Revert "cmd/compile: use generated loops instead of DUFFCOPY on amd64"

This reverts commit ec9e1176c3 (CL 678620).

Reason for revert: causing regalloc to get into an infinite loop

Change-Id: Ie53c58c6126804af6d6883ea4acdcfb632a172bd
Reviewed-on: https://go-review.googlesource.com/c/go/+/695196
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
Auto-Submit: Keith Randall <khr@golang.org>
Reviewed-by: Keith Randall <khr@google.com>
Reviewed-by: Dmitri Shuralyov <dmitshur@golang.org>
This commit is contained in:
Keith Randall 2025-08-12 15:14:13 -07:00 committed by Gopher Robot
parent d2b3c1a504
commit 4e182db5fc
8 changed files with 226 additions and 258 deletions

View file

@ -142,6 +142,45 @@ func memIdx(a *obj.Addr, v *ssa.Value) {
a.Index = i a.Index = i
} }
// DUFFZERO consists of repeated blocks of 4 MOVUPSs + LEAQ,
// See runtime/mkduff.go.
const (
dzBlocks = 16 // number of MOV/ADD blocks
dzBlockLen = 4 // number of clears per block
dzBlockSize = 23 // size of instructions in a single block
dzMovSize = 5 // size of single MOV instruction w/ offset
dzLeaqSize = 4 // size of single LEAQ instruction
dzClearStep = 16 // number of bytes cleared by each MOV instruction
)
func duffStart(size int64) int64 {
x, _ := duff(size)
return x
}
func duffAdj(size int64) int64 {
_, x := duff(size)
return x
}
// duff returns the offset (from duffzero, in bytes) and pointer adjust (in bytes)
// required to use the duffzero mechanism for a block of the given size.
func duff(size int64) (int64, int64) {
if size < 32 || size > 1024 || size%dzClearStep != 0 {
panic("bad duffzero size")
}
steps := size / dzClearStep
blocks := steps / dzBlockLen
steps %= dzBlockLen
off := dzBlockSize * (dzBlocks - blocks)
var adj int64
if steps != 0 {
off -= dzLeaqSize
off -= dzMovSize * steps
adj -= dzClearStep * (dzBlockLen - steps)
}
return off, adj
}
func getgFromTLS(s *ssagen.State, r int16) { func getgFromTLS(s *ssagen.State, r int16) {
// See the comments in cmd/internal/obj/x86/obj6.go // See the comments in cmd/internal/obj/x86/obj6.go
// near CanUse1InsnTLS for a detailed explanation of these instructions. // near CanUse1InsnTLS for a detailed explanation of these instructions.
@ -1065,110 +1104,20 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
zero16(off + n - 16) zero16(off + n - 16)
} }
case ssa.OpAMD64LoweredMove: case ssa.OpAMD64DUFFCOPY:
dstReg := v.Args[0].Reg() p := s.Prog(obj.ADUFFCOPY)
srcReg := v.Args[1].Reg() p.To.Type = obj.TYPE_ADDR
if dstReg == srcReg { p.To.Sym = ir.Syms.Duffcopy
break if v.AuxInt%16 != 0 {
} v.Fatalf("bad DUFFCOPY AuxInt %v", v.AuxInt)
tmpReg := int16(x86.REG_X14)
n := v.AuxInt
if n < 16 {
v.Fatalf("Move too small %d", n)
}
// move 16 bytes from srcReg+off to dstReg+off.
move16 := func(off int64) {
move16(s, srcReg, dstReg, tmpReg, off)
}
// Generate copying instructions.
var off int64
for n >= 16 {
move16(off)
off += 16
n -= 16
}
if n != 0 {
// use partially overlapped read/write.
// TODO: use smaller operations when we can?
move16(off + n - 16)
}
case ssa.OpAMD64LoweredMoveLoop:
dstReg := v.Args[0].Reg()
srcReg := v.Args[1].Reg()
if dstReg == srcReg {
break
}
countReg := v.RegTmp()
tmpReg := int16(x86.REG_X14)
n := v.AuxInt
loopSize := int64(64)
if n < 3*loopSize {
// - a loop count of 0 won't work.
// - a loop count of 1 is useless.
// - a loop count of 2 is a code size ~tie
// 4 instructions to implement the loop
// 4 instructions in the loop body
// vs
// 8 instructions in the straightline code
// Might as well use straightline code.
v.Fatalf("ZeroLoop size too small %d", n)
}
// move 16 bytes from srcReg+off to dstReg+off.
move16 := func(off int64) {
move16(s, srcReg, dstReg, tmpReg, off)
}
// Put iteration count in a register.
// MOVL $n, countReg
p := s.Prog(x86.AMOVL)
p.From.Type = obj.TYPE_CONST
p.From.Offset = n / loopSize
p.To.Type = obj.TYPE_REG
p.To.Reg = countReg
cntInit := p
// Copy loopSize bytes starting at srcReg to dstReg.
for i := range loopSize / 16 {
move16(i * 16)
}
// ADDQ $loopSize, srcReg
p = s.Prog(x86.AADDQ)
p.From.Type = obj.TYPE_CONST
p.From.Offset = loopSize
p.To.Type = obj.TYPE_REG
p.To.Reg = srcReg
// ADDQ $loopSize, dstReg
p = s.Prog(x86.AADDQ)
p.From.Type = obj.TYPE_CONST
p.From.Offset = loopSize
p.To.Type = obj.TYPE_REG
p.To.Reg = dstReg
// DECL countReg
p = s.Prog(x86.ADECL)
p.To.Type = obj.TYPE_REG
p.To.Reg = countReg
// Jump to loop header if we're not done yet.
// JNE head
p = s.Prog(x86.AJNE)
p.To.Type = obj.TYPE_BRANCH
p.To.SetTarget(cntInit.Link)
// Multiples of the loop size are now done.
n %= loopSize
// Copy any fractional portion.
var off int64
for n >= 16 {
move16(off)
off += 16
n -= 16
}
if n != 0 {
// Use partially-overlapping copy.
move16(off + n - 16)
} }
p.To.Offset = 14 * (64 - v.AuxInt/16)
// 14 and 64 are magic constants. 14 is the number of bytes to encode:
// MOVUPS (SI), X0
// ADDQ $16, SI
// MOVUPS X0, (DI)
// ADDQ $16, DI
// and 64 is the number of such blocks. See src/runtime/duff_amd64.s:duffcopy.
case ssa.OpCopy: // TODO: use MOVQreg for reg->reg copies instead of OpCopy? case ssa.OpCopy: // TODO: use MOVQreg for reg->reg copies instead of OpCopy?
if v.Type.IsMemory() { if v.Type.IsMemory() {
@ -1760,21 +1709,3 @@ func zero16(s *ssagen.State, reg int16, off int64) {
p.To.Reg = reg p.To.Reg = reg
p.To.Offset = off p.To.Offset = off
} }
// move 16 bytes from src+off to dst+off using temporary register tmp.
func move16(s *ssagen.State, src, dst, tmp int16, off int64) {
// MOVUPS off(srcReg), tmpReg
// MOVUPS tmpReg, off(dstReg)
p := s.Prog(x86.AMOVUPS)
p.From.Type = obj.TYPE_MEM
p.From.Reg = src
p.From.Offset = off
p.To.Type = obj.TYPE_REG
p.To.Reg = tmp
p = s.Prog(x86.AMOVUPS)
p.From.Type = obj.TYPE_REG
p.From.Reg = tmp
p.To.Type = obj.TYPE_MEM
p.To.Reg = dst
p.To.Offset = off
}

View file

@ -264,6 +264,24 @@
(Move [8] dst src mem) => (MOVQstore dst (MOVQload src mem) mem) (Move [8] dst src mem) => (MOVQstore dst (MOVQload src mem) mem)
(Move [16] dst src mem) => (MOVOstore dst (MOVOload src mem) mem) (Move [16] dst src mem) => (MOVOstore dst (MOVOload src mem) mem)
(Move [32] dst src mem) =>
(Move [16]
(OffPtr <dst.Type> dst [16])
(OffPtr <src.Type> src [16])
(Move [16] dst src mem))
(Move [48] dst src mem) =>
(Move [32]
(OffPtr <dst.Type> dst [16])
(OffPtr <src.Type> src [16])
(Move [16] dst src mem))
(Move [64] dst src mem) =>
(Move [32]
(OffPtr <dst.Type> dst [32])
(OffPtr <src.Type> src [32])
(Move [32] dst src mem))
(Move [3] dst src mem) => (Move [3] dst src mem) =>
(MOVBstore [2] dst (MOVBload [2] src mem) (MOVBstore [2] dst (MOVBload [2] src mem)
(MOVWstore dst (MOVWload src mem) mem)) (MOVWstore dst (MOVWload src mem) mem))
@ -292,19 +310,28 @@
(MOVQstore [int32(s-8)] dst (MOVQload [int32(s-8)] src mem) (MOVQstore [int32(s-8)] dst (MOVQload [int32(s-8)] src mem)
(MOVQstore dst (MOVQload src mem) mem)) (MOVQstore dst (MOVQload src mem) mem))
// Copying up to 192 bytes uses straightline code. // Adjust moves to be a multiple of 16 bytes.
(Move [s] dst src mem) && s > 16 && s < 192 && logLargeCopy(v, s) => (LoweredMove [s] dst src mem) (Move [s] dst src mem)
&& s > 16 && s%16 != 0 && s%16 <= 8 =>
(Move [s-s%16]
(OffPtr <dst.Type> dst [s%16])
(OffPtr <src.Type> src [s%16])
(MOVQstore dst (MOVQload src mem) mem))
(Move [s] dst src mem)
&& s > 16 && s%16 != 0 && s%16 > 8 =>
(Move [s-s%16]
(OffPtr <dst.Type> dst [s%16])
(OffPtr <src.Type> src [s%16])
(MOVOstore dst (MOVOload src mem) mem))
// Copying up to ~1KB uses a small loop. // Medium copying uses a duff device.
(Move [s] dst src mem) && s >= 192 && s <= repMoveThreshold && logLargeCopy(v, s) => (LoweredMoveLoop [s] dst src mem) (Move [s] dst src mem)
&& s > 64 && s <= 16*64 && s%16 == 0
&& logLargeCopy(v, s) =>
(DUFFCOPY [s] dst src mem)
// Large copying uses REP MOVSQ. // Large copying uses REP MOVSQ.
(Move [s] dst src mem) && s > repMoveThreshold && s%8 != 0 => (Move [s] dst src mem) && s > 16*64 && s%8 == 0 && logLargeCopy(v, s) =>
(Move [s-s%8]
(OffPtr <dst.Type> dst [s%8])
(OffPtr <src.Type> src [s%8])
(MOVQstore dst (MOVQload src mem) mem))
(Move [s] dst src mem) && s > repMoveThreshold && s%8 == 0 && logLargeCopy(v, s) =>
(REPMOVSQ dst src (MOVQconst [s/8]) mem) (REPMOVSQ dst src (MOVQconst [s/8]) mem)
// Lowering Zero instructions // Lowering Zero instructions

View file

@ -939,38 +939,20 @@ func init() {
// arg0 = destination pointer // arg0 = destination pointer
// arg1 = source pointer // arg1 = source pointer
// arg2 = mem // arg2 = mem
// auxint = # of bytes to copy // auxint = # of bytes to copy, must be multiple of 16
// returns memory // returns memory
{ {
name: "LoweredMove", name: "DUFFCOPY",
aux: "Int64", aux: "Int64",
argLength: 3, argLength: 3,
reg: regInfo{ reg: regInfo{
inputs: []regMask{gp, gp}, inputs: []regMask{buildReg("DI"), buildReg("SI")},
clobbers: buildReg("X14"), // uses X14 as a temporary clobbers: buildReg("DI SI X0"), // uses X0 as a temporary
},
faultOnNilArg0: true,
faultOnNilArg1: true,
},
// arg0 = destination pointer
// arg1 = source pointer
// arg2 = mem
// auxint = # of bytes to copy
// returns memory
{
name: "LoweredMoveLoop",
aux: "Int64",
argLength: 3,
reg: regInfo{
inputs: []regMask{gp, gp},
clobbers: buildReg("X14"), // uses X14 as a temporary
clobbersArg0: true,
clobbersArg1: true,
}, },
clobberFlags: true, clobberFlags: true,
faultOnNilArg0: true, //faultOnNilArg0: true, // Note: removed for 73748. TODO: reenable at some point
faultOnNilArg1: true, //faultOnNilArg1: true,
needIntTemp: true, unsafePoint: true, // FP maintenance around DUFFCOPY can be clobbered by interrupts
}, },
// arg0 = destination pointer // arg0 = destination pointer

View file

@ -1058,8 +1058,7 @@ const (
OpAMD64CALLtail OpAMD64CALLtail
OpAMD64CALLclosure OpAMD64CALLclosure
OpAMD64CALLinter OpAMD64CALLinter
OpAMD64LoweredMove OpAMD64DUFFCOPY
OpAMD64LoweredMoveLoop
OpAMD64REPMOVSQ OpAMD64REPMOVSQ
OpAMD64InvertFlags OpAMD64InvertFlags
OpAMD64LoweredGetG OpAMD64LoweredGetG
@ -13966,35 +13965,17 @@ var opcodeTable = [...]opInfo{
}, },
}, },
{ {
name: "LoweredMove", name: "DUFFCOPY",
auxType: auxInt64,
argLen: 3,
faultOnNilArg0: true,
faultOnNilArg1: true,
reg: regInfo{
inputs: []inputInfo{
{0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
{1, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
},
clobbers: 1073741824, // X14
},
},
{
name: "LoweredMoveLoop",
auxType: auxInt64, auxType: auxInt64,
argLen: 3, argLen: 3,
clobberFlags: true, clobberFlags: true,
needIntTemp: true, unsafePoint: true,
faultOnNilArg0: true,
faultOnNilArg1: true,
reg: regInfo{ reg: regInfo{
inputs: []inputInfo{ inputs: []inputInfo{
{0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15 {0, 128}, // DI
{1, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15 {1, 64}, // SI
}, },
clobbers: 1073741824, // X14 clobbers: 65728, // SI DI X0
clobbersArg0: true,
clobbersArg1: true,
}, },
}, },
{ {

View file

@ -561,14 +561,7 @@ func (s *regAllocState) allocValToReg(v *Value, mask regMask, nospill bool, pos
pos = pos.WithNotStmt() pos = pos.WithNotStmt()
// Check if v is already in a requested register. // Check if v is already in a requested register.
if mask&vi.regs != 0 { if mask&vi.regs != 0 {
mask &= vi.regs r := pickReg(mask & vi.regs)
r := pickReg(mask)
if mask.contains(s.SPReg) {
// Prefer the stack pointer if it is allowed.
// (Needed because the op might have an Aux symbol
// that needs SP as its base.)
r = s.SPReg
}
if !s.allocatable.contains(r) { if !s.allocatable.contains(r) {
return v // v is in a fixed register return v // v is in a fixed register
} }

View file

@ -240,30 +240,6 @@ func TestClobbersArg0(t *testing.T) {
} }
} }
func TestClobbersArg1(t *testing.T) {
c := testConfig(t)
f := c.Fun("entry",
Bloc("entry",
Valu("mem", OpInitMem, types.TypeMem, 0, nil),
Valu("src", OpArg, c.config.Types.Int64.PtrTo(), 0, c.Temp(c.config.Types.Int64.PtrTo())),
Valu("dst", OpArg, c.config.Types.Int64.PtrTo(), 0, c.Temp(c.config.Types.Int64.PtrTo())),
Valu("use1", OpArg, c.config.Types.Int64.PtrTo().PtrTo(), 0, c.Temp(c.config.Types.Int64.PtrTo().PtrTo())),
Valu("use2", OpArg, c.config.Types.Int64.PtrTo().PtrTo(), 0, c.Temp(c.config.Types.Int64.PtrTo().PtrTo())),
Valu("move", OpAMD64LoweredMoveLoop, types.TypeMem, 256, nil, "dst", "src", "mem"),
Valu("store1", OpAMD64MOVQstore, types.TypeMem, 0, nil, "use1", "src", "move"),
Valu("store2", OpAMD64MOVQstore, types.TypeMem, 0, nil, "use2", "dst", "store1"),
Exit("store2")))
flagalloc(f.f)
regalloc(f.f)
checkFunc(f.f)
// LoweredMoveLoop clobbers its arguments, so there must be a copy of "src" and "dst" somewhere
// so we still have that value available at the stores.
if n := numCopies(f.blocks["entry"]); n != 2 {
fmt.Printf("%s\n", f.f.String())
t.Errorf("got %d copies, want 2", n)
}
}
func numSpills(b *Block) int { func numSpills(b *Block) int {
return numOps(b, OpStoreReg) return numOps(b, OpStoreReg)
} }

View file

@ -31,7 +31,6 @@ const (
removeDeadValues = true removeDeadValues = true
repZeroThreshold = 1408 // size beyond which we use REP STOS for zeroing repZeroThreshold = 1408 // size beyond which we use REP STOS for zeroing
repMoveThreshold = 1408 // size beyond which we use REP MOVS for copying
) )
// deadcode indicates whether rewrite should try to remove any values that become dead. // deadcode indicates whether rewrite should try to remove any values that become dead.

View file

@ -27307,6 +27307,75 @@ func rewriteValueAMD64_OpMove(v *Value) bool {
v.AddArg3(dst, v0, mem) v.AddArg3(dst, v0, mem)
return true return true
} }
// match: (Move [32] dst src mem)
// result: (Move [16] (OffPtr <dst.Type> dst [16]) (OffPtr <src.Type> src [16]) (Move [16] dst src mem))
for {
if auxIntToInt64(v.AuxInt) != 32 {
break
}
dst := v_0
src := v_1
mem := v_2
v.reset(OpMove)
v.AuxInt = int64ToAuxInt(16)
v0 := b.NewValue0(v.Pos, OpOffPtr, dst.Type)
v0.AuxInt = int64ToAuxInt(16)
v0.AddArg(dst)
v1 := b.NewValue0(v.Pos, OpOffPtr, src.Type)
v1.AuxInt = int64ToAuxInt(16)
v1.AddArg(src)
v2 := b.NewValue0(v.Pos, OpMove, types.TypeMem)
v2.AuxInt = int64ToAuxInt(16)
v2.AddArg3(dst, src, mem)
v.AddArg3(v0, v1, v2)
return true
}
// match: (Move [48] dst src mem)
// result: (Move [32] (OffPtr <dst.Type> dst [16]) (OffPtr <src.Type> src [16]) (Move [16] dst src mem))
for {
if auxIntToInt64(v.AuxInt) != 48 {
break
}
dst := v_0
src := v_1
mem := v_2
v.reset(OpMove)
v.AuxInt = int64ToAuxInt(32)
v0 := b.NewValue0(v.Pos, OpOffPtr, dst.Type)
v0.AuxInt = int64ToAuxInt(16)
v0.AddArg(dst)
v1 := b.NewValue0(v.Pos, OpOffPtr, src.Type)
v1.AuxInt = int64ToAuxInt(16)
v1.AddArg(src)
v2 := b.NewValue0(v.Pos, OpMove, types.TypeMem)
v2.AuxInt = int64ToAuxInt(16)
v2.AddArg3(dst, src, mem)
v.AddArg3(v0, v1, v2)
return true
}
// match: (Move [64] dst src mem)
// result: (Move [32] (OffPtr <dst.Type> dst [32]) (OffPtr <src.Type> src [32]) (Move [32] dst src mem))
for {
if auxIntToInt64(v.AuxInt) != 64 {
break
}
dst := v_0
src := v_1
mem := v_2
v.reset(OpMove)
v.AuxInt = int64ToAuxInt(32)
v0 := b.NewValue0(v.Pos, OpOffPtr, dst.Type)
v0.AuxInt = int64ToAuxInt(32)
v0.AddArg(dst)
v1 := b.NewValue0(v.Pos, OpOffPtr, src.Type)
v1.AuxInt = int64ToAuxInt(32)
v1.AddArg(src)
v2 := b.NewValue0(v.Pos, OpMove, types.TypeMem)
v2.AuxInt = int64ToAuxInt(32)
v2.AddArg3(dst, src, mem)
v.AddArg3(v0, v1, v2)
return true
}
// match: (Move [3] dst src mem) // match: (Move [3] dst src mem)
// result: (MOVBstore [2] dst (MOVBload [2] src mem) (MOVWstore dst (MOVWload src mem) mem)) // result: (MOVBstore [2] dst (MOVBload [2] src mem) (MOVWstore dst (MOVWload src mem) mem))
for { for {
@ -27499,55 +27568,23 @@ func rewriteValueAMD64_OpMove(v *Value) bool {
return true return true
} }
// match: (Move [s] dst src mem) // match: (Move [s] dst src mem)
// cond: s > 16 && s < 192 && logLargeCopy(v, s) // cond: s > 16 && s%16 != 0 && s%16 <= 8
// result: (LoweredMove [s] dst src mem) // result: (Move [s-s%16] (OffPtr <dst.Type> dst [s%16]) (OffPtr <src.Type> src [s%16]) (MOVQstore dst (MOVQload src mem) mem))
for { for {
s := auxIntToInt64(v.AuxInt) s := auxIntToInt64(v.AuxInt)
dst := v_0 dst := v_0
src := v_1 src := v_1
mem := v_2 mem := v_2
if !(s > 16 && s < 192 && logLargeCopy(v, s)) { if !(s > 16 && s%16 != 0 && s%16 <= 8) {
break
}
v.reset(OpAMD64LoweredMove)
v.AuxInt = int64ToAuxInt(s)
v.AddArg3(dst, src, mem)
return true
}
// match: (Move [s] dst src mem)
// cond: s >= 192 && s <= repMoveThreshold && logLargeCopy(v, s)
// result: (LoweredMoveLoop [s] dst src mem)
for {
s := auxIntToInt64(v.AuxInt)
dst := v_0
src := v_1
mem := v_2
if !(s >= 192 && s <= repMoveThreshold && logLargeCopy(v, s)) {
break
}
v.reset(OpAMD64LoweredMoveLoop)
v.AuxInt = int64ToAuxInt(s)
v.AddArg3(dst, src, mem)
return true
}
// match: (Move [s] dst src mem)
// cond: s > repMoveThreshold && s%8 != 0
// result: (Move [s-s%8] (OffPtr <dst.Type> dst [s%8]) (OffPtr <src.Type> src [s%8]) (MOVQstore dst (MOVQload src mem) mem))
for {
s := auxIntToInt64(v.AuxInt)
dst := v_0
src := v_1
mem := v_2
if !(s > repMoveThreshold && s%8 != 0) {
break break
} }
v.reset(OpMove) v.reset(OpMove)
v.AuxInt = int64ToAuxInt(s - s%8) v.AuxInt = int64ToAuxInt(s - s%16)
v0 := b.NewValue0(v.Pos, OpOffPtr, dst.Type) v0 := b.NewValue0(v.Pos, OpOffPtr, dst.Type)
v0.AuxInt = int64ToAuxInt(s % 8) v0.AuxInt = int64ToAuxInt(s % 16)
v0.AddArg(dst) v0.AddArg(dst)
v1 := b.NewValue0(v.Pos, OpOffPtr, src.Type) v1 := b.NewValue0(v.Pos, OpOffPtr, src.Type)
v1.AuxInt = int64ToAuxInt(s % 8) v1.AuxInt = int64ToAuxInt(s % 16)
v1.AddArg(src) v1.AddArg(src)
v2 := b.NewValue0(v.Pos, OpAMD64MOVQstore, types.TypeMem) v2 := b.NewValue0(v.Pos, OpAMD64MOVQstore, types.TypeMem)
v3 := b.NewValue0(v.Pos, OpAMD64MOVQload, typ.UInt64) v3 := b.NewValue0(v.Pos, OpAMD64MOVQload, typ.UInt64)
@ -27557,14 +27594,56 @@ func rewriteValueAMD64_OpMove(v *Value) bool {
return true return true
} }
// match: (Move [s] dst src mem) // match: (Move [s] dst src mem)
// cond: s > repMoveThreshold && s%8 == 0 && logLargeCopy(v, s) // cond: s > 16 && s%16 != 0 && s%16 > 8
// result: (Move [s-s%16] (OffPtr <dst.Type> dst [s%16]) (OffPtr <src.Type> src [s%16]) (MOVOstore dst (MOVOload src mem) mem))
for {
s := auxIntToInt64(v.AuxInt)
dst := v_0
src := v_1
mem := v_2
if !(s > 16 && s%16 != 0 && s%16 > 8) {
break
}
v.reset(OpMove)
v.AuxInt = int64ToAuxInt(s - s%16)
v0 := b.NewValue0(v.Pos, OpOffPtr, dst.Type)
v0.AuxInt = int64ToAuxInt(s % 16)
v0.AddArg(dst)
v1 := b.NewValue0(v.Pos, OpOffPtr, src.Type)
v1.AuxInt = int64ToAuxInt(s % 16)
v1.AddArg(src)
v2 := b.NewValue0(v.Pos, OpAMD64MOVOstore, types.TypeMem)
v3 := b.NewValue0(v.Pos, OpAMD64MOVOload, types.TypeInt128)
v3.AddArg2(src, mem)
v2.AddArg3(dst, v3, mem)
v.AddArg3(v0, v1, v2)
return true
}
// match: (Move [s] dst src mem)
// cond: s > 64 && s <= 16*64 && s%16 == 0 && logLargeCopy(v, s)
// result: (DUFFCOPY [s] dst src mem)
for {
s := auxIntToInt64(v.AuxInt)
dst := v_0
src := v_1
mem := v_2
if !(s > 64 && s <= 16*64 && s%16 == 0 && logLargeCopy(v, s)) {
break
}
v.reset(OpAMD64DUFFCOPY)
v.AuxInt = int64ToAuxInt(s)
v.AddArg3(dst, src, mem)
return true
}
// match: (Move [s] dst src mem)
// cond: s > 16*64 && s%8 == 0 && logLargeCopy(v, s)
// result: (REPMOVSQ dst src (MOVQconst [s/8]) mem) // result: (REPMOVSQ dst src (MOVQconst [s/8]) mem)
for { for {
s := auxIntToInt64(v.AuxInt) s := auxIntToInt64(v.AuxInt)
dst := v_0 dst := v_0
src := v_1 src := v_1
mem := v_2 mem := v_2
if !(s > repMoveThreshold && s%8 == 0 && logLargeCopy(v, s)) { if !(s > 16*64 && s%8 == 0 && logLargeCopy(v, s)) {
break break
} }
v.reset(OpAMD64REPMOVSQ) v.reset(OpAMD64REPMOVSQ)