mirror of
https://github.com/golang/go.git
synced 2025-10-19 11:03:18 +00:00
cmd/compile: use generated loops instead of DUFFCOPY on amd64
goarch: amd64 cpu: 12th Gen Intel(R) Core(TM) i7-12700 │ base │ exp │ │ sec/op │ sec/op vs base │ MemmoveKnownSize112-20 1.764n ± 0% 1.247n ± 0% -29.31% (p=0.000 n=10) MemmoveKnownSize128-20 1.891n ± 0% 1.405n ± 1% -25.72% (p=0.000 n=10) MemmoveKnownSize192-20 2.521n ± 0% 2.114n ± 3% -16.16% (p=0.000 n=10) MemmoveKnownSize248-20 4.028n ± 0% 3.877n ± 1% -3.75% (p=0.000 n=10) MemmoveKnownSize256-20 3.272n ± 0% 2.961n ± 2% -9.53% (p=0.000 n=10) MemmoveKnownSize512-20 6.733n ± 3% 5.936n ± 4% -11.83% (p=0.000 n=10) MemmoveKnownSize1024-20 13.905n ± 5% 9.798n ± 9% -29.54% (p=0.000 n=10) Change-Id: Icc01cec0d8b072300d749a5ce76f53b3725b5c65 Reviewed-on: https://go-review.googlesource.com/c/go/+/678620 Reviewed-by: Jorropo <jorropo.pgm@gmail.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Keith Randall <khr@google.com> Reviewed-by: Dmitri Shuralyov <dmitshur@google.com> Reviewed-by: Jakub Ciolek <jakub@ciolek.dev>
This commit is contained in:
parent
d0a64f7969
commit
ec9e1176c3
8 changed files with 258 additions and 226 deletions
|
@ -142,45 +142,6 @@ func memIdx(a *obj.Addr, v *ssa.Value) {
|
|||
a.Index = i
|
||||
}
|
||||
|
||||
// DUFFZERO consists of repeated blocks of 4 MOVUPSs + LEAQ,
|
||||
// See runtime/mkduff.go.
|
||||
const (
|
||||
dzBlocks = 16 // number of MOV/ADD blocks
|
||||
dzBlockLen = 4 // number of clears per block
|
||||
dzBlockSize = 23 // size of instructions in a single block
|
||||
dzMovSize = 5 // size of single MOV instruction w/ offset
|
||||
dzLeaqSize = 4 // size of single LEAQ instruction
|
||||
dzClearStep = 16 // number of bytes cleared by each MOV instruction
|
||||
)
|
||||
|
||||
func duffStart(size int64) int64 {
|
||||
x, _ := duff(size)
|
||||
return x
|
||||
}
|
||||
func duffAdj(size int64) int64 {
|
||||
_, x := duff(size)
|
||||
return x
|
||||
}
|
||||
|
||||
// duff returns the offset (from duffzero, in bytes) and pointer adjust (in bytes)
|
||||
// required to use the duffzero mechanism for a block of the given size.
|
||||
func duff(size int64) (int64, int64) {
|
||||
if size < 32 || size > 1024 || size%dzClearStep != 0 {
|
||||
panic("bad duffzero size")
|
||||
}
|
||||
steps := size / dzClearStep
|
||||
blocks := steps / dzBlockLen
|
||||
steps %= dzBlockLen
|
||||
off := dzBlockSize * (dzBlocks - blocks)
|
||||
var adj int64
|
||||
if steps != 0 {
|
||||
off -= dzLeaqSize
|
||||
off -= dzMovSize * steps
|
||||
adj -= dzClearStep * (dzBlockLen - steps)
|
||||
}
|
||||
return off, adj
|
||||
}
|
||||
|
||||
func getgFromTLS(s *ssagen.State, r int16) {
|
||||
// See the comments in cmd/internal/obj/x86/obj6.go
|
||||
// near CanUse1InsnTLS for a detailed explanation of these instructions.
|
||||
|
@ -1104,20 +1065,110 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
|
|||
zero16(off + n - 16)
|
||||
}
|
||||
|
||||
case ssa.OpAMD64DUFFCOPY:
|
||||
p := s.Prog(obj.ADUFFCOPY)
|
||||
p.To.Type = obj.TYPE_ADDR
|
||||
p.To.Sym = ir.Syms.Duffcopy
|
||||
if v.AuxInt%16 != 0 {
|
||||
v.Fatalf("bad DUFFCOPY AuxInt %v", v.AuxInt)
|
||||
case ssa.OpAMD64LoweredMove:
|
||||
dstReg := v.Args[0].Reg()
|
||||
srcReg := v.Args[1].Reg()
|
||||
if dstReg == srcReg {
|
||||
break
|
||||
}
|
||||
tmpReg := int16(x86.REG_X14)
|
||||
n := v.AuxInt
|
||||
if n < 16 {
|
||||
v.Fatalf("Move too small %d", n)
|
||||
}
|
||||
// move 16 bytes from srcReg+off to dstReg+off.
|
||||
move16 := func(off int64) {
|
||||
move16(s, srcReg, dstReg, tmpReg, off)
|
||||
}
|
||||
|
||||
// Generate copying instructions.
|
||||
var off int64
|
||||
for n >= 16 {
|
||||
move16(off)
|
||||
off += 16
|
||||
n -= 16
|
||||
}
|
||||
if n != 0 {
|
||||
// use partially overlapped read/write.
|
||||
// TODO: use smaller operations when we can?
|
||||
move16(off + n - 16)
|
||||
}
|
||||
|
||||
case ssa.OpAMD64LoweredMoveLoop:
|
||||
dstReg := v.Args[0].Reg()
|
||||
srcReg := v.Args[1].Reg()
|
||||
if dstReg == srcReg {
|
||||
break
|
||||
}
|
||||
countReg := v.RegTmp()
|
||||
tmpReg := int16(x86.REG_X14)
|
||||
n := v.AuxInt
|
||||
loopSize := int64(64)
|
||||
if n < 3*loopSize {
|
||||
// - a loop count of 0 won't work.
|
||||
// - a loop count of 1 is useless.
|
||||
// - a loop count of 2 is a code size ~tie
|
||||
// 4 instructions to implement the loop
|
||||
// 4 instructions in the loop body
|
||||
// vs
|
||||
// 8 instructions in the straightline code
|
||||
// Might as well use straightline code.
|
||||
v.Fatalf("ZeroLoop size too small %d", n)
|
||||
}
|
||||
// move 16 bytes from srcReg+off to dstReg+off.
|
||||
move16 := func(off int64) {
|
||||
move16(s, srcReg, dstReg, tmpReg, off)
|
||||
}
|
||||
|
||||
// Put iteration count in a register.
|
||||
// MOVL $n, countReg
|
||||
p := s.Prog(x86.AMOVL)
|
||||
p.From.Type = obj.TYPE_CONST
|
||||
p.From.Offset = n / loopSize
|
||||
p.To.Type = obj.TYPE_REG
|
||||
p.To.Reg = countReg
|
||||
cntInit := p
|
||||
|
||||
// Copy loopSize bytes starting at srcReg to dstReg.
|
||||
for i := range loopSize / 16 {
|
||||
move16(i * 16)
|
||||
}
|
||||
// ADDQ $loopSize, srcReg
|
||||
p = s.Prog(x86.AADDQ)
|
||||
p.From.Type = obj.TYPE_CONST
|
||||
p.From.Offset = loopSize
|
||||
p.To.Type = obj.TYPE_REG
|
||||
p.To.Reg = srcReg
|
||||
// ADDQ $loopSize, dstReg
|
||||
p = s.Prog(x86.AADDQ)
|
||||
p.From.Type = obj.TYPE_CONST
|
||||
p.From.Offset = loopSize
|
||||
p.To.Type = obj.TYPE_REG
|
||||
p.To.Reg = dstReg
|
||||
// DECL countReg
|
||||
p = s.Prog(x86.ADECL)
|
||||
p.To.Type = obj.TYPE_REG
|
||||
p.To.Reg = countReg
|
||||
// Jump to loop header if we're not done yet.
|
||||
// JNE head
|
||||
p = s.Prog(x86.AJNE)
|
||||
p.To.Type = obj.TYPE_BRANCH
|
||||
p.To.SetTarget(cntInit.Link)
|
||||
|
||||
// Multiples of the loop size are now done.
|
||||
n %= loopSize
|
||||
|
||||
// Copy any fractional portion.
|
||||
var off int64
|
||||
for n >= 16 {
|
||||
move16(off)
|
||||
off += 16
|
||||
n -= 16
|
||||
}
|
||||
if n != 0 {
|
||||
// Use partially-overlapping copy.
|
||||
move16(off + n - 16)
|
||||
}
|
||||
p.To.Offset = 14 * (64 - v.AuxInt/16)
|
||||
// 14 and 64 are magic constants. 14 is the number of bytes to encode:
|
||||
// MOVUPS (SI), X0
|
||||
// ADDQ $16, SI
|
||||
// MOVUPS X0, (DI)
|
||||
// ADDQ $16, DI
|
||||
// and 64 is the number of such blocks. See src/runtime/duff_amd64.s:duffcopy.
|
||||
|
||||
case ssa.OpCopy: // TODO: use MOVQreg for reg->reg copies instead of OpCopy?
|
||||
if v.Type.IsMemory() {
|
||||
|
@ -1709,3 +1760,21 @@ func zero16(s *ssagen.State, reg int16, off int64) {
|
|||
p.To.Reg = reg
|
||||
p.To.Offset = off
|
||||
}
|
||||
|
||||
// move 16 bytes from src+off to dst+off using temporary register tmp.
|
||||
func move16(s *ssagen.State, src, dst, tmp int16, off int64) {
|
||||
// MOVUPS off(srcReg), tmpReg
|
||||
// MOVUPS tmpReg, off(dstReg)
|
||||
p := s.Prog(x86.AMOVUPS)
|
||||
p.From.Type = obj.TYPE_MEM
|
||||
p.From.Reg = src
|
||||
p.From.Offset = off
|
||||
p.To.Type = obj.TYPE_REG
|
||||
p.To.Reg = tmp
|
||||
p = s.Prog(x86.AMOVUPS)
|
||||
p.From.Type = obj.TYPE_REG
|
||||
p.From.Reg = tmp
|
||||
p.To.Type = obj.TYPE_MEM
|
||||
p.To.Reg = dst
|
||||
p.To.Offset = off
|
||||
}
|
||||
|
|
|
@ -264,24 +264,6 @@
|
|||
(Move [8] dst src mem) => (MOVQstore dst (MOVQload src mem) mem)
|
||||
(Move [16] dst src mem) => (MOVOstore dst (MOVOload src mem) mem)
|
||||
|
||||
(Move [32] dst src mem) =>
|
||||
(Move [16]
|
||||
(OffPtr <dst.Type> dst [16])
|
||||
(OffPtr <src.Type> src [16])
|
||||
(Move [16] dst src mem))
|
||||
|
||||
(Move [48] dst src mem) =>
|
||||
(Move [32]
|
||||
(OffPtr <dst.Type> dst [16])
|
||||
(OffPtr <src.Type> src [16])
|
||||
(Move [16] dst src mem))
|
||||
|
||||
(Move [64] dst src mem) =>
|
||||
(Move [32]
|
||||
(OffPtr <dst.Type> dst [32])
|
||||
(OffPtr <src.Type> src [32])
|
||||
(Move [32] dst src mem))
|
||||
|
||||
(Move [3] dst src mem) =>
|
||||
(MOVBstore [2] dst (MOVBload [2] src mem)
|
||||
(MOVWstore dst (MOVWload src mem) mem))
|
||||
|
@ -310,28 +292,19 @@
|
|||
(MOVQstore [int32(s-8)] dst (MOVQload [int32(s-8)] src mem)
|
||||
(MOVQstore dst (MOVQload src mem) mem))
|
||||
|
||||
// Adjust moves to be a multiple of 16 bytes.
|
||||
(Move [s] dst src mem)
|
||||
&& s > 16 && s%16 != 0 && s%16 <= 8 =>
|
||||
(Move [s-s%16]
|
||||
(OffPtr <dst.Type> dst [s%16])
|
||||
(OffPtr <src.Type> src [s%16])
|
||||
(MOVQstore dst (MOVQload src mem) mem))
|
||||
(Move [s] dst src mem)
|
||||
&& s > 16 && s%16 != 0 && s%16 > 8 =>
|
||||
(Move [s-s%16]
|
||||
(OffPtr <dst.Type> dst [s%16])
|
||||
(OffPtr <src.Type> src [s%16])
|
||||
(MOVOstore dst (MOVOload src mem) mem))
|
||||
// Copying up to 192 bytes uses straightline code.
|
||||
(Move [s] dst src mem) && s > 16 && s < 192 && logLargeCopy(v, s) => (LoweredMove [s] dst src mem)
|
||||
|
||||
// Medium copying uses a duff device.
|
||||
(Move [s] dst src mem)
|
||||
&& s > 64 && s <= 16*64 && s%16 == 0
|
||||
&& logLargeCopy(v, s) =>
|
||||
(DUFFCOPY [s] dst src mem)
|
||||
// Copying up to ~1KB uses a small loop.
|
||||
(Move [s] dst src mem) && s >= 192 && s <= repMoveThreshold && logLargeCopy(v, s) => (LoweredMoveLoop [s] dst src mem)
|
||||
|
||||
// Large copying uses REP MOVSQ.
|
||||
(Move [s] dst src mem) && s > 16*64 && s%8 == 0 && logLargeCopy(v, s) =>
|
||||
(Move [s] dst src mem) && s > repMoveThreshold && s%8 != 0 =>
|
||||
(Move [s-s%8]
|
||||
(OffPtr <dst.Type> dst [s%8])
|
||||
(OffPtr <src.Type> src [s%8])
|
||||
(MOVQstore dst (MOVQload src mem) mem))
|
||||
(Move [s] dst src mem) && s > repMoveThreshold && s%8 == 0 && logLargeCopy(v, s) =>
|
||||
(REPMOVSQ dst src (MOVQconst [s/8]) mem)
|
||||
|
||||
// Lowering Zero instructions
|
||||
|
|
|
@ -939,20 +939,38 @@ func init() {
|
|||
// arg0 = destination pointer
|
||||
// arg1 = source pointer
|
||||
// arg2 = mem
|
||||
// auxint = # of bytes to copy, must be multiple of 16
|
||||
// auxint = # of bytes to copy
|
||||
// returns memory
|
||||
{
|
||||
name: "DUFFCOPY",
|
||||
name: "LoweredMove",
|
||||
aux: "Int64",
|
||||
argLength: 3,
|
||||
reg: regInfo{
|
||||
inputs: []regMask{buildReg("DI"), buildReg("SI")},
|
||||
clobbers: buildReg("DI SI X0"), // uses X0 as a temporary
|
||||
inputs: []regMask{gp, gp},
|
||||
clobbers: buildReg("X14"), // uses X14 as a temporary
|
||||
},
|
||||
clobberFlags: true,
|
||||
//faultOnNilArg0: true, // Note: removed for 73748. TODO: reenable at some point
|
||||
//faultOnNilArg1: true,
|
||||
unsafePoint: true, // FP maintenance around DUFFCOPY can be clobbered by interrupts
|
||||
faultOnNilArg0: true,
|
||||
faultOnNilArg1: true,
|
||||
},
|
||||
// arg0 = destination pointer
|
||||
// arg1 = source pointer
|
||||
// arg2 = mem
|
||||
// auxint = # of bytes to copy
|
||||
// returns memory
|
||||
{
|
||||
name: "LoweredMoveLoop",
|
||||
aux: "Int64",
|
||||
argLength: 3,
|
||||
reg: regInfo{
|
||||
inputs: []regMask{gp, gp},
|
||||
clobbers: buildReg("X14"), // uses X14 as a temporary
|
||||
clobbersArg0: true,
|
||||
clobbersArg1: true,
|
||||
},
|
||||
clobberFlags: true,
|
||||
faultOnNilArg0: true,
|
||||
faultOnNilArg1: true,
|
||||
needIntTemp: true,
|
||||
},
|
||||
|
||||
// arg0 = destination pointer
|
||||
|
|
|
@ -1058,7 +1058,8 @@ const (
|
|||
OpAMD64CALLtail
|
||||
OpAMD64CALLclosure
|
||||
OpAMD64CALLinter
|
||||
OpAMD64DUFFCOPY
|
||||
OpAMD64LoweredMove
|
||||
OpAMD64LoweredMoveLoop
|
||||
OpAMD64REPMOVSQ
|
||||
OpAMD64InvertFlags
|
||||
OpAMD64LoweredGetG
|
||||
|
@ -13965,17 +13966,35 @@ var opcodeTable = [...]opInfo{
|
|||
},
|
||||
},
|
||||
{
|
||||
name: "DUFFCOPY",
|
||||
auxType: auxInt64,
|
||||
argLen: 3,
|
||||
clobberFlags: true,
|
||||
unsafePoint: true,
|
||||
name: "LoweredMove",
|
||||
auxType: auxInt64,
|
||||
argLen: 3,
|
||||
faultOnNilArg0: true,
|
||||
faultOnNilArg1: true,
|
||||
reg: regInfo{
|
||||
inputs: []inputInfo{
|
||||
{0, 128}, // DI
|
||||
{1, 64}, // SI
|
||||
{0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
|
||||
{1, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
|
||||
},
|
||||
clobbers: 65728, // SI DI X0
|
||||
clobbers: 1073741824, // X14
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "LoweredMoveLoop",
|
||||
auxType: auxInt64,
|
||||
argLen: 3,
|
||||
clobberFlags: true,
|
||||
needIntTemp: true,
|
||||
faultOnNilArg0: true,
|
||||
faultOnNilArg1: true,
|
||||
reg: regInfo{
|
||||
inputs: []inputInfo{
|
||||
{0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
|
||||
{1, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
|
||||
},
|
||||
clobbers: 1073741824, // X14
|
||||
clobbersArg0: true,
|
||||
clobbersArg1: true,
|
||||
},
|
||||
},
|
||||
{
|
||||
|
|
|
@ -561,7 +561,14 @@ func (s *regAllocState) allocValToReg(v *Value, mask regMask, nospill bool, pos
|
|||
pos = pos.WithNotStmt()
|
||||
// Check if v is already in a requested register.
|
||||
if mask&vi.regs != 0 {
|
||||
r := pickReg(mask & vi.regs)
|
||||
mask &= vi.regs
|
||||
r := pickReg(mask)
|
||||
if mask.contains(s.SPReg) {
|
||||
// Prefer the stack pointer if it is allowed.
|
||||
// (Needed because the op might have an Aux symbol
|
||||
// that needs SP as its base.)
|
||||
r = s.SPReg
|
||||
}
|
||||
if !s.allocatable.contains(r) {
|
||||
return v // v is in a fixed register
|
||||
}
|
||||
|
|
|
@ -240,6 +240,30 @@ func TestClobbersArg0(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
func TestClobbersArg1(t *testing.T) {
|
||||
c := testConfig(t)
|
||||
f := c.Fun("entry",
|
||||
Bloc("entry",
|
||||
Valu("mem", OpInitMem, types.TypeMem, 0, nil),
|
||||
Valu("src", OpArg, c.config.Types.Int64.PtrTo(), 0, c.Temp(c.config.Types.Int64.PtrTo())),
|
||||
Valu("dst", OpArg, c.config.Types.Int64.PtrTo(), 0, c.Temp(c.config.Types.Int64.PtrTo())),
|
||||
Valu("use1", OpArg, c.config.Types.Int64.PtrTo().PtrTo(), 0, c.Temp(c.config.Types.Int64.PtrTo().PtrTo())),
|
||||
Valu("use2", OpArg, c.config.Types.Int64.PtrTo().PtrTo(), 0, c.Temp(c.config.Types.Int64.PtrTo().PtrTo())),
|
||||
Valu("move", OpAMD64LoweredMoveLoop, types.TypeMem, 256, nil, "dst", "src", "mem"),
|
||||
Valu("store1", OpAMD64MOVQstore, types.TypeMem, 0, nil, "use1", "src", "move"),
|
||||
Valu("store2", OpAMD64MOVQstore, types.TypeMem, 0, nil, "use2", "dst", "store1"),
|
||||
Exit("store2")))
|
||||
flagalloc(f.f)
|
||||
regalloc(f.f)
|
||||
checkFunc(f.f)
|
||||
// LoweredMoveLoop clobbers its arguments, so there must be a copy of "src" and "dst" somewhere
|
||||
// so we still have that value available at the stores.
|
||||
if n := numCopies(f.blocks["entry"]); n != 2 {
|
||||
fmt.Printf("%s\n", f.f.String())
|
||||
t.Errorf("got %d copies, want 2", n)
|
||||
}
|
||||
}
|
||||
|
||||
func numSpills(b *Block) int {
|
||||
return numOps(b, OpStoreReg)
|
||||
}
|
||||
|
|
|
@ -31,6 +31,7 @@ const (
|
|||
removeDeadValues = true
|
||||
|
||||
repZeroThreshold = 1408 // size beyond which we use REP STOS for zeroing
|
||||
repMoveThreshold = 1408 // size beyond which we use REP MOVS for copying
|
||||
)
|
||||
|
||||
// deadcode indicates whether rewrite should try to remove any values that become dead.
|
||||
|
|
|
@ -27307,75 +27307,6 @@ func rewriteValueAMD64_OpMove(v *Value) bool {
|
|||
v.AddArg3(dst, v0, mem)
|
||||
return true
|
||||
}
|
||||
// match: (Move [32] dst src mem)
|
||||
// result: (Move [16] (OffPtr <dst.Type> dst [16]) (OffPtr <src.Type> src [16]) (Move [16] dst src mem))
|
||||
for {
|
||||
if auxIntToInt64(v.AuxInt) != 32 {
|
||||
break
|
||||
}
|
||||
dst := v_0
|
||||
src := v_1
|
||||
mem := v_2
|
||||
v.reset(OpMove)
|
||||
v.AuxInt = int64ToAuxInt(16)
|
||||
v0 := b.NewValue0(v.Pos, OpOffPtr, dst.Type)
|
||||
v0.AuxInt = int64ToAuxInt(16)
|
||||
v0.AddArg(dst)
|
||||
v1 := b.NewValue0(v.Pos, OpOffPtr, src.Type)
|
||||
v1.AuxInt = int64ToAuxInt(16)
|
||||
v1.AddArg(src)
|
||||
v2 := b.NewValue0(v.Pos, OpMove, types.TypeMem)
|
||||
v2.AuxInt = int64ToAuxInt(16)
|
||||
v2.AddArg3(dst, src, mem)
|
||||
v.AddArg3(v0, v1, v2)
|
||||
return true
|
||||
}
|
||||
// match: (Move [48] dst src mem)
|
||||
// result: (Move [32] (OffPtr <dst.Type> dst [16]) (OffPtr <src.Type> src [16]) (Move [16] dst src mem))
|
||||
for {
|
||||
if auxIntToInt64(v.AuxInt) != 48 {
|
||||
break
|
||||
}
|
||||
dst := v_0
|
||||
src := v_1
|
||||
mem := v_2
|
||||
v.reset(OpMove)
|
||||
v.AuxInt = int64ToAuxInt(32)
|
||||
v0 := b.NewValue0(v.Pos, OpOffPtr, dst.Type)
|
||||
v0.AuxInt = int64ToAuxInt(16)
|
||||
v0.AddArg(dst)
|
||||
v1 := b.NewValue0(v.Pos, OpOffPtr, src.Type)
|
||||
v1.AuxInt = int64ToAuxInt(16)
|
||||
v1.AddArg(src)
|
||||
v2 := b.NewValue0(v.Pos, OpMove, types.TypeMem)
|
||||
v2.AuxInt = int64ToAuxInt(16)
|
||||
v2.AddArg3(dst, src, mem)
|
||||
v.AddArg3(v0, v1, v2)
|
||||
return true
|
||||
}
|
||||
// match: (Move [64] dst src mem)
|
||||
// result: (Move [32] (OffPtr <dst.Type> dst [32]) (OffPtr <src.Type> src [32]) (Move [32] dst src mem))
|
||||
for {
|
||||
if auxIntToInt64(v.AuxInt) != 64 {
|
||||
break
|
||||
}
|
||||
dst := v_0
|
||||
src := v_1
|
||||
mem := v_2
|
||||
v.reset(OpMove)
|
||||
v.AuxInt = int64ToAuxInt(32)
|
||||
v0 := b.NewValue0(v.Pos, OpOffPtr, dst.Type)
|
||||
v0.AuxInt = int64ToAuxInt(32)
|
||||
v0.AddArg(dst)
|
||||
v1 := b.NewValue0(v.Pos, OpOffPtr, src.Type)
|
||||
v1.AuxInt = int64ToAuxInt(32)
|
||||
v1.AddArg(src)
|
||||
v2 := b.NewValue0(v.Pos, OpMove, types.TypeMem)
|
||||
v2.AuxInt = int64ToAuxInt(32)
|
||||
v2.AddArg3(dst, src, mem)
|
||||
v.AddArg3(v0, v1, v2)
|
||||
return true
|
||||
}
|
||||
// match: (Move [3] dst src mem)
|
||||
// result: (MOVBstore [2] dst (MOVBload [2] src mem) (MOVWstore dst (MOVWload src mem) mem))
|
||||
for {
|
||||
|
@ -27568,23 +27499,55 @@ func rewriteValueAMD64_OpMove(v *Value) bool {
|
|||
return true
|
||||
}
|
||||
// match: (Move [s] dst src mem)
|
||||
// cond: s > 16 && s%16 != 0 && s%16 <= 8
|
||||
// result: (Move [s-s%16] (OffPtr <dst.Type> dst [s%16]) (OffPtr <src.Type> src [s%16]) (MOVQstore dst (MOVQload src mem) mem))
|
||||
// cond: s > 16 && s < 192 && logLargeCopy(v, s)
|
||||
// result: (LoweredMove [s] dst src mem)
|
||||
for {
|
||||
s := auxIntToInt64(v.AuxInt)
|
||||
dst := v_0
|
||||
src := v_1
|
||||
mem := v_2
|
||||
if !(s > 16 && s%16 != 0 && s%16 <= 8) {
|
||||
if !(s > 16 && s < 192 && logLargeCopy(v, s)) {
|
||||
break
|
||||
}
|
||||
v.reset(OpAMD64LoweredMove)
|
||||
v.AuxInt = int64ToAuxInt(s)
|
||||
v.AddArg3(dst, src, mem)
|
||||
return true
|
||||
}
|
||||
// match: (Move [s] dst src mem)
|
||||
// cond: s >= 192 && s <= repMoveThreshold && logLargeCopy(v, s)
|
||||
// result: (LoweredMoveLoop [s] dst src mem)
|
||||
for {
|
||||
s := auxIntToInt64(v.AuxInt)
|
||||
dst := v_0
|
||||
src := v_1
|
||||
mem := v_2
|
||||
if !(s >= 192 && s <= repMoveThreshold && logLargeCopy(v, s)) {
|
||||
break
|
||||
}
|
||||
v.reset(OpAMD64LoweredMoveLoop)
|
||||
v.AuxInt = int64ToAuxInt(s)
|
||||
v.AddArg3(dst, src, mem)
|
||||
return true
|
||||
}
|
||||
// match: (Move [s] dst src mem)
|
||||
// cond: s > repMoveThreshold && s%8 != 0
|
||||
// result: (Move [s-s%8] (OffPtr <dst.Type> dst [s%8]) (OffPtr <src.Type> src [s%8]) (MOVQstore dst (MOVQload src mem) mem))
|
||||
for {
|
||||
s := auxIntToInt64(v.AuxInt)
|
||||
dst := v_0
|
||||
src := v_1
|
||||
mem := v_2
|
||||
if !(s > repMoveThreshold && s%8 != 0) {
|
||||
break
|
||||
}
|
||||
v.reset(OpMove)
|
||||
v.AuxInt = int64ToAuxInt(s - s%16)
|
||||
v.AuxInt = int64ToAuxInt(s - s%8)
|
||||
v0 := b.NewValue0(v.Pos, OpOffPtr, dst.Type)
|
||||
v0.AuxInt = int64ToAuxInt(s % 16)
|
||||
v0.AuxInt = int64ToAuxInt(s % 8)
|
||||
v0.AddArg(dst)
|
||||
v1 := b.NewValue0(v.Pos, OpOffPtr, src.Type)
|
||||
v1.AuxInt = int64ToAuxInt(s % 16)
|
||||
v1.AuxInt = int64ToAuxInt(s % 8)
|
||||
v1.AddArg(src)
|
||||
v2 := b.NewValue0(v.Pos, OpAMD64MOVQstore, types.TypeMem)
|
||||
v3 := b.NewValue0(v.Pos, OpAMD64MOVQload, typ.UInt64)
|
||||
|
@ -27594,56 +27557,14 @@ func rewriteValueAMD64_OpMove(v *Value) bool {
|
|||
return true
|
||||
}
|
||||
// match: (Move [s] dst src mem)
|
||||
// cond: s > 16 && s%16 != 0 && s%16 > 8
|
||||
// result: (Move [s-s%16] (OffPtr <dst.Type> dst [s%16]) (OffPtr <src.Type> src [s%16]) (MOVOstore dst (MOVOload src mem) mem))
|
||||
for {
|
||||
s := auxIntToInt64(v.AuxInt)
|
||||
dst := v_0
|
||||
src := v_1
|
||||
mem := v_2
|
||||
if !(s > 16 && s%16 != 0 && s%16 > 8) {
|
||||
break
|
||||
}
|
||||
v.reset(OpMove)
|
||||
v.AuxInt = int64ToAuxInt(s - s%16)
|
||||
v0 := b.NewValue0(v.Pos, OpOffPtr, dst.Type)
|
||||
v0.AuxInt = int64ToAuxInt(s % 16)
|
||||
v0.AddArg(dst)
|
||||
v1 := b.NewValue0(v.Pos, OpOffPtr, src.Type)
|
||||
v1.AuxInt = int64ToAuxInt(s % 16)
|
||||
v1.AddArg(src)
|
||||
v2 := b.NewValue0(v.Pos, OpAMD64MOVOstore, types.TypeMem)
|
||||
v3 := b.NewValue0(v.Pos, OpAMD64MOVOload, types.TypeInt128)
|
||||
v3.AddArg2(src, mem)
|
||||
v2.AddArg3(dst, v3, mem)
|
||||
v.AddArg3(v0, v1, v2)
|
||||
return true
|
||||
}
|
||||
// match: (Move [s] dst src mem)
|
||||
// cond: s > 64 && s <= 16*64 && s%16 == 0 && logLargeCopy(v, s)
|
||||
// result: (DUFFCOPY [s] dst src mem)
|
||||
for {
|
||||
s := auxIntToInt64(v.AuxInt)
|
||||
dst := v_0
|
||||
src := v_1
|
||||
mem := v_2
|
||||
if !(s > 64 && s <= 16*64 && s%16 == 0 && logLargeCopy(v, s)) {
|
||||
break
|
||||
}
|
||||
v.reset(OpAMD64DUFFCOPY)
|
||||
v.AuxInt = int64ToAuxInt(s)
|
||||
v.AddArg3(dst, src, mem)
|
||||
return true
|
||||
}
|
||||
// match: (Move [s] dst src mem)
|
||||
// cond: s > 16*64 && s%8 == 0 && logLargeCopy(v, s)
|
||||
// cond: s > repMoveThreshold && s%8 == 0 && logLargeCopy(v, s)
|
||||
// result: (REPMOVSQ dst src (MOVQconst [s/8]) mem)
|
||||
for {
|
||||
s := auxIntToInt64(v.AuxInt)
|
||||
dst := v_0
|
||||
src := v_1
|
||||
mem := v_2
|
||||
if !(s > 16*64 && s%8 == 0 && logLargeCopy(v, s)) {
|
||||
if !(s > repMoveThreshold && s%8 == 0 && logLargeCopy(v, s)) {
|
||||
break
|
||||
}
|
||||
v.reset(OpAMD64REPMOVSQ)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue