mirror of
https://github.com/golang/go.git
synced 2025-12-08 06:10:04 +00:00
cmd/compile: use generated loops instead of DUFFCOPY on loong64
Change-Id: If9da2b5681e5d05d7c3d51f003f1fe662d3feaec Reviewed-on: https://go-review.googlesource.com/c/go/+/699855 Reviewed-by: Keith Randall <khr@golang.org> Reviewed-by: sophie zhao <zhaoxiaolin@loongson.cn> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: abner chenc <chenguoqi@loongson.cn> Reviewed-by: Keith Randall <khr@google.com> Auto-Submit: Michael Pratt <mpratt@google.com> Reviewed-by: Michael Pratt <mpratt@google.com>
This commit is contained in:
parent
c552ad913f
commit
91e76a513b
5 changed files with 188 additions and 117 deletions
|
|
@ -659,42 +659,119 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
|
||||||
p.To.Sym = ir.Syms.Duffcopy
|
p.To.Sym = ir.Syms.Duffcopy
|
||||||
p.To.Offset = v.AuxInt
|
p.To.Offset = v.AuxInt
|
||||||
case ssa.OpLOONG64LoweredMove:
|
case ssa.OpLOONG64LoweredMove:
|
||||||
// MOVx (Rarg1), Rtmp
|
dstReg := v.Args[0].Reg()
|
||||||
// MOVx Rtmp, (Rarg0)
|
srcReg := v.Args[1].Reg()
|
||||||
// ADDV $sz, Rarg1
|
if dstReg == srcReg {
|
||||||
// ADDV $sz, Rarg0
|
break
|
||||||
// BGEU Rarg2, Rarg0, -4(PC)
|
}
|
||||||
mov, sz := largestMove(v.AuxInt)
|
tmpReg := int16(loong64.REG_R20)
|
||||||
p := s.Prog(mov)
|
n := v.AuxInt
|
||||||
p.From.Type = obj.TYPE_MEM
|
if n < 16 {
|
||||||
p.From.Reg = v.Args[1].Reg()
|
v.Fatalf("Move too small %d", n)
|
||||||
|
}
|
||||||
|
|
||||||
|
var off int64
|
||||||
|
for n >= 8 {
|
||||||
|
// MOVV off(srcReg), tmpReg
|
||||||
|
// MOVV tmpReg, off(dstReg)
|
||||||
|
move8(s, srcReg, dstReg, tmpReg, off)
|
||||||
|
off += 8
|
||||||
|
n -= 8
|
||||||
|
}
|
||||||
|
|
||||||
|
if n != 0 {
|
||||||
|
// MOVV off+n-8(srcReg), tmpReg
|
||||||
|
// MOVV tmpReg, off+n-8(srcReg)
|
||||||
|
move8(s, srcReg, dstReg, tmpReg, off+n-8)
|
||||||
|
}
|
||||||
|
case ssa.OpLOONG64LoweredMoveLoop:
|
||||||
|
dstReg := v.Args[0].Reg()
|
||||||
|
srcReg := v.Args[1].Reg()
|
||||||
|
if dstReg == srcReg {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
countReg := int16(loong64.REG_R20)
|
||||||
|
tmpReg := int16(loong64.REG_R21)
|
||||||
|
var off int64
|
||||||
|
n := v.AuxInt
|
||||||
|
loopSize := int64(64)
|
||||||
|
if n < 3*loopSize {
|
||||||
|
// - a loop count of 0 won't work.
|
||||||
|
// - a loop count of 1 is useless.
|
||||||
|
// - a loop count of 2 is a code size ~tie
|
||||||
|
// 4 instructions to implement the loop
|
||||||
|
// 8 instructions in the loop body
|
||||||
|
// vs
|
||||||
|
// 16 instructions in the straightline code
|
||||||
|
// Might as well use straightline code.
|
||||||
|
v.Fatalf("ZeroLoop size too small %d", n)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Put iteration count in a register.
|
||||||
|
// MOVV $n/loopSize, countReg
|
||||||
|
p := s.Prog(loong64.AMOVV)
|
||||||
|
p.From.Type = obj.TYPE_CONST
|
||||||
|
p.From.Offset = n / loopSize
|
||||||
p.To.Type = obj.TYPE_REG
|
p.To.Type = obj.TYPE_REG
|
||||||
p.To.Reg = loong64.REGTMP
|
p.To.Reg = countReg
|
||||||
|
cntInit := p
|
||||||
|
|
||||||
p2 := s.Prog(mov)
|
// Move loopSize bytes starting at srcReg to dstReg.
|
||||||
p2.From.Type = obj.TYPE_REG
|
for range loopSize / 8 {
|
||||||
p2.From.Reg = loong64.REGTMP
|
// MOVV off(srcReg), tmpReg
|
||||||
p2.To.Type = obj.TYPE_MEM
|
// MOVV tmpReg, off(dstReg)
|
||||||
p2.To.Reg = v.Args[0].Reg()
|
move8(s, srcReg, dstReg, tmpReg, off)
|
||||||
|
off += 8
|
||||||
|
}
|
||||||
|
|
||||||
p3 := s.Prog(loong64.AADDVU)
|
// Increment srcReg and destReg by loopSize.
|
||||||
p3.From.Type = obj.TYPE_CONST
|
// ADDV $loopSize, srcReg
|
||||||
p3.From.Offset = sz
|
p = s.Prog(loong64.AADDV)
|
||||||
p3.To.Type = obj.TYPE_REG
|
p.From.Type = obj.TYPE_CONST
|
||||||
p3.To.Reg = v.Args[1].Reg()
|
p.From.Offset = loopSize
|
||||||
|
p.To.Type = obj.TYPE_REG
|
||||||
|
p.To.Reg = srcReg
|
||||||
|
// ADDV $loopSize, dstReg
|
||||||
|
p = s.Prog(loong64.AADDV)
|
||||||
|
p.From.Type = obj.TYPE_CONST
|
||||||
|
p.From.Offset = loopSize
|
||||||
|
p.To.Type = obj.TYPE_REG
|
||||||
|
p.To.Reg = dstReg
|
||||||
|
|
||||||
p4 := s.Prog(loong64.AADDVU)
|
// Decrement loop count.
|
||||||
p4.From.Type = obj.TYPE_CONST
|
// SUBV $1, countReg
|
||||||
p4.From.Offset = sz
|
p = s.Prog(loong64.ASUBV)
|
||||||
p4.To.Type = obj.TYPE_REG
|
p.From.Type = obj.TYPE_CONST
|
||||||
p4.To.Reg = v.Args[0].Reg()
|
p.From.Offset = 1
|
||||||
|
p.To.Type = obj.TYPE_REG
|
||||||
|
p.To.Reg = countReg
|
||||||
|
|
||||||
p5 := s.Prog(loong64.ABGEU)
|
// Jump to loop header if we're not done yet.
|
||||||
p5.From.Type = obj.TYPE_REG
|
// BNE countReg, loop header
|
||||||
p5.From.Reg = v.Args[2].Reg()
|
p = s.Prog(loong64.ABNE)
|
||||||
p5.Reg = v.Args[1].Reg()
|
p.From.Type = obj.TYPE_REG
|
||||||
p5.To.Type = obj.TYPE_BRANCH
|
p.From.Reg = countReg
|
||||||
p5.To.SetTarget(p)
|
p.To.Type = obj.TYPE_BRANCH
|
||||||
|
p.To.SetTarget(cntInit.Link)
|
||||||
|
|
||||||
|
// Multiples of the loop size are now done.
|
||||||
|
n %= loopSize
|
||||||
|
|
||||||
|
off = 0
|
||||||
|
// Copy any fractional portion.
|
||||||
|
for n >= 8 {
|
||||||
|
// MOVV off(srcReg), tmpReg
|
||||||
|
// MOVV tmpReg, off(dstReg)
|
||||||
|
move8(s, srcReg, dstReg, tmpReg, off)
|
||||||
|
off += 8
|
||||||
|
n -= 8
|
||||||
|
}
|
||||||
|
|
||||||
|
if n != 0 {
|
||||||
|
// MOVV off+n-8(srcReg), tmpReg
|
||||||
|
// MOVV tmpReg, off+n-8(srcReg)
|
||||||
|
move8(s, srcReg, dstReg, tmpReg, off+n-8)
|
||||||
|
}
|
||||||
|
|
||||||
case ssa.OpLOONG64CALLstatic, ssa.OpLOONG64CALLclosure, ssa.OpLOONG64CALLinter:
|
case ssa.OpLOONG64CALLstatic, ssa.OpLOONG64CALLclosure, ssa.OpLOONG64CALLinter:
|
||||||
s.Call(v)
|
s.Call(v)
|
||||||
|
|
@ -1225,6 +1302,24 @@ func spillArgReg(pp *objw.Progs, p *obj.Prog, f *ssa.Func, t *types.Type, reg in
|
||||||
return p
|
return p
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// move8 copies 8 bytes at src+off to dst+off.
|
||||||
|
func move8(s *ssagen.State, src, dst, tmp int16, off int64) {
|
||||||
|
// MOVV off(src), tmp
|
||||||
|
ld := s.Prog(loong64.AMOVV)
|
||||||
|
ld.From.Type = obj.TYPE_MEM
|
||||||
|
ld.From.Reg = src
|
||||||
|
ld.From.Offset = off
|
||||||
|
ld.To.Type = obj.TYPE_REG
|
||||||
|
ld.To.Reg = tmp
|
||||||
|
// MOVV tmp, off(dst)
|
||||||
|
st := s.Prog(loong64.AMOVV)
|
||||||
|
st.From.Type = obj.TYPE_REG
|
||||||
|
st.From.Reg = tmp
|
||||||
|
st.To.Type = obj.TYPE_MEM
|
||||||
|
st.To.Reg = dst
|
||||||
|
st.To.Offset = off
|
||||||
|
}
|
||||||
|
|
||||||
// zero8 zeroes 8 bytes at reg+off.
|
// zero8 zeroes 8 bytes at reg+off.
|
||||||
func zero8(s *ssagen.State, reg int16, off int64) {
|
func zero8(s *ssagen.State, reg int16, off int64) {
|
||||||
// MOVV ZR, off(reg)
|
// MOVV ZR, off(reg)
|
||||||
|
|
|
||||||
|
|
@ -419,34 +419,8 @@
|
||||||
(MOVVstore [8] dst (MOVVload [8] src mem)
|
(MOVVstore [8] dst (MOVVload [8] src mem)
|
||||||
(MOVVstore dst (MOVVload src mem) mem))
|
(MOVVstore dst (MOVVload src mem) mem))
|
||||||
|
|
||||||
// strip off fractional word move
|
(Move [s] dst src mem) && s > 16 && s < 192 && logLargeCopy(v, s) => (LoweredMove [s] dst src mem)
|
||||||
(Move [s] dst src mem) && s%8 != 0 && s > 16 =>
|
(Move [s] dst src mem) && s >= 192 && logLargeCopy(v, s) => (LoweredMoveLoop [s] dst src mem)
|
||||||
(Move [s%8]
|
|
||||||
(OffPtr <dst.Type> dst [s-s%8])
|
|
||||||
(OffPtr <src.Type> src [s-s%8])
|
|
||||||
(Move [s-s%8] dst src mem))
|
|
||||||
|
|
||||||
// medium move uses a duff device
|
|
||||||
(Move [s] dst src mem)
|
|
||||||
&& s%8 == 0 && s > 16 && s <= 8*128
|
|
||||||
&& logLargeCopy(v, s) =>
|
|
||||||
(DUFFCOPY [16 * (128 - s/8)] dst src mem)
|
|
||||||
// 16 and 128 are magic constants. 16 is the number of bytes to encode:
|
|
||||||
// MOVV (R20), R30
|
|
||||||
// ADDV $8, R20
|
|
||||||
// MOVV R30, (R21)
|
|
||||||
// ADDV $8, R21
|
|
||||||
// and 128 is the number of such blocks. See runtime/duff_loong64.s:duffcopy.
|
|
||||||
|
|
||||||
// large move uses a loop
|
|
||||||
(Move [s] dst src mem)
|
|
||||||
&& s%8 == 0 && s > 1024 && logLargeCopy(v, s) =>
|
|
||||||
(LoweredMove
|
|
||||||
dst
|
|
||||||
src
|
|
||||||
(ADDVconst <src.Type> src [s-8])
|
|
||||||
mem)
|
|
||||||
|
|
||||||
|
|
||||||
// float <=> int register moves, with no conversion.
|
// float <=> int register moves, with no conversion.
|
||||||
// These come up when compiling math.{Float64bits, Float64frombits, Float32bits, Float32frombits}.
|
// These come up when compiling math.{Float64bits, Float64frombits, Float32bits, Float32frombits}.
|
||||||
|
|
|
||||||
|
|
@ -429,27 +429,40 @@ func init() {
|
||||||
needIntTemp: true,
|
needIntTemp: true,
|
||||||
},
|
},
|
||||||
|
|
||||||
// large or unaligned move
|
// medium copying
|
||||||
// arg0 = address of dst memory (in R21, changed as side effect)
|
// arg0 = address of dst memory
|
||||||
// arg1 = address of src memory (in R20, changed as side effect)
|
// arg1 = address of src memory
|
||||||
// arg2 = address of the last element of src
|
// arg2 = mem
|
||||||
// arg3 = mem
|
// auxint = number of bytes to copy
|
||||||
// auxint = alignment
|
|
||||||
// returns mem
|
// returns mem
|
||||||
// MOVx (R20), Rtmp
|
|
||||||
// MOVx Rtmp, (R21)
|
|
||||||
// ADDV $sz, R20
|
|
||||||
// ADDV $sz, R21
|
|
||||||
// BGEU Rarg2, R20, -4(PC)
|
|
||||||
{
|
{
|
||||||
name: "LoweredMove",
|
name: "LoweredMove",
|
||||||
aux: "Int64",
|
aux: "Int64",
|
||||||
argLength: 4,
|
argLength: 3,
|
||||||
reg: regInfo{
|
reg: regInfo{
|
||||||
inputs: []regMask{buildReg("R21"), buildReg("R20"), gp},
|
inputs: []regMask{gp &^ buildReg("R20"), gp &^ buildReg("R20")},
|
||||||
clobbers: buildReg("R20 R21"),
|
clobbers: buildReg("R20"),
|
||||||
|
},
|
||||||
|
faultOnNilArg0: true,
|
||||||
|
faultOnNilArg1: true,
|
||||||
|
},
|
||||||
|
|
||||||
|
// large copying
|
||||||
|
// arg0 = address of dst memory
|
||||||
|
// arg1 = address of src memory
|
||||||
|
// arg2 = mem
|
||||||
|
// auxint = number of bytes to copy
|
||||||
|
// returns mem
|
||||||
|
{
|
||||||
|
name: "LoweredMoveLoop",
|
||||||
|
aux: "Int64",
|
||||||
|
argLength: 3,
|
||||||
|
reg: regInfo{
|
||||||
|
inputs: []regMask{gp &^ buildReg("R20 R21"), gp &^ buildReg("R20 R21")},
|
||||||
|
clobbers: buildReg("R20 R21"),
|
||||||
|
clobbersArg0: true,
|
||||||
|
clobbersArg1: true,
|
||||||
},
|
},
|
||||||
typ: "Mem",
|
|
||||||
faultOnNilArg0: true,
|
faultOnNilArg0: true,
|
||||||
faultOnNilArg1: true,
|
faultOnNilArg1: true,
|
||||||
},
|
},
|
||||||
|
|
|
||||||
|
|
@ -1929,6 +1929,7 @@ const (
|
||||||
OpLOONG64DUFFCOPY
|
OpLOONG64DUFFCOPY
|
||||||
OpLOONG64LoweredZeroLoop
|
OpLOONG64LoweredZeroLoop
|
||||||
OpLOONG64LoweredMove
|
OpLOONG64LoweredMove
|
||||||
|
OpLOONG64LoweredMoveLoop
|
||||||
OpLOONG64LoweredAtomicLoad8
|
OpLOONG64LoweredAtomicLoad8
|
||||||
OpLOONG64LoweredAtomicLoad32
|
OpLOONG64LoweredAtomicLoad32
|
||||||
OpLOONG64LoweredAtomicLoad64
|
OpLOONG64LoweredAtomicLoad64
|
||||||
|
|
@ -25986,16 +25987,31 @@ var opcodeTable = [...]opInfo{
|
||||||
{
|
{
|
||||||
name: "LoweredMove",
|
name: "LoweredMove",
|
||||||
auxType: auxInt64,
|
auxType: auxInt64,
|
||||||
argLen: 4,
|
argLen: 3,
|
||||||
faultOnNilArg0: true,
|
faultOnNilArg0: true,
|
||||||
faultOnNilArg1: true,
|
faultOnNilArg1: true,
|
||||||
reg: regInfo{
|
reg: regInfo{
|
||||||
inputs: []inputInfo{
|
inputs: []inputInfo{
|
||||||
{0, 1048576}, // R21
|
{0, 1071120376}, // R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R21 R23 R24 R25 R26 R27 R28 R29 R31
|
||||||
{1, 524288}, // R20
|
{1, 1071120376}, // R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R21 R23 R24 R25 R26 R27 R28 R29 R31
|
||||||
{2, 1071644664}, // R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 R23 R24 R25 R26 R27 R28 R29 R31
|
},
|
||||||
|
clobbers: 524288, // R20
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "LoweredMoveLoop",
|
||||||
|
auxType: auxInt64,
|
||||||
|
argLen: 3,
|
||||||
|
faultOnNilArg0: true,
|
||||||
|
faultOnNilArg1: true,
|
||||||
|
reg: regInfo{
|
||||||
|
inputs: []inputInfo{
|
||||||
|
{0, 1070071800}, // R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R23 R24 R25 R26 R27 R28 R29 R31
|
||||||
|
{1, 1070071800}, // R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R23 R24 R25 R26 R27 R28 R29 R31
|
||||||
},
|
},
|
||||||
clobbers: 1572864, // R20 R21
|
clobbers: 1572864, // R20 R21
|
||||||
|
clobbersArg0: true,
|
||||||
|
clobbersArg1: true,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
|
||||||
|
|
@ -9133,62 +9133,35 @@ func rewriteValueLOONG64_OpMove(v *Value) bool {
|
||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
// match: (Move [s] dst src mem)
|
// match: (Move [s] dst src mem)
|
||||||
// cond: s%8 != 0 && s > 16
|
// cond: s > 16 && s < 192 && logLargeCopy(v, s)
|
||||||
// result: (Move [s%8] (OffPtr <dst.Type> dst [s-s%8]) (OffPtr <src.Type> src [s-s%8]) (Move [s-s%8] dst src mem))
|
// result: (LoweredMove [s] dst src mem)
|
||||||
for {
|
for {
|
||||||
s := auxIntToInt64(v.AuxInt)
|
s := auxIntToInt64(v.AuxInt)
|
||||||
dst := v_0
|
dst := v_0
|
||||||
src := v_1
|
src := v_1
|
||||||
mem := v_2
|
mem := v_2
|
||||||
if !(s%8 != 0 && s > 16) {
|
if !(s > 16 && s < 192 && logLargeCopy(v, s)) {
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
v.reset(OpMove)
|
v.reset(OpLOONG64LoweredMove)
|
||||||
v.AuxInt = int64ToAuxInt(s % 8)
|
v.AuxInt = int64ToAuxInt(s)
|
||||||
v0 := b.NewValue0(v.Pos, OpOffPtr, dst.Type)
|
|
||||||
v0.AuxInt = int64ToAuxInt(s - s%8)
|
|
||||||
v0.AddArg(dst)
|
|
||||||
v1 := b.NewValue0(v.Pos, OpOffPtr, src.Type)
|
|
||||||
v1.AuxInt = int64ToAuxInt(s - s%8)
|
|
||||||
v1.AddArg(src)
|
|
||||||
v2 := b.NewValue0(v.Pos, OpMove, types.TypeMem)
|
|
||||||
v2.AuxInt = int64ToAuxInt(s - s%8)
|
|
||||||
v2.AddArg3(dst, src, mem)
|
|
||||||
v.AddArg3(v0, v1, v2)
|
|
||||||
return true
|
|
||||||
}
|
|
||||||
// match: (Move [s] dst src mem)
|
|
||||||
// cond: s%8 == 0 && s > 16 && s <= 8*128 && logLargeCopy(v, s)
|
|
||||||
// result: (DUFFCOPY [16 * (128 - s/8)] dst src mem)
|
|
||||||
for {
|
|
||||||
s := auxIntToInt64(v.AuxInt)
|
|
||||||
dst := v_0
|
|
||||||
src := v_1
|
|
||||||
mem := v_2
|
|
||||||
if !(s%8 == 0 && s > 16 && s <= 8*128 && logLargeCopy(v, s)) {
|
|
||||||
break
|
|
||||||
}
|
|
||||||
v.reset(OpLOONG64DUFFCOPY)
|
|
||||||
v.AuxInt = int64ToAuxInt(16 * (128 - s/8))
|
|
||||||
v.AddArg3(dst, src, mem)
|
v.AddArg3(dst, src, mem)
|
||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
// match: (Move [s] dst src mem)
|
// match: (Move [s] dst src mem)
|
||||||
// cond: s%8 == 0 && s > 1024 && logLargeCopy(v, s)
|
// cond: s >= 192 && logLargeCopy(v, s)
|
||||||
// result: (LoweredMove dst src (ADDVconst <src.Type> src [s-8]) mem)
|
// result: (LoweredMoveLoop [s] dst src mem)
|
||||||
for {
|
for {
|
||||||
s := auxIntToInt64(v.AuxInt)
|
s := auxIntToInt64(v.AuxInt)
|
||||||
dst := v_0
|
dst := v_0
|
||||||
src := v_1
|
src := v_1
|
||||||
mem := v_2
|
mem := v_2
|
||||||
if !(s%8 == 0 && s > 1024 && logLargeCopy(v, s)) {
|
if !(s >= 192 && logLargeCopy(v, s)) {
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
v.reset(OpLOONG64LoweredMove)
|
v.reset(OpLOONG64LoweredMoveLoop)
|
||||||
v0 := b.NewValue0(v.Pos, OpLOONG64ADDVconst, src.Type)
|
v.AuxInt = int64ToAuxInt(s)
|
||||||
v0.AuxInt = int64ToAuxInt(s - 8)
|
v.AddArg3(dst, src, mem)
|
||||||
v0.AddArg(src)
|
|
||||||
v.AddArg4(dst, src, v0, mem)
|
|
||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
return false
|
return false
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue