cmd/compile: use generated loops instead of DUFFZERO on loong64

Change-Id: Id43ee4353d4bac96627f8b0f54545cdd3d2a1d1b
Reviewed-on: https://go-review.googlesource.com/c/go/+/699695
Reviewed-by: Cherry Mui <cherryyz@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Carlos Amedee <carlos@golang.org>
Reviewed-by: abner chenc <chenguoqi@loongson.cn>
This commit is contained in:
limeidan 2025-08-28 19:22:51 +08:00 committed by abner chenc
parent 882335e2cb
commit 7bba745820
5 changed files with 154 additions and 90 deletions

View file

@ -560,28 +560,97 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
p.To.Sym = ir.Syms.Duffzero
p.To.Offset = v.AuxInt
case ssa.OpLOONG64LoweredZero:
// MOVx R0, (Rarg0)
// ADDV $sz, Rarg0
// BGEU Rarg1, Rarg0, -2(PC)
mov, sz := largestMove(v.AuxInt)
p := s.Prog(mov)
ptrReg := v.Args[0].Reg()
n := v.AuxInt
if n < 16 {
v.Fatalf("Zero too small %d", n)
}
// Generate Zeroing instructions.
var off int64
for n >= 8 {
// MOVV ZR, off(ptrReg)
zero8(s, ptrReg, off)
off += 8
n -= 8
}
if n != 0 {
// MOVV ZR, off+n-8(ptrReg)
zero8(s, ptrReg, off+n-8)
}
case ssa.OpLOONG64LoweredZeroLoop:
ptrReg := v.Args[0].Reg()
countReg := v.RegTmp()
var off int64
n := v.AuxInt
loopSize := int64(64)
if n < 3*loopSize {
// - a loop count of 0 won't work.
// - a loop count of 1 is useless.
// - a loop count of 2 is a code size ~tie
// 4 instructions to implement the loop
// 8 instructions in the loop body
// vs
// 16 instuctions in the straightline code
// Might as well use straightline code.
v.Fatalf("ZeroLoop size tool small %d", n)
}
// Put iteration count in a register.
// MOVV $n/loopSize, countReg
p := s.Prog(loong64.AMOVV)
p.From.Type = obj.TYPE_CONST
p.From.Offset = n / loopSize
p.To.Type = obj.TYPE_REG
p.To.Reg = countReg
cntInit := p
// Zero loopSize bytes starting at ptrReg.
for range loopSize / 8 {
// MOVV ZR, off(ptrReg)
zero8(s, ptrReg, off)
off += 8
}
// Increment ptrReg by loopSize.
// ADDV $loopSize, ptrReg
p = s.Prog(loong64.AADDV)
p.From.Type = obj.TYPE_CONST
p.From.Offset = loopSize
p.To.Type = obj.TYPE_REG
p.To.Reg = ptrReg
// Decrement loop count.
// SUBV $1, countReg
p = s.Prog(loong64.ASUBV)
p.From.Type = obj.TYPE_CONST
p.From.Offset = 1
p.To.Type = obj.TYPE_REG
p.To.Reg = countReg
// Jump to loop header if we're not done yet.
// BNE countReg, loop header
p = s.Prog(loong64.ABNE)
p.From.Type = obj.TYPE_REG
p.From.Reg = loong64.REGZERO
p.To.Type = obj.TYPE_MEM
p.To.Reg = v.Args[0].Reg()
p.From.Reg = countReg
p.To.Type = obj.TYPE_BRANCH
p.To.SetTarget(cntInit.Link)
p2 := s.Prog(loong64.AADDVU)
p2.From.Type = obj.TYPE_CONST
p2.From.Offset = sz
p2.To.Type = obj.TYPE_REG
p2.To.Reg = v.Args[0].Reg()
// Multiples of the loop size are now done.
n %= loopSize
p3 := s.Prog(loong64.ABGEU)
p3.From.Type = obj.TYPE_REG
p3.From.Reg = v.Args[1].Reg()
p3.Reg = v.Args[0].Reg()
p3.To.Type = obj.TYPE_BRANCH
p3.To.SetTarget(p)
off = 0
// Write any fractional portion.
for n >= 8 {
// MOVV ZR, off(ptrReg)
zero8(s, ptrReg, off)
off += 8
n -= 8
}
if n != 0 {
zero8(s, ptrReg, off+n-8)
}
case ssa.OpLOONG64DUFFCOPY:
p := s.Prog(obj.ADUFFCOPY)
@ -1155,3 +1224,14 @@ func spillArgReg(pp *objw.Progs, p *obj.Prog, f *ssa.Func, t *types.Type, reg in
p.Pos = p.Pos.WithNotStmt()
return p
}
// zero8 zeroes 8 bytes at reg+off.
func zero8(s *ssagen.State, reg int16, off int64) {
// MOVV ZR, off(reg)
p := s.Prog(loong64.AMOVV)
p.From.Type = obj.TYPE_REG
p.From.Reg = loong64.REGZERO
p.To.Type = obj.TYPE_MEM
p.To.Reg = reg
p.To.Offset = off
}

View file

@ -373,24 +373,8 @@
(MOVVstore [8] ptr (MOVVconst [0])
(MOVVstore ptr (MOVVconst [0]) mem))
// strip off fractional word zeroing
(Zero [s] ptr mem) && s%8 != 0 && s > 16 =>
(Zero [s%8]
(OffPtr <ptr.Type> ptr [s-s%8])
(Zero [s-s%8] ptr mem))
// medium zeroing uses a duff device
(Zero [s] ptr mem)
&& s%8 == 0 && s > 16 && s <= 8*128 =>
(DUFFZERO [8 * (128 - s/8)] ptr mem)
// large zeroing uses a loop
(Zero [s] ptr mem)
&& s%8 == 0 && s > 8*128 =>
(LoweredZero
ptr
(ADDVconst <ptr.Type> ptr [s-8])
mem)
(Zero [s] ptr mem) && s > 16 && s < 192 => (LoweredZero [s] ptr mem)
(Zero [s] ptr mem) && s >= 192 => (LoweredZeroLoop [s] ptr mem)
// moves
(Move [0] _ _ mem) => mem

View file

@ -376,6 +376,21 @@ func init() {
faultOnNilArg0: true,
},
// medium zeroing
// arg0 = address of memory to zero
// arg1 = mem
// auxint = number of bytes to zero
// returns mem
{
name: "LoweredZero",
aux: "Int64",
argLength: 2,
reg: regInfo{
inputs: []regMask{gp},
},
faultOnNilArg0: true,
},
// duffcopy
// arg0 = address of dst memory (in R21, changed as side effect)
// arg1 = address of src memory (in R20, changed as side effect)
@ -395,25 +410,21 @@ func init() {
faultOnNilArg1: true,
},
// large or unaligned zeroing
// arg0 = address of memory to zero (in R20, changed as side effect)
// arg1 = address of the last element to zero
// arg2 = mem
// auxint = alignment
// large zeroing
// arg0 = address of memory to zero
// arg1 = mem
// auxint = number of bytes to zero
// returns mem
// MOVx R0, (R20)
// ADDV $sz, R20
// BGEU Rarg1, R20, -2(PC)
{
name: "LoweredZero",
name: "LoweredZeroLoop",
aux: "Int64",
argLength: 3,
argLength: 2,
reg: regInfo{
inputs: []regMask{buildReg("R20"), gp},
clobbers: buildReg("R20"),
inputs: []regMask{gp},
clobbersArg0: true,
},
typ: "Mem",
faultOnNilArg0: true,
needIntTemp: true,
},
// large or unaligned move

View file

@ -1923,8 +1923,9 @@ const (
OpLOONG64CALLclosure
OpLOONG64CALLinter
OpLOONG64DUFFZERO
OpLOONG64DUFFCOPY
OpLOONG64LoweredZero
OpLOONG64DUFFCOPY
OpLOONG64LoweredZeroLoop
OpLOONG64LoweredMove
OpLOONG64LoweredAtomicLoad8
OpLOONG64LoweredAtomicLoad32
@ -25912,6 +25913,17 @@ var opcodeTable = [...]opInfo{
clobbers: 524290, // R1 R20
},
},
{
name: "LoweredZero",
auxType: auxInt64,
argLen: 2,
faultOnNilArg0: true,
reg: regInfo{
inputs: []inputInfo{
{0, 1071644664}, // R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 R23 R24 R25 R26 R27 R28 R29 R31
},
},
},
{
name: "DUFFCOPY",
auxType: auxInt64,
@ -25927,16 +25939,16 @@ var opcodeTable = [...]opInfo{
},
},
{
name: "LoweredZero",
name: "LoweredZeroLoop",
auxType: auxInt64,
argLen: 3,
argLen: 2,
needIntTemp: true,
faultOnNilArg0: true,
reg: regInfo{
inputs: []inputInfo{
{0, 524288}, // R20
{1, 1071644664}, // R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 R23 R24 R25 R26 R27 R28 R29 R31
{0, 1071644664}, // R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 R23 R24 R25 R26 R27 R28 R29 R31
},
clobbers: 524288, // R20
clobbersArg0: true,
},
},
{

View file

@ -11497,56 +11497,33 @@ func rewriteValueLOONG64_OpZero(v *Value) bool {
return true
}
// match: (Zero [s] ptr mem)
// cond: s%8 != 0 && s > 16
// result: (Zero [s%8] (OffPtr <ptr.Type> ptr [s-s%8]) (Zero [s-s%8] ptr mem))
// cond: s > 16 && s < 192
// result: (LoweredZero [s] ptr mem)
for {
s := auxIntToInt64(v.AuxInt)
ptr := v_0
mem := v_1
if !(s%8 != 0 && s > 16) {
if !(s > 16 && s < 192) {
break
}
v.reset(OpZero)
v.AuxInt = int64ToAuxInt(s % 8)
v0 := b.NewValue0(v.Pos, OpOffPtr, ptr.Type)
v0.AuxInt = int64ToAuxInt(s - s%8)
v0.AddArg(ptr)
v1 := b.NewValue0(v.Pos, OpZero, types.TypeMem)
v1.AuxInt = int64ToAuxInt(s - s%8)
v1.AddArg2(ptr, mem)
v.AddArg2(v0, v1)
return true
}
// match: (Zero [s] ptr mem)
// cond: s%8 == 0 && s > 16 && s <= 8*128
// result: (DUFFZERO [8 * (128 - s/8)] ptr mem)
for {
s := auxIntToInt64(v.AuxInt)
ptr := v_0
mem := v_1
if !(s%8 == 0 && s > 16 && s <= 8*128) {
break
}
v.reset(OpLOONG64DUFFZERO)
v.AuxInt = int64ToAuxInt(8 * (128 - s/8))
v.reset(OpLOONG64LoweredZero)
v.AuxInt = int64ToAuxInt(s)
v.AddArg2(ptr, mem)
return true
}
// match: (Zero [s] ptr mem)
// cond: s%8 == 0 && s > 8*128
// result: (LoweredZero ptr (ADDVconst <ptr.Type> ptr [s-8]) mem)
// cond: s >= 192
// result: (LoweredZeroLoop [s] ptr mem)
for {
s := auxIntToInt64(v.AuxInt)
ptr := v_0
mem := v_1
if !(s%8 == 0 && s > 8*128) {
if !(s >= 192) {
break
}
v.reset(OpLOONG64LoweredZero)
v0 := b.NewValue0(v.Pos, OpLOONG64ADDVconst, ptr.Type)
v0.AuxInt = int64ToAuxInt(s - 8)
v0.AddArg(ptr)
v.AddArg3(ptr, v0, mem)
v.reset(OpLOONG64LoweredZeroLoop)
v.AuxInt = int64ToAuxInt(s)
v.AddArg2(ptr, mem)
return true
}
return false