cmd/compile: Implement LoweredZeroLoop with LSX Instruction on loong64

goos: linux
goarch: loong64
pkg: runtime
cpu: Loongson-3A6000 @ 2500.00MHz
                   |   old.txt    |               new.txt               |
                   |    sec/op    |   sec/op     vs base                |
ClearFat256           6.406n ± 0%   3.329n ± 1%  -48.03% (p=0.000 n=10)
ClearFat512          12.810n ± 0%   7.607n ± 0%  -40.62% (p=0.000 n=10)
ClearFat1024          25.62n ± 0%   14.01n ± 0%  -45.32% (p=0.000 n=10)
ClearFat1032          26.02n ± 0%   14.28n ± 0%  -45.14% (p=0.000 n=10)
ClearFat1040          26.02n ± 0%   14.41n ± 0%  -44.62% (p=0.000 n=10)
MemclrKnownSize192    4.804n ± 0%   2.827n ± 0%  -41.15% (p=0.000 n=10)
MemclrKnownSize248    6.561n ± 0%   4.371n ± 0%  -33.38% (p=0.000 n=10)
MemclrKnownSize256    6.406n ± 0%   3.335n ± 0%  -47.94% (p=0.000 n=10)
geomean               11.41n        6.453n       -43.45%

goos: linux
goarch: loong64
pkg: runtime
cpu: Loongson-3C5000 @ 2200.00MHz
                   |   old.txt    |               new.txt               |
                   |    sec/op    |   sec/op     vs base                |
ClearFat256          14.570n ± 0%   7.284n ± 0%  -50.01% (p=0.000 n=10)
ClearFat512           29.13n ± 0%   14.57n ± 0%  -49.98% (p=0.000 n=10)
ClearFat1024          58.26n ± 0%   29.15n ± 0%  -49.97% (p=0.000 n=10)
ClearFat1032          58.73n ± 0%   29.15n ± 0%  -50.36% (p=0.000 n=10)
ClearFat1040          59.18n ± 0%   29.26n ± 0%  -50.56% (p=0.000 n=10)
MemclrKnownSize192   10.930n ± 0%   5.466n ± 0%  -49.99% (p=0.000 n=10)
MemclrKnownSize248   14.110n ± 0%   6.772n ± 0%  -52.01% (p=0.000 n=10)
MemclrKnownSize256   14.570n ± 0%   7.285n ± 0%  -50.00% (p=0.000 n=10)
geomean               25.75n        12.78n       -50.36%

Change-Id: I88d7b6ae2f6fc3f095979f24fb83ff42a9d2d42e
Reviewed-on: https://go-review.googlesource.com/c/go/+/720940
Reviewed-by: Meidan Li <limeidan@loongson.cn>
Reviewed-by: Mark Freeman <markfreeman@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: sophie zhao <zhaoxiaolin@loongson.cn>
Reviewed-by: Keith Randall <khr@golang.org>
Reviewed-by: Keith Randall <khr@google.com>
This commit is contained in:
Guoqi Chen 2025-11-17 11:33:04 +08:00 committed by abner chenc
parent 7f2ae21fb4
commit c4bb9653ba
3 changed files with 115 additions and 40 deletions

View file

@ -575,6 +575,7 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
case ssa.OpLOONG64LoweredZeroLoop: case ssa.OpLOONG64LoweredZeroLoop:
ptrReg := v.Args[0].Reg() ptrReg := v.Args[0].Reg()
countReg := v.RegTmp() countReg := v.RegTmp()
flagReg := int16(loong64.REGTMP)
var off int64 var off int64
n := v.AuxInt n := v.AuxInt
loopSize := int64(64) loopSize := int64(64)
@ -587,58 +588,119 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
// vs // vs
// 16 instuctions in the straightline code // 16 instuctions in the straightline code
// Might as well use straightline code. // Might as well use straightline code.
v.Fatalf("ZeroLoop size tool small %d", n) v.Fatalf("ZeroLoop size too small %d", n)
} }
// Put iteration count in a register. // MOVV $n/loopSize, countReg
// MOVV $n/loopSize, countReg // MOVBU ir.Syms.Loong64HasLSX, flagReg
p := s.Prog(loong64.AMOVV) // BNE flagReg, lsxInit
p.From.Type = obj.TYPE_CONST // genericInit:
p.From.Offset = n / loopSize // for off = 0; off < loopSize; off += 8 {
p.To.Type = obj.TYPE_REG // zero8(s, ptrReg, off)
p.To.Reg = countReg // }
cntInit := p // ADDV $loopSize, ptrReg
// SUBV $1, countReg
// BNE countReg, genericInit
// JMP tail
// lsxInit:
// VXORV V31, V31, V31, v31 = 0
// for off = 0; off < loopSize; off += 16 {
// zero16(s, V31, ptrReg, off)
// }
// ADDV $loopSize, ptrReg
// SUBV $1, countReg
// BNE countReg, lsxInit
// tail:
// n %= loopSize
// for off = 0; n >= 8; off += 8, n -= 8 {
// zero8(s, ptrReg, off)
// }
//
// if n != 0 {
// zero8(s, ptrReg, off+n-8)
// }
// Zero loopSize bytes starting at ptrReg. p1 := s.Prog(loong64.AMOVV)
for range loopSize / 8 { p1.From.Type = obj.TYPE_CONST
// MOVV ZR, off(ptrReg) p1.From.Offset = n / loopSize
p1.To.Type = obj.TYPE_REG
p1.To.Reg = countReg
p2 := s.Prog(loong64.AMOVBU)
p2.From.Type = obj.TYPE_MEM
p2.From.Name = obj.NAME_EXTERN
p2.From.Sym = ir.Syms.Loong64HasLSX
p2.To.Type = obj.TYPE_REG
p2.To.Reg = flagReg
p3 := s.Prog(loong64.ABNE)
p3.From.Type = obj.TYPE_REG
p3.From.Reg = flagReg
p3.To.Type = obj.TYPE_BRANCH
for off = 0; off < loopSize; off += 8 {
zero8(s, ptrReg, off) zero8(s, ptrReg, off)
off += 8
} }
// Increment ptrReg by loopSize. p4 := s.Prog(loong64.AADDV)
// ADDV $loopSize, ptrReg p4.From.Type = obj.TYPE_CONST
p = s.Prog(loong64.AADDV) p4.From.Offset = loopSize
p.From.Type = obj.TYPE_CONST p4.To.Type = obj.TYPE_REG
p.From.Offset = loopSize p4.To.Reg = ptrReg
p.To.Type = obj.TYPE_REG
p.To.Reg = ptrReg
// Decrement loop count. p5 := s.Prog(loong64.ASUBV)
// SUBV $1, countReg p5.From.Type = obj.TYPE_CONST
p = s.Prog(loong64.ASUBV) p5.From.Offset = 1
p.From.Type = obj.TYPE_CONST p5.To.Type = obj.TYPE_REG
p.From.Offset = 1 p5.To.Reg = countReg
p.To.Type = obj.TYPE_REG
p.To.Reg = countReg
// Jump to loop header if we're not done yet. p6 := s.Prog(loong64.ABNE)
// BNE countReg, loop header p6.From.Type = obj.TYPE_REG
p = s.Prog(loong64.ABNE) p6.From.Reg = countReg
p.From.Type = obj.TYPE_REG p6.To.Type = obj.TYPE_BRANCH
p.From.Reg = countReg p6.To.SetTarget(p3.Link)
p.To.Type = obj.TYPE_BRANCH
p.To.SetTarget(cntInit.Link) p7 := s.Prog(obj.AJMP)
p7.To.Type = obj.TYPE_BRANCH
p8 := s.Prog(loong64.AVXORV)
p8.From.Type = obj.TYPE_REG
p8.From.Reg = loong64.REG_V31
p8.To.Type = obj.TYPE_REG
p8.To.Reg = loong64.REG_V31
p3.To.SetTarget(p8)
for off = 0; off < loopSize; off += 16 {
zero16(s, loong64.REG_V31, ptrReg, off)
}
p9 := s.Prog(loong64.AADDV)
p9.From.Type = obj.TYPE_CONST
p9.From.Offset = loopSize
p9.To.Type = obj.TYPE_REG
p9.To.Reg = ptrReg
p10 := s.Prog(loong64.ASUBV)
p10.From.Type = obj.TYPE_CONST
p10.From.Offset = 1
p10.To.Type = obj.TYPE_REG
p10.To.Reg = countReg
p11 := s.Prog(loong64.ABNE)
p11.From.Type = obj.TYPE_REG
p11.From.Reg = countReg
p11.To.Type = obj.TYPE_BRANCH
p11.To.SetTarget(p8.Link)
p12 := s.Prog(obj.ANOP)
p7.To.SetTarget(p12)
// Multiples of the loop size are now done. // Multiples of the loop size are now done.
n %= loopSize n %= loopSize
off = 0
// Write any fractional portion. // Write any fractional portion.
for n >= 8 { for off = 0; n >= 8; off += 8 {
// MOVV ZR, off(ptrReg) // MOVV ZR, off(ptrReg)
zero8(s, ptrReg, off) zero8(s, ptrReg, off)
off += 8
n -= 8 n -= 8
} }
@ -1333,7 +1395,7 @@ func move8(s *ssagen.State, src, dst, tmp int16, off int64) {
// zero8 zeroes 8 bytes at reg+off. // zero8 zeroes 8 bytes at reg+off.
func zero8(s *ssagen.State, reg int16, off int64) { func zero8(s *ssagen.State, reg int16, off int64) {
// MOVV ZR, off(reg) // MOVV ZR, off(reg)
p := s.Prog(loong64.AMOVV) p := s.Prog(loong64.AMOVV)
p.From.Type = obj.TYPE_REG p.From.Type = obj.TYPE_REG
p.From.Reg = loong64.REGZERO p.From.Reg = loong64.REGZERO
@ -1341,3 +1403,14 @@ func zero8(s *ssagen.State, reg int16, off int64) {
p.To.Reg = reg p.To.Reg = reg
p.To.Offset = off p.To.Offset = off
} }
// zero16 zeroes 16 bytes at reg+off.
func zero16(s *ssagen.State, regZero, regBase int16, off int64) {
// VMOVQ regZero, off(regBase)
p := s.Prog(loong64.AVMOVQ)
p.From.Type = obj.TYPE_REG
p.From.Reg = regZero
p.To.Type = obj.TYPE_MEM
p.To.Reg = regBase
p.To.Offset = off
}

View file

@ -388,6 +388,7 @@ func init() {
argLength: 2, argLength: 2,
reg: regInfo{ reg: regInfo{
inputs: []regMask{gp}, inputs: []regMask{gp},
clobbers: buildReg("F31"),
clobbersArg0: true, clobbersArg0: true,
}, },
faultOnNilArg0: true, faultOnNilArg0: true,

View file

@ -26107,6 +26107,7 @@ var opcodeTable = [...]opInfo{
inputs: []inputInfo{ inputs: []inputInfo{
{0, 1071644664}, // R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 R23 R24 R25 R26 R27 R28 R29 R31 {0, 1071644664}, // R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 R23 R24 R25 R26 R27 R28 R29 R31
}, },
clobbers: 2305843009213693952, // F31
clobbersArg0: true, clobbersArg0: true,
}, },
}, },