mirror of
https://github.com/golang/go.git
synced 2025-12-08 06:10:04 +00:00
cmd/compile: Implement LoweredZeroLoop with LSX Instruction on loong64
goos: linux
goarch: loong64
pkg: runtime
cpu: Loongson-3A6000 @ 2500.00MHz
| old.txt | new.txt |
| sec/op | sec/op vs base |
ClearFat256 6.406n ± 0% 3.329n ± 1% -48.03% (p=0.000 n=10)
ClearFat512 12.810n ± 0% 7.607n ± 0% -40.62% (p=0.000 n=10)
ClearFat1024 25.62n ± 0% 14.01n ± 0% -45.32% (p=0.000 n=10)
ClearFat1032 26.02n ± 0% 14.28n ± 0% -45.14% (p=0.000 n=10)
ClearFat1040 26.02n ± 0% 14.41n ± 0% -44.62% (p=0.000 n=10)
MemclrKnownSize192 4.804n ± 0% 2.827n ± 0% -41.15% (p=0.000 n=10)
MemclrKnownSize248 6.561n ± 0% 4.371n ± 0% -33.38% (p=0.000 n=10)
MemclrKnownSize256 6.406n ± 0% 3.335n ± 0% -47.94% (p=0.000 n=10)
geomean 11.41n 6.453n -43.45%
goos: linux
goarch: loong64
pkg: runtime
cpu: Loongson-3C5000 @ 2200.00MHz
| old.txt | new.txt |
| sec/op | sec/op vs base |
ClearFat256 14.570n ± 0% 7.284n ± 0% -50.01% (p=0.000 n=10)
ClearFat512 29.13n ± 0% 14.57n ± 0% -49.98% (p=0.000 n=10)
ClearFat1024 58.26n ± 0% 29.15n ± 0% -49.97% (p=0.000 n=10)
ClearFat1032 58.73n ± 0% 29.15n ± 0% -50.36% (p=0.000 n=10)
ClearFat1040 59.18n ± 0% 29.26n ± 0% -50.56% (p=0.000 n=10)
MemclrKnownSize192 10.930n ± 0% 5.466n ± 0% -49.99% (p=0.000 n=10)
MemclrKnownSize248 14.110n ± 0% 6.772n ± 0% -52.01% (p=0.000 n=10)
MemclrKnownSize256 14.570n ± 0% 7.285n ± 0% -50.00% (p=0.000 n=10)
geomean 25.75n 12.78n -50.36%
Change-Id: I88d7b6ae2f6fc3f095979f24fb83ff42a9d2d42e
Reviewed-on: https://go-review.googlesource.com/c/go/+/720940
Reviewed-by: Meidan Li <limeidan@loongson.cn>
Reviewed-by: Mark Freeman <markfreeman@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: sophie zhao <zhaoxiaolin@loongson.cn>
Reviewed-by: Keith Randall <khr@golang.org>
Reviewed-by: Keith Randall <khr@google.com>
This commit is contained in:
parent
7f2ae21fb4
commit
c4bb9653ba
3 changed files with 115 additions and 40 deletions
|
|
@ -575,6 +575,7 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
|
||||||
case ssa.OpLOONG64LoweredZeroLoop:
|
case ssa.OpLOONG64LoweredZeroLoop:
|
||||||
ptrReg := v.Args[0].Reg()
|
ptrReg := v.Args[0].Reg()
|
||||||
countReg := v.RegTmp()
|
countReg := v.RegTmp()
|
||||||
|
flagReg := int16(loong64.REGTMP)
|
||||||
var off int64
|
var off int64
|
||||||
n := v.AuxInt
|
n := v.AuxInt
|
||||||
loopSize := int64(64)
|
loopSize := int64(64)
|
||||||
|
|
@ -587,58 +588,119 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
|
||||||
// vs
|
// vs
|
||||||
// 16 instuctions in the straightline code
|
// 16 instuctions in the straightline code
|
||||||
// Might as well use straightline code.
|
// Might as well use straightline code.
|
||||||
v.Fatalf("ZeroLoop size tool small %d", n)
|
v.Fatalf("ZeroLoop size too small %d", n)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Put iteration count in a register.
|
// MOVV $n/loopSize, countReg
|
||||||
// MOVV $n/loopSize, countReg
|
// MOVBU ir.Syms.Loong64HasLSX, flagReg
|
||||||
p := s.Prog(loong64.AMOVV)
|
// BNE flagReg, lsxInit
|
||||||
p.From.Type = obj.TYPE_CONST
|
// genericInit:
|
||||||
p.From.Offset = n / loopSize
|
// for off = 0; off < loopSize; off += 8 {
|
||||||
p.To.Type = obj.TYPE_REG
|
// zero8(s, ptrReg, off)
|
||||||
p.To.Reg = countReg
|
// }
|
||||||
cntInit := p
|
// ADDV $loopSize, ptrReg
|
||||||
|
// SUBV $1, countReg
|
||||||
|
// BNE countReg, genericInit
|
||||||
|
// JMP tail
|
||||||
|
// lsxInit:
|
||||||
|
// VXORV V31, V31, V31, v31 = 0
|
||||||
|
// for off = 0; off < loopSize; off += 16 {
|
||||||
|
// zero16(s, V31, ptrReg, off)
|
||||||
|
// }
|
||||||
|
// ADDV $loopSize, ptrReg
|
||||||
|
// SUBV $1, countReg
|
||||||
|
// BNE countReg, lsxInit
|
||||||
|
// tail:
|
||||||
|
// n %= loopSize
|
||||||
|
// for off = 0; n >= 8; off += 8, n -= 8 {
|
||||||
|
// zero8(s, ptrReg, off)
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// if n != 0 {
|
||||||
|
// zero8(s, ptrReg, off+n-8)
|
||||||
|
// }
|
||||||
|
|
||||||
// Zero loopSize bytes starting at ptrReg.
|
p1 := s.Prog(loong64.AMOVV)
|
||||||
for range loopSize / 8 {
|
p1.From.Type = obj.TYPE_CONST
|
||||||
// MOVV ZR, off(ptrReg)
|
p1.From.Offset = n / loopSize
|
||||||
|
p1.To.Type = obj.TYPE_REG
|
||||||
|
p1.To.Reg = countReg
|
||||||
|
|
||||||
|
p2 := s.Prog(loong64.AMOVBU)
|
||||||
|
p2.From.Type = obj.TYPE_MEM
|
||||||
|
p2.From.Name = obj.NAME_EXTERN
|
||||||
|
p2.From.Sym = ir.Syms.Loong64HasLSX
|
||||||
|
p2.To.Type = obj.TYPE_REG
|
||||||
|
p2.To.Reg = flagReg
|
||||||
|
|
||||||
|
p3 := s.Prog(loong64.ABNE)
|
||||||
|
p3.From.Type = obj.TYPE_REG
|
||||||
|
p3.From.Reg = flagReg
|
||||||
|
p3.To.Type = obj.TYPE_BRANCH
|
||||||
|
|
||||||
|
for off = 0; off < loopSize; off += 8 {
|
||||||
zero8(s, ptrReg, off)
|
zero8(s, ptrReg, off)
|
||||||
off += 8
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Increment ptrReg by loopSize.
|
p4 := s.Prog(loong64.AADDV)
|
||||||
// ADDV $loopSize, ptrReg
|
p4.From.Type = obj.TYPE_CONST
|
||||||
p = s.Prog(loong64.AADDV)
|
p4.From.Offset = loopSize
|
||||||
p.From.Type = obj.TYPE_CONST
|
p4.To.Type = obj.TYPE_REG
|
||||||
p.From.Offset = loopSize
|
p4.To.Reg = ptrReg
|
||||||
p.To.Type = obj.TYPE_REG
|
|
||||||
p.To.Reg = ptrReg
|
|
||||||
|
|
||||||
// Decrement loop count.
|
p5 := s.Prog(loong64.ASUBV)
|
||||||
// SUBV $1, countReg
|
p5.From.Type = obj.TYPE_CONST
|
||||||
p = s.Prog(loong64.ASUBV)
|
p5.From.Offset = 1
|
||||||
p.From.Type = obj.TYPE_CONST
|
p5.To.Type = obj.TYPE_REG
|
||||||
p.From.Offset = 1
|
p5.To.Reg = countReg
|
||||||
p.To.Type = obj.TYPE_REG
|
|
||||||
p.To.Reg = countReg
|
|
||||||
|
|
||||||
// Jump to loop header if we're not done yet.
|
p6 := s.Prog(loong64.ABNE)
|
||||||
// BNE countReg, loop header
|
p6.From.Type = obj.TYPE_REG
|
||||||
p = s.Prog(loong64.ABNE)
|
p6.From.Reg = countReg
|
||||||
p.From.Type = obj.TYPE_REG
|
p6.To.Type = obj.TYPE_BRANCH
|
||||||
p.From.Reg = countReg
|
p6.To.SetTarget(p3.Link)
|
||||||
p.To.Type = obj.TYPE_BRANCH
|
|
||||||
p.To.SetTarget(cntInit.Link)
|
p7 := s.Prog(obj.AJMP)
|
||||||
|
p7.To.Type = obj.TYPE_BRANCH
|
||||||
|
|
||||||
|
p8 := s.Prog(loong64.AVXORV)
|
||||||
|
p8.From.Type = obj.TYPE_REG
|
||||||
|
p8.From.Reg = loong64.REG_V31
|
||||||
|
p8.To.Type = obj.TYPE_REG
|
||||||
|
p8.To.Reg = loong64.REG_V31
|
||||||
|
p3.To.SetTarget(p8)
|
||||||
|
|
||||||
|
for off = 0; off < loopSize; off += 16 {
|
||||||
|
zero16(s, loong64.REG_V31, ptrReg, off)
|
||||||
|
}
|
||||||
|
|
||||||
|
p9 := s.Prog(loong64.AADDV)
|
||||||
|
p9.From.Type = obj.TYPE_CONST
|
||||||
|
p9.From.Offset = loopSize
|
||||||
|
p9.To.Type = obj.TYPE_REG
|
||||||
|
p9.To.Reg = ptrReg
|
||||||
|
|
||||||
|
p10 := s.Prog(loong64.ASUBV)
|
||||||
|
p10.From.Type = obj.TYPE_CONST
|
||||||
|
p10.From.Offset = 1
|
||||||
|
p10.To.Type = obj.TYPE_REG
|
||||||
|
p10.To.Reg = countReg
|
||||||
|
|
||||||
|
p11 := s.Prog(loong64.ABNE)
|
||||||
|
p11.From.Type = obj.TYPE_REG
|
||||||
|
p11.From.Reg = countReg
|
||||||
|
p11.To.Type = obj.TYPE_BRANCH
|
||||||
|
p11.To.SetTarget(p8.Link)
|
||||||
|
|
||||||
|
p12 := s.Prog(obj.ANOP)
|
||||||
|
p7.To.SetTarget(p12)
|
||||||
|
|
||||||
// Multiples of the loop size are now done.
|
// Multiples of the loop size are now done.
|
||||||
n %= loopSize
|
n %= loopSize
|
||||||
|
|
||||||
off = 0
|
|
||||||
// Write any fractional portion.
|
// Write any fractional portion.
|
||||||
for n >= 8 {
|
for off = 0; n >= 8; off += 8 {
|
||||||
// MOVV ZR, off(ptrReg)
|
// MOVV ZR, off(ptrReg)
|
||||||
zero8(s, ptrReg, off)
|
zero8(s, ptrReg, off)
|
||||||
off += 8
|
|
||||||
n -= 8
|
n -= 8
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -1333,7 +1395,7 @@ func move8(s *ssagen.State, src, dst, tmp int16, off int64) {
|
||||||
|
|
||||||
// zero8 zeroes 8 bytes at reg+off.
|
// zero8 zeroes 8 bytes at reg+off.
|
||||||
func zero8(s *ssagen.State, reg int16, off int64) {
|
func zero8(s *ssagen.State, reg int16, off int64) {
|
||||||
// MOVV ZR, off(reg)
|
// MOVV ZR, off(reg)
|
||||||
p := s.Prog(loong64.AMOVV)
|
p := s.Prog(loong64.AMOVV)
|
||||||
p.From.Type = obj.TYPE_REG
|
p.From.Type = obj.TYPE_REG
|
||||||
p.From.Reg = loong64.REGZERO
|
p.From.Reg = loong64.REGZERO
|
||||||
|
|
@ -1341,3 +1403,14 @@ func zero8(s *ssagen.State, reg int16, off int64) {
|
||||||
p.To.Reg = reg
|
p.To.Reg = reg
|
||||||
p.To.Offset = off
|
p.To.Offset = off
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// zero16 zeroes 16 bytes at reg+off.
|
||||||
|
func zero16(s *ssagen.State, regZero, regBase int16, off int64) {
|
||||||
|
// VMOVQ regZero, off(regBase)
|
||||||
|
p := s.Prog(loong64.AVMOVQ)
|
||||||
|
p.From.Type = obj.TYPE_REG
|
||||||
|
p.From.Reg = regZero
|
||||||
|
p.To.Type = obj.TYPE_MEM
|
||||||
|
p.To.Reg = regBase
|
||||||
|
p.To.Offset = off
|
||||||
|
}
|
||||||
|
|
|
||||||
|
|
@ -388,6 +388,7 @@ func init() {
|
||||||
argLength: 2,
|
argLength: 2,
|
||||||
reg: regInfo{
|
reg: regInfo{
|
||||||
inputs: []regMask{gp},
|
inputs: []regMask{gp},
|
||||||
|
clobbers: buildReg("F31"),
|
||||||
clobbersArg0: true,
|
clobbersArg0: true,
|
||||||
},
|
},
|
||||||
faultOnNilArg0: true,
|
faultOnNilArg0: true,
|
||||||
|
|
|
||||||
|
|
@ -26107,6 +26107,7 @@ var opcodeTable = [...]opInfo{
|
||||||
inputs: []inputInfo{
|
inputs: []inputInfo{
|
||||||
{0, 1071644664}, // R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 R23 R24 R25 R26 R27 R28 R29 R31
|
{0, 1071644664}, // R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 R23 R24 R25 R26 R27 R28 R29 R31
|
||||||
},
|
},
|
||||||
|
clobbers: 2305843009213693952, // F31
|
||||||
clobbersArg0: true,
|
clobbersArg0: true,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue