cmd/compile: use generated loops instead of DUFFZERO on riscv64

MemclrKnownSize112-4          5.602Gi ± 0%    5.601Gi ± 0%         ~ (p=0.363 n=10)
MemclrKnownSize128-4          6.933Gi ± 1%    6.545Gi ± 1%    -5.59% (p=0.000 n=10)
MemclrKnownSize192-4          8.055Gi ± 1%    7.804Gi ± 0%    -3.12% (p=0.000 n=10)
MemclrKnownSize248-4          8.489Gi ± 0%    8.718Gi ± 0%    +2.69% (p=0.000 n=10)
MemclrKnownSize256-4          8.762Gi ± 0%    8.763Gi ± 0%         ~ (p=0.494 n=10)
MemclrKnownSize512-4          9.514Gi ± 1%    9.514Gi ± 0%         ~ (p=0.529 n=10)
MemclrKnownSize1024-4         9.940Gi ± 0%    9.939Gi ± 1%         ~ (p=0.989 n=10)
ClearFat3-4                   1.300Gi ± 0%    1.301Gi ±  0%         ~ (p=0.447 n=10)
ClearFat4-4                   3.902Gi ± 0%    3.902Gi ±  0%         ~ (p=0.971 n=10)
ClearFat5-4                   665.8Mi ± 0%   1331.5Mi ±  0%  +100.01% (p=0.000 n=10)
ClearFat6-4                   665.8Mi ± 0%   1330.5Mi ±  0%   +99.82% (p=0.000 n=10)
ClearFat7-4                   490.7Mi ± 0%   1331.9Mi ±  0%  +171.45% (p=0.000 n=10)
ClearFat8-4                   5.201Gi ± 0%    5.202Gi ±  0%         ~ (p=0.123 n=10)
ClearFat9-4                   856.1Mi ± 0%   1331.6Mi ±  0%   +55.54% (p=0.000 n=10)
ClearFat10-4                  887.8Mi ± 0%   1331.9Mi ±  0%   +50.03% (p=0.000 n=10)
ClearFat11-4                  915.3Mi ± 0%   1331.1Mi ±  0%   +45.42% (p=0.000 n=10)
ClearFat12-4                  5.202Gi ± 0%    5.202Gi ±  0%         ~ (p=0.481 n=10)
ClearFat13-4                  961.5Mi ± 0%   1331.8Mi ±  0%   +38.50% (p=0.000 n=10)
ClearFat14-4                  981.0Mi ± 0%   1331.8Mi ±  0%   +35.76% (p=0.000 n=10)
ClearFat15-4                  951.3Mi ± 0%   1331.4Mi ±  0%   +39.96% (p=0.000 n=10)
ClearFat16-4                  1.600Gi ± 0%    5.202Gi ±  0%  +225.10% (p=0.000 n=10)
ClearFat18-4                  1.018Gi ± 0%    1.300Gi ±  0%   +27.77% (p=0.000 n=10)
ClearFat20-4                  2.601Gi ± 0%    4.938Gi ± 12%   +89.87% (p=0.000 n=10)
ClearFat24-4                  2.601Gi ± 0%    5.201Gi ±  0%   +99.96% (p=0.000 n=10)
ClearFat32-4                  1.982Gi ± 0%    5.203Gi ±  0%  +162.55% (p=0.000 n=10)
ClearFat40-4                  3.467Gi ± 0%    4.338Gi ±  0%   +25.11% (p=0.000 n=10)
ClearFat48-4                  3.671Gi ± 0%    5.201Gi ±  0%   +41.69% (p=0.000 n=10)
ClearFat56-4                  3.640Gi ± 0%    5.201Gi ±  0%   +42.88% (p=0.000 n=10)
ClearFat64-4                  2.250Gi ± 0%    5.202Gi ±  0%  +131.25% (p=0.000 n=10)
ClearFat72-4                  4.064Gi ± 0%    5.201Gi ±  0%   +27.97% (p=0.000 n=10)
ClearFat128-4                 4.496Gi ± 0%    5.203Gi ±  0%   +15.71% (p=0.000 n=10)
ClearFat256-4                 4.756Gi ± 0%    5.201Gi ±  0%    +9.36% (p=0.000 n=10)
ClearFat512-4                 2.512Gi ± 0%    5.201Gi ±  0%  +107.03% (p=0.000 n=10)
ClearFat1024-4                4.255Gi ± 0%    5.202Gi ±  0%   +22.26% (p=0.000 n=10)
ClearFat1032-4                4.260Gi ± 0%    5.201Gi ±  0%   +22.09% (p=0.000 n=10)
ClearFat1040-4                4.285Gi ± 1%    5.203Gi ±  0%   +21.41% (p=0.000 n=10)
geomean                       2.005Gi         3.020Gi         +50.58%

Change-Id: Iea1da734ff8eaf1b5a2822ae2bdb7f4fd9b65651
Reviewed-on: https://go-review.googlesource.com/c/go/+/699635
Reviewed-by: Mark Ryan <markdryan@rivosinc.com>
Reviewed-by: Keith Randall <khr@google.com>
Reviewed-by: Keith Randall <khr@golang.org>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Mark Freeman <markfreeman@google.com>
This commit is contained in:
Meng Zhuo 2025-08-28 07:05:27 +00:00
parent 77643dc63f
commit 879ff736d3
5 changed files with 152 additions and 176 deletions

View file

@ -181,6 +181,8 @@ func largestMove(alignment int64) (obj.As, int64) {
} }
} }
var fracMovOps = []obj.As{riscv.AMOVB, riscv.AMOVH, riscv.AMOVW, riscv.AMOV}
// ssaMarkMoves marks any MOVXconst ops that need to avoid clobbering flags. // ssaMarkMoves marks any MOVXconst ops that need to avoid clobbering flags.
// RISC-V has no flags, so this is a no-op. // RISC-V has no flags, so this is a no-op.
func ssaMarkMoves(s *ssagen.State, b *ssa.Block) {} func ssaMarkMoves(s *ssagen.State, b *ssa.Block) {}
@ -738,30 +740,86 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
p.RegTo2 = riscv.REG_ZERO p.RegTo2 = riscv.REG_ZERO
case ssa.OpRISCV64LoweredZero: case ssa.OpRISCV64LoweredZero:
mov, sz := largestMove(v.AuxInt) ptr := v.Args[0].Reg()
sc := v.AuxValAndOff()
n := sc.Val64()
// mov ZERO, (Rarg0) mov, sz := largestMove(sc.Off64())
// ADD $sz, Rarg0
// BGEU Rarg1, Rarg0, -2(PC)
p := s.Prog(mov) // mov ZERO, (offset)(Rarg0)
p.From.Type = obj.TYPE_REG var off int64
p.From.Reg = riscv.REG_ZERO for n >= sz {
p.To.Type = obj.TYPE_MEM zeroOp(s, mov, ptr, off)
p.To.Reg = v.Args[0].Reg() off += sz
n -= sz
}
for i := len(fracMovOps) - 1; i >= 0; i-- {
tsz := int64(1 << i)
if n < tsz {
continue
}
zeroOp(s, fracMovOps[i], ptr, off)
off += tsz
n -= tsz
}
case ssa.OpRISCV64LoweredZeroLoop:
ptr := v.Args[0].Reg()
sc := v.AuxValAndOff()
n := sc.Val64()
mov, sz := largestMove(sc.Off64())
chunk := 8 * sz
if n <= 3*chunk {
v.Fatalf("ZeroLoop too small:%d, expect:%d", n, 3*chunk)
}
tmp := v.RegTmp()
p := s.Prog(riscv.AADD)
p.From.Type = obj.TYPE_CONST
p.From.Offset = n - n%chunk
p.Reg = ptr
p.To.Type = obj.TYPE_REG
p.To.Reg = tmp
for i := int64(0); i < 8; i++ {
zeroOp(s, mov, ptr, sz*i)
}
p2 := s.Prog(riscv.AADD) p2 := s.Prog(riscv.AADD)
p2.From.Type = obj.TYPE_CONST p2.From.Type = obj.TYPE_CONST
p2.From.Offset = sz p2.From.Offset = chunk
p2.To.Type = obj.TYPE_REG p2.To.Type = obj.TYPE_REG
p2.To.Reg = v.Args[0].Reg() p2.To.Reg = ptr
p3 := s.Prog(riscv.ABGEU) p3 := s.Prog(riscv.ABNE)
p3.To.Type = obj.TYPE_BRANCH p3.From.Reg = tmp
p3.Reg = v.Args[0].Reg()
p3.From.Type = obj.TYPE_REG p3.From.Type = obj.TYPE_REG
p3.From.Reg = v.Args[1].Reg() p3.Reg = ptr
p3.To.SetTarget(p) p3.To.Type = obj.TYPE_BRANCH
p3.To.SetTarget(p.Link)
n %= chunk
// mov ZERO, (offset)(Rarg0)
var off int64
for n >= sz {
zeroOp(s, mov, ptr, off)
off += sz
n -= sz
}
for i := len(fracMovOps) - 1; i >= 0; i-- {
tsz := int64(1 << i)
if n < tsz {
continue
}
zeroOp(s, fracMovOps[i], ptr, off)
off += tsz
n -= tsz
}
case ssa.OpRISCV64LoweredMove: case ssa.OpRISCV64LoweredMove:
mov, sz := largestMove(v.AuxInt) mov, sz := largestMove(v.AuxInt)
@ -955,3 +1013,13 @@ func spillArgReg(pp *objw.Progs, p *obj.Prog, f *ssa.Func, t *types.Type, reg in
p.Pos = p.Pos.WithNotStmt() p.Pos = p.Pos.WithNotStmt()
return p return p
} }
func zeroOp(s *ssagen.State, mov obj.As, reg int16, off int64) {
p := s.Prog(mov)
p.From.Type = obj.TYPE_REG
p.From.Reg = riscv.REG_ZERO
p.To.Type = obj.TYPE_MEM
p.To.Reg = reg
p.To.Offset = off
return
}

View file

@ -373,36 +373,14 @@
(MOVHstore [4] ptr (MOVDconst [0]) (MOVHstore [4] ptr (MOVDconst [0])
(MOVHstore [2] ptr (MOVDconst [0]) (MOVHstore [2] ptr (MOVDconst [0])
(MOVHstore ptr (MOVDconst [0]) mem))) (MOVHstore ptr (MOVDconst [0]) mem)))
(Zero [12] {t} ptr mem) && t.Alignment()%4 == 0 =>
(MOVWstore [8] ptr (MOVDconst [0])
(MOVWstore [4] ptr (MOVDconst [0])
(MOVWstore ptr (MOVDconst [0]) mem)))
(Zero [16] {t} ptr mem) && t.Alignment()%8 == 0 =>
(MOVDstore [8] ptr (MOVDconst [0])
(MOVDstore ptr (MOVDconst [0]) mem))
(Zero [24] {t} ptr mem) && t.Alignment()%8 == 0 =>
(MOVDstore [16] ptr (MOVDconst [0])
(MOVDstore [8] ptr (MOVDconst [0])
(MOVDstore ptr (MOVDconst [0]) mem)))
(Zero [32] {t} ptr mem) && t.Alignment()%8 == 0 =>
(MOVDstore [24] ptr (MOVDconst [0])
(MOVDstore [16] ptr (MOVDconst [0])
(MOVDstore [8] ptr (MOVDconst [0])
(MOVDstore ptr (MOVDconst [0]) mem))))
// Medium 8-aligned zeroing uses a Duff's device // Unroll zeroing in medium size (at most 192 bytes i.e. 3 cachelines)
// 8 and 128 are magic constants, see runtime/mkduff.go (Zero [s] {t} ptr mem) && s <= 24*moveSize(t.Alignment(), config) =>
(Zero [s] {t} ptr mem) (LoweredZero [makeValAndOff(int32(s),int32(t.Alignment()))] ptr mem)
&& s%8 == 0 && s <= 8*128
&& t.Alignment()%8 == 0 =>
(DUFFZERO [8 * (128 - s/8)] ptr mem)
// Generic zeroing uses a loop // Generic zeroing uses a loop
(Zero [s] {t} ptr mem) => (Zero [s] {t} ptr mem) && s > 24*moveSize(t.Alignment(), config) =>
(LoweredZero [t.Alignment()] (LoweredZeroLoop [makeValAndOff(int32(s),int32(t.Alignment()))] ptr mem)
ptr
(ADD <ptr.Type> ptr (MOVDconst [s-moveSize(t.Alignment(), config)]))
mem)
// Checks // Checks
(IsNonNil ...) => (SNEZ ...) (IsNonNil ...) => (SNEZ ...)

View file

@ -317,25 +317,40 @@ func init() {
// Generic moves and zeros // Generic moves and zeros
// general unaligned zeroing // general unrolled zeroing
// arg0 = address of memory to zero (in X5, changed as side effect) // arg0 = address of memory to zero
// arg1 = address of the last element to zero (inclusive) // arg1 = mem
// arg2 = mem // auxint = element size and type alignment
// auxint = element size
// returns mem // returns mem
// mov ZERO, (X5) // mov ZERO, (OFFSET)(Rarg0)
// ADD $sz, X5
// BGEU Rarg1, X5, -2(PC)
{ {
name: "LoweredZero", name: "LoweredZero",
aux: "Int64", aux: "SymValAndOff",
argLength: 3,
reg: regInfo{
inputs: []regMask{regNamed["X5"], gpMask},
clobbers: regNamed["X5"],
},
typ: "Mem", typ: "Mem",
argLength: 2,
symEffect: "Write",
faultOnNilArg0: true, faultOnNilArg0: true,
reg: regInfo{
inputs: []regMask{gpMask},
},
},
// general unaligned zeroing
// arg0 = address of memory to zero (clobber)
// arg2 = mem
// auxint = element size and type alignment
// returns mem
{
name: "LoweredZeroLoop",
aux: "SymValAndOff",
typ: "Mem",
argLength: 2,
symEffect: "Write",
needIntTemp: true,
faultOnNilArg0: true,
reg: regInfo{
inputs: []regMask{gpMask},
clobbersArg0: true,
},
}, },
// general unaligned move // general unaligned move

View file

@ -2569,6 +2569,7 @@ const (
OpRISCV64DUFFZERO OpRISCV64DUFFZERO
OpRISCV64DUFFCOPY OpRISCV64DUFFCOPY
OpRISCV64LoweredZero OpRISCV64LoweredZero
OpRISCV64LoweredZeroLoop
OpRISCV64LoweredMove OpRISCV64LoweredMove
OpRISCV64LoweredAtomicLoad8 OpRISCV64LoweredAtomicLoad8
OpRISCV64LoweredAtomicLoad32 OpRISCV64LoweredAtomicLoad32
@ -34558,15 +34559,28 @@ var opcodeTable = [...]opInfo{
}, },
{ {
name: "LoweredZero", name: "LoweredZero",
auxType: auxInt64, auxType: auxSymValAndOff,
argLen: 3, argLen: 2,
faultOnNilArg0: true, faultOnNilArg0: true,
symEffect: SymWrite,
reg: regInfo{ reg: regInfo{
inputs: []inputInfo{ inputs: []inputInfo{
{0, 16}, // X5 {0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
{1, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
}, },
clobbers: 16, // X5 },
},
{
name: "LoweredZeroLoop",
auxType: auxSymValAndOff,
argLen: 2,
needIntTemp: true,
faultOnNilArg0: true,
symEffect: SymWrite,
reg: regInfo{
inputs: []inputInfo{
{0, 1006632944}, // X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X28 X29 X30
},
clobbersArg0: true,
}, },
}, },
{ {

View file

@ -9925,138 +9925,39 @@ func rewriteValueRISCV64_OpZero(v *Value) bool {
v.AddArg3(ptr, v0, v1) v.AddArg3(ptr, v0, v1)
return true return true
} }
// match: (Zero [12] {t} ptr mem)
// cond: t.Alignment()%4 == 0
// result: (MOVWstore [8] ptr (MOVDconst [0]) (MOVWstore [4] ptr (MOVDconst [0]) (MOVWstore ptr (MOVDconst [0]) mem)))
for {
if auxIntToInt64(v.AuxInt) != 12 {
break
}
t := auxToType(v.Aux)
ptr := v_0
mem := v_1
if !(t.Alignment()%4 == 0) {
break
}
v.reset(OpRISCV64MOVWstore)
v.AuxInt = int32ToAuxInt(8)
v0 := b.NewValue0(v.Pos, OpRISCV64MOVDconst, typ.UInt64)
v0.AuxInt = int64ToAuxInt(0)
v1 := b.NewValue0(v.Pos, OpRISCV64MOVWstore, types.TypeMem)
v1.AuxInt = int32ToAuxInt(4)
v2 := b.NewValue0(v.Pos, OpRISCV64MOVWstore, types.TypeMem)
v2.AddArg3(ptr, v0, mem)
v1.AddArg3(ptr, v0, v2)
v.AddArg3(ptr, v0, v1)
return true
}
// match: (Zero [16] {t} ptr mem)
// cond: t.Alignment()%8 == 0
// result: (MOVDstore [8] ptr (MOVDconst [0]) (MOVDstore ptr (MOVDconst [0]) mem))
for {
if auxIntToInt64(v.AuxInt) != 16 {
break
}
t := auxToType(v.Aux)
ptr := v_0
mem := v_1
if !(t.Alignment()%8 == 0) {
break
}
v.reset(OpRISCV64MOVDstore)
v.AuxInt = int32ToAuxInt(8)
v0 := b.NewValue0(v.Pos, OpRISCV64MOVDconst, typ.UInt64)
v0.AuxInt = int64ToAuxInt(0)
v1 := b.NewValue0(v.Pos, OpRISCV64MOVDstore, types.TypeMem)
v1.AddArg3(ptr, v0, mem)
v.AddArg3(ptr, v0, v1)
return true
}
// match: (Zero [24] {t} ptr mem)
// cond: t.Alignment()%8 == 0
// result: (MOVDstore [16] ptr (MOVDconst [0]) (MOVDstore [8] ptr (MOVDconst [0]) (MOVDstore ptr (MOVDconst [0]) mem)))
for {
if auxIntToInt64(v.AuxInt) != 24 {
break
}
t := auxToType(v.Aux)
ptr := v_0
mem := v_1
if !(t.Alignment()%8 == 0) {
break
}
v.reset(OpRISCV64MOVDstore)
v.AuxInt = int32ToAuxInt(16)
v0 := b.NewValue0(v.Pos, OpRISCV64MOVDconst, typ.UInt64)
v0.AuxInt = int64ToAuxInt(0)
v1 := b.NewValue0(v.Pos, OpRISCV64MOVDstore, types.TypeMem)
v1.AuxInt = int32ToAuxInt(8)
v2 := b.NewValue0(v.Pos, OpRISCV64MOVDstore, types.TypeMem)
v2.AddArg3(ptr, v0, mem)
v1.AddArg3(ptr, v0, v2)
v.AddArg3(ptr, v0, v1)
return true
}
// match: (Zero [32] {t} ptr mem)
// cond: t.Alignment()%8 == 0
// result: (MOVDstore [24] ptr (MOVDconst [0]) (MOVDstore [16] ptr (MOVDconst [0]) (MOVDstore [8] ptr (MOVDconst [0]) (MOVDstore ptr (MOVDconst [0]) mem))))
for {
if auxIntToInt64(v.AuxInt) != 32 {
break
}
t := auxToType(v.Aux)
ptr := v_0
mem := v_1
if !(t.Alignment()%8 == 0) {
break
}
v.reset(OpRISCV64MOVDstore)
v.AuxInt = int32ToAuxInt(24)
v0 := b.NewValue0(v.Pos, OpRISCV64MOVDconst, typ.UInt64)
v0.AuxInt = int64ToAuxInt(0)
v1 := b.NewValue0(v.Pos, OpRISCV64MOVDstore, types.TypeMem)
v1.AuxInt = int32ToAuxInt(16)
v2 := b.NewValue0(v.Pos, OpRISCV64MOVDstore, types.TypeMem)
v2.AuxInt = int32ToAuxInt(8)
v3 := b.NewValue0(v.Pos, OpRISCV64MOVDstore, types.TypeMem)
v3.AddArg3(ptr, v0, mem)
v2.AddArg3(ptr, v0, v3)
v1.AddArg3(ptr, v0, v2)
v.AddArg3(ptr, v0, v1)
return true
}
// match: (Zero [s] {t} ptr mem) // match: (Zero [s] {t} ptr mem)
// cond: s%8 == 0 && s <= 8*128 && t.Alignment()%8 == 0 // cond: s <= 24*moveSize(t.Alignment(), config)
// result: (DUFFZERO [8 * (128 - s/8)] ptr mem) // result: (LoweredZero [makeValAndOff(int32(s),int32(t.Alignment()))] ptr mem)
for { for {
s := auxIntToInt64(v.AuxInt) s := auxIntToInt64(v.AuxInt)
t := auxToType(v.Aux) t := auxToType(v.Aux)
ptr := v_0 ptr := v_0
mem := v_1 mem := v_1
if !(s%8 == 0 && s <= 8*128 && t.Alignment()%8 == 0) { if !(s <= 24*moveSize(t.Alignment(), config)) {
break break
} }
v.reset(OpRISCV64DUFFZERO) v.reset(OpRISCV64LoweredZero)
v.AuxInt = int64ToAuxInt(8 * (128 - s/8)) v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(s), int32(t.Alignment())))
v.AddArg2(ptr, mem) v.AddArg2(ptr, mem)
return true return true
} }
// match: (Zero [s] {t} ptr mem) // match: (Zero [s] {t} ptr mem)
// result: (LoweredZero [t.Alignment()] ptr (ADD <ptr.Type> ptr (MOVDconst [s-moveSize(t.Alignment(), config)])) mem) // cond: s > 24*moveSize(t.Alignment(), config)
// result: (LoweredZeroLoop [makeValAndOff(int32(s),int32(t.Alignment()))] ptr mem)
for { for {
s := auxIntToInt64(v.AuxInt) s := auxIntToInt64(v.AuxInt)
t := auxToType(v.Aux) t := auxToType(v.Aux)
ptr := v_0 ptr := v_0
mem := v_1 mem := v_1
v.reset(OpRISCV64LoweredZero) if !(s > 24*moveSize(t.Alignment(), config)) {
v.AuxInt = int64ToAuxInt(t.Alignment()) break
v0 := b.NewValue0(v.Pos, OpRISCV64ADD, ptr.Type) }
v1 := b.NewValue0(v.Pos, OpRISCV64MOVDconst, typ.UInt64) v.reset(OpRISCV64LoweredZeroLoop)
v1.AuxInt = int64ToAuxInt(s - moveSize(t.Alignment(), config)) v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(s), int32(t.Alignment())))
v0.AddArg2(ptr, v1) v.AddArg2(ptr, mem)
v.AddArg3(ptr, v0, mem)
return true return true
} }
return false
} }
func rewriteBlockRISCV64(b *Block) bool { func rewriteBlockRISCV64(b *Block) bool {
typ := &b.Func.Config.Types typ := &b.Func.Config.Types