cmd/compile: use arm64 neon in LoweredMemmove/LoweredMemmoveLoop

Raspberry Pi 5 (Cortex-A76)

                     │   base.log   │               opt.log               │
                     │    sec/op    │   sec/op     vs base                │
MemmoveKnownSize112     3.549n ± 0%   3.652n ± 0%   +2.92% (p=0.000 n=10)
MemmoveKnownSize128     3.979n ± 0%   3.617n ± 0%   -9.09% (p=0.000 n=10)
MemmoveKnownSize192     7.566n ± 0%   5.074n ± 0%  -32.94% (p=0.000 n=10)
MemmoveKnownSize248     8.549n ± 0%   7.184n ± 1%  -15.97% (p=0.000 n=10)
MemmoveKnownSize256    10.010n ± 0%   6.827n ± 0%  -31.80% (p=0.000 n=10)
MemmoveKnownSize512     19.81n ± 0%   13.59n ± 0%  -31.40% (p=0.000 n=10)
MemmoveKnownSize1024    39.66n ± 0%   27.00n ± 0%  -31.93% (p=0.000 n=10)
geomean                 9.538n        7.392n       -22.50%

Change-Id: I7b17408cd0a500ceaa80bc93ffe2f19ddeea9c0d
Reviewed-on: https://go-review.googlesource.com/c/go/+/692315
Reviewed-by: Keith Randall <khr@google.com>
Reviewed-by: David Chase <drchase@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
This commit is contained in:
Vasily Leonenko 2025-07-25 23:06:33 +03:00 committed by Keith Randall
parent 61d1ff61ad
commit 5c9a26c7f8
3 changed files with 79 additions and 33 deletions

View file

@ -1189,8 +1189,9 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
if dstReg == srcReg { if dstReg == srcReg {
break break
} }
tmpReg1 := int16(arm64.REG_R24) tmpReg1 := int16(arm64.REG_R25)
tmpReg2 := int16(arm64.REG_R25) tmpFReg1 := int16(arm64.REG_F16)
tmpFReg2 := int16(arm64.REG_F17)
n := v.AuxInt n := v.AuxInt
if n < 16 { if n < 16 {
v.Fatalf("Move too small %d", n) v.Fatalf("Move too small %d", n)
@ -1198,10 +1199,17 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
// Generate copying instructions. // Generate copying instructions.
var off int64 var off int64
for n >= 32 {
// FLDPQ off(srcReg), (tmpFReg1, tmpFReg2)
// FSTPQ (tmpFReg1, tmpFReg2), off(dstReg)
move32(s, srcReg, dstReg, tmpFReg1, tmpFReg2, off, false)
off += 32
n -= 32
}
for n >= 16 { for n >= 16 {
// LDP off(srcReg), (tmpReg1, tmpReg2) // FMOVQ off(src), tmpFReg1
// STP (tmpReg1, tmpReg2), off(dstReg) // FMOVQ tmpFReg1, off(dst)
move16(s, srcReg, dstReg, tmpReg1, tmpReg2, off, false) move16(s, srcReg, dstReg, tmpFReg1, off, false)
off += 16 off += 16
n -= 16 n -= 16
} }
@ -1223,9 +1231,10 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
if dstReg == srcReg { if dstReg == srcReg {
break break
} }
countReg := int16(arm64.REG_R23) countReg := int16(arm64.REG_R24)
tmpReg1 := int16(arm64.REG_R24) tmpReg1 := int16(arm64.REG_R25)
tmpReg2 := int16(arm64.REG_R25) tmpFReg1 := int16(arm64.REG_F16)
tmpFReg2 := int16(arm64.REG_F17)
n := v.AuxInt n := v.AuxInt
loopSize := int64(64) loopSize := int64(64)
if n < 3*loopSize { if n < 3*loopSize {
@ -1251,10 +1260,10 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
// Move loopSize bytes starting at srcReg to dstReg. // Move loopSize bytes starting at srcReg to dstReg.
// Increment srcReg and destReg by loopSize as a side effect. // Increment srcReg and destReg by loopSize as a side effect.
for range loopSize / 16 { for range loopSize / 32 {
// LDP.P 16(srcReg), (tmpReg1, tmpReg2) // FLDPQ.P 32(srcReg), (tmpFReg1, tmpFReg2)
// STP.P (tmpReg1, tmpReg2), 16(dstReg) // FSTPQ.P (tmpFReg1, tmpFReg2), 32(dstReg)
move16(s, srcReg, dstReg, tmpReg1, tmpReg2, 0, true) move32(s, srcReg, dstReg, tmpFReg1, tmpFReg2, 0, true)
} }
// Decrement loop count. // Decrement loop count.
// SUB $1, countReg // SUB $1, countReg
@ -1276,10 +1285,17 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
// Copy any fractional portion. // Copy any fractional portion.
var off int64 var off int64
for n >= 32 {
// FLDPQ off(srcReg), (tmpFReg1, tmpFReg2)
// FSTPQ (tmpFReg1, tmpFReg2), off(dstReg)
move32(s, srcReg, dstReg, tmpFReg1, tmpFReg2, off, false)
off += 32
n -= 32
}
for n >= 16 { for n >= 16 {
// LDP off(srcReg), (tmpReg1, tmpReg2) // FMOVQ off(src), tmpFReg1
// STP (tmpReg1, tmpReg2), off(dstReg) // FMOVQ tmpFReg1, off(dst)
move16(s, srcReg, dstReg, tmpReg1, tmpReg2, off, false) move16(s, srcReg, dstReg, tmpFReg1, off, false)
off += 16 off += 16
n -= 16 n -= 16
} }
@ -1699,26 +1715,55 @@ func zero8(s *ssagen.State, reg int16, off int64) {
p.To.Offset = off p.To.Offset = off
} }
// move16 copies 16 bytes at src+off to dst+off. // move32 copies 32 bytes at src+off to dst+off.
// Uses registers tmp1 and tmp2. // Uses registers tmp1 and tmp2.
// If postInc is true, increment src and dst by 16. // If postInc is true, increment src and dst by 32.
func move16(s *ssagen.State, src, dst, tmp1, tmp2 int16, off int64, postInc bool) { func move32(s *ssagen.State, src, dst, tmp1, tmp2 int16, off int64, postInc bool) {
// LDP off(src), (tmp1, tmp2) // FLDPQ off(src), (tmp1, tmp2)
ld := s.Prog(arm64.ALDP) ld := s.Prog(arm64.AFLDPQ)
ld.From.Type = obj.TYPE_MEM ld.From.Type = obj.TYPE_MEM
ld.From.Reg = src ld.From.Reg = src
ld.From.Offset = off ld.From.Offset = off
ld.To.Type = obj.TYPE_REGREG ld.To.Type = obj.TYPE_REGREG
ld.To.Reg = tmp1 ld.To.Reg = tmp1
ld.To.Offset = int64(tmp2) ld.To.Offset = int64(tmp2)
// STP (tmp1, tmp2), off(dst) // FSTPQ (tmp1, tmp2), off(dst)
st := s.Prog(arm64.ASTP) st := s.Prog(arm64.AFSTPQ)
st.From.Type = obj.TYPE_REGREG st.From.Type = obj.TYPE_REGREG
st.From.Reg = tmp1 st.From.Reg = tmp1
st.From.Offset = int64(tmp2) st.From.Offset = int64(tmp2)
st.To.Type = obj.TYPE_MEM st.To.Type = obj.TYPE_MEM
st.To.Reg = dst st.To.Reg = dst
st.To.Offset = off st.To.Offset = off
if postInc {
if off != 0 {
panic("can't postinc with non-zero offset")
}
ld.Scond = arm64.C_XPOST
st.Scond = arm64.C_XPOST
ld.From.Offset = 32
st.To.Offset = 32
}
}
// move16 copies 16 bytes at src+off to dst+off.
// Uses register tmp1
// If postInc is true, increment src and dst by 16.
func move16(s *ssagen.State, src, dst, tmp1 int16, off int64, postInc bool) {
// FMOVQ off(src), tmp1
ld := s.Prog(arm64.AFMOVQ)
ld.From.Type = obj.TYPE_MEM
ld.From.Reg = src
ld.From.Offset = off
ld.To.Type = obj.TYPE_REG
ld.To.Reg = tmp1
// FMOVQ tmp1, off(dst)
st := s.Prog(arm64.AFMOVQ)
st.From.Type = obj.TYPE_REG
st.From.Reg = tmp1
st.To.Type = obj.TYPE_MEM
st.To.Reg = dst
st.To.Offset = off
if postInc { if postInc {
if off != 0 { if off != 0 {
panic("can't postinc with non-zero offset") panic("can't postinc with non-zero offset")

View file

@ -144,8 +144,9 @@ func init() {
gpspsbg = gpspg | buildReg("SB") gpspsbg = gpspg | buildReg("SB")
fp = buildReg("F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31") fp = buildReg("F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31")
callerSave = gp | fp | buildReg("g") // runtime.setg (and anything calling it) may clobber g callerSave = gp | fp | buildReg("g") // runtime.setg (and anything calling it) may clobber g
r25 = buildReg("R25")
r24to25 = buildReg("R24 R25") r24to25 = buildReg("R24 R25")
r23to25 = buildReg("R23 R24 R25") f16to17 = buildReg("F16 F17")
rz = buildReg("ZERO") rz = buildReg("ZERO")
first16 = buildReg("R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15") first16 = buildReg("R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15")
) )
@ -599,8 +600,8 @@ func init() {
aux: "Int64", aux: "Int64",
argLength: 3, argLength: 3,
reg: regInfo{ reg: regInfo{
inputs: []regMask{gp &^ r24to25, gp &^ r24to25}, inputs: []regMask{gp &^ r25, gp &^ r25},
clobbers: r24to25, // TODO: figure out needIntTemp x2 clobbers: r25 | f16to17, // TODO: figure out needIntTemp + x2 for floats
}, },
faultOnNilArg0: true, faultOnNilArg0: true,
faultOnNilArg1: true, faultOnNilArg1: true,
@ -617,8 +618,8 @@ func init() {
aux: "Int64", aux: "Int64",
argLength: 3, argLength: 3,
reg: regInfo{ reg: regInfo{
inputs: []regMask{gp &^ r23to25, gp &^ r23to25}, inputs: []regMask{gp &^ r24to25, gp &^ r24to25},
clobbers: r23to25, // TODO: figure out needIntTemp x3 clobbers: r24to25 | f16to17, // TODO: figure out needIntTemp x2 + x2 for floats
clobbersArg0: true, clobbersArg0: true,
clobbersArg1: true, clobbersArg1: true,
}, },

View file

@ -23199,10 +23199,10 @@ var opcodeTable = [...]opInfo{
faultOnNilArg1: true, faultOnNilArg1: true,
reg: regInfo{ reg: regInfo{
inputs: []inputInfo{ inputs: []inputInfo{
{0, 310378495}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R26 R30 {0, 318767103}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R26 R30
{1, 310378495}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R26 R30 {1, 318767103}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R26 R30
}, },
clobbers: 25165824, // R24 R25 clobbers: 422212481843200, // R25 F16 F17
}, },
}, },
{ {
@ -23213,10 +23213,10 @@ var opcodeTable = [...]opInfo{
faultOnNilArg1: true, faultOnNilArg1: true,
reg: regInfo{ reg: regInfo{
inputs: []inputInfo{ inputs: []inputInfo{
{0, 306184191}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R26 R30 {0, 310378495}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R26 R30
{1, 306184191}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R26 R30 {1, 310378495}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R26 R30
}, },
clobbers: 29360128, // R23 R24 R25 clobbers: 422212490231808, // R24 R25 F16 F17
clobbersArg0: true, clobbersArg0: true,
clobbersArg1: true, clobbersArg1: true,
}, },