mirror of
https://github.com/golang/go.git
synced 2025-12-08 06:10:04 +00:00
cmd/compile: use arm64 neon in LoweredMemmove/LoweredMemmoveLoop
Raspberry Pi 5 (Cortex-A76)
│ base.log │ opt.log │
│ sec/op │ sec/op vs base │
MemmoveKnownSize112 3.549n ± 0% 3.652n ± 0% +2.92% (p=0.000 n=10)
MemmoveKnownSize128 3.979n ± 0% 3.617n ± 0% -9.09% (p=0.000 n=10)
MemmoveKnownSize192 7.566n ± 0% 5.074n ± 0% -32.94% (p=0.000 n=10)
MemmoveKnownSize248 8.549n ± 0% 7.184n ± 1% -15.97% (p=0.000 n=10)
MemmoveKnownSize256 10.010n ± 0% 6.827n ± 0% -31.80% (p=0.000 n=10)
MemmoveKnownSize512 19.81n ± 0% 13.59n ± 0% -31.40% (p=0.000 n=10)
MemmoveKnownSize1024 39.66n ± 0% 27.00n ± 0% -31.93% (p=0.000 n=10)
geomean 9.538n 7.392n -22.50%
Change-Id: I7b17408cd0a500ceaa80bc93ffe2f19ddeea9c0d
Reviewed-on: https://go-review.googlesource.com/c/go/+/692315
Reviewed-by: Keith Randall <khr@google.com>
Reviewed-by: David Chase <drchase@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
This commit is contained in:
parent
61d1ff61ad
commit
5c9a26c7f8
3 changed files with 79 additions and 33 deletions
|
|
@ -1189,8 +1189,9 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
|
|||
if dstReg == srcReg {
|
||||
break
|
||||
}
|
||||
tmpReg1 := int16(arm64.REG_R24)
|
||||
tmpReg2 := int16(arm64.REG_R25)
|
||||
tmpReg1 := int16(arm64.REG_R25)
|
||||
tmpFReg1 := int16(arm64.REG_F16)
|
||||
tmpFReg2 := int16(arm64.REG_F17)
|
||||
n := v.AuxInt
|
||||
if n < 16 {
|
||||
v.Fatalf("Move too small %d", n)
|
||||
|
|
@ -1198,10 +1199,17 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
|
|||
|
||||
// Generate copying instructions.
|
||||
var off int64
|
||||
for n >= 32 {
|
||||
// FLDPQ off(srcReg), (tmpFReg1, tmpFReg2)
|
||||
// FSTPQ (tmpFReg1, tmpFReg2), off(dstReg)
|
||||
move32(s, srcReg, dstReg, tmpFReg1, tmpFReg2, off, false)
|
||||
off += 32
|
||||
n -= 32
|
||||
}
|
||||
for n >= 16 {
|
||||
// LDP off(srcReg), (tmpReg1, tmpReg2)
|
||||
// STP (tmpReg1, tmpReg2), off(dstReg)
|
||||
move16(s, srcReg, dstReg, tmpReg1, tmpReg2, off, false)
|
||||
// FMOVQ off(src), tmpFReg1
|
||||
// FMOVQ tmpFReg1, off(dst)
|
||||
move16(s, srcReg, dstReg, tmpFReg1, off, false)
|
||||
off += 16
|
||||
n -= 16
|
||||
}
|
||||
|
|
@ -1223,9 +1231,10 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
|
|||
if dstReg == srcReg {
|
||||
break
|
||||
}
|
||||
countReg := int16(arm64.REG_R23)
|
||||
tmpReg1 := int16(arm64.REG_R24)
|
||||
tmpReg2 := int16(arm64.REG_R25)
|
||||
countReg := int16(arm64.REG_R24)
|
||||
tmpReg1 := int16(arm64.REG_R25)
|
||||
tmpFReg1 := int16(arm64.REG_F16)
|
||||
tmpFReg2 := int16(arm64.REG_F17)
|
||||
n := v.AuxInt
|
||||
loopSize := int64(64)
|
||||
if n < 3*loopSize {
|
||||
|
|
@ -1251,10 +1260,10 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
|
|||
|
||||
// Move loopSize bytes starting at srcReg to dstReg.
|
||||
// Increment srcReg and destReg by loopSize as a side effect.
|
||||
for range loopSize / 16 {
|
||||
// LDP.P 16(srcReg), (tmpReg1, tmpReg2)
|
||||
// STP.P (tmpReg1, tmpReg2), 16(dstReg)
|
||||
move16(s, srcReg, dstReg, tmpReg1, tmpReg2, 0, true)
|
||||
for range loopSize / 32 {
|
||||
// FLDPQ.P 32(srcReg), (tmpFReg1, tmpFReg2)
|
||||
// FSTPQ.P (tmpFReg1, tmpFReg2), 32(dstReg)
|
||||
move32(s, srcReg, dstReg, tmpFReg1, tmpFReg2, 0, true)
|
||||
}
|
||||
// Decrement loop count.
|
||||
// SUB $1, countReg
|
||||
|
|
@ -1276,10 +1285,17 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
|
|||
|
||||
// Copy any fractional portion.
|
||||
var off int64
|
||||
for n >= 32 {
|
||||
// FLDPQ off(srcReg), (tmpFReg1, tmpFReg2)
|
||||
// FSTPQ (tmpFReg1, tmpFReg2), off(dstReg)
|
||||
move32(s, srcReg, dstReg, tmpFReg1, tmpFReg2, off, false)
|
||||
off += 32
|
||||
n -= 32
|
||||
}
|
||||
for n >= 16 {
|
||||
// LDP off(srcReg), (tmpReg1, tmpReg2)
|
||||
// STP (tmpReg1, tmpReg2), off(dstReg)
|
||||
move16(s, srcReg, dstReg, tmpReg1, tmpReg2, off, false)
|
||||
// FMOVQ off(src), tmpFReg1
|
||||
// FMOVQ tmpFReg1, off(dst)
|
||||
move16(s, srcReg, dstReg, tmpFReg1, off, false)
|
||||
off += 16
|
||||
n -= 16
|
||||
}
|
||||
|
|
@ -1699,26 +1715,55 @@ func zero8(s *ssagen.State, reg int16, off int64) {
|
|||
p.To.Offset = off
|
||||
}
|
||||
|
||||
// move16 copies 16 bytes at src+off to dst+off.
|
||||
// move32 copies 32 bytes at src+off to dst+off.
|
||||
// Uses registers tmp1 and tmp2.
|
||||
// If postInc is true, increment src and dst by 16.
|
||||
func move16(s *ssagen.State, src, dst, tmp1, tmp2 int16, off int64, postInc bool) {
|
||||
// LDP off(src), (tmp1, tmp2)
|
||||
ld := s.Prog(arm64.ALDP)
|
||||
// If postInc is true, increment src and dst by 32.
|
||||
func move32(s *ssagen.State, src, dst, tmp1, tmp2 int16, off int64, postInc bool) {
|
||||
// FLDPQ off(src), (tmp1, tmp2)
|
||||
ld := s.Prog(arm64.AFLDPQ)
|
||||
ld.From.Type = obj.TYPE_MEM
|
||||
ld.From.Reg = src
|
||||
ld.From.Offset = off
|
||||
ld.To.Type = obj.TYPE_REGREG
|
||||
ld.To.Reg = tmp1
|
||||
ld.To.Offset = int64(tmp2)
|
||||
// STP (tmp1, tmp2), off(dst)
|
||||
st := s.Prog(arm64.ASTP)
|
||||
// FSTPQ (tmp1, tmp2), off(dst)
|
||||
st := s.Prog(arm64.AFSTPQ)
|
||||
st.From.Type = obj.TYPE_REGREG
|
||||
st.From.Reg = tmp1
|
||||
st.From.Offset = int64(tmp2)
|
||||
st.To.Type = obj.TYPE_MEM
|
||||
st.To.Reg = dst
|
||||
st.To.Offset = off
|
||||
if postInc {
|
||||
if off != 0 {
|
||||
panic("can't postinc with non-zero offset")
|
||||
}
|
||||
ld.Scond = arm64.C_XPOST
|
||||
st.Scond = arm64.C_XPOST
|
||||
ld.From.Offset = 32
|
||||
st.To.Offset = 32
|
||||
}
|
||||
}
|
||||
|
||||
// move16 copies 16 bytes at src+off to dst+off.
|
||||
// Uses register tmp1
|
||||
// If postInc is true, increment src and dst by 16.
|
||||
func move16(s *ssagen.State, src, dst, tmp1 int16, off int64, postInc bool) {
|
||||
// FMOVQ off(src), tmp1
|
||||
ld := s.Prog(arm64.AFMOVQ)
|
||||
ld.From.Type = obj.TYPE_MEM
|
||||
ld.From.Reg = src
|
||||
ld.From.Offset = off
|
||||
ld.To.Type = obj.TYPE_REG
|
||||
ld.To.Reg = tmp1
|
||||
// FMOVQ tmp1, off(dst)
|
||||
st := s.Prog(arm64.AFMOVQ)
|
||||
st.From.Type = obj.TYPE_REG
|
||||
st.From.Reg = tmp1
|
||||
st.To.Type = obj.TYPE_MEM
|
||||
st.To.Reg = dst
|
||||
st.To.Offset = off
|
||||
if postInc {
|
||||
if off != 0 {
|
||||
panic("can't postinc with non-zero offset")
|
||||
|
|
|
|||
|
|
@ -144,8 +144,9 @@ func init() {
|
|||
gpspsbg = gpspg | buildReg("SB")
|
||||
fp = buildReg("F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31")
|
||||
callerSave = gp | fp | buildReg("g") // runtime.setg (and anything calling it) may clobber g
|
||||
r25 = buildReg("R25")
|
||||
r24to25 = buildReg("R24 R25")
|
||||
r23to25 = buildReg("R23 R24 R25")
|
||||
f16to17 = buildReg("F16 F17")
|
||||
rz = buildReg("ZERO")
|
||||
first16 = buildReg("R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15")
|
||||
)
|
||||
|
|
@ -599,8 +600,8 @@ func init() {
|
|||
aux: "Int64",
|
||||
argLength: 3,
|
||||
reg: regInfo{
|
||||
inputs: []regMask{gp &^ r24to25, gp &^ r24to25},
|
||||
clobbers: r24to25, // TODO: figure out needIntTemp x2
|
||||
inputs: []regMask{gp &^ r25, gp &^ r25},
|
||||
clobbers: r25 | f16to17, // TODO: figure out needIntTemp + x2 for floats
|
||||
},
|
||||
faultOnNilArg0: true,
|
||||
faultOnNilArg1: true,
|
||||
|
|
@ -617,8 +618,8 @@ func init() {
|
|||
aux: "Int64",
|
||||
argLength: 3,
|
||||
reg: regInfo{
|
||||
inputs: []regMask{gp &^ r23to25, gp &^ r23to25},
|
||||
clobbers: r23to25, // TODO: figure out needIntTemp x3
|
||||
inputs: []regMask{gp &^ r24to25, gp &^ r24to25},
|
||||
clobbers: r24to25 | f16to17, // TODO: figure out needIntTemp x2 + x2 for floats
|
||||
clobbersArg0: true,
|
||||
clobbersArg1: true,
|
||||
},
|
||||
|
|
|
|||
|
|
@ -23199,10 +23199,10 @@ var opcodeTable = [...]opInfo{
|
|||
faultOnNilArg1: true,
|
||||
reg: regInfo{
|
||||
inputs: []inputInfo{
|
||||
{0, 310378495}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R26 R30
|
||||
{1, 310378495}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R26 R30
|
||||
{0, 318767103}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R26 R30
|
||||
{1, 318767103}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R26 R30
|
||||
},
|
||||
clobbers: 25165824, // R24 R25
|
||||
clobbers: 422212481843200, // R25 F16 F17
|
||||
},
|
||||
},
|
||||
{
|
||||
|
|
@ -23213,10 +23213,10 @@ var opcodeTable = [...]opInfo{
|
|||
faultOnNilArg1: true,
|
||||
reg: regInfo{
|
||||
inputs: []inputInfo{
|
||||
{0, 306184191}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R26 R30
|
||||
{1, 306184191}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R26 R30
|
||||
{0, 310378495}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R26 R30
|
||||
{1, 310378495}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R26 R30
|
||||
},
|
||||
clobbers: 29360128, // R23 R24 R25
|
||||
clobbers: 422212490231808, // R24 R25 F16 F17
|
||||
clobbersArg0: true,
|
||||
clobbersArg1: true,
|
||||
},
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue