mirror of
https://github.com/golang/go.git
synced 2025-12-08 06:10:04 +00:00
cmd/compile: use arm64 neon in LoweredMemmove/LoweredMemmoveLoop
Raspberry Pi 5 (Cortex-A76)
│ base.log │ opt.log │
│ sec/op │ sec/op vs base │
MemmoveKnownSize112 3.549n ± 0% 3.652n ± 0% +2.92% (p=0.000 n=10)
MemmoveKnownSize128 3.979n ± 0% 3.617n ± 0% -9.09% (p=0.000 n=10)
MemmoveKnownSize192 7.566n ± 0% 5.074n ± 0% -32.94% (p=0.000 n=10)
MemmoveKnownSize248 8.549n ± 0% 7.184n ± 1% -15.97% (p=0.000 n=10)
MemmoveKnownSize256 10.010n ± 0% 6.827n ± 0% -31.80% (p=0.000 n=10)
MemmoveKnownSize512 19.81n ± 0% 13.59n ± 0% -31.40% (p=0.000 n=10)
MemmoveKnownSize1024 39.66n ± 0% 27.00n ± 0% -31.93% (p=0.000 n=10)
geomean 9.538n 7.392n -22.50%
Change-Id: I7b17408cd0a500ceaa80bc93ffe2f19ddeea9c0d
Reviewed-on: https://go-review.googlesource.com/c/go/+/692315
Reviewed-by: Keith Randall <khr@google.com>
Reviewed-by: David Chase <drchase@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
This commit is contained in:
parent
61d1ff61ad
commit
5c9a26c7f8
3 changed files with 79 additions and 33 deletions
|
|
@ -1189,8 +1189,9 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
|
||||||
if dstReg == srcReg {
|
if dstReg == srcReg {
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
tmpReg1 := int16(arm64.REG_R24)
|
tmpReg1 := int16(arm64.REG_R25)
|
||||||
tmpReg2 := int16(arm64.REG_R25)
|
tmpFReg1 := int16(arm64.REG_F16)
|
||||||
|
tmpFReg2 := int16(arm64.REG_F17)
|
||||||
n := v.AuxInt
|
n := v.AuxInt
|
||||||
if n < 16 {
|
if n < 16 {
|
||||||
v.Fatalf("Move too small %d", n)
|
v.Fatalf("Move too small %d", n)
|
||||||
|
|
@ -1198,10 +1199,17 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
|
||||||
|
|
||||||
// Generate copying instructions.
|
// Generate copying instructions.
|
||||||
var off int64
|
var off int64
|
||||||
|
for n >= 32 {
|
||||||
|
// FLDPQ off(srcReg), (tmpFReg1, tmpFReg2)
|
||||||
|
// FSTPQ (tmpFReg1, tmpFReg2), off(dstReg)
|
||||||
|
move32(s, srcReg, dstReg, tmpFReg1, tmpFReg2, off, false)
|
||||||
|
off += 32
|
||||||
|
n -= 32
|
||||||
|
}
|
||||||
for n >= 16 {
|
for n >= 16 {
|
||||||
// LDP off(srcReg), (tmpReg1, tmpReg2)
|
// FMOVQ off(src), tmpFReg1
|
||||||
// STP (tmpReg1, tmpReg2), off(dstReg)
|
// FMOVQ tmpFReg1, off(dst)
|
||||||
move16(s, srcReg, dstReg, tmpReg1, tmpReg2, off, false)
|
move16(s, srcReg, dstReg, tmpFReg1, off, false)
|
||||||
off += 16
|
off += 16
|
||||||
n -= 16
|
n -= 16
|
||||||
}
|
}
|
||||||
|
|
@ -1223,9 +1231,10 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
|
||||||
if dstReg == srcReg {
|
if dstReg == srcReg {
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
countReg := int16(arm64.REG_R23)
|
countReg := int16(arm64.REG_R24)
|
||||||
tmpReg1 := int16(arm64.REG_R24)
|
tmpReg1 := int16(arm64.REG_R25)
|
||||||
tmpReg2 := int16(arm64.REG_R25)
|
tmpFReg1 := int16(arm64.REG_F16)
|
||||||
|
tmpFReg2 := int16(arm64.REG_F17)
|
||||||
n := v.AuxInt
|
n := v.AuxInt
|
||||||
loopSize := int64(64)
|
loopSize := int64(64)
|
||||||
if n < 3*loopSize {
|
if n < 3*loopSize {
|
||||||
|
|
@ -1251,10 +1260,10 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
|
||||||
|
|
||||||
// Move loopSize bytes starting at srcReg to dstReg.
|
// Move loopSize bytes starting at srcReg to dstReg.
|
||||||
// Increment srcReg and destReg by loopSize as a side effect.
|
// Increment srcReg and destReg by loopSize as a side effect.
|
||||||
for range loopSize / 16 {
|
for range loopSize / 32 {
|
||||||
// LDP.P 16(srcReg), (tmpReg1, tmpReg2)
|
// FLDPQ.P 32(srcReg), (tmpFReg1, tmpFReg2)
|
||||||
// STP.P (tmpReg1, tmpReg2), 16(dstReg)
|
// FSTPQ.P (tmpFReg1, tmpFReg2), 32(dstReg)
|
||||||
move16(s, srcReg, dstReg, tmpReg1, tmpReg2, 0, true)
|
move32(s, srcReg, dstReg, tmpFReg1, tmpFReg2, 0, true)
|
||||||
}
|
}
|
||||||
// Decrement loop count.
|
// Decrement loop count.
|
||||||
// SUB $1, countReg
|
// SUB $1, countReg
|
||||||
|
|
@ -1276,10 +1285,17 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
|
||||||
|
|
||||||
// Copy any fractional portion.
|
// Copy any fractional portion.
|
||||||
var off int64
|
var off int64
|
||||||
|
for n >= 32 {
|
||||||
|
// FLDPQ off(srcReg), (tmpFReg1, tmpFReg2)
|
||||||
|
// FSTPQ (tmpFReg1, tmpFReg2), off(dstReg)
|
||||||
|
move32(s, srcReg, dstReg, tmpFReg1, tmpFReg2, off, false)
|
||||||
|
off += 32
|
||||||
|
n -= 32
|
||||||
|
}
|
||||||
for n >= 16 {
|
for n >= 16 {
|
||||||
// LDP off(srcReg), (tmpReg1, tmpReg2)
|
// FMOVQ off(src), tmpFReg1
|
||||||
// STP (tmpReg1, tmpReg2), off(dstReg)
|
// FMOVQ tmpFReg1, off(dst)
|
||||||
move16(s, srcReg, dstReg, tmpReg1, tmpReg2, off, false)
|
move16(s, srcReg, dstReg, tmpFReg1, off, false)
|
||||||
off += 16
|
off += 16
|
||||||
n -= 16
|
n -= 16
|
||||||
}
|
}
|
||||||
|
|
@ -1699,26 +1715,55 @@ func zero8(s *ssagen.State, reg int16, off int64) {
|
||||||
p.To.Offset = off
|
p.To.Offset = off
|
||||||
}
|
}
|
||||||
|
|
||||||
// move16 copies 16 bytes at src+off to dst+off.
|
// move32 copies 32 bytes at src+off to dst+off.
|
||||||
// Uses registers tmp1 and tmp2.
|
// Uses registers tmp1 and tmp2.
|
||||||
// If postInc is true, increment src and dst by 16.
|
// If postInc is true, increment src and dst by 32.
|
||||||
func move16(s *ssagen.State, src, dst, tmp1, tmp2 int16, off int64, postInc bool) {
|
func move32(s *ssagen.State, src, dst, tmp1, tmp2 int16, off int64, postInc bool) {
|
||||||
// LDP off(src), (tmp1, tmp2)
|
// FLDPQ off(src), (tmp1, tmp2)
|
||||||
ld := s.Prog(arm64.ALDP)
|
ld := s.Prog(arm64.AFLDPQ)
|
||||||
ld.From.Type = obj.TYPE_MEM
|
ld.From.Type = obj.TYPE_MEM
|
||||||
ld.From.Reg = src
|
ld.From.Reg = src
|
||||||
ld.From.Offset = off
|
ld.From.Offset = off
|
||||||
ld.To.Type = obj.TYPE_REGREG
|
ld.To.Type = obj.TYPE_REGREG
|
||||||
ld.To.Reg = tmp1
|
ld.To.Reg = tmp1
|
||||||
ld.To.Offset = int64(tmp2)
|
ld.To.Offset = int64(tmp2)
|
||||||
// STP (tmp1, tmp2), off(dst)
|
// FSTPQ (tmp1, tmp2), off(dst)
|
||||||
st := s.Prog(arm64.ASTP)
|
st := s.Prog(arm64.AFSTPQ)
|
||||||
st.From.Type = obj.TYPE_REGREG
|
st.From.Type = obj.TYPE_REGREG
|
||||||
st.From.Reg = tmp1
|
st.From.Reg = tmp1
|
||||||
st.From.Offset = int64(tmp2)
|
st.From.Offset = int64(tmp2)
|
||||||
st.To.Type = obj.TYPE_MEM
|
st.To.Type = obj.TYPE_MEM
|
||||||
st.To.Reg = dst
|
st.To.Reg = dst
|
||||||
st.To.Offset = off
|
st.To.Offset = off
|
||||||
|
if postInc {
|
||||||
|
if off != 0 {
|
||||||
|
panic("can't postinc with non-zero offset")
|
||||||
|
}
|
||||||
|
ld.Scond = arm64.C_XPOST
|
||||||
|
st.Scond = arm64.C_XPOST
|
||||||
|
ld.From.Offset = 32
|
||||||
|
st.To.Offset = 32
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// move16 copies 16 bytes at src+off to dst+off.
|
||||||
|
// Uses register tmp1
|
||||||
|
// If postInc is true, increment src and dst by 16.
|
||||||
|
func move16(s *ssagen.State, src, dst, tmp1 int16, off int64, postInc bool) {
|
||||||
|
// FMOVQ off(src), tmp1
|
||||||
|
ld := s.Prog(arm64.AFMOVQ)
|
||||||
|
ld.From.Type = obj.TYPE_MEM
|
||||||
|
ld.From.Reg = src
|
||||||
|
ld.From.Offset = off
|
||||||
|
ld.To.Type = obj.TYPE_REG
|
||||||
|
ld.To.Reg = tmp1
|
||||||
|
// FMOVQ tmp1, off(dst)
|
||||||
|
st := s.Prog(arm64.AFMOVQ)
|
||||||
|
st.From.Type = obj.TYPE_REG
|
||||||
|
st.From.Reg = tmp1
|
||||||
|
st.To.Type = obj.TYPE_MEM
|
||||||
|
st.To.Reg = dst
|
||||||
|
st.To.Offset = off
|
||||||
if postInc {
|
if postInc {
|
||||||
if off != 0 {
|
if off != 0 {
|
||||||
panic("can't postinc with non-zero offset")
|
panic("can't postinc with non-zero offset")
|
||||||
|
|
|
||||||
|
|
@ -144,8 +144,9 @@ func init() {
|
||||||
gpspsbg = gpspg | buildReg("SB")
|
gpspsbg = gpspg | buildReg("SB")
|
||||||
fp = buildReg("F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31")
|
fp = buildReg("F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31")
|
||||||
callerSave = gp | fp | buildReg("g") // runtime.setg (and anything calling it) may clobber g
|
callerSave = gp | fp | buildReg("g") // runtime.setg (and anything calling it) may clobber g
|
||||||
|
r25 = buildReg("R25")
|
||||||
r24to25 = buildReg("R24 R25")
|
r24to25 = buildReg("R24 R25")
|
||||||
r23to25 = buildReg("R23 R24 R25")
|
f16to17 = buildReg("F16 F17")
|
||||||
rz = buildReg("ZERO")
|
rz = buildReg("ZERO")
|
||||||
first16 = buildReg("R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15")
|
first16 = buildReg("R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15")
|
||||||
)
|
)
|
||||||
|
|
@ -599,8 +600,8 @@ func init() {
|
||||||
aux: "Int64",
|
aux: "Int64",
|
||||||
argLength: 3,
|
argLength: 3,
|
||||||
reg: regInfo{
|
reg: regInfo{
|
||||||
inputs: []regMask{gp &^ r24to25, gp &^ r24to25},
|
inputs: []regMask{gp &^ r25, gp &^ r25},
|
||||||
clobbers: r24to25, // TODO: figure out needIntTemp x2
|
clobbers: r25 | f16to17, // TODO: figure out needIntTemp + x2 for floats
|
||||||
},
|
},
|
||||||
faultOnNilArg0: true,
|
faultOnNilArg0: true,
|
||||||
faultOnNilArg1: true,
|
faultOnNilArg1: true,
|
||||||
|
|
@ -617,8 +618,8 @@ func init() {
|
||||||
aux: "Int64",
|
aux: "Int64",
|
||||||
argLength: 3,
|
argLength: 3,
|
||||||
reg: regInfo{
|
reg: regInfo{
|
||||||
inputs: []regMask{gp &^ r23to25, gp &^ r23to25},
|
inputs: []regMask{gp &^ r24to25, gp &^ r24to25},
|
||||||
clobbers: r23to25, // TODO: figure out needIntTemp x3
|
clobbers: r24to25 | f16to17, // TODO: figure out needIntTemp x2 + x2 for floats
|
||||||
clobbersArg0: true,
|
clobbersArg0: true,
|
||||||
clobbersArg1: true,
|
clobbersArg1: true,
|
||||||
},
|
},
|
||||||
|
|
|
||||||
|
|
@ -23199,10 +23199,10 @@ var opcodeTable = [...]opInfo{
|
||||||
faultOnNilArg1: true,
|
faultOnNilArg1: true,
|
||||||
reg: regInfo{
|
reg: regInfo{
|
||||||
inputs: []inputInfo{
|
inputs: []inputInfo{
|
||||||
{0, 310378495}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R26 R30
|
{0, 318767103}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R26 R30
|
||||||
{1, 310378495}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R26 R30
|
{1, 318767103}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R26 R30
|
||||||
},
|
},
|
||||||
clobbers: 25165824, // R24 R25
|
clobbers: 422212481843200, // R25 F16 F17
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
@ -23213,10 +23213,10 @@ var opcodeTable = [...]opInfo{
|
||||||
faultOnNilArg1: true,
|
faultOnNilArg1: true,
|
||||||
reg: regInfo{
|
reg: regInfo{
|
||||||
inputs: []inputInfo{
|
inputs: []inputInfo{
|
||||||
{0, 306184191}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R26 R30
|
{0, 310378495}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R26 R30
|
||||||
{1, 306184191}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R26 R30
|
{1, 310378495}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R26 R30
|
||||||
},
|
},
|
||||||
clobbers: 29360128, // R23 R24 R25
|
clobbers: 422212490231808, // R24 R25 F16 F17
|
||||||
clobbersArg0: true,
|
clobbersArg0: true,
|
||||||
clobbersArg1: true,
|
clobbersArg1: true,
|
||||||
},
|
},
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue