[dev.simd] cmd/compile: use X15 for zero value in AVX context

With the previous CL, the X15 (aliasd with Y15, Z15) register
holds the zero value for the whole register width. Use that in
AVX context when a zero value is needed.

Change-Id: If49b7059bce50c5e86f90bace0eaa830a91fa0fc
Reviewed-on: https://go-review.googlesource.com/c/go/+/698238
Reviewed-by: David Chase <drchase@google.com>
Reviewed-by: Junyang Shao <shaojunyang@google.com>
TryBot-Bypass: Cherry Mui <cherryyz@google.com>
This commit is contained in:
Cherry Mui 2025-08-21 15:22:57 -04:00
parent 4c311aa38f
commit 8d874834f1
4 changed files with 998 additions and 988 deletions

View file

@ -1713,12 +1713,7 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
case ssa.OpAMD64VZEROUPPER, ssa.OpAMD64VZEROALL: case ssa.OpAMD64VZEROUPPER, ssa.OpAMD64VZEROALL:
s.Prog(v.Op.Asm()) s.Prog(v.Op.Asm())
case ssa.OpAMD64Zero128, ssa.OpAMD64Zero256, ssa.OpAMD64Zero512: case ssa.OpAMD64Zero128, ssa.OpAMD64Zero256, ssa.OpAMD64Zero512:
p := s.Prog(v.Op.Asm()) // zero-width, no instruction generated
p.From.Type = obj.TYPE_REG
p.From.Reg = simdReg(v)
p.AddRestSourceReg(simdReg(v))
p.To.Type = obj.TYPE_REG
p.To.Reg = simdReg(v)
case ssa.OpAMD64VPADDD4: case ssa.OpAMD64VPADDD4:
p := s.Prog(v.Op.Asm()) p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG p.From.Type = obj.TYPE_REG

View file

@ -132,6 +132,9 @@ func init() {
gpspsb = gpsp | buildReg("SB") gpspsb = gpsp | buildReg("SB")
gpspsbg = gpspsb | g gpspsbg = gpspsb | g
callerSave = gp | fp | g // runtime.setg (and anything calling it) may clobber g callerSave = gp | fp | g // runtime.setg (and anything calling it) may clobber g
vz = v | x15
wz = w | x15
) )
// Common slices of register masks // Common slices of register masks
var ( var (
@ -140,6 +143,8 @@ func init() {
vonly = []regMask{v} vonly = []regMask{v}
wonly = []regMask{w} wonly = []regMask{w}
maskonly = []regMask{mask} maskonly = []regMask{mask}
vzonly = []regMask{vz}
wzonly = []regMask{wz}
) )
// Common regInfo // Common regInfo
@ -207,26 +212,24 @@ func init() {
vloadk = regInfo{inputs: []regMask{gpspsb, mask, 0}, outputs: vonly} vloadk = regInfo{inputs: []regMask{gpspsb, mask, 0}, outputs: vonly}
vstorek = regInfo{inputs: []regMask{gpspsb, mask, v, 0}} vstorek = regInfo{inputs: []regMask{gpspsb, mask, v, 0}}
v01 = regInfo{inputs: nil, outputs: vonly} v11 = regInfo{inputs: vzonly, outputs: vonly}
v11 = regInfo{inputs: vonly, outputs: vonly} v21 = regInfo{inputs: []regMask{vz, vz}, outputs: vonly}
v21 = regInfo{inputs: []regMask{v, v}, outputs: vonly} vk = regInfo{inputs: vzonly, outputs: maskonly}
vk = regInfo{inputs: vonly, outputs: maskonly}
kv = regInfo{inputs: maskonly, outputs: vonly} kv = regInfo{inputs: maskonly, outputs: vonly}
v2k = regInfo{inputs: []regMask{v, v}, outputs: maskonly} v2k = regInfo{inputs: []regMask{vz, vz}, outputs: maskonly}
vkv = regInfo{inputs: []regMask{v, mask}, outputs: vonly} vkv = regInfo{inputs: []regMask{vz, mask}, outputs: vonly}
v2kv = regInfo{inputs: []regMask{v, v, mask}, outputs: vonly} v2kv = regInfo{inputs: []regMask{vz, vz, mask}, outputs: vonly}
v2kk = regInfo{inputs: []regMask{v, v, mask}, outputs: maskonly} v2kk = regInfo{inputs: []regMask{vz, vz, mask}, outputs: maskonly}
v31 = regInfo{inputs: []regMask{v, v, v}, outputs: vonly} v31 = regInfo{inputs: []regMask{v, vz, vz}, outputs: vonly} // used in resultInArg0 ops, arg0 must not be x15
v3kv = regInfo{inputs: []regMask{v, v, v, mask}, outputs: vonly} v3kv = regInfo{inputs: []regMask{v, vz, vz, mask}, outputs: vonly} // used in resultInArg0 ops, arg0 must not be x15
vgpv = regInfo{inputs: []regMask{v, gp}, outputs: vonly} vgpv = regInfo{inputs: []regMask{vz, gp}, outputs: vonly}
vgp = regInfo{inputs: vonly, outputs: gponly} vgp = regInfo{inputs: vonly, outputs: gponly}
vfpv = regInfo{inputs: []regMask{v, fp}, outputs: vonly} vfpv = regInfo{inputs: []regMask{vz, fp}, outputs: vonly}
vfpkv = regInfo{inputs: []regMask{v, fp, mask}, outputs: vonly} vfpkv = regInfo{inputs: []regMask{vz, fp, mask}, outputs: vonly}
w01 = regInfo{inputs: nil, outputs: wonly} w11 = regInfo{inputs: wzonly, outputs: wonly}
w11 = regInfo{inputs: wonly, outputs: wonly} w21 = regInfo{inputs: []regMask{wz, wz}, outputs: wonly}
w21 = regInfo{inputs: []regMask{w, w}, outputs: wonly} wk = regInfo{inputs: wzonly, outputs: maskonly}
wk = regInfo{inputs: wonly, outputs: maskonly}
kw = regInfo{inputs: maskonly, outputs: wonly} kw = regInfo{inputs: maskonly, outputs: wonly}
w2k = regInfo{inputs: []regMask{fp, fp}, outputs: maskonly} w2k = regInfo{inputs: []regMask{fp, fp}, outputs: maskonly}
wkw = regInfo{inputs: []regMask{fp, mask}, outputs: fponly} wkw = regInfo{inputs: []regMask{fp, mask}, outputs: fponly}
@ -235,15 +238,17 @@ func init() {
w31 = regInfo{inputs: []regMask{fp, fp, fp}, outputs: fponly} w31 = regInfo{inputs: []regMask{fp, fp, fp}, outputs: fponly}
w3kw = regInfo{inputs: []regMask{fp, fp, fp, mask}, outputs: fponly} w3kw = regInfo{inputs: []regMask{fp, fp, fp, mask}, outputs: fponly}
wgpw = regInfo{inputs: []regMask{fp, gp}, outputs: fponly} wgpw = regInfo{inputs: []regMask{fp, gp}, outputs: fponly}
wgp = regInfo{inputs: wonly, outputs: gponly} wgp = regInfo{inputs: wzonly, outputs: gponly}
wfpw = regInfo{inputs: []regMask{w, fp}, outputs: wonly} wfpw = regInfo{inputs: []regMask{wz, fp}, outputs: wonly}
wfpkw = regInfo{inputs: []regMask{w, fp, mask}, outputs: wonly} wfpkw = regInfo{inputs: []regMask{wz, fp, mask}, outputs: wonly}
kload = regInfo{inputs: []regMask{gpspsb, 0}, outputs: maskonly} kload = regInfo{inputs: []regMask{gpspsb, 0}, outputs: maskonly}
kstore = regInfo{inputs: []regMask{gpspsb, mask, 0}} kstore = regInfo{inputs: []regMask{gpspsb, mask, 0}}
gpk = regInfo{inputs: gponly, outputs: maskonly} gpk = regInfo{inputs: gponly, outputs: maskonly}
kgp = regInfo{inputs: maskonly, outputs: gponly} kgp = regInfo{inputs: maskonly, outputs: gponly}
x15only = regInfo{inputs: nil, outputs: []regMask{x15}}
prefreg = regInfo{inputs: []regMask{gpspsbg}} prefreg = regInfo{inputs: []regMask{gpspsbg}}
) )
@ -1375,9 +1380,9 @@ func init() {
{name: "VPMOVVec64x4ToM", argLength: 1, reg: vk, asm: "VPMOVQ2M"}, {name: "VPMOVVec64x4ToM", argLength: 1, reg: vk, asm: "VPMOVQ2M"},
{name: "VPMOVVec64x8ToM", argLength: 1, reg: wk, asm: "VPMOVQ2M"}, {name: "VPMOVVec64x8ToM", argLength: 1, reg: wk, asm: "VPMOVQ2M"},
{name: "Zero128", argLength: 0, reg: v01, asm: "VPXOR"}, {name: "Zero128", argLength: 0, reg: x15only, zeroWidth: true, fixedReg: true},
{name: "Zero256", argLength: 0, reg: v01, asm: "VPXOR"}, {name: "Zero256", argLength: 0, reg: x15only, zeroWidth: true, fixedReg: true},
{name: "Zero512", argLength: 0, reg: w01, asm: "VPXORQ"}, {name: "Zero512", argLength: 0, reg: x15only, zeroWidth: true, fixedReg: true},
{name: "VZEROUPPER", argLength: 0, asm: "VZEROUPPER"}, {name: "VZEROUPPER", argLength: 0, asm: "VZEROUPPER"},
{name: "VZEROALL", argLength: 0, asm: "VZEROALL"}, {name: "VZEROALL", argLength: 0, asm: "VZEROALL"},
@ -1433,7 +1438,7 @@ func init() {
ParamFloatRegNames: "X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14", ParamFloatRegNames: "X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14",
gpregmask: gp, gpregmask: gp,
fpregmask: fp, fpregmask: fp,
specialregmask: x15 | mask, specialregmask: mask,
framepointerreg: int8(num["BP"]), framepointerreg: int8(num["BP"]),
linkreg: -1, // not used linkreg: -1, // not used
}) })

File diff suppressed because it is too large Load diff

View file

@ -1440,6 +1440,13 @@ func (s *regAllocState) regalloc(f *Func) {
s.sb = v.ID s.sb = v.ID
case OpARM64ZERO: case OpARM64ZERO:
s.assignReg(s.ZeroIntReg, v, v) s.assignReg(s.ZeroIntReg, v, v)
case OpAMD64Zero128, OpAMD64Zero256, OpAMD64Zero512:
regspec := s.regspec(v)
m := regspec.outputs[0].regs
if countRegs(m) != 1 {
f.Fatalf("bad fixed-register op %s", v)
}
s.assignReg(pickReg(m), v, v)
default: default:
f.Fatalf("unknown fixed-register op %s", v) f.Fatalf("unknown fixed-register op %s", v)
} }