[dev.simd] cmd/compile: changes for AVX2 SIMD masked load/store

This is "glue" changes and hand work for the AVX2
masked loads/stores.  Does not include generated
function/method declarations or intrinsic registration.

Change-Id: Ic95f90b117d0c471f174407ce3f729f1f517b23c
Reviewed-on: https://go-review.googlesource.com/c/go/+/689295
Reviewed-by: Junyang Shao <shaojunyang@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
This commit is contained in:
David Chase 2025-07-16 13:29:14 -04:00
parent 88568519b4
commit a0b87a7478
9 changed files with 404 additions and 3 deletions

View file

@ -1476,6 +1476,24 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
p.To.Reg = v.Args[0].Reg()
ssagen.AddAux(&p.To, v)
case ssa.OpAMD64VPMASK32load128, ssa.OpAMD64VPMASK64load128, ssa.OpAMD64VPMASK32load256, ssa.OpAMD64VPMASK64load256:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_MEM
p.From.Reg = v.Args[0].Reg()
ssagen.AddAux(&p.From, v)
p.To.Type = obj.TYPE_REG
p.To.Reg = simdReg(v)
p.AddRestSourceReg(simdReg(v.Args[1])) // masking simd reg
case ssa.OpAMD64VPMASK32store128, ssa.OpAMD64VPMASK64store128, ssa.OpAMD64VPMASK32store256, ssa.OpAMD64VPMASK64store256:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = simdReg(v.Args[2])
p.To.Type = obj.TYPE_MEM
p.To.Reg = v.Args[0].Reg()
ssagen.AddAux(&p.To, v)
p.AddRestSourceReg(simdReg(v.Args[1])) // masking simd reg
case ssa.OpAMD64VPMOVMToVec8x16,
ssa.OpAMD64VPMOVMToVec8x32,
ssa.OpAMD64VPMOVMToVec8x64,

View file

@ -1715,17 +1715,24 @@
(StoreMask64x8 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec64x8ToM <t> val) mem)
(Load <t> ptr mem) && t.Size() == 16 => (VMOVDQUload128 ptr mem)
(Store {t} ptr val mem) && t.Size() == 16 => (VMOVDQUstore128 ptr val mem)
(Load <t> ptr mem) && t.Size() == 32 => (VMOVDQUload256 ptr mem)
(Store {t} ptr val mem) && t.Size() == 32 => (VMOVDQUstore256 ptr val mem)
(Load <t> ptr mem) && t.Size() == 64 => (VMOVDQUload512 ptr mem)
(Store {t} ptr val mem) && t.Size() == 64 => (VMOVDQUstore512 ptr val mem)
(LoadMasked32 <t> ptr mask mem) && t.Size() == 16 => (VPMASK32load128 ptr mask mem)
(LoadMasked32 <t> ptr mask mem) && t.Size() == 32 => (VPMASK32load256 ptr mask mem)
(LoadMasked64 <t> ptr mask mem) && t.Size() == 16 => (VPMASK64load128 ptr mask mem)
(LoadMasked64 <t> ptr mask mem) && t.Size() == 32 => (VPMASK64load256 ptr mask mem)
(StoreMasked32 {t} ptr mask val mem) && t.Size() == 16 => (VPMASK32store128 ptr mask val mem)
(StoreMasked32 {t} ptr mask val mem) && t.Size() == 32 => (VPMASK32store256 ptr mask val mem)
(StoreMasked64 {t} ptr mask val mem) && t.Size() == 16 => (VPMASK64store128 ptr mask val mem)
(StoreMasked64 {t} ptr mask val mem) && t.Size() == 32 => (VPMASK64store256 ptr mask val mem)
(ZeroSIMD <t>) && t.Size() == 16 => (Zero128 <t>)
(ZeroSIMD <t>) && t.Size() == 32 => (Zero256 <t>)
(ZeroSIMD <t>) && t.Size() == 64 => (Zero512 <t>)

View file

@ -202,6 +202,12 @@ func init() {
fpstore = regInfo{inputs: []regMask{gpspsb, fp, 0}}
fpstoreidx = regInfo{inputs: []regMask{gpspsb, gpsp, fp, 0}}
// masked loads/stores, vector register or mask register
vloadv = regInfo{inputs: []regMask{gpspsb, v, 0}, outputs: vonly}
vstorev = regInfo{inputs: []regMask{gpspsb, v, v, 0}}
// vloadk = regInfo{inputs: []regMask{gpspsb, mask, 0}, outputs: vonly}
// vstorek = regInfo{inputs: []regMask{gpspsb, mask, v, 0}}
v01 = regInfo{inputs: nil, outputs: vonly}
v11 = regInfo{inputs: vonly, outputs: vonly}
v21 = regInfo{inputs: []regMask{v, v}, outputs: vonly}
@ -1279,6 +1285,17 @@ func init() {
{name: "VMOVDQUload512", argLength: 2, reg: fpload, asm: "VMOVDQU64", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0+auxint+aux, arg1 = mem
{name: "VMOVDQUstore512", argLength: 3, reg: fpstore, asm: "VMOVDQU64", aux: "SymOff", faultOnNilArg0: true, symEffect: "Write"}, // store, *(arg0+auxint+aux) = arg1, arg2 = mem
// AVX2 32 and 64-bit element masked moves.
{name: "VPMASK32load128", argLength: 3, reg: vloadv, asm: "VPMASKMOVD", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0+auxint+aux, arg1=integer mask, arg2 = mem
{name: "VPMASK32store128", argLength: 4, reg: vstorev, asm: "VPMASKMOVD", aux: "SymOff", faultOnNilArg0: true, symEffect: "Write"}, // store, *(arg0+auxint+aux) = arg2, arg1=integer mask, arg3 = mem
{name: "VPMASK64load128", argLength: 3, reg: vloadv, asm: "VPMASKMOVQ", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0+auxint+aux, arg1=integer mask, arg2 = mem
{name: "VPMASK64store128", argLength: 4, reg: vstorev, asm: "VPMASKMOVQ", aux: "SymOff", faultOnNilArg0: true, symEffect: "Write"}, // store, *(arg0+auxint+aux) = arg2, arg1=integer mask, arg3 = mem
{name: "VPMASK32load256", argLength: 3, reg: vloadv, asm: "VPMASKMOVD", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0+auxint+aux, arg1=integer mask, arg2 = mem
{name: "VPMASK32store256", argLength: 4, reg: vstorev, asm: "VPMASKMOVD", aux: "SymOff", faultOnNilArg0: true, symEffect: "Write"}, // store, *(arg0+auxint+aux) = arg2, arg1=integer mask, arg3 = mem
{name: "VPMASK64load256", argLength: 3, reg: vloadv, asm: "VPMASKMOVQ", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0+auxint+aux, arg1=integer mask, arg2 = mem
{name: "VPMASK64store256", argLength: 4, reg: vstorev, asm: "VPMASKMOVQ", aux: "SymOff", faultOnNilArg0: true, symEffect: "Write"}, // store, *(arg0+auxint+aux) = arg2, arg1=integer mask, arg3 = mem
{name: "VPMOVMToVec8x16", argLength: 1, reg: kv, asm: "VPMOVM2B"},
{name: "VPMOVMToVec8x32", argLength: 1, reg: kv, asm: "VPMOVM2B"},
{name: "VPMOVMToVec8x64", argLength: 1, reg: kw, asm: "VPMOVM2B"},

View file

@ -372,6 +372,14 @@ var genericOps = []opData{
{name: "Load", argLength: 2}, // Load from arg0. arg1=memory
{name: "Dereference", argLength: 2}, // Load from arg0. arg1=memory. Helper op for arg/result passing, result is an otherwise not-SSA-able "value".
{name: "Store", argLength: 3, typ: "Mem", aux: "Typ"}, // Store arg1 to arg0. arg2=memory, aux=type. Returns memory.
// masked memory operations.
// TODO add 16 and 8
{name: "LoadMasked32", argLength: 3}, // Load from arg0, arg1 = mask of 32-bits, arg2 = memory
{name: "LoadMasked64", argLength: 3}, // Load from arg0, arg1 = mask of 64-bits, arg2 = memory
{name: "StoreMasked32", argLength: 4, typ: "Mem", aux: "Typ"}, // Store arg2 to arg0, arg1=mask of 32-bits, arg3 = memory
{name: "StoreMasked64", argLength: 4, typ: "Mem", aux: "Typ"}, // Store arg2 to arg0, arg1=mask of 64-bits, arg3 = memory
// Normally we require that the source and destination of Move do not overlap.
// There is an exception when we know all the loads will happen before all
// the stores. In that case, overlap is ok. See

View file

@ -631,6 +631,19 @@ func (b *Block) NewValue4(pos src.XPos, op Op, t *types.Type, arg0, arg1, arg2,
return v
}
// NewValue4A returns a new value in the block with four arguments and zero aux values.
func (b *Block) NewValue4A(pos src.XPos, op Op, t *types.Type, aux Aux, arg0, arg1, arg2, arg3 *Value) *Value {
v := b.Func.newValue(op, t, b, pos)
v.AuxInt = 0
v.Aux = aux
v.Args = []*Value{arg0, arg1, arg2, arg3}
arg0.Uses++
arg1.Uses++
arg2.Uses++
arg3.Uses++
return v
}
// NewValue4I returns a new value in the block with four arguments and auxint value.
func (b *Block) NewValue4I(pos src.XPos, op Op, t *types.Type, auxint int64, arg0, arg1, arg2, arg3 *Value) *Value {
v := b.Func.newValue(op, t, b, pos)

View file

@ -1169,6 +1169,14 @@ const (
OpAMD64VMOVDQUstore256
OpAMD64VMOVDQUload512
OpAMD64VMOVDQUstore512
OpAMD64VPMASK32load128
OpAMD64VPMASK32store128
OpAMD64VPMASK64load128
OpAMD64VPMASK64store128
OpAMD64VPMASK32load256
OpAMD64VPMASK32store256
OpAMD64VPMASK64load256
OpAMD64VPMASK64store256
OpAMD64VPMOVMToVec8x16
OpAMD64VPMOVMToVec8x32
OpAMD64VPMOVMToVec8x64
@ -4246,6 +4254,10 @@ const (
OpLoad
OpDereference
OpStore
OpLoadMasked32
OpLoadMasked64
OpStoreMasked32
OpStoreMasked64
OpMove
OpZero
OpStoreWB
@ -18481,6 +18493,134 @@ var opcodeTable = [...]opInfo{
},
},
},
{
name: "VPMASK32load128",
auxType: auxSymOff,
argLen: 3,
faultOnNilArg0: true,
symEffect: SymRead,
asm: x86.AVPMASKMOVD,
reg: regInfo{
inputs: []inputInfo{
{1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
{0, 72057594037977087}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R15 SB
},
outputs: []outputInfo{
{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
},
},
},
{
name: "VPMASK32store128",
auxType: auxSymOff,
argLen: 4,
faultOnNilArg0: true,
symEffect: SymWrite,
asm: x86.AVPMASKMOVD,
reg: regInfo{
inputs: []inputInfo{
{1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
{2, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
{0, 72057594037977087}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R15 SB
},
},
},
{
name: "VPMASK64load128",
auxType: auxSymOff,
argLen: 3,
faultOnNilArg0: true,
symEffect: SymRead,
asm: x86.AVPMASKMOVQ,
reg: regInfo{
inputs: []inputInfo{
{1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
{0, 72057594037977087}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R15 SB
},
outputs: []outputInfo{
{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
},
},
},
{
name: "VPMASK64store128",
auxType: auxSymOff,
argLen: 4,
faultOnNilArg0: true,
symEffect: SymWrite,
asm: x86.AVPMASKMOVQ,
reg: regInfo{
inputs: []inputInfo{
{1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
{2, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
{0, 72057594037977087}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R15 SB
},
},
},
{
name: "VPMASK32load256",
auxType: auxSymOff,
argLen: 3,
faultOnNilArg0: true,
symEffect: SymRead,
asm: x86.AVPMASKMOVD,
reg: regInfo{
inputs: []inputInfo{
{1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
{0, 72057594037977087}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R15 SB
},
outputs: []outputInfo{
{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
},
},
},
{
name: "VPMASK32store256",
auxType: auxSymOff,
argLen: 4,
faultOnNilArg0: true,
symEffect: SymWrite,
asm: x86.AVPMASKMOVD,
reg: regInfo{
inputs: []inputInfo{
{1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
{2, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
{0, 72057594037977087}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R15 SB
},
},
},
{
name: "VPMASK64load256",
auxType: auxSymOff,
argLen: 3,
faultOnNilArg0: true,
symEffect: SymRead,
asm: x86.AVPMASKMOVQ,
reg: regInfo{
inputs: []inputInfo{
{1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
{0, 72057594037977087}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R15 SB
},
outputs: []outputInfo{
{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
},
},
},
{
name: "VPMASK64store256",
auxType: auxSymOff,
argLen: 4,
faultOnNilArg0: true,
symEffect: SymWrite,
asm: x86.AVPMASKMOVQ,
reg: regInfo{
inputs: []inputInfo{
{1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
{2, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
{0, 72057594037977087}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R15 SB
},
},
},
{
name: "VPMOVMToVec8x16",
argLen: 1,
@ -59969,6 +60109,28 @@ var opcodeTable = [...]opInfo{
argLen: 3,
generic: true,
},
{
name: "LoadMasked32",
argLen: 3,
generic: true,
},
{
name: "LoadMasked64",
argLen: 3,
generic: true,
},
{
name: "StoreMasked32",
auxType: auxTyp,
argLen: 4,
generic: true,
},
{
name: "StoreMasked64",
auxType: auxTyp,
argLen: 4,
generic: true,
},
{
name: "Move",
auxType: auxTypSize,

View file

@ -2462,6 +2462,10 @@ func rewriteValueAMD64(v *Value) bool {
return rewriteValueAMD64_OpLoadMask8x32(v)
case OpLoadMask8x64:
return rewriteValueAMD64_OpLoadMask8x64(v)
case OpLoadMasked32:
return rewriteValueAMD64_OpLoadMasked32(v)
case OpLoadMasked64:
return rewriteValueAMD64_OpLoadMasked64(v)
case OpLocalAddr:
return rewriteValueAMD64_OpLocalAddr(v)
case OpLsh16x16:
@ -5208,6 +5212,10 @@ func rewriteValueAMD64(v *Value) bool {
return rewriteValueAMD64_OpStoreMask8x32(v)
case OpStoreMask8x64:
return rewriteValueAMD64_OpStoreMask8x64(v)
case OpStoreMasked32:
return rewriteValueAMD64_OpStoreMasked32(v)
case OpStoreMasked64:
return rewriteValueAMD64_OpStoreMasked64(v)
case OpSub16:
v.Op = OpAMD64SUBL
return true
@ -40555,6 +40563,78 @@ func rewriteValueAMD64_OpLoadMask8x64(v *Value) bool {
return true
}
}
func rewriteValueAMD64_OpLoadMasked32(v *Value) bool {
v_2 := v.Args[2]
v_1 := v.Args[1]
v_0 := v.Args[0]
// match: (LoadMasked32 <t> ptr mask mem)
// cond: t.Size() == 16
// result: (VPMASK32load128 ptr mask mem)
for {
t := v.Type
ptr := v_0
mask := v_1
mem := v_2
if !(t.Size() == 16) {
break
}
v.reset(OpAMD64VPMASK32load128)
v.AddArg3(ptr, mask, mem)
return true
}
// match: (LoadMasked32 <t> ptr mask mem)
// cond: t.Size() == 32
// result: (VPMASK32load256 ptr mask mem)
for {
t := v.Type
ptr := v_0
mask := v_1
mem := v_2
if !(t.Size() == 32) {
break
}
v.reset(OpAMD64VPMASK32load256)
v.AddArg3(ptr, mask, mem)
return true
}
return false
}
func rewriteValueAMD64_OpLoadMasked64(v *Value) bool {
v_2 := v.Args[2]
v_1 := v.Args[1]
v_0 := v.Args[0]
// match: (LoadMasked64 <t> ptr mask mem)
// cond: t.Size() == 16
// result: (VPMASK64load128 ptr mask mem)
for {
t := v.Type
ptr := v_0
mask := v_1
mem := v_2
if !(t.Size() == 16) {
break
}
v.reset(OpAMD64VPMASK64load128)
v.AddArg3(ptr, mask, mem)
return true
}
// match: (LoadMasked64 <t> ptr mask mem)
// cond: t.Size() == 32
// result: (VPMASK64load256 ptr mask mem)
for {
t := v.Type
ptr := v_0
mask := v_1
mem := v_2
if !(t.Size() == 32) {
break
}
v.reset(OpAMD64VPMASK64load256)
v.AddArg3(ptr, mask, mem)
return true
}
return false
}
func rewriteValueAMD64_OpLocalAddr(v *Value) bool {
v_1 := v.Args[1]
v_0 := v.Args[0]
@ -53517,6 +53597,84 @@ func rewriteValueAMD64_OpStoreMask8x64(v *Value) bool {
return true
}
}
func rewriteValueAMD64_OpStoreMasked32(v *Value) bool {
v_3 := v.Args[3]
v_2 := v.Args[2]
v_1 := v.Args[1]
v_0 := v.Args[0]
// match: (StoreMasked32 {t} ptr mask val mem)
// cond: t.Size() == 16
// result: (VPMASK32store128 ptr mask val mem)
for {
t := auxToType(v.Aux)
ptr := v_0
mask := v_1
val := v_2
mem := v_3
if !(t.Size() == 16) {
break
}
v.reset(OpAMD64VPMASK32store128)
v.AddArg4(ptr, mask, val, mem)
return true
}
// match: (StoreMasked32 {t} ptr mask val mem)
// cond: t.Size() == 32
// result: (VPMASK32store256 ptr mask val mem)
for {
t := auxToType(v.Aux)
ptr := v_0
mask := v_1
val := v_2
mem := v_3
if !(t.Size() == 32) {
break
}
v.reset(OpAMD64VPMASK32store256)
v.AddArg4(ptr, mask, val, mem)
return true
}
return false
}
func rewriteValueAMD64_OpStoreMasked64(v *Value) bool {
v_3 := v.Args[3]
v_2 := v.Args[2]
v_1 := v.Args[1]
v_0 := v.Args[0]
// match: (StoreMasked64 {t} ptr mask val mem)
// cond: t.Size() == 16
// result: (VPMASK64store128 ptr mask val mem)
for {
t := auxToType(v.Aux)
ptr := v_0
mask := v_1
val := v_2
mem := v_3
if !(t.Size() == 16) {
break
}
v.reset(OpAMD64VPMASK64store128)
v.AddArg4(ptr, mask, val, mem)
return true
}
// match: (StoreMasked64 {t} ptr mask val mem)
// cond: t.Size() == 32
// result: (VPMASK64store256 ptr mask val mem)
for {
t := auxToType(v.Aux)
ptr := v_0
mask := v_1
val := v_2
mem := v_3
if !(t.Size() == 32) {
break
}
v.reset(OpAMD64VPMASK64store256)
v.AddArg4(ptr, mask, val, mem)
return true
}
return false
}
func rewriteValueAMD64_OpSubMaskedFloat32x16(v *Value) bool {
v_2 := v.Args[2]
v_1 := v.Args[1]

View file

@ -1808,6 +1808,19 @@ func simdStoreMask(elemBits, lanes int) func(s *state, n *ir.CallExpr, args []*s
}
}
func simdMaskedLoad(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue3(op, n.Type(), args[0], args[1], s.mem())
}
}
func simdMaskedStore(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
s.vars[memVar] = s.newValue4A(op, types.TypeMem, args[0].Type, args[1], args[2], args[0], s.mem())
return nil
}
}
// findIntrinsic returns a function which builds the SSA equivalent of the
// function identified by the symbol sym. If sym is not an intrinsic call, returns nil.
func findIntrinsic(sym *types.Sym) intrinsicBuilder {

View file

@ -1270,6 +1270,11 @@ func (s *state) newValue4(op ssa.Op, t *types.Type, arg0, arg1, arg2, arg3 *ssa.
return s.curBlock.NewValue4(s.peekPos(), op, t, arg0, arg1, arg2, arg3)
}
// newValue4A adds a new value with four arguments and an aux value to the current block.
func (s *state) newValue4A(op ssa.Op, t *types.Type, aux ssa.Aux, arg0, arg1, arg2, arg3 *ssa.Value) *ssa.Value {
return s.curBlock.NewValue4A(s.peekPos(), op, t, aux, arg0, arg1, arg2, arg3)
}
// newValue4I adds a new value with four arguments and an auxint value to the current block.
func (s *state) newValue4I(op ssa.Op, t *types.Type, aux int64, arg0, arg1, arg2, arg3 *ssa.Value) *ssa.Value {
return s.curBlock.NewValue4I(s.peekPos(), op, t, aux, arg0, arg1, arg2, arg3)