[dev.simd] cmd/compile, simd: added methods for "float" GetElem

This also required a "always use operation with least
OverrideBase" filter in choosing the machine instructions.

The order of generated HW operations is slightly
modified because the Float version of GetElem
appears earlier in the sorted operations list,
though it is not chosen to generate the HW Op.

Change-Id: I95fa67afca9c8b6f4f18941fdcaf69afdad8055b
Reviewed-on: https://go-review.googlesource.com/c/go/+/696375
Reviewed-by: Junyang Shao <shaojunyang@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
This commit is contained in:
David Chase 2025-08-14 17:26:15 -04:00
parent 7380213a4e
commit 9a934d5080
11 changed files with 122 additions and 43 deletions

View file

@ -1128,10 +1128,10 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
ssa.OpAMD64VPINSRW128:
p = simdVgpvImm8(s, v)
case ssa.OpAMD64VPEXTRB128,
ssa.OpAMD64VPEXTRW128,
ssa.OpAMD64VPEXTRD128,
ssa.OpAMD64VPEXTRQ128:
case ssa.OpAMD64VPEXTRD128,
ssa.OpAMD64VPEXTRQ128,
ssa.OpAMD64VPEXTRB128,
ssa.OpAMD64VPEXTRW128:
p = simdVgpImm8(s, v)
case ssa.OpAMD64VGF2P8AFFINEINVQBMasked128,

View file

@ -524,6 +524,8 @@
(GaloisFieldMulMaskedUint8x16 x y mask) => (VGF2P8MULBMasked128 x y (VPMOVVec8x16ToM <types.TypeMask> mask))
(GaloisFieldMulMaskedUint8x32 x y mask) => (VGF2P8MULBMasked256 x y (VPMOVVec8x32ToM <types.TypeMask> mask))
(GaloisFieldMulMaskedUint8x64 x y mask) => (VGF2P8MULBMasked512 x y (VPMOVVec8x64ToM <types.TypeMask> mask))
(GetElemFloat32x4 ...) => (VPEXTRD128 ...)
(GetElemFloat64x2 ...) => (VPEXTRQ128 ...)
(GetElemInt8x16 ...) => (VPEXTRB128 ...)
(GetElemInt16x8 ...) => (VPEXTRW128 ...)
(GetElemInt32x4 ...) => (VPEXTRD128 ...)

View file

@ -978,10 +978,10 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
{name: "VGF2P8AFFINEQBMasked128", argLength: 3, reg: w2kw, asm: "VGF2P8AFFINEQB", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
{name: "VGF2P8AFFINEQBMasked256", argLength: 3, reg: w2kw, asm: "VGF2P8AFFINEQB", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VGF2P8AFFINEQBMasked512", argLength: 3, reg: w2kw, asm: "VGF2P8AFFINEQB", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "VPEXTRB128", argLength: 1, reg: wgp, asm: "VPEXTRB", aux: "UInt8", commutative: false, typ: "int8", resultInArg0: false},
{name: "VPEXTRW128", argLength: 1, reg: wgp, asm: "VPEXTRW", aux: "UInt8", commutative: false, typ: "int16", resultInArg0: false},
{name: "VPEXTRD128", argLength: 1, reg: vgp, asm: "VPEXTRD", aux: "UInt8", commutative: false, typ: "int32", resultInArg0: false},
{name: "VPEXTRQ128", argLength: 1, reg: vgp, asm: "VPEXTRQ", aux: "UInt8", commutative: false, typ: "int64", resultInArg0: false},
{name: "VPEXTRB128", argLength: 1, reg: wgp, asm: "VPEXTRB", aux: "UInt8", commutative: false, typ: "int8", resultInArg0: false},
{name: "VPEXTRW128", argLength: 1, reg: wgp, asm: "VPEXTRW", aux: "UInt8", commutative: false, typ: "int16", resultInArg0: false},
{name: "VEXTRACTF128128", argLength: 1, reg: v11, asm: "VEXTRACTF128", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
{name: "VEXTRACTF64X4256", argLength: 1, reg: w11, asm: "VEXTRACTF64X4", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VEXTRACTI128128", argLength: 1, reg: v11, asm: "VEXTRACTI128", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},

View file

@ -1720,6 +1720,8 @@ func simdGenericOps() []opData {
{name: "GaloisFieldAffineTransformUint8x16", argLength: 2, commutative: false, aux: "UInt8"},
{name: "GaloisFieldAffineTransformUint8x32", argLength: 2, commutative: false, aux: "UInt8"},
{name: "GaloisFieldAffineTransformUint8x64", argLength: 2, commutative: false, aux: "UInt8"},
{name: "GetElemFloat32x4", argLength: 1, commutative: false, aux: "UInt8"},
{name: "GetElemFloat64x2", argLength: 1, commutative: false, aux: "UInt8"},
{name: "GetElemInt8x16", argLength: 1, commutative: false, aux: "UInt8"},
{name: "GetElemInt16x8", argLength: 1, commutative: false, aux: "UInt8"},
{name: "GetElemInt32x4", argLength: 1, commutative: false, aux: "UInt8"},

View file

@ -2201,10 +2201,10 @@ const (
OpAMD64VGF2P8AFFINEQBMasked128
OpAMD64VGF2P8AFFINEQBMasked256
OpAMD64VGF2P8AFFINEQBMasked512
OpAMD64VPEXTRB128
OpAMD64VPEXTRW128
OpAMD64VPEXTRD128
OpAMD64VPEXTRQ128
OpAMD64VPEXTRB128
OpAMD64VPEXTRW128
OpAMD64VEXTRACTF128128
OpAMD64VEXTRACTF64X4256
OpAMD64VEXTRACTI128128
@ -6352,6 +6352,8 @@ const (
OpGaloisFieldAffineTransformUint8x16
OpGaloisFieldAffineTransformUint8x32
OpGaloisFieldAffineTransformUint8x64
OpGetElemFloat32x4
OpGetElemFloat64x2
OpGetElemInt8x16
OpGetElemInt16x8
OpGetElemInt32x4
@ -34153,34 +34155,6 @@ var opcodeTable = [...]opInfo{
},
},
},
{
name: "VPEXTRB128",
auxType: auxUInt8,
argLen: 1,
asm: x86.AVPEXTRB,
reg: regInfo{
inputs: []inputInfo{
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
outputs: []outputInfo{
{0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
},
},
},
{
name: "VPEXTRW128",
auxType: auxUInt8,
argLen: 1,
asm: x86.AVPEXTRW,
reg: regInfo{
inputs: []inputInfo{
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
outputs: []outputInfo{
{0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
},
},
},
{
name: "VPEXTRD128",
auxType: auxUInt8,
@ -34209,6 +34183,34 @@ var opcodeTable = [...]opInfo{
},
},
},
{
name: "VPEXTRB128",
auxType: auxUInt8,
argLen: 1,
asm: x86.AVPEXTRB,
reg: regInfo{
inputs: []inputInfo{
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
outputs: []outputInfo{
{0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
},
},
},
{
name: "VPEXTRW128",
auxType: auxUInt8,
argLen: 1,
asm: x86.AVPEXTRW,
reg: regInfo{
inputs: []inputInfo{
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
outputs: []outputInfo{
{0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
},
},
},
{
name: "VEXTRACTF128128",
auxType: auxUInt8,
@ -72920,6 +72922,18 @@ var opcodeTable = [...]opInfo{
argLen: 2,
generic: true,
},
{
name: "GetElemFloat32x4",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "GetElemFloat64x2",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "GetElemInt8x16",
auxType: auxUInt8,

View file

@ -2186,6 +2186,12 @@ func rewriteValueAMD64(v *Value) bool {
case OpGetClosurePtr:
v.Op = OpAMD64LoweredGetClosurePtr
return true
case OpGetElemFloat32x4:
v.Op = OpAMD64VPEXTRD128
return true
case OpGetElemFloat64x2:
v.Op = OpAMD64VPEXTRQ128
return true
case OpGetElemInt16x8:
v.Op = OpAMD64VPEXTRW128
return true

View file

@ -536,6 +536,8 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
addF(simdPackage, "Uint8x16.GaloisFieldMulMasked", opLen3(ssa.OpGaloisFieldMulMaskedUint8x16, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Uint8x32.GaloisFieldMulMasked", opLen3(ssa.OpGaloisFieldMulMaskedUint8x32, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint8x64.GaloisFieldMulMasked", opLen3(ssa.OpGaloisFieldMulMaskedUint8x64, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Float32x4.GetElem", opLen1Imm8(ssa.OpGetElemFloat32x4, types.Types[types.TFLOAT32], 0), sys.AMD64)
addF(simdPackage, "Float64x2.GetElem", opLen1Imm8(ssa.OpGetElemFloat64x2, types.Types[types.TFLOAT64], 0), sys.AMD64)
addF(simdPackage, "Int8x16.GetElem", opLen1Imm8(ssa.OpGetElemInt8x16, types.Types[types.TINT8], 0), sys.AMD64)
addF(simdPackage, "Int16x8.GetElem", opLen1Imm8(ssa.OpGetElemInt16x8, types.Types[types.TINT16], 0), sys.AMD64)
addF(simdPackage, "Int32x4.GetElem", opLen1Imm8(ssa.OpGetElemInt32x4, types.Types[types.TINT32], 0), sys.AMD64)

View file

@ -46,22 +46,47 @@ func writeSIMDMachineOps(ops []Operation) *bytes.Buffer {
OpsData []opData
OpsDataImm []opData
}
seen := map[string]struct{}{}
regInfoSet := map[string]bool{
"v11": true, "v21": true, "v2k": true, "v2kv": true, "v2kk": true, "vkv": true, "v31": true, "v3kv": true, "vgpv": true, "vgp": true, "vfpv": true, "vfpkv": true,
"w11": true, "w21": true, "w2k": true, "w2kw": true, "w2kk": true, "wkw": true, "w31": true, "w3kw": true, "wgpw": true, "wgp": true, "wfpw": true, "wfpkw": true}
opsData := make([]opData, 0)
opsDataImm := make([]opData, 0)
// Determine the "best" version of an instruction to use
best := make(map[string]Operation)
var mOpOrder []string
countOverrides := func(s []Operand) int {
a := 0
for _, o := range s {
if o.OverwriteBase != nil {
a++
}
}
return a
}
for _, op := range ops {
shapeIn, shapeOut, maskType, _, gOp := op.shape()
_, _, maskType, _, gOp := op.shape()
asm := machineOpName(maskType, gOp)
other, ok := best[asm]
if !ok {
best[asm] = op
mOpOrder = append(mOpOrder, asm)
continue
}
// see if "op" is better than "other"
if countOverrides(op.In)+countOverrides(op.Out) < countOverrides(other.In)+countOverrides(other.Out) {
best[asm] = op
}
}
for _, asm := range mOpOrder {
op := best[asm]
shapeIn, shapeOut, _, _, gOp := op.shape()
// TODO: all our masked operations are now zeroing, we need to generate machine ops with merging masks, maybe copy
// one here with a name suffix "Merging". The rewrite rules will need them.
if _, ok := seen[asm]; ok {
continue
}
seen[asm] = struct{}{}
regInfo, err := op.regShape()
if err != nil {
panic(err)

View file

@ -67,7 +67,7 @@ type rawOperation struct {
NoTypes *string
// If non-nil, all generation in gen_simdGenericOps and gen_simdrules will be skipped.
NoGenericOps *string
// If non-nil, this string will be attached to the machine ssa op name.
// If non-nil, this string will be attached to the machine ssa op name. E.g. "const"
SSAVariant *string
}

View file

@ -45,6 +45,20 @@
base: $b
bits: $e
- go: GetElem
asm: "VPEXTR[DQ]"
in:
- class: vreg
base: int
elemBits: $e
OverwriteBase: float
- *imm
out:
- class: greg
base: int
bits: $e
OverwriteBase: float
- go: "SetHi|SetLo"
asm: "VINSERTI128|VINSERTI64X4"
inVariant: []

View file

@ -3470,6 +3470,20 @@ func (x Uint8x64) GaloisFieldMulMasked(y Uint8x64, mask Mask8x64) Uint8x64
/* GetElem */
// GetElem retrieves a single constant-indexed element's value.
//
// index results in better performance when it's a constant, a non-constant value will be translated into a jump table.
//
// Asm: VPEXTRD, CPU Feature: AVX
func (x Float32x4) GetElem(index uint8) float32
// GetElem retrieves a single constant-indexed element's value.
//
// index results in better performance when it's a constant, a non-constant value will be translated into a jump table.
//
// Asm: VPEXTRQ, CPU Feature: AVX
func (x Float64x2) GetElem(index uint8) float64
// GetElem retrieves a single constant-indexed element's value.
//
// index results in better performance when it's a constant, a non-constant value will be translated into a jump table.