[dev.simd] cmd/compile, simd: added methods for "float" GetElem

This also required a "always use operation with least
OverrideBase" filter in choosing the machine instructions.

The order of generated HW operations is slightly
modified because the Float version of GetElem
appears earlier in the sorted operations list,
though it is not chosen to generate the HW Op.

Change-Id: I95fa67afca9c8b6f4f18941fdcaf69afdad8055b
Reviewed-on: https://go-review.googlesource.com/c/go/+/696375
Reviewed-by: Junyang Shao <shaojunyang@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
This commit is contained in:
David Chase 2025-08-14 17:26:15 -04:00
parent 7380213a4e
commit 9a934d5080
11 changed files with 122 additions and 43 deletions

View file

@ -1128,10 +1128,10 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
ssa.OpAMD64VPINSRW128: ssa.OpAMD64VPINSRW128:
p = simdVgpvImm8(s, v) p = simdVgpvImm8(s, v)
case ssa.OpAMD64VPEXTRB128, case ssa.OpAMD64VPEXTRD128,
ssa.OpAMD64VPEXTRW128, ssa.OpAMD64VPEXTRQ128,
ssa.OpAMD64VPEXTRD128, ssa.OpAMD64VPEXTRB128,
ssa.OpAMD64VPEXTRQ128: ssa.OpAMD64VPEXTRW128:
p = simdVgpImm8(s, v) p = simdVgpImm8(s, v)
case ssa.OpAMD64VGF2P8AFFINEINVQBMasked128, case ssa.OpAMD64VGF2P8AFFINEINVQBMasked128,

View file

@ -524,6 +524,8 @@
(GaloisFieldMulMaskedUint8x16 x y mask) => (VGF2P8MULBMasked128 x y (VPMOVVec8x16ToM <types.TypeMask> mask)) (GaloisFieldMulMaskedUint8x16 x y mask) => (VGF2P8MULBMasked128 x y (VPMOVVec8x16ToM <types.TypeMask> mask))
(GaloisFieldMulMaskedUint8x32 x y mask) => (VGF2P8MULBMasked256 x y (VPMOVVec8x32ToM <types.TypeMask> mask)) (GaloisFieldMulMaskedUint8x32 x y mask) => (VGF2P8MULBMasked256 x y (VPMOVVec8x32ToM <types.TypeMask> mask))
(GaloisFieldMulMaskedUint8x64 x y mask) => (VGF2P8MULBMasked512 x y (VPMOVVec8x64ToM <types.TypeMask> mask)) (GaloisFieldMulMaskedUint8x64 x y mask) => (VGF2P8MULBMasked512 x y (VPMOVVec8x64ToM <types.TypeMask> mask))
(GetElemFloat32x4 ...) => (VPEXTRD128 ...)
(GetElemFloat64x2 ...) => (VPEXTRQ128 ...)
(GetElemInt8x16 ...) => (VPEXTRB128 ...) (GetElemInt8x16 ...) => (VPEXTRB128 ...)
(GetElemInt16x8 ...) => (VPEXTRW128 ...) (GetElemInt16x8 ...) => (VPEXTRW128 ...)
(GetElemInt32x4 ...) => (VPEXTRD128 ...) (GetElemInt32x4 ...) => (VPEXTRD128 ...)

View file

@ -978,10 +978,10 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
{name: "VGF2P8AFFINEQBMasked128", argLength: 3, reg: w2kw, asm: "VGF2P8AFFINEQB", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VGF2P8AFFINEQBMasked128", argLength: 3, reg: w2kw, asm: "VGF2P8AFFINEQB", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
{name: "VGF2P8AFFINEQBMasked256", argLength: 3, reg: w2kw, asm: "VGF2P8AFFINEQB", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VGF2P8AFFINEQBMasked256", argLength: 3, reg: w2kw, asm: "VGF2P8AFFINEQB", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VGF2P8AFFINEQBMasked512", argLength: 3, reg: w2kw, asm: "VGF2P8AFFINEQB", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false}, {name: "VGF2P8AFFINEQBMasked512", argLength: 3, reg: w2kw, asm: "VGF2P8AFFINEQB", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "VPEXTRB128", argLength: 1, reg: wgp, asm: "VPEXTRB", aux: "UInt8", commutative: false, typ: "int8", resultInArg0: false},
{name: "VPEXTRW128", argLength: 1, reg: wgp, asm: "VPEXTRW", aux: "UInt8", commutative: false, typ: "int16", resultInArg0: false},
{name: "VPEXTRD128", argLength: 1, reg: vgp, asm: "VPEXTRD", aux: "UInt8", commutative: false, typ: "int32", resultInArg0: false}, {name: "VPEXTRD128", argLength: 1, reg: vgp, asm: "VPEXTRD", aux: "UInt8", commutative: false, typ: "int32", resultInArg0: false},
{name: "VPEXTRQ128", argLength: 1, reg: vgp, asm: "VPEXTRQ", aux: "UInt8", commutative: false, typ: "int64", resultInArg0: false}, {name: "VPEXTRQ128", argLength: 1, reg: vgp, asm: "VPEXTRQ", aux: "UInt8", commutative: false, typ: "int64", resultInArg0: false},
{name: "VPEXTRB128", argLength: 1, reg: wgp, asm: "VPEXTRB", aux: "UInt8", commutative: false, typ: "int8", resultInArg0: false},
{name: "VPEXTRW128", argLength: 1, reg: wgp, asm: "VPEXTRW", aux: "UInt8", commutative: false, typ: "int16", resultInArg0: false},
{name: "VEXTRACTF128128", argLength: 1, reg: v11, asm: "VEXTRACTF128", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VEXTRACTF128128", argLength: 1, reg: v11, asm: "VEXTRACTF128", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
{name: "VEXTRACTF64X4256", argLength: 1, reg: w11, asm: "VEXTRACTF64X4", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VEXTRACTF64X4256", argLength: 1, reg: w11, asm: "VEXTRACTF64X4", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VEXTRACTI128128", argLength: 1, reg: v11, asm: "VEXTRACTI128", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VEXTRACTI128128", argLength: 1, reg: v11, asm: "VEXTRACTI128", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},

View file

@ -1720,6 +1720,8 @@ func simdGenericOps() []opData {
{name: "GaloisFieldAffineTransformUint8x16", argLength: 2, commutative: false, aux: "UInt8"}, {name: "GaloisFieldAffineTransformUint8x16", argLength: 2, commutative: false, aux: "UInt8"},
{name: "GaloisFieldAffineTransformUint8x32", argLength: 2, commutative: false, aux: "UInt8"}, {name: "GaloisFieldAffineTransformUint8x32", argLength: 2, commutative: false, aux: "UInt8"},
{name: "GaloisFieldAffineTransformUint8x64", argLength: 2, commutative: false, aux: "UInt8"}, {name: "GaloisFieldAffineTransformUint8x64", argLength: 2, commutative: false, aux: "UInt8"},
{name: "GetElemFloat32x4", argLength: 1, commutative: false, aux: "UInt8"},
{name: "GetElemFloat64x2", argLength: 1, commutative: false, aux: "UInt8"},
{name: "GetElemInt8x16", argLength: 1, commutative: false, aux: "UInt8"}, {name: "GetElemInt8x16", argLength: 1, commutative: false, aux: "UInt8"},
{name: "GetElemInt16x8", argLength: 1, commutative: false, aux: "UInt8"}, {name: "GetElemInt16x8", argLength: 1, commutative: false, aux: "UInt8"},
{name: "GetElemInt32x4", argLength: 1, commutative: false, aux: "UInt8"}, {name: "GetElemInt32x4", argLength: 1, commutative: false, aux: "UInt8"},

View file

@ -2201,10 +2201,10 @@ const (
OpAMD64VGF2P8AFFINEQBMasked128 OpAMD64VGF2P8AFFINEQBMasked128
OpAMD64VGF2P8AFFINEQBMasked256 OpAMD64VGF2P8AFFINEQBMasked256
OpAMD64VGF2P8AFFINEQBMasked512 OpAMD64VGF2P8AFFINEQBMasked512
OpAMD64VPEXTRB128
OpAMD64VPEXTRW128
OpAMD64VPEXTRD128 OpAMD64VPEXTRD128
OpAMD64VPEXTRQ128 OpAMD64VPEXTRQ128
OpAMD64VPEXTRB128
OpAMD64VPEXTRW128
OpAMD64VEXTRACTF128128 OpAMD64VEXTRACTF128128
OpAMD64VEXTRACTF64X4256 OpAMD64VEXTRACTF64X4256
OpAMD64VEXTRACTI128128 OpAMD64VEXTRACTI128128
@ -6352,6 +6352,8 @@ const (
OpGaloisFieldAffineTransformUint8x16 OpGaloisFieldAffineTransformUint8x16
OpGaloisFieldAffineTransformUint8x32 OpGaloisFieldAffineTransformUint8x32
OpGaloisFieldAffineTransformUint8x64 OpGaloisFieldAffineTransformUint8x64
OpGetElemFloat32x4
OpGetElemFloat64x2
OpGetElemInt8x16 OpGetElemInt8x16
OpGetElemInt16x8 OpGetElemInt16x8
OpGetElemInt32x4 OpGetElemInt32x4
@ -34153,34 +34155,6 @@ var opcodeTable = [...]opInfo{
}, },
}, },
}, },
{
name: "VPEXTRB128",
auxType: auxUInt8,
argLen: 1,
asm: x86.AVPEXTRB,
reg: regInfo{
inputs: []inputInfo{
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
outputs: []outputInfo{
{0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
},
},
},
{
name: "VPEXTRW128",
auxType: auxUInt8,
argLen: 1,
asm: x86.AVPEXTRW,
reg: regInfo{
inputs: []inputInfo{
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
outputs: []outputInfo{
{0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
},
},
},
{ {
name: "VPEXTRD128", name: "VPEXTRD128",
auxType: auxUInt8, auxType: auxUInt8,
@ -34209,6 +34183,34 @@ var opcodeTable = [...]opInfo{
}, },
}, },
}, },
{
name: "VPEXTRB128",
auxType: auxUInt8,
argLen: 1,
asm: x86.AVPEXTRB,
reg: regInfo{
inputs: []inputInfo{
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
outputs: []outputInfo{
{0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
},
},
},
{
name: "VPEXTRW128",
auxType: auxUInt8,
argLen: 1,
asm: x86.AVPEXTRW,
reg: regInfo{
inputs: []inputInfo{
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
outputs: []outputInfo{
{0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
},
},
},
{ {
name: "VEXTRACTF128128", name: "VEXTRACTF128128",
auxType: auxUInt8, auxType: auxUInt8,
@ -72920,6 +72922,18 @@ var opcodeTable = [...]opInfo{
argLen: 2, argLen: 2,
generic: true, generic: true,
}, },
{
name: "GetElemFloat32x4",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "GetElemFloat64x2",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{ {
name: "GetElemInt8x16", name: "GetElemInt8x16",
auxType: auxUInt8, auxType: auxUInt8,

View file

@ -2186,6 +2186,12 @@ func rewriteValueAMD64(v *Value) bool {
case OpGetClosurePtr: case OpGetClosurePtr:
v.Op = OpAMD64LoweredGetClosurePtr v.Op = OpAMD64LoweredGetClosurePtr
return true return true
case OpGetElemFloat32x4:
v.Op = OpAMD64VPEXTRD128
return true
case OpGetElemFloat64x2:
v.Op = OpAMD64VPEXTRQ128
return true
case OpGetElemInt16x8: case OpGetElemInt16x8:
v.Op = OpAMD64VPEXTRW128 v.Op = OpAMD64VPEXTRW128
return true return true

View file

@ -536,6 +536,8 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
addF(simdPackage, "Uint8x16.GaloisFieldMulMasked", opLen3(ssa.OpGaloisFieldMulMaskedUint8x16, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint8x16.GaloisFieldMulMasked", opLen3(ssa.OpGaloisFieldMulMaskedUint8x16, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Uint8x32.GaloisFieldMulMasked", opLen3(ssa.OpGaloisFieldMulMaskedUint8x32, types.TypeVec256), sys.AMD64) addF(simdPackage, "Uint8x32.GaloisFieldMulMasked", opLen3(ssa.OpGaloisFieldMulMaskedUint8x32, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint8x64.GaloisFieldMulMasked", opLen3(ssa.OpGaloisFieldMulMaskedUint8x64, types.TypeVec512), sys.AMD64) addF(simdPackage, "Uint8x64.GaloisFieldMulMasked", opLen3(ssa.OpGaloisFieldMulMaskedUint8x64, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Float32x4.GetElem", opLen1Imm8(ssa.OpGetElemFloat32x4, types.Types[types.TFLOAT32], 0), sys.AMD64)
addF(simdPackage, "Float64x2.GetElem", opLen1Imm8(ssa.OpGetElemFloat64x2, types.Types[types.TFLOAT64], 0), sys.AMD64)
addF(simdPackage, "Int8x16.GetElem", opLen1Imm8(ssa.OpGetElemInt8x16, types.Types[types.TINT8], 0), sys.AMD64) addF(simdPackage, "Int8x16.GetElem", opLen1Imm8(ssa.OpGetElemInt8x16, types.Types[types.TINT8], 0), sys.AMD64)
addF(simdPackage, "Int16x8.GetElem", opLen1Imm8(ssa.OpGetElemInt16x8, types.Types[types.TINT16], 0), sys.AMD64) addF(simdPackage, "Int16x8.GetElem", opLen1Imm8(ssa.OpGetElemInt16x8, types.Types[types.TINT16], 0), sys.AMD64)
addF(simdPackage, "Int32x4.GetElem", opLen1Imm8(ssa.OpGetElemInt32x4, types.Types[types.TINT32], 0), sys.AMD64) addF(simdPackage, "Int32x4.GetElem", opLen1Imm8(ssa.OpGetElemInt32x4, types.Types[types.TINT32], 0), sys.AMD64)

View file

@ -46,22 +46,47 @@ func writeSIMDMachineOps(ops []Operation) *bytes.Buffer {
OpsData []opData OpsData []opData
OpsDataImm []opData OpsDataImm []opData
} }
seen := map[string]struct{}{}
regInfoSet := map[string]bool{ regInfoSet := map[string]bool{
"v11": true, "v21": true, "v2k": true, "v2kv": true, "v2kk": true, "vkv": true, "v31": true, "v3kv": true, "vgpv": true, "vgp": true, "vfpv": true, "vfpkv": true, "v11": true, "v21": true, "v2k": true, "v2kv": true, "v2kk": true, "vkv": true, "v31": true, "v3kv": true, "vgpv": true, "vgp": true, "vfpv": true, "vfpkv": true,
"w11": true, "w21": true, "w2k": true, "w2kw": true, "w2kk": true, "wkw": true, "w31": true, "w3kw": true, "wgpw": true, "wgp": true, "wfpw": true, "wfpkw": true} "w11": true, "w21": true, "w2k": true, "w2kw": true, "w2kk": true, "wkw": true, "w31": true, "w3kw": true, "wgpw": true, "wgp": true, "wfpw": true, "wfpkw": true}
opsData := make([]opData, 0) opsData := make([]opData, 0)
opsDataImm := make([]opData, 0) opsDataImm := make([]opData, 0)
// Determine the "best" version of an instruction to use
best := make(map[string]Operation)
var mOpOrder []string
countOverrides := func(s []Operand) int {
a := 0
for _, o := range s {
if o.OverwriteBase != nil {
a++
}
}
return a
}
for _, op := range ops { for _, op := range ops {
shapeIn, shapeOut, maskType, _, gOp := op.shape() _, _, maskType, _, gOp := op.shape()
asm := machineOpName(maskType, gOp) asm := machineOpName(maskType, gOp)
other, ok := best[asm]
if !ok {
best[asm] = op
mOpOrder = append(mOpOrder, asm)
continue
}
// see if "op" is better than "other"
if countOverrides(op.In)+countOverrides(op.Out) < countOverrides(other.In)+countOverrides(other.Out) {
best[asm] = op
}
}
for _, asm := range mOpOrder {
op := best[asm]
shapeIn, shapeOut, _, _, gOp := op.shape()
// TODO: all our masked operations are now zeroing, we need to generate machine ops with merging masks, maybe copy // TODO: all our masked operations are now zeroing, we need to generate machine ops with merging masks, maybe copy
// one here with a name suffix "Merging". The rewrite rules will need them. // one here with a name suffix "Merging". The rewrite rules will need them.
if _, ok := seen[asm]; ok {
continue
}
seen[asm] = struct{}{}
regInfo, err := op.regShape() regInfo, err := op.regShape()
if err != nil { if err != nil {
panic(err) panic(err)

View file

@ -67,7 +67,7 @@ type rawOperation struct {
NoTypes *string NoTypes *string
// If non-nil, all generation in gen_simdGenericOps and gen_simdrules will be skipped. // If non-nil, all generation in gen_simdGenericOps and gen_simdrules will be skipped.
NoGenericOps *string NoGenericOps *string
// If non-nil, this string will be attached to the machine ssa op name. // If non-nil, this string will be attached to the machine ssa op name. E.g. "const"
SSAVariant *string SSAVariant *string
} }

View file

@ -45,6 +45,20 @@
base: $b base: $b
bits: $e bits: $e
- go: GetElem
asm: "VPEXTR[DQ]"
in:
- class: vreg
base: int
elemBits: $e
OverwriteBase: float
- *imm
out:
- class: greg
base: int
bits: $e
OverwriteBase: float
- go: "SetHi|SetLo" - go: "SetHi|SetLo"
asm: "VINSERTI128|VINSERTI64X4" asm: "VINSERTI128|VINSERTI64X4"
inVariant: [] inVariant: []

View file

@ -3470,6 +3470,20 @@ func (x Uint8x64) GaloisFieldMulMasked(y Uint8x64, mask Mask8x64) Uint8x64
/* GetElem */ /* GetElem */
// GetElem retrieves a single constant-indexed element's value.
//
// index results in better performance when it's a constant, a non-constant value will be translated into a jump table.
//
// Asm: VPEXTRD, CPU Feature: AVX
func (x Float32x4) GetElem(index uint8) float32
// GetElem retrieves a single constant-indexed element's value.
//
// index results in better performance when it's a constant, a non-constant value will be translated into a jump table.
//
// Asm: VPEXTRQ, CPU Feature: AVX
func (x Float64x2) GetElem(index uint8) float64
// GetElem retrieves a single constant-indexed element's value. // GetElem retrieves a single constant-indexed element's value.
// //
// index results in better performance when it's a constant, a non-constant value will be translated into a jump table. // index results in better performance when it's a constant, a non-constant value will be translated into a jump table.