From 9a934d5080ee103c43e92c35e213b97a92b8bd4a Mon Sep 17 00:00:00 2001 From: David Chase Date: Thu, 14 Aug 2025 17:26:15 -0400 Subject: [PATCH] [dev.simd] cmd/compile, simd: added methods for "float" GetElem This also required a "always use operation with least OverrideBase" filter in choosing the machine instructions. The order of generated HW operations is slightly modified because the Float version of GetElem appears earlier in the sorted operations list, though it is not chosen to generate the HW Op. Change-Id: I95fa67afca9c8b6f4f18941fdcaf69afdad8055b Reviewed-on: https://go-review.googlesource.com/c/go/+/696375 Reviewed-by: Junyang Shao LUCI-TryBot-Result: Go LUCI Reviewed-by: Cherry Mui --- src/cmd/compile/internal/amd64/simdssa.go | 8 +- .../compile/internal/ssa/_gen/simdAMD64.rules | 2 + .../compile/internal/ssa/_gen/simdAMD64ops.go | 4 +- .../internal/ssa/_gen/simdgenericOps.go | 2 + src/cmd/compile/internal/ssa/opGen.go | 74 +++++++++++-------- src/cmd/compile/internal/ssa/rewriteAMD64.go | 6 ++ .../compile/internal/ssagen/simdintrinsics.go | 2 + src/simd/_gen/simdgen/gen_simdMachineOps.go | 37 ++++++++-- src/simd/_gen/simdgen/godefs.go | 2 +- src/simd/_gen/simdgen/ops/Moves/go.yaml | 14 ++++ src/simd/ops_amd64.go | 14 ++++ 11 files changed, 122 insertions(+), 43 deletions(-) diff --git a/src/cmd/compile/internal/amd64/simdssa.go b/src/cmd/compile/internal/amd64/simdssa.go index 3ec8b484fb8..466e6c9cc74 100644 --- a/src/cmd/compile/internal/amd64/simdssa.go +++ b/src/cmd/compile/internal/amd64/simdssa.go @@ -1128,10 +1128,10 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPINSRW128: p = simdVgpvImm8(s, v) - case ssa.OpAMD64VPEXTRB128, - ssa.OpAMD64VPEXTRW128, - ssa.OpAMD64VPEXTRD128, - ssa.OpAMD64VPEXTRQ128: + case ssa.OpAMD64VPEXTRD128, + ssa.OpAMD64VPEXTRQ128, + ssa.OpAMD64VPEXTRB128, + ssa.OpAMD64VPEXTRW128: p = simdVgpImm8(s, v) case ssa.OpAMD64VGF2P8AFFINEINVQBMasked128, diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules index 9670f035ba8..d64f36cf74e 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules +++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules @@ -524,6 +524,8 @@ (GaloisFieldMulMaskedUint8x16 x y mask) => (VGF2P8MULBMasked128 x y (VPMOVVec8x16ToM mask)) (GaloisFieldMulMaskedUint8x32 x y mask) => (VGF2P8MULBMasked256 x y (VPMOVVec8x32ToM mask)) (GaloisFieldMulMaskedUint8x64 x y mask) => (VGF2P8MULBMasked512 x y (VPMOVVec8x64ToM mask)) +(GetElemFloat32x4 ...) => (VPEXTRD128 ...) +(GetElemFloat64x2 ...) => (VPEXTRQ128 ...) (GetElemInt8x16 ...) => (VPEXTRB128 ...) (GetElemInt16x8 ...) => (VPEXTRW128 ...) (GetElemInt32x4 ...) => (VPEXTRD128 ...) diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go b/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go index 61abaa5e978..ba73453ffe1 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go +++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go @@ -978,10 +978,10 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf {name: "VGF2P8AFFINEQBMasked128", argLength: 3, reg: w2kw, asm: "VGF2P8AFFINEQB", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VGF2P8AFFINEQBMasked256", argLength: 3, reg: w2kw, asm: "VGF2P8AFFINEQB", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VGF2P8AFFINEQBMasked512", argLength: 3, reg: w2kw, asm: "VGF2P8AFFINEQB", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false}, - {name: "VPEXTRB128", argLength: 1, reg: wgp, asm: "VPEXTRB", aux: "UInt8", commutative: false, typ: "int8", resultInArg0: false}, - {name: "VPEXTRW128", argLength: 1, reg: wgp, asm: "VPEXTRW", aux: "UInt8", commutative: false, typ: "int16", resultInArg0: false}, {name: "VPEXTRD128", argLength: 1, reg: vgp, asm: "VPEXTRD", aux: "UInt8", commutative: false, typ: "int32", resultInArg0: false}, {name: "VPEXTRQ128", argLength: 1, reg: vgp, asm: "VPEXTRQ", aux: "UInt8", commutative: false, typ: "int64", resultInArg0: false}, + {name: "VPEXTRB128", argLength: 1, reg: wgp, asm: "VPEXTRB", aux: "UInt8", commutative: false, typ: "int8", resultInArg0: false}, + {name: "VPEXTRW128", argLength: 1, reg: wgp, asm: "VPEXTRW", aux: "UInt8", commutative: false, typ: "int16", resultInArg0: false}, {name: "VEXTRACTF128128", argLength: 1, reg: v11, asm: "VEXTRACTF128", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VEXTRACTF64X4256", argLength: 1, reg: w11, asm: "VEXTRACTF64X4", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VEXTRACTI128128", argLength: 1, reg: v11, asm: "VEXTRACTI128", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false}, diff --git a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go index 4f2b1a91215..d98c0d8152a 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go +++ b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go @@ -1720,6 +1720,8 @@ func simdGenericOps() []opData { {name: "GaloisFieldAffineTransformUint8x16", argLength: 2, commutative: false, aux: "UInt8"}, {name: "GaloisFieldAffineTransformUint8x32", argLength: 2, commutative: false, aux: "UInt8"}, {name: "GaloisFieldAffineTransformUint8x64", argLength: 2, commutative: false, aux: "UInt8"}, + {name: "GetElemFloat32x4", argLength: 1, commutative: false, aux: "UInt8"}, + {name: "GetElemFloat64x2", argLength: 1, commutative: false, aux: "UInt8"}, {name: "GetElemInt8x16", argLength: 1, commutative: false, aux: "UInt8"}, {name: "GetElemInt16x8", argLength: 1, commutative: false, aux: "UInt8"}, {name: "GetElemInt32x4", argLength: 1, commutative: false, aux: "UInt8"}, diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go index 60ef3853524..b45cccd96bb 100644 --- a/src/cmd/compile/internal/ssa/opGen.go +++ b/src/cmd/compile/internal/ssa/opGen.go @@ -2201,10 +2201,10 @@ const ( OpAMD64VGF2P8AFFINEQBMasked128 OpAMD64VGF2P8AFFINEQBMasked256 OpAMD64VGF2P8AFFINEQBMasked512 - OpAMD64VPEXTRB128 - OpAMD64VPEXTRW128 OpAMD64VPEXTRD128 OpAMD64VPEXTRQ128 + OpAMD64VPEXTRB128 + OpAMD64VPEXTRW128 OpAMD64VEXTRACTF128128 OpAMD64VEXTRACTF64X4256 OpAMD64VEXTRACTI128128 @@ -6352,6 +6352,8 @@ const ( OpGaloisFieldAffineTransformUint8x16 OpGaloisFieldAffineTransformUint8x32 OpGaloisFieldAffineTransformUint8x64 + OpGetElemFloat32x4 + OpGetElemFloat64x2 OpGetElemInt8x16 OpGetElemInt16x8 OpGetElemInt32x4 @@ -34153,34 +34155,6 @@ var opcodeTable = [...]opInfo{ }, }, }, - { - name: "VPEXTRB128", - auxType: auxUInt8, - argLen: 1, - asm: x86.AVPEXTRB, - reg: regInfo{ - inputs: []inputInfo{ - {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 - }, - outputs: []outputInfo{ - {0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15 - }, - }, - }, - { - name: "VPEXTRW128", - auxType: auxUInt8, - argLen: 1, - asm: x86.AVPEXTRW, - reg: regInfo{ - inputs: []inputInfo{ - {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 - }, - outputs: []outputInfo{ - {0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15 - }, - }, - }, { name: "VPEXTRD128", auxType: auxUInt8, @@ -34209,6 +34183,34 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "VPEXTRB128", + auxType: auxUInt8, + argLen: 1, + asm: x86.AVPEXTRB, + reg: regInfo{ + inputs: []inputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15 + }, + }, + }, + { + name: "VPEXTRW128", + auxType: auxUInt8, + argLen: 1, + asm: x86.AVPEXTRW, + reg: regInfo{ + inputs: []inputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15 + }, + }, + }, { name: "VEXTRACTF128128", auxType: auxUInt8, @@ -72920,6 +72922,18 @@ var opcodeTable = [...]opInfo{ argLen: 2, generic: true, }, + { + name: "GetElemFloat32x4", + auxType: auxUInt8, + argLen: 1, + generic: true, + }, + { + name: "GetElemFloat64x2", + auxType: auxUInt8, + argLen: 1, + generic: true, + }, { name: "GetElemInt8x16", auxType: auxUInt8, diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go index 6e5e212fbeb..69393014c78 100644 --- a/src/cmd/compile/internal/ssa/rewriteAMD64.go +++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go @@ -2186,6 +2186,12 @@ func rewriteValueAMD64(v *Value) bool { case OpGetClosurePtr: v.Op = OpAMD64LoweredGetClosurePtr return true + case OpGetElemFloat32x4: + v.Op = OpAMD64VPEXTRD128 + return true + case OpGetElemFloat64x2: + v.Op = OpAMD64VPEXTRQ128 + return true case OpGetElemInt16x8: v.Op = OpAMD64VPEXTRW128 return true diff --git a/src/cmd/compile/internal/ssagen/simdintrinsics.go b/src/cmd/compile/internal/ssagen/simdintrinsics.go index 682a37e91ba..be3d917f8ff 100644 --- a/src/cmd/compile/internal/ssagen/simdintrinsics.go +++ b/src/cmd/compile/internal/ssagen/simdintrinsics.go @@ -536,6 +536,8 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Uint8x16.GaloisFieldMulMasked", opLen3(ssa.OpGaloisFieldMulMaskedUint8x16, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint8x32.GaloisFieldMulMasked", opLen3(ssa.OpGaloisFieldMulMaskedUint8x32, types.TypeVec256), sys.AMD64) addF(simdPackage, "Uint8x64.GaloisFieldMulMasked", opLen3(ssa.OpGaloisFieldMulMaskedUint8x64, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Float32x4.GetElem", opLen1Imm8(ssa.OpGetElemFloat32x4, types.Types[types.TFLOAT32], 0), sys.AMD64) + addF(simdPackage, "Float64x2.GetElem", opLen1Imm8(ssa.OpGetElemFloat64x2, types.Types[types.TFLOAT64], 0), sys.AMD64) addF(simdPackage, "Int8x16.GetElem", opLen1Imm8(ssa.OpGetElemInt8x16, types.Types[types.TINT8], 0), sys.AMD64) addF(simdPackage, "Int16x8.GetElem", opLen1Imm8(ssa.OpGetElemInt16x8, types.Types[types.TINT16], 0), sys.AMD64) addF(simdPackage, "Int32x4.GetElem", opLen1Imm8(ssa.OpGetElemInt32x4, types.Types[types.TINT32], 0), sys.AMD64) diff --git a/src/simd/_gen/simdgen/gen_simdMachineOps.go b/src/simd/_gen/simdgen/gen_simdMachineOps.go index 64918e5543a..f4d91a0c8ec 100644 --- a/src/simd/_gen/simdgen/gen_simdMachineOps.go +++ b/src/simd/_gen/simdgen/gen_simdMachineOps.go @@ -46,22 +46,47 @@ func writeSIMDMachineOps(ops []Operation) *bytes.Buffer { OpsData []opData OpsDataImm []opData } - seen := map[string]struct{}{} + regInfoSet := map[string]bool{ "v11": true, "v21": true, "v2k": true, "v2kv": true, "v2kk": true, "vkv": true, "v31": true, "v3kv": true, "vgpv": true, "vgp": true, "vfpv": true, "vfpkv": true, "w11": true, "w21": true, "w2k": true, "w2kw": true, "w2kk": true, "wkw": true, "w31": true, "w3kw": true, "wgpw": true, "wgp": true, "wfpw": true, "wfpkw": true} opsData := make([]opData, 0) opsDataImm := make([]opData, 0) + + // Determine the "best" version of an instruction to use + best := make(map[string]Operation) + var mOpOrder []string + countOverrides := func(s []Operand) int { + a := 0 + for _, o := range s { + if o.OverwriteBase != nil { + a++ + } + } + return a + } for _, op := range ops { - shapeIn, shapeOut, maskType, _, gOp := op.shape() + _, _, maskType, _, gOp := op.shape() asm := machineOpName(maskType, gOp) + other, ok := best[asm] + if !ok { + best[asm] = op + mOpOrder = append(mOpOrder, asm) + continue + } + // see if "op" is better than "other" + if countOverrides(op.In)+countOverrides(op.Out) < countOverrides(other.In)+countOverrides(other.Out) { + best[asm] = op + } + } + + for _, asm := range mOpOrder { + op := best[asm] + shapeIn, shapeOut, _, _, gOp := op.shape() // TODO: all our masked operations are now zeroing, we need to generate machine ops with merging masks, maybe copy // one here with a name suffix "Merging". The rewrite rules will need them. - if _, ok := seen[asm]; ok { - continue - } - seen[asm] = struct{}{} + regInfo, err := op.regShape() if err != nil { panic(err) diff --git a/src/simd/_gen/simdgen/godefs.go b/src/simd/_gen/simdgen/godefs.go index 0022140aaab..22decb9d7e6 100644 --- a/src/simd/_gen/simdgen/godefs.go +++ b/src/simd/_gen/simdgen/godefs.go @@ -67,7 +67,7 @@ type rawOperation struct { NoTypes *string // If non-nil, all generation in gen_simdGenericOps and gen_simdrules will be skipped. NoGenericOps *string - // If non-nil, this string will be attached to the machine ssa op name. + // If non-nil, this string will be attached to the machine ssa op name. E.g. "const" SSAVariant *string } diff --git a/src/simd/_gen/simdgen/ops/Moves/go.yaml b/src/simd/_gen/simdgen/ops/Moves/go.yaml index 71981c12af7..0e5997deebb 100644 --- a/src/simd/_gen/simdgen/ops/Moves/go.yaml +++ b/src/simd/_gen/simdgen/ops/Moves/go.yaml @@ -45,6 +45,20 @@ base: $b bits: $e +- go: GetElem + asm: "VPEXTR[DQ]" + in: + - class: vreg + base: int + elemBits: $e + OverwriteBase: float + - *imm + out: + - class: greg + base: int + bits: $e + OverwriteBase: float + - go: "SetHi|SetLo" asm: "VINSERTI128|VINSERTI64X4" inVariant: [] diff --git a/src/simd/ops_amd64.go b/src/simd/ops_amd64.go index d78bb699eaa..8da3cd18175 100644 --- a/src/simd/ops_amd64.go +++ b/src/simd/ops_amd64.go @@ -3470,6 +3470,20 @@ func (x Uint8x64) GaloisFieldMulMasked(y Uint8x64, mask Mask8x64) Uint8x64 /* GetElem */ +// GetElem retrieves a single constant-indexed element's value. +// +// index results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPEXTRD, CPU Feature: AVX +func (x Float32x4) GetElem(index uint8) float32 + +// GetElem retrieves a single constant-indexed element's value. +// +// index results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPEXTRQ, CPU Feature: AVX +func (x Float64x2) GetElem(index uint8) float64 + // GetElem retrieves a single constant-indexed element's value. // // index results in better performance when it's a constant, a non-constant value will be translated into a jump table.