diff --git a/src/cmd/compile/internal/amd64/simdssa.go b/src/cmd/compile/internal/amd64/simdssa.go index 3ec8b484fb8..466e6c9cc74 100644 --- a/src/cmd/compile/internal/amd64/simdssa.go +++ b/src/cmd/compile/internal/amd64/simdssa.go @@ -1128,10 +1128,10 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPINSRW128: p = simdVgpvImm8(s, v) - case ssa.OpAMD64VPEXTRB128, - ssa.OpAMD64VPEXTRW128, - ssa.OpAMD64VPEXTRD128, - ssa.OpAMD64VPEXTRQ128: + case ssa.OpAMD64VPEXTRD128, + ssa.OpAMD64VPEXTRQ128, + ssa.OpAMD64VPEXTRB128, + ssa.OpAMD64VPEXTRW128: p = simdVgpImm8(s, v) case ssa.OpAMD64VGF2P8AFFINEINVQBMasked128, diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules index 9670f035ba8..d64f36cf74e 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules +++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules @@ -524,6 +524,8 @@ (GaloisFieldMulMaskedUint8x16 x y mask) => (VGF2P8MULBMasked128 x y (VPMOVVec8x16ToM mask)) (GaloisFieldMulMaskedUint8x32 x y mask) => (VGF2P8MULBMasked256 x y (VPMOVVec8x32ToM mask)) (GaloisFieldMulMaskedUint8x64 x y mask) => (VGF2P8MULBMasked512 x y (VPMOVVec8x64ToM mask)) +(GetElemFloat32x4 ...) => (VPEXTRD128 ...) +(GetElemFloat64x2 ...) => (VPEXTRQ128 ...) (GetElemInt8x16 ...) => (VPEXTRB128 ...) (GetElemInt16x8 ...) => (VPEXTRW128 ...) (GetElemInt32x4 ...) => (VPEXTRD128 ...) diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go b/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go index 61abaa5e978..ba73453ffe1 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go +++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go @@ -978,10 +978,10 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf {name: "VGF2P8AFFINEQBMasked128", argLength: 3, reg: w2kw, asm: "VGF2P8AFFINEQB", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VGF2P8AFFINEQBMasked256", argLength: 3, reg: w2kw, asm: "VGF2P8AFFINEQB", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VGF2P8AFFINEQBMasked512", argLength: 3, reg: w2kw, asm: "VGF2P8AFFINEQB", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false}, - {name: "VPEXTRB128", argLength: 1, reg: wgp, asm: "VPEXTRB", aux: "UInt8", commutative: false, typ: "int8", resultInArg0: false}, - {name: "VPEXTRW128", argLength: 1, reg: wgp, asm: "VPEXTRW", aux: "UInt8", commutative: false, typ: "int16", resultInArg0: false}, {name: "VPEXTRD128", argLength: 1, reg: vgp, asm: "VPEXTRD", aux: "UInt8", commutative: false, typ: "int32", resultInArg0: false}, {name: "VPEXTRQ128", argLength: 1, reg: vgp, asm: "VPEXTRQ", aux: "UInt8", commutative: false, typ: "int64", resultInArg0: false}, + {name: "VPEXTRB128", argLength: 1, reg: wgp, asm: "VPEXTRB", aux: "UInt8", commutative: false, typ: "int8", resultInArg0: false}, + {name: "VPEXTRW128", argLength: 1, reg: wgp, asm: "VPEXTRW", aux: "UInt8", commutative: false, typ: "int16", resultInArg0: false}, {name: "VEXTRACTF128128", argLength: 1, reg: v11, asm: "VEXTRACTF128", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VEXTRACTF64X4256", argLength: 1, reg: w11, asm: "VEXTRACTF64X4", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VEXTRACTI128128", argLength: 1, reg: v11, asm: "VEXTRACTI128", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false}, diff --git a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go index 4f2b1a91215..d98c0d8152a 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go +++ b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go @@ -1720,6 +1720,8 @@ func simdGenericOps() []opData { {name: "GaloisFieldAffineTransformUint8x16", argLength: 2, commutative: false, aux: "UInt8"}, {name: "GaloisFieldAffineTransformUint8x32", argLength: 2, commutative: false, aux: "UInt8"}, {name: "GaloisFieldAffineTransformUint8x64", argLength: 2, commutative: false, aux: "UInt8"}, + {name: "GetElemFloat32x4", argLength: 1, commutative: false, aux: "UInt8"}, + {name: "GetElemFloat64x2", argLength: 1, commutative: false, aux: "UInt8"}, {name: "GetElemInt8x16", argLength: 1, commutative: false, aux: "UInt8"}, {name: "GetElemInt16x8", argLength: 1, commutative: false, aux: "UInt8"}, {name: "GetElemInt32x4", argLength: 1, commutative: false, aux: "UInt8"}, diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go index 60ef3853524..b45cccd96bb 100644 --- a/src/cmd/compile/internal/ssa/opGen.go +++ b/src/cmd/compile/internal/ssa/opGen.go @@ -2201,10 +2201,10 @@ const ( OpAMD64VGF2P8AFFINEQBMasked128 OpAMD64VGF2P8AFFINEQBMasked256 OpAMD64VGF2P8AFFINEQBMasked512 - OpAMD64VPEXTRB128 - OpAMD64VPEXTRW128 OpAMD64VPEXTRD128 OpAMD64VPEXTRQ128 + OpAMD64VPEXTRB128 + OpAMD64VPEXTRW128 OpAMD64VEXTRACTF128128 OpAMD64VEXTRACTF64X4256 OpAMD64VEXTRACTI128128 @@ -6352,6 +6352,8 @@ const ( OpGaloisFieldAffineTransformUint8x16 OpGaloisFieldAffineTransformUint8x32 OpGaloisFieldAffineTransformUint8x64 + OpGetElemFloat32x4 + OpGetElemFloat64x2 OpGetElemInt8x16 OpGetElemInt16x8 OpGetElemInt32x4 @@ -34153,34 +34155,6 @@ var opcodeTable = [...]opInfo{ }, }, }, - { - name: "VPEXTRB128", - auxType: auxUInt8, - argLen: 1, - asm: x86.AVPEXTRB, - reg: regInfo{ - inputs: []inputInfo{ - {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 - }, - outputs: []outputInfo{ - {0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15 - }, - }, - }, - { - name: "VPEXTRW128", - auxType: auxUInt8, - argLen: 1, - asm: x86.AVPEXTRW, - reg: regInfo{ - inputs: []inputInfo{ - {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 - }, - outputs: []outputInfo{ - {0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15 - }, - }, - }, { name: "VPEXTRD128", auxType: auxUInt8, @@ -34209,6 +34183,34 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "VPEXTRB128", + auxType: auxUInt8, + argLen: 1, + asm: x86.AVPEXTRB, + reg: regInfo{ + inputs: []inputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15 + }, + }, + }, + { + name: "VPEXTRW128", + auxType: auxUInt8, + argLen: 1, + asm: x86.AVPEXTRW, + reg: regInfo{ + inputs: []inputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15 + }, + }, + }, { name: "VEXTRACTF128128", auxType: auxUInt8, @@ -72920,6 +72922,18 @@ var opcodeTable = [...]opInfo{ argLen: 2, generic: true, }, + { + name: "GetElemFloat32x4", + auxType: auxUInt8, + argLen: 1, + generic: true, + }, + { + name: "GetElemFloat64x2", + auxType: auxUInt8, + argLen: 1, + generic: true, + }, { name: "GetElemInt8x16", auxType: auxUInt8, diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go index 6e5e212fbeb..69393014c78 100644 --- a/src/cmd/compile/internal/ssa/rewriteAMD64.go +++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go @@ -2186,6 +2186,12 @@ func rewriteValueAMD64(v *Value) bool { case OpGetClosurePtr: v.Op = OpAMD64LoweredGetClosurePtr return true + case OpGetElemFloat32x4: + v.Op = OpAMD64VPEXTRD128 + return true + case OpGetElemFloat64x2: + v.Op = OpAMD64VPEXTRQ128 + return true case OpGetElemInt16x8: v.Op = OpAMD64VPEXTRW128 return true diff --git a/src/cmd/compile/internal/ssagen/simdintrinsics.go b/src/cmd/compile/internal/ssagen/simdintrinsics.go index 682a37e91ba..be3d917f8ff 100644 --- a/src/cmd/compile/internal/ssagen/simdintrinsics.go +++ b/src/cmd/compile/internal/ssagen/simdintrinsics.go @@ -536,6 +536,8 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Uint8x16.GaloisFieldMulMasked", opLen3(ssa.OpGaloisFieldMulMaskedUint8x16, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint8x32.GaloisFieldMulMasked", opLen3(ssa.OpGaloisFieldMulMaskedUint8x32, types.TypeVec256), sys.AMD64) addF(simdPackage, "Uint8x64.GaloisFieldMulMasked", opLen3(ssa.OpGaloisFieldMulMaskedUint8x64, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Float32x4.GetElem", opLen1Imm8(ssa.OpGetElemFloat32x4, types.Types[types.TFLOAT32], 0), sys.AMD64) + addF(simdPackage, "Float64x2.GetElem", opLen1Imm8(ssa.OpGetElemFloat64x2, types.Types[types.TFLOAT64], 0), sys.AMD64) addF(simdPackage, "Int8x16.GetElem", opLen1Imm8(ssa.OpGetElemInt8x16, types.Types[types.TINT8], 0), sys.AMD64) addF(simdPackage, "Int16x8.GetElem", opLen1Imm8(ssa.OpGetElemInt16x8, types.Types[types.TINT16], 0), sys.AMD64) addF(simdPackage, "Int32x4.GetElem", opLen1Imm8(ssa.OpGetElemInt32x4, types.Types[types.TINT32], 0), sys.AMD64) diff --git a/src/simd/_gen/simdgen/gen_simdMachineOps.go b/src/simd/_gen/simdgen/gen_simdMachineOps.go index 64918e5543a..f4d91a0c8ec 100644 --- a/src/simd/_gen/simdgen/gen_simdMachineOps.go +++ b/src/simd/_gen/simdgen/gen_simdMachineOps.go @@ -46,22 +46,47 @@ func writeSIMDMachineOps(ops []Operation) *bytes.Buffer { OpsData []opData OpsDataImm []opData } - seen := map[string]struct{}{} + regInfoSet := map[string]bool{ "v11": true, "v21": true, "v2k": true, "v2kv": true, "v2kk": true, "vkv": true, "v31": true, "v3kv": true, "vgpv": true, "vgp": true, "vfpv": true, "vfpkv": true, "w11": true, "w21": true, "w2k": true, "w2kw": true, "w2kk": true, "wkw": true, "w31": true, "w3kw": true, "wgpw": true, "wgp": true, "wfpw": true, "wfpkw": true} opsData := make([]opData, 0) opsDataImm := make([]opData, 0) + + // Determine the "best" version of an instruction to use + best := make(map[string]Operation) + var mOpOrder []string + countOverrides := func(s []Operand) int { + a := 0 + for _, o := range s { + if o.OverwriteBase != nil { + a++ + } + } + return a + } for _, op := range ops { - shapeIn, shapeOut, maskType, _, gOp := op.shape() + _, _, maskType, _, gOp := op.shape() asm := machineOpName(maskType, gOp) + other, ok := best[asm] + if !ok { + best[asm] = op + mOpOrder = append(mOpOrder, asm) + continue + } + // see if "op" is better than "other" + if countOverrides(op.In)+countOverrides(op.Out) < countOverrides(other.In)+countOverrides(other.Out) { + best[asm] = op + } + } + + for _, asm := range mOpOrder { + op := best[asm] + shapeIn, shapeOut, _, _, gOp := op.shape() // TODO: all our masked operations are now zeroing, we need to generate machine ops with merging masks, maybe copy // one here with a name suffix "Merging". The rewrite rules will need them. - if _, ok := seen[asm]; ok { - continue - } - seen[asm] = struct{}{} + regInfo, err := op.regShape() if err != nil { panic(err) diff --git a/src/simd/_gen/simdgen/godefs.go b/src/simd/_gen/simdgen/godefs.go index 0022140aaab..22decb9d7e6 100644 --- a/src/simd/_gen/simdgen/godefs.go +++ b/src/simd/_gen/simdgen/godefs.go @@ -67,7 +67,7 @@ type rawOperation struct { NoTypes *string // If non-nil, all generation in gen_simdGenericOps and gen_simdrules will be skipped. NoGenericOps *string - // If non-nil, this string will be attached to the machine ssa op name. + // If non-nil, this string will be attached to the machine ssa op name. E.g. "const" SSAVariant *string } diff --git a/src/simd/_gen/simdgen/ops/Moves/go.yaml b/src/simd/_gen/simdgen/ops/Moves/go.yaml index 71981c12af7..0e5997deebb 100644 --- a/src/simd/_gen/simdgen/ops/Moves/go.yaml +++ b/src/simd/_gen/simdgen/ops/Moves/go.yaml @@ -45,6 +45,20 @@ base: $b bits: $e +- go: GetElem + asm: "VPEXTR[DQ]" + in: + - class: vreg + base: int + elemBits: $e + OverwriteBase: float + - *imm + out: + - class: greg + base: int + bits: $e + OverwriteBase: float + - go: "SetHi|SetLo" asm: "VINSERTI128|VINSERTI64X4" inVariant: [] diff --git a/src/simd/ops_amd64.go b/src/simd/ops_amd64.go index d78bb699eaa..8da3cd18175 100644 --- a/src/simd/ops_amd64.go +++ b/src/simd/ops_amd64.go @@ -3470,6 +3470,20 @@ func (x Uint8x64) GaloisFieldMulMasked(y Uint8x64, mask Mask8x64) Uint8x64 /* GetElem */ +// GetElem retrieves a single constant-indexed element's value. +// +// index results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPEXTRD, CPU Feature: AVX +func (x Float32x4) GetElem(index uint8) float32 + +// GetElem retrieves a single constant-indexed element's value. +// +// index results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPEXTRQ, CPU Feature: AVX +func (x Float64x2) GetElem(index uint8) float64 + // GetElem retrieves a single constant-indexed element's value. // // index results in better performance when it's a constant, a non-constant value will be translated into a jump table.