diff --git a/src/cmd/compile/internal/amd64/simdssa.go b/src/cmd/compile/internal/amd64/simdssa.go index f374cd25d0a..d4126cef1e3 100644 --- a/src/cmd/compile/internal/amd64/simdssa.go +++ b/src/cmd/compile/internal/amd64/simdssa.go @@ -36,6 +36,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VRSQRT14PD128, ssa.OpAMD64VRSQRT14PD256, ssa.OpAMD64VRSQRT14PD512, + ssa.OpAMD64VCVTTPS2DQ128, + ssa.OpAMD64VCVTTPS2DQ256, + ssa.OpAMD64VCVTTPS2DQ512, + ssa.OpAMD64VCVTPS2UDQ128, + ssa.OpAMD64VCVTPS2UDQ256, + ssa.OpAMD64VCVTPS2UDQ512, ssa.OpAMD64VPOPCNTB128, ssa.OpAMD64VPOPCNTB256, ssa.OpAMD64VPOPCNTB512, @@ -628,6 +634,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPCOMPRESSQMasked128, ssa.OpAMD64VPCOMPRESSQMasked256, ssa.OpAMD64VPCOMPRESSQMasked512, + ssa.OpAMD64VCVTTPS2DQMasked128, + ssa.OpAMD64VCVTTPS2DQMasked256, + ssa.OpAMD64VCVTTPS2DQMasked512, + ssa.OpAMD64VCVTPS2UDQMasked128, + ssa.OpAMD64VCVTPS2UDQMasked256, + ssa.OpAMD64VCVTPS2UDQMasked512, ssa.OpAMD64VPOPCNTBMasked128, ssa.OpAMD64VPOPCNTBMasked256, ssa.OpAMD64VPOPCNTBMasked512, @@ -1124,6 +1136,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPCOMPRESSQMasked128, ssa.OpAMD64VPCOMPRESSQMasked256, ssa.OpAMD64VPCOMPRESSQMasked512, + ssa.OpAMD64VCVTTPS2DQMasked128, + ssa.OpAMD64VCVTTPS2DQMasked256, + ssa.OpAMD64VCVTTPS2DQMasked512, + ssa.OpAMD64VCVTPS2UDQMasked128, + ssa.OpAMD64VCVTPS2UDQMasked256, + ssa.OpAMD64VCVTPS2UDQMasked512, ssa.OpAMD64VREDUCEPSMasked128, ssa.OpAMD64VREDUCEPSMasked256, ssa.OpAMD64VREDUCEPSMasked512, diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules index fb153acf66e..e5e3fb0d50e 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules +++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules @@ -234,6 +234,18 @@ (CompressUint64x2 x mask) => (VPCOMPRESSQMasked128 x (VPMOVVec64x2ToM mask)) (CompressUint64x4 x mask) => (VPCOMPRESSQMasked256 x (VPMOVVec64x4ToM mask)) (CompressUint64x8 x mask) => (VPCOMPRESSQMasked512 x (VPMOVVec64x8ToM mask)) +(ConvertToInt32Float32x4 ...) => (VCVTTPS2DQ128 ...) +(ConvertToInt32Float32x8 ...) => (VCVTTPS2DQ256 ...) +(ConvertToInt32Float32x16 ...) => (VCVTTPS2DQ512 ...) +(ConvertToInt32MaskedFloat32x4 x mask) => (VCVTTPS2DQMasked128 x (VPMOVVec32x4ToM mask)) +(ConvertToInt32MaskedFloat32x8 x mask) => (VCVTTPS2DQMasked256 x (VPMOVVec32x8ToM mask)) +(ConvertToInt32MaskedFloat32x16 x mask) => (VCVTTPS2DQMasked512 x (VPMOVVec32x16ToM mask)) +(ConvertToUint32Float32x4 ...) => (VCVTPS2UDQ128 ...) +(ConvertToUint32Float32x8 ...) => (VCVTPS2UDQ256 ...) +(ConvertToUint32Float32x16 ...) => (VCVTPS2UDQ512 ...) +(ConvertToUint32MaskedFloat32x4 x mask) => (VCVTPS2UDQMasked128 x (VPMOVVec32x4ToM mask)) +(ConvertToUint32MaskedFloat32x8 x mask) => (VCVTPS2UDQMasked256 x (VPMOVVec32x8ToM mask)) +(ConvertToUint32MaskedFloat32x16 x mask) => (VCVTPS2UDQMasked512 x (VPMOVVec32x16ToM mask)) (DiffWithCeilWithPrecisionFloat32x4 [a] x) => (VREDUCEPS128 [a+2] x) (DiffWithCeilWithPrecisionFloat32x8 [a] x) => (VREDUCEPS256 [a+2] x) (DiffWithCeilWithPrecisionFloat32x16 [a] x) => (VREDUCEPS512 [a+2] x) diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go b/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go index 3ab0eb527f8..adb6dd968f5 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go +++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go @@ -25,6 +25,18 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf {name: "VCOMPRESSPSMasked128", argLength: 2, reg: wkw, asm: "VCOMPRESSPS", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VCOMPRESSPSMasked256", argLength: 2, reg: wkw, asm: "VCOMPRESSPS", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VCOMPRESSPSMasked512", argLength: 2, reg: wkw, asm: "VCOMPRESSPS", commutative: false, typ: "Vec512", resultInArg0: false}, + {name: "VCVTPS2UDQ128", argLength: 1, reg: w11, asm: "VCVTPS2UDQ", commutative: false, typ: "Vec128", resultInArg0: false}, + {name: "VCVTPS2UDQ256", argLength: 1, reg: w11, asm: "VCVTPS2UDQ", commutative: false, typ: "Vec256", resultInArg0: false}, + {name: "VCVTPS2UDQ512", argLength: 1, reg: w11, asm: "VCVTPS2UDQ", commutative: false, typ: "Vec512", resultInArg0: false}, + {name: "VCVTPS2UDQMasked128", argLength: 2, reg: wkw, asm: "VCVTPS2UDQ", commutative: false, typ: "Vec128", resultInArg0: false}, + {name: "VCVTPS2UDQMasked256", argLength: 2, reg: wkw, asm: "VCVTPS2UDQ", commutative: false, typ: "Vec256", resultInArg0: false}, + {name: "VCVTPS2UDQMasked512", argLength: 2, reg: wkw, asm: "VCVTPS2UDQ", commutative: false, typ: "Vec512", resultInArg0: false}, + {name: "VCVTTPS2DQ128", argLength: 1, reg: v11, asm: "VCVTTPS2DQ", commutative: false, typ: "Vec128", resultInArg0: false}, + {name: "VCVTTPS2DQ256", argLength: 1, reg: v11, asm: "VCVTTPS2DQ", commutative: false, typ: "Vec256", resultInArg0: false}, + {name: "VCVTTPS2DQ512", argLength: 1, reg: w11, asm: "VCVTTPS2DQ", commutative: false, typ: "Vec512", resultInArg0: false}, + {name: "VCVTTPS2DQMasked128", argLength: 2, reg: wkw, asm: "VCVTTPS2DQ", commutative: false, typ: "Vec128", resultInArg0: false}, + {name: "VCVTTPS2DQMasked256", argLength: 2, reg: wkw, asm: "VCVTTPS2DQ", commutative: false, typ: "Vec256", resultInArg0: false}, + {name: "VCVTTPS2DQMasked512", argLength: 2, reg: wkw, asm: "VCVTTPS2DQ", commutative: false, typ: "Vec512", resultInArg0: false}, {name: "VDIVPD128", argLength: 2, reg: v21, asm: "VDIVPD", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VDIVPD256", argLength: 2, reg: v21, asm: "VDIVPD", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VDIVPD512", argLength: 2, reg: w21, asm: "VDIVPD", commutative: false, typ: "Vec512", resultInArg0: false}, diff --git a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go index 654c1ee1718..f1c1246d240 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go +++ b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go @@ -225,6 +225,18 @@ func simdGenericOps() []opData { {name: "CompressUint64x2", argLength: 2, commutative: false}, {name: "CompressUint64x4", argLength: 2, commutative: false}, {name: "CompressUint64x8", argLength: 2, commutative: false}, + {name: "ConvertToInt32Float32x4", argLength: 1, commutative: false}, + {name: "ConvertToInt32Float32x8", argLength: 1, commutative: false}, + {name: "ConvertToInt32Float32x16", argLength: 1, commutative: false}, + {name: "ConvertToInt32MaskedFloat32x4", argLength: 2, commutative: false}, + {name: "ConvertToInt32MaskedFloat32x8", argLength: 2, commutative: false}, + {name: "ConvertToInt32MaskedFloat32x16", argLength: 2, commutative: false}, + {name: "ConvertToUint32Float32x4", argLength: 1, commutative: false}, + {name: "ConvertToUint32Float32x8", argLength: 1, commutative: false}, + {name: "ConvertToUint32Float32x16", argLength: 1, commutative: false}, + {name: "ConvertToUint32MaskedFloat32x4", argLength: 2, commutative: false}, + {name: "ConvertToUint32MaskedFloat32x8", argLength: 2, commutative: false}, + {name: "ConvertToUint32MaskedFloat32x16", argLength: 2, commutative: false}, {name: "DivFloat32x4", argLength: 2, commutative: false}, {name: "DivFloat32x8", argLength: 2, commutative: false}, {name: "DivFloat32x16", argLength: 2, commutative: false}, diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go index 89e0d853dcb..b9dc41e8607 100644 --- a/src/cmd/compile/internal/ssa/opGen.go +++ b/src/cmd/compile/internal/ssa/opGen.go @@ -1230,6 +1230,18 @@ const ( OpAMD64VCOMPRESSPSMasked128 OpAMD64VCOMPRESSPSMasked256 OpAMD64VCOMPRESSPSMasked512 + OpAMD64VCVTPS2UDQ128 + OpAMD64VCVTPS2UDQ256 + OpAMD64VCVTPS2UDQ512 + OpAMD64VCVTPS2UDQMasked128 + OpAMD64VCVTPS2UDQMasked256 + OpAMD64VCVTPS2UDQMasked512 + OpAMD64VCVTTPS2DQ128 + OpAMD64VCVTTPS2DQ256 + OpAMD64VCVTTPS2DQ512 + OpAMD64VCVTTPS2DQMasked128 + OpAMD64VCVTTPS2DQMasked256 + OpAMD64VCVTTPS2DQMasked512 OpAMD64VDIVPD128 OpAMD64VDIVPD256 OpAMD64VDIVPD512 @@ -4671,6 +4683,18 @@ const ( OpCompressUint64x2 OpCompressUint64x4 OpCompressUint64x8 + OpConvertToInt32Float32x4 + OpConvertToInt32Float32x8 + OpConvertToInt32Float32x16 + OpConvertToInt32MaskedFloat32x4 + OpConvertToInt32MaskedFloat32x8 + OpConvertToInt32MaskedFloat32x16 + OpConvertToUint32Float32x4 + OpConvertToUint32Float32x8 + OpConvertToUint32Float32x16 + OpConvertToUint32MaskedFloat32x4 + OpConvertToUint32MaskedFloat32x8 + OpConvertToUint32MaskedFloat32x16 OpDivFloat32x4 OpDivFloat32x8 OpDivFloat32x16 @@ -19331,6 +19355,168 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "VCVTPS2UDQ128", + argLen: 1, + asm: x86.AVCVTPS2UDQ, + reg: regInfo{ + inputs: []inputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, + { + name: "VCVTPS2UDQ256", + argLen: 1, + asm: x86.AVCVTPS2UDQ, + reg: regInfo{ + inputs: []inputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, + { + name: "VCVTPS2UDQ512", + argLen: 1, + asm: x86.AVCVTPS2UDQ, + reg: regInfo{ + inputs: []inputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, + { + name: "VCVTPS2UDQMasked128", + argLen: 2, + asm: x86.AVCVTPS2UDQ, + reg: regInfo{ + inputs: []inputInfo{ + {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VCVTPS2UDQMasked256", + argLen: 2, + asm: x86.AVCVTPS2UDQ, + reg: regInfo{ + inputs: []inputInfo{ + {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VCVTPS2UDQMasked512", + argLen: 2, + asm: x86.AVCVTPS2UDQ, + reg: regInfo{ + inputs: []inputInfo{ + {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VCVTTPS2DQ128", + argLen: 1, + asm: x86.AVCVTTPS2DQ, + reg: regInfo{ + inputs: []inputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VCVTTPS2DQ256", + argLen: 1, + asm: x86.AVCVTTPS2DQ, + reg: regInfo{ + inputs: []inputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VCVTTPS2DQ512", + argLen: 1, + asm: x86.AVCVTTPS2DQ, + reg: regInfo{ + inputs: []inputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, + { + name: "VCVTTPS2DQMasked128", + argLen: 2, + asm: x86.AVCVTTPS2DQ, + reg: regInfo{ + inputs: []inputInfo{ + {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VCVTTPS2DQMasked256", + argLen: 2, + asm: x86.AVCVTTPS2DQ, + reg: regInfo{ + inputs: []inputInfo{ + {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VCVTTPS2DQMasked512", + argLen: 2, + asm: x86.AVCVTTPS2DQ, + reg: regInfo{ + inputs: []inputInfo{ + {1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, { name: "VDIVPD128", argLen: 2, @@ -62407,6 +62593,66 @@ var opcodeTable = [...]opInfo{ argLen: 2, generic: true, }, + { + name: "ConvertToInt32Float32x4", + argLen: 1, + generic: true, + }, + { + name: "ConvertToInt32Float32x8", + argLen: 1, + generic: true, + }, + { + name: "ConvertToInt32Float32x16", + argLen: 1, + generic: true, + }, + { + name: "ConvertToInt32MaskedFloat32x4", + argLen: 2, + generic: true, + }, + { + name: "ConvertToInt32MaskedFloat32x8", + argLen: 2, + generic: true, + }, + { + name: "ConvertToInt32MaskedFloat32x16", + argLen: 2, + generic: true, + }, + { + name: "ConvertToUint32Float32x4", + argLen: 1, + generic: true, + }, + { + name: "ConvertToUint32Float32x8", + argLen: 1, + generic: true, + }, + { + name: "ConvertToUint32Float32x16", + argLen: 1, + generic: true, + }, + { + name: "ConvertToUint32MaskedFloat32x4", + argLen: 2, + generic: true, + }, + { + name: "ConvertToUint32MaskedFloat32x8", + argLen: 2, + generic: true, + }, + { + name: "ConvertToUint32MaskedFloat32x16", + argLen: 2, + generic: true, + }, { name: "DivFloat32x4", argLen: 2, diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go index d9560c55c22..11c7c20db26 100644 --- a/src/cmd/compile/internal/ssa/rewriteAMD64.go +++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go @@ -1267,6 +1267,36 @@ func rewriteValueAMD64(v *Value) bool { return rewriteValueAMD64_OpConstBool(v) case OpConstNil: return rewriteValueAMD64_OpConstNil(v) + case OpConvertToInt32Float32x16: + v.Op = OpAMD64VCVTTPS2DQ512 + return true + case OpConvertToInt32Float32x4: + v.Op = OpAMD64VCVTTPS2DQ128 + return true + case OpConvertToInt32Float32x8: + v.Op = OpAMD64VCVTTPS2DQ256 + return true + case OpConvertToInt32MaskedFloat32x16: + return rewriteValueAMD64_OpConvertToInt32MaskedFloat32x16(v) + case OpConvertToInt32MaskedFloat32x4: + return rewriteValueAMD64_OpConvertToInt32MaskedFloat32x4(v) + case OpConvertToInt32MaskedFloat32x8: + return rewriteValueAMD64_OpConvertToInt32MaskedFloat32x8(v) + case OpConvertToUint32Float32x16: + v.Op = OpAMD64VCVTPS2UDQ512 + return true + case OpConvertToUint32Float32x4: + v.Op = OpAMD64VCVTPS2UDQ128 + return true + case OpConvertToUint32Float32x8: + v.Op = OpAMD64VCVTPS2UDQ256 + return true + case OpConvertToUint32MaskedFloat32x16: + return rewriteValueAMD64_OpConvertToUint32MaskedFloat32x16(v) + case OpConvertToUint32MaskedFloat32x4: + return rewriteValueAMD64_OpConvertToUint32MaskedFloat32x4(v) + case OpConvertToUint32MaskedFloat32x8: + return rewriteValueAMD64_OpConvertToUint32MaskedFloat32x8(v) case OpCtz16: return rewriteValueAMD64_OpCtz16(v) case OpCtz16NonZero: @@ -31928,6 +31958,102 @@ func rewriteValueAMD64_OpConstNil(v *Value) bool { return true } } +func rewriteValueAMD64_OpConvertToInt32MaskedFloat32x16(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (ConvertToInt32MaskedFloat32x16 x mask) + // result: (VCVTTPS2DQMasked512 x (VPMOVVec32x16ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VCVTTPS2DQMasked512) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpConvertToInt32MaskedFloat32x4(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (ConvertToInt32MaskedFloat32x4 x mask) + // result: (VCVTTPS2DQMasked128 x (VPMOVVec32x4ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VCVTTPS2DQMasked128) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpConvertToInt32MaskedFloat32x8(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (ConvertToInt32MaskedFloat32x8 x mask) + // result: (VCVTTPS2DQMasked256 x (VPMOVVec32x8ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VCVTTPS2DQMasked256) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpConvertToUint32MaskedFloat32x16(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (ConvertToUint32MaskedFloat32x16 x mask) + // result: (VCVTPS2UDQMasked512 x (VPMOVVec32x16ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VCVTPS2UDQMasked512) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpConvertToUint32MaskedFloat32x4(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (ConvertToUint32MaskedFloat32x4 x mask) + // result: (VCVTPS2UDQMasked128 x (VPMOVVec32x4ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VCVTPS2UDQMasked128) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpConvertToUint32MaskedFloat32x8(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (ConvertToUint32MaskedFloat32x8 x mask) + // result: (VCVTPS2UDQMasked256 x (VPMOVVec32x8ToM mask)) + for { + x := v_0 + mask := v_1 + v.reset(OpAMD64VCVTPS2UDQMasked256) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} func rewriteValueAMD64_OpCtz16(v *Value) bool { v_0 := v.Args[0] b := v.Block diff --git a/src/cmd/compile/internal/ssagen/simdintrinsics.go b/src/cmd/compile/internal/ssagen/simdintrinsics.go index cf2e7fc6764..a8a2ff91420 100644 --- a/src/cmd/compile/internal/ssagen/simdintrinsics.go +++ b/src/cmd/compile/internal/ssagen/simdintrinsics.go @@ -245,6 +245,18 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Uint64x2.Compress", opLen2(ssa.OpCompressUint64x2, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint64x4.Compress", opLen2(ssa.OpCompressUint64x4, types.TypeVec256), sys.AMD64) addF(simdPackage, "Uint64x8.Compress", opLen2(ssa.OpCompressUint64x8, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Float32x4.ConvertToInt32", opLen1(ssa.OpConvertToInt32Float32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Float32x8.ConvertToInt32", opLen1(ssa.OpConvertToInt32Float32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Float32x16.ConvertToInt32", opLen1(ssa.OpConvertToInt32Float32x16, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Float32x4.ConvertToInt32Masked", opLen2(ssa.OpConvertToInt32MaskedFloat32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Float32x8.ConvertToInt32Masked", opLen2(ssa.OpConvertToInt32MaskedFloat32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Float32x16.ConvertToInt32Masked", opLen2(ssa.OpConvertToInt32MaskedFloat32x16, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Float32x4.ConvertToUint32", opLen1(ssa.OpConvertToUint32Float32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Float32x8.ConvertToUint32", opLen1(ssa.OpConvertToUint32Float32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Float32x16.ConvertToUint32", opLen1(ssa.OpConvertToUint32Float32x16, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Float32x4.ConvertToUint32Masked", opLen2(ssa.OpConvertToUint32MaskedFloat32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Float32x8.ConvertToUint32Masked", opLen2(ssa.OpConvertToUint32MaskedFloat32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Float32x16.ConvertToUint32Masked", opLen2(ssa.OpConvertToUint32MaskedFloat32x16, types.TypeVec512), sys.AMD64) addF(simdPackage, "Float32x4.DiffWithCeilWithPrecision", opLen1Imm8(ssa.OpDiffWithCeilWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64) addF(simdPackage, "Float32x8.DiffWithCeilWithPrecision", opLen1Imm8(ssa.OpDiffWithCeilWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64) addF(simdPackage, "Float32x16.DiffWithCeilWithPrecision", opLen1Imm8(ssa.OpDiffWithCeilWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64) diff --git a/src/simd/ops_amd64.go b/src/simd/ops_amd64.go index 318883ea19c..8d941360907 100644 --- a/src/simd/ops_amd64.go +++ b/src/simd/ops_amd64.go @@ -1446,6 +1446,86 @@ func (x Uint64x4) Compress(mask Mask64x4) Uint64x4 // Asm: VPCOMPRESSQ, CPU Feature: AVX512F func (x Uint64x8) Compress(mask Mask64x8) Uint64x8 +/* ConvertToInt32 */ + +// ConvertToInt32 converts element values to int32. +// +// Asm: VCVTTPS2DQ, CPU Feature: AVX +func (x Float32x4) ConvertToInt32() Int32x4 + +// ConvertToInt32 converts element values to int32. +// +// Asm: VCVTTPS2DQ, CPU Feature: AVX +func (x Float32x8) ConvertToInt32() Int32x8 + +// ConvertToInt32 converts element values to int32. +// +// Asm: VCVTTPS2DQ, CPU Feature: AVX512F +func (x Float32x16) ConvertToInt32() Int32x16 + +/* ConvertToInt32Masked */ + +// ConvertToInt32 converts element values to int32. +// +// This operation is applied selectively under a write mask. +// +// Asm: VCVTTPS2DQ, CPU Feature: AVX512F +func (x Float32x4) ConvertToInt32Masked(mask Mask32x4) Int32x4 + +// ConvertToInt32 converts element values to int32. +// +// This operation is applied selectively under a write mask. +// +// Asm: VCVTTPS2DQ, CPU Feature: AVX512F +func (x Float32x8) ConvertToInt32Masked(mask Mask32x8) Int32x8 + +// ConvertToInt32 converts element values to int32. +// +// This operation is applied selectively under a write mask. +// +// Asm: VCVTTPS2DQ, CPU Feature: AVX512F +func (x Float32x16) ConvertToInt32Masked(mask Mask32x16) Int32x16 + +/* ConvertToUint32 */ + +// ConvertToUint32Masked converts element values to uint32. +// +// Asm: VCVTPS2UDQ, CPU Feature: AVX512F +func (x Float32x4) ConvertToUint32() Uint32x4 + +// ConvertToUint32Masked converts element values to uint32. +// +// Asm: VCVTPS2UDQ, CPU Feature: AVX512F +func (x Float32x8) ConvertToUint32() Uint32x8 + +// ConvertToUint32Masked converts element values to uint32. +// +// Asm: VCVTPS2UDQ, CPU Feature: AVX512F +func (x Float32x16) ConvertToUint32() Uint32x16 + +/* ConvertToUint32Masked */ + +// ConvertToUint32Masked converts element values to uint32. +// +// This operation is applied selectively under a write mask. +// +// Asm: VCVTPS2UDQ, CPU Feature: AVX512F +func (x Float32x4) ConvertToUint32Masked(mask Mask32x4) Uint32x4 + +// ConvertToUint32Masked converts element values to uint32. +// +// This operation is applied selectively under a write mask. +// +// Asm: VCVTPS2UDQ, CPU Feature: AVX512F +func (x Float32x8) ConvertToUint32Masked(mask Mask32x8) Uint32x8 + +// ConvertToUint32Masked converts element values to uint32. +// +// This operation is applied selectively under a write mask. +// +// Asm: VCVTPS2UDQ, CPU Feature: AVX512F +func (x Float32x16) ConvertToUint32Masked(mask Mask32x16) Uint32x16 + /* DiffWithCeilWithPrecision */ // DiffWithCeilWithPrecision computes the difference after ceiling with specified precision.