[dev.simd] cmd/compile, simd: complete AVX2? u?int shuffles

The namings follow the following convention:
- If its indices are from constant, amend "Constant" to the name.
- If its indices are used by multiple groups, mend "Grouped" to the
  name.
- If its indexing only the low part, amend "Lo", similarly "Hi".

Change-Id: I6a58f5dae54c882ebd59f39b5288f6f3f14d957f
Reviewed-on: https://go-review.googlesource.com/c/go/+/698296
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: David Chase <drchase@google.com>
This commit is contained in:
Junyang Shao 2025-08-21 20:37:57 +00:00
parent fa1e78c9ad
commit baea0c700b
10 changed files with 1050 additions and 2 deletions

View file

@ -346,6 +346,8 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
ssa.OpAMD64VPERMQ256, ssa.OpAMD64VPERMQ256,
ssa.OpAMD64VPERMPD512, ssa.OpAMD64VPERMPD512,
ssa.OpAMD64VPERMQ512, ssa.OpAMD64VPERMQ512,
ssa.OpAMD64VPSHUFB256,
ssa.OpAMD64VPSHUFB512,
ssa.OpAMD64VPROLVD128, ssa.OpAMD64VPROLVD128,
ssa.OpAMD64VPROLVD256, ssa.OpAMD64VPROLVD256,
ssa.OpAMD64VPROLVD512, ssa.OpAMD64VPROLVD512,
@ -606,6 +608,8 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
ssa.OpAMD64VPORQMasked128, ssa.OpAMD64VPORQMasked128,
ssa.OpAMD64VPORQMasked256, ssa.OpAMD64VPORQMasked256,
ssa.OpAMD64VPORQMasked512, ssa.OpAMD64VPORQMasked512,
ssa.OpAMD64VPSHUFBMasked256,
ssa.OpAMD64VPSHUFBMasked512,
ssa.OpAMD64VPSHUFBMasked128, ssa.OpAMD64VPSHUFBMasked128,
ssa.OpAMD64VPERMBMasked256, ssa.OpAMD64VPERMBMasked256,
ssa.OpAMD64VPERMBMasked512, ssa.OpAMD64VPERMBMasked512,
@ -903,6 +907,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
ssa.OpAMD64VEXTRACTF64X4256, ssa.OpAMD64VEXTRACTF64X4256,
ssa.OpAMD64VEXTRACTI128128, ssa.OpAMD64VEXTRACTI128128,
ssa.OpAMD64VEXTRACTI64X4256, ssa.OpAMD64VEXTRACTI64X4256,
ssa.OpAMD64VPSHUFD128,
ssa.OpAMD64VPSHUFD256,
ssa.OpAMD64VPSHUFD512,
ssa.OpAMD64VPSHUFHW128,
ssa.OpAMD64VPSHUFHW256,
ssa.OpAMD64VPSHUFHW512,
ssa.OpAMD64VPROLD128, ssa.OpAMD64VPROLD128,
ssa.OpAMD64VPROLD256, ssa.OpAMD64VPROLD256,
ssa.OpAMD64VPROLD512, ssa.OpAMD64VPROLD512,
@ -956,6 +966,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
ssa.OpAMD64VREDUCEPDMasked128, ssa.OpAMD64VREDUCEPDMasked128,
ssa.OpAMD64VREDUCEPDMasked256, ssa.OpAMD64VREDUCEPDMasked256,
ssa.OpAMD64VREDUCEPDMasked512, ssa.OpAMD64VREDUCEPDMasked512,
ssa.OpAMD64VPSHUFDMasked256,
ssa.OpAMD64VPSHUFDMasked512,
ssa.OpAMD64VPSHUFHWMasked256,
ssa.OpAMD64VPSHUFHWMasked512,
ssa.OpAMD64VPSHUFHWMasked128,
ssa.OpAMD64VPSHUFDMasked128,
ssa.OpAMD64VPROLDMasked128, ssa.OpAMD64VPROLDMasked128,
ssa.OpAMD64VPROLDMasked256, ssa.OpAMD64VPROLDMasked256,
ssa.OpAMD64VPROLDMasked512, ssa.OpAMD64VPROLDMasked512,
@ -1682,6 +1698,14 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
ssa.OpAMD64VPERMI2QMasked256, ssa.OpAMD64VPERMI2QMasked256,
ssa.OpAMD64VPERMI2PDMasked512, ssa.OpAMD64VPERMI2PDMasked512,
ssa.OpAMD64VPERMI2QMasked512, ssa.OpAMD64VPERMI2QMasked512,
ssa.OpAMD64VPSHUFDMasked256,
ssa.OpAMD64VPSHUFDMasked512,
ssa.OpAMD64VPSHUFHWMasked256,
ssa.OpAMD64VPSHUFHWMasked512,
ssa.OpAMD64VPSHUFHWMasked128,
ssa.OpAMD64VPSHUFDMasked128,
ssa.OpAMD64VPSHUFBMasked256,
ssa.OpAMD64VPSHUFBMasked512,
ssa.OpAMD64VPSHUFBMasked128, ssa.OpAMD64VPSHUFBMasked128,
ssa.OpAMD64VPERMBMasked256, ssa.OpAMD64VPERMBMasked256,
ssa.OpAMD64VPERMBMasked512, ssa.OpAMD64VPERMBMasked512,

View file

@ -782,6 +782,32 @@
(Permute2Uint64x2 ...) => (VPERMI2Q128 ...) (Permute2Uint64x2 ...) => (VPERMI2Q128 ...)
(Permute2Uint64x4 ...) => (VPERMI2Q256 ...) (Permute2Uint64x4 ...) => (VPERMI2Q256 ...)
(Permute2Uint64x8 ...) => (VPERMI2Q512 ...) (Permute2Uint64x8 ...) => (VPERMI2Q512 ...)
(PermuteConstantInt32x4 ...) => (VPSHUFD128 ...)
(PermuteConstantUint32x4 ...) => (VPSHUFD128 ...)
(PermuteConstantGroupedInt32x8 ...) => (VPSHUFD256 ...)
(PermuteConstantGroupedInt32x16 ...) => (VPSHUFD512 ...)
(PermuteConstantGroupedUint32x8 ...) => (VPSHUFD256 ...)
(PermuteConstantGroupedUint32x16 ...) => (VPSHUFD512 ...)
(PermuteConstantHiInt16x8 ...) => (VPSHUFHW128 ...)
(PermuteConstantHiInt32x4 ...) => (VPSHUFHW128 ...)
(PermuteConstantHiUint16x8 ...) => (VPSHUFHW128 ...)
(PermuteConstantHiUint32x4 ...) => (VPSHUFHW128 ...)
(PermuteConstantHiGroupedInt16x16 ...) => (VPSHUFHW256 ...)
(PermuteConstantHiGroupedInt16x32 ...) => (VPSHUFHW512 ...)
(PermuteConstantHiGroupedUint16x16 ...) => (VPSHUFHW256 ...)
(PermuteConstantHiGroupedUint16x32 ...) => (VPSHUFHW512 ...)
(PermuteConstantLoInt16x8 ...) => (VPSHUFHW128 ...)
(PermuteConstantLoInt32x4 ...) => (VPSHUFHW128 ...)
(PermuteConstantLoUint16x8 ...) => (VPSHUFHW128 ...)
(PermuteConstantLoUint32x4 ...) => (VPSHUFHW128 ...)
(PermuteConstantLoGroupedInt16x16 ...) => (VPSHUFHW256 ...)
(PermuteConstantLoGroupedInt16x32 ...) => (VPSHUFHW512 ...)
(PermuteConstantLoGroupedUint16x16 ...) => (VPSHUFHW256 ...)
(PermuteConstantLoGroupedUint16x32 ...) => (VPSHUFHW512 ...)
(PermuteGroupedInt8x32 ...) => (VPSHUFB256 ...)
(PermuteGroupedInt8x64 ...) => (VPSHUFB512 ...)
(PermuteGroupedUint8x32 ...) => (VPSHUFB256 ...)
(PermuteGroupedUint8x64 ...) => (VPSHUFB512 ...)
(ReciprocalFloat32x4 ...) => (VRCPPS128 ...) (ReciprocalFloat32x4 ...) => (VRCPPS128 ...)
(ReciprocalFloat32x8 ...) => (VRCPPS256 ...) (ReciprocalFloat32x8 ...) => (VRCPPS256 ...)
(ReciprocalFloat32x16 ...) => (VRCP14PS512 ...) (ReciprocalFloat32x16 ...) => (VRCP14PS512 ...)
@ -1317,6 +1343,9 @@
(VMOVDQU32Masked512 (VPERMI2D512 x y z) mask) => (VPERMI2DMasked512 x y z mask) (VMOVDQU32Masked512 (VPERMI2D512 x y z) mask) => (VPERMI2DMasked512 x y z mask)
(VMOVDQU64Masked512 (VPERMI2PD512 x y z) mask) => (VPERMI2PDMasked512 x y z mask) (VMOVDQU64Masked512 (VPERMI2PD512 x y z) mask) => (VPERMI2PDMasked512 x y z mask)
(VMOVDQU64Masked512 (VPERMI2Q512 x y z) mask) => (VPERMI2QMasked512 x y z mask) (VMOVDQU64Masked512 (VPERMI2Q512 x y z) mask) => (VPERMI2QMasked512 x y z mask)
(VMOVDQU32Masked512 (VPSHUFD512 [a] x) mask) => (VPSHUFDMasked512 [a] x mask)
(VMOVDQU16Masked512 (VPSHUFHW512 [a] x) mask) => (VPSHUFHWMasked512 [a] x mask)
(VMOVDQU8Masked512 (VPSHUFB512 x y) mask) => (VPSHUFBMasked512 x y mask)
(VMOVDQU8Masked512 (VPERMB512 x y) mask) => (VPERMBMasked512 x y mask) (VMOVDQU8Masked512 (VPERMB512 x y) mask) => (VPERMBMasked512 x y mask)
(VMOVDQU16Masked512 (VPERMW512 x y) mask) => (VPERMWMasked512 x y mask) (VMOVDQU16Masked512 (VPERMW512 x y) mask) => (VPERMWMasked512 x y mask)
(VMOVDQU32Masked512 (VPERMPS512 x y) mask) => (VPERMPSMasked512 x y mask) (VMOVDQU32Masked512 (VPERMPS512 x y) mask) => (VPERMPSMasked512 x y mask)

View file

@ -816,7 +816,11 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
{name: "VPSHRDVWMasked256", argLength: 4, reg: w3kw, asm: "VPSHRDVW", commutative: false, typ: "Vec256", resultInArg0: true}, {name: "VPSHRDVWMasked256", argLength: 4, reg: w3kw, asm: "VPSHRDVW", commutative: false, typ: "Vec256", resultInArg0: true},
{name: "VPSHRDVWMasked512", argLength: 4, reg: w3kw, asm: "VPSHRDVW", commutative: false, typ: "Vec512", resultInArg0: true}, {name: "VPSHRDVWMasked512", argLength: 4, reg: w3kw, asm: "VPSHRDVW", commutative: false, typ: "Vec512", resultInArg0: true},
{name: "VPSHUFB128", argLength: 2, reg: v21, asm: "VPSHUFB", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VPSHUFB128", argLength: 2, reg: v21, asm: "VPSHUFB", commutative: false, typ: "Vec128", resultInArg0: false},
{name: "VPSHUFB256", argLength: 2, reg: v21, asm: "VPSHUFB", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VPSHUFB512", argLength: 2, reg: w21, asm: "VPSHUFB", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "VPSHUFBMasked128", argLength: 3, reg: w2kw, asm: "VPSHUFB", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VPSHUFBMasked128", argLength: 3, reg: w2kw, asm: "VPSHUFB", commutative: false, typ: "Vec128", resultInArg0: false},
{name: "VPSHUFBMasked256", argLength: 3, reg: w2kw, asm: "VPSHUFB", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VPSHUFBMasked512", argLength: 3, reg: w2kw, asm: "VPSHUFB", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "VPSIGNB128", argLength: 2, reg: v21, asm: "VPSIGNB", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VPSIGNB128", argLength: 2, reg: v21, asm: "VPSIGNB", commutative: false, typ: "Vec128", resultInArg0: false},
{name: "VPSIGNB256", argLength: 2, reg: v21, asm: "VPSIGNB", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VPSIGNB256", argLength: 2, reg: v21, asm: "VPSIGNB", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VPSIGND128", argLength: 2, reg: v21, asm: "VPSIGND", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VPSIGND128", argLength: 2, reg: v21, asm: "VPSIGND", commutative: false, typ: "Vec128", resultInArg0: false},
@ -1141,6 +1145,18 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
{name: "VPCMPW512", argLength: 2, reg: w2k, asm: "VPCMPW", aux: "UInt8", commutative: false, typ: "Mask", resultInArg0: false}, {name: "VPCMPW512", argLength: 2, reg: w2k, asm: "VPCMPW", aux: "UInt8", commutative: false, typ: "Mask", resultInArg0: false},
{name: "VPCMPD512", argLength: 2, reg: w2k, asm: "VPCMPD", aux: "UInt8", commutative: false, typ: "Mask", resultInArg0: false}, {name: "VPCMPD512", argLength: 2, reg: w2k, asm: "VPCMPD", aux: "UInt8", commutative: false, typ: "Mask", resultInArg0: false},
{name: "VPCMPQ512", argLength: 2, reg: w2k, asm: "VPCMPQ", aux: "UInt8", commutative: false, typ: "Mask", resultInArg0: false}, {name: "VPCMPQ512", argLength: 2, reg: w2k, asm: "VPCMPQ", aux: "UInt8", commutative: false, typ: "Mask", resultInArg0: false},
{name: "VPSHUFD128", argLength: 1, reg: v11, asm: "VPSHUFD", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
{name: "VPSHUFD256", argLength: 1, reg: v11, asm: "VPSHUFD", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VPSHUFD512", argLength: 1, reg: w11, asm: "VPSHUFD", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "VPSHUFDMasked256", argLength: 2, reg: wkw, asm: "VPSHUFD", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VPSHUFDMasked512", argLength: 2, reg: wkw, asm: "VPSHUFD", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "VPSHUFHW128", argLength: 1, reg: w11, asm: "VPSHUFHW", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
{name: "VPSHUFHW256", argLength: 1, reg: v11, asm: "VPSHUFHW", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VPSHUFHW512", argLength: 1, reg: w11, asm: "VPSHUFHW", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "VPSHUFHWMasked256", argLength: 2, reg: wkw, asm: "VPSHUFHW", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VPSHUFHWMasked512", argLength: 2, reg: wkw, asm: "VPSHUFHW", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "VPSHUFHWMasked128", argLength: 2, reg: wkw, asm: "VPSHUFHW", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
{name: "VPSHUFDMasked128", argLength: 2, reg: wkw, asm: "VPSHUFD", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
{name: "VPROLD128", argLength: 1, reg: w11, asm: "VPROLD", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VPROLD128", argLength: 1, reg: w11, asm: "VPROLD", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
{name: "VPROLD256", argLength: 1, reg: w11, asm: "VPROLD", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VPROLD256", argLength: 1, reg: w11, asm: "VPROLD", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VPROLD512", argLength: 1, reg: w11, asm: "VPROLD", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false}, {name: "VPROLD512", argLength: 1, reg: w11, asm: "VPROLD", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false},

View file

@ -726,6 +726,10 @@ func simdGenericOps() []opData {
{name: "PermuteFloat32x16", argLength: 2, commutative: false}, {name: "PermuteFloat32x16", argLength: 2, commutative: false},
{name: "PermuteFloat64x4", argLength: 2, commutative: false}, {name: "PermuteFloat64x4", argLength: 2, commutative: false},
{name: "PermuteFloat64x8", argLength: 2, commutative: false}, {name: "PermuteFloat64x8", argLength: 2, commutative: false},
{name: "PermuteGroupedInt8x32", argLength: 2, commutative: false},
{name: "PermuteGroupedInt8x64", argLength: 2, commutative: false},
{name: "PermuteGroupedUint8x32", argLength: 2, commutative: false},
{name: "PermuteGroupedUint8x64", argLength: 2, commutative: false},
{name: "PermuteInt8x16", argLength: 2, commutative: false}, {name: "PermuteInt8x16", argLength: 2, commutative: false},
{name: "PermuteInt8x32", argLength: 2, commutative: false}, {name: "PermuteInt8x32", argLength: 2, commutative: false},
{name: "PermuteInt8x64", argLength: 2, commutative: false}, {name: "PermuteInt8x64", argLength: 2, commutative: false},
@ -1089,6 +1093,28 @@ func simdGenericOps() []opData {
{name: "GetElemUint16x8", argLength: 1, commutative: false, aux: "UInt8"}, {name: "GetElemUint16x8", argLength: 1, commutative: false, aux: "UInt8"},
{name: "GetElemUint32x4", argLength: 1, commutative: false, aux: "UInt8"}, {name: "GetElemUint32x4", argLength: 1, commutative: false, aux: "UInt8"},
{name: "GetElemUint64x2", argLength: 1, commutative: false, aux: "UInt8"}, {name: "GetElemUint64x2", argLength: 1, commutative: false, aux: "UInt8"},
{name: "PermuteConstantGroupedInt32x8", argLength: 1, commutative: false, aux: "UInt8"},
{name: "PermuteConstantGroupedInt32x16", argLength: 1, commutative: false, aux: "UInt8"},
{name: "PermuteConstantGroupedUint32x8", argLength: 1, commutative: false, aux: "UInt8"},
{name: "PermuteConstantGroupedUint32x16", argLength: 1, commutative: false, aux: "UInt8"},
{name: "PermuteConstantHiGroupedInt16x16", argLength: 1, commutative: false, aux: "UInt8"},
{name: "PermuteConstantHiGroupedInt16x32", argLength: 1, commutative: false, aux: "UInt8"},
{name: "PermuteConstantHiGroupedUint16x16", argLength: 1, commutative: false, aux: "UInt8"},
{name: "PermuteConstantHiGroupedUint16x32", argLength: 1, commutative: false, aux: "UInt8"},
{name: "PermuteConstantHiInt16x8", argLength: 1, commutative: false, aux: "UInt8"},
{name: "PermuteConstantHiInt32x4", argLength: 1, commutative: false, aux: "UInt8"},
{name: "PermuteConstantHiUint16x8", argLength: 1, commutative: false, aux: "UInt8"},
{name: "PermuteConstantHiUint32x4", argLength: 1, commutative: false, aux: "UInt8"},
{name: "PermuteConstantInt32x4", argLength: 1, commutative: false, aux: "UInt8"},
{name: "PermuteConstantLoGroupedInt16x16", argLength: 1, commutative: false, aux: "UInt8"},
{name: "PermuteConstantLoGroupedInt16x32", argLength: 1, commutative: false, aux: "UInt8"},
{name: "PermuteConstantLoGroupedUint16x16", argLength: 1, commutative: false, aux: "UInt8"},
{name: "PermuteConstantLoGroupedUint16x32", argLength: 1, commutative: false, aux: "UInt8"},
{name: "PermuteConstantLoInt16x8", argLength: 1, commutative: false, aux: "UInt8"},
{name: "PermuteConstantLoInt32x4", argLength: 1, commutative: false, aux: "UInt8"},
{name: "PermuteConstantLoUint16x8", argLength: 1, commutative: false, aux: "UInt8"},
{name: "PermuteConstantLoUint32x4", argLength: 1, commutative: false, aux: "UInt8"},
{name: "PermuteConstantUint32x4", argLength: 1, commutative: false, aux: "UInt8"},
{name: "RotateAllLeftInt32x4", argLength: 1, commutative: false, aux: "UInt8"}, {name: "RotateAllLeftInt32x4", argLength: 1, commutative: false, aux: "UInt8"},
{name: "RotateAllLeftInt32x8", argLength: 1, commutative: false, aux: "UInt8"}, {name: "RotateAllLeftInt32x8", argLength: 1, commutative: false, aux: "UInt8"},
{name: "RotateAllLeftInt32x16", argLength: 1, commutative: false, aux: "UInt8"}, {name: "RotateAllLeftInt32x16", argLength: 1, commutative: false, aux: "UInt8"},

View file

@ -2039,7 +2039,11 @@ const (
OpAMD64VPSHRDVWMasked256 OpAMD64VPSHRDVWMasked256
OpAMD64VPSHRDVWMasked512 OpAMD64VPSHRDVWMasked512
OpAMD64VPSHUFB128 OpAMD64VPSHUFB128
OpAMD64VPSHUFB256
OpAMD64VPSHUFB512
OpAMD64VPSHUFBMasked128 OpAMD64VPSHUFBMasked128
OpAMD64VPSHUFBMasked256
OpAMD64VPSHUFBMasked512
OpAMD64VPSIGNB128 OpAMD64VPSIGNB128
OpAMD64VPSIGNB256 OpAMD64VPSIGNB256
OpAMD64VPSIGND128 OpAMD64VPSIGND128
@ -2364,6 +2368,18 @@ const (
OpAMD64VPCMPW512 OpAMD64VPCMPW512
OpAMD64VPCMPD512 OpAMD64VPCMPD512
OpAMD64VPCMPQ512 OpAMD64VPCMPQ512
OpAMD64VPSHUFD128
OpAMD64VPSHUFD256
OpAMD64VPSHUFD512
OpAMD64VPSHUFDMasked256
OpAMD64VPSHUFDMasked512
OpAMD64VPSHUFHW128
OpAMD64VPSHUFHW256
OpAMD64VPSHUFHW512
OpAMD64VPSHUFHWMasked256
OpAMD64VPSHUFHWMasked512
OpAMD64VPSHUFHWMasked128
OpAMD64VPSHUFDMasked128
OpAMD64VPROLD128 OpAMD64VPROLD128
OpAMD64VPROLD256 OpAMD64VPROLD256
OpAMD64VPROLD512 OpAMD64VPROLD512
@ -5505,6 +5521,10 @@ const (
OpPermuteFloat32x16 OpPermuteFloat32x16
OpPermuteFloat64x4 OpPermuteFloat64x4
OpPermuteFloat64x8 OpPermuteFloat64x8
OpPermuteGroupedInt8x32
OpPermuteGroupedInt8x64
OpPermuteGroupedUint8x32
OpPermuteGroupedUint8x64
OpPermuteInt8x16 OpPermuteInt8x16
OpPermuteInt8x32 OpPermuteInt8x32
OpPermuteInt8x64 OpPermuteInt8x64
@ -5868,6 +5888,28 @@ const (
OpGetElemUint16x8 OpGetElemUint16x8
OpGetElemUint32x4 OpGetElemUint32x4
OpGetElemUint64x2 OpGetElemUint64x2
OpPermuteConstantGroupedInt32x8
OpPermuteConstantGroupedInt32x16
OpPermuteConstantGroupedUint32x8
OpPermuteConstantGroupedUint32x16
OpPermuteConstantHiGroupedInt16x16
OpPermuteConstantHiGroupedInt16x32
OpPermuteConstantHiGroupedUint16x16
OpPermuteConstantHiGroupedUint16x32
OpPermuteConstantHiInt16x8
OpPermuteConstantHiInt32x4
OpPermuteConstantHiUint16x8
OpPermuteConstantHiUint32x4
OpPermuteConstantInt32x4
OpPermuteConstantLoGroupedInt16x16
OpPermuteConstantLoGroupedInt16x32
OpPermuteConstantLoGroupedUint16x16
OpPermuteConstantLoGroupedUint16x32
OpPermuteConstantLoInt16x8
OpPermuteConstantLoInt32x4
OpPermuteConstantLoUint16x8
OpPermuteConstantLoUint32x4
OpPermuteConstantUint32x4
OpRotateAllLeftInt32x4 OpRotateAllLeftInt32x4
OpRotateAllLeftInt32x8 OpRotateAllLeftInt32x8
OpRotateAllLeftInt32x16 OpRotateAllLeftInt32x16
@ -31031,6 +31073,34 @@ var opcodeTable = [...]opInfo{
}, },
}, },
}, },
{
name: "VPSHUFB256",
argLen: 2,
asm: x86.AVPSHUFB,
reg: regInfo{
inputs: []inputInfo{
{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
{1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
},
outputs: []outputInfo{
{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
},
},
},
{
name: "VPSHUFB512",
argLen: 2,
asm: x86.AVPSHUFB,
reg: regInfo{
inputs: []inputInfo{
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
{1, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
outputs: []outputInfo{
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
},
},
{ {
name: "VPSHUFBMasked128", name: "VPSHUFBMasked128",
argLen: 3, argLen: 3,
@ -31046,6 +31116,36 @@ var opcodeTable = [...]opInfo{
}, },
}, },
}, },
{
name: "VPSHUFBMasked256",
argLen: 3,
asm: x86.AVPSHUFB,
reg: regInfo{
inputs: []inputInfo{
{2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
{1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
},
outputs: []outputInfo{
{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
},
},
},
{
name: "VPSHUFBMasked512",
argLen: 3,
asm: x86.AVPSHUFB,
reg: regInfo{
inputs: []inputInfo{
{2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
{1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
},
outputs: []outputInfo{
{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
},
},
},
{ {
name: "VPSIGNB128", name: "VPSIGNB128",
argLen: 2, argLen: 2,
@ -35810,6 +35910,180 @@ var opcodeTable = [...]opInfo{
}, },
}, },
}, },
{
name: "VPSHUFD128",
auxType: auxUInt8,
argLen: 1,
asm: x86.AVPSHUFD,
reg: regInfo{
inputs: []inputInfo{
{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
},
outputs: []outputInfo{
{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
},
},
},
{
name: "VPSHUFD256",
auxType: auxUInt8,
argLen: 1,
asm: x86.AVPSHUFD,
reg: regInfo{
inputs: []inputInfo{
{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
},
outputs: []outputInfo{
{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
},
},
},
{
name: "VPSHUFD512",
auxType: auxUInt8,
argLen: 1,
asm: x86.AVPSHUFD,
reg: regInfo{
inputs: []inputInfo{
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
outputs: []outputInfo{
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
},
},
{
name: "VPSHUFDMasked256",
auxType: auxUInt8,
argLen: 2,
asm: x86.AVPSHUFD,
reg: regInfo{
inputs: []inputInfo{
{1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
},
outputs: []outputInfo{
{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
},
},
},
{
name: "VPSHUFDMasked512",
auxType: auxUInt8,
argLen: 2,
asm: x86.AVPSHUFD,
reg: regInfo{
inputs: []inputInfo{
{1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
},
outputs: []outputInfo{
{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
},
},
},
{
name: "VPSHUFHW128",
auxType: auxUInt8,
argLen: 1,
asm: x86.AVPSHUFHW,
reg: regInfo{
inputs: []inputInfo{
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
outputs: []outputInfo{
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
},
},
{
name: "VPSHUFHW256",
auxType: auxUInt8,
argLen: 1,
asm: x86.AVPSHUFHW,
reg: regInfo{
inputs: []inputInfo{
{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
},
outputs: []outputInfo{
{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
},
},
},
{
name: "VPSHUFHW512",
auxType: auxUInt8,
argLen: 1,
asm: x86.AVPSHUFHW,
reg: regInfo{
inputs: []inputInfo{
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
outputs: []outputInfo{
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
},
},
{
name: "VPSHUFHWMasked256",
auxType: auxUInt8,
argLen: 2,
asm: x86.AVPSHUFHW,
reg: regInfo{
inputs: []inputInfo{
{1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
},
outputs: []outputInfo{
{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
},
},
},
{
name: "VPSHUFHWMasked512",
auxType: auxUInt8,
argLen: 2,
asm: x86.AVPSHUFHW,
reg: regInfo{
inputs: []inputInfo{
{1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
},
outputs: []outputInfo{
{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
},
},
},
{
name: "VPSHUFHWMasked128",
auxType: auxUInt8,
argLen: 2,
asm: x86.AVPSHUFHW,
reg: regInfo{
inputs: []inputInfo{
{1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
},
outputs: []outputInfo{
{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
},
},
},
{
name: "VPSHUFDMasked128",
auxType: auxUInt8,
argLen: 2,
asm: x86.AVPSHUFD,
reg: regInfo{
inputs: []inputInfo{
{1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
},
outputs: []outputInfo{
{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
},
},
},
{ {
name: "VPROLD128", name: "VPROLD128",
auxType: auxUInt8, auxType: auxUInt8,
@ -69053,6 +69327,26 @@ var opcodeTable = [...]opInfo{
argLen: 2, argLen: 2,
generic: true, generic: true,
}, },
{
name: "PermuteGroupedInt8x32",
argLen: 2,
generic: true,
},
{
name: "PermuteGroupedInt8x64",
argLen: 2,
generic: true,
},
{
name: "PermuteGroupedUint8x32",
argLen: 2,
generic: true,
},
{
name: "PermuteGroupedUint8x64",
argLen: 2,
generic: true,
},
{ {
name: "PermuteInt8x16", name: "PermuteInt8x16",
argLen: 2, argLen: 2,
@ -70932,6 +71226,138 @@ var opcodeTable = [...]opInfo{
argLen: 1, argLen: 1,
generic: true, generic: true,
}, },
{
name: "PermuteConstantGroupedInt32x8",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "PermuteConstantGroupedInt32x16",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "PermuteConstantGroupedUint32x8",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "PermuteConstantGroupedUint32x16",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "PermuteConstantHiGroupedInt16x16",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "PermuteConstantHiGroupedInt16x32",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "PermuteConstantHiGroupedUint16x16",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "PermuteConstantHiGroupedUint16x32",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "PermuteConstantHiInt16x8",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "PermuteConstantHiInt32x4",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "PermuteConstantHiUint16x8",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "PermuteConstantHiUint32x4",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "PermuteConstantInt32x4",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "PermuteConstantLoGroupedInt16x16",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "PermuteConstantLoGroupedInt16x32",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "PermuteConstantLoGroupedUint16x16",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "PermuteConstantLoGroupedUint16x32",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "PermuteConstantLoInt16x8",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "PermuteConstantLoInt32x4",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "PermuteConstantLoUint16x8",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "PermuteConstantLoUint32x4",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "PermuteConstantUint32x4",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{ {
name: "RotateAllLeftInt32x4", name: "RotateAllLeftInt32x4",
auxType: auxUInt8, auxType: auxUInt8,

View file

@ -3223,6 +3223,72 @@ func rewriteValueAMD64(v *Value) bool {
case OpPermute2Uint8x64: case OpPermute2Uint8x64:
v.Op = OpAMD64VPERMI2B512 v.Op = OpAMD64VPERMI2B512
return true return true
case OpPermuteConstantGroupedInt32x16:
v.Op = OpAMD64VPSHUFD512
return true
case OpPermuteConstantGroupedInt32x8:
v.Op = OpAMD64VPSHUFD256
return true
case OpPermuteConstantGroupedUint32x16:
v.Op = OpAMD64VPSHUFD512
return true
case OpPermuteConstantGroupedUint32x8:
v.Op = OpAMD64VPSHUFD256
return true
case OpPermuteConstantHiGroupedInt16x16:
v.Op = OpAMD64VPSHUFHW256
return true
case OpPermuteConstantHiGroupedInt16x32:
v.Op = OpAMD64VPSHUFHW512
return true
case OpPermuteConstantHiGroupedUint16x16:
v.Op = OpAMD64VPSHUFHW256
return true
case OpPermuteConstantHiGroupedUint16x32:
v.Op = OpAMD64VPSHUFHW512
return true
case OpPermuteConstantHiInt16x8:
v.Op = OpAMD64VPSHUFHW128
return true
case OpPermuteConstantHiInt32x4:
v.Op = OpAMD64VPSHUFHW128
return true
case OpPermuteConstantHiUint16x8:
v.Op = OpAMD64VPSHUFHW128
return true
case OpPermuteConstantHiUint32x4:
v.Op = OpAMD64VPSHUFHW128
return true
case OpPermuteConstantInt32x4:
v.Op = OpAMD64VPSHUFD128
return true
case OpPermuteConstantLoGroupedInt16x16:
v.Op = OpAMD64VPSHUFHW256
return true
case OpPermuteConstantLoGroupedInt16x32:
v.Op = OpAMD64VPSHUFHW512
return true
case OpPermuteConstantLoGroupedUint16x16:
v.Op = OpAMD64VPSHUFHW256
return true
case OpPermuteConstantLoGroupedUint16x32:
v.Op = OpAMD64VPSHUFHW512
return true
case OpPermuteConstantLoInt16x8:
v.Op = OpAMD64VPSHUFHW128
return true
case OpPermuteConstantLoInt32x4:
v.Op = OpAMD64VPSHUFHW128
return true
case OpPermuteConstantLoUint16x8:
v.Op = OpAMD64VPSHUFHW128
return true
case OpPermuteConstantLoUint32x4:
v.Op = OpAMD64VPSHUFHW128
return true
case OpPermuteConstantUint32x4:
v.Op = OpAMD64VPSHUFD128
return true
case OpPermuteFloat32x16: case OpPermuteFloat32x16:
v.Op = OpAMD64VPERMPS512 v.Op = OpAMD64VPERMPS512
return true return true
@ -3235,6 +3301,18 @@ func rewriteValueAMD64(v *Value) bool {
case OpPermuteFloat64x8: case OpPermuteFloat64x8:
v.Op = OpAMD64VPERMPD512 v.Op = OpAMD64VPERMPD512
return true return true
case OpPermuteGroupedInt8x32:
v.Op = OpAMD64VPSHUFB256
return true
case OpPermuteGroupedInt8x64:
v.Op = OpAMD64VPSHUFB512
return true
case OpPermuteGroupedUint8x32:
v.Op = OpAMD64VPSHUFB256
return true
case OpPermuteGroupedUint8x64:
v.Op = OpAMD64VPSHUFB512
return true
case OpPermuteInt16x16: case OpPermuteInt16x16:
v.Op = OpAMD64VPERMW256 v.Op = OpAMD64VPERMW256
return true return true
@ -26618,6 +26696,20 @@ func rewriteValueAMD64_OpAMD64VMOVDQU16Masked512(v *Value) bool {
v.AddArg4(x, y, z, mask) v.AddArg4(x, y, z, mask)
return true return true
} }
// match: (VMOVDQU16Masked512 (VPSHUFHW512 [a] x) mask)
// result: (VPSHUFHWMasked512 [a] x mask)
for {
if v_0.Op != OpAMD64VPSHUFHW512 {
break
}
a := auxIntToUint8(v_0.AuxInt)
x := v_0.Args[0]
mask := v_1
v.reset(OpAMD64VPSHUFHWMasked512)
v.AuxInt = uint8ToAuxInt(a)
v.AddArg2(x, mask)
return true
}
// match: (VMOVDQU16Masked512 (VPERMW512 x y) mask) // match: (VMOVDQU16Masked512 (VPERMW512 x y) mask)
// result: (VPERMWMasked512 x y mask) // result: (VPERMWMasked512 x y mask)
for { for {
@ -27311,6 +27403,20 @@ func rewriteValueAMD64_OpAMD64VMOVDQU32Masked512(v *Value) bool {
v.AddArg4(x, y, z, mask) v.AddArg4(x, y, z, mask)
return true return true
} }
// match: (VMOVDQU32Masked512 (VPSHUFD512 [a] x) mask)
// result: (VPSHUFDMasked512 [a] x mask)
for {
if v_0.Op != OpAMD64VPSHUFD512 {
break
}
a := auxIntToUint8(v_0.AuxInt)
x := v_0.Args[0]
mask := v_1
v.reset(OpAMD64VPSHUFDMasked512)
v.AuxInt = uint8ToAuxInt(a)
v.AddArg2(x, mask)
return true
}
// match: (VMOVDQU32Masked512 (VPERMPS512 x y) mask) // match: (VMOVDQU32Masked512 (VPERMPS512 x y) mask)
// result: (VPERMPSMasked512 x y mask) // result: (VPERMPSMasked512 x y mask)
for { for {
@ -28610,6 +28716,19 @@ func rewriteValueAMD64_OpAMD64VMOVDQU8Masked512(v *Value) bool {
v.AddArg4(x, y, z, mask) v.AddArg4(x, y, z, mask)
return true return true
} }
// match: (VMOVDQU8Masked512 (VPSHUFB512 x y) mask)
// result: (VPSHUFBMasked512 x y mask)
for {
if v_0.Op != OpAMD64VPSHUFB512 {
break
}
y := v_0.Args[1]
x := v_0.Args[0]
mask := v_1
v.reset(OpAMD64VPSHUFBMasked512)
v.AddArg3(x, y, mask)
return true
}
// match: (VMOVDQU8Masked512 (VPERMB512 x y) mask) // match: (VMOVDQU8Masked512 (VPERMB512 x y) mask)
// result: (VPERMBMasked512 x y mask) // result: (VPERMBMasked512 x y mask)
for { for {

View file

@ -794,6 +794,32 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
addF(simdPackage, "Float64x8.Permute2", opLen3_231(ssa.OpPermute2Float64x8, types.TypeVec512), sys.AMD64) addF(simdPackage, "Float64x8.Permute2", opLen3_231(ssa.OpPermute2Float64x8, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int64x8.Permute2", opLen3_231(ssa.OpPermute2Int64x8, types.TypeVec512), sys.AMD64) addF(simdPackage, "Int64x8.Permute2", opLen3_231(ssa.OpPermute2Int64x8, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Uint64x8.Permute2", opLen3_231(ssa.OpPermute2Uint64x8, types.TypeVec512), sys.AMD64) addF(simdPackage, "Uint64x8.Permute2", opLen3_231(ssa.OpPermute2Uint64x8, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int32x4.PermuteConstant", opLen1Imm8(ssa.OpPermuteConstantInt32x4, types.TypeVec128, 0), sys.AMD64)
addF(simdPackage, "Uint32x4.PermuteConstant", opLen1Imm8(ssa.OpPermuteConstantUint32x4, types.TypeVec128, 0), sys.AMD64)
addF(simdPackage, "Int32x8.PermuteConstantGrouped", opLen1Imm8(ssa.OpPermuteConstantGroupedInt32x8, types.TypeVec256, 0), sys.AMD64)
addF(simdPackage, "Int32x16.PermuteConstantGrouped", opLen1Imm8(ssa.OpPermuteConstantGroupedInt32x16, types.TypeVec512, 0), sys.AMD64)
addF(simdPackage, "Uint32x8.PermuteConstantGrouped", opLen1Imm8(ssa.OpPermuteConstantGroupedUint32x8, types.TypeVec256, 0), sys.AMD64)
addF(simdPackage, "Uint32x16.PermuteConstantGrouped", opLen1Imm8(ssa.OpPermuteConstantGroupedUint32x16, types.TypeVec512, 0), sys.AMD64)
addF(simdPackage, "Int16x8.PermuteConstantHi", opLen1Imm8(ssa.OpPermuteConstantHiInt16x8, types.TypeVec128, 0), sys.AMD64)
addF(simdPackage, "Int32x4.PermuteConstantHi", opLen1Imm8(ssa.OpPermuteConstantHiInt32x4, types.TypeVec128, 0), sys.AMD64)
addF(simdPackage, "Uint16x8.PermuteConstantHi", opLen1Imm8(ssa.OpPermuteConstantHiUint16x8, types.TypeVec128, 0), sys.AMD64)
addF(simdPackage, "Uint32x4.PermuteConstantHi", opLen1Imm8(ssa.OpPermuteConstantHiUint32x4, types.TypeVec128, 0), sys.AMD64)
addF(simdPackage, "Int16x16.PermuteConstantHiGrouped", opLen1Imm8(ssa.OpPermuteConstantHiGroupedInt16x16, types.TypeVec256, 0), sys.AMD64)
addF(simdPackage, "Int16x32.PermuteConstantHiGrouped", opLen1Imm8(ssa.OpPermuteConstantHiGroupedInt16x32, types.TypeVec512, 0), sys.AMD64)
addF(simdPackage, "Uint16x16.PermuteConstantHiGrouped", opLen1Imm8(ssa.OpPermuteConstantHiGroupedUint16x16, types.TypeVec256, 0), sys.AMD64)
addF(simdPackage, "Uint16x32.PermuteConstantHiGrouped", opLen1Imm8(ssa.OpPermuteConstantHiGroupedUint16x32, types.TypeVec512, 0), sys.AMD64)
addF(simdPackage, "Int16x8.PermuteConstantLo", opLen1Imm8(ssa.OpPermuteConstantLoInt16x8, types.TypeVec128, 0), sys.AMD64)
addF(simdPackage, "Int32x4.PermuteConstantLo", opLen1Imm8(ssa.OpPermuteConstantLoInt32x4, types.TypeVec128, 0), sys.AMD64)
addF(simdPackage, "Uint16x8.PermuteConstantLo", opLen1Imm8(ssa.OpPermuteConstantLoUint16x8, types.TypeVec128, 0), sys.AMD64)
addF(simdPackage, "Uint32x4.PermuteConstantLo", opLen1Imm8(ssa.OpPermuteConstantLoUint32x4, types.TypeVec128, 0), sys.AMD64)
addF(simdPackage, "Int16x16.PermuteConstantLoGrouped", opLen1Imm8(ssa.OpPermuteConstantLoGroupedInt16x16, types.TypeVec256, 0), sys.AMD64)
addF(simdPackage, "Int16x32.PermuteConstantLoGrouped", opLen1Imm8(ssa.OpPermuteConstantLoGroupedInt16x32, types.TypeVec512, 0), sys.AMD64)
addF(simdPackage, "Uint16x16.PermuteConstantLoGrouped", opLen1Imm8(ssa.OpPermuteConstantLoGroupedUint16x16, types.TypeVec256, 0), sys.AMD64)
addF(simdPackage, "Uint16x32.PermuteConstantLoGrouped", opLen1Imm8(ssa.OpPermuteConstantLoGroupedUint16x32, types.TypeVec512, 0), sys.AMD64)
addF(simdPackage, "Int8x32.PermuteGrouped", opLen2(ssa.OpPermuteGroupedInt8x32, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int8x64.PermuteGrouped", opLen2(ssa.OpPermuteGroupedInt8x64, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Uint8x32.PermuteGrouped", opLen2(ssa.OpPermuteGroupedUint8x32, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint8x64.PermuteGrouped", opLen2(ssa.OpPermuteGroupedUint8x64, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Float32x4.Reciprocal", opLen1(ssa.OpReciprocalFloat32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Float32x4.Reciprocal", opLen1(ssa.OpReciprocalFloat32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Float32x8.Reciprocal", opLen1(ssa.OpReciprocalFloat32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Float32x8.Reciprocal", opLen1(ssa.OpReciprocalFloat32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Float32x16.Reciprocal", opLen1(ssa.OpReciprocalFloat32x16, types.TypeVec512), sys.AMD64) addF(simdPackage, "Float32x16.Reciprocal", opLen1(ssa.OpReciprocalFloat32x16, types.TypeVec512), sys.AMD64)

View file

@ -74,4 +74,32 @@
commutative: false commutative: false
documentation: !string |- documentation: !string |-
// NAME copies element zero of its (128-bit) input to all elements of // NAME copies element zero of its (128-bit) input to all elements of
// the 512-bit output vector. // the 512-bit output vector.
- go: PermuteGrouped
commutative: false
documentation: !string |- # Detailed documentation will rely on the specific ops.
// NAME performs a grouped permutation of vector x using indices:
- go: PermuteConstant
commutative: false
documentation: !string |- # Detailed documentation will rely on the specific ops.
// NAME performs a permutation of vector x using constant indices:
- go: PermuteConstantGrouped
commutative: false
documentation: !string |- # Detailed documentation will rely on the specific ops.
// NAME performs a grouped permutation of vector x using constant indices:
- go: PermuteConstantLo
commutative: false
documentation: !string |- # Detailed documentation will rely on the specific ops.
// NAME performs a permutation of vector x using constant indices:
- go: PermuteConstantLoGrouped
commutative: false
documentation: !string |- # Detailed documentation will rely on the specific ops.
// NAME performs a grouped permutation of vector x using constant indices:
- go: PermuteConstantHi
commutative: false
documentation: !string |- # Detailed documentation will rely on the specific ops.
// NAME performs a permutation of vector x using constant indices:
- go: PermuteConstantHiGrouped
commutative: false
documentation: !string |- # Detailed documentation will rely on the specific ops.
// NAME performs a grouped permutation of vector x using constant indices:

View file

@ -432,4 +432,98 @@
go: $t go: $t
name: indices name: indices
out: out:
- *128any - *128any
- go: PermuteGrouped
asm: VPSHUFB
addDoc: !string |-
// result := {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...}
// Only the needed bits to represent the index of a group of x are used in indices' elements.
// However when the top bit is set, the low bits will be disregard and the respective element in the result vector will be zeroed.
// Each group is of size 128-bit.
in:
- &256Or512any
bits: "256|512"
go: $t
- bits: "256|512"
go: $t
name: indices
out:
- *256Or512any
- go: PermuteConstant
asm: VPSHUFD
addDoc: !string |-
// result := {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
in:
- *128any
- class: immediate
immOffset: 0
name: indices
out:
- *128any
- go: PermuteConstantGrouped
asm: VPSHUFD
addDoc: !string |-
// result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
// Each group is of size 128-bit.
in:
- *256Or512any
- class: immediate
immOffset: 0
name: indices
out:
- *256Or512any
- go: PermuteConstantLo
asm: VPSHUFHW
addDoc: !string |-
// result := {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
in:
- *128any
- class: immediate
immOffset: 0
name: indices
out:
- *128any
- go: PermuteConstantLoGrouped
asm: VPSHUFHW
addDoc: !string |-
// result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
// Each group is of size 128-bit.
in:
- *256Or512any
- class: immediate
immOffset: 0
name: indices
out:
- *256Or512any
- go: PermuteConstantHi
asm: VPSHUFHW
addDoc: !string |-
// result := {x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]}
// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
in:
- *128any
- class: immediate
immOffset: 0
name: indices
out:
- *128any
- go: PermuteConstantHiGrouped
asm: VPSHUFHW
addDoc: !string |-
// result := {x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4], x_group1[indices[0:2]+4], ...}
// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
// Each group is of size 128-bit.
in:
- *256Or512any
- class: immediate
immOffset: 0
name: indices
out:
- *256Or512any

View file

@ -4564,6 +4564,266 @@ func (x Int64x8) Permute2(y Int64x8, indices Uint64x8) Int64x8
// Asm: VPERMI2Q, CPU Feature: AVX512 // Asm: VPERMI2Q, CPU Feature: AVX512
func (x Uint64x8) Permute2(y Uint64x8, indices Uint64x8) Uint64x8 func (x Uint64x8) Permute2(y Uint64x8, indices Uint64x8) Uint64x8
/* PermuteConstant */
// PermuteConstant performs a permutation of vector x using constant indices:
// result := {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
//
// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
//
// Asm: VPSHUFD, CPU Feature: AVX
func (x Int32x4) PermuteConstant(indices uint8) Int32x4
// PermuteConstant performs a permutation of vector x using constant indices:
// result := {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
//
// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
//
// Asm: VPSHUFD, CPU Feature: AVX
func (x Uint32x4) PermuteConstant(indices uint8) Uint32x4
/* PermuteConstantGrouped */
// PermuteConstantGrouped performs a grouped permutation of vector x using constant indices:
// result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
// Each group is of size 128-bit.
//
// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
//
// Asm: VPSHUFD, CPU Feature: AVX2
func (x Int32x8) PermuteConstantGrouped(indices uint8) Int32x8
// PermuteConstantGrouped performs a grouped permutation of vector x using constant indices:
// result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
// Each group is of size 128-bit.
//
// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
//
// Asm: VPSHUFD, CPU Feature: AVX512
func (x Int32x16) PermuteConstantGrouped(indices uint8) Int32x16
// PermuteConstantGrouped performs a grouped permutation of vector x using constant indices:
// result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
// Each group is of size 128-bit.
//
// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
//
// Asm: VPSHUFD, CPU Feature: AVX2
func (x Uint32x8) PermuteConstantGrouped(indices uint8) Uint32x8
// PermuteConstantGrouped performs a grouped permutation of vector x using constant indices:
// result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
// Each group is of size 128-bit.
//
// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
//
// Asm: VPSHUFD, CPU Feature: AVX512
func (x Uint32x16) PermuteConstantGrouped(indices uint8) Uint32x16
/* PermuteConstantHi */
// PermuteConstantHi performs a permutation of vector x using constant indices:
// result := {x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]}
// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
//
// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
//
// Asm: VPSHUFHW, CPU Feature: AVX512
func (x Int16x8) PermuteConstantHi(indices uint8) Int16x8
// PermuteConstantHi performs a permutation of vector x using constant indices:
// result := {x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]}
// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
//
// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
//
// Asm: VPSHUFHW, CPU Feature: AVX
func (x Int32x4) PermuteConstantHi(indices uint8) Int32x4
// PermuteConstantHi performs a permutation of vector x using constant indices:
// result := {x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]}
// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
//
// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
//
// Asm: VPSHUFHW, CPU Feature: AVX512
func (x Uint16x8) PermuteConstantHi(indices uint8) Uint16x8
// PermuteConstantHi performs a permutation of vector x using constant indices:
// result := {x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]}
// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
//
// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
//
// Asm: VPSHUFHW, CPU Feature: AVX
func (x Uint32x4) PermuteConstantHi(indices uint8) Uint32x4
/* PermuteConstantHiGrouped */
// PermuteConstantHiGrouped performs a grouped permutation of vector x using constant indices:
// result := {x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4], x_group1[indices[0:2]+4], ...}
// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
// Each group is of size 128-bit.
//
// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
//
// Asm: VPSHUFHW, CPU Feature: AVX2
func (x Int16x16) PermuteConstantHiGrouped(indices uint8) Int16x16
// PermuteConstantHiGrouped performs a grouped permutation of vector x using constant indices:
// result := {x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4], x_group1[indices[0:2]+4], ...}
// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
// Each group is of size 128-bit.
//
// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
//
// Asm: VPSHUFHW, CPU Feature: AVX512
func (x Int16x32) PermuteConstantHiGrouped(indices uint8) Int16x32
// PermuteConstantHiGrouped performs a grouped permutation of vector x using constant indices:
// result := {x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4], x_group1[indices[0:2]+4], ...}
// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
// Each group is of size 128-bit.
//
// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
//
// Asm: VPSHUFHW, CPU Feature: AVX2
func (x Uint16x16) PermuteConstantHiGrouped(indices uint8) Uint16x16
// PermuteConstantHiGrouped performs a grouped permutation of vector x using constant indices:
// result := {x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4], x_group1[indices[0:2]+4], ...}
// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
// Each group is of size 128-bit.
//
// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
//
// Asm: VPSHUFHW, CPU Feature: AVX512
func (x Uint16x32) PermuteConstantHiGrouped(indices uint8) Uint16x32
/* PermuteConstantLo */
// PermuteConstantLo performs a permutation of vector x using constant indices:
// result := {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
//
// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
//
// Asm: VPSHUFHW, CPU Feature: AVX512
func (x Int16x8) PermuteConstantLo(indices uint8) Int16x8
// PermuteConstantLo performs a permutation of vector x using constant indices:
// result := {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
//
// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
//
// Asm: VPSHUFHW, CPU Feature: AVX
func (x Int32x4) PermuteConstantLo(indices uint8) Int32x4
// PermuteConstantLo performs a permutation of vector x using constant indices:
// result := {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
//
// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
//
// Asm: VPSHUFHW, CPU Feature: AVX512
func (x Uint16x8) PermuteConstantLo(indices uint8) Uint16x8
// PermuteConstantLo performs a permutation of vector x using constant indices:
// result := {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
//
// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
//
// Asm: VPSHUFHW, CPU Feature: AVX
func (x Uint32x4) PermuteConstantLo(indices uint8) Uint32x4
/* PermuteConstantLoGrouped */
// PermuteConstantLoGrouped performs a grouped permutation of vector x using constant indices:
// result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
// Each group is of size 128-bit.
//
// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
//
// Asm: VPSHUFHW, CPU Feature: AVX2
func (x Int16x16) PermuteConstantLoGrouped(indices uint8) Int16x16
// PermuteConstantLoGrouped performs a grouped permutation of vector x using constant indices:
// result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
// Each group is of size 128-bit.
//
// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
//
// Asm: VPSHUFHW, CPU Feature: AVX512
func (x Int16x32) PermuteConstantLoGrouped(indices uint8) Int16x32
// PermuteConstantLoGrouped performs a grouped permutation of vector x using constant indices:
// result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
// Each group is of size 128-bit.
//
// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
//
// Asm: VPSHUFHW, CPU Feature: AVX2
func (x Uint16x16) PermuteConstantLoGrouped(indices uint8) Uint16x16
// PermuteConstantLoGrouped performs a grouped permutation of vector x using constant indices:
// result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
// Each group is of size 128-bit.
//
// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
//
// Asm: VPSHUFHW, CPU Feature: AVX512
func (x Uint16x32) PermuteConstantLoGrouped(indices uint8) Uint16x32
/* PermuteGrouped */
// PermuteGrouped performs a grouped permutation of vector x using indices:
// result := {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...}
// Only the needed bits to represent the index of a group of x are used in indices' elements.
// However when the top bit is set, the low bits will be disregard and the respective element in the result vector will be zeroed.
// Each group is of size 128-bit.
//
// Asm: VPSHUFB, CPU Feature: AVX2
func (x Int8x32) PermuteGrouped(indices Int8x32) Int8x32
// PermuteGrouped performs a grouped permutation of vector x using indices:
// result := {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...}
// Only the needed bits to represent the index of a group of x are used in indices' elements.
// However when the top bit is set, the low bits will be disregard and the respective element in the result vector will be zeroed.
// Each group is of size 128-bit.
//
// Asm: VPSHUFB, CPU Feature: AVX512
func (x Int8x64) PermuteGrouped(indices Int8x64) Int8x64
// PermuteGrouped performs a grouped permutation of vector x using indices:
// result := {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...}
// Only the needed bits to represent the index of a group of x are used in indices' elements.
// However when the top bit is set, the low bits will be disregard and the respective element in the result vector will be zeroed.
// Each group is of size 128-bit.
//
// Asm: VPSHUFB, CPU Feature: AVX2
func (x Uint8x32) PermuteGrouped(indices Uint8x32) Uint8x32
// PermuteGrouped performs a grouped permutation of vector x using indices:
// result := {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...}
// Only the needed bits to represent the index of a group of x are used in indices' elements.
// However when the top bit is set, the low bits will be disregard and the respective element in the result vector will be zeroed.
// Each group is of size 128-bit.
//
// Asm: VPSHUFB, CPU Feature: AVX512
func (x Uint8x64) PermuteGrouped(indices Uint8x64) Uint8x64
/* Reciprocal */ /* Reciprocal */
// Reciprocal computes an approximate reciprocal of each element. // Reciprocal computes an approximate reciprocal of each element.