[dev.simd] cmd/compile, simd: add variable Permute

This CL also added some tests for them. This CL is generated by CL 687919. Change-Id: I9ddd2cd23bb98ecca91bfbeaffd62faa4bd85e0d Reviewed-on: https://go-review.googlesource.com/c/go/+/687939 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: David Chase <drchase@google.com>
2025-12-08 06:10:04 +00:00 · 2025-07-14 19:39:44 +00:00 · 2025-07-14 19:39:44 +00:00 · 01f7f57025
commit 01f7f57025
parent f5f42753ab
11 changed files with 4385 additions and 0 deletions
--- a/src/cmd/compile/internal/amd64/simdssa.go
+++ b/src/cmd/compile/internal/amd64/simdssa.go
@ -233,6 +233,20 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPHSUBW256,
 		ssa.OpAMD64VPHSUBD128,
 		ssa.OpAMD64VPHSUBD256,
 		ssa.OpAMD64VPERMB128,
 		ssa.OpAMD64VPERMB256,
 		ssa.OpAMD64VPERMB512,
 		ssa.OpAMD64VPERMW128,
 		ssa.OpAMD64VPERMW256,
 		ssa.OpAMD64VPERMW512,
 		ssa.OpAMD64VPERMPS256,
 		ssa.OpAMD64VPERMD256,
 		ssa.OpAMD64VPERMPS512,
 		ssa.OpAMD64VPERMD512,
 		ssa.OpAMD64VPERMPD256,
 		ssa.OpAMD64VPERMQ256,
 		ssa.OpAMD64VPERMPD512,
 		ssa.OpAMD64VPERMQ512,
 		ssa.OpAMD64VPROLVD128,
 		ssa.OpAMD64VPROLVD256,
 		ssa.OpAMD64VPROLVD512,
@ -468,6 +482,20 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPMADDWDMasked128,
 		ssa.OpAMD64VPMADDWDMasked256,
 		ssa.OpAMD64VPMADDWDMasked512,
 		ssa.OpAMD64VPERMBMasked128,
 		ssa.OpAMD64VPERMBMasked256,
 		ssa.OpAMD64VPERMBMasked512,
 		ssa.OpAMD64VPERMWMasked128,
 		ssa.OpAMD64VPERMWMasked256,
 		ssa.OpAMD64VPERMWMasked512,
 		ssa.OpAMD64VPERMPSMasked256,
 		ssa.OpAMD64VPERMDMasked256,
 		ssa.OpAMD64VPERMPSMasked512,
 		ssa.OpAMD64VPERMDMasked512,
 		ssa.OpAMD64VPERMPDMasked256,
 		ssa.OpAMD64VPERMQMasked256,
 		ssa.OpAMD64VPERMPDMasked512,
 		ssa.OpAMD64VPERMQMasked512,
 		ssa.OpAMD64VPROLVDMasked128,
 		ssa.OpAMD64VPROLVDMasked256,
 		ssa.OpAMD64VPROLVDMasked512,
@ -766,6 +794,24 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPDPWSSD128,
 		ssa.OpAMD64VPDPWSSD256,
 		ssa.OpAMD64VPDPWSSD512,
 		ssa.OpAMD64VPERMI2B128,
 		ssa.OpAMD64VPERMI2B256,
 		ssa.OpAMD64VPERMI2B512,
 		ssa.OpAMD64VPERMI2W128,
 		ssa.OpAMD64VPERMI2W256,
 		ssa.OpAMD64VPERMI2W512,
 		ssa.OpAMD64VPERMI2PS128,
 		ssa.OpAMD64VPERMI2D128,
 		ssa.OpAMD64VPERMI2PS256,
 		ssa.OpAMD64VPERMI2D256,
 		ssa.OpAMD64VPERMI2PS512,
 		ssa.OpAMD64VPERMI2D512,
 		ssa.OpAMD64VPERMI2PD128,
 		ssa.OpAMD64VPERMI2Q128,
 		ssa.OpAMD64VPERMI2PD256,
 		ssa.OpAMD64VPERMI2Q256,
 		ssa.OpAMD64VPERMI2PD512,
 		ssa.OpAMD64VPERMI2Q512,
 		ssa.OpAMD64VPDPWSSDS128,
 		ssa.OpAMD64VPDPWSSDS256,
 		ssa.OpAMD64VPDPWSSDS512,
@ -816,6 +862,24 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPDPWSSDMasked128,
 		ssa.OpAMD64VPDPWSSDMasked256,
 		ssa.OpAMD64VPDPWSSDMasked512,
 		ssa.OpAMD64VPERMI2BMasked128,
 		ssa.OpAMD64VPERMI2BMasked256,
 		ssa.OpAMD64VPERMI2BMasked512,
 		ssa.OpAMD64VPERMI2WMasked128,
 		ssa.OpAMD64VPERMI2WMasked256,
 		ssa.OpAMD64VPERMI2WMasked512,
 		ssa.OpAMD64VPERMI2PSMasked128,
 		ssa.OpAMD64VPERMI2DMasked128,
 		ssa.OpAMD64VPERMI2PSMasked256,
 		ssa.OpAMD64VPERMI2DMasked256,
 		ssa.OpAMD64VPERMI2PSMasked512,
 		ssa.OpAMD64VPERMI2DMasked512,
 		ssa.OpAMD64VPERMI2PDMasked128,
 		ssa.OpAMD64VPERMI2QMasked128,
 		ssa.OpAMD64VPERMI2PDMasked256,
 		ssa.OpAMD64VPERMI2QMasked256,
 		ssa.OpAMD64VPERMI2PDMasked512,
 		ssa.OpAMD64VPERMI2QMasked512,
 		ssa.OpAMD64VPDPWSSDSMasked128,
 		ssa.OpAMD64VPDPWSSDSMasked256,
 		ssa.OpAMD64VPDPWSSDSMasked512,
@ -1158,6 +1222,38 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPMADDWDMasked128,
 		ssa.OpAMD64VPMADDWDMasked256,
 		ssa.OpAMD64VPMADDWDMasked512,
 		ssa.OpAMD64VPERMI2BMasked128,
 		ssa.OpAMD64VPERMI2BMasked256,
 		ssa.OpAMD64VPERMI2BMasked512,
 		ssa.OpAMD64VPERMI2WMasked128,
 		ssa.OpAMD64VPERMI2WMasked256,
 		ssa.OpAMD64VPERMI2WMasked512,
 		ssa.OpAMD64VPERMI2PSMasked128,
 		ssa.OpAMD64VPERMI2DMasked128,
 		ssa.OpAMD64VPERMI2PSMasked256,
 		ssa.OpAMD64VPERMI2DMasked256,
 		ssa.OpAMD64VPERMI2PSMasked512,
 		ssa.OpAMD64VPERMI2DMasked512,
 		ssa.OpAMD64VPERMI2PDMasked128,
 		ssa.OpAMD64VPERMI2QMasked128,
 		ssa.OpAMD64VPERMI2PDMasked256,
 		ssa.OpAMD64VPERMI2QMasked256,
 		ssa.OpAMD64VPERMI2PDMasked512,
 		ssa.OpAMD64VPERMI2QMasked512,
 		ssa.OpAMD64VPERMBMasked128,
 		ssa.OpAMD64VPERMBMasked256,
 		ssa.OpAMD64VPERMBMasked512,
 		ssa.OpAMD64VPERMWMasked128,
 		ssa.OpAMD64VPERMWMasked256,
 		ssa.OpAMD64VPERMWMasked512,
 		ssa.OpAMD64VPERMPSMasked256,
 		ssa.OpAMD64VPERMDMasked256,
 		ssa.OpAMD64VPERMPSMasked512,
 		ssa.OpAMD64VPERMDMasked512,
 		ssa.OpAMD64VPERMPDMasked256,
 		ssa.OpAMD64VPERMQMasked256,
 		ssa.OpAMD64VPERMPDMasked512,
 		ssa.OpAMD64VPERMQMasked512,
 		ssa.OpAMD64VPOPCNTBMasked128,
 		ssa.OpAMD64VPOPCNTBMasked256,
 		ssa.OpAMD64VPOPCNTBMasked512,
--- a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
+++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
@ -985,6 +985,114 @@
 (PairwiseSubUint16x16 ...) => (VPHSUBW256 ...)
 (PairwiseSubUint32x4 ...) => (VPHSUBD128 ...)
 (PairwiseSubUint32x8 ...) => (VPHSUBD256 ...)
 (PermuteFloat32x8 ...) => (VPERMPS256 ...)
 (PermuteFloat32x16 ...) => (VPERMPS512 ...)
 (PermuteFloat64x4 ...) => (VPERMPD256 ...)
 (PermuteFloat64x8 ...) => (VPERMPD512 ...)
 (PermuteInt8x16 ...) => (VPERMB128 ...)
 (PermuteInt8x32 ...) => (VPERMB256 ...)
 (PermuteInt8x64 ...) => (VPERMB512 ...)
 (PermuteInt16x8 ...) => (VPERMW128 ...)
 (PermuteInt16x16 ...) => (VPERMW256 ...)
 (PermuteInt16x32 ...) => (VPERMW512 ...)
 (PermuteInt32x8 ...) => (VPERMD256 ...)
 (PermuteInt32x16 ...) => (VPERMD512 ...)
 (PermuteInt64x4 ...) => (VPERMQ256 ...)
 (PermuteInt64x8 ...) => (VPERMQ512 ...)
 (PermuteUint8x16 ...) => (VPERMB128 ...)
 (PermuteUint8x32 ...) => (VPERMB256 ...)
 (PermuteUint8x64 ...) => (VPERMB512 ...)
 (PermuteUint16x8 ...) => (VPERMW128 ...)
 (PermuteUint16x16 ...) => (VPERMW256 ...)
 (PermuteUint16x32 ...) => (VPERMW512 ...)
 (PermuteUint32x8 ...) => (VPERMD256 ...)
 (PermuteUint32x16 ...) => (VPERMD512 ...)
 (PermuteUint64x4 ...) => (VPERMQ256 ...)
 (PermuteUint64x8 ...) => (VPERMQ512 ...)
 (Permute2Float32x4 ...) => (VPERMI2PS128 ...)
 (Permute2Float32x8 ...) => (VPERMI2PS256 ...)
 (Permute2Float32x16 ...) => (VPERMI2PS512 ...)
 (Permute2Float64x2 ...) => (VPERMI2PD128 ...)
 (Permute2Float64x4 ...) => (VPERMI2PD256 ...)
 (Permute2Float64x8 ...) => (VPERMI2PD512 ...)
 (Permute2Int8x16 ...) => (VPERMI2B128 ...)
 (Permute2Int8x32 ...) => (VPERMI2B256 ...)
 (Permute2Int8x64 ...) => (VPERMI2B512 ...)
 (Permute2Int16x8 ...) => (VPERMI2W128 ...)
 (Permute2Int16x16 ...) => (VPERMI2W256 ...)
 (Permute2Int16x32 ...) => (VPERMI2W512 ...)
 (Permute2Int32x4 ...) => (VPERMI2D128 ...)
 (Permute2Int32x8 ...) => (VPERMI2D256 ...)
 (Permute2Int32x16 ...) => (VPERMI2D512 ...)
 (Permute2Int64x2 ...) => (VPERMI2Q128 ...)
 (Permute2Int64x4 ...) => (VPERMI2Q256 ...)
 (Permute2Int64x8 ...) => (VPERMI2Q512 ...)
 (Permute2Uint8x16 ...) => (VPERMI2B128 ...)
 (Permute2Uint8x32 ...) => (VPERMI2B256 ...)
 (Permute2Uint8x64 ...) => (VPERMI2B512 ...)
 (Permute2Uint16x8 ...) => (VPERMI2W128 ...)
 (Permute2Uint16x16 ...) => (VPERMI2W256 ...)
 (Permute2Uint16x32 ...) => (VPERMI2W512 ...)
 (Permute2Uint32x4 ...) => (VPERMI2D128 ...)
 (Permute2Uint32x8 ...) => (VPERMI2D256 ...)
 (Permute2Uint32x16 ...) => (VPERMI2D512 ...)
 (Permute2Uint64x2 ...) => (VPERMI2Q128 ...)
 (Permute2Uint64x4 ...) => (VPERMI2Q256 ...)
 (Permute2Uint64x8 ...) => (VPERMI2Q512 ...)
 (Permute2MaskedFloat32x4 x y z mask) => (VPERMI2PSMasked128 x y z (VPMOVVec32x4ToM <types.TypeMask> mask))
 (Permute2MaskedFloat32x8 x y z mask) => (VPERMI2PSMasked256 x y z (VPMOVVec32x8ToM <types.TypeMask> mask))
 (Permute2MaskedFloat32x16 x y z mask) => (VPERMI2PSMasked512 x y z (VPMOVVec32x16ToM <types.TypeMask> mask))
 (Permute2MaskedFloat64x2 x y z mask) => (VPERMI2PDMasked128 x y z (VPMOVVec64x2ToM <types.TypeMask> mask))
 (Permute2MaskedFloat64x4 x y z mask) => (VPERMI2PDMasked256 x y z (VPMOVVec64x4ToM <types.TypeMask> mask))
 (Permute2MaskedFloat64x8 x y z mask) => (VPERMI2PDMasked512 x y z (VPMOVVec64x8ToM <types.TypeMask> mask))
 (Permute2MaskedInt8x16 x y z mask) => (VPERMI2BMasked128 x y z (VPMOVVec8x16ToM <types.TypeMask> mask))
 (Permute2MaskedInt8x32 x y z mask) => (VPERMI2BMasked256 x y z (VPMOVVec8x32ToM <types.TypeMask> mask))
 (Permute2MaskedInt8x64 x y z mask) => (VPERMI2BMasked512 x y z (VPMOVVec8x64ToM <types.TypeMask> mask))
 (Permute2MaskedInt16x8 x y z mask) => (VPERMI2WMasked128 x y z (VPMOVVec16x8ToM <types.TypeMask> mask))
 (Permute2MaskedInt16x16 x y z mask) => (VPERMI2WMasked256 x y z (VPMOVVec16x16ToM <types.TypeMask> mask))
 (Permute2MaskedInt16x32 x y z mask) => (VPERMI2WMasked512 x y z (VPMOVVec16x32ToM <types.TypeMask> mask))
 (Permute2MaskedInt32x4 x y z mask) => (VPERMI2DMasked128 x y z (VPMOVVec32x4ToM <types.TypeMask> mask))
 (Permute2MaskedInt32x8 x y z mask) => (VPERMI2DMasked256 x y z (VPMOVVec32x8ToM <types.TypeMask> mask))
 (Permute2MaskedInt32x16 x y z mask) => (VPERMI2DMasked512 x y z (VPMOVVec32x16ToM <types.TypeMask> mask))
 (Permute2MaskedInt64x2 x y z mask) => (VPERMI2QMasked128 x y z (VPMOVVec64x2ToM <types.TypeMask> mask))
 (Permute2MaskedInt64x4 x y z mask) => (VPERMI2QMasked256 x y z (VPMOVVec64x4ToM <types.TypeMask> mask))
 (Permute2MaskedInt64x8 x y z mask) => (VPERMI2QMasked512 x y z (VPMOVVec64x8ToM <types.TypeMask> mask))
 (Permute2MaskedUint8x16 x y z mask) => (VPERMI2BMasked128 x y z (VPMOVVec8x16ToM <types.TypeMask> mask))
 (Permute2MaskedUint8x32 x y z mask) => (VPERMI2BMasked256 x y z (VPMOVVec8x32ToM <types.TypeMask> mask))
 (Permute2MaskedUint8x64 x y z mask) => (VPERMI2BMasked512 x y z (VPMOVVec8x64ToM <types.TypeMask> mask))
 (Permute2MaskedUint16x8 x y z mask) => (VPERMI2WMasked128 x y z (VPMOVVec16x8ToM <types.TypeMask> mask))
 (Permute2MaskedUint16x16 x y z mask) => (VPERMI2WMasked256 x y z (VPMOVVec16x16ToM <types.TypeMask> mask))
 (Permute2MaskedUint16x32 x y z mask) => (VPERMI2WMasked512 x y z (VPMOVVec16x32ToM <types.TypeMask> mask))
 (Permute2MaskedUint32x4 x y z mask) => (VPERMI2DMasked128 x y z (VPMOVVec32x4ToM <types.TypeMask> mask))
 (Permute2MaskedUint32x8 x y z mask) => (VPERMI2DMasked256 x y z (VPMOVVec32x8ToM <types.TypeMask> mask))
 (Permute2MaskedUint32x16 x y z mask) => (VPERMI2DMasked512 x y z (VPMOVVec32x16ToM <types.TypeMask> mask))
 (Permute2MaskedUint64x2 x y z mask) => (VPERMI2QMasked128 x y z (VPMOVVec64x2ToM <types.TypeMask> mask))
 (Permute2MaskedUint64x4 x y z mask) => (VPERMI2QMasked256 x y z (VPMOVVec64x4ToM <types.TypeMask> mask))
 (Permute2MaskedUint64x8 x y z mask) => (VPERMI2QMasked512 x y z (VPMOVVec64x8ToM <types.TypeMask> mask))
 (PermuteMaskedFloat32x8 x y mask) => (VPERMPSMasked256 x y (VPMOVVec32x8ToM <types.TypeMask> mask))
 (PermuteMaskedFloat32x16 x y mask) => (VPERMPSMasked512 x y (VPMOVVec32x16ToM <types.TypeMask> mask))
 (PermuteMaskedFloat64x4 x y mask) => (VPERMPDMasked256 x y (VPMOVVec64x4ToM <types.TypeMask> mask))
 (PermuteMaskedFloat64x8 x y mask) => (VPERMPDMasked512 x y (VPMOVVec64x8ToM <types.TypeMask> mask))
 (PermuteMaskedInt8x16 x y mask) => (VPERMBMasked128 x y (VPMOVVec8x16ToM <types.TypeMask> mask))
 (PermuteMaskedInt8x32 x y mask) => (VPERMBMasked256 x y (VPMOVVec8x32ToM <types.TypeMask> mask))
 (PermuteMaskedInt8x64 x y mask) => (VPERMBMasked512 x y (VPMOVVec8x64ToM <types.TypeMask> mask))
 (PermuteMaskedInt16x8 x y mask) => (VPERMWMasked128 x y (VPMOVVec16x8ToM <types.TypeMask> mask))
 (PermuteMaskedInt16x16 x y mask) => (VPERMWMasked256 x y (VPMOVVec16x16ToM <types.TypeMask> mask))
 (PermuteMaskedInt16x32 x y mask) => (VPERMWMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask))
 (PermuteMaskedInt32x8 x y mask) => (VPERMDMasked256 x y (VPMOVVec32x8ToM <types.TypeMask> mask))
 (PermuteMaskedInt32x16 x y mask) => (VPERMDMasked512 x y (VPMOVVec32x16ToM <types.TypeMask> mask))
 (PermuteMaskedInt64x4 x y mask) => (VPERMQMasked256 x y (VPMOVVec64x4ToM <types.TypeMask> mask))
 (PermuteMaskedInt64x8 x y mask) => (VPERMQMasked512 x y (VPMOVVec64x8ToM <types.TypeMask> mask))
 (PermuteMaskedUint8x16 x y mask) => (VPERMBMasked128 x y (VPMOVVec8x16ToM <types.TypeMask> mask))
 (PermuteMaskedUint8x32 x y mask) => (VPERMBMasked256 x y (VPMOVVec8x32ToM <types.TypeMask> mask))
 (PermuteMaskedUint8x64 x y mask) => (VPERMBMasked512 x y (VPMOVVec8x64ToM <types.TypeMask> mask))
 (PermuteMaskedUint16x8 x y mask) => (VPERMWMasked128 x y (VPMOVVec16x8ToM <types.TypeMask> mask))
 (PermuteMaskedUint16x16 x y mask) => (VPERMWMasked256 x y (VPMOVVec16x16ToM <types.TypeMask> mask))
 (PermuteMaskedUint16x32 x y mask) => (VPERMWMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask))
 (PermuteMaskedUint32x8 x y mask) => (VPERMDMasked256 x y (VPMOVVec32x8ToM <types.TypeMask> mask))
 (PermuteMaskedUint32x16 x y mask) => (VPERMDMasked512 x y (VPMOVVec32x16ToM <types.TypeMask> mask))
 (PermuteMaskedUint64x4 x y mask) => (VPERMQMasked256 x y (VPMOVVec64x4ToM <types.TypeMask> mask))
 (PermuteMaskedUint64x8 x y mask) => (VPERMQMasked512 x y (VPMOVVec64x8ToM <types.TypeMask> mask))
 (PopCountInt8x16 ...) => (VPOPCNTB128 ...)
 (PopCountInt8x32 ...) => (VPOPCNTB256 ...)
 (PopCountInt8x64 ...) => (VPOPCNTB512 ...)
--- a/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go
+++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go
@ -613,6 +613,10 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
 		{name: "VPMINUWMasked256", argLength: 3, reg: w2kw, asm: "VPMINUW", commutative: true, typ: "Vec256", resultInArg0: false},
 		{name: "VPMULHUW256", argLength: 2, reg: v21, asm: "VPMULHUW", commutative: true, typ: "Vec256", resultInArg0: false},
 		{name: "VPMULHUWMasked256", argLength: 3, reg: w2kw, asm: "VPMULHUW", commutative: true, typ: "Vec256", resultInArg0: false},
 		{name: "VPERMW256", argLength: 2, reg: w21, asm: "VPERMW", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPERMI2W256", argLength: 3, reg: w31, asm: "VPERMI2W", commutative: false, typ: "Vec256", resultInArg0: true},
 		{name: "VPERMI2WMasked256", argLength: 4, reg: w3kw, asm: "VPERMI2W", commutative: false, typ: "Vec256", resultInArg0: true},
 		{name: "VPERMWMasked256", argLength: 3, reg: w2kw, asm: "VPERMW", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPSRLW256", argLength: 2, reg: vfpv, asm: "VPSRLW", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPSRLWMasked256", argLength: 3, reg: wfpkw, asm: "VPSRLW", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPSRLVW256", argLength: 2, reg: w21, asm: "VPSRLVW", commutative: false, typ: "Vec256", resultInArg0: false},
@ -625,6 +629,10 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
 		{name: "VPMINUWMasked512", argLength: 3, reg: w2kw, asm: "VPMINUW", commutative: true, typ: "Vec512", resultInArg0: false},
 		{name: "VPMULHUW512", argLength: 2, reg: w21, asm: "VPMULHUW", commutative: true, typ: "Vec512", resultInArg0: false},
 		{name: "VPMULHUWMasked512", argLength: 3, reg: w2kw, asm: "VPMULHUW", commutative: true, typ: "Vec512", resultInArg0: false},
 		{name: "VPERMW512", argLength: 2, reg: w21, asm: "VPERMW", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VPERMI2W512", argLength: 3, reg: w31, asm: "VPERMI2W", commutative: false, typ: "Vec512", resultInArg0: true},
 		{name: "VPERMI2WMasked512", argLength: 4, reg: w3kw, asm: "VPERMI2W", commutative: false, typ: "Vec512", resultInArg0: true},
 		{name: "VPERMWMasked512", argLength: 3, reg: w2kw, asm: "VPERMW", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VPSRLW512", argLength: 2, reg: wfpw, asm: "VPSRLW", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VPSRLWMasked512", argLength: 3, reg: wfpkw, asm: "VPSRLW", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VPSRLVW512", argLength: 2, reg: w21, asm: "VPSRLVW", commutative: false, typ: "Vec512", resultInArg0: false},
@ -637,6 +645,10 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
 		{name: "VPMINUWMasked128", argLength: 3, reg: w2kw, asm: "VPMINUW", commutative: true, typ: "Vec128", resultInArg0: false},
 		{name: "VPMULHUW128", argLength: 2, reg: v21, asm: "VPMULHUW", commutative: true, typ: "Vec128", resultInArg0: false},
 		{name: "VPMULHUWMasked128", argLength: 3, reg: w2kw, asm: "VPMULHUW", commutative: true, typ: "Vec128", resultInArg0: false},
 		{name: "VPERMW128", argLength: 2, reg: w21, asm: "VPERMW", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPERMI2W128", argLength: 3, reg: w31, asm: "VPERMI2W", commutative: false, typ: "Vec128", resultInArg0: true},
 		{name: "VPERMI2WMasked128", argLength: 4, reg: w3kw, asm: "VPERMI2W", commutative: false, typ: "Vec128", resultInArg0: true},
 		{name: "VPERMWMasked128", argLength: 3, reg: w2kw, asm: "VPERMW", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPSRLW128", argLength: 2, reg: vfpv, asm: "VPSRLW", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPSRLWMasked128", argLength: 3, reg: wfpkw, asm: "VPSRLW", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPSRLVW128", argLength: 2, reg: w21, asm: "VPSRLVW", commutative: false, typ: "Vec128", resultInArg0: false},
@ -645,6 +657,14 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
 		{name: "VPMAXUDMasked512", argLength: 3, reg: w2kw, asm: "VPMAXUD", commutative: true, typ: "Vec512", resultInArg0: false},
 		{name: "VPMINUD512", argLength: 2, reg: w21, asm: "VPMINUD", commutative: true, typ: "Vec512", resultInArg0: false},
 		{name: "VPMINUDMasked512", argLength: 3, reg: w2kw, asm: "VPMINUD", commutative: true, typ: "Vec512", resultInArg0: false},
 		{name: "VPERMPS512", argLength: 2, reg: w21, asm: "VPERMPS", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VPERMD512", argLength: 2, reg: w21, asm: "VPERMD", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VPERMI2D512", argLength: 3, reg: w31, asm: "VPERMI2D", commutative: false, typ: "Vec512", resultInArg0: true},
 		{name: "VPERMI2PS512", argLength: 3, reg: w31, asm: "VPERMI2PS", commutative: false, typ: "Vec512", resultInArg0: true},
 		{name: "VPERMI2DMasked512", argLength: 4, reg: w3kw, asm: "VPERMI2D", commutative: false, typ: "Vec512", resultInArg0: true},
 		{name: "VPERMI2PSMasked512", argLength: 4, reg: w3kw, asm: "VPERMI2PS", commutative: false, typ: "Vec512", resultInArg0: true},
 		{name: "VPERMPSMasked512", argLength: 3, reg: w2kw, asm: "VPERMPS", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VPERMDMasked512", argLength: 3, reg: w2kw, asm: "VPERMD", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VPSRLD512", argLength: 2, reg: wfpw, asm: "VPSRLD", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VPSRLDMasked512", argLength: 3, reg: wfpkw, asm: "VPSRLD", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VPSRLVD512", argLength: 2, reg: w21, asm: "VPSRLVD", commutative: false, typ: "Vec512", resultInArg0: false},
@ -654,6 +674,10 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
 		{name: "VPMINUD128", argLength: 2, reg: v21, asm: "VPMINUD", commutative: true, typ: "Vec128", resultInArg0: false},
 		{name: "VPMINUDMasked128", argLength: 3, reg: w2kw, asm: "VPMINUD", commutative: true, typ: "Vec128", resultInArg0: false},
 		{name: "VPMULUDQ128", argLength: 2, reg: v21, asm: "VPMULUDQ", commutative: true, typ: "Vec128", resultInArg0: false},
 		{name: "VPERMI2D128", argLength: 3, reg: w31, asm: "VPERMI2D", commutative: false, typ: "Vec128", resultInArg0: true},
 		{name: "VPERMI2PS128", argLength: 3, reg: w31, asm: "VPERMI2PS", commutative: false, typ: "Vec128", resultInArg0: true},
 		{name: "VPERMI2PSMasked128", argLength: 4, reg: w3kw, asm: "VPERMI2PS", commutative: false, typ: "Vec128", resultInArg0: true},
 		{name: "VPERMI2DMasked128", argLength: 4, reg: w3kw, asm: "VPERMI2D", commutative: false, typ: "Vec128", resultInArg0: true},
 		{name: "VPSRLD128", argLength: 2, reg: vfpv, asm: "VPSRLD", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPSRLDMasked128", argLength: 3, reg: wfpkw, asm: "VPSRLD", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPSRLVD128", argLength: 2, reg: v21, asm: "VPSRLVD", commutative: false, typ: "Vec128", resultInArg0: false},
@ -663,6 +687,14 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
 		{name: "VPMINUD256", argLength: 2, reg: v21, asm: "VPMINUD", commutative: true, typ: "Vec256", resultInArg0: false},
 		{name: "VPMINUDMasked256", argLength: 3, reg: w2kw, asm: "VPMINUD", commutative: true, typ: "Vec256", resultInArg0: false},
 		{name: "VPMULUDQ256", argLength: 2, reg: v21, asm: "VPMULUDQ", commutative: true, typ: "Vec256", resultInArg0: false},
 		{name: "VPERMD256", argLength: 2, reg: v21, asm: "VPERMD", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPERMPS256", argLength: 2, reg: v21, asm: "VPERMPS", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPERMI2D256", argLength: 3, reg: w31, asm: "VPERMI2D", commutative: false, typ: "Vec256", resultInArg0: true},
 		{name: "VPERMI2PS256", argLength: 3, reg: w31, asm: "VPERMI2PS", commutative: false, typ: "Vec256", resultInArg0: true},
 		{name: "VPERMI2PSMasked256", argLength: 4, reg: w3kw, asm: "VPERMI2PS", commutative: false, typ: "Vec256", resultInArg0: true},
 		{name: "VPERMI2DMasked256", argLength: 4, reg: w3kw, asm: "VPERMI2D", commutative: false, typ: "Vec256", resultInArg0: true},
 		{name: "VPERMPSMasked256", argLength: 3, reg: w2kw, asm: "VPERMPS", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPERMDMasked256", argLength: 3, reg: w2kw, asm: "VPERMD", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPSRLD256", argLength: 2, reg: vfpv, asm: "VPSRLD", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPSRLDMasked256", argLength: 3, reg: wfpkw, asm: "VPSRLD", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPSRLVD256", argLength: 2, reg: v21, asm: "VPSRLVD", commutative: false, typ: "Vec256", resultInArg0: false},
@ -672,6 +704,10 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
 		{name: "VPMINUQ128", argLength: 2, reg: w21, asm: "VPMINUQ", commutative: true, typ: "Vec128", resultInArg0: false},
 		{name: "VPMINUQMasked128", argLength: 3, reg: w2kw, asm: "VPMINUQ", commutative: true, typ: "Vec128", resultInArg0: false},
 		{name: "VPMULUDQMasked128", argLength: 3, reg: w2kw, asm: "VPMULUDQ", commutative: true, typ: "Vec128", resultInArg0: false},
 		{name: "VPERMI2PD128", argLength: 3, reg: w31, asm: "VPERMI2PD", commutative: false, typ: "Vec128", resultInArg0: true},
 		{name: "VPERMI2Q128", argLength: 3, reg: w31, asm: "VPERMI2Q", commutative: false, typ: "Vec128", resultInArg0: true},
 		{name: "VPERMI2QMasked128", argLength: 4, reg: w3kw, asm: "VPERMI2Q", commutative: false, typ: "Vec128", resultInArg0: true},
 		{name: "VPERMI2PDMasked128", argLength: 4, reg: w3kw, asm: "VPERMI2PD", commutative: false, typ: "Vec128", resultInArg0: true},
 		{name: "VPSRLQ128", argLength: 2, reg: vfpv, asm: "VPSRLQ", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPSRLQMasked128", argLength: 3, reg: wfpkw, asm: "VPSRLQ", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPSRLVQ128", argLength: 2, reg: v21, asm: "VPSRLVQ", commutative: false, typ: "Vec128", resultInArg0: false},
@ -681,6 +717,14 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
 		{name: "VPMINUQ256", argLength: 2, reg: w21, asm: "VPMINUQ", commutative: true, typ: "Vec256", resultInArg0: false},
 		{name: "VPMINUQMasked256", argLength: 3, reg: w2kw, asm: "VPMINUQ", commutative: true, typ: "Vec256", resultInArg0: false},
 		{name: "VPMULUDQMasked256", argLength: 3, reg: w2kw, asm: "VPMULUDQ", commutative: true, typ: "Vec256", resultInArg0: false},
 		{name: "VPERMQ256", argLength: 2, reg: w21, asm: "VPERMQ", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPERMPD256", argLength: 2, reg: w21, asm: "VPERMPD", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPERMI2PD256", argLength: 3, reg: w31, asm: "VPERMI2PD", commutative: false, typ: "Vec256", resultInArg0: true},
 		{name: "VPERMI2Q256", argLength: 3, reg: w31, asm: "VPERMI2Q", commutative: false, typ: "Vec256", resultInArg0: true},
 		{name: "VPERMI2PDMasked256", argLength: 4, reg: w3kw, asm: "VPERMI2PD", commutative: false, typ: "Vec256", resultInArg0: true},
 		{name: "VPERMI2QMasked256", argLength: 4, reg: w3kw, asm: "VPERMI2Q", commutative: false, typ: "Vec256", resultInArg0: true},
 		{name: "VPERMPDMasked256", argLength: 3, reg: w2kw, asm: "VPERMPD", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPERMQMasked256", argLength: 3, reg: w2kw, asm: "VPERMQ", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPSRLQ256", argLength: 2, reg: vfpv, asm: "VPSRLQ", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPSRLQMasked256", argLength: 3, reg: wfpkw, asm: "VPSRLQ", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPSRLVQ256", argLength: 2, reg: v21, asm: "VPSRLVQ", commutative: false, typ: "Vec256", resultInArg0: false},
@ -691,6 +735,14 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
 		{name: "VPMINUQMasked512", argLength: 3, reg: w2kw, asm: "VPMINUQ", commutative: true, typ: "Vec512", resultInArg0: false},
 		{name: "VPMULUDQ512", argLength: 2, reg: w21, asm: "VPMULUDQ", commutative: true, typ: "Vec512", resultInArg0: false},
 		{name: "VPMULUDQMasked512", argLength: 3, reg: w2kw, asm: "VPMULUDQ", commutative: true, typ: "Vec512", resultInArg0: false},
 		{name: "VPERMPD512", argLength: 2, reg: w21, asm: "VPERMPD", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VPERMQ512", argLength: 2, reg: w21, asm: "VPERMQ", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VPERMI2Q512", argLength: 3, reg: w31, asm: "VPERMI2Q", commutative: false, typ: "Vec512", resultInArg0: true},
 		{name: "VPERMI2PD512", argLength: 3, reg: w31, asm: "VPERMI2PD", commutative: false, typ: "Vec512", resultInArg0: true},
 		{name: "VPERMI2QMasked512", argLength: 4, reg: w3kw, asm: "VPERMI2Q", commutative: false, typ: "Vec512", resultInArg0: true},
 		{name: "VPERMI2PDMasked512", argLength: 4, reg: w3kw, asm: "VPERMI2PD", commutative: false, typ: "Vec512", resultInArg0: true},
 		{name: "VPERMPDMasked512", argLength: 3, reg: w2kw, asm: "VPERMPD", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VPERMQMasked512", argLength: 3, reg: w2kw, asm: "VPERMQ", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VPSRLQ512", argLength: 2, reg: wfpw, asm: "VPSRLQ", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VPSRLQMasked512", argLength: 3, reg: wfpkw, asm: "VPSRLQ", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VPSRLVQ512", argLength: 2, reg: w21, asm: "VPSRLVQ", commutative: false, typ: "Vec512", resultInArg0: false},
@ -703,6 +755,10 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
 		{name: "VPMAXUBMasked128", argLength: 3, reg: w2kw, asm: "VPMAXUB", commutative: true, typ: "Vec128", resultInArg0: false},
 		{name: "VPMINUB128", argLength: 2, reg: v21, asm: "VPMINUB", commutative: true, typ: "Vec128", resultInArg0: false},
 		{name: "VPMINUBMasked128", argLength: 3, reg: w2kw, asm: "VPMINUB", commutative: true, typ: "Vec128", resultInArg0: false},
 		{name: "VPERMB128", argLength: 2, reg: w21, asm: "VPERMB", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPERMI2B128", argLength: 3, reg: w31, asm: "VPERMI2B", commutative: false, typ: "Vec128", resultInArg0: true},
 		{name: "VPERMI2BMasked128", argLength: 4, reg: w3kw, asm: "VPERMI2B", commutative: false, typ: "Vec128", resultInArg0: true},
 		{name: "VPERMBMasked128", argLength: 3, reg: w2kw, asm: "VPERMB", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPMADDUBSW128", argLength: 2, reg: v21, asm: "VPMADDUBSW", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPMADDUBSWMasked128", argLength: 3, reg: w2kw, asm: "VPMADDUBSW", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPAVGB256", argLength: 2, reg: v21, asm: "VPAVGB", commutative: true, typ: "Vec256", resultInArg0: false},
@ -713,6 +769,10 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
 		{name: "VPMAXUBMasked256", argLength: 3, reg: w2kw, asm: "VPMAXUB", commutative: true, typ: "Vec256", resultInArg0: false},
 		{name: "VPMINUB256", argLength: 2, reg: v21, asm: "VPMINUB", commutative: true, typ: "Vec256", resultInArg0: false},
 		{name: "VPMINUBMasked256", argLength: 3, reg: w2kw, asm: "VPMINUB", commutative: true, typ: "Vec256", resultInArg0: false},
 		{name: "VPERMB256", argLength: 2, reg: w21, asm: "VPERMB", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPERMI2B256", argLength: 3, reg: w31, asm: "VPERMI2B", commutative: false, typ: "Vec256", resultInArg0: true},
 		{name: "VPERMI2BMasked256", argLength: 4, reg: w3kw, asm: "VPERMI2B", commutative: false, typ: "Vec256", resultInArg0: true},
 		{name: "VPERMBMasked256", argLength: 3, reg: w2kw, asm: "VPERMB", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPMADDUBSW256", argLength: 2, reg: v21, asm: "VPMADDUBSW", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPMADDUBSWMasked256", argLength: 3, reg: w2kw, asm: "VPMADDUBSW", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPAVGB512", argLength: 2, reg: w21, asm: "VPAVGB", commutative: true, typ: "Vec512", resultInArg0: false},
@ -723,6 +783,10 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
 		{name: "VPMAXUBMasked512", argLength: 3, reg: w2kw, asm: "VPMAXUB", commutative: true, typ: "Vec512", resultInArg0: false},
 		{name: "VPMINUB512", argLength: 2, reg: w21, asm: "VPMINUB", commutative: true, typ: "Vec512", resultInArg0: false},
 		{name: "VPMINUBMasked512", argLength: 3, reg: w2kw, asm: "VPMINUB", commutative: true, typ: "Vec512", resultInArg0: false},
 		{name: "VPERMB512", argLength: 2, reg: w21, asm: "VPERMB", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VPERMI2B512", argLength: 3, reg: w31, asm: "VPERMI2B", commutative: false, typ: "Vec512", resultInArg0: true},
 		{name: "VPERMI2BMasked512", argLength: 4, reg: w3kw, asm: "VPERMI2B", commutative: false, typ: "Vec512", resultInArg0: true},
 		{name: "VPERMBMasked512", argLength: 3, reg: w2kw, asm: "VPERMB", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VPMADDUBSW512", argLength: 2, reg: w21, asm: "VPMADDUBSW", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VPMADDUBSWMasked512", argLength: 3, reg: w2kw, asm: "VPMADDUBSW", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VRNDSCALEPS512", argLength: 1, reg: w11, asm: "VRNDSCALEPS", aux: "Int8", commutative: false, typ: "Vec512", resultInArg0: false},
--- a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
+++ b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
@ -889,6 +889,14 @@ func simdGenericOps() []opData {
 		{name: "OrUint16x16", argLength: 2, commutative: true},
 		{name: "PairwiseAddUint16x16", argLength: 2, commutative: false},
 		{name: "PairwiseSubUint16x16", argLength: 2, commutative: false},
 		{name: "PermuteInt16x16", argLength: 2, commutative: false},
 		{name: "PermuteUint16x16", argLength: 2, commutative: false},
 		{name: "Permute2Uint16x16", argLength: 3, commutative: false},
 		{name: "Permute2Int16x16", argLength: 3, commutative: false},
 		{name: "Permute2MaskedUint16x16", argLength: 4, commutative: false},
 		{name: "Permute2MaskedInt16x16", argLength: 4, commutative: false},
 		{name: "PermuteMaskedUint16x16", argLength: 3, commutative: false},
 		{name: "PermuteMaskedInt16x16", argLength: 3, commutative: false},
 		{name: "PopCountUint16x16", argLength: 1, commutative: false},
 		{name: "PopCountMaskedUint16x16", argLength: 2, commutative: false},
 		{name: "SaturatedAddUint16x16", argLength: 2, commutative: true},
@ -932,6 +940,14 @@ func simdGenericOps() []opData {
 		{name: "MulHighMaskedUint16x32", argLength: 3, commutative: true},
 		{name: "NotEqualUint16x32", argLength: 2, commutative: true},
 		{name: "NotEqualMaskedUint16x32", argLength: 3, commutative: true},
 		{name: "PermuteUint16x32", argLength: 2, commutative: false},
 		{name: "PermuteInt16x32", argLength: 2, commutative: false},
 		{name: "Permute2Int16x32", argLength: 3, commutative: false},
 		{name: "Permute2Uint16x32", argLength: 3, commutative: false},
 		{name: "Permute2MaskedUint16x32", argLength: 4, commutative: false},
 		{name: "Permute2MaskedInt16x32", argLength: 4, commutative: false},
 		{name: "PermuteMaskedUint16x32", argLength: 3, commutative: false},
 		{name: "PermuteMaskedInt16x32", argLength: 3, commutative: false},
 		{name: "PopCountUint16x32", argLength: 1, commutative: false},
 		{name: "PopCountMaskedUint16x32", argLength: 2, commutative: false},
 		{name: "SaturatedAddUint16x32", argLength: 2, commutative: true},
@ -979,6 +995,14 @@ func simdGenericOps() []opData {
 		{name: "OrUint16x8", argLength: 2, commutative: true},
 		{name: "PairwiseAddUint16x8", argLength: 2, commutative: false},
 		{name: "PairwiseSubUint16x8", argLength: 2, commutative: false},
 		{name: "PermuteUint16x8", argLength: 2, commutative: false},
 		{name: "PermuteInt16x8", argLength: 2, commutative: false},
 		{name: "Permute2Int16x8", argLength: 3, commutative: false},
 		{name: "Permute2Uint16x8", argLength: 3, commutative: false},
 		{name: "Permute2MaskedUint16x8", argLength: 4, commutative: false},
 		{name: "Permute2MaskedInt16x8", argLength: 4, commutative: false},
 		{name: "PermuteMaskedInt16x8", argLength: 3, commutative: false},
 		{name: "PermuteMaskedUint16x8", argLength: 3, commutative: false},
 		{name: "PopCountUint16x8", argLength: 1, commutative: false},
 		{name: "PopCountMaskedUint16x8", argLength: 2, commutative: false},
 		{name: "SaturatedAddUint16x8", argLength: 2, commutative: true},
@ -1024,6 +1048,18 @@ func simdGenericOps() []opData {
 		{name: "NotEqualMaskedUint32x16", argLength: 3, commutative: true},
 		{name: "OrUint32x16", argLength: 2, commutative: true},
 		{name: "OrMaskedUint32x16", argLength: 3, commutative: true},
 		{name: "PermuteInt32x16", argLength: 2, commutative: false},
 		{name: "PermuteUint32x16", argLength: 2, commutative: false},
 		{name: "PermuteFloat32x16", argLength: 2, commutative: false},
 		{name: "Permute2Int32x16", argLength: 3, commutative: false},
 		{name: "Permute2Uint32x16", argLength: 3, commutative: false},
 		{name: "Permute2Float32x16", argLength: 3, commutative: false},
 		{name: "Permute2MaskedUint32x16", argLength: 4, commutative: false},
 		{name: "Permute2MaskedInt32x16", argLength: 4, commutative: false},
 		{name: "Permute2MaskedFloat32x16", argLength: 4, commutative: false},
 		{name: "PermuteMaskedUint32x16", argLength: 3, commutative: false},
 		{name: "PermuteMaskedInt32x16", argLength: 3, commutative: false},
 		{name: "PermuteMaskedFloat32x16", argLength: 3, commutative: false},
 		{name: "PopCountUint32x16", argLength: 1, commutative: false},
 		{name: "PopCountMaskedUint32x16", argLength: 2, commutative: false},
 		{name: "RotateLeftUint32x16", argLength: 2, commutative: false},
@ -1077,6 +1113,12 @@ func simdGenericOps() []opData {
 		{name: "OrMaskedUint32x4", argLength: 3, commutative: true},
 		{name: "PairwiseAddUint32x4", argLength: 2, commutative: false},
 		{name: "PairwiseSubUint32x4", argLength: 2, commutative: false},
 		{name: "Permute2Uint32x4", argLength: 3, commutative: false},
 		{name: "Permute2Float32x4", argLength: 3, commutative: false},
 		{name: "Permute2Int32x4", argLength: 3, commutative: false},
 		{name: "Permute2MaskedUint32x4", argLength: 4, commutative: false},
 		{name: "Permute2MaskedInt32x4", argLength: 4, commutative: false},
 		{name: "Permute2MaskedFloat32x4", argLength: 4, commutative: false},
 		{name: "PopCountUint32x4", argLength: 1, commutative: false},
 		{name: "PopCountMaskedUint32x4", argLength: 2, commutative: false},
 		{name: "RotateLeftUint32x4", argLength: 2, commutative: false},
@ -1130,6 +1172,18 @@ func simdGenericOps() []opData {
 		{name: "OrMaskedUint32x8", argLength: 3, commutative: true},
 		{name: "PairwiseAddUint32x8", argLength: 2, commutative: false},
 		{name: "PairwiseSubUint32x8", argLength: 2, commutative: false},
 		{name: "PermuteInt32x8", argLength: 2, commutative: false},
 		{name: "PermuteFloat32x8", argLength: 2, commutative: false},
 		{name: "PermuteUint32x8", argLength: 2, commutative: false},
 		{name: "Permute2Uint32x8", argLength: 3, commutative: false},
 		{name: "Permute2Float32x8", argLength: 3, commutative: false},
 		{name: "Permute2Int32x8", argLength: 3, commutative: false},
 		{name: "Permute2MaskedFloat32x8", argLength: 4, commutative: false},
 		{name: "Permute2MaskedUint32x8", argLength: 4, commutative: false},
 		{name: "Permute2MaskedInt32x8", argLength: 4, commutative: false},
 		{name: "PermuteMaskedInt32x8", argLength: 3, commutative: false},
 		{name: "PermuteMaskedFloat32x8", argLength: 3, commutative: false},
 		{name: "PermuteMaskedUint32x8", argLength: 3, commutative: false},
 		{name: "PopCountUint32x8", argLength: 1, commutative: false},
 		{name: "PopCountMaskedUint32x8", argLength: 2, commutative: false},
 		{name: "RotateLeftUint32x8", argLength: 2, commutative: false},
@ -1182,6 +1236,12 @@ func simdGenericOps() []opData {
 		{name: "NotEqualMaskedUint64x2", argLength: 3, commutative: true},
 		{name: "OrUint64x2", argLength: 2, commutative: true},
 		{name: "OrMaskedUint64x2", argLength: 3, commutative: true},
 		{name: "Permute2Uint64x2", argLength: 3, commutative: false},
 		{name: "Permute2Int64x2", argLength: 3, commutative: false},
 		{name: "Permute2Float64x2", argLength: 3, commutative: false},
 		{name: "Permute2MaskedUint64x2", argLength: 4, commutative: false},
 		{name: "Permute2MaskedInt64x2", argLength: 4, commutative: false},
 		{name: "Permute2MaskedFloat64x2", argLength: 4, commutative: false},
 		{name: "PopCountUint64x2", argLength: 1, commutative: false},
 		{name: "PopCountMaskedUint64x2", argLength: 2, commutative: false},
 		{name: "RotateLeftUint64x2", argLength: 2, commutative: false},
@ -1230,6 +1290,18 @@ func simdGenericOps() []opData {
 		{name: "NotEqualMaskedUint64x4", argLength: 3, commutative: true},
 		{name: "OrUint64x4", argLength: 2, commutative: true},
 		{name: "OrMaskedUint64x4", argLength: 3, commutative: true},
 		{name: "PermuteUint64x4", argLength: 2, commutative: false},
 		{name: "PermuteInt64x4", argLength: 2, commutative: false},
 		{name: "PermuteFloat64x4", argLength: 2, commutative: false},
 		{name: "Permute2Uint64x4", argLength: 3, commutative: false},
 		{name: "Permute2Int64x4", argLength: 3, commutative: false},
 		{name: "Permute2Float64x4", argLength: 3, commutative: false},
 		{name: "Permute2MaskedInt64x4", argLength: 4, commutative: false},
 		{name: "Permute2MaskedUint64x4", argLength: 4, commutative: false},
 		{name: "Permute2MaskedFloat64x4", argLength: 4, commutative: false},
 		{name: "PermuteMaskedFloat64x4", argLength: 3, commutative: false},
 		{name: "PermuteMaskedInt64x4", argLength: 3, commutative: false},
 		{name: "PermuteMaskedUint64x4", argLength: 3, commutative: false},
 		{name: "PopCountUint64x4", argLength: 1, commutative: false},
 		{name: "PopCountMaskedUint64x4", argLength: 2, commutative: false},
 		{name: "RotateLeftUint64x4", argLength: 2, commutative: false},
@ -1278,6 +1350,18 @@ func simdGenericOps() []opData {
 		{name: "NotEqualMaskedUint64x8", argLength: 3, commutative: true},
 		{name: "OrUint64x8", argLength: 2, commutative: true},
 		{name: "OrMaskedUint64x8", argLength: 3, commutative: true},
 		{name: "PermuteUint64x8", argLength: 2, commutative: false},
 		{name: "PermuteInt64x8", argLength: 2, commutative: false},
 		{name: "PermuteFloat64x8", argLength: 2, commutative: false},
 		{name: "Permute2Int64x8", argLength: 3, commutative: false},
 		{name: "Permute2Uint64x8", argLength: 3, commutative: false},
 		{name: "Permute2Float64x8", argLength: 3, commutative: false},
 		{name: "Permute2MaskedUint64x8", argLength: 4, commutative: false},
 		{name: "Permute2MaskedInt64x8", argLength: 4, commutative: false},
 		{name: "Permute2MaskedFloat64x8", argLength: 4, commutative: false},
 		{name: "PermuteMaskedFloat64x8", argLength: 3, commutative: false},
 		{name: "PermuteMaskedInt64x8", argLength: 3, commutative: false},
 		{name: "PermuteMaskedUint64x8", argLength: 3, commutative: false},
 		{name: "PopCountUint64x8", argLength: 1, commutative: false},
 		{name: "PopCountMaskedUint64x8", argLength: 2, commutative: false},
 		{name: "RotateLeftUint64x8", argLength: 2, commutative: false},
@ -1325,6 +1409,14 @@ func simdGenericOps() []opData {
 		{name: "NotEqualUint8x16", argLength: 2, commutative: true},
 		{name: "NotEqualMaskedUint8x16", argLength: 3, commutative: true},
 		{name: "OrUint8x16", argLength: 2, commutative: true},
 		{name: "PermuteUint8x16", argLength: 2, commutative: false},
 		{name: "PermuteInt8x16", argLength: 2, commutative: false},
 		{name: "Permute2Uint8x16", argLength: 3, commutative: false},
 		{name: "Permute2Int8x16", argLength: 3, commutative: false},
 		{name: "Permute2MaskedInt8x16", argLength: 4, commutative: false},
 		{name: "Permute2MaskedUint8x16", argLength: 4, commutative: false},
 		{name: "PermuteMaskedInt8x16", argLength: 3, commutative: false},
 		{name: "PermuteMaskedUint8x16", argLength: 3, commutative: false},
 		{name: "PopCountUint8x16", argLength: 1, commutative: false},
 		{name: "PopCountMaskedUint8x16", argLength: 2, commutative: false},
 		{name: "SaturatedAddUint8x16", argLength: 2, commutative: true},
@ -1361,6 +1453,14 @@ func simdGenericOps() []opData {
 		{name: "NotEqualUint8x32", argLength: 2, commutative: true},
 		{name: "NotEqualMaskedUint8x32", argLength: 3, commutative: true},
 		{name: "OrUint8x32", argLength: 2, commutative: true},
 		{name: "PermuteUint8x32", argLength: 2, commutative: false},
 		{name: "PermuteInt8x32", argLength: 2, commutative: false},
 		{name: "Permute2Int8x32", argLength: 3, commutative: false},
 		{name: "Permute2Uint8x32", argLength: 3, commutative: false},
 		{name: "Permute2MaskedUint8x32", argLength: 4, commutative: false},
 		{name: "Permute2MaskedInt8x32", argLength: 4, commutative: false},
 		{name: "PermuteMaskedUint8x32", argLength: 3, commutative: false},
 		{name: "PermuteMaskedInt8x32", argLength: 3, commutative: false},
 		{name: "PopCountUint8x32", argLength: 1, commutative: false},
 		{name: "PopCountMaskedUint8x32", argLength: 2, commutative: false},
 		{name: "SaturatedAddUint8x32", argLength: 2, commutative: true},
@ -1394,6 +1494,14 @@ func simdGenericOps() []opData {
 		{name: "MinMaskedUint8x64", argLength: 3, commutative: true},
 		{name: "NotEqualUint8x64", argLength: 2, commutative: true},
 		{name: "NotEqualMaskedUint8x64", argLength: 3, commutative: true},
 		{name: "PermuteUint8x64", argLength: 2, commutative: false},
 		{name: "PermuteInt8x64", argLength: 2, commutative: false},
 		{name: "Permute2Int8x64", argLength: 3, commutative: false},
 		{name: "Permute2Uint8x64", argLength: 3, commutative: false},
 		{name: "Permute2MaskedUint8x64", argLength: 4, commutative: false},
 		{name: "Permute2MaskedInt8x64", argLength: 4, commutative: false},
 		{name: "PermuteMaskedInt8x64", argLength: 3, commutative: false},
 		{name: "PermuteMaskedUint8x64", argLength: 3, commutative: false},
 		{name: "PopCountUint8x64", argLength: 1, commutative: false},
 		{name: "PopCountMaskedUint8x64", argLength: 2, commutative: false},
 		{name: "SaturatedAddUint8x64", argLength: 2, commutative: true},
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
--- a/src/cmd/compile/internal/ssa/rewriteAMD64.go
+++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go
--- a/src/cmd/compile/internal/ssagen/intrinsics.go
+++ b/src/cmd/compile/internal/ssagen/intrinsics.go
@ -1622,18 +1622,42 @@ func opLen2(op ssa.Op, t *types.Type) func(s *state, n *ir.CallExpr, args []*ssa
 	}
 }
 func opLen2_21(op ssa.Op, t *types.Type) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
 	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
 		return s.newValue2(op, t, args[1], args[0])
 	}
 }
 func opLen3(op ssa.Op, t *types.Type) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
 	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
 		return s.newValue3(op, t, args[0], args[1], args[2])
 	}
 }
 func opLen3_21(op ssa.Op, t *types.Type) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
 	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
 		return s.newValue3(op, t, args[1], args[0], args[2])
 	}
 }
 func opLen3_231(op ssa.Op, t *types.Type) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
 	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
 		return s.newValue3(op, t, args[2], args[0], args[1])
 	}
 }
 func opLen4(op ssa.Op, t *types.Type) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
 	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
 		return s.newValue4(op, t, args[0], args[1], args[2], args[3])
 	}
 }
 func opLen4_231(op ssa.Op, t *types.Type) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
 	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
 		return s.newValue4(op, t, args[2], args[0], args[1], args[3])
 	}
 }
 func plainPanicSimdImm(s *state) {
 	cmp := s.newValue0(ssa.OpConstBool, types.Types[types.TBOOL])
 	cmp.AuxInt = 0
--- a/src/cmd/compile/internal/ssagen/simdintrinsics.go
+++ b/src/cmd/compile/internal/ssagen/simdintrinsics.go
@ -996,6 +996,114 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
 	addF(simdPackage, "Uint16x16.PairwiseSub", opLen2(ssa.OpPairwiseSubUint16x16, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint32x4.PairwiseSub", opLen2(ssa.OpPairwiseSubUint32x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Uint32x8.PairwiseSub", opLen2(ssa.OpPairwiseSubUint32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int8x16.Permute", opLen2_21(ssa.OpPermuteInt8x16, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Uint8x16.Permute", opLen2_21(ssa.OpPermuteUint8x16, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Int8x32.Permute", opLen2_21(ssa.OpPermuteInt8x32, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint8x32.Permute", opLen2_21(ssa.OpPermuteUint8x32, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int8x64.Permute", opLen2_21(ssa.OpPermuteInt8x64, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Uint8x64.Permute", opLen2_21(ssa.OpPermuteUint8x64, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Int16x8.Permute", opLen2_21(ssa.OpPermuteInt16x8, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Uint16x8.Permute", opLen2_21(ssa.OpPermuteUint16x8, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Int16x16.Permute", opLen2_21(ssa.OpPermuteInt16x16, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint16x16.Permute", opLen2_21(ssa.OpPermuteUint16x16, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int16x32.Permute", opLen2_21(ssa.OpPermuteInt16x32, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Uint16x32.Permute", opLen2_21(ssa.OpPermuteUint16x32, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Float32x8.Permute", opLen2_21(ssa.OpPermuteFloat32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int32x8.Permute", opLen2_21(ssa.OpPermuteInt32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint32x8.Permute", opLen2_21(ssa.OpPermuteUint32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Float32x16.Permute", opLen2_21(ssa.OpPermuteFloat32x16, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Int32x16.Permute", opLen2_21(ssa.OpPermuteInt32x16, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Uint32x16.Permute", opLen2_21(ssa.OpPermuteUint32x16, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Float64x4.Permute", opLen2_21(ssa.OpPermuteFloat64x4, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int64x4.Permute", opLen2_21(ssa.OpPermuteInt64x4, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint64x4.Permute", opLen2_21(ssa.OpPermuteUint64x4, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Float64x8.Permute", opLen2_21(ssa.OpPermuteFloat64x8, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Int64x8.Permute", opLen2_21(ssa.OpPermuteInt64x8, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Uint64x8.Permute", opLen2_21(ssa.OpPermuteUint64x8, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Int8x16.Permute2", opLen3_231(ssa.OpPermute2Int8x16, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Uint8x16.Permute2", opLen3_231(ssa.OpPermute2Uint8x16, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Int8x32.Permute2", opLen3_231(ssa.OpPermute2Int8x32, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint8x32.Permute2", opLen3_231(ssa.OpPermute2Uint8x32, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int8x64.Permute2", opLen3_231(ssa.OpPermute2Int8x64, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Uint8x64.Permute2", opLen3_231(ssa.OpPermute2Uint8x64, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Int16x8.Permute2", opLen3_231(ssa.OpPermute2Int16x8, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Uint16x8.Permute2", opLen3_231(ssa.OpPermute2Uint16x8, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Int16x16.Permute2", opLen3_231(ssa.OpPermute2Int16x16, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint16x16.Permute2", opLen3_231(ssa.OpPermute2Uint16x16, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int16x32.Permute2", opLen3_231(ssa.OpPermute2Int16x32, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Uint16x32.Permute2", opLen3_231(ssa.OpPermute2Uint16x32, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Float32x4.Permute2", opLen3_231(ssa.OpPermute2Float32x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Int32x4.Permute2", opLen3_231(ssa.OpPermute2Int32x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Uint32x4.Permute2", opLen3_231(ssa.OpPermute2Uint32x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Float32x8.Permute2", opLen3_231(ssa.OpPermute2Float32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int32x8.Permute2", opLen3_231(ssa.OpPermute2Int32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint32x8.Permute2", opLen3_231(ssa.OpPermute2Uint32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Float32x16.Permute2", opLen3_231(ssa.OpPermute2Float32x16, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Int32x16.Permute2", opLen3_231(ssa.OpPermute2Int32x16, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Uint32x16.Permute2", opLen3_231(ssa.OpPermute2Uint32x16, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Float64x2.Permute2", opLen3_231(ssa.OpPermute2Float64x2, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Int64x2.Permute2", opLen3_231(ssa.OpPermute2Int64x2, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Uint64x2.Permute2", opLen3_231(ssa.OpPermute2Uint64x2, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Float64x4.Permute2", opLen3_231(ssa.OpPermute2Float64x4, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int64x4.Permute2", opLen3_231(ssa.OpPermute2Int64x4, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint64x4.Permute2", opLen3_231(ssa.OpPermute2Uint64x4, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Float64x8.Permute2", opLen3_231(ssa.OpPermute2Float64x8, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Int64x8.Permute2", opLen3_231(ssa.OpPermute2Int64x8, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Uint64x8.Permute2", opLen3_231(ssa.OpPermute2Uint64x8, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Int8x16.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedInt8x16, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Uint8x16.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedUint8x16, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Int8x32.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedInt8x32, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint8x32.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedUint8x32, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int8x64.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedInt8x64, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Uint8x64.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedUint8x64, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Int16x8.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedInt16x8, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Uint16x8.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedUint16x8, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Int16x16.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedInt16x16, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint16x16.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedUint16x16, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int16x32.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedInt16x32, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Uint16x32.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedUint16x32, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Float32x4.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedFloat32x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Int32x4.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedInt32x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Uint32x4.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedUint32x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Float32x8.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedFloat32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int32x8.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedInt32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint32x8.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedUint32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Float32x16.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedFloat32x16, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Int32x16.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedInt32x16, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Uint32x16.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedUint32x16, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Float64x2.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedFloat64x2, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Int64x2.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedInt64x2, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Uint64x2.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedUint64x2, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Float64x4.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedFloat64x4, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int64x4.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedInt64x4, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint64x4.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedUint64x4, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Float64x8.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedFloat64x8, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Int64x8.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedInt64x8, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Uint64x8.Permute2Masked", opLen4_231(ssa.OpPermute2MaskedUint64x8, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Int8x16.PermuteMasked", opLen3_21(ssa.OpPermuteMaskedInt8x16, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Uint8x16.PermuteMasked", opLen3_21(ssa.OpPermuteMaskedUint8x16, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Int8x32.PermuteMasked", opLen3_21(ssa.OpPermuteMaskedInt8x32, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint8x32.PermuteMasked", opLen3_21(ssa.OpPermuteMaskedUint8x32, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int8x64.PermuteMasked", opLen3_21(ssa.OpPermuteMaskedInt8x64, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Uint8x64.PermuteMasked", opLen3_21(ssa.OpPermuteMaskedUint8x64, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Int16x8.PermuteMasked", opLen3_21(ssa.OpPermuteMaskedInt16x8, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Uint16x8.PermuteMasked", opLen3_21(ssa.OpPermuteMaskedUint16x8, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Int16x16.PermuteMasked", opLen3_21(ssa.OpPermuteMaskedInt16x16, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint16x16.PermuteMasked", opLen3_21(ssa.OpPermuteMaskedUint16x16, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int16x32.PermuteMasked", opLen3_21(ssa.OpPermuteMaskedInt16x32, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Uint16x32.PermuteMasked", opLen3_21(ssa.OpPermuteMaskedUint16x32, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Float32x8.PermuteMasked", opLen3_21(ssa.OpPermuteMaskedFloat32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int32x8.PermuteMasked", opLen3_21(ssa.OpPermuteMaskedInt32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint32x8.PermuteMasked", opLen3_21(ssa.OpPermuteMaskedUint32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Float32x16.PermuteMasked", opLen3_21(ssa.OpPermuteMaskedFloat32x16, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Int32x16.PermuteMasked", opLen3_21(ssa.OpPermuteMaskedInt32x16, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Uint32x16.PermuteMasked", opLen3_21(ssa.OpPermuteMaskedUint32x16, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Float64x4.PermuteMasked", opLen3_21(ssa.OpPermuteMaskedFloat64x4, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int64x4.PermuteMasked", opLen3_21(ssa.OpPermuteMaskedInt64x4, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint64x4.PermuteMasked", opLen3_21(ssa.OpPermuteMaskedUint64x4, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Float64x8.PermuteMasked", opLen3_21(ssa.OpPermuteMaskedFloat64x8, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Int64x8.PermuteMasked", opLen3_21(ssa.OpPermuteMaskedInt64x8, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Uint64x8.PermuteMasked", opLen3_21(ssa.OpPermuteMaskedUint64x8, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Int8x16.PopCount", opLen1(ssa.OpPopCountInt8x16, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Int8x32.PopCount", opLen1(ssa.OpPopCountInt8x32, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int8x64.PopCount", opLen1(ssa.OpPopCountInt8x64, types.TypeVec512), sys.AMD64)
--- a/src/simd/ops_amd64.go
+++ b/src/simd/ops_amd64.go
@ -5391,6 +5391,830 @@ func (x Uint32x4) PairwiseSub(y Uint32x4) Uint32x4
 // Asm: VPHSUBD, CPU Feature: AVX2
 func (x Uint32x8) PairwiseSub(y Uint32x8) Uint32x8
 /* Permute */
 // Permute performs a full permutation of vector y using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
 // Asm: VPERMB, CPU Feature: AVX512VBMI
 func (x Int8x16) Permute(indices Uint8x16) Int8x16
 // Permute performs a full permutation of vector y using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
 // Asm: VPERMB, CPU Feature: AVX512VBMI
 func (x Uint8x16) Permute(indices Uint8x16) Uint8x16
 // Permute performs a full permutation of vector y using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
 // Asm: VPERMB, CPU Feature: AVX512VBMI
 func (x Int8x32) Permute(indices Uint8x32) Int8x32
 // Permute performs a full permutation of vector y using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
 // Asm: VPERMB, CPU Feature: AVX512VBMI
 func (x Uint8x32) Permute(indices Uint8x32) Uint8x32
 // Permute performs a full permutation of vector y using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
 // Asm: VPERMB, CPU Feature: AVX512VBMI
 func (x Int8x64) Permute(indices Uint8x64) Int8x64
 // Permute performs a full permutation of vector y using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
 // Asm: VPERMB, CPU Feature: AVX512VBMI
 func (x Uint8x64) Permute(indices Uint8x64) Uint8x64
 // Permute performs a full permutation of vector y using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
 // Asm: VPERMW, CPU Feature: AVX512BW
 func (x Int16x8) Permute(indices Uint16x8) Int16x8
 // Permute performs a full permutation of vector y using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
 // Asm: VPERMW, CPU Feature: AVX512BW
 func (x Uint16x8) Permute(indices Uint16x8) Uint16x8
 // Permute performs a full permutation of vector y using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
 // Asm: VPERMW, CPU Feature: AVX512BW
 func (x Int16x16) Permute(indices Uint16x16) Int16x16
 // Permute performs a full permutation of vector y using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
 // Asm: VPERMW, CPU Feature: AVX512BW
 func (x Uint16x16) Permute(indices Uint16x16) Uint16x16
 // Permute performs a full permutation of vector y using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
 // Asm: VPERMW, CPU Feature: AVX512BW
 func (x Int16x32) Permute(indices Uint16x32) Int16x32
 // Permute performs a full permutation of vector y using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
 // Asm: VPERMW, CPU Feature: AVX512BW
 func (x Uint16x32) Permute(indices Uint16x32) Uint16x32
 // Permute performs a full permutation of vector x using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
 // Asm: VPERMPS, CPU Feature: AVX2
 func (x Float32x8) Permute(indices Uint32x8) Float32x8
 // Permute performs a full permutation of vector x using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
 // Asm: VPERMD, CPU Feature: AVX2
 func (x Int32x8) Permute(indices Uint32x8) Int32x8
 // Permute performs a full permutation of vector x using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
 // Asm: VPERMD, CPU Feature: AVX2
 func (x Uint32x8) Permute(indices Uint32x8) Uint32x8
 // Permute performs a full permutation of vector y using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
 // Asm: VPERMPS, CPU Feature: AVX512F
 func (x Float32x16) Permute(indices Uint32x16) Float32x16
 // Permute performs a full permutation of vector y using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
 // Asm: VPERMD, CPU Feature: AVX512F
 func (x Int32x16) Permute(indices Uint32x16) Int32x16
 // Permute performs a full permutation of vector y using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
 // Asm: VPERMD, CPU Feature: AVX512F
 func (x Uint32x16) Permute(indices Uint32x16) Uint32x16
 // Permute performs a full permutation of vector y using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
 // Asm: VPERMPD, CPU Feature: AVX512F
 func (x Float64x4) Permute(indices Uint64x4) Float64x4
 // Permute performs a full permutation of vector y using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
 // Asm: VPERMQ, CPU Feature: AVX512F
 func (x Int64x4) Permute(indices Uint64x4) Int64x4
 // Permute performs a full permutation of vector y using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
 // Asm: VPERMQ, CPU Feature: AVX512F
 func (x Uint64x4) Permute(indices Uint64x4) Uint64x4
 // Permute performs a full permutation of vector y using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
 // Asm: VPERMPD, CPU Feature: AVX512F
 func (x Float64x8) Permute(indices Uint64x8) Float64x8
 // Permute performs a full permutation of vector y using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
 // Asm: VPERMQ, CPU Feature: AVX512F
 func (x Int64x8) Permute(indices Uint64x8) Int64x8
 // Permute performs a full permutation of vector y using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
 // Asm: VPERMQ, CPU Feature: AVX512F
 func (x Uint64x8) Permute(indices Uint64x8) Uint64x8
 /* Permute2 */
 // Permute2 performs a full permutation of vector x, y using indices:
 // result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
 // where xy is x appending y.
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
 // Asm: VPERMI2B, CPU Feature: AVX512VBMI
 func (x Int8x16) Permute2(y Int8x16, indices Uint8x16) Int8x16
 // Permute2 performs a full permutation of vector x, y using indices:
 // result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
 // where xy is x appending y.
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
 // Asm: VPERMI2B, CPU Feature: AVX512VBMI
 func (x Uint8x16) Permute2(y Uint8x16, indices Uint8x16) Uint8x16
 // Permute2 performs a full permutation of vector x, y using indices:
 // result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
 // where xy is x appending y.
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
 // Asm: VPERMI2B, CPU Feature: AVX512VBMI
 func (x Int8x32) Permute2(y Int8x32, indices Uint8x32) Int8x32
 // Permute2 performs a full permutation of vector x, y using indices:
 // result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
 // where xy is x appending y.
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
 // Asm: VPERMI2B, CPU Feature: AVX512VBMI
 func (x Uint8x32) Permute2(y Uint8x32, indices Uint8x32) Uint8x32
 // Permute2 performs a full permutation of vector x, y using indices:
 // result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
 // where xy is x appending y.
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
 // Asm: VPERMI2B, CPU Feature: AVX512VBMI
 func (x Int8x64) Permute2(y Int8x64, indices Uint8x64) Int8x64
 // Permute2 performs a full permutation of vector x, y using indices:
 // result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
 // where xy is x appending y.
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
 // Asm: VPERMI2B, CPU Feature: AVX512VBMI
 func (x Uint8x64) Permute2(y Uint8x64, indices Uint8x64) Uint8x64
 // Permute2 performs a full permutation of vector x, y using indices:
 // result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
 // where xy is x appending y.
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
 // Asm: VPERMI2W, CPU Feature: AVX512BW
 func (x Int16x8) Permute2(y Int16x8, indices Uint16x8) Int16x8
 // Permute2 performs a full permutation of vector x, y using indices:
 // result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
 // where xy is x appending y.
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
 // Asm: VPERMI2W, CPU Feature: AVX512BW
 func (x Uint16x8) Permute2(y Uint16x8, indices Uint16x8) Uint16x8
 // Permute2 performs a full permutation of vector x, y using indices:
 // result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
 // where xy is x appending y.
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
 // Asm: VPERMI2W, CPU Feature: AVX512BW
 func (x Int16x16) Permute2(y Int16x16, indices Uint16x16) Int16x16
 // Permute2 performs a full permutation of vector x, y using indices:
 // result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
 // where xy is x appending y.
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
 // Asm: VPERMI2W, CPU Feature: AVX512BW
 func (x Uint16x16) Permute2(y Uint16x16, indices Uint16x16) Uint16x16
 // Permute2 performs a full permutation of vector x, y using indices:
 // result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
 // where xy is x appending y.
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
 // Asm: VPERMI2W, CPU Feature: AVX512BW
 func (x Int16x32) Permute2(y Int16x32, indices Uint16x32) Int16x32
 // Permute2 performs a full permutation of vector x, y using indices:
 // result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
 // where xy is x appending y.
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
 // Asm: VPERMI2W, CPU Feature: AVX512BW
 func (x Uint16x32) Permute2(y Uint16x32, indices Uint16x32) Uint16x32
 // Permute2 performs a full permutation of vector x, y using indices:
 // result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
 // where xy is x appending y.
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
 // Asm: VPERMI2PS, CPU Feature: AVX512F
 func (x Float32x4) Permute2(y Float32x4, indices Uint32x4) Float32x4
 // Permute2 performs a full permutation of vector x, y using indices:
 // result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
 // where xy is x appending y.
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
 // Asm: VPERMI2D, CPU Feature: AVX512F
 func (x Int32x4) Permute2(y Int32x4, indices Uint32x4) Int32x4
 // Permute2 performs a full permutation of vector x, y using indices:
 // result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
 // where xy is x appending y.
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
 // Asm: VPERMI2D, CPU Feature: AVX512F
 func (x Uint32x4) Permute2(y Uint32x4, indices Uint32x4) Uint32x4
 // Permute2 performs a full permutation of vector x, y using indices:
 // result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
 // where xy is x appending y.
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
 // Asm: VPERMI2PS, CPU Feature: AVX512F
 func (x Float32x8) Permute2(y Float32x8, indices Uint32x8) Float32x8
 // Permute2 performs a full permutation of vector x, y using indices:
 // result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
 // where xy is x appending y.
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
 // Asm: VPERMI2D, CPU Feature: AVX512F
 func (x Int32x8) Permute2(y Int32x8, indices Uint32x8) Int32x8
 // Permute2 performs a full permutation of vector x, y using indices:
 // result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
 // where xy is x appending y.
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
 // Asm: VPERMI2D, CPU Feature: AVX512F
 func (x Uint32x8) Permute2(y Uint32x8, indices Uint32x8) Uint32x8
 // Permute2 performs a full permutation of vector x, y using indices:
 // result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
 // where xy is x appending y.
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
 // Asm: VPERMI2PS, CPU Feature: AVX512F
 func (x Float32x16) Permute2(y Float32x16, indices Uint32x16) Float32x16
 // Permute2 performs a full permutation of vector x, y using indices:
 // result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
 // where xy is x appending y.
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
 // Asm: VPERMI2D, CPU Feature: AVX512F
 func (x Int32x16) Permute2(y Int32x16, indices Uint32x16) Int32x16
 // Permute2 performs a full permutation of vector x, y using indices:
 // result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
 // where xy is x appending y.
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
 // Asm: VPERMI2D, CPU Feature: AVX512F
 func (x Uint32x16) Permute2(y Uint32x16, indices Uint32x16) Uint32x16
 // Permute2 performs a full permutation of vector x, y using indices:
 // result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
 // where xy is x appending y.
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
 // Asm: VPERMI2PD, CPU Feature: AVX512F
 func (x Float64x2) Permute2(y Float64x2, indices Uint64x2) Float64x2
 // Permute2 performs a full permutation of vector x, y using indices:
 // result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
 // where xy is x appending y.
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
 // Asm: VPERMI2Q, CPU Feature: AVX512F
 func (x Int64x2) Permute2(y Int64x2, indices Uint64x2) Int64x2
 // Permute2 performs a full permutation of vector x, y using indices:
 // result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
 // where xy is x appending y.
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
 // Asm: VPERMI2Q, CPU Feature: AVX512F
 func (x Uint64x2) Permute2(y Uint64x2, indices Uint64x2) Uint64x2
 // Permute2 performs a full permutation of vector x, y using indices:
 // result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
 // where xy is x appending y.
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
 // Asm: VPERMI2PD, CPU Feature: AVX512F
 func (x Float64x4) Permute2(y Float64x4, indices Uint64x4) Float64x4
 // Permute2 performs a full permutation of vector x, y using indices:
 // result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
 // where xy is x appending y.
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
 // Asm: VPERMI2Q, CPU Feature: AVX512F
 func (x Int64x4) Permute2(y Int64x4, indices Uint64x4) Int64x4
 // Permute2 performs a full permutation of vector x, y using indices:
 // result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
 // where xy is x appending y.
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
 // Asm: VPERMI2Q, CPU Feature: AVX512F
 func (x Uint64x4) Permute2(y Uint64x4, indices Uint64x4) Uint64x4
 // Permute2 performs a full permutation of vector x, y using indices:
 // result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
 // where xy is x appending y.
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
 // Asm: VPERMI2PD, CPU Feature: AVX512F
 func (x Float64x8) Permute2(y Float64x8, indices Uint64x8) Float64x8
 // Permute2 performs a full permutation of vector x, y using indices:
 // result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
 // where xy is x appending y.
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
 // Asm: VPERMI2Q, CPU Feature: AVX512F
 func (x Int64x8) Permute2(y Int64x8, indices Uint64x8) Int64x8
 // Permute2 performs a full permutation of vector x, y using indices:
 // result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
 // where xy is x appending y.
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
 // Asm: VPERMI2Q, CPU Feature: AVX512F
 func (x Uint64x8) Permute2(y Uint64x8, indices Uint64x8) Uint64x8
 /* Permute2Masked */
 // Permute2Masked performs a full permutation of vector x, y using indices:
 // result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
 // where xy is x appending y.
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
 // Asm: VPERMI2B, CPU Feature: AVX512VBMI
 func (x Int8x16) Permute2Masked(y Int8x16, indices Uint8x16, u Mask8x16) Int8x16
 // Permute2Masked performs a full permutation of vector x, y using indices:
 // result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
 // where xy is x appending y.
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
 // Asm: VPERMI2B, CPU Feature: AVX512VBMI
 func (x Uint8x16) Permute2Masked(y Uint8x16, indices Uint8x16, u Mask8x16) Uint8x16
 // Permute2Masked performs a full permutation of vector x, y using indices:
 // result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
 // where xy is x appending y.
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
 // Asm: VPERMI2B, CPU Feature: AVX512VBMI
 func (x Int8x32) Permute2Masked(y Int8x32, indices Uint8x32, u Mask8x32) Int8x32
 // Permute2Masked performs a full permutation of vector x, y using indices:
 // result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
 // where xy is x appending y.
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
 // Asm: VPERMI2B, CPU Feature: AVX512VBMI
 func (x Uint8x32) Permute2Masked(y Uint8x32, indices Uint8x32, u Mask8x32) Uint8x32
 // Permute2Masked performs a full permutation of vector x, y using indices:
 // result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
 // where xy is x appending y.
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
 // Asm: VPERMI2B, CPU Feature: AVX512VBMI
 func (x Int8x64) Permute2Masked(y Int8x64, indices Uint8x64, u Mask8x64) Int8x64
 // Permute2Masked performs a full permutation of vector x, y using indices:
 // result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
 // where xy is x appending y.
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
 // Asm: VPERMI2B, CPU Feature: AVX512VBMI
 func (x Uint8x64) Permute2Masked(y Uint8x64, indices Uint8x64, u Mask8x64) Uint8x64
 // Permute2Masked performs a full permutation of vector x, y using indices:
 // result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
 // where xy is x appending y.
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
 // Asm: VPERMI2W, CPU Feature: AVX512BW
 func (x Int16x8) Permute2Masked(y Int16x8, indices Uint16x8, u Mask16x8) Int16x8
 // Permute2Masked performs a full permutation of vector x, y using indices:
 // result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
 // where xy is x appending y.
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
 // Asm: VPERMI2W, CPU Feature: AVX512BW
 func (x Uint16x8) Permute2Masked(y Uint16x8, indices Uint16x8, u Mask16x8) Uint16x8
 // Permute2Masked performs a full permutation of vector x, y using indices:
 // result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
 // where xy is x appending y.
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
 // Asm: VPERMI2W, CPU Feature: AVX512BW
 func (x Int16x16) Permute2Masked(y Int16x16, indices Uint16x16, u Mask16x16) Int16x16
 // Permute2Masked performs a full permutation of vector x, y using indices:
 // result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
 // where xy is x appending y.
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
 // Asm: VPERMI2W, CPU Feature: AVX512BW
 func (x Uint16x16) Permute2Masked(y Uint16x16, indices Uint16x16, u Mask16x16) Uint16x16
 // Permute2Masked performs a full permutation of vector x, y using indices:
 // result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
 // where xy is x appending y.
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
 // Asm: VPERMI2W, CPU Feature: AVX512BW
 func (x Int16x32) Permute2Masked(y Int16x32, indices Uint16x32, u Mask16x32) Int16x32
 // Permute2Masked performs a full permutation of vector x, y using indices:
 // result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
 // where xy is x appending y.
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
 // Asm: VPERMI2W, CPU Feature: AVX512BW
 func (x Uint16x32) Permute2Masked(y Uint16x32, indices Uint16x32, u Mask16x32) Uint16x32
 // Permute2Masked performs a full permutation of vector x, y using indices:
 // result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
 // where xy is x appending y.
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
 // Asm: VPERMI2PS, CPU Feature: AVX512F
 func (x Float32x4) Permute2Masked(y Float32x4, indices Uint32x4, u Mask32x4) Float32x4
 // Permute2Masked performs a full permutation of vector x, y using indices:
 // result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
 // where xy is x appending y.
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
 // Asm: VPERMI2D, CPU Feature: AVX512F
 func (x Int32x4) Permute2Masked(y Int32x4, indices Uint32x4, u Mask32x4) Int32x4
 // Permute2Masked performs a full permutation of vector x, y using indices:
 // result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
 // where xy is x appending y.
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
 // Asm: VPERMI2D, CPU Feature: AVX512F
 func (x Uint32x4) Permute2Masked(y Uint32x4, indices Uint32x4, u Mask32x4) Uint32x4
 // Permute2Masked performs a full permutation of vector x, y using indices:
 // result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
 // where xy is x appending y.
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
 // Asm: VPERMI2PS, CPU Feature: AVX512F
 func (x Float32x8) Permute2Masked(y Float32x8, indices Uint32x8, u Mask32x8) Float32x8
 // Permute2Masked performs a full permutation of vector x, y using indices:
 // result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
 // where xy is x appending y.
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
 // Asm: VPERMI2D, CPU Feature: AVX512F
 func (x Int32x8) Permute2Masked(y Int32x8, indices Uint32x8, u Mask32x8) Int32x8
 // Permute2Masked performs a full permutation of vector x, y using indices:
 // result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
 // where xy is x appending y.
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
 // Asm: VPERMI2D, CPU Feature: AVX512F
 func (x Uint32x8) Permute2Masked(y Uint32x8, indices Uint32x8, u Mask32x8) Uint32x8
 // Permute2Masked performs a full permutation of vector x, y using indices:
 // result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
 // where xy is x appending y.
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
 // Asm: VPERMI2PS, CPU Feature: AVX512F
 func (x Float32x16) Permute2Masked(y Float32x16, indices Uint32x16, u Mask32x16) Float32x16
 // Permute2Masked performs a full permutation of vector x, y using indices:
 // result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
 // where xy is x appending y.
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
 // Asm: VPERMI2D, CPU Feature: AVX512F
 func (x Int32x16) Permute2Masked(y Int32x16, indices Uint32x16, u Mask32x16) Int32x16
 // Permute2Masked performs a full permutation of vector x, y using indices:
 // result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
 // where xy is x appending y.
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
 // Asm: VPERMI2D, CPU Feature: AVX512F
 func (x Uint32x16) Permute2Masked(y Uint32x16, indices Uint32x16, u Mask32x16) Uint32x16
 // Permute2Masked performs a full permutation of vector x, y using indices:
 // result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
 // where xy is x appending y.
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
 // Asm: VPERMI2PD, CPU Feature: AVX512F
 func (x Float64x2) Permute2Masked(y Float64x2, indices Uint64x2, u Mask64x2) Float64x2
 // Permute2Masked performs a full permutation of vector x, y using indices:
 // result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
 // where xy is x appending y.
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
 // Asm: VPERMI2Q, CPU Feature: AVX512F
 func (x Int64x2) Permute2Masked(y Int64x2, indices Uint64x2, u Mask64x2) Int64x2
 // Permute2Masked performs a full permutation of vector x, y using indices:
 // result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
 // where xy is x appending y.
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
 // Asm: VPERMI2Q, CPU Feature: AVX512F
 func (x Uint64x2) Permute2Masked(y Uint64x2, indices Uint64x2, u Mask64x2) Uint64x2
 // Permute2Masked performs a full permutation of vector x, y using indices:
 // result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
 // where xy is x appending y.
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
 // Asm: VPERMI2PD, CPU Feature: AVX512F
 func (x Float64x4) Permute2Masked(y Float64x4, indices Uint64x4, u Mask64x4) Float64x4
 // Permute2Masked performs a full permutation of vector x, y using indices:
 // result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
 // where xy is x appending y.
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
 // Asm: VPERMI2Q, CPU Feature: AVX512F
 func (x Int64x4) Permute2Masked(y Int64x4, indices Uint64x4, u Mask64x4) Int64x4
 // Permute2Masked performs a full permutation of vector x, y using indices:
 // result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
 // where xy is x appending y.
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
 // Asm: VPERMI2Q, CPU Feature: AVX512F
 func (x Uint64x4) Permute2Masked(y Uint64x4, indices Uint64x4, u Mask64x4) Uint64x4
 // Permute2Masked performs a full permutation of vector x, y using indices:
 // result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
 // where xy is x appending y.
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
 // Asm: VPERMI2PD, CPU Feature: AVX512F
 func (x Float64x8) Permute2Masked(y Float64x8, indices Uint64x8, u Mask64x8) Float64x8
 // Permute2Masked performs a full permutation of vector x, y using indices:
 // result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
 // where xy is x appending y.
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
 // Asm: VPERMI2Q, CPU Feature: AVX512F
 func (x Int64x8) Permute2Masked(y Int64x8, indices Uint64x8, u Mask64x8) Int64x8
 // Permute2Masked performs a full permutation of vector x, y using indices:
 // result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
 // where xy is x appending y.
 // Only the needed bits to represent xy's index are used in indices' elements.
 //
 // Asm: VPERMI2Q, CPU Feature: AVX512F
 func (x Uint64x8) Permute2Masked(y Uint64x8, indices Uint64x8, u Mask64x8) Uint64x8
 /* PermuteMasked */
 // PermuteMasked performs a full permutation of vector y using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
 // Asm: VPERMB, CPU Feature: AVX512VBMI
 func (x Int8x16) PermuteMasked(indices Uint8x16, z Mask8x16) Int8x16
 // PermuteMasked performs a full permutation of vector y using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
 // Asm: VPERMB, CPU Feature: AVX512VBMI
 func (x Uint8x16) PermuteMasked(indices Uint8x16, z Mask8x16) Uint8x16
 // PermuteMasked performs a full permutation of vector y using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
 // Asm: VPERMB, CPU Feature: AVX512VBMI
 func (x Int8x32) PermuteMasked(indices Uint8x32, z Mask8x32) Int8x32
 // PermuteMasked performs a full permutation of vector y using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
 // Asm: VPERMB, CPU Feature: AVX512VBMI
 func (x Uint8x32) PermuteMasked(indices Uint8x32, z Mask8x32) Uint8x32
 // PermuteMasked performs a full permutation of vector y using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
 // Asm: VPERMB, CPU Feature: AVX512VBMI
 func (x Int8x64) PermuteMasked(indices Uint8x64, z Mask8x64) Int8x64
 // PermuteMasked performs a full permutation of vector y using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
 // Asm: VPERMB, CPU Feature: AVX512VBMI
 func (x Uint8x64) PermuteMasked(indices Uint8x64, z Mask8x64) Uint8x64
 // PermuteMasked performs a full permutation of vector y using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
 // Asm: VPERMW, CPU Feature: AVX512BW
 func (x Int16x8) PermuteMasked(indices Uint16x8, z Mask16x8) Int16x8
 // PermuteMasked performs a full permutation of vector y using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
 // Asm: VPERMW, CPU Feature: AVX512BW
 func (x Uint16x8) PermuteMasked(indices Uint16x8, z Mask16x8) Uint16x8
 // PermuteMasked performs a full permutation of vector y using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
 // Asm: VPERMW, CPU Feature: AVX512BW
 func (x Int16x16) PermuteMasked(indices Uint16x16, z Mask16x16) Int16x16
 // PermuteMasked performs a full permutation of vector y using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
 // Asm: VPERMW, CPU Feature: AVX512BW
 func (x Uint16x16) PermuteMasked(indices Uint16x16, z Mask16x16) Uint16x16
 // PermuteMasked performs a full permutation of vector y using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
 // Asm: VPERMW, CPU Feature: AVX512BW
 func (x Int16x32) PermuteMasked(indices Uint16x32, z Mask16x32) Int16x32
 // PermuteMasked performs a full permutation of vector y using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
 // Asm: VPERMW, CPU Feature: AVX512BW
 func (x Uint16x32) PermuteMasked(indices Uint16x32, z Mask16x32) Uint16x32
 // PermuteMasked performs a full permutation of vector y using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
 // Asm: VPERMPS, CPU Feature: AVX512F
 func (x Float32x8) PermuteMasked(indices Uint32x8, z Mask32x8) Float32x8
 // PermuteMasked performs a full permutation of vector y using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
 // Asm: VPERMD, CPU Feature: AVX512F
 func (x Int32x8) PermuteMasked(indices Uint32x8, z Mask32x8) Int32x8
 // PermuteMasked performs a full permutation of vector y using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
 // Asm: VPERMD, CPU Feature: AVX512F
 func (x Uint32x8) PermuteMasked(indices Uint32x8, z Mask32x8) Uint32x8
 // PermuteMasked performs a full permutation of vector y using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
 // Asm: VPERMPS, CPU Feature: AVX512F
 func (x Float32x16) PermuteMasked(indices Uint32x16, z Mask32x16) Float32x16
 // PermuteMasked performs a full permutation of vector y using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
 // Asm: VPERMD, CPU Feature: AVX512F
 func (x Int32x16) PermuteMasked(indices Uint32x16, z Mask32x16) Int32x16
 // PermuteMasked performs a full permutation of vector y using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
 // Asm: VPERMD, CPU Feature: AVX512F
 func (x Uint32x16) PermuteMasked(indices Uint32x16, z Mask32x16) Uint32x16
 // PermuteMasked performs a full permutation of vector y using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
 // Asm: VPERMPD, CPU Feature: AVX512F
 func (x Float64x4) PermuteMasked(indices Uint64x4, z Mask64x4) Float64x4
 // PermuteMasked performs a full permutation of vector y using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
 // Asm: VPERMQ, CPU Feature: AVX512F
 func (x Int64x4) PermuteMasked(indices Uint64x4, z Mask64x4) Int64x4
 // PermuteMasked performs a full permutation of vector y using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
 // Asm: VPERMQ, CPU Feature: AVX512F
 func (x Uint64x4) PermuteMasked(indices Uint64x4, z Mask64x4) Uint64x4
 // PermuteMasked performs a full permutation of vector y using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
 // Asm: VPERMPD, CPU Feature: AVX512F
 func (x Float64x8) PermuteMasked(indices Uint64x8, z Mask64x8) Float64x8
 // PermuteMasked performs a full permutation of vector y using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
 // Asm: VPERMQ, CPU Feature: AVX512F
 func (x Int64x8) PermuteMasked(indices Uint64x8, z Mask64x8) Int64x8
 // PermuteMasked performs a full permutation of vector y using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
 // Asm: VPERMQ, CPU Feature: AVX512F
 func (x Uint64x8) PermuteMasked(indices Uint64x8, z Mask64x8) Uint64x8
 /* PopCount */
 // PopCount counts the number of set bits in each element.
--- a/src/simd/simd_test.go
+++ b/src/simd/simd_test.go
@ -151,6 +151,41 @@ func TestMaskedAdd(t *testing.T) {
 	testInt32x4BinaryMasked(t, []int32{1, 2, 3, 4}, []int32{5, 6, 7, 8}, []int32{-1, -1, 0, 0}, []int32{6, 8, 0, 0}, "AddMasked")
 }
 func TestPermute(t *testing.T) {
 	if !simd.HasAVX512() {
 		t.Skip("Test requires HasAVX512, not available on this hardware")
 		return
 	}
 	x := []int64{1, 2, 3, 4, 5, 6, 7, 8}
 	indices := []uint64{7, 6, 5, 4, 3, 2, 1, 0}
 	want := []int64{8, 7, 6, 5, 4, 3, 2, 1}
 	got := make([]int64, 8)
 	simd.LoadInt64x8Slice(x).Permute(simd.LoadUint64x8Slice(indices)).StoreSlice(got)
 	for i := range 8 {
 		if want[i] != got[i] {
 			t.Errorf("want and got differ at index %d, want=%d, got=%d", i, want[i], got[i])
 		}
 	}
 }
 func TestPermute2(t *testing.T) {
 	if !simd.HasAVX512() {
 		t.Skip("Test requires HasAVX512, not available on this hardware")
 		return
 	}
 	x := []int64{1, 2, 3, 4, 5, 6, 7, 8}
 	y := []int64{-1, -2, -3, -4, -5, -6, -7, -8}
 	indices := []uint64{7 + 8, 6, 5 + 8, 4, 3 + 8, 2, 1 + 8, 0}
 	want := []int64{-8, 7, -6, 5, -4, 3, -2, 1}
 	got := make([]int64, 8)
 	simd.LoadInt64x8Slice(x).Permute2(simd.LoadInt64x8Slice(y), simd.LoadUint64x8Slice(indices)).StoreSlice(got)
 	for i := range 8 {
 		if want[i] != got[i] {
 			t.Errorf("want and got differ at index %d, want=%d, got=%d", i, want[i], got[i])
 		}
 	}
 }
 // checkInt8Slices ensures that b and a are equal, to the end of b.
 // also serves to use the slices, to prevent accidental optimization.
 func checkInt8Slices(t *testing.T, a, b []int8) {
--- a/src/simd/simd_wrapped_test.go
+++ b/src/simd/simd_wrapped_test.go
@ -7800,6 +7800,10 @@ func testUint64x8UnaryMasked(t *testing.T, v0 []uint64, v1 []int64, want []uint6
 // GaloisFieldAffineTransformMasked
 // Get128
 // GetElem
 // Permute
 // Permute2
 // Permute2Masked
 // PermuteMasked
 // RotateAllLeft
 // RotateAllLeftMasked
 // RotateAllRight