[dev.simd] simd: fix signatures for PermuteConstant* methods

This moves the packed-immediate methods to package-private, and adds exported versions with four parameters. Rename PermuteConstant to PermuteScalars Rename VPSHUFB Permute to PermuteOrZero Rename Permute2 to ConcatPermute Comments were repaired/enhanced. Modified the generator to support an additional tag "hideMaskMethods : true" to suppress method, intrinsic, generic, and generic translation generation for said mask-modified versions of such methods (this is already true for exported methods). Change-Id: I91e208c1fff1f28ebce4edb4e73d26003715018c Reviewed-on: https://go-review.googlesource.com/c/go/+/721342 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Junyang Shao <shaojunyang@google.com> Reviewed-by: Cherry Mui <cherryyz@google.com>
2025-12-08 06:10:04 +00:00 · 2025-11-17 15:31:36 -05:00 · 2025-11-17 15:31:36 -05:00 · 4d26d66a49
commit 4d26d66a49
parent e3d4645693
18 changed files with 2614 additions and 1820 deletions
--- a/src/cmd/compile/internal/amd64/simdssa.go
+++ b/src/cmd/compile/internal/amd64/simdssa.go
@ -396,7 +396,7 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPOR256,
 		ssa.OpAMD64VPORD512,
 		ssa.OpAMD64VPORQ512,
-		ssa.OpAMD64VPSHUFB128,
+		ssa.OpAMD64VPERMB128,
 		ssa.OpAMD64VPERMB256,
 		ssa.OpAMD64VPERMB512,
 		ssa.OpAMD64VPERMW128,
@ -410,6 +410,7 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPERMQ256,
 		ssa.OpAMD64VPERMPD512,
 		ssa.OpAMD64VPERMQ512,
 		ssa.OpAMD64VPSHUFB128,
 		ssa.OpAMD64VPSHUFB256,
 		ssa.OpAMD64VPSHUFB512,
 		ssa.OpAMD64VPROLVD128,
@ -672,9 +673,7 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPORQMasked128,
 		ssa.OpAMD64VPORQMasked256,
 		ssa.OpAMD64VPORQMasked512,
-		ssa.OpAMD64VPSHUFBMasked256,
+		ssa.OpAMD64VPERMBMasked128,
 		ssa.OpAMD64VPSHUFBMasked512,
 		ssa.OpAMD64VPSHUFBMasked128,
 		ssa.OpAMD64VPERMBMasked256,
 		ssa.OpAMD64VPERMBMasked512,
 		ssa.OpAMD64VPERMWMasked128,
@ -688,6 +687,9 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPERMQMasked256,
 		ssa.OpAMD64VPERMPDMasked512,
 		ssa.OpAMD64VPERMQMasked512,
 		ssa.OpAMD64VPSHUFBMasked256,
 		ssa.OpAMD64VPSHUFBMasked512,
 		ssa.OpAMD64VPSHUFBMasked128,
 		ssa.OpAMD64VPROLVDMasked128,
 		ssa.OpAMD64VPROLVDMasked256,
 		ssa.OpAMD64VPROLVDMasked512,
@ -1011,12 +1013,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VEXTRACTF64X4256,
 		ssa.OpAMD64VEXTRACTI128128,
 		ssa.OpAMD64VEXTRACTI64X4256,
 		ssa.OpAMD64VPSHUFD128,
 		ssa.OpAMD64VPSHUFD256,
 		ssa.OpAMD64VPSHUFD512,
 		ssa.OpAMD64VPSHUFHW128,
 		ssa.OpAMD64VPSHUFHW256,
 		ssa.OpAMD64VPSHUFHW512,
 		ssa.OpAMD64VPROLD128,
 		ssa.OpAMD64VPROLD256,
 		ssa.OpAMD64VPROLD512,
@ -1029,6 +1025,15 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPRORQ128,
 		ssa.OpAMD64VPRORQ256,
 		ssa.OpAMD64VPRORQ512,
 		ssa.OpAMD64VPSHUFD128,
 		ssa.OpAMD64VPSHUFD256,
 		ssa.OpAMD64VPSHUFD512,
 		ssa.OpAMD64VPSHUFHW128,
 		ssa.OpAMD64VPSHUFHW256,
 		ssa.OpAMD64VPSHUFHW512,
 		ssa.OpAMD64VPSHUFLW128,
 		ssa.OpAMD64VPSHUFLW256,
 		ssa.OpAMD64VPSHUFLW512,
 		ssa.OpAMD64VPSLLW128const,
 		ssa.OpAMD64VPSLLW256const,
 		ssa.OpAMD64VPSLLW512const,
@ -1070,12 +1075,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VREDUCEPDMasked128,
 		ssa.OpAMD64VREDUCEPDMasked256,
 		ssa.OpAMD64VREDUCEPDMasked512,
 		ssa.OpAMD64VPSHUFDMasked256,
 		ssa.OpAMD64VPSHUFDMasked512,
 		ssa.OpAMD64VPSHUFHWMasked256,
 		ssa.OpAMD64VPSHUFHWMasked512,
 		ssa.OpAMD64VPSHUFHWMasked128,
 		ssa.OpAMD64VPSHUFDMasked128,
 		ssa.OpAMD64VPROLDMasked128,
 		ssa.OpAMD64VPROLDMasked256,
 		ssa.OpAMD64VPROLDMasked512,
@ -1088,6 +1087,15 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPRORQMasked128,
 		ssa.OpAMD64VPRORQMasked256,
 		ssa.OpAMD64VPRORQMasked512,
 		ssa.OpAMD64VPSHUFDMasked256,
 		ssa.OpAMD64VPSHUFDMasked512,
 		ssa.OpAMD64VPSHUFHWMasked256,
 		ssa.OpAMD64VPSHUFHWMasked512,
 		ssa.OpAMD64VPSHUFHWMasked128,
 		ssa.OpAMD64VPSHUFLWMasked256,
 		ssa.OpAMD64VPSHUFLWMasked512,
 		ssa.OpAMD64VPSHUFLWMasked128,
 		ssa.OpAMD64VPSHUFDMasked128,
 		ssa.OpAMD64VPSLLWMasked128const,
 		ssa.OpAMD64VPSLLWMasked256const,
 		ssa.OpAMD64VPSLLWMasked512const,
@ -1209,6 +1217,24 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 	case ssa.OpAMD64VPDPWSSD128,
 		ssa.OpAMD64VPDPWSSD256,
 		ssa.OpAMD64VPDPWSSD512,
 		ssa.OpAMD64VPERMI2B128,
 		ssa.OpAMD64VPERMI2B256,
 		ssa.OpAMD64VPERMI2B512,
 		ssa.OpAMD64VPERMI2W128,
 		ssa.OpAMD64VPERMI2W256,
 		ssa.OpAMD64VPERMI2W512,
 		ssa.OpAMD64VPERMI2PS128,
 		ssa.OpAMD64VPERMI2D128,
 		ssa.OpAMD64VPERMI2PS256,
 		ssa.OpAMD64VPERMI2D256,
 		ssa.OpAMD64VPERMI2PS512,
 		ssa.OpAMD64VPERMI2D512,
 		ssa.OpAMD64VPERMI2PD128,
 		ssa.OpAMD64VPERMI2Q128,
 		ssa.OpAMD64VPERMI2PD256,
 		ssa.OpAMD64VPERMI2Q256,
 		ssa.OpAMD64VPERMI2PD512,
 		ssa.OpAMD64VPERMI2Q512,
 		ssa.OpAMD64VPDPBUSD128,
 		ssa.OpAMD64VPDPBUSD256,
 		ssa.OpAMD64VPDPBUSD512,
@ -1233,24 +1259,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VFMSUBADD213PD128,
 		ssa.OpAMD64VFMSUBADD213PD256,
 		ssa.OpAMD64VFMSUBADD213PD512,
 		ssa.OpAMD64VPERMI2B128,
 		ssa.OpAMD64VPERMI2B256,
 		ssa.OpAMD64VPERMI2B512,
 		ssa.OpAMD64VPERMI2W128,
 		ssa.OpAMD64VPERMI2W256,
 		ssa.OpAMD64VPERMI2W512,
 		ssa.OpAMD64VPERMI2PS128,
 		ssa.OpAMD64VPERMI2D128,
 		ssa.OpAMD64VPERMI2PS256,
 		ssa.OpAMD64VPERMI2D256,
 		ssa.OpAMD64VPERMI2PS512,
 		ssa.OpAMD64VPERMI2D512,
 		ssa.OpAMD64VPERMI2PD128,
 		ssa.OpAMD64VPERMI2Q128,
 		ssa.OpAMD64VPERMI2PD256,
 		ssa.OpAMD64VPERMI2Q256,
 		ssa.OpAMD64VPERMI2PD512,
 		ssa.OpAMD64VPERMI2Q512,
 		ssa.OpAMD64VPSHLDVW128,
 		ssa.OpAMD64VPSHLDVW256,
 		ssa.OpAMD64VPSHLDVW512,
@ -1316,6 +1324,24 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPAVGWMasked128Merging,
 		ssa.OpAMD64VPAVGWMasked256Merging,
 		ssa.OpAMD64VPAVGWMasked512Merging,
 		ssa.OpAMD64VPERMI2BMasked128,
 		ssa.OpAMD64VPERMI2BMasked256,
 		ssa.OpAMD64VPERMI2BMasked512,
 		ssa.OpAMD64VPERMI2WMasked128,
 		ssa.OpAMD64VPERMI2WMasked256,
 		ssa.OpAMD64VPERMI2WMasked512,
 		ssa.OpAMD64VPERMI2PSMasked128,
 		ssa.OpAMD64VPERMI2DMasked128,
 		ssa.OpAMD64VPERMI2PSMasked256,
 		ssa.OpAMD64VPERMI2DMasked256,
 		ssa.OpAMD64VPERMI2PSMasked512,
 		ssa.OpAMD64VPERMI2DMasked512,
 		ssa.OpAMD64VPERMI2PDMasked128,
 		ssa.OpAMD64VPERMI2QMasked128,
 		ssa.OpAMD64VPERMI2PDMasked256,
 		ssa.OpAMD64VPERMI2QMasked256,
 		ssa.OpAMD64VPERMI2PDMasked512,
 		ssa.OpAMD64VPERMI2QMasked512,
 		ssa.OpAMD64VPALIGNRMasked256Merging,
 		ssa.OpAMD64VPALIGNRMasked512Merging,
 		ssa.OpAMD64VPALIGNRMasked128Merging,
@ -1451,24 +1477,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPORQMasked128Merging,
 		ssa.OpAMD64VPORQMasked256Merging,
 		ssa.OpAMD64VPORQMasked512Merging,
 		ssa.OpAMD64VPERMI2BMasked128,
 		ssa.OpAMD64VPERMI2BMasked256,
 		ssa.OpAMD64VPERMI2BMasked512,
 		ssa.OpAMD64VPERMI2WMasked128,
 		ssa.OpAMD64VPERMI2WMasked256,
 		ssa.OpAMD64VPERMI2WMasked512,
 		ssa.OpAMD64VPERMI2PSMasked128,
 		ssa.OpAMD64VPERMI2DMasked128,
 		ssa.OpAMD64VPERMI2PSMasked256,
 		ssa.OpAMD64VPERMI2DMasked256,
 		ssa.OpAMD64VPERMI2PSMasked512,
 		ssa.OpAMD64VPERMI2DMasked512,
 		ssa.OpAMD64VPERMI2PDMasked128,
 		ssa.OpAMD64VPERMI2QMasked128,
 		ssa.OpAMD64VPERMI2PDMasked256,
 		ssa.OpAMD64VPERMI2QMasked256,
 		ssa.OpAMD64VPERMI2PDMasked512,
 		ssa.OpAMD64VPERMI2QMasked512,
 		ssa.OpAMD64VPSHUFBMasked256Merging,
 		ssa.OpAMD64VPSHUFBMasked512Merging,
 		ssa.OpAMD64VPSHUFBMasked128Merging,
@ -1819,6 +1827,18 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		p = simdV21load(s, v)
 	case ssa.OpAMD64VPDPWSSD512load,
 		ssa.OpAMD64VPERMI2PS128load,
 		ssa.OpAMD64VPERMI2D128load,
 		ssa.OpAMD64VPERMI2PS256load,
 		ssa.OpAMD64VPERMI2D256load,
 		ssa.OpAMD64VPERMI2PS512load,
 		ssa.OpAMD64VPERMI2D512load,
 		ssa.OpAMD64VPERMI2PD128load,
 		ssa.OpAMD64VPERMI2Q128load,
 		ssa.OpAMD64VPERMI2PD256load,
 		ssa.OpAMD64VPERMI2Q256load,
 		ssa.OpAMD64VPERMI2PD512load,
 		ssa.OpAMD64VPERMI2Q512load,
 		ssa.OpAMD64VPDPBUSD512load,
 		ssa.OpAMD64VPDPBUSDS512load,
 		ssa.OpAMD64VFMADD213PS128load,
@ -1839,18 +1859,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VFMSUBADD213PD128load,
 		ssa.OpAMD64VFMSUBADD213PD256load,
 		ssa.OpAMD64VFMSUBADD213PD512load,
 		ssa.OpAMD64VPERMI2PS128load,
 		ssa.OpAMD64VPERMI2D128load,
 		ssa.OpAMD64VPERMI2PS256load,
 		ssa.OpAMD64VPERMI2D256load,
 		ssa.OpAMD64VPERMI2PS512load,
 		ssa.OpAMD64VPERMI2D512load,
 		ssa.OpAMD64VPERMI2PD128load,
 		ssa.OpAMD64VPERMI2Q128load,
 		ssa.OpAMD64VPERMI2PD256load,
 		ssa.OpAMD64VPERMI2Q256load,
 		ssa.OpAMD64VPERMI2PD512load,
 		ssa.OpAMD64VPERMI2Q512load,
 		ssa.OpAMD64VPSHLDVD128load,
 		ssa.OpAMD64VPSHLDVD256load,
 		ssa.OpAMD64VPSHLDVD512load,
@ -1868,6 +1876,18 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 	case ssa.OpAMD64VPDPWSSDMasked128load,
 		ssa.OpAMD64VPDPWSSDMasked256load,
 		ssa.OpAMD64VPDPWSSDMasked512load,
 		ssa.OpAMD64VPERMI2PSMasked128load,
 		ssa.OpAMD64VPERMI2DMasked128load,
 		ssa.OpAMD64VPERMI2PSMasked256load,
 		ssa.OpAMD64VPERMI2DMasked256load,
 		ssa.OpAMD64VPERMI2PSMasked512load,
 		ssa.OpAMD64VPERMI2DMasked512load,
 		ssa.OpAMD64VPERMI2PDMasked128load,
 		ssa.OpAMD64VPERMI2QMasked128load,
 		ssa.OpAMD64VPERMI2PDMasked256load,
 		ssa.OpAMD64VPERMI2QMasked256load,
 		ssa.OpAMD64VPERMI2PDMasked512load,
 		ssa.OpAMD64VPERMI2QMasked512load,
 		ssa.OpAMD64VPDPBUSDMasked128load,
 		ssa.OpAMD64VPDPBUSDMasked256load,
 		ssa.OpAMD64VPDPBUSDMasked512load,
@ -1892,18 +1912,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VFMSUBADD213PDMasked128load,
 		ssa.OpAMD64VFMSUBADD213PDMasked256load,
 		ssa.OpAMD64VFMSUBADD213PDMasked512load,
 		ssa.OpAMD64VPERMI2PSMasked128load,
 		ssa.OpAMD64VPERMI2DMasked128load,
 		ssa.OpAMD64VPERMI2PSMasked256load,
 		ssa.OpAMD64VPERMI2DMasked256load,
 		ssa.OpAMD64VPERMI2PSMasked512load,
 		ssa.OpAMD64VPERMI2DMasked512load,
 		ssa.OpAMD64VPERMI2PDMasked128load,
 		ssa.OpAMD64VPERMI2QMasked128load,
 		ssa.OpAMD64VPERMI2PDMasked256load,
 		ssa.OpAMD64VPERMI2QMasked256load,
 		ssa.OpAMD64VPERMI2PDMasked512load,
 		ssa.OpAMD64VPERMI2QMasked512load,
 		ssa.OpAMD64VPSHLDVDMasked128load,
 		ssa.OpAMD64VPSHLDVDMasked256load,
 		ssa.OpAMD64VPSHLDVDMasked512load,
@ -2124,7 +2132,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VREDUCEPD128load,
 		ssa.OpAMD64VREDUCEPD256load,
 		ssa.OpAMD64VREDUCEPD512load,
 		ssa.OpAMD64VPSHUFD512load,
 		ssa.OpAMD64VPROLD128load,
 		ssa.OpAMD64VPROLD256load,
 		ssa.OpAMD64VPROLD512load,
@ -2137,6 +2144,7 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPRORQ128load,
 		ssa.OpAMD64VPRORQ256load,
 		ssa.OpAMD64VPRORQ512load,
 		ssa.OpAMD64VPSHUFD512load,
 		ssa.OpAMD64VPSLLD512constload,
 		ssa.OpAMD64VPSLLQ512constload,
 		ssa.OpAMD64VPSRLD512constload,
@ -2159,9 +2167,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VREDUCEPDMasked128load,
 		ssa.OpAMD64VREDUCEPDMasked256load,
 		ssa.OpAMD64VREDUCEPDMasked512load,
 		ssa.OpAMD64VPSHUFDMasked256load,
 		ssa.OpAMD64VPSHUFDMasked512load,
 		ssa.OpAMD64VPSHUFDMasked128load,
 		ssa.OpAMD64VPROLDMasked128load,
 		ssa.OpAMD64VPROLDMasked256load,
 		ssa.OpAMD64VPROLDMasked512load,
@ -2174,6 +2179,9 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPRORQMasked128load,
 		ssa.OpAMD64VPRORQMasked256load,
 		ssa.OpAMD64VPRORQMasked512load,
 		ssa.OpAMD64VPSHUFDMasked256load,
 		ssa.OpAMD64VPSHUFDMasked512load,
 		ssa.OpAMD64VPSHUFDMasked128load,
 		ssa.OpAMD64VPSLLDMasked128constload,
 		ssa.OpAMD64VPSLLDMasked256constload,
 		ssa.OpAMD64VPSLLDMasked512constload,
@ -2447,12 +2455,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPOPCNTQMasked128Merging,
 		ssa.OpAMD64VPOPCNTQMasked256Merging,
 		ssa.OpAMD64VPOPCNTQMasked512Merging,
 		ssa.OpAMD64VPSHUFDMasked256Merging,
 		ssa.OpAMD64VPSHUFDMasked512Merging,
 		ssa.OpAMD64VPSHUFHWMasked256Merging,
 		ssa.OpAMD64VPSHUFHWMasked512Merging,
 		ssa.OpAMD64VPSHUFHWMasked128Merging,
 		ssa.OpAMD64VPSHUFDMasked128Merging,
 		ssa.OpAMD64VRCP14PSMasked128Merging,
 		ssa.OpAMD64VRCP14PSMasked256Merging,
 		ssa.OpAMD64VRCP14PSMasked512Merging,
@ -2483,6 +2485,15 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VSQRTPDMasked128Merging,
 		ssa.OpAMD64VSQRTPDMasked256Merging,
 		ssa.OpAMD64VSQRTPDMasked512Merging,
 		ssa.OpAMD64VPSHUFDMasked256Merging,
 		ssa.OpAMD64VPSHUFDMasked512Merging,
 		ssa.OpAMD64VPSHUFHWMasked256Merging,
 		ssa.OpAMD64VPSHUFHWMasked512Merging,
 		ssa.OpAMD64VPSHUFHWMasked128Merging,
 		ssa.OpAMD64VPSHUFLWMasked256Merging,
 		ssa.OpAMD64VPSHUFLWMasked512Merging,
 		ssa.OpAMD64VPSHUFLWMasked128Merging,
 		ssa.OpAMD64VPSHUFDMasked128Merging,
 		ssa.OpAMD64VPSLLWMasked128constMerging,
 		ssa.OpAMD64VPSLLWMasked256constMerging,
 		ssa.OpAMD64VPSLLWMasked512constMerging,
@ -2674,6 +2685,36 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPCOMPRESSQMasked128,
 		ssa.OpAMD64VPCOMPRESSQMasked256,
 		ssa.OpAMD64VPCOMPRESSQMasked512,
 		ssa.OpAMD64VPERMI2BMasked128,
 		ssa.OpAMD64VPERMI2BMasked256,
 		ssa.OpAMD64VPERMI2BMasked512,
 		ssa.OpAMD64VPERMI2WMasked128,
 		ssa.OpAMD64VPERMI2WMasked256,
 		ssa.OpAMD64VPERMI2WMasked512,
 		ssa.OpAMD64VPERMI2PSMasked128,
 		ssa.OpAMD64VPERMI2PSMasked128load,
 		ssa.OpAMD64VPERMI2DMasked128,
 		ssa.OpAMD64VPERMI2DMasked128load,
 		ssa.OpAMD64VPERMI2PSMasked256,
 		ssa.OpAMD64VPERMI2PSMasked256load,
 		ssa.OpAMD64VPERMI2DMasked256,
 		ssa.OpAMD64VPERMI2DMasked256load,
 		ssa.OpAMD64VPERMI2PSMasked512,
 		ssa.OpAMD64VPERMI2PSMasked512load,
 		ssa.OpAMD64VPERMI2DMasked512,
 		ssa.OpAMD64VPERMI2DMasked512load,
 		ssa.OpAMD64VPERMI2PDMasked128,
 		ssa.OpAMD64VPERMI2PDMasked128load,
 		ssa.OpAMD64VPERMI2QMasked128,
 		ssa.OpAMD64VPERMI2QMasked128load,
 		ssa.OpAMD64VPERMI2PDMasked256,
 		ssa.OpAMD64VPERMI2PDMasked256load,
 		ssa.OpAMD64VPERMI2QMasked256,
 		ssa.OpAMD64VPERMI2QMasked256load,
 		ssa.OpAMD64VPERMI2PDMasked512,
 		ssa.OpAMD64VPERMI2PDMasked512load,
 		ssa.OpAMD64VPERMI2QMasked512,
 		ssa.OpAMD64VPERMI2QMasked512load,
 		ssa.OpAMD64VPALIGNRMasked256,
 		ssa.OpAMD64VPALIGNRMasked512,
 		ssa.OpAMD64VPALIGNRMasked128,
@ -3061,48 +3102,7 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPORQMasked256load,
 		ssa.OpAMD64VPORQMasked512,
 		ssa.OpAMD64VPORQMasked512load,
-		ssa.OpAMD64VPERMI2BMasked128,
+		ssa.OpAMD64VPERMBMasked128,
 		ssa.OpAMD64VPERMI2BMasked256,
 		ssa.OpAMD64VPERMI2BMasked512,
 		ssa.OpAMD64VPERMI2WMasked128,
 		ssa.OpAMD64VPERMI2WMasked256,
 		ssa.OpAMD64VPERMI2WMasked512,
 		ssa.OpAMD64VPERMI2PSMasked128,
 		ssa.OpAMD64VPERMI2PSMasked128load,
 		ssa.OpAMD64VPERMI2DMasked128,
 		ssa.OpAMD64VPERMI2DMasked128load,
 		ssa.OpAMD64VPERMI2PSMasked256,
 		ssa.OpAMD64VPERMI2PSMasked256load,
 		ssa.OpAMD64VPERMI2DMasked256,
 		ssa.OpAMD64VPERMI2DMasked256load,
 		ssa.OpAMD64VPERMI2PSMasked512,
 		ssa.OpAMD64VPERMI2PSMasked512load,
 		ssa.OpAMD64VPERMI2DMasked512,
 		ssa.OpAMD64VPERMI2DMasked512load,
 		ssa.OpAMD64VPERMI2PDMasked128,
 		ssa.OpAMD64VPERMI2PDMasked128load,
 		ssa.OpAMD64VPERMI2QMasked128,
 		ssa.OpAMD64VPERMI2QMasked128load,
 		ssa.OpAMD64VPERMI2PDMasked256,
 		ssa.OpAMD64VPERMI2PDMasked256load,
 		ssa.OpAMD64VPERMI2QMasked256,
 		ssa.OpAMD64VPERMI2QMasked256load,
 		ssa.OpAMD64VPERMI2PDMasked512,
 		ssa.OpAMD64VPERMI2PDMasked512load,
 		ssa.OpAMD64VPERMI2QMasked512,
 		ssa.OpAMD64VPERMI2QMasked512load,
 		ssa.OpAMD64VPSHUFDMasked256,
 		ssa.OpAMD64VPSHUFDMasked256load,
 		ssa.OpAMD64VPSHUFDMasked512,
 		ssa.OpAMD64VPSHUFDMasked512load,
 		ssa.OpAMD64VPSHUFHWMasked256,
 		ssa.OpAMD64VPSHUFHWMasked512,
 		ssa.OpAMD64VPSHUFHWMasked128,
 		ssa.OpAMD64VPSHUFDMasked128,
 		ssa.OpAMD64VPSHUFDMasked128load,
 		ssa.OpAMD64VPSHUFBMasked256,
 		ssa.OpAMD64VPSHUFBMasked512,
 		ssa.OpAMD64VPSHUFBMasked128,
 		ssa.OpAMD64VPERMBMasked256,
 		ssa.OpAMD64VPERMBMasked512,
 		ssa.OpAMD64VPERMWMasked128,
@ -3124,6 +3124,9 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPERMPDMasked512load,
 		ssa.OpAMD64VPERMQMasked512,
 		ssa.OpAMD64VPERMQMasked512load,
 		ssa.OpAMD64VPSHUFBMasked256,
 		ssa.OpAMD64VPSHUFBMasked512,
 		ssa.OpAMD64VPSHUFBMasked128,
 		ssa.OpAMD64VRCP14PSMasked128,
 		ssa.OpAMD64VRCP14PSMasked128load,
 		ssa.OpAMD64VRCP14PSMasked256,
@ -3418,6 +3421,18 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VMOVDQU64Masked128,
 		ssa.OpAMD64VMOVDQU64Masked256,
 		ssa.OpAMD64VMOVDQU64Masked512,
 		ssa.OpAMD64VPSHUFDMasked256,
 		ssa.OpAMD64VPSHUFDMasked256load,
 		ssa.OpAMD64VPSHUFDMasked512,
 		ssa.OpAMD64VPSHUFDMasked512load,
 		ssa.OpAMD64VPSHUFHWMasked256,
 		ssa.OpAMD64VPSHUFHWMasked512,
 		ssa.OpAMD64VPSHUFHWMasked128,
 		ssa.OpAMD64VPSHUFLWMasked256,
 		ssa.OpAMD64VPSHUFLWMasked512,
 		ssa.OpAMD64VPSHUFLWMasked128,
 		ssa.OpAMD64VPSHUFDMasked128,
 		ssa.OpAMD64VPSHUFDMasked128load,
 		ssa.OpAMD64VPSLLWMasked128const,
 		ssa.OpAMD64VPSLLWMasked256const,
 		ssa.OpAMD64VPSLLWMasked512const,
--- a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
+++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
@ -216,6 +216,36 @@
 (CompressUint64x2 x mask) => (VPCOMPRESSQMasked128 x (VPMOVVec64x2ToM <types.TypeMask> mask))
 (CompressUint64x4 x mask) => (VPCOMPRESSQMasked256 x (VPMOVVec64x4ToM <types.TypeMask> mask))
 (CompressUint64x8 x mask) => (VPCOMPRESSQMasked512 x (VPMOVVec64x8ToM <types.TypeMask> mask))
 (ConcatPermuteFloat32x4 ...) => (VPERMI2PS128 ...)
 (ConcatPermuteFloat32x8 ...) => (VPERMI2PS256 ...)
 (ConcatPermuteFloat32x16 ...) => (VPERMI2PS512 ...)
 (ConcatPermuteFloat64x2 ...) => (VPERMI2PD128 ...)
 (ConcatPermuteFloat64x4 ...) => (VPERMI2PD256 ...)
 (ConcatPermuteFloat64x8 ...) => (VPERMI2PD512 ...)
 (ConcatPermuteInt8x16 ...) => (VPERMI2B128 ...)
 (ConcatPermuteInt8x32 ...) => (VPERMI2B256 ...)
 (ConcatPermuteInt8x64 ...) => (VPERMI2B512 ...)
 (ConcatPermuteInt16x8 ...) => (VPERMI2W128 ...)
 (ConcatPermuteInt16x16 ...) => (VPERMI2W256 ...)
 (ConcatPermuteInt16x32 ...) => (VPERMI2W512 ...)
 (ConcatPermuteInt32x4 ...) => (VPERMI2D128 ...)
 (ConcatPermuteInt32x8 ...) => (VPERMI2D256 ...)
 (ConcatPermuteInt32x16 ...) => (VPERMI2D512 ...)
 (ConcatPermuteInt64x2 ...) => (VPERMI2Q128 ...)
 (ConcatPermuteInt64x4 ...) => (VPERMI2Q256 ...)
 (ConcatPermuteInt64x8 ...) => (VPERMI2Q512 ...)
 (ConcatPermuteUint8x16 ...) => (VPERMI2B128 ...)
 (ConcatPermuteUint8x32 ...) => (VPERMI2B256 ...)
 (ConcatPermuteUint8x64 ...) => (VPERMI2B512 ...)
 (ConcatPermuteUint16x8 ...) => (VPERMI2W128 ...)
 (ConcatPermuteUint16x16 ...) => (VPERMI2W256 ...)
 (ConcatPermuteUint16x32 ...) => (VPERMI2W512 ...)
 (ConcatPermuteUint32x4 ...) => (VPERMI2D128 ...)
 (ConcatPermuteUint32x8 ...) => (VPERMI2D256 ...)
 (ConcatPermuteUint32x16 ...) => (VPERMI2D512 ...)
 (ConcatPermuteUint64x2 ...) => (VPERMI2Q128 ...)
 (ConcatPermuteUint64x4 ...) => (VPERMI2Q256 ...)
 (ConcatPermuteUint64x8 ...) => (VPERMI2Q512 ...)
 (ConcatShiftBytesRightUint8x16 ...) => (VPALIGNR128 ...)
 (ConcatShiftBytesRightGroupedUint8x32 ...) => (VPALIGNR256 ...)
 (ConcatShiftBytesRightGroupedUint8x64 ...) => (VPALIGNR512 ...)
@ -794,7 +824,7 @@
 (PermuteFloat32x16 ...) => (VPERMPS512 ...)
 (PermuteFloat64x4 ...) => (VPERMPD256 ...)
 (PermuteFloat64x8 ...) => (VPERMPD512 ...)
-(PermuteInt8x16 ...) => (VPSHUFB128 ...)
+(PermuteInt8x16 ...) => (VPERMB128 ...)
 (PermuteInt8x32 ...) => (VPERMB256 ...)
 (PermuteInt8x64 ...) => (VPERMB512 ...)
 (PermuteInt16x8 ...) => (VPERMW128 ...)
@ -804,7 +834,7 @@
 (PermuteInt32x16 ...) => (VPERMD512 ...)
 (PermuteInt64x4 ...) => (VPERMQ256 ...)
 (PermuteInt64x8 ...) => (VPERMQ512 ...)
-(PermuteUint8x16 ...) => (VPSHUFB128 ...)
+(PermuteUint8x16 ...) => (VPERMB128 ...)
 (PermuteUint8x32 ...) => (VPERMB256 ...)
 (PermuteUint8x64 ...) => (VPERMB512 ...)
 (PermuteUint16x8 ...) => (VPERMW128 ...)
@ -814,62 +844,12 @@
 (PermuteUint32x16 ...) => (VPERMD512 ...)
 (PermuteUint64x4 ...) => (VPERMQ256 ...)
 (PermuteUint64x8 ...) => (VPERMQ512 ...)
-(Permute2Float32x4 ...) => (VPERMI2PS128 ...)
+(PermuteOrZeroInt8x16 ...) => (VPSHUFB128 ...)
-(Permute2Float32x8 ...) => (VPERMI2PS256 ...)
+(PermuteOrZeroUint8x16 ...) => (VPSHUFB128 ...)
-(Permute2Float32x16 ...) => (VPERMI2PS512 ...)
+(PermuteOrZeroGroupedInt8x32 ...) => (VPSHUFB256 ...)
-(Permute2Float64x2 ...) => (VPERMI2PD128 ...)
+(PermuteOrZeroGroupedInt8x64 ...) => (VPSHUFB512 ...)
-(Permute2Float64x4 ...) => (VPERMI2PD256 ...)
+(PermuteOrZeroGroupedUint8x32 ...) => (VPSHUFB256 ...)
-(Permute2Float64x8 ...) => (VPERMI2PD512 ...)
+(PermuteOrZeroGroupedUint8x64 ...) => (VPSHUFB512 ...)
 (Permute2Int8x16 ...) => (VPERMI2B128 ...)
 (Permute2Int8x32 ...) => (VPERMI2B256 ...)
 (Permute2Int8x64 ...) => (VPERMI2B512 ...)
 (Permute2Int16x8 ...) => (VPERMI2W128 ...)
 (Permute2Int16x16 ...) => (VPERMI2W256 ...)
 (Permute2Int16x32 ...) => (VPERMI2W512 ...)
 (Permute2Int32x4 ...) => (VPERMI2D128 ...)
 (Permute2Int32x8 ...) => (VPERMI2D256 ...)
 (Permute2Int32x16 ...) => (VPERMI2D512 ...)
 (Permute2Int64x2 ...) => (VPERMI2Q128 ...)
 (Permute2Int64x4 ...) => (VPERMI2Q256 ...)
 (Permute2Int64x8 ...) => (VPERMI2Q512 ...)
 (Permute2Uint8x16 ...) => (VPERMI2B128 ...)
 (Permute2Uint8x32 ...) => (VPERMI2B256 ...)
 (Permute2Uint8x64 ...) => (VPERMI2B512 ...)
 (Permute2Uint16x8 ...) => (VPERMI2W128 ...)
 (Permute2Uint16x16 ...) => (VPERMI2W256 ...)
 (Permute2Uint16x32 ...) => (VPERMI2W512 ...)
 (Permute2Uint32x4 ...) => (VPERMI2D128 ...)
 (Permute2Uint32x8 ...) => (VPERMI2D256 ...)
 (Permute2Uint32x16 ...) => (VPERMI2D512 ...)
 (Permute2Uint64x2 ...) => (VPERMI2Q128 ...)
 (Permute2Uint64x4 ...) => (VPERMI2Q256 ...)
 (Permute2Uint64x8 ...) => (VPERMI2Q512 ...)
 (PermuteConstantInt32x4 ...) => (VPSHUFD128 ...)
 (PermuteConstantUint32x4 ...) => (VPSHUFD128 ...)
 (PermuteConstantGroupedInt32x8 ...) => (VPSHUFD256 ...)
 (PermuteConstantGroupedInt32x16 ...) => (VPSHUFD512 ...)
 (PermuteConstantGroupedUint32x8 ...) => (VPSHUFD256 ...)
 (PermuteConstantGroupedUint32x16 ...) => (VPSHUFD512 ...)
 (PermuteConstantHiInt16x8 ...) => (VPSHUFHW128 ...)
 (PermuteConstantHiInt32x4 ...) => (VPSHUFHW128 ...)
 (PermuteConstantHiUint16x8 ...) => (VPSHUFHW128 ...)
 (PermuteConstantHiUint32x4 ...) => (VPSHUFHW128 ...)
 (PermuteConstantHiGroupedInt16x16 ...) => (VPSHUFHW256 ...)
 (PermuteConstantHiGroupedInt16x32 ...) => (VPSHUFHW512 ...)
 (PermuteConstantHiGroupedUint16x16 ...) => (VPSHUFHW256 ...)
 (PermuteConstantHiGroupedUint16x32 ...) => (VPSHUFHW512 ...)
 (PermuteConstantLoInt16x8 ...) => (VPSHUFHW128 ...)
 (PermuteConstantLoInt32x4 ...) => (VPSHUFHW128 ...)
 (PermuteConstantLoUint16x8 ...) => (VPSHUFHW128 ...)
 (PermuteConstantLoUint32x4 ...) => (VPSHUFHW128 ...)
 (PermuteConstantLoGroupedInt16x16 ...) => (VPSHUFHW256 ...)
 (PermuteConstantLoGroupedInt16x32 ...) => (VPSHUFHW512 ...)
 (PermuteConstantLoGroupedUint16x16 ...) => (VPSHUFHW256 ...)
 (PermuteConstantLoGroupedUint16x32 ...) => (VPSHUFHW512 ...)
 (PermuteGroupedInt8x32 ...) => (VPSHUFB256 ...)
 (PermuteGroupedInt8x64 ...) => (VPSHUFB512 ...)
 (PermuteGroupedUint8x32 ...) => (VPSHUFB256 ...)
 (PermuteGroupedUint8x64 ...) => (VPSHUFB512 ...)
 (ReciprocalFloat32x4 ...) => (VRCPPS128 ...)
 (ReciprocalFloat32x8 ...) => (VRCPPS256 ...)
 (ReciprocalFloat32x16 ...) => (VRCP14PS512 ...)
@ -1324,6 +1304,24 @@
 (concatSelectedConstantGroupedUint32x16 ...) => (VSHUFPS512 ...)
 (concatSelectedConstantGroupedUint64x4 ...) => (VSHUFPD256 ...)
 (concatSelectedConstantGroupedUint64x8 ...) => (VSHUFPD512 ...)
 (permuteScalarsInt32x4 ...) => (VPSHUFD128 ...)
 (permuteScalarsUint32x4 ...) => (VPSHUFD128 ...)
 (permuteScalarsGroupedInt32x8 ...) => (VPSHUFD256 ...)
 (permuteScalarsGroupedInt32x16 ...) => (VPSHUFD512 ...)
 (permuteScalarsGroupedUint32x8 ...) => (VPSHUFD256 ...)
 (permuteScalarsGroupedUint32x16 ...) => (VPSHUFD512 ...)
 (permuteScalarsHiInt16x8 ...) => (VPSHUFHW128 ...)
 (permuteScalarsHiUint16x8 ...) => (VPSHUFHW128 ...)
 (permuteScalarsHiGroupedInt16x16 ...) => (VPSHUFHW256 ...)
 (permuteScalarsHiGroupedInt16x32 ...) => (VPSHUFHW512 ...)
 (permuteScalarsHiGroupedUint16x16 ...) => (VPSHUFHW256 ...)
 (permuteScalarsHiGroupedUint16x32 ...) => (VPSHUFHW512 ...)
 (permuteScalarsLoInt16x8 ...) => (VPSHUFLW128 ...)
 (permuteScalarsLoUint16x8 ...) => (VPSHUFLW128 ...)
 (permuteScalarsLoGroupedInt16x16 ...) => (VPSHUFLW256 ...)
 (permuteScalarsLoGroupedInt16x32 ...) => (VPSHUFLW512 ...)
 (permuteScalarsLoGroupedUint16x16 ...) => (VPSHUFLW256 ...)
 (permuteScalarsLoGroupedUint16x32 ...) => (VPSHUFLW512 ...)
 (ternInt32x4 ...) => (VPTERNLOGD128 ...)
 (ternInt32x8 ...) => (VPTERNLOGD256 ...)
 (ternInt32x16 ...) => (VPTERNLOGD512 ...)
@ -1417,6 +1415,24 @@
 (VMOVDQU64Masked128 (VREDUCEPD128 [a] x) mask) => (VREDUCEPDMasked128 [a] x mask)
 (VMOVDQU64Masked256 (VREDUCEPD256 [a] x) mask) => (VREDUCEPDMasked256 [a] x mask)
 (VMOVDQU64Masked512 (VREDUCEPD512 [a] x) mask) => (VREDUCEPDMasked512 [a] x mask)
 (VMOVDQU8Masked128 (VPERMI2B128 x y z) mask) => (VPERMI2BMasked128 x y z mask)
 (VMOVDQU8Masked256 (VPERMI2B256 x y z) mask) => (VPERMI2BMasked256 x y z mask)
 (VMOVDQU8Masked512 (VPERMI2B512 x y z) mask) => (VPERMI2BMasked512 x y z mask)
 (VMOVDQU16Masked128 (VPERMI2W128 x y z) mask) => (VPERMI2WMasked128 x y z mask)
 (VMOVDQU16Masked256 (VPERMI2W256 x y z) mask) => (VPERMI2WMasked256 x y z mask)
 (VMOVDQU16Masked512 (VPERMI2W512 x y z) mask) => (VPERMI2WMasked512 x y z mask)
 (VMOVDQU32Masked128 (VPERMI2PS128 x y z) mask) => (VPERMI2PSMasked128 x y z mask)
 (VMOVDQU32Masked128 (VPERMI2D128 x y z) mask) => (VPERMI2DMasked128 x y z mask)
 (VMOVDQU32Masked256 (VPERMI2PS256 x y z) mask) => (VPERMI2PSMasked256 x y z mask)
 (VMOVDQU32Masked256 (VPERMI2D256 x y z) mask) => (VPERMI2DMasked256 x y z mask)
 (VMOVDQU32Masked512 (VPERMI2PS512 x y z) mask) => (VPERMI2PSMasked512 x y z mask)
 (VMOVDQU32Masked512 (VPERMI2D512 x y z) mask) => (VPERMI2DMasked512 x y z mask)
 (VMOVDQU64Masked128 (VPERMI2PD128 x y z) mask) => (VPERMI2PDMasked128 x y z mask)
 (VMOVDQU64Masked128 (VPERMI2Q128 x y z) mask) => (VPERMI2QMasked128 x y z mask)
 (VMOVDQU64Masked256 (VPERMI2PD256 x y z) mask) => (VPERMI2PDMasked256 x y z mask)
 (VMOVDQU64Masked256 (VPERMI2Q256 x y z) mask) => (VPERMI2QMasked256 x y z mask)
 (VMOVDQU64Masked512 (VPERMI2PD512 x y z) mask) => (VPERMI2PDMasked512 x y z mask)
 (VMOVDQU64Masked512 (VPERMI2Q512 x y z) mask) => (VPERMI2QMasked512 x y z mask)
 (VMOVDQU8Masked256 (VPALIGNR256 [a] x y) mask) => (VPALIGNRMasked256 [a] x y mask)
 (VMOVDQU8Masked512 (VPALIGNR512 [a] x y) mask) => (VPALIGNRMasked512 [a] x y mask)
 (VMOVDQU8Masked128 (VPALIGNR128 [a] x y) mask) => (VPALIGNRMasked128 [a] x y mask)
@ -1668,33 +1684,7 @@
 (VMOVDQU64Masked512 (VPOPCNTQ512 x) mask) => (VPOPCNTQMasked512 x mask)
 (VMOVDQU32Masked512 (VPORD512 x y) mask) => (VPORDMasked512 x y mask)
 (VMOVDQU64Masked512 (VPORQ512 x y) mask) => (VPORQMasked512 x y mask)
-(VMOVDQU8Masked128 (VPERMI2B128 x y z) mask) => (VPERMI2BMasked128 x y z mask)
+(VMOVDQU8Masked128 (VPERMB128 x y) mask) => (VPERMBMasked128 x y mask)
 (VMOVDQU8Masked256 (VPERMI2B256 x y z) mask) => (VPERMI2BMasked256 x y z mask)
 (VMOVDQU8Masked512 (VPERMI2B512 x y z) mask) => (VPERMI2BMasked512 x y z mask)
 (VMOVDQU16Masked128 (VPERMI2W128 x y z) mask) => (VPERMI2WMasked128 x y z mask)
 (VMOVDQU16Masked256 (VPERMI2W256 x y z) mask) => (VPERMI2WMasked256 x y z mask)
 (VMOVDQU16Masked512 (VPERMI2W512 x y z) mask) => (VPERMI2WMasked512 x y z mask)
 (VMOVDQU32Masked128 (VPERMI2PS128 x y z) mask) => (VPERMI2PSMasked128 x y z mask)
 (VMOVDQU32Masked128 (VPERMI2D128 x y z) mask) => (VPERMI2DMasked128 x y z mask)
 (VMOVDQU32Masked256 (VPERMI2PS256 x y z) mask) => (VPERMI2PSMasked256 x y z mask)
 (VMOVDQU32Masked256 (VPERMI2D256 x y z) mask) => (VPERMI2DMasked256 x y z mask)
 (VMOVDQU32Masked512 (VPERMI2PS512 x y z) mask) => (VPERMI2PSMasked512 x y z mask)
 (VMOVDQU32Masked512 (VPERMI2D512 x y z) mask) => (VPERMI2DMasked512 x y z mask)
 (VMOVDQU64Masked128 (VPERMI2PD128 x y z) mask) => (VPERMI2PDMasked128 x y z mask)
 (VMOVDQU64Masked128 (VPERMI2Q128 x y z) mask) => (VPERMI2QMasked128 x y z mask)
 (VMOVDQU64Masked256 (VPERMI2PD256 x y z) mask) => (VPERMI2PDMasked256 x y z mask)
 (VMOVDQU64Masked256 (VPERMI2Q256 x y z) mask) => (VPERMI2QMasked256 x y z mask)
 (VMOVDQU64Masked512 (VPERMI2PD512 x y z) mask) => (VPERMI2PDMasked512 x y z mask)
 (VMOVDQU64Masked512 (VPERMI2Q512 x y z) mask) => (VPERMI2QMasked512 x y z mask)
 (VMOVDQU32Masked256 (VPSHUFD256 [a] x) mask) => (VPSHUFDMasked256 [a] x mask)
 (VMOVDQU32Masked512 (VPSHUFD512 [a] x) mask) => (VPSHUFDMasked512 [a] x mask)
 (VMOVDQU16Masked256 (VPSHUFHW256 [a] x) mask) => (VPSHUFHWMasked256 [a] x mask)
 (VMOVDQU16Masked512 (VPSHUFHW512 [a] x) mask) => (VPSHUFHWMasked512 [a] x mask)
 (VMOVDQU16Masked128 (VPSHUFHW128 [a] x) mask) => (VPSHUFHWMasked128 [a] x mask)
 (VMOVDQU32Masked128 (VPSHUFD128 [a] x) mask) => (VPSHUFDMasked128 [a] x mask)
 (VMOVDQU8Masked256 (VPSHUFB256 x y) mask) => (VPSHUFBMasked256 x y mask)
 (VMOVDQU8Masked512 (VPSHUFB512 x y) mask) => (VPSHUFBMasked512 x y mask)
 (VMOVDQU8Masked128 (VPSHUFB128 x y) mask) => (VPSHUFBMasked128 x y mask)
 (VMOVDQU8Masked256 (VPERMB256 x y) mask) => (VPERMBMasked256 x y mask)
 (VMOVDQU8Masked512 (VPERMB512 x y) mask) => (VPERMBMasked512 x y mask)
 (VMOVDQU16Masked128 (VPERMW128 x y) mask) => (VPERMWMasked128 x y mask)
@ -1708,6 +1698,9 @@
 (VMOVDQU64Masked256 (VPERMQ256 x y) mask) => (VPERMQMasked256 x y mask)
 (VMOVDQU64Masked512 (VPERMPD512 x y) mask) => (VPERMPDMasked512 x y mask)
 (VMOVDQU64Masked512 (VPERMQ512 x y) mask) => (VPERMQMasked512 x y mask)
 (VMOVDQU8Masked256 (VPSHUFB256 x y) mask) => (VPSHUFBMasked256 x y mask)
 (VMOVDQU8Masked512 (VPSHUFB512 x y) mask) => (VPSHUFBMasked512 x y mask)
 (VMOVDQU8Masked128 (VPSHUFB128 x y) mask) => (VPSHUFBMasked128 x y mask)
 (VMOVDQU32Masked512 (VRCP14PS512 x) mask) => (VRCP14PSMasked512 x mask)
 (VMOVDQU64Masked128 (VRCP14PD128 x) mask) => (VRCP14PDMasked128 x mask)
 (VMOVDQU64Masked256 (VRCP14PD256 x) mask) => (VRCP14PDMasked256 x mask)
@ -1874,6 +1867,15 @@
 (VMOVDQU16Masked512 (VPSUBUSW512 x y) mask) => (VPSUBUSWMasked512 x y mask)
 (VMOVDQU32Masked512 (VPXORD512 x y) mask) => (VPXORDMasked512 x y mask)
 (VMOVDQU64Masked512 (VPXORQ512 x y) mask) => (VPXORQMasked512 x y mask)
 (VMOVDQU32Masked256 (VPSHUFD256 [a] x) mask) => (VPSHUFDMasked256 [a] x mask)
 (VMOVDQU32Masked512 (VPSHUFD512 [a] x) mask) => (VPSHUFDMasked512 [a] x mask)
 (VMOVDQU16Masked256 (VPSHUFHW256 [a] x) mask) => (VPSHUFHWMasked256 [a] x mask)
 (VMOVDQU16Masked512 (VPSHUFHW512 [a] x) mask) => (VPSHUFHWMasked512 [a] x mask)
 (VMOVDQU16Masked128 (VPSHUFHW128 [a] x) mask) => (VPSHUFHWMasked128 [a] x mask)
 (VMOVDQU16Masked256 (VPSHUFLW256 [a] x) mask) => (VPSHUFLWMasked256 [a] x mask)
 (VMOVDQU16Masked512 (VPSHUFLW512 [a] x) mask) => (VPSHUFLWMasked512 [a] x mask)
 (VMOVDQU16Masked128 (VPSHUFLW128 [a] x) mask) => (VPSHUFLWMasked128 [a] x mask)
 (VMOVDQU32Masked128 (VPSHUFD128 [a] x) mask) => (VPSHUFDMasked128 [a] x mask)
 (VMOVDQU16Masked128 (VPSLLW128const [a] x) mask) => (VPSLLWMasked128const [a] x mask)
 (VMOVDQU16Masked256 (VPSLLW256const [a] x) mask) => (VPSLLWMasked256const [a] x mask)
 (VMOVDQU16Masked512 (VPSLLW512const [a] x) mask) => (VPSLLWMasked512const [a] x mask)
@ -2021,6 +2023,7 @@
 (VPBLENDMWMasked512 dst (VPSHLDW512 [a] x y) mask) => (VPSHLDWMasked512Merging dst [a] x y mask)
 (VPBLENDMWMasked512 dst (VPSHRDW512 [a] x y) mask) => (VPSHRDWMasked512Merging dst [a] x y mask)
 (VPBLENDMWMasked512 dst (VPSHUFHW512 [a] x) mask) => (VPSHUFHWMasked512Merging dst [a] x mask)
 (VPBLENDMWMasked512 dst (VPSHUFLW512 [a] x) mask) => (VPSHUFLWMasked512Merging dst [a] x mask)
 (VPBLENDMWMasked512 dst (VPSLLVW512 x y) mask) => (VPSLLVWMasked512Merging dst x y mask)
 (VPBLENDMWMasked512 dst (VPSLLW512const [a] x) mask) => (VPSLLWMasked512constMerging dst [a] x mask)
 (VPBLENDMWMasked512 dst (VPSRAVW512 x y) mask) => (VPSRAVWMasked512Merging dst x y mask)
@ -2170,6 +2173,7 @@
 (VPBLENDVB128 dst (VPSHUFB128 x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHUFBMasked128Merging dst x y (VPMOVVec8x16ToM <types.TypeMask> mask))
 (VPBLENDVB128 dst (VPSHUFD128 [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHUFDMasked128Merging dst [a] x (VPMOVVec32x4ToM <types.TypeMask> mask))
 (VPBLENDVB128 dst (VPSHUFHW128 [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHUFHWMasked128Merging dst [a] x (VPMOVVec16x8ToM <types.TypeMask> mask))
 (VPBLENDVB128 dst (VPSHUFLW128 [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHUFLWMasked128Merging dst [a] x (VPMOVVec16x8ToM <types.TypeMask> mask))
 (VPBLENDVB128 dst (VPSLLD128const [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSLLDMasked128constMerging dst [a] x (VPMOVVec32x4ToM <types.TypeMask> mask))
 (VPBLENDVB128 dst (VPSLLQ128const [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSLLQMasked128constMerging dst [a] x (VPMOVVec64x2ToM <types.TypeMask> mask))
 (VPBLENDVB128 dst (VPSLLVD128 x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSLLVDMasked128Merging dst x y (VPMOVVec32x4ToM <types.TypeMask> mask))
@ -2305,6 +2309,7 @@
 (VPBLENDVB256 dst (VPSHUFB256 x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHUFBMasked256Merging dst x y (VPMOVVec8x32ToM <types.TypeMask> mask))
 (VPBLENDVB256 dst (VPSHUFD256 [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHUFDMasked256Merging dst [a] x (VPMOVVec32x8ToM <types.TypeMask> mask))
 (VPBLENDVB256 dst (VPSHUFHW256 [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHUFHWMasked256Merging dst [a] x (VPMOVVec16x16ToM <types.TypeMask> mask))
 (VPBLENDVB256 dst (VPSHUFLW256 [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHUFLWMasked256Merging dst [a] x (VPMOVVec16x16ToM <types.TypeMask> mask))
 (VPBLENDVB256 dst (VPSLLD256const [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSLLDMasked256constMerging dst [a] x (VPMOVVec32x8ToM <types.TypeMask> mask))
 (VPBLENDVB256 dst (VPSLLQ256const [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSLLQMasked256constMerging dst [a] x (VPMOVVec64x4ToM <types.TypeMask> mask))
 (VPBLENDVB256 dst (VPSLLVD256 x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSLLVDMasked256Merging dst x y (VPMOVVec32x8ToM <types.TypeMask> mask))
@ -2410,6 +2415,30 @@
 (VREDUCEPDMasked128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VREDUCEPDMasked128load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
 (VREDUCEPDMasked256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VREDUCEPDMasked256load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
 (VREDUCEPDMasked512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VREDUCEPDMasked512load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
 (VPERMI2PS128 x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PS128load {sym} [off] x y ptr mem)
 (VPERMI2D128 x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2D128load {sym} [off] x y ptr mem)
 (VPERMI2PS256 x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PS256load {sym} [off] x y ptr mem)
 (VPERMI2D256 x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2D256load {sym} [off] x y ptr mem)
 (VPERMI2PS512 x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PS512load {sym} [off] x y ptr mem)
 (VPERMI2D512 x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2D512load {sym} [off] x y ptr mem)
 (VPERMI2PD128 x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PD128load {sym} [off] x y ptr mem)
 (VPERMI2Q128 x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2Q128load {sym} [off] x y ptr mem)
 (VPERMI2PD256 x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PD256load {sym} [off] x y ptr mem)
 (VPERMI2Q256 x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2Q256load {sym} [off] x y ptr mem)
 (VPERMI2PD512 x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PD512load {sym} [off] x y ptr mem)
 (VPERMI2Q512 x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2Q512load {sym} [off] x y ptr mem)
 (VPERMI2PSMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PSMasked128load {sym} [off] x y ptr mask mem)
 (VPERMI2DMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2DMasked128load {sym} [off] x y ptr mask mem)
 (VPERMI2PSMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PSMasked256load {sym} [off] x y ptr mask mem)
 (VPERMI2DMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2DMasked256load {sym} [off] x y ptr mask mem)
 (VPERMI2PSMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PSMasked512load {sym} [off] x y ptr mask mem)
 (VPERMI2DMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2DMasked512load {sym} [off] x y ptr mask mem)
 (VPERMI2PDMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PDMasked128load {sym} [off] x y ptr mask mem)
 (VPERMI2QMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2QMasked128load {sym} [off] x y ptr mask mem)
 (VPERMI2PDMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PDMasked256load {sym} [off] x y ptr mask mem)
 (VPERMI2QMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2QMasked256load {sym} [off] x y ptr mask mem)
 (VPERMI2PDMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PDMasked512load {sym} [off] x y ptr mask mem)
 (VPERMI2QMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2QMasked512load {sym} [off] x y ptr mask mem)
 (VPACKSSDW512 x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPACKSSDW512load {sym} [off] x ptr mem)
 (VPACKSSDWMasked128 x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPACKSSDWMasked128load {sym} [off] x ptr mask mem)
 (VPACKSSDWMasked256 x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPACKSSDWMasked256load {sym} [off] x ptr mask mem)
@ -2636,34 +2665,6 @@
 (VPERMQ256 x l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMQ256load {sym} [off] x ptr mem)
 (VPERMPD512 x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMPD512load {sym} [off] x ptr mem)
 (VPERMQ512 x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMQ512load {sym} [off] x ptr mem)
 (VPERMI2PS128 x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PS128load {sym} [off] x y ptr mem)
 (VPERMI2D128 x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2D128load {sym} [off] x y ptr mem)
 (VPERMI2PS256 x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PS256load {sym} [off] x y ptr mem)
 (VPERMI2D256 x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2D256load {sym} [off] x y ptr mem)
 (VPERMI2PS512 x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PS512load {sym} [off] x y ptr mem)
 (VPERMI2D512 x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2D512load {sym} [off] x y ptr mem)
 (VPERMI2PD128 x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PD128load {sym} [off] x y ptr mem)
 (VPERMI2Q128 x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2Q128load {sym} [off] x y ptr mem)
 (VPERMI2PD256 x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PD256load {sym} [off] x y ptr mem)
 (VPERMI2Q256 x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2Q256load {sym} [off] x y ptr mem)
 (VPERMI2PD512 x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PD512load {sym} [off] x y ptr mem)
 (VPERMI2Q512 x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2Q512load {sym} [off] x y ptr mem)
 (VPERMI2PSMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PSMasked128load {sym} [off] x y ptr mask mem)
 (VPERMI2DMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2DMasked128load {sym} [off] x y ptr mask mem)
 (VPERMI2PSMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PSMasked256load {sym} [off] x y ptr mask mem)
 (VPERMI2DMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2DMasked256load {sym} [off] x y ptr mask mem)
 (VPERMI2PSMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PSMasked512load {sym} [off] x y ptr mask mem)
 (VPERMI2DMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2DMasked512load {sym} [off] x y ptr mask mem)
 (VPERMI2PDMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PDMasked128load {sym} [off] x y ptr mask mem)
 (VPERMI2QMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2QMasked128load {sym} [off] x y ptr mask mem)
 (VPERMI2PDMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PDMasked256load {sym} [off] x y ptr mask mem)
 (VPERMI2QMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2QMasked256load {sym} [off] x y ptr mask mem)
 (VPERMI2PDMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PDMasked512load {sym} [off] x y ptr mask mem)
 (VPERMI2QMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2QMasked512load {sym} [off] x y ptr mask mem)
 (VPSHUFD512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHUFD512load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
 (VPSHUFDMasked256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHUFDMasked256load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
 (VPSHUFDMasked512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHUFDMasked512load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
 (VPSHUFDMasked128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHUFDMasked128load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
 (VPERMPSMasked256 x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMPSMasked256load {sym} [off] x ptr mask mem)
 (VPERMDMasked256 x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMDMasked256load {sym} [off] x ptr mask mem)
 (VPERMPSMasked512 x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMPSMasked512load {sym} [off] x ptr mask mem)
@ -2862,6 +2863,10 @@
 (VPBLENDMQMasked512 x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPBLENDMQMasked512load {sym} [off] x ptr mask mem)
 (VSHUFPS512 [c]  x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VSHUFPS512load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mem)
 (VSHUFPD512 [c]  x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VSHUFPD512load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mem)
 (VPSHUFD512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHUFD512load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
 (VPSHUFDMasked256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHUFDMasked256load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
 (VPSHUFDMasked512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHUFDMasked512load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
 (VPSHUFDMasked128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHUFDMasked128load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
 (VPSLLD512const [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSLLD512constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
 (VPSLLQ512const [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSLLQ512constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
 (VPSLLDMasked128const [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSLLDMasked128constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
--- a/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go
+++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go
@ -383,8 +383,10 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
 		{name: "VPDPWSSDMasked128", argLength: 4, reg: w3kw, asm: "VPDPWSSD", commutative: false, typ: "Vec128", resultInArg0: true},
 		{name: "VPDPWSSDMasked256", argLength: 4, reg: w3kw, asm: "VPDPWSSD", commutative: false, typ: "Vec256", resultInArg0: true},
 		{name: "VPDPWSSDMasked512", argLength: 4, reg: w3kw, asm: "VPDPWSSD", commutative: false, typ: "Vec512", resultInArg0: true},
 		{name: "VPERMB128", argLength: 2, reg: w21, asm: "VPERMB", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPERMB256", argLength: 2, reg: w21, asm: "VPERMB", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPERMB512", argLength: 2, reg: w21, asm: "VPERMB", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VPERMBMasked128", argLength: 3, reg: w2kw, asm: "VPERMB", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPERMBMasked256", argLength: 3, reg: w2kw, asm: "VPERMB", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPERMBMasked512", argLength: 3, reg: w2kw, asm: "VPERMB", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VPERMD256", argLength: 2, reg: v21, asm: "VPERMD", commutative: false, typ: "Vec256", resultInArg0: false},
@ -1310,6 +1312,12 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
 		{name: "VPSHUFHWMasked128", argLength: 2, reg: wkw, asm: "VPSHUFHW", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPSHUFHWMasked256", argLength: 2, reg: wkw, asm: "VPSHUFHW", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPSHUFHWMasked512", argLength: 2, reg: wkw, asm: "VPSHUFHW", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VPSHUFLW128", argLength: 1, reg: w11, asm: "VPSHUFLW", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPSHUFLW256", argLength: 1, reg: v11, asm: "VPSHUFLW", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPSHUFLW512", argLength: 1, reg: w11, asm: "VPSHUFLW", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VPSHUFLWMasked128", argLength: 2, reg: wkw, asm: "VPSHUFLW", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPSHUFLWMasked256", argLength: 2, reg: wkw, asm: "VPSHUFLW", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPSHUFLWMasked512", argLength: 2, reg: wkw, asm: "VPSHUFLW", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VPSLLD128const", argLength: 1, reg: v11, asm: "VPSLLD", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPSLLD256const", argLength: 1, reg: v11, asm: "VPSLLD", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPSLLD512const", argLength: 1, reg: w11, asm: "VPSLLD", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false},
@ -2392,6 +2400,9 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
 		{name: "VPSHUFHWMasked128Merging", argLength: 3, reg: w2kw, asm: "VPSHUFHW", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: true},
 		{name: "VPSHUFHWMasked256Merging", argLength: 3, reg: w2kw, asm: "VPSHUFHW", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: true},
 		{name: "VPSHUFHWMasked512Merging", argLength: 3, reg: w2kw, asm: "VPSHUFHW", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: true},
 		{name: "VPSHUFLWMasked128Merging", argLength: 3, reg: w2kw, asm: "VPSHUFLW", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: true},
 		{name: "VPSHUFLWMasked256Merging", argLength: 3, reg: w2kw, asm: "VPSHUFLW", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: true},
 		{name: "VPSHUFLWMasked512Merging", argLength: 3, reg: w2kw, asm: "VPSHUFLW", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: true},
 		{name: "VPSLLDMasked128constMerging", argLength: 3, reg: w2kw, asm: "VPSLLD", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: true},
 		{name: "VPSLLDMasked256constMerging", argLength: 3, reg: w2kw, asm: "VPSLLD", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: true},
 		{name: "VPSLLDMasked512constMerging", argLength: 3, reg: w2kw, asm: "VPSLLD", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: true},
--- a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
+++ b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
@ -207,6 +207,36 @@ func simdGenericOps() []opData {
 		{name: "CompressUint64x2", argLength: 2, commutative: false},
 		{name: "CompressUint64x4", argLength: 2, commutative: false},
 		{name: "CompressUint64x8", argLength: 2, commutative: false},
 		{name: "ConcatPermuteFloat32x4", argLength: 3, commutative: false},
 		{name: "ConcatPermuteFloat32x8", argLength: 3, commutative: false},
 		{name: "ConcatPermuteFloat32x16", argLength: 3, commutative: false},
 		{name: "ConcatPermuteFloat64x2", argLength: 3, commutative: false},
 		{name: "ConcatPermuteFloat64x4", argLength: 3, commutative: false},
 		{name: "ConcatPermuteFloat64x8", argLength: 3, commutative: false},
 		{name: "ConcatPermuteInt8x16", argLength: 3, commutative: false},
 		{name: "ConcatPermuteInt8x32", argLength: 3, commutative: false},
 		{name: "ConcatPermuteInt8x64", argLength: 3, commutative: false},
 		{name: "ConcatPermuteInt16x8", argLength: 3, commutative: false},
 		{name: "ConcatPermuteInt16x16", argLength: 3, commutative: false},
 		{name: "ConcatPermuteInt16x32", argLength: 3, commutative: false},
 		{name: "ConcatPermuteInt32x4", argLength: 3, commutative: false},
 		{name: "ConcatPermuteInt32x8", argLength: 3, commutative: false},
 		{name: "ConcatPermuteInt32x16", argLength: 3, commutative: false},
 		{name: "ConcatPermuteInt64x2", argLength: 3, commutative: false},
 		{name: "ConcatPermuteInt64x4", argLength: 3, commutative: false},
 		{name: "ConcatPermuteInt64x8", argLength: 3, commutative: false},
 		{name: "ConcatPermuteUint8x16", argLength: 3, commutative: false},
 		{name: "ConcatPermuteUint8x32", argLength: 3, commutative: false},
 		{name: "ConcatPermuteUint8x64", argLength: 3, commutative: false},
 		{name: "ConcatPermuteUint16x8", argLength: 3, commutative: false},
 		{name: "ConcatPermuteUint16x16", argLength: 3, commutative: false},
 		{name: "ConcatPermuteUint16x32", argLength: 3, commutative: false},
 		{name: "ConcatPermuteUint32x4", argLength: 3, commutative: false},
 		{name: "ConcatPermuteUint32x8", argLength: 3, commutative: false},
 		{name: "ConcatPermuteUint32x16", argLength: 3, commutative: false},
 		{name: "ConcatPermuteUint64x2", argLength: 3, commutative: false},
 		{name: "ConcatPermuteUint64x4", argLength: 3, commutative: false},
 		{name: "ConcatPermuteUint64x8", argLength: 3, commutative: false},
 		{name: "ConvertToInt8Int16x8", argLength: 1, commutative: false},
 		{name: "ConvertToInt8Int16x16", argLength: 1, commutative: false},
 		{name: "ConvertToInt8Int16x32", argLength: 1, commutative: false},
@ -750,44 +780,10 @@ func simdGenericOps() []opData {
 		{name: "OrUint64x2", argLength: 2, commutative: true},
 		{name: "OrUint64x4", argLength: 2, commutative: true},
 		{name: "OrUint64x8", argLength: 2, commutative: true},
 		{name: "Permute2Float32x4", argLength: 3, commutative: false},
 		{name: "Permute2Float32x8", argLength: 3, commutative: false},
 		{name: "Permute2Float32x16", argLength: 3, commutative: false},
 		{name: "Permute2Float64x2", argLength: 3, commutative: false},
 		{name: "Permute2Float64x4", argLength: 3, commutative: false},
 		{name: "Permute2Float64x8", argLength: 3, commutative: false},
 		{name: "Permute2Int8x16", argLength: 3, commutative: false},
 		{name: "Permute2Int8x32", argLength: 3, commutative: false},
 		{name: "Permute2Int8x64", argLength: 3, commutative: false},
 		{name: "Permute2Int16x8", argLength: 3, commutative: false},
 		{name: "Permute2Int16x16", argLength: 3, commutative: false},
 		{name: "Permute2Int16x32", argLength: 3, commutative: false},
 		{name: "Permute2Int32x4", argLength: 3, commutative: false},
 		{name: "Permute2Int32x8", argLength: 3, commutative: false},
 		{name: "Permute2Int32x16", argLength: 3, commutative: false},
 		{name: "Permute2Int64x2", argLength: 3, commutative: false},
 		{name: "Permute2Int64x4", argLength: 3, commutative: false},
 		{name: "Permute2Int64x8", argLength: 3, commutative: false},
 		{name: "Permute2Uint8x16", argLength: 3, commutative: false},
 		{name: "Permute2Uint8x32", argLength: 3, commutative: false},
 		{name: "Permute2Uint8x64", argLength: 3, commutative: false},
 		{name: "Permute2Uint16x8", argLength: 3, commutative: false},
 		{name: "Permute2Uint16x16", argLength: 3, commutative: false},
 		{name: "Permute2Uint16x32", argLength: 3, commutative: false},
 		{name: "Permute2Uint32x4", argLength: 3, commutative: false},
 		{name: "Permute2Uint32x8", argLength: 3, commutative: false},
 		{name: "Permute2Uint32x16", argLength: 3, commutative: false},
 		{name: "Permute2Uint64x2", argLength: 3, commutative: false},
 		{name: "Permute2Uint64x4", argLength: 3, commutative: false},
 		{name: "Permute2Uint64x8", argLength: 3, commutative: false},
 		{name: "PermuteFloat32x8", argLength: 2, commutative: false},
 		{name: "PermuteFloat32x16", argLength: 2, commutative: false},
 		{name: "PermuteFloat64x4", argLength: 2, commutative: false},
 		{name: "PermuteFloat64x8", argLength: 2, commutative: false},
 		{name: "PermuteGroupedInt8x32", argLength: 2, commutative: false},
 		{name: "PermuteGroupedInt8x64", argLength: 2, commutative: false},
 		{name: "PermuteGroupedUint8x32", argLength: 2, commutative: false},
 		{name: "PermuteGroupedUint8x64", argLength: 2, commutative: false},
 		{name: "PermuteInt8x16", argLength: 2, commutative: false},
 		{name: "PermuteInt8x32", argLength: 2, commutative: false},
 		{name: "PermuteInt8x64", argLength: 2, commutative: false},
@ -798,6 +794,12 @@ func simdGenericOps() []opData {
 		{name: "PermuteInt32x16", argLength: 2, commutative: false},
 		{name: "PermuteInt64x4", argLength: 2, commutative: false},
 		{name: "PermuteInt64x8", argLength: 2, commutative: false},
 		{name: "PermuteOrZeroGroupedInt8x32", argLength: 2, commutative: false},
 		{name: "PermuteOrZeroGroupedInt8x64", argLength: 2, commutative: false},
 		{name: "PermuteOrZeroGroupedUint8x32", argLength: 2, commutative: false},
 		{name: "PermuteOrZeroGroupedUint8x64", argLength: 2, commutative: false},
 		{name: "PermuteOrZeroInt8x16", argLength: 2, commutative: false},
 		{name: "PermuteOrZeroUint8x16", argLength: 2, commutative: false},
 		{name: "PermuteUint8x16", argLength: 2, commutative: false},
 		{name: "PermuteUint8x32", argLength: 2, commutative: false},
 		{name: "PermuteUint8x64", argLength: 2, commutative: false},
@ -1151,28 +1153,6 @@ func simdGenericOps() []opData {
 		{name: "GetElemUint16x8", argLength: 1, commutative: false, aux: "UInt8"},
 		{name: "GetElemUint32x4", argLength: 1, commutative: false, aux: "UInt8"},
 		{name: "GetElemUint64x2", argLength: 1, commutative: false, aux: "UInt8"},
 		{name: "PermuteConstantGroupedInt32x8", argLength: 1, commutative: false, aux: "UInt8"},
 		{name: "PermuteConstantGroupedInt32x16", argLength: 1, commutative: false, aux: "UInt8"},
 		{name: "PermuteConstantGroupedUint32x8", argLength: 1, commutative: false, aux: "UInt8"},
 		{name: "PermuteConstantGroupedUint32x16", argLength: 1, commutative: false, aux: "UInt8"},
 		{name: "PermuteConstantHiGroupedInt16x16", argLength: 1, commutative: false, aux: "UInt8"},
 		{name: "PermuteConstantHiGroupedInt16x32", argLength: 1, commutative: false, aux: "UInt8"},
 		{name: "PermuteConstantHiGroupedUint16x16", argLength: 1, commutative: false, aux: "UInt8"},
 		{name: "PermuteConstantHiGroupedUint16x32", argLength: 1, commutative: false, aux: "UInt8"},
 		{name: "PermuteConstantHiInt16x8", argLength: 1, commutative: false, aux: "UInt8"},
 		{name: "PermuteConstantHiInt32x4", argLength: 1, commutative: false, aux: "UInt8"},
 		{name: "PermuteConstantHiUint16x8", argLength: 1, commutative: false, aux: "UInt8"},
 		{name: "PermuteConstantHiUint32x4", argLength: 1, commutative: false, aux: "UInt8"},
 		{name: "PermuteConstantInt32x4", argLength: 1, commutative: false, aux: "UInt8"},
 		{name: "PermuteConstantLoGroupedInt16x16", argLength: 1, commutative: false, aux: "UInt8"},
 		{name: "PermuteConstantLoGroupedInt16x32", argLength: 1, commutative: false, aux: "UInt8"},
 		{name: "PermuteConstantLoGroupedUint16x16", argLength: 1, commutative: false, aux: "UInt8"},
 		{name: "PermuteConstantLoGroupedUint16x32", argLength: 1, commutative: false, aux: "UInt8"},
 		{name: "PermuteConstantLoInt16x8", argLength: 1, commutative: false, aux: "UInt8"},
 		{name: "PermuteConstantLoInt32x4", argLength: 1, commutative: false, aux: "UInt8"},
 		{name: "PermuteConstantLoUint16x8", argLength: 1, commutative: false, aux: "UInt8"},
 		{name: "PermuteConstantLoUint32x4", argLength: 1, commutative: false, aux: "UInt8"},
 		{name: "PermuteConstantUint32x4", argLength: 1, commutative: false, aux: "UInt8"},
 		{name: "RotateAllLeftInt32x4", argLength: 1, commutative: false, aux: "UInt8"},
 		{name: "RotateAllLeftInt32x8", argLength: 1, commutative: false, aux: "UInt8"},
 		{name: "RotateAllLeftInt32x16", argLength: 1, commutative: false, aux: "UInt8"},
@ -1292,6 +1272,24 @@ func simdGenericOps() []opData {
 		{name: "concatSelectedConstantInt64x2", argLength: 2, commutative: false, aux: "UInt8"},
 		{name: "concatSelectedConstantUint32x4", argLength: 2, commutative: false, aux: "UInt8"},
 		{name: "concatSelectedConstantUint64x2", argLength: 2, commutative: false, aux: "UInt8"},
 		{name: "permuteScalarsGroupedInt32x8", argLength: 1, commutative: false, aux: "UInt8"},
 		{name: "permuteScalarsGroupedInt32x16", argLength: 1, commutative: false, aux: "UInt8"},
 		{name: "permuteScalarsGroupedUint32x8", argLength: 1, commutative: false, aux: "UInt8"},
 		{name: "permuteScalarsGroupedUint32x16", argLength: 1, commutative: false, aux: "UInt8"},
 		{name: "permuteScalarsHiGroupedInt16x16", argLength: 1, commutative: false, aux: "UInt8"},
 		{name: "permuteScalarsHiGroupedInt16x32", argLength: 1, commutative: false, aux: "UInt8"},
 		{name: "permuteScalarsHiGroupedUint16x16", argLength: 1, commutative: false, aux: "UInt8"},
 		{name: "permuteScalarsHiGroupedUint16x32", argLength: 1, commutative: false, aux: "UInt8"},
 		{name: "permuteScalarsHiInt16x8", argLength: 1, commutative: false, aux: "UInt8"},
 		{name: "permuteScalarsHiUint16x8", argLength: 1, commutative: false, aux: "UInt8"},
 		{name: "permuteScalarsInt32x4", argLength: 1, commutative: false, aux: "UInt8"},
 		{name: "permuteScalarsLoGroupedInt16x16", argLength: 1, commutative: false, aux: "UInt8"},
 		{name: "permuteScalarsLoGroupedInt16x32", argLength: 1, commutative: false, aux: "UInt8"},
 		{name: "permuteScalarsLoGroupedUint16x16", argLength: 1, commutative: false, aux: "UInt8"},
 		{name: "permuteScalarsLoGroupedUint16x32", argLength: 1, commutative: false, aux: "UInt8"},
 		{name: "permuteScalarsLoInt16x8", argLength: 1, commutative: false, aux: "UInt8"},
 		{name: "permuteScalarsLoUint16x8", argLength: 1, commutative: false, aux: "UInt8"},
 		{name: "permuteScalarsUint32x4", argLength: 1, commutative: false, aux: "UInt8"},
 		{name: "ternInt32x4", argLength: 3, commutative: false, aux: "UInt8"},
 		{name: "ternInt32x8", argLength: 3, commutative: false, aux: "UInt8"},
 		{name: "ternInt32x16", argLength: 3, commutative: false, aux: "UInt8"},
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
@ -1624,8 +1624,10 @@ const (
 	OpAMD64VPDPWSSDMasked128
 	OpAMD64VPDPWSSDMasked256
 	OpAMD64VPDPWSSDMasked512
 	OpAMD64VPERMB128
 	OpAMD64VPERMB256
 	OpAMD64VPERMB512
 	OpAMD64VPERMBMasked128
 	OpAMD64VPERMBMasked256
 	OpAMD64VPERMBMasked512
 	OpAMD64VPERMD256
@ -2551,6 +2553,12 @@ const (
 	OpAMD64VPSHUFHWMasked128
 	OpAMD64VPSHUFHWMasked256
 	OpAMD64VPSHUFHWMasked512
 	OpAMD64VPSHUFLW128
 	OpAMD64VPSHUFLW256
 	OpAMD64VPSHUFLW512
 	OpAMD64VPSHUFLWMasked128
 	OpAMD64VPSHUFLWMasked256
 	OpAMD64VPSHUFLWMasked512
 	OpAMD64VPSLLD128const
 	OpAMD64VPSLLD256const
 	OpAMD64VPSLLD512const
@ -3633,6 +3641,9 @@ const (
 	OpAMD64VPSHUFHWMasked128Merging
 	OpAMD64VPSHUFHWMasked256Merging
 	OpAMD64VPSHUFHWMasked512Merging
 	OpAMD64VPSHUFLWMasked128Merging
 	OpAMD64VPSHUFLWMasked256Merging
 	OpAMD64VPSHUFLWMasked512Merging
 	OpAMD64VPSLLDMasked128constMerging
 	OpAMD64VPSLLDMasked256constMerging
 	OpAMD64VPSLLDMasked512constMerging
@ -6155,6 +6166,36 @@ const (
 	OpCompressUint64x2
 	OpCompressUint64x4
 	OpCompressUint64x8
 	OpConcatPermuteFloat32x4
 	OpConcatPermuteFloat32x8
 	OpConcatPermuteFloat32x16
 	OpConcatPermuteFloat64x2
 	OpConcatPermuteFloat64x4
 	OpConcatPermuteFloat64x8
 	OpConcatPermuteInt8x16
 	OpConcatPermuteInt8x32
 	OpConcatPermuteInt8x64
 	OpConcatPermuteInt16x8
 	OpConcatPermuteInt16x16
 	OpConcatPermuteInt16x32
 	OpConcatPermuteInt32x4
 	OpConcatPermuteInt32x8
 	OpConcatPermuteInt32x16
 	OpConcatPermuteInt64x2
 	OpConcatPermuteInt64x4
 	OpConcatPermuteInt64x8
 	OpConcatPermuteUint8x16
 	OpConcatPermuteUint8x32
 	OpConcatPermuteUint8x64
 	OpConcatPermuteUint16x8
 	OpConcatPermuteUint16x16
 	OpConcatPermuteUint16x32
 	OpConcatPermuteUint32x4
 	OpConcatPermuteUint32x8
 	OpConcatPermuteUint32x16
 	OpConcatPermuteUint64x2
 	OpConcatPermuteUint64x4
 	OpConcatPermuteUint64x8
 	OpConvertToInt8Int16x8
 	OpConvertToInt8Int16x16
 	OpConvertToInt8Int16x32
@ -6698,44 +6739,10 @@ const (
 	OpOrUint64x2
 	OpOrUint64x4
 	OpOrUint64x8
 	OpPermute2Float32x4
 	OpPermute2Float32x8
 	OpPermute2Float32x16
 	OpPermute2Float64x2
 	OpPermute2Float64x4
 	OpPermute2Float64x8
 	OpPermute2Int8x16
 	OpPermute2Int8x32
 	OpPermute2Int8x64
 	OpPermute2Int16x8
 	OpPermute2Int16x16
 	OpPermute2Int16x32
 	OpPermute2Int32x4
 	OpPermute2Int32x8
 	OpPermute2Int32x16
 	OpPermute2Int64x2
 	OpPermute2Int64x4
 	OpPermute2Int64x8
 	OpPermute2Uint8x16
 	OpPermute2Uint8x32
 	OpPermute2Uint8x64
 	OpPermute2Uint16x8
 	OpPermute2Uint16x16
 	OpPermute2Uint16x32
 	OpPermute2Uint32x4
 	OpPermute2Uint32x8
 	OpPermute2Uint32x16
 	OpPermute2Uint64x2
 	OpPermute2Uint64x4
 	OpPermute2Uint64x8
 	OpPermuteFloat32x8
 	OpPermuteFloat32x16
 	OpPermuteFloat64x4
 	OpPermuteFloat64x8
 	OpPermuteGroupedInt8x32
 	OpPermuteGroupedInt8x64
 	OpPermuteGroupedUint8x32
 	OpPermuteGroupedUint8x64
 	OpPermuteInt8x16
 	OpPermuteInt8x32
 	OpPermuteInt8x64
@ -6746,6 +6753,12 @@ const (
 	OpPermuteInt32x16
 	OpPermuteInt64x4
 	OpPermuteInt64x8
 	OpPermuteOrZeroGroupedInt8x32
 	OpPermuteOrZeroGroupedInt8x64
 	OpPermuteOrZeroGroupedUint8x32
 	OpPermuteOrZeroGroupedUint8x64
 	OpPermuteOrZeroInt8x16
 	OpPermuteOrZeroUint8x16
 	OpPermuteUint8x16
 	OpPermuteUint8x32
 	OpPermuteUint8x64
@ -7099,28 +7112,6 @@ const (
 	OpGetElemUint16x8
 	OpGetElemUint32x4
 	OpGetElemUint64x2
 	OpPermuteConstantGroupedInt32x8
 	OpPermuteConstantGroupedInt32x16
 	OpPermuteConstantGroupedUint32x8
 	OpPermuteConstantGroupedUint32x16
 	OpPermuteConstantHiGroupedInt16x16
 	OpPermuteConstantHiGroupedInt16x32
 	OpPermuteConstantHiGroupedUint16x16
 	OpPermuteConstantHiGroupedUint16x32
 	OpPermuteConstantHiInt16x8
 	OpPermuteConstantHiInt32x4
 	OpPermuteConstantHiUint16x8
 	OpPermuteConstantHiUint32x4
 	OpPermuteConstantInt32x4
 	OpPermuteConstantLoGroupedInt16x16
 	OpPermuteConstantLoGroupedInt16x32
 	OpPermuteConstantLoGroupedUint16x16
 	OpPermuteConstantLoGroupedUint16x32
 	OpPermuteConstantLoInt16x8
 	OpPermuteConstantLoInt32x4
 	OpPermuteConstantLoUint16x8
 	OpPermuteConstantLoUint32x4
 	OpPermuteConstantUint32x4
 	OpRotateAllLeftInt32x4
 	OpRotateAllLeftInt32x8
 	OpRotateAllLeftInt32x16
@ -7240,6 +7231,24 @@ const (
 	OpconcatSelectedConstantInt64x2
 	OpconcatSelectedConstantUint32x4
 	OpconcatSelectedConstantUint64x2
 	OppermuteScalarsGroupedInt32x8
 	OppermuteScalarsGroupedInt32x16
 	OppermuteScalarsGroupedUint32x8
 	OppermuteScalarsGroupedUint32x16
 	OppermuteScalarsHiGroupedInt16x16
 	OppermuteScalarsHiGroupedInt16x32
 	OppermuteScalarsHiGroupedUint16x16
 	OppermuteScalarsHiGroupedUint16x32
 	OppermuteScalarsHiInt16x8
 	OppermuteScalarsHiUint16x8
 	OppermuteScalarsInt32x4
 	OppermuteScalarsLoGroupedInt16x16
 	OppermuteScalarsLoGroupedInt16x32
 	OppermuteScalarsLoGroupedUint16x16
 	OppermuteScalarsLoGroupedUint16x32
 	OppermuteScalarsLoInt16x8
 	OppermuteScalarsLoUint16x8
 	OppermuteScalarsUint32x4
 	OpternInt32x4
 	OpternInt32x8
 	OpternInt32x16
@ -26142,6 +26151,20 @@ var opcodeTable = [...]opInfo{
 			},
 		},
 	},
 	{
 		name:   "VPERMB128",
 		argLen: 2,
 		asm:    x86.AVPERMB,
 		reg: regInfo{
 			inputs: []inputInfo{
 				{0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
 				{1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
 			},
 			outputs: []outputInfo{
 				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
 			},
 		},
 	},
 	{
 		name:   "VPERMB256",
 		argLen: 2,
@ -26170,6 +26193,21 @@ var opcodeTable = [...]opInfo{
 			},
 		},
 	},
 	{
 		name:   "VPERMBMasked128",
 		argLen: 3,
 		asm:    x86.AVPERMB,
 		reg: regInfo{
 			inputs: []inputInfo{
 				{2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
 				{0, 281472829161472},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
 				{1, 281474976645120},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
 			},
 			outputs: []outputInfo{
 				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
 			},
 		},
 	},
 	{
 		name:   "VPERMBMasked256",
 		argLen: 3,
@ -39744,6 +39782,93 @@ var opcodeTable = [...]opInfo{
 			},
 		},
 	},
 	{
 		name:    "VPSHUFLW128",
 		auxType: auxUInt8,
 		argLen:  1,
 		asm:     x86.AVPSHUFLW,
 		reg: regInfo{
 			inputs: []inputInfo{
 				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
 			},
 			outputs: []outputInfo{
 				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
 			},
 		},
 	},
 	{
 		name:    "VPSHUFLW256",
 		auxType: auxUInt8,
 		argLen:  1,
 		asm:     x86.AVPSHUFLW,
 		reg: regInfo{
 			inputs: []inputInfo{
 				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
 			},
 			outputs: []outputInfo{
 				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
 			},
 		},
 	},
 	{
 		name:    "VPSHUFLW512",
 		auxType: auxUInt8,
 		argLen:  1,
 		asm:     x86.AVPSHUFLW,
 		reg: regInfo{
 			inputs: []inputInfo{
 				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
 			},
 			outputs: []outputInfo{
 				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
 			},
 		},
 	},
 	{
 		name:    "VPSHUFLWMasked128",
 		auxType: auxUInt8,
 		argLen:  2,
 		asm:     x86.AVPSHUFLW,
 		reg: regInfo{
 			inputs: []inputInfo{
 				{1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
 				{0, 281474976645120},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
 			},
 			outputs: []outputInfo{
 				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
 			},
 		},
 	},
 	{
 		name:    "VPSHUFLWMasked256",
 		auxType: auxUInt8,
 		argLen:  2,
 		asm:     x86.AVPSHUFLW,
 		reg: regInfo{
 			inputs: []inputInfo{
 				{1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
 				{0, 281474976645120},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
 			},
 			outputs: []outputInfo{
 				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
 			},
 		},
 	},
 	{
 		name:    "VPSHUFLWMasked512",
 		auxType: auxUInt8,
 		argLen:  2,
 		asm:     x86.AVPSHUFLW,
 		reg: regInfo{
 			inputs: []inputInfo{
 				{1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
 				{0, 281474976645120},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
 			},
 			outputs: []outputInfo{
 				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
 			},
 		},
 	},
 	{
 		name:    "VPSLLD128const",
 		auxType: auxUInt8,
@ -57607,6 +57732,57 @@ var opcodeTable = [...]opInfo{
 			},
 		},
 	},
 	{
 		name:         "VPSHUFLWMasked128Merging",
 		auxType:      auxUInt8,
 		argLen:       3,
 		resultInArg0: true,
 		asm:          x86.AVPSHUFLW,
 		reg: regInfo{
 			inputs: []inputInfo{
 				{2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
 				{0, 281472829161472},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
 				{1, 281474976645120},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
 			},
 			outputs: []outputInfo{
 				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
 			},
 		},
 	},
 	{
 		name:         "VPSHUFLWMasked256Merging",
 		auxType:      auxUInt8,
 		argLen:       3,
 		resultInArg0: true,
 		asm:          x86.AVPSHUFLW,
 		reg: regInfo{
 			inputs: []inputInfo{
 				{2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
 				{0, 281472829161472},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
 				{1, 281474976645120},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
 			},
 			outputs: []outputInfo{
 				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
 			},
 		},
 	},
 	{
 		name:         "VPSHUFLWMasked512Merging",
 		auxType:      auxUInt8,
 		argLen:       3,
 		resultInArg0: true,
 		asm:          x86.AVPSHUFLW,
 		reg: regInfo{
 			inputs: []inputInfo{
 				{2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
 				{0, 281472829161472},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
 				{1, 281474976645120},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
 			},
 			outputs: []outputInfo{
 				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
 			},
 		},
 	},
 	{
 		name:         "VPSLLDMasked128constMerging",
 		auxType:      auxUInt8,
@ -86874,6 +87050,156 @@ var opcodeTable = [...]opInfo{
 		argLen:  2,
 		generic: true,
 	},
 	{
 		name:    "ConcatPermuteFloat32x4",
 		argLen:  3,
 		generic: true,
 	},
 	{
 		name:    "ConcatPermuteFloat32x8",
 		argLen:  3,
 		generic: true,
 	},
 	{
 		name:    "ConcatPermuteFloat32x16",
 		argLen:  3,
 		generic: true,
 	},
 	{
 		name:    "ConcatPermuteFloat64x2",
 		argLen:  3,
 		generic: true,
 	},
 	{
 		name:    "ConcatPermuteFloat64x4",
 		argLen:  3,
 		generic: true,
 	},
 	{
 		name:    "ConcatPermuteFloat64x8",
 		argLen:  3,
 		generic: true,
 	},
 	{
 		name:    "ConcatPermuteInt8x16",
 		argLen:  3,
 		generic: true,
 	},
 	{
 		name:    "ConcatPermuteInt8x32",
 		argLen:  3,
 		generic: true,
 	},
 	{
 		name:    "ConcatPermuteInt8x64",
 		argLen:  3,
 		generic: true,
 	},
 	{
 		name:    "ConcatPermuteInt16x8",
 		argLen:  3,
 		generic: true,
 	},
 	{
 		name:    "ConcatPermuteInt16x16",
 		argLen:  3,
 		generic: true,
 	},
 	{
 		name:    "ConcatPermuteInt16x32",
 		argLen:  3,
 		generic: true,
 	},
 	{
 		name:    "ConcatPermuteInt32x4",
 		argLen:  3,
 		generic: true,
 	},
 	{
 		name:    "ConcatPermuteInt32x8",
 		argLen:  3,
 		generic: true,
 	},
 	{
 		name:    "ConcatPermuteInt32x16",
 		argLen:  3,
 		generic: true,
 	},
 	{
 		name:    "ConcatPermuteInt64x2",
 		argLen:  3,
 		generic: true,
 	},
 	{
 		name:    "ConcatPermuteInt64x4",
 		argLen:  3,
 		generic: true,
 	},
 	{
 		name:    "ConcatPermuteInt64x8",
 		argLen:  3,
 		generic: true,
 	},
 	{
 		name:    "ConcatPermuteUint8x16",
 		argLen:  3,
 		generic: true,
 	},
 	{
 		name:    "ConcatPermuteUint8x32",
 		argLen:  3,
 		generic: true,
 	},
 	{
 		name:    "ConcatPermuteUint8x64",
 		argLen:  3,
 		generic: true,
 	},
 	{
 		name:    "ConcatPermuteUint16x8",
 		argLen:  3,
 		generic: true,
 	},
 	{
 		name:    "ConcatPermuteUint16x16",
 		argLen:  3,
 		generic: true,
 	},
 	{
 		name:    "ConcatPermuteUint16x32",
 		argLen:  3,
 		generic: true,
 	},
 	{
 		name:    "ConcatPermuteUint32x4",
 		argLen:  3,
 		generic: true,
 	},
 	{
 		name:    "ConcatPermuteUint32x8",
 		argLen:  3,
 		generic: true,
 	},
 	{
 		name:    "ConcatPermuteUint32x16",
 		argLen:  3,
 		generic: true,
 	},
 	{
 		name:    "ConcatPermuteUint64x2",
 		argLen:  3,
 		generic: true,
 	},
 	{
 		name:    "ConcatPermuteUint64x4",
 		argLen:  3,
 		generic: true,
 	},
 	{
 		name:    "ConcatPermuteUint64x8",
 		argLen:  3,
 		generic: true,
 	},
 	{
 		name:    "ConvertToInt8Int16x8",
 		argLen:  1,
@ -89757,156 +90083,6 @@ var opcodeTable = [...]opInfo{
 		commutative: true,
 		generic:     true,
 	},
 	{
 		name:    "Permute2Float32x4",
 		argLen:  3,
 		generic: true,
 	},
 	{
 		name:    "Permute2Float32x8",
 		argLen:  3,
 		generic: true,
 	},
 	{
 		name:    "Permute2Float32x16",
 		argLen:  3,
 		generic: true,
 	},
 	{
 		name:    "Permute2Float64x2",
 		argLen:  3,
 		generic: true,
 	},
 	{
 		name:    "Permute2Float64x4",
 		argLen:  3,
 		generic: true,
 	},
 	{
 		name:    "Permute2Float64x8",
 		argLen:  3,
 		generic: true,
 	},
 	{
 		name:    "Permute2Int8x16",
 		argLen:  3,
 		generic: true,
 	},
 	{
 		name:    "Permute2Int8x32",
 		argLen:  3,
 		generic: true,
 	},
 	{
 		name:    "Permute2Int8x64",
 		argLen:  3,
 		generic: true,
 	},
 	{
 		name:    "Permute2Int16x8",
 		argLen:  3,
 		generic: true,
 	},
 	{
 		name:    "Permute2Int16x16",
 		argLen:  3,
 		generic: true,
 	},
 	{
 		name:    "Permute2Int16x32",
 		argLen:  3,
 		generic: true,
 	},
 	{
 		name:    "Permute2Int32x4",
 		argLen:  3,
 		generic: true,
 	},
 	{
 		name:    "Permute2Int32x8",
 		argLen:  3,
 		generic: true,
 	},
 	{
 		name:    "Permute2Int32x16",
 		argLen:  3,
 		generic: true,
 	},
 	{
 		name:    "Permute2Int64x2",
 		argLen:  3,
 		generic: true,
 	},
 	{
 		name:    "Permute2Int64x4",
 		argLen:  3,
 		generic: true,
 	},
 	{
 		name:    "Permute2Int64x8",
 		argLen:  3,
 		generic: true,
 	},
 	{
 		name:    "Permute2Uint8x16",
 		argLen:  3,
 		generic: true,
 	},
 	{
 		name:    "Permute2Uint8x32",
 		argLen:  3,
 		generic: true,
 	},
 	{
 		name:    "Permute2Uint8x64",
 		argLen:  3,
 		generic: true,
 	},
 	{
 		name:    "Permute2Uint16x8",
 		argLen:  3,
 		generic: true,
 	},
 	{
 		name:    "Permute2Uint16x16",
 		argLen:  3,
 		generic: true,
 	},
 	{
 		name:    "Permute2Uint16x32",
 		argLen:  3,
 		generic: true,
 	},
 	{
 		name:    "Permute2Uint32x4",
 		argLen:  3,
 		generic: true,
 	},
 	{
 		name:    "Permute2Uint32x8",
 		argLen:  3,
 		generic: true,
 	},
 	{
 		name:    "Permute2Uint32x16",
 		argLen:  3,
 		generic: true,
 	},
 	{
 		name:    "Permute2Uint64x2",
 		argLen:  3,
 		generic: true,
 	},
 	{
 		name:    "Permute2Uint64x4",
 		argLen:  3,
 		generic: true,
 	},
 	{
 		name:    "Permute2Uint64x8",
 		argLen:  3,
 		generic: true,
 	},
 	{
 		name:    "PermuteFloat32x8",
 		argLen:  2,
@ -89927,26 +90103,6 @@ var opcodeTable = [...]opInfo{
 		argLen:  2,
 		generic: true,
 	},
 	{
 		name:    "PermuteGroupedInt8x32",
 		argLen:  2,
 		generic: true,
 	},
 	{
 		name:    "PermuteGroupedInt8x64",
 		argLen:  2,
 		generic: true,
 	},
 	{
 		name:    "PermuteGroupedUint8x32",
 		argLen:  2,
 		generic: true,
 	},
 	{
 		name:    "PermuteGroupedUint8x64",
 		argLen:  2,
 		generic: true,
 	},
 	{
 		name:    "PermuteInt8x16",
 		argLen:  2,
@ -89997,6 +90153,36 @@ var opcodeTable = [...]opInfo{
 		argLen:  2,
 		generic: true,
 	},
 	{
 		name:    "PermuteOrZeroGroupedInt8x32",
 		argLen:  2,
 		generic: true,
 	},
 	{
 		name:    "PermuteOrZeroGroupedInt8x64",
 		argLen:  2,
 		generic: true,
 	},
 	{
 		name:    "PermuteOrZeroGroupedUint8x32",
 		argLen:  2,
 		generic: true,
 	},
 	{
 		name:    "PermuteOrZeroGroupedUint8x64",
 		argLen:  2,
 		generic: true,
 	},
 	{
 		name:    "PermuteOrZeroInt8x16",
 		argLen:  2,
 		generic: true,
 	},
 	{
 		name:    "PermuteOrZeroUint8x16",
 		argLen:  2,
 		generic: true,
 	},
 	{
 		name:    "PermuteUint8x16",
 		argLen:  2,
@ -91830,138 +92016,6 @@ var opcodeTable = [...]opInfo{
 		argLen:  1,
 		generic: true,
 	},
 	{
 		name:    "PermuteConstantGroupedInt32x8",
 		auxType: auxUInt8,
 		argLen:  1,
 		generic: true,
 	},
 	{
 		name:    "PermuteConstantGroupedInt32x16",
 		auxType: auxUInt8,
 		argLen:  1,
 		generic: true,
 	},
 	{
 		name:    "PermuteConstantGroupedUint32x8",
 		auxType: auxUInt8,
 		argLen:  1,
 		generic: true,
 	},
 	{
 		name:    "PermuteConstantGroupedUint32x16",
 		auxType: auxUInt8,
 		argLen:  1,
 		generic: true,
 	},
 	{
 		name:    "PermuteConstantHiGroupedInt16x16",
 		auxType: auxUInt8,
 		argLen:  1,
 		generic: true,
 	},
 	{
 		name:    "PermuteConstantHiGroupedInt16x32",
 		auxType: auxUInt8,
 		argLen:  1,
 		generic: true,
 	},
 	{
 		name:    "PermuteConstantHiGroupedUint16x16",
 		auxType: auxUInt8,
 		argLen:  1,
 		generic: true,
 	},
 	{
 		name:    "PermuteConstantHiGroupedUint16x32",
 		auxType: auxUInt8,
 		argLen:  1,
 		generic: true,
 	},
 	{
 		name:    "PermuteConstantHiInt16x8",
 		auxType: auxUInt8,
 		argLen:  1,
 		generic: true,
 	},
 	{
 		name:    "PermuteConstantHiInt32x4",
 		auxType: auxUInt8,
 		argLen:  1,
 		generic: true,
 	},
 	{
 		name:    "PermuteConstantHiUint16x8",
 		auxType: auxUInt8,
 		argLen:  1,
 		generic: true,
 	},
 	{
 		name:    "PermuteConstantHiUint32x4",
 		auxType: auxUInt8,
 		argLen:  1,
 		generic: true,
 	},
 	{
 		name:    "PermuteConstantInt32x4",
 		auxType: auxUInt8,
 		argLen:  1,
 		generic: true,
 	},
 	{
 		name:    "PermuteConstantLoGroupedInt16x16",
 		auxType: auxUInt8,
 		argLen:  1,
 		generic: true,
 	},
 	{
 		name:    "PermuteConstantLoGroupedInt16x32",
 		auxType: auxUInt8,
 		argLen:  1,
 		generic: true,
 	},
 	{
 		name:    "PermuteConstantLoGroupedUint16x16",
 		auxType: auxUInt8,
 		argLen:  1,
 		generic: true,
 	},
 	{
 		name:    "PermuteConstantLoGroupedUint16x32",
 		auxType: auxUInt8,
 		argLen:  1,
 		generic: true,
 	},
 	{
 		name:    "PermuteConstantLoInt16x8",
 		auxType: auxUInt8,
 		argLen:  1,
 		generic: true,
 	},
 	{
 		name:    "PermuteConstantLoInt32x4",
 		auxType: auxUInt8,
 		argLen:  1,
 		generic: true,
 	},
 	{
 		name:    "PermuteConstantLoUint16x8",
 		auxType: auxUInt8,
 		argLen:  1,
 		generic: true,
 	},
 	{
 		name:    "PermuteConstantLoUint32x4",
 		auxType: auxUInt8,
 		argLen:  1,
 		generic: true,
 	},
 	{
 		name:    "PermuteConstantUint32x4",
 		auxType: auxUInt8,
 		argLen:  1,
 		generic: true,
 	},
 	{
 		name:    "RotateAllLeftInt32x4",
 		auxType: auxUInt8,
@ -92676,6 +92730,114 @@ var opcodeTable = [...]opInfo{
 		argLen:  2,
 		generic: true,
 	},
 	{
 		name:    "permuteScalarsGroupedInt32x8",
 		auxType: auxUInt8,
 		argLen:  1,
 		generic: true,
 	},
 	{
 		name:    "permuteScalarsGroupedInt32x16",
 		auxType: auxUInt8,
 		argLen:  1,
 		generic: true,
 	},
 	{
 		name:    "permuteScalarsGroupedUint32x8",
 		auxType: auxUInt8,
 		argLen:  1,
 		generic: true,
 	},
 	{
 		name:    "permuteScalarsGroupedUint32x16",
 		auxType: auxUInt8,
 		argLen:  1,
 		generic: true,
 	},
 	{
 		name:    "permuteScalarsHiGroupedInt16x16",
 		auxType: auxUInt8,
 		argLen:  1,
 		generic: true,
 	},
 	{
 		name:    "permuteScalarsHiGroupedInt16x32",
 		auxType: auxUInt8,
 		argLen:  1,
 		generic: true,
 	},
 	{
 		name:    "permuteScalarsHiGroupedUint16x16",
 		auxType: auxUInt8,
 		argLen:  1,
 		generic: true,
 	},
 	{
 		name:    "permuteScalarsHiGroupedUint16x32",
 		auxType: auxUInt8,
 		argLen:  1,
 		generic: true,
 	},
 	{
 		name:    "permuteScalarsHiInt16x8",
 		auxType: auxUInt8,
 		argLen:  1,
 		generic: true,
 	},
 	{
 		name:    "permuteScalarsHiUint16x8",
 		auxType: auxUInt8,
 		argLen:  1,
 		generic: true,
 	},
 	{
 		name:    "permuteScalarsInt32x4",
 		auxType: auxUInt8,
 		argLen:  1,
 		generic: true,
 	},
 	{
 		name:    "permuteScalarsLoGroupedInt16x16",
 		auxType: auxUInt8,
 		argLen:  1,
 		generic: true,
 	},
 	{
 		name:    "permuteScalarsLoGroupedInt16x32",
 		auxType: auxUInt8,
 		argLen:  1,
 		generic: true,
 	},
 	{
 		name:    "permuteScalarsLoGroupedUint16x16",
 		auxType: auxUInt8,
 		argLen:  1,
 		generic: true,
 	},
 	{
 		name:    "permuteScalarsLoGroupedUint16x32",
 		auxType: auxUInt8,
 		argLen:  1,
 		generic: true,
 	},
 	{
 		name:    "permuteScalarsLoInt16x8",
 		auxType: auxUInt8,
 		argLen:  1,
 		generic: true,
 	},
 	{
 		name:    "permuteScalarsLoUint16x8",
 		auxType: auxUInt8,
 		argLen:  1,
 		generic: true,
 	},
 	{
 		name:    "permuteScalarsUint32x4",
 		auxType: auxUInt8,
 		argLen:  1,
 		generic: true,
 	},
 	{
 		name:    "ternInt32x4",
 		auxType: auxUInt8,
--- a/src/cmd/compile/internal/ssa/rewriteAMD64.go
+++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go
--- a/src/cmd/compile/internal/ssagen/simdintrinsics.go
+++ b/src/cmd/compile/internal/ssagen/simdintrinsics.go
@ -228,6 +228,36 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
 	addF(simdPackage, "Uint64x2.Compress", opLen2(ssa.OpCompressUint64x2, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Uint64x4.Compress", opLen2(ssa.OpCompressUint64x4, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint64x8.Compress", opLen2(ssa.OpCompressUint64x8, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Int8x16.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt8x16, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Uint8x16.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint8x16, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Int8x32.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt8x32, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint8x32.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint8x32, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int8x64.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt8x64, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Uint8x64.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint8x64, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Int16x8.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt16x8, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Uint16x8.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint16x8, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Int16x16.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt16x16, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint16x16.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint16x16, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int16x32.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt16x32, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Uint16x32.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint16x32, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Float32x4.ConcatPermute", opLen3_231(ssa.OpConcatPermuteFloat32x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Int32x4.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt32x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Uint32x4.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint32x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Float32x8.ConcatPermute", opLen3_231(ssa.OpConcatPermuteFloat32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int32x8.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint32x8.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Float32x16.ConcatPermute", opLen3_231(ssa.OpConcatPermuteFloat32x16, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Int32x16.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt32x16, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Uint32x16.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint32x16, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Float64x2.ConcatPermute", opLen3_231(ssa.OpConcatPermuteFloat64x2, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Int64x2.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt64x2, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Uint64x2.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint64x2, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Float64x4.ConcatPermute", opLen3_231(ssa.OpConcatPermuteFloat64x4, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int64x4.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt64x4, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint64x4.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint64x4, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Float64x8.ConcatPermute", opLen3_231(ssa.OpConcatPermuteFloat64x8, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Int64x8.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt64x8, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Uint64x8.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint64x8, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Uint8x16.ConcatShiftBytesRight", opLen2Imm8(ssa.OpConcatShiftBytesRightUint8x16, types.TypeVec128, 0), sys.AMD64)
 	addF(simdPackage, "Uint8x32.ConcatShiftBytesRightGrouped", opLen2Imm8(ssa.OpConcatShiftBytesRightGroupedUint8x32, types.TypeVec256, 0), sys.AMD64)
 	addF(simdPackage, "Uint8x64.ConcatShiftBytesRightGrouped", opLen2Imm8(ssa.OpConcatShiftBytesRightGroupedUint8x64, types.TypeVec512, 0), sys.AMD64)
@ -802,8 +832,8 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
 	addF(simdPackage, "Uint64x2.Or", opLen2(ssa.OpOrUint64x2, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Uint64x4.Or", opLen2(ssa.OpOrUint64x4, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint64x8.Or", opLen2(ssa.OpOrUint64x8, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Int8x16.Permute", opLen2(ssa.OpPermuteInt8x16, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int8x16.Permute", opLen2_21(ssa.OpPermuteInt8x16, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Uint8x16.Permute", opLen2(ssa.OpPermuteUint8x16, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint8x16.Permute", opLen2_21(ssa.OpPermuteUint8x16, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Int8x32.Permute", opLen2_21(ssa.OpPermuteInt8x32, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint8x32.Permute", opLen2_21(ssa.OpPermuteUint8x32, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int8x64.Permute", opLen2_21(ssa.OpPermuteInt8x64, types.TypeVec512), sys.AMD64)
@ -826,62 +856,12 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
 	addF(simdPackage, "Float64x8.Permute", opLen2_21(ssa.OpPermuteFloat64x8, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Int64x8.Permute", opLen2_21(ssa.OpPermuteInt64x8, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Uint64x8.Permute", opLen2_21(ssa.OpPermuteUint64x8, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Int8x16.Permute2", opLen3_231(ssa.OpPermute2Int8x16, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int8x16.PermuteOrZero", opLen2(ssa.OpPermuteOrZeroInt8x16, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Uint8x16.Permute2", opLen3_231(ssa.OpPermute2Uint8x16, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint8x16.PermuteOrZero", opLen2(ssa.OpPermuteOrZeroUint8x16, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int8x32.Permute2", opLen3_231(ssa.OpPermute2Int8x32, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int8x32.PermuteOrZeroGrouped", opLen2(ssa.OpPermuteOrZeroGroupedInt8x32, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Uint8x32.Permute2", opLen3_231(ssa.OpPermute2Uint8x32, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int8x64.PermuteOrZeroGrouped", opLen2(ssa.OpPermuteOrZeroGroupedInt8x64, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Int8x64.Permute2", opLen3_231(ssa.OpPermute2Int8x64, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Uint8x32.PermuteOrZeroGrouped", opLen2(ssa.OpPermuteOrZeroGroupedUint8x32, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Uint8x64.Permute2", opLen3_231(ssa.OpPermute2Uint8x64, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Uint8x64.PermuteOrZeroGrouped", opLen2(ssa.OpPermuteOrZeroGroupedUint8x64, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Int16x8.Permute2", opLen3_231(ssa.OpPermute2Int16x8, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Uint16x8.Permute2", opLen3_231(ssa.OpPermute2Uint16x8, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Int16x16.Permute2", opLen3_231(ssa.OpPermute2Int16x16, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint16x16.Permute2", opLen3_231(ssa.OpPermute2Uint16x16, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int16x32.Permute2", opLen3_231(ssa.OpPermute2Int16x32, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Uint16x32.Permute2", opLen3_231(ssa.OpPermute2Uint16x32, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Float32x4.Permute2", opLen3_231(ssa.OpPermute2Float32x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Int32x4.Permute2", opLen3_231(ssa.OpPermute2Int32x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Uint32x4.Permute2", opLen3_231(ssa.OpPermute2Uint32x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Float32x8.Permute2", opLen3_231(ssa.OpPermute2Float32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int32x8.Permute2", opLen3_231(ssa.OpPermute2Int32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint32x8.Permute2", opLen3_231(ssa.OpPermute2Uint32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Float32x16.Permute2", opLen3_231(ssa.OpPermute2Float32x16, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Int32x16.Permute2", opLen3_231(ssa.OpPermute2Int32x16, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Uint32x16.Permute2", opLen3_231(ssa.OpPermute2Uint32x16, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Float64x2.Permute2", opLen3_231(ssa.OpPermute2Float64x2, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Int64x2.Permute2", opLen3_231(ssa.OpPermute2Int64x2, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Uint64x2.Permute2", opLen3_231(ssa.OpPermute2Uint64x2, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Float64x4.Permute2", opLen3_231(ssa.OpPermute2Float64x4, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int64x4.Permute2", opLen3_231(ssa.OpPermute2Int64x4, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint64x4.Permute2", opLen3_231(ssa.OpPermute2Uint64x4, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Float64x8.Permute2", opLen3_231(ssa.OpPermute2Float64x8, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Int64x8.Permute2", opLen3_231(ssa.OpPermute2Int64x8, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Uint64x8.Permute2", opLen3_231(ssa.OpPermute2Uint64x8, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Int32x4.PermuteConstant", opLen1Imm8(ssa.OpPermuteConstantInt32x4, types.TypeVec128, 0), sys.AMD64)
 	addF(simdPackage, "Uint32x4.PermuteConstant", opLen1Imm8(ssa.OpPermuteConstantUint32x4, types.TypeVec128, 0), sys.AMD64)
 	addF(simdPackage, "Int32x8.PermuteConstantGrouped", opLen1Imm8(ssa.OpPermuteConstantGroupedInt32x8, types.TypeVec256, 0), sys.AMD64)
 	addF(simdPackage, "Int32x16.PermuteConstantGrouped", opLen1Imm8(ssa.OpPermuteConstantGroupedInt32x16, types.TypeVec512, 0), sys.AMD64)
 	addF(simdPackage, "Uint32x8.PermuteConstantGrouped", opLen1Imm8(ssa.OpPermuteConstantGroupedUint32x8, types.TypeVec256, 0), sys.AMD64)
 	addF(simdPackage, "Uint32x16.PermuteConstantGrouped", opLen1Imm8(ssa.OpPermuteConstantGroupedUint32x16, types.TypeVec512, 0), sys.AMD64)
 	addF(simdPackage, "Int16x8.PermuteConstantHi", opLen1Imm8(ssa.OpPermuteConstantHiInt16x8, types.TypeVec128, 0), sys.AMD64)
 	addF(simdPackage, "Int32x4.PermuteConstantHi", opLen1Imm8(ssa.OpPermuteConstantHiInt32x4, types.TypeVec128, 0), sys.AMD64)
 	addF(simdPackage, "Uint16x8.PermuteConstantHi", opLen1Imm8(ssa.OpPermuteConstantHiUint16x8, types.TypeVec128, 0), sys.AMD64)
 	addF(simdPackage, "Uint32x4.PermuteConstantHi", opLen1Imm8(ssa.OpPermuteConstantHiUint32x4, types.TypeVec128, 0), sys.AMD64)
 	addF(simdPackage, "Int16x16.PermuteConstantHiGrouped", opLen1Imm8(ssa.OpPermuteConstantHiGroupedInt16x16, types.TypeVec256, 0), sys.AMD64)
 	addF(simdPackage, "Int16x32.PermuteConstantHiGrouped", opLen1Imm8(ssa.OpPermuteConstantHiGroupedInt16x32, types.TypeVec512, 0), sys.AMD64)
 	addF(simdPackage, "Uint16x16.PermuteConstantHiGrouped", opLen1Imm8(ssa.OpPermuteConstantHiGroupedUint16x16, types.TypeVec256, 0), sys.AMD64)
 	addF(simdPackage, "Uint16x32.PermuteConstantHiGrouped", opLen1Imm8(ssa.OpPermuteConstantHiGroupedUint16x32, types.TypeVec512, 0), sys.AMD64)
 	addF(simdPackage, "Int16x8.PermuteConstantLo", opLen1Imm8(ssa.OpPermuteConstantLoInt16x8, types.TypeVec128, 0), sys.AMD64)
 	addF(simdPackage, "Int32x4.PermuteConstantLo", opLen1Imm8(ssa.OpPermuteConstantLoInt32x4, types.TypeVec128, 0), sys.AMD64)
 	addF(simdPackage, "Uint16x8.PermuteConstantLo", opLen1Imm8(ssa.OpPermuteConstantLoUint16x8, types.TypeVec128, 0), sys.AMD64)
 	addF(simdPackage, "Uint32x4.PermuteConstantLo", opLen1Imm8(ssa.OpPermuteConstantLoUint32x4, types.TypeVec128, 0), sys.AMD64)
 	addF(simdPackage, "Int16x16.PermuteConstantLoGrouped", opLen1Imm8(ssa.OpPermuteConstantLoGroupedInt16x16, types.TypeVec256, 0), sys.AMD64)
 	addF(simdPackage, "Int16x32.PermuteConstantLoGrouped", opLen1Imm8(ssa.OpPermuteConstantLoGroupedInt16x32, types.TypeVec512, 0), sys.AMD64)
 	addF(simdPackage, "Uint16x16.PermuteConstantLoGrouped", opLen1Imm8(ssa.OpPermuteConstantLoGroupedUint16x16, types.TypeVec256, 0), sys.AMD64)
 	addF(simdPackage, "Uint16x32.PermuteConstantLoGrouped", opLen1Imm8(ssa.OpPermuteConstantLoGroupedUint16x32, types.TypeVec512, 0), sys.AMD64)
 	addF(simdPackage, "Int8x32.PermuteGrouped", opLen2(ssa.OpPermuteGroupedInt8x32, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int8x64.PermuteGrouped", opLen2(ssa.OpPermuteGroupedInt8x64, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Uint8x32.PermuteGrouped", opLen2(ssa.OpPermuteGroupedUint8x32, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint8x64.PermuteGrouped", opLen2(ssa.OpPermuteGroupedUint8x64, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Float32x4.Reciprocal", opLen1(ssa.OpReciprocalFloat32x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Float32x8.Reciprocal", opLen1(ssa.OpReciprocalFloat32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Float32x16.Reciprocal", opLen1(ssa.OpReciprocalFloat32x16, types.TypeVec512), sys.AMD64)
@ -1300,6 +1280,24 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
 	addF(simdPackage, "Uint32x16.concatSelectedConstantGrouped", opLen2Imm8(ssa.OpconcatSelectedConstantGroupedUint32x16, types.TypeVec512, 0), sys.AMD64)
 	addF(simdPackage, "Uint64x4.concatSelectedConstantGrouped", opLen2Imm8(ssa.OpconcatSelectedConstantGroupedUint64x4, types.TypeVec256, 0), sys.AMD64)
 	addF(simdPackage, "Uint64x8.concatSelectedConstantGrouped", opLen2Imm8(ssa.OpconcatSelectedConstantGroupedUint64x8, types.TypeVec512, 0), sys.AMD64)
 	addF(simdPackage, "Int32x4.permuteScalars", opLen1Imm8(ssa.OppermuteScalarsInt32x4, types.TypeVec128, 0), sys.AMD64)
 	addF(simdPackage, "Uint32x4.permuteScalars", opLen1Imm8(ssa.OppermuteScalarsUint32x4, types.TypeVec128, 0), sys.AMD64)
 	addF(simdPackage, "Int32x8.permuteScalarsGrouped", opLen1Imm8(ssa.OppermuteScalarsGroupedInt32x8, types.TypeVec256, 0), sys.AMD64)
 	addF(simdPackage, "Int32x16.permuteScalarsGrouped", opLen1Imm8(ssa.OppermuteScalarsGroupedInt32x16, types.TypeVec512, 0), sys.AMD64)
 	addF(simdPackage, "Uint32x8.permuteScalarsGrouped", opLen1Imm8(ssa.OppermuteScalarsGroupedUint32x8, types.TypeVec256, 0), sys.AMD64)
 	addF(simdPackage, "Uint32x16.permuteScalarsGrouped", opLen1Imm8(ssa.OppermuteScalarsGroupedUint32x16, types.TypeVec512, 0), sys.AMD64)
 	addF(simdPackage, "Int16x8.permuteScalarsHi", opLen1Imm8(ssa.OppermuteScalarsHiInt16x8, types.TypeVec128, 0), sys.AMD64)
 	addF(simdPackage, "Uint16x8.permuteScalarsHi", opLen1Imm8(ssa.OppermuteScalarsHiUint16x8, types.TypeVec128, 0), sys.AMD64)
 	addF(simdPackage, "Int16x16.permuteScalarsHiGrouped", opLen1Imm8(ssa.OppermuteScalarsHiGroupedInt16x16, types.TypeVec256, 0), sys.AMD64)
 	addF(simdPackage, "Int16x32.permuteScalarsHiGrouped", opLen1Imm8(ssa.OppermuteScalarsHiGroupedInt16x32, types.TypeVec512, 0), sys.AMD64)
 	addF(simdPackage, "Uint16x16.permuteScalarsHiGrouped", opLen1Imm8(ssa.OppermuteScalarsHiGroupedUint16x16, types.TypeVec256, 0), sys.AMD64)
 	addF(simdPackage, "Uint16x32.permuteScalarsHiGrouped", opLen1Imm8(ssa.OppermuteScalarsHiGroupedUint16x32, types.TypeVec512, 0), sys.AMD64)
 	addF(simdPackage, "Int16x8.permuteScalarsLo", opLen1Imm8(ssa.OppermuteScalarsLoInt16x8, types.TypeVec128, 0), sys.AMD64)
 	addF(simdPackage, "Uint16x8.permuteScalarsLo", opLen1Imm8(ssa.OppermuteScalarsLoUint16x8, types.TypeVec128, 0), sys.AMD64)
 	addF(simdPackage, "Int16x16.permuteScalarsLoGrouped", opLen1Imm8(ssa.OppermuteScalarsLoGroupedInt16x16, types.TypeVec256, 0), sys.AMD64)
 	addF(simdPackage, "Int16x32.permuteScalarsLoGrouped", opLen1Imm8(ssa.OppermuteScalarsLoGroupedInt16x32, types.TypeVec512, 0), sys.AMD64)
 	addF(simdPackage, "Uint16x16.permuteScalarsLoGrouped", opLen1Imm8(ssa.OppermuteScalarsLoGroupedUint16x16, types.TypeVec256, 0), sys.AMD64)
 	addF(simdPackage, "Uint16x32.permuteScalarsLoGrouped", opLen1Imm8(ssa.OppermuteScalarsLoGroupedUint16x32, types.TypeVec512, 0), sys.AMD64)
 	addF(simdPackage, "Int32x4.tern", opLen3Imm8(ssa.OpternInt32x4, types.TypeVec128, 0), sys.AMD64)
 	addF(simdPackage, "Int32x8.tern", opLen3Imm8(ssa.OpternInt32x8, types.TypeVec256, 0), sys.AMD64)
 	addF(simdPackage, "Int32x16.tern", opLen3Imm8(ssa.OpternInt32x16, types.TypeVec512, 0), sys.AMD64)
--- a/src/simd/_gen/simdgen/gen_simdGenericOps.go
+++ b/src/simd/_gen/simdgen/gen_simdGenericOps.go
@ -46,6 +46,9 @@ func writeSIMDGenericOps(ops []Operation) *bytes.Buffer {
 		if op.NoGenericOps != nil && *op.NoGenericOps == "true" {
 			continue
 		}
 		if op.SkipMaskedMethod() {
 			continue
 		}
 		_, _, _, immType, gOp := op.shape()
 		gOpData := genericOpsData{gOp.GenericName(), len(gOp.In), op.Commutative}
 		if immType == VarImm || immType == ConstVarImm {
--- a/src/simd/_gen/simdgen/gen_simdIntrinsics.go
+++ b/src/simd/_gen/simdgen/gen_simdIntrinsics.go
@ -107,6 +107,9 @@ func writeSIMDIntrinsics(ops []Operation, typeMap simdTypeMap) *bytes.Buffer {
 		if op.NoTypes != nil && *op.NoTypes == "true" {
 			continue
 		}
 		if op.SkipMaskedMethod() {
 			continue
 		}
 		if s, op, err := classifyOp(op); err == nil {
 			if err := t.ExecuteTemplate(buffer, s, op); err != nil {
 				panic(fmt.Errorf("failed to execute template %s for op %s: %w", s, op.Go, err))
--- a/src/simd/_gen/simdgen/gen_simdTypes.go
+++ b/src/simd/_gen/simdgen/gen_simdTypes.go
@ -604,6 +604,9 @@ func writeSIMDStubs(ops []Operation, typeMap simdTypeMap) (f, fI *bytes.Buffer)
 		if op.NoTypes != nil && *op.NoTypes == "true" {
 			continue
 		}
 		if op.SkipMaskedMethod() {
 			continue
 		}
 		idxVecAsScalar, err := checkVecAsScalar(op)
 		if err != nil {
 			panic(err)
--- a/src/simd/_gen/simdgen/gen_simdrules.go
+++ b/src/simd/_gen/simdgen/gen_simdrules.go
@ -345,7 +345,8 @@ func writeSIMDRules(ops []Operation) *bytes.Buffer {
 			data.ArgsOut = "..."
 		}
 		data.tplName = tplName
-		if opr.NoGenericOps != nil && *opr.NoGenericOps == "true" {
+		if opr.NoGenericOps != nil && *opr.NoGenericOps == "true" ||
 			opr.SkipMaskedMethod() {
 			optData = append(optData, data)
 			continue
 		}
--- a/src/simd/_gen/simdgen/godefs.go
+++ b/src/simd/_gen/simdgen/godefs.go
@ -73,6 +73,29 @@ type rawOperation struct {
 	NoGenericOps *string
 	// If non-nil, this string will be attached to the machine ssa op name.  E.g. "const"
 	SSAVariant *string
 	// If true, do not emit method declarations, generic ops, or intrinsics for masked variants
 	// DO emit the architecture-specific opcodes and optimizations.
 	HideMaskMethods *bool
 }
 func (o *Operation) IsMasked() bool {
 	if len(o.InVariant) == 0 {
 		return false
 	}
 	if len(o.InVariant) == 1 && o.InVariant[0].Class == "mask" {
 		return true
 	}
 	panic(fmt.Errorf("unknown inVariant"))
 }
 func (o *Operation) SkipMaskedMethod() bool {
 	if o.HideMaskMethods == nil {
 		return false
 	}
 	if *o.HideMaskMethods && o.IsMasked() {
 		return true
 	}
 	return false
 }
 func (o *Operation) DecodeUnified(v *unify.Value) error {
@ -80,14 +103,7 @@ func (o *Operation) DecodeUnified(v *unify.Value) error {
 		return err
 	}
-	isMasked := false
+	isMasked := o.IsMasked()
 	if len(o.InVariant) == 0 {
 		// No variant
 	} else if len(o.InVariant) == 1 && o.InVariant[0].Class == "mask" {
 		isMasked = true
 	} else {
 		return fmt.Errorf("unknown inVariant")
 	}
 	// Compute full Go method name.
 	o.Go = o.rawOperation.Go
@ -104,6 +120,7 @@ func (o *Operation) DecodeUnified(v *unify.Value) error {
 	o.Documentation = regexp.MustCompile(`\bNAME\b`).ReplaceAllString(o.Documentation, o.Go)
 	if isMasked {
 		o.Documentation += "\n//\n// This operation is applied selectively under a write mask."
 		// Suppress generic op and method declaration for exported methods, if a mask is present.
 		if unicode.IsUpper([]rune(o.Go)[0]) {
 			trueVal := "true"
 			o.NoGenericOps = &trueVal
--- a/src/simd/_gen/simdgen/ops/Moves/categories.yaml
+++ b/src/simd/_gen/simdgen/ops/Moves/categories.yaml
@ -27,18 +27,22 @@
  constImm: 1
  documentation: !string |-
    // NAME returns the upper half of x.
 - go: PermuteOrZero
  commutative: false
  documentation: !string |-
    // NAME performs a full permutation of vector x using indices:
    // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 - go: Permute
  commutative: false
  documentation: !string |-
    // NAME performs a full permutation of vector x using indices:
    // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-    // Only the needed bits to represent x's index are used in indices' elements.
+- go: ConcatPermute # ConcatPermute is only available on or after AVX512
 - go: Permute2 # Permute2 is only available on or after AVX512
  commutative: false
  documentation: !string |-
    // NAME performs a full permutation of vector x, y using indices:
    // result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-    // where xy is x appending y.
+    // where xy is the concatenation of x (lower half) and y (upper half).
    // Only the needed bits to represent xy's index are used in indices' elements.
 - go: Compress
  commutative: false
@ -74,31 +78,35 @@
  documentation: !string |-
    // NAME copies element zero of its (128-bit) input to all elements of
    // the 512-bit output vector.
 - go: PermuteOrZeroGrouped
  commutative: false
  documentation: !string |- # Detailed documentation will rely on the specific ops.
    // NAME performs a grouped permutation of vector x using indices:
 - go: PermuteGrouped
  commutative: false
  documentation: !string |- # Detailed documentation will rely on the specific ops.
    // NAME performs a grouped permutation of vector x using indices:
- go: PermuteConstant
+- go: permuteScalars
  commutative: false
  documentation: !string |- # Detailed documentation will rely on the specific ops.
    // NAME performs a permutation of vector x using constant indices:
- go: PermuteConstantGrouped
+- go: permuteScalarsGrouped
  commutative: false
  documentation: !string |- # Detailed documentation will rely on the specific ops.
    // NAME performs a grouped permutation of vector x using constant indices:
- go: PermuteConstantLo
+- go: permuteScalarsLo
  commutative: false
  documentation: !string |- # Detailed documentation will rely on the specific ops.
    // NAME performs a permutation of vector x using constant indices:
- go: PermuteConstantLoGrouped
+- go: permuteScalarsLoGrouped
  commutative: false
  documentation: !string |- # Detailed documentation will rely on the specific ops.
    // NAME performs a grouped permutation of vector x using constant indices:
- go: PermuteConstantHi
+- go: permuteScalarsHi
  commutative: false
  documentation: !string |- # Detailed documentation will rely on the specific ops.
    // NAME performs a permutation of vector x using constant indices:
- go: PermuteConstantHiGrouped
+- go: permuteScalarsHiGrouped
  commutative: false
  documentation: !string |- # Detailed documentation will rely on the specific ops.
    // NAME performs a grouped permutation of vector x using constant indices:
@ -218,8 +226,10 @@
 - go: Select128FromPair
  commutative: false
  documentation: !string |-
-    // NAME selects the low and high 128-bit halves from the 128-bit halves
+    // NAME treats the 256-bit vectors x and y as a single vector of four
-    // of its two 256-bit inputs, numbering those halves 0, 1, 2, 3.
+    // 128-bit elements, and returns a 256-bit result formed by 
    // concatenating the two elements specified by lo and hi.
    // For example, {4,5}.NAME(3,0,{6,7}) returns {7,4}.
 - go: ConcatShiftBytesRight
  commutative: false
--- a/src/simd/_gen/simdgen/ops/Moves/go.yaml
+++ b/src/simd/_gen/simdgen/ops/Moves/go.yaml
@ -213,19 +213,75 @@
  - *f64xN
 - go: Permute
-  asm: "VPERM[BWDQ]|VPERMP[SD]"
+  asm: "VPERMQ|VPERMPD"
  addDoc: !string |-
    // The low 2 bits (values 0-3) of each element of indices is used
  operandOrder: "21Type1"
  in:
  - &anyindices
    go: $t
    name: indices
    overwriteBase: uint
  - &any4
    go: $t
    lanes: 4
  out:
  - &any
    go: $t
 - go: Permute
  asm: "VPERM[WDQ]|VPERMP[SD]"
  addDoc: !string |-
    // The low 3 bits (values 0-7) of each element of indices is used
  operandOrder: "21Type1"
  in:
  - *anyindices
  - &any8
    go: $t
    lanes: 8
  out:
  - *any
- go: Permute2
+- go: Permute
  asm: "VPERM[BWD]|VPERMPS"
  addDoc: !string |-
    // The low 4 bits (values 0-15) of each element of indices is used
  operandOrder: "21Type1"
  in:
  - *anyindices
  - &any16
    go: $t
    lanes: 16
  out:
  - *any
 - go: Permute
  asm: "VPERM[BW]"
  addDoc: !string |-
    // The low 5 bits (values 0-31) of each element of indices is used
  operandOrder: "21Type1"
  in:
  - *anyindices
  - &any32
    go: $t
    lanes: 32
  out:
  - *any
 - go: Permute
  asm: "VPERMB"
  addDoc: !string |-
    // The low 6 bits (values 0-63) of each element of indices is used
  operandOrder: "21Type1"
  in:
  - *anyindices
  - &any64
    go: $t
    lanes: 64
  out:
  - *any
 - go: ConcatPermute
  asm: "VPERMI2[BWDQ]|VPERMI2P[SD]"
  # Because we are overwriting the receiver's type, we
  # have to move the receiver to be a parameter so that
@ -403,113 +459,137 @@
    base: $b
 # VPSHUFB for 128-bit byte shuffles will be picked with higher priority than VPERMB, given its lower CPU feature requirement. (It's AVX)
- go: Permute
+- go: PermuteOrZero
  asm: VPSHUFB
  addDoc: !string |-
-    // However when the top bit is set, the low bits will be disregard and the respective element in the result vector will be zeroed.
+    // The lower four bits of each byte-sized index in indices select an element from x,
    // unless the index's sign bit is set in which case zero is used instead.
  in:
  - &128any
    bits: 128
    go: $t
  - bits: 128
    go: $t
    name: indices
    base: int # always signed
  out:
  - *128any
- go: PermuteGrouped
+
 - go: PermuteOrZeroGrouped
  asm: VPSHUFB
  addDoc: !string |-
-    // result := {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...}
+    // result = {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...}
-    // Only the needed bits to represent the index of a group of x are used in indices' elements.
+    // The lower four bits of each byte-sized index in indices select an element from its corresponding group in x,
-    // However when the top bit is set, the low bits will be disregard and the respective element in the result vector will be zeroed.
+    // unless the index's sign bit is set in which case zero is used instead.
    // Each group is of size 128-bit.
  in:
  - &256Or512any
    bits: "256|512"
    go: $t
  - bits: "256|512"
    base: int
    name: indices
  out:
  - *256Or512any
 - go: permuteScalars
  asm: VPSHUFD
  addDoc: !string |-
    // result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
    // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
  in:
  - *128any
  - class: immediate
    immOffset: 0
    name: indices
  hideMaskMethods: true
  out:
  - *128any
 - go: permuteScalarsGrouped
  asm: VPSHUFD
  addDoc: !string |-
    // result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
    // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
    // Each group is of size 128-bit.
  in:
  - *256Or512any
  - class: immediate
    immOffset: 0
    name: indices
  hideMaskMethods: true
  out:
  - *256Or512any
 - go: permuteScalarsLo
  asm: VPSHUFLW
  addDoc: !string |-
    // result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]], x[4], x[5], x[6], x[7]}
    // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
  in:
    - &128lanes8
      bits: 128
      go: $t
-    name: indices
+      elemBits: 16
  out:
  - *256Or512any
 - go: PermuteConstant
  asm: VPSHUFD
  addDoc: !string |-
    // result := {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
    // Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
  in:
  - *128any
    - class: immediate
      immOffset: 0
      name: indices
  hideMaskMethods: true
  out:
-  - *128any
+    - *128lanes8
- go: PermuteConstantGrouped
+
-  asm: VPSHUFD
+- go: permuteScalarsLoGrouped
  asm: VPSHUFLW
  addDoc: !string |-
-    // result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
+    //
-    // Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
+    //   result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x[4], x[5], x[6], x[7],
    //    x_group1[indices[0:2]], ...}
    //
    // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
    // Each group is of size 128-bit.
  in:
-  - *256Or512any
+  - &256Or512lanes8
    bits: "256|512"
    go: $t
    elemBits: 16
  - class: immediate
    immOffset: 0
    name: indices
  hideMaskMethods: true
  out:
-  - *256Or512any
+  - *256Or512lanes8
- go: PermuteConstantLo
+- go: permuteScalarsHi
  asm: VPSHUFHW
  addDoc: !string |-
-    // result := {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
+    // result = {x[0], x[1], x[2], x[3], x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]}
-    // Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
+    // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
  in:
-    - *128any
+  - *128lanes8
  - class: immediate
    immOffset: 0
    name: indices
  hideMaskMethods: true
  out:
-    - *128any
+  - *128lanes8
- go: PermuteConstantLoGrouped
+
 - go: permuteScalarsHiGrouped
  asm: VPSHUFHW
  addDoc: !string |-
-    // result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
+    // result =
-    // Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
+    //
    //   {x_group0[0], x_group0[1], x_group0[2], x_group0[3], x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4],
    //    x_group1[0], x_group1[1], x_group1[2], x_group1[3], x_group1[indices[0:2]+4], ...}
    //
    // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
    // Each group is of size 128-bit.
  in:
-  - *256Or512any
+  - *256Or512lanes8
  - class: immediate
    immOffset: 0
    name: indices
  hideMaskMethods: true
  out:
-  - *256Or512any
+  - *256Or512lanes8
 - go: PermuteConstantHi
  asm: VPSHUFHW
  addDoc: !string |-
    // result := {x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]}
    // Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
  in:
  - *128any
  - class: immediate
    immOffset: 0
    name: indices
  out:
  - *128any
 - go: PermuteConstantHiGrouped
  asm: VPSHUFHW
  addDoc: !string |-
    // result := {x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4], x_group1[indices[0:2]+4], ...}
    // Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
    // Each group is of size 128-bit.
  in:
  - *256Or512any
  - class: immediate
    immOffset: 0
    name: indices
  out:
  - *256Or512any
 - go: InterleaveHi
  asm: VPUNPCKH(QDQ|DQ|WD|WB)
--- a/src/simd/internal/simd_test/simd_test.go
+++ b/src/simd/internal/simd_test/simd_test.go
@ -163,7 +163,20 @@ func TestPermute(t *testing.T) {
 	}
 }
-func TestPermute2(t *testing.T) {
+func TestPermuteOrZero(t *testing.T) {
 	x := []uint8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
 	indices := []int8{7, 6, 5, 4, 3, 2, 1, 0, -1, 8, -1, 9, -1, 10, -1, 11}
 	want := []uint8{8, 7, 6, 5, 4, 3, 2, 1, 0, 9, 0, 10, 0, 11, 0, 12}
 	got := make([]uint8, len(x))
 	simd.LoadUint8x16Slice(x).PermuteOrZero(simd.LoadInt8x16Slice(indices)).StoreSlice(got)
 	for i := range 8 {
 		if want[i] != got[i] {
 			t.Errorf("want and got differ at index %d, want=%d, got=%d", i, want[i], got[i])
 		}
 	}
 }
 func TestConcatPermute(t *testing.T) {
 	if !simd.X86.AVX512() {
 		t.Skip("Test requires X86.AVX512, not available on this hardware")
 		return
@ -173,7 +186,7 @@ func TestPermute2(t *testing.T) {
 	indices := []uint64{7 + 8, 6, 5 + 8, 4, 3 + 8, 2, 1 + 8, 0}
 	want := []int64{-8, 7, -6, 5, -4, 3, -2, 1}
 	got := make([]int64, 8)
-	simd.LoadInt64x8Slice(x).Permute2(simd.LoadInt64x8Slice(y), simd.LoadUint64x8Slice(indices)).StoreSlice(got)
+	simd.LoadInt64x8Slice(x).ConcatPermute(simd.LoadInt64x8Slice(y), simd.LoadUint64x8Slice(indices)).StoreSlice(got)
 	for i := range 8 {
 		if want[i] != got[i] {
 			t.Errorf("want and got differ at index %d, want=%d, got=%d", i, want[i], got[i])
@ -1161,3 +1174,75 @@ func TestDotProductQuadruple(t *testing.T) {
 		}
 	}
 }
 func TestPermuteScalars(t *testing.T) {
 	x := []int32{11, 12, 13, 14}
 	want := []int32{12, 13, 14, 11}
 	got := make([]int32, 4)
 	simd.LoadInt32x4Slice(x).PermuteScalars(1, 2, 3, 0).StoreSlice(got)
 	for i := range 4 {
 		if want[i] != got[i] {
 			t.Errorf("want and got differ at index %d, want=%d, got=%d", i, want[i], got[i])
 		}
 	}
 }
 func TestPermuteScalarsGrouped(t *testing.T) {
 	x := []int32{11, 12, 13, 14, 21, 22, 23, 24}
 	want := []int32{12, 13, 14, 11, 22, 23, 24, 21}
 	got := make([]int32, 8)
 	simd.LoadInt32x8Slice(x).PermuteScalarsGrouped(1, 2, 3, 0).StoreSlice(got)
 	for i := range 8 {
 		if want[i] != got[i] {
 			t.Errorf("want and got differ at index %d, want=%d, got=%d", i, want[i], got[i])
 		}
 	}
 }
 func TestPermuteScalarsHi(t *testing.T) {
 	x := []int16{-1, -2, -3, -4, 11, 12, 13, 14}
 	want := []int16{-1, -2, -3, -4, 12, 13, 14, 11}
 	got := make([]int16, len(x))
 	simd.LoadInt16x8Slice(x).PermuteScalarsHi(1, 2, 3, 0).StoreSlice(got)
 	for i := range got {
 		if want[i] != got[i] {
 			t.Errorf("want and got differ at index %d, want=%d, got=%d", i, want[i], got[i])
 		}
 	}
 }
 func TestPermuteScalarsLo(t *testing.T) {
 	x := []int16{11, 12, 13, 14, 4, 5, 6, 7}
 	want := []int16{12, 13, 14, 11, 4, 5, 6, 7}
 	got := make([]int16, len(x))
 	simd.LoadInt16x8Slice(x).PermuteScalarsLo(1, 2, 3, 0).StoreSlice(got)
 	for i := range got {
 		if want[i] != got[i] {
 			t.Errorf("want and got differ at index %d, want=%d, got=%d", i, want[i], got[i])
 		}
 	}
 }
 func TestPermuteScalarsHiGrouped(t *testing.T) {
 	x := []int16{-1, -2, -3, -4, 11, 12, 13, 14, -11, -12, -13, -14, 111, 112, 113, 114}
 	want := []int16{-1, -2, -3, -4, 12, 13, 14, 11, -11, -12, -13, -14, 112, 113, 114, 111}
 	got := make([]int16, len(x))
 	simd.LoadInt16x16Slice(x).PermuteScalarsHiGrouped(1, 2, 3, 0).StoreSlice(got)
 	for i := range got {
 		if want[i] != got[i] {
 			t.Errorf("want and got differ at index %d, want=%d, got=%d", i, want[i], got[i])
 		}
 	}
 }
 func TestPermuteScalarsLoGrouped(t *testing.T) {
 	x := []int16{11, 12, 13, 14, 4, 5, 6, 7, 111, 112, 113, 114, 14, 15, 16, 17}
 	want := []int16{12, 13, 14, 11, 4, 5, 6, 7, 112, 113, 114, 111, 14, 15, 16, 17}
 	got := make([]int16, len(x))
 	simd.LoadInt16x16Slice(x).PermuteScalarsLoGrouped(1, 2, 3, 0).StoreSlice(got)
 	for i := range got {
 		if want[i] != got[i] {
 			t.Errorf("want and got differ at index %d, want=%d, got=%d", i, want[i], got[i])
 		}
 	}
 }
--- a/src/simd/ops_amd64.go
+++ b/src/simd/ops_amd64.go
--- a/src/simd/ops_internal_amd64.go
+++ b/src/simd/ops_internal_amd64.go
@ -338,6 +338,220 @@ func (x Uint64x4) concatSelectedConstantGrouped(hilos uint8, y Uint64x4) Uint64x
 // Asm: VSHUFPD, CPU Feature: AVX512
 func (x Uint64x8) concatSelectedConstantGrouped(hilos uint8, y Uint64x8) Uint64x8
 /* permuteScalars */
 // permuteScalars performs a permutation of vector x using constant indices:
 // result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
 // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
 //
 // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
 //
 // Asm: VPSHUFD, CPU Feature: AVX
 func (x Int32x4) permuteScalars(indices uint8) Int32x4
 // permuteScalars performs a permutation of vector x using constant indices:
 // result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
 // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
 //
 // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
 //
 // Asm: VPSHUFD, CPU Feature: AVX
 func (x Uint32x4) permuteScalars(indices uint8) Uint32x4
 /* permuteScalarsGrouped */
 // permuteScalarsGrouped performs a grouped permutation of vector x using constant indices:
 // result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
 // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
 // Each group is of size 128-bit.
 //
 // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
 //
 // Asm: VPSHUFD, CPU Feature: AVX2
 func (x Int32x8) permuteScalarsGrouped(indices uint8) Int32x8
 // permuteScalarsGrouped performs a grouped permutation of vector x using constant indices:
 // result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
 // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
 // Each group is of size 128-bit.
 //
 // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
 //
 // Asm: VPSHUFD, CPU Feature: AVX512
 func (x Int32x16) permuteScalarsGrouped(indices uint8) Int32x16
 // permuteScalarsGrouped performs a grouped permutation of vector x using constant indices:
 // result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
 // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
 // Each group is of size 128-bit.
 //
 // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
 //
 // Asm: VPSHUFD, CPU Feature: AVX2
 func (x Uint32x8) permuteScalarsGrouped(indices uint8) Uint32x8
 // permuteScalarsGrouped performs a grouped permutation of vector x using constant indices:
 // result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
 // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
 // Each group is of size 128-bit.
 //
 // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
 //
 // Asm: VPSHUFD, CPU Feature: AVX512
 func (x Uint32x16) permuteScalarsGrouped(indices uint8) Uint32x16
 /* permuteScalarsHi */
 // permuteScalarsHi performs a permutation of vector x using constant indices:
 // result = {x[0], x[1], x[2], x[3], x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]}
 // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
 //
 // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
 //
 // Asm: VPSHUFHW, CPU Feature: AVX512
 func (x Int16x8) permuteScalarsHi(indices uint8) Int16x8
 // permuteScalarsHi performs a permutation of vector x using constant indices:
 // result = {x[0], x[1], x[2], x[3], x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]}
 // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
 //
 // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
 //
 // Asm: VPSHUFHW, CPU Feature: AVX512
 func (x Uint16x8) permuteScalarsHi(indices uint8) Uint16x8
 /* permuteScalarsHiGrouped */
 // permuteScalarsHiGrouped performs a grouped permutation of vector x using constant indices:
 // result =
 //
 //	{x_group0[0], x_group0[1], x_group0[2], x_group0[3], x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4],
 //	 x_group1[0], x_group1[1], x_group1[2], x_group1[3], x_group1[indices[0:2]+4], ...}
 //
 // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
 // Each group is of size 128-bit.
 //
 // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
 //
 // Asm: VPSHUFHW, CPU Feature: AVX2
 func (x Int16x16) permuteScalarsHiGrouped(indices uint8) Int16x16
 // permuteScalarsHiGrouped performs a grouped permutation of vector x using constant indices:
 // result =
 //
 //	{x_group0[0], x_group0[1], x_group0[2], x_group0[3], x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4],
 //	 x_group1[0], x_group1[1], x_group1[2], x_group1[3], x_group1[indices[0:2]+4], ...}
 //
 // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
 // Each group is of size 128-bit.
 //
 // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
 //
 // Asm: VPSHUFHW, CPU Feature: AVX512
 func (x Int16x32) permuteScalarsHiGrouped(indices uint8) Int16x32
 // permuteScalarsHiGrouped performs a grouped permutation of vector x using constant indices:
 // result =
 //
 //	{x_group0[0], x_group0[1], x_group0[2], x_group0[3], x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4],
 //	 x_group1[0], x_group1[1], x_group1[2], x_group1[3], x_group1[indices[0:2]+4], ...}
 //
 // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
 // Each group is of size 128-bit.
 //
 // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
 //
 // Asm: VPSHUFHW, CPU Feature: AVX2
 func (x Uint16x16) permuteScalarsHiGrouped(indices uint8) Uint16x16
 // permuteScalarsHiGrouped performs a grouped permutation of vector x using constant indices:
 // result =
 //
 //	{x_group0[0], x_group0[1], x_group0[2], x_group0[3], x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4],
 //	 x_group1[0], x_group1[1], x_group1[2], x_group1[3], x_group1[indices[0:2]+4], ...}
 //
 // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
 // Each group is of size 128-bit.
 //
 // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
 //
 // Asm: VPSHUFHW, CPU Feature: AVX512
 func (x Uint16x32) permuteScalarsHiGrouped(indices uint8) Uint16x32
 /* permuteScalarsLo */
 // permuteScalarsLo performs a permutation of vector x using constant indices:
 // result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]], x[4], x[5], x[6], x[7]}
 // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
 //
 // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
 //
 // Asm: VPSHUFLW, CPU Feature: AVX512
 func (x Int16x8) permuteScalarsLo(indices uint8) Int16x8
 // permuteScalarsLo performs a permutation of vector x using constant indices:
 // result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]], x[4], x[5], x[6], x[7]}
 // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
 //
 // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
 //
 // Asm: VPSHUFLW, CPU Feature: AVX512
 func (x Uint16x8) permuteScalarsLo(indices uint8) Uint16x8
 /* permuteScalarsLoGrouped */
 // permuteScalarsLoGrouped performs a grouped permutation of vector x using constant indices:
 //
 //	result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x[4], x[5], x[6], x[7],
 //	 x_group1[indices[0:2]], ...}
 //
 // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
 // Each group is of size 128-bit.
 //
 // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
 //
 // Asm: VPSHUFLW, CPU Feature: AVX2
 func (x Int16x16) permuteScalarsLoGrouped(indices uint8) Int16x16
 // permuteScalarsLoGrouped performs a grouped permutation of vector x using constant indices:
 //
 //	result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x[4], x[5], x[6], x[7],
 //	 x_group1[indices[0:2]], ...}
 //
 // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
 // Each group is of size 128-bit.
 //
 // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
 //
 // Asm: VPSHUFLW, CPU Feature: AVX512
 func (x Int16x32) permuteScalarsLoGrouped(indices uint8) Int16x32
 // permuteScalarsLoGrouped performs a grouped permutation of vector x using constant indices:
 //
 //	result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x[4], x[5], x[6], x[7],
 //	 x_group1[indices[0:2]], ...}
 //
 // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
 // Each group is of size 128-bit.
 //
 // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
 //
 // Asm: VPSHUFLW, CPU Feature: AVX2
 func (x Uint16x16) permuteScalarsLoGrouped(indices uint8) Uint16x16
 // permuteScalarsLoGrouped performs a grouped permutation of vector x using constant indices:
 //
 //	result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x[4], x[5], x[6], x[7],
 //	 x_group1[indices[0:2]], ...}
 //
 // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
 // Each group is of size 128-bit.
 //
 // indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
 //
 // Asm: VPSHUFLW, CPU Feature: AVX512
 func (x Uint16x32) permuteScalarsLoGrouped(indices uint8) Uint16x32
 /* tern */
 // tern performs a logical operation on three vectors based on the 8-bit truth table.
--- a/src/simd/shuffles_amd64.go
+++ b/src/simd/shuffles_amd64.go
@ -989,3 +989,280 @@ func (x Int64x8) SelectFromPairGrouped(a, b uint8, y Int64x8) Int64x8 {
 	}
 	panic("missing case, switch should be exhaustive")
 }
 /* PermuteScalars */
 // PermuteScalars performs a permutation of vector x's elements using the supplied indices:
 //
 //	result = {x[a], x[b], x[c], x[d]}
 //
 // Parameters a,b,c,d should have values between 0 and 3.
 // If a through d are constants, then an instruction will be inlined, otherwise
 // a jump table may be generated.
 //
 // Asm: VPSHUFD, CPU Feature: AVX
 func (x Int32x4) PermuteScalars(a, b, c, d uint8) Int32x4 {
 	return x.permuteScalars(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
 }
 // PermuteScalars performs a permutation of vector x's elements using the supplied indices:
 //
 //	result = {x[a], x[b], x[c], x[d]}
 //
 // Parameters a,b,c,d should have values between 0 and 3.
 // If a through d are constants, then an instruction will be inlined, otherwise
 // a jump table may be generated.
 //
 // Asm: VPSHUFD, CPU Feature: AVX
 func (x Uint32x4) PermuteScalars(a, b, c, d uint8) Uint32x4 {
 	return x.permuteScalars(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
 }
 /* PermuteScalarsGrouped */
 // PermuteScalarsGrouped performs a grouped permutation of vector x using the supplied indices:
 //
 //	result = {x[a], x[b], x[c], x[d], x[a+4], x[b+4], x[c+4], x[d+4]}
 //
 // Parameters a,b,c,d should have values between 0 and 3.
 // If a through d are constants, then an instruction will be inlined, otherwise
 // a jump table may be generated.
 //
 // Asm: VPSHUFD, CPU Feature: AVX2
 func (x Int32x8) PermuteScalarsGrouped(a, b, c, d uint8) Int32x8 {
 	return x.permuteScalarsGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
 }
 // PermuteScalarsGrouped performs a grouped permutation of vector x using the supplied indices:
 //
 //	 result =
 //		 {  x[a], x[b], x[c], x[d],         x[a+4], x[b+4], x[c+4], x[d+4],
 //			x[a+8], x[b+8], x[c+8], x[d+8], x[a+12], x[b+12], x[c+12], x[d+12]}
 //
 // Parameters a,b,c,d should have values between 0 and 3.
 // If a through d are constants, then an instruction will be inlined, otherwise
 // a jump table may be generated.
 //
 // Asm: VPSHUFD, CPU Feature: AVX512
 func (x Int32x16) PermuteScalarsGrouped(a, b, c, d uint8) Int32x16 {
 	return x.permuteScalarsGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
 }
 // PermuteScalarsGrouped performs a grouped permutation of vector x using the supplied indices:
 //
 //	result = {x[a], x[b], x[c], x[d], x[a+4], x[b+4], x[c+4], x[d+4]}
 //
 // Parameters a,b,c,d should have values between 0 and 3.
 // If a through d are constants, then an instruction will be inlined, otherwise
 // a jump table is generated.
 //
 // Asm: VPSHUFD, CPU Feature: AVX2
 func (x Uint32x8) PermuteScalarsGrouped(a, b, c, d uint8) Uint32x8 {
 	return x.permuteScalarsGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
 }
 // PermuteScalarsGrouped performs a grouped permutation of vector x using the supplied indices:
 //
 //	 result =
 //		 {  x[a], x[b], x[c], x[d],         x[a+4], x[b+4], x[c+4], x[d+4],
 //			x[a+8], x[b+8], x[c+8], x[d+8], x[a+12], x[b+12], x[c+12], x[d+12]}
 //
 // Parameters a,b,c,d should have values between 0 and 3.
 // If a through d are constants, then an instruction will be inlined, otherwise
 // a jump table is generated.
 //
 // Asm: VPSHUFD, CPU Feature: AVX512
 func (x Uint32x16) PermuteScalarsGrouped(a, b, c, d uint8) Uint32x16 {
 	return x.permuteScalarsGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
 }
 /* PermuteScalarsHi */
 // PermuteScalarsHi performs a permutation of vector x using the supplied indices:
 //
 // result = {x[0], x[1], x[2], x[3], x[a+4], x[b+4], x[c+4], x[d+4]}
 //
 // Parameters a,b,c,d should have values between 0 and 3.
 // If a through d are constants, then an instruction will be inlined, otherwise
 // a jump table is generated.
 //
 // Asm: VPSHUFHW, CPU Feature: AVX512
 func (x Int16x8) PermuteScalarsHi(a, b, c, d uint8) Int16x8 {
 	return x.permuteScalarsHi(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
 }
 // PermuteScalarsHi performs a permutation of vector x using the supplied indices:
 //
 // result = {x[0], x[1], x[2], x[3], x[a+4], x[b+4], x[c+4], x[d+4]}
 //
 // Parameters a,b,c,d should have values between 0 and 3.
 // If a through d are constants, then an instruction will be inlined, otherwise
 // a jump table is generated.
 //
 // Asm: VPSHUFHW, CPU Feature: AVX512
 func (x Uint16x8) PermuteScalarsHi(a, b, c, d uint8) Uint16x8 {
 	return x.permuteScalarsHi(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
 }
 /* PermuteScalarsHiGrouped */
 // PermuteScalarsHiGrouped performs a grouped permutation of vector x using the supplied indices:
 //
 //	 result =
 //		  {x[0], x[1], x[2], x[3],   x[a+4], x[b+4], x[c+4], x[d+4],
 //			x[8], x[9], x[10], x[11], x[a+12], x[b+12], x[c+12], x[d+12]}
 //
 // Parameters a,b,c,d should have values between 0 and 3.
 // If a through d are constants, then an instruction will be inlined, otherwise
 // a jump table is generated.
 //
 // Asm: VPSHUFHW, CPU Feature: AVX2
 func (x Int16x16) PermuteScalarsHiGrouped(a, b, c, d uint8) Int16x16 {
 	return x.permuteScalarsHiGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
 }
 // PermuteScalarsHiGrouped performs a grouped permutation of vector x using the supplied indices:
 //
 //	 result =
 //		  {x[0], x[1], x[2], x[3],     x[a+4], x[b+4], x[c+4], x[d+4],
 //			x[8], x[9], x[10], x[11],   x[a+12], x[b+12], x[c+12], x[d+12],
 //			x[16], x[17], x[18], x[19], x[a+20], x[b+20], x[c+20], x[d+20],
 //			x[24], x[25], x[26], x[27], x[a+28], x[b+28], x[c+28], x[d+28]}
 //
 // Parameters a,b,c,d should have values between 0 and 3.
 // If a through d are constants, then an instruction will be inlined, otherwise
 // a jump table is generated.
 //
 // Asm: VPSHUFHW, CPU Feature: AVX512
 func (x Int16x32) PermuteScalarsHiGrouped(a, b, c, d uint8) Int16x32 {
 	return x.permuteScalarsHiGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
 }
 // PermuteScalarsHiGrouped performs a grouped permutation of vector x using the supplied indices:
 //
 //	 result =
 //	  {x[0], x[1], x[2], x[3],   x[a+4], x[b+4], x[c+4], x[d+4],
 //		x[8], x[9], x[10], x[11], x[a+12], x[b+12], x[c+12], x[d+12]}
 //
 // Each group is of size 128-bit.
 //
 // Parameters a,b,c,d should have values between 0 and 3.
 // If a through d are constants, then an instruction will be inlined, otherwise
 // a jump table is generated.
 //
 // Asm: VPSHUFHW, CPU Feature: AVX2
 func (x Uint16x16) PermuteScalarsHiGrouped(a, b, c, d uint8) Uint16x16 {
 	return x.permuteScalarsHiGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
 }
 // PermuteScalarsHiGrouped performs a grouped permutation of vector x using the supplied indices:
 //
 //	 result =
 //		 {  x[0], x[1], x[2], x[3],     x[a+4], x[b+4], x[c+4], x[d+4],
 //			x[8], x[9], x[10], x[11],   x[a+12], x[b+12], x[c+12], x[d+12],
 //			x[16], x[17], x[18], x[19], x[a+20], x[b+20], x[c+20], x[d+20],
 //			x[24], x[25], x[26], x[27], x[a+28], x[b+28], x[c+28], x[d+28]}
 //
 // Parameters a,b,c,d should have values between 0 and 3.
 // If a through d are constants, then an instruction will be inlined, otherwise
 // a jump table is generated.
 //
 // Asm: VPSHUFHW, CPU Feature: AVX512
 func (x Uint16x32) PermuteScalarsHiGrouped(a, b, c, d uint8) Uint16x32 {
 	return x.permuteScalarsHiGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
 }
 /* PermuteScalarsLo */
 // PermuteScalarsLo performs a permutation of vector x using the supplied indices:
 //
 //	result = {x[a], x[b], x[c], x[d], x[4], x[5], x[6], x[7]}
 //
 // Parameters a,b,c,d should have values between 0 and 3.
 // If a through d are constants, then an instruction will be inlined, otherwise
 // a jump table is generated.
 //
 // Asm: VPSHUFLW, CPU Feature: AVX512
 func (x Int16x8) PermuteScalarsLo(a, b, c, d uint8) Int16x8 {
 	return x.permuteScalarsLo(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
 }
 // PermuteScalarsLo performs a permutation of vector x using the supplied indices:
 //
 //	result = {x[a], x[b], x[c], x[d], x[4], x[5], x[6], x[7]}
 //
 // Parameters a,b,c,d should have values between 0 and 3.
 // If a through d are constants, then an instruction will be inlined, otherwise
 // a jump table is generated.
 //
 // Asm: VPSHUFLW, CPU Feature: AVX512
 func (x Uint16x8) PermuteScalarsLo(a, b, c, d uint8) Uint16x8 {
 	return x.permuteScalarsLo(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
 }
 /* PermuteScalarsLoGrouped */
 // PermuteScalarsLoGrouped performs a grouped permutation of vector x using the supplied indices:
 //
 //	 result =
 //	 {x[a], x[b], x[c], x[d],         x[4], x[5], x[6], x[7],
 //		 x[a+8], x[b+8], x[c+8], x[d+8], x[12], x[13], x[14], x[15]}
 //
 // Parameters a,b,c,d should have values between 0 and 3.
 // If a through d are constants, then an instruction will be inlined, otherwise
 // a jump table is generated.
 //
 // Asm: VPSHUFLW, CPU Feature: AVX2
 func (x Int16x16) PermuteScalarsLoGrouped(a, b, c, d uint8) Int16x16 {
 	return x.permuteScalarsLoGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
 }
 // PermuteScalarsLoGrouped performs a grouped permutation of vector x using the supplied indices:
 //
 //	 result =
 //	 {x[a], x[b], x[c], x[d],    x[4], x[5], x[6], x[7],
 //		x[a+8], x[b+8], x[c+8], x[d+8],     x[12], x[13], x[14], x[15],
 //		x[a+16], x[b+16], x[c+16], x[d+16], x[20], x[21], x[22], x[23],
 //		x[a+24], x[b+24], x[c+24], x[d+24], x[28], x[29], x[30], x[31]}
 //
 // Parameters a,b,c,d should have values between 0 and 3.
 // If a through d are constants, then an instruction will be inlined, otherwise
 // a jump table is generated.
 //
 // Asm: VPSHUFLW, CPU Feature: AVX512
 func (x Int16x32) PermuteScalarsLoGrouped(a, b, c, d uint8) Int16x32 {
 	return x.permuteScalarsLoGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
 }
 // PermuteScalarsLoGrouped performs a grouped permutation of vector x using the supplied indices:
 //
 //	 result = {x[a], x[b], x[c], x[d],         x[4], x[5], x[6], x[7],
 //		x[a+8], x[b+8], x[c+8], x[d+8], x[12], x[13], x[14], x[15]}
 //
 // Parameters a,b,c,d should have values between 0 and 3.
 // If a through d are constants, then an instruction will be inlined, otherwise
 // a jump table is generated.
 //
 // Asm: VPSHUFLW, CPU Feature: AVX2
 func (x Uint16x16) PermuteScalarsLoGrouped(a, b, c, d uint8) Uint16x16 {
 	return x.permuteScalarsLoGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
 }
 // PermuteScalarsLoGrouped performs a grouped permutation of vector x using the supplied indices:
 //
 //	 result =
 //	 {x[a], x[b], x[c], x[d],    x[4], x[5], x[6], x[7],
 //		x[a+8], x[b+8], x[c+8], x[d+8],     x[12], x[13], x[14], x[15],
 //		x[a+16], x[b+16], x[c+16], x[d+16], x[20], x[21], x[22], x[23],
 //		x[a+24], x[b+24], x[c+24], x[d+24], x[28], x[29], x[30], x[31]}
 //
 // Each group is of size 128-bit.
 //
 // Parameters a,b,c,d should have values between 0 and 3.
 // If a through d are constants, then an instruction will be inlined, otherwise
 // a jump table is generated.
 //
 // Asm: VPSHUFLW, CPU Feature: AVX512
 func (x Uint16x32) PermuteScalarsLoGrouped(a, b, c, d uint8) Uint16x32 {
 	return x.permuteScalarsLoGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
 }