[dev.simd] simd: fix signatures for PermuteConstant* methods

This moves the packed-immediate methods to package-private, and adds exported versions with four parameters. Rename PermuteConstant to PermuteScalars Rename VPSHUFB Permute to PermuteOrZero Rename Permute2 to ConcatPermute Comments were repaired/enhanced. Modified the generator to support an additional tag "hideMaskMethods : true" to suppress method, intrinsic, generic, and generic translation generation for said mask-modified versions of such methods (this is already true for exported methods). Change-Id: I91e208c1fff1f28ebce4edb4e73d26003715018c Reviewed-on: https://go-review.googlesource.com/c/go/+/721342 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Junyang Shao <shaojunyang@google.com> Reviewed-by: Cherry Mui <cherryyz@google.com>
2025-12-08 06:10:04 +00:00 · 2025-11-17 15:31:36 -05:00 · 2025-11-17 15:31:36 -05:00 · 4d26d66a49
commit 4d26d66a49
parent e3d4645693
18 changed files with 2614 additions and 1820 deletions
--- a/src/cmd/compile/internal/amd64/simdssa.go
+++ b/src/cmd/compile/internal/amd64/simdssa.go
@ -396,7 +396,7 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPOR256,
 		ssa.OpAMD64VPORD512,
 		ssa.OpAMD64VPORQ512,
-		ssa.OpAMD64VPSHUFB128,
+		ssa.OpAMD64VPERMB128,
 		ssa.OpAMD64VPERMB256,
 		ssa.OpAMD64VPERMB512,
 		ssa.OpAMD64VPERMW128,
@ -410,6 +410,7 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPERMQ256,
 		ssa.OpAMD64VPERMPD512,
 		ssa.OpAMD64VPERMQ512,
+		ssa.OpAMD64VPSHUFB128,
 		ssa.OpAMD64VPSHUFB256,
 		ssa.OpAMD64VPSHUFB512,
 		ssa.OpAMD64VPROLVD128,
@ -672,9 +673,7 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPORQMasked128,
 		ssa.OpAMD64VPORQMasked256,
 		ssa.OpAMD64VPORQMasked512,
-		ssa.OpAMD64VPSHUFBMasked256,
-		ssa.OpAMD64VPSHUFBMasked512,
-		ssa.OpAMD64VPSHUFBMasked128,
+		ssa.OpAMD64VPERMBMasked128,
 		ssa.OpAMD64VPERMBMasked256,
 		ssa.OpAMD64VPERMBMasked512,
 		ssa.OpAMD64VPERMWMasked128,
@ -688,6 +687,9 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPERMQMasked256,
 		ssa.OpAMD64VPERMPDMasked512,
 		ssa.OpAMD64VPERMQMasked512,
+		ssa.OpAMD64VPSHUFBMasked256,
+		ssa.OpAMD64VPSHUFBMasked512,
+		ssa.OpAMD64VPSHUFBMasked128,
 		ssa.OpAMD64VPROLVDMasked128,
 		ssa.OpAMD64VPROLVDMasked256,
 		ssa.OpAMD64VPROLVDMasked512,
@ -1011,12 +1013,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VEXTRACTF64X4256,
 		ssa.OpAMD64VEXTRACTI128128,
 		ssa.OpAMD64VEXTRACTI64X4256,
-		ssa.OpAMD64VPSHUFD128,
-		ssa.OpAMD64VPSHUFD256,
-		ssa.OpAMD64VPSHUFD512,
-		ssa.OpAMD64VPSHUFHW128,
-		ssa.OpAMD64VPSHUFHW256,
-		ssa.OpAMD64VPSHUFHW512,
 		ssa.OpAMD64VPROLD128,
 		ssa.OpAMD64VPROLD256,
 		ssa.OpAMD64VPROLD512,
@ -1029,6 +1025,15 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPRORQ128,
 		ssa.OpAMD64VPRORQ256,
 		ssa.OpAMD64VPRORQ512,
+		ssa.OpAMD64VPSHUFD128,
+		ssa.OpAMD64VPSHUFD256,
+		ssa.OpAMD64VPSHUFD512,
+		ssa.OpAMD64VPSHUFHW128,
+		ssa.OpAMD64VPSHUFHW256,
+		ssa.OpAMD64VPSHUFHW512,
+		ssa.OpAMD64VPSHUFLW128,
+		ssa.OpAMD64VPSHUFLW256,
+		ssa.OpAMD64VPSHUFLW512,
 		ssa.OpAMD64VPSLLW128const,
 		ssa.OpAMD64VPSLLW256const,
 		ssa.OpAMD64VPSLLW512const,
@ -1070,12 +1075,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VREDUCEPDMasked128,
 		ssa.OpAMD64VREDUCEPDMasked256,
 		ssa.OpAMD64VREDUCEPDMasked512,
-		ssa.OpAMD64VPSHUFDMasked256,
-		ssa.OpAMD64VPSHUFDMasked512,
-		ssa.OpAMD64VPSHUFHWMasked256,
-		ssa.OpAMD64VPSHUFHWMasked512,
-		ssa.OpAMD64VPSHUFHWMasked128,
-		ssa.OpAMD64VPSHUFDMasked128,
 		ssa.OpAMD64VPROLDMasked128,
 		ssa.OpAMD64VPROLDMasked256,
 		ssa.OpAMD64VPROLDMasked512,
@ -1088,6 +1087,15 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPRORQMasked128,
 		ssa.OpAMD64VPRORQMasked256,
 		ssa.OpAMD64VPRORQMasked512,
+		ssa.OpAMD64VPSHUFDMasked256,
+		ssa.OpAMD64VPSHUFDMasked512,
+		ssa.OpAMD64VPSHUFHWMasked256,
+		ssa.OpAMD64VPSHUFHWMasked512,
+		ssa.OpAMD64VPSHUFHWMasked128,
+		ssa.OpAMD64VPSHUFLWMasked256,
+		ssa.OpAMD64VPSHUFLWMasked512,
+		ssa.OpAMD64VPSHUFLWMasked128,
+		ssa.OpAMD64VPSHUFDMasked128,
 		ssa.OpAMD64VPSLLWMasked128const,
 		ssa.OpAMD64VPSLLWMasked256const,
 		ssa.OpAMD64VPSLLWMasked512const,
@ -1209,6 +1217,24 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 	case ssa.OpAMD64VPDPWSSD128,
 		ssa.OpAMD64VPDPWSSD256,
 		ssa.OpAMD64VPDPWSSD512,
+		ssa.OpAMD64VPERMI2B128,
+		ssa.OpAMD64VPERMI2B256,
+		ssa.OpAMD64VPERMI2B512,
+		ssa.OpAMD64VPERMI2W128,
+		ssa.OpAMD64VPERMI2W256,
+		ssa.OpAMD64VPERMI2W512,
+		ssa.OpAMD64VPERMI2PS128,
+		ssa.OpAMD64VPERMI2D128,
+		ssa.OpAMD64VPERMI2PS256,
+		ssa.OpAMD64VPERMI2D256,
+		ssa.OpAMD64VPERMI2PS512,
+		ssa.OpAMD64VPERMI2D512,
+		ssa.OpAMD64VPERMI2PD128,
+		ssa.OpAMD64VPERMI2Q128,
+		ssa.OpAMD64VPERMI2PD256,
+		ssa.OpAMD64VPERMI2Q256,
+		ssa.OpAMD64VPERMI2PD512,
+		ssa.OpAMD64VPERMI2Q512,
 		ssa.OpAMD64VPDPBUSD128,
 		ssa.OpAMD64VPDPBUSD256,
 		ssa.OpAMD64VPDPBUSD512,
@ -1233,24 +1259,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VFMSUBADD213PD128,
 		ssa.OpAMD64VFMSUBADD213PD256,
 		ssa.OpAMD64VFMSUBADD213PD512,
-		ssa.OpAMD64VPERMI2B128,
-		ssa.OpAMD64VPERMI2B256,
-		ssa.OpAMD64VPERMI2B512,
-		ssa.OpAMD64VPERMI2W128,
-		ssa.OpAMD64VPERMI2W256,
-		ssa.OpAMD64VPERMI2W512,
-		ssa.OpAMD64VPERMI2PS128,
-		ssa.OpAMD64VPERMI2D128,
-		ssa.OpAMD64VPERMI2PS256,
-		ssa.OpAMD64VPERMI2D256,
-		ssa.OpAMD64VPERMI2PS512,
-		ssa.OpAMD64VPERMI2D512,
-		ssa.OpAMD64VPERMI2PD128,
-		ssa.OpAMD64VPERMI2Q128,
-		ssa.OpAMD64VPERMI2PD256,
-		ssa.OpAMD64VPERMI2Q256,
-		ssa.OpAMD64VPERMI2PD512,
-		ssa.OpAMD64VPERMI2Q512,
 		ssa.OpAMD64VPSHLDVW128,
 		ssa.OpAMD64VPSHLDVW256,
 		ssa.OpAMD64VPSHLDVW512,
@ -1316,6 +1324,24 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPAVGWMasked128Merging,
 		ssa.OpAMD64VPAVGWMasked256Merging,
 		ssa.OpAMD64VPAVGWMasked512Merging,
+		ssa.OpAMD64VPERMI2BMasked128,
+		ssa.OpAMD64VPERMI2BMasked256,
+		ssa.OpAMD64VPERMI2BMasked512,
+		ssa.OpAMD64VPERMI2WMasked128,
+		ssa.OpAMD64VPERMI2WMasked256,
+		ssa.OpAMD64VPERMI2WMasked512,
+		ssa.OpAMD64VPERMI2PSMasked128,
+		ssa.OpAMD64VPERMI2DMasked128,
+		ssa.OpAMD64VPERMI2PSMasked256,
+		ssa.OpAMD64VPERMI2DMasked256,
+		ssa.OpAMD64VPERMI2PSMasked512,
+		ssa.OpAMD64VPERMI2DMasked512,
+		ssa.OpAMD64VPERMI2PDMasked128,
+		ssa.OpAMD64VPERMI2QMasked128,
+		ssa.OpAMD64VPERMI2PDMasked256,
+		ssa.OpAMD64VPERMI2QMasked256,
+		ssa.OpAMD64VPERMI2PDMasked512,
+		ssa.OpAMD64VPERMI2QMasked512,
 		ssa.OpAMD64VPALIGNRMasked256Merging,
 		ssa.OpAMD64VPALIGNRMasked512Merging,
 		ssa.OpAMD64VPALIGNRMasked128Merging,
@ -1451,24 +1477,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPORQMasked128Merging,
 		ssa.OpAMD64VPORQMasked256Merging,
 		ssa.OpAMD64VPORQMasked512Merging,
-		ssa.OpAMD64VPERMI2BMasked128,
-		ssa.OpAMD64VPERMI2BMasked256,
-		ssa.OpAMD64VPERMI2BMasked512,
-		ssa.OpAMD64VPERMI2WMasked128,
-		ssa.OpAMD64VPERMI2WMasked256,
-		ssa.OpAMD64VPERMI2WMasked512,
-		ssa.OpAMD64VPERMI2PSMasked128,
-		ssa.OpAMD64VPERMI2DMasked128,
-		ssa.OpAMD64VPERMI2PSMasked256,
-		ssa.OpAMD64VPERMI2DMasked256,
-		ssa.OpAMD64VPERMI2PSMasked512,
-		ssa.OpAMD64VPERMI2DMasked512,
-		ssa.OpAMD64VPERMI2PDMasked128,
-		ssa.OpAMD64VPERMI2QMasked128,
-		ssa.OpAMD64VPERMI2PDMasked256,
-		ssa.OpAMD64VPERMI2QMasked256,
-		ssa.OpAMD64VPERMI2PDMasked512,
-		ssa.OpAMD64VPERMI2QMasked512,
 		ssa.OpAMD64VPSHUFBMasked256Merging,
 		ssa.OpAMD64VPSHUFBMasked512Merging,
 		ssa.OpAMD64VPSHUFBMasked128Merging,
@ -1819,6 +1827,18 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		p = simdV21load(s, v)

 	case ssa.OpAMD64VPDPWSSD512load,
+		ssa.OpAMD64VPERMI2PS128load,
+		ssa.OpAMD64VPERMI2D128load,
+		ssa.OpAMD64VPERMI2PS256load,
+		ssa.OpAMD64VPERMI2D256load,
+		ssa.OpAMD64VPERMI2PS512load,
+		ssa.OpAMD64VPERMI2D512load,
+		ssa.OpAMD64VPERMI2PD128load,
+		ssa.OpAMD64VPERMI2Q128load,
+		ssa.OpAMD64VPERMI2PD256load,
+		ssa.OpAMD64VPERMI2Q256load,
+		ssa.OpAMD64VPERMI2PD512load,
+		ssa.OpAMD64VPERMI2Q512load,
 		ssa.OpAMD64VPDPBUSD512load,
 		ssa.OpAMD64VPDPBUSDS512load,
 		ssa.OpAMD64VFMADD213PS128load,
@ -1839,18 +1859,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VFMSUBADD213PD128load,
 		ssa.OpAMD64VFMSUBADD213PD256load,
 		ssa.OpAMD64VFMSUBADD213PD512load,
-		ssa.OpAMD64VPERMI2PS128load,
-		ssa.OpAMD64VPERMI2D128load,
-		ssa.OpAMD64VPERMI2PS256load,
-		ssa.OpAMD64VPERMI2D256load,
-		ssa.OpAMD64VPERMI2PS512load,
-		ssa.OpAMD64VPERMI2D512load,
-		ssa.OpAMD64VPERMI2PD128load,
-		ssa.OpAMD64VPERMI2Q128load,
-		ssa.OpAMD64VPERMI2PD256load,
-		ssa.OpAMD64VPERMI2Q256load,
-		ssa.OpAMD64VPERMI2PD512load,
-		ssa.OpAMD64VPERMI2Q512load,
 		ssa.OpAMD64VPSHLDVD128load,
 		ssa.OpAMD64VPSHLDVD256load,
 		ssa.OpAMD64VPSHLDVD512load,
@ -1868,6 +1876,18 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 	case ssa.OpAMD64VPDPWSSDMasked128load,
 		ssa.OpAMD64VPDPWSSDMasked256load,
 		ssa.OpAMD64VPDPWSSDMasked512load,
+		ssa.OpAMD64VPERMI2PSMasked128load,
+		ssa.OpAMD64VPERMI2DMasked128load,
+		ssa.OpAMD64VPERMI2PSMasked256load,
+		ssa.OpAMD64VPERMI2DMasked256load,
+		ssa.OpAMD64VPERMI2PSMasked512load,
+		ssa.OpAMD64VPERMI2DMasked512load,
+		ssa.OpAMD64VPERMI2PDMasked128load,
+		ssa.OpAMD64VPERMI2QMasked128load,
+		ssa.OpAMD64VPERMI2PDMasked256load,
+		ssa.OpAMD64VPERMI2QMasked256load,
+		ssa.OpAMD64VPERMI2PDMasked512load,
+		ssa.OpAMD64VPERMI2QMasked512load,
 		ssa.OpAMD64VPDPBUSDMasked128load,
 		ssa.OpAMD64VPDPBUSDMasked256load,
 		ssa.OpAMD64VPDPBUSDMasked512load,
@ -1892,18 +1912,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VFMSUBADD213PDMasked128load,
 		ssa.OpAMD64VFMSUBADD213PDMasked256load,
 		ssa.OpAMD64VFMSUBADD213PDMasked512load,
-		ssa.OpAMD64VPERMI2PSMasked128load,
-		ssa.OpAMD64VPERMI2DMasked128load,
-		ssa.OpAMD64VPERMI2PSMasked256load,
-		ssa.OpAMD64VPERMI2DMasked256load,
-		ssa.OpAMD64VPERMI2PSMasked512load,
-		ssa.OpAMD64VPERMI2DMasked512load,
-		ssa.OpAMD64VPERMI2PDMasked128load,
-		ssa.OpAMD64VPERMI2QMasked128load,
-		ssa.OpAMD64VPERMI2PDMasked256load,
-		ssa.OpAMD64VPERMI2QMasked256load,
-		ssa.OpAMD64VPERMI2PDMasked512load,
-		ssa.OpAMD64VPERMI2QMasked512load,
 		ssa.OpAMD64VPSHLDVDMasked128load,
 		ssa.OpAMD64VPSHLDVDMasked256load,
 		ssa.OpAMD64VPSHLDVDMasked512load,
@ -2124,7 +2132,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VREDUCEPD128load,
 		ssa.OpAMD64VREDUCEPD256load,
 		ssa.OpAMD64VREDUCEPD512load,
-		ssa.OpAMD64VPSHUFD512load,
 		ssa.OpAMD64VPROLD128load,
 		ssa.OpAMD64VPROLD256load,
 		ssa.OpAMD64VPROLD512load,
@ -2137,6 +2144,7 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPRORQ128load,
 		ssa.OpAMD64VPRORQ256load,
 		ssa.OpAMD64VPRORQ512load,
+		ssa.OpAMD64VPSHUFD512load,
 		ssa.OpAMD64VPSLLD512constload,
 		ssa.OpAMD64VPSLLQ512constload,
 		ssa.OpAMD64VPSRLD512constload,
@ -2159,9 +2167,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VREDUCEPDMasked128load,
 		ssa.OpAMD64VREDUCEPDMasked256load,
 		ssa.OpAMD64VREDUCEPDMasked512load,
-		ssa.OpAMD64VPSHUFDMasked256load,
-		ssa.OpAMD64VPSHUFDMasked512load,
-		ssa.OpAMD64VPSHUFDMasked128load,
 		ssa.OpAMD64VPROLDMasked128load,
 		ssa.OpAMD64VPROLDMasked256load,
 		ssa.OpAMD64VPROLDMasked512load,
@ -2174,6 +2179,9 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPRORQMasked128load,
 		ssa.OpAMD64VPRORQMasked256load,
 		ssa.OpAMD64VPRORQMasked512load,
+		ssa.OpAMD64VPSHUFDMasked256load,
+		ssa.OpAMD64VPSHUFDMasked512load,
+		ssa.OpAMD64VPSHUFDMasked128load,
 		ssa.OpAMD64VPSLLDMasked128constload,
 		ssa.OpAMD64VPSLLDMasked256constload,
 		ssa.OpAMD64VPSLLDMasked512constload,
@ -2447,12 +2455,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPOPCNTQMasked128Merging,
 		ssa.OpAMD64VPOPCNTQMasked256Merging,
 		ssa.OpAMD64VPOPCNTQMasked512Merging,
-		ssa.OpAMD64VPSHUFDMasked256Merging,
-		ssa.OpAMD64VPSHUFDMasked512Merging,
-		ssa.OpAMD64VPSHUFHWMasked256Merging,
-		ssa.OpAMD64VPSHUFHWMasked512Merging,
-		ssa.OpAMD64VPSHUFHWMasked128Merging,
-		ssa.OpAMD64VPSHUFDMasked128Merging,
 		ssa.OpAMD64VRCP14PSMasked128Merging,
 		ssa.OpAMD64VRCP14PSMasked256Merging,
 		ssa.OpAMD64VRCP14PSMasked512Merging,
@ -2483,6 +2485,15 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VSQRTPDMasked128Merging,
 		ssa.OpAMD64VSQRTPDMasked256Merging,
 		ssa.OpAMD64VSQRTPDMasked512Merging,
+		ssa.OpAMD64VPSHUFDMasked256Merging,
+		ssa.OpAMD64VPSHUFDMasked512Merging,
+		ssa.OpAMD64VPSHUFHWMasked256Merging,
+		ssa.OpAMD64VPSHUFHWMasked512Merging,
+		ssa.OpAMD64VPSHUFHWMasked128Merging,
+		ssa.OpAMD64VPSHUFLWMasked256Merging,
+		ssa.OpAMD64VPSHUFLWMasked512Merging,
+		ssa.OpAMD64VPSHUFLWMasked128Merging,
+		ssa.OpAMD64VPSHUFDMasked128Merging,
 		ssa.OpAMD64VPSLLWMasked128constMerging,
 		ssa.OpAMD64VPSLLWMasked256constMerging,
 		ssa.OpAMD64VPSLLWMasked512constMerging,
@ -2674,6 +2685,36 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPCOMPRESSQMasked128,
 		ssa.OpAMD64VPCOMPRESSQMasked256,
 		ssa.OpAMD64VPCOMPRESSQMasked512,
+		ssa.OpAMD64VPERMI2BMasked128,
+		ssa.OpAMD64VPERMI2BMasked256,
+		ssa.OpAMD64VPERMI2BMasked512,
+		ssa.OpAMD64VPERMI2WMasked128,
+		ssa.OpAMD64VPERMI2WMasked256,
+		ssa.OpAMD64VPERMI2WMasked512,
+		ssa.OpAMD64VPERMI2PSMasked128,
+		ssa.OpAMD64VPERMI2PSMasked128load,
+		ssa.OpAMD64VPERMI2DMasked128,
+		ssa.OpAMD64VPERMI2DMasked128load,
+		ssa.OpAMD64VPERMI2PSMasked256,
+		ssa.OpAMD64VPERMI2PSMasked256load,
+		ssa.OpAMD64VPERMI2DMasked256,
+		ssa.OpAMD64VPERMI2DMasked256load,
+		ssa.OpAMD64VPERMI2PSMasked512,
+		ssa.OpAMD64VPERMI2PSMasked512load,
+		ssa.OpAMD64VPERMI2DMasked512,
+		ssa.OpAMD64VPERMI2DMasked512load,
+		ssa.OpAMD64VPERMI2PDMasked128,
+		ssa.OpAMD64VPERMI2PDMasked128load,
+		ssa.OpAMD64VPERMI2QMasked128,
+		ssa.OpAMD64VPERMI2QMasked128load,
+		ssa.OpAMD64VPERMI2PDMasked256,
+		ssa.OpAMD64VPERMI2PDMasked256load,
+		ssa.OpAMD64VPERMI2QMasked256,
+		ssa.OpAMD64VPERMI2QMasked256load,
+		ssa.OpAMD64VPERMI2PDMasked512,
+		ssa.OpAMD64VPERMI2PDMasked512load,
+		ssa.OpAMD64VPERMI2QMasked512,
+		ssa.OpAMD64VPERMI2QMasked512load,
 		ssa.OpAMD64VPALIGNRMasked256,
 		ssa.OpAMD64VPALIGNRMasked512,
 		ssa.OpAMD64VPALIGNRMasked128,
@ -3061,48 +3102,7 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPORQMasked256load,
 		ssa.OpAMD64VPORQMasked512,
 		ssa.OpAMD64VPORQMasked512load,
-		ssa.OpAMD64VPERMI2BMasked128,
-		ssa.OpAMD64VPERMI2BMasked256,
-		ssa.OpAMD64VPERMI2BMasked512,
-		ssa.OpAMD64VPERMI2WMasked128,
-		ssa.OpAMD64VPERMI2WMasked256,
-		ssa.OpAMD64VPERMI2WMasked512,
-		ssa.OpAMD64VPERMI2PSMasked128,
-		ssa.OpAMD64VPERMI2PSMasked128load,
-		ssa.OpAMD64VPERMI2DMasked128,
-		ssa.OpAMD64VPERMI2DMasked128load,
-		ssa.OpAMD64VPERMI2PSMasked256,
-		ssa.OpAMD64VPERMI2PSMasked256load,
-		ssa.OpAMD64VPERMI2DMasked256,
-		ssa.OpAMD64VPERMI2DMasked256load,
-		ssa.OpAMD64VPERMI2PSMasked512,
-		ssa.OpAMD64VPERMI2PSMasked512load,
-		ssa.OpAMD64VPERMI2DMasked512,
-		ssa.OpAMD64VPERMI2DMasked512load,
-		ssa.OpAMD64VPERMI2PDMasked128,
-		ssa.OpAMD64VPERMI2PDMasked128load,
-		ssa.OpAMD64VPERMI2QMasked128,
-		ssa.OpAMD64VPERMI2QMasked128load,
-		ssa.OpAMD64VPERMI2PDMasked256,
-		ssa.OpAMD64VPERMI2PDMasked256load,
-		ssa.OpAMD64VPERMI2QMasked256,
-		ssa.OpAMD64VPERMI2QMasked256load,
-		ssa.OpAMD64VPERMI2PDMasked512,
-		ssa.OpAMD64VPERMI2PDMasked512load,
-		ssa.OpAMD64VPERMI2QMasked512,
-		ssa.OpAMD64VPERMI2QMasked512load,
-		ssa.OpAMD64VPSHUFDMasked256,
-		ssa.OpAMD64VPSHUFDMasked256load,
-		ssa.OpAMD64VPSHUFDMasked512,
-		ssa.OpAMD64VPSHUFDMasked512load,
-		ssa.OpAMD64VPSHUFHWMasked256,
-		ssa.OpAMD64VPSHUFHWMasked512,
-		ssa.OpAMD64VPSHUFHWMasked128,
-		ssa.OpAMD64VPSHUFDMasked128,
-		ssa.OpAMD64VPSHUFDMasked128load,
-		ssa.OpAMD64VPSHUFBMasked256,
-		ssa.OpAMD64VPSHUFBMasked512,
-		ssa.OpAMD64VPSHUFBMasked128,
+		ssa.OpAMD64VPERMBMasked128,
 		ssa.OpAMD64VPERMBMasked256,
 		ssa.OpAMD64VPERMBMasked512,
 		ssa.OpAMD64VPERMWMasked128,
@ -3124,6 +3124,9 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPERMPDMasked512load,
 		ssa.OpAMD64VPERMQMasked512,
 		ssa.OpAMD64VPERMQMasked512load,
+		ssa.OpAMD64VPSHUFBMasked256,
+		ssa.OpAMD64VPSHUFBMasked512,
+		ssa.OpAMD64VPSHUFBMasked128,
 		ssa.OpAMD64VRCP14PSMasked128,
 		ssa.OpAMD64VRCP14PSMasked128load,
 		ssa.OpAMD64VRCP14PSMasked256,
@ -3418,6 +3421,18 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VMOVDQU64Masked128,
 		ssa.OpAMD64VMOVDQU64Masked256,
 		ssa.OpAMD64VMOVDQU64Masked512,
+		ssa.OpAMD64VPSHUFDMasked256,
+		ssa.OpAMD64VPSHUFDMasked256load,
+		ssa.OpAMD64VPSHUFDMasked512,
+		ssa.OpAMD64VPSHUFDMasked512load,
+		ssa.OpAMD64VPSHUFHWMasked256,
+		ssa.OpAMD64VPSHUFHWMasked512,
+		ssa.OpAMD64VPSHUFHWMasked128,
+		ssa.OpAMD64VPSHUFLWMasked256,
+		ssa.OpAMD64VPSHUFLWMasked512,
+		ssa.OpAMD64VPSHUFLWMasked128,
+		ssa.OpAMD64VPSHUFDMasked128,
+		ssa.OpAMD64VPSHUFDMasked128load,
 		ssa.OpAMD64VPSLLWMasked128const,
 		ssa.OpAMD64VPSLLWMasked256const,
 		ssa.OpAMD64VPSLLWMasked512const,
--- a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
+++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
@ -216,6 +216,36 @@
 (CompressUint64x2 x mask) => (VPCOMPRESSQMasked128 x (VPMOVVec64x2ToM <types.TypeMask> mask))
 (CompressUint64x4 x mask) => (VPCOMPRESSQMasked256 x (VPMOVVec64x4ToM <types.TypeMask> mask))
 (CompressUint64x8 x mask) => (VPCOMPRESSQMasked512 x (VPMOVVec64x8ToM <types.TypeMask> mask))
+(ConcatPermuteFloat32x4 ...) => (VPERMI2PS128 ...)
+(ConcatPermuteFloat32x8 ...) => (VPERMI2PS256 ...)
+(ConcatPermuteFloat32x16 ...) => (VPERMI2PS512 ...)
+(ConcatPermuteFloat64x2 ...) => (VPERMI2PD128 ...)
+(ConcatPermuteFloat64x4 ...) => (VPERMI2PD256 ...)
+(ConcatPermuteFloat64x8 ...) => (VPERMI2PD512 ...)
+(ConcatPermuteInt8x16 ...) => (VPERMI2B128 ...)
+(ConcatPermuteInt8x32 ...) => (VPERMI2B256 ...)
+(ConcatPermuteInt8x64 ...) => (VPERMI2B512 ...)
+(ConcatPermuteInt16x8 ...) => (VPERMI2W128 ...)
+(ConcatPermuteInt16x16 ...) => (VPERMI2W256 ...)
+(ConcatPermuteInt16x32 ...) => (VPERMI2W512 ...)
+(ConcatPermuteInt32x4 ...) => (VPERMI2D128 ...)
+(ConcatPermuteInt32x8 ...) => (VPERMI2D256 ...)
+(ConcatPermuteInt32x16 ...) => (VPERMI2D512 ...)
+(ConcatPermuteInt64x2 ...) => (VPERMI2Q128 ...)
+(ConcatPermuteInt64x4 ...) => (VPERMI2Q256 ...)
+(ConcatPermuteInt64x8 ...) => (VPERMI2Q512 ...)
+(ConcatPermuteUint8x16 ...) => (VPERMI2B128 ...)
+(ConcatPermuteUint8x32 ...) => (VPERMI2B256 ...)
+(ConcatPermuteUint8x64 ...) => (VPERMI2B512 ...)
+(ConcatPermuteUint16x8 ...) => (VPERMI2W128 ...)
+(ConcatPermuteUint16x16 ...) => (VPERMI2W256 ...)
+(ConcatPermuteUint16x32 ...) => (VPERMI2W512 ...)
+(ConcatPermuteUint32x4 ...) => (VPERMI2D128 ...)
+(ConcatPermuteUint32x8 ...) => (VPERMI2D256 ...)
+(ConcatPermuteUint32x16 ...) => (VPERMI2D512 ...)
+(ConcatPermuteUint64x2 ...) => (VPERMI2Q128 ...)
+(ConcatPermuteUint64x4 ...) => (VPERMI2Q256 ...)
+(ConcatPermuteUint64x8 ...) => (VPERMI2Q512 ...)
 (ConcatShiftBytesRightUint8x16 ...) => (VPALIGNR128 ...)
 (ConcatShiftBytesRightGroupedUint8x32 ...) => (VPALIGNR256 ...)
 (ConcatShiftBytesRightGroupedUint8x64 ...) => (VPALIGNR512 ...)
@ -794,7 +824,7 @@
 (PermuteFloat32x16 ...) => (VPERMPS512 ...)
 (PermuteFloat64x4 ...) => (VPERMPD256 ...)
 (PermuteFloat64x8 ...) => (VPERMPD512 ...)
-(PermuteInt8x16 ...) => (VPSHUFB128 ...)
+(PermuteInt8x16 ...) => (VPERMB128 ...)
 (PermuteInt8x32 ...) => (VPERMB256 ...)
 (PermuteInt8x64 ...) => (VPERMB512 ...)
 (PermuteInt16x8 ...) => (VPERMW128 ...)
@ -804,7 +834,7 @@
 (PermuteInt32x16 ...) => (VPERMD512 ...)
 (PermuteInt64x4 ...) => (VPERMQ256 ...)
 (PermuteInt64x8 ...) => (VPERMQ512 ...)
-(PermuteUint8x16 ...) => (VPSHUFB128 ...)
+(PermuteUint8x16 ...) => (VPERMB128 ...)
 (PermuteUint8x32 ...) => (VPERMB256 ...)
 (PermuteUint8x64 ...) => (VPERMB512 ...)
 (PermuteUint16x8 ...) => (VPERMW128 ...)
@ -814,62 +844,12 @@
 (PermuteUint32x16 ...) => (VPERMD512 ...)
 (PermuteUint64x4 ...) => (VPERMQ256 ...)
 (PermuteUint64x8 ...) => (VPERMQ512 ...)
-(Permute2Float32x4 ...) => (VPERMI2PS128 ...)
-(Permute2Float32x8 ...) => (VPERMI2PS256 ...)
-(Permute2Float32x16 ...) => (VPERMI2PS512 ...)
-(Permute2Float64x2 ...) => (VPERMI2PD128 ...)
-(Permute2Float64x4 ...) => (VPERMI2PD256 ...)
-(Permute2Float64x8 ...) => (VPERMI2PD512 ...)
-(Permute2Int8x16 ...) => (VPERMI2B128 ...)
-(Permute2Int8x32 ...) => (VPERMI2B256 ...)
-(Permute2Int8x64 ...) => (VPERMI2B512 ...)
-(Permute2Int16x8 ...) => (VPERMI2W128 ...)
-(Permute2Int16x16 ...) => (VPERMI2W256 ...)
-(Permute2Int16x32 ...) => (VPERMI2W512 ...)
-(Permute2Int32x4 ...) => (VPERMI2D128 ...)
-(Permute2Int32x8 ...) => (VPERMI2D256 ...)
-(Permute2Int32x16 ...) => (VPERMI2D512 ...)
-(Permute2Int64x2 ...) => (VPERMI2Q128 ...)
-(Permute2Int64x4 ...) => (VPERMI2Q256 ...)
-(Permute2Int64x8 ...) => (VPERMI2Q512 ...)
-(Permute2Uint8x16 ...) => (VPERMI2B128 ...)
-(Permute2Uint8x32 ...) => (VPERMI2B256 ...)
-(Permute2Uint8x64 ...) => (VPERMI2B512 ...)
-(Permute2Uint16x8 ...) => (VPERMI2W128 ...)
-(Permute2Uint16x16 ...) => (VPERMI2W256 ...)
-(Permute2Uint16x32 ...) => (VPERMI2W512 ...)
-(Permute2Uint32x4 ...) => (VPERMI2D128 ...)
-(Permute2Uint32x8 ...) => (VPERMI2D256 ...)
-(Permute2Uint32x16 ...) => (VPERMI2D512 ...)
-(Permute2Uint64x2 ...) => (VPERMI2Q128 ...)
-(Permute2Uint64x4 ...) => (VPERMI2Q256 ...)
-(Permute2Uint64x8 ...) => (VPERMI2Q512 ...)
-(PermuteConstantInt32x4 ...) => (VPSHUFD128 ...)
-(PermuteConstantUint32x4 ...) => (VPSHUFD128 ...)
-(PermuteConstantGroupedInt32x8 ...) => (VPSHUFD256 ...)
-(PermuteConstantGroupedInt32x16 ...) => (VPSHUFD512 ...)
-(PermuteConstantGroupedUint32x8 ...) => (VPSHUFD256 ...)
-(PermuteConstantGroupedUint32x16 ...) => (VPSHUFD512 ...)
-(PermuteConstantHiInt16x8 ...) => (VPSHUFHW128 ...)
-(PermuteConstantHiInt32x4 ...) => (VPSHUFHW128 ...)
-(PermuteConstantHiUint16x8 ...) => (VPSHUFHW128 ...)
-(PermuteConstantHiUint32x4 ...) => (VPSHUFHW128 ...)
-(PermuteConstantHiGroupedInt16x16 ...) => (VPSHUFHW256 ...)
-(PermuteConstantHiGroupedInt16x32 ...) => (VPSHUFHW512 ...)
-(PermuteConstantHiGroupedUint16x16 ...) => (VPSHUFHW256 ...)
-(PermuteConstantHiGroupedUint16x32 ...) => (VPSHUFHW512 ...)
-(PermuteConstantLoInt16x8 ...) => (VPSHUFHW128 ...)
-(PermuteConstantLoInt32x4 ...) => (VPSHUFHW128 ...)
-(PermuteConstantLoUint16x8 ...) => (VPSHUFHW128 ...)
-(PermuteConstantLoUint32x4 ...) => (VPSHUFHW128 ...)
-(PermuteConstantLoGroupedInt16x16 ...) => (VPSHUFHW256 ...)
-(PermuteConstantLoGroupedInt16x32 ...) => (VPSHUFHW512 ...)
-(PermuteConstantLoGroupedUint16x16 ...) => (VPSHUFHW256 ...)
-(PermuteConstantLoGroupedUint16x32 ...) => (VPSHUFHW512 ...)
-(PermuteGroupedInt8x32 ...) => (VPSHUFB256 ...)
-(PermuteGroupedInt8x64 ...) => (VPSHUFB512 ...)
-(PermuteGroupedUint8x32 ...) => (VPSHUFB256 ...)
-(PermuteGroupedUint8x64 ...) => (VPSHUFB512 ...)
+(PermuteOrZeroInt8x16 ...) => (VPSHUFB128 ...)
+(PermuteOrZeroUint8x16 ...) => (VPSHUFB128 ...)
+(PermuteOrZeroGroupedInt8x32 ...) => (VPSHUFB256 ...)
+(PermuteOrZeroGroupedInt8x64 ...) => (VPSHUFB512 ...)
+(PermuteOrZeroGroupedUint8x32 ...) => (VPSHUFB256 ...)
+(PermuteOrZeroGroupedUint8x64 ...) => (VPSHUFB512 ...)
 (ReciprocalFloat32x4 ...) => (VRCPPS128 ...)
 (ReciprocalFloat32x8 ...) => (VRCPPS256 ...)
 (ReciprocalFloat32x16 ...) => (VRCP14PS512 ...)
@ -1324,6 +1304,24 @@
 (concatSelectedConstantGroupedUint32x16 ...) => (VSHUFPS512 ...)
 (concatSelectedConstantGroupedUint64x4 ...) => (VSHUFPD256 ...)
 (concatSelectedConstantGroupedUint64x8 ...) => (VSHUFPD512 ...)
+(permuteScalarsInt32x4 ...) => (VPSHUFD128 ...)
+(permuteScalarsUint32x4 ...) => (VPSHUFD128 ...)
+(permuteScalarsGroupedInt32x8 ...) => (VPSHUFD256 ...)
+(permuteScalarsGroupedInt32x16 ...) => (VPSHUFD512 ...)
+(permuteScalarsGroupedUint32x8 ...) => (VPSHUFD256 ...)
+(permuteScalarsGroupedUint32x16 ...) => (VPSHUFD512 ...)
+(permuteScalarsHiInt16x8 ...) => (VPSHUFHW128 ...)
+(permuteScalarsHiUint16x8 ...) => (VPSHUFHW128 ...)
+(permuteScalarsHiGroupedInt16x16 ...) => (VPSHUFHW256 ...)
+(permuteScalarsHiGroupedInt16x32 ...) => (VPSHUFHW512 ...)
+(permuteScalarsHiGroupedUint16x16 ...) => (VPSHUFHW256 ...)
+(permuteScalarsHiGroupedUint16x32 ...) => (VPSHUFHW512 ...)
+(permuteScalarsLoInt16x8 ...) => (VPSHUFLW128 ...)
+(permuteScalarsLoUint16x8 ...) => (VPSHUFLW128 ...)
+(permuteScalarsLoGroupedInt16x16 ...) => (VPSHUFLW256 ...)
+(permuteScalarsLoGroupedInt16x32 ...) => (VPSHUFLW512 ...)
+(permuteScalarsLoGroupedUint16x16 ...) => (VPSHUFLW256 ...)
+(permuteScalarsLoGroupedUint16x32 ...) => (VPSHUFLW512 ...)
 (ternInt32x4 ...) => (VPTERNLOGD128 ...)
 (ternInt32x8 ...) => (VPTERNLOGD256 ...)
 (ternInt32x16 ...) => (VPTERNLOGD512 ...)
@ -1417,6 +1415,24 @@
 (VMOVDQU64Masked128 (VREDUCEPD128 [a] x) mask) => (VREDUCEPDMasked128 [a] x mask)
 (VMOVDQU64Masked256 (VREDUCEPD256 [a] x) mask) => (VREDUCEPDMasked256 [a] x mask)
 (VMOVDQU64Masked512 (VREDUCEPD512 [a] x) mask) => (VREDUCEPDMasked512 [a] x mask)
+(VMOVDQU8Masked128 (VPERMI2B128 x y z) mask) => (VPERMI2BMasked128 x y z mask)
+(VMOVDQU8Masked256 (VPERMI2B256 x y z) mask) => (VPERMI2BMasked256 x y z mask)
+(VMOVDQU8Masked512 (VPERMI2B512 x y z) mask) => (VPERMI2BMasked512 x y z mask)
+(VMOVDQU16Masked128 (VPERMI2W128 x y z) mask) => (VPERMI2WMasked128 x y z mask)
+(VMOVDQU16Masked256 (VPERMI2W256 x y z) mask) => (VPERMI2WMasked256 x y z mask)
+(VMOVDQU16Masked512 (VPERMI2W512 x y z) mask) => (VPERMI2WMasked512 x y z mask)
+(VMOVDQU32Masked128 (VPERMI2PS128 x y z) mask) => (VPERMI2PSMasked128 x y z mask)
+(VMOVDQU32Masked128 (VPERMI2D128 x y z) mask) => (VPERMI2DMasked128 x y z mask)
+(VMOVDQU32Masked256 (VPERMI2PS256 x y z) mask) => (VPERMI2PSMasked256 x y z mask)
+(VMOVDQU32Masked256 (VPERMI2D256 x y z) mask) => (VPERMI2DMasked256 x y z mask)
+(VMOVDQU32Masked512 (VPERMI2PS512 x y z) mask) => (VPERMI2PSMasked512 x y z mask)
+(VMOVDQU32Masked512 (VPERMI2D512 x y z) mask) => (VPERMI2DMasked512 x y z mask)
+(VMOVDQU64Masked128 (VPERMI2PD128 x y z) mask) => (VPERMI2PDMasked128 x y z mask)
+(VMOVDQU64Masked128 (VPERMI2Q128 x y z) mask) => (VPERMI2QMasked128 x y z mask)
+(VMOVDQU64Masked256 (VPERMI2PD256 x y z) mask) => (VPERMI2PDMasked256 x y z mask)
+(VMOVDQU64Masked256 (VPERMI2Q256 x y z) mask) => (VPERMI2QMasked256 x y z mask)
+(VMOVDQU64Masked512 (VPERMI2PD512 x y z) mask) => (VPERMI2PDMasked512 x y z mask)
+(VMOVDQU64Masked512 (VPERMI2Q512 x y z) mask) => (VPERMI2QMasked512 x y z mask)
 (VMOVDQU8Masked256 (VPALIGNR256 [a] x y) mask) => (VPALIGNRMasked256 [a] x y mask)
 (VMOVDQU8Masked512 (VPALIGNR512 [a] x y) mask) => (VPALIGNRMasked512 [a] x y mask)
 (VMOVDQU8Masked128 (VPALIGNR128 [a] x y) mask) => (VPALIGNRMasked128 [a] x y mask)
@ -1668,33 +1684,7 @@
 (VMOVDQU64Masked512 (VPOPCNTQ512 x) mask) => (VPOPCNTQMasked512 x mask)
 (VMOVDQU32Masked512 (VPORD512 x y) mask) => (VPORDMasked512 x y mask)
 (VMOVDQU64Masked512 (VPORQ512 x y) mask) => (VPORQMasked512 x y mask)
-(VMOVDQU8Masked128 (VPERMI2B128 x y z) mask) => (VPERMI2BMasked128 x y z mask)
-(VMOVDQU8Masked256 (VPERMI2B256 x y z) mask) => (VPERMI2BMasked256 x y z mask)
-(VMOVDQU8Masked512 (VPERMI2B512 x y z) mask) => (VPERMI2BMasked512 x y z mask)
-(VMOVDQU16Masked128 (VPERMI2W128 x y z) mask) => (VPERMI2WMasked128 x y z mask)
-(VMOVDQU16Masked256 (VPERMI2W256 x y z) mask) => (VPERMI2WMasked256 x y z mask)
-(VMOVDQU16Masked512 (VPERMI2W512 x y z) mask) => (VPERMI2WMasked512 x y z mask)
-(VMOVDQU32Masked128 (VPERMI2PS128 x y z) mask) => (VPERMI2PSMasked128 x y z mask)
-(VMOVDQU32Masked128 (VPERMI2D128 x y z) mask) => (VPERMI2DMasked128 x y z mask)
-(VMOVDQU32Masked256 (VPERMI2PS256 x y z) mask) => (VPERMI2PSMasked256 x y z mask)
-(VMOVDQU32Masked256 (VPERMI2D256 x y z) mask) => (VPERMI2DMasked256 x y z mask)
-(VMOVDQU32Masked512 (VPERMI2PS512 x y z) mask) => (VPERMI2PSMasked512 x y z mask)
-(VMOVDQU32Masked512 (VPERMI2D512 x y z) mask) => (VPERMI2DMasked512 x y z mask)
-(VMOVDQU64Masked128 (VPERMI2PD128 x y z) mask) => (VPERMI2PDMasked128 x y z mask)
-(VMOVDQU64Masked128 (VPERMI2Q128 x y z) mask) => (VPERMI2QMasked128 x y z mask)
-(VMOVDQU64Masked256 (VPERMI2PD256 x y z) mask) => (VPERMI2PDMasked256 x y z mask)
-(VMOVDQU64Masked256 (VPERMI2Q256 x y z) mask) => (VPERMI2QMasked256 x y z mask)
-(VMOVDQU64Masked512 (VPERMI2PD512 x y z) mask) => (VPERMI2PDMasked512 x y z mask)
-(VMOVDQU64Masked512 (VPERMI2Q512 x y z) mask) => (VPERMI2QMasked512 x y z mask)
-(VMOVDQU32Masked256 (VPSHUFD256 [a] x) mask) => (VPSHUFDMasked256 [a] x mask)
-(VMOVDQU32Masked512 (VPSHUFD512 [a] x) mask) => (VPSHUFDMasked512 [a] x mask)
-(VMOVDQU16Masked256 (VPSHUFHW256 [a] x) mask) => (VPSHUFHWMasked256 [a] x mask)
-(VMOVDQU16Masked512 (VPSHUFHW512 [a] x) mask) => (VPSHUFHWMasked512 [a] x mask)
-(VMOVDQU16Masked128 (VPSHUFHW128 [a] x) mask) => (VPSHUFHWMasked128 [a] x mask)
-(VMOVDQU32Masked128 (VPSHUFD128 [a] x) mask) => (VPSHUFDMasked128 [a] x mask)
-(VMOVDQU8Masked256 (VPSHUFB256 x y) mask) => (VPSHUFBMasked256 x y mask)
-(VMOVDQU8Masked512 (VPSHUFB512 x y) mask) => (VPSHUFBMasked512 x y mask)
-(VMOVDQU8Masked128 (VPSHUFB128 x y) mask) => (VPSHUFBMasked128 x y mask)
+(VMOVDQU8Masked128 (VPERMB128 x y) mask) => (VPERMBMasked128 x y mask)
 (VMOVDQU8Masked256 (VPERMB256 x y) mask) => (VPERMBMasked256 x y mask)
 (VMOVDQU8Masked512 (VPERMB512 x y) mask) => (VPERMBMasked512 x y mask)
 (VMOVDQU16Masked128 (VPERMW128 x y) mask) => (VPERMWMasked128 x y mask)
@ -1708,6 +1698,9 @@
 (VMOVDQU64Masked256 (VPERMQ256 x y) mask) => (VPERMQMasked256 x y mask)
 (VMOVDQU64Masked512 (VPERMPD512 x y) mask) => (VPERMPDMasked512 x y mask)
 (VMOVDQU64Masked512 (VPERMQ512 x y) mask) => (VPERMQMasked512 x y mask)
+(VMOVDQU8Masked256 (VPSHUFB256 x y) mask) => (VPSHUFBMasked256 x y mask)
+(VMOVDQU8Masked512 (VPSHUFB512 x y) mask) => (VPSHUFBMasked512 x y mask)
+(VMOVDQU8Masked128 (VPSHUFB128 x y) mask) => (VPSHUFBMasked128 x y mask)
 (VMOVDQU32Masked512 (VRCP14PS512 x) mask) => (VRCP14PSMasked512 x mask)
 (VMOVDQU64Masked128 (VRCP14PD128 x) mask) => (VRCP14PDMasked128 x mask)
 (VMOVDQU64Masked256 (VRCP14PD256 x) mask) => (VRCP14PDMasked256 x mask)
@ -1874,6 +1867,15 @@
 (VMOVDQU16Masked512 (VPSUBUSW512 x y) mask) => (VPSUBUSWMasked512 x y mask)
 (VMOVDQU32Masked512 (VPXORD512 x y) mask) => (VPXORDMasked512 x y mask)
 (VMOVDQU64Masked512 (VPXORQ512 x y) mask) => (VPXORQMasked512 x y mask)
+(VMOVDQU32Masked256 (VPSHUFD256 [a] x) mask) => (VPSHUFDMasked256 [a] x mask)
+(VMOVDQU32Masked512 (VPSHUFD512 [a] x) mask) => (VPSHUFDMasked512 [a] x mask)
+(VMOVDQU16Masked256 (VPSHUFHW256 [a] x) mask) => (VPSHUFHWMasked256 [a] x mask)
+(VMOVDQU16Masked512 (VPSHUFHW512 [a] x) mask) => (VPSHUFHWMasked512 [a] x mask)
+(VMOVDQU16Masked128 (VPSHUFHW128 [a] x) mask) => (VPSHUFHWMasked128 [a] x mask)
+(VMOVDQU16Masked256 (VPSHUFLW256 [a] x) mask) => (VPSHUFLWMasked256 [a] x mask)
+(VMOVDQU16Masked512 (VPSHUFLW512 [a] x) mask) => (VPSHUFLWMasked512 [a] x mask)
+(VMOVDQU16Masked128 (VPSHUFLW128 [a] x) mask) => (VPSHUFLWMasked128 [a] x mask)
+(VMOVDQU32Masked128 (VPSHUFD128 [a] x) mask) => (VPSHUFDMasked128 [a] x mask)
 (VMOVDQU16Masked128 (VPSLLW128const [a] x) mask) => (VPSLLWMasked128const [a] x mask)
 (VMOVDQU16Masked256 (VPSLLW256const [a] x) mask) => (VPSLLWMasked256const [a] x mask)
 (VMOVDQU16Masked512 (VPSLLW512const [a] x) mask) => (VPSLLWMasked512const [a] x mask)
@ -2021,6 +2023,7 @@
 (VPBLENDMWMasked512 dst (VPSHLDW512 [a] x y) mask) => (VPSHLDWMasked512Merging dst [a] x y mask)
 (VPBLENDMWMasked512 dst (VPSHRDW512 [a] x y) mask) => (VPSHRDWMasked512Merging dst [a] x y mask)
 (VPBLENDMWMasked512 dst (VPSHUFHW512 [a] x) mask) => (VPSHUFHWMasked512Merging dst [a] x mask)
+(VPBLENDMWMasked512 dst (VPSHUFLW512 [a] x) mask) => (VPSHUFLWMasked512Merging dst [a] x mask)
 (VPBLENDMWMasked512 dst (VPSLLVW512 x y) mask) => (VPSLLVWMasked512Merging dst x y mask)
 (VPBLENDMWMasked512 dst (VPSLLW512const [a] x) mask) => (VPSLLWMasked512constMerging dst [a] x mask)
 (VPBLENDMWMasked512 dst (VPSRAVW512 x y) mask) => (VPSRAVWMasked512Merging dst x y mask)
@ -2170,6 +2173,7 @@
 (VPBLENDVB128 dst (VPSHUFB128 x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHUFBMasked128Merging dst x y (VPMOVVec8x16ToM <types.TypeMask> mask))
 (VPBLENDVB128 dst (VPSHUFD128 [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHUFDMasked128Merging dst [a] x (VPMOVVec32x4ToM <types.TypeMask> mask))
 (VPBLENDVB128 dst (VPSHUFHW128 [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHUFHWMasked128Merging dst [a] x (VPMOVVec16x8ToM <types.TypeMask> mask))
+(VPBLENDVB128 dst (VPSHUFLW128 [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHUFLWMasked128Merging dst [a] x (VPMOVVec16x8ToM <types.TypeMask> mask))
 (VPBLENDVB128 dst (VPSLLD128const [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSLLDMasked128constMerging dst [a] x (VPMOVVec32x4ToM <types.TypeMask> mask))
 (VPBLENDVB128 dst (VPSLLQ128const [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSLLQMasked128constMerging dst [a] x (VPMOVVec64x2ToM <types.TypeMask> mask))
 (VPBLENDVB128 dst (VPSLLVD128 x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSLLVDMasked128Merging dst x y (VPMOVVec32x4ToM <types.TypeMask> mask))
@ -2305,6 +2309,7 @@
 (VPBLENDVB256 dst (VPSHUFB256 x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHUFBMasked256Merging dst x y (VPMOVVec8x32ToM <types.TypeMask> mask))
 (VPBLENDVB256 dst (VPSHUFD256 [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHUFDMasked256Merging dst [a] x (VPMOVVec32x8ToM <types.TypeMask> mask))
 (VPBLENDVB256 dst (VPSHUFHW256 [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHUFHWMasked256Merging dst [a] x (VPMOVVec16x16ToM <types.TypeMask> mask))
+(VPBLENDVB256 dst (VPSHUFLW256 [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHUFLWMasked256Merging dst [a] x (VPMOVVec16x16ToM <types.TypeMask> mask))
 (VPBLENDVB256 dst (VPSLLD256const [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSLLDMasked256constMerging dst [a] x (VPMOVVec32x8ToM <types.TypeMask> mask))
 (VPBLENDVB256 dst (VPSLLQ256const [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSLLQMasked256constMerging dst [a] x (VPMOVVec64x4ToM <types.TypeMask> mask))
 (VPBLENDVB256 dst (VPSLLVD256 x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSLLVDMasked256Merging dst x y (VPMOVVec32x8ToM <types.TypeMask> mask))
@ -2410,6 +2415,30 @@
 (VREDUCEPDMasked128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VREDUCEPDMasked128load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
 (VREDUCEPDMasked256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VREDUCEPDMasked256load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
 (VREDUCEPDMasked512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VREDUCEPDMasked512load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
+(VPERMI2PS128 x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PS128load {sym} [off] x y ptr mem)
+(VPERMI2D128 x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2D128load {sym} [off] x y ptr mem)
+(VPERMI2PS256 x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PS256load {sym} [off] x y ptr mem)
+(VPERMI2D256 x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2D256load {sym} [off] x y ptr mem)
+(VPERMI2PS512 x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PS512load {sym} [off] x y ptr mem)
+(VPERMI2D512 x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2D512load {sym} [off] x y ptr mem)
+(VPERMI2PD128 x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PD128load {sym} [off] x y ptr mem)
+(VPERMI2Q128 x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2Q128load {sym} [off] x y ptr mem)
+(VPERMI2PD256 x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PD256load {sym} [off] x y ptr mem)
+(VPERMI2Q256 x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2Q256load {sym} [off] x y ptr mem)
+(VPERMI2PD512 x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PD512load {sym} [off] x y ptr mem)
+(VPERMI2Q512 x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2Q512load {sym} [off] x y ptr mem)
+(VPERMI2PSMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PSMasked128load {sym} [off] x y ptr mask mem)
+(VPERMI2DMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2DMasked128load {sym} [off] x y ptr mask mem)
+(VPERMI2PSMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PSMasked256load {sym} [off] x y ptr mask mem)
+(VPERMI2DMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2DMasked256load {sym} [off] x y ptr mask mem)
+(VPERMI2PSMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PSMasked512load {sym} [off] x y ptr mask mem)
+(VPERMI2DMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2DMasked512load {sym} [off] x y ptr mask mem)
+(VPERMI2PDMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PDMasked128load {sym} [off] x y ptr mask mem)
+(VPERMI2QMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2QMasked128load {sym} [off] x y ptr mask mem)
+(VPERMI2PDMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PDMasked256load {sym} [off] x y ptr mask mem)
+(VPERMI2QMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2QMasked256load {sym} [off] x y ptr mask mem)
+(VPERMI2PDMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PDMasked512load {sym} [off] x y ptr mask mem)
+(VPERMI2QMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2QMasked512load {sym} [off] x y ptr mask mem)
 (VPACKSSDW512 x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPACKSSDW512load {sym} [off] x ptr mem)
 (VPACKSSDWMasked128 x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPACKSSDWMasked128load {sym} [off] x ptr mask mem)
 (VPACKSSDWMasked256 x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPACKSSDWMasked256load {sym} [off] x ptr mask mem)
@ -2636,34 +2665,6 @@
 (VPERMQ256 x l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMQ256load {sym} [off] x ptr mem)
 (VPERMPD512 x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMPD512load {sym} [off] x ptr mem)
 (VPERMQ512 x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMQ512load {sym} [off] x ptr mem)
-(VPERMI2PS128 x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PS128load {sym} [off] x y ptr mem)
-(VPERMI2D128 x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2D128load {sym} [off] x y ptr mem)
-(VPERMI2PS256 x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PS256load {sym} [off] x y ptr mem)
-(VPERMI2D256 x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2D256load {sym} [off] x y ptr mem)
-(VPERMI2PS512 x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PS512load {sym} [off] x y ptr mem)
-(VPERMI2D512 x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2D512load {sym} [off] x y ptr mem)
-(VPERMI2PD128 x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PD128load {sym} [off] x y ptr mem)
-(VPERMI2Q128 x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2Q128load {sym} [off] x y ptr mem)
-(VPERMI2PD256 x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PD256load {sym} [off] x y ptr mem)
-(VPERMI2Q256 x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2Q256load {sym} [off] x y ptr mem)
-(VPERMI2PD512 x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PD512load {sym} [off] x y ptr mem)
-(VPERMI2Q512 x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2Q512load {sym} [off] x y ptr mem)
-(VPERMI2PSMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PSMasked128load {sym} [off] x y ptr mask mem)
-(VPERMI2DMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2DMasked128load {sym} [off] x y ptr mask mem)
-(VPERMI2PSMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PSMasked256load {sym} [off] x y ptr mask mem)
-(VPERMI2DMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2DMasked256load {sym} [off] x y ptr mask mem)
-(VPERMI2PSMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PSMasked512load {sym} [off] x y ptr mask mem)
-(VPERMI2DMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2DMasked512load {sym} [off] x y ptr mask mem)
-(VPERMI2PDMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PDMasked128load {sym} [off] x y ptr mask mem)
-(VPERMI2QMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2QMasked128load {sym} [off] x y ptr mask mem)
-(VPERMI2PDMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PDMasked256load {sym} [off] x y ptr mask mem)
-(VPERMI2QMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2QMasked256load {sym} [off] x y ptr mask mem)
-(VPERMI2PDMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PDMasked512load {sym} [off] x y ptr mask mem)
-(VPERMI2QMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2QMasked512load {sym} [off] x y ptr mask mem)
-(VPSHUFD512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHUFD512load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
-(VPSHUFDMasked256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHUFDMasked256load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VPSHUFDMasked512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHUFDMasked512load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
-(VPSHUFDMasked128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHUFDMasked128load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
 (VPERMPSMasked256 x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMPSMasked256load {sym} [off] x ptr mask mem)
 (VPERMDMasked256 x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMDMasked256load {sym} [off] x ptr mask mem)
 (VPERMPSMasked512 x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMPSMasked512load {sym} [off] x ptr mask mem)
@ -2862,6 +2863,10 @@
 (VPBLENDMQMasked512 x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPBLENDMQMasked512load {sym} [off] x ptr mask mem)
 (VSHUFPS512 [c]  x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VSHUFPS512load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mem)
 (VSHUFPD512 [c]  x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VSHUFPD512load {sym} [makeValAndOff(int32(int8(c)),off)]  x ptr mem)
+(VPSHUFD512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHUFD512load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
+(VPSHUFDMasked256 [c]  l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHUFDMasked256load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
+(VPSHUFDMasked512 [c]  l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHUFDMasked512load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
+(VPSHUFDMasked128 [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHUFDMasked128load {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
 (VPSLLD512const [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSLLD512constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
 (VPSLLQ512const [c]  l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSLLQ512constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mem)
 (VPSLLDMasked128const [c]  l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSLLDMasked128constload {sym} [makeValAndOff(int32(int8(c)),off)]  ptr mask mem)
--- a/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go
+++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go
@ -383,8 +383,10 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
 		{name: "VPDPWSSDMasked128", argLength: 4, reg: w3kw, asm: "VPDPWSSD", commutative: false, typ: "Vec128", resultInArg0: true},
 		{name: "VPDPWSSDMasked256", argLength: 4, reg: w3kw, asm: "VPDPWSSD", commutative: false, typ: "Vec256", resultInArg0: true},
 		{name: "VPDPWSSDMasked512", argLength: 4, reg: w3kw, asm: "VPDPWSSD", commutative: false, typ: "Vec512", resultInArg0: true},
+		{name: "VPERMB128", argLength: 2, reg: w21, asm: "VPERMB", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPERMB256", argLength: 2, reg: w21, asm: "VPERMB", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPERMB512", argLength: 2, reg: w21, asm: "VPERMB", commutative: false, typ: "Vec512", resultInArg0: false},
+		{name: "VPERMBMasked128", argLength: 3, reg: w2kw, asm: "VPERMB", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPERMBMasked256", argLength: 3, reg: w2kw, asm: "VPERMB", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPERMBMasked512", argLength: 3, reg: w2kw, asm: "VPERMB", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VPERMD256", argLength: 2, reg: v21, asm: "VPERMD", commutative: false, typ: "Vec256", resultInArg0: false},
@ -1310,6 +1312,12 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
 		{name: "VPSHUFHWMasked128", argLength: 2, reg: wkw, asm: "VPSHUFHW", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPSHUFHWMasked256", argLength: 2, reg: wkw, asm: "VPSHUFHW", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPSHUFHWMasked512", argLength: 2, reg: wkw, asm: "VPSHUFHW", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false},
+		{name: "VPSHUFLW128", argLength: 1, reg: w11, asm: "VPSHUFLW", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
+		{name: "VPSHUFLW256", argLength: 1, reg: v11, asm: "VPSHUFLW", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false},
+		{name: "VPSHUFLW512", argLength: 1, reg: w11, asm: "VPSHUFLW", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false},
+		{name: "VPSHUFLWMasked128", argLength: 2, reg: wkw, asm: "VPSHUFLW", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
+		{name: "VPSHUFLWMasked256", argLength: 2, reg: wkw, asm: "VPSHUFLW", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false},
+		{name: "VPSHUFLWMasked512", argLength: 2, reg: wkw, asm: "VPSHUFLW", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VPSLLD128const", argLength: 1, reg: v11, asm: "VPSLLD", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPSLLD256const", argLength: 1, reg: v11, asm: "VPSLLD", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPSLLD512const", argLength: 1, reg: w11, asm: "VPSLLD", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false},
@ -2392,6 +2400,9 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
 		{name: "VPSHUFHWMasked128Merging", argLength: 3, reg: w2kw, asm: "VPSHUFHW", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: true},
 		{name: "VPSHUFHWMasked256Merging", argLength: 3, reg: w2kw, asm: "VPSHUFHW", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: true},
 		{name: "VPSHUFHWMasked512Merging", argLength: 3, reg: w2kw, asm: "VPSHUFHW", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: true},
+		{name: "VPSHUFLWMasked128Merging", argLength: 3, reg: w2kw, asm: "VPSHUFLW", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: true},
+		{name: "VPSHUFLWMasked256Merging", argLength: 3, reg: w2kw, asm: "VPSHUFLW", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: true},
+		{name: "VPSHUFLWMasked512Merging", argLength: 3, reg: w2kw, asm: "VPSHUFLW", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: true},
 		{name: "VPSLLDMasked128constMerging", argLength: 3, reg: w2kw, asm: "VPSLLD", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: true},
 		{name: "VPSLLDMasked256constMerging", argLength: 3, reg: w2kw, asm: "VPSLLD", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: true},
 		{name: "VPSLLDMasked512constMerging", argLength: 3, reg: w2kw, asm: "VPSLLD", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: true},
--- a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
+++ b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
@ -207,6 +207,36 @@ func simdGenericOps() []opData {
 		{name: "CompressUint64x2", argLength: 2, commutative: false},
 		{name: "CompressUint64x4", argLength: 2, commutative: false},
 		{name: "CompressUint64x8", argLength: 2, commutative: false},
+		{name: "ConcatPermuteFloat32x4", argLength: 3, commutative: false},
+		{name: "ConcatPermuteFloat32x8", argLength: 3, commutative: false},
+		{name: "ConcatPermuteFloat32x16", argLength: 3, commutative: false},
+		{name: "ConcatPermuteFloat64x2", argLength: 3, commutative: false},
+		{name: "ConcatPermuteFloat64x4", argLength: 3, commutative: false},
+		{name: "ConcatPermuteFloat64x8", argLength: 3, commutative: false},
+		{name: "ConcatPermuteInt8x16", argLength: 3, commutative: false},
+		{name: "ConcatPermuteInt8x32", argLength: 3, commutative: false},
+		{name: "ConcatPermuteInt8x64", argLength: 3, commutative: false},
+		{name: "ConcatPermuteInt16x8", argLength: 3, commutative: false},
+		{name: "ConcatPermuteInt16x16", argLength: 3, commutative: false},
+		{name: "ConcatPermuteInt16x32", argLength: 3, commutative: false},
+		{name: "ConcatPermuteInt32x4", argLength: 3, commutative: false},
+		{name: "ConcatPermuteInt32x8", argLength: 3, commutative: false},
+		{name: "ConcatPermuteInt32x16", argLength: 3, commutative: false},
+		{name: "ConcatPermuteInt64x2", argLength: 3, commutative: false},
+		{name: "ConcatPermuteInt64x4", argLength: 3, commutative: false},
+		{name: "ConcatPermuteInt64x8", argLength: 3, commutative: false},
+		{name: "ConcatPermuteUint8x16", argLength: 3, commutative: false},
+		{name: "ConcatPermuteUint8x32", argLength: 3, commutative: false},
+		{name: "ConcatPermuteUint8x64", argLength: 3, commutative: false},
+		{name: "ConcatPermuteUint16x8", argLength: 3, commutative: false},
+		{name: "ConcatPermuteUint16x16", argLength: 3, commutative: false},
+		{name: "ConcatPermuteUint16x32", argLength: 3, commutative: false},
+		{name: "ConcatPermuteUint32x4", argLength: 3, commutative: false},
+		{name: "ConcatPermuteUint32x8", argLength: 3, commutative: false},
+		{name: "ConcatPermuteUint32x16", argLength: 3, commutative: false},
+		{name: "ConcatPermuteUint64x2", argLength: 3, commutative: false},
+		{name: "ConcatPermuteUint64x4", argLength: 3, commutative: false},
+		{name: "ConcatPermuteUint64x8", argLength: 3, commutative: false},
 		{name: "ConvertToInt8Int16x8", argLength: 1, commutative: false},
 		{name: "ConvertToInt8Int16x16", argLength: 1, commutative: false},
 		{name: "ConvertToInt8Int16x32", argLength: 1, commutative: false},
@ -750,44 +780,10 @@ func simdGenericOps() []opData {
 		{name: "OrUint64x2", argLength: 2, commutative: true},
 		{name: "OrUint64x4", argLength: 2, commutative: true},
 		{name: "OrUint64x8", argLength: 2, commutative: true},
-		{name: "Permute2Float32x4", argLength: 3, commutative: false},
-		{name: "Permute2Float32x8", argLength: 3, commutative: false},
-		{name: "Permute2Float32x16", argLength: 3, commutative: false},
-		{name: "Permute2Float64x2", argLength: 3, commutative: false},
-		{name: "Permute2Float64x4", argLength: 3, commutative: false},
-		{name: "Permute2Float64x8", argLength: 3, commutative: false},
-		{name: "Permute2Int8x16", argLength: 3, commutative: false},
-		{name: "Permute2Int8x32", argLength: 3, commutative: false},
-		{name: "Permute2Int8x64", argLength: 3, commutative: false},
-		{name: "Permute2Int16x8", argLength: 3, commutative: false},
-		{name: "Permute2Int16x16", argLength: 3, commutative: false},
-		{name: "Permute2Int16x32", argLength: 3, commutative: false},
-		{name: "Permute2Int32x4", argLength: 3, commutative: false},
-		{name: "Permute2Int32x8", argLength: 3, commutative: false},
-		{name: "Permute2Int32x16", argLength: 3, commutative: false},
-		{name: "Permute2Int64x2", argLength: 3, commutative: false},
-		{name: "Permute2Int64x4", argLength: 3, commutative: false},
-		{name: "Permute2Int64x8", argLength: 3, commutative: false},
-		{name: "Permute2Uint8x16", argLength: 3, commutative: false},
-		{name: "Permute2Uint8x32", argLength: 3, commutative: false},
-		{name: "Permute2Uint8x64", argLength: 3, commutative: false},
-		{name: "Permute2Uint16x8", argLength: 3, commutative: false},
-		{name: "Permute2Uint16x16", argLength: 3, commutative: false},
-		{name: "Permute2Uint16x32", argLength: 3, commutative: false},
-		{name: "Permute2Uint32x4", argLength: 3, commutative: false},
-		{name: "Permute2Uint32x8", argLength: 3, commutative: false},
-		{name: "Permute2Uint32x16", argLength: 3, commutative: false},
-		{name: "Permute2Uint64x2", argLength: 3, commutative: false},
-		{name: "Permute2Uint64x4", argLength: 3, commutative: false},
-		{name: "Permute2Uint64x8", argLength: 3, commutative: false},
 		{name: "PermuteFloat32x8", argLength: 2, commutative: false},
 		{name: "PermuteFloat32x16", argLength: 2, commutative: false},
 		{name: "PermuteFloat64x4", argLength: 2, commutative: false},
 		{name: "PermuteFloat64x8", argLength: 2, commutative: false},
-		{name: "PermuteGroupedInt8x32", argLength: 2, commutative: false},
-		{name: "PermuteGroupedInt8x64", argLength: 2, commutative: false},
-		{name: "PermuteGroupedUint8x32", argLength: 2, commutative: false},
-		{name: "PermuteGroupedUint8x64", argLength: 2, commutative: false},
 		{name: "PermuteInt8x16", argLength: 2, commutative: false},
 		{name: "PermuteInt8x32", argLength: 2, commutative: false},
 		{name: "PermuteInt8x64", argLength: 2, commutative: false},
@ -798,6 +794,12 @@ func simdGenericOps() []opData {
 		{name: "PermuteInt32x16", argLength: 2, commutative: false},
 		{name: "PermuteInt64x4", argLength: 2, commutative: false},
 		{name: "PermuteInt64x8", argLength: 2, commutative: false},
+		{name: "PermuteOrZeroGroupedInt8x32", argLength: 2, commutative: false},
+		{name: "PermuteOrZeroGroupedInt8x64", argLength: 2, commutative: false},
+		{name: "PermuteOrZeroGroupedUint8x32", argLength: 2, commutative: false},
+		{name: "PermuteOrZeroGroupedUint8x64", argLength: 2, commutative: false},
+		{name: "PermuteOrZeroInt8x16", argLength: 2, commutative: false},
+		{name: "PermuteOrZeroUint8x16", argLength: 2, commutative: false},
 		{name: "PermuteUint8x16", argLength: 2, commutative: false},
 		{name: "PermuteUint8x32", argLength: 2, commutative: false},
 		{name: "PermuteUint8x64", argLength: 2, commutative: false},
@ -1151,28 +1153,6 @@ func simdGenericOps() []opData {
 		{name: "GetElemUint16x8", argLength: 1, commutative: false, aux: "UInt8"},
 		{name: "GetElemUint32x4", argLength: 1, commutative: false, aux: "UInt8"},
 		{name: "GetElemUint64x2", argLength: 1, commutative: false, aux: "UInt8"},
-		{name: "PermuteConstantGroupedInt32x8", argLength: 1, commutative: false, aux: "UInt8"},
-		{name: "PermuteConstantGroupedInt32x16", argLength: 1, commutative: false, aux: "UInt8"},
-		{name: "PermuteConstantGroupedUint32x8", argLength: 1, commutative: false, aux: "UInt8"},
-		{name: "PermuteConstantGroupedUint32x16", argLength: 1, commutative: false, aux: "UInt8"},
-		{name: "PermuteConstantHiGroupedInt16x16", argLength: 1, commutative: false, aux: "UInt8"},
-		{name: "PermuteConstantHiGroupedInt16x32", argLength: 1, commutative: false, aux: "UInt8"},
-		{name: "PermuteConstantHiGroupedUint16x16", argLength: 1, commutative: false, aux: "UInt8"},
-		{name: "PermuteConstantHiGroupedUint16x32", argLength: 1, commutative: false, aux: "UInt8"},
-		{name: "PermuteConstantHiInt16x8", argLength: 1, commutative: false, aux: "UInt8"},
-		{name: "PermuteConstantHiInt32x4", argLength: 1, commutative: false, aux: "UInt8"},
-		{name: "PermuteConstantHiUint16x8", argLength: 1, commutative: false, aux: "UInt8"},
-		{name: "PermuteConstantHiUint32x4", argLength: 1, commutative: false, aux: "UInt8"},
-		{name: "PermuteConstantInt32x4", argLength: 1, commutative: false, aux: "UInt8"},
-		{name: "PermuteConstantLoGroupedInt16x16", argLength: 1, commutative: false, aux: "UInt8"},
-		{name: "PermuteConstantLoGroupedInt16x32", argLength: 1, commutative: false, aux: "UInt8"},
-		{name: "PermuteConstantLoGroupedUint16x16", argLength: 1, commutative: false, aux: "UInt8"},
-		{name: "PermuteConstantLoGroupedUint16x32", argLength: 1, commutative: false, aux: "UInt8"},
-		{name: "PermuteConstantLoInt16x8", argLength: 1, commutative: false, aux: "UInt8"},
-		{name: "PermuteConstantLoInt32x4", argLength: 1, commutative: false, aux: "UInt8"},
-		{name: "PermuteConstantLoUint16x8", argLength: 1, commutative: false, aux: "UInt8"},
-		{name: "PermuteConstantLoUint32x4", argLength: 1, commutative: false, aux: "UInt8"},
-		{name: "PermuteConstantUint32x4", argLength: 1, commutative: false, aux: "UInt8"},
 		{name: "RotateAllLeftInt32x4", argLength: 1, commutative: false, aux: "UInt8"},
 		{name: "RotateAllLeftInt32x8", argLength: 1, commutative: false, aux: "UInt8"},
 		{name: "RotateAllLeftInt32x16", argLength: 1, commutative: false, aux: "UInt8"},
@ -1292,6 +1272,24 @@ func simdGenericOps() []opData {
 		{name: "concatSelectedConstantInt64x2", argLength: 2, commutative: false, aux: "UInt8"},
 		{name: "concatSelectedConstantUint32x4", argLength: 2, commutative: false, aux: "UInt8"},
 		{name: "concatSelectedConstantUint64x2", argLength: 2, commutative: false, aux: "UInt8"},
+		{name: "permuteScalarsGroupedInt32x8", argLength: 1, commutative: false, aux: "UInt8"},
+		{name: "permuteScalarsGroupedInt32x16", argLength: 1, commutative: false, aux: "UInt8"},
+		{name: "permuteScalarsGroupedUint32x8", argLength: 1, commutative: false, aux: "UInt8"},
+		{name: "permuteScalarsGroupedUint32x16", argLength: 1, commutative: false, aux: "UInt8"},
+		{name: "permuteScalarsHiGroupedInt16x16", argLength: 1, commutative: false, aux: "UInt8"},
+		{name: "permuteScalarsHiGroupedInt16x32", argLength: 1, commutative: false, aux: "UInt8"},
+		{name: "permuteScalarsHiGroupedUint16x16", argLength: 1, commutative: false, aux: "UInt8"},
+		{name: "permuteScalarsHiGroupedUint16x32", argLength: 1, commutative: false, aux: "UInt8"},
+		{name: "permuteScalarsHiInt16x8", argLength: 1, commutative: false, aux: "UInt8"},
+		{name: "permuteScalarsHiUint16x8", argLength: 1, commutative: false, aux: "UInt8"},
+		{name: "permuteScalarsInt32x4", argLength: 1, commutative: false, aux: "UInt8"},
+		{name: "permuteScalarsLoGroupedInt16x16", argLength: 1, commutative: false, aux: "UInt8"},
+		{name: "permuteScalarsLoGroupedInt16x32", argLength: 1, commutative: false, aux: "UInt8"},
+		{name: "permuteScalarsLoGroupedUint16x16", argLength: 1, commutative: false, aux: "UInt8"},
+		{name: "permuteScalarsLoGroupedUint16x32", argLength: 1, commutative: false, aux: "UInt8"},
+		{name: "permuteScalarsLoInt16x8", argLength: 1, commutative: false, aux: "UInt8"},
+		{name: "permuteScalarsLoUint16x8", argLength: 1, commutative: false, aux: "UInt8"},
+		{name: "permuteScalarsUint32x4", argLength: 1, commutative: false, aux: "UInt8"},
 		{name: "ternInt32x4", argLength: 3, commutative: false, aux: "UInt8"},
 		{name: "ternInt32x8", argLength: 3, commutative: false, aux: "UInt8"},
 		{name: "ternInt32x16", argLength: 3, commutative: false, aux: "UInt8"},
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
@ -1624,8 +1624,10 @@ const (
 	OpAMD64VPDPWSSDMasked128
 	OpAMD64VPDPWSSDMasked256
 	OpAMD64VPDPWSSDMasked512
+	OpAMD64VPERMB128
 	OpAMD64VPERMB256
 	OpAMD64VPERMB512
+	OpAMD64VPERMBMasked128
 	OpAMD64VPERMBMasked256
 	OpAMD64VPERMBMasked512
 	OpAMD64VPERMD256
@ -2551,6 +2553,12 @@ const (
 	OpAMD64VPSHUFHWMasked128
 	OpAMD64VPSHUFHWMasked256
 	OpAMD64VPSHUFHWMasked512
+	OpAMD64VPSHUFLW128
+	OpAMD64VPSHUFLW256
+	OpAMD64VPSHUFLW512
+	OpAMD64VPSHUFLWMasked128
+	OpAMD64VPSHUFLWMasked256
+	OpAMD64VPSHUFLWMasked512
 	OpAMD64VPSLLD128const
 	OpAMD64VPSLLD256const
 	OpAMD64VPSLLD512const
@ -3633,6 +3641,9 @@ const (
 	OpAMD64VPSHUFHWMasked128Merging
 	OpAMD64VPSHUFHWMasked256Merging
 	OpAMD64VPSHUFHWMasked512Merging
+	OpAMD64VPSHUFLWMasked128Merging
+	OpAMD64VPSHUFLWMasked256Merging
+	OpAMD64VPSHUFLWMasked512Merging
 	OpAMD64VPSLLDMasked128constMerging
 	OpAMD64VPSLLDMasked256constMerging
 	OpAMD64VPSLLDMasked512constMerging
@ -6155,6 +6166,36 @@ const (
 	OpCompressUint64x2
 	OpCompressUint64x4
 	OpCompressUint64x8
+	OpConcatPermuteFloat32x4
+	OpConcatPermuteFloat32x8
+	OpConcatPermuteFloat32x16
+	OpConcatPermuteFloat64x2
+	OpConcatPermuteFloat64x4
+	OpConcatPermuteFloat64x8
+	OpConcatPermuteInt8x16
+	OpConcatPermuteInt8x32
+	OpConcatPermuteInt8x64
+	OpConcatPermuteInt16x8
+	OpConcatPermuteInt16x16
+	OpConcatPermuteInt16x32
+	OpConcatPermuteInt32x4
+	OpConcatPermuteInt32x8
+	OpConcatPermuteInt32x16
+	OpConcatPermuteInt64x2
+	OpConcatPermuteInt64x4
+	OpConcatPermuteInt64x8
+	OpConcatPermuteUint8x16
+	OpConcatPermuteUint8x32
+	OpConcatPermuteUint8x64
+	OpConcatPermuteUint16x8
+	OpConcatPermuteUint16x16
+	OpConcatPermuteUint16x32
+	OpConcatPermuteUint32x4
+	OpConcatPermuteUint32x8
+	OpConcatPermuteUint32x16
+	OpConcatPermuteUint64x2
+	OpConcatPermuteUint64x4
+	OpConcatPermuteUint64x8
 	OpConvertToInt8Int16x8
 	OpConvertToInt8Int16x16
 	OpConvertToInt8Int16x32
@ -6698,44 +6739,10 @@ const (
 	OpOrUint64x2
 	OpOrUint64x4
 	OpOrUint64x8
-	OpPermute2Float32x4
-	OpPermute2Float32x8
-	OpPermute2Float32x16
-	OpPermute2Float64x2
-	OpPermute2Float64x4
-	OpPermute2Float64x8
-	OpPermute2Int8x16
-	OpPermute2Int8x32
-	OpPermute2Int8x64
-	OpPermute2Int16x8
-	OpPermute2Int16x16
-	OpPermute2Int16x32
-	OpPermute2Int32x4
-	OpPermute2Int32x8
-	OpPermute2Int32x16
-	OpPermute2Int64x2
-	OpPermute2Int64x4
-	OpPermute2Int64x8
-	OpPermute2Uint8x16
-	OpPermute2Uint8x32
-	OpPermute2Uint8x64
-	OpPermute2Uint16x8
-	OpPermute2Uint16x16
-	OpPermute2Uint16x32
-	OpPermute2Uint32x4
-	OpPermute2Uint32x8
-	OpPermute2Uint32x16
-	OpPermute2Uint64x2
-	OpPermute2Uint64x4
-	OpPermute2Uint64x8
 	OpPermuteFloat32x8
 	OpPermuteFloat32x16
 	OpPermuteFloat64x4
 	OpPermuteFloat64x8
-	OpPermuteGroupedInt8x32
-	OpPermuteGroupedInt8x64
-	OpPermuteGroupedUint8x32
-	OpPermuteGroupedUint8x64
 	OpPermuteInt8x16
 	OpPermuteInt8x32
 	OpPermuteInt8x64
@ -6746,6 +6753,12 @@ const (
 	OpPermuteInt32x16
 	OpPermuteInt64x4
 	OpPermuteInt64x8
+	OpPermuteOrZeroGroupedInt8x32
+	OpPermuteOrZeroGroupedInt8x64
+	OpPermuteOrZeroGroupedUint8x32
+	OpPermuteOrZeroGroupedUint8x64
+	OpPermuteOrZeroInt8x16
+	OpPermuteOrZeroUint8x16
 	OpPermuteUint8x16
 	OpPermuteUint8x32
 	OpPermuteUint8x64
@ -7099,28 +7112,6 @@ const (
 	OpGetElemUint16x8
 	OpGetElemUint32x4
 	OpGetElemUint64x2
-	OpPermuteConstantGroupedInt32x8
-	OpPermuteConstantGroupedInt32x16
-	OpPermuteConstantGroupedUint32x8
-	OpPermuteConstantGroupedUint32x16
-	OpPermuteConstantHiGroupedInt16x16
-	OpPermuteConstantHiGroupedInt16x32
-	OpPermuteConstantHiGroupedUint16x16
-	OpPermuteConstantHiGroupedUint16x32
-	OpPermuteConstantHiInt16x8
-	OpPermuteConstantHiInt32x4
-	OpPermuteConstantHiUint16x8
-	OpPermuteConstantHiUint32x4
-	OpPermuteConstantInt32x4
-	OpPermuteConstantLoGroupedInt16x16
-	OpPermuteConstantLoGroupedInt16x32
-	OpPermuteConstantLoGroupedUint16x16
-	OpPermuteConstantLoGroupedUint16x32
-	OpPermuteConstantLoInt16x8
-	OpPermuteConstantLoInt32x4
-	OpPermuteConstantLoUint16x8
-	OpPermuteConstantLoUint32x4
-	OpPermuteConstantUint32x4
 	OpRotateAllLeftInt32x4
 	OpRotateAllLeftInt32x8
 	OpRotateAllLeftInt32x16
@ -7240,6 +7231,24 @@ const (
 	OpconcatSelectedConstantInt64x2
 	OpconcatSelectedConstantUint32x4
 	OpconcatSelectedConstantUint64x2
+	OppermuteScalarsGroupedInt32x8
+	OppermuteScalarsGroupedInt32x16
+	OppermuteScalarsGroupedUint32x8
+	OppermuteScalarsGroupedUint32x16
+	OppermuteScalarsHiGroupedInt16x16
+	OppermuteScalarsHiGroupedInt16x32
+	OppermuteScalarsHiGroupedUint16x16
+	OppermuteScalarsHiGroupedUint16x32
+	OppermuteScalarsHiInt16x8
+	OppermuteScalarsHiUint16x8
+	OppermuteScalarsInt32x4
+	OppermuteScalarsLoGroupedInt16x16
+	OppermuteScalarsLoGroupedInt16x32
+	OppermuteScalarsLoGroupedUint16x16
+	OppermuteScalarsLoGroupedUint16x32
+	OppermuteScalarsLoInt16x8
+	OppermuteScalarsLoUint16x8
+	OppermuteScalarsUint32x4
 	OpternInt32x4
 	OpternInt32x8
 	OpternInt32x16
@ -26142,6 +26151,20 @@ var opcodeTable = [...]opInfo{
 			},
 		},
 	},
+	{
+		name:   "VPERMB128",
+		argLen: 2,
+		asm:    x86.AVPERMB,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+				{1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+			outputs: []outputInfo{
+				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+		},
+	},
 	{
 		name:   "VPERMB256",
 		argLen: 2,
@ -26170,6 +26193,21 @@ var opcodeTable = [...]opInfo{
 			},
 		},
 	},
+	{
+		name:   "VPERMBMasked128",
+		argLen: 3,
+		asm:    x86.AVPERMB,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+				{0, 281472829161472},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+				{1, 281474976645120},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+			outputs: []outputInfo{
+				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+		},
+	},
 	{
 		name:   "VPERMBMasked256",
 		argLen: 3,
@ -39744,6 +39782,93 @@ var opcodeTable = [...]opInfo{
 			},
 		},
 	},
+	{
+		name:    "VPSHUFLW128",
+		auxType: auxUInt8,
+		argLen:  1,
+		asm:     x86.AVPSHUFLW,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+			outputs: []outputInfo{
+				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+		},
+	},
+	{
+		name:    "VPSHUFLW256",
+		auxType: auxUInt8,
+		argLen:  1,
+		asm:     x86.AVPSHUFLW,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
+	{
+		name:    "VPSHUFLW512",
+		auxType: auxUInt8,
+		argLen:  1,
+		asm:     x86.AVPSHUFLW,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+			outputs: []outputInfo{
+				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+		},
+	},
+	{
+		name:    "VPSHUFLWMasked128",
+		auxType: auxUInt8,
+		argLen:  2,
+		asm:     x86.AVPSHUFLW,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+				{0, 281474976645120},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+			outputs: []outputInfo{
+				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+		},
+	},
+	{
+		name:    "VPSHUFLWMasked256",
+		auxType: auxUInt8,
+		argLen:  2,
+		asm:     x86.AVPSHUFLW,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+				{0, 281474976645120},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+			outputs: []outputInfo{
+				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+		},
+	},
+	{
+		name:    "VPSHUFLWMasked512",
+		auxType: auxUInt8,
+		argLen:  2,
+		asm:     x86.AVPSHUFLW,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+				{0, 281474976645120},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+			outputs: []outputInfo{
+				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+		},
+	},
 	{
 		name:    "VPSLLD128const",
 		auxType: auxUInt8,
@ -57607,6 +57732,57 @@ var opcodeTable = [...]opInfo{
 			},
 		},
 	},
+	{
+		name:         "VPSHUFLWMasked128Merging",
+		auxType:      auxUInt8,
+		argLen:       3,
+		resultInArg0: true,
+		asm:          x86.AVPSHUFLW,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+				{0, 281472829161472},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+				{1, 281474976645120},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+			outputs: []outputInfo{
+				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+		},
+	},
+	{
+		name:         "VPSHUFLWMasked256Merging",
+		auxType:      auxUInt8,
+		argLen:       3,
+		resultInArg0: true,
+		asm:          x86.AVPSHUFLW,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+				{0, 281472829161472},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+				{1, 281474976645120},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+			outputs: []outputInfo{
+				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+		},
+	},
+	{
+		name:         "VPSHUFLWMasked512Merging",
+		auxType:      auxUInt8,
+		argLen:       3,
+		resultInArg0: true,
+		asm:          x86.AVPSHUFLW,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
+				{0, 281472829161472},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+				{1, 281474976645120},   // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+			outputs: []outputInfo{
+				{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+			},
+		},
+	},
 	{
 		name:         "VPSLLDMasked128constMerging",
 		auxType:      auxUInt8,
@ -86874,6 +87050,156 @@ var opcodeTable = [...]opInfo{
 		argLen:  2,
 		generic: true,
 	},
+	{
+		name:    "ConcatPermuteFloat32x4",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "ConcatPermuteFloat32x8",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "ConcatPermuteFloat32x16",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "ConcatPermuteFloat64x2",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "ConcatPermuteFloat64x4",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "ConcatPermuteFloat64x8",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "ConcatPermuteInt8x16",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "ConcatPermuteInt8x32",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "ConcatPermuteInt8x64",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "ConcatPermuteInt16x8",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "ConcatPermuteInt16x16",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "ConcatPermuteInt16x32",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "ConcatPermuteInt32x4",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "ConcatPermuteInt32x8",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "ConcatPermuteInt32x16",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "ConcatPermuteInt64x2",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "ConcatPermuteInt64x4",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "ConcatPermuteInt64x8",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "ConcatPermuteUint8x16",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "ConcatPermuteUint8x32",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "ConcatPermuteUint8x64",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "ConcatPermuteUint16x8",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "ConcatPermuteUint16x16",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "ConcatPermuteUint16x32",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "ConcatPermuteUint32x4",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "ConcatPermuteUint32x8",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "ConcatPermuteUint32x16",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "ConcatPermuteUint64x2",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "ConcatPermuteUint64x4",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "ConcatPermuteUint64x8",
+		argLen:  3,
+		generic: true,
+	},
 	{
 		name:    "ConvertToInt8Int16x8",
 		argLen:  1,
@ -89757,156 +90083,6 @@ var opcodeTable = [...]opInfo{
 		commutative: true,
 		generic:     true,
 	},
-	{
-		name:    "Permute2Float32x4",
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "Permute2Float32x8",
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "Permute2Float32x16",
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "Permute2Float64x2",
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "Permute2Float64x4",
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "Permute2Float64x8",
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "Permute2Int8x16",
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "Permute2Int8x32",
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "Permute2Int8x64",
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "Permute2Int16x8",
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "Permute2Int16x16",
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "Permute2Int16x32",
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "Permute2Int32x4",
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "Permute2Int32x8",
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "Permute2Int32x16",
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "Permute2Int64x2",
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "Permute2Int64x4",
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "Permute2Int64x8",
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "Permute2Uint8x16",
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "Permute2Uint8x32",
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "Permute2Uint8x64",
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "Permute2Uint16x8",
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "Permute2Uint16x16",
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "Permute2Uint16x32",
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "Permute2Uint32x4",
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "Permute2Uint32x8",
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "Permute2Uint32x16",
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "Permute2Uint64x2",
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "Permute2Uint64x4",
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "Permute2Uint64x8",
-		argLen:  3,
-		generic: true,
-	},
 	{
 		name:    "PermuteFloat32x8",
 		argLen:  2,
@ -89927,26 +90103,6 @@ var opcodeTable = [...]opInfo{
 		argLen:  2,
 		generic: true,
 	},
-	{
-		name:    "PermuteGroupedInt8x32",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "PermuteGroupedInt8x64",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "PermuteGroupedUint8x32",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "PermuteGroupedUint8x64",
-		argLen:  2,
-		generic: true,
-	},
 	{
 		name:    "PermuteInt8x16",
 		argLen:  2,
@ -89997,6 +90153,36 @@ var opcodeTable = [...]opInfo{
 		argLen:  2,
 		generic: true,
 	},
+	{
+		name:    "PermuteOrZeroGroupedInt8x32",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "PermuteOrZeroGroupedInt8x64",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "PermuteOrZeroGroupedUint8x32",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "PermuteOrZeroGroupedUint8x64",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "PermuteOrZeroInt8x16",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "PermuteOrZeroUint8x16",
+		argLen:  2,
+		generic: true,
+	},
 	{
 		name:    "PermuteUint8x16",
 		argLen:  2,
@ -91830,138 +92016,6 @@ var opcodeTable = [...]opInfo{
 		argLen:  1,
 		generic: true,
 	},
-	{
-		name:    "PermuteConstantGroupedInt32x8",
-		auxType: auxUInt8,
-		argLen:  1,
-		generic: true,
-	},
-	{
-		name:    "PermuteConstantGroupedInt32x16",
-		auxType: auxUInt8,
-		argLen:  1,
-		generic: true,
-	},
-	{
-		name:    "PermuteConstantGroupedUint32x8",
-		auxType: auxUInt8,
-		argLen:  1,
-		generic: true,
-	},
-	{
-		name:    "PermuteConstantGroupedUint32x16",
-		auxType: auxUInt8,
-		argLen:  1,
-		generic: true,
-	},
-	{
-		name:    "PermuteConstantHiGroupedInt16x16",
-		auxType: auxUInt8,
-		argLen:  1,
-		generic: true,
-	},
-	{
-		name:    "PermuteConstantHiGroupedInt16x32",
-		auxType: auxUInt8,
-		argLen:  1,
-		generic: true,
-	},
-	{
-		name:    "PermuteConstantHiGroupedUint16x16",
-		auxType: auxUInt8,
-		argLen:  1,
-		generic: true,
-	},
-	{
-		name:    "PermuteConstantHiGroupedUint16x32",
-		auxType: auxUInt8,
-		argLen:  1,
-		generic: true,
-	},
-	{
-		name:    "PermuteConstantHiInt16x8",
-		auxType: auxUInt8,
-		argLen:  1,
-		generic: true,
-	},
-	{
-		name:    "PermuteConstantHiInt32x4",
-		auxType: auxUInt8,
-		argLen:  1,
-		generic: true,
-	},
-	{
-		name:    "PermuteConstantHiUint16x8",
-		auxType: auxUInt8,
-		argLen:  1,
-		generic: true,
-	},
-	{
-		name:    "PermuteConstantHiUint32x4",
-		auxType: auxUInt8,
-		argLen:  1,
-		generic: true,
-	},
-	{
-		name:    "PermuteConstantInt32x4",
-		auxType: auxUInt8,
-		argLen:  1,
-		generic: true,
-	},
-	{
-		name:    "PermuteConstantLoGroupedInt16x16",
-		auxType: auxUInt8,
-		argLen:  1,
-		generic: true,
-	},
-	{
-		name:    "PermuteConstantLoGroupedInt16x32",
-		auxType: auxUInt8,
-		argLen:  1,
-		generic: true,
-	},
-	{
-		name:    "PermuteConstantLoGroupedUint16x16",
-		auxType: auxUInt8,
-		argLen:  1,
-		generic: true,
-	},
-	{
-		name:    "PermuteConstantLoGroupedUint16x32",
-		auxType: auxUInt8,
-		argLen:  1,
-		generic: true,
-	},
-	{
-		name:    "PermuteConstantLoInt16x8",
-		auxType: auxUInt8,
-		argLen:  1,
-		generic: true,
-	},
-	{
-		name:    "PermuteConstantLoInt32x4",
-		auxType: auxUInt8,
-		argLen:  1,
-		generic: true,
-	},
-	{
-		name:    "PermuteConstantLoUint16x8",
-		auxType: auxUInt8,
-		argLen:  1,
-		generic: true,
-	},
-	{
-		name:    "PermuteConstantLoUint32x4",
-		auxType: auxUInt8,
-		argLen:  1,
-		generic: true,
-	},
-	{
-		name:    "PermuteConstantUint32x4",
-		auxType: auxUInt8,
-		argLen:  1,
-		generic: true,
-	},
 	{
 		name:    "RotateAllLeftInt32x4",
 		auxType: auxUInt8,
@ -92676,6 +92730,114 @@ var opcodeTable = [...]opInfo{
 		argLen:  2,
 		generic: true,
 	},
+	{
+		name:    "permuteScalarsGroupedInt32x8",
+		auxType: auxUInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "permuteScalarsGroupedInt32x16",
+		auxType: auxUInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "permuteScalarsGroupedUint32x8",
+		auxType: auxUInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "permuteScalarsGroupedUint32x16",
+		auxType: auxUInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "permuteScalarsHiGroupedInt16x16",
+		auxType: auxUInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "permuteScalarsHiGroupedInt16x32",
+		auxType: auxUInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "permuteScalarsHiGroupedUint16x16",
+		auxType: auxUInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "permuteScalarsHiGroupedUint16x32",
+		auxType: auxUInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "permuteScalarsHiInt16x8",
+		auxType: auxUInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "permuteScalarsHiUint16x8",
+		auxType: auxUInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "permuteScalarsInt32x4",
+		auxType: auxUInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "permuteScalarsLoGroupedInt16x16",
+		auxType: auxUInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "permuteScalarsLoGroupedInt16x32",
+		auxType: auxUInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "permuteScalarsLoGroupedUint16x16",
+		auxType: auxUInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "permuteScalarsLoGroupedUint16x32",
+		auxType: auxUInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "permuteScalarsLoInt16x8",
+		auxType: auxUInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "permuteScalarsLoUint16x8",
+		auxType: auxUInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "permuteScalarsUint32x4",
+		auxType: auxUInt8,
+		argLen:  1,
+		generic: true,
+	},
 	{
 		name:    "ternInt32x4",
 		auxType: auxUInt8,
--- a/src/cmd/compile/internal/ssa/rewriteAMD64.go
+++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go
--- a/src/cmd/compile/internal/ssagen/simdintrinsics.go
+++ b/src/cmd/compile/internal/ssagen/simdintrinsics.go
@ -228,6 +228,36 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
 	addF(simdPackage, "Uint64x2.Compress", opLen2(ssa.OpCompressUint64x2, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Uint64x4.Compress", opLen2(ssa.OpCompressUint64x4, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint64x8.Compress", opLen2(ssa.OpCompressUint64x8, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Int8x16.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt8x16, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint8x16.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint8x16, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int8x32.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt8x32, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Uint8x32.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint8x32, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int8x64.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt8x64, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Uint8x64.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint8x64, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Int16x8.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt16x8, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint16x8.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint16x8, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int16x16.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt16x16, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Uint16x16.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint16x16, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int16x32.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt16x32, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Uint16x32.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint16x32, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Float32x4.ConcatPermute", opLen3_231(ssa.OpConcatPermuteFloat32x4, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int32x4.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt32x4, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint32x4.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint32x4, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Float32x8.ConcatPermute", opLen3_231(ssa.OpConcatPermuteFloat32x8, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int32x8.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt32x8, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Uint32x8.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint32x8, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Float32x16.ConcatPermute", opLen3_231(ssa.OpConcatPermuteFloat32x16, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Int32x16.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt32x16, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Uint32x16.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint32x16, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Float64x2.ConcatPermute", opLen3_231(ssa.OpConcatPermuteFloat64x2, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int64x2.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt64x2, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint64x2.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint64x2, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Float64x4.ConcatPermute", opLen3_231(ssa.OpConcatPermuteFloat64x4, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int64x4.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt64x4, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Uint64x4.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint64x4, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Float64x8.ConcatPermute", opLen3_231(ssa.OpConcatPermuteFloat64x8, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Int64x8.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt64x8, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Uint64x8.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint64x8, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Uint8x16.ConcatShiftBytesRight", opLen2Imm8(ssa.OpConcatShiftBytesRightUint8x16, types.TypeVec128, 0), sys.AMD64)
 	addF(simdPackage, "Uint8x32.ConcatShiftBytesRightGrouped", opLen2Imm8(ssa.OpConcatShiftBytesRightGroupedUint8x32, types.TypeVec256, 0), sys.AMD64)
 	addF(simdPackage, "Uint8x64.ConcatShiftBytesRightGrouped", opLen2Imm8(ssa.OpConcatShiftBytesRightGroupedUint8x64, types.TypeVec512, 0), sys.AMD64)
@ -802,8 +832,8 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
 	addF(simdPackage, "Uint64x2.Or", opLen2(ssa.OpOrUint64x2, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Uint64x4.Or", opLen2(ssa.OpOrUint64x4, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint64x8.Or", opLen2(ssa.OpOrUint64x8, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Int8x16.Permute", opLen2(ssa.OpPermuteInt8x16, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Uint8x16.Permute", opLen2(ssa.OpPermuteUint8x16, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int8x16.Permute", opLen2_21(ssa.OpPermuteInt8x16, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint8x16.Permute", opLen2_21(ssa.OpPermuteUint8x16, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Int8x32.Permute", opLen2_21(ssa.OpPermuteInt8x32, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint8x32.Permute", opLen2_21(ssa.OpPermuteUint8x32, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int8x64.Permute", opLen2_21(ssa.OpPermuteInt8x64, types.TypeVec512), sys.AMD64)
@ -826,62 +856,12 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
 	addF(simdPackage, "Float64x8.Permute", opLen2_21(ssa.OpPermuteFloat64x8, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Int64x8.Permute", opLen2_21(ssa.OpPermuteInt64x8, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Uint64x8.Permute", opLen2_21(ssa.OpPermuteUint64x8, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Int8x16.Permute2", opLen3_231(ssa.OpPermute2Int8x16, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Uint8x16.Permute2", opLen3_231(ssa.OpPermute2Uint8x16, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int8x32.Permute2", opLen3_231(ssa.OpPermute2Int8x32, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Uint8x32.Permute2", opLen3_231(ssa.OpPermute2Uint8x32, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Int8x64.Permute2", opLen3_231(ssa.OpPermute2Int8x64, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Uint8x64.Permute2", opLen3_231(ssa.OpPermute2Uint8x64, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Int16x8.Permute2", opLen3_231(ssa.OpPermute2Int16x8, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Uint16x8.Permute2", opLen3_231(ssa.OpPermute2Uint16x8, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int16x16.Permute2", opLen3_231(ssa.OpPermute2Int16x16, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Uint16x16.Permute2", opLen3_231(ssa.OpPermute2Uint16x16, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Int16x32.Permute2", opLen3_231(ssa.OpPermute2Int16x32, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Uint16x32.Permute2", opLen3_231(ssa.OpPermute2Uint16x32, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Float32x4.Permute2", opLen3_231(ssa.OpPermute2Float32x4, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int32x4.Permute2", opLen3_231(ssa.OpPermute2Int32x4, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Uint32x4.Permute2", opLen3_231(ssa.OpPermute2Uint32x4, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Float32x8.Permute2", opLen3_231(ssa.OpPermute2Float32x8, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Int32x8.Permute2", opLen3_231(ssa.OpPermute2Int32x8, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Uint32x8.Permute2", opLen3_231(ssa.OpPermute2Uint32x8, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Float32x16.Permute2", opLen3_231(ssa.OpPermute2Float32x16, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Int32x16.Permute2", opLen3_231(ssa.OpPermute2Int32x16, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Uint32x16.Permute2", opLen3_231(ssa.OpPermute2Uint32x16, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Float64x2.Permute2", opLen3_231(ssa.OpPermute2Float64x2, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int64x2.Permute2", opLen3_231(ssa.OpPermute2Int64x2, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Uint64x2.Permute2", opLen3_231(ssa.OpPermute2Uint64x2, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Float64x4.Permute2", opLen3_231(ssa.OpPermute2Float64x4, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Int64x4.Permute2", opLen3_231(ssa.OpPermute2Int64x4, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Uint64x4.Permute2", opLen3_231(ssa.OpPermute2Uint64x4, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Float64x8.Permute2", opLen3_231(ssa.OpPermute2Float64x8, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Int64x8.Permute2", opLen3_231(ssa.OpPermute2Int64x8, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Uint64x8.Permute2", opLen3_231(ssa.OpPermute2Uint64x8, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Int32x4.PermuteConstant", opLen1Imm8(ssa.OpPermuteConstantInt32x4, types.TypeVec128, 0), sys.AMD64)
-	addF(simdPackage, "Uint32x4.PermuteConstant", opLen1Imm8(ssa.OpPermuteConstantUint32x4, types.TypeVec128, 0), sys.AMD64)
-	addF(simdPackage, "Int32x8.PermuteConstantGrouped", opLen1Imm8(ssa.OpPermuteConstantGroupedInt32x8, types.TypeVec256, 0), sys.AMD64)
-	addF(simdPackage, "Int32x16.PermuteConstantGrouped", opLen1Imm8(ssa.OpPermuteConstantGroupedInt32x16, types.TypeVec512, 0), sys.AMD64)
-	addF(simdPackage, "Uint32x8.PermuteConstantGrouped", opLen1Imm8(ssa.OpPermuteConstantGroupedUint32x8, types.TypeVec256, 0), sys.AMD64)
-	addF(simdPackage, "Uint32x16.PermuteConstantGrouped", opLen1Imm8(ssa.OpPermuteConstantGroupedUint32x16, types.TypeVec512, 0), sys.AMD64)
-	addF(simdPackage, "Int16x8.PermuteConstantHi", opLen1Imm8(ssa.OpPermuteConstantHiInt16x8, types.TypeVec128, 0), sys.AMD64)
-	addF(simdPackage, "Int32x4.PermuteConstantHi", opLen1Imm8(ssa.OpPermuteConstantHiInt32x4, types.TypeVec128, 0), sys.AMD64)
-	addF(simdPackage, "Uint16x8.PermuteConstantHi", opLen1Imm8(ssa.OpPermuteConstantHiUint16x8, types.TypeVec128, 0), sys.AMD64)
-	addF(simdPackage, "Uint32x4.PermuteConstantHi", opLen1Imm8(ssa.OpPermuteConstantHiUint32x4, types.TypeVec128, 0), sys.AMD64)
-	addF(simdPackage, "Int16x16.PermuteConstantHiGrouped", opLen1Imm8(ssa.OpPermuteConstantHiGroupedInt16x16, types.TypeVec256, 0), sys.AMD64)
-	addF(simdPackage, "Int16x32.PermuteConstantHiGrouped", opLen1Imm8(ssa.OpPermuteConstantHiGroupedInt16x32, types.TypeVec512, 0), sys.AMD64)
-	addF(simdPackage, "Uint16x16.PermuteConstantHiGrouped", opLen1Imm8(ssa.OpPermuteConstantHiGroupedUint16x16, types.TypeVec256, 0), sys.AMD64)
-	addF(simdPackage, "Uint16x32.PermuteConstantHiGrouped", opLen1Imm8(ssa.OpPermuteConstantHiGroupedUint16x32, types.TypeVec512, 0), sys.AMD64)
-	addF(simdPackage, "Int16x8.PermuteConstantLo", opLen1Imm8(ssa.OpPermuteConstantLoInt16x8, types.TypeVec128, 0), sys.AMD64)
-	addF(simdPackage, "Int32x4.PermuteConstantLo", opLen1Imm8(ssa.OpPermuteConstantLoInt32x4, types.TypeVec128, 0), sys.AMD64)
-	addF(simdPackage, "Uint16x8.PermuteConstantLo", opLen1Imm8(ssa.OpPermuteConstantLoUint16x8, types.TypeVec128, 0), sys.AMD64)
-	addF(simdPackage, "Uint32x4.PermuteConstantLo", opLen1Imm8(ssa.OpPermuteConstantLoUint32x4, types.TypeVec128, 0), sys.AMD64)
-	addF(simdPackage, "Int16x16.PermuteConstantLoGrouped", opLen1Imm8(ssa.OpPermuteConstantLoGroupedInt16x16, types.TypeVec256, 0), sys.AMD64)
-	addF(simdPackage, "Int16x32.PermuteConstantLoGrouped", opLen1Imm8(ssa.OpPermuteConstantLoGroupedInt16x32, types.TypeVec512, 0), sys.AMD64)
-	addF(simdPackage, "Uint16x16.PermuteConstantLoGrouped", opLen1Imm8(ssa.OpPermuteConstantLoGroupedUint16x16, types.TypeVec256, 0), sys.AMD64)
-	addF(simdPackage, "Uint16x32.PermuteConstantLoGrouped", opLen1Imm8(ssa.OpPermuteConstantLoGroupedUint16x32, types.TypeVec512, 0), sys.AMD64)
-	addF(simdPackage, "Int8x32.PermuteGrouped", opLen2(ssa.OpPermuteGroupedInt8x32, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Int8x64.PermuteGrouped", opLen2(ssa.OpPermuteGroupedInt8x64, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Uint8x32.PermuteGrouped", opLen2(ssa.OpPermuteGroupedUint8x32, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Uint8x64.PermuteGrouped", opLen2(ssa.OpPermuteGroupedUint8x64, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Int8x16.PermuteOrZero", opLen2(ssa.OpPermuteOrZeroInt8x16, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint8x16.PermuteOrZero", opLen2(ssa.OpPermuteOrZeroUint8x16, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int8x32.PermuteOrZeroGrouped", opLen2(ssa.OpPermuteOrZeroGroupedInt8x32, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int8x64.PermuteOrZeroGrouped", opLen2(ssa.OpPermuteOrZeroGroupedInt8x64, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Uint8x32.PermuteOrZeroGrouped", opLen2(ssa.OpPermuteOrZeroGroupedUint8x32, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Uint8x64.PermuteOrZeroGrouped", opLen2(ssa.OpPermuteOrZeroGroupedUint8x64, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Float32x4.Reciprocal", opLen1(ssa.OpReciprocalFloat32x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Float32x8.Reciprocal", opLen1(ssa.OpReciprocalFloat32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Float32x16.Reciprocal", opLen1(ssa.OpReciprocalFloat32x16, types.TypeVec512), sys.AMD64)
@ -1300,6 +1280,24 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
 	addF(simdPackage, "Uint32x16.concatSelectedConstantGrouped", opLen2Imm8(ssa.OpconcatSelectedConstantGroupedUint32x16, types.TypeVec512, 0), sys.AMD64)
 	addF(simdPackage, "Uint64x4.concatSelectedConstantGrouped", opLen2Imm8(ssa.OpconcatSelectedConstantGroupedUint64x4, types.TypeVec256, 0), sys.AMD64)
 	addF(simdPackage, "Uint64x8.concatSelectedConstantGrouped", opLen2Imm8(ssa.OpconcatSelectedConstantGroupedUint64x8, types.TypeVec512, 0), sys.AMD64)
+	addF(simdPackage, "Int32x4.permuteScalars", opLen1Imm8(ssa.OppermuteScalarsInt32x4, types.TypeVec128, 0), sys.AMD64)
+	addF(simdPackage, "Uint32x4.permuteScalars", opLen1Imm8(ssa.OppermuteScalarsUint32x4, types.TypeVec128, 0), sys.AMD64)
+	addF(simdPackage, "Int32x8.permuteScalarsGrouped", opLen1Imm8(ssa.OppermuteScalarsGroupedInt32x8, types.TypeVec256, 0), sys.AMD64)
+	addF(simdPackage, "Int32x16.permuteScalarsGrouped", opLen1Imm8(ssa.OppermuteScalarsGroupedInt32x16, types.TypeVec512, 0), sys.AMD64)
+	addF(simdPackage, "Uint32x8.permuteScalarsGrouped", opLen1Imm8(ssa.OppermuteScalarsGroupedUint32x8, types.TypeVec256, 0), sys.AMD64)
+	addF(simdPackage, "Uint32x16.permuteScalarsGrouped", opLen1Imm8(ssa.OppermuteScalarsGroupedUint32x16, types.TypeVec512, 0), sys.AMD64)
+	addF(simdPackage, "Int16x8.permuteScalarsHi", opLen1Imm8(ssa.OppermuteScalarsHiInt16x8, types.TypeVec128, 0), sys.AMD64)
+	addF(simdPackage, "Uint16x8.permuteScalarsHi", opLen1Imm8(ssa.OppermuteScalarsHiUint16x8, types.TypeVec128, 0), sys.AMD64)
+	addF(simdPackage, "Int16x16.permuteScalarsHiGrouped", opLen1Imm8(ssa.OppermuteScalarsHiGroupedInt16x16, types.TypeVec256, 0), sys.AMD64)
+	addF(simdPackage, "Int16x32.permuteScalarsHiGrouped", opLen1Imm8(ssa.OppermuteScalarsHiGroupedInt16x32, types.TypeVec512, 0), sys.AMD64)
+	addF(simdPackage, "Uint16x16.permuteScalarsHiGrouped", opLen1Imm8(ssa.OppermuteScalarsHiGroupedUint16x16, types.TypeVec256, 0), sys.AMD64)
+	addF(simdPackage, "Uint16x32.permuteScalarsHiGrouped", opLen1Imm8(ssa.OppermuteScalarsHiGroupedUint16x32, types.TypeVec512, 0), sys.AMD64)
+	addF(simdPackage, "Int16x8.permuteScalarsLo", opLen1Imm8(ssa.OppermuteScalarsLoInt16x8, types.TypeVec128, 0), sys.AMD64)
+	addF(simdPackage, "Uint16x8.permuteScalarsLo", opLen1Imm8(ssa.OppermuteScalarsLoUint16x8, types.TypeVec128, 0), sys.AMD64)
+	addF(simdPackage, "Int16x16.permuteScalarsLoGrouped", opLen1Imm8(ssa.OppermuteScalarsLoGroupedInt16x16, types.TypeVec256, 0), sys.AMD64)
+	addF(simdPackage, "Int16x32.permuteScalarsLoGrouped", opLen1Imm8(ssa.OppermuteScalarsLoGroupedInt16x32, types.TypeVec512, 0), sys.AMD64)
+	addF(simdPackage, "Uint16x16.permuteScalarsLoGrouped", opLen1Imm8(ssa.OppermuteScalarsLoGroupedUint16x16, types.TypeVec256, 0), sys.AMD64)
+	addF(simdPackage, "Uint16x32.permuteScalarsLoGrouped", opLen1Imm8(ssa.OppermuteScalarsLoGroupedUint16x32, types.TypeVec512, 0), sys.AMD64)
 	addF(simdPackage, "Int32x4.tern", opLen3Imm8(ssa.OpternInt32x4, types.TypeVec128, 0), sys.AMD64)
 	addF(simdPackage, "Int32x8.tern", opLen3Imm8(ssa.OpternInt32x8, types.TypeVec256, 0), sys.AMD64)
 	addF(simdPackage, "Int32x16.tern", opLen3Imm8(ssa.OpternInt32x16, types.TypeVec512, 0), sys.AMD64)
--- a/src/simd/_gen/simdgen/gen_simdGenericOps.go
+++ b/src/simd/_gen/simdgen/gen_simdGenericOps.go
@ -46,6 +46,9 @@ func writeSIMDGenericOps(ops []Operation) *bytes.Buffer {
 		if op.NoGenericOps != nil && *op.NoGenericOps == "true" {
 			continue
 		}
+		if op.SkipMaskedMethod() {
+			continue
+		}
 		_, _, _, immType, gOp := op.shape()
 		gOpData := genericOpsData{gOp.GenericName(), len(gOp.In), op.Commutative}
 		if immType == VarImm || immType == ConstVarImm {
--- a/src/simd/_gen/simdgen/gen_simdIntrinsics.go
+++ b/src/simd/_gen/simdgen/gen_simdIntrinsics.go
@ -107,6 +107,9 @@ func writeSIMDIntrinsics(ops []Operation, typeMap simdTypeMap) *bytes.Buffer {
 		if op.NoTypes != nil && *op.NoTypes == "true" {
 			continue
 		}
+		if op.SkipMaskedMethod() {
+			continue
+		}
 		if s, op, err := classifyOp(op); err == nil {
 			if err := t.ExecuteTemplate(buffer, s, op); err != nil {
 				panic(fmt.Errorf("failed to execute template %s for op %s: %w", s, op.Go, err))
--- a/src/simd/_gen/simdgen/gen_simdTypes.go
+++ b/src/simd/_gen/simdgen/gen_simdTypes.go
@ -604,6 +604,9 @@ func writeSIMDStubs(ops []Operation, typeMap simdTypeMap) (f, fI *bytes.Buffer)
 		if op.NoTypes != nil && *op.NoTypes == "true" {
 			continue
 		}
+		if op.SkipMaskedMethod() {
+			continue
+		}
 		idxVecAsScalar, err := checkVecAsScalar(op)
 		if err != nil {
 			panic(err)
--- a/src/simd/_gen/simdgen/gen_simdrules.go
+++ b/src/simd/_gen/simdgen/gen_simdrules.go
@ -345,7 +345,8 @@ func writeSIMDRules(ops []Operation) *bytes.Buffer {
 			data.ArgsOut = "..."
 		}
 		data.tplName = tplName
-		if opr.NoGenericOps != nil && *opr.NoGenericOps == "true" {
+		if opr.NoGenericOps != nil && *opr.NoGenericOps == "true" ||
+			opr.SkipMaskedMethod() {
 			optData = append(optData, data)
 			continue
 		}
--- a/src/simd/_gen/simdgen/godefs.go
+++ b/src/simd/_gen/simdgen/godefs.go
@ -73,6 +73,29 @@ type rawOperation struct {
 	NoGenericOps *string
 	// If non-nil, this string will be attached to the machine ssa op name.  E.g. "const"
 	SSAVariant *string
+	// If true, do not emit method declarations, generic ops, or intrinsics for masked variants
+	// DO emit the architecture-specific opcodes and optimizations.
+	HideMaskMethods *bool
+}
+
+func (o *Operation) IsMasked() bool {
+	if len(o.InVariant) == 0 {
+		return false
+	}
+	if len(o.InVariant) == 1 && o.InVariant[0].Class == "mask" {
+		return true
+	}
+	panic(fmt.Errorf("unknown inVariant"))
+}
+
+func (o *Operation) SkipMaskedMethod() bool {
+	if o.HideMaskMethods == nil {
+		return false
+	}
+	if *o.HideMaskMethods && o.IsMasked() {
+		return true
+	}
+	return false
 }

 func (o *Operation) DecodeUnified(v *unify.Value) error {
@ -80,14 +103,7 @@ func (o *Operation) DecodeUnified(v *unify.Value) error {
 		return err
 	}

-	isMasked := false
-	if len(o.InVariant) == 0 {
-		// No variant
-	} else if len(o.InVariant) == 1 && o.InVariant[0].Class == "mask" {
-		isMasked = true
-	} else {
-		return fmt.Errorf("unknown inVariant")
-	}
+	isMasked := o.IsMasked()

 	// Compute full Go method name.
 	o.Go = o.rawOperation.Go
@ -104,6 +120,7 @@ func (o *Operation) DecodeUnified(v *unify.Value) error {
 	o.Documentation = regexp.MustCompile(`\bNAME\b`).ReplaceAllString(o.Documentation, o.Go)
 	if isMasked {
 		o.Documentation += "\n//\n// This operation is applied selectively under a write mask."
+		// Suppress generic op and method declaration for exported methods, if a mask is present.
 		if unicode.IsUpper([]rune(o.Go)[0]) {
 			trueVal := "true"
 			o.NoGenericOps = &trueVal
--- a/src/simd/_gen/simdgen/ops/Moves/categories.yaml
+++ b/src/simd/_gen/simdgen/ops/Moves/categories.yaml
@ -27,18 +27,22 @@
  constImm: 1
  documentation: !string |-
    // NAME returns the upper half of x.
+- go: PermuteOrZero
+  commutative: false
+  documentation: !string |-
+    // NAME performs a full permutation of vector x using indices:
+    // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 - go: Permute
  commutative: false
  documentation: !string |-
    // NAME performs a full permutation of vector x using indices:
    // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-    // Only the needed bits to represent x's index are used in indices' elements.
- go: Permute2 # Permute2 is only available on or after AVX512
+- go: ConcatPermute # ConcatPermute is only available on or after AVX512
  commutative: false
  documentation: !string |-
    // NAME performs a full permutation of vector x, y using indices:
    // result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-    // where xy is x appending y.
+    // where xy is the concatenation of x (lower half) and y (upper half).
    // Only the needed bits to represent xy's index are used in indices' elements.
 - go: Compress
  commutative: false
@ -74,31 +78,35 @@
  documentation: !string |-
    // NAME copies element zero of its (128-bit) input to all elements of
    // the 512-bit output vector.
+- go: PermuteOrZeroGrouped
+  commutative: false
+  documentation: !string |- # Detailed documentation will rely on the specific ops.
+    // NAME performs a grouped permutation of vector x using indices:
 - go: PermuteGrouped
  commutative: false
  documentation: !string |- # Detailed documentation will rely on the specific ops.
    // NAME performs a grouped permutation of vector x using indices:
- go: PermuteConstant
+- go: permuteScalars
  commutative: false
  documentation: !string |- # Detailed documentation will rely on the specific ops.
    // NAME performs a permutation of vector x using constant indices:
- go: PermuteConstantGrouped
+- go: permuteScalarsGrouped
  commutative: false
  documentation: !string |- # Detailed documentation will rely on the specific ops.
    // NAME performs a grouped permutation of vector x using constant indices:
- go: PermuteConstantLo
+- go: permuteScalarsLo
  commutative: false
  documentation: !string |- # Detailed documentation will rely on the specific ops.
    // NAME performs a permutation of vector x using constant indices:
- go: PermuteConstantLoGrouped
+- go: permuteScalarsLoGrouped
  commutative: false
  documentation: !string |- # Detailed documentation will rely on the specific ops.
    // NAME performs a grouped permutation of vector x using constant indices:
- go: PermuteConstantHi
+- go: permuteScalarsHi
  commutative: false
  documentation: !string |- # Detailed documentation will rely on the specific ops.
    // NAME performs a permutation of vector x using constant indices:
- go: PermuteConstantHiGrouped
+- go: permuteScalarsHiGrouped
  commutative: false
  documentation: !string |- # Detailed documentation will rely on the specific ops.
    // NAME performs a grouped permutation of vector x using constant indices:
@ -218,8 +226,10 @@
 - go: Select128FromPair
  commutative: false
  documentation: !string |-
-    // NAME selects the low and high 128-bit halves from the 128-bit halves
-    // of its two 256-bit inputs, numbering those halves 0, 1, 2, 3.
+    // NAME treats the 256-bit vectors x and y as a single vector of four
+    // 128-bit elements, and returns a 256-bit result formed by 
+    // concatenating the two elements specified by lo and hi.
+    // For example, {4,5}.NAME(3,0,{6,7}) returns {7,4}.

 - go: ConcatShiftBytesRight
  commutative: false
--- a/src/simd/_gen/simdgen/ops/Moves/go.yaml
+++ b/src/simd/_gen/simdgen/ops/Moves/go.yaml
@ -213,19 +213,75 @@
  - *f64xN

 - go: Permute
-  asm: "VPERM[BWDQ]|VPERMP[SD]"
+  asm: "VPERMQ|VPERMPD"
+  addDoc: !string |-
+    // The low 2 bits (values 0-3) of each element of indices is used
  operandOrder: "21Type1"
  in:
  - &anyindices
    go: $t
    name: indices
    overwriteBase: uint
+  - &any4
+    go: $t
+    lanes: 4
+  out:
  - &any
    go: $t
+
+- go: Permute
+  asm: "VPERM[WDQ]|VPERMP[SD]"
+  addDoc: !string |-
+    // The low 3 bits (values 0-7) of each element of indices is used
+  operandOrder: "21Type1"
+  in:
+  - *anyindices
+  - &any8
+    go: $t
+    lanes: 8
  out:
  - *any

- go: Permute2
+- go: Permute
+  asm: "VPERM[BWD]|VPERMPS"
+  addDoc: !string |-
+    // The low 4 bits (values 0-15) of each element of indices is used
+  operandOrder: "21Type1"
+  in:
+  - *anyindices
+  - &any16
+    go: $t
+    lanes: 16
+  out:
+  - *any
+
+- go: Permute
+  asm: "VPERM[BW]"
+  addDoc: !string |-
+    // The low 5 bits (values 0-31) of each element of indices is used
+  operandOrder: "21Type1"
+  in:
+  - *anyindices
+  - &any32
+    go: $t
+    lanes: 32
+  out:
+  - *any
+
+- go: Permute
+  asm: "VPERMB"
+  addDoc: !string |-
+    // The low 6 bits (values 0-63) of each element of indices is used
+  operandOrder: "21Type1"
+  in:
+  - *anyindices
+  - &any64
+    go: $t
+    lanes: 64
+  out:
+  - *any
+
+- go: ConcatPermute
  asm: "VPERMI2[BWDQ]|VPERMI2P[SD]"
  # Because we are overwriting the receiver's type, we
  # have to move the receiver to be a parameter so that
@ -403,113 +459,137 @@
    base: $b

 # VPSHUFB for 128-bit byte shuffles will be picked with higher priority than VPERMB, given its lower CPU feature requirement. (It's AVX)
- go: Permute
+- go: PermuteOrZero
  asm: VPSHUFB
  addDoc: !string |-
-    // However when the top bit is set, the low bits will be disregard and the respective element in the result vector will be zeroed.
+    // The lower four bits of each byte-sized index in indices select an element from x,
+    // unless the index's sign bit is set in which case zero is used instead.
  in:
  - &128any
    bits: 128
    go: $t
  - bits: 128
-    go: $t
    name: indices
+    base: int # always signed
  out:
  - *128any
- go: PermuteGrouped
+
+- go: PermuteOrZeroGrouped
  asm: VPSHUFB
  addDoc: !string |-
-    // result := {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...}
-    // Only the needed bits to represent the index of a group of x are used in indices' elements.
-    // However when the top bit is set, the low bits will be disregard and the respective element in the result vector will be zeroed.
+    // result = {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...}
+    // The lower four bits of each byte-sized index in indices select an element from its corresponding group in x,
+    // unless the index's sign bit is set in which case zero is used instead.
    // Each group is of size 128-bit.
  in:
  - &256Or512any
    bits: "256|512"
    go: $t
  - bits: "256|512"
+    base: int
+    name: indices
+  out:
+  - *256Or512any
+
+- go: permuteScalars
+  asm: VPSHUFD
+  addDoc: !string |-
+    // result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
+    // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+  in:
+  - *128any
+  - class: immediate
+    immOffset: 0
+    name: indices
+  hideMaskMethods: true
+  out:
+  - *128any
+
+- go: permuteScalarsGrouped
+  asm: VPSHUFD
+  addDoc: !string |-
+    // result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
+    // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+    // Each group is of size 128-bit.
+  in:
+  - *256Or512any
+  - class: immediate
+    immOffset: 0
+    name: indices
+  hideMaskMethods: true
+  out:
+  - *256Or512any
+
+- go: permuteScalarsLo
+  asm: VPSHUFLW
+  addDoc: !string |-
+    // result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]], x[4], x[5], x[6], x[7]}
+    // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+  in:
+    - &128lanes8
+      bits: 128
      go: $t
-    name: indices
-  out:
-  - *256Or512any
-
- go: PermuteConstant
-  asm: VPSHUFD
-  addDoc: !string |-
-    // result := {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
-    // Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
-  in:
-  - *128any
+      elemBits: 16
    - class: immediate
      immOffset: 0
      name: indices
+  hideMaskMethods: true
  out:
-  - *128any
- go: PermuteConstantGrouped
-  asm: VPSHUFD
+    - *128lanes8
+
+- go: permuteScalarsLoGrouped
+  asm: VPSHUFLW
  addDoc: !string |-
-    // result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
-    // Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
+    //
+    //   result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x[4], x[5], x[6], x[7],
+    //    x_group1[indices[0:2]], ...}
+    //
+    // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
    // Each group is of size 128-bit.
  in:
-  - *256Or512any
+  - &256Or512lanes8
+    bits: "256|512"
+    go: $t
+    elemBits: 16
  - class: immediate
    immOffset: 0
    name: indices
+  hideMaskMethods: true
  out:
-  - *256Or512any
+  - *256Or512lanes8

- go: PermuteConstantLo
+- go: permuteScalarsHi
  asm: VPSHUFHW
  addDoc: !string |-
-    // result := {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
-    // Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
+    // result = {x[0], x[1], x[2], x[3], x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]}
+    // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
  in:
-    - *128any
+  - *128lanes8
  - class: immediate
    immOffset: 0
    name: indices
+  hideMaskMethods: true
  out:
-    - *128any
- go: PermuteConstantLoGrouped
+  - *128lanes8
+
+- go: permuteScalarsHiGrouped
  asm: VPSHUFHW
  addDoc: !string |-
-    // result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
-    // Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
+    // result =
+    //
+    //   {x_group0[0], x_group0[1], x_group0[2], x_group0[3], x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4],
+    //    x_group1[0], x_group1[1], x_group1[2], x_group1[3], x_group1[indices[0:2]+4], ...}
+    //
+    // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
    // Each group is of size 128-bit.
  in:
-  - *256Or512any
+  - *256Or512lanes8
  - class: immediate
    immOffset: 0
    name: indices
+  hideMaskMethods: true
  out:
-  - *256Or512any
-
- go: PermuteConstantHi
-  asm: VPSHUFHW
-  addDoc: !string |-
-    // result := {x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]}
-    // Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
-  in:
-  - *128any
-  - class: immediate
-    immOffset: 0
-    name: indices
-  out:
-  - *128any
- go: PermuteConstantHiGrouped
-  asm: VPSHUFHW
-  addDoc: !string |-
-    // result := {x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4], x_group1[indices[0:2]+4], ...}
-    // Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
-    // Each group is of size 128-bit.
-  in:
-  - *256Or512any
-  - class: immediate
-    immOffset: 0
-    name: indices
-  out:
-  - *256Or512any
+  - *256Or512lanes8

 - go: InterleaveHi
  asm: VPUNPCKH(QDQ|DQ|WD|WB)
--- a/src/simd/internal/simd_test/simd_test.go
+++ b/src/simd/internal/simd_test/simd_test.go
@ -163,7 +163,20 @@ func TestPermute(t *testing.T) {
 	}
 }

-func TestPermute2(t *testing.T) {
+func TestPermuteOrZero(t *testing.T) {
+	x := []uint8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
+	indices := []int8{7, 6, 5, 4, 3, 2, 1, 0, -1, 8, -1, 9, -1, 10, -1, 11}
+	want := []uint8{8, 7, 6, 5, 4, 3, 2, 1, 0, 9, 0, 10, 0, 11, 0, 12}
+	got := make([]uint8, len(x))
+	simd.LoadUint8x16Slice(x).PermuteOrZero(simd.LoadInt8x16Slice(indices)).StoreSlice(got)
+	for i := range 8 {
+		if want[i] != got[i] {
+			t.Errorf("want and got differ at index %d, want=%d, got=%d", i, want[i], got[i])
+		}
+	}
+}
+
+func TestConcatPermute(t *testing.T) {
 	if !simd.X86.AVX512() {
 		t.Skip("Test requires X86.AVX512, not available on this hardware")
 		return
@ -173,7 +186,7 @@ func TestPermute2(t *testing.T) {
 	indices := []uint64{7 + 8, 6, 5 + 8, 4, 3 + 8, 2, 1 + 8, 0}
 	want := []int64{-8, 7, -6, 5, -4, 3, -2, 1}
 	got := make([]int64, 8)
-	simd.LoadInt64x8Slice(x).Permute2(simd.LoadInt64x8Slice(y), simd.LoadUint64x8Slice(indices)).StoreSlice(got)
+	simd.LoadInt64x8Slice(x).ConcatPermute(simd.LoadInt64x8Slice(y), simd.LoadUint64x8Slice(indices)).StoreSlice(got)
 	for i := range 8 {
 		if want[i] != got[i] {
 			t.Errorf("want and got differ at index %d, want=%d, got=%d", i, want[i], got[i])
@ -1161,3 +1174,75 @@ func TestDotProductQuadruple(t *testing.T) {
 		}
 	}
 }
+
+func TestPermuteScalars(t *testing.T) {
+	x := []int32{11, 12, 13, 14}
+	want := []int32{12, 13, 14, 11}
+	got := make([]int32, 4)
+	simd.LoadInt32x4Slice(x).PermuteScalars(1, 2, 3, 0).StoreSlice(got)
+	for i := range 4 {
+		if want[i] != got[i] {
+			t.Errorf("want and got differ at index %d, want=%d, got=%d", i, want[i], got[i])
+		}
+	}
+}
+
+func TestPermuteScalarsGrouped(t *testing.T) {
+	x := []int32{11, 12, 13, 14, 21, 22, 23, 24}
+	want := []int32{12, 13, 14, 11, 22, 23, 24, 21}
+	got := make([]int32, 8)
+	simd.LoadInt32x8Slice(x).PermuteScalarsGrouped(1, 2, 3, 0).StoreSlice(got)
+	for i := range 8 {
+		if want[i] != got[i] {
+			t.Errorf("want and got differ at index %d, want=%d, got=%d", i, want[i], got[i])
+		}
+	}
+}
+
+func TestPermuteScalarsHi(t *testing.T) {
+	x := []int16{-1, -2, -3, -4, 11, 12, 13, 14}
+	want := []int16{-1, -2, -3, -4, 12, 13, 14, 11}
+	got := make([]int16, len(x))
+	simd.LoadInt16x8Slice(x).PermuteScalarsHi(1, 2, 3, 0).StoreSlice(got)
+	for i := range got {
+		if want[i] != got[i] {
+			t.Errorf("want and got differ at index %d, want=%d, got=%d", i, want[i], got[i])
+		}
+	}
+}
+
+func TestPermuteScalarsLo(t *testing.T) {
+	x := []int16{11, 12, 13, 14, 4, 5, 6, 7}
+	want := []int16{12, 13, 14, 11, 4, 5, 6, 7}
+	got := make([]int16, len(x))
+	simd.LoadInt16x8Slice(x).PermuteScalarsLo(1, 2, 3, 0).StoreSlice(got)
+	for i := range got {
+		if want[i] != got[i] {
+			t.Errorf("want and got differ at index %d, want=%d, got=%d", i, want[i], got[i])
+		}
+	}
+}
+
+func TestPermuteScalarsHiGrouped(t *testing.T) {
+	x := []int16{-1, -2, -3, -4, 11, 12, 13, 14, -11, -12, -13, -14, 111, 112, 113, 114}
+	want := []int16{-1, -2, -3, -4, 12, 13, 14, 11, -11, -12, -13, -14, 112, 113, 114, 111}
+	got := make([]int16, len(x))
+	simd.LoadInt16x16Slice(x).PermuteScalarsHiGrouped(1, 2, 3, 0).StoreSlice(got)
+	for i := range got {
+		if want[i] != got[i] {
+			t.Errorf("want and got differ at index %d, want=%d, got=%d", i, want[i], got[i])
+		}
+	}
+}
+
+func TestPermuteScalarsLoGrouped(t *testing.T) {
+	x := []int16{11, 12, 13, 14, 4, 5, 6, 7, 111, 112, 113, 114, 14, 15, 16, 17}
+	want := []int16{12, 13, 14, 11, 4, 5, 6, 7, 112, 113, 114, 111, 14, 15, 16, 17}
+	got := make([]int16, len(x))
+	simd.LoadInt16x16Slice(x).PermuteScalarsLoGrouped(1, 2, 3, 0).StoreSlice(got)
+	for i := range got {
+		if want[i] != got[i] {
+			t.Errorf("want and got differ at index %d, want=%d, got=%d", i, want[i], got[i])
+		}
+	}
+}
--- a/src/simd/ops_amd64.go
+++ b/src/simd/ops_amd64.go
--- a/src/simd/ops_internal_amd64.go
+++ b/src/simd/ops_internal_amd64.go
@ -338,6 +338,220 @@ func (x Uint64x4) concatSelectedConstantGrouped(hilos uint8, y Uint64x4) Uint64x
 // Asm: VSHUFPD, CPU Feature: AVX512
 func (x Uint64x8) concatSelectedConstantGrouped(hilos uint8, y Uint64x8) Uint64x8

+/* permuteScalars */
+
+// permuteScalars performs a permutation of vector x using constant indices:
+// result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFD, CPU Feature: AVX
+func (x Int32x4) permuteScalars(indices uint8) Int32x4
+
+// permuteScalars performs a permutation of vector x using constant indices:
+// result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFD, CPU Feature: AVX
+func (x Uint32x4) permuteScalars(indices uint8) Uint32x4
+
+/* permuteScalarsGrouped */
+
+// permuteScalarsGrouped performs a grouped permutation of vector x using constant indices:
+// result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+// Each group is of size 128-bit.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFD, CPU Feature: AVX2
+func (x Int32x8) permuteScalarsGrouped(indices uint8) Int32x8
+
+// permuteScalarsGrouped performs a grouped permutation of vector x using constant indices:
+// result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+// Each group is of size 128-bit.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFD, CPU Feature: AVX512
+func (x Int32x16) permuteScalarsGrouped(indices uint8) Int32x16
+
+// permuteScalarsGrouped performs a grouped permutation of vector x using constant indices:
+// result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+// Each group is of size 128-bit.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFD, CPU Feature: AVX2
+func (x Uint32x8) permuteScalarsGrouped(indices uint8) Uint32x8
+
+// permuteScalarsGrouped performs a grouped permutation of vector x using constant indices:
+// result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+// Each group is of size 128-bit.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFD, CPU Feature: AVX512
+func (x Uint32x16) permuteScalarsGrouped(indices uint8) Uint32x16
+
+/* permuteScalarsHi */
+
+// permuteScalarsHi performs a permutation of vector x using constant indices:
+// result = {x[0], x[1], x[2], x[3], x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]}
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX512
+func (x Int16x8) permuteScalarsHi(indices uint8) Int16x8
+
+// permuteScalarsHi performs a permutation of vector x using constant indices:
+// result = {x[0], x[1], x[2], x[3], x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]}
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX512
+func (x Uint16x8) permuteScalarsHi(indices uint8) Uint16x8
+
+/* permuteScalarsHiGrouped */
+
+// permuteScalarsHiGrouped performs a grouped permutation of vector x using constant indices:
+// result =
+//
+//	{x_group0[0], x_group0[1], x_group0[2], x_group0[3], x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4],
+//	 x_group1[0], x_group1[1], x_group1[2], x_group1[3], x_group1[indices[0:2]+4], ...}
+//
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+// Each group is of size 128-bit.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX2
+func (x Int16x16) permuteScalarsHiGrouped(indices uint8) Int16x16
+
+// permuteScalarsHiGrouped performs a grouped permutation of vector x using constant indices:
+// result =
+//
+//	{x_group0[0], x_group0[1], x_group0[2], x_group0[3], x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4],
+//	 x_group1[0], x_group1[1], x_group1[2], x_group1[3], x_group1[indices[0:2]+4], ...}
+//
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+// Each group is of size 128-bit.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX512
+func (x Int16x32) permuteScalarsHiGrouped(indices uint8) Int16x32
+
+// permuteScalarsHiGrouped performs a grouped permutation of vector x using constant indices:
+// result =
+//
+//	{x_group0[0], x_group0[1], x_group0[2], x_group0[3], x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4],
+//	 x_group1[0], x_group1[1], x_group1[2], x_group1[3], x_group1[indices[0:2]+4], ...}
+//
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+// Each group is of size 128-bit.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX2
+func (x Uint16x16) permuteScalarsHiGrouped(indices uint8) Uint16x16
+
+// permuteScalarsHiGrouped performs a grouped permutation of vector x using constant indices:
+// result =
+//
+//	{x_group0[0], x_group0[1], x_group0[2], x_group0[3], x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4],
+//	 x_group1[0], x_group1[1], x_group1[2], x_group1[3], x_group1[indices[0:2]+4], ...}
+//
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+// Each group is of size 128-bit.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX512
+func (x Uint16x32) permuteScalarsHiGrouped(indices uint8) Uint16x32
+
+/* permuteScalarsLo */
+
+// permuteScalarsLo performs a permutation of vector x using constant indices:
+// result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]], x[4], x[5], x[6], x[7]}
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFLW, CPU Feature: AVX512
+func (x Int16x8) permuteScalarsLo(indices uint8) Int16x8
+
+// permuteScalarsLo performs a permutation of vector x using constant indices:
+// result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]], x[4], x[5], x[6], x[7]}
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFLW, CPU Feature: AVX512
+func (x Uint16x8) permuteScalarsLo(indices uint8) Uint16x8
+
+/* permuteScalarsLoGrouped */
+
+// permuteScalarsLoGrouped performs a grouped permutation of vector x using constant indices:
+//
+//	result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x[4], x[5], x[6], x[7],
+//	 x_group1[indices[0:2]], ...}
+//
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+// Each group is of size 128-bit.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFLW, CPU Feature: AVX2
+func (x Int16x16) permuteScalarsLoGrouped(indices uint8) Int16x16
+
+// permuteScalarsLoGrouped performs a grouped permutation of vector x using constant indices:
+//
+//	result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x[4], x[5], x[6], x[7],
+//	 x_group1[indices[0:2]], ...}
+//
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+// Each group is of size 128-bit.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFLW, CPU Feature: AVX512
+func (x Int16x32) permuteScalarsLoGrouped(indices uint8) Int16x32
+
+// permuteScalarsLoGrouped performs a grouped permutation of vector x using constant indices:
+//
+//	result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x[4], x[5], x[6], x[7],
+//	 x_group1[indices[0:2]], ...}
+//
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+// Each group is of size 128-bit.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFLW, CPU Feature: AVX2
+func (x Uint16x16) permuteScalarsLoGrouped(indices uint8) Uint16x16
+
+// permuteScalarsLoGrouped performs a grouped permutation of vector x using constant indices:
+//
+//	result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x[4], x[5], x[6], x[7],
+//	 x_group1[indices[0:2]], ...}
+//
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+// Each group is of size 128-bit.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFLW, CPU Feature: AVX512
+func (x Uint16x32) permuteScalarsLoGrouped(indices uint8) Uint16x32
+
 /* tern */

 // tern performs a logical operation on three vectors based on the 8-bit truth table.
--- a/src/simd/shuffles_amd64.go
+++ b/src/simd/shuffles_amd64.go
@ -989,3 +989,280 @@ func (x Int64x8) SelectFromPairGrouped(a, b uint8, y Int64x8) Int64x8 {
 	}
 	panic("missing case, switch should be exhaustive")
 }
+
+/* PermuteScalars */
+
+// PermuteScalars performs a permutation of vector x's elements using the supplied indices:
+//
+//	result = {x[a], x[b], x[c], x[d]}
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table may be generated.
+//
+// Asm: VPSHUFD, CPU Feature: AVX
+func (x Int32x4) PermuteScalars(a, b, c, d uint8) Int32x4 {
+	return x.permuteScalars(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
+
+// PermuteScalars performs a permutation of vector x's elements using the supplied indices:
+//
+//	result = {x[a], x[b], x[c], x[d]}
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table may be generated.
+//
+// Asm: VPSHUFD, CPU Feature: AVX
+func (x Uint32x4) PermuteScalars(a, b, c, d uint8) Uint32x4 {
+	return x.permuteScalars(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
+
+/* PermuteScalarsGrouped */
+
+// PermuteScalarsGrouped performs a grouped permutation of vector x using the supplied indices:
+//
+//	result = {x[a], x[b], x[c], x[d], x[a+4], x[b+4], x[c+4], x[d+4]}
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table may be generated.
+//
+// Asm: VPSHUFD, CPU Feature: AVX2
+func (x Int32x8) PermuteScalarsGrouped(a, b, c, d uint8) Int32x8 {
+	return x.permuteScalarsGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
+
+// PermuteScalarsGrouped performs a grouped permutation of vector x using the supplied indices:
+//
+//	 result =
+//		 {  x[a], x[b], x[c], x[d],         x[a+4], x[b+4], x[c+4], x[d+4],
+//			x[a+8], x[b+8], x[c+8], x[d+8], x[a+12], x[b+12], x[c+12], x[d+12]}
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table may be generated.
+//
+// Asm: VPSHUFD, CPU Feature: AVX512
+func (x Int32x16) PermuteScalarsGrouped(a, b, c, d uint8) Int32x16 {
+	return x.permuteScalarsGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
+
+// PermuteScalarsGrouped performs a grouped permutation of vector x using the supplied indices:
+//
+//	result = {x[a], x[b], x[c], x[d], x[a+4], x[b+4], x[c+4], x[d+4]}
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table is generated.
+//
+// Asm: VPSHUFD, CPU Feature: AVX2
+func (x Uint32x8) PermuteScalarsGrouped(a, b, c, d uint8) Uint32x8 {
+	return x.permuteScalarsGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
+
+// PermuteScalarsGrouped performs a grouped permutation of vector x using the supplied indices:
+//
+//	 result =
+//		 {  x[a], x[b], x[c], x[d],         x[a+4], x[b+4], x[c+4], x[d+4],
+//			x[a+8], x[b+8], x[c+8], x[d+8], x[a+12], x[b+12], x[c+12], x[d+12]}
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table is generated.
+//
+// Asm: VPSHUFD, CPU Feature: AVX512
+func (x Uint32x16) PermuteScalarsGrouped(a, b, c, d uint8) Uint32x16 {
+	return x.permuteScalarsGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
+
+/* PermuteScalarsHi */
+
+// PermuteScalarsHi performs a permutation of vector x using the supplied indices:
+//
+// result = {x[0], x[1], x[2], x[3], x[a+4], x[b+4], x[c+4], x[d+4]}
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table is generated.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX512
+func (x Int16x8) PermuteScalarsHi(a, b, c, d uint8) Int16x8 {
+	return x.permuteScalarsHi(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
+
+// PermuteScalarsHi performs a permutation of vector x using the supplied indices:
+//
+// result = {x[0], x[1], x[2], x[3], x[a+4], x[b+4], x[c+4], x[d+4]}
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table is generated.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX512
+func (x Uint16x8) PermuteScalarsHi(a, b, c, d uint8) Uint16x8 {
+	return x.permuteScalarsHi(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
+
+/* PermuteScalarsHiGrouped */
+
+// PermuteScalarsHiGrouped performs a grouped permutation of vector x using the supplied indices:
+//
+//	 result =
+//		  {x[0], x[1], x[2], x[3],   x[a+4], x[b+4], x[c+4], x[d+4],
+//			x[8], x[9], x[10], x[11], x[a+12], x[b+12], x[c+12], x[d+12]}
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table is generated.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX2
+func (x Int16x16) PermuteScalarsHiGrouped(a, b, c, d uint8) Int16x16 {
+	return x.permuteScalarsHiGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
+
+// PermuteScalarsHiGrouped performs a grouped permutation of vector x using the supplied indices:
+//
+//	 result =
+//		  {x[0], x[1], x[2], x[3],     x[a+4], x[b+4], x[c+4], x[d+4],
+//			x[8], x[9], x[10], x[11],   x[a+12], x[b+12], x[c+12], x[d+12],
+//			x[16], x[17], x[18], x[19], x[a+20], x[b+20], x[c+20], x[d+20],
+//			x[24], x[25], x[26], x[27], x[a+28], x[b+28], x[c+28], x[d+28]}
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table is generated.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX512
+func (x Int16x32) PermuteScalarsHiGrouped(a, b, c, d uint8) Int16x32 {
+	return x.permuteScalarsHiGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
+
+// PermuteScalarsHiGrouped performs a grouped permutation of vector x using the supplied indices:
+//
+//	 result =
+//	  {x[0], x[1], x[2], x[3],   x[a+4], x[b+4], x[c+4], x[d+4],
+//		x[8], x[9], x[10], x[11], x[a+12], x[b+12], x[c+12], x[d+12]}
+//
+// Each group is of size 128-bit.
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table is generated.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX2
+func (x Uint16x16) PermuteScalarsHiGrouped(a, b, c, d uint8) Uint16x16 {
+	return x.permuteScalarsHiGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
+
+// PermuteScalarsHiGrouped performs a grouped permutation of vector x using the supplied indices:
+//
+//	 result =
+//		 {  x[0], x[1], x[2], x[3],     x[a+4], x[b+4], x[c+4], x[d+4],
+//			x[8], x[9], x[10], x[11],   x[a+12], x[b+12], x[c+12], x[d+12],
+//			x[16], x[17], x[18], x[19], x[a+20], x[b+20], x[c+20], x[d+20],
+//			x[24], x[25], x[26], x[27], x[a+28], x[b+28], x[c+28], x[d+28]}
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table is generated.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX512
+func (x Uint16x32) PermuteScalarsHiGrouped(a, b, c, d uint8) Uint16x32 {
+	return x.permuteScalarsHiGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
+
+/* PermuteScalarsLo */
+
+// PermuteScalarsLo performs a permutation of vector x using the supplied indices:
+//
+//	result = {x[a], x[b], x[c], x[d], x[4], x[5], x[6], x[7]}
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table is generated.
+//
+// Asm: VPSHUFLW, CPU Feature: AVX512
+func (x Int16x8) PermuteScalarsLo(a, b, c, d uint8) Int16x8 {
+	return x.permuteScalarsLo(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
+
+// PermuteScalarsLo performs a permutation of vector x using the supplied indices:
+//
+//	result = {x[a], x[b], x[c], x[d], x[4], x[5], x[6], x[7]}
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table is generated.
+//
+// Asm: VPSHUFLW, CPU Feature: AVX512
+func (x Uint16x8) PermuteScalarsLo(a, b, c, d uint8) Uint16x8 {
+	return x.permuteScalarsLo(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
+
+/* PermuteScalarsLoGrouped */
+
+// PermuteScalarsLoGrouped performs a grouped permutation of vector x using the supplied indices:
+//
+//	 result =
+//	 {x[a], x[b], x[c], x[d],         x[4], x[5], x[6], x[7],
+//		 x[a+8], x[b+8], x[c+8], x[d+8], x[12], x[13], x[14], x[15]}
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table is generated.
+//
+// Asm: VPSHUFLW, CPU Feature: AVX2
+func (x Int16x16) PermuteScalarsLoGrouped(a, b, c, d uint8) Int16x16 {
+	return x.permuteScalarsLoGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
+
+// PermuteScalarsLoGrouped performs a grouped permutation of vector x using the supplied indices:
+//
+//	 result =
+//	 {x[a], x[b], x[c], x[d],    x[4], x[5], x[6], x[7],
+//		x[a+8], x[b+8], x[c+8], x[d+8],     x[12], x[13], x[14], x[15],
+//		x[a+16], x[b+16], x[c+16], x[d+16], x[20], x[21], x[22], x[23],
+//		x[a+24], x[b+24], x[c+24], x[d+24], x[28], x[29], x[30], x[31]}
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table is generated.
+//
+// Asm: VPSHUFLW, CPU Feature: AVX512
+func (x Int16x32) PermuteScalarsLoGrouped(a, b, c, d uint8) Int16x32 {
+	return x.permuteScalarsLoGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
+
+// PermuteScalarsLoGrouped performs a grouped permutation of vector x using the supplied indices:
+//
+//	 result = {x[a], x[b], x[c], x[d],         x[4], x[5], x[6], x[7],
+//		x[a+8], x[b+8], x[c+8], x[d+8], x[12], x[13], x[14], x[15]}
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table is generated.
+//
+// Asm: VPSHUFLW, CPU Feature: AVX2
+func (x Uint16x16) PermuteScalarsLoGrouped(a, b, c, d uint8) Uint16x16 {
+	return x.permuteScalarsLoGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
+
+// PermuteScalarsLoGrouped performs a grouped permutation of vector x using the supplied indices:
+//
+//	 result =
+//	 {x[a], x[b], x[c], x[d],    x[4], x[5], x[6], x[7],
+//		x[a+8], x[b+8], x[c+8], x[d+8],     x[12], x[13], x[14], x[15],
+//		x[a+16], x[b+16], x[c+16], x[d+16], x[20], x[21], x[22], x[23],
+//		x[a+24], x[b+24], x[c+24], x[d+24], x[28], x[29], x[30], x[31]}
+//
+// Each group is of size 128-bit.
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table is generated.
+//
+// Asm: VPSHUFLW, CPU Feature: AVX512
+func (x Uint16x32) PermuteScalarsLoGrouped(a, b, c, d uint8) Uint16x32 {
+	return x.permuteScalarsLoGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}