[dev.simd] simd: fix signatures for PermuteConstant* methods

This moves the packed-immediate methods to package-private,
and adds exported versions with four parameters.

Rename PermuteConstant to PermuteScalars
Rename VPSHUFB Permute to PermuteOrZero
Rename Permute2 to ConcatPermute

Comments were repaired/enhanced.

Modified the generator to support an additional tag
"hideMaskMethods : true" to suppress method, intrinsic,
generic, and generic translation generation for said
mask-modified versions of such methods (this is already
true for exported methods).

Change-Id: I91e208c1fff1f28ebce4edb4e73d26003715018c
Reviewed-on: https://go-review.googlesource.com/c/go/+/721342
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Junyang Shao <shaojunyang@google.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
This commit is contained in:
David Chase 2025-11-17 15:31:36 -05:00
parent e3d4645693
commit 4d26d66a49
18 changed files with 2614 additions and 1820 deletions

View file

@ -396,7 +396,7 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
ssa.OpAMD64VPOR256,
ssa.OpAMD64VPORD512,
ssa.OpAMD64VPORQ512,
ssa.OpAMD64VPSHUFB128,
ssa.OpAMD64VPERMB128,
ssa.OpAMD64VPERMB256,
ssa.OpAMD64VPERMB512,
ssa.OpAMD64VPERMW128,
@ -410,6 +410,7 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
ssa.OpAMD64VPERMQ256,
ssa.OpAMD64VPERMPD512,
ssa.OpAMD64VPERMQ512,
ssa.OpAMD64VPSHUFB128,
ssa.OpAMD64VPSHUFB256,
ssa.OpAMD64VPSHUFB512,
ssa.OpAMD64VPROLVD128,
@ -672,9 +673,7 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
ssa.OpAMD64VPORQMasked128,
ssa.OpAMD64VPORQMasked256,
ssa.OpAMD64VPORQMasked512,
ssa.OpAMD64VPSHUFBMasked256,
ssa.OpAMD64VPSHUFBMasked512,
ssa.OpAMD64VPSHUFBMasked128,
ssa.OpAMD64VPERMBMasked128,
ssa.OpAMD64VPERMBMasked256,
ssa.OpAMD64VPERMBMasked512,
ssa.OpAMD64VPERMWMasked128,
@ -688,6 +687,9 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
ssa.OpAMD64VPERMQMasked256,
ssa.OpAMD64VPERMPDMasked512,
ssa.OpAMD64VPERMQMasked512,
ssa.OpAMD64VPSHUFBMasked256,
ssa.OpAMD64VPSHUFBMasked512,
ssa.OpAMD64VPSHUFBMasked128,
ssa.OpAMD64VPROLVDMasked128,
ssa.OpAMD64VPROLVDMasked256,
ssa.OpAMD64VPROLVDMasked512,
@ -1011,12 +1013,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
ssa.OpAMD64VEXTRACTF64X4256,
ssa.OpAMD64VEXTRACTI128128,
ssa.OpAMD64VEXTRACTI64X4256,
ssa.OpAMD64VPSHUFD128,
ssa.OpAMD64VPSHUFD256,
ssa.OpAMD64VPSHUFD512,
ssa.OpAMD64VPSHUFHW128,
ssa.OpAMD64VPSHUFHW256,
ssa.OpAMD64VPSHUFHW512,
ssa.OpAMD64VPROLD128,
ssa.OpAMD64VPROLD256,
ssa.OpAMD64VPROLD512,
@ -1029,6 +1025,15 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
ssa.OpAMD64VPRORQ128,
ssa.OpAMD64VPRORQ256,
ssa.OpAMD64VPRORQ512,
ssa.OpAMD64VPSHUFD128,
ssa.OpAMD64VPSHUFD256,
ssa.OpAMD64VPSHUFD512,
ssa.OpAMD64VPSHUFHW128,
ssa.OpAMD64VPSHUFHW256,
ssa.OpAMD64VPSHUFHW512,
ssa.OpAMD64VPSHUFLW128,
ssa.OpAMD64VPSHUFLW256,
ssa.OpAMD64VPSHUFLW512,
ssa.OpAMD64VPSLLW128const,
ssa.OpAMD64VPSLLW256const,
ssa.OpAMD64VPSLLW512const,
@ -1070,12 +1075,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
ssa.OpAMD64VREDUCEPDMasked128,
ssa.OpAMD64VREDUCEPDMasked256,
ssa.OpAMD64VREDUCEPDMasked512,
ssa.OpAMD64VPSHUFDMasked256,
ssa.OpAMD64VPSHUFDMasked512,
ssa.OpAMD64VPSHUFHWMasked256,
ssa.OpAMD64VPSHUFHWMasked512,
ssa.OpAMD64VPSHUFHWMasked128,
ssa.OpAMD64VPSHUFDMasked128,
ssa.OpAMD64VPROLDMasked128,
ssa.OpAMD64VPROLDMasked256,
ssa.OpAMD64VPROLDMasked512,
@ -1088,6 +1087,15 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
ssa.OpAMD64VPRORQMasked128,
ssa.OpAMD64VPRORQMasked256,
ssa.OpAMD64VPRORQMasked512,
ssa.OpAMD64VPSHUFDMasked256,
ssa.OpAMD64VPSHUFDMasked512,
ssa.OpAMD64VPSHUFHWMasked256,
ssa.OpAMD64VPSHUFHWMasked512,
ssa.OpAMD64VPSHUFHWMasked128,
ssa.OpAMD64VPSHUFLWMasked256,
ssa.OpAMD64VPSHUFLWMasked512,
ssa.OpAMD64VPSHUFLWMasked128,
ssa.OpAMD64VPSHUFDMasked128,
ssa.OpAMD64VPSLLWMasked128const,
ssa.OpAMD64VPSLLWMasked256const,
ssa.OpAMD64VPSLLWMasked512const,
@ -1209,6 +1217,24 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
case ssa.OpAMD64VPDPWSSD128,
ssa.OpAMD64VPDPWSSD256,
ssa.OpAMD64VPDPWSSD512,
ssa.OpAMD64VPERMI2B128,
ssa.OpAMD64VPERMI2B256,
ssa.OpAMD64VPERMI2B512,
ssa.OpAMD64VPERMI2W128,
ssa.OpAMD64VPERMI2W256,
ssa.OpAMD64VPERMI2W512,
ssa.OpAMD64VPERMI2PS128,
ssa.OpAMD64VPERMI2D128,
ssa.OpAMD64VPERMI2PS256,
ssa.OpAMD64VPERMI2D256,
ssa.OpAMD64VPERMI2PS512,
ssa.OpAMD64VPERMI2D512,
ssa.OpAMD64VPERMI2PD128,
ssa.OpAMD64VPERMI2Q128,
ssa.OpAMD64VPERMI2PD256,
ssa.OpAMD64VPERMI2Q256,
ssa.OpAMD64VPERMI2PD512,
ssa.OpAMD64VPERMI2Q512,
ssa.OpAMD64VPDPBUSD128,
ssa.OpAMD64VPDPBUSD256,
ssa.OpAMD64VPDPBUSD512,
@ -1233,24 +1259,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
ssa.OpAMD64VFMSUBADD213PD128,
ssa.OpAMD64VFMSUBADD213PD256,
ssa.OpAMD64VFMSUBADD213PD512,
ssa.OpAMD64VPERMI2B128,
ssa.OpAMD64VPERMI2B256,
ssa.OpAMD64VPERMI2B512,
ssa.OpAMD64VPERMI2W128,
ssa.OpAMD64VPERMI2W256,
ssa.OpAMD64VPERMI2W512,
ssa.OpAMD64VPERMI2PS128,
ssa.OpAMD64VPERMI2D128,
ssa.OpAMD64VPERMI2PS256,
ssa.OpAMD64VPERMI2D256,
ssa.OpAMD64VPERMI2PS512,
ssa.OpAMD64VPERMI2D512,
ssa.OpAMD64VPERMI2PD128,
ssa.OpAMD64VPERMI2Q128,
ssa.OpAMD64VPERMI2PD256,
ssa.OpAMD64VPERMI2Q256,
ssa.OpAMD64VPERMI2PD512,
ssa.OpAMD64VPERMI2Q512,
ssa.OpAMD64VPSHLDVW128,
ssa.OpAMD64VPSHLDVW256,
ssa.OpAMD64VPSHLDVW512,
@ -1316,6 +1324,24 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
ssa.OpAMD64VPAVGWMasked128Merging,
ssa.OpAMD64VPAVGWMasked256Merging,
ssa.OpAMD64VPAVGWMasked512Merging,
ssa.OpAMD64VPERMI2BMasked128,
ssa.OpAMD64VPERMI2BMasked256,
ssa.OpAMD64VPERMI2BMasked512,
ssa.OpAMD64VPERMI2WMasked128,
ssa.OpAMD64VPERMI2WMasked256,
ssa.OpAMD64VPERMI2WMasked512,
ssa.OpAMD64VPERMI2PSMasked128,
ssa.OpAMD64VPERMI2DMasked128,
ssa.OpAMD64VPERMI2PSMasked256,
ssa.OpAMD64VPERMI2DMasked256,
ssa.OpAMD64VPERMI2PSMasked512,
ssa.OpAMD64VPERMI2DMasked512,
ssa.OpAMD64VPERMI2PDMasked128,
ssa.OpAMD64VPERMI2QMasked128,
ssa.OpAMD64VPERMI2PDMasked256,
ssa.OpAMD64VPERMI2QMasked256,
ssa.OpAMD64VPERMI2PDMasked512,
ssa.OpAMD64VPERMI2QMasked512,
ssa.OpAMD64VPALIGNRMasked256Merging,
ssa.OpAMD64VPALIGNRMasked512Merging,
ssa.OpAMD64VPALIGNRMasked128Merging,
@ -1451,24 +1477,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
ssa.OpAMD64VPORQMasked128Merging,
ssa.OpAMD64VPORQMasked256Merging,
ssa.OpAMD64VPORQMasked512Merging,
ssa.OpAMD64VPERMI2BMasked128,
ssa.OpAMD64VPERMI2BMasked256,
ssa.OpAMD64VPERMI2BMasked512,
ssa.OpAMD64VPERMI2WMasked128,
ssa.OpAMD64VPERMI2WMasked256,
ssa.OpAMD64VPERMI2WMasked512,
ssa.OpAMD64VPERMI2PSMasked128,
ssa.OpAMD64VPERMI2DMasked128,
ssa.OpAMD64VPERMI2PSMasked256,
ssa.OpAMD64VPERMI2DMasked256,
ssa.OpAMD64VPERMI2PSMasked512,
ssa.OpAMD64VPERMI2DMasked512,
ssa.OpAMD64VPERMI2PDMasked128,
ssa.OpAMD64VPERMI2QMasked128,
ssa.OpAMD64VPERMI2PDMasked256,
ssa.OpAMD64VPERMI2QMasked256,
ssa.OpAMD64VPERMI2PDMasked512,
ssa.OpAMD64VPERMI2QMasked512,
ssa.OpAMD64VPSHUFBMasked256Merging,
ssa.OpAMD64VPSHUFBMasked512Merging,
ssa.OpAMD64VPSHUFBMasked128Merging,
@ -1819,6 +1827,18 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
p = simdV21load(s, v)
case ssa.OpAMD64VPDPWSSD512load,
ssa.OpAMD64VPERMI2PS128load,
ssa.OpAMD64VPERMI2D128load,
ssa.OpAMD64VPERMI2PS256load,
ssa.OpAMD64VPERMI2D256load,
ssa.OpAMD64VPERMI2PS512load,
ssa.OpAMD64VPERMI2D512load,
ssa.OpAMD64VPERMI2PD128load,
ssa.OpAMD64VPERMI2Q128load,
ssa.OpAMD64VPERMI2PD256load,
ssa.OpAMD64VPERMI2Q256load,
ssa.OpAMD64VPERMI2PD512load,
ssa.OpAMD64VPERMI2Q512load,
ssa.OpAMD64VPDPBUSD512load,
ssa.OpAMD64VPDPBUSDS512load,
ssa.OpAMD64VFMADD213PS128load,
@ -1839,18 +1859,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
ssa.OpAMD64VFMSUBADD213PD128load,
ssa.OpAMD64VFMSUBADD213PD256load,
ssa.OpAMD64VFMSUBADD213PD512load,
ssa.OpAMD64VPERMI2PS128load,
ssa.OpAMD64VPERMI2D128load,
ssa.OpAMD64VPERMI2PS256load,
ssa.OpAMD64VPERMI2D256load,
ssa.OpAMD64VPERMI2PS512load,
ssa.OpAMD64VPERMI2D512load,
ssa.OpAMD64VPERMI2PD128load,
ssa.OpAMD64VPERMI2Q128load,
ssa.OpAMD64VPERMI2PD256load,
ssa.OpAMD64VPERMI2Q256load,
ssa.OpAMD64VPERMI2PD512load,
ssa.OpAMD64VPERMI2Q512load,
ssa.OpAMD64VPSHLDVD128load,
ssa.OpAMD64VPSHLDVD256load,
ssa.OpAMD64VPSHLDVD512load,
@ -1868,6 +1876,18 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
case ssa.OpAMD64VPDPWSSDMasked128load,
ssa.OpAMD64VPDPWSSDMasked256load,
ssa.OpAMD64VPDPWSSDMasked512load,
ssa.OpAMD64VPERMI2PSMasked128load,
ssa.OpAMD64VPERMI2DMasked128load,
ssa.OpAMD64VPERMI2PSMasked256load,
ssa.OpAMD64VPERMI2DMasked256load,
ssa.OpAMD64VPERMI2PSMasked512load,
ssa.OpAMD64VPERMI2DMasked512load,
ssa.OpAMD64VPERMI2PDMasked128load,
ssa.OpAMD64VPERMI2QMasked128load,
ssa.OpAMD64VPERMI2PDMasked256load,
ssa.OpAMD64VPERMI2QMasked256load,
ssa.OpAMD64VPERMI2PDMasked512load,
ssa.OpAMD64VPERMI2QMasked512load,
ssa.OpAMD64VPDPBUSDMasked128load,
ssa.OpAMD64VPDPBUSDMasked256load,
ssa.OpAMD64VPDPBUSDMasked512load,
@ -1892,18 +1912,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
ssa.OpAMD64VFMSUBADD213PDMasked128load,
ssa.OpAMD64VFMSUBADD213PDMasked256load,
ssa.OpAMD64VFMSUBADD213PDMasked512load,
ssa.OpAMD64VPERMI2PSMasked128load,
ssa.OpAMD64VPERMI2DMasked128load,
ssa.OpAMD64VPERMI2PSMasked256load,
ssa.OpAMD64VPERMI2DMasked256load,
ssa.OpAMD64VPERMI2PSMasked512load,
ssa.OpAMD64VPERMI2DMasked512load,
ssa.OpAMD64VPERMI2PDMasked128load,
ssa.OpAMD64VPERMI2QMasked128load,
ssa.OpAMD64VPERMI2PDMasked256load,
ssa.OpAMD64VPERMI2QMasked256load,
ssa.OpAMD64VPERMI2PDMasked512load,
ssa.OpAMD64VPERMI2QMasked512load,
ssa.OpAMD64VPSHLDVDMasked128load,
ssa.OpAMD64VPSHLDVDMasked256load,
ssa.OpAMD64VPSHLDVDMasked512load,
@ -2124,7 +2132,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
ssa.OpAMD64VREDUCEPD128load,
ssa.OpAMD64VREDUCEPD256load,
ssa.OpAMD64VREDUCEPD512load,
ssa.OpAMD64VPSHUFD512load,
ssa.OpAMD64VPROLD128load,
ssa.OpAMD64VPROLD256load,
ssa.OpAMD64VPROLD512load,
@ -2137,6 +2144,7 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
ssa.OpAMD64VPRORQ128load,
ssa.OpAMD64VPRORQ256load,
ssa.OpAMD64VPRORQ512load,
ssa.OpAMD64VPSHUFD512load,
ssa.OpAMD64VPSLLD512constload,
ssa.OpAMD64VPSLLQ512constload,
ssa.OpAMD64VPSRLD512constload,
@ -2159,9 +2167,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
ssa.OpAMD64VREDUCEPDMasked128load,
ssa.OpAMD64VREDUCEPDMasked256load,
ssa.OpAMD64VREDUCEPDMasked512load,
ssa.OpAMD64VPSHUFDMasked256load,
ssa.OpAMD64VPSHUFDMasked512load,
ssa.OpAMD64VPSHUFDMasked128load,
ssa.OpAMD64VPROLDMasked128load,
ssa.OpAMD64VPROLDMasked256load,
ssa.OpAMD64VPROLDMasked512load,
@ -2174,6 +2179,9 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
ssa.OpAMD64VPRORQMasked128load,
ssa.OpAMD64VPRORQMasked256load,
ssa.OpAMD64VPRORQMasked512load,
ssa.OpAMD64VPSHUFDMasked256load,
ssa.OpAMD64VPSHUFDMasked512load,
ssa.OpAMD64VPSHUFDMasked128load,
ssa.OpAMD64VPSLLDMasked128constload,
ssa.OpAMD64VPSLLDMasked256constload,
ssa.OpAMD64VPSLLDMasked512constload,
@ -2447,12 +2455,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
ssa.OpAMD64VPOPCNTQMasked128Merging,
ssa.OpAMD64VPOPCNTQMasked256Merging,
ssa.OpAMD64VPOPCNTQMasked512Merging,
ssa.OpAMD64VPSHUFDMasked256Merging,
ssa.OpAMD64VPSHUFDMasked512Merging,
ssa.OpAMD64VPSHUFHWMasked256Merging,
ssa.OpAMD64VPSHUFHWMasked512Merging,
ssa.OpAMD64VPSHUFHWMasked128Merging,
ssa.OpAMD64VPSHUFDMasked128Merging,
ssa.OpAMD64VRCP14PSMasked128Merging,
ssa.OpAMD64VRCP14PSMasked256Merging,
ssa.OpAMD64VRCP14PSMasked512Merging,
@ -2483,6 +2485,15 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
ssa.OpAMD64VSQRTPDMasked128Merging,
ssa.OpAMD64VSQRTPDMasked256Merging,
ssa.OpAMD64VSQRTPDMasked512Merging,
ssa.OpAMD64VPSHUFDMasked256Merging,
ssa.OpAMD64VPSHUFDMasked512Merging,
ssa.OpAMD64VPSHUFHWMasked256Merging,
ssa.OpAMD64VPSHUFHWMasked512Merging,
ssa.OpAMD64VPSHUFHWMasked128Merging,
ssa.OpAMD64VPSHUFLWMasked256Merging,
ssa.OpAMD64VPSHUFLWMasked512Merging,
ssa.OpAMD64VPSHUFLWMasked128Merging,
ssa.OpAMD64VPSHUFDMasked128Merging,
ssa.OpAMD64VPSLLWMasked128constMerging,
ssa.OpAMD64VPSLLWMasked256constMerging,
ssa.OpAMD64VPSLLWMasked512constMerging,
@ -2674,6 +2685,36 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
ssa.OpAMD64VPCOMPRESSQMasked128,
ssa.OpAMD64VPCOMPRESSQMasked256,
ssa.OpAMD64VPCOMPRESSQMasked512,
ssa.OpAMD64VPERMI2BMasked128,
ssa.OpAMD64VPERMI2BMasked256,
ssa.OpAMD64VPERMI2BMasked512,
ssa.OpAMD64VPERMI2WMasked128,
ssa.OpAMD64VPERMI2WMasked256,
ssa.OpAMD64VPERMI2WMasked512,
ssa.OpAMD64VPERMI2PSMasked128,
ssa.OpAMD64VPERMI2PSMasked128load,
ssa.OpAMD64VPERMI2DMasked128,
ssa.OpAMD64VPERMI2DMasked128load,
ssa.OpAMD64VPERMI2PSMasked256,
ssa.OpAMD64VPERMI2PSMasked256load,
ssa.OpAMD64VPERMI2DMasked256,
ssa.OpAMD64VPERMI2DMasked256load,
ssa.OpAMD64VPERMI2PSMasked512,
ssa.OpAMD64VPERMI2PSMasked512load,
ssa.OpAMD64VPERMI2DMasked512,
ssa.OpAMD64VPERMI2DMasked512load,
ssa.OpAMD64VPERMI2PDMasked128,
ssa.OpAMD64VPERMI2PDMasked128load,
ssa.OpAMD64VPERMI2QMasked128,
ssa.OpAMD64VPERMI2QMasked128load,
ssa.OpAMD64VPERMI2PDMasked256,
ssa.OpAMD64VPERMI2PDMasked256load,
ssa.OpAMD64VPERMI2QMasked256,
ssa.OpAMD64VPERMI2QMasked256load,
ssa.OpAMD64VPERMI2PDMasked512,
ssa.OpAMD64VPERMI2PDMasked512load,
ssa.OpAMD64VPERMI2QMasked512,
ssa.OpAMD64VPERMI2QMasked512load,
ssa.OpAMD64VPALIGNRMasked256,
ssa.OpAMD64VPALIGNRMasked512,
ssa.OpAMD64VPALIGNRMasked128,
@ -3061,48 +3102,7 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
ssa.OpAMD64VPORQMasked256load,
ssa.OpAMD64VPORQMasked512,
ssa.OpAMD64VPORQMasked512load,
ssa.OpAMD64VPERMI2BMasked128,
ssa.OpAMD64VPERMI2BMasked256,
ssa.OpAMD64VPERMI2BMasked512,
ssa.OpAMD64VPERMI2WMasked128,
ssa.OpAMD64VPERMI2WMasked256,
ssa.OpAMD64VPERMI2WMasked512,
ssa.OpAMD64VPERMI2PSMasked128,
ssa.OpAMD64VPERMI2PSMasked128load,
ssa.OpAMD64VPERMI2DMasked128,
ssa.OpAMD64VPERMI2DMasked128load,
ssa.OpAMD64VPERMI2PSMasked256,
ssa.OpAMD64VPERMI2PSMasked256load,
ssa.OpAMD64VPERMI2DMasked256,
ssa.OpAMD64VPERMI2DMasked256load,
ssa.OpAMD64VPERMI2PSMasked512,
ssa.OpAMD64VPERMI2PSMasked512load,
ssa.OpAMD64VPERMI2DMasked512,
ssa.OpAMD64VPERMI2DMasked512load,
ssa.OpAMD64VPERMI2PDMasked128,
ssa.OpAMD64VPERMI2PDMasked128load,
ssa.OpAMD64VPERMI2QMasked128,
ssa.OpAMD64VPERMI2QMasked128load,
ssa.OpAMD64VPERMI2PDMasked256,
ssa.OpAMD64VPERMI2PDMasked256load,
ssa.OpAMD64VPERMI2QMasked256,
ssa.OpAMD64VPERMI2QMasked256load,
ssa.OpAMD64VPERMI2PDMasked512,
ssa.OpAMD64VPERMI2PDMasked512load,
ssa.OpAMD64VPERMI2QMasked512,
ssa.OpAMD64VPERMI2QMasked512load,
ssa.OpAMD64VPSHUFDMasked256,
ssa.OpAMD64VPSHUFDMasked256load,
ssa.OpAMD64VPSHUFDMasked512,
ssa.OpAMD64VPSHUFDMasked512load,
ssa.OpAMD64VPSHUFHWMasked256,
ssa.OpAMD64VPSHUFHWMasked512,
ssa.OpAMD64VPSHUFHWMasked128,
ssa.OpAMD64VPSHUFDMasked128,
ssa.OpAMD64VPSHUFDMasked128load,
ssa.OpAMD64VPSHUFBMasked256,
ssa.OpAMD64VPSHUFBMasked512,
ssa.OpAMD64VPSHUFBMasked128,
ssa.OpAMD64VPERMBMasked128,
ssa.OpAMD64VPERMBMasked256,
ssa.OpAMD64VPERMBMasked512,
ssa.OpAMD64VPERMWMasked128,
@ -3124,6 +3124,9 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
ssa.OpAMD64VPERMPDMasked512load,
ssa.OpAMD64VPERMQMasked512,
ssa.OpAMD64VPERMQMasked512load,
ssa.OpAMD64VPSHUFBMasked256,
ssa.OpAMD64VPSHUFBMasked512,
ssa.OpAMD64VPSHUFBMasked128,
ssa.OpAMD64VRCP14PSMasked128,
ssa.OpAMD64VRCP14PSMasked128load,
ssa.OpAMD64VRCP14PSMasked256,
@ -3418,6 +3421,18 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
ssa.OpAMD64VMOVDQU64Masked128,
ssa.OpAMD64VMOVDQU64Masked256,
ssa.OpAMD64VMOVDQU64Masked512,
ssa.OpAMD64VPSHUFDMasked256,
ssa.OpAMD64VPSHUFDMasked256load,
ssa.OpAMD64VPSHUFDMasked512,
ssa.OpAMD64VPSHUFDMasked512load,
ssa.OpAMD64VPSHUFHWMasked256,
ssa.OpAMD64VPSHUFHWMasked512,
ssa.OpAMD64VPSHUFHWMasked128,
ssa.OpAMD64VPSHUFLWMasked256,
ssa.OpAMD64VPSHUFLWMasked512,
ssa.OpAMD64VPSHUFLWMasked128,
ssa.OpAMD64VPSHUFDMasked128,
ssa.OpAMD64VPSHUFDMasked128load,
ssa.OpAMD64VPSLLWMasked128const,
ssa.OpAMD64VPSLLWMasked256const,
ssa.OpAMD64VPSLLWMasked512const,

View file

@ -216,6 +216,36 @@
(CompressUint64x2 x mask) => (VPCOMPRESSQMasked128 x (VPMOVVec64x2ToM <types.TypeMask> mask))
(CompressUint64x4 x mask) => (VPCOMPRESSQMasked256 x (VPMOVVec64x4ToM <types.TypeMask> mask))
(CompressUint64x8 x mask) => (VPCOMPRESSQMasked512 x (VPMOVVec64x8ToM <types.TypeMask> mask))
(ConcatPermuteFloat32x4 ...) => (VPERMI2PS128 ...)
(ConcatPermuteFloat32x8 ...) => (VPERMI2PS256 ...)
(ConcatPermuteFloat32x16 ...) => (VPERMI2PS512 ...)
(ConcatPermuteFloat64x2 ...) => (VPERMI2PD128 ...)
(ConcatPermuteFloat64x4 ...) => (VPERMI2PD256 ...)
(ConcatPermuteFloat64x8 ...) => (VPERMI2PD512 ...)
(ConcatPermuteInt8x16 ...) => (VPERMI2B128 ...)
(ConcatPermuteInt8x32 ...) => (VPERMI2B256 ...)
(ConcatPermuteInt8x64 ...) => (VPERMI2B512 ...)
(ConcatPermuteInt16x8 ...) => (VPERMI2W128 ...)
(ConcatPermuteInt16x16 ...) => (VPERMI2W256 ...)
(ConcatPermuteInt16x32 ...) => (VPERMI2W512 ...)
(ConcatPermuteInt32x4 ...) => (VPERMI2D128 ...)
(ConcatPermuteInt32x8 ...) => (VPERMI2D256 ...)
(ConcatPermuteInt32x16 ...) => (VPERMI2D512 ...)
(ConcatPermuteInt64x2 ...) => (VPERMI2Q128 ...)
(ConcatPermuteInt64x4 ...) => (VPERMI2Q256 ...)
(ConcatPermuteInt64x8 ...) => (VPERMI2Q512 ...)
(ConcatPermuteUint8x16 ...) => (VPERMI2B128 ...)
(ConcatPermuteUint8x32 ...) => (VPERMI2B256 ...)
(ConcatPermuteUint8x64 ...) => (VPERMI2B512 ...)
(ConcatPermuteUint16x8 ...) => (VPERMI2W128 ...)
(ConcatPermuteUint16x16 ...) => (VPERMI2W256 ...)
(ConcatPermuteUint16x32 ...) => (VPERMI2W512 ...)
(ConcatPermuteUint32x4 ...) => (VPERMI2D128 ...)
(ConcatPermuteUint32x8 ...) => (VPERMI2D256 ...)
(ConcatPermuteUint32x16 ...) => (VPERMI2D512 ...)
(ConcatPermuteUint64x2 ...) => (VPERMI2Q128 ...)
(ConcatPermuteUint64x4 ...) => (VPERMI2Q256 ...)
(ConcatPermuteUint64x8 ...) => (VPERMI2Q512 ...)
(ConcatShiftBytesRightUint8x16 ...) => (VPALIGNR128 ...)
(ConcatShiftBytesRightGroupedUint8x32 ...) => (VPALIGNR256 ...)
(ConcatShiftBytesRightGroupedUint8x64 ...) => (VPALIGNR512 ...)
@ -794,7 +824,7 @@
(PermuteFloat32x16 ...) => (VPERMPS512 ...)
(PermuteFloat64x4 ...) => (VPERMPD256 ...)
(PermuteFloat64x8 ...) => (VPERMPD512 ...)
(PermuteInt8x16 ...) => (VPSHUFB128 ...)
(PermuteInt8x16 ...) => (VPERMB128 ...)
(PermuteInt8x32 ...) => (VPERMB256 ...)
(PermuteInt8x64 ...) => (VPERMB512 ...)
(PermuteInt16x8 ...) => (VPERMW128 ...)
@ -804,7 +834,7 @@
(PermuteInt32x16 ...) => (VPERMD512 ...)
(PermuteInt64x4 ...) => (VPERMQ256 ...)
(PermuteInt64x8 ...) => (VPERMQ512 ...)
(PermuteUint8x16 ...) => (VPSHUFB128 ...)
(PermuteUint8x16 ...) => (VPERMB128 ...)
(PermuteUint8x32 ...) => (VPERMB256 ...)
(PermuteUint8x64 ...) => (VPERMB512 ...)
(PermuteUint16x8 ...) => (VPERMW128 ...)
@ -814,62 +844,12 @@
(PermuteUint32x16 ...) => (VPERMD512 ...)
(PermuteUint64x4 ...) => (VPERMQ256 ...)
(PermuteUint64x8 ...) => (VPERMQ512 ...)
(Permute2Float32x4 ...) => (VPERMI2PS128 ...)
(Permute2Float32x8 ...) => (VPERMI2PS256 ...)
(Permute2Float32x16 ...) => (VPERMI2PS512 ...)
(Permute2Float64x2 ...) => (VPERMI2PD128 ...)
(Permute2Float64x4 ...) => (VPERMI2PD256 ...)
(Permute2Float64x8 ...) => (VPERMI2PD512 ...)
(Permute2Int8x16 ...) => (VPERMI2B128 ...)
(Permute2Int8x32 ...) => (VPERMI2B256 ...)
(Permute2Int8x64 ...) => (VPERMI2B512 ...)
(Permute2Int16x8 ...) => (VPERMI2W128 ...)
(Permute2Int16x16 ...) => (VPERMI2W256 ...)
(Permute2Int16x32 ...) => (VPERMI2W512 ...)
(Permute2Int32x4 ...) => (VPERMI2D128 ...)
(Permute2Int32x8 ...) => (VPERMI2D256 ...)
(Permute2Int32x16 ...) => (VPERMI2D512 ...)
(Permute2Int64x2 ...) => (VPERMI2Q128 ...)
(Permute2Int64x4 ...) => (VPERMI2Q256 ...)
(Permute2Int64x8 ...) => (VPERMI2Q512 ...)
(Permute2Uint8x16 ...) => (VPERMI2B128 ...)
(Permute2Uint8x32 ...) => (VPERMI2B256 ...)
(Permute2Uint8x64 ...) => (VPERMI2B512 ...)
(Permute2Uint16x8 ...) => (VPERMI2W128 ...)
(Permute2Uint16x16 ...) => (VPERMI2W256 ...)
(Permute2Uint16x32 ...) => (VPERMI2W512 ...)
(Permute2Uint32x4 ...) => (VPERMI2D128 ...)
(Permute2Uint32x8 ...) => (VPERMI2D256 ...)
(Permute2Uint32x16 ...) => (VPERMI2D512 ...)
(Permute2Uint64x2 ...) => (VPERMI2Q128 ...)
(Permute2Uint64x4 ...) => (VPERMI2Q256 ...)
(Permute2Uint64x8 ...) => (VPERMI2Q512 ...)
(PermuteConstantInt32x4 ...) => (VPSHUFD128 ...)
(PermuteConstantUint32x4 ...) => (VPSHUFD128 ...)
(PermuteConstantGroupedInt32x8 ...) => (VPSHUFD256 ...)
(PermuteConstantGroupedInt32x16 ...) => (VPSHUFD512 ...)
(PermuteConstantGroupedUint32x8 ...) => (VPSHUFD256 ...)
(PermuteConstantGroupedUint32x16 ...) => (VPSHUFD512 ...)
(PermuteConstantHiInt16x8 ...) => (VPSHUFHW128 ...)
(PermuteConstantHiInt32x4 ...) => (VPSHUFHW128 ...)
(PermuteConstantHiUint16x8 ...) => (VPSHUFHW128 ...)
(PermuteConstantHiUint32x4 ...) => (VPSHUFHW128 ...)
(PermuteConstantHiGroupedInt16x16 ...) => (VPSHUFHW256 ...)
(PermuteConstantHiGroupedInt16x32 ...) => (VPSHUFHW512 ...)
(PermuteConstantHiGroupedUint16x16 ...) => (VPSHUFHW256 ...)
(PermuteConstantHiGroupedUint16x32 ...) => (VPSHUFHW512 ...)
(PermuteConstantLoInt16x8 ...) => (VPSHUFHW128 ...)
(PermuteConstantLoInt32x4 ...) => (VPSHUFHW128 ...)
(PermuteConstantLoUint16x8 ...) => (VPSHUFHW128 ...)
(PermuteConstantLoUint32x4 ...) => (VPSHUFHW128 ...)
(PermuteConstantLoGroupedInt16x16 ...) => (VPSHUFHW256 ...)
(PermuteConstantLoGroupedInt16x32 ...) => (VPSHUFHW512 ...)
(PermuteConstantLoGroupedUint16x16 ...) => (VPSHUFHW256 ...)
(PermuteConstantLoGroupedUint16x32 ...) => (VPSHUFHW512 ...)
(PermuteGroupedInt8x32 ...) => (VPSHUFB256 ...)
(PermuteGroupedInt8x64 ...) => (VPSHUFB512 ...)
(PermuteGroupedUint8x32 ...) => (VPSHUFB256 ...)
(PermuteGroupedUint8x64 ...) => (VPSHUFB512 ...)
(PermuteOrZeroInt8x16 ...) => (VPSHUFB128 ...)
(PermuteOrZeroUint8x16 ...) => (VPSHUFB128 ...)
(PermuteOrZeroGroupedInt8x32 ...) => (VPSHUFB256 ...)
(PermuteOrZeroGroupedInt8x64 ...) => (VPSHUFB512 ...)
(PermuteOrZeroGroupedUint8x32 ...) => (VPSHUFB256 ...)
(PermuteOrZeroGroupedUint8x64 ...) => (VPSHUFB512 ...)
(ReciprocalFloat32x4 ...) => (VRCPPS128 ...)
(ReciprocalFloat32x8 ...) => (VRCPPS256 ...)
(ReciprocalFloat32x16 ...) => (VRCP14PS512 ...)
@ -1324,6 +1304,24 @@
(concatSelectedConstantGroupedUint32x16 ...) => (VSHUFPS512 ...)
(concatSelectedConstantGroupedUint64x4 ...) => (VSHUFPD256 ...)
(concatSelectedConstantGroupedUint64x8 ...) => (VSHUFPD512 ...)
(permuteScalarsInt32x4 ...) => (VPSHUFD128 ...)
(permuteScalarsUint32x4 ...) => (VPSHUFD128 ...)
(permuteScalarsGroupedInt32x8 ...) => (VPSHUFD256 ...)
(permuteScalarsGroupedInt32x16 ...) => (VPSHUFD512 ...)
(permuteScalarsGroupedUint32x8 ...) => (VPSHUFD256 ...)
(permuteScalarsGroupedUint32x16 ...) => (VPSHUFD512 ...)
(permuteScalarsHiInt16x8 ...) => (VPSHUFHW128 ...)
(permuteScalarsHiUint16x8 ...) => (VPSHUFHW128 ...)
(permuteScalarsHiGroupedInt16x16 ...) => (VPSHUFHW256 ...)
(permuteScalarsHiGroupedInt16x32 ...) => (VPSHUFHW512 ...)
(permuteScalarsHiGroupedUint16x16 ...) => (VPSHUFHW256 ...)
(permuteScalarsHiGroupedUint16x32 ...) => (VPSHUFHW512 ...)
(permuteScalarsLoInt16x8 ...) => (VPSHUFLW128 ...)
(permuteScalarsLoUint16x8 ...) => (VPSHUFLW128 ...)
(permuteScalarsLoGroupedInt16x16 ...) => (VPSHUFLW256 ...)
(permuteScalarsLoGroupedInt16x32 ...) => (VPSHUFLW512 ...)
(permuteScalarsLoGroupedUint16x16 ...) => (VPSHUFLW256 ...)
(permuteScalarsLoGroupedUint16x32 ...) => (VPSHUFLW512 ...)
(ternInt32x4 ...) => (VPTERNLOGD128 ...)
(ternInt32x8 ...) => (VPTERNLOGD256 ...)
(ternInt32x16 ...) => (VPTERNLOGD512 ...)
@ -1417,6 +1415,24 @@
(VMOVDQU64Masked128 (VREDUCEPD128 [a] x) mask) => (VREDUCEPDMasked128 [a] x mask)
(VMOVDQU64Masked256 (VREDUCEPD256 [a] x) mask) => (VREDUCEPDMasked256 [a] x mask)
(VMOVDQU64Masked512 (VREDUCEPD512 [a] x) mask) => (VREDUCEPDMasked512 [a] x mask)
(VMOVDQU8Masked128 (VPERMI2B128 x y z) mask) => (VPERMI2BMasked128 x y z mask)
(VMOVDQU8Masked256 (VPERMI2B256 x y z) mask) => (VPERMI2BMasked256 x y z mask)
(VMOVDQU8Masked512 (VPERMI2B512 x y z) mask) => (VPERMI2BMasked512 x y z mask)
(VMOVDQU16Masked128 (VPERMI2W128 x y z) mask) => (VPERMI2WMasked128 x y z mask)
(VMOVDQU16Masked256 (VPERMI2W256 x y z) mask) => (VPERMI2WMasked256 x y z mask)
(VMOVDQU16Masked512 (VPERMI2W512 x y z) mask) => (VPERMI2WMasked512 x y z mask)
(VMOVDQU32Masked128 (VPERMI2PS128 x y z) mask) => (VPERMI2PSMasked128 x y z mask)
(VMOVDQU32Masked128 (VPERMI2D128 x y z) mask) => (VPERMI2DMasked128 x y z mask)
(VMOVDQU32Masked256 (VPERMI2PS256 x y z) mask) => (VPERMI2PSMasked256 x y z mask)
(VMOVDQU32Masked256 (VPERMI2D256 x y z) mask) => (VPERMI2DMasked256 x y z mask)
(VMOVDQU32Masked512 (VPERMI2PS512 x y z) mask) => (VPERMI2PSMasked512 x y z mask)
(VMOVDQU32Masked512 (VPERMI2D512 x y z) mask) => (VPERMI2DMasked512 x y z mask)
(VMOVDQU64Masked128 (VPERMI2PD128 x y z) mask) => (VPERMI2PDMasked128 x y z mask)
(VMOVDQU64Masked128 (VPERMI2Q128 x y z) mask) => (VPERMI2QMasked128 x y z mask)
(VMOVDQU64Masked256 (VPERMI2PD256 x y z) mask) => (VPERMI2PDMasked256 x y z mask)
(VMOVDQU64Masked256 (VPERMI2Q256 x y z) mask) => (VPERMI2QMasked256 x y z mask)
(VMOVDQU64Masked512 (VPERMI2PD512 x y z) mask) => (VPERMI2PDMasked512 x y z mask)
(VMOVDQU64Masked512 (VPERMI2Q512 x y z) mask) => (VPERMI2QMasked512 x y z mask)
(VMOVDQU8Masked256 (VPALIGNR256 [a] x y) mask) => (VPALIGNRMasked256 [a] x y mask)
(VMOVDQU8Masked512 (VPALIGNR512 [a] x y) mask) => (VPALIGNRMasked512 [a] x y mask)
(VMOVDQU8Masked128 (VPALIGNR128 [a] x y) mask) => (VPALIGNRMasked128 [a] x y mask)
@ -1668,33 +1684,7 @@
(VMOVDQU64Masked512 (VPOPCNTQ512 x) mask) => (VPOPCNTQMasked512 x mask)
(VMOVDQU32Masked512 (VPORD512 x y) mask) => (VPORDMasked512 x y mask)
(VMOVDQU64Masked512 (VPORQ512 x y) mask) => (VPORQMasked512 x y mask)
(VMOVDQU8Masked128 (VPERMI2B128 x y z) mask) => (VPERMI2BMasked128 x y z mask)
(VMOVDQU8Masked256 (VPERMI2B256 x y z) mask) => (VPERMI2BMasked256 x y z mask)
(VMOVDQU8Masked512 (VPERMI2B512 x y z) mask) => (VPERMI2BMasked512 x y z mask)
(VMOVDQU16Masked128 (VPERMI2W128 x y z) mask) => (VPERMI2WMasked128 x y z mask)
(VMOVDQU16Masked256 (VPERMI2W256 x y z) mask) => (VPERMI2WMasked256 x y z mask)
(VMOVDQU16Masked512 (VPERMI2W512 x y z) mask) => (VPERMI2WMasked512 x y z mask)
(VMOVDQU32Masked128 (VPERMI2PS128 x y z) mask) => (VPERMI2PSMasked128 x y z mask)
(VMOVDQU32Masked128 (VPERMI2D128 x y z) mask) => (VPERMI2DMasked128 x y z mask)
(VMOVDQU32Masked256 (VPERMI2PS256 x y z) mask) => (VPERMI2PSMasked256 x y z mask)
(VMOVDQU32Masked256 (VPERMI2D256 x y z) mask) => (VPERMI2DMasked256 x y z mask)
(VMOVDQU32Masked512 (VPERMI2PS512 x y z) mask) => (VPERMI2PSMasked512 x y z mask)
(VMOVDQU32Masked512 (VPERMI2D512 x y z) mask) => (VPERMI2DMasked512 x y z mask)
(VMOVDQU64Masked128 (VPERMI2PD128 x y z) mask) => (VPERMI2PDMasked128 x y z mask)
(VMOVDQU64Masked128 (VPERMI2Q128 x y z) mask) => (VPERMI2QMasked128 x y z mask)
(VMOVDQU64Masked256 (VPERMI2PD256 x y z) mask) => (VPERMI2PDMasked256 x y z mask)
(VMOVDQU64Masked256 (VPERMI2Q256 x y z) mask) => (VPERMI2QMasked256 x y z mask)
(VMOVDQU64Masked512 (VPERMI2PD512 x y z) mask) => (VPERMI2PDMasked512 x y z mask)
(VMOVDQU64Masked512 (VPERMI2Q512 x y z) mask) => (VPERMI2QMasked512 x y z mask)
(VMOVDQU32Masked256 (VPSHUFD256 [a] x) mask) => (VPSHUFDMasked256 [a] x mask)
(VMOVDQU32Masked512 (VPSHUFD512 [a] x) mask) => (VPSHUFDMasked512 [a] x mask)
(VMOVDQU16Masked256 (VPSHUFHW256 [a] x) mask) => (VPSHUFHWMasked256 [a] x mask)
(VMOVDQU16Masked512 (VPSHUFHW512 [a] x) mask) => (VPSHUFHWMasked512 [a] x mask)
(VMOVDQU16Masked128 (VPSHUFHW128 [a] x) mask) => (VPSHUFHWMasked128 [a] x mask)
(VMOVDQU32Masked128 (VPSHUFD128 [a] x) mask) => (VPSHUFDMasked128 [a] x mask)
(VMOVDQU8Masked256 (VPSHUFB256 x y) mask) => (VPSHUFBMasked256 x y mask)
(VMOVDQU8Masked512 (VPSHUFB512 x y) mask) => (VPSHUFBMasked512 x y mask)
(VMOVDQU8Masked128 (VPSHUFB128 x y) mask) => (VPSHUFBMasked128 x y mask)
(VMOVDQU8Masked128 (VPERMB128 x y) mask) => (VPERMBMasked128 x y mask)
(VMOVDQU8Masked256 (VPERMB256 x y) mask) => (VPERMBMasked256 x y mask)
(VMOVDQU8Masked512 (VPERMB512 x y) mask) => (VPERMBMasked512 x y mask)
(VMOVDQU16Masked128 (VPERMW128 x y) mask) => (VPERMWMasked128 x y mask)
@ -1708,6 +1698,9 @@
(VMOVDQU64Masked256 (VPERMQ256 x y) mask) => (VPERMQMasked256 x y mask)
(VMOVDQU64Masked512 (VPERMPD512 x y) mask) => (VPERMPDMasked512 x y mask)
(VMOVDQU64Masked512 (VPERMQ512 x y) mask) => (VPERMQMasked512 x y mask)
(VMOVDQU8Masked256 (VPSHUFB256 x y) mask) => (VPSHUFBMasked256 x y mask)
(VMOVDQU8Masked512 (VPSHUFB512 x y) mask) => (VPSHUFBMasked512 x y mask)
(VMOVDQU8Masked128 (VPSHUFB128 x y) mask) => (VPSHUFBMasked128 x y mask)
(VMOVDQU32Masked512 (VRCP14PS512 x) mask) => (VRCP14PSMasked512 x mask)
(VMOVDQU64Masked128 (VRCP14PD128 x) mask) => (VRCP14PDMasked128 x mask)
(VMOVDQU64Masked256 (VRCP14PD256 x) mask) => (VRCP14PDMasked256 x mask)
@ -1874,6 +1867,15 @@
(VMOVDQU16Masked512 (VPSUBUSW512 x y) mask) => (VPSUBUSWMasked512 x y mask)
(VMOVDQU32Masked512 (VPXORD512 x y) mask) => (VPXORDMasked512 x y mask)
(VMOVDQU64Masked512 (VPXORQ512 x y) mask) => (VPXORQMasked512 x y mask)
(VMOVDQU32Masked256 (VPSHUFD256 [a] x) mask) => (VPSHUFDMasked256 [a] x mask)
(VMOVDQU32Masked512 (VPSHUFD512 [a] x) mask) => (VPSHUFDMasked512 [a] x mask)
(VMOVDQU16Masked256 (VPSHUFHW256 [a] x) mask) => (VPSHUFHWMasked256 [a] x mask)
(VMOVDQU16Masked512 (VPSHUFHW512 [a] x) mask) => (VPSHUFHWMasked512 [a] x mask)
(VMOVDQU16Masked128 (VPSHUFHW128 [a] x) mask) => (VPSHUFHWMasked128 [a] x mask)
(VMOVDQU16Masked256 (VPSHUFLW256 [a] x) mask) => (VPSHUFLWMasked256 [a] x mask)
(VMOVDQU16Masked512 (VPSHUFLW512 [a] x) mask) => (VPSHUFLWMasked512 [a] x mask)
(VMOVDQU16Masked128 (VPSHUFLW128 [a] x) mask) => (VPSHUFLWMasked128 [a] x mask)
(VMOVDQU32Masked128 (VPSHUFD128 [a] x) mask) => (VPSHUFDMasked128 [a] x mask)
(VMOVDQU16Masked128 (VPSLLW128const [a] x) mask) => (VPSLLWMasked128const [a] x mask)
(VMOVDQU16Masked256 (VPSLLW256const [a] x) mask) => (VPSLLWMasked256const [a] x mask)
(VMOVDQU16Masked512 (VPSLLW512const [a] x) mask) => (VPSLLWMasked512const [a] x mask)
@ -2021,6 +2023,7 @@
(VPBLENDMWMasked512 dst (VPSHLDW512 [a] x y) mask) => (VPSHLDWMasked512Merging dst [a] x y mask)
(VPBLENDMWMasked512 dst (VPSHRDW512 [a] x y) mask) => (VPSHRDWMasked512Merging dst [a] x y mask)
(VPBLENDMWMasked512 dst (VPSHUFHW512 [a] x) mask) => (VPSHUFHWMasked512Merging dst [a] x mask)
(VPBLENDMWMasked512 dst (VPSHUFLW512 [a] x) mask) => (VPSHUFLWMasked512Merging dst [a] x mask)
(VPBLENDMWMasked512 dst (VPSLLVW512 x y) mask) => (VPSLLVWMasked512Merging dst x y mask)
(VPBLENDMWMasked512 dst (VPSLLW512const [a] x) mask) => (VPSLLWMasked512constMerging dst [a] x mask)
(VPBLENDMWMasked512 dst (VPSRAVW512 x y) mask) => (VPSRAVWMasked512Merging dst x y mask)
@ -2170,6 +2173,7 @@
(VPBLENDVB128 dst (VPSHUFB128 x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHUFBMasked128Merging dst x y (VPMOVVec8x16ToM <types.TypeMask> mask))
(VPBLENDVB128 dst (VPSHUFD128 [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHUFDMasked128Merging dst [a] x (VPMOVVec32x4ToM <types.TypeMask> mask))
(VPBLENDVB128 dst (VPSHUFHW128 [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHUFHWMasked128Merging dst [a] x (VPMOVVec16x8ToM <types.TypeMask> mask))
(VPBLENDVB128 dst (VPSHUFLW128 [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHUFLWMasked128Merging dst [a] x (VPMOVVec16x8ToM <types.TypeMask> mask))
(VPBLENDVB128 dst (VPSLLD128const [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSLLDMasked128constMerging dst [a] x (VPMOVVec32x4ToM <types.TypeMask> mask))
(VPBLENDVB128 dst (VPSLLQ128const [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSLLQMasked128constMerging dst [a] x (VPMOVVec64x2ToM <types.TypeMask> mask))
(VPBLENDVB128 dst (VPSLLVD128 x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSLLVDMasked128Merging dst x y (VPMOVVec32x4ToM <types.TypeMask> mask))
@ -2305,6 +2309,7 @@
(VPBLENDVB256 dst (VPSHUFB256 x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHUFBMasked256Merging dst x y (VPMOVVec8x32ToM <types.TypeMask> mask))
(VPBLENDVB256 dst (VPSHUFD256 [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHUFDMasked256Merging dst [a] x (VPMOVVec32x8ToM <types.TypeMask> mask))
(VPBLENDVB256 dst (VPSHUFHW256 [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHUFHWMasked256Merging dst [a] x (VPMOVVec16x16ToM <types.TypeMask> mask))
(VPBLENDVB256 dst (VPSHUFLW256 [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHUFLWMasked256Merging dst [a] x (VPMOVVec16x16ToM <types.TypeMask> mask))
(VPBLENDVB256 dst (VPSLLD256const [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSLLDMasked256constMerging dst [a] x (VPMOVVec32x8ToM <types.TypeMask> mask))
(VPBLENDVB256 dst (VPSLLQ256const [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSLLQMasked256constMerging dst [a] x (VPMOVVec64x4ToM <types.TypeMask> mask))
(VPBLENDVB256 dst (VPSLLVD256 x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSLLVDMasked256Merging dst x y (VPMOVVec32x8ToM <types.TypeMask> mask))
@ -2410,6 +2415,30 @@
(VREDUCEPDMasked128 [c] l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VREDUCEPDMasked128load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
(VREDUCEPDMasked256 [c] l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VREDUCEPDMasked256load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
(VREDUCEPDMasked512 [c] l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VREDUCEPDMasked512load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
(VPERMI2PS128 x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PS128load {sym} [off] x y ptr mem)
(VPERMI2D128 x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2D128load {sym} [off] x y ptr mem)
(VPERMI2PS256 x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PS256load {sym} [off] x y ptr mem)
(VPERMI2D256 x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2D256load {sym} [off] x y ptr mem)
(VPERMI2PS512 x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PS512load {sym} [off] x y ptr mem)
(VPERMI2D512 x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2D512load {sym} [off] x y ptr mem)
(VPERMI2PD128 x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PD128load {sym} [off] x y ptr mem)
(VPERMI2Q128 x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2Q128load {sym} [off] x y ptr mem)
(VPERMI2PD256 x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PD256load {sym} [off] x y ptr mem)
(VPERMI2Q256 x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2Q256load {sym} [off] x y ptr mem)
(VPERMI2PD512 x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PD512load {sym} [off] x y ptr mem)
(VPERMI2Q512 x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2Q512load {sym} [off] x y ptr mem)
(VPERMI2PSMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PSMasked128load {sym} [off] x y ptr mask mem)
(VPERMI2DMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2DMasked128load {sym} [off] x y ptr mask mem)
(VPERMI2PSMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PSMasked256load {sym} [off] x y ptr mask mem)
(VPERMI2DMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2DMasked256load {sym} [off] x y ptr mask mem)
(VPERMI2PSMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PSMasked512load {sym} [off] x y ptr mask mem)
(VPERMI2DMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2DMasked512load {sym} [off] x y ptr mask mem)
(VPERMI2PDMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PDMasked128load {sym} [off] x y ptr mask mem)
(VPERMI2QMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2QMasked128load {sym} [off] x y ptr mask mem)
(VPERMI2PDMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PDMasked256load {sym} [off] x y ptr mask mem)
(VPERMI2QMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2QMasked256load {sym} [off] x y ptr mask mem)
(VPERMI2PDMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PDMasked512load {sym} [off] x y ptr mask mem)
(VPERMI2QMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2QMasked512load {sym} [off] x y ptr mask mem)
(VPACKSSDW512 x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPACKSSDW512load {sym} [off] x ptr mem)
(VPACKSSDWMasked128 x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPACKSSDWMasked128load {sym} [off] x ptr mask mem)
(VPACKSSDWMasked256 x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPACKSSDWMasked256load {sym} [off] x ptr mask mem)
@ -2636,34 +2665,6 @@
(VPERMQ256 x l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMQ256load {sym} [off] x ptr mem)
(VPERMPD512 x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMPD512load {sym} [off] x ptr mem)
(VPERMQ512 x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMQ512load {sym} [off] x ptr mem)
(VPERMI2PS128 x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PS128load {sym} [off] x y ptr mem)
(VPERMI2D128 x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2D128load {sym} [off] x y ptr mem)
(VPERMI2PS256 x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PS256load {sym} [off] x y ptr mem)
(VPERMI2D256 x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2D256load {sym} [off] x y ptr mem)
(VPERMI2PS512 x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PS512load {sym} [off] x y ptr mem)
(VPERMI2D512 x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2D512load {sym} [off] x y ptr mem)
(VPERMI2PD128 x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PD128load {sym} [off] x y ptr mem)
(VPERMI2Q128 x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2Q128load {sym} [off] x y ptr mem)
(VPERMI2PD256 x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PD256load {sym} [off] x y ptr mem)
(VPERMI2Q256 x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2Q256load {sym} [off] x y ptr mem)
(VPERMI2PD512 x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PD512load {sym} [off] x y ptr mem)
(VPERMI2Q512 x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2Q512load {sym} [off] x y ptr mem)
(VPERMI2PSMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PSMasked128load {sym} [off] x y ptr mask mem)
(VPERMI2DMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2DMasked128load {sym} [off] x y ptr mask mem)
(VPERMI2PSMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PSMasked256load {sym} [off] x y ptr mask mem)
(VPERMI2DMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2DMasked256load {sym} [off] x y ptr mask mem)
(VPERMI2PSMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PSMasked512load {sym} [off] x y ptr mask mem)
(VPERMI2DMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2DMasked512load {sym} [off] x y ptr mask mem)
(VPERMI2PDMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PDMasked128load {sym} [off] x y ptr mask mem)
(VPERMI2QMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2QMasked128load {sym} [off] x y ptr mask mem)
(VPERMI2PDMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PDMasked256load {sym} [off] x y ptr mask mem)
(VPERMI2QMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2QMasked256load {sym} [off] x y ptr mask mem)
(VPERMI2PDMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PDMasked512load {sym} [off] x y ptr mask mem)
(VPERMI2QMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2QMasked512load {sym} [off] x y ptr mask mem)
(VPSHUFD512 [c] l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHUFD512load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem)
(VPSHUFDMasked256 [c] l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHUFDMasked256load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
(VPSHUFDMasked512 [c] l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHUFDMasked512load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
(VPSHUFDMasked128 [c] l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHUFDMasked128load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
(VPERMPSMasked256 x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMPSMasked256load {sym} [off] x ptr mask mem)
(VPERMDMasked256 x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMDMasked256load {sym} [off] x ptr mask mem)
(VPERMPSMasked512 x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMPSMasked512load {sym} [off] x ptr mask mem)
@ -2862,6 +2863,10 @@
(VPBLENDMQMasked512 x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPBLENDMQMasked512load {sym} [off] x ptr mask mem)
(VSHUFPS512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VSHUFPS512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mem)
(VSHUFPD512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VSHUFPD512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mem)
(VPSHUFD512 [c] l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHUFD512load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem)
(VPSHUFDMasked256 [c] l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHUFDMasked256load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
(VPSHUFDMasked512 [c] l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHUFDMasked512load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
(VPSHUFDMasked128 [c] l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHUFDMasked128load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
(VPSLLD512const [c] l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSLLD512constload {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem)
(VPSLLQ512const [c] l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSLLQ512constload {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem)
(VPSLLDMasked128const [c] l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSLLDMasked128constload {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)

View file

@ -383,8 +383,10 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
{name: "VPDPWSSDMasked128", argLength: 4, reg: w3kw, asm: "VPDPWSSD", commutative: false, typ: "Vec128", resultInArg0: true},
{name: "VPDPWSSDMasked256", argLength: 4, reg: w3kw, asm: "VPDPWSSD", commutative: false, typ: "Vec256", resultInArg0: true},
{name: "VPDPWSSDMasked512", argLength: 4, reg: w3kw, asm: "VPDPWSSD", commutative: false, typ: "Vec512", resultInArg0: true},
{name: "VPERMB128", argLength: 2, reg: w21, asm: "VPERMB", commutative: false, typ: "Vec128", resultInArg0: false},
{name: "VPERMB256", argLength: 2, reg: w21, asm: "VPERMB", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VPERMB512", argLength: 2, reg: w21, asm: "VPERMB", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "VPERMBMasked128", argLength: 3, reg: w2kw, asm: "VPERMB", commutative: false, typ: "Vec128", resultInArg0: false},
{name: "VPERMBMasked256", argLength: 3, reg: w2kw, asm: "VPERMB", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VPERMBMasked512", argLength: 3, reg: w2kw, asm: "VPERMB", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "VPERMD256", argLength: 2, reg: v21, asm: "VPERMD", commutative: false, typ: "Vec256", resultInArg0: false},
@ -1310,6 +1312,12 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
{name: "VPSHUFHWMasked128", argLength: 2, reg: wkw, asm: "VPSHUFHW", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
{name: "VPSHUFHWMasked256", argLength: 2, reg: wkw, asm: "VPSHUFHW", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VPSHUFHWMasked512", argLength: 2, reg: wkw, asm: "VPSHUFHW", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "VPSHUFLW128", argLength: 1, reg: w11, asm: "VPSHUFLW", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
{name: "VPSHUFLW256", argLength: 1, reg: v11, asm: "VPSHUFLW", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VPSHUFLW512", argLength: 1, reg: w11, asm: "VPSHUFLW", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "VPSHUFLWMasked128", argLength: 2, reg: wkw, asm: "VPSHUFLW", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
{name: "VPSHUFLWMasked256", argLength: 2, reg: wkw, asm: "VPSHUFLW", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VPSHUFLWMasked512", argLength: 2, reg: wkw, asm: "VPSHUFLW", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "VPSLLD128const", argLength: 1, reg: v11, asm: "VPSLLD", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
{name: "VPSLLD256const", argLength: 1, reg: v11, asm: "VPSLLD", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VPSLLD512const", argLength: 1, reg: w11, asm: "VPSLLD", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false},
@ -2392,6 +2400,9 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
{name: "VPSHUFHWMasked128Merging", argLength: 3, reg: w2kw, asm: "VPSHUFHW", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: true},
{name: "VPSHUFHWMasked256Merging", argLength: 3, reg: w2kw, asm: "VPSHUFHW", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: true},
{name: "VPSHUFHWMasked512Merging", argLength: 3, reg: w2kw, asm: "VPSHUFHW", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: true},
{name: "VPSHUFLWMasked128Merging", argLength: 3, reg: w2kw, asm: "VPSHUFLW", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: true},
{name: "VPSHUFLWMasked256Merging", argLength: 3, reg: w2kw, asm: "VPSHUFLW", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: true},
{name: "VPSHUFLWMasked512Merging", argLength: 3, reg: w2kw, asm: "VPSHUFLW", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: true},
{name: "VPSLLDMasked128constMerging", argLength: 3, reg: w2kw, asm: "VPSLLD", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: true},
{name: "VPSLLDMasked256constMerging", argLength: 3, reg: w2kw, asm: "VPSLLD", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: true},
{name: "VPSLLDMasked512constMerging", argLength: 3, reg: w2kw, asm: "VPSLLD", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: true},

View file

@ -207,6 +207,36 @@ func simdGenericOps() []opData {
{name: "CompressUint64x2", argLength: 2, commutative: false},
{name: "CompressUint64x4", argLength: 2, commutative: false},
{name: "CompressUint64x8", argLength: 2, commutative: false},
{name: "ConcatPermuteFloat32x4", argLength: 3, commutative: false},
{name: "ConcatPermuteFloat32x8", argLength: 3, commutative: false},
{name: "ConcatPermuteFloat32x16", argLength: 3, commutative: false},
{name: "ConcatPermuteFloat64x2", argLength: 3, commutative: false},
{name: "ConcatPermuteFloat64x4", argLength: 3, commutative: false},
{name: "ConcatPermuteFloat64x8", argLength: 3, commutative: false},
{name: "ConcatPermuteInt8x16", argLength: 3, commutative: false},
{name: "ConcatPermuteInt8x32", argLength: 3, commutative: false},
{name: "ConcatPermuteInt8x64", argLength: 3, commutative: false},
{name: "ConcatPermuteInt16x8", argLength: 3, commutative: false},
{name: "ConcatPermuteInt16x16", argLength: 3, commutative: false},
{name: "ConcatPermuteInt16x32", argLength: 3, commutative: false},
{name: "ConcatPermuteInt32x4", argLength: 3, commutative: false},
{name: "ConcatPermuteInt32x8", argLength: 3, commutative: false},
{name: "ConcatPermuteInt32x16", argLength: 3, commutative: false},
{name: "ConcatPermuteInt64x2", argLength: 3, commutative: false},
{name: "ConcatPermuteInt64x4", argLength: 3, commutative: false},
{name: "ConcatPermuteInt64x8", argLength: 3, commutative: false},
{name: "ConcatPermuteUint8x16", argLength: 3, commutative: false},
{name: "ConcatPermuteUint8x32", argLength: 3, commutative: false},
{name: "ConcatPermuteUint8x64", argLength: 3, commutative: false},
{name: "ConcatPermuteUint16x8", argLength: 3, commutative: false},
{name: "ConcatPermuteUint16x16", argLength: 3, commutative: false},
{name: "ConcatPermuteUint16x32", argLength: 3, commutative: false},
{name: "ConcatPermuteUint32x4", argLength: 3, commutative: false},
{name: "ConcatPermuteUint32x8", argLength: 3, commutative: false},
{name: "ConcatPermuteUint32x16", argLength: 3, commutative: false},
{name: "ConcatPermuteUint64x2", argLength: 3, commutative: false},
{name: "ConcatPermuteUint64x4", argLength: 3, commutative: false},
{name: "ConcatPermuteUint64x8", argLength: 3, commutative: false},
{name: "ConvertToInt8Int16x8", argLength: 1, commutative: false},
{name: "ConvertToInt8Int16x16", argLength: 1, commutative: false},
{name: "ConvertToInt8Int16x32", argLength: 1, commutative: false},
@ -750,44 +780,10 @@ func simdGenericOps() []opData {
{name: "OrUint64x2", argLength: 2, commutative: true},
{name: "OrUint64x4", argLength: 2, commutative: true},
{name: "OrUint64x8", argLength: 2, commutative: true},
{name: "Permute2Float32x4", argLength: 3, commutative: false},
{name: "Permute2Float32x8", argLength: 3, commutative: false},
{name: "Permute2Float32x16", argLength: 3, commutative: false},
{name: "Permute2Float64x2", argLength: 3, commutative: false},
{name: "Permute2Float64x4", argLength: 3, commutative: false},
{name: "Permute2Float64x8", argLength: 3, commutative: false},
{name: "Permute2Int8x16", argLength: 3, commutative: false},
{name: "Permute2Int8x32", argLength: 3, commutative: false},
{name: "Permute2Int8x64", argLength: 3, commutative: false},
{name: "Permute2Int16x8", argLength: 3, commutative: false},
{name: "Permute2Int16x16", argLength: 3, commutative: false},
{name: "Permute2Int16x32", argLength: 3, commutative: false},
{name: "Permute2Int32x4", argLength: 3, commutative: false},
{name: "Permute2Int32x8", argLength: 3, commutative: false},
{name: "Permute2Int32x16", argLength: 3, commutative: false},
{name: "Permute2Int64x2", argLength: 3, commutative: false},
{name: "Permute2Int64x4", argLength: 3, commutative: false},
{name: "Permute2Int64x8", argLength: 3, commutative: false},
{name: "Permute2Uint8x16", argLength: 3, commutative: false},
{name: "Permute2Uint8x32", argLength: 3, commutative: false},
{name: "Permute2Uint8x64", argLength: 3, commutative: false},
{name: "Permute2Uint16x8", argLength: 3, commutative: false},
{name: "Permute2Uint16x16", argLength: 3, commutative: false},
{name: "Permute2Uint16x32", argLength: 3, commutative: false},
{name: "Permute2Uint32x4", argLength: 3, commutative: false},
{name: "Permute2Uint32x8", argLength: 3, commutative: false},
{name: "Permute2Uint32x16", argLength: 3, commutative: false},
{name: "Permute2Uint64x2", argLength: 3, commutative: false},
{name: "Permute2Uint64x4", argLength: 3, commutative: false},
{name: "Permute2Uint64x8", argLength: 3, commutative: false},
{name: "PermuteFloat32x8", argLength: 2, commutative: false},
{name: "PermuteFloat32x16", argLength: 2, commutative: false},
{name: "PermuteFloat64x4", argLength: 2, commutative: false},
{name: "PermuteFloat64x8", argLength: 2, commutative: false},
{name: "PermuteGroupedInt8x32", argLength: 2, commutative: false},
{name: "PermuteGroupedInt8x64", argLength: 2, commutative: false},
{name: "PermuteGroupedUint8x32", argLength: 2, commutative: false},
{name: "PermuteGroupedUint8x64", argLength: 2, commutative: false},
{name: "PermuteInt8x16", argLength: 2, commutative: false},
{name: "PermuteInt8x32", argLength: 2, commutative: false},
{name: "PermuteInt8x64", argLength: 2, commutative: false},
@ -798,6 +794,12 @@ func simdGenericOps() []opData {
{name: "PermuteInt32x16", argLength: 2, commutative: false},
{name: "PermuteInt64x4", argLength: 2, commutative: false},
{name: "PermuteInt64x8", argLength: 2, commutative: false},
{name: "PermuteOrZeroGroupedInt8x32", argLength: 2, commutative: false},
{name: "PermuteOrZeroGroupedInt8x64", argLength: 2, commutative: false},
{name: "PermuteOrZeroGroupedUint8x32", argLength: 2, commutative: false},
{name: "PermuteOrZeroGroupedUint8x64", argLength: 2, commutative: false},
{name: "PermuteOrZeroInt8x16", argLength: 2, commutative: false},
{name: "PermuteOrZeroUint8x16", argLength: 2, commutative: false},
{name: "PermuteUint8x16", argLength: 2, commutative: false},
{name: "PermuteUint8x32", argLength: 2, commutative: false},
{name: "PermuteUint8x64", argLength: 2, commutative: false},
@ -1151,28 +1153,6 @@ func simdGenericOps() []opData {
{name: "GetElemUint16x8", argLength: 1, commutative: false, aux: "UInt8"},
{name: "GetElemUint32x4", argLength: 1, commutative: false, aux: "UInt8"},
{name: "GetElemUint64x2", argLength: 1, commutative: false, aux: "UInt8"},
{name: "PermuteConstantGroupedInt32x8", argLength: 1, commutative: false, aux: "UInt8"},
{name: "PermuteConstantGroupedInt32x16", argLength: 1, commutative: false, aux: "UInt8"},
{name: "PermuteConstantGroupedUint32x8", argLength: 1, commutative: false, aux: "UInt8"},
{name: "PermuteConstantGroupedUint32x16", argLength: 1, commutative: false, aux: "UInt8"},
{name: "PermuteConstantHiGroupedInt16x16", argLength: 1, commutative: false, aux: "UInt8"},
{name: "PermuteConstantHiGroupedInt16x32", argLength: 1, commutative: false, aux: "UInt8"},
{name: "PermuteConstantHiGroupedUint16x16", argLength: 1, commutative: false, aux: "UInt8"},
{name: "PermuteConstantHiGroupedUint16x32", argLength: 1, commutative: false, aux: "UInt8"},
{name: "PermuteConstantHiInt16x8", argLength: 1, commutative: false, aux: "UInt8"},
{name: "PermuteConstantHiInt32x4", argLength: 1, commutative: false, aux: "UInt8"},
{name: "PermuteConstantHiUint16x8", argLength: 1, commutative: false, aux: "UInt8"},
{name: "PermuteConstantHiUint32x4", argLength: 1, commutative: false, aux: "UInt8"},
{name: "PermuteConstantInt32x4", argLength: 1, commutative: false, aux: "UInt8"},
{name: "PermuteConstantLoGroupedInt16x16", argLength: 1, commutative: false, aux: "UInt8"},
{name: "PermuteConstantLoGroupedInt16x32", argLength: 1, commutative: false, aux: "UInt8"},
{name: "PermuteConstantLoGroupedUint16x16", argLength: 1, commutative: false, aux: "UInt8"},
{name: "PermuteConstantLoGroupedUint16x32", argLength: 1, commutative: false, aux: "UInt8"},
{name: "PermuteConstantLoInt16x8", argLength: 1, commutative: false, aux: "UInt8"},
{name: "PermuteConstantLoInt32x4", argLength: 1, commutative: false, aux: "UInt8"},
{name: "PermuteConstantLoUint16x8", argLength: 1, commutative: false, aux: "UInt8"},
{name: "PermuteConstantLoUint32x4", argLength: 1, commutative: false, aux: "UInt8"},
{name: "PermuteConstantUint32x4", argLength: 1, commutative: false, aux: "UInt8"},
{name: "RotateAllLeftInt32x4", argLength: 1, commutative: false, aux: "UInt8"},
{name: "RotateAllLeftInt32x8", argLength: 1, commutative: false, aux: "UInt8"},
{name: "RotateAllLeftInt32x16", argLength: 1, commutative: false, aux: "UInt8"},
@ -1292,6 +1272,24 @@ func simdGenericOps() []opData {
{name: "concatSelectedConstantInt64x2", argLength: 2, commutative: false, aux: "UInt8"},
{name: "concatSelectedConstantUint32x4", argLength: 2, commutative: false, aux: "UInt8"},
{name: "concatSelectedConstantUint64x2", argLength: 2, commutative: false, aux: "UInt8"},
{name: "permuteScalarsGroupedInt32x8", argLength: 1, commutative: false, aux: "UInt8"},
{name: "permuteScalarsGroupedInt32x16", argLength: 1, commutative: false, aux: "UInt8"},
{name: "permuteScalarsGroupedUint32x8", argLength: 1, commutative: false, aux: "UInt8"},
{name: "permuteScalarsGroupedUint32x16", argLength: 1, commutative: false, aux: "UInt8"},
{name: "permuteScalarsHiGroupedInt16x16", argLength: 1, commutative: false, aux: "UInt8"},
{name: "permuteScalarsHiGroupedInt16x32", argLength: 1, commutative: false, aux: "UInt8"},
{name: "permuteScalarsHiGroupedUint16x16", argLength: 1, commutative: false, aux: "UInt8"},
{name: "permuteScalarsHiGroupedUint16x32", argLength: 1, commutative: false, aux: "UInt8"},
{name: "permuteScalarsHiInt16x8", argLength: 1, commutative: false, aux: "UInt8"},
{name: "permuteScalarsHiUint16x8", argLength: 1, commutative: false, aux: "UInt8"},
{name: "permuteScalarsInt32x4", argLength: 1, commutative: false, aux: "UInt8"},
{name: "permuteScalarsLoGroupedInt16x16", argLength: 1, commutative: false, aux: "UInt8"},
{name: "permuteScalarsLoGroupedInt16x32", argLength: 1, commutative: false, aux: "UInt8"},
{name: "permuteScalarsLoGroupedUint16x16", argLength: 1, commutative: false, aux: "UInt8"},
{name: "permuteScalarsLoGroupedUint16x32", argLength: 1, commutative: false, aux: "UInt8"},
{name: "permuteScalarsLoInt16x8", argLength: 1, commutative: false, aux: "UInt8"},
{name: "permuteScalarsLoUint16x8", argLength: 1, commutative: false, aux: "UInt8"},
{name: "permuteScalarsUint32x4", argLength: 1, commutative: false, aux: "UInt8"},
{name: "ternInt32x4", argLength: 3, commutative: false, aux: "UInt8"},
{name: "ternInt32x8", argLength: 3, commutative: false, aux: "UInt8"},
{name: "ternInt32x16", argLength: 3, commutative: false, aux: "UInt8"},

View file

@ -1624,8 +1624,10 @@ const (
OpAMD64VPDPWSSDMasked128
OpAMD64VPDPWSSDMasked256
OpAMD64VPDPWSSDMasked512
OpAMD64VPERMB128
OpAMD64VPERMB256
OpAMD64VPERMB512
OpAMD64VPERMBMasked128
OpAMD64VPERMBMasked256
OpAMD64VPERMBMasked512
OpAMD64VPERMD256
@ -2551,6 +2553,12 @@ const (
OpAMD64VPSHUFHWMasked128
OpAMD64VPSHUFHWMasked256
OpAMD64VPSHUFHWMasked512
OpAMD64VPSHUFLW128
OpAMD64VPSHUFLW256
OpAMD64VPSHUFLW512
OpAMD64VPSHUFLWMasked128
OpAMD64VPSHUFLWMasked256
OpAMD64VPSHUFLWMasked512
OpAMD64VPSLLD128const
OpAMD64VPSLLD256const
OpAMD64VPSLLD512const
@ -3633,6 +3641,9 @@ const (
OpAMD64VPSHUFHWMasked128Merging
OpAMD64VPSHUFHWMasked256Merging
OpAMD64VPSHUFHWMasked512Merging
OpAMD64VPSHUFLWMasked128Merging
OpAMD64VPSHUFLWMasked256Merging
OpAMD64VPSHUFLWMasked512Merging
OpAMD64VPSLLDMasked128constMerging
OpAMD64VPSLLDMasked256constMerging
OpAMD64VPSLLDMasked512constMerging
@ -6155,6 +6166,36 @@ const (
OpCompressUint64x2
OpCompressUint64x4
OpCompressUint64x8
OpConcatPermuteFloat32x4
OpConcatPermuteFloat32x8
OpConcatPermuteFloat32x16
OpConcatPermuteFloat64x2
OpConcatPermuteFloat64x4
OpConcatPermuteFloat64x8
OpConcatPermuteInt8x16
OpConcatPermuteInt8x32
OpConcatPermuteInt8x64
OpConcatPermuteInt16x8
OpConcatPermuteInt16x16
OpConcatPermuteInt16x32
OpConcatPermuteInt32x4
OpConcatPermuteInt32x8
OpConcatPermuteInt32x16
OpConcatPermuteInt64x2
OpConcatPermuteInt64x4
OpConcatPermuteInt64x8
OpConcatPermuteUint8x16
OpConcatPermuteUint8x32
OpConcatPermuteUint8x64
OpConcatPermuteUint16x8
OpConcatPermuteUint16x16
OpConcatPermuteUint16x32
OpConcatPermuteUint32x4
OpConcatPermuteUint32x8
OpConcatPermuteUint32x16
OpConcatPermuteUint64x2
OpConcatPermuteUint64x4
OpConcatPermuteUint64x8
OpConvertToInt8Int16x8
OpConvertToInt8Int16x16
OpConvertToInt8Int16x32
@ -6698,44 +6739,10 @@ const (
OpOrUint64x2
OpOrUint64x4
OpOrUint64x8
OpPermute2Float32x4
OpPermute2Float32x8
OpPermute2Float32x16
OpPermute2Float64x2
OpPermute2Float64x4
OpPermute2Float64x8
OpPermute2Int8x16
OpPermute2Int8x32
OpPermute2Int8x64
OpPermute2Int16x8
OpPermute2Int16x16
OpPermute2Int16x32
OpPermute2Int32x4
OpPermute2Int32x8
OpPermute2Int32x16
OpPermute2Int64x2
OpPermute2Int64x4
OpPermute2Int64x8
OpPermute2Uint8x16
OpPermute2Uint8x32
OpPermute2Uint8x64
OpPermute2Uint16x8
OpPermute2Uint16x16
OpPermute2Uint16x32
OpPermute2Uint32x4
OpPermute2Uint32x8
OpPermute2Uint32x16
OpPermute2Uint64x2
OpPermute2Uint64x4
OpPermute2Uint64x8
OpPermuteFloat32x8
OpPermuteFloat32x16
OpPermuteFloat64x4
OpPermuteFloat64x8
OpPermuteGroupedInt8x32
OpPermuteGroupedInt8x64
OpPermuteGroupedUint8x32
OpPermuteGroupedUint8x64
OpPermuteInt8x16
OpPermuteInt8x32
OpPermuteInt8x64
@ -6746,6 +6753,12 @@ const (
OpPermuteInt32x16
OpPermuteInt64x4
OpPermuteInt64x8
OpPermuteOrZeroGroupedInt8x32
OpPermuteOrZeroGroupedInt8x64
OpPermuteOrZeroGroupedUint8x32
OpPermuteOrZeroGroupedUint8x64
OpPermuteOrZeroInt8x16
OpPermuteOrZeroUint8x16
OpPermuteUint8x16
OpPermuteUint8x32
OpPermuteUint8x64
@ -7099,28 +7112,6 @@ const (
OpGetElemUint16x8
OpGetElemUint32x4
OpGetElemUint64x2
OpPermuteConstantGroupedInt32x8
OpPermuteConstantGroupedInt32x16
OpPermuteConstantGroupedUint32x8
OpPermuteConstantGroupedUint32x16
OpPermuteConstantHiGroupedInt16x16
OpPermuteConstantHiGroupedInt16x32
OpPermuteConstantHiGroupedUint16x16
OpPermuteConstantHiGroupedUint16x32
OpPermuteConstantHiInt16x8
OpPermuteConstantHiInt32x4
OpPermuteConstantHiUint16x8
OpPermuteConstantHiUint32x4
OpPermuteConstantInt32x4
OpPermuteConstantLoGroupedInt16x16
OpPermuteConstantLoGroupedInt16x32
OpPermuteConstantLoGroupedUint16x16
OpPermuteConstantLoGroupedUint16x32
OpPermuteConstantLoInt16x8
OpPermuteConstantLoInt32x4
OpPermuteConstantLoUint16x8
OpPermuteConstantLoUint32x4
OpPermuteConstantUint32x4
OpRotateAllLeftInt32x4
OpRotateAllLeftInt32x8
OpRotateAllLeftInt32x16
@ -7240,6 +7231,24 @@ const (
OpconcatSelectedConstantInt64x2
OpconcatSelectedConstantUint32x4
OpconcatSelectedConstantUint64x2
OppermuteScalarsGroupedInt32x8
OppermuteScalarsGroupedInt32x16
OppermuteScalarsGroupedUint32x8
OppermuteScalarsGroupedUint32x16
OppermuteScalarsHiGroupedInt16x16
OppermuteScalarsHiGroupedInt16x32
OppermuteScalarsHiGroupedUint16x16
OppermuteScalarsHiGroupedUint16x32
OppermuteScalarsHiInt16x8
OppermuteScalarsHiUint16x8
OppermuteScalarsInt32x4
OppermuteScalarsLoGroupedInt16x16
OppermuteScalarsLoGroupedInt16x32
OppermuteScalarsLoGroupedUint16x16
OppermuteScalarsLoGroupedUint16x32
OppermuteScalarsLoInt16x8
OppermuteScalarsLoUint16x8
OppermuteScalarsUint32x4
OpternInt32x4
OpternInt32x8
OpternInt32x16
@ -26142,6 +26151,20 @@ var opcodeTable = [...]opInfo{
},
},
},
{
name: "VPERMB128",
argLen: 2,
asm: x86.AVPERMB,
reg: regInfo{
inputs: []inputInfo{
{0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
{1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
outputs: []outputInfo{
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
},
},
{
name: "VPERMB256",
argLen: 2,
@ -26170,6 +26193,21 @@ var opcodeTable = [...]opInfo{
},
},
},
{
name: "VPERMBMasked128",
argLen: 3,
asm: x86.AVPERMB,
reg: regInfo{
inputs: []inputInfo{
{2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
{1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
outputs: []outputInfo{
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
},
},
{
name: "VPERMBMasked256",
argLen: 3,
@ -39744,6 +39782,93 @@ var opcodeTable = [...]opInfo{
},
},
},
{
name: "VPSHUFLW128",
auxType: auxUInt8,
argLen: 1,
asm: x86.AVPSHUFLW,
reg: regInfo{
inputs: []inputInfo{
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
outputs: []outputInfo{
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
},
},
{
name: "VPSHUFLW256",
auxType: auxUInt8,
argLen: 1,
asm: x86.AVPSHUFLW,
reg: regInfo{
inputs: []inputInfo{
{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
},
outputs: []outputInfo{
{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
},
},
},
{
name: "VPSHUFLW512",
auxType: auxUInt8,
argLen: 1,
asm: x86.AVPSHUFLW,
reg: regInfo{
inputs: []inputInfo{
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
outputs: []outputInfo{
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
},
},
{
name: "VPSHUFLWMasked128",
auxType: auxUInt8,
argLen: 2,
asm: x86.AVPSHUFLW,
reg: regInfo{
inputs: []inputInfo{
{1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
{0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
outputs: []outputInfo{
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
},
},
{
name: "VPSHUFLWMasked256",
auxType: auxUInt8,
argLen: 2,
asm: x86.AVPSHUFLW,
reg: regInfo{
inputs: []inputInfo{
{1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
{0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
outputs: []outputInfo{
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
},
},
{
name: "VPSHUFLWMasked512",
auxType: auxUInt8,
argLen: 2,
asm: x86.AVPSHUFLW,
reg: regInfo{
inputs: []inputInfo{
{1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
{0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
outputs: []outputInfo{
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
},
},
{
name: "VPSLLD128const",
auxType: auxUInt8,
@ -57607,6 +57732,57 @@ var opcodeTable = [...]opInfo{
},
},
},
{
name: "VPSHUFLWMasked128Merging",
auxType: auxUInt8,
argLen: 3,
resultInArg0: true,
asm: x86.AVPSHUFLW,
reg: regInfo{
inputs: []inputInfo{
{2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
{1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
outputs: []outputInfo{
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
},
},
{
name: "VPSHUFLWMasked256Merging",
auxType: auxUInt8,
argLen: 3,
resultInArg0: true,
asm: x86.AVPSHUFLW,
reg: regInfo{
inputs: []inputInfo{
{2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
{1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
outputs: []outputInfo{
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
},
},
{
name: "VPSHUFLWMasked512Merging",
auxType: auxUInt8,
argLen: 3,
resultInArg0: true,
asm: x86.AVPSHUFLW,
reg: regInfo{
inputs: []inputInfo{
{2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
{1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
outputs: []outputInfo{
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
},
},
{
name: "VPSLLDMasked128constMerging",
auxType: auxUInt8,
@ -86874,6 +87050,156 @@ var opcodeTable = [...]opInfo{
argLen: 2,
generic: true,
},
{
name: "ConcatPermuteFloat32x4",
argLen: 3,
generic: true,
},
{
name: "ConcatPermuteFloat32x8",
argLen: 3,
generic: true,
},
{
name: "ConcatPermuteFloat32x16",
argLen: 3,
generic: true,
},
{
name: "ConcatPermuteFloat64x2",
argLen: 3,
generic: true,
},
{
name: "ConcatPermuteFloat64x4",
argLen: 3,
generic: true,
},
{
name: "ConcatPermuteFloat64x8",
argLen: 3,
generic: true,
},
{
name: "ConcatPermuteInt8x16",
argLen: 3,
generic: true,
},
{
name: "ConcatPermuteInt8x32",
argLen: 3,
generic: true,
},
{
name: "ConcatPermuteInt8x64",
argLen: 3,
generic: true,
},
{
name: "ConcatPermuteInt16x8",
argLen: 3,
generic: true,
},
{
name: "ConcatPermuteInt16x16",
argLen: 3,
generic: true,
},
{
name: "ConcatPermuteInt16x32",
argLen: 3,
generic: true,
},
{
name: "ConcatPermuteInt32x4",
argLen: 3,
generic: true,
},
{
name: "ConcatPermuteInt32x8",
argLen: 3,
generic: true,
},
{
name: "ConcatPermuteInt32x16",
argLen: 3,
generic: true,
},
{
name: "ConcatPermuteInt64x2",
argLen: 3,
generic: true,
},
{
name: "ConcatPermuteInt64x4",
argLen: 3,
generic: true,
},
{
name: "ConcatPermuteInt64x8",
argLen: 3,
generic: true,
},
{
name: "ConcatPermuteUint8x16",
argLen: 3,
generic: true,
},
{
name: "ConcatPermuteUint8x32",
argLen: 3,
generic: true,
},
{
name: "ConcatPermuteUint8x64",
argLen: 3,
generic: true,
},
{
name: "ConcatPermuteUint16x8",
argLen: 3,
generic: true,
},
{
name: "ConcatPermuteUint16x16",
argLen: 3,
generic: true,
},
{
name: "ConcatPermuteUint16x32",
argLen: 3,
generic: true,
},
{
name: "ConcatPermuteUint32x4",
argLen: 3,
generic: true,
},
{
name: "ConcatPermuteUint32x8",
argLen: 3,
generic: true,
},
{
name: "ConcatPermuteUint32x16",
argLen: 3,
generic: true,
},
{
name: "ConcatPermuteUint64x2",
argLen: 3,
generic: true,
},
{
name: "ConcatPermuteUint64x4",
argLen: 3,
generic: true,
},
{
name: "ConcatPermuteUint64x8",
argLen: 3,
generic: true,
},
{
name: "ConvertToInt8Int16x8",
argLen: 1,
@ -89757,156 +90083,6 @@ var opcodeTable = [...]opInfo{
commutative: true,
generic: true,
},
{
name: "Permute2Float32x4",
argLen: 3,
generic: true,
},
{
name: "Permute2Float32x8",
argLen: 3,
generic: true,
},
{
name: "Permute2Float32x16",
argLen: 3,
generic: true,
},
{
name: "Permute2Float64x2",
argLen: 3,
generic: true,
},
{
name: "Permute2Float64x4",
argLen: 3,
generic: true,
},
{
name: "Permute2Float64x8",
argLen: 3,
generic: true,
},
{
name: "Permute2Int8x16",
argLen: 3,
generic: true,
},
{
name: "Permute2Int8x32",
argLen: 3,
generic: true,
},
{
name: "Permute2Int8x64",
argLen: 3,
generic: true,
},
{
name: "Permute2Int16x8",
argLen: 3,
generic: true,
},
{
name: "Permute2Int16x16",
argLen: 3,
generic: true,
},
{
name: "Permute2Int16x32",
argLen: 3,
generic: true,
},
{
name: "Permute2Int32x4",
argLen: 3,
generic: true,
},
{
name: "Permute2Int32x8",
argLen: 3,
generic: true,
},
{
name: "Permute2Int32x16",
argLen: 3,
generic: true,
},
{
name: "Permute2Int64x2",
argLen: 3,
generic: true,
},
{
name: "Permute2Int64x4",
argLen: 3,
generic: true,
},
{
name: "Permute2Int64x8",
argLen: 3,
generic: true,
},
{
name: "Permute2Uint8x16",
argLen: 3,
generic: true,
},
{
name: "Permute2Uint8x32",
argLen: 3,
generic: true,
},
{
name: "Permute2Uint8x64",
argLen: 3,
generic: true,
},
{
name: "Permute2Uint16x8",
argLen: 3,
generic: true,
},
{
name: "Permute2Uint16x16",
argLen: 3,
generic: true,
},
{
name: "Permute2Uint16x32",
argLen: 3,
generic: true,
},
{
name: "Permute2Uint32x4",
argLen: 3,
generic: true,
},
{
name: "Permute2Uint32x8",
argLen: 3,
generic: true,
},
{
name: "Permute2Uint32x16",
argLen: 3,
generic: true,
},
{
name: "Permute2Uint64x2",
argLen: 3,
generic: true,
},
{
name: "Permute2Uint64x4",
argLen: 3,
generic: true,
},
{
name: "Permute2Uint64x8",
argLen: 3,
generic: true,
},
{
name: "PermuteFloat32x8",
argLen: 2,
@ -89927,26 +90103,6 @@ var opcodeTable = [...]opInfo{
argLen: 2,
generic: true,
},
{
name: "PermuteGroupedInt8x32",
argLen: 2,
generic: true,
},
{
name: "PermuteGroupedInt8x64",
argLen: 2,
generic: true,
},
{
name: "PermuteGroupedUint8x32",
argLen: 2,
generic: true,
},
{
name: "PermuteGroupedUint8x64",
argLen: 2,
generic: true,
},
{
name: "PermuteInt8x16",
argLen: 2,
@ -89997,6 +90153,36 @@ var opcodeTable = [...]opInfo{
argLen: 2,
generic: true,
},
{
name: "PermuteOrZeroGroupedInt8x32",
argLen: 2,
generic: true,
},
{
name: "PermuteOrZeroGroupedInt8x64",
argLen: 2,
generic: true,
},
{
name: "PermuteOrZeroGroupedUint8x32",
argLen: 2,
generic: true,
},
{
name: "PermuteOrZeroGroupedUint8x64",
argLen: 2,
generic: true,
},
{
name: "PermuteOrZeroInt8x16",
argLen: 2,
generic: true,
},
{
name: "PermuteOrZeroUint8x16",
argLen: 2,
generic: true,
},
{
name: "PermuteUint8x16",
argLen: 2,
@ -91830,138 +92016,6 @@ var opcodeTable = [...]opInfo{
argLen: 1,
generic: true,
},
{
name: "PermuteConstantGroupedInt32x8",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "PermuteConstantGroupedInt32x16",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "PermuteConstantGroupedUint32x8",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "PermuteConstantGroupedUint32x16",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "PermuteConstantHiGroupedInt16x16",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "PermuteConstantHiGroupedInt16x32",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "PermuteConstantHiGroupedUint16x16",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "PermuteConstantHiGroupedUint16x32",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "PermuteConstantHiInt16x8",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "PermuteConstantHiInt32x4",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "PermuteConstantHiUint16x8",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "PermuteConstantHiUint32x4",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "PermuteConstantInt32x4",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "PermuteConstantLoGroupedInt16x16",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "PermuteConstantLoGroupedInt16x32",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "PermuteConstantLoGroupedUint16x16",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "PermuteConstantLoGroupedUint16x32",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "PermuteConstantLoInt16x8",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "PermuteConstantLoInt32x4",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "PermuteConstantLoUint16x8",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "PermuteConstantLoUint32x4",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "PermuteConstantUint32x4",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "RotateAllLeftInt32x4",
auxType: auxUInt8,
@ -92676,6 +92730,114 @@ var opcodeTable = [...]opInfo{
argLen: 2,
generic: true,
},
{
name: "permuteScalarsGroupedInt32x8",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "permuteScalarsGroupedInt32x16",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "permuteScalarsGroupedUint32x8",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "permuteScalarsGroupedUint32x16",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "permuteScalarsHiGroupedInt16x16",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "permuteScalarsHiGroupedInt16x32",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "permuteScalarsHiGroupedUint16x16",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "permuteScalarsHiGroupedUint16x32",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "permuteScalarsHiInt16x8",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "permuteScalarsHiUint16x8",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "permuteScalarsInt32x4",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "permuteScalarsLoGroupedInt16x16",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "permuteScalarsLoGroupedInt16x32",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "permuteScalarsLoGroupedUint16x16",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "permuteScalarsLoGroupedUint16x32",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "permuteScalarsLoInt16x8",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "permuteScalarsLoUint16x8",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "permuteScalarsUint32x4",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "ternInt32x4",
auxType: auxUInt8,

File diff suppressed because it is too large Load diff

View file

@ -228,6 +228,36 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
addF(simdPackage, "Uint64x2.Compress", opLen2(ssa.OpCompressUint64x2, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Uint64x4.Compress", opLen2(ssa.OpCompressUint64x4, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint64x8.Compress", opLen2(ssa.OpCompressUint64x8, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int8x16.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt8x16, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Uint8x16.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint8x16, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int8x32.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt8x32, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint8x32.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint8x32, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int8x64.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt8x64, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Uint8x64.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint8x64, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int16x8.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt16x8, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Uint16x8.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint16x8, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int16x16.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt16x16, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint16x16.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint16x16, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int16x32.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt16x32, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Uint16x32.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint16x32, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Float32x4.ConcatPermute", opLen3_231(ssa.OpConcatPermuteFloat32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int32x4.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Uint32x4.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Float32x8.ConcatPermute", opLen3_231(ssa.OpConcatPermuteFloat32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int32x8.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint32x8.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Float32x16.ConcatPermute", opLen3_231(ssa.OpConcatPermuteFloat32x16, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int32x16.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt32x16, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Uint32x16.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint32x16, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Float64x2.ConcatPermute", opLen3_231(ssa.OpConcatPermuteFloat64x2, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int64x2.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt64x2, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Uint64x2.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint64x2, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Float64x4.ConcatPermute", opLen3_231(ssa.OpConcatPermuteFloat64x4, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int64x4.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt64x4, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint64x4.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint64x4, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Float64x8.ConcatPermute", opLen3_231(ssa.OpConcatPermuteFloat64x8, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int64x8.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt64x8, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Uint64x8.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint64x8, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Uint8x16.ConcatShiftBytesRight", opLen2Imm8(ssa.OpConcatShiftBytesRightUint8x16, types.TypeVec128, 0), sys.AMD64)
addF(simdPackage, "Uint8x32.ConcatShiftBytesRightGrouped", opLen2Imm8(ssa.OpConcatShiftBytesRightGroupedUint8x32, types.TypeVec256, 0), sys.AMD64)
addF(simdPackage, "Uint8x64.ConcatShiftBytesRightGrouped", opLen2Imm8(ssa.OpConcatShiftBytesRightGroupedUint8x64, types.TypeVec512, 0), sys.AMD64)
@ -802,8 +832,8 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
addF(simdPackage, "Uint64x2.Or", opLen2(ssa.OpOrUint64x2, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Uint64x4.Or", opLen2(ssa.OpOrUint64x4, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint64x8.Or", opLen2(ssa.OpOrUint64x8, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int8x16.Permute", opLen2(ssa.OpPermuteInt8x16, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Uint8x16.Permute", opLen2(ssa.OpPermuteUint8x16, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int8x16.Permute", opLen2_21(ssa.OpPermuteInt8x16, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Uint8x16.Permute", opLen2_21(ssa.OpPermuteUint8x16, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int8x32.Permute", opLen2_21(ssa.OpPermuteInt8x32, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint8x32.Permute", opLen2_21(ssa.OpPermuteUint8x32, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int8x64.Permute", opLen2_21(ssa.OpPermuteInt8x64, types.TypeVec512), sys.AMD64)
@ -826,62 +856,12 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
addF(simdPackage, "Float64x8.Permute", opLen2_21(ssa.OpPermuteFloat64x8, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int64x8.Permute", opLen2_21(ssa.OpPermuteInt64x8, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Uint64x8.Permute", opLen2_21(ssa.OpPermuteUint64x8, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int8x16.Permute2", opLen3_231(ssa.OpPermute2Int8x16, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Uint8x16.Permute2", opLen3_231(ssa.OpPermute2Uint8x16, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int8x32.Permute2", opLen3_231(ssa.OpPermute2Int8x32, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint8x32.Permute2", opLen3_231(ssa.OpPermute2Uint8x32, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int8x64.Permute2", opLen3_231(ssa.OpPermute2Int8x64, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Uint8x64.Permute2", opLen3_231(ssa.OpPermute2Uint8x64, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int16x8.Permute2", opLen3_231(ssa.OpPermute2Int16x8, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Uint16x8.Permute2", opLen3_231(ssa.OpPermute2Uint16x8, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int16x16.Permute2", opLen3_231(ssa.OpPermute2Int16x16, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint16x16.Permute2", opLen3_231(ssa.OpPermute2Uint16x16, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int16x32.Permute2", opLen3_231(ssa.OpPermute2Int16x32, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Uint16x32.Permute2", opLen3_231(ssa.OpPermute2Uint16x32, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Float32x4.Permute2", opLen3_231(ssa.OpPermute2Float32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int32x4.Permute2", opLen3_231(ssa.OpPermute2Int32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Uint32x4.Permute2", opLen3_231(ssa.OpPermute2Uint32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Float32x8.Permute2", opLen3_231(ssa.OpPermute2Float32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int32x8.Permute2", opLen3_231(ssa.OpPermute2Int32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint32x8.Permute2", opLen3_231(ssa.OpPermute2Uint32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Float32x16.Permute2", opLen3_231(ssa.OpPermute2Float32x16, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int32x16.Permute2", opLen3_231(ssa.OpPermute2Int32x16, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Uint32x16.Permute2", opLen3_231(ssa.OpPermute2Uint32x16, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Float64x2.Permute2", opLen3_231(ssa.OpPermute2Float64x2, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int64x2.Permute2", opLen3_231(ssa.OpPermute2Int64x2, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Uint64x2.Permute2", opLen3_231(ssa.OpPermute2Uint64x2, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Float64x4.Permute2", opLen3_231(ssa.OpPermute2Float64x4, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int64x4.Permute2", opLen3_231(ssa.OpPermute2Int64x4, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint64x4.Permute2", opLen3_231(ssa.OpPermute2Uint64x4, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Float64x8.Permute2", opLen3_231(ssa.OpPermute2Float64x8, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int64x8.Permute2", opLen3_231(ssa.OpPermute2Int64x8, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Uint64x8.Permute2", opLen3_231(ssa.OpPermute2Uint64x8, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int32x4.PermuteConstant", opLen1Imm8(ssa.OpPermuteConstantInt32x4, types.TypeVec128, 0), sys.AMD64)
addF(simdPackage, "Uint32x4.PermuteConstant", opLen1Imm8(ssa.OpPermuteConstantUint32x4, types.TypeVec128, 0), sys.AMD64)
addF(simdPackage, "Int32x8.PermuteConstantGrouped", opLen1Imm8(ssa.OpPermuteConstantGroupedInt32x8, types.TypeVec256, 0), sys.AMD64)
addF(simdPackage, "Int32x16.PermuteConstantGrouped", opLen1Imm8(ssa.OpPermuteConstantGroupedInt32x16, types.TypeVec512, 0), sys.AMD64)
addF(simdPackage, "Uint32x8.PermuteConstantGrouped", opLen1Imm8(ssa.OpPermuteConstantGroupedUint32x8, types.TypeVec256, 0), sys.AMD64)
addF(simdPackage, "Uint32x16.PermuteConstantGrouped", opLen1Imm8(ssa.OpPermuteConstantGroupedUint32x16, types.TypeVec512, 0), sys.AMD64)
addF(simdPackage, "Int16x8.PermuteConstantHi", opLen1Imm8(ssa.OpPermuteConstantHiInt16x8, types.TypeVec128, 0), sys.AMD64)
addF(simdPackage, "Int32x4.PermuteConstantHi", opLen1Imm8(ssa.OpPermuteConstantHiInt32x4, types.TypeVec128, 0), sys.AMD64)
addF(simdPackage, "Uint16x8.PermuteConstantHi", opLen1Imm8(ssa.OpPermuteConstantHiUint16x8, types.TypeVec128, 0), sys.AMD64)
addF(simdPackage, "Uint32x4.PermuteConstantHi", opLen1Imm8(ssa.OpPermuteConstantHiUint32x4, types.TypeVec128, 0), sys.AMD64)
addF(simdPackage, "Int16x16.PermuteConstantHiGrouped", opLen1Imm8(ssa.OpPermuteConstantHiGroupedInt16x16, types.TypeVec256, 0), sys.AMD64)
addF(simdPackage, "Int16x32.PermuteConstantHiGrouped", opLen1Imm8(ssa.OpPermuteConstantHiGroupedInt16x32, types.TypeVec512, 0), sys.AMD64)
addF(simdPackage, "Uint16x16.PermuteConstantHiGrouped", opLen1Imm8(ssa.OpPermuteConstantHiGroupedUint16x16, types.TypeVec256, 0), sys.AMD64)
addF(simdPackage, "Uint16x32.PermuteConstantHiGrouped", opLen1Imm8(ssa.OpPermuteConstantHiGroupedUint16x32, types.TypeVec512, 0), sys.AMD64)
addF(simdPackage, "Int16x8.PermuteConstantLo", opLen1Imm8(ssa.OpPermuteConstantLoInt16x8, types.TypeVec128, 0), sys.AMD64)
addF(simdPackage, "Int32x4.PermuteConstantLo", opLen1Imm8(ssa.OpPermuteConstantLoInt32x4, types.TypeVec128, 0), sys.AMD64)
addF(simdPackage, "Uint16x8.PermuteConstantLo", opLen1Imm8(ssa.OpPermuteConstantLoUint16x8, types.TypeVec128, 0), sys.AMD64)
addF(simdPackage, "Uint32x4.PermuteConstantLo", opLen1Imm8(ssa.OpPermuteConstantLoUint32x4, types.TypeVec128, 0), sys.AMD64)
addF(simdPackage, "Int16x16.PermuteConstantLoGrouped", opLen1Imm8(ssa.OpPermuteConstantLoGroupedInt16x16, types.TypeVec256, 0), sys.AMD64)
addF(simdPackage, "Int16x32.PermuteConstantLoGrouped", opLen1Imm8(ssa.OpPermuteConstantLoGroupedInt16x32, types.TypeVec512, 0), sys.AMD64)
addF(simdPackage, "Uint16x16.PermuteConstantLoGrouped", opLen1Imm8(ssa.OpPermuteConstantLoGroupedUint16x16, types.TypeVec256, 0), sys.AMD64)
addF(simdPackage, "Uint16x32.PermuteConstantLoGrouped", opLen1Imm8(ssa.OpPermuteConstantLoGroupedUint16x32, types.TypeVec512, 0), sys.AMD64)
addF(simdPackage, "Int8x32.PermuteGrouped", opLen2(ssa.OpPermuteGroupedInt8x32, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int8x64.PermuteGrouped", opLen2(ssa.OpPermuteGroupedInt8x64, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Uint8x32.PermuteGrouped", opLen2(ssa.OpPermuteGroupedUint8x32, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint8x64.PermuteGrouped", opLen2(ssa.OpPermuteGroupedUint8x64, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int8x16.PermuteOrZero", opLen2(ssa.OpPermuteOrZeroInt8x16, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Uint8x16.PermuteOrZero", opLen2(ssa.OpPermuteOrZeroUint8x16, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int8x32.PermuteOrZeroGrouped", opLen2(ssa.OpPermuteOrZeroGroupedInt8x32, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int8x64.PermuteOrZeroGrouped", opLen2(ssa.OpPermuteOrZeroGroupedInt8x64, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Uint8x32.PermuteOrZeroGrouped", opLen2(ssa.OpPermuteOrZeroGroupedUint8x32, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint8x64.PermuteOrZeroGrouped", opLen2(ssa.OpPermuteOrZeroGroupedUint8x64, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Float32x4.Reciprocal", opLen1(ssa.OpReciprocalFloat32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Float32x8.Reciprocal", opLen1(ssa.OpReciprocalFloat32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Float32x16.Reciprocal", opLen1(ssa.OpReciprocalFloat32x16, types.TypeVec512), sys.AMD64)
@ -1300,6 +1280,24 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
addF(simdPackage, "Uint32x16.concatSelectedConstantGrouped", opLen2Imm8(ssa.OpconcatSelectedConstantGroupedUint32x16, types.TypeVec512, 0), sys.AMD64)
addF(simdPackage, "Uint64x4.concatSelectedConstantGrouped", opLen2Imm8(ssa.OpconcatSelectedConstantGroupedUint64x4, types.TypeVec256, 0), sys.AMD64)
addF(simdPackage, "Uint64x8.concatSelectedConstantGrouped", opLen2Imm8(ssa.OpconcatSelectedConstantGroupedUint64x8, types.TypeVec512, 0), sys.AMD64)
addF(simdPackage, "Int32x4.permuteScalars", opLen1Imm8(ssa.OppermuteScalarsInt32x4, types.TypeVec128, 0), sys.AMD64)
addF(simdPackage, "Uint32x4.permuteScalars", opLen1Imm8(ssa.OppermuteScalarsUint32x4, types.TypeVec128, 0), sys.AMD64)
addF(simdPackage, "Int32x8.permuteScalarsGrouped", opLen1Imm8(ssa.OppermuteScalarsGroupedInt32x8, types.TypeVec256, 0), sys.AMD64)
addF(simdPackage, "Int32x16.permuteScalarsGrouped", opLen1Imm8(ssa.OppermuteScalarsGroupedInt32x16, types.TypeVec512, 0), sys.AMD64)
addF(simdPackage, "Uint32x8.permuteScalarsGrouped", opLen1Imm8(ssa.OppermuteScalarsGroupedUint32x8, types.TypeVec256, 0), sys.AMD64)
addF(simdPackage, "Uint32x16.permuteScalarsGrouped", opLen1Imm8(ssa.OppermuteScalarsGroupedUint32x16, types.TypeVec512, 0), sys.AMD64)
addF(simdPackage, "Int16x8.permuteScalarsHi", opLen1Imm8(ssa.OppermuteScalarsHiInt16x8, types.TypeVec128, 0), sys.AMD64)
addF(simdPackage, "Uint16x8.permuteScalarsHi", opLen1Imm8(ssa.OppermuteScalarsHiUint16x8, types.TypeVec128, 0), sys.AMD64)
addF(simdPackage, "Int16x16.permuteScalarsHiGrouped", opLen1Imm8(ssa.OppermuteScalarsHiGroupedInt16x16, types.TypeVec256, 0), sys.AMD64)
addF(simdPackage, "Int16x32.permuteScalarsHiGrouped", opLen1Imm8(ssa.OppermuteScalarsHiGroupedInt16x32, types.TypeVec512, 0), sys.AMD64)
addF(simdPackage, "Uint16x16.permuteScalarsHiGrouped", opLen1Imm8(ssa.OppermuteScalarsHiGroupedUint16x16, types.TypeVec256, 0), sys.AMD64)
addF(simdPackage, "Uint16x32.permuteScalarsHiGrouped", opLen1Imm8(ssa.OppermuteScalarsHiGroupedUint16x32, types.TypeVec512, 0), sys.AMD64)
addF(simdPackage, "Int16x8.permuteScalarsLo", opLen1Imm8(ssa.OppermuteScalarsLoInt16x8, types.TypeVec128, 0), sys.AMD64)
addF(simdPackage, "Uint16x8.permuteScalarsLo", opLen1Imm8(ssa.OppermuteScalarsLoUint16x8, types.TypeVec128, 0), sys.AMD64)
addF(simdPackage, "Int16x16.permuteScalarsLoGrouped", opLen1Imm8(ssa.OppermuteScalarsLoGroupedInt16x16, types.TypeVec256, 0), sys.AMD64)
addF(simdPackage, "Int16x32.permuteScalarsLoGrouped", opLen1Imm8(ssa.OppermuteScalarsLoGroupedInt16x32, types.TypeVec512, 0), sys.AMD64)
addF(simdPackage, "Uint16x16.permuteScalarsLoGrouped", opLen1Imm8(ssa.OppermuteScalarsLoGroupedUint16x16, types.TypeVec256, 0), sys.AMD64)
addF(simdPackage, "Uint16x32.permuteScalarsLoGrouped", opLen1Imm8(ssa.OppermuteScalarsLoGroupedUint16x32, types.TypeVec512, 0), sys.AMD64)
addF(simdPackage, "Int32x4.tern", opLen3Imm8(ssa.OpternInt32x4, types.TypeVec128, 0), sys.AMD64)
addF(simdPackage, "Int32x8.tern", opLen3Imm8(ssa.OpternInt32x8, types.TypeVec256, 0), sys.AMD64)
addF(simdPackage, "Int32x16.tern", opLen3Imm8(ssa.OpternInt32x16, types.TypeVec512, 0), sys.AMD64)

View file

@ -46,6 +46,9 @@ func writeSIMDGenericOps(ops []Operation) *bytes.Buffer {
if op.NoGenericOps != nil && *op.NoGenericOps == "true" {
continue
}
if op.SkipMaskedMethod() {
continue
}
_, _, _, immType, gOp := op.shape()
gOpData := genericOpsData{gOp.GenericName(), len(gOp.In), op.Commutative}
if immType == VarImm || immType == ConstVarImm {

View file

@ -107,6 +107,9 @@ func writeSIMDIntrinsics(ops []Operation, typeMap simdTypeMap) *bytes.Buffer {
if op.NoTypes != nil && *op.NoTypes == "true" {
continue
}
if op.SkipMaskedMethod() {
continue
}
if s, op, err := classifyOp(op); err == nil {
if err := t.ExecuteTemplate(buffer, s, op); err != nil {
panic(fmt.Errorf("failed to execute template %s for op %s: %w", s, op.Go, err))

View file

@ -604,6 +604,9 @@ func writeSIMDStubs(ops []Operation, typeMap simdTypeMap) (f, fI *bytes.Buffer)
if op.NoTypes != nil && *op.NoTypes == "true" {
continue
}
if op.SkipMaskedMethod() {
continue
}
idxVecAsScalar, err := checkVecAsScalar(op)
if err != nil {
panic(err)

View file

@ -345,7 +345,8 @@ func writeSIMDRules(ops []Operation) *bytes.Buffer {
data.ArgsOut = "..."
}
data.tplName = tplName
if opr.NoGenericOps != nil && *opr.NoGenericOps == "true" {
if opr.NoGenericOps != nil && *opr.NoGenericOps == "true" ||
opr.SkipMaskedMethod() {
optData = append(optData, data)
continue
}

View file

@ -73,6 +73,29 @@ type rawOperation struct {
NoGenericOps *string
// If non-nil, this string will be attached to the machine ssa op name. E.g. "const"
SSAVariant *string
// If true, do not emit method declarations, generic ops, or intrinsics for masked variants
// DO emit the architecture-specific opcodes and optimizations.
HideMaskMethods *bool
}
func (o *Operation) IsMasked() bool {
if len(o.InVariant) == 0 {
return false
}
if len(o.InVariant) == 1 && o.InVariant[0].Class == "mask" {
return true
}
panic(fmt.Errorf("unknown inVariant"))
}
func (o *Operation) SkipMaskedMethod() bool {
if o.HideMaskMethods == nil {
return false
}
if *o.HideMaskMethods && o.IsMasked() {
return true
}
return false
}
func (o *Operation) DecodeUnified(v *unify.Value) error {
@ -80,14 +103,7 @@ func (o *Operation) DecodeUnified(v *unify.Value) error {
return err
}
isMasked := false
if len(o.InVariant) == 0 {
// No variant
} else if len(o.InVariant) == 1 && o.InVariant[0].Class == "mask" {
isMasked = true
} else {
return fmt.Errorf("unknown inVariant")
}
isMasked := o.IsMasked()
// Compute full Go method name.
o.Go = o.rawOperation.Go
@ -104,6 +120,7 @@ func (o *Operation) DecodeUnified(v *unify.Value) error {
o.Documentation = regexp.MustCompile(`\bNAME\b`).ReplaceAllString(o.Documentation, o.Go)
if isMasked {
o.Documentation += "\n//\n// This operation is applied selectively under a write mask."
// Suppress generic op and method declaration for exported methods, if a mask is present.
if unicode.IsUpper([]rune(o.Go)[0]) {
trueVal := "true"
o.NoGenericOps = &trueVal

View file

@ -27,18 +27,22 @@
constImm: 1
documentation: !string |-
// NAME returns the upper half of x.
- go: PermuteOrZero
commutative: false
documentation: !string |-
// NAME performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
- go: Permute
commutative: false
documentation: !string |-
// NAME performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements.
- go: Permute2 # Permute2 is only available on or after AVX512
- go: ConcatPermute # ConcatPermute is only available on or after AVX512
commutative: false
documentation: !string |-
// NAME performs a full permutation of vector x, y using indices:
// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
// where xy is x appending y.
// where xy is the concatenation of x (lower half) and y (upper half).
// Only the needed bits to represent xy's index are used in indices' elements.
- go: Compress
commutative: false
@ -74,31 +78,35 @@
documentation: !string |-
// NAME copies element zero of its (128-bit) input to all elements of
// the 512-bit output vector.
- go: PermuteOrZeroGrouped
commutative: false
documentation: !string |- # Detailed documentation will rely on the specific ops.
// NAME performs a grouped permutation of vector x using indices:
- go: PermuteGrouped
commutative: false
documentation: !string |- # Detailed documentation will rely on the specific ops.
// NAME performs a grouped permutation of vector x using indices:
- go: PermuteConstant
- go: permuteScalars
commutative: false
documentation: !string |- # Detailed documentation will rely on the specific ops.
// NAME performs a permutation of vector x using constant indices:
- go: PermuteConstantGrouped
- go: permuteScalarsGrouped
commutative: false
documentation: !string |- # Detailed documentation will rely on the specific ops.
// NAME performs a grouped permutation of vector x using constant indices:
- go: PermuteConstantLo
- go: permuteScalarsLo
commutative: false
documentation: !string |- # Detailed documentation will rely on the specific ops.
// NAME performs a permutation of vector x using constant indices:
- go: PermuteConstantLoGrouped
- go: permuteScalarsLoGrouped
commutative: false
documentation: !string |- # Detailed documentation will rely on the specific ops.
// NAME performs a grouped permutation of vector x using constant indices:
- go: PermuteConstantHi
- go: permuteScalarsHi
commutative: false
documentation: !string |- # Detailed documentation will rely on the specific ops.
// NAME performs a permutation of vector x using constant indices:
- go: PermuteConstantHiGrouped
- go: permuteScalarsHiGrouped
commutative: false
documentation: !string |- # Detailed documentation will rely on the specific ops.
// NAME performs a grouped permutation of vector x using constant indices:
@ -218,8 +226,10 @@
- go: Select128FromPair
commutative: false
documentation: !string |-
// NAME selects the low and high 128-bit halves from the 128-bit halves
// of its two 256-bit inputs, numbering those halves 0, 1, 2, 3.
// NAME treats the 256-bit vectors x and y as a single vector of four
// 128-bit elements, and returns a 256-bit result formed by
// concatenating the two elements specified by lo and hi.
// For example, {4,5}.NAME(3,0,{6,7}) returns {7,4}.
- go: ConcatShiftBytesRight
commutative: false

View file

@ -213,19 +213,75 @@
- *f64xN
- go: Permute
asm: "VPERM[BWDQ]|VPERMP[SD]"
asm: "VPERMQ|VPERMPD"
addDoc: !string |-
// The low 2 bits (values 0-3) of each element of indices is used
operandOrder: "21Type1"
in:
- &anyindices
go: $t
name: indices
overwriteBase: uint
- &any4
go: $t
lanes: 4
out:
- &any
go: $t
- go: Permute
asm: "VPERM[WDQ]|VPERMP[SD]"
addDoc: !string |-
// The low 3 bits (values 0-7) of each element of indices is used
operandOrder: "21Type1"
in:
- *anyindices
- &any8
go: $t
lanes: 8
out:
- *any
- go: Permute2
- go: Permute
asm: "VPERM[BWD]|VPERMPS"
addDoc: !string |-
// The low 4 bits (values 0-15) of each element of indices is used
operandOrder: "21Type1"
in:
- *anyindices
- &any16
go: $t
lanes: 16
out:
- *any
- go: Permute
asm: "VPERM[BW]"
addDoc: !string |-
// The low 5 bits (values 0-31) of each element of indices is used
operandOrder: "21Type1"
in:
- *anyindices
- &any32
go: $t
lanes: 32
out:
- *any
- go: Permute
asm: "VPERMB"
addDoc: !string |-
// The low 6 bits (values 0-63) of each element of indices is used
operandOrder: "21Type1"
in:
- *anyindices
- &any64
go: $t
lanes: 64
out:
- *any
- go: ConcatPermute
asm: "VPERMI2[BWDQ]|VPERMI2P[SD]"
# Because we are overwriting the receiver's type, we
# have to move the receiver to be a parameter so that
@ -403,113 +459,137 @@
base: $b
# VPSHUFB for 128-bit byte shuffles will be picked with higher priority than VPERMB, given its lower CPU feature requirement. (It's AVX)
- go: Permute
- go: PermuteOrZero
asm: VPSHUFB
addDoc: !string |-
// However when the top bit is set, the low bits will be disregard and the respective element in the result vector will be zeroed.
// The lower four bits of each byte-sized index in indices select an element from x,
// unless the index's sign bit is set in which case zero is used instead.
in:
- &128any
bits: 128
go: $t
- bits: 128
go: $t
name: indices
base: int # always signed
out:
- *128any
- go: PermuteGrouped
- go: PermuteOrZeroGrouped
asm: VPSHUFB
addDoc: !string |-
// result := {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...}
// Only the needed bits to represent the index of a group of x are used in indices' elements.
// However when the top bit is set, the low bits will be disregard and the respective element in the result vector will be zeroed.
// result = {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...}
// The lower four bits of each byte-sized index in indices select an element from its corresponding group in x,
// unless the index's sign bit is set in which case zero is used instead.
// Each group is of size 128-bit.
in:
- &256Or512any
bits: "256|512"
go: $t
- bits: "256|512"
base: int
name: indices
out:
- *256Or512any
- go: permuteScalars
asm: VPSHUFD
addDoc: !string |-
// result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
in:
- *128any
- class: immediate
immOffset: 0
name: indices
hideMaskMethods: true
out:
- *128any
- go: permuteScalarsGrouped
asm: VPSHUFD
addDoc: !string |-
// result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
// Each group is of size 128-bit.
in:
- *256Or512any
- class: immediate
immOffset: 0
name: indices
hideMaskMethods: true
out:
- *256Or512any
- go: permuteScalarsLo
asm: VPSHUFLW
addDoc: !string |-
// result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]], x[4], x[5], x[6], x[7]}
// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
in:
- &128lanes8
bits: 128
go: $t
name: indices
out:
- *256Or512any
- go: PermuteConstant
asm: VPSHUFD
addDoc: !string |-
// result := {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
in:
- *128any
elemBits: 16
- class: immediate
immOffset: 0
name: indices
hideMaskMethods: true
out:
- *128any
- go: PermuteConstantGrouped
asm: VPSHUFD
- *128lanes8
- go: permuteScalarsLoGrouped
asm: VPSHUFLW
addDoc: !string |-
// result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
//
// result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x[4], x[5], x[6], x[7],
// x_group1[indices[0:2]], ...}
//
// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
// Each group is of size 128-bit.
in:
- *256Or512any
- &256Or512lanes8
bits: "256|512"
go: $t
elemBits: 16
- class: immediate
immOffset: 0
name: indices
hideMaskMethods: true
out:
- *256Or512any
- *256Or512lanes8
- go: PermuteConstantLo
- go: permuteScalarsHi
asm: VPSHUFHW
addDoc: !string |-
// result := {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
// result = {x[0], x[1], x[2], x[3], x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]}
// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
in:
- *128any
- *128lanes8
- class: immediate
immOffset: 0
name: indices
hideMaskMethods: true
out:
- *128any
- go: PermuteConstantLoGrouped
- *128lanes8
- go: permuteScalarsHiGrouped
asm: VPSHUFHW
addDoc: !string |-
// result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
// result =
//
// {x_group0[0], x_group0[1], x_group0[2], x_group0[3], x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4],
// x_group1[0], x_group1[1], x_group1[2], x_group1[3], x_group1[indices[0:2]+4], ...}
//
// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
// Each group is of size 128-bit.
in:
- *256Or512any
- *256Or512lanes8
- class: immediate
immOffset: 0
name: indices
hideMaskMethods: true
out:
- *256Or512any
- go: PermuteConstantHi
asm: VPSHUFHW
addDoc: !string |-
// result := {x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]}
// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
in:
- *128any
- class: immediate
immOffset: 0
name: indices
out:
- *128any
- go: PermuteConstantHiGrouped
asm: VPSHUFHW
addDoc: !string |-
// result := {x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4], x_group1[indices[0:2]+4], ...}
// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
// Each group is of size 128-bit.
in:
- *256Or512any
- class: immediate
immOffset: 0
name: indices
out:
- *256Or512any
- *256Or512lanes8
- go: InterleaveHi
asm: VPUNPCKH(QDQ|DQ|WD|WB)

View file

@ -163,7 +163,20 @@ func TestPermute(t *testing.T) {
}
}
func TestPermute2(t *testing.T) {
func TestPermuteOrZero(t *testing.T) {
x := []uint8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
indices := []int8{7, 6, 5, 4, 3, 2, 1, 0, -1, 8, -1, 9, -1, 10, -1, 11}
want := []uint8{8, 7, 6, 5, 4, 3, 2, 1, 0, 9, 0, 10, 0, 11, 0, 12}
got := make([]uint8, len(x))
simd.LoadUint8x16Slice(x).PermuteOrZero(simd.LoadInt8x16Slice(indices)).StoreSlice(got)
for i := range 8 {
if want[i] != got[i] {
t.Errorf("want and got differ at index %d, want=%d, got=%d", i, want[i], got[i])
}
}
}
func TestConcatPermute(t *testing.T) {
if !simd.X86.AVX512() {
t.Skip("Test requires X86.AVX512, not available on this hardware")
return
@ -173,7 +186,7 @@ func TestPermute2(t *testing.T) {
indices := []uint64{7 + 8, 6, 5 + 8, 4, 3 + 8, 2, 1 + 8, 0}
want := []int64{-8, 7, -6, 5, -4, 3, -2, 1}
got := make([]int64, 8)
simd.LoadInt64x8Slice(x).Permute2(simd.LoadInt64x8Slice(y), simd.LoadUint64x8Slice(indices)).StoreSlice(got)
simd.LoadInt64x8Slice(x).ConcatPermute(simd.LoadInt64x8Slice(y), simd.LoadUint64x8Slice(indices)).StoreSlice(got)
for i := range 8 {
if want[i] != got[i] {
t.Errorf("want and got differ at index %d, want=%d, got=%d", i, want[i], got[i])
@ -1161,3 +1174,75 @@ func TestDotProductQuadruple(t *testing.T) {
}
}
}
func TestPermuteScalars(t *testing.T) {
x := []int32{11, 12, 13, 14}
want := []int32{12, 13, 14, 11}
got := make([]int32, 4)
simd.LoadInt32x4Slice(x).PermuteScalars(1, 2, 3, 0).StoreSlice(got)
for i := range 4 {
if want[i] != got[i] {
t.Errorf("want and got differ at index %d, want=%d, got=%d", i, want[i], got[i])
}
}
}
func TestPermuteScalarsGrouped(t *testing.T) {
x := []int32{11, 12, 13, 14, 21, 22, 23, 24}
want := []int32{12, 13, 14, 11, 22, 23, 24, 21}
got := make([]int32, 8)
simd.LoadInt32x8Slice(x).PermuteScalarsGrouped(1, 2, 3, 0).StoreSlice(got)
for i := range 8 {
if want[i] != got[i] {
t.Errorf("want and got differ at index %d, want=%d, got=%d", i, want[i], got[i])
}
}
}
func TestPermuteScalarsHi(t *testing.T) {
x := []int16{-1, -2, -3, -4, 11, 12, 13, 14}
want := []int16{-1, -2, -3, -4, 12, 13, 14, 11}
got := make([]int16, len(x))
simd.LoadInt16x8Slice(x).PermuteScalarsHi(1, 2, 3, 0).StoreSlice(got)
for i := range got {
if want[i] != got[i] {
t.Errorf("want and got differ at index %d, want=%d, got=%d", i, want[i], got[i])
}
}
}
func TestPermuteScalarsLo(t *testing.T) {
x := []int16{11, 12, 13, 14, 4, 5, 6, 7}
want := []int16{12, 13, 14, 11, 4, 5, 6, 7}
got := make([]int16, len(x))
simd.LoadInt16x8Slice(x).PermuteScalarsLo(1, 2, 3, 0).StoreSlice(got)
for i := range got {
if want[i] != got[i] {
t.Errorf("want and got differ at index %d, want=%d, got=%d", i, want[i], got[i])
}
}
}
func TestPermuteScalarsHiGrouped(t *testing.T) {
x := []int16{-1, -2, -3, -4, 11, 12, 13, 14, -11, -12, -13, -14, 111, 112, 113, 114}
want := []int16{-1, -2, -3, -4, 12, 13, 14, 11, -11, -12, -13, -14, 112, 113, 114, 111}
got := make([]int16, len(x))
simd.LoadInt16x16Slice(x).PermuteScalarsHiGrouped(1, 2, 3, 0).StoreSlice(got)
for i := range got {
if want[i] != got[i] {
t.Errorf("want and got differ at index %d, want=%d, got=%d", i, want[i], got[i])
}
}
}
func TestPermuteScalarsLoGrouped(t *testing.T) {
x := []int16{11, 12, 13, 14, 4, 5, 6, 7, 111, 112, 113, 114, 14, 15, 16, 17}
want := []int16{12, 13, 14, 11, 4, 5, 6, 7, 112, 113, 114, 111, 14, 15, 16, 17}
got := make([]int16, len(x))
simd.LoadInt16x16Slice(x).PermuteScalarsLoGrouped(1, 2, 3, 0).StoreSlice(got)
for i := range got {
if want[i] != got[i] {
t.Errorf("want and got differ at index %d, want=%d, got=%d", i, want[i], got[i])
}
}
}

File diff suppressed because it is too large Load diff

View file

@ -338,6 +338,220 @@ func (x Uint64x4) concatSelectedConstantGrouped(hilos uint8, y Uint64x4) Uint64x
// Asm: VSHUFPD, CPU Feature: AVX512
func (x Uint64x8) concatSelectedConstantGrouped(hilos uint8, y Uint64x8) Uint64x8
/* permuteScalars */
// permuteScalars performs a permutation of vector x using constant indices:
// result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
//
// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
//
// Asm: VPSHUFD, CPU Feature: AVX
func (x Int32x4) permuteScalars(indices uint8) Int32x4
// permuteScalars performs a permutation of vector x using constant indices:
// result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
//
// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
//
// Asm: VPSHUFD, CPU Feature: AVX
func (x Uint32x4) permuteScalars(indices uint8) Uint32x4
/* permuteScalarsGrouped */
// permuteScalarsGrouped performs a grouped permutation of vector x using constant indices:
// result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
// Each group is of size 128-bit.
//
// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
//
// Asm: VPSHUFD, CPU Feature: AVX2
func (x Int32x8) permuteScalarsGrouped(indices uint8) Int32x8
// permuteScalarsGrouped performs a grouped permutation of vector x using constant indices:
// result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
// Each group is of size 128-bit.
//
// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
//
// Asm: VPSHUFD, CPU Feature: AVX512
func (x Int32x16) permuteScalarsGrouped(indices uint8) Int32x16
// permuteScalarsGrouped performs a grouped permutation of vector x using constant indices:
// result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
// Each group is of size 128-bit.
//
// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
//
// Asm: VPSHUFD, CPU Feature: AVX2
func (x Uint32x8) permuteScalarsGrouped(indices uint8) Uint32x8
// permuteScalarsGrouped performs a grouped permutation of vector x using constant indices:
// result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
// Each group is of size 128-bit.
//
// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
//
// Asm: VPSHUFD, CPU Feature: AVX512
func (x Uint32x16) permuteScalarsGrouped(indices uint8) Uint32x16
/* permuteScalarsHi */
// permuteScalarsHi performs a permutation of vector x using constant indices:
// result = {x[0], x[1], x[2], x[3], x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]}
// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
//
// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
//
// Asm: VPSHUFHW, CPU Feature: AVX512
func (x Int16x8) permuteScalarsHi(indices uint8) Int16x8
// permuteScalarsHi performs a permutation of vector x using constant indices:
// result = {x[0], x[1], x[2], x[3], x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]}
// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
//
// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
//
// Asm: VPSHUFHW, CPU Feature: AVX512
func (x Uint16x8) permuteScalarsHi(indices uint8) Uint16x8
/* permuteScalarsHiGrouped */
// permuteScalarsHiGrouped performs a grouped permutation of vector x using constant indices:
// result =
//
// {x_group0[0], x_group0[1], x_group0[2], x_group0[3], x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4],
// x_group1[0], x_group1[1], x_group1[2], x_group1[3], x_group1[indices[0:2]+4], ...}
//
// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
// Each group is of size 128-bit.
//
// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
//
// Asm: VPSHUFHW, CPU Feature: AVX2
func (x Int16x16) permuteScalarsHiGrouped(indices uint8) Int16x16
// permuteScalarsHiGrouped performs a grouped permutation of vector x using constant indices:
// result =
//
// {x_group0[0], x_group0[1], x_group0[2], x_group0[3], x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4],
// x_group1[0], x_group1[1], x_group1[2], x_group1[3], x_group1[indices[0:2]+4], ...}
//
// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
// Each group is of size 128-bit.
//
// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
//
// Asm: VPSHUFHW, CPU Feature: AVX512
func (x Int16x32) permuteScalarsHiGrouped(indices uint8) Int16x32
// permuteScalarsHiGrouped performs a grouped permutation of vector x using constant indices:
// result =
//
// {x_group0[0], x_group0[1], x_group0[2], x_group0[3], x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4],
// x_group1[0], x_group1[1], x_group1[2], x_group1[3], x_group1[indices[0:2]+4], ...}
//
// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
// Each group is of size 128-bit.
//
// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
//
// Asm: VPSHUFHW, CPU Feature: AVX2
func (x Uint16x16) permuteScalarsHiGrouped(indices uint8) Uint16x16
// permuteScalarsHiGrouped performs a grouped permutation of vector x using constant indices:
// result =
//
// {x_group0[0], x_group0[1], x_group0[2], x_group0[3], x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4],
// x_group1[0], x_group1[1], x_group1[2], x_group1[3], x_group1[indices[0:2]+4], ...}
//
// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
// Each group is of size 128-bit.
//
// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
//
// Asm: VPSHUFHW, CPU Feature: AVX512
func (x Uint16x32) permuteScalarsHiGrouped(indices uint8) Uint16x32
/* permuteScalarsLo */
// permuteScalarsLo performs a permutation of vector x using constant indices:
// result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]], x[4], x[5], x[6], x[7]}
// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
//
// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
//
// Asm: VPSHUFLW, CPU Feature: AVX512
func (x Int16x8) permuteScalarsLo(indices uint8) Int16x8
// permuteScalarsLo performs a permutation of vector x using constant indices:
// result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]], x[4], x[5], x[6], x[7]}
// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
//
// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
//
// Asm: VPSHUFLW, CPU Feature: AVX512
func (x Uint16x8) permuteScalarsLo(indices uint8) Uint16x8
/* permuteScalarsLoGrouped */
// permuteScalarsLoGrouped performs a grouped permutation of vector x using constant indices:
//
// result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x[4], x[5], x[6], x[7],
// x_group1[indices[0:2]], ...}
//
// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
// Each group is of size 128-bit.
//
// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
//
// Asm: VPSHUFLW, CPU Feature: AVX2
func (x Int16x16) permuteScalarsLoGrouped(indices uint8) Int16x16
// permuteScalarsLoGrouped performs a grouped permutation of vector x using constant indices:
//
// result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x[4], x[5], x[6], x[7],
// x_group1[indices[0:2]], ...}
//
// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
// Each group is of size 128-bit.
//
// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
//
// Asm: VPSHUFLW, CPU Feature: AVX512
func (x Int16x32) permuteScalarsLoGrouped(indices uint8) Int16x32
// permuteScalarsLoGrouped performs a grouped permutation of vector x using constant indices:
//
// result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x[4], x[5], x[6], x[7],
// x_group1[indices[0:2]], ...}
//
// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
// Each group is of size 128-bit.
//
// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
//
// Asm: VPSHUFLW, CPU Feature: AVX2
func (x Uint16x16) permuteScalarsLoGrouped(indices uint8) Uint16x16
// permuteScalarsLoGrouped performs a grouped permutation of vector x using constant indices:
//
// result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x[4], x[5], x[6], x[7],
// x_group1[indices[0:2]], ...}
//
// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
// Each group is of size 128-bit.
//
// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
//
// Asm: VPSHUFLW, CPU Feature: AVX512
func (x Uint16x32) permuteScalarsLoGrouped(indices uint8) Uint16x32
/* tern */
// tern performs a logical operation on three vectors based on the 8-bit truth table.

View file

@ -989,3 +989,280 @@ func (x Int64x8) SelectFromPairGrouped(a, b uint8, y Int64x8) Int64x8 {
}
panic("missing case, switch should be exhaustive")
}
/* PermuteScalars */
// PermuteScalars performs a permutation of vector x's elements using the supplied indices:
//
// result = {x[a], x[b], x[c], x[d]}
//
// Parameters a,b,c,d should have values between 0 and 3.
// If a through d are constants, then an instruction will be inlined, otherwise
// a jump table may be generated.
//
// Asm: VPSHUFD, CPU Feature: AVX
func (x Int32x4) PermuteScalars(a, b, c, d uint8) Int32x4 {
return x.permuteScalars(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
}
// PermuteScalars performs a permutation of vector x's elements using the supplied indices:
//
// result = {x[a], x[b], x[c], x[d]}
//
// Parameters a,b,c,d should have values between 0 and 3.
// If a through d are constants, then an instruction will be inlined, otherwise
// a jump table may be generated.
//
// Asm: VPSHUFD, CPU Feature: AVX
func (x Uint32x4) PermuteScalars(a, b, c, d uint8) Uint32x4 {
return x.permuteScalars(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
}
/* PermuteScalarsGrouped */
// PermuteScalarsGrouped performs a grouped permutation of vector x using the supplied indices:
//
// result = {x[a], x[b], x[c], x[d], x[a+4], x[b+4], x[c+4], x[d+4]}
//
// Parameters a,b,c,d should have values between 0 and 3.
// If a through d are constants, then an instruction will be inlined, otherwise
// a jump table may be generated.
//
// Asm: VPSHUFD, CPU Feature: AVX2
func (x Int32x8) PermuteScalarsGrouped(a, b, c, d uint8) Int32x8 {
return x.permuteScalarsGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
}
// PermuteScalarsGrouped performs a grouped permutation of vector x using the supplied indices:
//
// result =
// { x[a], x[b], x[c], x[d], x[a+4], x[b+4], x[c+4], x[d+4],
// x[a+8], x[b+8], x[c+8], x[d+8], x[a+12], x[b+12], x[c+12], x[d+12]}
//
// Parameters a,b,c,d should have values between 0 and 3.
// If a through d are constants, then an instruction will be inlined, otherwise
// a jump table may be generated.
//
// Asm: VPSHUFD, CPU Feature: AVX512
func (x Int32x16) PermuteScalarsGrouped(a, b, c, d uint8) Int32x16 {
return x.permuteScalarsGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
}
// PermuteScalarsGrouped performs a grouped permutation of vector x using the supplied indices:
//
// result = {x[a], x[b], x[c], x[d], x[a+4], x[b+4], x[c+4], x[d+4]}
//
// Parameters a,b,c,d should have values between 0 and 3.
// If a through d are constants, then an instruction will be inlined, otherwise
// a jump table is generated.
//
// Asm: VPSHUFD, CPU Feature: AVX2
func (x Uint32x8) PermuteScalarsGrouped(a, b, c, d uint8) Uint32x8 {
return x.permuteScalarsGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
}
// PermuteScalarsGrouped performs a grouped permutation of vector x using the supplied indices:
//
// result =
// { x[a], x[b], x[c], x[d], x[a+4], x[b+4], x[c+4], x[d+4],
// x[a+8], x[b+8], x[c+8], x[d+8], x[a+12], x[b+12], x[c+12], x[d+12]}
//
// Parameters a,b,c,d should have values between 0 and 3.
// If a through d are constants, then an instruction will be inlined, otherwise
// a jump table is generated.
//
// Asm: VPSHUFD, CPU Feature: AVX512
func (x Uint32x16) PermuteScalarsGrouped(a, b, c, d uint8) Uint32x16 {
return x.permuteScalarsGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
}
/* PermuteScalarsHi */
// PermuteScalarsHi performs a permutation of vector x using the supplied indices:
//
// result = {x[0], x[1], x[2], x[3], x[a+4], x[b+4], x[c+4], x[d+4]}
//
// Parameters a,b,c,d should have values between 0 and 3.
// If a through d are constants, then an instruction will be inlined, otherwise
// a jump table is generated.
//
// Asm: VPSHUFHW, CPU Feature: AVX512
func (x Int16x8) PermuteScalarsHi(a, b, c, d uint8) Int16x8 {
return x.permuteScalarsHi(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
}
// PermuteScalarsHi performs a permutation of vector x using the supplied indices:
//
// result = {x[0], x[1], x[2], x[3], x[a+4], x[b+4], x[c+4], x[d+4]}
//
// Parameters a,b,c,d should have values between 0 and 3.
// If a through d are constants, then an instruction will be inlined, otherwise
// a jump table is generated.
//
// Asm: VPSHUFHW, CPU Feature: AVX512
func (x Uint16x8) PermuteScalarsHi(a, b, c, d uint8) Uint16x8 {
return x.permuteScalarsHi(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
}
/* PermuteScalarsHiGrouped */
// PermuteScalarsHiGrouped performs a grouped permutation of vector x using the supplied indices:
//
// result =
// {x[0], x[1], x[2], x[3], x[a+4], x[b+4], x[c+4], x[d+4],
// x[8], x[9], x[10], x[11], x[a+12], x[b+12], x[c+12], x[d+12]}
//
// Parameters a,b,c,d should have values between 0 and 3.
// If a through d are constants, then an instruction will be inlined, otherwise
// a jump table is generated.
//
// Asm: VPSHUFHW, CPU Feature: AVX2
func (x Int16x16) PermuteScalarsHiGrouped(a, b, c, d uint8) Int16x16 {
return x.permuteScalarsHiGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
}
// PermuteScalarsHiGrouped performs a grouped permutation of vector x using the supplied indices:
//
// result =
// {x[0], x[1], x[2], x[3], x[a+4], x[b+4], x[c+4], x[d+4],
// x[8], x[9], x[10], x[11], x[a+12], x[b+12], x[c+12], x[d+12],
// x[16], x[17], x[18], x[19], x[a+20], x[b+20], x[c+20], x[d+20],
// x[24], x[25], x[26], x[27], x[a+28], x[b+28], x[c+28], x[d+28]}
//
// Parameters a,b,c,d should have values between 0 and 3.
// If a through d are constants, then an instruction will be inlined, otherwise
// a jump table is generated.
//
// Asm: VPSHUFHW, CPU Feature: AVX512
func (x Int16x32) PermuteScalarsHiGrouped(a, b, c, d uint8) Int16x32 {
return x.permuteScalarsHiGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
}
// PermuteScalarsHiGrouped performs a grouped permutation of vector x using the supplied indices:
//
// result =
// {x[0], x[1], x[2], x[3], x[a+4], x[b+4], x[c+4], x[d+4],
// x[8], x[9], x[10], x[11], x[a+12], x[b+12], x[c+12], x[d+12]}
//
// Each group is of size 128-bit.
//
// Parameters a,b,c,d should have values between 0 and 3.
// If a through d are constants, then an instruction will be inlined, otherwise
// a jump table is generated.
//
// Asm: VPSHUFHW, CPU Feature: AVX2
func (x Uint16x16) PermuteScalarsHiGrouped(a, b, c, d uint8) Uint16x16 {
return x.permuteScalarsHiGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
}
// PermuteScalarsHiGrouped performs a grouped permutation of vector x using the supplied indices:
//
// result =
// { x[0], x[1], x[2], x[3], x[a+4], x[b+4], x[c+4], x[d+4],
// x[8], x[9], x[10], x[11], x[a+12], x[b+12], x[c+12], x[d+12],
// x[16], x[17], x[18], x[19], x[a+20], x[b+20], x[c+20], x[d+20],
// x[24], x[25], x[26], x[27], x[a+28], x[b+28], x[c+28], x[d+28]}
//
// Parameters a,b,c,d should have values between 0 and 3.
// If a through d are constants, then an instruction will be inlined, otherwise
// a jump table is generated.
//
// Asm: VPSHUFHW, CPU Feature: AVX512
func (x Uint16x32) PermuteScalarsHiGrouped(a, b, c, d uint8) Uint16x32 {
return x.permuteScalarsHiGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
}
/* PermuteScalarsLo */
// PermuteScalarsLo performs a permutation of vector x using the supplied indices:
//
// result = {x[a], x[b], x[c], x[d], x[4], x[5], x[6], x[7]}
//
// Parameters a,b,c,d should have values between 0 and 3.
// If a through d are constants, then an instruction will be inlined, otherwise
// a jump table is generated.
//
// Asm: VPSHUFLW, CPU Feature: AVX512
func (x Int16x8) PermuteScalarsLo(a, b, c, d uint8) Int16x8 {
return x.permuteScalarsLo(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
}
// PermuteScalarsLo performs a permutation of vector x using the supplied indices:
//
// result = {x[a], x[b], x[c], x[d], x[4], x[5], x[6], x[7]}
//
// Parameters a,b,c,d should have values between 0 and 3.
// If a through d are constants, then an instruction will be inlined, otherwise
// a jump table is generated.
//
// Asm: VPSHUFLW, CPU Feature: AVX512
func (x Uint16x8) PermuteScalarsLo(a, b, c, d uint8) Uint16x8 {
return x.permuteScalarsLo(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
}
/* PermuteScalarsLoGrouped */
// PermuteScalarsLoGrouped performs a grouped permutation of vector x using the supplied indices:
//
// result =
// {x[a], x[b], x[c], x[d], x[4], x[5], x[6], x[7],
// x[a+8], x[b+8], x[c+8], x[d+8], x[12], x[13], x[14], x[15]}
//
// Parameters a,b,c,d should have values between 0 and 3.
// If a through d are constants, then an instruction will be inlined, otherwise
// a jump table is generated.
//
// Asm: VPSHUFLW, CPU Feature: AVX2
func (x Int16x16) PermuteScalarsLoGrouped(a, b, c, d uint8) Int16x16 {
return x.permuteScalarsLoGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
}
// PermuteScalarsLoGrouped performs a grouped permutation of vector x using the supplied indices:
//
// result =
// {x[a], x[b], x[c], x[d], x[4], x[5], x[6], x[7],
// x[a+8], x[b+8], x[c+8], x[d+8], x[12], x[13], x[14], x[15],
// x[a+16], x[b+16], x[c+16], x[d+16], x[20], x[21], x[22], x[23],
// x[a+24], x[b+24], x[c+24], x[d+24], x[28], x[29], x[30], x[31]}
//
// Parameters a,b,c,d should have values between 0 and 3.
// If a through d are constants, then an instruction will be inlined, otherwise
// a jump table is generated.
//
// Asm: VPSHUFLW, CPU Feature: AVX512
func (x Int16x32) PermuteScalarsLoGrouped(a, b, c, d uint8) Int16x32 {
return x.permuteScalarsLoGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
}
// PermuteScalarsLoGrouped performs a grouped permutation of vector x using the supplied indices:
//
// result = {x[a], x[b], x[c], x[d], x[4], x[5], x[6], x[7],
// x[a+8], x[b+8], x[c+8], x[d+8], x[12], x[13], x[14], x[15]}
//
// Parameters a,b,c,d should have values between 0 and 3.
// If a through d are constants, then an instruction will be inlined, otherwise
// a jump table is generated.
//
// Asm: VPSHUFLW, CPU Feature: AVX2
func (x Uint16x16) PermuteScalarsLoGrouped(a, b, c, d uint8) Uint16x16 {
return x.permuteScalarsLoGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
}
// PermuteScalarsLoGrouped performs a grouped permutation of vector x using the supplied indices:
//
// result =
// {x[a], x[b], x[c], x[d], x[4], x[5], x[6], x[7],
// x[a+8], x[b+8], x[c+8], x[d+8], x[12], x[13], x[14], x[15],
// x[a+16], x[b+16], x[c+16], x[d+16], x[20], x[21], x[22], x[23],
// x[a+24], x[b+24], x[c+24], x[d+24], x[28], x[29], x[30], x[31]}
//
// Each group is of size 128-bit.
//
// Parameters a,b,c,d should have values between 0 and 3.
// If a through d are constants, then an instruction will be inlined, otherwise
// a jump table is generated.
//
// Asm: VPSHUFLW, CPU Feature: AVX512
func (x Uint16x32) PermuteScalarsLoGrouped(a, b, c, d uint8) Uint16x32 {
return x.permuteScalarsLoGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
}