[dev.simd] simd: fix signatures for PermuteConstant* methods

This moves the packed-immediate methods to package-private,
and adds exported versions with four parameters.

Rename PermuteConstant to PermuteScalars
Rename VPSHUFB Permute to PermuteOrZero
Rename Permute2 to ConcatPermute

Comments were repaired/enhanced.

Modified the generator to support an additional tag
"hideMaskMethods : true" to suppress method, intrinsic,
generic, and generic translation generation for said
mask-modified versions of such methods (this is already
true for exported methods).

Change-Id: I91e208c1fff1f28ebce4edb4e73d26003715018c
Reviewed-on: https://go-review.googlesource.com/c/go/+/721342
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Junyang Shao <shaojunyang@google.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
This commit is contained in:
David Chase 2025-11-17 15:31:36 -05:00
parent e3d4645693
commit 4d26d66a49
18 changed files with 2614 additions and 1820 deletions

View file

@ -396,7 +396,7 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
ssa.OpAMD64VPOR256, ssa.OpAMD64VPOR256,
ssa.OpAMD64VPORD512, ssa.OpAMD64VPORD512,
ssa.OpAMD64VPORQ512, ssa.OpAMD64VPORQ512,
ssa.OpAMD64VPSHUFB128, ssa.OpAMD64VPERMB128,
ssa.OpAMD64VPERMB256, ssa.OpAMD64VPERMB256,
ssa.OpAMD64VPERMB512, ssa.OpAMD64VPERMB512,
ssa.OpAMD64VPERMW128, ssa.OpAMD64VPERMW128,
@ -410,6 +410,7 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
ssa.OpAMD64VPERMQ256, ssa.OpAMD64VPERMQ256,
ssa.OpAMD64VPERMPD512, ssa.OpAMD64VPERMPD512,
ssa.OpAMD64VPERMQ512, ssa.OpAMD64VPERMQ512,
ssa.OpAMD64VPSHUFB128,
ssa.OpAMD64VPSHUFB256, ssa.OpAMD64VPSHUFB256,
ssa.OpAMD64VPSHUFB512, ssa.OpAMD64VPSHUFB512,
ssa.OpAMD64VPROLVD128, ssa.OpAMD64VPROLVD128,
@ -672,9 +673,7 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
ssa.OpAMD64VPORQMasked128, ssa.OpAMD64VPORQMasked128,
ssa.OpAMD64VPORQMasked256, ssa.OpAMD64VPORQMasked256,
ssa.OpAMD64VPORQMasked512, ssa.OpAMD64VPORQMasked512,
ssa.OpAMD64VPSHUFBMasked256, ssa.OpAMD64VPERMBMasked128,
ssa.OpAMD64VPSHUFBMasked512,
ssa.OpAMD64VPSHUFBMasked128,
ssa.OpAMD64VPERMBMasked256, ssa.OpAMD64VPERMBMasked256,
ssa.OpAMD64VPERMBMasked512, ssa.OpAMD64VPERMBMasked512,
ssa.OpAMD64VPERMWMasked128, ssa.OpAMD64VPERMWMasked128,
@ -688,6 +687,9 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
ssa.OpAMD64VPERMQMasked256, ssa.OpAMD64VPERMQMasked256,
ssa.OpAMD64VPERMPDMasked512, ssa.OpAMD64VPERMPDMasked512,
ssa.OpAMD64VPERMQMasked512, ssa.OpAMD64VPERMQMasked512,
ssa.OpAMD64VPSHUFBMasked256,
ssa.OpAMD64VPSHUFBMasked512,
ssa.OpAMD64VPSHUFBMasked128,
ssa.OpAMD64VPROLVDMasked128, ssa.OpAMD64VPROLVDMasked128,
ssa.OpAMD64VPROLVDMasked256, ssa.OpAMD64VPROLVDMasked256,
ssa.OpAMD64VPROLVDMasked512, ssa.OpAMD64VPROLVDMasked512,
@ -1011,12 +1013,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
ssa.OpAMD64VEXTRACTF64X4256, ssa.OpAMD64VEXTRACTF64X4256,
ssa.OpAMD64VEXTRACTI128128, ssa.OpAMD64VEXTRACTI128128,
ssa.OpAMD64VEXTRACTI64X4256, ssa.OpAMD64VEXTRACTI64X4256,
ssa.OpAMD64VPSHUFD128,
ssa.OpAMD64VPSHUFD256,
ssa.OpAMD64VPSHUFD512,
ssa.OpAMD64VPSHUFHW128,
ssa.OpAMD64VPSHUFHW256,
ssa.OpAMD64VPSHUFHW512,
ssa.OpAMD64VPROLD128, ssa.OpAMD64VPROLD128,
ssa.OpAMD64VPROLD256, ssa.OpAMD64VPROLD256,
ssa.OpAMD64VPROLD512, ssa.OpAMD64VPROLD512,
@ -1029,6 +1025,15 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
ssa.OpAMD64VPRORQ128, ssa.OpAMD64VPRORQ128,
ssa.OpAMD64VPRORQ256, ssa.OpAMD64VPRORQ256,
ssa.OpAMD64VPRORQ512, ssa.OpAMD64VPRORQ512,
ssa.OpAMD64VPSHUFD128,
ssa.OpAMD64VPSHUFD256,
ssa.OpAMD64VPSHUFD512,
ssa.OpAMD64VPSHUFHW128,
ssa.OpAMD64VPSHUFHW256,
ssa.OpAMD64VPSHUFHW512,
ssa.OpAMD64VPSHUFLW128,
ssa.OpAMD64VPSHUFLW256,
ssa.OpAMD64VPSHUFLW512,
ssa.OpAMD64VPSLLW128const, ssa.OpAMD64VPSLLW128const,
ssa.OpAMD64VPSLLW256const, ssa.OpAMD64VPSLLW256const,
ssa.OpAMD64VPSLLW512const, ssa.OpAMD64VPSLLW512const,
@ -1070,12 +1075,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
ssa.OpAMD64VREDUCEPDMasked128, ssa.OpAMD64VREDUCEPDMasked128,
ssa.OpAMD64VREDUCEPDMasked256, ssa.OpAMD64VREDUCEPDMasked256,
ssa.OpAMD64VREDUCEPDMasked512, ssa.OpAMD64VREDUCEPDMasked512,
ssa.OpAMD64VPSHUFDMasked256,
ssa.OpAMD64VPSHUFDMasked512,
ssa.OpAMD64VPSHUFHWMasked256,
ssa.OpAMD64VPSHUFHWMasked512,
ssa.OpAMD64VPSHUFHWMasked128,
ssa.OpAMD64VPSHUFDMasked128,
ssa.OpAMD64VPROLDMasked128, ssa.OpAMD64VPROLDMasked128,
ssa.OpAMD64VPROLDMasked256, ssa.OpAMD64VPROLDMasked256,
ssa.OpAMD64VPROLDMasked512, ssa.OpAMD64VPROLDMasked512,
@ -1088,6 +1087,15 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
ssa.OpAMD64VPRORQMasked128, ssa.OpAMD64VPRORQMasked128,
ssa.OpAMD64VPRORQMasked256, ssa.OpAMD64VPRORQMasked256,
ssa.OpAMD64VPRORQMasked512, ssa.OpAMD64VPRORQMasked512,
ssa.OpAMD64VPSHUFDMasked256,
ssa.OpAMD64VPSHUFDMasked512,
ssa.OpAMD64VPSHUFHWMasked256,
ssa.OpAMD64VPSHUFHWMasked512,
ssa.OpAMD64VPSHUFHWMasked128,
ssa.OpAMD64VPSHUFLWMasked256,
ssa.OpAMD64VPSHUFLWMasked512,
ssa.OpAMD64VPSHUFLWMasked128,
ssa.OpAMD64VPSHUFDMasked128,
ssa.OpAMD64VPSLLWMasked128const, ssa.OpAMD64VPSLLWMasked128const,
ssa.OpAMD64VPSLLWMasked256const, ssa.OpAMD64VPSLLWMasked256const,
ssa.OpAMD64VPSLLWMasked512const, ssa.OpAMD64VPSLLWMasked512const,
@ -1209,6 +1217,24 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
case ssa.OpAMD64VPDPWSSD128, case ssa.OpAMD64VPDPWSSD128,
ssa.OpAMD64VPDPWSSD256, ssa.OpAMD64VPDPWSSD256,
ssa.OpAMD64VPDPWSSD512, ssa.OpAMD64VPDPWSSD512,
ssa.OpAMD64VPERMI2B128,
ssa.OpAMD64VPERMI2B256,
ssa.OpAMD64VPERMI2B512,
ssa.OpAMD64VPERMI2W128,
ssa.OpAMD64VPERMI2W256,
ssa.OpAMD64VPERMI2W512,
ssa.OpAMD64VPERMI2PS128,
ssa.OpAMD64VPERMI2D128,
ssa.OpAMD64VPERMI2PS256,
ssa.OpAMD64VPERMI2D256,
ssa.OpAMD64VPERMI2PS512,
ssa.OpAMD64VPERMI2D512,
ssa.OpAMD64VPERMI2PD128,
ssa.OpAMD64VPERMI2Q128,
ssa.OpAMD64VPERMI2PD256,
ssa.OpAMD64VPERMI2Q256,
ssa.OpAMD64VPERMI2PD512,
ssa.OpAMD64VPERMI2Q512,
ssa.OpAMD64VPDPBUSD128, ssa.OpAMD64VPDPBUSD128,
ssa.OpAMD64VPDPBUSD256, ssa.OpAMD64VPDPBUSD256,
ssa.OpAMD64VPDPBUSD512, ssa.OpAMD64VPDPBUSD512,
@ -1233,24 +1259,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
ssa.OpAMD64VFMSUBADD213PD128, ssa.OpAMD64VFMSUBADD213PD128,
ssa.OpAMD64VFMSUBADD213PD256, ssa.OpAMD64VFMSUBADD213PD256,
ssa.OpAMD64VFMSUBADD213PD512, ssa.OpAMD64VFMSUBADD213PD512,
ssa.OpAMD64VPERMI2B128,
ssa.OpAMD64VPERMI2B256,
ssa.OpAMD64VPERMI2B512,
ssa.OpAMD64VPERMI2W128,
ssa.OpAMD64VPERMI2W256,
ssa.OpAMD64VPERMI2W512,
ssa.OpAMD64VPERMI2PS128,
ssa.OpAMD64VPERMI2D128,
ssa.OpAMD64VPERMI2PS256,
ssa.OpAMD64VPERMI2D256,
ssa.OpAMD64VPERMI2PS512,
ssa.OpAMD64VPERMI2D512,
ssa.OpAMD64VPERMI2PD128,
ssa.OpAMD64VPERMI2Q128,
ssa.OpAMD64VPERMI2PD256,
ssa.OpAMD64VPERMI2Q256,
ssa.OpAMD64VPERMI2PD512,
ssa.OpAMD64VPERMI2Q512,
ssa.OpAMD64VPSHLDVW128, ssa.OpAMD64VPSHLDVW128,
ssa.OpAMD64VPSHLDVW256, ssa.OpAMD64VPSHLDVW256,
ssa.OpAMD64VPSHLDVW512, ssa.OpAMD64VPSHLDVW512,
@ -1316,6 +1324,24 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
ssa.OpAMD64VPAVGWMasked128Merging, ssa.OpAMD64VPAVGWMasked128Merging,
ssa.OpAMD64VPAVGWMasked256Merging, ssa.OpAMD64VPAVGWMasked256Merging,
ssa.OpAMD64VPAVGWMasked512Merging, ssa.OpAMD64VPAVGWMasked512Merging,
ssa.OpAMD64VPERMI2BMasked128,
ssa.OpAMD64VPERMI2BMasked256,
ssa.OpAMD64VPERMI2BMasked512,
ssa.OpAMD64VPERMI2WMasked128,
ssa.OpAMD64VPERMI2WMasked256,
ssa.OpAMD64VPERMI2WMasked512,
ssa.OpAMD64VPERMI2PSMasked128,
ssa.OpAMD64VPERMI2DMasked128,
ssa.OpAMD64VPERMI2PSMasked256,
ssa.OpAMD64VPERMI2DMasked256,
ssa.OpAMD64VPERMI2PSMasked512,
ssa.OpAMD64VPERMI2DMasked512,
ssa.OpAMD64VPERMI2PDMasked128,
ssa.OpAMD64VPERMI2QMasked128,
ssa.OpAMD64VPERMI2PDMasked256,
ssa.OpAMD64VPERMI2QMasked256,
ssa.OpAMD64VPERMI2PDMasked512,
ssa.OpAMD64VPERMI2QMasked512,
ssa.OpAMD64VPALIGNRMasked256Merging, ssa.OpAMD64VPALIGNRMasked256Merging,
ssa.OpAMD64VPALIGNRMasked512Merging, ssa.OpAMD64VPALIGNRMasked512Merging,
ssa.OpAMD64VPALIGNRMasked128Merging, ssa.OpAMD64VPALIGNRMasked128Merging,
@ -1451,24 +1477,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
ssa.OpAMD64VPORQMasked128Merging, ssa.OpAMD64VPORQMasked128Merging,
ssa.OpAMD64VPORQMasked256Merging, ssa.OpAMD64VPORQMasked256Merging,
ssa.OpAMD64VPORQMasked512Merging, ssa.OpAMD64VPORQMasked512Merging,
ssa.OpAMD64VPERMI2BMasked128,
ssa.OpAMD64VPERMI2BMasked256,
ssa.OpAMD64VPERMI2BMasked512,
ssa.OpAMD64VPERMI2WMasked128,
ssa.OpAMD64VPERMI2WMasked256,
ssa.OpAMD64VPERMI2WMasked512,
ssa.OpAMD64VPERMI2PSMasked128,
ssa.OpAMD64VPERMI2DMasked128,
ssa.OpAMD64VPERMI2PSMasked256,
ssa.OpAMD64VPERMI2DMasked256,
ssa.OpAMD64VPERMI2PSMasked512,
ssa.OpAMD64VPERMI2DMasked512,
ssa.OpAMD64VPERMI2PDMasked128,
ssa.OpAMD64VPERMI2QMasked128,
ssa.OpAMD64VPERMI2PDMasked256,
ssa.OpAMD64VPERMI2QMasked256,
ssa.OpAMD64VPERMI2PDMasked512,
ssa.OpAMD64VPERMI2QMasked512,
ssa.OpAMD64VPSHUFBMasked256Merging, ssa.OpAMD64VPSHUFBMasked256Merging,
ssa.OpAMD64VPSHUFBMasked512Merging, ssa.OpAMD64VPSHUFBMasked512Merging,
ssa.OpAMD64VPSHUFBMasked128Merging, ssa.OpAMD64VPSHUFBMasked128Merging,
@ -1819,6 +1827,18 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
p = simdV21load(s, v) p = simdV21load(s, v)
case ssa.OpAMD64VPDPWSSD512load, case ssa.OpAMD64VPDPWSSD512load,
ssa.OpAMD64VPERMI2PS128load,
ssa.OpAMD64VPERMI2D128load,
ssa.OpAMD64VPERMI2PS256load,
ssa.OpAMD64VPERMI2D256load,
ssa.OpAMD64VPERMI2PS512load,
ssa.OpAMD64VPERMI2D512load,
ssa.OpAMD64VPERMI2PD128load,
ssa.OpAMD64VPERMI2Q128load,
ssa.OpAMD64VPERMI2PD256load,
ssa.OpAMD64VPERMI2Q256load,
ssa.OpAMD64VPERMI2PD512load,
ssa.OpAMD64VPERMI2Q512load,
ssa.OpAMD64VPDPBUSD512load, ssa.OpAMD64VPDPBUSD512load,
ssa.OpAMD64VPDPBUSDS512load, ssa.OpAMD64VPDPBUSDS512load,
ssa.OpAMD64VFMADD213PS128load, ssa.OpAMD64VFMADD213PS128load,
@ -1839,18 +1859,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
ssa.OpAMD64VFMSUBADD213PD128load, ssa.OpAMD64VFMSUBADD213PD128load,
ssa.OpAMD64VFMSUBADD213PD256load, ssa.OpAMD64VFMSUBADD213PD256load,
ssa.OpAMD64VFMSUBADD213PD512load, ssa.OpAMD64VFMSUBADD213PD512load,
ssa.OpAMD64VPERMI2PS128load,
ssa.OpAMD64VPERMI2D128load,
ssa.OpAMD64VPERMI2PS256load,
ssa.OpAMD64VPERMI2D256load,
ssa.OpAMD64VPERMI2PS512load,
ssa.OpAMD64VPERMI2D512load,
ssa.OpAMD64VPERMI2PD128load,
ssa.OpAMD64VPERMI2Q128load,
ssa.OpAMD64VPERMI2PD256load,
ssa.OpAMD64VPERMI2Q256load,
ssa.OpAMD64VPERMI2PD512load,
ssa.OpAMD64VPERMI2Q512load,
ssa.OpAMD64VPSHLDVD128load, ssa.OpAMD64VPSHLDVD128load,
ssa.OpAMD64VPSHLDVD256load, ssa.OpAMD64VPSHLDVD256load,
ssa.OpAMD64VPSHLDVD512load, ssa.OpAMD64VPSHLDVD512load,
@ -1868,6 +1876,18 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
case ssa.OpAMD64VPDPWSSDMasked128load, case ssa.OpAMD64VPDPWSSDMasked128load,
ssa.OpAMD64VPDPWSSDMasked256load, ssa.OpAMD64VPDPWSSDMasked256load,
ssa.OpAMD64VPDPWSSDMasked512load, ssa.OpAMD64VPDPWSSDMasked512load,
ssa.OpAMD64VPERMI2PSMasked128load,
ssa.OpAMD64VPERMI2DMasked128load,
ssa.OpAMD64VPERMI2PSMasked256load,
ssa.OpAMD64VPERMI2DMasked256load,
ssa.OpAMD64VPERMI2PSMasked512load,
ssa.OpAMD64VPERMI2DMasked512load,
ssa.OpAMD64VPERMI2PDMasked128load,
ssa.OpAMD64VPERMI2QMasked128load,
ssa.OpAMD64VPERMI2PDMasked256load,
ssa.OpAMD64VPERMI2QMasked256load,
ssa.OpAMD64VPERMI2PDMasked512load,
ssa.OpAMD64VPERMI2QMasked512load,
ssa.OpAMD64VPDPBUSDMasked128load, ssa.OpAMD64VPDPBUSDMasked128load,
ssa.OpAMD64VPDPBUSDMasked256load, ssa.OpAMD64VPDPBUSDMasked256load,
ssa.OpAMD64VPDPBUSDMasked512load, ssa.OpAMD64VPDPBUSDMasked512load,
@ -1892,18 +1912,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
ssa.OpAMD64VFMSUBADD213PDMasked128load, ssa.OpAMD64VFMSUBADD213PDMasked128load,
ssa.OpAMD64VFMSUBADD213PDMasked256load, ssa.OpAMD64VFMSUBADD213PDMasked256load,
ssa.OpAMD64VFMSUBADD213PDMasked512load, ssa.OpAMD64VFMSUBADD213PDMasked512load,
ssa.OpAMD64VPERMI2PSMasked128load,
ssa.OpAMD64VPERMI2DMasked128load,
ssa.OpAMD64VPERMI2PSMasked256load,
ssa.OpAMD64VPERMI2DMasked256load,
ssa.OpAMD64VPERMI2PSMasked512load,
ssa.OpAMD64VPERMI2DMasked512load,
ssa.OpAMD64VPERMI2PDMasked128load,
ssa.OpAMD64VPERMI2QMasked128load,
ssa.OpAMD64VPERMI2PDMasked256load,
ssa.OpAMD64VPERMI2QMasked256load,
ssa.OpAMD64VPERMI2PDMasked512load,
ssa.OpAMD64VPERMI2QMasked512load,
ssa.OpAMD64VPSHLDVDMasked128load, ssa.OpAMD64VPSHLDVDMasked128load,
ssa.OpAMD64VPSHLDVDMasked256load, ssa.OpAMD64VPSHLDVDMasked256load,
ssa.OpAMD64VPSHLDVDMasked512load, ssa.OpAMD64VPSHLDVDMasked512load,
@ -2124,7 +2132,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
ssa.OpAMD64VREDUCEPD128load, ssa.OpAMD64VREDUCEPD128load,
ssa.OpAMD64VREDUCEPD256load, ssa.OpAMD64VREDUCEPD256load,
ssa.OpAMD64VREDUCEPD512load, ssa.OpAMD64VREDUCEPD512load,
ssa.OpAMD64VPSHUFD512load,
ssa.OpAMD64VPROLD128load, ssa.OpAMD64VPROLD128load,
ssa.OpAMD64VPROLD256load, ssa.OpAMD64VPROLD256load,
ssa.OpAMD64VPROLD512load, ssa.OpAMD64VPROLD512load,
@ -2137,6 +2144,7 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
ssa.OpAMD64VPRORQ128load, ssa.OpAMD64VPRORQ128load,
ssa.OpAMD64VPRORQ256load, ssa.OpAMD64VPRORQ256load,
ssa.OpAMD64VPRORQ512load, ssa.OpAMD64VPRORQ512load,
ssa.OpAMD64VPSHUFD512load,
ssa.OpAMD64VPSLLD512constload, ssa.OpAMD64VPSLLD512constload,
ssa.OpAMD64VPSLLQ512constload, ssa.OpAMD64VPSLLQ512constload,
ssa.OpAMD64VPSRLD512constload, ssa.OpAMD64VPSRLD512constload,
@ -2159,9 +2167,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
ssa.OpAMD64VREDUCEPDMasked128load, ssa.OpAMD64VREDUCEPDMasked128load,
ssa.OpAMD64VREDUCEPDMasked256load, ssa.OpAMD64VREDUCEPDMasked256load,
ssa.OpAMD64VREDUCEPDMasked512load, ssa.OpAMD64VREDUCEPDMasked512load,
ssa.OpAMD64VPSHUFDMasked256load,
ssa.OpAMD64VPSHUFDMasked512load,
ssa.OpAMD64VPSHUFDMasked128load,
ssa.OpAMD64VPROLDMasked128load, ssa.OpAMD64VPROLDMasked128load,
ssa.OpAMD64VPROLDMasked256load, ssa.OpAMD64VPROLDMasked256load,
ssa.OpAMD64VPROLDMasked512load, ssa.OpAMD64VPROLDMasked512load,
@ -2174,6 +2179,9 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
ssa.OpAMD64VPRORQMasked128load, ssa.OpAMD64VPRORQMasked128load,
ssa.OpAMD64VPRORQMasked256load, ssa.OpAMD64VPRORQMasked256load,
ssa.OpAMD64VPRORQMasked512load, ssa.OpAMD64VPRORQMasked512load,
ssa.OpAMD64VPSHUFDMasked256load,
ssa.OpAMD64VPSHUFDMasked512load,
ssa.OpAMD64VPSHUFDMasked128load,
ssa.OpAMD64VPSLLDMasked128constload, ssa.OpAMD64VPSLLDMasked128constload,
ssa.OpAMD64VPSLLDMasked256constload, ssa.OpAMD64VPSLLDMasked256constload,
ssa.OpAMD64VPSLLDMasked512constload, ssa.OpAMD64VPSLLDMasked512constload,
@ -2447,12 +2455,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
ssa.OpAMD64VPOPCNTQMasked128Merging, ssa.OpAMD64VPOPCNTQMasked128Merging,
ssa.OpAMD64VPOPCNTQMasked256Merging, ssa.OpAMD64VPOPCNTQMasked256Merging,
ssa.OpAMD64VPOPCNTQMasked512Merging, ssa.OpAMD64VPOPCNTQMasked512Merging,
ssa.OpAMD64VPSHUFDMasked256Merging,
ssa.OpAMD64VPSHUFDMasked512Merging,
ssa.OpAMD64VPSHUFHWMasked256Merging,
ssa.OpAMD64VPSHUFHWMasked512Merging,
ssa.OpAMD64VPSHUFHWMasked128Merging,
ssa.OpAMD64VPSHUFDMasked128Merging,
ssa.OpAMD64VRCP14PSMasked128Merging, ssa.OpAMD64VRCP14PSMasked128Merging,
ssa.OpAMD64VRCP14PSMasked256Merging, ssa.OpAMD64VRCP14PSMasked256Merging,
ssa.OpAMD64VRCP14PSMasked512Merging, ssa.OpAMD64VRCP14PSMasked512Merging,
@ -2483,6 +2485,15 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
ssa.OpAMD64VSQRTPDMasked128Merging, ssa.OpAMD64VSQRTPDMasked128Merging,
ssa.OpAMD64VSQRTPDMasked256Merging, ssa.OpAMD64VSQRTPDMasked256Merging,
ssa.OpAMD64VSQRTPDMasked512Merging, ssa.OpAMD64VSQRTPDMasked512Merging,
ssa.OpAMD64VPSHUFDMasked256Merging,
ssa.OpAMD64VPSHUFDMasked512Merging,
ssa.OpAMD64VPSHUFHWMasked256Merging,
ssa.OpAMD64VPSHUFHWMasked512Merging,
ssa.OpAMD64VPSHUFHWMasked128Merging,
ssa.OpAMD64VPSHUFLWMasked256Merging,
ssa.OpAMD64VPSHUFLWMasked512Merging,
ssa.OpAMD64VPSHUFLWMasked128Merging,
ssa.OpAMD64VPSHUFDMasked128Merging,
ssa.OpAMD64VPSLLWMasked128constMerging, ssa.OpAMD64VPSLLWMasked128constMerging,
ssa.OpAMD64VPSLLWMasked256constMerging, ssa.OpAMD64VPSLLWMasked256constMerging,
ssa.OpAMD64VPSLLWMasked512constMerging, ssa.OpAMD64VPSLLWMasked512constMerging,
@ -2674,6 +2685,36 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
ssa.OpAMD64VPCOMPRESSQMasked128, ssa.OpAMD64VPCOMPRESSQMasked128,
ssa.OpAMD64VPCOMPRESSQMasked256, ssa.OpAMD64VPCOMPRESSQMasked256,
ssa.OpAMD64VPCOMPRESSQMasked512, ssa.OpAMD64VPCOMPRESSQMasked512,
ssa.OpAMD64VPERMI2BMasked128,
ssa.OpAMD64VPERMI2BMasked256,
ssa.OpAMD64VPERMI2BMasked512,
ssa.OpAMD64VPERMI2WMasked128,
ssa.OpAMD64VPERMI2WMasked256,
ssa.OpAMD64VPERMI2WMasked512,
ssa.OpAMD64VPERMI2PSMasked128,
ssa.OpAMD64VPERMI2PSMasked128load,
ssa.OpAMD64VPERMI2DMasked128,
ssa.OpAMD64VPERMI2DMasked128load,
ssa.OpAMD64VPERMI2PSMasked256,
ssa.OpAMD64VPERMI2PSMasked256load,
ssa.OpAMD64VPERMI2DMasked256,
ssa.OpAMD64VPERMI2DMasked256load,
ssa.OpAMD64VPERMI2PSMasked512,
ssa.OpAMD64VPERMI2PSMasked512load,
ssa.OpAMD64VPERMI2DMasked512,
ssa.OpAMD64VPERMI2DMasked512load,
ssa.OpAMD64VPERMI2PDMasked128,
ssa.OpAMD64VPERMI2PDMasked128load,
ssa.OpAMD64VPERMI2QMasked128,
ssa.OpAMD64VPERMI2QMasked128load,
ssa.OpAMD64VPERMI2PDMasked256,
ssa.OpAMD64VPERMI2PDMasked256load,
ssa.OpAMD64VPERMI2QMasked256,
ssa.OpAMD64VPERMI2QMasked256load,
ssa.OpAMD64VPERMI2PDMasked512,
ssa.OpAMD64VPERMI2PDMasked512load,
ssa.OpAMD64VPERMI2QMasked512,
ssa.OpAMD64VPERMI2QMasked512load,
ssa.OpAMD64VPALIGNRMasked256, ssa.OpAMD64VPALIGNRMasked256,
ssa.OpAMD64VPALIGNRMasked512, ssa.OpAMD64VPALIGNRMasked512,
ssa.OpAMD64VPALIGNRMasked128, ssa.OpAMD64VPALIGNRMasked128,
@ -3061,48 +3102,7 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
ssa.OpAMD64VPORQMasked256load, ssa.OpAMD64VPORQMasked256load,
ssa.OpAMD64VPORQMasked512, ssa.OpAMD64VPORQMasked512,
ssa.OpAMD64VPORQMasked512load, ssa.OpAMD64VPORQMasked512load,
ssa.OpAMD64VPERMI2BMasked128, ssa.OpAMD64VPERMBMasked128,
ssa.OpAMD64VPERMI2BMasked256,
ssa.OpAMD64VPERMI2BMasked512,
ssa.OpAMD64VPERMI2WMasked128,
ssa.OpAMD64VPERMI2WMasked256,
ssa.OpAMD64VPERMI2WMasked512,
ssa.OpAMD64VPERMI2PSMasked128,
ssa.OpAMD64VPERMI2PSMasked128load,
ssa.OpAMD64VPERMI2DMasked128,
ssa.OpAMD64VPERMI2DMasked128load,
ssa.OpAMD64VPERMI2PSMasked256,
ssa.OpAMD64VPERMI2PSMasked256load,
ssa.OpAMD64VPERMI2DMasked256,
ssa.OpAMD64VPERMI2DMasked256load,
ssa.OpAMD64VPERMI2PSMasked512,
ssa.OpAMD64VPERMI2PSMasked512load,
ssa.OpAMD64VPERMI2DMasked512,
ssa.OpAMD64VPERMI2DMasked512load,
ssa.OpAMD64VPERMI2PDMasked128,
ssa.OpAMD64VPERMI2PDMasked128load,
ssa.OpAMD64VPERMI2QMasked128,
ssa.OpAMD64VPERMI2QMasked128load,
ssa.OpAMD64VPERMI2PDMasked256,
ssa.OpAMD64VPERMI2PDMasked256load,
ssa.OpAMD64VPERMI2QMasked256,
ssa.OpAMD64VPERMI2QMasked256load,
ssa.OpAMD64VPERMI2PDMasked512,
ssa.OpAMD64VPERMI2PDMasked512load,
ssa.OpAMD64VPERMI2QMasked512,
ssa.OpAMD64VPERMI2QMasked512load,
ssa.OpAMD64VPSHUFDMasked256,
ssa.OpAMD64VPSHUFDMasked256load,
ssa.OpAMD64VPSHUFDMasked512,
ssa.OpAMD64VPSHUFDMasked512load,
ssa.OpAMD64VPSHUFHWMasked256,
ssa.OpAMD64VPSHUFHWMasked512,
ssa.OpAMD64VPSHUFHWMasked128,
ssa.OpAMD64VPSHUFDMasked128,
ssa.OpAMD64VPSHUFDMasked128load,
ssa.OpAMD64VPSHUFBMasked256,
ssa.OpAMD64VPSHUFBMasked512,
ssa.OpAMD64VPSHUFBMasked128,
ssa.OpAMD64VPERMBMasked256, ssa.OpAMD64VPERMBMasked256,
ssa.OpAMD64VPERMBMasked512, ssa.OpAMD64VPERMBMasked512,
ssa.OpAMD64VPERMWMasked128, ssa.OpAMD64VPERMWMasked128,
@ -3124,6 +3124,9 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
ssa.OpAMD64VPERMPDMasked512load, ssa.OpAMD64VPERMPDMasked512load,
ssa.OpAMD64VPERMQMasked512, ssa.OpAMD64VPERMQMasked512,
ssa.OpAMD64VPERMQMasked512load, ssa.OpAMD64VPERMQMasked512load,
ssa.OpAMD64VPSHUFBMasked256,
ssa.OpAMD64VPSHUFBMasked512,
ssa.OpAMD64VPSHUFBMasked128,
ssa.OpAMD64VRCP14PSMasked128, ssa.OpAMD64VRCP14PSMasked128,
ssa.OpAMD64VRCP14PSMasked128load, ssa.OpAMD64VRCP14PSMasked128load,
ssa.OpAMD64VRCP14PSMasked256, ssa.OpAMD64VRCP14PSMasked256,
@ -3418,6 +3421,18 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
ssa.OpAMD64VMOVDQU64Masked128, ssa.OpAMD64VMOVDQU64Masked128,
ssa.OpAMD64VMOVDQU64Masked256, ssa.OpAMD64VMOVDQU64Masked256,
ssa.OpAMD64VMOVDQU64Masked512, ssa.OpAMD64VMOVDQU64Masked512,
ssa.OpAMD64VPSHUFDMasked256,
ssa.OpAMD64VPSHUFDMasked256load,
ssa.OpAMD64VPSHUFDMasked512,
ssa.OpAMD64VPSHUFDMasked512load,
ssa.OpAMD64VPSHUFHWMasked256,
ssa.OpAMD64VPSHUFHWMasked512,
ssa.OpAMD64VPSHUFHWMasked128,
ssa.OpAMD64VPSHUFLWMasked256,
ssa.OpAMD64VPSHUFLWMasked512,
ssa.OpAMD64VPSHUFLWMasked128,
ssa.OpAMD64VPSHUFDMasked128,
ssa.OpAMD64VPSHUFDMasked128load,
ssa.OpAMD64VPSLLWMasked128const, ssa.OpAMD64VPSLLWMasked128const,
ssa.OpAMD64VPSLLWMasked256const, ssa.OpAMD64VPSLLWMasked256const,
ssa.OpAMD64VPSLLWMasked512const, ssa.OpAMD64VPSLLWMasked512const,

View file

@ -216,6 +216,36 @@
(CompressUint64x2 x mask) => (VPCOMPRESSQMasked128 x (VPMOVVec64x2ToM <types.TypeMask> mask)) (CompressUint64x2 x mask) => (VPCOMPRESSQMasked128 x (VPMOVVec64x2ToM <types.TypeMask> mask))
(CompressUint64x4 x mask) => (VPCOMPRESSQMasked256 x (VPMOVVec64x4ToM <types.TypeMask> mask)) (CompressUint64x4 x mask) => (VPCOMPRESSQMasked256 x (VPMOVVec64x4ToM <types.TypeMask> mask))
(CompressUint64x8 x mask) => (VPCOMPRESSQMasked512 x (VPMOVVec64x8ToM <types.TypeMask> mask)) (CompressUint64x8 x mask) => (VPCOMPRESSQMasked512 x (VPMOVVec64x8ToM <types.TypeMask> mask))
(ConcatPermuteFloat32x4 ...) => (VPERMI2PS128 ...)
(ConcatPermuteFloat32x8 ...) => (VPERMI2PS256 ...)
(ConcatPermuteFloat32x16 ...) => (VPERMI2PS512 ...)
(ConcatPermuteFloat64x2 ...) => (VPERMI2PD128 ...)
(ConcatPermuteFloat64x4 ...) => (VPERMI2PD256 ...)
(ConcatPermuteFloat64x8 ...) => (VPERMI2PD512 ...)
(ConcatPermuteInt8x16 ...) => (VPERMI2B128 ...)
(ConcatPermuteInt8x32 ...) => (VPERMI2B256 ...)
(ConcatPermuteInt8x64 ...) => (VPERMI2B512 ...)
(ConcatPermuteInt16x8 ...) => (VPERMI2W128 ...)
(ConcatPermuteInt16x16 ...) => (VPERMI2W256 ...)
(ConcatPermuteInt16x32 ...) => (VPERMI2W512 ...)
(ConcatPermuteInt32x4 ...) => (VPERMI2D128 ...)
(ConcatPermuteInt32x8 ...) => (VPERMI2D256 ...)
(ConcatPermuteInt32x16 ...) => (VPERMI2D512 ...)
(ConcatPermuteInt64x2 ...) => (VPERMI2Q128 ...)
(ConcatPermuteInt64x4 ...) => (VPERMI2Q256 ...)
(ConcatPermuteInt64x8 ...) => (VPERMI2Q512 ...)
(ConcatPermuteUint8x16 ...) => (VPERMI2B128 ...)
(ConcatPermuteUint8x32 ...) => (VPERMI2B256 ...)
(ConcatPermuteUint8x64 ...) => (VPERMI2B512 ...)
(ConcatPermuteUint16x8 ...) => (VPERMI2W128 ...)
(ConcatPermuteUint16x16 ...) => (VPERMI2W256 ...)
(ConcatPermuteUint16x32 ...) => (VPERMI2W512 ...)
(ConcatPermuteUint32x4 ...) => (VPERMI2D128 ...)
(ConcatPermuteUint32x8 ...) => (VPERMI2D256 ...)
(ConcatPermuteUint32x16 ...) => (VPERMI2D512 ...)
(ConcatPermuteUint64x2 ...) => (VPERMI2Q128 ...)
(ConcatPermuteUint64x4 ...) => (VPERMI2Q256 ...)
(ConcatPermuteUint64x8 ...) => (VPERMI2Q512 ...)
(ConcatShiftBytesRightUint8x16 ...) => (VPALIGNR128 ...) (ConcatShiftBytesRightUint8x16 ...) => (VPALIGNR128 ...)
(ConcatShiftBytesRightGroupedUint8x32 ...) => (VPALIGNR256 ...) (ConcatShiftBytesRightGroupedUint8x32 ...) => (VPALIGNR256 ...)
(ConcatShiftBytesRightGroupedUint8x64 ...) => (VPALIGNR512 ...) (ConcatShiftBytesRightGroupedUint8x64 ...) => (VPALIGNR512 ...)
@ -794,7 +824,7 @@
(PermuteFloat32x16 ...) => (VPERMPS512 ...) (PermuteFloat32x16 ...) => (VPERMPS512 ...)
(PermuteFloat64x4 ...) => (VPERMPD256 ...) (PermuteFloat64x4 ...) => (VPERMPD256 ...)
(PermuteFloat64x8 ...) => (VPERMPD512 ...) (PermuteFloat64x8 ...) => (VPERMPD512 ...)
(PermuteInt8x16 ...) => (VPSHUFB128 ...) (PermuteInt8x16 ...) => (VPERMB128 ...)
(PermuteInt8x32 ...) => (VPERMB256 ...) (PermuteInt8x32 ...) => (VPERMB256 ...)
(PermuteInt8x64 ...) => (VPERMB512 ...) (PermuteInt8x64 ...) => (VPERMB512 ...)
(PermuteInt16x8 ...) => (VPERMW128 ...) (PermuteInt16x8 ...) => (VPERMW128 ...)
@ -804,7 +834,7 @@
(PermuteInt32x16 ...) => (VPERMD512 ...) (PermuteInt32x16 ...) => (VPERMD512 ...)
(PermuteInt64x4 ...) => (VPERMQ256 ...) (PermuteInt64x4 ...) => (VPERMQ256 ...)
(PermuteInt64x8 ...) => (VPERMQ512 ...) (PermuteInt64x8 ...) => (VPERMQ512 ...)
(PermuteUint8x16 ...) => (VPSHUFB128 ...) (PermuteUint8x16 ...) => (VPERMB128 ...)
(PermuteUint8x32 ...) => (VPERMB256 ...) (PermuteUint8x32 ...) => (VPERMB256 ...)
(PermuteUint8x64 ...) => (VPERMB512 ...) (PermuteUint8x64 ...) => (VPERMB512 ...)
(PermuteUint16x8 ...) => (VPERMW128 ...) (PermuteUint16x8 ...) => (VPERMW128 ...)
@ -814,62 +844,12 @@
(PermuteUint32x16 ...) => (VPERMD512 ...) (PermuteUint32x16 ...) => (VPERMD512 ...)
(PermuteUint64x4 ...) => (VPERMQ256 ...) (PermuteUint64x4 ...) => (VPERMQ256 ...)
(PermuteUint64x8 ...) => (VPERMQ512 ...) (PermuteUint64x8 ...) => (VPERMQ512 ...)
(Permute2Float32x4 ...) => (VPERMI2PS128 ...) (PermuteOrZeroInt8x16 ...) => (VPSHUFB128 ...)
(Permute2Float32x8 ...) => (VPERMI2PS256 ...) (PermuteOrZeroUint8x16 ...) => (VPSHUFB128 ...)
(Permute2Float32x16 ...) => (VPERMI2PS512 ...) (PermuteOrZeroGroupedInt8x32 ...) => (VPSHUFB256 ...)
(Permute2Float64x2 ...) => (VPERMI2PD128 ...) (PermuteOrZeroGroupedInt8x64 ...) => (VPSHUFB512 ...)
(Permute2Float64x4 ...) => (VPERMI2PD256 ...) (PermuteOrZeroGroupedUint8x32 ...) => (VPSHUFB256 ...)
(Permute2Float64x8 ...) => (VPERMI2PD512 ...) (PermuteOrZeroGroupedUint8x64 ...) => (VPSHUFB512 ...)
(Permute2Int8x16 ...) => (VPERMI2B128 ...)
(Permute2Int8x32 ...) => (VPERMI2B256 ...)
(Permute2Int8x64 ...) => (VPERMI2B512 ...)
(Permute2Int16x8 ...) => (VPERMI2W128 ...)
(Permute2Int16x16 ...) => (VPERMI2W256 ...)
(Permute2Int16x32 ...) => (VPERMI2W512 ...)
(Permute2Int32x4 ...) => (VPERMI2D128 ...)
(Permute2Int32x8 ...) => (VPERMI2D256 ...)
(Permute2Int32x16 ...) => (VPERMI2D512 ...)
(Permute2Int64x2 ...) => (VPERMI2Q128 ...)
(Permute2Int64x4 ...) => (VPERMI2Q256 ...)
(Permute2Int64x8 ...) => (VPERMI2Q512 ...)
(Permute2Uint8x16 ...) => (VPERMI2B128 ...)
(Permute2Uint8x32 ...) => (VPERMI2B256 ...)
(Permute2Uint8x64 ...) => (VPERMI2B512 ...)
(Permute2Uint16x8 ...) => (VPERMI2W128 ...)
(Permute2Uint16x16 ...) => (VPERMI2W256 ...)
(Permute2Uint16x32 ...) => (VPERMI2W512 ...)
(Permute2Uint32x4 ...) => (VPERMI2D128 ...)
(Permute2Uint32x8 ...) => (VPERMI2D256 ...)
(Permute2Uint32x16 ...) => (VPERMI2D512 ...)
(Permute2Uint64x2 ...) => (VPERMI2Q128 ...)
(Permute2Uint64x4 ...) => (VPERMI2Q256 ...)
(Permute2Uint64x8 ...) => (VPERMI2Q512 ...)
(PermuteConstantInt32x4 ...) => (VPSHUFD128 ...)
(PermuteConstantUint32x4 ...) => (VPSHUFD128 ...)
(PermuteConstantGroupedInt32x8 ...) => (VPSHUFD256 ...)
(PermuteConstantGroupedInt32x16 ...) => (VPSHUFD512 ...)
(PermuteConstantGroupedUint32x8 ...) => (VPSHUFD256 ...)
(PermuteConstantGroupedUint32x16 ...) => (VPSHUFD512 ...)
(PermuteConstantHiInt16x8 ...) => (VPSHUFHW128 ...)
(PermuteConstantHiInt32x4 ...) => (VPSHUFHW128 ...)
(PermuteConstantHiUint16x8 ...) => (VPSHUFHW128 ...)
(PermuteConstantHiUint32x4 ...) => (VPSHUFHW128 ...)
(PermuteConstantHiGroupedInt16x16 ...) => (VPSHUFHW256 ...)
(PermuteConstantHiGroupedInt16x32 ...) => (VPSHUFHW512 ...)
(PermuteConstantHiGroupedUint16x16 ...) => (VPSHUFHW256 ...)
(PermuteConstantHiGroupedUint16x32 ...) => (VPSHUFHW512 ...)
(PermuteConstantLoInt16x8 ...) => (VPSHUFHW128 ...)
(PermuteConstantLoInt32x4 ...) => (VPSHUFHW128 ...)
(PermuteConstantLoUint16x8 ...) => (VPSHUFHW128 ...)
(PermuteConstantLoUint32x4 ...) => (VPSHUFHW128 ...)
(PermuteConstantLoGroupedInt16x16 ...) => (VPSHUFHW256 ...)
(PermuteConstantLoGroupedInt16x32 ...) => (VPSHUFHW512 ...)
(PermuteConstantLoGroupedUint16x16 ...) => (VPSHUFHW256 ...)
(PermuteConstantLoGroupedUint16x32 ...) => (VPSHUFHW512 ...)
(PermuteGroupedInt8x32 ...) => (VPSHUFB256 ...)
(PermuteGroupedInt8x64 ...) => (VPSHUFB512 ...)
(PermuteGroupedUint8x32 ...) => (VPSHUFB256 ...)
(PermuteGroupedUint8x64 ...) => (VPSHUFB512 ...)
(ReciprocalFloat32x4 ...) => (VRCPPS128 ...) (ReciprocalFloat32x4 ...) => (VRCPPS128 ...)
(ReciprocalFloat32x8 ...) => (VRCPPS256 ...) (ReciprocalFloat32x8 ...) => (VRCPPS256 ...)
(ReciprocalFloat32x16 ...) => (VRCP14PS512 ...) (ReciprocalFloat32x16 ...) => (VRCP14PS512 ...)
@ -1324,6 +1304,24 @@
(concatSelectedConstantGroupedUint32x16 ...) => (VSHUFPS512 ...) (concatSelectedConstantGroupedUint32x16 ...) => (VSHUFPS512 ...)
(concatSelectedConstantGroupedUint64x4 ...) => (VSHUFPD256 ...) (concatSelectedConstantGroupedUint64x4 ...) => (VSHUFPD256 ...)
(concatSelectedConstantGroupedUint64x8 ...) => (VSHUFPD512 ...) (concatSelectedConstantGroupedUint64x8 ...) => (VSHUFPD512 ...)
(permuteScalarsInt32x4 ...) => (VPSHUFD128 ...)
(permuteScalarsUint32x4 ...) => (VPSHUFD128 ...)
(permuteScalarsGroupedInt32x8 ...) => (VPSHUFD256 ...)
(permuteScalarsGroupedInt32x16 ...) => (VPSHUFD512 ...)
(permuteScalarsGroupedUint32x8 ...) => (VPSHUFD256 ...)
(permuteScalarsGroupedUint32x16 ...) => (VPSHUFD512 ...)
(permuteScalarsHiInt16x8 ...) => (VPSHUFHW128 ...)
(permuteScalarsHiUint16x8 ...) => (VPSHUFHW128 ...)
(permuteScalarsHiGroupedInt16x16 ...) => (VPSHUFHW256 ...)
(permuteScalarsHiGroupedInt16x32 ...) => (VPSHUFHW512 ...)
(permuteScalarsHiGroupedUint16x16 ...) => (VPSHUFHW256 ...)
(permuteScalarsHiGroupedUint16x32 ...) => (VPSHUFHW512 ...)
(permuteScalarsLoInt16x8 ...) => (VPSHUFLW128 ...)
(permuteScalarsLoUint16x8 ...) => (VPSHUFLW128 ...)
(permuteScalarsLoGroupedInt16x16 ...) => (VPSHUFLW256 ...)
(permuteScalarsLoGroupedInt16x32 ...) => (VPSHUFLW512 ...)
(permuteScalarsLoGroupedUint16x16 ...) => (VPSHUFLW256 ...)
(permuteScalarsLoGroupedUint16x32 ...) => (VPSHUFLW512 ...)
(ternInt32x4 ...) => (VPTERNLOGD128 ...) (ternInt32x4 ...) => (VPTERNLOGD128 ...)
(ternInt32x8 ...) => (VPTERNLOGD256 ...) (ternInt32x8 ...) => (VPTERNLOGD256 ...)
(ternInt32x16 ...) => (VPTERNLOGD512 ...) (ternInt32x16 ...) => (VPTERNLOGD512 ...)
@ -1417,6 +1415,24 @@
(VMOVDQU64Masked128 (VREDUCEPD128 [a] x) mask) => (VREDUCEPDMasked128 [a] x mask) (VMOVDQU64Masked128 (VREDUCEPD128 [a] x) mask) => (VREDUCEPDMasked128 [a] x mask)
(VMOVDQU64Masked256 (VREDUCEPD256 [a] x) mask) => (VREDUCEPDMasked256 [a] x mask) (VMOVDQU64Masked256 (VREDUCEPD256 [a] x) mask) => (VREDUCEPDMasked256 [a] x mask)
(VMOVDQU64Masked512 (VREDUCEPD512 [a] x) mask) => (VREDUCEPDMasked512 [a] x mask) (VMOVDQU64Masked512 (VREDUCEPD512 [a] x) mask) => (VREDUCEPDMasked512 [a] x mask)
(VMOVDQU8Masked128 (VPERMI2B128 x y z) mask) => (VPERMI2BMasked128 x y z mask)
(VMOVDQU8Masked256 (VPERMI2B256 x y z) mask) => (VPERMI2BMasked256 x y z mask)
(VMOVDQU8Masked512 (VPERMI2B512 x y z) mask) => (VPERMI2BMasked512 x y z mask)
(VMOVDQU16Masked128 (VPERMI2W128 x y z) mask) => (VPERMI2WMasked128 x y z mask)
(VMOVDQU16Masked256 (VPERMI2W256 x y z) mask) => (VPERMI2WMasked256 x y z mask)
(VMOVDQU16Masked512 (VPERMI2W512 x y z) mask) => (VPERMI2WMasked512 x y z mask)
(VMOVDQU32Masked128 (VPERMI2PS128 x y z) mask) => (VPERMI2PSMasked128 x y z mask)
(VMOVDQU32Masked128 (VPERMI2D128 x y z) mask) => (VPERMI2DMasked128 x y z mask)
(VMOVDQU32Masked256 (VPERMI2PS256 x y z) mask) => (VPERMI2PSMasked256 x y z mask)
(VMOVDQU32Masked256 (VPERMI2D256 x y z) mask) => (VPERMI2DMasked256 x y z mask)
(VMOVDQU32Masked512 (VPERMI2PS512 x y z) mask) => (VPERMI2PSMasked512 x y z mask)
(VMOVDQU32Masked512 (VPERMI2D512 x y z) mask) => (VPERMI2DMasked512 x y z mask)
(VMOVDQU64Masked128 (VPERMI2PD128 x y z) mask) => (VPERMI2PDMasked128 x y z mask)
(VMOVDQU64Masked128 (VPERMI2Q128 x y z) mask) => (VPERMI2QMasked128 x y z mask)
(VMOVDQU64Masked256 (VPERMI2PD256 x y z) mask) => (VPERMI2PDMasked256 x y z mask)
(VMOVDQU64Masked256 (VPERMI2Q256 x y z) mask) => (VPERMI2QMasked256 x y z mask)
(VMOVDQU64Masked512 (VPERMI2PD512 x y z) mask) => (VPERMI2PDMasked512 x y z mask)
(VMOVDQU64Masked512 (VPERMI2Q512 x y z) mask) => (VPERMI2QMasked512 x y z mask)
(VMOVDQU8Masked256 (VPALIGNR256 [a] x y) mask) => (VPALIGNRMasked256 [a] x y mask) (VMOVDQU8Masked256 (VPALIGNR256 [a] x y) mask) => (VPALIGNRMasked256 [a] x y mask)
(VMOVDQU8Masked512 (VPALIGNR512 [a] x y) mask) => (VPALIGNRMasked512 [a] x y mask) (VMOVDQU8Masked512 (VPALIGNR512 [a] x y) mask) => (VPALIGNRMasked512 [a] x y mask)
(VMOVDQU8Masked128 (VPALIGNR128 [a] x y) mask) => (VPALIGNRMasked128 [a] x y mask) (VMOVDQU8Masked128 (VPALIGNR128 [a] x y) mask) => (VPALIGNRMasked128 [a] x y mask)
@ -1668,33 +1684,7 @@
(VMOVDQU64Masked512 (VPOPCNTQ512 x) mask) => (VPOPCNTQMasked512 x mask) (VMOVDQU64Masked512 (VPOPCNTQ512 x) mask) => (VPOPCNTQMasked512 x mask)
(VMOVDQU32Masked512 (VPORD512 x y) mask) => (VPORDMasked512 x y mask) (VMOVDQU32Masked512 (VPORD512 x y) mask) => (VPORDMasked512 x y mask)
(VMOVDQU64Masked512 (VPORQ512 x y) mask) => (VPORQMasked512 x y mask) (VMOVDQU64Masked512 (VPORQ512 x y) mask) => (VPORQMasked512 x y mask)
(VMOVDQU8Masked128 (VPERMI2B128 x y z) mask) => (VPERMI2BMasked128 x y z mask) (VMOVDQU8Masked128 (VPERMB128 x y) mask) => (VPERMBMasked128 x y mask)
(VMOVDQU8Masked256 (VPERMI2B256 x y z) mask) => (VPERMI2BMasked256 x y z mask)
(VMOVDQU8Masked512 (VPERMI2B512 x y z) mask) => (VPERMI2BMasked512 x y z mask)
(VMOVDQU16Masked128 (VPERMI2W128 x y z) mask) => (VPERMI2WMasked128 x y z mask)
(VMOVDQU16Masked256 (VPERMI2W256 x y z) mask) => (VPERMI2WMasked256 x y z mask)
(VMOVDQU16Masked512 (VPERMI2W512 x y z) mask) => (VPERMI2WMasked512 x y z mask)
(VMOVDQU32Masked128 (VPERMI2PS128 x y z) mask) => (VPERMI2PSMasked128 x y z mask)
(VMOVDQU32Masked128 (VPERMI2D128 x y z) mask) => (VPERMI2DMasked128 x y z mask)
(VMOVDQU32Masked256 (VPERMI2PS256 x y z) mask) => (VPERMI2PSMasked256 x y z mask)
(VMOVDQU32Masked256 (VPERMI2D256 x y z) mask) => (VPERMI2DMasked256 x y z mask)
(VMOVDQU32Masked512 (VPERMI2PS512 x y z) mask) => (VPERMI2PSMasked512 x y z mask)
(VMOVDQU32Masked512 (VPERMI2D512 x y z) mask) => (VPERMI2DMasked512 x y z mask)
(VMOVDQU64Masked128 (VPERMI2PD128 x y z) mask) => (VPERMI2PDMasked128 x y z mask)
(VMOVDQU64Masked128 (VPERMI2Q128 x y z) mask) => (VPERMI2QMasked128 x y z mask)
(VMOVDQU64Masked256 (VPERMI2PD256 x y z) mask) => (VPERMI2PDMasked256 x y z mask)
(VMOVDQU64Masked256 (VPERMI2Q256 x y z) mask) => (VPERMI2QMasked256 x y z mask)
(VMOVDQU64Masked512 (VPERMI2PD512 x y z) mask) => (VPERMI2PDMasked512 x y z mask)
(VMOVDQU64Masked512 (VPERMI2Q512 x y z) mask) => (VPERMI2QMasked512 x y z mask)
(VMOVDQU32Masked256 (VPSHUFD256 [a] x) mask) => (VPSHUFDMasked256 [a] x mask)
(VMOVDQU32Masked512 (VPSHUFD512 [a] x) mask) => (VPSHUFDMasked512 [a] x mask)
(VMOVDQU16Masked256 (VPSHUFHW256 [a] x) mask) => (VPSHUFHWMasked256 [a] x mask)
(VMOVDQU16Masked512 (VPSHUFHW512 [a] x) mask) => (VPSHUFHWMasked512 [a] x mask)
(VMOVDQU16Masked128 (VPSHUFHW128 [a] x) mask) => (VPSHUFHWMasked128 [a] x mask)
(VMOVDQU32Masked128 (VPSHUFD128 [a] x) mask) => (VPSHUFDMasked128 [a] x mask)
(VMOVDQU8Masked256 (VPSHUFB256 x y) mask) => (VPSHUFBMasked256 x y mask)
(VMOVDQU8Masked512 (VPSHUFB512 x y) mask) => (VPSHUFBMasked512 x y mask)
(VMOVDQU8Masked128 (VPSHUFB128 x y) mask) => (VPSHUFBMasked128 x y mask)
(VMOVDQU8Masked256 (VPERMB256 x y) mask) => (VPERMBMasked256 x y mask) (VMOVDQU8Masked256 (VPERMB256 x y) mask) => (VPERMBMasked256 x y mask)
(VMOVDQU8Masked512 (VPERMB512 x y) mask) => (VPERMBMasked512 x y mask) (VMOVDQU8Masked512 (VPERMB512 x y) mask) => (VPERMBMasked512 x y mask)
(VMOVDQU16Masked128 (VPERMW128 x y) mask) => (VPERMWMasked128 x y mask) (VMOVDQU16Masked128 (VPERMW128 x y) mask) => (VPERMWMasked128 x y mask)
@ -1708,6 +1698,9 @@
(VMOVDQU64Masked256 (VPERMQ256 x y) mask) => (VPERMQMasked256 x y mask) (VMOVDQU64Masked256 (VPERMQ256 x y) mask) => (VPERMQMasked256 x y mask)
(VMOVDQU64Masked512 (VPERMPD512 x y) mask) => (VPERMPDMasked512 x y mask) (VMOVDQU64Masked512 (VPERMPD512 x y) mask) => (VPERMPDMasked512 x y mask)
(VMOVDQU64Masked512 (VPERMQ512 x y) mask) => (VPERMQMasked512 x y mask) (VMOVDQU64Masked512 (VPERMQ512 x y) mask) => (VPERMQMasked512 x y mask)
(VMOVDQU8Masked256 (VPSHUFB256 x y) mask) => (VPSHUFBMasked256 x y mask)
(VMOVDQU8Masked512 (VPSHUFB512 x y) mask) => (VPSHUFBMasked512 x y mask)
(VMOVDQU8Masked128 (VPSHUFB128 x y) mask) => (VPSHUFBMasked128 x y mask)
(VMOVDQU32Masked512 (VRCP14PS512 x) mask) => (VRCP14PSMasked512 x mask) (VMOVDQU32Masked512 (VRCP14PS512 x) mask) => (VRCP14PSMasked512 x mask)
(VMOVDQU64Masked128 (VRCP14PD128 x) mask) => (VRCP14PDMasked128 x mask) (VMOVDQU64Masked128 (VRCP14PD128 x) mask) => (VRCP14PDMasked128 x mask)
(VMOVDQU64Masked256 (VRCP14PD256 x) mask) => (VRCP14PDMasked256 x mask) (VMOVDQU64Masked256 (VRCP14PD256 x) mask) => (VRCP14PDMasked256 x mask)
@ -1874,6 +1867,15 @@
(VMOVDQU16Masked512 (VPSUBUSW512 x y) mask) => (VPSUBUSWMasked512 x y mask) (VMOVDQU16Masked512 (VPSUBUSW512 x y) mask) => (VPSUBUSWMasked512 x y mask)
(VMOVDQU32Masked512 (VPXORD512 x y) mask) => (VPXORDMasked512 x y mask) (VMOVDQU32Masked512 (VPXORD512 x y) mask) => (VPXORDMasked512 x y mask)
(VMOVDQU64Masked512 (VPXORQ512 x y) mask) => (VPXORQMasked512 x y mask) (VMOVDQU64Masked512 (VPXORQ512 x y) mask) => (VPXORQMasked512 x y mask)
(VMOVDQU32Masked256 (VPSHUFD256 [a] x) mask) => (VPSHUFDMasked256 [a] x mask)
(VMOVDQU32Masked512 (VPSHUFD512 [a] x) mask) => (VPSHUFDMasked512 [a] x mask)
(VMOVDQU16Masked256 (VPSHUFHW256 [a] x) mask) => (VPSHUFHWMasked256 [a] x mask)
(VMOVDQU16Masked512 (VPSHUFHW512 [a] x) mask) => (VPSHUFHWMasked512 [a] x mask)
(VMOVDQU16Masked128 (VPSHUFHW128 [a] x) mask) => (VPSHUFHWMasked128 [a] x mask)
(VMOVDQU16Masked256 (VPSHUFLW256 [a] x) mask) => (VPSHUFLWMasked256 [a] x mask)
(VMOVDQU16Masked512 (VPSHUFLW512 [a] x) mask) => (VPSHUFLWMasked512 [a] x mask)
(VMOVDQU16Masked128 (VPSHUFLW128 [a] x) mask) => (VPSHUFLWMasked128 [a] x mask)
(VMOVDQU32Masked128 (VPSHUFD128 [a] x) mask) => (VPSHUFDMasked128 [a] x mask)
(VMOVDQU16Masked128 (VPSLLW128const [a] x) mask) => (VPSLLWMasked128const [a] x mask) (VMOVDQU16Masked128 (VPSLLW128const [a] x) mask) => (VPSLLWMasked128const [a] x mask)
(VMOVDQU16Masked256 (VPSLLW256const [a] x) mask) => (VPSLLWMasked256const [a] x mask) (VMOVDQU16Masked256 (VPSLLW256const [a] x) mask) => (VPSLLWMasked256const [a] x mask)
(VMOVDQU16Masked512 (VPSLLW512const [a] x) mask) => (VPSLLWMasked512const [a] x mask) (VMOVDQU16Masked512 (VPSLLW512const [a] x) mask) => (VPSLLWMasked512const [a] x mask)
@ -2021,6 +2023,7 @@
(VPBLENDMWMasked512 dst (VPSHLDW512 [a] x y) mask) => (VPSHLDWMasked512Merging dst [a] x y mask) (VPBLENDMWMasked512 dst (VPSHLDW512 [a] x y) mask) => (VPSHLDWMasked512Merging dst [a] x y mask)
(VPBLENDMWMasked512 dst (VPSHRDW512 [a] x y) mask) => (VPSHRDWMasked512Merging dst [a] x y mask) (VPBLENDMWMasked512 dst (VPSHRDW512 [a] x y) mask) => (VPSHRDWMasked512Merging dst [a] x y mask)
(VPBLENDMWMasked512 dst (VPSHUFHW512 [a] x) mask) => (VPSHUFHWMasked512Merging dst [a] x mask) (VPBLENDMWMasked512 dst (VPSHUFHW512 [a] x) mask) => (VPSHUFHWMasked512Merging dst [a] x mask)
(VPBLENDMWMasked512 dst (VPSHUFLW512 [a] x) mask) => (VPSHUFLWMasked512Merging dst [a] x mask)
(VPBLENDMWMasked512 dst (VPSLLVW512 x y) mask) => (VPSLLVWMasked512Merging dst x y mask) (VPBLENDMWMasked512 dst (VPSLLVW512 x y) mask) => (VPSLLVWMasked512Merging dst x y mask)
(VPBLENDMWMasked512 dst (VPSLLW512const [a] x) mask) => (VPSLLWMasked512constMerging dst [a] x mask) (VPBLENDMWMasked512 dst (VPSLLW512const [a] x) mask) => (VPSLLWMasked512constMerging dst [a] x mask)
(VPBLENDMWMasked512 dst (VPSRAVW512 x y) mask) => (VPSRAVWMasked512Merging dst x y mask) (VPBLENDMWMasked512 dst (VPSRAVW512 x y) mask) => (VPSRAVWMasked512Merging dst x y mask)
@ -2170,6 +2173,7 @@
(VPBLENDVB128 dst (VPSHUFB128 x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHUFBMasked128Merging dst x y (VPMOVVec8x16ToM <types.TypeMask> mask)) (VPBLENDVB128 dst (VPSHUFB128 x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHUFBMasked128Merging dst x y (VPMOVVec8x16ToM <types.TypeMask> mask))
(VPBLENDVB128 dst (VPSHUFD128 [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHUFDMasked128Merging dst [a] x (VPMOVVec32x4ToM <types.TypeMask> mask)) (VPBLENDVB128 dst (VPSHUFD128 [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHUFDMasked128Merging dst [a] x (VPMOVVec32x4ToM <types.TypeMask> mask))
(VPBLENDVB128 dst (VPSHUFHW128 [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHUFHWMasked128Merging dst [a] x (VPMOVVec16x8ToM <types.TypeMask> mask)) (VPBLENDVB128 dst (VPSHUFHW128 [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHUFHWMasked128Merging dst [a] x (VPMOVVec16x8ToM <types.TypeMask> mask))
(VPBLENDVB128 dst (VPSHUFLW128 [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHUFLWMasked128Merging dst [a] x (VPMOVVec16x8ToM <types.TypeMask> mask))
(VPBLENDVB128 dst (VPSLLD128const [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSLLDMasked128constMerging dst [a] x (VPMOVVec32x4ToM <types.TypeMask> mask)) (VPBLENDVB128 dst (VPSLLD128const [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSLLDMasked128constMerging dst [a] x (VPMOVVec32x4ToM <types.TypeMask> mask))
(VPBLENDVB128 dst (VPSLLQ128const [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSLLQMasked128constMerging dst [a] x (VPMOVVec64x2ToM <types.TypeMask> mask)) (VPBLENDVB128 dst (VPSLLQ128const [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSLLQMasked128constMerging dst [a] x (VPMOVVec64x2ToM <types.TypeMask> mask))
(VPBLENDVB128 dst (VPSLLVD128 x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSLLVDMasked128Merging dst x y (VPMOVVec32x4ToM <types.TypeMask> mask)) (VPBLENDVB128 dst (VPSLLVD128 x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSLLVDMasked128Merging dst x y (VPMOVVec32x4ToM <types.TypeMask> mask))
@ -2305,6 +2309,7 @@
(VPBLENDVB256 dst (VPSHUFB256 x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHUFBMasked256Merging dst x y (VPMOVVec8x32ToM <types.TypeMask> mask)) (VPBLENDVB256 dst (VPSHUFB256 x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHUFBMasked256Merging dst x y (VPMOVVec8x32ToM <types.TypeMask> mask))
(VPBLENDVB256 dst (VPSHUFD256 [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHUFDMasked256Merging dst [a] x (VPMOVVec32x8ToM <types.TypeMask> mask)) (VPBLENDVB256 dst (VPSHUFD256 [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHUFDMasked256Merging dst [a] x (VPMOVVec32x8ToM <types.TypeMask> mask))
(VPBLENDVB256 dst (VPSHUFHW256 [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHUFHWMasked256Merging dst [a] x (VPMOVVec16x16ToM <types.TypeMask> mask)) (VPBLENDVB256 dst (VPSHUFHW256 [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHUFHWMasked256Merging dst [a] x (VPMOVVec16x16ToM <types.TypeMask> mask))
(VPBLENDVB256 dst (VPSHUFLW256 [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSHUFLWMasked256Merging dst [a] x (VPMOVVec16x16ToM <types.TypeMask> mask))
(VPBLENDVB256 dst (VPSLLD256const [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSLLDMasked256constMerging dst [a] x (VPMOVVec32x8ToM <types.TypeMask> mask)) (VPBLENDVB256 dst (VPSLLD256const [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSLLDMasked256constMerging dst [a] x (VPMOVVec32x8ToM <types.TypeMask> mask))
(VPBLENDVB256 dst (VPSLLQ256const [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSLLQMasked256constMerging dst [a] x (VPMOVVec64x4ToM <types.TypeMask> mask)) (VPBLENDVB256 dst (VPSLLQ256const [a] x) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSLLQMasked256constMerging dst [a] x (VPMOVVec64x4ToM <types.TypeMask> mask))
(VPBLENDVB256 dst (VPSLLVD256 x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSLLVDMasked256Merging dst x y (VPMOVVec32x8ToM <types.TypeMask> mask)) (VPBLENDVB256 dst (VPSLLVD256 x y) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (VPSLLVDMasked256Merging dst x y (VPMOVVec32x8ToM <types.TypeMask> mask))
@ -2410,6 +2415,30 @@
(VREDUCEPDMasked128 [c] l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VREDUCEPDMasked128load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem) (VREDUCEPDMasked128 [c] l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VREDUCEPDMasked128load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
(VREDUCEPDMasked256 [c] l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VREDUCEPDMasked256load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem) (VREDUCEPDMasked256 [c] l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VREDUCEPDMasked256load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
(VREDUCEPDMasked512 [c] l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VREDUCEPDMasked512load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem) (VREDUCEPDMasked512 [c] l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VREDUCEPDMasked512load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
(VPERMI2PS128 x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PS128load {sym} [off] x y ptr mem)
(VPERMI2D128 x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2D128load {sym} [off] x y ptr mem)
(VPERMI2PS256 x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PS256load {sym} [off] x y ptr mem)
(VPERMI2D256 x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2D256load {sym} [off] x y ptr mem)
(VPERMI2PS512 x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PS512load {sym} [off] x y ptr mem)
(VPERMI2D512 x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2D512load {sym} [off] x y ptr mem)
(VPERMI2PD128 x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PD128load {sym} [off] x y ptr mem)
(VPERMI2Q128 x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2Q128load {sym} [off] x y ptr mem)
(VPERMI2PD256 x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PD256load {sym} [off] x y ptr mem)
(VPERMI2Q256 x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2Q256load {sym} [off] x y ptr mem)
(VPERMI2PD512 x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PD512load {sym} [off] x y ptr mem)
(VPERMI2Q512 x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2Q512load {sym} [off] x y ptr mem)
(VPERMI2PSMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PSMasked128load {sym} [off] x y ptr mask mem)
(VPERMI2DMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2DMasked128load {sym} [off] x y ptr mask mem)
(VPERMI2PSMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PSMasked256load {sym} [off] x y ptr mask mem)
(VPERMI2DMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2DMasked256load {sym} [off] x y ptr mask mem)
(VPERMI2PSMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PSMasked512load {sym} [off] x y ptr mask mem)
(VPERMI2DMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2DMasked512load {sym} [off] x y ptr mask mem)
(VPERMI2PDMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PDMasked128load {sym} [off] x y ptr mask mem)
(VPERMI2QMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2QMasked128load {sym} [off] x y ptr mask mem)
(VPERMI2PDMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PDMasked256load {sym} [off] x y ptr mask mem)
(VPERMI2QMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2QMasked256load {sym} [off] x y ptr mask mem)
(VPERMI2PDMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PDMasked512load {sym} [off] x y ptr mask mem)
(VPERMI2QMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2QMasked512load {sym} [off] x y ptr mask mem)
(VPACKSSDW512 x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPACKSSDW512load {sym} [off] x ptr mem) (VPACKSSDW512 x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPACKSSDW512load {sym} [off] x ptr mem)
(VPACKSSDWMasked128 x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPACKSSDWMasked128load {sym} [off] x ptr mask mem) (VPACKSSDWMasked128 x l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPACKSSDWMasked128load {sym} [off] x ptr mask mem)
(VPACKSSDWMasked256 x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPACKSSDWMasked256load {sym} [off] x ptr mask mem) (VPACKSSDWMasked256 x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPACKSSDWMasked256load {sym} [off] x ptr mask mem)
@ -2636,34 +2665,6 @@
(VPERMQ256 x l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMQ256load {sym} [off] x ptr mem) (VPERMQ256 x l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMQ256load {sym} [off] x ptr mem)
(VPERMPD512 x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMPD512load {sym} [off] x ptr mem) (VPERMPD512 x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMPD512load {sym} [off] x ptr mem)
(VPERMQ512 x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMQ512load {sym} [off] x ptr mem) (VPERMQ512 x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMQ512load {sym} [off] x ptr mem)
(VPERMI2PS128 x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PS128load {sym} [off] x y ptr mem)
(VPERMI2D128 x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2D128load {sym} [off] x y ptr mem)
(VPERMI2PS256 x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PS256load {sym} [off] x y ptr mem)
(VPERMI2D256 x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2D256load {sym} [off] x y ptr mem)
(VPERMI2PS512 x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PS512load {sym} [off] x y ptr mem)
(VPERMI2D512 x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2D512load {sym} [off] x y ptr mem)
(VPERMI2PD128 x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PD128load {sym} [off] x y ptr mem)
(VPERMI2Q128 x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2Q128load {sym} [off] x y ptr mem)
(VPERMI2PD256 x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PD256load {sym} [off] x y ptr mem)
(VPERMI2Q256 x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2Q256load {sym} [off] x y ptr mem)
(VPERMI2PD512 x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PD512load {sym} [off] x y ptr mem)
(VPERMI2Q512 x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPERMI2Q512load {sym} [off] x y ptr mem)
(VPERMI2PSMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PSMasked128load {sym} [off] x y ptr mask mem)
(VPERMI2DMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2DMasked128load {sym} [off] x y ptr mask mem)
(VPERMI2PSMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PSMasked256load {sym} [off] x y ptr mask mem)
(VPERMI2DMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2DMasked256load {sym} [off] x y ptr mask mem)
(VPERMI2PSMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PSMasked512load {sym} [off] x y ptr mask mem)
(VPERMI2DMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2DMasked512load {sym} [off] x y ptr mask mem)
(VPERMI2PDMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PDMasked128load {sym} [off] x y ptr mask mem)
(VPERMI2QMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2QMasked128load {sym} [off] x y ptr mask mem)
(VPERMI2PDMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PDMasked256load {sym} [off] x y ptr mask mem)
(VPERMI2QMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2QMasked256load {sym} [off] x y ptr mask mem)
(VPERMI2PDMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2PDMasked512load {sym} [off] x y ptr mask mem)
(VPERMI2QMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMI2QMasked512load {sym} [off] x y ptr mask mem)
(VPSHUFD512 [c] l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHUFD512load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem)
(VPSHUFDMasked256 [c] l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHUFDMasked256load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
(VPSHUFDMasked512 [c] l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHUFDMasked512load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
(VPSHUFDMasked128 [c] l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHUFDMasked128load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
(VPERMPSMasked256 x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMPSMasked256load {sym} [off] x ptr mask mem) (VPERMPSMasked256 x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMPSMasked256load {sym} [off] x ptr mask mem)
(VPERMDMasked256 x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMDMasked256load {sym} [off] x ptr mask mem) (VPERMDMasked256 x l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMDMasked256load {sym} [off] x ptr mask mem)
(VPERMPSMasked512 x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMPSMasked512load {sym} [off] x ptr mask mem) (VPERMPSMasked512 x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPERMPSMasked512load {sym} [off] x ptr mask mem)
@ -2862,6 +2863,10 @@
(VPBLENDMQMasked512 x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPBLENDMQMasked512load {sym} [off] x ptr mask mem) (VPBLENDMQMasked512 x l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPBLENDMQMasked512load {sym} [off] x ptr mask mem)
(VSHUFPS512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VSHUFPS512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mem) (VSHUFPS512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VSHUFPS512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mem)
(VSHUFPD512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VSHUFPD512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mem) (VSHUFPD512 [c] x l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VSHUFPD512load {sym} [makeValAndOff(int32(int8(c)),off)] x ptr mem)
(VPSHUFD512 [c] l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSHUFD512load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem)
(VPSHUFDMasked256 [c] l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHUFDMasked256load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
(VPSHUFDMasked512 [c] l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHUFDMasked512load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
(VPSHUFDMasked128 [c] l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSHUFDMasked128load {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)
(VPSLLD512const [c] l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSLLD512constload {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem) (VPSLLD512const [c] l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSLLD512constload {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem)
(VPSLLQ512const [c] l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSLLQ512constload {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem) (VPSLLQ512const [c] l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPSLLQ512constload {sym} [makeValAndOff(int32(int8(c)),off)] ptr mem)
(VPSLLDMasked128const [c] l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSLLDMasked128constload {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem) (VPSLLDMasked128const [c] l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSLLDMasked128constload {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem)

View file

@ -383,8 +383,10 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
{name: "VPDPWSSDMasked128", argLength: 4, reg: w3kw, asm: "VPDPWSSD", commutative: false, typ: "Vec128", resultInArg0: true}, {name: "VPDPWSSDMasked128", argLength: 4, reg: w3kw, asm: "VPDPWSSD", commutative: false, typ: "Vec128", resultInArg0: true},
{name: "VPDPWSSDMasked256", argLength: 4, reg: w3kw, asm: "VPDPWSSD", commutative: false, typ: "Vec256", resultInArg0: true}, {name: "VPDPWSSDMasked256", argLength: 4, reg: w3kw, asm: "VPDPWSSD", commutative: false, typ: "Vec256", resultInArg0: true},
{name: "VPDPWSSDMasked512", argLength: 4, reg: w3kw, asm: "VPDPWSSD", commutative: false, typ: "Vec512", resultInArg0: true}, {name: "VPDPWSSDMasked512", argLength: 4, reg: w3kw, asm: "VPDPWSSD", commutative: false, typ: "Vec512", resultInArg0: true},
{name: "VPERMB128", argLength: 2, reg: w21, asm: "VPERMB", commutative: false, typ: "Vec128", resultInArg0: false},
{name: "VPERMB256", argLength: 2, reg: w21, asm: "VPERMB", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VPERMB256", argLength: 2, reg: w21, asm: "VPERMB", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VPERMB512", argLength: 2, reg: w21, asm: "VPERMB", commutative: false, typ: "Vec512", resultInArg0: false}, {name: "VPERMB512", argLength: 2, reg: w21, asm: "VPERMB", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "VPERMBMasked128", argLength: 3, reg: w2kw, asm: "VPERMB", commutative: false, typ: "Vec128", resultInArg0: false},
{name: "VPERMBMasked256", argLength: 3, reg: w2kw, asm: "VPERMB", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VPERMBMasked256", argLength: 3, reg: w2kw, asm: "VPERMB", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VPERMBMasked512", argLength: 3, reg: w2kw, asm: "VPERMB", commutative: false, typ: "Vec512", resultInArg0: false}, {name: "VPERMBMasked512", argLength: 3, reg: w2kw, asm: "VPERMB", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "VPERMD256", argLength: 2, reg: v21, asm: "VPERMD", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VPERMD256", argLength: 2, reg: v21, asm: "VPERMD", commutative: false, typ: "Vec256", resultInArg0: false},
@ -1310,6 +1312,12 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
{name: "VPSHUFHWMasked128", argLength: 2, reg: wkw, asm: "VPSHUFHW", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VPSHUFHWMasked128", argLength: 2, reg: wkw, asm: "VPSHUFHW", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
{name: "VPSHUFHWMasked256", argLength: 2, reg: wkw, asm: "VPSHUFHW", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VPSHUFHWMasked256", argLength: 2, reg: wkw, asm: "VPSHUFHW", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VPSHUFHWMasked512", argLength: 2, reg: wkw, asm: "VPSHUFHW", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false}, {name: "VPSHUFHWMasked512", argLength: 2, reg: wkw, asm: "VPSHUFHW", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "VPSHUFLW128", argLength: 1, reg: w11, asm: "VPSHUFLW", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
{name: "VPSHUFLW256", argLength: 1, reg: v11, asm: "VPSHUFLW", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VPSHUFLW512", argLength: 1, reg: w11, asm: "VPSHUFLW", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "VPSHUFLWMasked128", argLength: 2, reg: wkw, asm: "VPSHUFLW", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
{name: "VPSHUFLWMasked256", argLength: 2, reg: wkw, asm: "VPSHUFLW", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VPSHUFLWMasked512", argLength: 2, reg: wkw, asm: "VPSHUFLW", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "VPSLLD128const", argLength: 1, reg: v11, asm: "VPSLLD", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VPSLLD128const", argLength: 1, reg: v11, asm: "VPSLLD", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
{name: "VPSLLD256const", argLength: 1, reg: v11, asm: "VPSLLD", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VPSLLD256const", argLength: 1, reg: v11, asm: "VPSLLD", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VPSLLD512const", argLength: 1, reg: w11, asm: "VPSLLD", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false}, {name: "VPSLLD512const", argLength: 1, reg: w11, asm: "VPSLLD", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false},
@ -2392,6 +2400,9 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
{name: "VPSHUFHWMasked128Merging", argLength: 3, reg: w2kw, asm: "VPSHUFHW", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: true}, {name: "VPSHUFHWMasked128Merging", argLength: 3, reg: w2kw, asm: "VPSHUFHW", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: true},
{name: "VPSHUFHWMasked256Merging", argLength: 3, reg: w2kw, asm: "VPSHUFHW", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: true}, {name: "VPSHUFHWMasked256Merging", argLength: 3, reg: w2kw, asm: "VPSHUFHW", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: true},
{name: "VPSHUFHWMasked512Merging", argLength: 3, reg: w2kw, asm: "VPSHUFHW", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: true}, {name: "VPSHUFHWMasked512Merging", argLength: 3, reg: w2kw, asm: "VPSHUFHW", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: true},
{name: "VPSHUFLWMasked128Merging", argLength: 3, reg: w2kw, asm: "VPSHUFLW", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: true},
{name: "VPSHUFLWMasked256Merging", argLength: 3, reg: w2kw, asm: "VPSHUFLW", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: true},
{name: "VPSHUFLWMasked512Merging", argLength: 3, reg: w2kw, asm: "VPSHUFLW", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: true},
{name: "VPSLLDMasked128constMerging", argLength: 3, reg: w2kw, asm: "VPSLLD", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: true}, {name: "VPSLLDMasked128constMerging", argLength: 3, reg: w2kw, asm: "VPSLLD", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: true},
{name: "VPSLLDMasked256constMerging", argLength: 3, reg: w2kw, asm: "VPSLLD", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: true}, {name: "VPSLLDMasked256constMerging", argLength: 3, reg: w2kw, asm: "VPSLLD", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: true},
{name: "VPSLLDMasked512constMerging", argLength: 3, reg: w2kw, asm: "VPSLLD", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: true}, {name: "VPSLLDMasked512constMerging", argLength: 3, reg: w2kw, asm: "VPSLLD", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: true},

View file

@ -207,6 +207,36 @@ func simdGenericOps() []opData {
{name: "CompressUint64x2", argLength: 2, commutative: false}, {name: "CompressUint64x2", argLength: 2, commutative: false},
{name: "CompressUint64x4", argLength: 2, commutative: false}, {name: "CompressUint64x4", argLength: 2, commutative: false},
{name: "CompressUint64x8", argLength: 2, commutative: false}, {name: "CompressUint64x8", argLength: 2, commutative: false},
{name: "ConcatPermuteFloat32x4", argLength: 3, commutative: false},
{name: "ConcatPermuteFloat32x8", argLength: 3, commutative: false},
{name: "ConcatPermuteFloat32x16", argLength: 3, commutative: false},
{name: "ConcatPermuteFloat64x2", argLength: 3, commutative: false},
{name: "ConcatPermuteFloat64x4", argLength: 3, commutative: false},
{name: "ConcatPermuteFloat64x8", argLength: 3, commutative: false},
{name: "ConcatPermuteInt8x16", argLength: 3, commutative: false},
{name: "ConcatPermuteInt8x32", argLength: 3, commutative: false},
{name: "ConcatPermuteInt8x64", argLength: 3, commutative: false},
{name: "ConcatPermuteInt16x8", argLength: 3, commutative: false},
{name: "ConcatPermuteInt16x16", argLength: 3, commutative: false},
{name: "ConcatPermuteInt16x32", argLength: 3, commutative: false},
{name: "ConcatPermuteInt32x4", argLength: 3, commutative: false},
{name: "ConcatPermuteInt32x8", argLength: 3, commutative: false},
{name: "ConcatPermuteInt32x16", argLength: 3, commutative: false},
{name: "ConcatPermuteInt64x2", argLength: 3, commutative: false},
{name: "ConcatPermuteInt64x4", argLength: 3, commutative: false},
{name: "ConcatPermuteInt64x8", argLength: 3, commutative: false},
{name: "ConcatPermuteUint8x16", argLength: 3, commutative: false},
{name: "ConcatPermuteUint8x32", argLength: 3, commutative: false},
{name: "ConcatPermuteUint8x64", argLength: 3, commutative: false},
{name: "ConcatPermuteUint16x8", argLength: 3, commutative: false},
{name: "ConcatPermuteUint16x16", argLength: 3, commutative: false},
{name: "ConcatPermuteUint16x32", argLength: 3, commutative: false},
{name: "ConcatPermuteUint32x4", argLength: 3, commutative: false},
{name: "ConcatPermuteUint32x8", argLength: 3, commutative: false},
{name: "ConcatPermuteUint32x16", argLength: 3, commutative: false},
{name: "ConcatPermuteUint64x2", argLength: 3, commutative: false},
{name: "ConcatPermuteUint64x4", argLength: 3, commutative: false},
{name: "ConcatPermuteUint64x8", argLength: 3, commutative: false},
{name: "ConvertToInt8Int16x8", argLength: 1, commutative: false}, {name: "ConvertToInt8Int16x8", argLength: 1, commutative: false},
{name: "ConvertToInt8Int16x16", argLength: 1, commutative: false}, {name: "ConvertToInt8Int16x16", argLength: 1, commutative: false},
{name: "ConvertToInt8Int16x32", argLength: 1, commutative: false}, {name: "ConvertToInt8Int16x32", argLength: 1, commutative: false},
@ -750,44 +780,10 @@ func simdGenericOps() []opData {
{name: "OrUint64x2", argLength: 2, commutative: true}, {name: "OrUint64x2", argLength: 2, commutative: true},
{name: "OrUint64x4", argLength: 2, commutative: true}, {name: "OrUint64x4", argLength: 2, commutative: true},
{name: "OrUint64x8", argLength: 2, commutative: true}, {name: "OrUint64x8", argLength: 2, commutative: true},
{name: "Permute2Float32x4", argLength: 3, commutative: false},
{name: "Permute2Float32x8", argLength: 3, commutative: false},
{name: "Permute2Float32x16", argLength: 3, commutative: false},
{name: "Permute2Float64x2", argLength: 3, commutative: false},
{name: "Permute2Float64x4", argLength: 3, commutative: false},
{name: "Permute2Float64x8", argLength: 3, commutative: false},
{name: "Permute2Int8x16", argLength: 3, commutative: false},
{name: "Permute2Int8x32", argLength: 3, commutative: false},
{name: "Permute2Int8x64", argLength: 3, commutative: false},
{name: "Permute2Int16x8", argLength: 3, commutative: false},
{name: "Permute2Int16x16", argLength: 3, commutative: false},
{name: "Permute2Int16x32", argLength: 3, commutative: false},
{name: "Permute2Int32x4", argLength: 3, commutative: false},
{name: "Permute2Int32x8", argLength: 3, commutative: false},
{name: "Permute2Int32x16", argLength: 3, commutative: false},
{name: "Permute2Int64x2", argLength: 3, commutative: false},
{name: "Permute2Int64x4", argLength: 3, commutative: false},
{name: "Permute2Int64x8", argLength: 3, commutative: false},
{name: "Permute2Uint8x16", argLength: 3, commutative: false},
{name: "Permute2Uint8x32", argLength: 3, commutative: false},
{name: "Permute2Uint8x64", argLength: 3, commutative: false},
{name: "Permute2Uint16x8", argLength: 3, commutative: false},
{name: "Permute2Uint16x16", argLength: 3, commutative: false},
{name: "Permute2Uint16x32", argLength: 3, commutative: false},
{name: "Permute2Uint32x4", argLength: 3, commutative: false},
{name: "Permute2Uint32x8", argLength: 3, commutative: false},
{name: "Permute2Uint32x16", argLength: 3, commutative: false},
{name: "Permute2Uint64x2", argLength: 3, commutative: false},
{name: "Permute2Uint64x4", argLength: 3, commutative: false},
{name: "Permute2Uint64x8", argLength: 3, commutative: false},
{name: "PermuteFloat32x8", argLength: 2, commutative: false}, {name: "PermuteFloat32x8", argLength: 2, commutative: false},
{name: "PermuteFloat32x16", argLength: 2, commutative: false}, {name: "PermuteFloat32x16", argLength: 2, commutative: false},
{name: "PermuteFloat64x4", argLength: 2, commutative: false}, {name: "PermuteFloat64x4", argLength: 2, commutative: false},
{name: "PermuteFloat64x8", argLength: 2, commutative: false}, {name: "PermuteFloat64x8", argLength: 2, commutative: false},
{name: "PermuteGroupedInt8x32", argLength: 2, commutative: false},
{name: "PermuteGroupedInt8x64", argLength: 2, commutative: false},
{name: "PermuteGroupedUint8x32", argLength: 2, commutative: false},
{name: "PermuteGroupedUint8x64", argLength: 2, commutative: false},
{name: "PermuteInt8x16", argLength: 2, commutative: false}, {name: "PermuteInt8x16", argLength: 2, commutative: false},
{name: "PermuteInt8x32", argLength: 2, commutative: false}, {name: "PermuteInt8x32", argLength: 2, commutative: false},
{name: "PermuteInt8x64", argLength: 2, commutative: false}, {name: "PermuteInt8x64", argLength: 2, commutative: false},
@ -798,6 +794,12 @@ func simdGenericOps() []opData {
{name: "PermuteInt32x16", argLength: 2, commutative: false}, {name: "PermuteInt32x16", argLength: 2, commutative: false},
{name: "PermuteInt64x4", argLength: 2, commutative: false}, {name: "PermuteInt64x4", argLength: 2, commutative: false},
{name: "PermuteInt64x8", argLength: 2, commutative: false}, {name: "PermuteInt64x8", argLength: 2, commutative: false},
{name: "PermuteOrZeroGroupedInt8x32", argLength: 2, commutative: false},
{name: "PermuteOrZeroGroupedInt8x64", argLength: 2, commutative: false},
{name: "PermuteOrZeroGroupedUint8x32", argLength: 2, commutative: false},
{name: "PermuteOrZeroGroupedUint8x64", argLength: 2, commutative: false},
{name: "PermuteOrZeroInt8x16", argLength: 2, commutative: false},
{name: "PermuteOrZeroUint8x16", argLength: 2, commutative: false},
{name: "PermuteUint8x16", argLength: 2, commutative: false}, {name: "PermuteUint8x16", argLength: 2, commutative: false},
{name: "PermuteUint8x32", argLength: 2, commutative: false}, {name: "PermuteUint8x32", argLength: 2, commutative: false},
{name: "PermuteUint8x64", argLength: 2, commutative: false}, {name: "PermuteUint8x64", argLength: 2, commutative: false},
@ -1151,28 +1153,6 @@ func simdGenericOps() []opData {
{name: "GetElemUint16x8", argLength: 1, commutative: false, aux: "UInt8"}, {name: "GetElemUint16x8", argLength: 1, commutative: false, aux: "UInt8"},
{name: "GetElemUint32x4", argLength: 1, commutative: false, aux: "UInt8"}, {name: "GetElemUint32x4", argLength: 1, commutative: false, aux: "UInt8"},
{name: "GetElemUint64x2", argLength: 1, commutative: false, aux: "UInt8"}, {name: "GetElemUint64x2", argLength: 1, commutative: false, aux: "UInt8"},
{name: "PermuteConstantGroupedInt32x8", argLength: 1, commutative: false, aux: "UInt8"},
{name: "PermuteConstantGroupedInt32x16", argLength: 1, commutative: false, aux: "UInt8"},
{name: "PermuteConstantGroupedUint32x8", argLength: 1, commutative: false, aux: "UInt8"},
{name: "PermuteConstantGroupedUint32x16", argLength: 1, commutative: false, aux: "UInt8"},
{name: "PermuteConstantHiGroupedInt16x16", argLength: 1, commutative: false, aux: "UInt8"},
{name: "PermuteConstantHiGroupedInt16x32", argLength: 1, commutative: false, aux: "UInt8"},
{name: "PermuteConstantHiGroupedUint16x16", argLength: 1, commutative: false, aux: "UInt8"},
{name: "PermuteConstantHiGroupedUint16x32", argLength: 1, commutative: false, aux: "UInt8"},
{name: "PermuteConstantHiInt16x8", argLength: 1, commutative: false, aux: "UInt8"},
{name: "PermuteConstantHiInt32x4", argLength: 1, commutative: false, aux: "UInt8"},
{name: "PermuteConstantHiUint16x8", argLength: 1, commutative: false, aux: "UInt8"},
{name: "PermuteConstantHiUint32x4", argLength: 1, commutative: false, aux: "UInt8"},
{name: "PermuteConstantInt32x4", argLength: 1, commutative: false, aux: "UInt8"},
{name: "PermuteConstantLoGroupedInt16x16", argLength: 1, commutative: false, aux: "UInt8"},
{name: "PermuteConstantLoGroupedInt16x32", argLength: 1, commutative: false, aux: "UInt8"},
{name: "PermuteConstantLoGroupedUint16x16", argLength: 1, commutative: false, aux: "UInt8"},
{name: "PermuteConstantLoGroupedUint16x32", argLength: 1, commutative: false, aux: "UInt8"},
{name: "PermuteConstantLoInt16x8", argLength: 1, commutative: false, aux: "UInt8"},
{name: "PermuteConstantLoInt32x4", argLength: 1, commutative: false, aux: "UInt8"},
{name: "PermuteConstantLoUint16x8", argLength: 1, commutative: false, aux: "UInt8"},
{name: "PermuteConstantLoUint32x4", argLength: 1, commutative: false, aux: "UInt8"},
{name: "PermuteConstantUint32x4", argLength: 1, commutative: false, aux: "UInt8"},
{name: "RotateAllLeftInt32x4", argLength: 1, commutative: false, aux: "UInt8"}, {name: "RotateAllLeftInt32x4", argLength: 1, commutative: false, aux: "UInt8"},
{name: "RotateAllLeftInt32x8", argLength: 1, commutative: false, aux: "UInt8"}, {name: "RotateAllLeftInt32x8", argLength: 1, commutative: false, aux: "UInt8"},
{name: "RotateAllLeftInt32x16", argLength: 1, commutative: false, aux: "UInt8"}, {name: "RotateAllLeftInt32x16", argLength: 1, commutative: false, aux: "UInt8"},
@ -1292,6 +1272,24 @@ func simdGenericOps() []opData {
{name: "concatSelectedConstantInt64x2", argLength: 2, commutative: false, aux: "UInt8"}, {name: "concatSelectedConstantInt64x2", argLength: 2, commutative: false, aux: "UInt8"},
{name: "concatSelectedConstantUint32x4", argLength: 2, commutative: false, aux: "UInt8"}, {name: "concatSelectedConstantUint32x4", argLength: 2, commutative: false, aux: "UInt8"},
{name: "concatSelectedConstantUint64x2", argLength: 2, commutative: false, aux: "UInt8"}, {name: "concatSelectedConstantUint64x2", argLength: 2, commutative: false, aux: "UInt8"},
{name: "permuteScalarsGroupedInt32x8", argLength: 1, commutative: false, aux: "UInt8"},
{name: "permuteScalarsGroupedInt32x16", argLength: 1, commutative: false, aux: "UInt8"},
{name: "permuteScalarsGroupedUint32x8", argLength: 1, commutative: false, aux: "UInt8"},
{name: "permuteScalarsGroupedUint32x16", argLength: 1, commutative: false, aux: "UInt8"},
{name: "permuteScalarsHiGroupedInt16x16", argLength: 1, commutative: false, aux: "UInt8"},
{name: "permuteScalarsHiGroupedInt16x32", argLength: 1, commutative: false, aux: "UInt8"},
{name: "permuteScalarsHiGroupedUint16x16", argLength: 1, commutative: false, aux: "UInt8"},
{name: "permuteScalarsHiGroupedUint16x32", argLength: 1, commutative: false, aux: "UInt8"},
{name: "permuteScalarsHiInt16x8", argLength: 1, commutative: false, aux: "UInt8"},
{name: "permuteScalarsHiUint16x8", argLength: 1, commutative: false, aux: "UInt8"},
{name: "permuteScalarsInt32x4", argLength: 1, commutative: false, aux: "UInt8"},
{name: "permuteScalarsLoGroupedInt16x16", argLength: 1, commutative: false, aux: "UInt8"},
{name: "permuteScalarsLoGroupedInt16x32", argLength: 1, commutative: false, aux: "UInt8"},
{name: "permuteScalarsLoGroupedUint16x16", argLength: 1, commutative: false, aux: "UInt8"},
{name: "permuteScalarsLoGroupedUint16x32", argLength: 1, commutative: false, aux: "UInt8"},
{name: "permuteScalarsLoInt16x8", argLength: 1, commutative: false, aux: "UInt8"},
{name: "permuteScalarsLoUint16x8", argLength: 1, commutative: false, aux: "UInt8"},
{name: "permuteScalarsUint32x4", argLength: 1, commutative: false, aux: "UInt8"},
{name: "ternInt32x4", argLength: 3, commutative: false, aux: "UInt8"}, {name: "ternInt32x4", argLength: 3, commutative: false, aux: "UInt8"},
{name: "ternInt32x8", argLength: 3, commutative: false, aux: "UInt8"}, {name: "ternInt32x8", argLength: 3, commutative: false, aux: "UInt8"},
{name: "ternInt32x16", argLength: 3, commutative: false, aux: "UInt8"}, {name: "ternInt32x16", argLength: 3, commutative: false, aux: "UInt8"},

View file

@ -1624,8 +1624,10 @@ const (
OpAMD64VPDPWSSDMasked128 OpAMD64VPDPWSSDMasked128
OpAMD64VPDPWSSDMasked256 OpAMD64VPDPWSSDMasked256
OpAMD64VPDPWSSDMasked512 OpAMD64VPDPWSSDMasked512
OpAMD64VPERMB128
OpAMD64VPERMB256 OpAMD64VPERMB256
OpAMD64VPERMB512 OpAMD64VPERMB512
OpAMD64VPERMBMasked128
OpAMD64VPERMBMasked256 OpAMD64VPERMBMasked256
OpAMD64VPERMBMasked512 OpAMD64VPERMBMasked512
OpAMD64VPERMD256 OpAMD64VPERMD256
@ -2551,6 +2553,12 @@ const (
OpAMD64VPSHUFHWMasked128 OpAMD64VPSHUFHWMasked128
OpAMD64VPSHUFHWMasked256 OpAMD64VPSHUFHWMasked256
OpAMD64VPSHUFHWMasked512 OpAMD64VPSHUFHWMasked512
OpAMD64VPSHUFLW128
OpAMD64VPSHUFLW256
OpAMD64VPSHUFLW512
OpAMD64VPSHUFLWMasked128
OpAMD64VPSHUFLWMasked256
OpAMD64VPSHUFLWMasked512
OpAMD64VPSLLD128const OpAMD64VPSLLD128const
OpAMD64VPSLLD256const OpAMD64VPSLLD256const
OpAMD64VPSLLD512const OpAMD64VPSLLD512const
@ -3633,6 +3641,9 @@ const (
OpAMD64VPSHUFHWMasked128Merging OpAMD64VPSHUFHWMasked128Merging
OpAMD64VPSHUFHWMasked256Merging OpAMD64VPSHUFHWMasked256Merging
OpAMD64VPSHUFHWMasked512Merging OpAMD64VPSHUFHWMasked512Merging
OpAMD64VPSHUFLWMasked128Merging
OpAMD64VPSHUFLWMasked256Merging
OpAMD64VPSHUFLWMasked512Merging
OpAMD64VPSLLDMasked128constMerging OpAMD64VPSLLDMasked128constMerging
OpAMD64VPSLLDMasked256constMerging OpAMD64VPSLLDMasked256constMerging
OpAMD64VPSLLDMasked512constMerging OpAMD64VPSLLDMasked512constMerging
@ -6155,6 +6166,36 @@ const (
OpCompressUint64x2 OpCompressUint64x2
OpCompressUint64x4 OpCompressUint64x4
OpCompressUint64x8 OpCompressUint64x8
OpConcatPermuteFloat32x4
OpConcatPermuteFloat32x8
OpConcatPermuteFloat32x16
OpConcatPermuteFloat64x2
OpConcatPermuteFloat64x4
OpConcatPermuteFloat64x8
OpConcatPermuteInt8x16
OpConcatPermuteInt8x32
OpConcatPermuteInt8x64
OpConcatPermuteInt16x8
OpConcatPermuteInt16x16
OpConcatPermuteInt16x32
OpConcatPermuteInt32x4
OpConcatPermuteInt32x8
OpConcatPermuteInt32x16
OpConcatPermuteInt64x2
OpConcatPermuteInt64x4
OpConcatPermuteInt64x8
OpConcatPermuteUint8x16
OpConcatPermuteUint8x32
OpConcatPermuteUint8x64
OpConcatPermuteUint16x8
OpConcatPermuteUint16x16
OpConcatPermuteUint16x32
OpConcatPermuteUint32x4
OpConcatPermuteUint32x8
OpConcatPermuteUint32x16
OpConcatPermuteUint64x2
OpConcatPermuteUint64x4
OpConcatPermuteUint64x8
OpConvertToInt8Int16x8 OpConvertToInt8Int16x8
OpConvertToInt8Int16x16 OpConvertToInt8Int16x16
OpConvertToInt8Int16x32 OpConvertToInt8Int16x32
@ -6698,44 +6739,10 @@ const (
OpOrUint64x2 OpOrUint64x2
OpOrUint64x4 OpOrUint64x4
OpOrUint64x8 OpOrUint64x8
OpPermute2Float32x4
OpPermute2Float32x8
OpPermute2Float32x16
OpPermute2Float64x2
OpPermute2Float64x4
OpPermute2Float64x8
OpPermute2Int8x16
OpPermute2Int8x32
OpPermute2Int8x64
OpPermute2Int16x8
OpPermute2Int16x16
OpPermute2Int16x32
OpPermute2Int32x4
OpPermute2Int32x8
OpPermute2Int32x16
OpPermute2Int64x2
OpPermute2Int64x4
OpPermute2Int64x8
OpPermute2Uint8x16
OpPermute2Uint8x32
OpPermute2Uint8x64
OpPermute2Uint16x8
OpPermute2Uint16x16
OpPermute2Uint16x32
OpPermute2Uint32x4
OpPermute2Uint32x8
OpPermute2Uint32x16
OpPermute2Uint64x2
OpPermute2Uint64x4
OpPermute2Uint64x8
OpPermuteFloat32x8 OpPermuteFloat32x8
OpPermuteFloat32x16 OpPermuteFloat32x16
OpPermuteFloat64x4 OpPermuteFloat64x4
OpPermuteFloat64x8 OpPermuteFloat64x8
OpPermuteGroupedInt8x32
OpPermuteGroupedInt8x64
OpPermuteGroupedUint8x32
OpPermuteGroupedUint8x64
OpPermuteInt8x16 OpPermuteInt8x16
OpPermuteInt8x32 OpPermuteInt8x32
OpPermuteInt8x64 OpPermuteInt8x64
@ -6746,6 +6753,12 @@ const (
OpPermuteInt32x16 OpPermuteInt32x16
OpPermuteInt64x4 OpPermuteInt64x4
OpPermuteInt64x8 OpPermuteInt64x8
OpPermuteOrZeroGroupedInt8x32
OpPermuteOrZeroGroupedInt8x64
OpPermuteOrZeroGroupedUint8x32
OpPermuteOrZeroGroupedUint8x64
OpPermuteOrZeroInt8x16
OpPermuteOrZeroUint8x16
OpPermuteUint8x16 OpPermuteUint8x16
OpPermuteUint8x32 OpPermuteUint8x32
OpPermuteUint8x64 OpPermuteUint8x64
@ -7099,28 +7112,6 @@ const (
OpGetElemUint16x8 OpGetElemUint16x8
OpGetElemUint32x4 OpGetElemUint32x4
OpGetElemUint64x2 OpGetElemUint64x2
OpPermuteConstantGroupedInt32x8
OpPermuteConstantGroupedInt32x16
OpPermuteConstantGroupedUint32x8
OpPermuteConstantGroupedUint32x16
OpPermuteConstantHiGroupedInt16x16
OpPermuteConstantHiGroupedInt16x32
OpPermuteConstantHiGroupedUint16x16
OpPermuteConstantHiGroupedUint16x32
OpPermuteConstantHiInt16x8
OpPermuteConstantHiInt32x4
OpPermuteConstantHiUint16x8
OpPermuteConstantHiUint32x4
OpPermuteConstantInt32x4
OpPermuteConstantLoGroupedInt16x16
OpPermuteConstantLoGroupedInt16x32
OpPermuteConstantLoGroupedUint16x16
OpPermuteConstantLoGroupedUint16x32
OpPermuteConstantLoInt16x8
OpPermuteConstantLoInt32x4
OpPermuteConstantLoUint16x8
OpPermuteConstantLoUint32x4
OpPermuteConstantUint32x4
OpRotateAllLeftInt32x4 OpRotateAllLeftInt32x4
OpRotateAllLeftInt32x8 OpRotateAllLeftInt32x8
OpRotateAllLeftInt32x16 OpRotateAllLeftInt32x16
@ -7240,6 +7231,24 @@ const (
OpconcatSelectedConstantInt64x2 OpconcatSelectedConstantInt64x2
OpconcatSelectedConstantUint32x4 OpconcatSelectedConstantUint32x4
OpconcatSelectedConstantUint64x2 OpconcatSelectedConstantUint64x2
OppermuteScalarsGroupedInt32x8
OppermuteScalarsGroupedInt32x16
OppermuteScalarsGroupedUint32x8
OppermuteScalarsGroupedUint32x16
OppermuteScalarsHiGroupedInt16x16
OppermuteScalarsHiGroupedInt16x32
OppermuteScalarsHiGroupedUint16x16
OppermuteScalarsHiGroupedUint16x32
OppermuteScalarsHiInt16x8
OppermuteScalarsHiUint16x8
OppermuteScalarsInt32x4
OppermuteScalarsLoGroupedInt16x16
OppermuteScalarsLoGroupedInt16x32
OppermuteScalarsLoGroupedUint16x16
OppermuteScalarsLoGroupedUint16x32
OppermuteScalarsLoInt16x8
OppermuteScalarsLoUint16x8
OppermuteScalarsUint32x4
OpternInt32x4 OpternInt32x4
OpternInt32x8 OpternInt32x8
OpternInt32x16 OpternInt32x16
@ -26142,6 +26151,20 @@ var opcodeTable = [...]opInfo{
}, },
}, },
}, },
{
name: "VPERMB128",
argLen: 2,
asm: x86.AVPERMB,
reg: regInfo{
inputs: []inputInfo{
{0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
{1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
outputs: []outputInfo{
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
},
},
{ {
name: "VPERMB256", name: "VPERMB256",
argLen: 2, argLen: 2,
@ -26170,6 +26193,21 @@ var opcodeTable = [...]opInfo{
}, },
}, },
}, },
{
name: "VPERMBMasked128",
argLen: 3,
asm: x86.AVPERMB,
reg: regInfo{
inputs: []inputInfo{
{2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
{1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
outputs: []outputInfo{
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
},
},
{ {
name: "VPERMBMasked256", name: "VPERMBMasked256",
argLen: 3, argLen: 3,
@ -39744,6 +39782,93 @@ var opcodeTable = [...]opInfo{
}, },
}, },
}, },
{
name: "VPSHUFLW128",
auxType: auxUInt8,
argLen: 1,
asm: x86.AVPSHUFLW,
reg: regInfo{
inputs: []inputInfo{
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
outputs: []outputInfo{
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
},
},
{
name: "VPSHUFLW256",
auxType: auxUInt8,
argLen: 1,
asm: x86.AVPSHUFLW,
reg: regInfo{
inputs: []inputInfo{
{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
},
outputs: []outputInfo{
{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
},
},
},
{
name: "VPSHUFLW512",
auxType: auxUInt8,
argLen: 1,
asm: x86.AVPSHUFLW,
reg: regInfo{
inputs: []inputInfo{
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
outputs: []outputInfo{
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
},
},
{
name: "VPSHUFLWMasked128",
auxType: auxUInt8,
argLen: 2,
asm: x86.AVPSHUFLW,
reg: regInfo{
inputs: []inputInfo{
{1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
{0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
outputs: []outputInfo{
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
},
},
{
name: "VPSHUFLWMasked256",
auxType: auxUInt8,
argLen: 2,
asm: x86.AVPSHUFLW,
reg: regInfo{
inputs: []inputInfo{
{1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
{0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
outputs: []outputInfo{
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
},
},
{
name: "VPSHUFLWMasked512",
auxType: auxUInt8,
argLen: 2,
asm: x86.AVPSHUFLW,
reg: regInfo{
inputs: []inputInfo{
{1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
{0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
outputs: []outputInfo{
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
},
},
{ {
name: "VPSLLD128const", name: "VPSLLD128const",
auxType: auxUInt8, auxType: auxUInt8,
@ -57607,6 +57732,57 @@ var opcodeTable = [...]opInfo{
}, },
}, },
}, },
{
name: "VPSHUFLWMasked128Merging",
auxType: auxUInt8,
argLen: 3,
resultInArg0: true,
asm: x86.AVPSHUFLW,
reg: regInfo{
inputs: []inputInfo{
{2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
{1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
outputs: []outputInfo{
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
},
},
{
name: "VPSHUFLWMasked256Merging",
auxType: auxUInt8,
argLen: 3,
resultInArg0: true,
asm: x86.AVPSHUFLW,
reg: regInfo{
inputs: []inputInfo{
{2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
{1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
outputs: []outputInfo{
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
},
},
{
name: "VPSHUFLWMasked512Merging",
auxType: auxUInt8,
argLen: 3,
resultInArg0: true,
asm: x86.AVPSHUFLW,
reg: regInfo{
inputs: []inputInfo{
{2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
{1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
outputs: []outputInfo{
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
},
},
{ {
name: "VPSLLDMasked128constMerging", name: "VPSLLDMasked128constMerging",
auxType: auxUInt8, auxType: auxUInt8,
@ -86874,6 +87050,156 @@ var opcodeTable = [...]opInfo{
argLen: 2, argLen: 2,
generic: true, generic: true,
}, },
{
name: "ConcatPermuteFloat32x4",
argLen: 3,
generic: true,
},
{
name: "ConcatPermuteFloat32x8",
argLen: 3,
generic: true,
},
{
name: "ConcatPermuteFloat32x16",
argLen: 3,
generic: true,
},
{
name: "ConcatPermuteFloat64x2",
argLen: 3,
generic: true,
},
{
name: "ConcatPermuteFloat64x4",
argLen: 3,
generic: true,
},
{
name: "ConcatPermuteFloat64x8",
argLen: 3,
generic: true,
},
{
name: "ConcatPermuteInt8x16",
argLen: 3,
generic: true,
},
{
name: "ConcatPermuteInt8x32",
argLen: 3,
generic: true,
},
{
name: "ConcatPermuteInt8x64",
argLen: 3,
generic: true,
},
{
name: "ConcatPermuteInt16x8",
argLen: 3,
generic: true,
},
{
name: "ConcatPermuteInt16x16",
argLen: 3,
generic: true,
},
{
name: "ConcatPermuteInt16x32",
argLen: 3,
generic: true,
},
{
name: "ConcatPermuteInt32x4",
argLen: 3,
generic: true,
},
{
name: "ConcatPermuteInt32x8",
argLen: 3,
generic: true,
},
{
name: "ConcatPermuteInt32x16",
argLen: 3,
generic: true,
},
{
name: "ConcatPermuteInt64x2",
argLen: 3,
generic: true,
},
{
name: "ConcatPermuteInt64x4",
argLen: 3,
generic: true,
},
{
name: "ConcatPermuteInt64x8",
argLen: 3,
generic: true,
},
{
name: "ConcatPermuteUint8x16",
argLen: 3,
generic: true,
},
{
name: "ConcatPermuteUint8x32",
argLen: 3,
generic: true,
},
{
name: "ConcatPermuteUint8x64",
argLen: 3,
generic: true,
},
{
name: "ConcatPermuteUint16x8",
argLen: 3,
generic: true,
},
{
name: "ConcatPermuteUint16x16",
argLen: 3,
generic: true,
},
{
name: "ConcatPermuteUint16x32",
argLen: 3,
generic: true,
},
{
name: "ConcatPermuteUint32x4",
argLen: 3,
generic: true,
},
{
name: "ConcatPermuteUint32x8",
argLen: 3,
generic: true,
},
{
name: "ConcatPermuteUint32x16",
argLen: 3,
generic: true,
},
{
name: "ConcatPermuteUint64x2",
argLen: 3,
generic: true,
},
{
name: "ConcatPermuteUint64x4",
argLen: 3,
generic: true,
},
{
name: "ConcatPermuteUint64x8",
argLen: 3,
generic: true,
},
{ {
name: "ConvertToInt8Int16x8", name: "ConvertToInt8Int16x8",
argLen: 1, argLen: 1,
@ -89757,156 +90083,6 @@ var opcodeTable = [...]opInfo{
commutative: true, commutative: true,
generic: true, generic: true,
}, },
{
name: "Permute2Float32x4",
argLen: 3,
generic: true,
},
{
name: "Permute2Float32x8",
argLen: 3,
generic: true,
},
{
name: "Permute2Float32x16",
argLen: 3,
generic: true,
},
{
name: "Permute2Float64x2",
argLen: 3,
generic: true,
},
{
name: "Permute2Float64x4",
argLen: 3,
generic: true,
},
{
name: "Permute2Float64x8",
argLen: 3,
generic: true,
},
{
name: "Permute2Int8x16",
argLen: 3,
generic: true,
},
{
name: "Permute2Int8x32",
argLen: 3,
generic: true,
},
{
name: "Permute2Int8x64",
argLen: 3,
generic: true,
},
{
name: "Permute2Int16x8",
argLen: 3,
generic: true,
},
{
name: "Permute2Int16x16",
argLen: 3,
generic: true,
},
{
name: "Permute2Int16x32",
argLen: 3,
generic: true,
},
{
name: "Permute2Int32x4",
argLen: 3,
generic: true,
},
{
name: "Permute2Int32x8",
argLen: 3,
generic: true,
},
{
name: "Permute2Int32x16",
argLen: 3,
generic: true,
},
{
name: "Permute2Int64x2",
argLen: 3,
generic: true,
},
{
name: "Permute2Int64x4",
argLen: 3,
generic: true,
},
{
name: "Permute2Int64x8",
argLen: 3,
generic: true,
},
{
name: "Permute2Uint8x16",
argLen: 3,
generic: true,
},
{
name: "Permute2Uint8x32",
argLen: 3,
generic: true,
},
{
name: "Permute2Uint8x64",
argLen: 3,
generic: true,
},
{
name: "Permute2Uint16x8",
argLen: 3,
generic: true,
},
{
name: "Permute2Uint16x16",
argLen: 3,
generic: true,
},
{
name: "Permute2Uint16x32",
argLen: 3,
generic: true,
},
{
name: "Permute2Uint32x4",
argLen: 3,
generic: true,
},
{
name: "Permute2Uint32x8",
argLen: 3,
generic: true,
},
{
name: "Permute2Uint32x16",
argLen: 3,
generic: true,
},
{
name: "Permute2Uint64x2",
argLen: 3,
generic: true,
},
{
name: "Permute2Uint64x4",
argLen: 3,
generic: true,
},
{
name: "Permute2Uint64x8",
argLen: 3,
generic: true,
},
{ {
name: "PermuteFloat32x8", name: "PermuteFloat32x8",
argLen: 2, argLen: 2,
@ -89927,26 +90103,6 @@ var opcodeTable = [...]opInfo{
argLen: 2, argLen: 2,
generic: true, generic: true,
}, },
{
name: "PermuteGroupedInt8x32",
argLen: 2,
generic: true,
},
{
name: "PermuteGroupedInt8x64",
argLen: 2,
generic: true,
},
{
name: "PermuteGroupedUint8x32",
argLen: 2,
generic: true,
},
{
name: "PermuteGroupedUint8x64",
argLen: 2,
generic: true,
},
{ {
name: "PermuteInt8x16", name: "PermuteInt8x16",
argLen: 2, argLen: 2,
@ -89997,6 +90153,36 @@ var opcodeTable = [...]opInfo{
argLen: 2, argLen: 2,
generic: true, generic: true,
}, },
{
name: "PermuteOrZeroGroupedInt8x32",
argLen: 2,
generic: true,
},
{
name: "PermuteOrZeroGroupedInt8x64",
argLen: 2,
generic: true,
},
{
name: "PermuteOrZeroGroupedUint8x32",
argLen: 2,
generic: true,
},
{
name: "PermuteOrZeroGroupedUint8x64",
argLen: 2,
generic: true,
},
{
name: "PermuteOrZeroInt8x16",
argLen: 2,
generic: true,
},
{
name: "PermuteOrZeroUint8x16",
argLen: 2,
generic: true,
},
{ {
name: "PermuteUint8x16", name: "PermuteUint8x16",
argLen: 2, argLen: 2,
@ -91830,138 +92016,6 @@ var opcodeTable = [...]opInfo{
argLen: 1, argLen: 1,
generic: true, generic: true,
}, },
{
name: "PermuteConstantGroupedInt32x8",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "PermuteConstantGroupedInt32x16",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "PermuteConstantGroupedUint32x8",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "PermuteConstantGroupedUint32x16",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "PermuteConstantHiGroupedInt16x16",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "PermuteConstantHiGroupedInt16x32",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "PermuteConstantHiGroupedUint16x16",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "PermuteConstantHiGroupedUint16x32",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "PermuteConstantHiInt16x8",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "PermuteConstantHiInt32x4",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "PermuteConstantHiUint16x8",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "PermuteConstantHiUint32x4",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "PermuteConstantInt32x4",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "PermuteConstantLoGroupedInt16x16",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "PermuteConstantLoGroupedInt16x32",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "PermuteConstantLoGroupedUint16x16",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "PermuteConstantLoGroupedUint16x32",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "PermuteConstantLoInt16x8",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "PermuteConstantLoInt32x4",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "PermuteConstantLoUint16x8",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "PermuteConstantLoUint32x4",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "PermuteConstantUint32x4",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{ {
name: "RotateAllLeftInt32x4", name: "RotateAllLeftInt32x4",
auxType: auxUInt8, auxType: auxUInt8,
@ -92676,6 +92730,114 @@ var opcodeTable = [...]opInfo{
argLen: 2, argLen: 2,
generic: true, generic: true,
}, },
{
name: "permuteScalarsGroupedInt32x8",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "permuteScalarsGroupedInt32x16",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "permuteScalarsGroupedUint32x8",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "permuteScalarsGroupedUint32x16",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "permuteScalarsHiGroupedInt16x16",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "permuteScalarsHiGroupedInt16x32",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "permuteScalarsHiGroupedUint16x16",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "permuteScalarsHiGroupedUint16x32",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "permuteScalarsHiInt16x8",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "permuteScalarsHiUint16x8",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "permuteScalarsInt32x4",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "permuteScalarsLoGroupedInt16x16",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "permuteScalarsLoGroupedInt16x32",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "permuteScalarsLoGroupedUint16x16",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "permuteScalarsLoGroupedUint16x32",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "permuteScalarsLoInt16x8",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "permuteScalarsLoUint16x8",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{
name: "permuteScalarsUint32x4",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
{ {
name: "ternInt32x4", name: "ternInt32x4",
auxType: auxUInt8, auxType: auxUInt8,

File diff suppressed because it is too large Load diff

View file

@ -228,6 +228,36 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
addF(simdPackage, "Uint64x2.Compress", opLen2(ssa.OpCompressUint64x2, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint64x2.Compress", opLen2(ssa.OpCompressUint64x2, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Uint64x4.Compress", opLen2(ssa.OpCompressUint64x4, types.TypeVec256), sys.AMD64) addF(simdPackage, "Uint64x4.Compress", opLen2(ssa.OpCompressUint64x4, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint64x8.Compress", opLen2(ssa.OpCompressUint64x8, types.TypeVec512), sys.AMD64) addF(simdPackage, "Uint64x8.Compress", opLen2(ssa.OpCompressUint64x8, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int8x16.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt8x16, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Uint8x16.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint8x16, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int8x32.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt8x32, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint8x32.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint8x32, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int8x64.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt8x64, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Uint8x64.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint8x64, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int16x8.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt16x8, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Uint16x8.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint16x8, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int16x16.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt16x16, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint16x16.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint16x16, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int16x32.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt16x32, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Uint16x32.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint16x32, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Float32x4.ConcatPermute", opLen3_231(ssa.OpConcatPermuteFloat32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int32x4.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Uint32x4.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Float32x8.ConcatPermute", opLen3_231(ssa.OpConcatPermuteFloat32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int32x8.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint32x8.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Float32x16.ConcatPermute", opLen3_231(ssa.OpConcatPermuteFloat32x16, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int32x16.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt32x16, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Uint32x16.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint32x16, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Float64x2.ConcatPermute", opLen3_231(ssa.OpConcatPermuteFloat64x2, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int64x2.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt64x2, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Uint64x2.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint64x2, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Float64x4.ConcatPermute", opLen3_231(ssa.OpConcatPermuteFloat64x4, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int64x4.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt64x4, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint64x4.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint64x4, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Float64x8.ConcatPermute", opLen3_231(ssa.OpConcatPermuteFloat64x8, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int64x8.ConcatPermute", opLen3_231(ssa.OpConcatPermuteInt64x8, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Uint64x8.ConcatPermute", opLen3_231(ssa.OpConcatPermuteUint64x8, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Uint8x16.ConcatShiftBytesRight", opLen2Imm8(ssa.OpConcatShiftBytesRightUint8x16, types.TypeVec128, 0), sys.AMD64) addF(simdPackage, "Uint8x16.ConcatShiftBytesRight", opLen2Imm8(ssa.OpConcatShiftBytesRightUint8x16, types.TypeVec128, 0), sys.AMD64)
addF(simdPackage, "Uint8x32.ConcatShiftBytesRightGrouped", opLen2Imm8(ssa.OpConcatShiftBytesRightGroupedUint8x32, types.TypeVec256, 0), sys.AMD64) addF(simdPackage, "Uint8x32.ConcatShiftBytesRightGrouped", opLen2Imm8(ssa.OpConcatShiftBytesRightGroupedUint8x32, types.TypeVec256, 0), sys.AMD64)
addF(simdPackage, "Uint8x64.ConcatShiftBytesRightGrouped", opLen2Imm8(ssa.OpConcatShiftBytesRightGroupedUint8x64, types.TypeVec512, 0), sys.AMD64) addF(simdPackage, "Uint8x64.ConcatShiftBytesRightGrouped", opLen2Imm8(ssa.OpConcatShiftBytesRightGroupedUint8x64, types.TypeVec512, 0), sys.AMD64)
@ -802,8 +832,8 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
addF(simdPackage, "Uint64x2.Or", opLen2(ssa.OpOrUint64x2, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint64x2.Or", opLen2(ssa.OpOrUint64x2, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Uint64x4.Or", opLen2(ssa.OpOrUint64x4, types.TypeVec256), sys.AMD64) addF(simdPackage, "Uint64x4.Or", opLen2(ssa.OpOrUint64x4, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint64x8.Or", opLen2(ssa.OpOrUint64x8, types.TypeVec512), sys.AMD64) addF(simdPackage, "Uint64x8.Or", opLen2(ssa.OpOrUint64x8, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int8x16.Permute", opLen2(ssa.OpPermuteInt8x16, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int8x16.Permute", opLen2_21(ssa.OpPermuteInt8x16, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Uint8x16.Permute", opLen2(ssa.OpPermuteUint8x16, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint8x16.Permute", opLen2_21(ssa.OpPermuteUint8x16, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int8x32.Permute", opLen2_21(ssa.OpPermuteInt8x32, types.TypeVec256), sys.AMD64) addF(simdPackage, "Int8x32.Permute", opLen2_21(ssa.OpPermuteInt8x32, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint8x32.Permute", opLen2_21(ssa.OpPermuteUint8x32, types.TypeVec256), sys.AMD64) addF(simdPackage, "Uint8x32.Permute", opLen2_21(ssa.OpPermuteUint8x32, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int8x64.Permute", opLen2_21(ssa.OpPermuteInt8x64, types.TypeVec512), sys.AMD64) addF(simdPackage, "Int8x64.Permute", opLen2_21(ssa.OpPermuteInt8x64, types.TypeVec512), sys.AMD64)
@ -826,62 +856,12 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
addF(simdPackage, "Float64x8.Permute", opLen2_21(ssa.OpPermuteFloat64x8, types.TypeVec512), sys.AMD64) addF(simdPackage, "Float64x8.Permute", opLen2_21(ssa.OpPermuteFloat64x8, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int64x8.Permute", opLen2_21(ssa.OpPermuteInt64x8, types.TypeVec512), sys.AMD64) addF(simdPackage, "Int64x8.Permute", opLen2_21(ssa.OpPermuteInt64x8, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Uint64x8.Permute", opLen2_21(ssa.OpPermuteUint64x8, types.TypeVec512), sys.AMD64) addF(simdPackage, "Uint64x8.Permute", opLen2_21(ssa.OpPermuteUint64x8, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int8x16.Permute2", opLen3_231(ssa.OpPermute2Int8x16, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int8x16.PermuteOrZero", opLen2(ssa.OpPermuteOrZeroInt8x16, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Uint8x16.Permute2", opLen3_231(ssa.OpPermute2Uint8x16, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint8x16.PermuteOrZero", opLen2(ssa.OpPermuteOrZeroUint8x16, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int8x32.Permute2", opLen3_231(ssa.OpPermute2Int8x32, types.TypeVec256), sys.AMD64) addF(simdPackage, "Int8x32.PermuteOrZeroGrouped", opLen2(ssa.OpPermuteOrZeroGroupedInt8x32, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint8x32.Permute2", opLen3_231(ssa.OpPermute2Uint8x32, types.TypeVec256), sys.AMD64) addF(simdPackage, "Int8x64.PermuteOrZeroGrouped", opLen2(ssa.OpPermuteOrZeroGroupedInt8x64, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int8x64.Permute2", opLen3_231(ssa.OpPermute2Int8x64, types.TypeVec512), sys.AMD64) addF(simdPackage, "Uint8x32.PermuteOrZeroGrouped", opLen2(ssa.OpPermuteOrZeroGroupedUint8x32, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint8x64.Permute2", opLen3_231(ssa.OpPermute2Uint8x64, types.TypeVec512), sys.AMD64) addF(simdPackage, "Uint8x64.PermuteOrZeroGrouped", opLen2(ssa.OpPermuteOrZeroGroupedUint8x64, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int16x8.Permute2", opLen3_231(ssa.OpPermute2Int16x8, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Uint16x8.Permute2", opLen3_231(ssa.OpPermute2Uint16x8, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int16x16.Permute2", opLen3_231(ssa.OpPermute2Int16x16, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint16x16.Permute2", opLen3_231(ssa.OpPermute2Uint16x16, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int16x32.Permute2", opLen3_231(ssa.OpPermute2Int16x32, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Uint16x32.Permute2", opLen3_231(ssa.OpPermute2Uint16x32, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Float32x4.Permute2", opLen3_231(ssa.OpPermute2Float32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int32x4.Permute2", opLen3_231(ssa.OpPermute2Int32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Uint32x4.Permute2", opLen3_231(ssa.OpPermute2Uint32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Float32x8.Permute2", opLen3_231(ssa.OpPermute2Float32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int32x8.Permute2", opLen3_231(ssa.OpPermute2Int32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint32x8.Permute2", opLen3_231(ssa.OpPermute2Uint32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Float32x16.Permute2", opLen3_231(ssa.OpPermute2Float32x16, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int32x16.Permute2", opLen3_231(ssa.OpPermute2Int32x16, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Uint32x16.Permute2", opLen3_231(ssa.OpPermute2Uint32x16, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Float64x2.Permute2", opLen3_231(ssa.OpPermute2Float64x2, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int64x2.Permute2", opLen3_231(ssa.OpPermute2Int64x2, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Uint64x2.Permute2", opLen3_231(ssa.OpPermute2Uint64x2, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Float64x4.Permute2", opLen3_231(ssa.OpPermute2Float64x4, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int64x4.Permute2", opLen3_231(ssa.OpPermute2Int64x4, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint64x4.Permute2", opLen3_231(ssa.OpPermute2Uint64x4, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Float64x8.Permute2", opLen3_231(ssa.OpPermute2Float64x8, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int64x8.Permute2", opLen3_231(ssa.OpPermute2Int64x8, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Uint64x8.Permute2", opLen3_231(ssa.OpPermute2Uint64x8, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int32x4.PermuteConstant", opLen1Imm8(ssa.OpPermuteConstantInt32x4, types.TypeVec128, 0), sys.AMD64)
addF(simdPackage, "Uint32x4.PermuteConstant", opLen1Imm8(ssa.OpPermuteConstantUint32x4, types.TypeVec128, 0), sys.AMD64)
addF(simdPackage, "Int32x8.PermuteConstantGrouped", opLen1Imm8(ssa.OpPermuteConstantGroupedInt32x8, types.TypeVec256, 0), sys.AMD64)
addF(simdPackage, "Int32x16.PermuteConstantGrouped", opLen1Imm8(ssa.OpPermuteConstantGroupedInt32x16, types.TypeVec512, 0), sys.AMD64)
addF(simdPackage, "Uint32x8.PermuteConstantGrouped", opLen1Imm8(ssa.OpPermuteConstantGroupedUint32x8, types.TypeVec256, 0), sys.AMD64)
addF(simdPackage, "Uint32x16.PermuteConstantGrouped", opLen1Imm8(ssa.OpPermuteConstantGroupedUint32x16, types.TypeVec512, 0), sys.AMD64)
addF(simdPackage, "Int16x8.PermuteConstantHi", opLen1Imm8(ssa.OpPermuteConstantHiInt16x8, types.TypeVec128, 0), sys.AMD64)
addF(simdPackage, "Int32x4.PermuteConstantHi", opLen1Imm8(ssa.OpPermuteConstantHiInt32x4, types.TypeVec128, 0), sys.AMD64)
addF(simdPackage, "Uint16x8.PermuteConstantHi", opLen1Imm8(ssa.OpPermuteConstantHiUint16x8, types.TypeVec128, 0), sys.AMD64)
addF(simdPackage, "Uint32x4.PermuteConstantHi", opLen1Imm8(ssa.OpPermuteConstantHiUint32x4, types.TypeVec128, 0), sys.AMD64)
addF(simdPackage, "Int16x16.PermuteConstantHiGrouped", opLen1Imm8(ssa.OpPermuteConstantHiGroupedInt16x16, types.TypeVec256, 0), sys.AMD64)
addF(simdPackage, "Int16x32.PermuteConstantHiGrouped", opLen1Imm8(ssa.OpPermuteConstantHiGroupedInt16x32, types.TypeVec512, 0), sys.AMD64)
addF(simdPackage, "Uint16x16.PermuteConstantHiGrouped", opLen1Imm8(ssa.OpPermuteConstantHiGroupedUint16x16, types.TypeVec256, 0), sys.AMD64)
addF(simdPackage, "Uint16x32.PermuteConstantHiGrouped", opLen1Imm8(ssa.OpPermuteConstantHiGroupedUint16x32, types.TypeVec512, 0), sys.AMD64)
addF(simdPackage, "Int16x8.PermuteConstantLo", opLen1Imm8(ssa.OpPermuteConstantLoInt16x8, types.TypeVec128, 0), sys.AMD64)
addF(simdPackage, "Int32x4.PermuteConstantLo", opLen1Imm8(ssa.OpPermuteConstantLoInt32x4, types.TypeVec128, 0), sys.AMD64)
addF(simdPackage, "Uint16x8.PermuteConstantLo", opLen1Imm8(ssa.OpPermuteConstantLoUint16x8, types.TypeVec128, 0), sys.AMD64)
addF(simdPackage, "Uint32x4.PermuteConstantLo", opLen1Imm8(ssa.OpPermuteConstantLoUint32x4, types.TypeVec128, 0), sys.AMD64)
addF(simdPackage, "Int16x16.PermuteConstantLoGrouped", opLen1Imm8(ssa.OpPermuteConstantLoGroupedInt16x16, types.TypeVec256, 0), sys.AMD64)
addF(simdPackage, "Int16x32.PermuteConstantLoGrouped", opLen1Imm8(ssa.OpPermuteConstantLoGroupedInt16x32, types.TypeVec512, 0), sys.AMD64)
addF(simdPackage, "Uint16x16.PermuteConstantLoGrouped", opLen1Imm8(ssa.OpPermuteConstantLoGroupedUint16x16, types.TypeVec256, 0), sys.AMD64)
addF(simdPackage, "Uint16x32.PermuteConstantLoGrouped", opLen1Imm8(ssa.OpPermuteConstantLoGroupedUint16x32, types.TypeVec512, 0), sys.AMD64)
addF(simdPackage, "Int8x32.PermuteGrouped", opLen2(ssa.OpPermuteGroupedInt8x32, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int8x64.PermuteGrouped", opLen2(ssa.OpPermuteGroupedInt8x64, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Uint8x32.PermuteGrouped", opLen2(ssa.OpPermuteGroupedUint8x32, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint8x64.PermuteGrouped", opLen2(ssa.OpPermuteGroupedUint8x64, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Float32x4.Reciprocal", opLen1(ssa.OpReciprocalFloat32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Float32x4.Reciprocal", opLen1(ssa.OpReciprocalFloat32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Float32x8.Reciprocal", opLen1(ssa.OpReciprocalFloat32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Float32x8.Reciprocal", opLen1(ssa.OpReciprocalFloat32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Float32x16.Reciprocal", opLen1(ssa.OpReciprocalFloat32x16, types.TypeVec512), sys.AMD64) addF(simdPackage, "Float32x16.Reciprocal", opLen1(ssa.OpReciprocalFloat32x16, types.TypeVec512), sys.AMD64)
@ -1300,6 +1280,24 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
addF(simdPackage, "Uint32x16.concatSelectedConstantGrouped", opLen2Imm8(ssa.OpconcatSelectedConstantGroupedUint32x16, types.TypeVec512, 0), sys.AMD64) addF(simdPackage, "Uint32x16.concatSelectedConstantGrouped", opLen2Imm8(ssa.OpconcatSelectedConstantGroupedUint32x16, types.TypeVec512, 0), sys.AMD64)
addF(simdPackage, "Uint64x4.concatSelectedConstantGrouped", opLen2Imm8(ssa.OpconcatSelectedConstantGroupedUint64x4, types.TypeVec256, 0), sys.AMD64) addF(simdPackage, "Uint64x4.concatSelectedConstantGrouped", opLen2Imm8(ssa.OpconcatSelectedConstantGroupedUint64x4, types.TypeVec256, 0), sys.AMD64)
addF(simdPackage, "Uint64x8.concatSelectedConstantGrouped", opLen2Imm8(ssa.OpconcatSelectedConstantGroupedUint64x8, types.TypeVec512, 0), sys.AMD64) addF(simdPackage, "Uint64x8.concatSelectedConstantGrouped", opLen2Imm8(ssa.OpconcatSelectedConstantGroupedUint64x8, types.TypeVec512, 0), sys.AMD64)
addF(simdPackage, "Int32x4.permuteScalars", opLen1Imm8(ssa.OppermuteScalarsInt32x4, types.TypeVec128, 0), sys.AMD64)
addF(simdPackage, "Uint32x4.permuteScalars", opLen1Imm8(ssa.OppermuteScalarsUint32x4, types.TypeVec128, 0), sys.AMD64)
addF(simdPackage, "Int32x8.permuteScalarsGrouped", opLen1Imm8(ssa.OppermuteScalarsGroupedInt32x8, types.TypeVec256, 0), sys.AMD64)
addF(simdPackage, "Int32x16.permuteScalarsGrouped", opLen1Imm8(ssa.OppermuteScalarsGroupedInt32x16, types.TypeVec512, 0), sys.AMD64)
addF(simdPackage, "Uint32x8.permuteScalarsGrouped", opLen1Imm8(ssa.OppermuteScalarsGroupedUint32x8, types.TypeVec256, 0), sys.AMD64)
addF(simdPackage, "Uint32x16.permuteScalarsGrouped", opLen1Imm8(ssa.OppermuteScalarsGroupedUint32x16, types.TypeVec512, 0), sys.AMD64)
addF(simdPackage, "Int16x8.permuteScalarsHi", opLen1Imm8(ssa.OppermuteScalarsHiInt16x8, types.TypeVec128, 0), sys.AMD64)
addF(simdPackage, "Uint16x8.permuteScalarsHi", opLen1Imm8(ssa.OppermuteScalarsHiUint16x8, types.TypeVec128, 0), sys.AMD64)
addF(simdPackage, "Int16x16.permuteScalarsHiGrouped", opLen1Imm8(ssa.OppermuteScalarsHiGroupedInt16x16, types.TypeVec256, 0), sys.AMD64)
addF(simdPackage, "Int16x32.permuteScalarsHiGrouped", opLen1Imm8(ssa.OppermuteScalarsHiGroupedInt16x32, types.TypeVec512, 0), sys.AMD64)
addF(simdPackage, "Uint16x16.permuteScalarsHiGrouped", opLen1Imm8(ssa.OppermuteScalarsHiGroupedUint16x16, types.TypeVec256, 0), sys.AMD64)
addF(simdPackage, "Uint16x32.permuteScalarsHiGrouped", opLen1Imm8(ssa.OppermuteScalarsHiGroupedUint16x32, types.TypeVec512, 0), sys.AMD64)
addF(simdPackage, "Int16x8.permuteScalarsLo", opLen1Imm8(ssa.OppermuteScalarsLoInt16x8, types.TypeVec128, 0), sys.AMD64)
addF(simdPackage, "Uint16x8.permuteScalarsLo", opLen1Imm8(ssa.OppermuteScalarsLoUint16x8, types.TypeVec128, 0), sys.AMD64)
addF(simdPackage, "Int16x16.permuteScalarsLoGrouped", opLen1Imm8(ssa.OppermuteScalarsLoGroupedInt16x16, types.TypeVec256, 0), sys.AMD64)
addF(simdPackage, "Int16x32.permuteScalarsLoGrouped", opLen1Imm8(ssa.OppermuteScalarsLoGroupedInt16x32, types.TypeVec512, 0), sys.AMD64)
addF(simdPackage, "Uint16x16.permuteScalarsLoGrouped", opLen1Imm8(ssa.OppermuteScalarsLoGroupedUint16x16, types.TypeVec256, 0), sys.AMD64)
addF(simdPackage, "Uint16x32.permuteScalarsLoGrouped", opLen1Imm8(ssa.OppermuteScalarsLoGroupedUint16x32, types.TypeVec512, 0), sys.AMD64)
addF(simdPackage, "Int32x4.tern", opLen3Imm8(ssa.OpternInt32x4, types.TypeVec128, 0), sys.AMD64) addF(simdPackage, "Int32x4.tern", opLen3Imm8(ssa.OpternInt32x4, types.TypeVec128, 0), sys.AMD64)
addF(simdPackage, "Int32x8.tern", opLen3Imm8(ssa.OpternInt32x8, types.TypeVec256, 0), sys.AMD64) addF(simdPackage, "Int32x8.tern", opLen3Imm8(ssa.OpternInt32x8, types.TypeVec256, 0), sys.AMD64)
addF(simdPackage, "Int32x16.tern", opLen3Imm8(ssa.OpternInt32x16, types.TypeVec512, 0), sys.AMD64) addF(simdPackage, "Int32x16.tern", opLen3Imm8(ssa.OpternInt32x16, types.TypeVec512, 0), sys.AMD64)

View file

@ -46,6 +46,9 @@ func writeSIMDGenericOps(ops []Operation) *bytes.Buffer {
if op.NoGenericOps != nil && *op.NoGenericOps == "true" { if op.NoGenericOps != nil && *op.NoGenericOps == "true" {
continue continue
} }
if op.SkipMaskedMethod() {
continue
}
_, _, _, immType, gOp := op.shape() _, _, _, immType, gOp := op.shape()
gOpData := genericOpsData{gOp.GenericName(), len(gOp.In), op.Commutative} gOpData := genericOpsData{gOp.GenericName(), len(gOp.In), op.Commutative}
if immType == VarImm || immType == ConstVarImm { if immType == VarImm || immType == ConstVarImm {

View file

@ -107,6 +107,9 @@ func writeSIMDIntrinsics(ops []Operation, typeMap simdTypeMap) *bytes.Buffer {
if op.NoTypes != nil && *op.NoTypes == "true" { if op.NoTypes != nil && *op.NoTypes == "true" {
continue continue
} }
if op.SkipMaskedMethod() {
continue
}
if s, op, err := classifyOp(op); err == nil { if s, op, err := classifyOp(op); err == nil {
if err := t.ExecuteTemplate(buffer, s, op); err != nil { if err := t.ExecuteTemplate(buffer, s, op); err != nil {
panic(fmt.Errorf("failed to execute template %s for op %s: %w", s, op.Go, err)) panic(fmt.Errorf("failed to execute template %s for op %s: %w", s, op.Go, err))

View file

@ -604,6 +604,9 @@ func writeSIMDStubs(ops []Operation, typeMap simdTypeMap) (f, fI *bytes.Buffer)
if op.NoTypes != nil && *op.NoTypes == "true" { if op.NoTypes != nil && *op.NoTypes == "true" {
continue continue
} }
if op.SkipMaskedMethod() {
continue
}
idxVecAsScalar, err := checkVecAsScalar(op) idxVecAsScalar, err := checkVecAsScalar(op)
if err != nil { if err != nil {
panic(err) panic(err)

View file

@ -345,7 +345,8 @@ func writeSIMDRules(ops []Operation) *bytes.Buffer {
data.ArgsOut = "..." data.ArgsOut = "..."
} }
data.tplName = tplName data.tplName = tplName
if opr.NoGenericOps != nil && *opr.NoGenericOps == "true" { if opr.NoGenericOps != nil && *opr.NoGenericOps == "true" ||
opr.SkipMaskedMethod() {
optData = append(optData, data) optData = append(optData, data)
continue continue
} }

View file

@ -73,6 +73,29 @@ type rawOperation struct {
NoGenericOps *string NoGenericOps *string
// If non-nil, this string will be attached to the machine ssa op name. E.g. "const" // If non-nil, this string will be attached to the machine ssa op name. E.g. "const"
SSAVariant *string SSAVariant *string
// If true, do not emit method declarations, generic ops, or intrinsics for masked variants
// DO emit the architecture-specific opcodes and optimizations.
HideMaskMethods *bool
}
func (o *Operation) IsMasked() bool {
if len(o.InVariant) == 0 {
return false
}
if len(o.InVariant) == 1 && o.InVariant[0].Class == "mask" {
return true
}
panic(fmt.Errorf("unknown inVariant"))
}
func (o *Operation) SkipMaskedMethod() bool {
if o.HideMaskMethods == nil {
return false
}
if *o.HideMaskMethods && o.IsMasked() {
return true
}
return false
} }
func (o *Operation) DecodeUnified(v *unify.Value) error { func (o *Operation) DecodeUnified(v *unify.Value) error {
@ -80,14 +103,7 @@ func (o *Operation) DecodeUnified(v *unify.Value) error {
return err return err
} }
isMasked := false isMasked := o.IsMasked()
if len(o.InVariant) == 0 {
// No variant
} else if len(o.InVariant) == 1 && o.InVariant[0].Class == "mask" {
isMasked = true
} else {
return fmt.Errorf("unknown inVariant")
}
// Compute full Go method name. // Compute full Go method name.
o.Go = o.rawOperation.Go o.Go = o.rawOperation.Go
@ -104,6 +120,7 @@ func (o *Operation) DecodeUnified(v *unify.Value) error {
o.Documentation = regexp.MustCompile(`\bNAME\b`).ReplaceAllString(o.Documentation, o.Go) o.Documentation = regexp.MustCompile(`\bNAME\b`).ReplaceAllString(o.Documentation, o.Go)
if isMasked { if isMasked {
o.Documentation += "\n//\n// This operation is applied selectively under a write mask." o.Documentation += "\n//\n// This operation is applied selectively under a write mask."
// Suppress generic op and method declaration for exported methods, if a mask is present.
if unicode.IsUpper([]rune(o.Go)[0]) { if unicode.IsUpper([]rune(o.Go)[0]) {
trueVal := "true" trueVal := "true"
o.NoGenericOps = &trueVal o.NoGenericOps = &trueVal

View file

@ -27,18 +27,22 @@
constImm: 1 constImm: 1
documentation: !string |- documentation: !string |-
// NAME returns the upper half of x. // NAME returns the upper half of x.
- go: PermuteOrZero
commutative: false
documentation: !string |-
// NAME performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
- go: Permute - go: Permute
commutative: false commutative: false
documentation: !string |- documentation: !string |-
// NAME performs a full permutation of vector x using indices: // NAME performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements. - go: ConcatPermute # ConcatPermute is only available on or after AVX512
- go: Permute2 # Permute2 is only available on or after AVX512
commutative: false commutative: false
documentation: !string |- documentation: !string |-
// NAME performs a full permutation of vector x, y using indices: // NAME performs a full permutation of vector x, y using indices:
// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]} // result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
// where xy is x appending y. // where xy is the concatenation of x (lower half) and y (upper half).
// Only the needed bits to represent xy's index are used in indices' elements. // Only the needed bits to represent xy's index are used in indices' elements.
- go: Compress - go: Compress
commutative: false commutative: false
@ -74,31 +78,35 @@
documentation: !string |- documentation: !string |-
// NAME copies element zero of its (128-bit) input to all elements of // NAME copies element zero of its (128-bit) input to all elements of
// the 512-bit output vector. // the 512-bit output vector.
- go: PermuteOrZeroGrouped
commutative: false
documentation: !string |- # Detailed documentation will rely on the specific ops.
// NAME performs a grouped permutation of vector x using indices:
- go: PermuteGrouped - go: PermuteGrouped
commutative: false commutative: false
documentation: !string |- # Detailed documentation will rely on the specific ops. documentation: !string |- # Detailed documentation will rely on the specific ops.
// NAME performs a grouped permutation of vector x using indices: // NAME performs a grouped permutation of vector x using indices:
- go: PermuteConstant - go: permuteScalars
commutative: false commutative: false
documentation: !string |- # Detailed documentation will rely on the specific ops. documentation: !string |- # Detailed documentation will rely on the specific ops.
// NAME performs a permutation of vector x using constant indices: // NAME performs a permutation of vector x using constant indices:
- go: PermuteConstantGrouped - go: permuteScalarsGrouped
commutative: false commutative: false
documentation: !string |- # Detailed documentation will rely on the specific ops. documentation: !string |- # Detailed documentation will rely on the specific ops.
// NAME performs a grouped permutation of vector x using constant indices: // NAME performs a grouped permutation of vector x using constant indices:
- go: PermuteConstantLo - go: permuteScalarsLo
commutative: false commutative: false
documentation: !string |- # Detailed documentation will rely on the specific ops. documentation: !string |- # Detailed documentation will rely on the specific ops.
// NAME performs a permutation of vector x using constant indices: // NAME performs a permutation of vector x using constant indices:
- go: PermuteConstantLoGrouped - go: permuteScalarsLoGrouped
commutative: false commutative: false
documentation: !string |- # Detailed documentation will rely on the specific ops. documentation: !string |- # Detailed documentation will rely on the specific ops.
// NAME performs a grouped permutation of vector x using constant indices: // NAME performs a grouped permutation of vector x using constant indices:
- go: PermuteConstantHi - go: permuteScalarsHi
commutative: false commutative: false
documentation: !string |- # Detailed documentation will rely on the specific ops. documentation: !string |- # Detailed documentation will rely on the specific ops.
// NAME performs a permutation of vector x using constant indices: // NAME performs a permutation of vector x using constant indices:
- go: PermuteConstantHiGrouped - go: permuteScalarsHiGrouped
commutative: false commutative: false
documentation: !string |- # Detailed documentation will rely on the specific ops. documentation: !string |- # Detailed documentation will rely on the specific ops.
// NAME performs a grouped permutation of vector x using constant indices: // NAME performs a grouped permutation of vector x using constant indices:
@ -218,8 +226,10 @@
- go: Select128FromPair - go: Select128FromPair
commutative: false commutative: false
documentation: !string |- documentation: !string |-
// NAME selects the low and high 128-bit halves from the 128-bit halves // NAME treats the 256-bit vectors x and y as a single vector of four
// of its two 256-bit inputs, numbering those halves 0, 1, 2, 3. // 128-bit elements, and returns a 256-bit result formed by
// concatenating the two elements specified by lo and hi.
// For example, {4,5}.NAME(3,0,{6,7}) returns {7,4}.
- go: ConcatShiftBytesRight - go: ConcatShiftBytesRight
commutative: false commutative: false

View file

@ -213,19 +213,75 @@
- *f64xN - *f64xN
- go: Permute - go: Permute
asm: "VPERM[BWDQ]|VPERMP[SD]" asm: "VPERMQ|VPERMPD"
addDoc: !string |-
// The low 2 bits (values 0-3) of each element of indices is used
operandOrder: "21Type1" operandOrder: "21Type1"
in: in:
- &anyindices - &anyindices
go: $t go: $t
name: indices name: indices
overwriteBase: uint overwriteBase: uint
- &any4
go: $t
lanes: 4
out:
- &any - &any
go: $t go: $t
- go: Permute
asm: "VPERM[WDQ]|VPERMP[SD]"
addDoc: !string |-
// The low 3 bits (values 0-7) of each element of indices is used
operandOrder: "21Type1"
in:
- *anyindices
- &any8
go: $t
lanes: 8
out: out:
- *any - *any
- go: Permute2 - go: Permute
asm: "VPERM[BWD]|VPERMPS"
addDoc: !string |-
// The low 4 bits (values 0-15) of each element of indices is used
operandOrder: "21Type1"
in:
- *anyindices
- &any16
go: $t
lanes: 16
out:
- *any
- go: Permute
asm: "VPERM[BW]"
addDoc: !string |-
// The low 5 bits (values 0-31) of each element of indices is used
operandOrder: "21Type1"
in:
- *anyindices
- &any32
go: $t
lanes: 32
out:
- *any
- go: Permute
asm: "VPERMB"
addDoc: !string |-
// The low 6 bits (values 0-63) of each element of indices is used
operandOrder: "21Type1"
in:
- *anyindices
- &any64
go: $t
lanes: 64
out:
- *any
- go: ConcatPermute
asm: "VPERMI2[BWDQ]|VPERMI2P[SD]" asm: "VPERMI2[BWDQ]|VPERMI2P[SD]"
# Because we are overwriting the receiver's type, we # Because we are overwriting the receiver's type, we
# have to move the receiver to be a parameter so that # have to move the receiver to be a parameter so that
@ -403,113 +459,137 @@
base: $b base: $b
# VPSHUFB for 128-bit byte shuffles will be picked with higher priority than VPERMB, given its lower CPU feature requirement. (It's AVX) # VPSHUFB for 128-bit byte shuffles will be picked with higher priority than VPERMB, given its lower CPU feature requirement. (It's AVX)
- go: Permute - go: PermuteOrZero
asm: VPSHUFB asm: VPSHUFB
addDoc: !string |- addDoc: !string |-
// However when the top bit is set, the low bits will be disregard and the respective element in the result vector will be zeroed. // The lower four bits of each byte-sized index in indices select an element from x,
// unless the index's sign bit is set in which case zero is used instead.
in: in:
- &128any - &128any
bits: 128 bits: 128
go: $t go: $t
- bits: 128 - bits: 128
go: $t
name: indices name: indices
base: int # always signed
out: out:
- *128any - *128any
- go: PermuteGrouped
- go: PermuteOrZeroGrouped
asm: VPSHUFB asm: VPSHUFB
addDoc: !string |- addDoc: !string |-
// result := {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...} // result = {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...}
// Only the needed bits to represent the index of a group of x are used in indices' elements. // The lower four bits of each byte-sized index in indices select an element from its corresponding group in x,
// However when the top bit is set, the low bits will be disregard and the respective element in the result vector will be zeroed. // unless the index's sign bit is set in which case zero is used instead.
// Each group is of size 128-bit. // Each group is of size 128-bit.
in: in:
- &256Or512any - &256Or512any
bits: "256|512" bits: "256|512"
go: $t go: $t
- bits: "256|512" - bits: "256|512"
go: $t base: int
name: indices name: indices
out: out:
- *256Or512any - *256Or512any
- go: PermuteConstant - go: permuteScalars
asm: VPSHUFD asm: VPSHUFD
addDoc: !string |- addDoc: !string |-
// result := {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]} // result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
in: in:
- *128any - *128any
- class: immediate - class: immediate
immOffset: 0 immOffset: 0
name: indices name: indices
hideMaskMethods: true
out: out:
- *128any - *128any
- go: PermuteConstantGrouped
- go: permuteScalarsGrouped
asm: VPSHUFD asm: VPSHUFD
addDoc: !string |- addDoc: !string |-
// result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...} // result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
// Each group is of size 128-bit. // Each group is of size 128-bit.
in: in:
- *256Or512any - *256Or512any
- class: immediate - class: immediate
immOffset: 0 immOffset: 0
name: indices name: indices
hideMaskMethods: true
out: out:
- *256Or512any - *256Or512any
- go: PermuteConstantLo - go: permuteScalarsLo
asm: VPSHUFHW asm: VPSHUFLW
addDoc: !string |- addDoc: !string |-
// result := {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]} // result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]], x[4], x[5], x[6], x[7]}
// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
in: in:
- *128any - &128lanes8
bits: 128
go: $t
elemBits: 16
- class: immediate - class: immediate
immOffset: 0 immOffset: 0
name: indices name: indices
hideMaskMethods: true
out: out:
- *128any - *128lanes8
- go: PermuteConstantLoGrouped
asm: VPSHUFHW
addDoc: !string |-
// result := {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
// Each group is of size 128-bit.
in:
- *256Or512any
- class: immediate
immOffset: 0
name: indices
out:
- *256Or512any
- go: PermuteConstantHi - go: permuteScalarsLoGrouped
asm: VPSHUFHW asm: VPSHUFLW
addDoc: !string |- addDoc: !string |-
// result := {x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]} //
// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index. // result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x[4], x[5], x[6], x[7],
in: // x_group1[indices[0:2]], ...}
- *128any //
- class: immediate // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
immOffset: 0
name: indices
out:
- *128any
- go: PermuteConstantHiGrouped
asm: VPSHUFHW
addDoc: !string |-
// result := {x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4], x_group1[indices[0:2]+4], ...}
// Here indices are word-size unsigned index value packed together, e.g. indices[0:2] is the first index.
// Each group is of size 128-bit. // Each group is of size 128-bit.
in: in:
- *256Or512any - &256Or512lanes8
bits: "256|512"
go: $t
elemBits: 16
- class: immediate - class: immediate
immOffset: 0 immOffset: 0
name: indices name: indices
hideMaskMethods: true
out: out:
- *256Or512any - *256Or512lanes8
- go: permuteScalarsHi
asm: VPSHUFHW
addDoc: !string |-
// result = {x[0], x[1], x[2], x[3], x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]}
// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
in:
- *128lanes8
- class: immediate
immOffset: 0
name: indices
hideMaskMethods: true
out:
- *128lanes8
- go: permuteScalarsHiGrouped
asm: VPSHUFHW
addDoc: !string |-
// result =
//
// {x_group0[0], x_group0[1], x_group0[2], x_group0[3], x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4],
// x_group1[0], x_group1[1], x_group1[2], x_group1[3], x_group1[indices[0:2]+4], ...}
//
// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
// Each group is of size 128-bit.
in:
- *256Or512lanes8
- class: immediate
immOffset: 0
name: indices
hideMaskMethods: true
out:
- *256Or512lanes8
- go: InterleaveHi - go: InterleaveHi
asm: VPUNPCKH(QDQ|DQ|WD|WB) asm: VPUNPCKH(QDQ|DQ|WD|WB)

View file

@ -163,7 +163,20 @@ func TestPermute(t *testing.T) {
} }
} }
func TestPermute2(t *testing.T) { func TestPermuteOrZero(t *testing.T) {
x := []uint8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
indices := []int8{7, 6, 5, 4, 3, 2, 1, 0, -1, 8, -1, 9, -1, 10, -1, 11}
want := []uint8{8, 7, 6, 5, 4, 3, 2, 1, 0, 9, 0, 10, 0, 11, 0, 12}
got := make([]uint8, len(x))
simd.LoadUint8x16Slice(x).PermuteOrZero(simd.LoadInt8x16Slice(indices)).StoreSlice(got)
for i := range 8 {
if want[i] != got[i] {
t.Errorf("want and got differ at index %d, want=%d, got=%d", i, want[i], got[i])
}
}
}
func TestConcatPermute(t *testing.T) {
if !simd.X86.AVX512() { if !simd.X86.AVX512() {
t.Skip("Test requires X86.AVX512, not available on this hardware") t.Skip("Test requires X86.AVX512, not available on this hardware")
return return
@ -173,7 +186,7 @@ func TestPermute2(t *testing.T) {
indices := []uint64{7 + 8, 6, 5 + 8, 4, 3 + 8, 2, 1 + 8, 0} indices := []uint64{7 + 8, 6, 5 + 8, 4, 3 + 8, 2, 1 + 8, 0}
want := []int64{-8, 7, -6, 5, -4, 3, -2, 1} want := []int64{-8, 7, -6, 5, -4, 3, -2, 1}
got := make([]int64, 8) got := make([]int64, 8)
simd.LoadInt64x8Slice(x).Permute2(simd.LoadInt64x8Slice(y), simd.LoadUint64x8Slice(indices)).StoreSlice(got) simd.LoadInt64x8Slice(x).ConcatPermute(simd.LoadInt64x8Slice(y), simd.LoadUint64x8Slice(indices)).StoreSlice(got)
for i := range 8 { for i := range 8 {
if want[i] != got[i] { if want[i] != got[i] {
t.Errorf("want and got differ at index %d, want=%d, got=%d", i, want[i], got[i]) t.Errorf("want and got differ at index %d, want=%d, got=%d", i, want[i], got[i])
@ -1161,3 +1174,75 @@ func TestDotProductQuadruple(t *testing.T) {
} }
} }
} }
func TestPermuteScalars(t *testing.T) {
x := []int32{11, 12, 13, 14}
want := []int32{12, 13, 14, 11}
got := make([]int32, 4)
simd.LoadInt32x4Slice(x).PermuteScalars(1, 2, 3, 0).StoreSlice(got)
for i := range 4 {
if want[i] != got[i] {
t.Errorf("want and got differ at index %d, want=%d, got=%d", i, want[i], got[i])
}
}
}
func TestPermuteScalarsGrouped(t *testing.T) {
x := []int32{11, 12, 13, 14, 21, 22, 23, 24}
want := []int32{12, 13, 14, 11, 22, 23, 24, 21}
got := make([]int32, 8)
simd.LoadInt32x8Slice(x).PermuteScalarsGrouped(1, 2, 3, 0).StoreSlice(got)
for i := range 8 {
if want[i] != got[i] {
t.Errorf("want and got differ at index %d, want=%d, got=%d", i, want[i], got[i])
}
}
}
func TestPermuteScalarsHi(t *testing.T) {
x := []int16{-1, -2, -3, -4, 11, 12, 13, 14}
want := []int16{-1, -2, -3, -4, 12, 13, 14, 11}
got := make([]int16, len(x))
simd.LoadInt16x8Slice(x).PermuteScalarsHi(1, 2, 3, 0).StoreSlice(got)
for i := range got {
if want[i] != got[i] {
t.Errorf("want and got differ at index %d, want=%d, got=%d", i, want[i], got[i])
}
}
}
func TestPermuteScalarsLo(t *testing.T) {
x := []int16{11, 12, 13, 14, 4, 5, 6, 7}
want := []int16{12, 13, 14, 11, 4, 5, 6, 7}
got := make([]int16, len(x))
simd.LoadInt16x8Slice(x).PermuteScalarsLo(1, 2, 3, 0).StoreSlice(got)
for i := range got {
if want[i] != got[i] {
t.Errorf("want and got differ at index %d, want=%d, got=%d", i, want[i], got[i])
}
}
}
func TestPermuteScalarsHiGrouped(t *testing.T) {
x := []int16{-1, -2, -3, -4, 11, 12, 13, 14, -11, -12, -13, -14, 111, 112, 113, 114}
want := []int16{-1, -2, -3, -4, 12, 13, 14, 11, -11, -12, -13, -14, 112, 113, 114, 111}
got := make([]int16, len(x))
simd.LoadInt16x16Slice(x).PermuteScalarsHiGrouped(1, 2, 3, 0).StoreSlice(got)
for i := range got {
if want[i] != got[i] {
t.Errorf("want and got differ at index %d, want=%d, got=%d", i, want[i], got[i])
}
}
}
func TestPermuteScalarsLoGrouped(t *testing.T) {
x := []int16{11, 12, 13, 14, 4, 5, 6, 7, 111, 112, 113, 114, 14, 15, 16, 17}
want := []int16{12, 13, 14, 11, 4, 5, 6, 7, 112, 113, 114, 111, 14, 15, 16, 17}
got := make([]int16, len(x))
simd.LoadInt16x16Slice(x).PermuteScalarsLoGrouped(1, 2, 3, 0).StoreSlice(got)
for i := range got {
if want[i] != got[i] {
t.Errorf("want and got differ at index %d, want=%d, got=%d", i, want[i], got[i])
}
}
}

File diff suppressed because it is too large Load diff

View file

@ -338,6 +338,220 @@ func (x Uint64x4) concatSelectedConstantGrouped(hilos uint8, y Uint64x4) Uint64x
// Asm: VSHUFPD, CPU Feature: AVX512 // Asm: VSHUFPD, CPU Feature: AVX512
func (x Uint64x8) concatSelectedConstantGrouped(hilos uint8, y Uint64x8) Uint64x8 func (x Uint64x8) concatSelectedConstantGrouped(hilos uint8, y Uint64x8) Uint64x8
/* permuteScalars */
// permuteScalars performs a permutation of vector x using constant indices:
// result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
//
// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
//
// Asm: VPSHUFD, CPU Feature: AVX
func (x Int32x4) permuteScalars(indices uint8) Int32x4
// permuteScalars performs a permutation of vector x using constant indices:
// result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
//
// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
//
// Asm: VPSHUFD, CPU Feature: AVX
func (x Uint32x4) permuteScalars(indices uint8) Uint32x4
/* permuteScalarsGrouped */
// permuteScalarsGrouped performs a grouped permutation of vector x using constant indices:
// result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
// Each group is of size 128-bit.
//
// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
//
// Asm: VPSHUFD, CPU Feature: AVX2
func (x Int32x8) permuteScalarsGrouped(indices uint8) Int32x8
// permuteScalarsGrouped performs a grouped permutation of vector x using constant indices:
// result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
// Each group is of size 128-bit.
//
// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
//
// Asm: VPSHUFD, CPU Feature: AVX512
func (x Int32x16) permuteScalarsGrouped(indices uint8) Int32x16
// permuteScalarsGrouped performs a grouped permutation of vector x using constant indices:
// result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
// Each group is of size 128-bit.
//
// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
//
// Asm: VPSHUFD, CPU Feature: AVX2
func (x Uint32x8) permuteScalarsGrouped(indices uint8) Uint32x8
// permuteScalarsGrouped performs a grouped permutation of vector x using constant indices:
// result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
// Each group is of size 128-bit.
//
// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
//
// Asm: VPSHUFD, CPU Feature: AVX512
func (x Uint32x16) permuteScalarsGrouped(indices uint8) Uint32x16
/* permuteScalarsHi */
// permuteScalarsHi performs a permutation of vector x using constant indices:
// result = {x[0], x[1], x[2], x[3], x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]}
// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
//
// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
//
// Asm: VPSHUFHW, CPU Feature: AVX512
func (x Int16x8) permuteScalarsHi(indices uint8) Int16x8
// permuteScalarsHi performs a permutation of vector x using constant indices:
// result = {x[0], x[1], x[2], x[3], x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]}
// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
//
// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
//
// Asm: VPSHUFHW, CPU Feature: AVX512
func (x Uint16x8) permuteScalarsHi(indices uint8) Uint16x8
/* permuteScalarsHiGrouped */
// permuteScalarsHiGrouped performs a grouped permutation of vector x using constant indices:
// result =
//
// {x_group0[0], x_group0[1], x_group0[2], x_group0[3], x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4],
// x_group1[0], x_group1[1], x_group1[2], x_group1[3], x_group1[indices[0:2]+4], ...}
//
// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
// Each group is of size 128-bit.
//
// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
//
// Asm: VPSHUFHW, CPU Feature: AVX2
func (x Int16x16) permuteScalarsHiGrouped(indices uint8) Int16x16
// permuteScalarsHiGrouped performs a grouped permutation of vector x using constant indices:
// result =
//
// {x_group0[0], x_group0[1], x_group0[2], x_group0[3], x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4],
// x_group1[0], x_group1[1], x_group1[2], x_group1[3], x_group1[indices[0:2]+4], ...}
//
// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
// Each group is of size 128-bit.
//
// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
//
// Asm: VPSHUFHW, CPU Feature: AVX512
func (x Int16x32) permuteScalarsHiGrouped(indices uint8) Int16x32
// permuteScalarsHiGrouped performs a grouped permutation of vector x using constant indices:
// result =
//
// {x_group0[0], x_group0[1], x_group0[2], x_group0[3], x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4],
// x_group1[0], x_group1[1], x_group1[2], x_group1[3], x_group1[indices[0:2]+4], ...}
//
// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
// Each group is of size 128-bit.
//
// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
//
// Asm: VPSHUFHW, CPU Feature: AVX2
func (x Uint16x16) permuteScalarsHiGrouped(indices uint8) Uint16x16
// permuteScalarsHiGrouped performs a grouped permutation of vector x using constant indices:
// result =
//
// {x_group0[0], x_group0[1], x_group0[2], x_group0[3], x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4],
// x_group1[0], x_group1[1], x_group1[2], x_group1[3], x_group1[indices[0:2]+4], ...}
//
// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
// Each group is of size 128-bit.
//
// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
//
// Asm: VPSHUFHW, CPU Feature: AVX512
func (x Uint16x32) permuteScalarsHiGrouped(indices uint8) Uint16x32
/* permuteScalarsLo */
// permuteScalarsLo performs a permutation of vector x using constant indices:
// result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]], x[4], x[5], x[6], x[7]}
// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
//
// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
//
// Asm: VPSHUFLW, CPU Feature: AVX512
func (x Int16x8) permuteScalarsLo(indices uint8) Int16x8
// permuteScalarsLo performs a permutation of vector x using constant indices:
// result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]], x[4], x[5], x[6], x[7]}
// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
//
// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
//
// Asm: VPSHUFLW, CPU Feature: AVX512
func (x Uint16x8) permuteScalarsLo(indices uint8) Uint16x8
/* permuteScalarsLoGrouped */
// permuteScalarsLoGrouped performs a grouped permutation of vector x using constant indices:
//
// result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x[4], x[5], x[6], x[7],
// x_group1[indices[0:2]], ...}
//
// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
// Each group is of size 128-bit.
//
// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
//
// Asm: VPSHUFLW, CPU Feature: AVX2
func (x Int16x16) permuteScalarsLoGrouped(indices uint8) Int16x16
// permuteScalarsLoGrouped performs a grouped permutation of vector x using constant indices:
//
// result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x[4], x[5], x[6], x[7],
// x_group1[indices[0:2]], ...}
//
// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
// Each group is of size 128-bit.
//
// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
//
// Asm: VPSHUFLW, CPU Feature: AVX512
func (x Int16x32) permuteScalarsLoGrouped(indices uint8) Int16x32
// permuteScalarsLoGrouped performs a grouped permutation of vector x using constant indices:
//
// result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x[4], x[5], x[6], x[7],
// x_group1[indices[0:2]], ...}
//
// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
// Each group is of size 128-bit.
//
// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
//
// Asm: VPSHUFLW, CPU Feature: AVX2
func (x Uint16x16) permuteScalarsLoGrouped(indices uint8) Uint16x16
// permuteScalarsLoGrouped performs a grouped permutation of vector x using constant indices:
//
// result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x[4], x[5], x[6], x[7],
// x_group1[indices[0:2]], ...}
//
// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
// Each group is of size 128-bit.
//
// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
//
// Asm: VPSHUFLW, CPU Feature: AVX512
func (x Uint16x32) permuteScalarsLoGrouped(indices uint8) Uint16x32
/* tern */ /* tern */
// tern performs a logical operation on three vectors based on the 8-bit truth table. // tern performs a logical operation on three vectors based on the 8-bit truth table.

View file

@ -989,3 +989,280 @@ func (x Int64x8) SelectFromPairGrouped(a, b uint8, y Int64x8) Int64x8 {
} }
panic("missing case, switch should be exhaustive") panic("missing case, switch should be exhaustive")
} }
/* PermuteScalars */
// PermuteScalars performs a permutation of vector x's elements using the supplied indices:
//
// result = {x[a], x[b], x[c], x[d]}
//
// Parameters a,b,c,d should have values between 0 and 3.
// If a through d are constants, then an instruction will be inlined, otherwise
// a jump table may be generated.
//
// Asm: VPSHUFD, CPU Feature: AVX
func (x Int32x4) PermuteScalars(a, b, c, d uint8) Int32x4 {
return x.permuteScalars(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
}
// PermuteScalars performs a permutation of vector x's elements using the supplied indices:
//
// result = {x[a], x[b], x[c], x[d]}
//
// Parameters a,b,c,d should have values between 0 and 3.
// If a through d are constants, then an instruction will be inlined, otherwise
// a jump table may be generated.
//
// Asm: VPSHUFD, CPU Feature: AVX
func (x Uint32x4) PermuteScalars(a, b, c, d uint8) Uint32x4 {
return x.permuteScalars(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
}
/* PermuteScalarsGrouped */
// PermuteScalarsGrouped performs a grouped permutation of vector x using the supplied indices:
//
// result = {x[a], x[b], x[c], x[d], x[a+4], x[b+4], x[c+4], x[d+4]}
//
// Parameters a,b,c,d should have values between 0 and 3.
// If a through d are constants, then an instruction will be inlined, otherwise
// a jump table may be generated.
//
// Asm: VPSHUFD, CPU Feature: AVX2
func (x Int32x8) PermuteScalarsGrouped(a, b, c, d uint8) Int32x8 {
return x.permuteScalarsGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
}
// PermuteScalarsGrouped performs a grouped permutation of vector x using the supplied indices:
//
// result =
// { x[a], x[b], x[c], x[d], x[a+4], x[b+4], x[c+4], x[d+4],
// x[a+8], x[b+8], x[c+8], x[d+8], x[a+12], x[b+12], x[c+12], x[d+12]}
//
// Parameters a,b,c,d should have values between 0 and 3.
// If a through d are constants, then an instruction will be inlined, otherwise
// a jump table may be generated.
//
// Asm: VPSHUFD, CPU Feature: AVX512
func (x Int32x16) PermuteScalarsGrouped(a, b, c, d uint8) Int32x16 {
return x.permuteScalarsGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
}
// PermuteScalarsGrouped performs a grouped permutation of vector x using the supplied indices:
//
// result = {x[a], x[b], x[c], x[d], x[a+4], x[b+4], x[c+4], x[d+4]}
//
// Parameters a,b,c,d should have values between 0 and 3.
// If a through d are constants, then an instruction will be inlined, otherwise
// a jump table is generated.
//
// Asm: VPSHUFD, CPU Feature: AVX2
func (x Uint32x8) PermuteScalarsGrouped(a, b, c, d uint8) Uint32x8 {
return x.permuteScalarsGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
}
// PermuteScalarsGrouped performs a grouped permutation of vector x using the supplied indices:
//
// result =
// { x[a], x[b], x[c], x[d], x[a+4], x[b+4], x[c+4], x[d+4],
// x[a+8], x[b+8], x[c+8], x[d+8], x[a+12], x[b+12], x[c+12], x[d+12]}
//
// Parameters a,b,c,d should have values between 0 and 3.
// If a through d are constants, then an instruction will be inlined, otherwise
// a jump table is generated.
//
// Asm: VPSHUFD, CPU Feature: AVX512
func (x Uint32x16) PermuteScalarsGrouped(a, b, c, d uint8) Uint32x16 {
return x.permuteScalarsGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
}
/* PermuteScalarsHi */
// PermuteScalarsHi performs a permutation of vector x using the supplied indices:
//
// result = {x[0], x[1], x[2], x[3], x[a+4], x[b+4], x[c+4], x[d+4]}
//
// Parameters a,b,c,d should have values between 0 and 3.
// If a through d are constants, then an instruction will be inlined, otherwise
// a jump table is generated.
//
// Asm: VPSHUFHW, CPU Feature: AVX512
func (x Int16x8) PermuteScalarsHi(a, b, c, d uint8) Int16x8 {
return x.permuteScalarsHi(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
}
// PermuteScalarsHi performs a permutation of vector x using the supplied indices:
//
// result = {x[0], x[1], x[2], x[3], x[a+4], x[b+4], x[c+4], x[d+4]}
//
// Parameters a,b,c,d should have values between 0 and 3.
// If a through d are constants, then an instruction will be inlined, otherwise
// a jump table is generated.
//
// Asm: VPSHUFHW, CPU Feature: AVX512
func (x Uint16x8) PermuteScalarsHi(a, b, c, d uint8) Uint16x8 {
return x.permuteScalarsHi(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
}
/* PermuteScalarsHiGrouped */
// PermuteScalarsHiGrouped performs a grouped permutation of vector x using the supplied indices:
//
// result =
// {x[0], x[1], x[2], x[3], x[a+4], x[b+4], x[c+4], x[d+4],
// x[8], x[9], x[10], x[11], x[a+12], x[b+12], x[c+12], x[d+12]}
//
// Parameters a,b,c,d should have values between 0 and 3.
// If a through d are constants, then an instruction will be inlined, otherwise
// a jump table is generated.
//
// Asm: VPSHUFHW, CPU Feature: AVX2
func (x Int16x16) PermuteScalarsHiGrouped(a, b, c, d uint8) Int16x16 {
return x.permuteScalarsHiGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
}
// PermuteScalarsHiGrouped performs a grouped permutation of vector x using the supplied indices:
//
// result =
// {x[0], x[1], x[2], x[3], x[a+4], x[b+4], x[c+4], x[d+4],
// x[8], x[9], x[10], x[11], x[a+12], x[b+12], x[c+12], x[d+12],
// x[16], x[17], x[18], x[19], x[a+20], x[b+20], x[c+20], x[d+20],
// x[24], x[25], x[26], x[27], x[a+28], x[b+28], x[c+28], x[d+28]}
//
// Parameters a,b,c,d should have values between 0 and 3.
// If a through d are constants, then an instruction will be inlined, otherwise
// a jump table is generated.
//
// Asm: VPSHUFHW, CPU Feature: AVX512
func (x Int16x32) PermuteScalarsHiGrouped(a, b, c, d uint8) Int16x32 {
return x.permuteScalarsHiGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
}
// PermuteScalarsHiGrouped performs a grouped permutation of vector x using the supplied indices:
//
// result =
// {x[0], x[1], x[2], x[3], x[a+4], x[b+4], x[c+4], x[d+4],
// x[8], x[9], x[10], x[11], x[a+12], x[b+12], x[c+12], x[d+12]}
//
// Each group is of size 128-bit.
//
// Parameters a,b,c,d should have values between 0 and 3.
// If a through d are constants, then an instruction will be inlined, otherwise
// a jump table is generated.
//
// Asm: VPSHUFHW, CPU Feature: AVX2
func (x Uint16x16) PermuteScalarsHiGrouped(a, b, c, d uint8) Uint16x16 {
return x.permuteScalarsHiGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
}
// PermuteScalarsHiGrouped performs a grouped permutation of vector x using the supplied indices:
//
// result =
// { x[0], x[1], x[2], x[3], x[a+4], x[b+4], x[c+4], x[d+4],
// x[8], x[9], x[10], x[11], x[a+12], x[b+12], x[c+12], x[d+12],
// x[16], x[17], x[18], x[19], x[a+20], x[b+20], x[c+20], x[d+20],
// x[24], x[25], x[26], x[27], x[a+28], x[b+28], x[c+28], x[d+28]}
//
// Parameters a,b,c,d should have values between 0 and 3.
// If a through d are constants, then an instruction will be inlined, otherwise
// a jump table is generated.
//
// Asm: VPSHUFHW, CPU Feature: AVX512
func (x Uint16x32) PermuteScalarsHiGrouped(a, b, c, d uint8) Uint16x32 {
return x.permuteScalarsHiGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
}
/* PermuteScalarsLo */
// PermuteScalarsLo performs a permutation of vector x using the supplied indices:
//
// result = {x[a], x[b], x[c], x[d], x[4], x[5], x[6], x[7]}
//
// Parameters a,b,c,d should have values between 0 and 3.
// If a through d are constants, then an instruction will be inlined, otherwise
// a jump table is generated.
//
// Asm: VPSHUFLW, CPU Feature: AVX512
func (x Int16x8) PermuteScalarsLo(a, b, c, d uint8) Int16x8 {
return x.permuteScalarsLo(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
}
// PermuteScalarsLo performs a permutation of vector x using the supplied indices:
//
// result = {x[a], x[b], x[c], x[d], x[4], x[5], x[6], x[7]}
//
// Parameters a,b,c,d should have values between 0 and 3.
// If a through d are constants, then an instruction will be inlined, otherwise
// a jump table is generated.
//
// Asm: VPSHUFLW, CPU Feature: AVX512
func (x Uint16x8) PermuteScalarsLo(a, b, c, d uint8) Uint16x8 {
return x.permuteScalarsLo(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
}
/* PermuteScalarsLoGrouped */
// PermuteScalarsLoGrouped performs a grouped permutation of vector x using the supplied indices:
//
// result =
// {x[a], x[b], x[c], x[d], x[4], x[5], x[6], x[7],
// x[a+8], x[b+8], x[c+8], x[d+8], x[12], x[13], x[14], x[15]}
//
// Parameters a,b,c,d should have values between 0 and 3.
// If a through d are constants, then an instruction will be inlined, otherwise
// a jump table is generated.
//
// Asm: VPSHUFLW, CPU Feature: AVX2
func (x Int16x16) PermuteScalarsLoGrouped(a, b, c, d uint8) Int16x16 {
return x.permuteScalarsLoGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
}
// PermuteScalarsLoGrouped performs a grouped permutation of vector x using the supplied indices:
//
// result =
// {x[a], x[b], x[c], x[d], x[4], x[5], x[6], x[7],
// x[a+8], x[b+8], x[c+8], x[d+8], x[12], x[13], x[14], x[15],
// x[a+16], x[b+16], x[c+16], x[d+16], x[20], x[21], x[22], x[23],
// x[a+24], x[b+24], x[c+24], x[d+24], x[28], x[29], x[30], x[31]}
//
// Parameters a,b,c,d should have values between 0 and 3.
// If a through d are constants, then an instruction will be inlined, otherwise
// a jump table is generated.
//
// Asm: VPSHUFLW, CPU Feature: AVX512
func (x Int16x32) PermuteScalarsLoGrouped(a, b, c, d uint8) Int16x32 {
return x.permuteScalarsLoGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
}
// PermuteScalarsLoGrouped performs a grouped permutation of vector x using the supplied indices:
//
// result = {x[a], x[b], x[c], x[d], x[4], x[5], x[6], x[7],
// x[a+8], x[b+8], x[c+8], x[d+8], x[12], x[13], x[14], x[15]}
//
// Parameters a,b,c,d should have values between 0 and 3.
// If a through d are constants, then an instruction will be inlined, otherwise
// a jump table is generated.
//
// Asm: VPSHUFLW, CPU Feature: AVX2
func (x Uint16x16) PermuteScalarsLoGrouped(a, b, c, d uint8) Uint16x16 {
return x.permuteScalarsLoGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
}
// PermuteScalarsLoGrouped performs a grouped permutation of vector x using the supplied indices:
//
// result =
// {x[a], x[b], x[c], x[d], x[4], x[5], x[6], x[7],
// x[a+8], x[b+8], x[c+8], x[d+8], x[12], x[13], x[14], x[15],
// x[a+16], x[b+16], x[c+16], x[d+16], x[20], x[21], x[22], x[23],
// x[a+24], x[b+24], x[c+24], x[d+24], x[28], x[29], x[30], x[31]}
//
// Each group is of size 128-bit.
//
// Parameters a,b,c,d should have values between 0 and 3.
// If a through d are constants, then an instruction will be inlined, otherwise
// a jump table is generated.
//
// Asm: VPSHUFLW, CPU Feature: AVX512
func (x Uint16x32) PermuteScalarsLoGrouped(a, b, c, d uint8) Uint16x32 {
return x.permuteScalarsLoGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
}