[dev.simd] cmd/compile: reorder stubs

This CL is generated by CL 682035. Change-Id: I0a8b7382470afb5a6571ab7d4abe038de0ff239e Reviewed-on: https://go-review.googlesource.com/c/go/+/682055 Reviewed-by: Junyang Shao <shaojunyang@google.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: David Chase <drchase@google.com> Auto-Submit: Junyang Shao <shaojunyang@google.com>
2025-12-08 06:10:04 +00:00 · 2025-06-16 20:11:27 +00:00 · 2025-06-16 20:11:27 +00:00 · ee1d9f3f85
commit ee1d9f3f85
parent 6c50c8b892
8 changed files with 8944 additions and 8986 deletions
--- a/src/cmd/compile/internal/amd64/simdssa.go
+++ b/src/cmd/compile/internal/amd64/simdssa.go
--- a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
+++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
@ -194,17 +194,17 @@
 (EqualFloat64x4 x y) => (VCMPPD256 [0] x y)
 (EqualFloat64x8 x y) => (VPMOVMToVec64x8 (VCMPPD512 [0] x y))
 (EqualInt16x16 ...) => (VPCMPEQW256 ...)
-(EqualInt16x32 x y) => (VPMOVMToVec16x32 (VPCMPEQW512 x y))
+(EqualInt16x32 x y) => (VPMOVMToVec16x32 (VPCMPW512 [0] x y))
 (EqualInt16x8 ...) => (VPCMPEQW128 ...)
-(EqualInt32x16 x y) => (VPMOVMToVec32x16 (VPCMPEQD512 x y))
+(EqualInt32x16 x y) => (VPMOVMToVec32x16 (VPCMPD512 [0] x y))
 (EqualInt32x4 ...) => (VPCMPEQD128 ...)
 (EqualInt32x8 ...) => (VPCMPEQD256 ...)
 (EqualInt64x2 ...) => (VPCMPEQQ128 ...)
 (EqualInt64x4 ...) => (VPCMPEQQ256 ...)
-(EqualInt64x8 x y) => (VPMOVMToVec64x8 (VPCMPEQQ512 x y))
+(EqualInt64x8 x y) => (VPMOVMToVec64x8 (VPCMPQ512 [0] x y))
 (EqualInt8x16 ...) => (VPCMPEQB128 ...)
 (EqualInt8x32 ...) => (VPCMPEQB256 ...)
-(EqualInt8x64 x y) => (VPMOVMToVec8x64 (VPCMPEQB512 x y))
+(EqualInt8x64 x y) => (VPMOVMToVec8x64 (VPCMPB512 [0] x y))
 (EqualUint16x16 x y) => (VPMOVMToVec16x16 (VPCMPUW256 [0] x y))
 (EqualUint16x32 x y) => (VPMOVMToVec16x32 (VPCMPUW512 [0] x y))
 (EqualUint16x8 x y) => (VPMOVMToVec16x8 (VPCMPUW128 [0] x y))
@ -348,17 +348,17 @@
 (GreaterFloat64x4 x y) => (VCMPPD256 [6] x y)
 (GreaterFloat64x8 x y) => (VPMOVMToVec64x8 (VCMPPD512 [6] x y))
 (GreaterInt16x16 ...) => (VPCMPGTW256 ...)
-(GreaterInt16x32 x y) => (VPMOVMToVec16x32 (VPCMPGTW512 x y))
+(GreaterInt16x32 x y) => (VPMOVMToVec16x32 (VPCMPW512 [6] x y))
 (GreaterInt16x8 ...) => (VPCMPGTW128 ...)
-(GreaterInt32x16 x y) => (VPMOVMToVec32x16 (VPCMPGTD512 x y))
+(GreaterInt32x16 x y) => (VPMOVMToVec32x16 (VPCMPD512 [6] x y))
 (GreaterInt32x4 ...) => (VPCMPGTD128 ...)
 (GreaterInt32x8 ...) => (VPCMPGTD256 ...)
-(GreaterInt64x2 x y) => (VPMOVMToVec64x2 (VPCMPGTQ128 x y))
+(GreaterInt64x2 x y) => (VPMOVMToVec64x2 (VPCMPQ128 [6] x y))
 (GreaterInt64x4 ...) => (VPCMPGTQ256 ...)
-(GreaterInt64x8 x y) => (VPMOVMToVec64x8 (VPCMPGTQ512 x y))
+(GreaterInt64x8 x y) => (VPMOVMToVec64x8 (VPCMPQ512 [6] x y))
 (GreaterInt8x16 ...) => (VPCMPGTB128 ...)
 (GreaterInt8x32 ...) => (VPCMPGTB256 ...)
-(GreaterInt8x64 x y) => (VPMOVMToVec8x64 (VPCMPGTB512 x y))
+(GreaterInt8x64 x y) => (VPMOVMToVec8x64 (VPCMPB512 [6] x y))
 (GreaterUint16x16 x y) => (VPMOVMToVec16x16 (VPCMPUW256 [6] x y))
 (GreaterUint16x32 x y) => (VPMOVMToVec16x32 (VPCMPUW512 [6] x y))
 (GreaterUint16x8 x y) => (VPMOVMToVec16x8 (VPCMPUW128 [6] x y))
@ -635,18 +635,18 @@
 (MaskedEqualFloat64x2 x y mask) => (VPMOVMToVec64x2 (VCMPPDMasked128 [0] x y (VPMOVVec64x2ToM <types.TypeMask> mask)))
 (MaskedEqualFloat64x4 x y mask) => (VPMOVMToVec64x4 (VCMPPDMasked256 [0] x y (VPMOVVec64x4ToM <types.TypeMask> mask)))
 (MaskedEqualFloat64x8 x y mask) => (VPMOVMToVec64x8 (VCMPPDMasked512 [0] x y (VPMOVVec64x8ToM <types.TypeMask> mask)))
-(MaskedEqualInt16x16 x y mask) => (VPMOVMToVec16x16 (VPCMPEQWMasked256 x y (VPMOVVec16x16ToM <types.TypeMask> mask)))
+(MaskedEqualInt16x16 x y mask) => (VPMOVMToVec16x16 (VPCMPWMasked256 [0] x y (VPMOVVec16x16ToM <types.TypeMask> mask)))
-(MaskedEqualInt16x32 x y mask) => (VPMOVMToVec16x32 (VPCMPEQWMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask)))
+(MaskedEqualInt16x32 x y mask) => (VPMOVMToVec16x32 (VPCMPWMasked512 [0] x y (VPMOVVec16x32ToM <types.TypeMask> mask)))
-(MaskedEqualInt16x8 x y mask) => (VPMOVMToVec16x8 (VPCMPEQWMasked128 x y (VPMOVVec16x8ToM <types.TypeMask> mask)))
+(MaskedEqualInt16x8 x y mask) => (VPMOVMToVec16x8 (VPCMPWMasked128 [0] x y (VPMOVVec16x8ToM <types.TypeMask> mask)))
-(MaskedEqualInt32x16 x y mask) => (VPMOVMToVec32x16 (VPCMPEQDMasked512 x y (VPMOVVec32x16ToM <types.TypeMask> mask)))
+(MaskedEqualInt32x16 x y mask) => (VPMOVMToVec32x16 (VPCMPDMasked512 [0] x y (VPMOVVec32x16ToM <types.TypeMask> mask)))
-(MaskedEqualInt32x4 x y mask) => (VPMOVMToVec32x4 (VPCMPEQDMasked128 x y (VPMOVVec32x4ToM <types.TypeMask> mask)))
+(MaskedEqualInt32x4 x y mask) => (VPMOVMToVec32x4 (VPCMPDMasked128 [0] x y (VPMOVVec32x4ToM <types.TypeMask> mask)))
-(MaskedEqualInt32x8 x y mask) => (VPMOVMToVec32x8 (VPCMPEQDMasked256 x y (VPMOVVec32x8ToM <types.TypeMask> mask)))
+(MaskedEqualInt32x8 x y mask) => (VPMOVMToVec32x8 (VPCMPDMasked256 [0] x y (VPMOVVec32x8ToM <types.TypeMask> mask)))
-(MaskedEqualInt64x2 x y mask) => (VPMOVMToVec64x2 (VPCMPEQQMasked128 x y (VPMOVVec64x2ToM <types.TypeMask> mask)))
+(MaskedEqualInt64x2 x y mask) => (VPMOVMToVec64x2 (VPCMPQMasked128 [0] x y (VPMOVVec64x2ToM <types.TypeMask> mask)))
-(MaskedEqualInt64x4 x y mask) => (VPMOVMToVec64x4 (VPCMPEQQMasked256 x y (VPMOVVec64x4ToM <types.TypeMask> mask)))
+(MaskedEqualInt64x4 x y mask) => (VPMOVMToVec64x4 (VPCMPQMasked256 [0] x y (VPMOVVec64x4ToM <types.TypeMask> mask)))
-(MaskedEqualInt64x8 x y mask) => (VPMOVMToVec64x8 (VPCMPEQQMasked512 x y (VPMOVVec64x8ToM <types.TypeMask> mask)))
+(MaskedEqualInt64x8 x y mask) => (VPMOVMToVec64x8 (VPCMPQMasked512 [0] x y (VPMOVVec64x8ToM <types.TypeMask> mask)))
-(MaskedEqualInt8x16 x y mask) => (VPMOVMToVec8x16 (VPCMPEQBMasked128 x y (VPMOVVec8x16ToM <types.TypeMask> mask)))
+(MaskedEqualInt8x16 x y mask) => (VPMOVMToVec8x16 (VPCMPBMasked128 [0] x y (VPMOVVec8x16ToM <types.TypeMask> mask)))
-(MaskedEqualInt8x32 x y mask) => (VPMOVMToVec8x32 (VPCMPEQBMasked256 x y (VPMOVVec8x32ToM <types.TypeMask> mask)))
+(MaskedEqualInt8x32 x y mask) => (VPMOVMToVec8x32 (VPCMPBMasked256 [0] x y (VPMOVVec8x32ToM <types.TypeMask> mask)))
-(MaskedEqualInt8x64 x y mask) => (VPMOVMToVec8x64 (VPCMPEQBMasked512 x y (VPMOVVec8x64ToM <types.TypeMask> mask)))
+(MaskedEqualInt8x64 x y mask) => (VPMOVMToVec8x64 (VPCMPBMasked512 [0] x y (VPMOVVec8x64ToM <types.TypeMask> mask)))
 (MaskedEqualUint16x16 x y mask) => (VPMOVMToVec16x16 (VPCMPUWMasked256 [0] x y (VPMOVVec16x16ToM <types.TypeMask> mask)))
 (MaskedEqualUint16x32 x y mask) => (VPMOVMToVec16x32 (VPCMPUWMasked512 [0] x y (VPMOVVec16x32ToM <types.TypeMask> mask)))
 (MaskedEqualUint16x8 x y mask) => (VPMOVMToVec16x8 (VPCMPUWMasked128 [0] x y (VPMOVVec16x8ToM <types.TypeMask> mask)))
@ -785,18 +785,18 @@
 (MaskedGreaterFloat64x2 x y mask) => (VPMOVMToVec64x2 (VCMPPDMasked128 [6] x y (VPMOVVec64x2ToM <types.TypeMask> mask)))
 (MaskedGreaterFloat64x4 x y mask) => (VPMOVMToVec64x4 (VCMPPDMasked256 [6] x y (VPMOVVec64x4ToM <types.TypeMask> mask)))
 (MaskedGreaterFloat64x8 x y mask) => (VPMOVMToVec64x8 (VCMPPDMasked512 [6] x y (VPMOVVec64x8ToM <types.TypeMask> mask)))
-(MaskedGreaterInt16x16 x y mask) => (VPMOVMToVec16x16 (VPCMPGTWMasked256 x y (VPMOVVec16x16ToM <types.TypeMask> mask)))
+(MaskedGreaterInt16x16 x y mask) => (VPMOVMToVec16x16 (VPCMPWMasked256 [6] x y (VPMOVVec16x16ToM <types.TypeMask> mask)))
-(MaskedGreaterInt16x32 x y mask) => (VPMOVMToVec16x32 (VPCMPGTWMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask)))
+(MaskedGreaterInt16x32 x y mask) => (VPMOVMToVec16x32 (VPCMPWMasked512 [6] x y (VPMOVVec16x32ToM <types.TypeMask> mask)))
-(MaskedGreaterInt16x8 x y mask) => (VPMOVMToVec16x8 (VPCMPGTWMasked128 x y (VPMOVVec16x8ToM <types.TypeMask> mask)))
+(MaskedGreaterInt16x8 x y mask) => (VPMOVMToVec16x8 (VPCMPWMasked128 [6] x y (VPMOVVec16x8ToM <types.TypeMask> mask)))
-(MaskedGreaterInt32x16 x y mask) => (VPMOVMToVec32x16 (VPCMPGTDMasked512 x y (VPMOVVec32x16ToM <types.TypeMask> mask)))
+(MaskedGreaterInt32x16 x y mask) => (VPMOVMToVec32x16 (VPCMPDMasked512 [6] x y (VPMOVVec32x16ToM <types.TypeMask> mask)))
-(MaskedGreaterInt32x4 x y mask) => (VPMOVMToVec32x4 (VPCMPGTDMasked128 x y (VPMOVVec32x4ToM <types.TypeMask> mask)))
+(MaskedGreaterInt32x4 x y mask) => (VPMOVMToVec32x4 (VPCMPDMasked128 [6] x y (VPMOVVec32x4ToM <types.TypeMask> mask)))
-(MaskedGreaterInt32x8 x y mask) => (VPMOVMToVec32x8 (VPCMPGTDMasked256 x y (VPMOVVec32x8ToM <types.TypeMask> mask)))
+(MaskedGreaterInt32x8 x y mask) => (VPMOVMToVec32x8 (VPCMPDMasked256 [6] x y (VPMOVVec32x8ToM <types.TypeMask> mask)))
-(MaskedGreaterInt64x2 x y mask) => (VPMOVMToVec64x2 (VPCMPGTQMasked128 x y (VPMOVVec64x2ToM <types.TypeMask> mask)))
+(MaskedGreaterInt64x2 x y mask) => (VPMOVMToVec64x2 (VPCMPQMasked128 [6] x y (VPMOVVec64x2ToM <types.TypeMask> mask)))
-(MaskedGreaterInt64x4 x y mask) => (VPMOVMToVec64x4 (VPCMPGTQMasked256 x y (VPMOVVec64x4ToM <types.TypeMask> mask)))
+(MaskedGreaterInt64x4 x y mask) => (VPMOVMToVec64x4 (VPCMPQMasked256 [6] x y (VPMOVVec64x4ToM <types.TypeMask> mask)))
-(MaskedGreaterInt64x8 x y mask) => (VPMOVMToVec64x8 (VPCMPGTQMasked512 x y (VPMOVVec64x8ToM <types.TypeMask> mask)))
+(MaskedGreaterInt64x8 x y mask) => (VPMOVMToVec64x8 (VPCMPQMasked512 [6] x y (VPMOVVec64x8ToM <types.TypeMask> mask)))
-(MaskedGreaterInt8x16 x y mask) => (VPMOVMToVec8x16 (VPCMPGTBMasked128 x y (VPMOVVec8x16ToM <types.TypeMask> mask)))
+(MaskedGreaterInt8x16 x y mask) => (VPMOVMToVec8x16 (VPCMPBMasked128 [6] x y (VPMOVVec8x16ToM <types.TypeMask> mask)))
-(MaskedGreaterInt8x32 x y mask) => (VPMOVMToVec8x32 (VPCMPGTBMasked256 x y (VPMOVVec8x32ToM <types.TypeMask> mask)))
+(MaskedGreaterInt8x32 x y mask) => (VPMOVMToVec8x32 (VPCMPBMasked256 [6] x y (VPMOVVec8x32ToM <types.TypeMask> mask)))
-(MaskedGreaterInt8x64 x y mask) => (VPMOVMToVec8x64 (VPCMPGTBMasked512 x y (VPMOVVec8x64ToM <types.TypeMask> mask)))
+(MaskedGreaterInt8x64 x y mask) => (VPMOVMToVec8x64 (VPCMPBMasked512 [6] x y (VPMOVVec8x64ToM <types.TypeMask> mask)))
 (MaskedGreaterUint16x16 x y mask) => (VPMOVMToVec16x16 (VPCMPUWMasked256 [6] x y (VPMOVVec16x16ToM <types.TypeMask> mask)))
 (MaskedGreaterUint16x32 x y mask) => (VPMOVMToVec16x32 (VPCMPUWMasked512 [6] x y (VPMOVVec16x32ToM <types.TypeMask> mask)))
 (MaskedGreaterUint16x8 x y mask) => (VPMOVMToVec16x8 (VPCMPUWMasked128 [6] x y (VPMOVVec16x8ToM <types.TypeMask> mask)))
@ -1130,12 +1130,12 @@
 (MaskedSqrtFloat64x2 x mask) => (VSQRTPDMasked128 x (VPMOVVec64x2ToM <types.TypeMask> mask))
 (MaskedSqrtFloat64x4 x mask) => (VSQRTPDMasked256 x (VPMOVVec64x4ToM <types.TypeMask> mask))
 (MaskedSqrtFloat64x8 x mask) => (VSQRTPDMasked512 x (VPMOVVec64x8ToM <types.TypeMask> mask))
-(MaskedSubFloat32x16 x y mask) => (VADDPSMasked512 x y (VPMOVVec32x16ToM <types.TypeMask> mask))
+(MaskedSubFloat32x16 x y mask) => (VSUBPSMasked512 x y (VPMOVVec32x16ToM <types.TypeMask> mask))
-(MaskedSubFloat32x4 x y mask) => (VADDPSMasked128 x y (VPMOVVec32x4ToM <types.TypeMask> mask))
+(MaskedSubFloat32x4 x y mask) => (VSUBPSMasked128 x y (VPMOVVec32x4ToM <types.TypeMask> mask))
-(MaskedSubFloat32x8 x y mask) => (VADDPSMasked256 x y (VPMOVVec32x8ToM <types.TypeMask> mask))
+(MaskedSubFloat32x8 x y mask) => (VSUBPSMasked256 x y (VPMOVVec32x8ToM <types.TypeMask> mask))
-(MaskedSubFloat64x2 x y mask) => (VADDPDMasked128 x y (VPMOVVec64x2ToM <types.TypeMask> mask))
+(MaskedSubFloat64x2 x y mask) => (VSUBPDMasked128 x y (VPMOVVec64x2ToM <types.TypeMask> mask))
-(MaskedSubFloat64x4 x y mask) => (VADDPDMasked256 x y (VPMOVVec64x4ToM <types.TypeMask> mask))
+(MaskedSubFloat64x4 x y mask) => (VSUBPDMasked256 x y (VPMOVVec64x4ToM <types.TypeMask> mask))
-(MaskedSubFloat64x8 x y mask) => (VADDPDMasked512 x y (VPMOVVec64x8ToM <types.TypeMask> mask))
+(MaskedSubFloat64x8 x y mask) => (VSUBPDMasked512 x y (VPMOVVec64x8ToM <types.TypeMask> mask))
 (MaskedSubInt16x16 x y mask) => (VPSUBWMasked256 x y (VPMOVVec16x16ToM <types.TypeMask> mask))
 (MaskedSubInt16x32 x y mask) => (VPSUBWMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask))
 (MaskedSubInt16x8 x y mask) => (VPSUBWMasked128 x y (VPMOVVec16x8ToM <types.TypeMask> mask))
@ -1473,12 +1473,12 @@
 (SqrtFloat64x2 ...) => (VSQRTPD128 ...)
 (SqrtFloat64x4 ...) => (VSQRTPD256 ...)
 (SqrtFloat64x8 ...) => (VSQRTPD512 ...)
-(SubFloat32x16 ...) => (VADDPS512 ...)
+(SubFloat32x16 ...) => (VSUBPS512 ...)
-(SubFloat32x4 ...) => (VADDPS128 ...)
+(SubFloat32x4 ...) => (VSUBPS128 ...)
-(SubFloat32x8 ...) => (VADDPS256 ...)
+(SubFloat32x8 ...) => (VSUBPS256 ...)
-(SubFloat64x2 ...) => (VADDPD128 ...)
+(SubFloat64x2 ...) => (VSUBPD128 ...)
-(SubFloat64x4 ...) => (VADDPD256 ...)
+(SubFloat64x4 ...) => (VSUBPD256 ...)
-(SubFloat64x8 ...) => (VADDPD512 ...)
+(SubFloat64x8 ...) => (VSUBPD512 ...)
 (SubInt16x16 ...) => (VPSUBW256 ...)
 (SubInt16x32 ...) => (VPSUBW512 ...)
 (SubInt16x8 ...) => (VPSUBW128 ...)
--- a/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go
+++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go
@ -57,6 +57,7 @@ func simdAMD64Ops(fp11, fp21, fp2k1, fp1k1fp1, fp2k1fp1, fp2k1k1, fp31, fp3k1fp1
 		{name: "VSCALEFPSMasked512", argLength: 3, reg: fp2k1fp1, asm: "VSCALEFPS", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VORPSMasked512", argLength: 3, reg: fp2k1fp1, asm: "VORPS", commutative: true, typ: "Vec512", resultInArg0: false},
 		{name: "VSQRTPSMasked512", argLength: 2, reg: fp1k1fp1, asm: "VSQRTPS", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VSUBPSMasked512", argLength: 3, reg: fp2k1fp1, asm: "VSUBPS", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VXORPSMasked512", argLength: 3, reg: fp2k1fp1, asm: "VXORPS", commutative: true, typ: "Vec512", resultInArg0: false},
 		{name: "VMAXPS512", argLength: 2, reg: fp21, asm: "VMAXPS", commutative: true, typ: "Vec512", resultInArg0: false},
 		{name: "VMINPS512", argLength: 2, reg: fp21, asm: "VMINPS", commutative: true, typ: "Vec512", resultInArg0: false},
@ -64,6 +65,7 @@ func simdAMD64Ops(fp11, fp21, fp2k1, fp1k1fp1, fp2k1fp1, fp2k1k1, fp31, fp3k1fp1
 		{name: "VSCALEFPS512", argLength: 2, reg: fp21, asm: "VSCALEFPS", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VORPS512", argLength: 2, reg: fp21, asm: "VORPS", commutative: true, typ: "Vec512", resultInArg0: false},
 		{name: "VSQRTPS512", argLength: 1, reg: fp11, asm: "VSQRTPS", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VSUBPS512", argLength: 2, reg: fp21, asm: "VSUBPS", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VXORPS512", argLength: 2, reg: fp21, asm: "VXORPS", commutative: true, typ: "Vec512", resultInArg0: false},
 		{name: "VADDPS128", argLength: 2, reg: fp21, asm: "VADDPS", commutative: true, typ: "Vec128", resultInArg0: false},
 		{name: "VADDSUBPS128", argLength: 2, reg: fp21, asm: "VADDSUBPS", commutative: false, typ: "Vec128", resultInArg0: false},
@ -120,6 +122,7 @@ func simdAMD64Ops(fp11, fp21, fp2k1, fp1k1fp1, fp2k1fp1, fp2k1k1, fp31, fp3k1fp1
 		{name: "VSCALEFPSMasked128", argLength: 3, reg: fp2k1fp1, asm: "VSCALEFPS", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VORPSMasked128", argLength: 3, reg: fp2k1fp1, asm: "VORPS", commutative: true, typ: "Vec128", resultInArg0: false},
 		{name: "VSQRTPSMasked128", argLength: 2, reg: fp1k1fp1, asm: "VSQRTPS", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VSUBPSMasked128", argLength: 3, reg: fp2k1fp1, asm: "VSUBPS", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VXORPSMasked128", argLength: 3, reg: fp2k1fp1, asm: "VXORPS", commutative: true, typ: "Vec128", resultInArg0: false},
 		{name: "VMAXPS128", argLength: 2, reg: fp21, asm: "VMAXPS", commutative: true, typ: "Vec128", resultInArg0: false},
 		{name: "VMINPS128", argLength: 2, reg: fp21, asm: "VMINPS", commutative: true, typ: "Vec128", resultInArg0: false},
@ -129,6 +132,7 @@ func simdAMD64Ops(fp11, fp21, fp2k1, fp1k1fp1, fp2k1fp1, fp2k1k1, fp31, fp3k1fp1
 		{name: "VHADDPS128", argLength: 2, reg: fp21, asm: "VHADDPS", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VHSUBPS128", argLength: 2, reg: fp21, asm: "VHSUBPS", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VSQRTPS128", argLength: 1, reg: fp11, asm: "VSQRTPS", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VSUBPS128", argLength: 2, reg: fp21, asm: "VSUBPS", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VXORPS128", argLength: 2, reg: fp21, asm: "VXORPS", commutative: true, typ: "Vec128", resultInArg0: false},
 		{name: "VADDPS256", argLength: 2, reg: fp21, asm: "VADDPS", commutative: true, typ: "Vec256", resultInArg0: false},
 		{name: "VADDSUBPS256", argLength: 2, reg: fp21, asm: "VADDSUBPS", commutative: false, typ: "Vec256", resultInArg0: false},
@ -185,6 +189,7 @@ func simdAMD64Ops(fp11, fp21, fp2k1, fp1k1fp1, fp2k1fp1, fp2k1k1, fp31, fp3k1fp1
 		{name: "VSCALEFPSMasked256", argLength: 3, reg: fp2k1fp1, asm: "VSCALEFPS", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VORPSMasked256", argLength: 3, reg: fp2k1fp1, asm: "VORPS", commutative: true, typ: "Vec256", resultInArg0: false},
 		{name: "VSQRTPSMasked256", argLength: 2, reg: fp1k1fp1, asm: "VSQRTPS", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VSUBPSMasked256", argLength: 3, reg: fp2k1fp1, asm: "VSUBPS", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VXORPSMasked256", argLength: 3, reg: fp2k1fp1, asm: "VXORPS", commutative: true, typ: "Vec256", resultInArg0: false},
 		{name: "VMAXPS256", argLength: 2, reg: fp21, asm: "VMAXPS", commutative: true, typ: "Vec256", resultInArg0: false},
 		{name: "VMINPS256", argLength: 2, reg: fp21, asm: "VMINPS", commutative: true, typ: "Vec256", resultInArg0: false},
@ -194,6 +199,7 @@ func simdAMD64Ops(fp11, fp21, fp2k1, fp1k1fp1, fp2k1fp1, fp2k1k1, fp31, fp3k1fp1
 		{name: "VHADDPS256", argLength: 2, reg: fp21, asm: "VHADDPS", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VHSUBPS256", argLength: 2, reg: fp21, asm: "VHSUBPS", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VSQRTPS256", argLength: 1, reg: fp11, asm: "VSQRTPS", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VSUBPS256", argLength: 2, reg: fp21, asm: "VSUBPS", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VXORPS256", argLength: 2, reg: fp21, asm: "VXORPS", commutative: true, typ: "Vec256", resultInArg0: false},
 		{name: "VADDPD128", argLength: 2, reg: fp21, asm: "VADDPD", commutative: true, typ: "Vec128", resultInArg0: false},
 		{name: "VADDSUBPD128", argLength: 2, reg: fp21, asm: "VADDSUBPD", commutative: false, typ: "Vec128", resultInArg0: false},
@ -250,6 +256,7 @@ func simdAMD64Ops(fp11, fp21, fp2k1, fp1k1fp1, fp2k1fp1, fp2k1k1, fp31, fp3k1fp1
 		{name: "VSCALEFPDMasked128", argLength: 3, reg: fp2k1fp1, asm: "VSCALEFPD", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VORPDMasked128", argLength: 3, reg: fp2k1fp1, asm: "VORPD", commutative: true, typ: "Vec128", resultInArg0: false},
 		{name: "VSQRTPDMasked128", argLength: 2, reg: fp1k1fp1, asm: "VSQRTPD", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VSUBPDMasked128", argLength: 3, reg: fp2k1fp1, asm: "VSUBPD", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VXORPDMasked128", argLength: 3, reg: fp2k1fp1, asm: "VXORPD", commutative: true, typ: "Vec128", resultInArg0: false},
 		{name: "VMAXPD128", argLength: 2, reg: fp21, asm: "VMAXPD", commutative: true, typ: "Vec128", resultInArg0: false},
 		{name: "VMINPD128", argLength: 2, reg: fp21, asm: "VMINPD", commutative: true, typ: "Vec128", resultInArg0: false},
@ -259,6 +266,7 @@ func simdAMD64Ops(fp11, fp21, fp2k1, fp1k1fp1, fp2k1fp1, fp2k1k1, fp31, fp3k1fp1
 		{name: "VHADDPD128", argLength: 2, reg: fp21, asm: "VHADDPD", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VHSUBPD128", argLength: 2, reg: fp21, asm: "VHSUBPD", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VSQRTPD128", argLength: 1, reg: fp11, asm: "VSQRTPD", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VSUBPD128", argLength: 2, reg: fp21, asm: "VSUBPD", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VXORPD128", argLength: 2, reg: fp21, asm: "VXORPD", commutative: true, typ: "Vec128", resultInArg0: false},
 		{name: "VADDPD256", argLength: 2, reg: fp21, asm: "VADDPD", commutative: true, typ: "Vec256", resultInArg0: false},
 		{name: "VADDSUBPD256", argLength: 2, reg: fp21, asm: "VADDSUBPD", commutative: false, typ: "Vec256", resultInArg0: false},
@ -315,6 +323,7 @@ func simdAMD64Ops(fp11, fp21, fp2k1, fp1k1fp1, fp2k1fp1, fp2k1k1, fp31, fp3k1fp1
 		{name: "VSCALEFPDMasked256", argLength: 3, reg: fp2k1fp1, asm: "VSCALEFPD", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VORPDMasked256", argLength: 3, reg: fp2k1fp1, asm: "VORPD", commutative: true, typ: "Vec256", resultInArg0: false},
 		{name: "VSQRTPDMasked256", argLength: 2, reg: fp1k1fp1, asm: "VSQRTPD", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VSUBPDMasked256", argLength: 3, reg: fp2k1fp1, asm: "VSUBPD", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VXORPDMasked256", argLength: 3, reg: fp2k1fp1, asm: "VXORPD", commutative: true, typ: "Vec256", resultInArg0: false},
 		{name: "VMAXPD256", argLength: 2, reg: fp21, asm: "VMAXPD", commutative: true, typ: "Vec256", resultInArg0: false},
 		{name: "VMINPD256", argLength: 2, reg: fp21, asm: "VMINPD", commutative: true, typ: "Vec256", resultInArg0: false},
@ -324,6 +333,7 @@ func simdAMD64Ops(fp11, fp21, fp2k1, fp1k1fp1, fp2k1fp1, fp2k1k1, fp31, fp3k1fp1
 		{name: "VHADDPD256", argLength: 2, reg: fp21, asm: "VHADDPD", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VHSUBPD256", argLength: 2, reg: fp21, asm: "VHSUBPD", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VSQRTPD256", argLength: 1, reg: fp11, asm: "VSQRTPD", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VSUBPD256", argLength: 2, reg: fp21, asm: "VSUBPD", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VXORPD256", argLength: 2, reg: fp21, asm: "VXORPD", commutative: true, typ: "Vec256", resultInArg0: false},
 		{name: "VADDPD512", argLength: 2, reg: fp21, asm: "VADDPD", commutative: true, typ: "Vec512", resultInArg0: false},
 		{name: "VANDPD512", argLength: 2, reg: fp21, asm: "VANDPD", commutative: true, typ: "Vec512", resultInArg0: false},
@ -379,6 +389,7 @@ func simdAMD64Ops(fp11, fp21, fp2k1, fp1k1fp1, fp2k1fp1, fp2k1k1, fp31, fp3k1fp1
 		{name: "VSCALEFPDMasked512", argLength: 3, reg: fp2k1fp1, asm: "VSCALEFPD", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VORPDMasked512", argLength: 3, reg: fp2k1fp1, asm: "VORPD", commutative: true, typ: "Vec512", resultInArg0: false},
 		{name: "VSQRTPDMasked512", argLength: 2, reg: fp1k1fp1, asm: "VSQRTPD", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VSUBPDMasked512", argLength: 3, reg: fp2k1fp1, asm: "VSUBPD", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VXORPDMasked512", argLength: 3, reg: fp2k1fp1, asm: "VXORPD", commutative: true, typ: "Vec512", resultInArg0: false},
 		{name: "VMAXPD512", argLength: 2, reg: fp21, asm: "VMAXPD", commutative: true, typ: "Vec512", resultInArg0: false},
 		{name: "VMINPD512", argLength: 2, reg: fp21, asm: "VMINPD", commutative: true, typ: "Vec512", resultInArg0: false},
@ -386,17 +397,14 @@ func simdAMD64Ops(fp11, fp21, fp2k1, fp1k1fp1, fp2k1fp1, fp2k1k1, fp31, fp3k1fp1
 		{name: "VSCALEFPD512", argLength: 2, reg: fp21, asm: "VSCALEFPD", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VORPD512", argLength: 2, reg: fp21, asm: "VORPD", commutative: true, typ: "Vec512", resultInArg0: false},
 		{name: "VSQRTPD512", argLength: 1, reg: fp11, asm: "VSQRTPD", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VSUBPD512", argLength: 2, reg: fp21, asm: "VSUBPD", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VXORPD512", argLength: 2, reg: fp21, asm: "VXORPD", commutative: true, typ: "Vec512", resultInArg0: false},
 		{name: "VPABSW256", argLength: 1, reg: fp11, asm: "VPABSW", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPADDW256", argLength: 2, reg: fp21, asm: "VPADDW", commutative: true, typ: "Vec256", resultInArg0: false},
 		{name: "VPAND256", argLength: 2, reg: fp21, asm: "VPAND", commutative: true, typ: "Vec256", resultInArg0: false},
 		{name: "VPANDN256", argLength: 2, reg: fp21, asm: "VPANDN", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPCMPEQW256", argLength: 2, reg: fp21, asm: "VPCMPEQW", commutative: true, typ: "Vec256", resultInArg0: false},
 		{name: "VPCMPGTW256", argLength: 2, reg: fp21, asm: "VPCMPGTW", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPABSWMasked256", argLength: 2, reg: fp1k1fp1, asm: "VPABSW", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPADDWMasked256", argLength: 3, reg: fp2k1fp1, asm: "VPADDW", commutative: true, typ: "Vec256", resultInArg0: false},
 		{name: "VPCMPEQWMasked256", argLength: 3, reg: fp2k1k1, asm: "VPCMPEQW", commutative: true, typ: "Mask", resultInArg0: false},
 		{name: "VPCMPGTWMasked256", argLength: 3, reg: fp2k1k1, asm: "VPCMPGTW", commutative: false, typ: "Mask", resultInArg0: false},
 		{name: "VPMAXSWMasked256", argLength: 3, reg: fp2k1fp1, asm: "VPMAXSW", commutative: true, typ: "Vec256", resultInArg0: false},
 		{name: "VPMINSWMasked256", argLength: 3, reg: fp2k1fp1, asm: "VPMINSW", commutative: true, typ: "Vec256", resultInArg0: false},
 		{name: "VPMULHWMasked256", argLength: 3, reg: fp2k1fp1, asm: "VPMULHW", commutative: true, typ: "Vec256", resultInArg0: false},
@ -410,7 +418,6 @@ func simdAMD64Ops(fp11, fp21, fp2k1, fp1k1fp1, fp2k1fp1, fp2k1k1, fp31, fp3k1fp1
 		{name: "VPMINSW256", argLength: 2, reg: fp21, asm: "VPMINSW", commutative: true, typ: "Vec256", resultInArg0: false},
 		{name: "VPMULHW256", argLength: 2, reg: fp21, asm: "VPMULHW", commutative: true, typ: "Vec256", resultInArg0: false},
 		{name: "VPMULLW256", argLength: 2, reg: fp21, asm: "VPMULLW", commutative: true, typ: "Vec256", resultInArg0: false},
 		{name: "VPOR256", argLength: 2, reg: fp21, asm: "VPOR", commutative: true, typ: "Vec256", resultInArg0: false},
 		{name: "VPMADDWD256", argLength: 2, reg: fp21, asm: "VPMADDWD", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPHADDW256", argLength: 2, reg: fp21, asm: "VPHADDW", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPHSUBW256", argLength: 2, reg: fp21, asm: "VPHSUBW", commutative: false, typ: "Vec256", resultInArg0: false},
@ -421,15 +428,10 @@ func simdAMD64Ops(fp11, fp21, fp2k1, fp1k1fp1, fp2k1fp1, fp2k1k1, fp31, fp3k1fp1
 		{name: "VPSUBSW256", argLength: 2, reg: fp21, asm: "VPSUBSW", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPSIGNW256", argLength: 2, reg: fp21, asm: "VPSIGNW", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPSUBW256", argLength: 2, reg: fp21, asm: "VPSUBW", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPXOR256", argLength: 2, reg: fp21, asm: "VPXOR", commutative: true, typ: "Vec256", resultInArg0: false},
 		{name: "VPABSW512", argLength: 1, reg: fp11, asm: "VPABSW", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VPADDW512", argLength: 2, reg: fp21, asm: "VPADDW", commutative: true, typ: "Vec512", resultInArg0: false},
 		{name: "VPCMPEQW512", argLength: 2, reg: fp2k1, asm: "VPCMPEQW", commutative: true, typ: "Mask", resultInArg0: false},
 		{name: "VPCMPGTW512", argLength: 2, reg: fp2k1, asm: "VPCMPGTW", commutative: false, typ: "Mask", resultInArg0: false},
 		{name: "VPABSWMasked512", argLength: 2, reg: fp1k1fp1, asm: "VPABSW", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VPADDWMasked512", argLength: 3, reg: fp2k1fp1, asm: "VPADDW", commutative: true, typ: "Vec512", resultInArg0: false},
 		{name: "VPCMPEQWMasked512", argLength: 3, reg: fp2k1k1, asm: "VPCMPEQW", commutative: true, typ: "Mask", resultInArg0: false},
 		{name: "VPCMPGTWMasked512", argLength: 3, reg: fp2k1k1, asm: "VPCMPGTW", commutative: false, typ: "Mask", resultInArg0: false},
 		{name: "VPMAXSWMasked512", argLength: 3, reg: fp2k1fp1, asm: "VPMAXSW", commutative: true, typ: "Vec512", resultInArg0: false},
 		{name: "VPMINSWMasked512", argLength: 3, reg: fp2k1fp1, asm: "VPMINSW", commutative: true, typ: "Vec512", resultInArg0: false},
 		{name: "VPMULHWMasked512", argLength: 3, reg: fp2k1fp1, asm: "VPMULHW", commutative: true, typ: "Vec512", resultInArg0: false},
@ -450,14 +452,10 @@ func simdAMD64Ops(fp11, fp21, fp2k1, fp1k1fp1, fp2k1fp1, fp2k1k1, fp31, fp3k1fp1
 		{name: "VPSUBW512", argLength: 2, reg: fp21, asm: "VPSUBW", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VPABSW128", argLength: 1, reg: fp11, asm: "VPABSW", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPADDW128", argLength: 2, reg: fp21, asm: "VPADDW", commutative: true, typ: "Vec128", resultInArg0: false},
 		{name: "VPAND128", argLength: 2, reg: fp21, asm: "VPAND", commutative: true, typ: "Vec128", resultInArg0: false},
 		{name: "VPANDN128", argLength: 2, reg: fp21, asm: "VPANDN", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPCMPEQW128", argLength: 2, reg: fp21, asm: "VPCMPEQW", commutative: true, typ: "Vec128", resultInArg0: false},
 		{name: "VPCMPGTW128", argLength: 2, reg: fp21, asm: "VPCMPGTW", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPABSWMasked128", argLength: 2, reg: fp1k1fp1, asm: "VPABSW", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPADDWMasked128", argLength: 3, reg: fp2k1fp1, asm: "VPADDW", commutative: true, typ: "Vec128", resultInArg0: false},
 		{name: "VPCMPEQWMasked128", argLength: 3, reg: fp2k1k1, asm: "VPCMPEQW", commutative: true, typ: "Mask", resultInArg0: false},
 		{name: "VPCMPGTWMasked128", argLength: 3, reg: fp2k1k1, asm: "VPCMPGTW", commutative: false, typ: "Mask", resultInArg0: false},
 		{name: "VPMAXSWMasked128", argLength: 3, reg: fp2k1fp1, asm: "VPMAXSW", commutative: true, typ: "Vec128", resultInArg0: false},
 		{name: "VPMINSWMasked128", argLength: 3, reg: fp2k1fp1, asm: "VPMINSW", commutative: true, typ: "Vec128", resultInArg0: false},
 		{name: "VPMULHWMasked128", argLength: 3, reg: fp2k1fp1, asm: "VPMULHW", commutative: true, typ: "Vec128", resultInArg0: false},
@ -471,7 +469,6 @@ func simdAMD64Ops(fp11, fp21, fp2k1, fp1k1fp1, fp2k1fp1, fp2k1k1, fp31, fp3k1fp1
 		{name: "VPMINSW128", argLength: 2, reg: fp21, asm: "VPMINSW", commutative: true, typ: "Vec128", resultInArg0: false},
 		{name: "VPMULHW128", argLength: 2, reg: fp21, asm: "VPMULHW", commutative: true, typ: "Vec128", resultInArg0: false},
 		{name: "VPMULLW128", argLength: 2, reg: fp21, asm: "VPMULLW", commutative: true, typ: "Vec128", resultInArg0: false},
 		{name: "VPOR128", argLength: 2, reg: fp21, asm: "VPOR", commutative: true, typ: "Vec128", resultInArg0: false},
 		{name: "VPMADDWD128", argLength: 2, reg: fp21, asm: "VPMADDWD", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPHADDW128", argLength: 2, reg: fp21, asm: "VPHADDW", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPHSUBW128", argLength: 2, reg: fp21, asm: "VPHSUBW", commutative: false, typ: "Vec128", resultInArg0: false},
@ -482,19 +479,14 @@ func simdAMD64Ops(fp11, fp21, fp2k1, fp1k1fp1, fp2k1fp1, fp2k1k1, fp31, fp3k1fp1
 		{name: "VPSUBSW128", argLength: 2, reg: fp21, asm: "VPSUBSW", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPSIGNW128", argLength: 2, reg: fp21, asm: "VPSIGNW", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPSUBW128", argLength: 2, reg: fp21, asm: "VPSUBW", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPXOR128", argLength: 2, reg: fp21, asm: "VPXOR", commutative: true, typ: "Vec128", resultInArg0: false},
 		{name: "VPABSD512", argLength: 1, reg: fp11, asm: "VPABSD", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VPADDD512", argLength: 2, reg: fp21, asm: "VPADDD", commutative: true, typ: "Vec512", resultInArg0: false},
 		{name: "VPANDD512", argLength: 2, reg: fp21, asm: "VPANDD", commutative: true, typ: "Vec512", resultInArg0: false},
 		{name: "VPANDND512", argLength: 2, reg: fp21, asm: "VPANDND", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VPCMPEQD512", argLength: 2, reg: fp2k1, asm: "VPCMPEQD", commutative: true, typ: "Mask", resultInArg0: false},
 		{name: "VPCMPGTD512", argLength: 2, reg: fp2k1, asm: "VPCMPGTD", commutative: false, typ: "Mask", resultInArg0: false},
 		{name: "VPABSDMasked512", argLength: 2, reg: fp1k1fp1, asm: "VPABSD", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VPADDDMasked512", argLength: 3, reg: fp2k1fp1, asm: "VPADDD", commutative: true, typ: "Vec512", resultInArg0: false},
 		{name: "VPANDDMasked512", argLength: 3, reg: fp2k1fp1, asm: "VPANDD", commutative: true, typ: "Vec512", resultInArg0: false},
 		{name: "VPANDNDMasked512", argLength: 3, reg: fp2k1fp1, asm: "VPANDND", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VPCMPEQDMasked512", argLength: 3, reg: fp2k1k1, asm: "VPCMPEQD", commutative: true, typ: "Mask", resultInArg0: false},
 		{name: "VPCMPGTDMasked512", argLength: 3, reg: fp2k1k1, asm: "VPCMPGTD", commutative: false, typ: "Mask", resultInArg0: false},
 		{name: "VPMAXSDMasked512", argLength: 3, reg: fp2k1fp1, asm: "VPMAXSD", commutative: true, typ: "Vec512", resultInArg0: false},
 		{name: "VPMINSDMasked512", argLength: 3, reg: fp2k1fp1, asm: "VPMINSD", commutative: true, typ: "Vec512", resultInArg0: false},
 		{name: "VPMULLDMasked512", argLength: 3, reg: fp2k1fp1, asm: "VPMULLD", commutative: true, typ: "Vec512", resultInArg0: false},
@ -525,8 +517,6 @@ func simdAMD64Ops(fp11, fp21, fp2k1, fp1k1fp1, fp2k1fp1, fp2k1k1, fp31, fp3k1fp1
 		{name: "VPADDDMasked128", argLength: 3, reg: fp2k1fp1, asm: "VPADDD", commutative: true, typ: "Vec128", resultInArg0: false},
 		{name: "VPANDDMasked128", argLength: 3, reg: fp2k1fp1, asm: "VPANDD", commutative: true, typ: "Vec128", resultInArg0: false},
 		{name: "VPANDNDMasked128", argLength: 3, reg: fp2k1fp1, asm: "VPANDND", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPCMPEQDMasked128", argLength: 3, reg: fp2k1k1, asm: "VPCMPEQD", commutative: true, typ: "Mask", resultInArg0: false},
 		{name: "VPCMPGTDMasked128", argLength: 3, reg: fp2k1k1, asm: "VPCMPGTD", commutative: false, typ: "Mask", resultInArg0: false},
 		{name: "VPMAXSDMasked128", argLength: 3, reg: fp2k1fp1, asm: "VPMAXSD", commutative: true, typ: "Vec128", resultInArg0: false},
 		{name: "VPMINSDMasked128", argLength: 3, reg: fp2k1fp1, asm: "VPMINSD", commutative: true, typ: "Vec128", resultInArg0: false},
 		{name: "VPMULLDMasked128", argLength: 3, reg: fp2k1fp1, asm: "VPMULLD", commutative: true, typ: "Vec128", resultInArg0: false},
@ -559,8 +549,6 @@ func simdAMD64Ops(fp11, fp21, fp2k1, fp1k1fp1, fp2k1fp1, fp2k1k1, fp31, fp3k1fp1
 		{name: "VPADDDMasked256", argLength: 3, reg: fp2k1fp1, asm: "VPADDD", commutative: true, typ: "Vec256", resultInArg0: false},
 		{name: "VPANDDMasked256", argLength: 3, reg: fp2k1fp1, asm: "VPANDD", commutative: true, typ: "Vec256", resultInArg0: false},
 		{name: "VPANDNDMasked256", argLength: 3, reg: fp2k1fp1, asm: "VPANDND", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPCMPEQDMasked256", argLength: 3, reg: fp2k1k1, asm: "VPCMPEQD", commutative: true, typ: "Mask", resultInArg0: false},
 		{name: "VPCMPGTDMasked256", argLength: 3, reg: fp2k1k1, asm: "VPCMPGTD", commutative: false, typ: "Mask", resultInArg0: false},
 		{name: "VPMAXSDMasked256", argLength: 3, reg: fp2k1fp1, asm: "VPMAXSD", commutative: true, typ: "Vec256", resultInArg0: false},
 		{name: "VPMINSDMasked256", argLength: 3, reg: fp2k1fp1, asm: "VPMINSD", commutative: true, typ: "Vec256", resultInArg0: false},
 		{name: "VPMULLDMasked256", argLength: 3, reg: fp2k1fp1, asm: "VPMULLD", commutative: true, typ: "Vec256", resultInArg0: false},
@ -588,13 +576,10 @@ func simdAMD64Ops(fp11, fp21, fp2k1, fp1k1fp1, fp2k1fp1, fp2k1k1, fp31, fp3k1fp1
 		{name: "VPABSQ128", argLength: 1, reg: fp11, asm: "VPABSQ", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPADDQ128", argLength: 2, reg: fp21, asm: "VPADDQ", commutative: true, typ: "Vec128", resultInArg0: false},
 		{name: "VPCMPEQQ128", argLength: 2, reg: fp21, asm: "VPCMPEQQ", commutative: true, typ: "Vec128", resultInArg0: false},
 		{name: "VPCMPGTQ128", argLength: 2, reg: fp2k1, asm: "VPCMPGTQ", commutative: false, typ: "Mask", resultInArg0: false},
 		{name: "VPABSQMasked128", argLength: 2, reg: fp1k1fp1, asm: "VPABSQ", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPADDQMasked128", argLength: 3, reg: fp2k1fp1, asm: "VPADDQ", commutative: true, typ: "Vec128", resultInArg0: false},
 		{name: "VPANDQMasked128", argLength: 3, reg: fp2k1fp1, asm: "VPANDQ", commutative: true, typ: "Vec128", resultInArg0: false},
 		{name: "VPANDNQMasked128", argLength: 3, reg: fp2k1fp1, asm: "VPANDNQ", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPCMPEQQMasked128", argLength: 3, reg: fp2k1k1, asm: "VPCMPEQQ", commutative: true, typ: "Mask", resultInArg0: false},
 		{name: "VPCMPGTQMasked128", argLength: 3, reg: fp2k1k1, asm: "VPCMPGTQ", commutative: false, typ: "Mask", resultInArg0: false},
 		{name: "VPMAXSQMasked128", argLength: 3, reg: fp2k1fp1, asm: "VPMAXSQ", commutative: true, typ: "Vec128", resultInArg0: false},
 		{name: "VPMINSQMasked128", argLength: 3, reg: fp2k1fp1, asm: "VPMINSQ", commutative: true, typ: "Vec128", resultInArg0: false},
 		{name: "VPMULDQMasked128", argLength: 3, reg: fp2k1fp1, asm: "VPMULDQ", commutative: true, typ: "Vec128", resultInArg0: false},
@ -616,8 +601,6 @@ func simdAMD64Ops(fp11, fp21, fp2k1, fp1k1fp1, fp2k1fp1, fp2k1k1, fp31, fp3k1fp1
 		{name: "VPADDQMasked256", argLength: 3, reg: fp2k1fp1, asm: "VPADDQ", commutative: true, typ: "Vec256", resultInArg0: false},
 		{name: "VPANDQMasked256", argLength: 3, reg: fp2k1fp1, asm: "VPANDQ", commutative: true, typ: "Vec256", resultInArg0: false},
 		{name: "VPANDNQMasked256", argLength: 3, reg: fp2k1fp1, asm: "VPANDNQ", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPCMPEQQMasked256", argLength: 3, reg: fp2k1k1, asm: "VPCMPEQQ", commutative: true, typ: "Mask", resultInArg0: false},
 		{name: "VPCMPGTQMasked256", argLength: 3, reg: fp2k1k1, asm: "VPCMPGTQ", commutative: false, typ: "Mask", resultInArg0: false},
 		{name: "VPMAXSQMasked256", argLength: 3, reg: fp2k1fp1, asm: "VPMAXSQ", commutative: true, typ: "Vec256", resultInArg0: false},
 		{name: "VPMINSQMasked256", argLength: 3, reg: fp2k1fp1, asm: "VPMINSQ", commutative: true, typ: "Vec256", resultInArg0: false},
 		{name: "VPMULDQMasked256", argLength: 3, reg: fp2k1fp1, asm: "VPMULDQ", commutative: true, typ: "Vec256", resultInArg0: false},
@ -635,14 +618,10 @@ func simdAMD64Ops(fp11, fp21, fp2k1, fp1k1fp1, fp2k1fp1, fp2k1k1, fp31, fp3k1fp1
 		{name: "VPADDQ512", argLength: 2, reg: fp21, asm: "VPADDQ", commutative: true, typ: "Vec512", resultInArg0: false},
 		{name: "VPANDQ512", argLength: 2, reg: fp21, asm: "VPANDQ", commutative: true, typ: "Vec512", resultInArg0: false},
 		{name: "VPANDNQ512", argLength: 2, reg: fp21, asm: "VPANDNQ", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VPCMPEQQ512", argLength: 2, reg: fp2k1, asm: "VPCMPEQQ", commutative: true, typ: "Mask", resultInArg0: false},
 		{name: "VPCMPGTQ512", argLength: 2, reg: fp2k1, asm: "VPCMPGTQ", commutative: false, typ: "Mask", resultInArg0: false},
 		{name: "VPABSQMasked512", argLength: 2, reg: fp1k1fp1, asm: "VPABSQ", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VPADDQMasked512", argLength: 3, reg: fp2k1fp1, asm: "VPADDQ", commutative: true, typ: "Vec512", resultInArg0: false},
 		{name: "VPANDQMasked512", argLength: 3, reg: fp2k1fp1, asm: "VPANDQ", commutative: true, typ: "Vec512", resultInArg0: false},
 		{name: "VPANDNQMasked512", argLength: 3, reg: fp2k1fp1, asm: "VPANDNQ", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VPCMPEQQMasked512", argLength: 3, reg: fp2k1k1, asm: "VPCMPEQQ", commutative: true, typ: "Mask", resultInArg0: false},
 		{name: "VPCMPGTQMasked512", argLength: 3, reg: fp2k1k1, asm: "VPCMPGTQ", commutative: false, typ: "Mask", resultInArg0: false},
 		{name: "VPMAXSQMasked512", argLength: 3, reg: fp2k1fp1, asm: "VPMAXSQ", commutative: true, typ: "Vec512", resultInArg0: false},
 		{name: "VPMINSQMasked512", argLength: 3, reg: fp2k1fp1, asm: "VPMINSQ", commutative: true, typ: "Vec512", resultInArg0: false},
 		{name: "VPMULDQMasked512", argLength: 3, reg: fp2k1fp1, asm: "VPMULDQ", commutative: true, typ: "Vec512", resultInArg0: false},
@ -661,12 +640,12 @@ func simdAMD64Ops(fp11, fp21, fp2k1, fp1k1fp1, fp2k1fp1, fp2k1k1, fp31, fp3k1fp1
 		{name: "VPXORQ512", argLength: 2, reg: fp21, asm: "VPXORQ", commutative: true, typ: "Vec512", resultInArg0: false},
 		{name: "VPABSB128", argLength: 1, reg: fp11, asm: "VPABSB", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPADDB128", argLength: 2, reg: fp21, asm: "VPADDB", commutative: true, typ: "Vec128", resultInArg0: false},
 		{name: "VPAND128", argLength: 2, reg: fp21, asm: "VPAND", commutative: true, typ: "Vec128", resultInArg0: false},
 		{name: "VPANDN128", argLength: 2, reg: fp21, asm: "VPANDN", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPCMPEQB128", argLength: 2, reg: fp21, asm: "VPCMPEQB", commutative: true, typ: "Vec128", resultInArg0: false},
 		{name: "VPCMPGTB128", argLength: 2, reg: fp21, asm: "VPCMPGTB", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPABSBMasked128", argLength: 2, reg: fp1k1fp1, asm: "VPABSB", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPADDBMasked128", argLength: 3, reg: fp2k1fp1, asm: "VPADDB", commutative: true, typ: "Vec128", resultInArg0: false},
 		{name: "VPCMPEQBMasked128", argLength: 3, reg: fp2k1k1, asm: "VPCMPEQB", commutative: true, typ: "Mask", resultInArg0: false},
 		{name: "VPCMPGTBMasked128", argLength: 3, reg: fp2k1k1, asm: "VPCMPGTB", commutative: false, typ: "Mask", resultInArg0: false},
 		{name: "VPMAXSBMasked128", argLength: 3, reg: fp2k1fp1, asm: "VPMAXSB", commutative: true, typ: "Vec128", resultInArg0: false},
 		{name: "VPMINSBMasked128", argLength: 3, reg: fp2k1fp1, asm: "VPMINSB", commutative: true, typ: "Vec128", resultInArg0: false},
 		{name: "VPOPCNTBMasked128", argLength: 2, reg: fp1k1fp1, asm: "VPOPCNTB", commutative: false, typ: "Vec128", resultInArg0: false},
@ -675,19 +654,21 @@ func simdAMD64Ops(fp11, fp21, fp2k1, fp1k1fp1, fp2k1fp1, fp2k1k1, fp31, fp3k1fp1
 		{name: "VPSUBBMasked128", argLength: 3, reg: fp2k1fp1, asm: "VPSUBB", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPMAXSB128", argLength: 2, reg: fp21, asm: "VPMAXSB", commutative: true, typ: "Vec128", resultInArg0: false},
 		{name: "VPMINSB128", argLength: 2, reg: fp21, asm: "VPMINSB", commutative: true, typ: "Vec128", resultInArg0: false},
 		{name: "VPOR128", argLength: 2, reg: fp21, asm: "VPOR", commutative: true, typ: "Vec128", resultInArg0: false},
 		{name: "VPOPCNTB128", argLength: 1, reg: fp11, asm: "VPOPCNTB", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPADDSB128", argLength: 2, reg: fp21, asm: "VPADDSB", commutative: true, typ: "Vec128", resultInArg0: false},
 		{name: "VPSUBSB128", argLength: 2, reg: fp21, asm: "VPSUBSB", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPSIGNB128", argLength: 2, reg: fp21, asm: "VPSIGNB", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPSUBB128", argLength: 2, reg: fp21, asm: "VPSUBB", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPXOR128", argLength: 2, reg: fp21, asm: "VPXOR", commutative: true, typ: "Vec128", resultInArg0: false},
 		{name: "VPABSB256", argLength: 1, reg: fp11, asm: "VPABSB", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPADDB256", argLength: 2, reg: fp21, asm: "VPADDB", commutative: true, typ: "Vec256", resultInArg0: false},
 		{name: "VPAND256", argLength: 2, reg: fp21, asm: "VPAND", commutative: true, typ: "Vec256", resultInArg0: false},
 		{name: "VPANDN256", argLength: 2, reg: fp21, asm: "VPANDN", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPCMPEQB256", argLength: 2, reg: fp21, asm: "VPCMPEQB", commutative: true, typ: "Vec256", resultInArg0: false},
 		{name: "VPCMPGTB256", argLength: 2, reg: fp21, asm: "VPCMPGTB", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPABSBMasked256", argLength: 2, reg: fp1k1fp1, asm: "VPABSB", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPADDBMasked256", argLength: 3, reg: fp2k1fp1, asm: "VPADDB", commutative: true, typ: "Vec256", resultInArg0: false},
 		{name: "VPCMPEQBMasked256", argLength: 3, reg: fp2k1k1, asm: "VPCMPEQB", commutative: true, typ: "Mask", resultInArg0: false},
 		{name: "VPCMPGTBMasked256", argLength: 3, reg: fp2k1k1, asm: "VPCMPGTB", commutative: false, typ: "Mask", resultInArg0: false},
 		{name: "VPMAXSBMasked256", argLength: 3, reg: fp2k1fp1, asm: "VPMAXSB", commutative: true, typ: "Vec256", resultInArg0: false},
 		{name: "VPMINSBMasked256", argLength: 3, reg: fp2k1fp1, asm: "VPMINSB", commutative: true, typ: "Vec256", resultInArg0: false},
 		{name: "VPOPCNTBMasked256", argLength: 2, reg: fp1k1fp1, asm: "VPOPCNTB", commutative: false, typ: "Vec256", resultInArg0: false},
@ -696,19 +677,17 @@ func simdAMD64Ops(fp11, fp21, fp2k1, fp1k1fp1, fp2k1fp1, fp2k1k1, fp31, fp3k1fp1
 		{name: "VPSUBBMasked256", argLength: 3, reg: fp2k1fp1, asm: "VPSUBB", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPMAXSB256", argLength: 2, reg: fp21, asm: "VPMAXSB", commutative: true, typ: "Vec256", resultInArg0: false},
 		{name: "VPMINSB256", argLength: 2, reg: fp21, asm: "VPMINSB", commutative: true, typ: "Vec256", resultInArg0: false},
 		{name: "VPOR256", argLength: 2, reg: fp21, asm: "VPOR", commutative: true, typ: "Vec256", resultInArg0: false},
 		{name: "VPOPCNTB256", argLength: 1, reg: fp11, asm: "VPOPCNTB", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPADDSB256", argLength: 2, reg: fp21, asm: "VPADDSB", commutative: true, typ: "Vec256", resultInArg0: false},
 		{name: "VPSUBSB256", argLength: 2, reg: fp21, asm: "VPSUBSB", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPSIGNB256", argLength: 2, reg: fp21, asm: "VPSIGNB", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPSUBB256", argLength: 2, reg: fp21, asm: "VPSUBB", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPXOR256", argLength: 2, reg: fp21, asm: "VPXOR", commutative: true, typ: "Vec256", resultInArg0: false},
 		{name: "VPABSB512", argLength: 1, reg: fp11, asm: "VPABSB", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VPADDB512", argLength: 2, reg: fp21, asm: "VPADDB", commutative: true, typ: "Vec512", resultInArg0: false},
 		{name: "VPCMPEQB512", argLength: 2, reg: fp2k1, asm: "VPCMPEQB", commutative: true, typ: "Mask", resultInArg0: false},
 		{name: "VPCMPGTB512", argLength: 2, reg: fp2k1, asm: "VPCMPGTB", commutative: false, typ: "Mask", resultInArg0: false},
 		{name: "VPABSBMasked512", argLength: 2, reg: fp1k1fp1, asm: "VPABSB", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VPADDBMasked512", argLength: 3, reg: fp2k1fp1, asm: "VPADDB", commutative: true, typ: "Vec512", resultInArg0: false},
 		{name: "VPCMPEQBMasked512", argLength: 3, reg: fp2k1k1, asm: "VPCMPEQB", commutative: true, typ: "Mask", resultInArg0: false},
 		{name: "VPCMPGTBMasked512", argLength: 3, reg: fp2k1k1, asm: "VPCMPGTB", commutative: false, typ: "Mask", resultInArg0: false},
 		{name: "VPMAXSBMasked512", argLength: 3, reg: fp2k1fp1, asm: "VPMAXSB", commutative: true, typ: "Vec512", resultInArg0: false},
 		{name: "VPMINSBMasked512", argLength: 3, reg: fp2k1fp1, asm: "VPMINSB", commutative: true, typ: "Vec512", resultInArg0: false},
 		{name: "VPOPCNTBMasked512", argLength: 2, reg: fp1k1fp1, asm: "VPOPCNTB", commutative: false, typ: "Vec512", resultInArg0: false},
@ -841,29 +820,29 @@ func simdAMD64Ops(fp11, fp21, fp2k1, fp1k1fp1, fp2k1fp1, fp2k1k1, fp31, fp3k1fp1
 		{name: "VREDUCEPDMasked512", argLength: 2, reg: fp1k1fp1, asm: "VREDUCEPD", aux: "Int8", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VCMPPDMasked512", argLength: 3, reg: fp2k1k1, asm: "VCMPPD", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
 		{name: "VPCMPW256", argLength: 2, reg: fp2k1, asm: "VPCMPW", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false},
-		{name: "VPCMPWMasked256", argLength: 3, reg: fp2k1k1, asm: "VPCMPW", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false},
+		{name: "VPCMPWMasked256", argLength: 3, reg: fp2k1k1, asm: "VPCMPW", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
-		{name: "VPCMPW512", argLength: 2, reg: fp2k1, asm: "VPCMPW", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false},
+		{name: "VPCMPW512", argLength: 2, reg: fp2k1, asm: "VPCMPW", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
-		{name: "VPCMPWMasked512", argLength: 3, reg: fp2k1k1, asm: "VPCMPW", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false},
+		{name: "VPCMPWMasked512", argLength: 3, reg: fp2k1k1, asm: "VPCMPW", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
 		{name: "VPCMPW128", argLength: 2, reg: fp2k1, asm: "VPCMPW", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false},
-		{name: "VPCMPWMasked128", argLength: 3, reg: fp2k1k1, asm: "VPCMPW", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false},
+		{name: "VPCMPWMasked128", argLength: 3, reg: fp2k1k1, asm: "VPCMPW", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
-		{name: "VPCMPD512", argLength: 2, reg: fp2k1, asm: "VPCMPD", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false},
+		{name: "VPCMPD512", argLength: 2, reg: fp2k1, asm: "VPCMPD", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
-		{name: "VPCMPDMasked512", argLength: 3, reg: fp2k1k1, asm: "VPCMPD", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false},
+		{name: "VPCMPDMasked512", argLength: 3, reg: fp2k1k1, asm: "VPCMPD", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
 		{name: "VPCMPD128", argLength: 2, reg: fp2k1, asm: "VPCMPD", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false},
-		{name: "VPCMPDMasked128", argLength: 3, reg: fp2k1k1, asm: "VPCMPD", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false},
+		{name: "VPCMPDMasked128", argLength: 3, reg: fp2k1k1, asm: "VPCMPD", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
 		{name: "VPCMPD256", argLength: 2, reg: fp2k1, asm: "VPCMPD", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false},
-		{name: "VPCMPDMasked256", argLength: 3, reg: fp2k1k1, asm: "VPCMPD", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false},
+		{name: "VPCMPDMasked256", argLength: 3, reg: fp2k1k1, asm: "VPCMPD", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
 		{name: "VPCMPQ128", argLength: 2, reg: fp2k1, asm: "VPCMPQ", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false},
-		{name: "VPCMPQMasked128", argLength: 3, reg: fp2k1k1, asm: "VPCMPQ", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false},
+		{name: "VPCMPQMasked128", argLength: 3, reg: fp2k1k1, asm: "VPCMPQ", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
 		{name: "VPCMPQ256", argLength: 2, reg: fp2k1, asm: "VPCMPQ", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false},
-		{name: "VPCMPQMasked256", argLength: 3, reg: fp2k1k1, asm: "VPCMPQ", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false},
+		{name: "VPCMPQMasked256", argLength: 3, reg: fp2k1k1, asm: "VPCMPQ", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
-		{name: "VPCMPQ512", argLength: 2, reg: fp2k1, asm: "VPCMPQ", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false},
+		{name: "VPCMPQ512", argLength: 2, reg: fp2k1, asm: "VPCMPQ", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
-		{name: "VPCMPQMasked512", argLength: 3, reg: fp2k1k1, asm: "VPCMPQ", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false},
+		{name: "VPCMPQMasked512", argLength: 3, reg: fp2k1k1, asm: "VPCMPQ", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
 		{name: "VPCMPB128", argLength: 2, reg: fp2k1, asm: "VPCMPB", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false},
-		{name: "VPCMPBMasked128", argLength: 3, reg: fp2k1k1, asm: "VPCMPB", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false},
+		{name: "VPCMPBMasked128", argLength: 3, reg: fp2k1k1, asm: "VPCMPB", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
 		{name: "VPCMPB256", argLength: 2, reg: fp2k1, asm: "VPCMPB", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false},
-		{name: "VPCMPBMasked256", argLength: 3, reg: fp2k1k1, asm: "VPCMPB", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false},
+		{name: "VPCMPBMasked256", argLength: 3, reg: fp2k1k1, asm: "VPCMPB", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
-		{name: "VPCMPB512", argLength: 2, reg: fp2k1, asm: "VPCMPB", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false},
+		{name: "VPCMPB512", argLength: 2, reg: fp2k1, asm: "VPCMPB", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
-		{name: "VPCMPBMasked512", argLength: 3, reg: fp2k1k1, asm: "VPCMPB", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false},
+		{name: "VPCMPBMasked512", argLength: 3, reg: fp2k1k1, asm: "VPCMPB", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
 		{name: "VPCMPUW256", argLength: 2, reg: fp2k1, asm: "VPCMPUW", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
 		{name: "VPCMPUWMasked256", argLength: 3, reg: fp2k1k1, asm: "VPCMPUW", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
 		{name: "VPCMPUW512", argLength: 2, reg: fp2k1, asm: "VPCMPUW", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
--- a/src/cmd/compile/internal/ssa/rewriteAMD64.go
+++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go
@ -4584,22 +4584,22 @@ func rewriteValueAMD64(v *Value) bool {
 		v.Op = OpAMD64SUBL
 		return true
 	case OpSubFloat32x16:
-		v.Op = OpAMD64VADDPS512
+		v.Op = OpAMD64VSUBPS512
 		return true
 	case OpSubFloat32x4:
-		v.Op = OpAMD64VADDPS128
+		v.Op = OpAMD64VSUBPS128
 		return true
 	case OpSubFloat32x8:
-		v.Op = OpAMD64VADDPS256
+		v.Op = OpAMD64VSUBPS256
 		return true
 	case OpSubFloat64x2:
-		v.Op = OpAMD64VADDPD128
+		v.Op = OpAMD64VSUBPD128
 		return true
 	case OpSubFloat64x4:
-		v.Op = OpAMD64VADDPD256
+		v.Op = OpAMD64VSUBPD256
 		return true
 	case OpSubFloat64x8:
-		v.Op = OpAMD64VADDPD512
+		v.Op = OpAMD64VSUBPD512
 		return true
 	case OpSubInt16x16:
 		v.Op = OpAMD64VPSUBW256
@ -30476,12 +30476,13 @@ func rewriteValueAMD64_OpEqualInt16x32(v *Value) bool {
 	b := v.Block
 	typ := &b.Func.Config.Types
 	// match: (EqualInt16x32 x y)
-	// result: (VPMOVMToVec16x32 (VPCMPEQW512 x y))
+	// result: (VPMOVMToVec16x32 (VPCMPW512 [0] x y))
 	for {
 		x := v_0
 		y := v_1
 		v.reset(OpAMD64VPMOVMToVec16x32)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPEQW512, typ.Mask)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPW512, typ.Mask)
 		v0.AuxInt = int8ToAuxInt(0)
 		v0.AddArg2(x, y)
 		v.AddArg(v0)
 		return true
@ -30493,12 +30494,13 @@ func rewriteValueAMD64_OpEqualInt32x16(v *Value) bool {
 	b := v.Block
 	typ := &b.Func.Config.Types
 	// match: (EqualInt32x16 x y)
-	// result: (VPMOVMToVec32x16 (VPCMPEQD512 x y))
+	// result: (VPMOVMToVec32x16 (VPCMPD512 [0] x y))
 	for {
 		x := v_0
 		y := v_1
 		v.reset(OpAMD64VPMOVMToVec32x16)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPEQD512, typ.Mask)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPD512, typ.Mask)
 		v0.AuxInt = int8ToAuxInt(0)
 		v0.AddArg2(x, y)
 		v.AddArg(v0)
 		return true
@ -30510,12 +30512,13 @@ func rewriteValueAMD64_OpEqualInt64x8(v *Value) bool {
 	b := v.Block
 	typ := &b.Func.Config.Types
 	// match: (EqualInt64x8 x y)
-	// result: (VPMOVMToVec64x8 (VPCMPEQQ512 x y))
+	// result: (VPMOVMToVec64x8 (VPCMPQ512 [0] x y))
 	for {
 		x := v_0
 		y := v_1
 		v.reset(OpAMD64VPMOVMToVec64x8)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPEQQ512, typ.Mask)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPQ512, typ.Mask)
 		v0.AuxInt = int8ToAuxInt(0)
 		v0.AddArg2(x, y)
 		v.AddArg(v0)
 		return true
@ -30527,12 +30530,13 @@ func rewriteValueAMD64_OpEqualInt8x64(v *Value) bool {
 	b := v.Block
 	typ := &b.Func.Config.Types
 	// match: (EqualInt8x64 x y)
-	// result: (VPMOVMToVec8x64 (VPCMPEQB512 x y))
+	// result: (VPMOVMToVec8x64 (VPCMPB512 [0] x y))
 	for {
 		x := v_0
 		y := v_1
 		v.reset(OpAMD64VPMOVMToVec8x64)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPEQB512, typ.Mask)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPB512, typ.Mask)
 		v0.AuxInt = int8ToAuxInt(0)
 		v0.AddArg2(x, y)
 		v.AddArg(v0)
 		return true
@ -31623,12 +31627,13 @@ func rewriteValueAMD64_OpGreaterInt16x32(v *Value) bool {
 	b := v.Block
 	typ := &b.Func.Config.Types
 	// match: (GreaterInt16x32 x y)
-	// result: (VPMOVMToVec16x32 (VPCMPGTW512 x y))
+	// result: (VPMOVMToVec16x32 (VPCMPW512 [6] x y))
 	for {
 		x := v_0
 		y := v_1
 		v.reset(OpAMD64VPMOVMToVec16x32)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPGTW512, typ.Mask)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPW512, typ.Mask)
 		v0.AuxInt = int8ToAuxInt(6)
 		v0.AddArg2(x, y)
 		v.AddArg(v0)
 		return true
@ -31640,12 +31645,13 @@ func rewriteValueAMD64_OpGreaterInt32x16(v *Value) bool {
 	b := v.Block
 	typ := &b.Func.Config.Types
 	// match: (GreaterInt32x16 x y)
-	// result: (VPMOVMToVec32x16 (VPCMPGTD512 x y))
+	// result: (VPMOVMToVec32x16 (VPCMPD512 [6] x y))
 	for {
 		x := v_0
 		y := v_1
 		v.reset(OpAMD64VPMOVMToVec32x16)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPGTD512, typ.Mask)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPD512, typ.Mask)
 		v0.AuxInt = int8ToAuxInt(6)
 		v0.AddArg2(x, y)
 		v.AddArg(v0)
 		return true
@ -31657,12 +31663,13 @@ func rewriteValueAMD64_OpGreaterInt64x2(v *Value) bool {
 	b := v.Block
 	typ := &b.Func.Config.Types
 	// match: (GreaterInt64x2 x y)
-	// result: (VPMOVMToVec64x2 (VPCMPGTQ128 x y))
+	// result: (VPMOVMToVec64x2 (VPCMPQ128 [6] x y))
 	for {
 		x := v_0
 		y := v_1
 		v.reset(OpAMD64VPMOVMToVec64x2)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPGTQ128, typ.Mask)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPQ128, typ.Mask)
 		v0.AuxInt = int8ToAuxInt(6)
 		v0.AddArg2(x, y)
 		v.AddArg(v0)
 		return true
@ -31674,12 +31681,13 @@ func rewriteValueAMD64_OpGreaterInt64x8(v *Value) bool {
 	b := v.Block
 	typ := &b.Func.Config.Types
 	// match: (GreaterInt64x8 x y)
-	// result: (VPMOVMToVec64x8 (VPCMPGTQ512 x y))
+	// result: (VPMOVMToVec64x8 (VPCMPQ512 [6] x y))
 	for {
 		x := v_0
 		y := v_1
 		v.reset(OpAMD64VPMOVMToVec64x8)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPGTQ512, typ.Mask)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPQ512, typ.Mask)
 		v0.AuxInt = int8ToAuxInt(6)
 		v0.AddArg2(x, y)
 		v.AddArg(v0)
 		return true
@ -31691,12 +31699,13 @@ func rewriteValueAMD64_OpGreaterInt8x64(v *Value) bool {
 	b := v.Block
 	typ := &b.Func.Config.Types
 	// match: (GreaterInt8x64 x y)
-	// result: (VPMOVMToVec8x64 (VPCMPGTB512 x y))
+	// result: (VPMOVMToVec8x64 (VPCMPB512 [6] x y))
 	for {
 		x := v_0
 		y := v_1
 		v.reset(OpAMD64VPMOVMToVec8x64)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPGTB512, typ.Mask)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPB512, typ.Mask)
 		v0.AuxInt = int8ToAuxInt(6)
 		v0.AddArg2(x, y)
 		v.AddArg(v0)
 		return true
@ -37259,13 +37268,14 @@ func rewriteValueAMD64_OpMaskedEqualInt16x16(v *Value) bool {
 	b := v.Block
 	typ := &b.Func.Config.Types
 	// match: (MaskedEqualInt16x16 x y mask)
-	// result: (VPMOVMToVec16x16 (VPCMPEQWMasked256 x y (VPMOVVec16x16ToM <types.TypeMask> mask)))
+	// result: (VPMOVMToVec16x16 (VPCMPWMasked256 [0] x y (VPMOVVec16x16ToM <types.TypeMask> mask)))
 	for {
 		x := v_0
 		y := v_1
 		mask := v_2
 		v.reset(OpAMD64VPMOVMToVec16x16)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPEQWMasked256, typ.Mask)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPWMasked256, typ.Mask)
 		v0.AuxInt = int8ToAuxInt(0)
 		v1 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x16ToM, types.TypeMask)
 		v1.AddArg(mask)
 		v0.AddArg3(x, y, v1)
@ -37280,13 +37290,14 @@ func rewriteValueAMD64_OpMaskedEqualInt16x32(v *Value) bool {
 	b := v.Block
 	typ := &b.Func.Config.Types
 	// match: (MaskedEqualInt16x32 x y mask)
-	// result: (VPMOVMToVec16x32 (VPCMPEQWMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask)))
+	// result: (VPMOVMToVec16x32 (VPCMPWMasked512 [0] x y (VPMOVVec16x32ToM <types.TypeMask> mask)))
 	for {
 		x := v_0
 		y := v_1
 		mask := v_2
 		v.reset(OpAMD64VPMOVMToVec16x32)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPEQWMasked512, typ.Mask)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPWMasked512, typ.Mask)
 		v0.AuxInt = int8ToAuxInt(0)
 		v1 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x32ToM, types.TypeMask)
 		v1.AddArg(mask)
 		v0.AddArg3(x, y, v1)
@ -37301,13 +37312,14 @@ func rewriteValueAMD64_OpMaskedEqualInt16x8(v *Value) bool {
 	b := v.Block
 	typ := &b.Func.Config.Types
 	// match: (MaskedEqualInt16x8 x y mask)
-	// result: (VPMOVMToVec16x8 (VPCMPEQWMasked128 x y (VPMOVVec16x8ToM <types.TypeMask> mask)))
+	// result: (VPMOVMToVec16x8 (VPCMPWMasked128 [0] x y (VPMOVVec16x8ToM <types.TypeMask> mask)))
 	for {
 		x := v_0
 		y := v_1
 		mask := v_2
 		v.reset(OpAMD64VPMOVMToVec16x8)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPEQWMasked128, typ.Mask)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPWMasked128, typ.Mask)
 		v0.AuxInt = int8ToAuxInt(0)
 		v1 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask)
 		v1.AddArg(mask)
 		v0.AddArg3(x, y, v1)
@ -37322,13 +37334,14 @@ func rewriteValueAMD64_OpMaskedEqualInt32x16(v *Value) bool {
 	b := v.Block
 	typ := &b.Func.Config.Types
 	// match: (MaskedEqualInt32x16 x y mask)
-	// result: (VPMOVMToVec32x16 (VPCMPEQDMasked512 x y (VPMOVVec32x16ToM <types.TypeMask> mask)))
+	// result: (VPMOVMToVec32x16 (VPCMPDMasked512 [0] x y (VPMOVVec32x16ToM <types.TypeMask> mask)))
 	for {
 		x := v_0
 		y := v_1
 		mask := v_2
 		v.reset(OpAMD64VPMOVMToVec32x16)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPEQDMasked512, typ.Mask)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPDMasked512, typ.Mask)
 		v0.AuxInt = int8ToAuxInt(0)
 		v1 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask)
 		v1.AddArg(mask)
 		v0.AddArg3(x, y, v1)
@ -37343,13 +37356,14 @@ func rewriteValueAMD64_OpMaskedEqualInt32x4(v *Value) bool {
 	b := v.Block
 	typ := &b.Func.Config.Types
 	// match: (MaskedEqualInt32x4 x y mask)
-	// result: (VPMOVMToVec32x4 (VPCMPEQDMasked128 x y (VPMOVVec32x4ToM <types.TypeMask> mask)))
+	// result: (VPMOVMToVec32x4 (VPCMPDMasked128 [0] x y (VPMOVVec32x4ToM <types.TypeMask> mask)))
 	for {
 		x := v_0
 		y := v_1
 		mask := v_2
 		v.reset(OpAMD64VPMOVMToVec32x4)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPEQDMasked128, typ.Mask)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPDMasked128, typ.Mask)
 		v0.AuxInt = int8ToAuxInt(0)
 		v1 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
 		v1.AddArg(mask)
 		v0.AddArg3(x, y, v1)
@ -37364,13 +37378,14 @@ func rewriteValueAMD64_OpMaskedEqualInt32x8(v *Value) bool {
 	b := v.Block
 	typ := &b.Func.Config.Types
 	// match: (MaskedEqualInt32x8 x y mask)
-	// result: (VPMOVMToVec32x8 (VPCMPEQDMasked256 x y (VPMOVVec32x8ToM <types.TypeMask> mask)))
+	// result: (VPMOVMToVec32x8 (VPCMPDMasked256 [0] x y (VPMOVVec32x8ToM <types.TypeMask> mask)))
 	for {
 		x := v_0
 		y := v_1
 		mask := v_2
 		v.reset(OpAMD64VPMOVMToVec32x8)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPEQDMasked256, typ.Mask)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPDMasked256, typ.Mask)
 		v0.AuxInt = int8ToAuxInt(0)
 		v1 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask)
 		v1.AddArg(mask)
 		v0.AddArg3(x, y, v1)
@ -37385,13 +37400,14 @@ func rewriteValueAMD64_OpMaskedEqualInt64x2(v *Value) bool {
 	b := v.Block
 	typ := &b.Func.Config.Types
 	// match: (MaskedEqualInt64x2 x y mask)
-	// result: (VPMOVMToVec64x2 (VPCMPEQQMasked128 x y (VPMOVVec64x2ToM <types.TypeMask> mask)))
+	// result: (VPMOVMToVec64x2 (VPCMPQMasked128 [0] x y (VPMOVVec64x2ToM <types.TypeMask> mask)))
 	for {
 		x := v_0
 		y := v_1
 		mask := v_2
 		v.reset(OpAMD64VPMOVMToVec64x2)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPEQQMasked128, typ.Mask)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPQMasked128, typ.Mask)
 		v0.AuxInt = int8ToAuxInt(0)
 		v1 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask)
 		v1.AddArg(mask)
 		v0.AddArg3(x, y, v1)
@ -37406,13 +37422,14 @@ func rewriteValueAMD64_OpMaskedEqualInt64x4(v *Value) bool {
 	b := v.Block
 	typ := &b.Func.Config.Types
 	// match: (MaskedEqualInt64x4 x y mask)
-	// result: (VPMOVMToVec64x4 (VPCMPEQQMasked256 x y (VPMOVVec64x4ToM <types.TypeMask> mask)))
+	// result: (VPMOVMToVec64x4 (VPCMPQMasked256 [0] x y (VPMOVVec64x4ToM <types.TypeMask> mask)))
 	for {
 		x := v_0
 		y := v_1
 		mask := v_2
 		v.reset(OpAMD64VPMOVMToVec64x4)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPEQQMasked256, typ.Mask)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPQMasked256, typ.Mask)
 		v0.AuxInt = int8ToAuxInt(0)
 		v1 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask)
 		v1.AddArg(mask)
 		v0.AddArg3(x, y, v1)
@ -37427,13 +37444,14 @@ func rewriteValueAMD64_OpMaskedEqualInt64x8(v *Value) bool {
 	b := v.Block
 	typ := &b.Func.Config.Types
 	// match: (MaskedEqualInt64x8 x y mask)
-	// result: (VPMOVMToVec64x8 (VPCMPEQQMasked512 x y (VPMOVVec64x8ToM <types.TypeMask> mask)))
+	// result: (VPMOVMToVec64x8 (VPCMPQMasked512 [0] x y (VPMOVVec64x8ToM <types.TypeMask> mask)))
 	for {
 		x := v_0
 		y := v_1
 		mask := v_2
 		v.reset(OpAMD64VPMOVMToVec64x8)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPEQQMasked512, typ.Mask)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPQMasked512, typ.Mask)
 		v0.AuxInt = int8ToAuxInt(0)
 		v1 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask)
 		v1.AddArg(mask)
 		v0.AddArg3(x, y, v1)
@ -37448,13 +37466,14 @@ func rewriteValueAMD64_OpMaskedEqualInt8x16(v *Value) bool {
 	b := v.Block
 	typ := &b.Func.Config.Types
 	// match: (MaskedEqualInt8x16 x y mask)
-	// result: (VPMOVMToVec8x16 (VPCMPEQBMasked128 x y (VPMOVVec8x16ToM <types.TypeMask> mask)))
+	// result: (VPMOVMToVec8x16 (VPCMPBMasked128 [0] x y (VPMOVVec8x16ToM <types.TypeMask> mask)))
 	for {
 		x := v_0
 		y := v_1
 		mask := v_2
 		v.reset(OpAMD64VPMOVMToVec8x16)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPEQBMasked128, typ.Mask)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPBMasked128, typ.Mask)
 		v0.AuxInt = int8ToAuxInt(0)
 		v1 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x16ToM, types.TypeMask)
 		v1.AddArg(mask)
 		v0.AddArg3(x, y, v1)
@ -37469,13 +37488,14 @@ func rewriteValueAMD64_OpMaskedEqualInt8x32(v *Value) bool {
 	b := v.Block
 	typ := &b.Func.Config.Types
 	// match: (MaskedEqualInt8x32 x y mask)
-	// result: (VPMOVMToVec8x32 (VPCMPEQBMasked256 x y (VPMOVVec8x32ToM <types.TypeMask> mask)))
+	// result: (VPMOVMToVec8x32 (VPCMPBMasked256 [0] x y (VPMOVVec8x32ToM <types.TypeMask> mask)))
 	for {
 		x := v_0
 		y := v_1
 		mask := v_2
 		v.reset(OpAMD64VPMOVMToVec8x32)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPEQBMasked256, typ.Mask)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPBMasked256, typ.Mask)
 		v0.AuxInt = int8ToAuxInt(0)
 		v1 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x32ToM, types.TypeMask)
 		v1.AddArg(mask)
 		v0.AddArg3(x, y, v1)
@ -37490,13 +37510,14 @@ func rewriteValueAMD64_OpMaskedEqualInt8x64(v *Value) bool {
 	b := v.Block
 	typ := &b.Func.Config.Types
 	// match: (MaskedEqualInt8x64 x y mask)
-	// result: (VPMOVMToVec8x64 (VPCMPEQBMasked512 x y (VPMOVVec8x64ToM <types.TypeMask> mask)))
+	// result: (VPMOVMToVec8x64 (VPCMPBMasked512 [0] x y (VPMOVVec8x64ToM <types.TypeMask> mask)))
 	for {
 		x := v_0
 		y := v_1
 		mask := v_2
 		v.reset(OpAMD64VPMOVMToVec8x64)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPEQBMasked512, typ.Mask)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPBMasked512, typ.Mask)
 		v0.AuxInt = int8ToAuxInt(0)
 		v1 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x64ToM, types.TypeMask)
 		v1.AddArg(mask)
 		v0.AddArg3(x, y, v1)
@ -40943,13 +40964,14 @@ func rewriteValueAMD64_OpMaskedGreaterInt16x16(v *Value) bool {
 	b := v.Block
 	typ := &b.Func.Config.Types
 	// match: (MaskedGreaterInt16x16 x y mask)
-	// result: (VPMOVMToVec16x16 (VPCMPGTWMasked256 x y (VPMOVVec16x16ToM <types.TypeMask> mask)))
+	// result: (VPMOVMToVec16x16 (VPCMPWMasked256 [6] x y (VPMOVVec16x16ToM <types.TypeMask> mask)))
 	for {
 		x := v_0
 		y := v_1
 		mask := v_2
 		v.reset(OpAMD64VPMOVMToVec16x16)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPGTWMasked256, typ.Mask)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPWMasked256, typ.Mask)
 		v0.AuxInt = int8ToAuxInt(6)
 		v1 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x16ToM, types.TypeMask)
 		v1.AddArg(mask)
 		v0.AddArg3(x, y, v1)
@ -40964,13 +40986,14 @@ func rewriteValueAMD64_OpMaskedGreaterInt16x32(v *Value) bool {
 	b := v.Block
 	typ := &b.Func.Config.Types
 	// match: (MaskedGreaterInt16x32 x y mask)
-	// result: (VPMOVMToVec16x32 (VPCMPGTWMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask)))
+	// result: (VPMOVMToVec16x32 (VPCMPWMasked512 [6] x y (VPMOVVec16x32ToM <types.TypeMask> mask)))
 	for {
 		x := v_0
 		y := v_1
 		mask := v_2
 		v.reset(OpAMD64VPMOVMToVec16x32)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPGTWMasked512, typ.Mask)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPWMasked512, typ.Mask)
 		v0.AuxInt = int8ToAuxInt(6)
 		v1 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x32ToM, types.TypeMask)
 		v1.AddArg(mask)
 		v0.AddArg3(x, y, v1)
@ -40985,13 +41008,14 @@ func rewriteValueAMD64_OpMaskedGreaterInt16x8(v *Value) bool {
 	b := v.Block
 	typ := &b.Func.Config.Types
 	// match: (MaskedGreaterInt16x8 x y mask)
-	// result: (VPMOVMToVec16x8 (VPCMPGTWMasked128 x y (VPMOVVec16x8ToM <types.TypeMask> mask)))
+	// result: (VPMOVMToVec16x8 (VPCMPWMasked128 [6] x y (VPMOVVec16x8ToM <types.TypeMask> mask)))
 	for {
 		x := v_0
 		y := v_1
 		mask := v_2
 		v.reset(OpAMD64VPMOVMToVec16x8)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPGTWMasked128, typ.Mask)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPWMasked128, typ.Mask)
 		v0.AuxInt = int8ToAuxInt(6)
 		v1 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask)
 		v1.AddArg(mask)
 		v0.AddArg3(x, y, v1)
@ -41006,13 +41030,14 @@ func rewriteValueAMD64_OpMaskedGreaterInt32x16(v *Value) bool {
 	b := v.Block
 	typ := &b.Func.Config.Types
 	// match: (MaskedGreaterInt32x16 x y mask)
-	// result: (VPMOVMToVec32x16 (VPCMPGTDMasked512 x y (VPMOVVec32x16ToM <types.TypeMask> mask)))
+	// result: (VPMOVMToVec32x16 (VPCMPDMasked512 [6] x y (VPMOVVec32x16ToM <types.TypeMask> mask)))
 	for {
 		x := v_0
 		y := v_1
 		mask := v_2
 		v.reset(OpAMD64VPMOVMToVec32x16)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPGTDMasked512, typ.Mask)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPDMasked512, typ.Mask)
 		v0.AuxInt = int8ToAuxInt(6)
 		v1 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask)
 		v1.AddArg(mask)
 		v0.AddArg3(x, y, v1)
@ -41027,13 +41052,14 @@ func rewriteValueAMD64_OpMaskedGreaterInt32x4(v *Value) bool {
 	b := v.Block
 	typ := &b.Func.Config.Types
 	// match: (MaskedGreaterInt32x4 x y mask)
-	// result: (VPMOVMToVec32x4 (VPCMPGTDMasked128 x y (VPMOVVec32x4ToM <types.TypeMask> mask)))
+	// result: (VPMOVMToVec32x4 (VPCMPDMasked128 [6] x y (VPMOVVec32x4ToM <types.TypeMask> mask)))
 	for {
 		x := v_0
 		y := v_1
 		mask := v_2
 		v.reset(OpAMD64VPMOVMToVec32x4)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPGTDMasked128, typ.Mask)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPDMasked128, typ.Mask)
 		v0.AuxInt = int8ToAuxInt(6)
 		v1 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
 		v1.AddArg(mask)
 		v0.AddArg3(x, y, v1)
@ -41048,13 +41074,14 @@ func rewriteValueAMD64_OpMaskedGreaterInt32x8(v *Value) bool {
 	b := v.Block
 	typ := &b.Func.Config.Types
 	// match: (MaskedGreaterInt32x8 x y mask)
-	// result: (VPMOVMToVec32x8 (VPCMPGTDMasked256 x y (VPMOVVec32x8ToM <types.TypeMask> mask)))
+	// result: (VPMOVMToVec32x8 (VPCMPDMasked256 [6] x y (VPMOVVec32x8ToM <types.TypeMask> mask)))
 	for {
 		x := v_0
 		y := v_1
 		mask := v_2
 		v.reset(OpAMD64VPMOVMToVec32x8)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPGTDMasked256, typ.Mask)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPDMasked256, typ.Mask)
 		v0.AuxInt = int8ToAuxInt(6)
 		v1 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask)
 		v1.AddArg(mask)
 		v0.AddArg3(x, y, v1)
@ -41069,13 +41096,14 @@ func rewriteValueAMD64_OpMaskedGreaterInt64x2(v *Value) bool {
 	b := v.Block
 	typ := &b.Func.Config.Types
 	// match: (MaskedGreaterInt64x2 x y mask)
-	// result: (VPMOVMToVec64x2 (VPCMPGTQMasked128 x y (VPMOVVec64x2ToM <types.TypeMask> mask)))
+	// result: (VPMOVMToVec64x2 (VPCMPQMasked128 [6] x y (VPMOVVec64x2ToM <types.TypeMask> mask)))
 	for {
 		x := v_0
 		y := v_1
 		mask := v_2
 		v.reset(OpAMD64VPMOVMToVec64x2)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPGTQMasked128, typ.Mask)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPQMasked128, typ.Mask)
 		v0.AuxInt = int8ToAuxInt(6)
 		v1 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask)
 		v1.AddArg(mask)
 		v0.AddArg3(x, y, v1)
@ -41090,13 +41118,14 @@ func rewriteValueAMD64_OpMaskedGreaterInt64x4(v *Value) bool {
 	b := v.Block
 	typ := &b.Func.Config.Types
 	// match: (MaskedGreaterInt64x4 x y mask)
-	// result: (VPMOVMToVec64x4 (VPCMPGTQMasked256 x y (VPMOVVec64x4ToM <types.TypeMask> mask)))
+	// result: (VPMOVMToVec64x4 (VPCMPQMasked256 [6] x y (VPMOVVec64x4ToM <types.TypeMask> mask)))
 	for {
 		x := v_0
 		y := v_1
 		mask := v_2
 		v.reset(OpAMD64VPMOVMToVec64x4)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPGTQMasked256, typ.Mask)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPQMasked256, typ.Mask)
 		v0.AuxInt = int8ToAuxInt(6)
 		v1 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask)
 		v1.AddArg(mask)
 		v0.AddArg3(x, y, v1)
@ -41111,13 +41140,14 @@ func rewriteValueAMD64_OpMaskedGreaterInt64x8(v *Value) bool {
 	b := v.Block
 	typ := &b.Func.Config.Types
 	// match: (MaskedGreaterInt64x8 x y mask)
-	// result: (VPMOVMToVec64x8 (VPCMPGTQMasked512 x y (VPMOVVec64x8ToM <types.TypeMask> mask)))
+	// result: (VPMOVMToVec64x8 (VPCMPQMasked512 [6] x y (VPMOVVec64x8ToM <types.TypeMask> mask)))
 	for {
 		x := v_0
 		y := v_1
 		mask := v_2
 		v.reset(OpAMD64VPMOVMToVec64x8)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPGTQMasked512, typ.Mask)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPQMasked512, typ.Mask)
 		v0.AuxInt = int8ToAuxInt(6)
 		v1 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask)
 		v1.AddArg(mask)
 		v0.AddArg3(x, y, v1)
@ -41132,13 +41162,14 @@ func rewriteValueAMD64_OpMaskedGreaterInt8x16(v *Value) bool {
 	b := v.Block
 	typ := &b.Func.Config.Types
 	// match: (MaskedGreaterInt8x16 x y mask)
-	// result: (VPMOVMToVec8x16 (VPCMPGTBMasked128 x y (VPMOVVec8x16ToM <types.TypeMask> mask)))
+	// result: (VPMOVMToVec8x16 (VPCMPBMasked128 [6] x y (VPMOVVec8x16ToM <types.TypeMask> mask)))
 	for {
 		x := v_0
 		y := v_1
 		mask := v_2
 		v.reset(OpAMD64VPMOVMToVec8x16)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPGTBMasked128, typ.Mask)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPBMasked128, typ.Mask)
 		v0.AuxInt = int8ToAuxInt(6)
 		v1 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x16ToM, types.TypeMask)
 		v1.AddArg(mask)
 		v0.AddArg3(x, y, v1)
@ -41153,13 +41184,14 @@ func rewriteValueAMD64_OpMaskedGreaterInt8x32(v *Value) bool {
 	b := v.Block
 	typ := &b.Func.Config.Types
 	// match: (MaskedGreaterInt8x32 x y mask)
-	// result: (VPMOVMToVec8x32 (VPCMPGTBMasked256 x y (VPMOVVec8x32ToM <types.TypeMask> mask)))
+	// result: (VPMOVMToVec8x32 (VPCMPBMasked256 [6] x y (VPMOVVec8x32ToM <types.TypeMask> mask)))
 	for {
 		x := v_0
 		y := v_1
 		mask := v_2
 		v.reset(OpAMD64VPMOVMToVec8x32)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPGTBMasked256, typ.Mask)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPBMasked256, typ.Mask)
 		v0.AuxInt = int8ToAuxInt(6)
 		v1 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x32ToM, types.TypeMask)
 		v1.AddArg(mask)
 		v0.AddArg3(x, y, v1)
@ -41174,13 +41206,14 @@ func rewriteValueAMD64_OpMaskedGreaterInt8x64(v *Value) bool {
 	b := v.Block
 	typ := &b.Func.Config.Types
 	// match: (MaskedGreaterInt8x64 x y mask)
-	// result: (VPMOVMToVec8x64 (VPCMPGTBMasked512 x y (VPMOVVec8x64ToM <types.TypeMask> mask)))
+	// result: (VPMOVMToVec8x64 (VPCMPBMasked512 [6] x y (VPMOVVec8x64ToM <types.TypeMask> mask)))
 	for {
 		x := v_0
 		y := v_1
 		mask := v_2
 		v.reset(OpAMD64VPMOVMToVec8x64)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPGTBMasked512, typ.Mask)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPBMasked512, typ.Mask)
 		v0.AuxInt = int8ToAuxInt(6)
 		v1 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x64ToM, types.TypeMask)
 		v1.AddArg(mask)
 		v0.AddArg3(x, y, v1)
@ -47044,12 +47077,12 @@ func rewriteValueAMD64_OpMaskedSubFloat32x16(v *Value) bool {
 	v_0 := v.Args[0]
 	b := v.Block
 	// match: (MaskedSubFloat32x16 x y mask)
-	// result: (VADDPSMasked512 x y (VPMOVVec32x16ToM <types.TypeMask> mask))
+	// result: (VSUBPSMasked512 x y (VPMOVVec32x16ToM <types.TypeMask> mask))
 	for {
 		x := v_0
 		y := v_1
 		mask := v_2
-		v.reset(OpAMD64VADDPSMasked512)
+		v.reset(OpAMD64VSUBPSMasked512)
 		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask)
 		v0.AddArg(mask)
 		v.AddArg3(x, y, v0)
@ -47062,12 +47095,12 @@ func rewriteValueAMD64_OpMaskedSubFloat32x4(v *Value) bool {
 	v_0 := v.Args[0]
 	b := v.Block
 	// match: (MaskedSubFloat32x4 x y mask)
-	// result: (VADDPSMasked128 x y (VPMOVVec32x4ToM <types.TypeMask> mask))
+	// result: (VSUBPSMasked128 x y (VPMOVVec32x4ToM <types.TypeMask> mask))
 	for {
 		x := v_0
 		y := v_1
 		mask := v_2
-		v.reset(OpAMD64VADDPSMasked128)
+		v.reset(OpAMD64VSUBPSMasked128)
 		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
 		v0.AddArg(mask)
 		v.AddArg3(x, y, v0)
@ -47080,12 +47113,12 @@ func rewriteValueAMD64_OpMaskedSubFloat32x8(v *Value) bool {
 	v_0 := v.Args[0]
 	b := v.Block
 	// match: (MaskedSubFloat32x8 x y mask)
-	// result: (VADDPSMasked256 x y (VPMOVVec32x8ToM <types.TypeMask> mask))
+	// result: (VSUBPSMasked256 x y (VPMOVVec32x8ToM <types.TypeMask> mask))
 	for {
 		x := v_0
 		y := v_1
 		mask := v_2
-		v.reset(OpAMD64VADDPSMasked256)
+		v.reset(OpAMD64VSUBPSMasked256)
 		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask)
 		v0.AddArg(mask)
 		v.AddArg3(x, y, v0)
@ -47098,12 +47131,12 @@ func rewriteValueAMD64_OpMaskedSubFloat64x2(v *Value) bool {
 	v_0 := v.Args[0]
 	b := v.Block
 	// match: (MaskedSubFloat64x2 x y mask)
-	// result: (VADDPDMasked128 x y (VPMOVVec64x2ToM <types.TypeMask> mask))
+	// result: (VSUBPDMasked128 x y (VPMOVVec64x2ToM <types.TypeMask> mask))
 	for {
 		x := v_0
 		y := v_1
 		mask := v_2
-		v.reset(OpAMD64VADDPDMasked128)
+		v.reset(OpAMD64VSUBPDMasked128)
 		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask)
 		v0.AddArg(mask)
 		v.AddArg3(x, y, v0)
@ -47116,12 +47149,12 @@ func rewriteValueAMD64_OpMaskedSubFloat64x4(v *Value) bool {
 	v_0 := v.Args[0]
 	b := v.Block
 	// match: (MaskedSubFloat64x4 x y mask)
-	// result: (VADDPDMasked256 x y (VPMOVVec64x4ToM <types.TypeMask> mask))
+	// result: (VSUBPDMasked256 x y (VPMOVVec64x4ToM <types.TypeMask> mask))
 	for {
 		x := v_0
 		y := v_1
 		mask := v_2
-		v.reset(OpAMD64VADDPDMasked256)
+		v.reset(OpAMD64VSUBPDMasked256)
 		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask)
 		v0.AddArg(mask)
 		v.AddArg3(x, y, v0)
@ -47134,12 +47167,12 @@ func rewriteValueAMD64_OpMaskedSubFloat64x8(v *Value) bool {
 	v_0 := v.Args[0]
 	b := v.Block
 	// match: (MaskedSubFloat64x8 x y mask)
-	// result: (VADDPDMasked512 x y (VPMOVVec64x8ToM <types.TypeMask> mask))
+	// result: (VSUBPDMasked512 x y (VPMOVVec64x8ToM <types.TypeMask> mask))
 	for {
 		x := v_0
 		y := v_1
 		mask := v_2
-		v.reset(OpAMD64VADDPDMasked512)
+		v.reset(OpAMD64VSUBPDMasked512)
 		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask)
 		v0.AddArg(mask)
 		v.AddArg3(x, y, v0)
--- a/src/cmd/compile/internal/ssagen/simdintrinsics.go
+++ b/src/cmd/compile/internal/ssagen/simdintrinsics.go
@ -1370,195 +1370,195 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
 	addF(simdPackage, "Uint32x4.MaskedUnsignedSignedQuadDotProdAccumulate", opLen4(ssa.OpMaskedUnsignedSignedQuadDotProdAccumulateUint32x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Uint32x8.MaskedSaturatedUnsignedSignedQuadDotProdAccumulate", opLen4(ssa.OpMaskedSaturatedUnsignedSignedQuadDotProdAccumulateUint32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint32x8.MaskedUnsignedSignedQuadDotProdAccumulate", opLen4(ssa.OpMaskedUnsignedSignedQuadDotProdAccumulateUint32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Float32x16.CeilSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpCeilSuppressExceptionWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float32x4.CeilSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpCeilSuppressExceptionWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
 	addF(simdPackage, "Float32x8.CeilSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpCeilSuppressExceptionWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
 	addF(simdPackage, "Float32x16.CeilSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpCeilSuppressExceptionWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float64x2.CeilSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpCeilSuppressExceptionWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
 	addF(simdPackage, "Float64x4.CeilSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpCeilSuppressExceptionWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
 	addF(simdPackage, "Float64x8.CeilSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpCeilSuppressExceptionWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float32x16.CeilWithPrecision", opLen1Imm8(ssa.OpCeilWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float32x4.CeilWithPrecision", opLen1Imm8(ssa.OpCeilWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
 	addF(simdPackage, "Float32x8.CeilWithPrecision", opLen1Imm8(ssa.OpCeilWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
 	addF(simdPackage, "Float32x16.CeilWithPrecision", opLen1Imm8(ssa.OpCeilWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float64x2.CeilWithPrecision", opLen1Imm8(ssa.OpCeilWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
 	addF(simdPackage, "Float64x4.CeilWithPrecision", opLen1Imm8(ssa.OpCeilWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
 	addF(simdPackage, "Float64x8.CeilWithPrecision", opLen1Imm8(ssa.OpCeilWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float32x16.DiffWithCeilSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpDiffWithCeilSuppressExceptionWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float32x4.DiffWithCeilSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpDiffWithCeilSuppressExceptionWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
 	addF(simdPackage, "Float32x8.DiffWithCeilSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpDiffWithCeilSuppressExceptionWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
 	addF(simdPackage, "Float32x16.DiffWithCeilSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpDiffWithCeilSuppressExceptionWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float64x2.DiffWithCeilSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpDiffWithCeilSuppressExceptionWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
 	addF(simdPackage, "Float64x4.DiffWithCeilSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpDiffWithCeilSuppressExceptionWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
 	addF(simdPackage, "Float64x8.DiffWithCeilSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpDiffWithCeilSuppressExceptionWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float32x16.DiffWithCeilWithPrecision", opLen1Imm8(ssa.OpDiffWithCeilWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float32x4.DiffWithCeilWithPrecision", opLen1Imm8(ssa.OpDiffWithCeilWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
 	addF(simdPackage, "Float32x8.DiffWithCeilWithPrecision", opLen1Imm8(ssa.OpDiffWithCeilWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
 	addF(simdPackage, "Float32x16.DiffWithCeilWithPrecision", opLen1Imm8(ssa.OpDiffWithCeilWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float64x2.DiffWithCeilWithPrecision", opLen1Imm8(ssa.OpDiffWithCeilWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
 	addF(simdPackage, "Float64x4.DiffWithCeilWithPrecision", opLen1Imm8(ssa.OpDiffWithCeilWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
 	addF(simdPackage, "Float64x8.DiffWithCeilWithPrecision", opLen1Imm8(ssa.OpDiffWithCeilWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float32x16.DiffWithFloorSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpDiffWithFloorSuppressExceptionWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float32x4.DiffWithFloorSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpDiffWithFloorSuppressExceptionWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
 	addF(simdPackage, "Float32x8.DiffWithFloorSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpDiffWithFloorSuppressExceptionWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
 	addF(simdPackage, "Float32x16.DiffWithFloorSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpDiffWithFloorSuppressExceptionWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float64x2.DiffWithFloorSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpDiffWithFloorSuppressExceptionWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
 	addF(simdPackage, "Float64x4.DiffWithFloorSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpDiffWithFloorSuppressExceptionWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
 	addF(simdPackage, "Float64x8.DiffWithFloorSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpDiffWithFloorSuppressExceptionWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float32x16.DiffWithFloorWithPrecision", opLen1Imm8(ssa.OpDiffWithFloorWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float32x4.DiffWithFloorWithPrecision", opLen1Imm8(ssa.OpDiffWithFloorWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
 	addF(simdPackage, "Float32x8.DiffWithFloorWithPrecision", opLen1Imm8(ssa.OpDiffWithFloorWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
 	addF(simdPackage, "Float32x16.DiffWithFloorWithPrecision", opLen1Imm8(ssa.OpDiffWithFloorWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float64x2.DiffWithFloorWithPrecision", opLen1Imm8(ssa.OpDiffWithFloorWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
 	addF(simdPackage, "Float64x4.DiffWithFloorWithPrecision", opLen1Imm8(ssa.OpDiffWithFloorWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
 	addF(simdPackage, "Float64x8.DiffWithFloorWithPrecision", opLen1Imm8(ssa.OpDiffWithFloorWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float32x16.DiffWithRoundSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpDiffWithRoundSuppressExceptionWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float32x4.DiffWithRoundSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpDiffWithRoundSuppressExceptionWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
 	addF(simdPackage, "Float32x8.DiffWithRoundSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpDiffWithRoundSuppressExceptionWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
 	addF(simdPackage, "Float32x16.DiffWithRoundSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpDiffWithRoundSuppressExceptionWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float64x2.DiffWithRoundSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpDiffWithRoundSuppressExceptionWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
 	addF(simdPackage, "Float64x4.DiffWithRoundSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpDiffWithRoundSuppressExceptionWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
 	addF(simdPackage, "Float64x8.DiffWithRoundSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpDiffWithRoundSuppressExceptionWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float32x16.DiffWithRoundWithPrecision", opLen1Imm8(ssa.OpDiffWithRoundWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float32x4.DiffWithRoundWithPrecision", opLen1Imm8(ssa.OpDiffWithRoundWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
 	addF(simdPackage, "Float32x8.DiffWithRoundWithPrecision", opLen1Imm8(ssa.OpDiffWithRoundWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
 	addF(simdPackage, "Float32x16.DiffWithRoundWithPrecision", opLen1Imm8(ssa.OpDiffWithRoundWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float64x2.DiffWithRoundWithPrecision", opLen1Imm8(ssa.OpDiffWithRoundWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
 	addF(simdPackage, "Float64x4.DiffWithRoundWithPrecision", opLen1Imm8(ssa.OpDiffWithRoundWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
 	addF(simdPackage, "Float64x8.DiffWithRoundWithPrecision", opLen1Imm8(ssa.OpDiffWithRoundWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float32x16.DiffWithTruncSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpDiffWithTruncSuppressExceptionWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float32x4.DiffWithTruncSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpDiffWithTruncSuppressExceptionWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
 	addF(simdPackage, "Float32x8.DiffWithTruncSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpDiffWithTruncSuppressExceptionWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
 	addF(simdPackage, "Float32x16.DiffWithTruncSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpDiffWithTruncSuppressExceptionWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float64x2.DiffWithTruncSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpDiffWithTruncSuppressExceptionWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
 	addF(simdPackage, "Float64x4.DiffWithTruncSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpDiffWithTruncSuppressExceptionWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
 	addF(simdPackage, "Float64x8.DiffWithTruncSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpDiffWithTruncSuppressExceptionWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float32x16.DiffWithTruncWithPrecision", opLen1Imm8(ssa.OpDiffWithTruncWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float32x4.DiffWithTruncWithPrecision", opLen1Imm8(ssa.OpDiffWithTruncWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
 	addF(simdPackage, "Float32x8.DiffWithTruncWithPrecision", opLen1Imm8(ssa.OpDiffWithTruncWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
 	addF(simdPackage, "Float32x16.DiffWithTruncWithPrecision", opLen1Imm8(ssa.OpDiffWithTruncWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float64x2.DiffWithTruncWithPrecision", opLen1Imm8(ssa.OpDiffWithTruncWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
 	addF(simdPackage, "Float64x4.DiffWithTruncWithPrecision", opLen1Imm8(ssa.OpDiffWithTruncWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
 	addF(simdPackage, "Float64x8.DiffWithTruncWithPrecision", opLen1Imm8(ssa.OpDiffWithTruncWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float32x16.FloorSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpFloorSuppressExceptionWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float32x4.FloorSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpFloorSuppressExceptionWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
 	addF(simdPackage, "Float32x8.FloorSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpFloorSuppressExceptionWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
 	addF(simdPackage, "Float32x16.FloorSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpFloorSuppressExceptionWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float64x2.FloorSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpFloorSuppressExceptionWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
 	addF(simdPackage, "Float64x4.FloorSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpFloorSuppressExceptionWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
 	addF(simdPackage, "Float64x8.FloorSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpFloorSuppressExceptionWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float32x16.FloorWithPrecision", opLen1Imm8(ssa.OpFloorWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float32x4.FloorWithPrecision", opLen1Imm8(ssa.OpFloorWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
 	addF(simdPackage, "Float32x8.FloorWithPrecision", opLen1Imm8(ssa.OpFloorWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
 	addF(simdPackage, "Float32x16.FloorWithPrecision", opLen1Imm8(ssa.OpFloorWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float64x2.FloorWithPrecision", opLen1Imm8(ssa.OpFloorWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
 	addF(simdPackage, "Float64x4.FloorWithPrecision", opLen1Imm8(ssa.OpFloorWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
 	addF(simdPackage, "Float64x8.FloorWithPrecision", opLen1Imm8(ssa.OpFloorWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float32x16.RoundSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpRoundSuppressExceptionWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float32x4.RoundSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpRoundSuppressExceptionWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
 	addF(simdPackage, "Float32x8.RoundSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpRoundSuppressExceptionWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
 	addF(simdPackage, "Float32x16.RoundSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpRoundSuppressExceptionWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float64x2.RoundSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpRoundSuppressExceptionWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
 	addF(simdPackage, "Float64x4.RoundSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpRoundSuppressExceptionWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
 	addF(simdPackage, "Float64x8.RoundSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpRoundSuppressExceptionWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float32x16.RoundWithPrecision", opLen1Imm8(ssa.OpRoundWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float32x4.RoundWithPrecision", opLen1Imm8(ssa.OpRoundWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
 	addF(simdPackage, "Float32x8.RoundWithPrecision", opLen1Imm8(ssa.OpRoundWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
 	addF(simdPackage, "Float32x16.RoundWithPrecision", opLen1Imm8(ssa.OpRoundWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float64x2.RoundWithPrecision", opLen1Imm8(ssa.OpRoundWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
 	addF(simdPackage, "Float64x4.RoundWithPrecision", opLen1Imm8(ssa.OpRoundWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
 	addF(simdPackage, "Float64x8.RoundWithPrecision", opLen1Imm8(ssa.OpRoundWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float32x16.TruncSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpTruncSuppressExceptionWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float32x4.TruncSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpTruncSuppressExceptionWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
 	addF(simdPackage, "Float32x8.TruncSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpTruncSuppressExceptionWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
 	addF(simdPackage, "Float32x16.TruncSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpTruncSuppressExceptionWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float64x2.TruncSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpTruncSuppressExceptionWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
 	addF(simdPackage, "Float64x4.TruncSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpTruncSuppressExceptionWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
 	addF(simdPackage, "Float64x8.TruncSuppressExceptionWithPrecision", opLen1Imm8(ssa.OpTruncSuppressExceptionWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float32x16.TruncWithPrecision", opLen1Imm8(ssa.OpTruncWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float32x4.TruncWithPrecision", opLen1Imm8(ssa.OpTruncWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
 	addF(simdPackage, "Float32x8.TruncWithPrecision", opLen1Imm8(ssa.OpTruncWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
 	addF(simdPackage, "Float32x16.TruncWithPrecision", opLen1Imm8(ssa.OpTruncWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float64x2.TruncWithPrecision", opLen1Imm8(ssa.OpTruncWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
 	addF(simdPackage, "Float64x4.TruncWithPrecision", opLen1Imm8(ssa.OpTruncWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
 	addF(simdPackage, "Float64x8.TruncWithPrecision", opLen1Imm8(ssa.OpTruncWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float32x16.MaskedCeilSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedCeilSuppressExceptionWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float32x4.MaskedCeilSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedCeilSuppressExceptionWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
 	addF(simdPackage, "Float32x8.MaskedCeilSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedCeilSuppressExceptionWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
 	addF(simdPackage, "Float32x16.MaskedCeilSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedCeilSuppressExceptionWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float64x2.MaskedCeilSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedCeilSuppressExceptionWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
 	addF(simdPackage, "Float64x4.MaskedCeilSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedCeilSuppressExceptionWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
 	addF(simdPackage, "Float64x8.MaskedCeilSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedCeilSuppressExceptionWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float32x16.MaskedCeilWithPrecision", opLen2Imm8(ssa.OpMaskedCeilWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float32x4.MaskedCeilWithPrecision", opLen2Imm8(ssa.OpMaskedCeilWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
 	addF(simdPackage, "Float32x8.MaskedCeilWithPrecision", opLen2Imm8(ssa.OpMaskedCeilWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
 	addF(simdPackage, "Float32x16.MaskedCeilWithPrecision", opLen2Imm8(ssa.OpMaskedCeilWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float64x2.MaskedCeilWithPrecision", opLen2Imm8(ssa.OpMaskedCeilWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
 	addF(simdPackage, "Float64x4.MaskedCeilWithPrecision", opLen2Imm8(ssa.OpMaskedCeilWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
 	addF(simdPackage, "Float64x8.MaskedCeilWithPrecision", opLen2Imm8(ssa.OpMaskedCeilWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float32x16.MaskedDiffWithCeilSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithCeilSuppressExceptionWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float32x4.MaskedDiffWithCeilSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithCeilSuppressExceptionWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
 	addF(simdPackage, "Float32x8.MaskedDiffWithCeilSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithCeilSuppressExceptionWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
 	addF(simdPackage, "Float32x16.MaskedDiffWithCeilSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithCeilSuppressExceptionWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float64x2.MaskedDiffWithCeilSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithCeilSuppressExceptionWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
 	addF(simdPackage, "Float64x4.MaskedDiffWithCeilSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithCeilSuppressExceptionWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
 	addF(simdPackage, "Float64x8.MaskedDiffWithCeilSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithCeilSuppressExceptionWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float32x16.MaskedDiffWithCeilWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithCeilWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float32x4.MaskedDiffWithCeilWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithCeilWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
 	addF(simdPackage, "Float32x8.MaskedDiffWithCeilWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithCeilWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
 	addF(simdPackage, "Float32x16.MaskedDiffWithCeilWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithCeilWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float64x2.MaskedDiffWithCeilWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithCeilWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
 	addF(simdPackage, "Float64x4.MaskedDiffWithCeilWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithCeilWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
 	addF(simdPackage, "Float64x8.MaskedDiffWithCeilWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithCeilWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float32x16.MaskedDiffWithFloorSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithFloorSuppressExceptionWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float32x4.MaskedDiffWithFloorSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithFloorSuppressExceptionWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
 	addF(simdPackage, "Float32x8.MaskedDiffWithFloorSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithFloorSuppressExceptionWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
 	addF(simdPackage, "Float32x16.MaskedDiffWithFloorSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithFloorSuppressExceptionWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float64x2.MaskedDiffWithFloorSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithFloorSuppressExceptionWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
 	addF(simdPackage, "Float64x4.MaskedDiffWithFloorSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithFloorSuppressExceptionWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
 	addF(simdPackage, "Float64x8.MaskedDiffWithFloorSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithFloorSuppressExceptionWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float32x16.MaskedDiffWithFloorWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithFloorWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float32x4.MaskedDiffWithFloorWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithFloorWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
 	addF(simdPackage, "Float32x8.MaskedDiffWithFloorWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithFloorWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
 	addF(simdPackage, "Float32x16.MaskedDiffWithFloorWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithFloorWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float64x2.MaskedDiffWithFloorWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithFloorWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
 	addF(simdPackage, "Float64x4.MaskedDiffWithFloorWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithFloorWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
 	addF(simdPackage, "Float64x8.MaskedDiffWithFloorWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithFloorWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float32x16.MaskedDiffWithRoundSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithRoundSuppressExceptionWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float32x4.MaskedDiffWithRoundSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithRoundSuppressExceptionWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
 	addF(simdPackage, "Float32x8.MaskedDiffWithRoundSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithRoundSuppressExceptionWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
 	addF(simdPackage, "Float32x16.MaskedDiffWithRoundSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithRoundSuppressExceptionWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float64x2.MaskedDiffWithRoundSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithRoundSuppressExceptionWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
 	addF(simdPackage, "Float64x4.MaskedDiffWithRoundSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithRoundSuppressExceptionWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
 	addF(simdPackage, "Float64x8.MaskedDiffWithRoundSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithRoundSuppressExceptionWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float32x16.MaskedDiffWithRoundWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithRoundWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float32x4.MaskedDiffWithRoundWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithRoundWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
 	addF(simdPackage, "Float32x8.MaskedDiffWithRoundWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithRoundWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
 	addF(simdPackage, "Float32x16.MaskedDiffWithRoundWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithRoundWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float64x2.MaskedDiffWithRoundWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithRoundWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
 	addF(simdPackage, "Float64x4.MaskedDiffWithRoundWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithRoundWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
 	addF(simdPackage, "Float64x8.MaskedDiffWithRoundWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithRoundWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float32x16.MaskedDiffWithTruncSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithTruncSuppressExceptionWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float32x4.MaskedDiffWithTruncSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithTruncSuppressExceptionWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
 	addF(simdPackage, "Float32x8.MaskedDiffWithTruncSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithTruncSuppressExceptionWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
 	addF(simdPackage, "Float32x16.MaskedDiffWithTruncSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithTruncSuppressExceptionWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float64x2.MaskedDiffWithTruncSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithTruncSuppressExceptionWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
 	addF(simdPackage, "Float64x4.MaskedDiffWithTruncSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithTruncSuppressExceptionWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
 	addF(simdPackage, "Float64x8.MaskedDiffWithTruncSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithTruncSuppressExceptionWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float32x16.MaskedDiffWithTruncWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithTruncWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float32x4.MaskedDiffWithTruncWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithTruncWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
 	addF(simdPackage, "Float32x8.MaskedDiffWithTruncWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithTruncWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
 	addF(simdPackage, "Float32x16.MaskedDiffWithTruncWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithTruncWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float64x2.MaskedDiffWithTruncWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithTruncWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
 	addF(simdPackage, "Float64x4.MaskedDiffWithTruncWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithTruncWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
 	addF(simdPackage, "Float64x8.MaskedDiffWithTruncWithPrecision", opLen2Imm8(ssa.OpMaskedDiffWithTruncWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float32x16.MaskedFloorSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedFloorSuppressExceptionWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float32x4.MaskedFloorSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedFloorSuppressExceptionWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
 	addF(simdPackage, "Float32x8.MaskedFloorSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedFloorSuppressExceptionWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
 	addF(simdPackage, "Float32x16.MaskedFloorSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedFloorSuppressExceptionWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float64x2.MaskedFloorSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedFloorSuppressExceptionWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
 	addF(simdPackage, "Float64x4.MaskedFloorSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedFloorSuppressExceptionWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
 	addF(simdPackage, "Float64x8.MaskedFloorSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedFloorSuppressExceptionWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float32x16.MaskedFloorWithPrecision", opLen2Imm8(ssa.OpMaskedFloorWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float32x4.MaskedFloorWithPrecision", opLen2Imm8(ssa.OpMaskedFloorWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
 	addF(simdPackage, "Float32x8.MaskedFloorWithPrecision", opLen2Imm8(ssa.OpMaskedFloorWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
 	addF(simdPackage, "Float32x16.MaskedFloorWithPrecision", opLen2Imm8(ssa.OpMaskedFloorWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float64x2.MaskedFloorWithPrecision", opLen2Imm8(ssa.OpMaskedFloorWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
 	addF(simdPackage, "Float64x4.MaskedFloorWithPrecision", opLen2Imm8(ssa.OpMaskedFloorWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
 	addF(simdPackage, "Float64x8.MaskedFloorWithPrecision", opLen2Imm8(ssa.OpMaskedFloorWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float32x16.MaskedRoundSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedRoundSuppressExceptionWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float32x4.MaskedRoundSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedRoundSuppressExceptionWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
 	addF(simdPackage, "Float32x8.MaskedRoundSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedRoundSuppressExceptionWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
 	addF(simdPackage, "Float32x16.MaskedRoundSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedRoundSuppressExceptionWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float64x2.MaskedRoundSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedRoundSuppressExceptionWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
 	addF(simdPackage, "Float64x4.MaskedRoundSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedRoundSuppressExceptionWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
 	addF(simdPackage, "Float64x8.MaskedRoundSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedRoundSuppressExceptionWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float32x16.MaskedRoundWithPrecision", opLen2Imm8(ssa.OpMaskedRoundWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float32x4.MaskedRoundWithPrecision", opLen2Imm8(ssa.OpMaskedRoundWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
 	addF(simdPackage, "Float32x8.MaskedRoundWithPrecision", opLen2Imm8(ssa.OpMaskedRoundWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
 	addF(simdPackage, "Float32x16.MaskedRoundWithPrecision", opLen2Imm8(ssa.OpMaskedRoundWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float64x2.MaskedRoundWithPrecision", opLen2Imm8(ssa.OpMaskedRoundWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
 	addF(simdPackage, "Float64x4.MaskedRoundWithPrecision", opLen2Imm8(ssa.OpMaskedRoundWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
 	addF(simdPackage, "Float64x8.MaskedRoundWithPrecision", opLen2Imm8(ssa.OpMaskedRoundWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float32x16.MaskedTruncSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedTruncSuppressExceptionWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float32x4.MaskedTruncSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedTruncSuppressExceptionWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
 	addF(simdPackage, "Float32x8.MaskedTruncSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedTruncSuppressExceptionWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
 	addF(simdPackage, "Float32x16.MaskedTruncSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedTruncSuppressExceptionWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float64x2.MaskedTruncSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedTruncSuppressExceptionWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
 	addF(simdPackage, "Float64x4.MaskedTruncSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedTruncSuppressExceptionWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
 	addF(simdPackage, "Float64x8.MaskedTruncSuppressExceptionWithPrecision", opLen2Imm8(ssa.OpMaskedTruncSuppressExceptionWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float32x16.MaskedTruncWithPrecision", opLen2Imm8(ssa.OpMaskedTruncWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float32x4.MaskedTruncWithPrecision", opLen2Imm8(ssa.OpMaskedTruncWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
 	addF(simdPackage, "Float32x8.MaskedTruncWithPrecision", opLen2Imm8(ssa.OpMaskedTruncWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
 	addF(simdPackage, "Float32x16.MaskedTruncWithPrecision", opLen2Imm8(ssa.OpMaskedTruncWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float64x2.MaskedTruncWithPrecision", opLen2Imm8(ssa.OpMaskedTruncWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
 	addF(simdPackage, "Float64x4.MaskedTruncWithPrecision", opLen2Imm8(ssa.OpMaskedTruncWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
 	addF(simdPackage, "Float64x8.MaskedTruncWithPrecision", opLen2Imm8(ssa.OpMaskedTruncWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
@ -1832,12 +1832,12 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
 	addF(simdPackage, "Uint8x64.AsUint16x32", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
 	addF(simdPackage, "Uint8x64.AsUint32x16", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
 	addF(simdPackage, "Uint8x64.AsUint64x8", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
 	addF(simdPackage, "LoadInt8x16", simdLoad(), sys.AMD64)
 	addF(simdPackage, "Int8x16.Store", simdStore(), sys.AMD64)
 	addF(simdPackage, "LoadInt16x8", simdLoad(), sys.AMD64)
 	addF(simdPackage, "Int16x8.Store", simdStore(), sys.AMD64)
 	addF(simdPackage, "LoadInt32x4", simdLoad(), sys.AMD64)
 	addF(simdPackage, "Int32x4.Store", simdStore(), sys.AMD64)
 	addF(simdPackage, "LoadInt8x16", simdLoad(), sys.AMD64)
 	addF(simdPackage, "Int8x16.Store", simdStore(), sys.AMD64)
 	addF(simdPackage, "LoadInt64x2", simdLoad(), sys.AMD64)
 	addF(simdPackage, "Int64x2.Store", simdStore(), sys.AMD64)
 	addF(simdPackage, "LoadMask64x2", simdLoad(), sys.AMD64)
@ -1846,26 +1846,26 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
 	addF(simdPackage, "Float32x4.Store", simdStore(), sys.AMD64)
 	addF(simdPackage, "LoadFloat64x2", simdLoad(), sys.AMD64)
 	addF(simdPackage, "Float64x2.Store", simdStore(), sys.AMD64)
 	addF(simdPackage, "LoadUint8x16", simdLoad(), sys.AMD64)
 	addF(simdPackage, "Uint8x16.Store", simdStore(), sys.AMD64)
 	addF(simdPackage, "LoadUint16x8", simdLoad(), sys.AMD64)
 	addF(simdPackage, "Uint16x8.Store", simdStore(), sys.AMD64)
 	addF(simdPackage, "LoadUint32x4", simdLoad(), sys.AMD64)
 	addF(simdPackage, "Uint32x4.Store", simdStore(), sys.AMD64)
 	addF(simdPackage, "LoadUint64x2", simdLoad(), sys.AMD64)
 	addF(simdPackage, "Uint64x2.Store", simdStore(), sys.AMD64)
 	addF(simdPackage, "LoadUint8x16", simdLoad(), sys.AMD64)
 	addF(simdPackage, "Uint8x16.Store", simdStore(), sys.AMD64)
 	addF(simdPackage, "LoadMask32x4", simdLoad(), sys.AMD64)
 	addF(simdPackage, "Mask32x4.Store", simdStore(), sys.AMD64)
 	addF(simdPackage, "LoadMask16x8", simdLoad(), sys.AMD64)
 	addF(simdPackage, "Mask16x8.Store", simdStore(), sys.AMD64)
 	addF(simdPackage, "LoadMask8x16", simdLoad(), sys.AMD64)
 	addF(simdPackage, "Mask8x16.Store", simdStore(), sys.AMD64)
 	addF(simdPackage, "LoadMask16x8", simdLoad(), sys.AMD64)
 	addF(simdPackage, "Mask16x8.Store", simdStore(), sys.AMD64)
 	addF(simdPackage, "LoadInt8x32", simdLoad(), sys.AMD64)
 	addF(simdPackage, "Int8x32.Store", simdStore(), sys.AMD64)
 	addF(simdPackage, "LoadInt16x16", simdLoad(), sys.AMD64)
 	addF(simdPackage, "Int16x16.Store", simdStore(), sys.AMD64)
 	addF(simdPackage, "LoadInt32x8", simdLoad(), sys.AMD64)
 	addF(simdPackage, "Int32x8.Store", simdStore(), sys.AMD64)
 	addF(simdPackage, "LoadInt8x32", simdLoad(), sys.AMD64)
 	addF(simdPackage, "Int8x32.Store", simdStore(), sys.AMD64)
 	addF(simdPackage, "LoadInt64x4", simdLoad(), sys.AMD64)
 	addF(simdPackage, "Int64x4.Store", simdStore(), sys.AMD64)
 	addF(simdPackage, "LoadMask64x4", simdLoad(), sys.AMD64)
@ -1874,20 +1874,24 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
 	addF(simdPackage, "Float32x8.Store", simdStore(), sys.AMD64)
 	addF(simdPackage, "LoadFloat64x4", simdLoad(), sys.AMD64)
 	addF(simdPackage, "Float64x4.Store", simdStore(), sys.AMD64)
 	addF(simdPackage, "LoadUint8x32", simdLoad(), sys.AMD64)
 	addF(simdPackage, "Uint8x32.Store", simdStore(), sys.AMD64)
 	addF(simdPackage, "LoadUint16x16", simdLoad(), sys.AMD64)
 	addF(simdPackage, "Uint16x16.Store", simdStore(), sys.AMD64)
 	addF(simdPackage, "LoadUint32x8", simdLoad(), sys.AMD64)
 	addF(simdPackage, "Uint32x8.Store", simdStore(), sys.AMD64)
 	addF(simdPackage, "LoadUint64x4", simdLoad(), sys.AMD64)
 	addF(simdPackage, "Uint64x4.Store", simdStore(), sys.AMD64)
 	addF(simdPackage, "LoadUint8x32", simdLoad(), sys.AMD64)
 	addF(simdPackage, "Uint8x32.Store", simdStore(), sys.AMD64)
 	addF(simdPackage, "LoadMask32x8", simdLoad(), sys.AMD64)
 	addF(simdPackage, "Mask32x8.Store", simdStore(), sys.AMD64)
 	addF(simdPackage, "LoadMask16x16", simdLoad(), sys.AMD64)
 	addF(simdPackage, "Mask16x16.Store", simdStore(), sys.AMD64)
 	addF(simdPackage, "LoadMask8x32", simdLoad(), sys.AMD64)
 	addF(simdPackage, "Mask8x32.Store", simdStore(), sys.AMD64)
 	addF(simdPackage, "LoadMask16x16", simdLoad(), sys.AMD64)
 	addF(simdPackage, "Mask16x16.Store", simdStore(), sys.AMD64)
 	addF(simdPackage, "LoadInt8x64", simdLoad(), sys.AMD64)
 	addF(simdPackage, "Int8x64.Store", simdStore(), sys.AMD64)
 	addF(simdPackage, "LoadMask8x64", simdLoad(), sys.AMD64)
 	addF(simdPackage, "Mask8x64.Store", simdStore(), sys.AMD64)
 	addF(simdPackage, "LoadInt16x32", simdLoad(), sys.AMD64)
 	addF(simdPackage, "Int16x32.Store", simdStore(), sys.AMD64)
 	addF(simdPackage, "LoadMask16x32", simdLoad(), sys.AMD64)
@ -1900,22 +1904,18 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
 	addF(simdPackage, "Int64x8.Store", simdStore(), sys.AMD64)
 	addF(simdPackage, "LoadMask64x8", simdLoad(), sys.AMD64)
 	addF(simdPackage, "Mask64x8.Store", simdStore(), sys.AMD64)
 	addF(simdPackage, "LoadInt8x64", simdLoad(), sys.AMD64)
 	addF(simdPackage, "Int8x64.Store", simdStore(), sys.AMD64)
 	addF(simdPackage, "LoadMask8x64", simdLoad(), sys.AMD64)
 	addF(simdPackage, "Mask8x64.Store", simdStore(), sys.AMD64)
 	addF(simdPackage, "LoadFloat32x16", simdLoad(), sys.AMD64)
 	addF(simdPackage, "Float32x16.Store", simdStore(), sys.AMD64)
 	addF(simdPackage, "LoadFloat64x8", simdLoad(), sys.AMD64)
 	addF(simdPackage, "Float64x8.Store", simdStore(), sys.AMD64)
 	addF(simdPackage, "LoadUint8x64", simdLoad(), sys.AMD64)
 	addF(simdPackage, "Uint8x64.Store", simdStore(), sys.AMD64)
 	addF(simdPackage, "LoadUint16x32", simdLoad(), sys.AMD64)
 	addF(simdPackage, "Uint16x32.Store", simdStore(), sys.AMD64)
 	addF(simdPackage, "LoadUint32x16", simdLoad(), sys.AMD64)
 	addF(simdPackage, "Uint32x16.Store", simdStore(), sys.AMD64)
 	addF(simdPackage, "LoadUint64x8", simdLoad(), sys.AMD64)
 	addF(simdPackage, "Uint64x8.Store", simdStore(), sys.AMD64)
 	addF(simdPackage, "LoadUint8x64", simdLoad(), sys.AMD64)
 	addF(simdPackage, "Uint8x64.Store", simdStore(), sys.AMD64)
 	addF(simdPackage, "Mask16x16.AsInt16x16", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
 	addF(simdPackage, "Int16x16.AsMask16x16", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
 	addF(simdPackage, "Mask16x16.And", opLen2(ssa.OpAndInt32x8, types.TypeVec256), sys.AMD64)
--- a/src/simd/stubs_amd64.go
+++ b/src/simd/stubs_amd64.go
--- a/src/simd/types_amd64.go
+++ b/src/simd/types_amd64.go
@ -9,6 +9,25 @@ type v128 struct {
 	_128 struct{}
 }
 // Int8x16 is a 128-bit SIMD vector of 16 int8
 type Int8x16 struct {
 	int8x16 v128
 	vals    [16]int8
 }
 // Len returns the number of elements in a Int8x16
 func (x Int8x16) Len() int { return 16 }
 // LoadInt8x16 loads a Int8x16 from an array
 //
 //go:noescape
 func LoadInt8x16(y *[16]int8) Int8x16
 // Store stores a Int8x16 to an array
 //
 //go:noescape
 func (x Int8x16) Store(y *[16]int8)
 // Int16x8 is a 128-bit SIMD vector of 8 int16
 type Int16x8 struct {
 	int16x8 v128
@ -47,25 +66,6 @@ func LoadInt32x4(y *[4]int32) Int32x4
 //go:noescape
 func (x Int32x4) Store(y *[4]int32)
 // Int8x16 is a 128-bit SIMD vector of 16 int8
 type Int8x16 struct {
 	int8x16 v128
 	vals    [16]int8
 }
 // Len returns the number of elements in a Int8x16
 func (x Int8x16) Len() int { return 16 }
 // LoadInt8x16 loads a Int8x16 from an array
 //
 //go:noescape
 func LoadInt8x16(y *[16]int8) Int8x16
 // Store stores a Int8x16 to an array
 //
 //go:noescape
 func (x Int8x16) Store(y *[16]int8)
 // Int64x2 is a 128-bit SIMD vector of 2 int64
 type Int64x2 struct {
 	int64x2 v128
@ -129,6 +129,25 @@ func LoadFloat64x2(y *[2]float64) Float64x2
 //go:noescape
 func (x Float64x2) Store(y *[2]float64)
 // Uint8x16 is a 128-bit SIMD vector of 16 uint8
 type Uint8x16 struct {
 	uint8x16 v128
 	vals     [16]uint8
 }
 // Len returns the number of elements in a Uint8x16
 func (x Uint8x16) Len() int { return 16 }
 // LoadUint8x16 loads a Uint8x16 from an array
 //
 //go:noescape
 func LoadUint8x16(y *[16]uint8) Uint8x16
 // Store stores a Uint8x16 to an array
 //
 //go:noescape
 func (x Uint8x16) Store(y *[16]uint8)
 // Uint16x8 is a 128-bit SIMD vector of 8 uint16
 type Uint16x8 struct {
 	uint16x8 v128
@ -186,48 +205,48 @@ func LoadUint64x2(y *[2]uint64) Uint64x2
 //go:noescape
 func (x Uint64x2) Store(y *[2]uint64)
 // Uint8x16 is a 128-bit SIMD vector of 16 uint8
 type Uint8x16 struct {
 	uint8x16 v128
 	vals     [16]uint8
 }
 // Len returns the number of elements in a Uint8x16
 func (x Uint8x16) Len() int { return 16 }
 // LoadUint8x16 loads a Uint8x16 from an array
 //
 //go:noescape
 func LoadUint8x16(y *[16]uint8) Uint8x16
 // Store stores a Uint8x16 to an array
 //
 //go:noescape
 func (x Uint8x16) Store(y *[16]uint8)
 // Mask32x4 is a 128-bit SIMD vector of 4 int32
 type Mask32x4 struct {
 	int32x4 v128
 	vals    [4]int32
 }
 // Mask16x8 is a 128-bit SIMD vector of 8 int16
 type Mask16x8 struct {
 	int16x8 v128
 	vals    [8]int16
 }
 // Mask8x16 is a 128-bit SIMD vector of 16 int8
 type Mask8x16 struct {
 	int8x16 v128
 	vals    [16]int8
 }
 // Mask16x8 is a 128-bit SIMD vector of 8 int16
 type Mask16x8 struct {
 	int16x8 v128
 	vals    [8]int16
 }
 // v256 is a tag type that tells the compiler that this is really 256-bit SIMD
 type v256 struct {
 	_256 struct{}
 }
 // Int8x32 is a 256-bit SIMD vector of 32 int8
 type Int8x32 struct {
 	int8x32 v256
 	vals    [32]int8
 }
 // Len returns the number of elements in a Int8x32
 func (x Int8x32) Len() int { return 32 }
 // LoadInt8x32 loads a Int8x32 from an array
 //
 //go:noescape
 func LoadInt8x32(y *[32]int8) Int8x32
 // Store stores a Int8x32 to an array
 //
 //go:noescape
 func (x Int8x32) Store(y *[32]int8)
 // Int16x16 is a 256-bit SIMD vector of 16 int16
 type Int16x16 struct {
 	int16x16 v256
@ -266,25 +285,6 @@ func LoadInt32x8(y *[8]int32) Int32x8
 //go:noescape
 func (x Int32x8) Store(y *[8]int32)
 // Int8x32 is a 256-bit SIMD vector of 32 int8
 type Int8x32 struct {
 	int8x32 v256
 	vals    [32]int8
 }
 // Len returns the number of elements in a Int8x32
 func (x Int8x32) Len() int { return 32 }
 // LoadInt8x32 loads a Int8x32 from an array
 //
 //go:noescape
 func LoadInt8x32(y *[32]int8) Int8x32
 // Store stores a Int8x32 to an array
 //
 //go:noescape
 func (x Int8x32) Store(y *[32]int8)
 // Int64x4 is a 256-bit SIMD vector of 4 int64
 type Int64x4 struct {
 	int64x4 v256
@ -348,6 +348,25 @@ func LoadFloat64x4(y *[4]float64) Float64x4
 //go:noescape
 func (x Float64x4) Store(y *[4]float64)
 // Uint8x32 is a 256-bit SIMD vector of 32 uint8
 type Uint8x32 struct {
 	uint8x32 v256
 	vals     [32]uint8
 }
 // Len returns the number of elements in a Uint8x32
 func (x Uint8x32) Len() int { return 32 }
 // LoadUint8x32 loads a Uint8x32 from an array
 //
 //go:noescape
 func LoadUint8x32(y *[32]uint8) Uint8x32
 // Store stores a Uint8x32 to an array
 //
 //go:noescape
 func (x Uint8x32) Store(y *[32]uint8)
 // Uint16x16 is a 256-bit SIMD vector of 16 uint16
 type Uint16x16 struct {
 	uint16x16 v256
@ -405,48 +424,54 @@ func LoadUint64x4(y *[4]uint64) Uint64x4
 //go:noescape
 func (x Uint64x4) Store(y *[4]uint64)
 // Uint8x32 is a 256-bit SIMD vector of 32 uint8
 type Uint8x32 struct {
 	uint8x32 v256
 	vals     [32]uint8
 }
 // Len returns the number of elements in a Uint8x32
 func (x Uint8x32) Len() int { return 32 }
 // LoadUint8x32 loads a Uint8x32 from an array
 //
 //go:noescape
 func LoadUint8x32(y *[32]uint8) Uint8x32
 // Store stores a Uint8x32 to an array
 //
 //go:noescape
 func (x Uint8x32) Store(y *[32]uint8)
 // Mask32x8 is a 256-bit SIMD vector of 8 int32
 type Mask32x8 struct {
 	int32x8 v256
 	vals    [8]int32
 }
 // Mask16x16 is a 256-bit SIMD vector of 16 int16
 type Mask16x16 struct {
 	int16x16 v256
 	vals     [16]int16
 }
 // Mask8x32 is a 256-bit SIMD vector of 32 int8
 type Mask8x32 struct {
 	int8x32 v256
 	vals    [32]int8
 }
 // Mask16x16 is a 256-bit SIMD vector of 16 int16
 type Mask16x16 struct {
 	int16x16 v256
 	vals     [16]int16
 }
 // v512 is a tag type that tells the compiler that this is really 512-bit SIMD
 type v512 struct {
 	_512 struct{}
 }
 // Int8x64 is a 512-bit SIMD vector of 64 int8
 type Int8x64 struct {
 	int8x64 v512
 	vals    [64]int8
 }
 // Len returns the number of elements in a Int8x64
 func (x Int8x64) Len() int { return 64 }
 // LoadInt8x64 loads a Int8x64 from an array
 //
 //go:noescape
 func LoadInt8x64(y *[64]int8) Int8x64
 // Store stores a Int8x64 to an array
 //
 //go:noescape
 func (x Int8x64) Store(y *[64]int8)
 // Mask8x64 is a 512-bit SIMD vector of 64 int8
 type Mask8x64 struct {
 	int8x64 v512
 	vals    [64]int8
 }
 // Int16x32 is a 512-bit SIMD vector of 32 int16
 type Int16x32 struct {
 	int16x32 v512
@ -522,31 +547,6 @@ type Mask64x8 struct {
 	vals    [8]int64
 }
 // Int8x64 is a 512-bit SIMD vector of 64 int8
 type Int8x64 struct {
 	int8x64 v512
 	vals    [64]int8
 }
 // Len returns the number of elements in a Int8x64
 func (x Int8x64) Len() int { return 64 }
 // LoadInt8x64 loads a Int8x64 from an array
 //
 //go:noescape
 func LoadInt8x64(y *[64]int8) Int8x64
 // Store stores a Int8x64 to an array
 //
 //go:noescape
 func (x Int8x64) Store(y *[64]int8)
 // Mask8x64 is a 512-bit SIMD vector of 64 int8
 type Mask8x64 struct {
 	int8x64 v512
 	vals    [64]int8
 }
 // Float32x16 is a 512-bit SIMD vector of 16 float32
 type Float32x16 struct {
 	float32x16 v512
@ -585,6 +585,25 @@ func LoadFloat64x8(y *[8]float64) Float64x8
 //go:noescape
 func (x Float64x8) Store(y *[8]float64)
 // Uint8x64 is a 512-bit SIMD vector of 64 uint8
 type Uint8x64 struct {
 	uint8x64 v512
 	vals     [64]uint8
 }
 // Len returns the number of elements in a Uint8x64
 func (x Uint8x64) Len() int { return 64 }
 // LoadUint8x64 loads a Uint8x64 from an array
 //
 //go:noescape
 func LoadUint8x64(y *[64]uint8) Uint8x64
 // Store stores a Uint8x64 to an array
 //
 //go:noescape
 func (x Uint8x64) Store(y *[64]uint8)
 // Uint16x32 is a 512-bit SIMD vector of 32 uint16
 type Uint16x32 struct {
 	uint16x32 v512
@ -641,22 +660,3 @@ func LoadUint64x8(y *[8]uint64) Uint64x8
 //
 //go:noescape
 func (x Uint64x8) Store(y *[8]uint64)
 // Uint8x64 is a 512-bit SIMD vector of 64 uint8
 type Uint8x64 struct {
 	uint8x64 v512
 	vals     [64]uint8
 }
 // Len returns the number of elements in a Uint8x64
 func (x Uint8x64) Len() int { return 64 }
 // LoadUint8x64 loads a Uint8x64 from an array
 //
 //go:noescape
 func LoadUint8x64(y *[64]uint8) Uint8x64
 // Store stores a Uint8x64 to an array
 //
 //go:noescape
 func (x Uint8x64) Store(y *[64]uint8)