diff --git a/src/cmd/compile/internal/amd64/simdssa.go b/src/cmd/compile/internal/amd64/simdssa.go
index 15ffbf66fa7..76ef42576d3 100644
--- a/src/cmd/compile/internal/amd64/simdssa.go
+++ b/src/cmd/compile/internal/amd64/simdssa.go
@@ -80,6 +80,22 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPADDQ128,
 		ssa.OpAMD64VPADDQ256,
 		ssa.OpAMD64VPADDQ512,
+		ssa.OpAMD64VHADDPS128,
+		ssa.OpAMD64VHADDPS256,
+		ssa.OpAMD64VHADDPD128,
+		ssa.OpAMD64VHADDPD256,
+		ssa.OpAMD64VPHADDW128,
+		ssa.OpAMD64VPHADDW256,
+		ssa.OpAMD64VPHADDD128,
+		ssa.OpAMD64VPHADDD256,
+		ssa.OpAMD64VPHADDSW128,
+		ssa.OpAMD64VPHADDSW256,
+		ssa.OpAMD64VPADDSB128,
+		ssa.OpAMD64VPADDSB256,
+		ssa.OpAMD64VPADDSB512,
+		ssa.OpAMD64VPADDSW128,
+		ssa.OpAMD64VPADDSW256,
+		ssa.OpAMD64VPADDSW512,
 		ssa.OpAMD64VADDSUBPS128,
 		ssa.OpAMD64VADDSUBPS256,
 		ssa.OpAMD64VADDSUBPD128,
@@ -189,12 +205,15 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VMULPD128,
 		ssa.OpAMD64VMULPD256,
 		ssa.OpAMD64VMULPD512,
-		ssa.OpAMD64VSCALEFPS128,
-		ssa.OpAMD64VSCALEFPS256,
-		ssa.OpAMD64VSCALEFPS512,
-		ssa.OpAMD64VSCALEFPD128,
-		ssa.OpAMD64VSCALEFPD256,
-		ssa.OpAMD64VSCALEFPD512,
+		ssa.OpAMD64VPMULLW128,
+		ssa.OpAMD64VPMULLW256,
+		ssa.OpAMD64VPMULLW512,
+		ssa.OpAMD64VPMULLD128,
+		ssa.OpAMD64VPMULLD256,
+		ssa.OpAMD64VPMULLD512,
+		ssa.OpAMD64VPMULLQ128,
+		ssa.OpAMD64VPMULLQ256,
+		ssa.OpAMD64VPMULLQ512,
 		ssa.OpAMD64VPMULDQ128,
 		ssa.OpAMD64VPMULDQ256,
 		ssa.OpAMD64VPMULDQ512,
@@ -207,15 +226,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPMULHUW128,
 		ssa.OpAMD64VPMULHUW256,
 		ssa.OpAMD64VPMULHUW512,
-		ssa.OpAMD64VPMULLW128,
-		ssa.OpAMD64VPMULLW256,
-		ssa.OpAMD64VPMULLW512,
-		ssa.OpAMD64VPMULLD128,
-		ssa.OpAMD64VPMULLD256,
-		ssa.OpAMD64VPMULLD512,
-		ssa.OpAMD64VPMULLQ128,
-		ssa.OpAMD64VPMULLQ256,
-		ssa.OpAMD64VPMULLQ512,
 		ssa.OpAMD64VPOR128,
 		ssa.OpAMD64VPOR256,
 		ssa.OpAMD64VPORD512,
@@ -223,22 +233,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPMADDWD128,
 		ssa.OpAMD64VPMADDWD256,
 		ssa.OpAMD64VPMADDWD512,
-		ssa.OpAMD64VHADDPS128,
-		ssa.OpAMD64VHADDPS256,
-		ssa.OpAMD64VHADDPD128,
-		ssa.OpAMD64VHADDPD256,
-		ssa.OpAMD64VPHADDW128,
-		ssa.OpAMD64VPHADDW256,
-		ssa.OpAMD64VPHADDD128,
-		ssa.OpAMD64VPHADDD256,
-		ssa.OpAMD64VHSUBPS128,
-		ssa.OpAMD64VHSUBPS256,
-		ssa.OpAMD64VHSUBPD128,
-		ssa.OpAMD64VHSUBPD256,
-		ssa.OpAMD64VPHSUBW128,
-		ssa.OpAMD64VPHSUBW256,
-		ssa.OpAMD64VPHSUBD128,
-		ssa.OpAMD64VPHSUBD256,
 		ssa.OpAMD64VPERMB128,
 		ssa.OpAMD64VPERMB256,
 		ssa.OpAMD64VPERMB512,
@@ -265,25 +259,15 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPRORVQ128,
 		ssa.OpAMD64VPRORVQ256,
 		ssa.OpAMD64VPRORVQ512,
-		ssa.OpAMD64VPADDSB128,
-		ssa.OpAMD64VPADDSB256,
-		ssa.OpAMD64VPADDSB512,
-		ssa.OpAMD64VPADDSW128,
-		ssa.OpAMD64VPADDSW256,
-		ssa.OpAMD64VPADDSW512,
-		ssa.OpAMD64VPHADDSW128,
-		ssa.OpAMD64VPHADDSW256,
-		ssa.OpAMD64VPHSUBSW128,
-		ssa.OpAMD64VPHSUBSW256,
-		ssa.OpAMD64VPSUBSB128,
-		ssa.OpAMD64VPSUBSB256,
-		ssa.OpAMD64VPSUBSB512,
-		ssa.OpAMD64VPSUBSW128,
-		ssa.OpAMD64VPSUBSW256,
-		ssa.OpAMD64VPSUBSW512,
 		ssa.OpAMD64VPMADDUBSW128,
 		ssa.OpAMD64VPMADDUBSW256,
 		ssa.OpAMD64VPMADDUBSW512,
+		ssa.OpAMD64VSCALEFPS128,
+		ssa.OpAMD64VSCALEFPS256,
+		ssa.OpAMD64VSCALEFPS512,
+		ssa.OpAMD64VSCALEFPD128,
+		ssa.OpAMD64VSCALEFPD256,
+		ssa.OpAMD64VSCALEFPD512,
 		ssa.OpAMD64VPSLLVW128,
 		ssa.OpAMD64VPSLLVW256,
 		ssa.OpAMD64VPSLLVW512,
@@ -335,6 +319,22 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPSUBQ128,
 		ssa.OpAMD64VPSUBQ256,
 		ssa.OpAMD64VPSUBQ512,
+		ssa.OpAMD64VHSUBPS128,
+		ssa.OpAMD64VHSUBPS256,
+		ssa.OpAMD64VHSUBPD128,
+		ssa.OpAMD64VHSUBPD256,
+		ssa.OpAMD64VPHSUBW128,
+		ssa.OpAMD64VPHSUBW256,
+		ssa.OpAMD64VPHSUBD128,
+		ssa.OpAMD64VPHSUBD256,
+		ssa.OpAMD64VPHSUBSW128,
+		ssa.OpAMD64VPHSUBSW256,
+		ssa.OpAMD64VPSUBSB128,
+		ssa.OpAMD64VPSUBSB256,
+		ssa.OpAMD64VPSUBSB512,
+		ssa.OpAMD64VPSUBSW128,
+		ssa.OpAMD64VPSUBSW256,
+		ssa.OpAMD64VPSUBSW512,
 		ssa.OpAMD64VPXOR128,
 		ssa.OpAMD64VPXOR256,
 		ssa.OpAMD64VPXORD512,
@@ -369,6 +369,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPADDQMasked128,
 		ssa.OpAMD64VPADDQMasked256,
 		ssa.OpAMD64VPADDQMasked512,
+		ssa.OpAMD64VPADDSBMasked128,
+		ssa.OpAMD64VPADDSBMasked256,
+		ssa.OpAMD64VPADDSBMasked512,
+		ssa.OpAMD64VPADDSWMasked128,
+		ssa.OpAMD64VPADDSWMasked256,
+		ssa.OpAMD64VPADDSWMasked512,
 		ssa.OpAMD64VPANDDMasked128,
 		ssa.OpAMD64VPANDDMasked256,
 		ssa.OpAMD64VPANDDMasked512,
@@ -456,12 +462,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPMINUQMasked128,
 		ssa.OpAMD64VPMINUQMasked256,
 		ssa.OpAMD64VPMINUQMasked512,
-		ssa.OpAMD64VSCALEFPSMasked128,
-		ssa.OpAMD64VSCALEFPSMasked256,
-		ssa.OpAMD64VSCALEFPSMasked512,
-		ssa.OpAMD64VSCALEFPDMasked128,
-		ssa.OpAMD64VSCALEFPDMasked256,
-		ssa.OpAMD64VSCALEFPDMasked512,
 		ssa.OpAMD64VPMULDQMasked128,
 		ssa.OpAMD64VPMULDQMasked256,
 		ssa.OpAMD64VPMULDQMasked512,
@@ -474,6 +474,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPMULHUWMasked128,
 		ssa.OpAMD64VPMULHUWMasked256,
 		ssa.OpAMD64VPMULHUWMasked512,
+		ssa.OpAMD64VMULPSMasked128,
+		ssa.OpAMD64VMULPSMasked256,
+		ssa.OpAMD64VMULPSMasked512,
+		ssa.OpAMD64VMULPDMasked128,
+		ssa.OpAMD64VMULPDMasked256,
+		ssa.OpAMD64VMULPDMasked512,
 		ssa.OpAMD64VPMULLWMasked128,
 		ssa.OpAMD64VPMULLWMasked256,
 		ssa.OpAMD64VPMULLWMasked512,
@@ -483,12 +489,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPMULLQMasked128,
 		ssa.OpAMD64VPMULLQMasked256,
 		ssa.OpAMD64VPMULLQMasked512,
-		ssa.OpAMD64VMULPSMasked128,
-		ssa.OpAMD64VMULPSMasked256,
-		ssa.OpAMD64VMULPSMasked512,
-		ssa.OpAMD64VMULPDMasked128,
-		ssa.OpAMD64VMULPDMasked256,
-		ssa.OpAMD64VMULPDMasked512,
 		ssa.OpAMD64VPORDMasked128,
 		ssa.OpAMD64VPORDMasked256,
 		ssa.OpAMD64VPORDMasked512,
@@ -524,21 +524,15 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPRORVQMasked128,
 		ssa.OpAMD64VPRORVQMasked256,
 		ssa.OpAMD64VPRORVQMasked512,
-		ssa.OpAMD64VPADDSBMasked128,
-		ssa.OpAMD64VPADDSBMasked256,
-		ssa.OpAMD64VPADDSBMasked512,
-		ssa.OpAMD64VPADDSWMasked128,
-		ssa.OpAMD64VPADDSWMasked256,
-		ssa.OpAMD64VPADDSWMasked512,
-		ssa.OpAMD64VPSUBSBMasked128,
-		ssa.OpAMD64VPSUBSBMasked256,
-		ssa.OpAMD64VPSUBSBMasked512,
-		ssa.OpAMD64VPSUBSWMasked128,
-		ssa.OpAMD64VPSUBSWMasked256,
-		ssa.OpAMD64VPSUBSWMasked512,
 		ssa.OpAMD64VPMADDUBSWMasked128,
 		ssa.OpAMD64VPMADDUBSWMasked256,
 		ssa.OpAMD64VPMADDUBSWMasked512,
+		ssa.OpAMD64VSCALEFPSMasked128,
+		ssa.OpAMD64VSCALEFPSMasked256,
+		ssa.OpAMD64VSCALEFPSMasked512,
+		ssa.OpAMD64VSCALEFPDMasked128,
+		ssa.OpAMD64VSCALEFPDMasked256,
+		ssa.OpAMD64VSCALEFPDMasked512,
 		ssa.OpAMD64VPSLLVWMasked128,
 		ssa.OpAMD64VPSLLVWMasked256,
 		ssa.OpAMD64VPSLLVWMasked512,
@@ -584,6 +578,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPSUBQMasked128,
 		ssa.OpAMD64VPSUBQMasked256,
 		ssa.OpAMD64VPSUBQMasked512,
+		ssa.OpAMD64VPSUBSBMasked128,
+		ssa.OpAMD64VPSUBSBMasked256,
+		ssa.OpAMD64VPSUBSBMasked512,
+		ssa.OpAMD64VPSUBSWMasked128,
+		ssa.OpAMD64VPSUBSWMasked256,
+		ssa.OpAMD64VPSUBSWMasked512,
 		ssa.OpAMD64VPXORDMasked128,
 		ssa.OpAMD64VPXORDMasked256,
 		ssa.OpAMD64VPXORDMasked512,
@@ -1085,6 +1085,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPADDQMasked128,
 		ssa.OpAMD64VPADDQMasked256,
 		ssa.OpAMD64VPADDQMasked512,
+		ssa.OpAMD64VPADDSBMasked128,
+		ssa.OpAMD64VPADDSBMasked256,
+		ssa.OpAMD64VPADDSBMasked512,
+		ssa.OpAMD64VPADDSWMasked128,
+		ssa.OpAMD64VPADDSWMasked256,
+		ssa.OpAMD64VPADDSWMasked512,
 		ssa.OpAMD64VPANDDMasked128,
 		ssa.OpAMD64VPANDDMasked256,
 		ssa.OpAMD64VPANDDMasked512,
@@ -1121,6 +1127,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VRNDSCALEPDMasked128,
 		ssa.OpAMD64VRNDSCALEPDMasked256,
 		ssa.OpAMD64VRNDSCALEPDMasked512,
+		ssa.OpAMD64VREDUCEPSMasked128,
+		ssa.OpAMD64VREDUCEPSMasked256,
+		ssa.OpAMD64VREDUCEPSMasked512,
+		ssa.OpAMD64VREDUCEPDMasked128,
+		ssa.OpAMD64VREDUCEPDMasked256,
+		ssa.OpAMD64VREDUCEPDMasked512,
 		ssa.OpAMD64VCOMPRESSPSMasked128,
 		ssa.OpAMD64VCOMPRESSPSMasked256,
 		ssa.OpAMD64VCOMPRESSPSMasked512,
@@ -1145,12 +1157,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VCVTPS2UDQMasked128,
 		ssa.OpAMD64VCVTPS2UDQMasked256,
 		ssa.OpAMD64VCVTPS2UDQMasked512,
-		ssa.OpAMD64VREDUCEPSMasked128,
-		ssa.OpAMD64VREDUCEPSMasked256,
-		ssa.OpAMD64VREDUCEPSMasked512,
-		ssa.OpAMD64VREDUCEPDMasked128,
-		ssa.OpAMD64VREDUCEPDMasked256,
-		ssa.OpAMD64VREDUCEPDMasked512,
 		ssa.OpAMD64VDIVPSMasked128,
 		ssa.OpAMD64VDIVPSMasked256,
 		ssa.OpAMD64VDIVPSMasked512,
@@ -1244,12 +1250,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPMINUQMasked128,
 		ssa.OpAMD64VPMINUQMasked256,
 		ssa.OpAMD64VPMINUQMasked512,
-		ssa.OpAMD64VSCALEFPSMasked128,
-		ssa.OpAMD64VSCALEFPSMasked256,
-		ssa.OpAMD64VSCALEFPSMasked512,
-		ssa.OpAMD64VSCALEFPDMasked128,
-		ssa.OpAMD64VSCALEFPDMasked256,
-		ssa.OpAMD64VSCALEFPDMasked512,
 		ssa.OpAMD64VPMULDQMasked128,
 		ssa.OpAMD64VPMULDQMasked256,
 		ssa.OpAMD64VPMULDQMasked512,
@@ -1262,6 +1262,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPMULHUWMasked128,
 		ssa.OpAMD64VPMULHUWMasked256,
 		ssa.OpAMD64VPMULHUWMasked512,
+		ssa.OpAMD64VMULPSMasked128,
+		ssa.OpAMD64VMULPSMasked256,
+		ssa.OpAMD64VMULPSMasked512,
+		ssa.OpAMD64VMULPDMasked128,
+		ssa.OpAMD64VMULPDMasked256,
+		ssa.OpAMD64VMULPDMasked512,
 		ssa.OpAMD64VPMULLWMasked128,
 		ssa.OpAMD64VPMULLWMasked256,
 		ssa.OpAMD64VPMULLWMasked512,
@@ -1271,12 +1277,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPMULLQMasked128,
 		ssa.OpAMD64VPMULLQMasked256,
 		ssa.OpAMD64VPMULLQMasked512,
-		ssa.OpAMD64VMULPSMasked128,
-		ssa.OpAMD64VMULPSMasked256,
-		ssa.OpAMD64VMULPSMasked512,
-		ssa.OpAMD64VMULPDMasked128,
-		ssa.OpAMD64VMULPDMasked256,
-		ssa.OpAMD64VMULPDMasked512,
 		ssa.OpAMD64VPORDMasked128,
 		ssa.OpAMD64VPORDMasked256,
 		ssa.OpAMD64VPORDMasked512,
@@ -1357,24 +1357,18 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPDPWSSDSMasked128,
 		ssa.OpAMD64VPDPWSSDSMasked256,
 		ssa.OpAMD64VPDPWSSDSMasked512,
-		ssa.OpAMD64VPADDSBMasked128,
-		ssa.OpAMD64VPADDSBMasked256,
-		ssa.OpAMD64VPADDSBMasked512,
-		ssa.OpAMD64VPADDSWMasked128,
-		ssa.OpAMD64VPADDSWMasked256,
-		ssa.OpAMD64VPADDSWMasked512,
-		ssa.OpAMD64VPSUBSBMasked128,
-		ssa.OpAMD64VPSUBSBMasked256,
-		ssa.OpAMD64VPSUBSBMasked512,
-		ssa.OpAMD64VPSUBSWMasked128,
-		ssa.OpAMD64VPSUBSWMasked256,
-		ssa.OpAMD64VPSUBSWMasked512,
 		ssa.OpAMD64VPMADDUBSWMasked128,
 		ssa.OpAMD64VPMADDUBSWMasked256,
 		ssa.OpAMD64VPMADDUBSWMasked512,
 		ssa.OpAMD64VPDPBUSDSMasked128,
 		ssa.OpAMD64VPDPBUSDSMasked256,
 		ssa.OpAMD64VPDPBUSDSMasked512,
+		ssa.OpAMD64VSCALEFPSMasked128,
+		ssa.OpAMD64VSCALEFPSMasked256,
+		ssa.OpAMD64VSCALEFPSMasked512,
+		ssa.OpAMD64VSCALEFPDMasked128,
+		ssa.OpAMD64VSCALEFPDMasked256,
+		ssa.OpAMD64VSCALEFPDMasked512,
 		ssa.OpAMD64VPSHLDWMasked128,
 		ssa.OpAMD64VPSHLDWMasked256,
 		ssa.OpAMD64VPSHLDWMasked512,
@@ -1489,6 +1483,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPSUBQMasked128,
 		ssa.OpAMD64VPSUBQMasked256,
 		ssa.OpAMD64VPSUBQMasked512,
+		ssa.OpAMD64VPSUBSBMasked128,
+		ssa.OpAMD64VPSUBSBMasked256,
+		ssa.OpAMD64VPSUBSBMasked512,
+		ssa.OpAMD64VPSUBSWMasked128,
+		ssa.OpAMD64VPSUBSWMasked256,
+		ssa.OpAMD64VPSUBSWMasked512,
 		ssa.OpAMD64VPDPBUSDMasked128,
 		ssa.OpAMD64VPDPBUSDMasked256,
 		ssa.OpAMD64VPDPBUSDMasked512,
diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
index 1d54cfcdbdd..060f220c7de 100644
--- a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
+++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
@@ -90,6 +90,44 @@
 (AddMaskedUint64x2 x y mask) => (VPADDQMasked128 x y (VPMOVVec64x2ToM <types.TypeMask> mask))
 (AddMaskedUint64x4 x y mask) => (VPADDQMasked256 x y (VPMOVVec64x4ToM <types.TypeMask> mask))
 (AddMaskedUint64x8 x y mask) => (VPADDQMasked512 x y (VPMOVVec64x8ToM <types.TypeMask> mask))
+(AddPairsFloat32x4 ...) => (VHADDPS128 ...)
+(AddPairsFloat32x8 ...) => (VHADDPS256 ...)
+(AddPairsFloat64x2 ...) => (VHADDPD128 ...)
+(AddPairsFloat64x4 ...) => (VHADDPD256 ...)
+(AddPairsInt16x8 ...) => (VPHADDW128 ...)
+(AddPairsInt16x16 ...) => (VPHADDW256 ...)
+(AddPairsInt32x4 ...) => (VPHADDD128 ...)
+(AddPairsInt32x8 ...) => (VPHADDD256 ...)
+(AddPairsUint16x8 ...) => (VPHADDW128 ...)
+(AddPairsUint16x16 ...) => (VPHADDW256 ...)
+(AddPairsUint32x4 ...) => (VPHADDD128 ...)
+(AddPairsUint32x8 ...) => (VPHADDD256 ...)
+(AddPairsSaturatedInt16x8 ...) => (VPHADDSW128 ...)
+(AddPairsSaturatedInt16x16 ...) => (VPHADDSW256 ...)
+(AddSaturatedInt8x16 ...) => (VPADDSB128 ...)
+(AddSaturatedInt8x32 ...) => (VPADDSB256 ...)
+(AddSaturatedInt8x64 ...) => (VPADDSB512 ...)
+(AddSaturatedInt16x8 ...) => (VPADDSW128 ...)
+(AddSaturatedInt16x16 ...) => (VPADDSW256 ...)
+(AddSaturatedInt16x32 ...) => (VPADDSW512 ...)
+(AddSaturatedUint8x16 ...) => (VPADDSB128 ...)
+(AddSaturatedUint8x32 ...) => (VPADDSB256 ...)
+(AddSaturatedUint8x64 ...) => (VPADDSB512 ...)
+(AddSaturatedUint16x8 ...) => (VPADDSW128 ...)
+(AddSaturatedUint16x16 ...) => (VPADDSW256 ...)
+(AddSaturatedUint16x32 ...) => (VPADDSW512 ...)
+(AddSaturatedMaskedInt8x16 x y mask) => (VPADDSBMasked128 x y (VPMOVVec8x16ToM <types.TypeMask> mask))
+(AddSaturatedMaskedInt8x32 x y mask) => (VPADDSBMasked256 x y (VPMOVVec8x32ToM <types.TypeMask> mask))
+(AddSaturatedMaskedInt8x64 x y mask) => (VPADDSBMasked512 x y (VPMOVVec8x64ToM <types.TypeMask> mask))
+(AddSaturatedMaskedInt16x8 x y mask) => (VPADDSWMasked128 x y (VPMOVVec16x8ToM <types.TypeMask> mask))
+(AddSaturatedMaskedInt16x16 x y mask) => (VPADDSWMasked256 x y (VPMOVVec16x16ToM <types.TypeMask> mask))
+(AddSaturatedMaskedInt16x32 x y mask) => (VPADDSWMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask))
+(AddSaturatedMaskedUint8x16 x y mask) => (VPADDSBMasked128 x y (VPMOVVec8x16ToM <types.TypeMask> mask))
+(AddSaturatedMaskedUint8x32 x y mask) => (VPADDSBMasked256 x y (VPMOVVec8x32ToM <types.TypeMask> mask))
+(AddSaturatedMaskedUint8x64 x y mask) => (VPADDSBMasked512 x y (VPMOVVec8x64ToM <types.TypeMask> mask))
+(AddSaturatedMaskedUint16x8 x y mask) => (VPADDSWMasked128 x y (VPMOVVec16x8ToM <types.TypeMask> mask))
+(AddSaturatedMaskedUint16x16 x y mask) => (VPADDSWMasked256 x y (VPMOVVec16x16ToM <types.TypeMask> mask))
+(AddSaturatedMaskedUint16x32 x y mask) => (VPADDSWMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask))
 (AddSubFloat32x4 ...) => (VADDSUBPS128 ...)
 (AddSubFloat32x8 ...) => (VADDSUBPS256 ...)
 (AddSubFloat64x2 ...) => (VADDSUBPD128 ...)
@@ -206,18 +244,30 @@
 (CeilFloat32x8 x) => (VROUNDPS256 [2] x)
 (CeilFloat64x2 x) => (VROUNDPD128 [2] x)
 (CeilFloat64x4 x) => (VROUNDPD256 [2] x)
-(CeilWithPrecisionFloat32x4 [a] x) => (VRNDSCALEPS128 [a+2] x)
-(CeilWithPrecisionFloat32x8 [a] x) => (VRNDSCALEPS256 [a+2] x)
-(CeilWithPrecisionFloat32x16 [a] x) => (VRNDSCALEPS512 [a+2] x)
-(CeilWithPrecisionFloat64x2 [a] x) => (VRNDSCALEPD128 [a+2] x)
-(CeilWithPrecisionFloat64x4 [a] x) => (VRNDSCALEPD256 [a+2] x)
-(CeilWithPrecisionFloat64x8 [a] x) => (VRNDSCALEPD512 [a+2] x)
-(CeilWithPrecisionMaskedFloat32x4 [a] x mask) => (VRNDSCALEPSMasked128 [a+2] x (VPMOVVec32x4ToM <types.TypeMask> mask))
-(CeilWithPrecisionMaskedFloat32x8 [a] x mask) => (VRNDSCALEPSMasked256 [a+2] x (VPMOVVec32x8ToM <types.TypeMask> mask))
-(CeilWithPrecisionMaskedFloat32x16 [a] x mask) => (VRNDSCALEPSMasked512 [a+2] x (VPMOVVec32x16ToM <types.TypeMask> mask))
-(CeilWithPrecisionMaskedFloat64x2 [a] x mask) => (VRNDSCALEPDMasked128 [a+2] x (VPMOVVec64x2ToM <types.TypeMask> mask))
-(CeilWithPrecisionMaskedFloat64x4 [a] x mask) => (VRNDSCALEPDMasked256 [a+2] x (VPMOVVec64x4ToM <types.TypeMask> mask))
-(CeilWithPrecisionMaskedFloat64x8 [a] x mask) => (VRNDSCALEPDMasked512 [a+2] x (VPMOVVec64x8ToM <types.TypeMask> mask))
+(CeilScaledFloat32x4 [a] x) => (VRNDSCALEPS128 [a+2] x)
+(CeilScaledFloat32x8 [a] x) => (VRNDSCALEPS256 [a+2] x)
+(CeilScaledFloat32x16 [a] x) => (VRNDSCALEPS512 [a+2] x)
+(CeilScaledFloat64x2 [a] x) => (VRNDSCALEPD128 [a+2] x)
+(CeilScaledFloat64x4 [a] x) => (VRNDSCALEPD256 [a+2] x)
+(CeilScaledFloat64x8 [a] x) => (VRNDSCALEPD512 [a+2] x)
+(CeilScaledMaskedFloat32x4 [a] x mask) => (VRNDSCALEPSMasked128 [a+2] x (VPMOVVec32x4ToM <types.TypeMask> mask))
+(CeilScaledMaskedFloat32x8 [a] x mask) => (VRNDSCALEPSMasked256 [a+2] x (VPMOVVec32x8ToM <types.TypeMask> mask))
+(CeilScaledMaskedFloat32x16 [a] x mask) => (VRNDSCALEPSMasked512 [a+2] x (VPMOVVec32x16ToM <types.TypeMask> mask))
+(CeilScaledMaskedFloat64x2 [a] x mask) => (VRNDSCALEPDMasked128 [a+2] x (VPMOVVec64x2ToM <types.TypeMask> mask))
+(CeilScaledMaskedFloat64x4 [a] x mask) => (VRNDSCALEPDMasked256 [a+2] x (VPMOVVec64x4ToM <types.TypeMask> mask))
+(CeilScaledMaskedFloat64x8 [a] x mask) => (VRNDSCALEPDMasked512 [a+2] x (VPMOVVec64x8ToM <types.TypeMask> mask))
+(CeilScaledResidueFloat32x4 [a] x) => (VREDUCEPS128 [a+2] x)
+(CeilScaledResidueFloat32x8 [a] x) => (VREDUCEPS256 [a+2] x)
+(CeilScaledResidueFloat32x16 [a] x) => (VREDUCEPS512 [a+2] x)
+(CeilScaledResidueFloat64x2 [a] x) => (VREDUCEPD128 [a+2] x)
+(CeilScaledResidueFloat64x4 [a] x) => (VREDUCEPD256 [a+2] x)
+(CeilScaledResidueFloat64x8 [a] x) => (VREDUCEPD512 [a+2] x)
+(CeilScaledResidueMaskedFloat32x4 [a] x mask) => (VREDUCEPSMasked128 [a+2] x (VPMOVVec32x4ToM <types.TypeMask> mask))
+(CeilScaledResidueMaskedFloat32x8 [a] x mask) => (VREDUCEPSMasked256 [a+2] x (VPMOVVec32x8ToM <types.TypeMask> mask))
+(CeilScaledResidueMaskedFloat32x16 [a] x mask) => (VREDUCEPSMasked512 [a+2] x (VPMOVVec32x16ToM <types.TypeMask> mask))
+(CeilScaledResidueMaskedFloat64x2 [a] x mask) => (VREDUCEPDMasked128 [a+2] x (VPMOVVec64x2ToM <types.TypeMask> mask))
+(CeilScaledResidueMaskedFloat64x4 [a] x mask) => (VREDUCEPDMasked256 [a+2] x (VPMOVVec64x4ToM <types.TypeMask> mask))
+(CeilScaledResidueMaskedFloat64x8 [a] x mask) => (VREDUCEPDMasked512 [a+2] x (VPMOVVec64x8ToM <types.TypeMask> mask))
 (CompressFloat32x4 x mask) => (VCOMPRESSPSMasked128 x (VPMOVVec32x4ToM <types.TypeMask> mask))
 (CompressFloat32x8 x mask) => (VCOMPRESSPSMasked256 x (VPMOVVec32x8ToM <types.TypeMask> mask))
 (CompressFloat32x16 x mask) => (VCOMPRESSPSMasked512 x (VPMOVVec32x16ToM <types.TypeMask> mask))
@@ -260,54 +310,6 @@
 (ConvertToUint32MaskedFloat32x4 x mask) => (VCVTPS2UDQMasked128 x (VPMOVVec32x4ToM <types.TypeMask> mask))
 (ConvertToUint32MaskedFloat32x8 x mask) => (VCVTPS2UDQMasked256 x (VPMOVVec32x8ToM <types.TypeMask> mask))
 (ConvertToUint32MaskedFloat32x16 x mask) => (VCVTPS2UDQMasked512 x (VPMOVVec32x16ToM <types.TypeMask> mask))
-(DiffWithCeilWithPrecisionFloat32x4 [a] x) => (VREDUCEPS128 [a+2] x)
-(DiffWithCeilWithPrecisionFloat32x8 [a] x) => (VREDUCEPS256 [a+2] x)
-(DiffWithCeilWithPrecisionFloat32x16 [a] x) => (VREDUCEPS512 [a+2] x)
-(DiffWithCeilWithPrecisionFloat64x2 [a] x) => (VREDUCEPD128 [a+2] x)
-(DiffWithCeilWithPrecisionFloat64x4 [a] x) => (VREDUCEPD256 [a+2] x)
-(DiffWithCeilWithPrecisionFloat64x8 [a] x) => (VREDUCEPD512 [a+2] x)
-(DiffWithCeilWithPrecisionMaskedFloat32x4 [a] x mask) => (VREDUCEPSMasked128 [a+2] x (VPMOVVec32x4ToM <types.TypeMask> mask))
-(DiffWithCeilWithPrecisionMaskedFloat32x8 [a] x mask) => (VREDUCEPSMasked256 [a+2] x (VPMOVVec32x8ToM <types.TypeMask> mask))
-(DiffWithCeilWithPrecisionMaskedFloat32x16 [a] x mask) => (VREDUCEPSMasked512 [a+2] x (VPMOVVec32x16ToM <types.TypeMask> mask))
-(DiffWithCeilWithPrecisionMaskedFloat64x2 [a] x mask) => (VREDUCEPDMasked128 [a+2] x (VPMOVVec64x2ToM <types.TypeMask> mask))
-(DiffWithCeilWithPrecisionMaskedFloat64x4 [a] x mask) => (VREDUCEPDMasked256 [a+2] x (VPMOVVec64x4ToM <types.TypeMask> mask))
-(DiffWithCeilWithPrecisionMaskedFloat64x8 [a] x mask) => (VREDUCEPDMasked512 [a+2] x (VPMOVVec64x8ToM <types.TypeMask> mask))
-(DiffWithFloorWithPrecisionFloat32x4 [a] x) => (VREDUCEPS128 [a+1] x)
-(DiffWithFloorWithPrecisionFloat32x8 [a] x) => (VREDUCEPS256 [a+1] x)
-(DiffWithFloorWithPrecisionFloat32x16 [a] x) => (VREDUCEPS512 [a+1] x)
-(DiffWithFloorWithPrecisionFloat64x2 [a] x) => (VREDUCEPD128 [a+1] x)
-(DiffWithFloorWithPrecisionFloat64x4 [a] x) => (VREDUCEPD256 [a+1] x)
-(DiffWithFloorWithPrecisionFloat64x8 [a] x) => (VREDUCEPD512 [a+1] x)
-(DiffWithFloorWithPrecisionMaskedFloat32x4 [a] x mask) => (VREDUCEPSMasked128 [a+1] x (VPMOVVec32x4ToM <types.TypeMask> mask))
-(DiffWithFloorWithPrecisionMaskedFloat32x8 [a] x mask) => (VREDUCEPSMasked256 [a+1] x (VPMOVVec32x8ToM <types.TypeMask> mask))
-(DiffWithFloorWithPrecisionMaskedFloat32x16 [a] x mask) => (VREDUCEPSMasked512 [a+1] x (VPMOVVec32x16ToM <types.TypeMask> mask))
-(DiffWithFloorWithPrecisionMaskedFloat64x2 [a] x mask) => (VREDUCEPDMasked128 [a+1] x (VPMOVVec64x2ToM <types.TypeMask> mask))
-(DiffWithFloorWithPrecisionMaskedFloat64x4 [a] x mask) => (VREDUCEPDMasked256 [a+1] x (VPMOVVec64x4ToM <types.TypeMask> mask))
-(DiffWithFloorWithPrecisionMaskedFloat64x8 [a] x mask) => (VREDUCEPDMasked512 [a+1] x (VPMOVVec64x8ToM <types.TypeMask> mask))
-(DiffWithRoundWithPrecisionFloat32x4 [a] x) => (VREDUCEPS128 [a+0] x)
-(DiffWithRoundWithPrecisionFloat32x8 [a] x) => (VREDUCEPS256 [a+0] x)
-(DiffWithRoundWithPrecisionFloat32x16 [a] x) => (VREDUCEPS512 [a+0] x)
-(DiffWithRoundWithPrecisionFloat64x2 [a] x) => (VREDUCEPD128 [a+0] x)
-(DiffWithRoundWithPrecisionFloat64x4 [a] x) => (VREDUCEPD256 [a+0] x)
-(DiffWithRoundWithPrecisionFloat64x8 [a] x) => (VREDUCEPD512 [a+0] x)
-(DiffWithRoundWithPrecisionMaskedFloat32x4 [a] x mask) => (VREDUCEPSMasked128 [a+0] x (VPMOVVec32x4ToM <types.TypeMask> mask))
-(DiffWithRoundWithPrecisionMaskedFloat32x8 [a] x mask) => (VREDUCEPSMasked256 [a+0] x (VPMOVVec32x8ToM <types.TypeMask> mask))
-(DiffWithRoundWithPrecisionMaskedFloat32x16 [a] x mask) => (VREDUCEPSMasked512 [a+0] x (VPMOVVec32x16ToM <types.TypeMask> mask))
-(DiffWithRoundWithPrecisionMaskedFloat64x2 [a] x mask) => (VREDUCEPDMasked128 [a+0] x (VPMOVVec64x2ToM <types.TypeMask> mask))
-(DiffWithRoundWithPrecisionMaskedFloat64x4 [a] x mask) => (VREDUCEPDMasked256 [a+0] x (VPMOVVec64x4ToM <types.TypeMask> mask))
-(DiffWithRoundWithPrecisionMaskedFloat64x8 [a] x mask) => (VREDUCEPDMasked512 [a+0] x (VPMOVVec64x8ToM <types.TypeMask> mask))
-(DiffWithTruncWithPrecisionFloat32x4 [a] x) => (VREDUCEPS128 [a+3] x)
-(DiffWithTruncWithPrecisionFloat32x8 [a] x) => (VREDUCEPS256 [a+3] x)
-(DiffWithTruncWithPrecisionFloat32x16 [a] x) => (VREDUCEPS512 [a+3] x)
-(DiffWithTruncWithPrecisionFloat64x2 [a] x) => (VREDUCEPD128 [a+3] x)
-(DiffWithTruncWithPrecisionFloat64x4 [a] x) => (VREDUCEPD256 [a+3] x)
-(DiffWithTruncWithPrecisionFloat64x8 [a] x) => (VREDUCEPD512 [a+3] x)
-(DiffWithTruncWithPrecisionMaskedFloat32x4 [a] x mask) => (VREDUCEPSMasked128 [a+3] x (VPMOVVec32x4ToM <types.TypeMask> mask))
-(DiffWithTruncWithPrecisionMaskedFloat32x8 [a] x mask) => (VREDUCEPSMasked256 [a+3] x (VPMOVVec32x8ToM <types.TypeMask> mask))
-(DiffWithTruncWithPrecisionMaskedFloat32x16 [a] x mask) => (VREDUCEPSMasked512 [a+3] x (VPMOVVec32x16ToM <types.TypeMask> mask))
-(DiffWithTruncWithPrecisionMaskedFloat64x2 [a] x mask) => (VREDUCEPDMasked128 [a+3] x (VPMOVVec64x2ToM <types.TypeMask> mask))
-(DiffWithTruncWithPrecisionMaskedFloat64x4 [a] x mask) => (VREDUCEPDMasked256 [a+3] x (VPMOVVec64x4ToM <types.TypeMask> mask))
-(DiffWithTruncWithPrecisionMaskedFloat64x8 [a] x mask) => (VREDUCEPDMasked512 [a+3] x (VPMOVVec64x8ToM <types.TypeMask> mask))
 (DivFloat32x4 ...) => (VDIVPS128 ...)
 (DivFloat32x8 ...) => (VDIVPS256 ...)
 (DivFloat32x16 ...) => (VDIVPS512 ...)
@@ -387,18 +389,30 @@
 (FloorFloat32x8 x) => (VROUNDPS256 [1] x)
 (FloorFloat64x2 x) => (VROUNDPD128 [1] x)
 (FloorFloat64x4 x) => (VROUNDPD256 [1] x)
-(FloorWithPrecisionFloat32x4 [a] x) => (VRNDSCALEPS128 [a+1] x)
-(FloorWithPrecisionFloat32x8 [a] x) => (VRNDSCALEPS256 [a+1] x)
-(FloorWithPrecisionFloat32x16 [a] x) => (VRNDSCALEPS512 [a+1] x)
-(FloorWithPrecisionFloat64x2 [a] x) => (VRNDSCALEPD128 [a+1] x)
-(FloorWithPrecisionFloat64x4 [a] x) => (VRNDSCALEPD256 [a+1] x)
-(FloorWithPrecisionFloat64x8 [a] x) => (VRNDSCALEPD512 [a+1] x)
-(FloorWithPrecisionMaskedFloat32x4 [a] x mask) => (VRNDSCALEPSMasked128 [a+1] x (VPMOVVec32x4ToM <types.TypeMask> mask))
-(FloorWithPrecisionMaskedFloat32x8 [a] x mask) => (VRNDSCALEPSMasked256 [a+1] x (VPMOVVec32x8ToM <types.TypeMask> mask))
-(FloorWithPrecisionMaskedFloat32x16 [a] x mask) => (VRNDSCALEPSMasked512 [a+1] x (VPMOVVec32x16ToM <types.TypeMask> mask))
-(FloorWithPrecisionMaskedFloat64x2 [a] x mask) => (VRNDSCALEPDMasked128 [a+1] x (VPMOVVec64x2ToM <types.TypeMask> mask))
-(FloorWithPrecisionMaskedFloat64x4 [a] x mask) => (VRNDSCALEPDMasked256 [a+1] x (VPMOVVec64x4ToM <types.TypeMask> mask))
-(FloorWithPrecisionMaskedFloat64x8 [a] x mask) => (VRNDSCALEPDMasked512 [a+1] x (VPMOVVec64x8ToM <types.TypeMask> mask))
+(FloorScaledFloat32x4 [a] x) => (VRNDSCALEPS128 [a+1] x)
+(FloorScaledFloat32x8 [a] x) => (VRNDSCALEPS256 [a+1] x)
+(FloorScaledFloat32x16 [a] x) => (VRNDSCALEPS512 [a+1] x)
+(FloorScaledFloat64x2 [a] x) => (VRNDSCALEPD128 [a+1] x)
+(FloorScaledFloat64x4 [a] x) => (VRNDSCALEPD256 [a+1] x)
+(FloorScaledFloat64x8 [a] x) => (VRNDSCALEPD512 [a+1] x)
+(FloorScaledMaskedFloat32x4 [a] x mask) => (VRNDSCALEPSMasked128 [a+1] x (VPMOVVec32x4ToM <types.TypeMask> mask))
+(FloorScaledMaskedFloat32x8 [a] x mask) => (VRNDSCALEPSMasked256 [a+1] x (VPMOVVec32x8ToM <types.TypeMask> mask))
+(FloorScaledMaskedFloat32x16 [a] x mask) => (VRNDSCALEPSMasked512 [a+1] x (VPMOVVec32x16ToM <types.TypeMask> mask))
+(FloorScaledMaskedFloat64x2 [a] x mask) => (VRNDSCALEPDMasked128 [a+1] x (VPMOVVec64x2ToM <types.TypeMask> mask))
+(FloorScaledMaskedFloat64x4 [a] x mask) => (VRNDSCALEPDMasked256 [a+1] x (VPMOVVec64x4ToM <types.TypeMask> mask))
+(FloorScaledMaskedFloat64x8 [a] x mask) => (VRNDSCALEPDMasked512 [a+1] x (VPMOVVec64x8ToM <types.TypeMask> mask))
+(FloorScaledResidueFloat32x4 [a] x) => (VREDUCEPS128 [a+1] x)
+(FloorScaledResidueFloat32x8 [a] x) => (VREDUCEPS256 [a+1] x)
+(FloorScaledResidueFloat32x16 [a] x) => (VREDUCEPS512 [a+1] x)
+(FloorScaledResidueFloat64x2 [a] x) => (VREDUCEPD128 [a+1] x)
+(FloorScaledResidueFloat64x4 [a] x) => (VREDUCEPD256 [a+1] x)
+(FloorScaledResidueFloat64x8 [a] x) => (VREDUCEPD512 [a+1] x)
+(FloorScaledResidueMaskedFloat32x4 [a] x mask) => (VREDUCEPSMasked128 [a+1] x (VPMOVVec32x4ToM <types.TypeMask> mask))
+(FloorScaledResidueMaskedFloat32x8 [a] x mask) => (VREDUCEPSMasked256 [a+1] x (VPMOVVec32x8ToM <types.TypeMask> mask))
+(FloorScaledResidueMaskedFloat32x16 [a] x mask) => (VREDUCEPSMasked512 [a+1] x (VPMOVVec32x16ToM <types.TypeMask> mask))
+(FloorScaledResidueMaskedFloat64x2 [a] x mask) => (VREDUCEPDMasked128 [a+1] x (VPMOVVec64x2ToM <types.TypeMask> mask))
+(FloorScaledResidueMaskedFloat64x4 [a] x mask) => (VREDUCEPDMasked256 [a+1] x (VPMOVVec64x4ToM <types.TypeMask> mask))
+(FloorScaledResidueMaskedFloat64x8 [a] x mask) => (VREDUCEPDMasked512 [a+1] x (VPMOVVec64x8ToM <types.TypeMask> mask))
 (FusedMultiplyAddFloat32x4 ...) => (VFMADD213PS128 ...)
 (FusedMultiplyAddFloat32x8 ...) => (VFMADD213PS256 ...)
 (FusedMultiplyAddFloat32x16 ...) => (VFMADD213PS512 ...)
@@ -849,18 +863,15 @@
 (MulFloat64x2 ...) => (VMULPD128 ...)
 (MulFloat64x4 ...) => (VMULPD256 ...)
 (MulFloat64x8 ...) => (VMULPD512 ...)
-(MulByPowOf2Float32x4 ...) => (VSCALEFPS128 ...)
-(MulByPowOf2Float32x8 ...) => (VSCALEFPS256 ...)
-(MulByPowOf2Float32x16 ...) => (VSCALEFPS512 ...)
-(MulByPowOf2Float64x2 ...) => (VSCALEFPD128 ...)
-(MulByPowOf2Float64x4 ...) => (VSCALEFPD256 ...)
-(MulByPowOf2Float64x8 ...) => (VSCALEFPD512 ...)
-(MulByPowOf2MaskedFloat32x4 x y mask) => (VSCALEFPSMasked128 x y (VPMOVVec32x4ToM <types.TypeMask> mask))
-(MulByPowOf2MaskedFloat32x8 x y mask) => (VSCALEFPSMasked256 x y (VPMOVVec32x8ToM <types.TypeMask> mask))
-(MulByPowOf2MaskedFloat32x16 x y mask) => (VSCALEFPSMasked512 x y (VPMOVVec32x16ToM <types.TypeMask> mask))
-(MulByPowOf2MaskedFloat64x2 x y mask) => (VSCALEFPDMasked128 x y (VPMOVVec64x2ToM <types.TypeMask> mask))
-(MulByPowOf2MaskedFloat64x4 x y mask) => (VSCALEFPDMasked256 x y (VPMOVVec64x4ToM <types.TypeMask> mask))
-(MulByPowOf2MaskedFloat64x8 x y mask) => (VSCALEFPDMasked512 x y (VPMOVVec64x8ToM <types.TypeMask> mask))
+(MulInt16x8 ...) => (VPMULLW128 ...)
+(MulInt16x16 ...) => (VPMULLW256 ...)
+(MulInt16x32 ...) => (VPMULLW512 ...)
+(MulInt32x4 ...) => (VPMULLD128 ...)
+(MulInt32x8 ...) => (VPMULLD256 ...)
+(MulInt32x16 ...) => (VPMULLD512 ...)
+(MulInt64x2 ...) => (VPMULLQ128 ...)
+(MulInt64x4 ...) => (VPMULLQ256 ...)
+(MulInt64x8 ...) => (VPMULLQ512 ...)
 (MulEvenWidenInt32x4 ...) => (VPMULDQ128 ...)
 (MulEvenWidenInt32x8 ...) => (VPMULDQ256 ...)
 (MulEvenWidenInt64x2 ...) => (VPMULDQ128 ...)
@@ -889,30 +900,21 @@
 (MulHighMaskedUint16x8 x y mask) => (VPMULHUWMasked128 x y (VPMOVVec16x8ToM <types.TypeMask> mask))
 (MulHighMaskedUint16x16 x y mask) => (VPMULHUWMasked256 x y (VPMOVVec16x16ToM <types.TypeMask> mask))
 (MulHighMaskedUint16x32 x y mask) => (VPMULHUWMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask))
-(MulLowInt16x8 ...) => (VPMULLW128 ...)
-(MulLowInt16x16 ...) => (VPMULLW256 ...)
-(MulLowInt16x32 ...) => (VPMULLW512 ...)
-(MulLowInt32x4 ...) => (VPMULLD128 ...)
-(MulLowInt32x8 ...) => (VPMULLD256 ...)
-(MulLowInt32x16 ...) => (VPMULLD512 ...)
-(MulLowInt64x2 ...) => (VPMULLQ128 ...)
-(MulLowInt64x4 ...) => (VPMULLQ256 ...)
-(MulLowInt64x8 ...) => (VPMULLQ512 ...)
-(MulLowMaskedInt16x8 x y mask) => (VPMULLWMasked128 x y (VPMOVVec16x8ToM <types.TypeMask> mask))
-(MulLowMaskedInt16x16 x y mask) => (VPMULLWMasked256 x y (VPMOVVec16x16ToM <types.TypeMask> mask))
-(MulLowMaskedInt16x32 x y mask) => (VPMULLWMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask))
-(MulLowMaskedInt32x4 x y mask) => (VPMULLDMasked128 x y (VPMOVVec32x4ToM <types.TypeMask> mask))
-(MulLowMaskedInt32x8 x y mask) => (VPMULLDMasked256 x y (VPMOVVec32x8ToM <types.TypeMask> mask))
-(MulLowMaskedInt32x16 x y mask) => (VPMULLDMasked512 x y (VPMOVVec32x16ToM <types.TypeMask> mask))
-(MulLowMaskedInt64x2 x y mask) => (VPMULLQMasked128 x y (VPMOVVec64x2ToM <types.TypeMask> mask))
-(MulLowMaskedInt64x4 x y mask) => (VPMULLQMasked256 x y (VPMOVVec64x4ToM <types.TypeMask> mask))
-(MulLowMaskedInt64x8 x y mask) => (VPMULLQMasked512 x y (VPMOVVec64x8ToM <types.TypeMask> mask))
 (MulMaskedFloat32x4 x y mask) => (VMULPSMasked128 x y (VPMOVVec32x4ToM <types.TypeMask> mask))
 (MulMaskedFloat32x8 x y mask) => (VMULPSMasked256 x y (VPMOVVec32x8ToM <types.TypeMask> mask))
 (MulMaskedFloat32x16 x y mask) => (VMULPSMasked512 x y (VPMOVVec32x16ToM <types.TypeMask> mask))
 (MulMaskedFloat64x2 x y mask) => (VMULPDMasked128 x y (VPMOVVec64x2ToM <types.TypeMask> mask))
 (MulMaskedFloat64x4 x y mask) => (VMULPDMasked256 x y (VPMOVVec64x4ToM <types.TypeMask> mask))
 (MulMaskedFloat64x8 x y mask) => (VMULPDMasked512 x y (VPMOVVec64x8ToM <types.TypeMask> mask))
+(MulMaskedInt16x8 x y mask) => (VPMULLWMasked128 x y (VPMOVVec16x8ToM <types.TypeMask> mask))
+(MulMaskedInt16x16 x y mask) => (VPMULLWMasked256 x y (VPMOVVec16x16ToM <types.TypeMask> mask))
+(MulMaskedInt16x32 x y mask) => (VPMULLWMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask))
+(MulMaskedInt32x4 x y mask) => (VPMULLDMasked128 x y (VPMOVVec32x4ToM <types.TypeMask> mask))
+(MulMaskedInt32x8 x y mask) => (VPMULLDMasked256 x y (VPMOVVec32x8ToM <types.TypeMask> mask))
+(MulMaskedInt32x16 x y mask) => (VPMULLDMasked512 x y (VPMOVVec32x16ToM <types.TypeMask> mask))
+(MulMaskedInt64x2 x y mask) => (VPMULLQMasked128 x y (VPMOVVec64x2ToM <types.TypeMask> mask))
+(MulMaskedInt64x4 x y mask) => (VPMULLQMasked256 x y (VPMOVVec64x4ToM <types.TypeMask> mask))
+(MulMaskedInt64x8 x y mask) => (VPMULLQMasked512 x y (VPMOVVec64x8ToM <types.TypeMask> mask))
 (NotEqualFloat32x4 x y) => (VCMPPS128 [4] x y)
 (NotEqualFloat32x8 x y) => (VCMPPS256 [4] x y)
 (NotEqualFloat32x16 x y) => (VPMOVMToVec32x16 (VCMPPS512 [4] x y))
@@ -1015,30 +1017,6 @@
 (PairDotProdMaskedInt16x8 x y mask) => (VPMADDWDMasked128 x y (VPMOVVec16x8ToM <types.TypeMask> mask))
 (PairDotProdMaskedInt16x16 x y mask) => (VPMADDWDMasked256 x y (VPMOVVec16x16ToM <types.TypeMask> mask))
 (PairDotProdMaskedInt16x32 x y mask) => (VPMADDWDMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask))
-(PairwiseAddFloat32x4 ...) => (VHADDPS128 ...)
-(PairwiseAddFloat32x8 ...) => (VHADDPS256 ...)
-(PairwiseAddFloat64x2 ...) => (VHADDPD128 ...)
-(PairwiseAddFloat64x4 ...) => (VHADDPD256 ...)
-(PairwiseAddInt16x8 ...) => (VPHADDW128 ...)
-(PairwiseAddInt16x16 ...) => (VPHADDW256 ...)
-(PairwiseAddInt32x4 ...) => (VPHADDD128 ...)
-(PairwiseAddInt32x8 ...) => (VPHADDD256 ...)
-(PairwiseAddUint16x8 ...) => (VPHADDW128 ...)
-(PairwiseAddUint16x16 ...) => (VPHADDW256 ...)
-(PairwiseAddUint32x4 ...) => (VPHADDD128 ...)
-(PairwiseAddUint32x8 ...) => (VPHADDD256 ...)
-(PairwiseSubFloat32x4 ...) => (VHSUBPS128 ...)
-(PairwiseSubFloat32x8 ...) => (VHSUBPS256 ...)
-(PairwiseSubFloat64x2 ...) => (VHSUBPD128 ...)
-(PairwiseSubFloat64x4 ...) => (VHSUBPD256 ...)
-(PairwiseSubInt16x8 ...) => (VPHSUBW128 ...)
-(PairwiseSubInt16x16 ...) => (VPHSUBW256 ...)
-(PairwiseSubInt32x4 ...) => (VPHSUBD128 ...)
-(PairwiseSubInt32x8 ...) => (VPHSUBD256 ...)
-(PairwiseSubUint16x8 ...) => (VPHSUBW128 ...)
-(PairwiseSubUint16x16 ...) => (VPHSUBW256 ...)
-(PairwiseSubUint32x4 ...) => (VPHSUBD128 ...)
-(PairwiseSubUint32x8 ...) => (VPHSUBD256 ...)
 (PermuteFloat32x8 ...) => (VPERMPS256 ...)
 (PermuteFloat32x16 ...) => (VPERMPS512 ...)
 (PermuteFloat64x4 ...) => (VPERMPD256 ...)
@@ -1295,76 +1273,36 @@
 (RoundFloat32x8 x) => (VROUNDPS256 [0] x)
 (RoundFloat64x2 x) => (VROUNDPD128 [0] x)
 (RoundFloat64x4 x) => (VROUNDPD256 [0] x)
-(RoundWithPrecisionFloat32x4 [a] x) => (VRNDSCALEPS128 [a+0] x)
-(RoundWithPrecisionFloat32x8 [a] x) => (VRNDSCALEPS256 [a+0] x)
-(RoundWithPrecisionFloat32x16 [a] x) => (VRNDSCALEPS512 [a+0] x)
-(RoundWithPrecisionFloat64x2 [a] x) => (VRNDSCALEPD128 [a+0] x)
-(RoundWithPrecisionFloat64x4 [a] x) => (VRNDSCALEPD256 [a+0] x)
-(RoundWithPrecisionFloat64x8 [a] x) => (VRNDSCALEPD512 [a+0] x)
-(RoundWithPrecisionMaskedFloat32x4 [a] x mask) => (VRNDSCALEPSMasked128 [a+0] x (VPMOVVec32x4ToM <types.TypeMask> mask))
-(RoundWithPrecisionMaskedFloat32x8 [a] x mask) => (VRNDSCALEPSMasked256 [a+0] x (VPMOVVec32x8ToM <types.TypeMask> mask))
-(RoundWithPrecisionMaskedFloat32x16 [a] x mask) => (VRNDSCALEPSMasked512 [a+0] x (VPMOVVec32x16ToM <types.TypeMask> mask))
-(RoundWithPrecisionMaskedFloat64x2 [a] x mask) => (VRNDSCALEPDMasked128 [a+0] x (VPMOVVec64x2ToM <types.TypeMask> mask))
-(RoundWithPrecisionMaskedFloat64x4 [a] x mask) => (VRNDSCALEPDMasked256 [a+0] x (VPMOVVec64x4ToM <types.TypeMask> mask))
-(RoundWithPrecisionMaskedFloat64x8 [a] x mask) => (VRNDSCALEPDMasked512 [a+0] x (VPMOVVec64x8ToM <types.TypeMask> mask))
-(SaturatedAddInt8x16 ...) => (VPADDSB128 ...)
-(SaturatedAddInt8x32 ...) => (VPADDSB256 ...)
-(SaturatedAddInt8x64 ...) => (VPADDSB512 ...)
-(SaturatedAddInt16x8 ...) => (VPADDSW128 ...)
-(SaturatedAddInt16x16 ...) => (VPADDSW256 ...)
-(SaturatedAddInt16x32 ...) => (VPADDSW512 ...)
-(SaturatedAddUint8x16 ...) => (VPADDSB128 ...)
-(SaturatedAddUint8x32 ...) => (VPADDSB256 ...)
-(SaturatedAddUint8x64 ...) => (VPADDSB512 ...)
-(SaturatedAddUint16x8 ...) => (VPADDSW128 ...)
-(SaturatedAddUint16x16 ...) => (VPADDSW256 ...)
-(SaturatedAddUint16x32 ...) => (VPADDSW512 ...)
+(RoundScaledFloat32x4 [a] x) => (VRNDSCALEPS128 [a+0] x)
+(RoundScaledFloat32x8 [a] x) => (VRNDSCALEPS256 [a+0] x)
+(RoundScaledFloat32x16 [a] x) => (VRNDSCALEPS512 [a+0] x)
+(RoundScaledFloat64x2 [a] x) => (VRNDSCALEPD128 [a+0] x)
+(RoundScaledFloat64x4 [a] x) => (VRNDSCALEPD256 [a+0] x)
+(RoundScaledFloat64x8 [a] x) => (VRNDSCALEPD512 [a+0] x)
+(RoundScaledMaskedFloat32x4 [a] x mask) => (VRNDSCALEPSMasked128 [a+0] x (VPMOVVec32x4ToM <types.TypeMask> mask))
+(RoundScaledMaskedFloat32x8 [a] x mask) => (VRNDSCALEPSMasked256 [a+0] x (VPMOVVec32x8ToM <types.TypeMask> mask))
+(RoundScaledMaskedFloat32x16 [a] x mask) => (VRNDSCALEPSMasked512 [a+0] x (VPMOVVec32x16ToM <types.TypeMask> mask))
+(RoundScaledMaskedFloat64x2 [a] x mask) => (VRNDSCALEPDMasked128 [a+0] x (VPMOVVec64x2ToM <types.TypeMask> mask))
+(RoundScaledMaskedFloat64x4 [a] x mask) => (VRNDSCALEPDMasked256 [a+0] x (VPMOVVec64x4ToM <types.TypeMask> mask))
+(RoundScaledMaskedFloat64x8 [a] x mask) => (VRNDSCALEPDMasked512 [a+0] x (VPMOVVec64x8ToM <types.TypeMask> mask))
+(RoundScaledResidueFloat32x4 [a] x) => (VREDUCEPS128 [a+0] x)
+(RoundScaledResidueFloat32x8 [a] x) => (VREDUCEPS256 [a+0] x)
+(RoundScaledResidueFloat32x16 [a] x) => (VREDUCEPS512 [a+0] x)
+(RoundScaledResidueFloat64x2 [a] x) => (VREDUCEPD128 [a+0] x)
+(RoundScaledResidueFloat64x4 [a] x) => (VREDUCEPD256 [a+0] x)
+(RoundScaledResidueFloat64x8 [a] x) => (VREDUCEPD512 [a+0] x)
+(RoundScaledResidueMaskedFloat32x4 [a] x mask) => (VREDUCEPSMasked128 [a+0] x (VPMOVVec32x4ToM <types.TypeMask> mask))
+(RoundScaledResidueMaskedFloat32x8 [a] x mask) => (VREDUCEPSMasked256 [a+0] x (VPMOVVec32x8ToM <types.TypeMask> mask))
+(RoundScaledResidueMaskedFloat32x16 [a] x mask) => (VREDUCEPSMasked512 [a+0] x (VPMOVVec32x16ToM <types.TypeMask> mask))
+(RoundScaledResidueMaskedFloat64x2 [a] x mask) => (VREDUCEPDMasked128 [a+0] x (VPMOVVec64x2ToM <types.TypeMask> mask))
+(RoundScaledResidueMaskedFloat64x4 [a] x mask) => (VREDUCEPDMasked256 [a+0] x (VPMOVVec64x4ToM <types.TypeMask> mask))
+(RoundScaledResidueMaskedFloat64x8 [a] x mask) => (VREDUCEPDMasked512 [a+0] x (VPMOVVec64x8ToM <types.TypeMask> mask))
 (SaturatedAddDotProdInt32x4 ...) => (VPDPWSSDS128 ...)
 (SaturatedAddDotProdInt32x8 ...) => (VPDPWSSDS256 ...)
 (SaturatedAddDotProdInt32x16 ...) => (VPDPWSSDS512 ...)
 (SaturatedAddDotProdMaskedInt32x4 x y z mask) => (VPDPWSSDSMasked128 x y z (VPMOVVec32x4ToM <types.TypeMask> mask))
 (SaturatedAddDotProdMaskedInt32x8 x y z mask) => (VPDPWSSDSMasked256 x y z (VPMOVVec32x8ToM <types.TypeMask> mask))
 (SaturatedAddDotProdMaskedInt32x16 x y z mask) => (VPDPWSSDSMasked512 x y z (VPMOVVec32x16ToM <types.TypeMask> mask))
-(SaturatedAddMaskedInt8x16 x y mask) => (VPADDSBMasked128 x y (VPMOVVec8x16ToM <types.TypeMask> mask))
-(SaturatedAddMaskedInt8x32 x y mask) => (VPADDSBMasked256 x y (VPMOVVec8x32ToM <types.TypeMask> mask))
-(SaturatedAddMaskedInt8x64 x y mask) => (VPADDSBMasked512 x y (VPMOVVec8x64ToM <types.TypeMask> mask))
-(SaturatedAddMaskedInt16x8 x y mask) => (VPADDSWMasked128 x y (VPMOVVec16x8ToM <types.TypeMask> mask))
-(SaturatedAddMaskedInt16x16 x y mask) => (VPADDSWMasked256 x y (VPMOVVec16x16ToM <types.TypeMask> mask))
-(SaturatedAddMaskedInt16x32 x y mask) => (VPADDSWMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask))
-(SaturatedAddMaskedUint8x16 x y mask) => (VPADDSBMasked128 x y (VPMOVVec8x16ToM <types.TypeMask> mask))
-(SaturatedAddMaskedUint8x32 x y mask) => (VPADDSBMasked256 x y (VPMOVVec8x32ToM <types.TypeMask> mask))
-(SaturatedAddMaskedUint8x64 x y mask) => (VPADDSBMasked512 x y (VPMOVVec8x64ToM <types.TypeMask> mask))
-(SaturatedAddMaskedUint16x8 x y mask) => (VPADDSWMasked128 x y (VPMOVVec16x8ToM <types.TypeMask> mask))
-(SaturatedAddMaskedUint16x16 x y mask) => (VPADDSWMasked256 x y (VPMOVVec16x16ToM <types.TypeMask> mask))
-(SaturatedAddMaskedUint16x32 x y mask) => (VPADDSWMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask))
-(SaturatedPairwiseAddInt16x8 ...) => (VPHADDSW128 ...)
-(SaturatedPairwiseAddInt16x16 ...) => (VPHADDSW256 ...)
-(SaturatedPairwiseSubInt16x8 ...) => (VPHSUBSW128 ...)
-(SaturatedPairwiseSubInt16x16 ...) => (VPHSUBSW256 ...)
-(SaturatedSubInt8x16 ...) => (VPSUBSB128 ...)
-(SaturatedSubInt8x32 ...) => (VPSUBSB256 ...)
-(SaturatedSubInt8x64 ...) => (VPSUBSB512 ...)
-(SaturatedSubInt16x8 ...) => (VPSUBSW128 ...)
-(SaturatedSubInt16x16 ...) => (VPSUBSW256 ...)
-(SaturatedSubInt16x32 ...) => (VPSUBSW512 ...)
-(SaturatedSubUint8x16 ...) => (VPSUBSB128 ...)
-(SaturatedSubUint8x32 ...) => (VPSUBSB256 ...)
-(SaturatedSubUint8x64 ...) => (VPSUBSB512 ...)
-(SaturatedSubUint16x8 ...) => (VPSUBSW128 ...)
-(SaturatedSubUint16x16 ...) => (VPSUBSW256 ...)
-(SaturatedSubUint16x32 ...) => (VPSUBSW512 ...)
-(SaturatedSubMaskedInt8x16 x y mask) => (VPSUBSBMasked128 x y (VPMOVVec8x16ToM <types.TypeMask> mask))
-(SaturatedSubMaskedInt8x32 x y mask) => (VPSUBSBMasked256 x y (VPMOVVec8x32ToM <types.TypeMask> mask))
-(SaturatedSubMaskedInt8x64 x y mask) => (VPSUBSBMasked512 x y (VPMOVVec8x64ToM <types.TypeMask> mask))
-(SaturatedSubMaskedInt16x8 x y mask) => (VPSUBSWMasked128 x y (VPMOVVec16x8ToM <types.TypeMask> mask))
-(SaturatedSubMaskedInt16x16 x y mask) => (VPSUBSWMasked256 x y (VPMOVVec16x16ToM <types.TypeMask> mask))
-(SaturatedSubMaskedInt16x32 x y mask) => (VPSUBSWMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask))
-(SaturatedSubMaskedUint8x16 x y mask) => (VPSUBSBMasked128 x y (VPMOVVec8x16ToM <types.TypeMask> mask))
-(SaturatedSubMaskedUint8x32 x y mask) => (VPSUBSBMasked256 x y (VPMOVVec8x32ToM <types.TypeMask> mask))
-(SaturatedSubMaskedUint8x64 x y mask) => (VPSUBSBMasked512 x y (VPMOVVec8x64ToM <types.TypeMask> mask))
-(SaturatedSubMaskedUint16x8 x y mask) => (VPSUBSWMasked128 x y (VPMOVVec16x8ToM <types.TypeMask> mask))
-(SaturatedSubMaskedUint16x16 x y mask) => (VPSUBSWMasked256 x y (VPMOVVec16x16ToM <types.TypeMask> mask))
-(SaturatedSubMaskedUint16x32 x y mask) => (VPSUBSWMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask))
 (SaturatedUnsignedSignedPairDotProdUint8x16 ...) => (VPMADDUBSW128 ...)
 (SaturatedUnsignedSignedPairDotProdUint8x32 ...) => (VPMADDUBSW256 ...)
 (SaturatedUnsignedSignedPairDotProdUint8x64 ...) => (VPMADDUBSW512 ...)
@@ -1377,6 +1315,18 @@
 (SaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x4 x y z mask) => (VPDPBUSDSMasked128 x y z (VPMOVVec32x4ToM <types.TypeMask> mask))
 (SaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x8 x y z mask) => (VPDPBUSDSMasked256 x y z (VPMOVVec32x8ToM <types.TypeMask> mask))
 (SaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x16 x y z mask) => (VPDPBUSDSMasked512 x y z (VPMOVVec32x16ToM <types.TypeMask> mask))
+(ScaleFloat32x4 ...) => (VSCALEFPS128 ...)
+(ScaleFloat32x8 ...) => (VSCALEFPS256 ...)
+(ScaleFloat32x16 ...) => (VSCALEFPS512 ...)
+(ScaleFloat64x2 ...) => (VSCALEFPD128 ...)
+(ScaleFloat64x4 ...) => (VSCALEFPD256 ...)
+(ScaleFloat64x8 ...) => (VSCALEFPD512 ...)
+(ScaleMaskedFloat32x4 x y mask) => (VSCALEFPSMasked128 x y (VPMOVVec32x4ToM <types.TypeMask> mask))
+(ScaleMaskedFloat32x8 x y mask) => (VSCALEFPSMasked256 x y (VPMOVVec32x8ToM <types.TypeMask> mask))
+(ScaleMaskedFloat32x16 x y mask) => (VSCALEFPSMasked512 x y (VPMOVVec32x16ToM <types.TypeMask> mask))
+(ScaleMaskedFloat64x2 x y mask) => (VSCALEFPDMasked128 x y (VPMOVVec64x2ToM <types.TypeMask> mask))
+(ScaleMaskedFloat64x4 x y mask) => (VSCALEFPDMasked256 x y (VPMOVVec64x4ToM <types.TypeMask> mask))
+(ScaleMaskedFloat64x8 x y mask) => (VSCALEFPDMasked512 x y (VPMOVVec64x8ToM <types.TypeMask> mask))
 (Set128Float32x8 ...) => (VINSERTF128256 ...)
 (Set128Float64x4 ...) => (VINSERTF128256 ...)
 (Set128Int8x32 ...) => (VINSERTI128256 ...)
@@ -1761,22 +1711,72 @@
 (SubMaskedUint64x2 x y mask) => (VPSUBQMasked128 x y (VPMOVVec64x2ToM <types.TypeMask> mask))
 (SubMaskedUint64x4 x y mask) => (VPSUBQMasked256 x y (VPMOVVec64x4ToM <types.TypeMask> mask))
 (SubMaskedUint64x8 x y mask) => (VPSUBQMasked512 x y (VPMOVVec64x8ToM <types.TypeMask> mask))
+(SubPairsFloat32x4 ...) => (VHSUBPS128 ...)
+(SubPairsFloat32x8 ...) => (VHSUBPS256 ...)
+(SubPairsFloat64x2 ...) => (VHSUBPD128 ...)
+(SubPairsFloat64x4 ...) => (VHSUBPD256 ...)
+(SubPairsInt16x8 ...) => (VPHSUBW128 ...)
+(SubPairsInt16x16 ...) => (VPHSUBW256 ...)
+(SubPairsInt32x4 ...) => (VPHSUBD128 ...)
+(SubPairsInt32x8 ...) => (VPHSUBD256 ...)
+(SubPairsUint16x8 ...) => (VPHSUBW128 ...)
+(SubPairsUint16x16 ...) => (VPHSUBW256 ...)
+(SubPairsUint32x4 ...) => (VPHSUBD128 ...)
+(SubPairsUint32x8 ...) => (VPHSUBD256 ...)
+(SubPairsSaturatedInt16x8 ...) => (VPHSUBSW128 ...)
+(SubPairsSaturatedInt16x16 ...) => (VPHSUBSW256 ...)
+(SubSaturatedInt8x16 ...) => (VPSUBSB128 ...)
+(SubSaturatedInt8x32 ...) => (VPSUBSB256 ...)
+(SubSaturatedInt8x64 ...) => (VPSUBSB512 ...)
+(SubSaturatedInt16x8 ...) => (VPSUBSW128 ...)
+(SubSaturatedInt16x16 ...) => (VPSUBSW256 ...)
+(SubSaturatedInt16x32 ...) => (VPSUBSW512 ...)
+(SubSaturatedUint8x16 ...) => (VPSUBSB128 ...)
+(SubSaturatedUint8x32 ...) => (VPSUBSB256 ...)
+(SubSaturatedUint8x64 ...) => (VPSUBSB512 ...)
+(SubSaturatedUint16x8 ...) => (VPSUBSW128 ...)
+(SubSaturatedUint16x16 ...) => (VPSUBSW256 ...)
+(SubSaturatedUint16x32 ...) => (VPSUBSW512 ...)
+(SubSaturatedMaskedInt8x16 x y mask) => (VPSUBSBMasked128 x y (VPMOVVec8x16ToM <types.TypeMask> mask))
+(SubSaturatedMaskedInt8x32 x y mask) => (VPSUBSBMasked256 x y (VPMOVVec8x32ToM <types.TypeMask> mask))
+(SubSaturatedMaskedInt8x64 x y mask) => (VPSUBSBMasked512 x y (VPMOVVec8x64ToM <types.TypeMask> mask))
+(SubSaturatedMaskedInt16x8 x y mask) => (VPSUBSWMasked128 x y (VPMOVVec16x8ToM <types.TypeMask> mask))
+(SubSaturatedMaskedInt16x16 x y mask) => (VPSUBSWMasked256 x y (VPMOVVec16x16ToM <types.TypeMask> mask))
+(SubSaturatedMaskedInt16x32 x y mask) => (VPSUBSWMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask))
+(SubSaturatedMaskedUint8x16 x y mask) => (VPSUBSBMasked128 x y (VPMOVVec8x16ToM <types.TypeMask> mask))
+(SubSaturatedMaskedUint8x32 x y mask) => (VPSUBSBMasked256 x y (VPMOVVec8x32ToM <types.TypeMask> mask))
+(SubSaturatedMaskedUint8x64 x y mask) => (VPSUBSBMasked512 x y (VPMOVVec8x64ToM <types.TypeMask> mask))
+(SubSaturatedMaskedUint16x8 x y mask) => (VPSUBSWMasked128 x y (VPMOVVec16x8ToM <types.TypeMask> mask))
+(SubSaturatedMaskedUint16x16 x y mask) => (VPSUBSWMasked256 x y (VPMOVVec16x16ToM <types.TypeMask> mask))
+(SubSaturatedMaskedUint16x32 x y mask) => (VPSUBSWMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask))
 (TruncFloat32x4 x) => (VROUNDPS128 [3] x)
 (TruncFloat32x8 x) => (VROUNDPS256 [3] x)
 (TruncFloat64x2 x) => (VROUNDPD128 [3] x)
 (TruncFloat64x4 x) => (VROUNDPD256 [3] x)
-(TruncWithPrecisionFloat32x4 [a] x) => (VRNDSCALEPS128 [a+3] x)
-(TruncWithPrecisionFloat32x8 [a] x) => (VRNDSCALEPS256 [a+3] x)
-(TruncWithPrecisionFloat32x16 [a] x) => (VRNDSCALEPS512 [a+3] x)
-(TruncWithPrecisionFloat64x2 [a] x) => (VRNDSCALEPD128 [a+3] x)
-(TruncWithPrecisionFloat64x4 [a] x) => (VRNDSCALEPD256 [a+3] x)
-(TruncWithPrecisionFloat64x8 [a] x) => (VRNDSCALEPD512 [a+3] x)
-(TruncWithPrecisionMaskedFloat32x4 [a] x mask) => (VRNDSCALEPSMasked128 [a+3] x (VPMOVVec32x4ToM <types.TypeMask> mask))
-(TruncWithPrecisionMaskedFloat32x8 [a] x mask) => (VRNDSCALEPSMasked256 [a+3] x (VPMOVVec32x8ToM <types.TypeMask> mask))
-(TruncWithPrecisionMaskedFloat32x16 [a] x mask) => (VRNDSCALEPSMasked512 [a+3] x (VPMOVVec32x16ToM <types.TypeMask> mask))
-(TruncWithPrecisionMaskedFloat64x2 [a] x mask) => (VRNDSCALEPDMasked128 [a+3] x (VPMOVVec64x2ToM <types.TypeMask> mask))
-(TruncWithPrecisionMaskedFloat64x4 [a] x mask) => (VRNDSCALEPDMasked256 [a+3] x (VPMOVVec64x4ToM <types.TypeMask> mask))
-(TruncWithPrecisionMaskedFloat64x8 [a] x mask) => (VRNDSCALEPDMasked512 [a+3] x (VPMOVVec64x8ToM <types.TypeMask> mask))
+(TruncScaledFloat32x4 [a] x) => (VRNDSCALEPS128 [a+3] x)
+(TruncScaledFloat32x8 [a] x) => (VRNDSCALEPS256 [a+3] x)
+(TruncScaledFloat32x16 [a] x) => (VRNDSCALEPS512 [a+3] x)
+(TruncScaledFloat64x2 [a] x) => (VRNDSCALEPD128 [a+3] x)
+(TruncScaledFloat64x4 [a] x) => (VRNDSCALEPD256 [a+3] x)
+(TruncScaledFloat64x8 [a] x) => (VRNDSCALEPD512 [a+3] x)
+(TruncScaledMaskedFloat32x4 [a] x mask) => (VRNDSCALEPSMasked128 [a+3] x (VPMOVVec32x4ToM <types.TypeMask> mask))
+(TruncScaledMaskedFloat32x8 [a] x mask) => (VRNDSCALEPSMasked256 [a+3] x (VPMOVVec32x8ToM <types.TypeMask> mask))
+(TruncScaledMaskedFloat32x16 [a] x mask) => (VRNDSCALEPSMasked512 [a+3] x (VPMOVVec32x16ToM <types.TypeMask> mask))
+(TruncScaledMaskedFloat64x2 [a] x mask) => (VRNDSCALEPDMasked128 [a+3] x (VPMOVVec64x2ToM <types.TypeMask> mask))
+(TruncScaledMaskedFloat64x4 [a] x mask) => (VRNDSCALEPDMasked256 [a+3] x (VPMOVVec64x4ToM <types.TypeMask> mask))
+(TruncScaledMaskedFloat64x8 [a] x mask) => (VRNDSCALEPDMasked512 [a+3] x (VPMOVVec64x8ToM <types.TypeMask> mask))
+(TruncScaledResidueFloat32x4 [a] x) => (VREDUCEPS128 [a+3] x)
+(TruncScaledResidueFloat32x8 [a] x) => (VREDUCEPS256 [a+3] x)
+(TruncScaledResidueFloat32x16 [a] x) => (VREDUCEPS512 [a+3] x)
+(TruncScaledResidueFloat64x2 [a] x) => (VREDUCEPD128 [a+3] x)
+(TruncScaledResidueFloat64x4 [a] x) => (VREDUCEPD256 [a+3] x)
+(TruncScaledResidueFloat64x8 [a] x) => (VREDUCEPD512 [a+3] x)
+(TruncScaledResidueMaskedFloat32x4 [a] x mask) => (VREDUCEPSMasked128 [a+3] x (VPMOVVec32x4ToM <types.TypeMask> mask))
+(TruncScaledResidueMaskedFloat32x8 [a] x mask) => (VREDUCEPSMasked256 [a+3] x (VPMOVVec32x8ToM <types.TypeMask> mask))
+(TruncScaledResidueMaskedFloat32x16 [a] x mask) => (VREDUCEPSMasked512 [a+3] x (VPMOVVec32x16ToM <types.TypeMask> mask))
+(TruncScaledResidueMaskedFloat64x2 [a] x mask) => (VREDUCEPDMasked128 [a+3] x (VPMOVVec64x2ToM <types.TypeMask> mask))
+(TruncScaledResidueMaskedFloat64x4 [a] x mask) => (VREDUCEPDMasked256 [a+3] x (VPMOVVec64x4ToM <types.TypeMask> mask))
+(TruncScaledResidueMaskedFloat64x8 [a] x mask) => (VREDUCEPDMasked512 [a+3] x (VPMOVVec64x8ToM <types.TypeMask> mask))
 (UnsignedSignedQuadDotProdAccumulateInt32x4 ...) => (VPDPBUSD128 ...)
 (UnsignedSignedQuadDotProdAccumulateInt32x8 ...) => (VPDPBUSD256 ...)
 (UnsignedSignedQuadDotProdAccumulateInt32x16 ...) => (VPDPBUSD512 ...)
diff --git a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
index 492a994e936..ea52254413f 100644
--- a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
+++ b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
@@ -81,6 +81,44 @@ func simdGenericOps() []opData {
 		{name: "AddMaskedUint64x2", argLength: 3, commutative: true},
 		{name: "AddMaskedUint64x4", argLength: 3, commutative: true},
 		{name: "AddMaskedUint64x8", argLength: 3, commutative: true},
+		{name: "AddPairsFloat32x4", argLength: 2, commutative: false},
+		{name: "AddPairsFloat32x8", argLength: 2, commutative: false},
+		{name: "AddPairsFloat64x2", argLength: 2, commutative: false},
+		{name: "AddPairsFloat64x4", argLength: 2, commutative: false},
+		{name: "AddPairsInt16x8", argLength: 2, commutative: false},
+		{name: "AddPairsInt16x16", argLength: 2, commutative: false},
+		{name: "AddPairsInt32x4", argLength: 2, commutative: false},
+		{name: "AddPairsInt32x8", argLength: 2, commutative: false},
+		{name: "AddPairsSaturatedInt16x8", argLength: 2, commutative: false},
+		{name: "AddPairsSaturatedInt16x16", argLength: 2, commutative: false},
+		{name: "AddPairsUint16x8", argLength: 2, commutative: false},
+		{name: "AddPairsUint16x16", argLength: 2, commutative: false},
+		{name: "AddPairsUint32x4", argLength: 2, commutative: false},
+		{name: "AddPairsUint32x8", argLength: 2, commutative: false},
+		{name: "AddSaturatedInt8x16", argLength: 2, commutative: true},
+		{name: "AddSaturatedInt8x32", argLength: 2, commutative: true},
+		{name: "AddSaturatedInt8x64", argLength: 2, commutative: true},
+		{name: "AddSaturatedInt16x8", argLength: 2, commutative: true},
+		{name: "AddSaturatedInt16x16", argLength: 2, commutative: true},
+		{name: "AddSaturatedInt16x32", argLength: 2, commutative: true},
+		{name: "AddSaturatedMaskedInt8x16", argLength: 3, commutative: true},
+		{name: "AddSaturatedMaskedInt8x32", argLength: 3, commutative: true},
+		{name: "AddSaturatedMaskedInt8x64", argLength: 3, commutative: true},
+		{name: "AddSaturatedMaskedInt16x8", argLength: 3, commutative: true},
+		{name: "AddSaturatedMaskedInt16x16", argLength: 3, commutative: true},
+		{name: "AddSaturatedMaskedInt16x32", argLength: 3, commutative: true},
+		{name: "AddSaturatedMaskedUint8x16", argLength: 3, commutative: true},
+		{name: "AddSaturatedMaskedUint8x32", argLength: 3, commutative: true},
+		{name: "AddSaturatedMaskedUint8x64", argLength: 3, commutative: true},
+		{name: "AddSaturatedMaskedUint16x8", argLength: 3, commutative: true},
+		{name: "AddSaturatedMaskedUint16x16", argLength: 3, commutative: true},
+		{name: "AddSaturatedMaskedUint16x32", argLength: 3, commutative: true},
+		{name: "AddSaturatedUint8x16", argLength: 2, commutative: true},
+		{name: "AddSaturatedUint8x32", argLength: 2, commutative: true},
+		{name: "AddSaturatedUint8x64", argLength: 2, commutative: true},
+		{name: "AddSaturatedUint16x8", argLength: 2, commutative: true},
+		{name: "AddSaturatedUint16x16", argLength: 2, commutative: true},
+		{name: "AddSaturatedUint16x32", argLength: 2, commutative: true},
 		{name: "AddSubFloat32x4", argLength: 2, commutative: false},
 		{name: "AddSubFloat32x8", argLength: 2, commutative: false},
 		{name: "AddSubFloat64x2", argLength: 2, commutative: false},
@@ -744,18 +782,6 @@ func simdGenericOps() []opData {
 		{name: "MinUint64x2", argLength: 2, commutative: true},
 		{name: "MinUint64x4", argLength: 2, commutative: true},
 		{name: "MinUint64x8", argLength: 2, commutative: true},
-		{name: "MulByPowOf2Float32x4", argLength: 2, commutative: false},
-		{name: "MulByPowOf2Float32x8", argLength: 2, commutative: false},
-		{name: "MulByPowOf2Float32x16", argLength: 2, commutative: false},
-		{name: "MulByPowOf2Float64x2", argLength: 2, commutative: false},
-		{name: "MulByPowOf2Float64x4", argLength: 2, commutative: false},
-		{name: "MulByPowOf2Float64x8", argLength: 2, commutative: false},
-		{name: "MulByPowOf2MaskedFloat32x4", argLength: 3, commutative: false},
-		{name: "MulByPowOf2MaskedFloat32x8", argLength: 3, commutative: false},
-		{name: "MulByPowOf2MaskedFloat32x16", argLength: 3, commutative: false},
-		{name: "MulByPowOf2MaskedFloat64x2", argLength: 3, commutative: false},
-		{name: "MulByPowOf2MaskedFloat64x4", argLength: 3, commutative: false},
-		{name: "MulByPowOf2MaskedFloat64x8", argLength: 3, commutative: false},
 		{name: "MulEvenWidenInt32x4", argLength: 2, commutative: true},
 		{name: "MulEvenWidenInt32x8", argLength: 2, commutative: true},
 		{name: "MulEvenWidenInt64x2", argLength: 2, commutative: true},
@@ -790,30 +816,30 @@ func simdGenericOps() []opData {
 		{name: "MulHighUint16x8", argLength: 2, commutative: true},
 		{name: "MulHighUint16x16", argLength: 2, commutative: true},
 		{name: "MulHighUint16x32", argLength: 2, commutative: true},
-		{name: "MulLowInt16x8", argLength: 2, commutative: true},
-		{name: "MulLowInt16x16", argLength: 2, commutative: true},
-		{name: "MulLowInt16x32", argLength: 2, commutative: true},
-		{name: "MulLowInt32x4", argLength: 2, commutative: true},
-		{name: "MulLowInt32x8", argLength: 2, commutative: true},
-		{name: "MulLowInt32x16", argLength: 2, commutative: true},
-		{name: "MulLowInt64x2", argLength: 2, commutative: true},
-		{name: "MulLowInt64x4", argLength: 2, commutative: true},
-		{name: "MulLowInt64x8", argLength: 2, commutative: true},
-		{name: "MulLowMaskedInt16x8", argLength: 3, commutative: true},
-		{name: "MulLowMaskedInt16x16", argLength: 3, commutative: true},
-		{name: "MulLowMaskedInt16x32", argLength: 3, commutative: true},
-		{name: "MulLowMaskedInt32x4", argLength: 3, commutative: true},
-		{name: "MulLowMaskedInt32x8", argLength: 3, commutative: true},
-		{name: "MulLowMaskedInt32x16", argLength: 3, commutative: true},
-		{name: "MulLowMaskedInt64x2", argLength: 3, commutative: true},
-		{name: "MulLowMaskedInt64x4", argLength: 3, commutative: true},
-		{name: "MulLowMaskedInt64x8", argLength: 3, commutative: true},
+		{name: "MulInt16x8", argLength: 2, commutative: true},
+		{name: "MulInt16x16", argLength: 2, commutative: true},
+		{name: "MulInt16x32", argLength: 2, commutative: true},
+		{name: "MulInt32x4", argLength: 2, commutative: true},
+		{name: "MulInt32x8", argLength: 2, commutative: true},
+		{name: "MulInt32x16", argLength: 2, commutative: true},
+		{name: "MulInt64x2", argLength: 2, commutative: true},
+		{name: "MulInt64x4", argLength: 2, commutative: true},
+		{name: "MulInt64x8", argLength: 2, commutative: true},
 		{name: "MulMaskedFloat32x4", argLength: 3, commutative: true},
 		{name: "MulMaskedFloat32x8", argLength: 3, commutative: true},
 		{name: "MulMaskedFloat32x16", argLength: 3, commutative: true},
 		{name: "MulMaskedFloat64x2", argLength: 3, commutative: true},
 		{name: "MulMaskedFloat64x4", argLength: 3, commutative: true},
 		{name: "MulMaskedFloat64x8", argLength: 3, commutative: true},
+		{name: "MulMaskedInt16x8", argLength: 3, commutative: true},
+		{name: "MulMaskedInt16x16", argLength: 3, commutative: true},
+		{name: "MulMaskedInt16x32", argLength: 3, commutative: true},
+		{name: "MulMaskedInt32x4", argLength: 3, commutative: true},
+		{name: "MulMaskedInt32x8", argLength: 3, commutative: true},
+		{name: "MulMaskedInt32x16", argLength: 3, commutative: true},
+		{name: "MulMaskedInt64x2", argLength: 3, commutative: true},
+		{name: "MulMaskedInt64x4", argLength: 3, commutative: true},
+		{name: "MulMaskedInt64x8", argLength: 3, commutative: true},
 		{name: "NotEqualFloat32x4", argLength: 2, commutative: true},
 		{name: "NotEqualFloat32x8", argLength: 2, commutative: true},
 		{name: "NotEqualFloat32x16", argLength: 2, commutative: true},
@@ -916,30 +942,6 @@ func simdGenericOps() []opData {
 		{name: "PairDotProdMaskedInt16x8", argLength: 3, commutative: false},
 		{name: "PairDotProdMaskedInt16x16", argLength: 3, commutative: false},
 		{name: "PairDotProdMaskedInt16x32", argLength: 3, commutative: false},
-		{name: "PairwiseAddFloat32x4", argLength: 2, commutative: false},
-		{name: "PairwiseAddFloat32x8", argLength: 2, commutative: false},
-		{name: "PairwiseAddFloat64x2", argLength: 2, commutative: false},
-		{name: "PairwiseAddFloat64x4", argLength: 2, commutative: false},
-		{name: "PairwiseAddInt16x8", argLength: 2, commutative: false},
-		{name: "PairwiseAddInt16x16", argLength: 2, commutative: false},
-		{name: "PairwiseAddInt32x4", argLength: 2, commutative: false},
-		{name: "PairwiseAddInt32x8", argLength: 2, commutative: false},
-		{name: "PairwiseAddUint16x8", argLength: 2, commutative: false},
-		{name: "PairwiseAddUint16x16", argLength: 2, commutative: false},
-		{name: "PairwiseAddUint32x4", argLength: 2, commutative: false},
-		{name: "PairwiseAddUint32x8", argLength: 2, commutative: false},
-		{name: "PairwiseSubFloat32x4", argLength: 2, commutative: false},
-		{name: "PairwiseSubFloat32x8", argLength: 2, commutative: false},
-		{name: "PairwiseSubFloat64x2", argLength: 2, commutative: false},
-		{name: "PairwiseSubFloat64x4", argLength: 2, commutative: false},
-		{name: "PairwiseSubInt16x8", argLength: 2, commutative: false},
-		{name: "PairwiseSubInt16x16", argLength: 2, commutative: false},
-		{name: "PairwiseSubInt32x4", argLength: 2, commutative: false},
-		{name: "PairwiseSubInt32x8", argLength: 2, commutative: false},
-		{name: "PairwiseSubUint16x8", argLength: 2, commutative: false},
-		{name: "PairwiseSubUint16x16", argLength: 2, commutative: false},
-		{name: "PairwiseSubUint32x4", argLength: 2, commutative: false},
-		{name: "PairwiseSubUint32x8", argLength: 2, commutative: false},
 		{name: "Permute2Float32x4", argLength: 3, commutative: false},
 		{name: "Permute2Float32x8", argLength: 3, commutative: false},
 		{name: "Permute2Float32x16", argLength: 3, commutative: false},
@@ -1154,58 +1156,6 @@ func simdGenericOps() []opData {
 		{name: "SaturatedAddDotProdMaskedInt32x4", argLength: 4, commutative: false},
 		{name: "SaturatedAddDotProdMaskedInt32x8", argLength: 4, commutative: false},
 		{name: "SaturatedAddDotProdMaskedInt32x16", argLength: 4, commutative: false},
-		{name: "SaturatedAddInt8x16", argLength: 2, commutative: true},
-		{name: "SaturatedAddInt8x32", argLength: 2, commutative: true},
-		{name: "SaturatedAddInt8x64", argLength: 2, commutative: true},
-		{name: "SaturatedAddInt16x8", argLength: 2, commutative: true},
-		{name: "SaturatedAddInt16x16", argLength: 2, commutative: true},
-		{name: "SaturatedAddInt16x32", argLength: 2, commutative: true},
-		{name: "SaturatedAddMaskedInt8x16", argLength: 3, commutative: true},
-		{name: "SaturatedAddMaskedInt8x32", argLength: 3, commutative: true},
-		{name: "SaturatedAddMaskedInt8x64", argLength: 3, commutative: true},
-		{name: "SaturatedAddMaskedInt16x8", argLength: 3, commutative: true},
-		{name: "SaturatedAddMaskedInt16x16", argLength: 3, commutative: true},
-		{name: "SaturatedAddMaskedInt16x32", argLength: 3, commutative: true},
-		{name: "SaturatedAddMaskedUint8x16", argLength: 3, commutative: true},
-		{name: "SaturatedAddMaskedUint8x32", argLength: 3, commutative: true},
-		{name: "SaturatedAddMaskedUint8x64", argLength: 3, commutative: true},
-		{name: "SaturatedAddMaskedUint16x8", argLength: 3, commutative: true},
-		{name: "SaturatedAddMaskedUint16x16", argLength: 3, commutative: true},
-		{name: "SaturatedAddMaskedUint16x32", argLength: 3, commutative: true},
-		{name: "SaturatedAddUint8x16", argLength: 2, commutative: true},
-		{name: "SaturatedAddUint8x32", argLength: 2, commutative: true},
-		{name: "SaturatedAddUint8x64", argLength: 2, commutative: true},
-		{name: "SaturatedAddUint16x8", argLength: 2, commutative: true},
-		{name: "SaturatedAddUint16x16", argLength: 2, commutative: true},
-		{name: "SaturatedAddUint16x32", argLength: 2, commutative: true},
-		{name: "SaturatedPairwiseAddInt16x8", argLength: 2, commutative: false},
-		{name: "SaturatedPairwiseAddInt16x16", argLength: 2, commutative: false},
-		{name: "SaturatedPairwiseSubInt16x8", argLength: 2, commutative: false},
-		{name: "SaturatedPairwiseSubInt16x16", argLength: 2, commutative: false},
-		{name: "SaturatedSubInt8x16", argLength: 2, commutative: false},
-		{name: "SaturatedSubInt8x32", argLength: 2, commutative: false},
-		{name: "SaturatedSubInt8x64", argLength: 2, commutative: false},
-		{name: "SaturatedSubInt16x8", argLength: 2, commutative: false},
-		{name: "SaturatedSubInt16x16", argLength: 2, commutative: false},
-		{name: "SaturatedSubInt16x32", argLength: 2, commutative: false},
-		{name: "SaturatedSubMaskedInt8x16", argLength: 3, commutative: false},
-		{name: "SaturatedSubMaskedInt8x32", argLength: 3, commutative: false},
-		{name: "SaturatedSubMaskedInt8x64", argLength: 3, commutative: false},
-		{name: "SaturatedSubMaskedInt16x8", argLength: 3, commutative: false},
-		{name: "SaturatedSubMaskedInt16x16", argLength: 3, commutative: false},
-		{name: "SaturatedSubMaskedInt16x32", argLength: 3, commutative: false},
-		{name: "SaturatedSubMaskedUint8x16", argLength: 3, commutative: false},
-		{name: "SaturatedSubMaskedUint8x32", argLength: 3, commutative: false},
-		{name: "SaturatedSubMaskedUint8x64", argLength: 3, commutative: false},
-		{name: "SaturatedSubMaskedUint16x8", argLength: 3, commutative: false},
-		{name: "SaturatedSubMaskedUint16x16", argLength: 3, commutative: false},
-		{name: "SaturatedSubMaskedUint16x32", argLength: 3, commutative: false},
-		{name: "SaturatedSubUint8x16", argLength: 2, commutative: false},
-		{name: "SaturatedSubUint8x32", argLength: 2, commutative: false},
-		{name: "SaturatedSubUint8x64", argLength: 2, commutative: false},
-		{name: "SaturatedSubUint16x8", argLength: 2, commutative: false},
-		{name: "SaturatedSubUint16x16", argLength: 2, commutative: false},
-		{name: "SaturatedSubUint16x32", argLength: 2, commutative: false},
 		{name: "SaturatedUnsignedSignedPairDotProdMaskedUint8x16", argLength: 3, commutative: false},
 		{name: "SaturatedUnsignedSignedPairDotProdMaskedUint8x32", argLength: 3, commutative: false},
 		{name: "SaturatedUnsignedSignedPairDotProdMaskedUint8x64", argLength: 3, commutative: false},
@@ -1218,6 +1168,18 @@ func simdGenericOps() []opData {
 		{name: "SaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x4", argLength: 4, commutative: false},
 		{name: "SaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x8", argLength: 4, commutative: false},
 		{name: "SaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x16", argLength: 4, commutative: false},
+		{name: "ScaleFloat32x4", argLength: 2, commutative: false},
+		{name: "ScaleFloat32x8", argLength: 2, commutative: false},
+		{name: "ScaleFloat32x16", argLength: 2, commutative: false},
+		{name: "ScaleFloat64x2", argLength: 2, commutative: false},
+		{name: "ScaleFloat64x4", argLength: 2, commutative: false},
+		{name: "ScaleFloat64x8", argLength: 2, commutative: false},
+		{name: "ScaleMaskedFloat32x4", argLength: 3, commutative: false},
+		{name: "ScaleMaskedFloat32x8", argLength: 3, commutative: false},
+		{name: "ScaleMaskedFloat32x16", argLength: 3, commutative: false},
+		{name: "ScaleMaskedFloat64x2", argLength: 3, commutative: false},
+		{name: "ScaleMaskedFloat64x4", argLength: 3, commutative: false},
+		{name: "ScaleMaskedFloat64x8", argLength: 3, commutative: false},
 		{name: "ShiftAllLeftInt16x8", argLength: 2, commutative: false},
 		{name: "ShiftAllLeftInt16x16", argLength: 2, commutative: false},
 		{name: "ShiftAllLeftInt16x32", argLength: 2, commutative: false},
@@ -1500,6 +1462,44 @@ func simdGenericOps() []opData {
 		{name: "SubMaskedUint64x2", argLength: 3, commutative: false},
 		{name: "SubMaskedUint64x4", argLength: 3, commutative: false},
 		{name: "SubMaskedUint64x8", argLength: 3, commutative: false},
+		{name: "SubPairsFloat32x4", argLength: 2, commutative: false},
+		{name: "SubPairsFloat32x8", argLength: 2, commutative: false},
+		{name: "SubPairsFloat64x2", argLength: 2, commutative: false},
+		{name: "SubPairsFloat64x4", argLength: 2, commutative: false},
+		{name: "SubPairsInt16x8", argLength: 2, commutative: false},
+		{name: "SubPairsInt16x16", argLength: 2, commutative: false},
+		{name: "SubPairsInt32x4", argLength: 2, commutative: false},
+		{name: "SubPairsInt32x8", argLength: 2, commutative: false},
+		{name: "SubPairsSaturatedInt16x8", argLength: 2, commutative: false},
+		{name: "SubPairsSaturatedInt16x16", argLength: 2, commutative: false},
+		{name: "SubPairsUint16x8", argLength: 2, commutative: false},
+		{name: "SubPairsUint16x16", argLength: 2, commutative: false},
+		{name: "SubPairsUint32x4", argLength: 2, commutative: false},
+		{name: "SubPairsUint32x8", argLength: 2, commutative: false},
+		{name: "SubSaturatedInt8x16", argLength: 2, commutative: false},
+		{name: "SubSaturatedInt8x32", argLength: 2, commutative: false},
+		{name: "SubSaturatedInt8x64", argLength: 2, commutative: false},
+		{name: "SubSaturatedInt16x8", argLength: 2, commutative: false},
+		{name: "SubSaturatedInt16x16", argLength: 2, commutative: false},
+		{name: "SubSaturatedInt16x32", argLength: 2, commutative: false},
+		{name: "SubSaturatedMaskedInt8x16", argLength: 3, commutative: false},
+		{name: "SubSaturatedMaskedInt8x32", argLength: 3, commutative: false},
+		{name: "SubSaturatedMaskedInt8x64", argLength: 3, commutative: false},
+		{name: "SubSaturatedMaskedInt16x8", argLength: 3, commutative: false},
+		{name: "SubSaturatedMaskedInt16x16", argLength: 3, commutative: false},
+		{name: "SubSaturatedMaskedInt16x32", argLength: 3, commutative: false},
+		{name: "SubSaturatedMaskedUint8x16", argLength: 3, commutative: false},
+		{name: "SubSaturatedMaskedUint8x32", argLength: 3, commutative: false},
+		{name: "SubSaturatedMaskedUint8x64", argLength: 3, commutative: false},
+		{name: "SubSaturatedMaskedUint16x8", argLength: 3, commutative: false},
+		{name: "SubSaturatedMaskedUint16x16", argLength: 3, commutative: false},
+		{name: "SubSaturatedMaskedUint16x32", argLength: 3, commutative: false},
+		{name: "SubSaturatedUint8x16", argLength: 2, commutative: false},
+		{name: "SubSaturatedUint8x32", argLength: 2, commutative: false},
+		{name: "SubSaturatedUint8x64", argLength: 2, commutative: false},
+		{name: "SubSaturatedUint16x8", argLength: 2, commutative: false},
+		{name: "SubSaturatedUint16x16", argLength: 2, commutative: false},
+		{name: "SubSaturatedUint16x32", argLength: 2, commutative: false},
 		{name: "SubUint8x16", argLength: 2, commutative: false},
 		{name: "SubUint8x32", argLength: 2, commutative: false},
 		{name: "SubUint8x64", argLength: 2, commutative: false},
@@ -1558,78 +1558,54 @@ func simdGenericOps() []opData {
 		{name: "XorUint64x2", argLength: 2, commutative: true},
 		{name: "XorUint64x4", argLength: 2, commutative: true},
 		{name: "XorUint64x8", argLength: 2, commutative: true},
-		{name: "CeilWithPrecisionFloat32x4", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "CeilWithPrecisionFloat32x8", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "CeilWithPrecisionFloat32x16", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "CeilWithPrecisionFloat64x2", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "CeilWithPrecisionFloat64x4", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "CeilWithPrecisionFloat64x8", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "CeilWithPrecisionMaskedFloat32x4", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "CeilWithPrecisionMaskedFloat32x8", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "CeilWithPrecisionMaskedFloat32x16", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "CeilWithPrecisionMaskedFloat64x2", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "CeilWithPrecisionMaskedFloat64x4", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "CeilWithPrecisionMaskedFloat64x8", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "DiffWithCeilWithPrecisionFloat32x4", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "DiffWithCeilWithPrecisionFloat32x8", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "DiffWithCeilWithPrecisionFloat32x16", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "DiffWithCeilWithPrecisionFloat64x2", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "DiffWithCeilWithPrecisionFloat64x4", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "DiffWithCeilWithPrecisionFloat64x8", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "DiffWithCeilWithPrecisionMaskedFloat32x4", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "DiffWithCeilWithPrecisionMaskedFloat32x8", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "DiffWithCeilWithPrecisionMaskedFloat32x16", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "DiffWithCeilWithPrecisionMaskedFloat64x2", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "DiffWithCeilWithPrecisionMaskedFloat64x4", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "DiffWithCeilWithPrecisionMaskedFloat64x8", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "DiffWithFloorWithPrecisionFloat32x4", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "DiffWithFloorWithPrecisionFloat32x8", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "DiffWithFloorWithPrecisionFloat32x16", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "DiffWithFloorWithPrecisionFloat64x2", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "DiffWithFloorWithPrecisionFloat64x4", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "DiffWithFloorWithPrecisionFloat64x8", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "DiffWithFloorWithPrecisionMaskedFloat32x4", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "DiffWithFloorWithPrecisionMaskedFloat32x8", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "DiffWithFloorWithPrecisionMaskedFloat32x16", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "DiffWithFloorWithPrecisionMaskedFloat64x2", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "DiffWithFloorWithPrecisionMaskedFloat64x4", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "DiffWithFloorWithPrecisionMaskedFloat64x8", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "DiffWithRoundWithPrecisionFloat32x4", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "DiffWithRoundWithPrecisionFloat32x8", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "DiffWithRoundWithPrecisionFloat32x16", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "DiffWithRoundWithPrecisionFloat64x2", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "DiffWithRoundWithPrecisionFloat64x4", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "DiffWithRoundWithPrecisionFloat64x8", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "DiffWithRoundWithPrecisionMaskedFloat32x4", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "DiffWithRoundWithPrecisionMaskedFloat32x8", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "DiffWithRoundWithPrecisionMaskedFloat32x16", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "DiffWithRoundWithPrecisionMaskedFloat64x2", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "DiffWithRoundWithPrecisionMaskedFloat64x4", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "DiffWithRoundWithPrecisionMaskedFloat64x8", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "DiffWithTruncWithPrecisionFloat32x4", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "DiffWithTruncWithPrecisionFloat32x8", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "DiffWithTruncWithPrecisionFloat32x16", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "DiffWithTruncWithPrecisionFloat64x2", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "DiffWithTruncWithPrecisionFloat64x4", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "DiffWithTruncWithPrecisionFloat64x8", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "DiffWithTruncWithPrecisionMaskedFloat32x4", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "DiffWithTruncWithPrecisionMaskedFloat32x8", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "DiffWithTruncWithPrecisionMaskedFloat32x16", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "DiffWithTruncWithPrecisionMaskedFloat64x2", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "DiffWithTruncWithPrecisionMaskedFloat64x4", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "DiffWithTruncWithPrecisionMaskedFloat64x8", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "FloorWithPrecisionFloat32x4", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "FloorWithPrecisionFloat32x8", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "FloorWithPrecisionFloat32x16", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "FloorWithPrecisionFloat64x2", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "FloorWithPrecisionFloat64x4", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "FloorWithPrecisionFloat64x8", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "FloorWithPrecisionMaskedFloat32x4", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "FloorWithPrecisionMaskedFloat32x8", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "FloorWithPrecisionMaskedFloat32x16", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "FloorWithPrecisionMaskedFloat64x2", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "FloorWithPrecisionMaskedFloat64x4", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "FloorWithPrecisionMaskedFloat64x8", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "CeilScaledFloat32x4", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "CeilScaledFloat32x8", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "CeilScaledFloat32x16", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "CeilScaledFloat64x2", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "CeilScaledFloat64x4", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "CeilScaledFloat64x8", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "CeilScaledMaskedFloat32x4", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "CeilScaledMaskedFloat32x8", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "CeilScaledMaskedFloat32x16", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "CeilScaledMaskedFloat64x2", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "CeilScaledMaskedFloat64x4", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "CeilScaledMaskedFloat64x8", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "CeilScaledResidueFloat32x4", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "CeilScaledResidueFloat32x8", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "CeilScaledResidueFloat32x16", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "CeilScaledResidueFloat64x2", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "CeilScaledResidueFloat64x4", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "CeilScaledResidueFloat64x8", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "CeilScaledResidueMaskedFloat32x4", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "CeilScaledResidueMaskedFloat32x8", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "CeilScaledResidueMaskedFloat32x16", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "CeilScaledResidueMaskedFloat64x2", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "CeilScaledResidueMaskedFloat64x4", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "CeilScaledResidueMaskedFloat64x8", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "FloorScaledFloat32x4", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "FloorScaledFloat32x8", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "FloorScaledFloat32x16", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "FloorScaledFloat64x2", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "FloorScaledFloat64x4", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "FloorScaledFloat64x8", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "FloorScaledMaskedFloat32x4", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "FloorScaledMaskedFloat32x8", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "FloorScaledMaskedFloat32x16", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "FloorScaledMaskedFloat64x2", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "FloorScaledMaskedFloat64x4", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "FloorScaledMaskedFloat64x8", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "FloorScaledResidueFloat32x4", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "FloorScaledResidueFloat32x8", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "FloorScaledResidueFloat32x16", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "FloorScaledResidueFloat64x2", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "FloorScaledResidueFloat64x4", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "FloorScaledResidueFloat64x8", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "FloorScaledResidueMaskedFloat32x4", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "FloorScaledResidueMaskedFloat32x8", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "FloorScaledResidueMaskedFloat32x16", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "FloorScaledResidueMaskedFloat64x2", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "FloorScaledResidueMaskedFloat64x4", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "FloorScaledResidueMaskedFloat64x8", argLength: 2, commutative: false, aux: "Int8"},
 		{name: "GaloisFieldAffineTransformInverseMaskedUint8x16", argLength: 3, commutative: false, aux: "Int8"},
 		{name: "GaloisFieldAffineTransformInverseMaskedUint8x32", argLength: 3, commutative: false, aux: "Int8"},
 		{name: "GaloisFieldAffineTransformInverseMaskedUint8x64", argLength: 3, commutative: false, aux: "Int8"},
@@ -1708,18 +1684,30 @@ func simdGenericOps() []opData {
 		{name: "RotateAllRightUint64x2", argLength: 1, commutative: false, aux: "Int8"},
 		{name: "RotateAllRightUint64x4", argLength: 1, commutative: false, aux: "Int8"},
 		{name: "RotateAllRightUint64x8", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "RoundWithPrecisionFloat32x4", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "RoundWithPrecisionFloat32x8", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "RoundWithPrecisionFloat32x16", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "RoundWithPrecisionFloat64x2", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "RoundWithPrecisionFloat64x4", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "RoundWithPrecisionFloat64x8", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "RoundWithPrecisionMaskedFloat32x4", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "RoundWithPrecisionMaskedFloat32x8", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "RoundWithPrecisionMaskedFloat32x16", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "RoundWithPrecisionMaskedFloat64x2", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "RoundWithPrecisionMaskedFloat64x4", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "RoundWithPrecisionMaskedFloat64x8", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "RoundScaledFloat32x4", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "RoundScaledFloat32x8", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "RoundScaledFloat32x16", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "RoundScaledFloat64x2", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "RoundScaledFloat64x4", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "RoundScaledFloat64x8", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "RoundScaledMaskedFloat32x4", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "RoundScaledMaskedFloat32x8", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "RoundScaledMaskedFloat32x16", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "RoundScaledMaskedFloat64x2", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "RoundScaledMaskedFloat64x4", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "RoundScaledMaskedFloat64x8", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "RoundScaledResidueFloat32x4", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "RoundScaledResidueFloat32x8", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "RoundScaledResidueFloat32x16", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "RoundScaledResidueFloat64x2", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "RoundScaledResidueFloat64x4", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "RoundScaledResidueFloat64x8", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "RoundScaledResidueMaskedFloat32x4", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "RoundScaledResidueMaskedFloat32x8", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "RoundScaledResidueMaskedFloat32x16", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "RoundScaledResidueMaskedFloat64x2", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "RoundScaledResidueMaskedFloat64x4", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "RoundScaledResidueMaskedFloat64x8", argLength: 2, commutative: false, aux: "Int8"},
 		{name: "Set128Float32x8", argLength: 2, commutative: false, aux: "Int8"},
 		{name: "Set128Float64x4", argLength: 2, commutative: false, aux: "Int8"},
 		{name: "Set128Int8x32", argLength: 2, commutative: false, aux: "Int8"},
@@ -1810,17 +1798,29 @@ func simdGenericOps() []opData {
 		{name: "ShiftAllRightConcatUint64x2", argLength: 2, commutative: false, aux: "Int8"},
 		{name: "ShiftAllRightConcatUint64x4", argLength: 2, commutative: false, aux: "Int8"},
 		{name: "ShiftAllRightConcatUint64x8", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "TruncWithPrecisionFloat32x4", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "TruncWithPrecisionFloat32x8", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "TruncWithPrecisionFloat32x16", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "TruncWithPrecisionFloat64x2", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "TruncWithPrecisionFloat64x4", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "TruncWithPrecisionFloat64x8", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "TruncWithPrecisionMaskedFloat32x4", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "TruncWithPrecisionMaskedFloat32x8", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "TruncWithPrecisionMaskedFloat32x16", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "TruncWithPrecisionMaskedFloat64x2", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "TruncWithPrecisionMaskedFloat64x4", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "TruncWithPrecisionMaskedFloat64x8", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "TruncScaledFloat32x4", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "TruncScaledFloat32x8", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "TruncScaledFloat32x16", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "TruncScaledFloat64x2", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "TruncScaledFloat64x4", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "TruncScaledFloat64x8", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "TruncScaledMaskedFloat32x4", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "TruncScaledMaskedFloat32x8", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "TruncScaledMaskedFloat32x16", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "TruncScaledMaskedFloat64x2", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "TruncScaledMaskedFloat64x4", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "TruncScaledMaskedFloat64x8", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "TruncScaledResidueFloat32x4", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "TruncScaledResidueFloat32x8", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "TruncScaledResidueFloat32x16", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "TruncScaledResidueFloat64x2", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "TruncScaledResidueFloat64x4", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "TruncScaledResidueFloat64x8", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "TruncScaledResidueMaskedFloat32x4", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "TruncScaledResidueMaskedFloat32x8", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "TruncScaledResidueMaskedFloat32x16", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "TruncScaledResidueMaskedFloat64x2", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "TruncScaledResidueMaskedFloat64x4", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "TruncScaledResidueMaskedFloat64x8", argLength: 2, commutative: false, aux: "Int8"},
 	}
 }
diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go
index e8a5354c001..6dcbec2573b 100644
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
@@ -4567,6 +4567,44 @@ const (
 	OpAddMaskedUint64x2
 	OpAddMaskedUint64x4
 	OpAddMaskedUint64x8
+	OpAddPairsFloat32x4
+	OpAddPairsFloat32x8
+	OpAddPairsFloat64x2
+	OpAddPairsFloat64x4
+	OpAddPairsInt16x8
+	OpAddPairsInt16x16
+	OpAddPairsInt32x4
+	OpAddPairsInt32x8
+	OpAddPairsSaturatedInt16x8
+	OpAddPairsSaturatedInt16x16
+	OpAddPairsUint16x8
+	OpAddPairsUint16x16
+	OpAddPairsUint32x4
+	OpAddPairsUint32x8
+	OpAddSaturatedInt8x16
+	OpAddSaturatedInt8x32
+	OpAddSaturatedInt8x64
+	OpAddSaturatedInt16x8
+	OpAddSaturatedInt16x16
+	OpAddSaturatedInt16x32
+	OpAddSaturatedMaskedInt8x16
+	OpAddSaturatedMaskedInt8x32
+	OpAddSaturatedMaskedInt8x64
+	OpAddSaturatedMaskedInt16x8
+	OpAddSaturatedMaskedInt16x16
+	OpAddSaturatedMaskedInt16x32
+	OpAddSaturatedMaskedUint8x16
+	OpAddSaturatedMaskedUint8x32
+	OpAddSaturatedMaskedUint8x64
+	OpAddSaturatedMaskedUint16x8
+	OpAddSaturatedMaskedUint16x16
+	OpAddSaturatedMaskedUint16x32
+	OpAddSaturatedUint8x16
+	OpAddSaturatedUint8x32
+	OpAddSaturatedUint8x64
+	OpAddSaturatedUint16x8
+	OpAddSaturatedUint16x16
+	OpAddSaturatedUint16x32
 	OpAddSubFloat32x4
 	OpAddSubFloat32x8
 	OpAddSubFloat64x2
@@ -5230,18 +5268,6 @@ const (
 	OpMinUint64x2
 	OpMinUint64x4
 	OpMinUint64x8
-	OpMulByPowOf2Float32x4
-	OpMulByPowOf2Float32x8
-	OpMulByPowOf2Float32x16
-	OpMulByPowOf2Float64x2
-	OpMulByPowOf2Float64x4
-	OpMulByPowOf2Float64x8
-	OpMulByPowOf2MaskedFloat32x4
-	OpMulByPowOf2MaskedFloat32x8
-	OpMulByPowOf2MaskedFloat32x16
-	OpMulByPowOf2MaskedFloat64x2
-	OpMulByPowOf2MaskedFloat64x4
-	OpMulByPowOf2MaskedFloat64x8
 	OpMulEvenWidenInt32x4
 	OpMulEvenWidenInt32x8
 	OpMulEvenWidenInt64x2
@@ -5276,30 +5302,30 @@ const (
 	OpMulHighUint16x8
 	OpMulHighUint16x16
 	OpMulHighUint16x32
-	OpMulLowInt16x8
-	OpMulLowInt16x16
-	OpMulLowInt16x32
-	OpMulLowInt32x4
-	OpMulLowInt32x8
-	OpMulLowInt32x16
-	OpMulLowInt64x2
-	OpMulLowInt64x4
-	OpMulLowInt64x8
-	OpMulLowMaskedInt16x8
-	OpMulLowMaskedInt16x16
-	OpMulLowMaskedInt16x32
-	OpMulLowMaskedInt32x4
-	OpMulLowMaskedInt32x8
-	OpMulLowMaskedInt32x16
-	OpMulLowMaskedInt64x2
-	OpMulLowMaskedInt64x4
-	OpMulLowMaskedInt64x8
+	OpMulInt16x8
+	OpMulInt16x16
+	OpMulInt16x32
+	OpMulInt32x4
+	OpMulInt32x8
+	OpMulInt32x16
+	OpMulInt64x2
+	OpMulInt64x4
+	OpMulInt64x8
 	OpMulMaskedFloat32x4
 	OpMulMaskedFloat32x8
 	OpMulMaskedFloat32x16
 	OpMulMaskedFloat64x2
 	OpMulMaskedFloat64x4
 	OpMulMaskedFloat64x8
+	OpMulMaskedInt16x8
+	OpMulMaskedInt16x16
+	OpMulMaskedInt16x32
+	OpMulMaskedInt32x4
+	OpMulMaskedInt32x8
+	OpMulMaskedInt32x16
+	OpMulMaskedInt64x2
+	OpMulMaskedInt64x4
+	OpMulMaskedInt64x8
 	OpNotEqualFloat32x4
 	OpNotEqualFloat32x8
 	OpNotEqualFloat32x16
@@ -5402,30 +5428,6 @@ const (
 	OpPairDotProdMaskedInt16x8
 	OpPairDotProdMaskedInt16x16
 	OpPairDotProdMaskedInt16x32
-	OpPairwiseAddFloat32x4
-	OpPairwiseAddFloat32x8
-	OpPairwiseAddFloat64x2
-	OpPairwiseAddFloat64x4
-	OpPairwiseAddInt16x8
-	OpPairwiseAddInt16x16
-	OpPairwiseAddInt32x4
-	OpPairwiseAddInt32x8
-	OpPairwiseAddUint16x8
-	OpPairwiseAddUint16x16
-	OpPairwiseAddUint32x4
-	OpPairwiseAddUint32x8
-	OpPairwiseSubFloat32x4
-	OpPairwiseSubFloat32x8
-	OpPairwiseSubFloat64x2
-	OpPairwiseSubFloat64x4
-	OpPairwiseSubInt16x8
-	OpPairwiseSubInt16x16
-	OpPairwiseSubInt32x4
-	OpPairwiseSubInt32x8
-	OpPairwiseSubUint16x8
-	OpPairwiseSubUint16x16
-	OpPairwiseSubUint32x4
-	OpPairwiseSubUint32x8
 	OpPermute2Float32x4
 	OpPermute2Float32x8
 	OpPermute2Float32x16
@@ -5640,58 +5642,6 @@ const (
 	OpSaturatedAddDotProdMaskedInt32x4
 	OpSaturatedAddDotProdMaskedInt32x8
 	OpSaturatedAddDotProdMaskedInt32x16
-	OpSaturatedAddInt8x16
-	OpSaturatedAddInt8x32
-	OpSaturatedAddInt8x64
-	OpSaturatedAddInt16x8
-	OpSaturatedAddInt16x16
-	OpSaturatedAddInt16x32
-	OpSaturatedAddMaskedInt8x16
-	OpSaturatedAddMaskedInt8x32
-	OpSaturatedAddMaskedInt8x64
-	OpSaturatedAddMaskedInt16x8
-	OpSaturatedAddMaskedInt16x16
-	OpSaturatedAddMaskedInt16x32
-	OpSaturatedAddMaskedUint8x16
-	OpSaturatedAddMaskedUint8x32
-	OpSaturatedAddMaskedUint8x64
-	OpSaturatedAddMaskedUint16x8
-	OpSaturatedAddMaskedUint16x16
-	OpSaturatedAddMaskedUint16x32
-	OpSaturatedAddUint8x16
-	OpSaturatedAddUint8x32
-	OpSaturatedAddUint8x64
-	OpSaturatedAddUint16x8
-	OpSaturatedAddUint16x16
-	OpSaturatedAddUint16x32
-	OpSaturatedPairwiseAddInt16x8
-	OpSaturatedPairwiseAddInt16x16
-	OpSaturatedPairwiseSubInt16x8
-	OpSaturatedPairwiseSubInt16x16
-	OpSaturatedSubInt8x16
-	OpSaturatedSubInt8x32
-	OpSaturatedSubInt8x64
-	OpSaturatedSubInt16x8
-	OpSaturatedSubInt16x16
-	OpSaturatedSubInt16x32
-	OpSaturatedSubMaskedInt8x16
-	OpSaturatedSubMaskedInt8x32
-	OpSaturatedSubMaskedInt8x64
-	OpSaturatedSubMaskedInt16x8
-	OpSaturatedSubMaskedInt16x16
-	OpSaturatedSubMaskedInt16x32
-	OpSaturatedSubMaskedUint8x16
-	OpSaturatedSubMaskedUint8x32
-	OpSaturatedSubMaskedUint8x64
-	OpSaturatedSubMaskedUint16x8
-	OpSaturatedSubMaskedUint16x16
-	OpSaturatedSubMaskedUint16x32
-	OpSaturatedSubUint8x16
-	OpSaturatedSubUint8x32
-	OpSaturatedSubUint8x64
-	OpSaturatedSubUint16x8
-	OpSaturatedSubUint16x16
-	OpSaturatedSubUint16x32
 	OpSaturatedUnsignedSignedPairDotProdMaskedUint8x16
 	OpSaturatedUnsignedSignedPairDotProdMaskedUint8x32
 	OpSaturatedUnsignedSignedPairDotProdMaskedUint8x64
@@ -5704,6 +5654,18 @@ const (
 	OpSaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x4
 	OpSaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x8
 	OpSaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x16
+	OpScaleFloat32x4
+	OpScaleFloat32x8
+	OpScaleFloat32x16
+	OpScaleFloat64x2
+	OpScaleFloat64x4
+	OpScaleFloat64x8
+	OpScaleMaskedFloat32x4
+	OpScaleMaskedFloat32x8
+	OpScaleMaskedFloat32x16
+	OpScaleMaskedFloat64x2
+	OpScaleMaskedFloat64x4
+	OpScaleMaskedFloat64x8
 	OpShiftAllLeftInt16x8
 	OpShiftAllLeftInt16x16
 	OpShiftAllLeftInt16x32
@@ -5986,6 +5948,44 @@ const (
 	OpSubMaskedUint64x2
 	OpSubMaskedUint64x4
 	OpSubMaskedUint64x8
+	OpSubPairsFloat32x4
+	OpSubPairsFloat32x8
+	OpSubPairsFloat64x2
+	OpSubPairsFloat64x4
+	OpSubPairsInt16x8
+	OpSubPairsInt16x16
+	OpSubPairsInt32x4
+	OpSubPairsInt32x8
+	OpSubPairsSaturatedInt16x8
+	OpSubPairsSaturatedInt16x16
+	OpSubPairsUint16x8
+	OpSubPairsUint16x16
+	OpSubPairsUint32x4
+	OpSubPairsUint32x8
+	OpSubSaturatedInt8x16
+	OpSubSaturatedInt8x32
+	OpSubSaturatedInt8x64
+	OpSubSaturatedInt16x8
+	OpSubSaturatedInt16x16
+	OpSubSaturatedInt16x32
+	OpSubSaturatedMaskedInt8x16
+	OpSubSaturatedMaskedInt8x32
+	OpSubSaturatedMaskedInt8x64
+	OpSubSaturatedMaskedInt16x8
+	OpSubSaturatedMaskedInt16x16
+	OpSubSaturatedMaskedInt16x32
+	OpSubSaturatedMaskedUint8x16
+	OpSubSaturatedMaskedUint8x32
+	OpSubSaturatedMaskedUint8x64
+	OpSubSaturatedMaskedUint16x8
+	OpSubSaturatedMaskedUint16x16
+	OpSubSaturatedMaskedUint16x32
+	OpSubSaturatedUint8x16
+	OpSubSaturatedUint8x32
+	OpSubSaturatedUint8x64
+	OpSubSaturatedUint16x8
+	OpSubSaturatedUint16x16
+	OpSubSaturatedUint16x32
 	OpSubUint8x16
 	OpSubUint8x32
 	OpSubUint8x64
@@ -6044,78 +6044,54 @@ const (
 	OpXorUint64x2
 	OpXorUint64x4
 	OpXorUint64x8
-	OpCeilWithPrecisionFloat32x4
-	OpCeilWithPrecisionFloat32x8
-	OpCeilWithPrecisionFloat32x16
-	OpCeilWithPrecisionFloat64x2
-	OpCeilWithPrecisionFloat64x4
-	OpCeilWithPrecisionFloat64x8
-	OpCeilWithPrecisionMaskedFloat32x4
-	OpCeilWithPrecisionMaskedFloat32x8
-	OpCeilWithPrecisionMaskedFloat32x16
-	OpCeilWithPrecisionMaskedFloat64x2
-	OpCeilWithPrecisionMaskedFloat64x4
-	OpCeilWithPrecisionMaskedFloat64x8
-	OpDiffWithCeilWithPrecisionFloat32x4
-	OpDiffWithCeilWithPrecisionFloat32x8
-	OpDiffWithCeilWithPrecisionFloat32x16
-	OpDiffWithCeilWithPrecisionFloat64x2
-	OpDiffWithCeilWithPrecisionFloat64x4
-	OpDiffWithCeilWithPrecisionFloat64x8
-	OpDiffWithCeilWithPrecisionMaskedFloat32x4
-	OpDiffWithCeilWithPrecisionMaskedFloat32x8
-	OpDiffWithCeilWithPrecisionMaskedFloat32x16
-	OpDiffWithCeilWithPrecisionMaskedFloat64x2
-	OpDiffWithCeilWithPrecisionMaskedFloat64x4
-	OpDiffWithCeilWithPrecisionMaskedFloat64x8
-	OpDiffWithFloorWithPrecisionFloat32x4
-	OpDiffWithFloorWithPrecisionFloat32x8
-	OpDiffWithFloorWithPrecisionFloat32x16
-	OpDiffWithFloorWithPrecisionFloat64x2
-	OpDiffWithFloorWithPrecisionFloat64x4
-	OpDiffWithFloorWithPrecisionFloat64x8
-	OpDiffWithFloorWithPrecisionMaskedFloat32x4
-	OpDiffWithFloorWithPrecisionMaskedFloat32x8
-	OpDiffWithFloorWithPrecisionMaskedFloat32x16
-	OpDiffWithFloorWithPrecisionMaskedFloat64x2
-	OpDiffWithFloorWithPrecisionMaskedFloat64x4
-	OpDiffWithFloorWithPrecisionMaskedFloat64x8
-	OpDiffWithRoundWithPrecisionFloat32x4
-	OpDiffWithRoundWithPrecisionFloat32x8
-	OpDiffWithRoundWithPrecisionFloat32x16
-	OpDiffWithRoundWithPrecisionFloat64x2
-	OpDiffWithRoundWithPrecisionFloat64x4
-	OpDiffWithRoundWithPrecisionFloat64x8
-	OpDiffWithRoundWithPrecisionMaskedFloat32x4
-	OpDiffWithRoundWithPrecisionMaskedFloat32x8
-	OpDiffWithRoundWithPrecisionMaskedFloat32x16
-	OpDiffWithRoundWithPrecisionMaskedFloat64x2
-	OpDiffWithRoundWithPrecisionMaskedFloat64x4
-	OpDiffWithRoundWithPrecisionMaskedFloat64x8
-	OpDiffWithTruncWithPrecisionFloat32x4
-	OpDiffWithTruncWithPrecisionFloat32x8
-	OpDiffWithTruncWithPrecisionFloat32x16
-	OpDiffWithTruncWithPrecisionFloat64x2
-	OpDiffWithTruncWithPrecisionFloat64x4
-	OpDiffWithTruncWithPrecisionFloat64x8
-	OpDiffWithTruncWithPrecisionMaskedFloat32x4
-	OpDiffWithTruncWithPrecisionMaskedFloat32x8
-	OpDiffWithTruncWithPrecisionMaskedFloat32x16
-	OpDiffWithTruncWithPrecisionMaskedFloat64x2
-	OpDiffWithTruncWithPrecisionMaskedFloat64x4
-	OpDiffWithTruncWithPrecisionMaskedFloat64x8
-	OpFloorWithPrecisionFloat32x4
-	OpFloorWithPrecisionFloat32x8
-	OpFloorWithPrecisionFloat32x16
-	OpFloorWithPrecisionFloat64x2
-	OpFloorWithPrecisionFloat64x4
-	OpFloorWithPrecisionFloat64x8
-	OpFloorWithPrecisionMaskedFloat32x4
-	OpFloorWithPrecisionMaskedFloat32x8
-	OpFloorWithPrecisionMaskedFloat32x16
-	OpFloorWithPrecisionMaskedFloat64x2
-	OpFloorWithPrecisionMaskedFloat64x4
-	OpFloorWithPrecisionMaskedFloat64x8
+	OpCeilScaledFloat32x4
+	OpCeilScaledFloat32x8
+	OpCeilScaledFloat32x16
+	OpCeilScaledFloat64x2
+	OpCeilScaledFloat64x4
+	OpCeilScaledFloat64x8
+	OpCeilScaledMaskedFloat32x4
+	OpCeilScaledMaskedFloat32x8
+	OpCeilScaledMaskedFloat32x16
+	OpCeilScaledMaskedFloat64x2
+	OpCeilScaledMaskedFloat64x4
+	OpCeilScaledMaskedFloat64x8
+	OpCeilScaledResidueFloat32x4
+	OpCeilScaledResidueFloat32x8
+	OpCeilScaledResidueFloat32x16
+	OpCeilScaledResidueFloat64x2
+	OpCeilScaledResidueFloat64x4
+	OpCeilScaledResidueFloat64x8
+	OpCeilScaledResidueMaskedFloat32x4
+	OpCeilScaledResidueMaskedFloat32x8
+	OpCeilScaledResidueMaskedFloat32x16
+	OpCeilScaledResidueMaskedFloat64x2
+	OpCeilScaledResidueMaskedFloat64x4
+	OpCeilScaledResidueMaskedFloat64x8
+	OpFloorScaledFloat32x4
+	OpFloorScaledFloat32x8
+	OpFloorScaledFloat32x16
+	OpFloorScaledFloat64x2
+	OpFloorScaledFloat64x4
+	OpFloorScaledFloat64x8
+	OpFloorScaledMaskedFloat32x4
+	OpFloorScaledMaskedFloat32x8
+	OpFloorScaledMaskedFloat32x16
+	OpFloorScaledMaskedFloat64x2
+	OpFloorScaledMaskedFloat64x4
+	OpFloorScaledMaskedFloat64x8
+	OpFloorScaledResidueFloat32x4
+	OpFloorScaledResidueFloat32x8
+	OpFloorScaledResidueFloat32x16
+	OpFloorScaledResidueFloat64x2
+	OpFloorScaledResidueFloat64x4
+	OpFloorScaledResidueFloat64x8
+	OpFloorScaledResidueMaskedFloat32x4
+	OpFloorScaledResidueMaskedFloat32x8
+	OpFloorScaledResidueMaskedFloat32x16
+	OpFloorScaledResidueMaskedFloat64x2
+	OpFloorScaledResidueMaskedFloat64x4
+	OpFloorScaledResidueMaskedFloat64x8
 	OpGaloisFieldAffineTransformInverseMaskedUint8x16
 	OpGaloisFieldAffineTransformInverseMaskedUint8x32
 	OpGaloisFieldAffineTransformInverseMaskedUint8x64
@@ -6194,18 +6170,30 @@ const (
 	OpRotateAllRightUint64x2
 	OpRotateAllRightUint64x4
 	OpRotateAllRightUint64x8
-	OpRoundWithPrecisionFloat32x4
-	OpRoundWithPrecisionFloat32x8
-	OpRoundWithPrecisionFloat32x16
-	OpRoundWithPrecisionFloat64x2
-	OpRoundWithPrecisionFloat64x4
-	OpRoundWithPrecisionFloat64x8
-	OpRoundWithPrecisionMaskedFloat32x4
-	OpRoundWithPrecisionMaskedFloat32x8
-	OpRoundWithPrecisionMaskedFloat32x16
-	OpRoundWithPrecisionMaskedFloat64x2
-	OpRoundWithPrecisionMaskedFloat64x4
-	OpRoundWithPrecisionMaskedFloat64x8
+	OpRoundScaledFloat32x4
+	OpRoundScaledFloat32x8
+	OpRoundScaledFloat32x16
+	OpRoundScaledFloat64x2
+	OpRoundScaledFloat64x4
+	OpRoundScaledFloat64x8
+	OpRoundScaledMaskedFloat32x4
+	OpRoundScaledMaskedFloat32x8
+	OpRoundScaledMaskedFloat32x16
+	OpRoundScaledMaskedFloat64x2
+	OpRoundScaledMaskedFloat64x4
+	OpRoundScaledMaskedFloat64x8
+	OpRoundScaledResidueFloat32x4
+	OpRoundScaledResidueFloat32x8
+	OpRoundScaledResidueFloat32x16
+	OpRoundScaledResidueFloat64x2
+	OpRoundScaledResidueFloat64x4
+	OpRoundScaledResidueFloat64x8
+	OpRoundScaledResidueMaskedFloat32x4
+	OpRoundScaledResidueMaskedFloat32x8
+	OpRoundScaledResidueMaskedFloat32x16
+	OpRoundScaledResidueMaskedFloat64x2
+	OpRoundScaledResidueMaskedFloat64x4
+	OpRoundScaledResidueMaskedFloat64x8
 	OpSet128Float32x8
 	OpSet128Float64x4
 	OpSet128Int8x32
@@ -6296,18 +6284,30 @@ const (
 	OpShiftAllRightConcatUint64x2
 	OpShiftAllRightConcatUint64x4
 	OpShiftAllRightConcatUint64x8
-	OpTruncWithPrecisionFloat32x4
-	OpTruncWithPrecisionFloat32x8
-	OpTruncWithPrecisionFloat32x16
-	OpTruncWithPrecisionFloat64x2
-	OpTruncWithPrecisionFloat64x4
-	OpTruncWithPrecisionFloat64x8
-	OpTruncWithPrecisionMaskedFloat32x4
-	OpTruncWithPrecisionMaskedFloat32x8
-	OpTruncWithPrecisionMaskedFloat32x16
-	OpTruncWithPrecisionMaskedFloat64x2
-	OpTruncWithPrecisionMaskedFloat64x4
-	OpTruncWithPrecisionMaskedFloat64x8
+	OpTruncScaledFloat32x4
+	OpTruncScaledFloat32x8
+	OpTruncScaledFloat32x16
+	OpTruncScaledFloat64x2
+	OpTruncScaledFloat64x4
+	OpTruncScaledFloat64x8
+	OpTruncScaledMaskedFloat32x4
+	OpTruncScaledMaskedFloat32x8
+	OpTruncScaledMaskedFloat32x16
+	OpTruncScaledMaskedFloat64x2
+	OpTruncScaledMaskedFloat64x4
+	OpTruncScaledMaskedFloat64x8
+	OpTruncScaledResidueFloat32x4
+	OpTruncScaledResidueFloat32x8
+	OpTruncScaledResidueFloat32x16
+	OpTruncScaledResidueFloat64x2
+	OpTruncScaledResidueFloat64x4
+	OpTruncScaledResidueFloat64x8
+	OpTruncScaledResidueMaskedFloat32x4
+	OpTruncScaledResidueMaskedFloat32x8
+	OpTruncScaledResidueMaskedFloat32x16
+	OpTruncScaledResidueMaskedFloat64x2
+	OpTruncScaledResidueMaskedFloat64x4
+	OpTruncScaledResidueMaskedFloat64x8
 )
 
 var opcodeTable = [...]opInfo{
@@ -62123,6 +62123,220 @@ var opcodeTable = [...]opInfo{
 		commutative: true,
 		generic:     true,
 	},
+	{
+		name:    "AddPairsFloat32x4",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "AddPairsFloat32x8",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "AddPairsFloat64x2",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "AddPairsFloat64x4",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "AddPairsInt16x8",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "AddPairsInt16x16",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "AddPairsInt32x4",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "AddPairsInt32x8",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "AddPairsSaturatedInt16x8",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "AddPairsSaturatedInt16x16",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "AddPairsUint16x8",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "AddPairsUint16x16",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "AddPairsUint32x4",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "AddPairsUint32x8",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:        "AddSaturatedInt8x16",
+		argLen:      2,
+		commutative: true,
+		generic:     true,
+	},
+	{
+		name:        "AddSaturatedInt8x32",
+		argLen:      2,
+		commutative: true,
+		generic:     true,
+	},
+	{
+		name:        "AddSaturatedInt8x64",
+		argLen:      2,
+		commutative: true,
+		generic:     true,
+	},
+	{
+		name:        "AddSaturatedInt16x8",
+		argLen:      2,
+		commutative: true,
+		generic:     true,
+	},
+	{
+		name:        "AddSaturatedInt16x16",
+		argLen:      2,
+		commutative: true,
+		generic:     true,
+	},
+	{
+		name:        "AddSaturatedInt16x32",
+		argLen:      2,
+		commutative: true,
+		generic:     true,
+	},
+	{
+		name:        "AddSaturatedMaskedInt8x16",
+		argLen:      3,
+		commutative: true,
+		generic:     true,
+	},
+	{
+		name:        "AddSaturatedMaskedInt8x32",
+		argLen:      3,
+		commutative: true,
+		generic:     true,
+	},
+	{
+		name:        "AddSaturatedMaskedInt8x64",
+		argLen:      3,
+		commutative: true,
+		generic:     true,
+	},
+	{
+		name:        "AddSaturatedMaskedInt16x8",
+		argLen:      3,
+		commutative: true,
+		generic:     true,
+	},
+	{
+		name:        "AddSaturatedMaskedInt16x16",
+		argLen:      3,
+		commutative: true,
+		generic:     true,
+	},
+	{
+		name:        "AddSaturatedMaskedInt16x32",
+		argLen:      3,
+		commutative: true,
+		generic:     true,
+	},
+	{
+		name:        "AddSaturatedMaskedUint8x16",
+		argLen:      3,
+		commutative: true,
+		generic:     true,
+	},
+	{
+		name:        "AddSaturatedMaskedUint8x32",
+		argLen:      3,
+		commutative: true,
+		generic:     true,
+	},
+	{
+		name:        "AddSaturatedMaskedUint8x64",
+		argLen:      3,
+		commutative: true,
+		generic:     true,
+	},
+	{
+		name:        "AddSaturatedMaskedUint16x8",
+		argLen:      3,
+		commutative: true,
+		generic:     true,
+	},
+	{
+		name:        "AddSaturatedMaskedUint16x16",
+		argLen:      3,
+		commutative: true,
+		generic:     true,
+	},
+	{
+		name:        "AddSaturatedMaskedUint16x32",
+		argLen:      3,
+		commutative: true,
+		generic:     true,
+	},
+	{
+		name:        "AddSaturatedUint8x16",
+		argLen:      2,
+		commutative: true,
+		generic:     true,
+	},
+	{
+		name:        "AddSaturatedUint8x32",
+		argLen:      2,
+		commutative: true,
+		generic:     true,
+	},
+	{
+		name:        "AddSaturatedUint8x64",
+		argLen:      2,
+		commutative: true,
+		generic:     true,
+	},
+	{
+		name:        "AddSaturatedUint16x8",
+		argLen:      2,
+		commutative: true,
+		generic:     true,
+	},
+	{
+		name:        "AddSaturatedUint16x16",
+		argLen:      2,
+		commutative: true,
+		generic:     true,
+	},
+	{
+		name:        "AddSaturatedUint16x32",
+		argLen:      2,
+		commutative: true,
+		generic:     true,
+	},
 	{
 		name:    "AddSubFloat32x4",
 		argLen:  2,
@@ -65693,66 +65907,6 @@ var opcodeTable = [...]opInfo{
 		commutative: true,
 		generic:     true,
 	},
-	{
-		name:    "MulByPowOf2Float32x4",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "MulByPowOf2Float32x8",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "MulByPowOf2Float32x16",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "MulByPowOf2Float64x2",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "MulByPowOf2Float64x4",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "MulByPowOf2Float64x8",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "MulByPowOf2MaskedFloat32x4",
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "MulByPowOf2MaskedFloat32x8",
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "MulByPowOf2MaskedFloat32x16",
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "MulByPowOf2MaskedFloat64x2",
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "MulByPowOf2MaskedFloat64x4",
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "MulByPowOf2MaskedFloat64x8",
-		argLen:  3,
-		generic: true,
-	},
 	{
 		name:        "MulEvenWidenInt32x4",
 		argLen:      2,
@@ -65958,113 +66112,59 @@ var opcodeTable = [...]opInfo{
 		generic:     true,
 	},
 	{
-		name:        "MulLowInt16x8",
+		name:        "MulInt16x8",
 		argLen:      2,
 		commutative: true,
 		generic:     true,
 	},
 	{
-		name:        "MulLowInt16x16",
+		name:        "MulInt16x16",
 		argLen:      2,
 		commutative: true,
 		generic:     true,
 	},
 	{
-		name:        "MulLowInt16x32",
+		name:        "MulInt16x32",
 		argLen:      2,
 		commutative: true,
 		generic:     true,
 	},
 	{
-		name:        "MulLowInt32x4",
+		name:        "MulInt32x4",
 		argLen:      2,
 		commutative: true,
 		generic:     true,
 	},
 	{
-		name:        "MulLowInt32x8",
+		name:        "MulInt32x8",
 		argLen:      2,
 		commutative: true,
 		generic:     true,
 	},
 	{
-		name:        "MulLowInt32x16",
+		name:        "MulInt32x16",
 		argLen:      2,
 		commutative: true,
 		generic:     true,
 	},
 	{
-		name:        "MulLowInt64x2",
+		name:        "MulInt64x2",
 		argLen:      2,
 		commutative: true,
 		generic:     true,
 	},
 	{
-		name:        "MulLowInt64x4",
+		name:        "MulInt64x4",
 		argLen:      2,
 		commutative: true,
 		generic:     true,
 	},
 	{
-		name:        "MulLowInt64x8",
+		name:        "MulInt64x8",
 		argLen:      2,
 		commutative: true,
 		generic:     true,
 	},
-	{
-		name:        "MulLowMaskedInt16x8",
-		argLen:      3,
-		commutative: true,
-		generic:     true,
-	},
-	{
-		name:        "MulLowMaskedInt16x16",
-		argLen:      3,
-		commutative: true,
-		generic:     true,
-	},
-	{
-		name:        "MulLowMaskedInt16x32",
-		argLen:      3,
-		commutative: true,
-		generic:     true,
-	},
-	{
-		name:        "MulLowMaskedInt32x4",
-		argLen:      3,
-		commutative: true,
-		generic:     true,
-	},
-	{
-		name:        "MulLowMaskedInt32x8",
-		argLen:      3,
-		commutative: true,
-		generic:     true,
-	},
-	{
-		name:        "MulLowMaskedInt32x16",
-		argLen:      3,
-		commutative: true,
-		generic:     true,
-	},
-	{
-		name:        "MulLowMaskedInt64x2",
-		argLen:      3,
-		commutative: true,
-		generic:     true,
-	},
-	{
-		name:        "MulLowMaskedInt64x4",
-		argLen:      3,
-		commutative: true,
-		generic:     true,
-	},
-	{
-		name:        "MulLowMaskedInt64x8",
-		argLen:      3,
-		commutative: true,
-		generic:     true,
-	},
 	{
 		name:        "MulMaskedFloat32x4",
 		argLen:      3,
@@ -66101,6 +66201,60 @@ var opcodeTable = [...]opInfo{
 		commutative: true,
 		generic:     true,
 	},
+	{
+		name:        "MulMaskedInt16x8",
+		argLen:      3,
+		commutative: true,
+		generic:     true,
+	},
+	{
+		name:        "MulMaskedInt16x16",
+		argLen:      3,
+		commutative: true,
+		generic:     true,
+	},
+	{
+		name:        "MulMaskedInt16x32",
+		argLen:      3,
+		commutative: true,
+		generic:     true,
+	},
+	{
+		name:        "MulMaskedInt32x4",
+		argLen:      3,
+		commutative: true,
+		generic:     true,
+	},
+	{
+		name:        "MulMaskedInt32x8",
+		argLen:      3,
+		commutative: true,
+		generic:     true,
+	},
+	{
+		name:        "MulMaskedInt32x16",
+		argLen:      3,
+		commutative: true,
+		generic:     true,
+	},
+	{
+		name:        "MulMaskedInt64x2",
+		argLen:      3,
+		commutative: true,
+		generic:     true,
+	},
+	{
+		name:        "MulMaskedInt64x4",
+		argLen:      3,
+		commutative: true,
+		generic:     true,
+	},
+	{
+		name:        "MulMaskedInt64x8",
+		argLen:      3,
+		commutative: true,
+		generic:     true,
+	},
 	{
 		name:        "NotEqualFloat32x4",
 		argLen:      2,
@@ -66707,126 +66861,6 @@ var opcodeTable = [...]opInfo{
 		argLen:  3,
 		generic: true,
 	},
-	{
-		name:    "PairwiseAddFloat32x4",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "PairwiseAddFloat32x8",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "PairwiseAddFloat64x2",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "PairwiseAddFloat64x4",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "PairwiseAddInt16x8",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "PairwiseAddInt16x16",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "PairwiseAddInt32x4",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "PairwiseAddInt32x8",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "PairwiseAddUint16x8",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "PairwiseAddUint16x16",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "PairwiseAddUint32x4",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "PairwiseAddUint32x8",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "PairwiseSubFloat32x4",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "PairwiseSubFloat32x8",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "PairwiseSubFloat64x2",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "PairwiseSubFloat64x4",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "PairwiseSubInt16x8",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "PairwiseSubInt16x16",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "PairwiseSubInt32x4",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "PairwiseSubInt32x8",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "PairwiseSubUint16x8",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "PairwiseSubUint16x16",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "PairwiseSubUint32x4",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "PairwiseSubUint32x8",
-		argLen:  2,
-		generic: true,
-	},
 	{
 		name:    "Permute2Float32x4",
 		argLen:  3,
@@ -67897,290 +67931,6 @@ var opcodeTable = [...]opInfo{
 		argLen:  4,
 		generic: true,
 	},
-	{
-		name:        "SaturatedAddInt8x16",
-		argLen:      2,
-		commutative: true,
-		generic:     true,
-	},
-	{
-		name:        "SaturatedAddInt8x32",
-		argLen:      2,
-		commutative: true,
-		generic:     true,
-	},
-	{
-		name:        "SaturatedAddInt8x64",
-		argLen:      2,
-		commutative: true,
-		generic:     true,
-	},
-	{
-		name:        "SaturatedAddInt16x8",
-		argLen:      2,
-		commutative: true,
-		generic:     true,
-	},
-	{
-		name:        "SaturatedAddInt16x16",
-		argLen:      2,
-		commutative: true,
-		generic:     true,
-	},
-	{
-		name:        "SaturatedAddInt16x32",
-		argLen:      2,
-		commutative: true,
-		generic:     true,
-	},
-	{
-		name:        "SaturatedAddMaskedInt8x16",
-		argLen:      3,
-		commutative: true,
-		generic:     true,
-	},
-	{
-		name:        "SaturatedAddMaskedInt8x32",
-		argLen:      3,
-		commutative: true,
-		generic:     true,
-	},
-	{
-		name:        "SaturatedAddMaskedInt8x64",
-		argLen:      3,
-		commutative: true,
-		generic:     true,
-	},
-	{
-		name:        "SaturatedAddMaskedInt16x8",
-		argLen:      3,
-		commutative: true,
-		generic:     true,
-	},
-	{
-		name:        "SaturatedAddMaskedInt16x16",
-		argLen:      3,
-		commutative: true,
-		generic:     true,
-	},
-	{
-		name:        "SaturatedAddMaskedInt16x32",
-		argLen:      3,
-		commutative: true,
-		generic:     true,
-	},
-	{
-		name:        "SaturatedAddMaskedUint8x16",
-		argLen:      3,
-		commutative: true,
-		generic:     true,
-	},
-	{
-		name:        "SaturatedAddMaskedUint8x32",
-		argLen:      3,
-		commutative: true,
-		generic:     true,
-	},
-	{
-		name:        "SaturatedAddMaskedUint8x64",
-		argLen:      3,
-		commutative: true,
-		generic:     true,
-	},
-	{
-		name:        "SaturatedAddMaskedUint16x8",
-		argLen:      3,
-		commutative: true,
-		generic:     true,
-	},
-	{
-		name:        "SaturatedAddMaskedUint16x16",
-		argLen:      3,
-		commutative: true,
-		generic:     true,
-	},
-	{
-		name:        "SaturatedAddMaskedUint16x32",
-		argLen:      3,
-		commutative: true,
-		generic:     true,
-	},
-	{
-		name:        "SaturatedAddUint8x16",
-		argLen:      2,
-		commutative: true,
-		generic:     true,
-	},
-	{
-		name:        "SaturatedAddUint8x32",
-		argLen:      2,
-		commutative: true,
-		generic:     true,
-	},
-	{
-		name:        "SaturatedAddUint8x64",
-		argLen:      2,
-		commutative: true,
-		generic:     true,
-	},
-	{
-		name:        "SaturatedAddUint16x8",
-		argLen:      2,
-		commutative: true,
-		generic:     true,
-	},
-	{
-		name:        "SaturatedAddUint16x16",
-		argLen:      2,
-		commutative: true,
-		generic:     true,
-	},
-	{
-		name:        "SaturatedAddUint16x32",
-		argLen:      2,
-		commutative: true,
-		generic:     true,
-	},
-	{
-		name:    "SaturatedPairwiseAddInt16x8",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "SaturatedPairwiseAddInt16x16",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "SaturatedPairwiseSubInt16x8",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "SaturatedPairwiseSubInt16x16",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "SaturatedSubInt8x16",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "SaturatedSubInt8x32",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "SaturatedSubInt8x64",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "SaturatedSubInt16x8",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "SaturatedSubInt16x16",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "SaturatedSubInt16x32",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "SaturatedSubMaskedInt8x16",
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "SaturatedSubMaskedInt8x32",
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "SaturatedSubMaskedInt8x64",
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "SaturatedSubMaskedInt16x8",
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "SaturatedSubMaskedInt16x16",
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "SaturatedSubMaskedInt16x32",
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "SaturatedSubMaskedUint8x16",
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "SaturatedSubMaskedUint8x32",
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "SaturatedSubMaskedUint8x64",
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "SaturatedSubMaskedUint16x8",
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "SaturatedSubMaskedUint16x16",
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "SaturatedSubMaskedUint16x32",
-		argLen:  3,
-		generic: true,
-	},
-	{
-		name:    "SaturatedSubUint8x16",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "SaturatedSubUint8x32",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "SaturatedSubUint8x64",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "SaturatedSubUint16x8",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "SaturatedSubUint16x16",
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "SaturatedSubUint16x32",
-		argLen:  2,
-		generic: true,
-	},
 	{
 		name:    "SaturatedUnsignedSignedPairDotProdMaskedUint8x16",
 		argLen:  3,
@@ -68241,6 +67991,66 @@ var opcodeTable = [...]opInfo{
 		argLen:  4,
 		generic: true,
 	},
+	{
+		name:    "ScaleFloat32x4",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "ScaleFloat32x8",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "ScaleFloat32x16",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "ScaleFloat64x2",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "ScaleFloat64x4",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "ScaleFloat64x8",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "ScaleMaskedFloat32x4",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "ScaleMaskedFloat32x8",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "ScaleMaskedFloat32x16",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "ScaleMaskedFloat64x2",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "ScaleMaskedFloat64x4",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "ScaleMaskedFloat64x8",
+		argLen:  3,
+		generic: true,
+	},
 	{
 		name:    "ShiftAllLeftInt16x8",
 		argLen:  2,
@@ -69651,6 +69461,196 @@ var opcodeTable = [...]opInfo{
 		argLen:  3,
 		generic: true,
 	},
+	{
+		name:    "SubPairsFloat32x4",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "SubPairsFloat32x8",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "SubPairsFloat64x2",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "SubPairsFloat64x4",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "SubPairsInt16x8",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "SubPairsInt16x16",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "SubPairsInt32x4",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "SubPairsInt32x8",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "SubPairsSaturatedInt16x8",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "SubPairsSaturatedInt16x16",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "SubPairsUint16x8",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "SubPairsUint16x16",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "SubPairsUint32x4",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "SubPairsUint32x8",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "SubSaturatedInt8x16",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "SubSaturatedInt8x32",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "SubSaturatedInt8x64",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "SubSaturatedInt16x8",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "SubSaturatedInt16x16",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "SubSaturatedInt16x32",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "SubSaturatedMaskedInt8x16",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "SubSaturatedMaskedInt8x32",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "SubSaturatedMaskedInt8x64",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "SubSaturatedMaskedInt16x8",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "SubSaturatedMaskedInt16x16",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "SubSaturatedMaskedInt16x32",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "SubSaturatedMaskedUint8x16",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "SubSaturatedMaskedUint8x32",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "SubSaturatedMaskedUint8x64",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "SubSaturatedMaskedUint16x8",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "SubSaturatedMaskedUint16x16",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "SubSaturatedMaskedUint16x32",
+		argLen:  3,
+		generic: true,
+	},
+	{
+		name:    "SubSaturatedUint8x16",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "SubSaturatedUint8x32",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "SubSaturatedUint8x64",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "SubSaturatedUint16x8",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "SubSaturatedUint16x16",
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "SubSaturatedUint16x32",
+		argLen:  2,
+		generic: true,
+	},
 	{
 		name:    "SubUint8x16",
 		argLen:  2,
@@ -69978,433 +69978,289 @@ var opcodeTable = [...]opInfo{
 		generic:     true,
 	},
 	{
-		name:    "CeilWithPrecisionFloat32x4",
+		name:    "CeilScaledFloat32x4",
 		auxType: auxInt8,
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "CeilWithPrecisionFloat32x8",
+		name:    "CeilScaledFloat32x8",
 		auxType: auxInt8,
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "CeilWithPrecisionFloat32x16",
+		name:    "CeilScaledFloat32x16",
 		auxType: auxInt8,
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "CeilWithPrecisionFloat64x2",
+		name:    "CeilScaledFloat64x2",
 		auxType: auxInt8,
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "CeilWithPrecisionFloat64x4",
+		name:    "CeilScaledFloat64x4",
 		auxType: auxInt8,
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "CeilWithPrecisionFloat64x8",
+		name:    "CeilScaledFloat64x8",
 		auxType: auxInt8,
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "CeilWithPrecisionMaskedFloat32x4",
+		name:    "CeilScaledMaskedFloat32x4",
 		auxType: auxInt8,
 		argLen:  2,
 		generic: true,
 	},
 	{
-		name:    "CeilWithPrecisionMaskedFloat32x8",
+		name:    "CeilScaledMaskedFloat32x8",
 		auxType: auxInt8,
 		argLen:  2,
 		generic: true,
 	},
 	{
-		name:    "CeilWithPrecisionMaskedFloat32x16",
+		name:    "CeilScaledMaskedFloat32x16",
 		auxType: auxInt8,
 		argLen:  2,
 		generic: true,
 	},
 	{
-		name:    "CeilWithPrecisionMaskedFloat64x2",
+		name:    "CeilScaledMaskedFloat64x2",
 		auxType: auxInt8,
 		argLen:  2,
 		generic: true,
 	},
 	{
-		name:    "CeilWithPrecisionMaskedFloat64x4",
+		name:    "CeilScaledMaskedFloat64x4",
 		auxType: auxInt8,
 		argLen:  2,
 		generic: true,
 	},
 	{
-		name:    "CeilWithPrecisionMaskedFloat64x8",
+		name:    "CeilScaledMaskedFloat64x8",
 		auxType: auxInt8,
 		argLen:  2,
 		generic: true,
 	},
 	{
-		name:    "DiffWithCeilWithPrecisionFloat32x4",
+		name:    "CeilScaledResidueFloat32x4",
 		auxType: auxInt8,
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "DiffWithCeilWithPrecisionFloat32x8",
+		name:    "CeilScaledResidueFloat32x8",
 		auxType: auxInt8,
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "DiffWithCeilWithPrecisionFloat32x16",
+		name:    "CeilScaledResidueFloat32x16",
 		auxType: auxInt8,
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "DiffWithCeilWithPrecisionFloat64x2",
+		name:    "CeilScaledResidueFloat64x2",
 		auxType: auxInt8,
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "DiffWithCeilWithPrecisionFloat64x4",
+		name:    "CeilScaledResidueFloat64x4",
 		auxType: auxInt8,
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "DiffWithCeilWithPrecisionFloat64x8",
+		name:    "CeilScaledResidueFloat64x8",
 		auxType: auxInt8,
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "DiffWithCeilWithPrecisionMaskedFloat32x4",
+		name:    "CeilScaledResidueMaskedFloat32x4",
 		auxType: auxInt8,
 		argLen:  2,
 		generic: true,
 	},
 	{
-		name:    "DiffWithCeilWithPrecisionMaskedFloat32x8",
+		name:    "CeilScaledResidueMaskedFloat32x8",
 		auxType: auxInt8,
 		argLen:  2,
 		generic: true,
 	},
 	{
-		name:    "DiffWithCeilWithPrecisionMaskedFloat32x16",
+		name:    "CeilScaledResidueMaskedFloat32x16",
 		auxType: auxInt8,
 		argLen:  2,
 		generic: true,
 	},
 	{
-		name:    "DiffWithCeilWithPrecisionMaskedFloat64x2",
+		name:    "CeilScaledResidueMaskedFloat64x2",
 		auxType: auxInt8,
 		argLen:  2,
 		generic: true,
 	},
 	{
-		name:    "DiffWithCeilWithPrecisionMaskedFloat64x4",
+		name:    "CeilScaledResidueMaskedFloat64x4",
 		auxType: auxInt8,
 		argLen:  2,
 		generic: true,
 	},
 	{
-		name:    "DiffWithCeilWithPrecisionMaskedFloat64x8",
+		name:    "CeilScaledResidueMaskedFloat64x8",
 		auxType: auxInt8,
 		argLen:  2,
 		generic: true,
 	},
 	{
-		name:    "DiffWithFloorWithPrecisionFloat32x4",
+		name:    "FloorScaledFloat32x4",
 		auxType: auxInt8,
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "DiffWithFloorWithPrecisionFloat32x8",
+		name:    "FloorScaledFloat32x8",
 		auxType: auxInt8,
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "DiffWithFloorWithPrecisionFloat32x16",
+		name:    "FloorScaledFloat32x16",
 		auxType: auxInt8,
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "DiffWithFloorWithPrecisionFloat64x2",
+		name:    "FloorScaledFloat64x2",
 		auxType: auxInt8,
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "DiffWithFloorWithPrecisionFloat64x4",
+		name:    "FloorScaledFloat64x4",
 		auxType: auxInt8,
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "DiffWithFloorWithPrecisionFloat64x8",
+		name:    "FloorScaledFloat64x8",
 		auxType: auxInt8,
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "DiffWithFloorWithPrecisionMaskedFloat32x4",
+		name:    "FloorScaledMaskedFloat32x4",
 		auxType: auxInt8,
 		argLen:  2,
 		generic: true,
 	},
 	{
-		name:    "DiffWithFloorWithPrecisionMaskedFloat32x8",
+		name:    "FloorScaledMaskedFloat32x8",
 		auxType: auxInt8,
 		argLen:  2,
 		generic: true,
 	},
 	{
-		name:    "DiffWithFloorWithPrecisionMaskedFloat32x16",
+		name:    "FloorScaledMaskedFloat32x16",
 		auxType: auxInt8,
 		argLen:  2,
 		generic: true,
 	},
 	{
-		name:    "DiffWithFloorWithPrecisionMaskedFloat64x2",
+		name:    "FloorScaledMaskedFloat64x2",
 		auxType: auxInt8,
 		argLen:  2,
 		generic: true,
 	},
 	{
-		name:    "DiffWithFloorWithPrecisionMaskedFloat64x4",
+		name:    "FloorScaledMaskedFloat64x4",
 		auxType: auxInt8,
 		argLen:  2,
 		generic: true,
 	},
 	{
-		name:    "DiffWithFloorWithPrecisionMaskedFloat64x8",
+		name:    "FloorScaledMaskedFloat64x8",
 		auxType: auxInt8,
 		argLen:  2,
 		generic: true,
 	},
 	{
-		name:    "DiffWithRoundWithPrecisionFloat32x4",
+		name:    "FloorScaledResidueFloat32x4",
 		auxType: auxInt8,
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "DiffWithRoundWithPrecisionFloat32x8",
+		name:    "FloorScaledResidueFloat32x8",
 		auxType: auxInt8,
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "DiffWithRoundWithPrecisionFloat32x16",
+		name:    "FloorScaledResidueFloat32x16",
 		auxType: auxInt8,
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "DiffWithRoundWithPrecisionFloat64x2",
+		name:    "FloorScaledResidueFloat64x2",
 		auxType: auxInt8,
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "DiffWithRoundWithPrecisionFloat64x4",
+		name:    "FloorScaledResidueFloat64x4",
 		auxType: auxInt8,
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "DiffWithRoundWithPrecisionFloat64x8",
+		name:    "FloorScaledResidueFloat64x8",
 		auxType: auxInt8,
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "DiffWithRoundWithPrecisionMaskedFloat32x4",
+		name:    "FloorScaledResidueMaskedFloat32x4",
 		auxType: auxInt8,
 		argLen:  2,
 		generic: true,
 	},
 	{
-		name:    "DiffWithRoundWithPrecisionMaskedFloat32x8",
+		name:    "FloorScaledResidueMaskedFloat32x8",
 		auxType: auxInt8,
 		argLen:  2,
 		generic: true,
 	},
 	{
-		name:    "DiffWithRoundWithPrecisionMaskedFloat32x16",
+		name:    "FloorScaledResidueMaskedFloat32x16",
 		auxType: auxInt8,
 		argLen:  2,
 		generic: true,
 	},
 	{
-		name:    "DiffWithRoundWithPrecisionMaskedFloat64x2",
+		name:    "FloorScaledResidueMaskedFloat64x2",
 		auxType: auxInt8,
 		argLen:  2,
 		generic: true,
 	},
 	{
-		name:    "DiffWithRoundWithPrecisionMaskedFloat64x4",
+		name:    "FloorScaledResidueMaskedFloat64x4",
 		auxType: auxInt8,
 		argLen:  2,
 		generic: true,
 	},
 	{
-		name:    "DiffWithRoundWithPrecisionMaskedFloat64x8",
-		auxType: auxInt8,
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "DiffWithTruncWithPrecisionFloat32x4",
-		auxType: auxInt8,
-		argLen:  1,
-		generic: true,
-	},
-	{
-		name:    "DiffWithTruncWithPrecisionFloat32x8",
-		auxType: auxInt8,
-		argLen:  1,
-		generic: true,
-	},
-	{
-		name:    "DiffWithTruncWithPrecisionFloat32x16",
-		auxType: auxInt8,
-		argLen:  1,
-		generic: true,
-	},
-	{
-		name:    "DiffWithTruncWithPrecisionFloat64x2",
-		auxType: auxInt8,
-		argLen:  1,
-		generic: true,
-	},
-	{
-		name:    "DiffWithTruncWithPrecisionFloat64x4",
-		auxType: auxInt8,
-		argLen:  1,
-		generic: true,
-	},
-	{
-		name:    "DiffWithTruncWithPrecisionFloat64x8",
-		auxType: auxInt8,
-		argLen:  1,
-		generic: true,
-	},
-	{
-		name:    "DiffWithTruncWithPrecisionMaskedFloat32x4",
-		auxType: auxInt8,
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "DiffWithTruncWithPrecisionMaskedFloat32x8",
-		auxType: auxInt8,
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "DiffWithTruncWithPrecisionMaskedFloat32x16",
-		auxType: auxInt8,
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "DiffWithTruncWithPrecisionMaskedFloat64x2",
-		auxType: auxInt8,
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "DiffWithTruncWithPrecisionMaskedFloat64x4",
-		auxType: auxInt8,
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "DiffWithTruncWithPrecisionMaskedFloat64x8",
-		auxType: auxInt8,
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "FloorWithPrecisionFloat32x4",
-		auxType: auxInt8,
-		argLen:  1,
-		generic: true,
-	},
-	{
-		name:    "FloorWithPrecisionFloat32x8",
-		auxType: auxInt8,
-		argLen:  1,
-		generic: true,
-	},
-	{
-		name:    "FloorWithPrecisionFloat32x16",
-		auxType: auxInt8,
-		argLen:  1,
-		generic: true,
-	},
-	{
-		name:    "FloorWithPrecisionFloat64x2",
-		auxType: auxInt8,
-		argLen:  1,
-		generic: true,
-	},
-	{
-		name:    "FloorWithPrecisionFloat64x4",
-		auxType: auxInt8,
-		argLen:  1,
-		generic: true,
-	},
-	{
-		name:    "FloorWithPrecisionFloat64x8",
-		auxType: auxInt8,
-		argLen:  1,
-		generic: true,
-	},
-	{
-		name:    "FloorWithPrecisionMaskedFloat32x4",
-		auxType: auxInt8,
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "FloorWithPrecisionMaskedFloat32x8",
-		auxType: auxInt8,
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "FloorWithPrecisionMaskedFloat32x16",
-		auxType: auxInt8,
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "FloorWithPrecisionMaskedFloat64x2",
-		auxType: auxInt8,
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "FloorWithPrecisionMaskedFloat64x4",
-		auxType: auxInt8,
-		argLen:  2,
-		generic: true,
-	},
-	{
-		name:    "FloorWithPrecisionMaskedFloat64x8",
+		name:    "FloorScaledResidueMaskedFloat64x8",
 		auxType: auxInt8,
 		argLen:  2,
 		generic: true,
@@ -70878,73 +70734,145 @@ var opcodeTable = [...]opInfo{
 		generic: true,
 	},
 	{
-		name:    "RoundWithPrecisionFloat32x4",
+		name:    "RoundScaledFloat32x4",
 		auxType: auxInt8,
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "RoundWithPrecisionFloat32x8",
+		name:    "RoundScaledFloat32x8",
 		auxType: auxInt8,
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "RoundWithPrecisionFloat32x16",
+		name:    "RoundScaledFloat32x16",
 		auxType: auxInt8,
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "RoundWithPrecisionFloat64x2",
+		name:    "RoundScaledFloat64x2",
 		auxType: auxInt8,
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "RoundWithPrecisionFloat64x4",
+		name:    "RoundScaledFloat64x4",
 		auxType: auxInt8,
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "RoundWithPrecisionFloat64x8",
+		name:    "RoundScaledFloat64x8",
 		auxType: auxInt8,
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "RoundWithPrecisionMaskedFloat32x4",
+		name:    "RoundScaledMaskedFloat32x4",
 		auxType: auxInt8,
 		argLen:  2,
 		generic: true,
 	},
 	{
-		name:    "RoundWithPrecisionMaskedFloat32x8",
+		name:    "RoundScaledMaskedFloat32x8",
 		auxType: auxInt8,
 		argLen:  2,
 		generic: true,
 	},
 	{
-		name:    "RoundWithPrecisionMaskedFloat32x16",
+		name:    "RoundScaledMaskedFloat32x16",
 		auxType: auxInt8,
 		argLen:  2,
 		generic: true,
 	},
 	{
-		name:    "RoundWithPrecisionMaskedFloat64x2",
+		name:    "RoundScaledMaskedFloat64x2",
 		auxType: auxInt8,
 		argLen:  2,
 		generic: true,
 	},
 	{
-		name:    "RoundWithPrecisionMaskedFloat64x4",
+		name:    "RoundScaledMaskedFloat64x4",
 		auxType: auxInt8,
 		argLen:  2,
 		generic: true,
 	},
 	{
-		name:    "RoundWithPrecisionMaskedFloat64x8",
+		name:    "RoundScaledMaskedFloat64x8",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "RoundScaledResidueFloat32x4",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "RoundScaledResidueFloat32x8",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "RoundScaledResidueFloat32x16",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "RoundScaledResidueFloat64x2",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "RoundScaledResidueFloat64x4",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "RoundScaledResidueFloat64x8",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "RoundScaledResidueMaskedFloat32x4",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "RoundScaledResidueMaskedFloat32x8",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "RoundScaledResidueMaskedFloat32x16",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "RoundScaledResidueMaskedFloat64x2",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "RoundScaledResidueMaskedFloat64x4",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "RoundScaledResidueMaskedFloat64x8",
 		auxType: auxInt8,
 		argLen:  2,
 		generic: true,
@@ -71490,73 +71418,145 @@ var opcodeTable = [...]opInfo{
 		generic: true,
 	},
 	{
-		name:    "TruncWithPrecisionFloat32x4",
+		name:    "TruncScaledFloat32x4",
 		auxType: auxInt8,
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "TruncWithPrecisionFloat32x8",
+		name:    "TruncScaledFloat32x8",
 		auxType: auxInt8,
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "TruncWithPrecisionFloat32x16",
+		name:    "TruncScaledFloat32x16",
 		auxType: auxInt8,
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "TruncWithPrecisionFloat64x2",
+		name:    "TruncScaledFloat64x2",
 		auxType: auxInt8,
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "TruncWithPrecisionFloat64x4",
+		name:    "TruncScaledFloat64x4",
 		auxType: auxInt8,
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "TruncWithPrecisionFloat64x8",
+		name:    "TruncScaledFloat64x8",
 		auxType: auxInt8,
 		argLen:  1,
 		generic: true,
 	},
 	{
-		name:    "TruncWithPrecisionMaskedFloat32x4",
+		name:    "TruncScaledMaskedFloat32x4",
 		auxType: auxInt8,
 		argLen:  2,
 		generic: true,
 	},
 	{
-		name:    "TruncWithPrecisionMaskedFloat32x8",
+		name:    "TruncScaledMaskedFloat32x8",
 		auxType: auxInt8,
 		argLen:  2,
 		generic: true,
 	},
 	{
-		name:    "TruncWithPrecisionMaskedFloat32x16",
+		name:    "TruncScaledMaskedFloat32x16",
 		auxType: auxInt8,
 		argLen:  2,
 		generic: true,
 	},
 	{
-		name:    "TruncWithPrecisionMaskedFloat64x2",
+		name:    "TruncScaledMaskedFloat64x2",
 		auxType: auxInt8,
 		argLen:  2,
 		generic: true,
 	},
 	{
-		name:    "TruncWithPrecisionMaskedFloat64x4",
+		name:    "TruncScaledMaskedFloat64x4",
 		auxType: auxInt8,
 		argLen:  2,
 		generic: true,
 	},
 	{
-		name:    "TruncWithPrecisionMaskedFloat64x8",
+		name:    "TruncScaledMaskedFloat64x8",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "TruncScaledResidueFloat32x4",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "TruncScaledResidueFloat32x8",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "TruncScaledResidueFloat32x16",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "TruncScaledResidueFloat64x2",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "TruncScaledResidueFloat64x4",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "TruncScaledResidueFloat64x8",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "TruncScaledResidueMaskedFloat32x4",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "TruncScaledResidueMaskedFloat32x8",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "TruncScaledResidueMaskedFloat32x16",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "TruncScaledResidueMaskedFloat64x2",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "TruncScaledResidueMaskedFloat64x4",
+		auxType: auxInt8,
+		argLen:  2,
+		generic: true,
+	},
+	{
+		name:    "TruncScaledResidueMaskedFloat64x8",
 		auxType: auxInt8,
 		argLen:  2,
 		generic: true,
diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go
index 82f13b43c6e..a3a7ba7ed65 100644
--- a/src/cmd/compile/internal/ssa/rewriteAMD64.go
+++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go
@@ -760,9 +760,111 @@ func rewriteValueAMD64(v *Value) bool {
 		return rewriteValueAMD64_OpAddMaskedUint8x32(v)
 	case OpAddMaskedUint8x64:
 		return rewriteValueAMD64_OpAddMaskedUint8x64(v)
+	case OpAddPairsFloat32x4:
+		v.Op = OpAMD64VHADDPS128
+		return true
+	case OpAddPairsFloat32x8:
+		v.Op = OpAMD64VHADDPS256
+		return true
+	case OpAddPairsFloat64x2:
+		v.Op = OpAMD64VHADDPD128
+		return true
+	case OpAddPairsFloat64x4:
+		v.Op = OpAMD64VHADDPD256
+		return true
+	case OpAddPairsInt16x16:
+		v.Op = OpAMD64VPHADDW256
+		return true
+	case OpAddPairsInt16x8:
+		v.Op = OpAMD64VPHADDW128
+		return true
+	case OpAddPairsInt32x4:
+		v.Op = OpAMD64VPHADDD128
+		return true
+	case OpAddPairsInt32x8:
+		v.Op = OpAMD64VPHADDD256
+		return true
+	case OpAddPairsSaturatedInt16x16:
+		v.Op = OpAMD64VPHADDSW256
+		return true
+	case OpAddPairsSaturatedInt16x8:
+		v.Op = OpAMD64VPHADDSW128
+		return true
+	case OpAddPairsUint16x16:
+		v.Op = OpAMD64VPHADDW256
+		return true
+	case OpAddPairsUint16x8:
+		v.Op = OpAMD64VPHADDW128
+		return true
+	case OpAddPairsUint32x4:
+		v.Op = OpAMD64VPHADDD128
+		return true
+	case OpAddPairsUint32x8:
+		v.Op = OpAMD64VPHADDD256
+		return true
 	case OpAddPtr:
 		v.Op = OpAMD64ADDQ
 		return true
+	case OpAddSaturatedInt16x16:
+		v.Op = OpAMD64VPADDSW256
+		return true
+	case OpAddSaturatedInt16x32:
+		v.Op = OpAMD64VPADDSW512
+		return true
+	case OpAddSaturatedInt16x8:
+		v.Op = OpAMD64VPADDSW128
+		return true
+	case OpAddSaturatedInt8x16:
+		v.Op = OpAMD64VPADDSB128
+		return true
+	case OpAddSaturatedInt8x32:
+		v.Op = OpAMD64VPADDSB256
+		return true
+	case OpAddSaturatedInt8x64:
+		v.Op = OpAMD64VPADDSB512
+		return true
+	case OpAddSaturatedMaskedInt16x16:
+		return rewriteValueAMD64_OpAddSaturatedMaskedInt16x16(v)
+	case OpAddSaturatedMaskedInt16x32:
+		return rewriteValueAMD64_OpAddSaturatedMaskedInt16x32(v)
+	case OpAddSaturatedMaskedInt16x8:
+		return rewriteValueAMD64_OpAddSaturatedMaskedInt16x8(v)
+	case OpAddSaturatedMaskedInt8x16:
+		return rewriteValueAMD64_OpAddSaturatedMaskedInt8x16(v)
+	case OpAddSaturatedMaskedInt8x32:
+		return rewriteValueAMD64_OpAddSaturatedMaskedInt8x32(v)
+	case OpAddSaturatedMaskedInt8x64:
+		return rewriteValueAMD64_OpAddSaturatedMaskedInt8x64(v)
+	case OpAddSaturatedMaskedUint16x16:
+		return rewriteValueAMD64_OpAddSaturatedMaskedUint16x16(v)
+	case OpAddSaturatedMaskedUint16x32:
+		return rewriteValueAMD64_OpAddSaturatedMaskedUint16x32(v)
+	case OpAddSaturatedMaskedUint16x8:
+		return rewriteValueAMD64_OpAddSaturatedMaskedUint16x8(v)
+	case OpAddSaturatedMaskedUint8x16:
+		return rewriteValueAMD64_OpAddSaturatedMaskedUint8x16(v)
+	case OpAddSaturatedMaskedUint8x32:
+		return rewriteValueAMD64_OpAddSaturatedMaskedUint8x32(v)
+	case OpAddSaturatedMaskedUint8x64:
+		return rewriteValueAMD64_OpAddSaturatedMaskedUint8x64(v)
+	case OpAddSaturatedUint16x16:
+		v.Op = OpAMD64VPADDSW256
+		return true
+	case OpAddSaturatedUint16x32:
+		v.Op = OpAMD64VPADDSW512
+		return true
+	case OpAddSaturatedUint16x8:
+		v.Op = OpAMD64VPADDSW128
+		return true
+	case OpAddSaturatedUint8x16:
+		v.Op = OpAMD64VPADDSB128
+		return true
+	case OpAddSaturatedUint8x32:
+		v.Op = OpAMD64VPADDSB256
+		return true
+	case OpAddSaturatedUint8x64:
+		v.Op = OpAMD64VPADDSB512
+		return true
 	case OpAddSubFloat32x4:
 		v.Op = OpAMD64VADDSUBPS128
 		return true
@@ -1185,30 +1287,54 @@ func rewriteValueAMD64(v *Value) bool {
 		return rewriteValueAMD64_OpCeilFloat64x2(v)
 	case OpCeilFloat64x4:
 		return rewriteValueAMD64_OpCeilFloat64x4(v)
-	case OpCeilWithPrecisionFloat32x16:
-		return rewriteValueAMD64_OpCeilWithPrecisionFloat32x16(v)
-	case OpCeilWithPrecisionFloat32x4:
-		return rewriteValueAMD64_OpCeilWithPrecisionFloat32x4(v)
-	case OpCeilWithPrecisionFloat32x8:
-		return rewriteValueAMD64_OpCeilWithPrecisionFloat32x8(v)
-	case OpCeilWithPrecisionFloat64x2:
-		return rewriteValueAMD64_OpCeilWithPrecisionFloat64x2(v)
-	case OpCeilWithPrecisionFloat64x4:
-		return rewriteValueAMD64_OpCeilWithPrecisionFloat64x4(v)
-	case OpCeilWithPrecisionFloat64x8:
-		return rewriteValueAMD64_OpCeilWithPrecisionFloat64x8(v)
-	case OpCeilWithPrecisionMaskedFloat32x16:
-		return rewriteValueAMD64_OpCeilWithPrecisionMaskedFloat32x16(v)
-	case OpCeilWithPrecisionMaskedFloat32x4:
-		return rewriteValueAMD64_OpCeilWithPrecisionMaskedFloat32x4(v)
-	case OpCeilWithPrecisionMaskedFloat32x8:
-		return rewriteValueAMD64_OpCeilWithPrecisionMaskedFloat32x8(v)
-	case OpCeilWithPrecisionMaskedFloat64x2:
-		return rewriteValueAMD64_OpCeilWithPrecisionMaskedFloat64x2(v)
-	case OpCeilWithPrecisionMaskedFloat64x4:
-		return rewriteValueAMD64_OpCeilWithPrecisionMaskedFloat64x4(v)
-	case OpCeilWithPrecisionMaskedFloat64x8:
-		return rewriteValueAMD64_OpCeilWithPrecisionMaskedFloat64x8(v)
+	case OpCeilScaledFloat32x16:
+		return rewriteValueAMD64_OpCeilScaledFloat32x16(v)
+	case OpCeilScaledFloat32x4:
+		return rewriteValueAMD64_OpCeilScaledFloat32x4(v)
+	case OpCeilScaledFloat32x8:
+		return rewriteValueAMD64_OpCeilScaledFloat32x8(v)
+	case OpCeilScaledFloat64x2:
+		return rewriteValueAMD64_OpCeilScaledFloat64x2(v)
+	case OpCeilScaledFloat64x4:
+		return rewriteValueAMD64_OpCeilScaledFloat64x4(v)
+	case OpCeilScaledFloat64x8:
+		return rewriteValueAMD64_OpCeilScaledFloat64x8(v)
+	case OpCeilScaledMaskedFloat32x16:
+		return rewriteValueAMD64_OpCeilScaledMaskedFloat32x16(v)
+	case OpCeilScaledMaskedFloat32x4:
+		return rewriteValueAMD64_OpCeilScaledMaskedFloat32x4(v)
+	case OpCeilScaledMaskedFloat32x8:
+		return rewriteValueAMD64_OpCeilScaledMaskedFloat32x8(v)
+	case OpCeilScaledMaskedFloat64x2:
+		return rewriteValueAMD64_OpCeilScaledMaskedFloat64x2(v)
+	case OpCeilScaledMaskedFloat64x4:
+		return rewriteValueAMD64_OpCeilScaledMaskedFloat64x4(v)
+	case OpCeilScaledMaskedFloat64x8:
+		return rewriteValueAMD64_OpCeilScaledMaskedFloat64x8(v)
+	case OpCeilScaledResidueFloat32x16:
+		return rewriteValueAMD64_OpCeilScaledResidueFloat32x16(v)
+	case OpCeilScaledResidueFloat32x4:
+		return rewriteValueAMD64_OpCeilScaledResidueFloat32x4(v)
+	case OpCeilScaledResidueFloat32x8:
+		return rewriteValueAMD64_OpCeilScaledResidueFloat32x8(v)
+	case OpCeilScaledResidueFloat64x2:
+		return rewriteValueAMD64_OpCeilScaledResidueFloat64x2(v)
+	case OpCeilScaledResidueFloat64x4:
+		return rewriteValueAMD64_OpCeilScaledResidueFloat64x4(v)
+	case OpCeilScaledResidueFloat64x8:
+		return rewriteValueAMD64_OpCeilScaledResidueFloat64x8(v)
+	case OpCeilScaledResidueMaskedFloat32x16:
+		return rewriteValueAMD64_OpCeilScaledResidueMaskedFloat32x16(v)
+	case OpCeilScaledResidueMaskedFloat32x4:
+		return rewriteValueAMD64_OpCeilScaledResidueMaskedFloat32x4(v)
+	case OpCeilScaledResidueMaskedFloat32x8:
+		return rewriteValueAMD64_OpCeilScaledResidueMaskedFloat32x8(v)
+	case OpCeilScaledResidueMaskedFloat64x2:
+		return rewriteValueAMD64_OpCeilScaledResidueMaskedFloat64x2(v)
+	case OpCeilScaledResidueMaskedFloat64x4:
+		return rewriteValueAMD64_OpCeilScaledResidueMaskedFloat64x4(v)
+	case OpCeilScaledResidueMaskedFloat64x8:
+		return rewriteValueAMD64_OpCeilScaledResidueMaskedFloat64x8(v)
 	case OpClosureCall:
 		v.Op = OpAMD64CALLclosure
 		return true
@@ -1409,102 +1535,6 @@ func rewriteValueAMD64(v *Value) bool {
 	case OpCvtBoolToUint8:
 		v.Op = OpCopy
 		return true
-	case OpDiffWithCeilWithPrecisionFloat32x16:
-		return rewriteValueAMD64_OpDiffWithCeilWithPrecisionFloat32x16(v)
-	case OpDiffWithCeilWithPrecisionFloat32x4:
-		return rewriteValueAMD64_OpDiffWithCeilWithPrecisionFloat32x4(v)
-	case OpDiffWithCeilWithPrecisionFloat32x8:
-		return rewriteValueAMD64_OpDiffWithCeilWithPrecisionFloat32x8(v)
-	case OpDiffWithCeilWithPrecisionFloat64x2:
-		return rewriteValueAMD64_OpDiffWithCeilWithPrecisionFloat64x2(v)
-	case OpDiffWithCeilWithPrecisionFloat64x4:
-		return rewriteValueAMD64_OpDiffWithCeilWithPrecisionFloat64x4(v)
-	case OpDiffWithCeilWithPrecisionFloat64x8:
-		return rewriteValueAMD64_OpDiffWithCeilWithPrecisionFloat64x8(v)
-	case OpDiffWithCeilWithPrecisionMaskedFloat32x16:
-		return rewriteValueAMD64_OpDiffWithCeilWithPrecisionMaskedFloat32x16(v)
-	case OpDiffWithCeilWithPrecisionMaskedFloat32x4:
-		return rewriteValueAMD64_OpDiffWithCeilWithPrecisionMaskedFloat32x4(v)
-	case OpDiffWithCeilWithPrecisionMaskedFloat32x8:
-		return rewriteValueAMD64_OpDiffWithCeilWithPrecisionMaskedFloat32x8(v)
-	case OpDiffWithCeilWithPrecisionMaskedFloat64x2:
-		return rewriteValueAMD64_OpDiffWithCeilWithPrecisionMaskedFloat64x2(v)
-	case OpDiffWithCeilWithPrecisionMaskedFloat64x4:
-		return rewriteValueAMD64_OpDiffWithCeilWithPrecisionMaskedFloat64x4(v)
-	case OpDiffWithCeilWithPrecisionMaskedFloat64x8:
-		return rewriteValueAMD64_OpDiffWithCeilWithPrecisionMaskedFloat64x8(v)
-	case OpDiffWithFloorWithPrecisionFloat32x16:
-		return rewriteValueAMD64_OpDiffWithFloorWithPrecisionFloat32x16(v)
-	case OpDiffWithFloorWithPrecisionFloat32x4:
-		return rewriteValueAMD64_OpDiffWithFloorWithPrecisionFloat32x4(v)
-	case OpDiffWithFloorWithPrecisionFloat32x8:
-		return rewriteValueAMD64_OpDiffWithFloorWithPrecisionFloat32x8(v)
-	case OpDiffWithFloorWithPrecisionFloat64x2:
-		return rewriteValueAMD64_OpDiffWithFloorWithPrecisionFloat64x2(v)
-	case OpDiffWithFloorWithPrecisionFloat64x4:
-		return rewriteValueAMD64_OpDiffWithFloorWithPrecisionFloat64x4(v)
-	case OpDiffWithFloorWithPrecisionFloat64x8:
-		return rewriteValueAMD64_OpDiffWithFloorWithPrecisionFloat64x8(v)
-	case OpDiffWithFloorWithPrecisionMaskedFloat32x16:
-		return rewriteValueAMD64_OpDiffWithFloorWithPrecisionMaskedFloat32x16(v)
-	case OpDiffWithFloorWithPrecisionMaskedFloat32x4:
-		return rewriteValueAMD64_OpDiffWithFloorWithPrecisionMaskedFloat32x4(v)
-	case OpDiffWithFloorWithPrecisionMaskedFloat32x8:
-		return rewriteValueAMD64_OpDiffWithFloorWithPrecisionMaskedFloat32x8(v)
-	case OpDiffWithFloorWithPrecisionMaskedFloat64x2:
-		return rewriteValueAMD64_OpDiffWithFloorWithPrecisionMaskedFloat64x2(v)
-	case OpDiffWithFloorWithPrecisionMaskedFloat64x4:
-		return rewriteValueAMD64_OpDiffWithFloorWithPrecisionMaskedFloat64x4(v)
-	case OpDiffWithFloorWithPrecisionMaskedFloat64x8:
-		return rewriteValueAMD64_OpDiffWithFloorWithPrecisionMaskedFloat64x8(v)
-	case OpDiffWithRoundWithPrecisionFloat32x16:
-		return rewriteValueAMD64_OpDiffWithRoundWithPrecisionFloat32x16(v)
-	case OpDiffWithRoundWithPrecisionFloat32x4:
-		return rewriteValueAMD64_OpDiffWithRoundWithPrecisionFloat32x4(v)
-	case OpDiffWithRoundWithPrecisionFloat32x8:
-		return rewriteValueAMD64_OpDiffWithRoundWithPrecisionFloat32x8(v)
-	case OpDiffWithRoundWithPrecisionFloat64x2:
-		return rewriteValueAMD64_OpDiffWithRoundWithPrecisionFloat64x2(v)
-	case OpDiffWithRoundWithPrecisionFloat64x4:
-		return rewriteValueAMD64_OpDiffWithRoundWithPrecisionFloat64x4(v)
-	case OpDiffWithRoundWithPrecisionFloat64x8:
-		return rewriteValueAMD64_OpDiffWithRoundWithPrecisionFloat64x8(v)
-	case OpDiffWithRoundWithPrecisionMaskedFloat32x16:
-		return rewriteValueAMD64_OpDiffWithRoundWithPrecisionMaskedFloat32x16(v)
-	case OpDiffWithRoundWithPrecisionMaskedFloat32x4:
-		return rewriteValueAMD64_OpDiffWithRoundWithPrecisionMaskedFloat32x4(v)
-	case OpDiffWithRoundWithPrecisionMaskedFloat32x8:
-		return rewriteValueAMD64_OpDiffWithRoundWithPrecisionMaskedFloat32x8(v)
-	case OpDiffWithRoundWithPrecisionMaskedFloat64x2:
-		return rewriteValueAMD64_OpDiffWithRoundWithPrecisionMaskedFloat64x2(v)
-	case OpDiffWithRoundWithPrecisionMaskedFloat64x4:
-		return rewriteValueAMD64_OpDiffWithRoundWithPrecisionMaskedFloat64x4(v)
-	case OpDiffWithRoundWithPrecisionMaskedFloat64x8:
-		return rewriteValueAMD64_OpDiffWithRoundWithPrecisionMaskedFloat64x8(v)
-	case OpDiffWithTruncWithPrecisionFloat32x16:
-		return rewriteValueAMD64_OpDiffWithTruncWithPrecisionFloat32x16(v)
-	case OpDiffWithTruncWithPrecisionFloat32x4:
-		return rewriteValueAMD64_OpDiffWithTruncWithPrecisionFloat32x4(v)
-	case OpDiffWithTruncWithPrecisionFloat32x8:
-		return rewriteValueAMD64_OpDiffWithTruncWithPrecisionFloat32x8(v)
-	case OpDiffWithTruncWithPrecisionFloat64x2:
-		return rewriteValueAMD64_OpDiffWithTruncWithPrecisionFloat64x2(v)
-	case OpDiffWithTruncWithPrecisionFloat64x4:
-		return rewriteValueAMD64_OpDiffWithTruncWithPrecisionFloat64x4(v)
-	case OpDiffWithTruncWithPrecisionFloat64x8:
-		return rewriteValueAMD64_OpDiffWithTruncWithPrecisionFloat64x8(v)
-	case OpDiffWithTruncWithPrecisionMaskedFloat32x16:
-		return rewriteValueAMD64_OpDiffWithTruncWithPrecisionMaskedFloat32x16(v)
-	case OpDiffWithTruncWithPrecisionMaskedFloat32x4:
-		return rewriteValueAMD64_OpDiffWithTruncWithPrecisionMaskedFloat32x4(v)
-	case OpDiffWithTruncWithPrecisionMaskedFloat32x8:
-		return rewriteValueAMD64_OpDiffWithTruncWithPrecisionMaskedFloat32x8(v)
-	case OpDiffWithTruncWithPrecisionMaskedFloat64x2:
-		return rewriteValueAMD64_OpDiffWithTruncWithPrecisionMaskedFloat64x2(v)
-	case OpDiffWithTruncWithPrecisionMaskedFloat64x4:
-		return rewriteValueAMD64_OpDiffWithTruncWithPrecisionMaskedFloat64x4(v)
-	case OpDiffWithTruncWithPrecisionMaskedFloat64x8:
-		return rewriteValueAMD64_OpDiffWithTruncWithPrecisionMaskedFloat64x8(v)
 	case OpDiv128u:
 		v.Op = OpAMD64DIVQU2
 		return true
@@ -1730,30 +1760,54 @@ func rewriteValueAMD64(v *Value) bool {
 		return rewriteValueAMD64_OpFloorFloat64x2(v)
 	case OpFloorFloat64x4:
 		return rewriteValueAMD64_OpFloorFloat64x4(v)
-	case OpFloorWithPrecisionFloat32x16:
-		return rewriteValueAMD64_OpFloorWithPrecisionFloat32x16(v)
-	case OpFloorWithPrecisionFloat32x4:
-		return rewriteValueAMD64_OpFloorWithPrecisionFloat32x4(v)
-	case OpFloorWithPrecisionFloat32x8:
-		return rewriteValueAMD64_OpFloorWithPrecisionFloat32x8(v)
-	case OpFloorWithPrecisionFloat64x2:
-		return rewriteValueAMD64_OpFloorWithPrecisionFloat64x2(v)
-	case OpFloorWithPrecisionFloat64x4:
-		return rewriteValueAMD64_OpFloorWithPrecisionFloat64x4(v)
-	case OpFloorWithPrecisionFloat64x8:
-		return rewriteValueAMD64_OpFloorWithPrecisionFloat64x8(v)
-	case OpFloorWithPrecisionMaskedFloat32x16:
-		return rewriteValueAMD64_OpFloorWithPrecisionMaskedFloat32x16(v)
-	case OpFloorWithPrecisionMaskedFloat32x4:
-		return rewriteValueAMD64_OpFloorWithPrecisionMaskedFloat32x4(v)
-	case OpFloorWithPrecisionMaskedFloat32x8:
-		return rewriteValueAMD64_OpFloorWithPrecisionMaskedFloat32x8(v)
-	case OpFloorWithPrecisionMaskedFloat64x2:
-		return rewriteValueAMD64_OpFloorWithPrecisionMaskedFloat64x2(v)
-	case OpFloorWithPrecisionMaskedFloat64x4:
-		return rewriteValueAMD64_OpFloorWithPrecisionMaskedFloat64x4(v)
-	case OpFloorWithPrecisionMaskedFloat64x8:
-		return rewriteValueAMD64_OpFloorWithPrecisionMaskedFloat64x8(v)
+	case OpFloorScaledFloat32x16:
+		return rewriteValueAMD64_OpFloorScaledFloat32x16(v)
+	case OpFloorScaledFloat32x4:
+		return rewriteValueAMD64_OpFloorScaledFloat32x4(v)
+	case OpFloorScaledFloat32x8:
+		return rewriteValueAMD64_OpFloorScaledFloat32x8(v)
+	case OpFloorScaledFloat64x2:
+		return rewriteValueAMD64_OpFloorScaledFloat64x2(v)
+	case OpFloorScaledFloat64x4:
+		return rewriteValueAMD64_OpFloorScaledFloat64x4(v)
+	case OpFloorScaledFloat64x8:
+		return rewriteValueAMD64_OpFloorScaledFloat64x8(v)
+	case OpFloorScaledMaskedFloat32x16:
+		return rewriteValueAMD64_OpFloorScaledMaskedFloat32x16(v)
+	case OpFloorScaledMaskedFloat32x4:
+		return rewriteValueAMD64_OpFloorScaledMaskedFloat32x4(v)
+	case OpFloorScaledMaskedFloat32x8:
+		return rewriteValueAMD64_OpFloorScaledMaskedFloat32x8(v)
+	case OpFloorScaledMaskedFloat64x2:
+		return rewriteValueAMD64_OpFloorScaledMaskedFloat64x2(v)
+	case OpFloorScaledMaskedFloat64x4:
+		return rewriteValueAMD64_OpFloorScaledMaskedFloat64x4(v)
+	case OpFloorScaledMaskedFloat64x8:
+		return rewriteValueAMD64_OpFloorScaledMaskedFloat64x8(v)
+	case OpFloorScaledResidueFloat32x16:
+		return rewriteValueAMD64_OpFloorScaledResidueFloat32x16(v)
+	case OpFloorScaledResidueFloat32x4:
+		return rewriteValueAMD64_OpFloorScaledResidueFloat32x4(v)
+	case OpFloorScaledResidueFloat32x8:
+		return rewriteValueAMD64_OpFloorScaledResidueFloat32x8(v)
+	case OpFloorScaledResidueFloat64x2:
+		return rewriteValueAMD64_OpFloorScaledResidueFloat64x2(v)
+	case OpFloorScaledResidueFloat64x4:
+		return rewriteValueAMD64_OpFloorScaledResidueFloat64x4(v)
+	case OpFloorScaledResidueFloat64x8:
+		return rewriteValueAMD64_OpFloorScaledResidueFloat64x8(v)
+	case OpFloorScaledResidueMaskedFloat32x16:
+		return rewriteValueAMD64_OpFloorScaledResidueMaskedFloat32x16(v)
+	case OpFloorScaledResidueMaskedFloat32x4:
+		return rewriteValueAMD64_OpFloorScaledResidueMaskedFloat32x4(v)
+	case OpFloorScaledResidueMaskedFloat32x8:
+		return rewriteValueAMD64_OpFloorScaledResidueMaskedFloat32x8(v)
+	case OpFloorScaledResidueMaskedFloat64x2:
+		return rewriteValueAMD64_OpFloorScaledResidueMaskedFloat64x2(v)
+	case OpFloorScaledResidueMaskedFloat64x4:
+		return rewriteValueAMD64_OpFloorScaledResidueMaskedFloat64x4(v)
+	case OpFloorScaledResidueMaskedFloat64x8:
+		return rewriteValueAMD64_OpFloorScaledResidueMaskedFloat64x8(v)
 	case OpFusedMultiplyAddFloat32x16:
 		v.Op = OpAMD64VFMADD213PS512
 		return true
@@ -2944,36 +2998,6 @@ func rewriteValueAMD64(v *Value) bool {
 	case OpMul8:
 		v.Op = OpAMD64MULL
 		return true
-	case OpMulByPowOf2Float32x16:
-		v.Op = OpAMD64VSCALEFPS512
-		return true
-	case OpMulByPowOf2Float32x4:
-		v.Op = OpAMD64VSCALEFPS128
-		return true
-	case OpMulByPowOf2Float32x8:
-		v.Op = OpAMD64VSCALEFPS256
-		return true
-	case OpMulByPowOf2Float64x2:
-		v.Op = OpAMD64VSCALEFPD128
-		return true
-	case OpMulByPowOf2Float64x4:
-		v.Op = OpAMD64VSCALEFPD256
-		return true
-	case OpMulByPowOf2Float64x8:
-		v.Op = OpAMD64VSCALEFPD512
-		return true
-	case OpMulByPowOf2MaskedFloat32x16:
-		return rewriteValueAMD64_OpMulByPowOf2MaskedFloat32x16(v)
-	case OpMulByPowOf2MaskedFloat32x4:
-		return rewriteValueAMD64_OpMulByPowOf2MaskedFloat32x4(v)
-	case OpMulByPowOf2MaskedFloat32x8:
-		return rewriteValueAMD64_OpMulByPowOf2MaskedFloat32x8(v)
-	case OpMulByPowOf2MaskedFloat64x2:
-		return rewriteValueAMD64_OpMulByPowOf2MaskedFloat64x2(v)
-	case OpMulByPowOf2MaskedFloat64x4:
-		return rewriteValueAMD64_OpMulByPowOf2MaskedFloat64x4(v)
-	case OpMulByPowOf2MaskedFloat64x8:
-		return rewriteValueAMD64_OpMulByPowOf2MaskedFloat64x8(v)
 	case OpMulEvenWidenInt32x4:
 		v.Op = OpAMD64VPMULDQ128
 		return true
@@ -3064,51 +3088,33 @@ func rewriteValueAMD64(v *Value) bool {
 	case OpMulHighUint16x8:
 		v.Op = OpAMD64VPMULHUW128
 		return true
-	case OpMulLowInt16x16:
+	case OpMulInt16x16:
 		v.Op = OpAMD64VPMULLW256
 		return true
-	case OpMulLowInt16x32:
+	case OpMulInt16x32:
 		v.Op = OpAMD64VPMULLW512
 		return true
-	case OpMulLowInt16x8:
+	case OpMulInt16x8:
 		v.Op = OpAMD64VPMULLW128
 		return true
-	case OpMulLowInt32x16:
+	case OpMulInt32x16:
 		v.Op = OpAMD64VPMULLD512
 		return true
-	case OpMulLowInt32x4:
+	case OpMulInt32x4:
 		v.Op = OpAMD64VPMULLD128
 		return true
-	case OpMulLowInt32x8:
+	case OpMulInt32x8:
 		v.Op = OpAMD64VPMULLD256
 		return true
-	case OpMulLowInt64x2:
+	case OpMulInt64x2:
 		v.Op = OpAMD64VPMULLQ128
 		return true
-	case OpMulLowInt64x4:
+	case OpMulInt64x4:
 		v.Op = OpAMD64VPMULLQ256
 		return true
-	case OpMulLowInt64x8:
+	case OpMulInt64x8:
 		v.Op = OpAMD64VPMULLQ512
 		return true
-	case OpMulLowMaskedInt16x16:
-		return rewriteValueAMD64_OpMulLowMaskedInt16x16(v)
-	case OpMulLowMaskedInt16x32:
-		return rewriteValueAMD64_OpMulLowMaskedInt16x32(v)
-	case OpMulLowMaskedInt16x8:
-		return rewriteValueAMD64_OpMulLowMaskedInt16x8(v)
-	case OpMulLowMaskedInt32x16:
-		return rewriteValueAMD64_OpMulLowMaskedInt32x16(v)
-	case OpMulLowMaskedInt32x4:
-		return rewriteValueAMD64_OpMulLowMaskedInt32x4(v)
-	case OpMulLowMaskedInt32x8:
-		return rewriteValueAMD64_OpMulLowMaskedInt32x8(v)
-	case OpMulLowMaskedInt64x2:
-		return rewriteValueAMD64_OpMulLowMaskedInt64x2(v)
-	case OpMulLowMaskedInt64x4:
-		return rewriteValueAMD64_OpMulLowMaskedInt64x4(v)
-	case OpMulLowMaskedInt64x8:
-		return rewriteValueAMD64_OpMulLowMaskedInt64x8(v)
 	case OpMulMaskedFloat32x16:
 		return rewriteValueAMD64_OpMulMaskedFloat32x16(v)
 	case OpMulMaskedFloat32x4:
@@ -3121,6 +3127,24 @@ func rewriteValueAMD64(v *Value) bool {
 		return rewriteValueAMD64_OpMulMaskedFloat64x4(v)
 	case OpMulMaskedFloat64x8:
 		return rewriteValueAMD64_OpMulMaskedFloat64x8(v)
+	case OpMulMaskedInt16x16:
+		return rewriteValueAMD64_OpMulMaskedInt16x16(v)
+	case OpMulMaskedInt16x32:
+		return rewriteValueAMD64_OpMulMaskedInt16x32(v)
+	case OpMulMaskedInt16x8:
+		return rewriteValueAMD64_OpMulMaskedInt16x8(v)
+	case OpMulMaskedInt32x16:
+		return rewriteValueAMD64_OpMulMaskedInt32x16(v)
+	case OpMulMaskedInt32x4:
+		return rewriteValueAMD64_OpMulMaskedInt32x4(v)
+	case OpMulMaskedInt32x8:
+		return rewriteValueAMD64_OpMulMaskedInt32x8(v)
+	case OpMulMaskedInt64x2:
+		return rewriteValueAMD64_OpMulMaskedInt64x2(v)
+	case OpMulMaskedInt64x4:
+		return rewriteValueAMD64_OpMulMaskedInt64x4(v)
+	case OpMulMaskedInt64x8:
+		return rewriteValueAMD64_OpMulMaskedInt64x8(v)
 	case OpNeg16:
 		v.Op = OpAMD64NEGL
 		return true
@@ -3406,78 +3430,6 @@ func rewriteValueAMD64(v *Value) bool {
 		return rewriteValueAMD64_OpPairDotProdMaskedInt16x32(v)
 	case OpPairDotProdMaskedInt16x8:
 		return rewriteValueAMD64_OpPairDotProdMaskedInt16x8(v)
-	case OpPairwiseAddFloat32x4:
-		v.Op = OpAMD64VHADDPS128
-		return true
-	case OpPairwiseAddFloat32x8:
-		v.Op = OpAMD64VHADDPS256
-		return true
-	case OpPairwiseAddFloat64x2:
-		v.Op = OpAMD64VHADDPD128
-		return true
-	case OpPairwiseAddFloat64x4:
-		v.Op = OpAMD64VHADDPD256
-		return true
-	case OpPairwiseAddInt16x16:
-		v.Op = OpAMD64VPHADDW256
-		return true
-	case OpPairwiseAddInt16x8:
-		v.Op = OpAMD64VPHADDW128
-		return true
-	case OpPairwiseAddInt32x4:
-		v.Op = OpAMD64VPHADDD128
-		return true
-	case OpPairwiseAddInt32x8:
-		v.Op = OpAMD64VPHADDD256
-		return true
-	case OpPairwiseAddUint16x16:
-		v.Op = OpAMD64VPHADDW256
-		return true
-	case OpPairwiseAddUint16x8:
-		v.Op = OpAMD64VPHADDW128
-		return true
-	case OpPairwiseAddUint32x4:
-		v.Op = OpAMD64VPHADDD128
-		return true
-	case OpPairwiseAddUint32x8:
-		v.Op = OpAMD64VPHADDD256
-		return true
-	case OpPairwiseSubFloat32x4:
-		v.Op = OpAMD64VHSUBPS128
-		return true
-	case OpPairwiseSubFloat32x8:
-		v.Op = OpAMD64VHSUBPS256
-		return true
-	case OpPairwiseSubFloat64x2:
-		v.Op = OpAMD64VHSUBPD128
-		return true
-	case OpPairwiseSubFloat64x4:
-		v.Op = OpAMD64VHSUBPD256
-		return true
-	case OpPairwiseSubInt16x16:
-		v.Op = OpAMD64VPHSUBW256
-		return true
-	case OpPairwiseSubInt16x8:
-		v.Op = OpAMD64VPHSUBW128
-		return true
-	case OpPairwiseSubInt32x4:
-		v.Op = OpAMD64VPHSUBD128
-		return true
-	case OpPairwiseSubInt32x8:
-		v.Op = OpAMD64VPHSUBD256
-		return true
-	case OpPairwiseSubUint16x16:
-		v.Op = OpAMD64VPHSUBW256
-		return true
-	case OpPairwiseSubUint16x8:
-		v.Op = OpAMD64VPHSUBW128
-		return true
-	case OpPairwiseSubUint32x4:
-		v.Op = OpAMD64VPHSUBD128
-		return true
-	case OpPairwiseSubUint32x8:
-		v.Op = OpAMD64VPHSUBD256
-		return true
 	case OpPanicBounds:
 		return rewriteValueAMD64_OpPanicBounds(v)
 	case OpPermute2Float32x16:
@@ -4152,32 +4104,56 @@ func rewriteValueAMD64(v *Value) bool {
 		return rewriteValueAMD64_OpRoundFloat64x2(v)
 	case OpRoundFloat64x4:
 		return rewriteValueAMD64_OpRoundFloat64x4(v)
+	case OpRoundScaledFloat32x16:
+		return rewriteValueAMD64_OpRoundScaledFloat32x16(v)
+	case OpRoundScaledFloat32x4:
+		return rewriteValueAMD64_OpRoundScaledFloat32x4(v)
+	case OpRoundScaledFloat32x8:
+		return rewriteValueAMD64_OpRoundScaledFloat32x8(v)
+	case OpRoundScaledFloat64x2:
+		return rewriteValueAMD64_OpRoundScaledFloat64x2(v)
+	case OpRoundScaledFloat64x4:
+		return rewriteValueAMD64_OpRoundScaledFloat64x4(v)
+	case OpRoundScaledFloat64x8:
+		return rewriteValueAMD64_OpRoundScaledFloat64x8(v)
+	case OpRoundScaledMaskedFloat32x16:
+		return rewriteValueAMD64_OpRoundScaledMaskedFloat32x16(v)
+	case OpRoundScaledMaskedFloat32x4:
+		return rewriteValueAMD64_OpRoundScaledMaskedFloat32x4(v)
+	case OpRoundScaledMaskedFloat32x8:
+		return rewriteValueAMD64_OpRoundScaledMaskedFloat32x8(v)
+	case OpRoundScaledMaskedFloat64x2:
+		return rewriteValueAMD64_OpRoundScaledMaskedFloat64x2(v)
+	case OpRoundScaledMaskedFloat64x4:
+		return rewriteValueAMD64_OpRoundScaledMaskedFloat64x4(v)
+	case OpRoundScaledMaskedFloat64x8:
+		return rewriteValueAMD64_OpRoundScaledMaskedFloat64x8(v)
+	case OpRoundScaledResidueFloat32x16:
+		return rewriteValueAMD64_OpRoundScaledResidueFloat32x16(v)
+	case OpRoundScaledResidueFloat32x4:
+		return rewriteValueAMD64_OpRoundScaledResidueFloat32x4(v)
+	case OpRoundScaledResidueFloat32x8:
+		return rewriteValueAMD64_OpRoundScaledResidueFloat32x8(v)
+	case OpRoundScaledResidueFloat64x2:
+		return rewriteValueAMD64_OpRoundScaledResidueFloat64x2(v)
+	case OpRoundScaledResidueFloat64x4:
+		return rewriteValueAMD64_OpRoundScaledResidueFloat64x4(v)
+	case OpRoundScaledResidueFloat64x8:
+		return rewriteValueAMD64_OpRoundScaledResidueFloat64x8(v)
+	case OpRoundScaledResidueMaskedFloat32x16:
+		return rewriteValueAMD64_OpRoundScaledResidueMaskedFloat32x16(v)
+	case OpRoundScaledResidueMaskedFloat32x4:
+		return rewriteValueAMD64_OpRoundScaledResidueMaskedFloat32x4(v)
+	case OpRoundScaledResidueMaskedFloat32x8:
+		return rewriteValueAMD64_OpRoundScaledResidueMaskedFloat32x8(v)
+	case OpRoundScaledResidueMaskedFloat64x2:
+		return rewriteValueAMD64_OpRoundScaledResidueMaskedFloat64x2(v)
+	case OpRoundScaledResidueMaskedFloat64x4:
+		return rewriteValueAMD64_OpRoundScaledResidueMaskedFloat64x4(v)
+	case OpRoundScaledResidueMaskedFloat64x8:
+		return rewriteValueAMD64_OpRoundScaledResidueMaskedFloat64x8(v)
 	case OpRoundToEven:
 		return rewriteValueAMD64_OpRoundToEven(v)
-	case OpRoundWithPrecisionFloat32x16:
-		return rewriteValueAMD64_OpRoundWithPrecisionFloat32x16(v)
-	case OpRoundWithPrecisionFloat32x4:
-		return rewriteValueAMD64_OpRoundWithPrecisionFloat32x4(v)
-	case OpRoundWithPrecisionFloat32x8:
-		return rewriteValueAMD64_OpRoundWithPrecisionFloat32x8(v)
-	case OpRoundWithPrecisionFloat64x2:
-		return rewriteValueAMD64_OpRoundWithPrecisionFloat64x2(v)
-	case OpRoundWithPrecisionFloat64x4:
-		return rewriteValueAMD64_OpRoundWithPrecisionFloat64x4(v)
-	case OpRoundWithPrecisionFloat64x8:
-		return rewriteValueAMD64_OpRoundWithPrecisionFloat64x8(v)
-	case OpRoundWithPrecisionMaskedFloat32x16:
-		return rewriteValueAMD64_OpRoundWithPrecisionMaskedFloat32x16(v)
-	case OpRoundWithPrecisionMaskedFloat32x4:
-		return rewriteValueAMD64_OpRoundWithPrecisionMaskedFloat32x4(v)
-	case OpRoundWithPrecisionMaskedFloat32x8:
-		return rewriteValueAMD64_OpRoundWithPrecisionMaskedFloat32x8(v)
-	case OpRoundWithPrecisionMaskedFloat64x2:
-		return rewriteValueAMD64_OpRoundWithPrecisionMaskedFloat64x2(v)
-	case OpRoundWithPrecisionMaskedFloat64x4:
-		return rewriteValueAMD64_OpRoundWithPrecisionMaskedFloat64x4(v)
-	case OpRoundWithPrecisionMaskedFloat64x8:
-		return rewriteValueAMD64_OpRoundWithPrecisionMaskedFloat64x8(v)
 	case OpRsh16Ux16:
 		return rewriteValueAMD64_OpRsh16Ux16(v)
 	case OpRsh16Ux32:
@@ -4257,138 +4233,6 @@ func rewriteValueAMD64(v *Value) bool {
 		return rewriteValueAMD64_OpSaturatedAddDotProdMaskedInt32x4(v)
 	case OpSaturatedAddDotProdMaskedInt32x8:
 		return rewriteValueAMD64_OpSaturatedAddDotProdMaskedInt32x8(v)
-	case OpSaturatedAddInt16x16:
-		v.Op = OpAMD64VPADDSW256
-		return true
-	case OpSaturatedAddInt16x32:
-		v.Op = OpAMD64VPADDSW512
-		return true
-	case OpSaturatedAddInt16x8:
-		v.Op = OpAMD64VPADDSW128
-		return true
-	case OpSaturatedAddInt8x16:
-		v.Op = OpAMD64VPADDSB128
-		return true
-	case OpSaturatedAddInt8x32:
-		v.Op = OpAMD64VPADDSB256
-		return true
-	case OpSaturatedAddInt8x64:
-		v.Op = OpAMD64VPADDSB512
-		return true
-	case OpSaturatedAddMaskedInt16x16:
-		return rewriteValueAMD64_OpSaturatedAddMaskedInt16x16(v)
-	case OpSaturatedAddMaskedInt16x32:
-		return rewriteValueAMD64_OpSaturatedAddMaskedInt16x32(v)
-	case OpSaturatedAddMaskedInt16x8:
-		return rewriteValueAMD64_OpSaturatedAddMaskedInt16x8(v)
-	case OpSaturatedAddMaskedInt8x16:
-		return rewriteValueAMD64_OpSaturatedAddMaskedInt8x16(v)
-	case OpSaturatedAddMaskedInt8x32:
-		return rewriteValueAMD64_OpSaturatedAddMaskedInt8x32(v)
-	case OpSaturatedAddMaskedInt8x64:
-		return rewriteValueAMD64_OpSaturatedAddMaskedInt8x64(v)
-	case OpSaturatedAddMaskedUint16x16:
-		return rewriteValueAMD64_OpSaturatedAddMaskedUint16x16(v)
-	case OpSaturatedAddMaskedUint16x32:
-		return rewriteValueAMD64_OpSaturatedAddMaskedUint16x32(v)
-	case OpSaturatedAddMaskedUint16x8:
-		return rewriteValueAMD64_OpSaturatedAddMaskedUint16x8(v)
-	case OpSaturatedAddMaskedUint8x16:
-		return rewriteValueAMD64_OpSaturatedAddMaskedUint8x16(v)
-	case OpSaturatedAddMaskedUint8x32:
-		return rewriteValueAMD64_OpSaturatedAddMaskedUint8x32(v)
-	case OpSaturatedAddMaskedUint8x64:
-		return rewriteValueAMD64_OpSaturatedAddMaskedUint8x64(v)
-	case OpSaturatedAddUint16x16:
-		v.Op = OpAMD64VPADDSW256
-		return true
-	case OpSaturatedAddUint16x32:
-		v.Op = OpAMD64VPADDSW512
-		return true
-	case OpSaturatedAddUint16x8:
-		v.Op = OpAMD64VPADDSW128
-		return true
-	case OpSaturatedAddUint8x16:
-		v.Op = OpAMD64VPADDSB128
-		return true
-	case OpSaturatedAddUint8x32:
-		v.Op = OpAMD64VPADDSB256
-		return true
-	case OpSaturatedAddUint8x64:
-		v.Op = OpAMD64VPADDSB512
-		return true
-	case OpSaturatedPairwiseAddInt16x16:
-		v.Op = OpAMD64VPHADDSW256
-		return true
-	case OpSaturatedPairwiseAddInt16x8:
-		v.Op = OpAMD64VPHADDSW128
-		return true
-	case OpSaturatedPairwiseSubInt16x16:
-		v.Op = OpAMD64VPHSUBSW256
-		return true
-	case OpSaturatedPairwiseSubInt16x8:
-		v.Op = OpAMD64VPHSUBSW128
-		return true
-	case OpSaturatedSubInt16x16:
-		v.Op = OpAMD64VPSUBSW256
-		return true
-	case OpSaturatedSubInt16x32:
-		v.Op = OpAMD64VPSUBSW512
-		return true
-	case OpSaturatedSubInt16x8:
-		v.Op = OpAMD64VPSUBSW128
-		return true
-	case OpSaturatedSubInt8x16:
-		v.Op = OpAMD64VPSUBSB128
-		return true
-	case OpSaturatedSubInt8x32:
-		v.Op = OpAMD64VPSUBSB256
-		return true
-	case OpSaturatedSubInt8x64:
-		v.Op = OpAMD64VPSUBSB512
-		return true
-	case OpSaturatedSubMaskedInt16x16:
-		return rewriteValueAMD64_OpSaturatedSubMaskedInt16x16(v)
-	case OpSaturatedSubMaskedInt16x32:
-		return rewriteValueAMD64_OpSaturatedSubMaskedInt16x32(v)
-	case OpSaturatedSubMaskedInt16x8:
-		return rewriteValueAMD64_OpSaturatedSubMaskedInt16x8(v)
-	case OpSaturatedSubMaskedInt8x16:
-		return rewriteValueAMD64_OpSaturatedSubMaskedInt8x16(v)
-	case OpSaturatedSubMaskedInt8x32:
-		return rewriteValueAMD64_OpSaturatedSubMaskedInt8x32(v)
-	case OpSaturatedSubMaskedInt8x64:
-		return rewriteValueAMD64_OpSaturatedSubMaskedInt8x64(v)
-	case OpSaturatedSubMaskedUint16x16:
-		return rewriteValueAMD64_OpSaturatedSubMaskedUint16x16(v)
-	case OpSaturatedSubMaskedUint16x32:
-		return rewriteValueAMD64_OpSaturatedSubMaskedUint16x32(v)
-	case OpSaturatedSubMaskedUint16x8:
-		return rewriteValueAMD64_OpSaturatedSubMaskedUint16x8(v)
-	case OpSaturatedSubMaskedUint8x16:
-		return rewriteValueAMD64_OpSaturatedSubMaskedUint8x16(v)
-	case OpSaturatedSubMaskedUint8x32:
-		return rewriteValueAMD64_OpSaturatedSubMaskedUint8x32(v)
-	case OpSaturatedSubMaskedUint8x64:
-		return rewriteValueAMD64_OpSaturatedSubMaskedUint8x64(v)
-	case OpSaturatedSubUint16x16:
-		v.Op = OpAMD64VPSUBSW256
-		return true
-	case OpSaturatedSubUint16x32:
-		v.Op = OpAMD64VPSUBSW512
-		return true
-	case OpSaturatedSubUint16x8:
-		v.Op = OpAMD64VPSUBSW128
-		return true
-	case OpSaturatedSubUint8x16:
-		v.Op = OpAMD64VPSUBSB128
-		return true
-	case OpSaturatedSubUint8x32:
-		v.Op = OpAMD64VPSUBSB256
-		return true
-	case OpSaturatedSubUint8x64:
-		v.Op = OpAMD64VPSUBSB512
-		return true
 	case OpSaturatedUnsignedSignedPairDotProdMaskedUint8x16:
 		return rewriteValueAMD64_OpSaturatedUnsignedSignedPairDotProdMaskedUint8x16(v)
 	case OpSaturatedUnsignedSignedPairDotProdMaskedUint8x32:
@@ -4419,6 +4263,36 @@ func rewriteValueAMD64(v *Value) bool {
 		return rewriteValueAMD64_OpSaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x4(v)
 	case OpSaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x8:
 		return rewriteValueAMD64_OpSaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x8(v)
+	case OpScaleFloat32x16:
+		v.Op = OpAMD64VSCALEFPS512
+		return true
+	case OpScaleFloat32x4:
+		v.Op = OpAMD64VSCALEFPS128
+		return true
+	case OpScaleFloat32x8:
+		v.Op = OpAMD64VSCALEFPS256
+		return true
+	case OpScaleFloat64x2:
+		v.Op = OpAMD64VSCALEFPD128
+		return true
+	case OpScaleFloat64x4:
+		v.Op = OpAMD64VSCALEFPD256
+		return true
+	case OpScaleFloat64x8:
+		v.Op = OpAMD64VSCALEFPD512
+		return true
+	case OpScaleMaskedFloat32x16:
+		return rewriteValueAMD64_OpScaleMaskedFloat32x16(v)
+	case OpScaleMaskedFloat32x4:
+		return rewriteValueAMD64_OpScaleMaskedFloat32x4(v)
+	case OpScaleMaskedFloat32x8:
+		return rewriteValueAMD64_OpScaleMaskedFloat32x8(v)
+	case OpScaleMaskedFloat64x2:
+		return rewriteValueAMD64_OpScaleMaskedFloat64x2(v)
+	case OpScaleMaskedFloat64x4:
+		return rewriteValueAMD64_OpScaleMaskedFloat64x4(v)
+	case OpScaleMaskedFloat64x8:
+		return rewriteValueAMD64_OpScaleMaskedFloat64x8(v)
 	case OpSelect0:
 		return rewriteValueAMD64_OpSelect0(v)
 	case OpSelect1:
@@ -5446,9 +5320,111 @@ func rewriteValueAMD64(v *Value) bool {
 		return rewriteValueAMD64_OpSubMaskedUint8x32(v)
 	case OpSubMaskedUint8x64:
 		return rewriteValueAMD64_OpSubMaskedUint8x64(v)
+	case OpSubPairsFloat32x4:
+		v.Op = OpAMD64VHSUBPS128
+		return true
+	case OpSubPairsFloat32x8:
+		v.Op = OpAMD64VHSUBPS256
+		return true
+	case OpSubPairsFloat64x2:
+		v.Op = OpAMD64VHSUBPD128
+		return true
+	case OpSubPairsFloat64x4:
+		v.Op = OpAMD64VHSUBPD256
+		return true
+	case OpSubPairsInt16x16:
+		v.Op = OpAMD64VPHSUBW256
+		return true
+	case OpSubPairsInt16x8:
+		v.Op = OpAMD64VPHSUBW128
+		return true
+	case OpSubPairsInt32x4:
+		v.Op = OpAMD64VPHSUBD128
+		return true
+	case OpSubPairsInt32x8:
+		v.Op = OpAMD64VPHSUBD256
+		return true
+	case OpSubPairsSaturatedInt16x16:
+		v.Op = OpAMD64VPHSUBSW256
+		return true
+	case OpSubPairsSaturatedInt16x8:
+		v.Op = OpAMD64VPHSUBSW128
+		return true
+	case OpSubPairsUint16x16:
+		v.Op = OpAMD64VPHSUBW256
+		return true
+	case OpSubPairsUint16x8:
+		v.Op = OpAMD64VPHSUBW128
+		return true
+	case OpSubPairsUint32x4:
+		v.Op = OpAMD64VPHSUBD128
+		return true
+	case OpSubPairsUint32x8:
+		v.Op = OpAMD64VPHSUBD256
+		return true
 	case OpSubPtr:
 		v.Op = OpAMD64SUBQ
 		return true
+	case OpSubSaturatedInt16x16:
+		v.Op = OpAMD64VPSUBSW256
+		return true
+	case OpSubSaturatedInt16x32:
+		v.Op = OpAMD64VPSUBSW512
+		return true
+	case OpSubSaturatedInt16x8:
+		v.Op = OpAMD64VPSUBSW128
+		return true
+	case OpSubSaturatedInt8x16:
+		v.Op = OpAMD64VPSUBSB128
+		return true
+	case OpSubSaturatedInt8x32:
+		v.Op = OpAMD64VPSUBSB256
+		return true
+	case OpSubSaturatedInt8x64:
+		v.Op = OpAMD64VPSUBSB512
+		return true
+	case OpSubSaturatedMaskedInt16x16:
+		return rewriteValueAMD64_OpSubSaturatedMaskedInt16x16(v)
+	case OpSubSaturatedMaskedInt16x32:
+		return rewriteValueAMD64_OpSubSaturatedMaskedInt16x32(v)
+	case OpSubSaturatedMaskedInt16x8:
+		return rewriteValueAMD64_OpSubSaturatedMaskedInt16x8(v)
+	case OpSubSaturatedMaskedInt8x16:
+		return rewriteValueAMD64_OpSubSaturatedMaskedInt8x16(v)
+	case OpSubSaturatedMaskedInt8x32:
+		return rewriteValueAMD64_OpSubSaturatedMaskedInt8x32(v)
+	case OpSubSaturatedMaskedInt8x64:
+		return rewriteValueAMD64_OpSubSaturatedMaskedInt8x64(v)
+	case OpSubSaturatedMaskedUint16x16:
+		return rewriteValueAMD64_OpSubSaturatedMaskedUint16x16(v)
+	case OpSubSaturatedMaskedUint16x32:
+		return rewriteValueAMD64_OpSubSaturatedMaskedUint16x32(v)
+	case OpSubSaturatedMaskedUint16x8:
+		return rewriteValueAMD64_OpSubSaturatedMaskedUint16x8(v)
+	case OpSubSaturatedMaskedUint8x16:
+		return rewriteValueAMD64_OpSubSaturatedMaskedUint8x16(v)
+	case OpSubSaturatedMaskedUint8x32:
+		return rewriteValueAMD64_OpSubSaturatedMaskedUint8x32(v)
+	case OpSubSaturatedMaskedUint8x64:
+		return rewriteValueAMD64_OpSubSaturatedMaskedUint8x64(v)
+	case OpSubSaturatedUint16x16:
+		v.Op = OpAMD64VPSUBSW256
+		return true
+	case OpSubSaturatedUint16x32:
+		v.Op = OpAMD64VPSUBSW512
+		return true
+	case OpSubSaturatedUint16x8:
+		v.Op = OpAMD64VPSUBSW128
+		return true
+	case OpSubSaturatedUint8x16:
+		v.Op = OpAMD64VPSUBSB128
+		return true
+	case OpSubSaturatedUint8x32:
+		v.Op = OpAMD64VPSUBSB256
+		return true
+	case OpSubSaturatedUint8x64:
+		v.Op = OpAMD64VPSUBSB512
+		return true
 	case OpSubUint16x16:
 		v.Op = OpAMD64VPSUBW256
 		return true
@@ -5516,30 +5492,54 @@ func rewriteValueAMD64(v *Value) bool {
 		return rewriteValueAMD64_OpTruncFloat64x2(v)
 	case OpTruncFloat64x4:
 		return rewriteValueAMD64_OpTruncFloat64x4(v)
-	case OpTruncWithPrecisionFloat32x16:
-		return rewriteValueAMD64_OpTruncWithPrecisionFloat32x16(v)
-	case OpTruncWithPrecisionFloat32x4:
-		return rewriteValueAMD64_OpTruncWithPrecisionFloat32x4(v)
-	case OpTruncWithPrecisionFloat32x8:
-		return rewriteValueAMD64_OpTruncWithPrecisionFloat32x8(v)
-	case OpTruncWithPrecisionFloat64x2:
-		return rewriteValueAMD64_OpTruncWithPrecisionFloat64x2(v)
-	case OpTruncWithPrecisionFloat64x4:
-		return rewriteValueAMD64_OpTruncWithPrecisionFloat64x4(v)
-	case OpTruncWithPrecisionFloat64x8:
-		return rewriteValueAMD64_OpTruncWithPrecisionFloat64x8(v)
-	case OpTruncWithPrecisionMaskedFloat32x16:
-		return rewriteValueAMD64_OpTruncWithPrecisionMaskedFloat32x16(v)
-	case OpTruncWithPrecisionMaskedFloat32x4:
-		return rewriteValueAMD64_OpTruncWithPrecisionMaskedFloat32x4(v)
-	case OpTruncWithPrecisionMaskedFloat32x8:
-		return rewriteValueAMD64_OpTruncWithPrecisionMaskedFloat32x8(v)
-	case OpTruncWithPrecisionMaskedFloat64x2:
-		return rewriteValueAMD64_OpTruncWithPrecisionMaskedFloat64x2(v)
-	case OpTruncWithPrecisionMaskedFloat64x4:
-		return rewriteValueAMD64_OpTruncWithPrecisionMaskedFloat64x4(v)
-	case OpTruncWithPrecisionMaskedFloat64x8:
-		return rewriteValueAMD64_OpTruncWithPrecisionMaskedFloat64x8(v)
+	case OpTruncScaledFloat32x16:
+		return rewriteValueAMD64_OpTruncScaledFloat32x16(v)
+	case OpTruncScaledFloat32x4:
+		return rewriteValueAMD64_OpTruncScaledFloat32x4(v)
+	case OpTruncScaledFloat32x8:
+		return rewriteValueAMD64_OpTruncScaledFloat32x8(v)
+	case OpTruncScaledFloat64x2:
+		return rewriteValueAMD64_OpTruncScaledFloat64x2(v)
+	case OpTruncScaledFloat64x4:
+		return rewriteValueAMD64_OpTruncScaledFloat64x4(v)
+	case OpTruncScaledFloat64x8:
+		return rewriteValueAMD64_OpTruncScaledFloat64x8(v)
+	case OpTruncScaledMaskedFloat32x16:
+		return rewriteValueAMD64_OpTruncScaledMaskedFloat32x16(v)
+	case OpTruncScaledMaskedFloat32x4:
+		return rewriteValueAMD64_OpTruncScaledMaskedFloat32x4(v)
+	case OpTruncScaledMaskedFloat32x8:
+		return rewriteValueAMD64_OpTruncScaledMaskedFloat32x8(v)
+	case OpTruncScaledMaskedFloat64x2:
+		return rewriteValueAMD64_OpTruncScaledMaskedFloat64x2(v)
+	case OpTruncScaledMaskedFloat64x4:
+		return rewriteValueAMD64_OpTruncScaledMaskedFloat64x4(v)
+	case OpTruncScaledMaskedFloat64x8:
+		return rewriteValueAMD64_OpTruncScaledMaskedFloat64x8(v)
+	case OpTruncScaledResidueFloat32x16:
+		return rewriteValueAMD64_OpTruncScaledResidueFloat32x16(v)
+	case OpTruncScaledResidueFloat32x4:
+		return rewriteValueAMD64_OpTruncScaledResidueFloat32x4(v)
+	case OpTruncScaledResidueFloat32x8:
+		return rewriteValueAMD64_OpTruncScaledResidueFloat32x8(v)
+	case OpTruncScaledResidueFloat64x2:
+		return rewriteValueAMD64_OpTruncScaledResidueFloat64x2(v)
+	case OpTruncScaledResidueFloat64x4:
+		return rewriteValueAMD64_OpTruncScaledResidueFloat64x4(v)
+	case OpTruncScaledResidueFloat64x8:
+		return rewriteValueAMD64_OpTruncScaledResidueFloat64x8(v)
+	case OpTruncScaledResidueMaskedFloat32x16:
+		return rewriteValueAMD64_OpTruncScaledResidueMaskedFloat32x16(v)
+	case OpTruncScaledResidueMaskedFloat32x4:
+		return rewriteValueAMD64_OpTruncScaledResidueMaskedFloat32x4(v)
+	case OpTruncScaledResidueMaskedFloat32x8:
+		return rewriteValueAMD64_OpTruncScaledResidueMaskedFloat32x8(v)
+	case OpTruncScaledResidueMaskedFloat64x2:
+		return rewriteValueAMD64_OpTruncScaledResidueMaskedFloat64x2(v)
+	case OpTruncScaledResidueMaskedFloat64x4:
+		return rewriteValueAMD64_OpTruncScaledResidueMaskedFloat64x4(v)
+	case OpTruncScaledResidueMaskedFloat64x8:
+		return rewriteValueAMD64_OpTruncScaledResidueMaskedFloat64x8(v)
 	case OpUnsignedSignedQuadDotProdAccumulateInt32x16:
 		v.Op = OpAMD64VPDPBUSD512
 		return true
@@ -29162,6 +29162,222 @@ func rewriteValueAMD64_OpAddMaskedUint8x64(v *Value) bool {
 		return true
 	}
 }
+func rewriteValueAMD64_OpAddSaturatedMaskedInt16x16(v *Value) bool {
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (AddSaturatedMaskedInt16x16 x y mask)
+	// result: (VPADDSWMasked256 x y (VPMOVVec16x16ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		mask := v_2
+		v.reset(OpAMD64VPADDSWMasked256)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x16ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg3(x, y, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpAddSaturatedMaskedInt16x32(v *Value) bool {
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (AddSaturatedMaskedInt16x32 x y mask)
+	// result: (VPADDSWMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		mask := v_2
+		v.reset(OpAMD64VPADDSWMasked512)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x32ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg3(x, y, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpAddSaturatedMaskedInt16x8(v *Value) bool {
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (AddSaturatedMaskedInt16x8 x y mask)
+	// result: (VPADDSWMasked128 x y (VPMOVVec16x8ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		mask := v_2
+		v.reset(OpAMD64VPADDSWMasked128)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg3(x, y, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpAddSaturatedMaskedInt8x16(v *Value) bool {
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (AddSaturatedMaskedInt8x16 x y mask)
+	// result: (VPADDSBMasked128 x y (VPMOVVec8x16ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		mask := v_2
+		v.reset(OpAMD64VPADDSBMasked128)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x16ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg3(x, y, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpAddSaturatedMaskedInt8x32(v *Value) bool {
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (AddSaturatedMaskedInt8x32 x y mask)
+	// result: (VPADDSBMasked256 x y (VPMOVVec8x32ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		mask := v_2
+		v.reset(OpAMD64VPADDSBMasked256)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x32ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg3(x, y, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpAddSaturatedMaskedInt8x64(v *Value) bool {
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (AddSaturatedMaskedInt8x64 x y mask)
+	// result: (VPADDSBMasked512 x y (VPMOVVec8x64ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		mask := v_2
+		v.reset(OpAMD64VPADDSBMasked512)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x64ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg3(x, y, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpAddSaturatedMaskedUint16x16(v *Value) bool {
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (AddSaturatedMaskedUint16x16 x y mask)
+	// result: (VPADDSWMasked256 x y (VPMOVVec16x16ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		mask := v_2
+		v.reset(OpAMD64VPADDSWMasked256)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x16ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg3(x, y, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpAddSaturatedMaskedUint16x32(v *Value) bool {
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (AddSaturatedMaskedUint16x32 x y mask)
+	// result: (VPADDSWMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		mask := v_2
+		v.reset(OpAMD64VPADDSWMasked512)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x32ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg3(x, y, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpAddSaturatedMaskedUint16x8(v *Value) bool {
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (AddSaturatedMaskedUint16x8 x y mask)
+	// result: (VPADDSWMasked128 x y (VPMOVVec16x8ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		mask := v_2
+		v.reset(OpAMD64VPADDSWMasked128)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg3(x, y, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpAddSaturatedMaskedUint8x16(v *Value) bool {
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (AddSaturatedMaskedUint8x16 x y mask)
+	// result: (VPADDSBMasked128 x y (VPMOVVec8x16ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		mask := v_2
+		v.reset(OpAMD64VPADDSBMasked128)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x16ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg3(x, y, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpAddSaturatedMaskedUint8x32(v *Value) bool {
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (AddSaturatedMaskedUint8x32 x y mask)
+	// result: (VPADDSBMasked256 x y (VPMOVVec8x32ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		mask := v_2
+		v.reset(OpAMD64VPADDSBMasked256)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x32ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg3(x, y, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpAddSaturatedMaskedUint8x64(v *Value) bool {
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (AddSaturatedMaskedUint8x64 x y mask)
+	// result: (VPADDSBMasked512 x y (VPMOVVec8x64ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		mask := v_2
+		v.reset(OpAMD64VPADDSBMasked512)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x64ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg3(x, y, v0)
+		return true
+	}
+}
 func rewriteValueAMD64_OpAddr(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (Addr {sym} base)
@@ -30521,9 +30737,9 @@ func rewriteValueAMD64_OpCeilFloat64x4(v *Value) bool {
 		return true
 	}
 }
-func rewriteValueAMD64_OpCeilWithPrecisionFloat32x16(v *Value) bool {
+func rewriteValueAMD64_OpCeilScaledFloat32x16(v *Value) bool {
 	v_0 := v.Args[0]
-	// match: (CeilWithPrecisionFloat32x16 [a] x)
+	// match: (CeilScaledFloat32x16 [a] x)
 	// result: (VRNDSCALEPS512 [a+2] x)
 	for {
 		a := auxIntToInt8(v.AuxInt)
@@ -30534,9 +30750,9 @@ func rewriteValueAMD64_OpCeilWithPrecisionFloat32x16(v *Value) bool {
 		return true
 	}
 }
-func rewriteValueAMD64_OpCeilWithPrecisionFloat32x4(v *Value) bool {
+func rewriteValueAMD64_OpCeilScaledFloat32x4(v *Value) bool {
 	v_0 := v.Args[0]
-	// match: (CeilWithPrecisionFloat32x4 [a] x)
+	// match: (CeilScaledFloat32x4 [a] x)
 	// result: (VRNDSCALEPS128 [a+2] x)
 	for {
 		a := auxIntToInt8(v.AuxInt)
@@ -30547,9 +30763,9 @@ func rewriteValueAMD64_OpCeilWithPrecisionFloat32x4(v *Value) bool {
 		return true
 	}
 }
-func rewriteValueAMD64_OpCeilWithPrecisionFloat32x8(v *Value) bool {
+func rewriteValueAMD64_OpCeilScaledFloat32x8(v *Value) bool {
 	v_0 := v.Args[0]
-	// match: (CeilWithPrecisionFloat32x8 [a] x)
+	// match: (CeilScaledFloat32x8 [a] x)
 	// result: (VRNDSCALEPS256 [a+2] x)
 	for {
 		a := auxIntToInt8(v.AuxInt)
@@ -30560,9 +30776,9 @@ func rewriteValueAMD64_OpCeilWithPrecisionFloat32x8(v *Value) bool {
 		return true
 	}
 }
-func rewriteValueAMD64_OpCeilWithPrecisionFloat64x2(v *Value) bool {
+func rewriteValueAMD64_OpCeilScaledFloat64x2(v *Value) bool {
 	v_0 := v.Args[0]
-	// match: (CeilWithPrecisionFloat64x2 [a] x)
+	// match: (CeilScaledFloat64x2 [a] x)
 	// result: (VRNDSCALEPD128 [a+2] x)
 	for {
 		a := auxIntToInt8(v.AuxInt)
@@ -30573,9 +30789,9 @@ func rewriteValueAMD64_OpCeilWithPrecisionFloat64x2(v *Value) bool {
 		return true
 	}
 }
-func rewriteValueAMD64_OpCeilWithPrecisionFloat64x4(v *Value) bool {
+func rewriteValueAMD64_OpCeilScaledFloat64x4(v *Value) bool {
 	v_0 := v.Args[0]
-	// match: (CeilWithPrecisionFloat64x4 [a] x)
+	// match: (CeilScaledFloat64x4 [a] x)
 	// result: (VRNDSCALEPD256 [a+2] x)
 	for {
 		a := auxIntToInt8(v.AuxInt)
@@ -30586,9 +30802,9 @@ func rewriteValueAMD64_OpCeilWithPrecisionFloat64x4(v *Value) bool {
 		return true
 	}
 }
-func rewriteValueAMD64_OpCeilWithPrecisionFloat64x8(v *Value) bool {
+func rewriteValueAMD64_OpCeilScaledFloat64x8(v *Value) bool {
 	v_0 := v.Args[0]
-	// match: (CeilWithPrecisionFloat64x8 [a] x)
+	// match: (CeilScaledFloat64x8 [a] x)
 	// result: (VRNDSCALEPD512 [a+2] x)
 	for {
 		a := auxIntToInt8(v.AuxInt)
@@ -30599,11 +30815,11 @@ func rewriteValueAMD64_OpCeilWithPrecisionFloat64x8(v *Value) bool {
 		return true
 	}
 }
-func rewriteValueAMD64_OpCeilWithPrecisionMaskedFloat32x16(v *Value) bool {
+func rewriteValueAMD64_OpCeilScaledMaskedFloat32x16(v *Value) bool {
 	v_1 := v.Args[1]
 	v_0 := v.Args[0]
 	b := v.Block
-	// match: (CeilWithPrecisionMaskedFloat32x16 [a] x mask)
+	// match: (CeilScaledMaskedFloat32x16 [a] x mask)
 	// result: (VRNDSCALEPSMasked512 [a+2] x (VPMOVVec32x16ToM <types.TypeMask> mask))
 	for {
 		a := auxIntToInt8(v.AuxInt)
@@ -30617,11 +30833,11 @@ func rewriteValueAMD64_OpCeilWithPrecisionMaskedFloat32x16(v *Value) bool {
 		return true
 	}
 }
-func rewriteValueAMD64_OpCeilWithPrecisionMaskedFloat32x4(v *Value) bool {
+func rewriteValueAMD64_OpCeilScaledMaskedFloat32x4(v *Value) bool {
 	v_1 := v.Args[1]
 	v_0 := v.Args[0]
 	b := v.Block
-	// match: (CeilWithPrecisionMaskedFloat32x4 [a] x mask)
+	// match: (CeilScaledMaskedFloat32x4 [a] x mask)
 	// result: (VRNDSCALEPSMasked128 [a+2] x (VPMOVVec32x4ToM <types.TypeMask> mask))
 	for {
 		a := auxIntToInt8(v.AuxInt)
@@ -30635,11 +30851,11 @@ func rewriteValueAMD64_OpCeilWithPrecisionMaskedFloat32x4(v *Value) bool {
 		return true
 	}
 }
-func rewriteValueAMD64_OpCeilWithPrecisionMaskedFloat32x8(v *Value) bool {
+func rewriteValueAMD64_OpCeilScaledMaskedFloat32x8(v *Value) bool {
 	v_1 := v.Args[1]
 	v_0 := v.Args[0]
 	b := v.Block
-	// match: (CeilWithPrecisionMaskedFloat32x8 [a] x mask)
+	// match: (CeilScaledMaskedFloat32x8 [a] x mask)
 	// result: (VRNDSCALEPSMasked256 [a+2] x (VPMOVVec32x8ToM <types.TypeMask> mask))
 	for {
 		a := auxIntToInt8(v.AuxInt)
@@ -30653,11 +30869,11 @@ func rewriteValueAMD64_OpCeilWithPrecisionMaskedFloat32x8(v *Value) bool {
 		return true
 	}
 }
-func rewriteValueAMD64_OpCeilWithPrecisionMaskedFloat64x2(v *Value) bool {
+func rewriteValueAMD64_OpCeilScaledMaskedFloat64x2(v *Value) bool {
 	v_1 := v.Args[1]
 	v_0 := v.Args[0]
 	b := v.Block
-	// match: (CeilWithPrecisionMaskedFloat64x2 [a] x mask)
+	// match: (CeilScaledMaskedFloat64x2 [a] x mask)
 	// result: (VRNDSCALEPDMasked128 [a+2] x (VPMOVVec64x2ToM <types.TypeMask> mask))
 	for {
 		a := auxIntToInt8(v.AuxInt)
@@ -30671,11 +30887,11 @@ func rewriteValueAMD64_OpCeilWithPrecisionMaskedFloat64x2(v *Value) bool {
 		return true
 	}
 }
-func rewriteValueAMD64_OpCeilWithPrecisionMaskedFloat64x4(v *Value) bool {
+func rewriteValueAMD64_OpCeilScaledMaskedFloat64x4(v *Value) bool {
 	v_1 := v.Args[1]
 	v_0 := v.Args[0]
 	b := v.Block
-	// match: (CeilWithPrecisionMaskedFloat64x4 [a] x mask)
+	// match: (CeilScaledMaskedFloat64x4 [a] x mask)
 	// result: (VRNDSCALEPDMasked256 [a+2] x (VPMOVVec64x4ToM <types.TypeMask> mask))
 	for {
 		a := auxIntToInt8(v.AuxInt)
@@ -30689,11 +30905,11 @@ func rewriteValueAMD64_OpCeilWithPrecisionMaskedFloat64x4(v *Value) bool {
 		return true
 	}
 }
-func rewriteValueAMD64_OpCeilWithPrecisionMaskedFloat64x8(v *Value) bool {
+func rewriteValueAMD64_OpCeilScaledMaskedFloat64x8(v *Value) bool {
 	v_1 := v.Args[1]
 	v_0 := v.Args[0]
 	b := v.Block
-	// match: (CeilWithPrecisionMaskedFloat64x8 [a] x mask)
+	// match: (CeilScaledMaskedFloat64x8 [a] x mask)
 	// result: (VRNDSCALEPDMasked512 [a+2] x (VPMOVVec64x8ToM <types.TypeMask> mask))
 	for {
 		a := auxIntToInt8(v.AuxInt)
@@ -30707,6 +30923,192 @@ func rewriteValueAMD64_OpCeilWithPrecisionMaskedFloat64x8(v *Value) bool {
 		return true
 	}
 }
+func rewriteValueAMD64_OpCeilScaledResidueFloat32x16(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (CeilScaledResidueFloat32x16 [a] x)
+	// result: (VREDUCEPS512 [a+2] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VREDUCEPS512)
+		v.AuxInt = int8ToAuxInt(a + 2)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpCeilScaledResidueFloat32x4(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (CeilScaledResidueFloat32x4 [a] x)
+	// result: (VREDUCEPS128 [a+2] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VREDUCEPS128)
+		v.AuxInt = int8ToAuxInt(a + 2)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpCeilScaledResidueFloat32x8(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (CeilScaledResidueFloat32x8 [a] x)
+	// result: (VREDUCEPS256 [a+2] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VREDUCEPS256)
+		v.AuxInt = int8ToAuxInt(a + 2)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpCeilScaledResidueFloat64x2(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (CeilScaledResidueFloat64x2 [a] x)
+	// result: (VREDUCEPD128 [a+2] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VREDUCEPD128)
+		v.AuxInt = int8ToAuxInt(a + 2)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpCeilScaledResidueFloat64x4(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (CeilScaledResidueFloat64x4 [a] x)
+	// result: (VREDUCEPD256 [a+2] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VREDUCEPD256)
+		v.AuxInt = int8ToAuxInt(a + 2)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpCeilScaledResidueFloat64x8(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (CeilScaledResidueFloat64x8 [a] x)
+	// result: (VREDUCEPD512 [a+2] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VREDUCEPD512)
+		v.AuxInt = int8ToAuxInt(a + 2)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpCeilScaledResidueMaskedFloat32x16(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (CeilScaledResidueMaskedFloat32x16 [a] x mask)
+	// result: (VREDUCEPSMasked512 [a+2] x (VPMOVVec32x16ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VREDUCEPSMasked512)
+		v.AuxInt = int8ToAuxInt(a + 2)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpCeilScaledResidueMaskedFloat32x4(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (CeilScaledResidueMaskedFloat32x4 [a] x mask)
+	// result: (VREDUCEPSMasked128 [a+2] x (VPMOVVec32x4ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VREDUCEPSMasked128)
+		v.AuxInt = int8ToAuxInt(a + 2)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpCeilScaledResidueMaskedFloat32x8(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (CeilScaledResidueMaskedFloat32x8 [a] x mask)
+	// result: (VREDUCEPSMasked256 [a+2] x (VPMOVVec32x8ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VREDUCEPSMasked256)
+		v.AuxInt = int8ToAuxInt(a + 2)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpCeilScaledResidueMaskedFloat64x2(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (CeilScaledResidueMaskedFloat64x2 [a] x mask)
+	// result: (VREDUCEPDMasked128 [a+2] x (VPMOVVec64x2ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VREDUCEPDMasked128)
+		v.AuxInt = int8ToAuxInt(a + 2)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpCeilScaledResidueMaskedFloat64x4(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (CeilScaledResidueMaskedFloat64x4 [a] x mask)
+	// result: (VREDUCEPDMasked256 [a+2] x (VPMOVVec64x4ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VREDUCEPDMasked256)
+		v.AuxInt = int8ToAuxInt(a + 2)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpCeilScaledResidueMaskedFloat64x8(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (CeilScaledResidueMaskedFloat64x8 [a] x mask)
+	// result: (VREDUCEPDMasked512 [a+2] x (VPMOVVec64x8ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VREDUCEPDMasked512)
+		v.AuxInt = int8ToAuxInt(a + 2)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
 func rewriteValueAMD64_OpCompressFloat32x16(v *Value) bool {
 	v_1 := v.Args[1]
 	v_0 := v.Args[0]
@@ -32596,750 +32998,6 @@ func rewriteValueAMD64_OpCvt8toMask64x8(v *Value) bool {
 		return true
 	}
 }
-func rewriteValueAMD64_OpDiffWithCeilWithPrecisionFloat32x16(v *Value) bool {
-	v_0 := v.Args[0]
-	// match: (DiffWithCeilWithPrecisionFloat32x16 [a] x)
-	// result: (VREDUCEPS512 [a+2] x)
-	for {
-		a := auxIntToInt8(v.AuxInt)
-		x := v_0
-		v.reset(OpAMD64VREDUCEPS512)
-		v.AuxInt = int8ToAuxInt(a + 2)
-		v.AddArg(x)
-		return true
-	}
-}
-func rewriteValueAMD64_OpDiffWithCeilWithPrecisionFloat32x4(v *Value) bool {
-	v_0 := v.Args[0]
-	// match: (DiffWithCeilWithPrecisionFloat32x4 [a] x)
-	// result: (VREDUCEPS128 [a+2] x)
-	for {
-		a := auxIntToInt8(v.AuxInt)
-		x := v_0
-		v.reset(OpAMD64VREDUCEPS128)
-		v.AuxInt = int8ToAuxInt(a + 2)
-		v.AddArg(x)
-		return true
-	}
-}
-func rewriteValueAMD64_OpDiffWithCeilWithPrecisionFloat32x8(v *Value) bool {
-	v_0 := v.Args[0]
-	// match: (DiffWithCeilWithPrecisionFloat32x8 [a] x)
-	// result: (VREDUCEPS256 [a+2] x)
-	for {
-		a := auxIntToInt8(v.AuxInt)
-		x := v_0
-		v.reset(OpAMD64VREDUCEPS256)
-		v.AuxInt = int8ToAuxInt(a + 2)
-		v.AddArg(x)
-		return true
-	}
-}
-func rewriteValueAMD64_OpDiffWithCeilWithPrecisionFloat64x2(v *Value) bool {
-	v_0 := v.Args[0]
-	// match: (DiffWithCeilWithPrecisionFloat64x2 [a] x)
-	// result: (VREDUCEPD128 [a+2] x)
-	for {
-		a := auxIntToInt8(v.AuxInt)
-		x := v_0
-		v.reset(OpAMD64VREDUCEPD128)
-		v.AuxInt = int8ToAuxInt(a + 2)
-		v.AddArg(x)
-		return true
-	}
-}
-func rewriteValueAMD64_OpDiffWithCeilWithPrecisionFloat64x4(v *Value) bool {
-	v_0 := v.Args[0]
-	// match: (DiffWithCeilWithPrecisionFloat64x4 [a] x)
-	// result: (VREDUCEPD256 [a+2] x)
-	for {
-		a := auxIntToInt8(v.AuxInt)
-		x := v_0
-		v.reset(OpAMD64VREDUCEPD256)
-		v.AuxInt = int8ToAuxInt(a + 2)
-		v.AddArg(x)
-		return true
-	}
-}
-func rewriteValueAMD64_OpDiffWithCeilWithPrecisionFloat64x8(v *Value) bool {
-	v_0 := v.Args[0]
-	// match: (DiffWithCeilWithPrecisionFloat64x8 [a] x)
-	// result: (VREDUCEPD512 [a+2] x)
-	for {
-		a := auxIntToInt8(v.AuxInt)
-		x := v_0
-		v.reset(OpAMD64VREDUCEPD512)
-		v.AuxInt = int8ToAuxInt(a + 2)
-		v.AddArg(x)
-		return true
-	}
-}
-func rewriteValueAMD64_OpDiffWithCeilWithPrecisionMaskedFloat32x16(v *Value) bool {
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (DiffWithCeilWithPrecisionMaskedFloat32x16 [a] x mask)
-	// result: (VREDUCEPSMasked512 [a+2] x (VPMOVVec32x16ToM <types.TypeMask> mask))
-	for {
-		a := auxIntToInt8(v.AuxInt)
-		x := v_0
-		mask := v_1
-		v.reset(OpAMD64VREDUCEPSMasked512)
-		v.AuxInt = int8ToAuxInt(a + 2)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask)
-		v0.AddArg(mask)
-		v.AddArg2(x, v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpDiffWithCeilWithPrecisionMaskedFloat32x4(v *Value) bool {
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (DiffWithCeilWithPrecisionMaskedFloat32x4 [a] x mask)
-	// result: (VREDUCEPSMasked128 [a+2] x (VPMOVVec32x4ToM <types.TypeMask> mask))
-	for {
-		a := auxIntToInt8(v.AuxInt)
-		x := v_0
-		mask := v_1
-		v.reset(OpAMD64VREDUCEPSMasked128)
-		v.AuxInt = int8ToAuxInt(a + 2)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
-		v0.AddArg(mask)
-		v.AddArg2(x, v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpDiffWithCeilWithPrecisionMaskedFloat32x8(v *Value) bool {
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (DiffWithCeilWithPrecisionMaskedFloat32x8 [a] x mask)
-	// result: (VREDUCEPSMasked256 [a+2] x (VPMOVVec32x8ToM <types.TypeMask> mask))
-	for {
-		a := auxIntToInt8(v.AuxInt)
-		x := v_0
-		mask := v_1
-		v.reset(OpAMD64VREDUCEPSMasked256)
-		v.AuxInt = int8ToAuxInt(a + 2)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask)
-		v0.AddArg(mask)
-		v.AddArg2(x, v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpDiffWithCeilWithPrecisionMaskedFloat64x2(v *Value) bool {
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (DiffWithCeilWithPrecisionMaskedFloat64x2 [a] x mask)
-	// result: (VREDUCEPDMasked128 [a+2] x (VPMOVVec64x2ToM <types.TypeMask> mask))
-	for {
-		a := auxIntToInt8(v.AuxInt)
-		x := v_0
-		mask := v_1
-		v.reset(OpAMD64VREDUCEPDMasked128)
-		v.AuxInt = int8ToAuxInt(a + 2)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask)
-		v0.AddArg(mask)
-		v.AddArg2(x, v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpDiffWithCeilWithPrecisionMaskedFloat64x4(v *Value) bool {
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (DiffWithCeilWithPrecisionMaskedFloat64x4 [a] x mask)
-	// result: (VREDUCEPDMasked256 [a+2] x (VPMOVVec64x4ToM <types.TypeMask> mask))
-	for {
-		a := auxIntToInt8(v.AuxInt)
-		x := v_0
-		mask := v_1
-		v.reset(OpAMD64VREDUCEPDMasked256)
-		v.AuxInt = int8ToAuxInt(a + 2)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask)
-		v0.AddArg(mask)
-		v.AddArg2(x, v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpDiffWithCeilWithPrecisionMaskedFloat64x8(v *Value) bool {
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (DiffWithCeilWithPrecisionMaskedFloat64x8 [a] x mask)
-	// result: (VREDUCEPDMasked512 [a+2] x (VPMOVVec64x8ToM <types.TypeMask> mask))
-	for {
-		a := auxIntToInt8(v.AuxInt)
-		x := v_0
-		mask := v_1
-		v.reset(OpAMD64VREDUCEPDMasked512)
-		v.AuxInt = int8ToAuxInt(a + 2)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask)
-		v0.AddArg(mask)
-		v.AddArg2(x, v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpDiffWithFloorWithPrecisionFloat32x16(v *Value) bool {
-	v_0 := v.Args[0]
-	// match: (DiffWithFloorWithPrecisionFloat32x16 [a] x)
-	// result: (VREDUCEPS512 [a+1] x)
-	for {
-		a := auxIntToInt8(v.AuxInt)
-		x := v_0
-		v.reset(OpAMD64VREDUCEPS512)
-		v.AuxInt = int8ToAuxInt(a + 1)
-		v.AddArg(x)
-		return true
-	}
-}
-func rewriteValueAMD64_OpDiffWithFloorWithPrecisionFloat32x4(v *Value) bool {
-	v_0 := v.Args[0]
-	// match: (DiffWithFloorWithPrecisionFloat32x4 [a] x)
-	// result: (VREDUCEPS128 [a+1] x)
-	for {
-		a := auxIntToInt8(v.AuxInt)
-		x := v_0
-		v.reset(OpAMD64VREDUCEPS128)
-		v.AuxInt = int8ToAuxInt(a + 1)
-		v.AddArg(x)
-		return true
-	}
-}
-func rewriteValueAMD64_OpDiffWithFloorWithPrecisionFloat32x8(v *Value) bool {
-	v_0 := v.Args[0]
-	// match: (DiffWithFloorWithPrecisionFloat32x8 [a] x)
-	// result: (VREDUCEPS256 [a+1] x)
-	for {
-		a := auxIntToInt8(v.AuxInt)
-		x := v_0
-		v.reset(OpAMD64VREDUCEPS256)
-		v.AuxInt = int8ToAuxInt(a + 1)
-		v.AddArg(x)
-		return true
-	}
-}
-func rewriteValueAMD64_OpDiffWithFloorWithPrecisionFloat64x2(v *Value) bool {
-	v_0 := v.Args[0]
-	// match: (DiffWithFloorWithPrecisionFloat64x2 [a] x)
-	// result: (VREDUCEPD128 [a+1] x)
-	for {
-		a := auxIntToInt8(v.AuxInt)
-		x := v_0
-		v.reset(OpAMD64VREDUCEPD128)
-		v.AuxInt = int8ToAuxInt(a + 1)
-		v.AddArg(x)
-		return true
-	}
-}
-func rewriteValueAMD64_OpDiffWithFloorWithPrecisionFloat64x4(v *Value) bool {
-	v_0 := v.Args[0]
-	// match: (DiffWithFloorWithPrecisionFloat64x4 [a] x)
-	// result: (VREDUCEPD256 [a+1] x)
-	for {
-		a := auxIntToInt8(v.AuxInt)
-		x := v_0
-		v.reset(OpAMD64VREDUCEPD256)
-		v.AuxInt = int8ToAuxInt(a + 1)
-		v.AddArg(x)
-		return true
-	}
-}
-func rewriteValueAMD64_OpDiffWithFloorWithPrecisionFloat64x8(v *Value) bool {
-	v_0 := v.Args[0]
-	// match: (DiffWithFloorWithPrecisionFloat64x8 [a] x)
-	// result: (VREDUCEPD512 [a+1] x)
-	for {
-		a := auxIntToInt8(v.AuxInt)
-		x := v_0
-		v.reset(OpAMD64VREDUCEPD512)
-		v.AuxInt = int8ToAuxInt(a + 1)
-		v.AddArg(x)
-		return true
-	}
-}
-func rewriteValueAMD64_OpDiffWithFloorWithPrecisionMaskedFloat32x16(v *Value) bool {
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (DiffWithFloorWithPrecisionMaskedFloat32x16 [a] x mask)
-	// result: (VREDUCEPSMasked512 [a+1] x (VPMOVVec32x16ToM <types.TypeMask> mask))
-	for {
-		a := auxIntToInt8(v.AuxInt)
-		x := v_0
-		mask := v_1
-		v.reset(OpAMD64VREDUCEPSMasked512)
-		v.AuxInt = int8ToAuxInt(a + 1)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask)
-		v0.AddArg(mask)
-		v.AddArg2(x, v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpDiffWithFloorWithPrecisionMaskedFloat32x4(v *Value) bool {
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (DiffWithFloorWithPrecisionMaskedFloat32x4 [a] x mask)
-	// result: (VREDUCEPSMasked128 [a+1] x (VPMOVVec32x4ToM <types.TypeMask> mask))
-	for {
-		a := auxIntToInt8(v.AuxInt)
-		x := v_0
-		mask := v_1
-		v.reset(OpAMD64VREDUCEPSMasked128)
-		v.AuxInt = int8ToAuxInt(a + 1)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
-		v0.AddArg(mask)
-		v.AddArg2(x, v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpDiffWithFloorWithPrecisionMaskedFloat32x8(v *Value) bool {
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (DiffWithFloorWithPrecisionMaskedFloat32x8 [a] x mask)
-	// result: (VREDUCEPSMasked256 [a+1] x (VPMOVVec32x8ToM <types.TypeMask> mask))
-	for {
-		a := auxIntToInt8(v.AuxInt)
-		x := v_0
-		mask := v_1
-		v.reset(OpAMD64VREDUCEPSMasked256)
-		v.AuxInt = int8ToAuxInt(a + 1)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask)
-		v0.AddArg(mask)
-		v.AddArg2(x, v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpDiffWithFloorWithPrecisionMaskedFloat64x2(v *Value) bool {
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (DiffWithFloorWithPrecisionMaskedFloat64x2 [a] x mask)
-	// result: (VREDUCEPDMasked128 [a+1] x (VPMOVVec64x2ToM <types.TypeMask> mask))
-	for {
-		a := auxIntToInt8(v.AuxInt)
-		x := v_0
-		mask := v_1
-		v.reset(OpAMD64VREDUCEPDMasked128)
-		v.AuxInt = int8ToAuxInt(a + 1)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask)
-		v0.AddArg(mask)
-		v.AddArg2(x, v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpDiffWithFloorWithPrecisionMaskedFloat64x4(v *Value) bool {
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (DiffWithFloorWithPrecisionMaskedFloat64x4 [a] x mask)
-	// result: (VREDUCEPDMasked256 [a+1] x (VPMOVVec64x4ToM <types.TypeMask> mask))
-	for {
-		a := auxIntToInt8(v.AuxInt)
-		x := v_0
-		mask := v_1
-		v.reset(OpAMD64VREDUCEPDMasked256)
-		v.AuxInt = int8ToAuxInt(a + 1)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask)
-		v0.AddArg(mask)
-		v.AddArg2(x, v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpDiffWithFloorWithPrecisionMaskedFloat64x8(v *Value) bool {
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (DiffWithFloorWithPrecisionMaskedFloat64x8 [a] x mask)
-	// result: (VREDUCEPDMasked512 [a+1] x (VPMOVVec64x8ToM <types.TypeMask> mask))
-	for {
-		a := auxIntToInt8(v.AuxInt)
-		x := v_0
-		mask := v_1
-		v.reset(OpAMD64VREDUCEPDMasked512)
-		v.AuxInt = int8ToAuxInt(a + 1)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask)
-		v0.AddArg(mask)
-		v.AddArg2(x, v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpDiffWithRoundWithPrecisionFloat32x16(v *Value) bool {
-	v_0 := v.Args[0]
-	// match: (DiffWithRoundWithPrecisionFloat32x16 [a] x)
-	// result: (VREDUCEPS512 [a+0] x)
-	for {
-		a := auxIntToInt8(v.AuxInt)
-		x := v_0
-		v.reset(OpAMD64VREDUCEPS512)
-		v.AuxInt = int8ToAuxInt(a + 0)
-		v.AddArg(x)
-		return true
-	}
-}
-func rewriteValueAMD64_OpDiffWithRoundWithPrecisionFloat32x4(v *Value) bool {
-	v_0 := v.Args[0]
-	// match: (DiffWithRoundWithPrecisionFloat32x4 [a] x)
-	// result: (VREDUCEPS128 [a+0] x)
-	for {
-		a := auxIntToInt8(v.AuxInt)
-		x := v_0
-		v.reset(OpAMD64VREDUCEPS128)
-		v.AuxInt = int8ToAuxInt(a + 0)
-		v.AddArg(x)
-		return true
-	}
-}
-func rewriteValueAMD64_OpDiffWithRoundWithPrecisionFloat32x8(v *Value) bool {
-	v_0 := v.Args[0]
-	// match: (DiffWithRoundWithPrecisionFloat32x8 [a] x)
-	// result: (VREDUCEPS256 [a+0] x)
-	for {
-		a := auxIntToInt8(v.AuxInt)
-		x := v_0
-		v.reset(OpAMD64VREDUCEPS256)
-		v.AuxInt = int8ToAuxInt(a + 0)
-		v.AddArg(x)
-		return true
-	}
-}
-func rewriteValueAMD64_OpDiffWithRoundWithPrecisionFloat64x2(v *Value) bool {
-	v_0 := v.Args[0]
-	// match: (DiffWithRoundWithPrecisionFloat64x2 [a] x)
-	// result: (VREDUCEPD128 [a+0] x)
-	for {
-		a := auxIntToInt8(v.AuxInt)
-		x := v_0
-		v.reset(OpAMD64VREDUCEPD128)
-		v.AuxInt = int8ToAuxInt(a + 0)
-		v.AddArg(x)
-		return true
-	}
-}
-func rewriteValueAMD64_OpDiffWithRoundWithPrecisionFloat64x4(v *Value) bool {
-	v_0 := v.Args[0]
-	// match: (DiffWithRoundWithPrecisionFloat64x4 [a] x)
-	// result: (VREDUCEPD256 [a+0] x)
-	for {
-		a := auxIntToInt8(v.AuxInt)
-		x := v_0
-		v.reset(OpAMD64VREDUCEPD256)
-		v.AuxInt = int8ToAuxInt(a + 0)
-		v.AddArg(x)
-		return true
-	}
-}
-func rewriteValueAMD64_OpDiffWithRoundWithPrecisionFloat64x8(v *Value) bool {
-	v_0 := v.Args[0]
-	// match: (DiffWithRoundWithPrecisionFloat64x8 [a] x)
-	// result: (VREDUCEPD512 [a+0] x)
-	for {
-		a := auxIntToInt8(v.AuxInt)
-		x := v_0
-		v.reset(OpAMD64VREDUCEPD512)
-		v.AuxInt = int8ToAuxInt(a + 0)
-		v.AddArg(x)
-		return true
-	}
-}
-func rewriteValueAMD64_OpDiffWithRoundWithPrecisionMaskedFloat32x16(v *Value) bool {
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (DiffWithRoundWithPrecisionMaskedFloat32x16 [a] x mask)
-	// result: (VREDUCEPSMasked512 [a+0] x (VPMOVVec32x16ToM <types.TypeMask> mask))
-	for {
-		a := auxIntToInt8(v.AuxInt)
-		x := v_0
-		mask := v_1
-		v.reset(OpAMD64VREDUCEPSMasked512)
-		v.AuxInt = int8ToAuxInt(a + 0)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask)
-		v0.AddArg(mask)
-		v.AddArg2(x, v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpDiffWithRoundWithPrecisionMaskedFloat32x4(v *Value) bool {
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (DiffWithRoundWithPrecisionMaskedFloat32x4 [a] x mask)
-	// result: (VREDUCEPSMasked128 [a+0] x (VPMOVVec32x4ToM <types.TypeMask> mask))
-	for {
-		a := auxIntToInt8(v.AuxInt)
-		x := v_0
-		mask := v_1
-		v.reset(OpAMD64VREDUCEPSMasked128)
-		v.AuxInt = int8ToAuxInt(a + 0)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
-		v0.AddArg(mask)
-		v.AddArg2(x, v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpDiffWithRoundWithPrecisionMaskedFloat32x8(v *Value) bool {
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (DiffWithRoundWithPrecisionMaskedFloat32x8 [a] x mask)
-	// result: (VREDUCEPSMasked256 [a+0] x (VPMOVVec32x8ToM <types.TypeMask> mask))
-	for {
-		a := auxIntToInt8(v.AuxInt)
-		x := v_0
-		mask := v_1
-		v.reset(OpAMD64VREDUCEPSMasked256)
-		v.AuxInt = int8ToAuxInt(a + 0)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask)
-		v0.AddArg(mask)
-		v.AddArg2(x, v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpDiffWithRoundWithPrecisionMaskedFloat64x2(v *Value) bool {
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (DiffWithRoundWithPrecisionMaskedFloat64x2 [a] x mask)
-	// result: (VREDUCEPDMasked128 [a+0] x (VPMOVVec64x2ToM <types.TypeMask> mask))
-	for {
-		a := auxIntToInt8(v.AuxInt)
-		x := v_0
-		mask := v_1
-		v.reset(OpAMD64VREDUCEPDMasked128)
-		v.AuxInt = int8ToAuxInt(a + 0)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask)
-		v0.AddArg(mask)
-		v.AddArg2(x, v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpDiffWithRoundWithPrecisionMaskedFloat64x4(v *Value) bool {
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (DiffWithRoundWithPrecisionMaskedFloat64x4 [a] x mask)
-	// result: (VREDUCEPDMasked256 [a+0] x (VPMOVVec64x4ToM <types.TypeMask> mask))
-	for {
-		a := auxIntToInt8(v.AuxInt)
-		x := v_0
-		mask := v_1
-		v.reset(OpAMD64VREDUCEPDMasked256)
-		v.AuxInt = int8ToAuxInt(a + 0)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask)
-		v0.AddArg(mask)
-		v.AddArg2(x, v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpDiffWithRoundWithPrecisionMaskedFloat64x8(v *Value) bool {
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (DiffWithRoundWithPrecisionMaskedFloat64x8 [a] x mask)
-	// result: (VREDUCEPDMasked512 [a+0] x (VPMOVVec64x8ToM <types.TypeMask> mask))
-	for {
-		a := auxIntToInt8(v.AuxInt)
-		x := v_0
-		mask := v_1
-		v.reset(OpAMD64VREDUCEPDMasked512)
-		v.AuxInt = int8ToAuxInt(a + 0)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask)
-		v0.AddArg(mask)
-		v.AddArg2(x, v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpDiffWithTruncWithPrecisionFloat32x16(v *Value) bool {
-	v_0 := v.Args[0]
-	// match: (DiffWithTruncWithPrecisionFloat32x16 [a] x)
-	// result: (VREDUCEPS512 [a+3] x)
-	for {
-		a := auxIntToInt8(v.AuxInt)
-		x := v_0
-		v.reset(OpAMD64VREDUCEPS512)
-		v.AuxInt = int8ToAuxInt(a + 3)
-		v.AddArg(x)
-		return true
-	}
-}
-func rewriteValueAMD64_OpDiffWithTruncWithPrecisionFloat32x4(v *Value) bool {
-	v_0 := v.Args[0]
-	// match: (DiffWithTruncWithPrecisionFloat32x4 [a] x)
-	// result: (VREDUCEPS128 [a+3] x)
-	for {
-		a := auxIntToInt8(v.AuxInt)
-		x := v_0
-		v.reset(OpAMD64VREDUCEPS128)
-		v.AuxInt = int8ToAuxInt(a + 3)
-		v.AddArg(x)
-		return true
-	}
-}
-func rewriteValueAMD64_OpDiffWithTruncWithPrecisionFloat32x8(v *Value) bool {
-	v_0 := v.Args[0]
-	// match: (DiffWithTruncWithPrecisionFloat32x8 [a] x)
-	// result: (VREDUCEPS256 [a+3] x)
-	for {
-		a := auxIntToInt8(v.AuxInt)
-		x := v_0
-		v.reset(OpAMD64VREDUCEPS256)
-		v.AuxInt = int8ToAuxInt(a + 3)
-		v.AddArg(x)
-		return true
-	}
-}
-func rewriteValueAMD64_OpDiffWithTruncWithPrecisionFloat64x2(v *Value) bool {
-	v_0 := v.Args[0]
-	// match: (DiffWithTruncWithPrecisionFloat64x2 [a] x)
-	// result: (VREDUCEPD128 [a+3] x)
-	for {
-		a := auxIntToInt8(v.AuxInt)
-		x := v_0
-		v.reset(OpAMD64VREDUCEPD128)
-		v.AuxInt = int8ToAuxInt(a + 3)
-		v.AddArg(x)
-		return true
-	}
-}
-func rewriteValueAMD64_OpDiffWithTruncWithPrecisionFloat64x4(v *Value) bool {
-	v_0 := v.Args[0]
-	// match: (DiffWithTruncWithPrecisionFloat64x4 [a] x)
-	// result: (VREDUCEPD256 [a+3] x)
-	for {
-		a := auxIntToInt8(v.AuxInt)
-		x := v_0
-		v.reset(OpAMD64VREDUCEPD256)
-		v.AuxInt = int8ToAuxInt(a + 3)
-		v.AddArg(x)
-		return true
-	}
-}
-func rewriteValueAMD64_OpDiffWithTruncWithPrecisionFloat64x8(v *Value) bool {
-	v_0 := v.Args[0]
-	// match: (DiffWithTruncWithPrecisionFloat64x8 [a] x)
-	// result: (VREDUCEPD512 [a+3] x)
-	for {
-		a := auxIntToInt8(v.AuxInt)
-		x := v_0
-		v.reset(OpAMD64VREDUCEPD512)
-		v.AuxInt = int8ToAuxInt(a + 3)
-		v.AddArg(x)
-		return true
-	}
-}
-func rewriteValueAMD64_OpDiffWithTruncWithPrecisionMaskedFloat32x16(v *Value) bool {
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (DiffWithTruncWithPrecisionMaskedFloat32x16 [a] x mask)
-	// result: (VREDUCEPSMasked512 [a+3] x (VPMOVVec32x16ToM <types.TypeMask> mask))
-	for {
-		a := auxIntToInt8(v.AuxInt)
-		x := v_0
-		mask := v_1
-		v.reset(OpAMD64VREDUCEPSMasked512)
-		v.AuxInt = int8ToAuxInt(a + 3)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask)
-		v0.AddArg(mask)
-		v.AddArg2(x, v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpDiffWithTruncWithPrecisionMaskedFloat32x4(v *Value) bool {
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (DiffWithTruncWithPrecisionMaskedFloat32x4 [a] x mask)
-	// result: (VREDUCEPSMasked128 [a+3] x (VPMOVVec32x4ToM <types.TypeMask> mask))
-	for {
-		a := auxIntToInt8(v.AuxInt)
-		x := v_0
-		mask := v_1
-		v.reset(OpAMD64VREDUCEPSMasked128)
-		v.AuxInt = int8ToAuxInt(a + 3)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
-		v0.AddArg(mask)
-		v.AddArg2(x, v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpDiffWithTruncWithPrecisionMaskedFloat32x8(v *Value) bool {
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (DiffWithTruncWithPrecisionMaskedFloat32x8 [a] x mask)
-	// result: (VREDUCEPSMasked256 [a+3] x (VPMOVVec32x8ToM <types.TypeMask> mask))
-	for {
-		a := auxIntToInt8(v.AuxInt)
-		x := v_0
-		mask := v_1
-		v.reset(OpAMD64VREDUCEPSMasked256)
-		v.AuxInt = int8ToAuxInt(a + 3)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask)
-		v0.AddArg(mask)
-		v.AddArg2(x, v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpDiffWithTruncWithPrecisionMaskedFloat64x2(v *Value) bool {
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (DiffWithTruncWithPrecisionMaskedFloat64x2 [a] x mask)
-	// result: (VREDUCEPDMasked128 [a+3] x (VPMOVVec64x2ToM <types.TypeMask> mask))
-	for {
-		a := auxIntToInt8(v.AuxInt)
-		x := v_0
-		mask := v_1
-		v.reset(OpAMD64VREDUCEPDMasked128)
-		v.AuxInt = int8ToAuxInt(a + 3)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask)
-		v0.AddArg(mask)
-		v.AddArg2(x, v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpDiffWithTruncWithPrecisionMaskedFloat64x4(v *Value) bool {
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (DiffWithTruncWithPrecisionMaskedFloat64x4 [a] x mask)
-	// result: (VREDUCEPDMasked256 [a+3] x (VPMOVVec64x4ToM <types.TypeMask> mask))
-	for {
-		a := auxIntToInt8(v.AuxInt)
-		x := v_0
-		mask := v_1
-		v.reset(OpAMD64VREDUCEPDMasked256)
-		v.AuxInt = int8ToAuxInt(a + 3)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask)
-		v0.AddArg(mask)
-		v.AddArg2(x, v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpDiffWithTruncWithPrecisionMaskedFloat64x8(v *Value) bool {
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (DiffWithTruncWithPrecisionMaskedFloat64x8 [a] x mask)
-	// result: (VREDUCEPDMasked512 [a+3] x (VPMOVVec64x8ToM <types.TypeMask> mask))
-	for {
-		a := auxIntToInt8(v.AuxInt)
-		x := v_0
-		mask := v_1
-		v.reset(OpAMD64VREDUCEPDMasked512)
-		v.AuxInt = int8ToAuxInt(a + 3)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask)
-		v0.AddArg(mask)
-		v.AddArg2(x, v0)
-		return true
-	}
-}
 func rewriteValueAMD64_OpDiv16(v *Value) bool {
 	v_1 := v.Args[1]
 	v_0 := v.Args[0]
@@ -34731,9 +34389,9 @@ func rewriteValueAMD64_OpFloorFloat64x4(v *Value) bool {
 		return true
 	}
 }
-func rewriteValueAMD64_OpFloorWithPrecisionFloat32x16(v *Value) bool {
+func rewriteValueAMD64_OpFloorScaledFloat32x16(v *Value) bool {
 	v_0 := v.Args[0]
-	// match: (FloorWithPrecisionFloat32x16 [a] x)
+	// match: (FloorScaledFloat32x16 [a] x)
 	// result: (VRNDSCALEPS512 [a+1] x)
 	for {
 		a := auxIntToInt8(v.AuxInt)
@@ -34744,9 +34402,9 @@ func rewriteValueAMD64_OpFloorWithPrecisionFloat32x16(v *Value) bool {
 		return true
 	}
 }
-func rewriteValueAMD64_OpFloorWithPrecisionFloat32x4(v *Value) bool {
+func rewriteValueAMD64_OpFloorScaledFloat32x4(v *Value) bool {
 	v_0 := v.Args[0]
-	// match: (FloorWithPrecisionFloat32x4 [a] x)
+	// match: (FloorScaledFloat32x4 [a] x)
 	// result: (VRNDSCALEPS128 [a+1] x)
 	for {
 		a := auxIntToInt8(v.AuxInt)
@@ -34757,9 +34415,9 @@ func rewriteValueAMD64_OpFloorWithPrecisionFloat32x4(v *Value) bool {
 		return true
 	}
 }
-func rewriteValueAMD64_OpFloorWithPrecisionFloat32x8(v *Value) bool {
+func rewriteValueAMD64_OpFloorScaledFloat32x8(v *Value) bool {
 	v_0 := v.Args[0]
-	// match: (FloorWithPrecisionFloat32x8 [a] x)
+	// match: (FloorScaledFloat32x8 [a] x)
 	// result: (VRNDSCALEPS256 [a+1] x)
 	for {
 		a := auxIntToInt8(v.AuxInt)
@@ -34770,9 +34428,9 @@ func rewriteValueAMD64_OpFloorWithPrecisionFloat32x8(v *Value) bool {
 		return true
 	}
 }
-func rewriteValueAMD64_OpFloorWithPrecisionFloat64x2(v *Value) bool {
+func rewriteValueAMD64_OpFloorScaledFloat64x2(v *Value) bool {
 	v_0 := v.Args[0]
-	// match: (FloorWithPrecisionFloat64x2 [a] x)
+	// match: (FloorScaledFloat64x2 [a] x)
 	// result: (VRNDSCALEPD128 [a+1] x)
 	for {
 		a := auxIntToInt8(v.AuxInt)
@@ -34783,9 +34441,9 @@ func rewriteValueAMD64_OpFloorWithPrecisionFloat64x2(v *Value) bool {
 		return true
 	}
 }
-func rewriteValueAMD64_OpFloorWithPrecisionFloat64x4(v *Value) bool {
+func rewriteValueAMD64_OpFloorScaledFloat64x4(v *Value) bool {
 	v_0 := v.Args[0]
-	// match: (FloorWithPrecisionFloat64x4 [a] x)
+	// match: (FloorScaledFloat64x4 [a] x)
 	// result: (VRNDSCALEPD256 [a+1] x)
 	for {
 		a := auxIntToInt8(v.AuxInt)
@@ -34796,9 +34454,9 @@ func rewriteValueAMD64_OpFloorWithPrecisionFloat64x4(v *Value) bool {
 		return true
 	}
 }
-func rewriteValueAMD64_OpFloorWithPrecisionFloat64x8(v *Value) bool {
+func rewriteValueAMD64_OpFloorScaledFloat64x8(v *Value) bool {
 	v_0 := v.Args[0]
-	// match: (FloorWithPrecisionFloat64x8 [a] x)
+	// match: (FloorScaledFloat64x8 [a] x)
 	// result: (VRNDSCALEPD512 [a+1] x)
 	for {
 		a := auxIntToInt8(v.AuxInt)
@@ -34809,11 +34467,11 @@ func rewriteValueAMD64_OpFloorWithPrecisionFloat64x8(v *Value) bool {
 		return true
 	}
 }
-func rewriteValueAMD64_OpFloorWithPrecisionMaskedFloat32x16(v *Value) bool {
+func rewriteValueAMD64_OpFloorScaledMaskedFloat32x16(v *Value) bool {
 	v_1 := v.Args[1]
 	v_0 := v.Args[0]
 	b := v.Block
-	// match: (FloorWithPrecisionMaskedFloat32x16 [a] x mask)
+	// match: (FloorScaledMaskedFloat32x16 [a] x mask)
 	// result: (VRNDSCALEPSMasked512 [a+1] x (VPMOVVec32x16ToM <types.TypeMask> mask))
 	for {
 		a := auxIntToInt8(v.AuxInt)
@@ -34827,11 +34485,11 @@ func rewriteValueAMD64_OpFloorWithPrecisionMaskedFloat32x16(v *Value) bool {
 		return true
 	}
 }
-func rewriteValueAMD64_OpFloorWithPrecisionMaskedFloat32x4(v *Value) bool {
+func rewriteValueAMD64_OpFloorScaledMaskedFloat32x4(v *Value) bool {
 	v_1 := v.Args[1]
 	v_0 := v.Args[0]
 	b := v.Block
-	// match: (FloorWithPrecisionMaskedFloat32x4 [a] x mask)
+	// match: (FloorScaledMaskedFloat32x4 [a] x mask)
 	// result: (VRNDSCALEPSMasked128 [a+1] x (VPMOVVec32x4ToM <types.TypeMask> mask))
 	for {
 		a := auxIntToInt8(v.AuxInt)
@@ -34845,11 +34503,11 @@ func rewriteValueAMD64_OpFloorWithPrecisionMaskedFloat32x4(v *Value) bool {
 		return true
 	}
 }
-func rewriteValueAMD64_OpFloorWithPrecisionMaskedFloat32x8(v *Value) bool {
+func rewriteValueAMD64_OpFloorScaledMaskedFloat32x8(v *Value) bool {
 	v_1 := v.Args[1]
 	v_0 := v.Args[0]
 	b := v.Block
-	// match: (FloorWithPrecisionMaskedFloat32x8 [a] x mask)
+	// match: (FloorScaledMaskedFloat32x8 [a] x mask)
 	// result: (VRNDSCALEPSMasked256 [a+1] x (VPMOVVec32x8ToM <types.TypeMask> mask))
 	for {
 		a := auxIntToInt8(v.AuxInt)
@@ -34863,11 +34521,11 @@ func rewriteValueAMD64_OpFloorWithPrecisionMaskedFloat32x8(v *Value) bool {
 		return true
 	}
 }
-func rewriteValueAMD64_OpFloorWithPrecisionMaskedFloat64x2(v *Value) bool {
+func rewriteValueAMD64_OpFloorScaledMaskedFloat64x2(v *Value) bool {
 	v_1 := v.Args[1]
 	v_0 := v.Args[0]
 	b := v.Block
-	// match: (FloorWithPrecisionMaskedFloat64x2 [a] x mask)
+	// match: (FloorScaledMaskedFloat64x2 [a] x mask)
 	// result: (VRNDSCALEPDMasked128 [a+1] x (VPMOVVec64x2ToM <types.TypeMask> mask))
 	for {
 		a := auxIntToInt8(v.AuxInt)
@@ -34881,11 +34539,11 @@ func rewriteValueAMD64_OpFloorWithPrecisionMaskedFloat64x2(v *Value) bool {
 		return true
 	}
 }
-func rewriteValueAMD64_OpFloorWithPrecisionMaskedFloat64x4(v *Value) bool {
+func rewriteValueAMD64_OpFloorScaledMaskedFloat64x4(v *Value) bool {
 	v_1 := v.Args[1]
 	v_0 := v.Args[0]
 	b := v.Block
-	// match: (FloorWithPrecisionMaskedFloat64x4 [a] x mask)
+	// match: (FloorScaledMaskedFloat64x4 [a] x mask)
 	// result: (VRNDSCALEPDMasked256 [a+1] x (VPMOVVec64x4ToM <types.TypeMask> mask))
 	for {
 		a := auxIntToInt8(v.AuxInt)
@@ -34899,11 +34557,11 @@ func rewriteValueAMD64_OpFloorWithPrecisionMaskedFloat64x4(v *Value) bool {
 		return true
 	}
 }
-func rewriteValueAMD64_OpFloorWithPrecisionMaskedFloat64x8(v *Value) bool {
+func rewriteValueAMD64_OpFloorScaledMaskedFloat64x8(v *Value) bool {
 	v_1 := v.Args[1]
 	v_0 := v.Args[0]
 	b := v.Block
-	// match: (FloorWithPrecisionMaskedFloat64x8 [a] x mask)
+	// match: (FloorScaledMaskedFloat64x8 [a] x mask)
 	// result: (VRNDSCALEPDMasked512 [a+1] x (VPMOVVec64x8ToM <types.TypeMask> mask))
 	for {
 		a := auxIntToInt8(v.AuxInt)
@@ -34917,6 +34575,192 @@ func rewriteValueAMD64_OpFloorWithPrecisionMaskedFloat64x8(v *Value) bool {
 		return true
 	}
 }
+func rewriteValueAMD64_OpFloorScaledResidueFloat32x16(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (FloorScaledResidueFloat32x16 [a] x)
+	// result: (VREDUCEPS512 [a+1] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VREDUCEPS512)
+		v.AuxInt = int8ToAuxInt(a + 1)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpFloorScaledResidueFloat32x4(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (FloorScaledResidueFloat32x4 [a] x)
+	// result: (VREDUCEPS128 [a+1] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VREDUCEPS128)
+		v.AuxInt = int8ToAuxInt(a + 1)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpFloorScaledResidueFloat32x8(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (FloorScaledResidueFloat32x8 [a] x)
+	// result: (VREDUCEPS256 [a+1] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VREDUCEPS256)
+		v.AuxInt = int8ToAuxInt(a + 1)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpFloorScaledResidueFloat64x2(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (FloorScaledResidueFloat64x2 [a] x)
+	// result: (VREDUCEPD128 [a+1] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VREDUCEPD128)
+		v.AuxInt = int8ToAuxInt(a + 1)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpFloorScaledResidueFloat64x4(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (FloorScaledResidueFloat64x4 [a] x)
+	// result: (VREDUCEPD256 [a+1] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VREDUCEPD256)
+		v.AuxInt = int8ToAuxInt(a + 1)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpFloorScaledResidueFloat64x8(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (FloorScaledResidueFloat64x8 [a] x)
+	// result: (VREDUCEPD512 [a+1] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VREDUCEPD512)
+		v.AuxInt = int8ToAuxInt(a + 1)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpFloorScaledResidueMaskedFloat32x16(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (FloorScaledResidueMaskedFloat32x16 [a] x mask)
+	// result: (VREDUCEPSMasked512 [a+1] x (VPMOVVec32x16ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VREDUCEPSMasked512)
+		v.AuxInt = int8ToAuxInt(a + 1)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpFloorScaledResidueMaskedFloat32x4(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (FloorScaledResidueMaskedFloat32x4 [a] x mask)
+	// result: (VREDUCEPSMasked128 [a+1] x (VPMOVVec32x4ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VREDUCEPSMasked128)
+		v.AuxInt = int8ToAuxInt(a + 1)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpFloorScaledResidueMaskedFloat32x8(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (FloorScaledResidueMaskedFloat32x8 [a] x mask)
+	// result: (VREDUCEPSMasked256 [a+1] x (VPMOVVec32x8ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VREDUCEPSMasked256)
+		v.AuxInt = int8ToAuxInt(a + 1)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpFloorScaledResidueMaskedFloat64x2(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (FloorScaledResidueMaskedFloat64x2 [a] x mask)
+	// result: (VREDUCEPDMasked128 [a+1] x (VPMOVVec64x2ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VREDUCEPDMasked128)
+		v.AuxInt = int8ToAuxInt(a + 1)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpFloorScaledResidueMaskedFloat64x4(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (FloorScaledResidueMaskedFloat64x4 [a] x mask)
+	// result: (VREDUCEPDMasked256 [a+1] x (VPMOVVec64x4ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VREDUCEPDMasked256)
+		v.AuxInt = int8ToAuxInt(a + 1)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpFloorScaledResidueMaskedFloat64x8(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (FloorScaledResidueMaskedFloat64x8 [a] x mask)
+	// result: (VREDUCEPDMasked512 [a+1] x (VPMOVVec64x8ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VREDUCEPDMasked512)
+		v.AuxInt = int8ToAuxInt(a + 1)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
 func rewriteValueAMD64_OpFusedMultiplyAddMaskedFloat32x16(v *Value) bool {
 	v_3 := v.Args[3]
 	v_2 := v.Args[2]
@@ -43583,114 +43427,6 @@ func rewriteValueAMD64_OpMove(v *Value) bool {
 	}
 	return false
 }
-func rewriteValueAMD64_OpMulByPowOf2MaskedFloat32x16(v *Value) bool {
-	v_2 := v.Args[2]
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (MulByPowOf2MaskedFloat32x16 x y mask)
-	// result: (VSCALEFPSMasked512 x y (VPMOVVec32x16ToM <types.TypeMask> mask))
-	for {
-		x := v_0
-		y := v_1
-		mask := v_2
-		v.reset(OpAMD64VSCALEFPSMasked512)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask)
-		v0.AddArg(mask)
-		v.AddArg3(x, y, v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpMulByPowOf2MaskedFloat32x4(v *Value) bool {
-	v_2 := v.Args[2]
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (MulByPowOf2MaskedFloat32x4 x y mask)
-	// result: (VSCALEFPSMasked128 x y (VPMOVVec32x4ToM <types.TypeMask> mask))
-	for {
-		x := v_0
-		y := v_1
-		mask := v_2
-		v.reset(OpAMD64VSCALEFPSMasked128)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
-		v0.AddArg(mask)
-		v.AddArg3(x, y, v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpMulByPowOf2MaskedFloat32x8(v *Value) bool {
-	v_2 := v.Args[2]
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (MulByPowOf2MaskedFloat32x8 x y mask)
-	// result: (VSCALEFPSMasked256 x y (VPMOVVec32x8ToM <types.TypeMask> mask))
-	for {
-		x := v_0
-		y := v_1
-		mask := v_2
-		v.reset(OpAMD64VSCALEFPSMasked256)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask)
-		v0.AddArg(mask)
-		v.AddArg3(x, y, v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpMulByPowOf2MaskedFloat64x2(v *Value) bool {
-	v_2 := v.Args[2]
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (MulByPowOf2MaskedFloat64x2 x y mask)
-	// result: (VSCALEFPDMasked128 x y (VPMOVVec64x2ToM <types.TypeMask> mask))
-	for {
-		x := v_0
-		y := v_1
-		mask := v_2
-		v.reset(OpAMD64VSCALEFPDMasked128)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask)
-		v0.AddArg(mask)
-		v.AddArg3(x, y, v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpMulByPowOf2MaskedFloat64x4(v *Value) bool {
-	v_2 := v.Args[2]
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (MulByPowOf2MaskedFloat64x4 x y mask)
-	// result: (VSCALEFPDMasked256 x y (VPMOVVec64x4ToM <types.TypeMask> mask))
-	for {
-		x := v_0
-		y := v_1
-		mask := v_2
-		v.reset(OpAMD64VSCALEFPDMasked256)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask)
-		v0.AddArg(mask)
-		v.AddArg3(x, y, v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpMulByPowOf2MaskedFloat64x8(v *Value) bool {
-	v_2 := v.Args[2]
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (MulByPowOf2MaskedFloat64x8 x y mask)
-	// result: (VSCALEFPDMasked512 x y (VPMOVVec64x8ToM <types.TypeMask> mask))
-	for {
-		x := v_0
-		y := v_1
-		mask := v_2
-		v.reset(OpAMD64VSCALEFPDMasked512)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask)
-		v0.AddArg(mask)
-		v.AddArg3(x, y, v0)
-		return true
-	}
-}
 func rewriteValueAMD64_OpMulEvenWidenMaskedInt64x2(v *Value) bool {
 	v_2 := v.Args[2]
 	v_1 := v.Args[1]
@@ -43907,168 +43643,6 @@ func rewriteValueAMD64_OpMulHighMaskedUint16x8(v *Value) bool {
 		return true
 	}
 }
-func rewriteValueAMD64_OpMulLowMaskedInt16x16(v *Value) bool {
-	v_2 := v.Args[2]
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (MulLowMaskedInt16x16 x y mask)
-	// result: (VPMULLWMasked256 x y (VPMOVVec16x16ToM <types.TypeMask> mask))
-	for {
-		x := v_0
-		y := v_1
-		mask := v_2
-		v.reset(OpAMD64VPMULLWMasked256)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x16ToM, types.TypeMask)
-		v0.AddArg(mask)
-		v.AddArg3(x, y, v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpMulLowMaskedInt16x32(v *Value) bool {
-	v_2 := v.Args[2]
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (MulLowMaskedInt16x32 x y mask)
-	// result: (VPMULLWMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask))
-	for {
-		x := v_0
-		y := v_1
-		mask := v_2
-		v.reset(OpAMD64VPMULLWMasked512)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x32ToM, types.TypeMask)
-		v0.AddArg(mask)
-		v.AddArg3(x, y, v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpMulLowMaskedInt16x8(v *Value) bool {
-	v_2 := v.Args[2]
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (MulLowMaskedInt16x8 x y mask)
-	// result: (VPMULLWMasked128 x y (VPMOVVec16x8ToM <types.TypeMask> mask))
-	for {
-		x := v_0
-		y := v_1
-		mask := v_2
-		v.reset(OpAMD64VPMULLWMasked128)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask)
-		v0.AddArg(mask)
-		v.AddArg3(x, y, v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpMulLowMaskedInt32x16(v *Value) bool {
-	v_2 := v.Args[2]
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (MulLowMaskedInt32x16 x y mask)
-	// result: (VPMULLDMasked512 x y (VPMOVVec32x16ToM <types.TypeMask> mask))
-	for {
-		x := v_0
-		y := v_1
-		mask := v_2
-		v.reset(OpAMD64VPMULLDMasked512)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask)
-		v0.AddArg(mask)
-		v.AddArg3(x, y, v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpMulLowMaskedInt32x4(v *Value) bool {
-	v_2 := v.Args[2]
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (MulLowMaskedInt32x4 x y mask)
-	// result: (VPMULLDMasked128 x y (VPMOVVec32x4ToM <types.TypeMask> mask))
-	for {
-		x := v_0
-		y := v_1
-		mask := v_2
-		v.reset(OpAMD64VPMULLDMasked128)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
-		v0.AddArg(mask)
-		v.AddArg3(x, y, v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpMulLowMaskedInt32x8(v *Value) bool {
-	v_2 := v.Args[2]
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (MulLowMaskedInt32x8 x y mask)
-	// result: (VPMULLDMasked256 x y (VPMOVVec32x8ToM <types.TypeMask> mask))
-	for {
-		x := v_0
-		y := v_1
-		mask := v_2
-		v.reset(OpAMD64VPMULLDMasked256)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask)
-		v0.AddArg(mask)
-		v.AddArg3(x, y, v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpMulLowMaskedInt64x2(v *Value) bool {
-	v_2 := v.Args[2]
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (MulLowMaskedInt64x2 x y mask)
-	// result: (VPMULLQMasked128 x y (VPMOVVec64x2ToM <types.TypeMask> mask))
-	for {
-		x := v_0
-		y := v_1
-		mask := v_2
-		v.reset(OpAMD64VPMULLQMasked128)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask)
-		v0.AddArg(mask)
-		v.AddArg3(x, y, v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpMulLowMaskedInt64x4(v *Value) bool {
-	v_2 := v.Args[2]
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (MulLowMaskedInt64x4 x y mask)
-	// result: (VPMULLQMasked256 x y (VPMOVVec64x4ToM <types.TypeMask> mask))
-	for {
-		x := v_0
-		y := v_1
-		mask := v_2
-		v.reset(OpAMD64VPMULLQMasked256)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask)
-		v0.AddArg(mask)
-		v.AddArg3(x, y, v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpMulLowMaskedInt64x8(v *Value) bool {
-	v_2 := v.Args[2]
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (MulLowMaskedInt64x8 x y mask)
-	// result: (VPMULLQMasked512 x y (VPMOVVec64x8ToM <types.TypeMask> mask))
-	for {
-		x := v_0
-		y := v_1
-		mask := v_2
-		v.reset(OpAMD64VPMULLQMasked512)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask)
-		v0.AddArg(mask)
-		v.AddArg3(x, y, v0)
-		return true
-	}
-}
 func rewriteValueAMD64_OpMulMaskedFloat32x16(v *Value) bool {
 	v_2 := v.Args[2]
 	v_1 := v.Args[1]
@@ -44177,6 +43751,168 @@ func rewriteValueAMD64_OpMulMaskedFloat64x8(v *Value) bool {
 		return true
 	}
 }
+func rewriteValueAMD64_OpMulMaskedInt16x16(v *Value) bool {
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MulMaskedInt16x16 x y mask)
+	// result: (VPMULLWMasked256 x y (VPMOVVec16x16ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		mask := v_2
+		v.reset(OpAMD64VPMULLWMasked256)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x16ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg3(x, y, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMulMaskedInt16x32(v *Value) bool {
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MulMaskedInt16x32 x y mask)
+	// result: (VPMULLWMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		mask := v_2
+		v.reset(OpAMD64VPMULLWMasked512)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x32ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg3(x, y, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMulMaskedInt16x8(v *Value) bool {
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MulMaskedInt16x8 x y mask)
+	// result: (VPMULLWMasked128 x y (VPMOVVec16x8ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		mask := v_2
+		v.reset(OpAMD64VPMULLWMasked128)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg3(x, y, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMulMaskedInt32x16(v *Value) bool {
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MulMaskedInt32x16 x y mask)
+	// result: (VPMULLDMasked512 x y (VPMOVVec32x16ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		mask := v_2
+		v.reset(OpAMD64VPMULLDMasked512)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg3(x, y, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMulMaskedInt32x4(v *Value) bool {
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MulMaskedInt32x4 x y mask)
+	// result: (VPMULLDMasked128 x y (VPMOVVec32x4ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		mask := v_2
+		v.reset(OpAMD64VPMULLDMasked128)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg3(x, y, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMulMaskedInt32x8(v *Value) bool {
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MulMaskedInt32x8 x y mask)
+	// result: (VPMULLDMasked256 x y (VPMOVVec32x8ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		mask := v_2
+		v.reset(OpAMD64VPMULLDMasked256)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg3(x, y, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMulMaskedInt64x2(v *Value) bool {
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MulMaskedInt64x2 x y mask)
+	// result: (VPMULLQMasked128 x y (VPMOVVec64x2ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		mask := v_2
+		v.reset(OpAMD64VPMULLQMasked128)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg3(x, y, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMulMaskedInt64x4(v *Value) bool {
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MulMaskedInt64x4 x y mask)
+	// result: (VPMULLQMasked256 x y (VPMOVVec64x4ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		mask := v_2
+		v.reset(OpAMD64VPMULLQMasked256)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg3(x, y, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpMulMaskedInt64x8(v *Value) bool {
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (MulMaskedInt64x8 x y mask)
+	// result: (VPMULLQMasked512 x y (VPMOVVec64x8ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		mask := v_2
+		v.reset(OpAMD64VPMULLQMasked512)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg3(x, y, v0)
+		return true
+	}
+}
 func rewriteValueAMD64_OpNeg32F(v *Value) bool {
 	v_0 := v.Args[0]
 	b := v.Block
@@ -48243,21 +47979,9 @@ func rewriteValueAMD64_OpRoundFloat64x4(v *Value) bool {
 		return true
 	}
 }
-func rewriteValueAMD64_OpRoundToEven(v *Value) bool {
+func rewriteValueAMD64_OpRoundScaledFloat32x16(v *Value) bool {
 	v_0 := v.Args[0]
-	// match: (RoundToEven x)
-	// result: (ROUNDSD [0] x)
-	for {
-		x := v_0
-		v.reset(OpAMD64ROUNDSD)
-		v.AuxInt = int8ToAuxInt(0)
-		v.AddArg(x)
-		return true
-	}
-}
-func rewriteValueAMD64_OpRoundWithPrecisionFloat32x16(v *Value) bool {
-	v_0 := v.Args[0]
-	// match: (RoundWithPrecisionFloat32x16 [a] x)
+	// match: (RoundScaledFloat32x16 [a] x)
 	// result: (VRNDSCALEPS512 [a+0] x)
 	for {
 		a := auxIntToInt8(v.AuxInt)
@@ -48268,9 +47992,9 @@ func rewriteValueAMD64_OpRoundWithPrecisionFloat32x16(v *Value) bool {
 		return true
 	}
 }
-func rewriteValueAMD64_OpRoundWithPrecisionFloat32x4(v *Value) bool {
+func rewriteValueAMD64_OpRoundScaledFloat32x4(v *Value) bool {
 	v_0 := v.Args[0]
-	// match: (RoundWithPrecisionFloat32x4 [a] x)
+	// match: (RoundScaledFloat32x4 [a] x)
 	// result: (VRNDSCALEPS128 [a+0] x)
 	for {
 		a := auxIntToInt8(v.AuxInt)
@@ -48281,9 +48005,9 @@ func rewriteValueAMD64_OpRoundWithPrecisionFloat32x4(v *Value) bool {
 		return true
 	}
 }
-func rewriteValueAMD64_OpRoundWithPrecisionFloat32x8(v *Value) bool {
+func rewriteValueAMD64_OpRoundScaledFloat32x8(v *Value) bool {
 	v_0 := v.Args[0]
-	// match: (RoundWithPrecisionFloat32x8 [a] x)
+	// match: (RoundScaledFloat32x8 [a] x)
 	// result: (VRNDSCALEPS256 [a+0] x)
 	for {
 		a := auxIntToInt8(v.AuxInt)
@@ -48294,9 +48018,9 @@ func rewriteValueAMD64_OpRoundWithPrecisionFloat32x8(v *Value) bool {
 		return true
 	}
 }
-func rewriteValueAMD64_OpRoundWithPrecisionFloat64x2(v *Value) bool {
+func rewriteValueAMD64_OpRoundScaledFloat64x2(v *Value) bool {
 	v_0 := v.Args[0]
-	// match: (RoundWithPrecisionFloat64x2 [a] x)
+	// match: (RoundScaledFloat64x2 [a] x)
 	// result: (VRNDSCALEPD128 [a+0] x)
 	for {
 		a := auxIntToInt8(v.AuxInt)
@@ -48307,9 +48031,9 @@ func rewriteValueAMD64_OpRoundWithPrecisionFloat64x2(v *Value) bool {
 		return true
 	}
 }
-func rewriteValueAMD64_OpRoundWithPrecisionFloat64x4(v *Value) bool {
+func rewriteValueAMD64_OpRoundScaledFloat64x4(v *Value) bool {
 	v_0 := v.Args[0]
-	// match: (RoundWithPrecisionFloat64x4 [a] x)
+	// match: (RoundScaledFloat64x4 [a] x)
 	// result: (VRNDSCALEPD256 [a+0] x)
 	for {
 		a := auxIntToInt8(v.AuxInt)
@@ -48320,9 +48044,9 @@ func rewriteValueAMD64_OpRoundWithPrecisionFloat64x4(v *Value) bool {
 		return true
 	}
 }
-func rewriteValueAMD64_OpRoundWithPrecisionFloat64x8(v *Value) bool {
+func rewriteValueAMD64_OpRoundScaledFloat64x8(v *Value) bool {
 	v_0 := v.Args[0]
-	// match: (RoundWithPrecisionFloat64x8 [a] x)
+	// match: (RoundScaledFloat64x8 [a] x)
 	// result: (VRNDSCALEPD512 [a+0] x)
 	for {
 		a := auxIntToInt8(v.AuxInt)
@@ -48333,11 +48057,11 @@ func rewriteValueAMD64_OpRoundWithPrecisionFloat64x8(v *Value) bool {
 		return true
 	}
 }
-func rewriteValueAMD64_OpRoundWithPrecisionMaskedFloat32x16(v *Value) bool {
+func rewriteValueAMD64_OpRoundScaledMaskedFloat32x16(v *Value) bool {
 	v_1 := v.Args[1]
 	v_0 := v.Args[0]
 	b := v.Block
-	// match: (RoundWithPrecisionMaskedFloat32x16 [a] x mask)
+	// match: (RoundScaledMaskedFloat32x16 [a] x mask)
 	// result: (VRNDSCALEPSMasked512 [a+0] x (VPMOVVec32x16ToM <types.TypeMask> mask))
 	for {
 		a := auxIntToInt8(v.AuxInt)
@@ -48351,11 +48075,11 @@ func rewriteValueAMD64_OpRoundWithPrecisionMaskedFloat32x16(v *Value) bool {
 		return true
 	}
 }
-func rewriteValueAMD64_OpRoundWithPrecisionMaskedFloat32x4(v *Value) bool {
+func rewriteValueAMD64_OpRoundScaledMaskedFloat32x4(v *Value) bool {
 	v_1 := v.Args[1]
 	v_0 := v.Args[0]
 	b := v.Block
-	// match: (RoundWithPrecisionMaskedFloat32x4 [a] x mask)
+	// match: (RoundScaledMaskedFloat32x4 [a] x mask)
 	// result: (VRNDSCALEPSMasked128 [a+0] x (VPMOVVec32x4ToM <types.TypeMask> mask))
 	for {
 		a := auxIntToInt8(v.AuxInt)
@@ -48369,11 +48093,11 @@ func rewriteValueAMD64_OpRoundWithPrecisionMaskedFloat32x4(v *Value) bool {
 		return true
 	}
 }
-func rewriteValueAMD64_OpRoundWithPrecisionMaskedFloat32x8(v *Value) bool {
+func rewriteValueAMD64_OpRoundScaledMaskedFloat32x8(v *Value) bool {
 	v_1 := v.Args[1]
 	v_0 := v.Args[0]
 	b := v.Block
-	// match: (RoundWithPrecisionMaskedFloat32x8 [a] x mask)
+	// match: (RoundScaledMaskedFloat32x8 [a] x mask)
 	// result: (VRNDSCALEPSMasked256 [a+0] x (VPMOVVec32x8ToM <types.TypeMask> mask))
 	for {
 		a := auxIntToInt8(v.AuxInt)
@@ -48387,11 +48111,11 @@ func rewriteValueAMD64_OpRoundWithPrecisionMaskedFloat32x8(v *Value) bool {
 		return true
 	}
 }
-func rewriteValueAMD64_OpRoundWithPrecisionMaskedFloat64x2(v *Value) bool {
+func rewriteValueAMD64_OpRoundScaledMaskedFloat64x2(v *Value) bool {
 	v_1 := v.Args[1]
 	v_0 := v.Args[0]
 	b := v.Block
-	// match: (RoundWithPrecisionMaskedFloat64x2 [a] x mask)
+	// match: (RoundScaledMaskedFloat64x2 [a] x mask)
 	// result: (VRNDSCALEPDMasked128 [a+0] x (VPMOVVec64x2ToM <types.TypeMask> mask))
 	for {
 		a := auxIntToInt8(v.AuxInt)
@@ -48405,11 +48129,11 @@ func rewriteValueAMD64_OpRoundWithPrecisionMaskedFloat64x2(v *Value) bool {
 		return true
 	}
 }
-func rewriteValueAMD64_OpRoundWithPrecisionMaskedFloat64x4(v *Value) bool {
+func rewriteValueAMD64_OpRoundScaledMaskedFloat64x4(v *Value) bool {
 	v_1 := v.Args[1]
 	v_0 := v.Args[0]
 	b := v.Block
-	// match: (RoundWithPrecisionMaskedFloat64x4 [a] x mask)
+	// match: (RoundScaledMaskedFloat64x4 [a] x mask)
 	// result: (VRNDSCALEPDMasked256 [a+0] x (VPMOVVec64x4ToM <types.TypeMask> mask))
 	for {
 		a := auxIntToInt8(v.AuxInt)
@@ -48423,11 +48147,11 @@ func rewriteValueAMD64_OpRoundWithPrecisionMaskedFloat64x4(v *Value) bool {
 		return true
 	}
 }
-func rewriteValueAMD64_OpRoundWithPrecisionMaskedFloat64x8(v *Value) bool {
+func rewriteValueAMD64_OpRoundScaledMaskedFloat64x8(v *Value) bool {
 	v_1 := v.Args[1]
 	v_0 := v.Args[0]
 	b := v.Block
-	// match: (RoundWithPrecisionMaskedFloat64x8 [a] x mask)
+	// match: (RoundScaledMaskedFloat64x8 [a] x mask)
 	// result: (VRNDSCALEPDMasked512 [a+0] x (VPMOVVec64x8ToM <types.TypeMask> mask))
 	for {
 		a := auxIntToInt8(v.AuxInt)
@@ -48441,6 +48165,204 @@ func rewriteValueAMD64_OpRoundWithPrecisionMaskedFloat64x8(v *Value) bool {
 		return true
 	}
 }
+func rewriteValueAMD64_OpRoundScaledResidueFloat32x16(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (RoundScaledResidueFloat32x16 [a] x)
+	// result: (VREDUCEPS512 [a+0] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VREDUCEPS512)
+		v.AuxInt = int8ToAuxInt(a + 0)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpRoundScaledResidueFloat32x4(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (RoundScaledResidueFloat32x4 [a] x)
+	// result: (VREDUCEPS128 [a+0] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VREDUCEPS128)
+		v.AuxInt = int8ToAuxInt(a + 0)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpRoundScaledResidueFloat32x8(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (RoundScaledResidueFloat32x8 [a] x)
+	// result: (VREDUCEPS256 [a+0] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VREDUCEPS256)
+		v.AuxInt = int8ToAuxInt(a + 0)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpRoundScaledResidueFloat64x2(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (RoundScaledResidueFloat64x2 [a] x)
+	// result: (VREDUCEPD128 [a+0] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VREDUCEPD128)
+		v.AuxInt = int8ToAuxInt(a + 0)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpRoundScaledResidueFloat64x4(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (RoundScaledResidueFloat64x4 [a] x)
+	// result: (VREDUCEPD256 [a+0] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VREDUCEPD256)
+		v.AuxInt = int8ToAuxInt(a + 0)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpRoundScaledResidueFloat64x8(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (RoundScaledResidueFloat64x8 [a] x)
+	// result: (VREDUCEPD512 [a+0] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VREDUCEPD512)
+		v.AuxInt = int8ToAuxInt(a + 0)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpRoundScaledResidueMaskedFloat32x16(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (RoundScaledResidueMaskedFloat32x16 [a] x mask)
+	// result: (VREDUCEPSMasked512 [a+0] x (VPMOVVec32x16ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VREDUCEPSMasked512)
+		v.AuxInt = int8ToAuxInt(a + 0)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpRoundScaledResidueMaskedFloat32x4(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (RoundScaledResidueMaskedFloat32x4 [a] x mask)
+	// result: (VREDUCEPSMasked128 [a+0] x (VPMOVVec32x4ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VREDUCEPSMasked128)
+		v.AuxInt = int8ToAuxInt(a + 0)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpRoundScaledResidueMaskedFloat32x8(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (RoundScaledResidueMaskedFloat32x8 [a] x mask)
+	// result: (VREDUCEPSMasked256 [a+0] x (VPMOVVec32x8ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VREDUCEPSMasked256)
+		v.AuxInt = int8ToAuxInt(a + 0)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpRoundScaledResidueMaskedFloat64x2(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (RoundScaledResidueMaskedFloat64x2 [a] x mask)
+	// result: (VREDUCEPDMasked128 [a+0] x (VPMOVVec64x2ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VREDUCEPDMasked128)
+		v.AuxInt = int8ToAuxInt(a + 0)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpRoundScaledResidueMaskedFloat64x4(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (RoundScaledResidueMaskedFloat64x4 [a] x mask)
+	// result: (VREDUCEPDMasked256 [a+0] x (VPMOVVec64x4ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VREDUCEPDMasked256)
+		v.AuxInt = int8ToAuxInt(a + 0)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpRoundScaledResidueMaskedFloat64x8(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (RoundScaledResidueMaskedFloat64x8 [a] x mask)
+	// result: (VREDUCEPDMasked512 [a+0] x (VPMOVVec64x8ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VREDUCEPDMasked512)
+		v.AuxInt = int8ToAuxInt(a + 0)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpRoundToEven(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (RoundToEven x)
+	// result: (ROUNDSD [0] x)
+	for {
+		x := v_0
+		v.reset(OpAMD64ROUNDSD)
+		v.AuxInt = int8ToAuxInt(0)
+		v.AddArg(x)
+		return true
+	}
+}
 func rewriteValueAMD64_OpRsh16Ux16(v *Value) bool {
 	v_1 := v.Args[1]
 	v_0 := v.Args[0]
@@ -49829,438 +49751,6 @@ func rewriteValueAMD64_OpSaturatedAddDotProdMaskedInt32x8(v *Value) bool {
 		return true
 	}
 }
-func rewriteValueAMD64_OpSaturatedAddMaskedInt16x16(v *Value) bool {
-	v_2 := v.Args[2]
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (SaturatedAddMaskedInt16x16 x y mask)
-	// result: (VPADDSWMasked256 x y (VPMOVVec16x16ToM <types.TypeMask> mask))
-	for {
-		x := v_0
-		y := v_1
-		mask := v_2
-		v.reset(OpAMD64VPADDSWMasked256)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x16ToM, types.TypeMask)
-		v0.AddArg(mask)
-		v.AddArg3(x, y, v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpSaturatedAddMaskedInt16x32(v *Value) bool {
-	v_2 := v.Args[2]
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (SaturatedAddMaskedInt16x32 x y mask)
-	// result: (VPADDSWMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask))
-	for {
-		x := v_0
-		y := v_1
-		mask := v_2
-		v.reset(OpAMD64VPADDSWMasked512)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x32ToM, types.TypeMask)
-		v0.AddArg(mask)
-		v.AddArg3(x, y, v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpSaturatedAddMaskedInt16x8(v *Value) bool {
-	v_2 := v.Args[2]
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (SaturatedAddMaskedInt16x8 x y mask)
-	// result: (VPADDSWMasked128 x y (VPMOVVec16x8ToM <types.TypeMask> mask))
-	for {
-		x := v_0
-		y := v_1
-		mask := v_2
-		v.reset(OpAMD64VPADDSWMasked128)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask)
-		v0.AddArg(mask)
-		v.AddArg3(x, y, v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpSaturatedAddMaskedInt8x16(v *Value) bool {
-	v_2 := v.Args[2]
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (SaturatedAddMaskedInt8x16 x y mask)
-	// result: (VPADDSBMasked128 x y (VPMOVVec8x16ToM <types.TypeMask> mask))
-	for {
-		x := v_0
-		y := v_1
-		mask := v_2
-		v.reset(OpAMD64VPADDSBMasked128)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x16ToM, types.TypeMask)
-		v0.AddArg(mask)
-		v.AddArg3(x, y, v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpSaturatedAddMaskedInt8x32(v *Value) bool {
-	v_2 := v.Args[2]
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (SaturatedAddMaskedInt8x32 x y mask)
-	// result: (VPADDSBMasked256 x y (VPMOVVec8x32ToM <types.TypeMask> mask))
-	for {
-		x := v_0
-		y := v_1
-		mask := v_2
-		v.reset(OpAMD64VPADDSBMasked256)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x32ToM, types.TypeMask)
-		v0.AddArg(mask)
-		v.AddArg3(x, y, v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpSaturatedAddMaskedInt8x64(v *Value) bool {
-	v_2 := v.Args[2]
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (SaturatedAddMaskedInt8x64 x y mask)
-	// result: (VPADDSBMasked512 x y (VPMOVVec8x64ToM <types.TypeMask> mask))
-	for {
-		x := v_0
-		y := v_1
-		mask := v_2
-		v.reset(OpAMD64VPADDSBMasked512)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x64ToM, types.TypeMask)
-		v0.AddArg(mask)
-		v.AddArg3(x, y, v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpSaturatedAddMaskedUint16x16(v *Value) bool {
-	v_2 := v.Args[2]
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (SaturatedAddMaskedUint16x16 x y mask)
-	// result: (VPADDSWMasked256 x y (VPMOVVec16x16ToM <types.TypeMask> mask))
-	for {
-		x := v_0
-		y := v_1
-		mask := v_2
-		v.reset(OpAMD64VPADDSWMasked256)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x16ToM, types.TypeMask)
-		v0.AddArg(mask)
-		v.AddArg3(x, y, v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpSaturatedAddMaskedUint16x32(v *Value) bool {
-	v_2 := v.Args[2]
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (SaturatedAddMaskedUint16x32 x y mask)
-	// result: (VPADDSWMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask))
-	for {
-		x := v_0
-		y := v_1
-		mask := v_2
-		v.reset(OpAMD64VPADDSWMasked512)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x32ToM, types.TypeMask)
-		v0.AddArg(mask)
-		v.AddArg3(x, y, v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpSaturatedAddMaskedUint16x8(v *Value) bool {
-	v_2 := v.Args[2]
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (SaturatedAddMaskedUint16x8 x y mask)
-	// result: (VPADDSWMasked128 x y (VPMOVVec16x8ToM <types.TypeMask> mask))
-	for {
-		x := v_0
-		y := v_1
-		mask := v_2
-		v.reset(OpAMD64VPADDSWMasked128)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask)
-		v0.AddArg(mask)
-		v.AddArg3(x, y, v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpSaturatedAddMaskedUint8x16(v *Value) bool {
-	v_2 := v.Args[2]
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (SaturatedAddMaskedUint8x16 x y mask)
-	// result: (VPADDSBMasked128 x y (VPMOVVec8x16ToM <types.TypeMask> mask))
-	for {
-		x := v_0
-		y := v_1
-		mask := v_2
-		v.reset(OpAMD64VPADDSBMasked128)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x16ToM, types.TypeMask)
-		v0.AddArg(mask)
-		v.AddArg3(x, y, v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpSaturatedAddMaskedUint8x32(v *Value) bool {
-	v_2 := v.Args[2]
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (SaturatedAddMaskedUint8x32 x y mask)
-	// result: (VPADDSBMasked256 x y (VPMOVVec8x32ToM <types.TypeMask> mask))
-	for {
-		x := v_0
-		y := v_1
-		mask := v_2
-		v.reset(OpAMD64VPADDSBMasked256)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x32ToM, types.TypeMask)
-		v0.AddArg(mask)
-		v.AddArg3(x, y, v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpSaturatedAddMaskedUint8x64(v *Value) bool {
-	v_2 := v.Args[2]
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (SaturatedAddMaskedUint8x64 x y mask)
-	// result: (VPADDSBMasked512 x y (VPMOVVec8x64ToM <types.TypeMask> mask))
-	for {
-		x := v_0
-		y := v_1
-		mask := v_2
-		v.reset(OpAMD64VPADDSBMasked512)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x64ToM, types.TypeMask)
-		v0.AddArg(mask)
-		v.AddArg3(x, y, v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpSaturatedSubMaskedInt16x16(v *Value) bool {
-	v_2 := v.Args[2]
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (SaturatedSubMaskedInt16x16 x y mask)
-	// result: (VPSUBSWMasked256 x y (VPMOVVec16x16ToM <types.TypeMask> mask))
-	for {
-		x := v_0
-		y := v_1
-		mask := v_2
-		v.reset(OpAMD64VPSUBSWMasked256)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x16ToM, types.TypeMask)
-		v0.AddArg(mask)
-		v.AddArg3(x, y, v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpSaturatedSubMaskedInt16x32(v *Value) bool {
-	v_2 := v.Args[2]
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (SaturatedSubMaskedInt16x32 x y mask)
-	// result: (VPSUBSWMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask))
-	for {
-		x := v_0
-		y := v_1
-		mask := v_2
-		v.reset(OpAMD64VPSUBSWMasked512)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x32ToM, types.TypeMask)
-		v0.AddArg(mask)
-		v.AddArg3(x, y, v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpSaturatedSubMaskedInt16x8(v *Value) bool {
-	v_2 := v.Args[2]
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (SaturatedSubMaskedInt16x8 x y mask)
-	// result: (VPSUBSWMasked128 x y (VPMOVVec16x8ToM <types.TypeMask> mask))
-	for {
-		x := v_0
-		y := v_1
-		mask := v_2
-		v.reset(OpAMD64VPSUBSWMasked128)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask)
-		v0.AddArg(mask)
-		v.AddArg3(x, y, v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpSaturatedSubMaskedInt8x16(v *Value) bool {
-	v_2 := v.Args[2]
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (SaturatedSubMaskedInt8x16 x y mask)
-	// result: (VPSUBSBMasked128 x y (VPMOVVec8x16ToM <types.TypeMask> mask))
-	for {
-		x := v_0
-		y := v_1
-		mask := v_2
-		v.reset(OpAMD64VPSUBSBMasked128)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x16ToM, types.TypeMask)
-		v0.AddArg(mask)
-		v.AddArg3(x, y, v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpSaturatedSubMaskedInt8x32(v *Value) bool {
-	v_2 := v.Args[2]
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (SaturatedSubMaskedInt8x32 x y mask)
-	// result: (VPSUBSBMasked256 x y (VPMOVVec8x32ToM <types.TypeMask> mask))
-	for {
-		x := v_0
-		y := v_1
-		mask := v_2
-		v.reset(OpAMD64VPSUBSBMasked256)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x32ToM, types.TypeMask)
-		v0.AddArg(mask)
-		v.AddArg3(x, y, v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpSaturatedSubMaskedInt8x64(v *Value) bool {
-	v_2 := v.Args[2]
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (SaturatedSubMaskedInt8x64 x y mask)
-	// result: (VPSUBSBMasked512 x y (VPMOVVec8x64ToM <types.TypeMask> mask))
-	for {
-		x := v_0
-		y := v_1
-		mask := v_2
-		v.reset(OpAMD64VPSUBSBMasked512)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x64ToM, types.TypeMask)
-		v0.AddArg(mask)
-		v.AddArg3(x, y, v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpSaturatedSubMaskedUint16x16(v *Value) bool {
-	v_2 := v.Args[2]
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (SaturatedSubMaskedUint16x16 x y mask)
-	// result: (VPSUBSWMasked256 x y (VPMOVVec16x16ToM <types.TypeMask> mask))
-	for {
-		x := v_0
-		y := v_1
-		mask := v_2
-		v.reset(OpAMD64VPSUBSWMasked256)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x16ToM, types.TypeMask)
-		v0.AddArg(mask)
-		v.AddArg3(x, y, v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpSaturatedSubMaskedUint16x32(v *Value) bool {
-	v_2 := v.Args[2]
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (SaturatedSubMaskedUint16x32 x y mask)
-	// result: (VPSUBSWMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask))
-	for {
-		x := v_0
-		y := v_1
-		mask := v_2
-		v.reset(OpAMD64VPSUBSWMasked512)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x32ToM, types.TypeMask)
-		v0.AddArg(mask)
-		v.AddArg3(x, y, v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpSaturatedSubMaskedUint16x8(v *Value) bool {
-	v_2 := v.Args[2]
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (SaturatedSubMaskedUint16x8 x y mask)
-	// result: (VPSUBSWMasked128 x y (VPMOVVec16x8ToM <types.TypeMask> mask))
-	for {
-		x := v_0
-		y := v_1
-		mask := v_2
-		v.reset(OpAMD64VPSUBSWMasked128)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask)
-		v0.AddArg(mask)
-		v.AddArg3(x, y, v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpSaturatedSubMaskedUint8x16(v *Value) bool {
-	v_2 := v.Args[2]
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (SaturatedSubMaskedUint8x16 x y mask)
-	// result: (VPSUBSBMasked128 x y (VPMOVVec8x16ToM <types.TypeMask> mask))
-	for {
-		x := v_0
-		y := v_1
-		mask := v_2
-		v.reset(OpAMD64VPSUBSBMasked128)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x16ToM, types.TypeMask)
-		v0.AddArg(mask)
-		v.AddArg3(x, y, v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpSaturatedSubMaskedUint8x32(v *Value) bool {
-	v_2 := v.Args[2]
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (SaturatedSubMaskedUint8x32 x y mask)
-	// result: (VPSUBSBMasked256 x y (VPMOVVec8x32ToM <types.TypeMask> mask))
-	for {
-		x := v_0
-		y := v_1
-		mask := v_2
-		v.reset(OpAMD64VPSUBSBMasked256)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x32ToM, types.TypeMask)
-		v0.AddArg(mask)
-		v.AddArg3(x, y, v0)
-		return true
-	}
-}
-func rewriteValueAMD64_OpSaturatedSubMaskedUint8x64(v *Value) bool {
-	v_2 := v.Args[2]
-	v_1 := v.Args[1]
-	v_0 := v.Args[0]
-	b := v.Block
-	// match: (SaturatedSubMaskedUint8x64 x y mask)
-	// result: (VPSUBSBMasked512 x y (VPMOVVec8x64ToM <types.TypeMask> mask))
-	for {
-		x := v_0
-		y := v_1
-		mask := v_2
-		v.reset(OpAMD64VPSUBSBMasked512)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x64ToM, types.TypeMask)
-		v0.AddArg(mask)
-		v.AddArg3(x, y, v0)
-		return true
-	}
-}
 func rewriteValueAMD64_OpSaturatedUnsignedSignedPairDotProdMaskedUint8x16(v *Value) bool {
 	v_2 := v.Args[2]
 	v_1 := v.Args[1]
@@ -50375,6 +49865,114 @@ func rewriteValueAMD64_OpSaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32
 		return true
 	}
 }
+func rewriteValueAMD64_OpScaleMaskedFloat32x16(v *Value) bool {
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (ScaleMaskedFloat32x16 x y mask)
+	// result: (VSCALEFPSMasked512 x y (VPMOVVec32x16ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		mask := v_2
+		v.reset(OpAMD64VSCALEFPSMasked512)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg3(x, y, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpScaleMaskedFloat32x4(v *Value) bool {
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (ScaleMaskedFloat32x4 x y mask)
+	// result: (VSCALEFPSMasked128 x y (VPMOVVec32x4ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		mask := v_2
+		v.reset(OpAMD64VSCALEFPSMasked128)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg3(x, y, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpScaleMaskedFloat32x8(v *Value) bool {
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (ScaleMaskedFloat32x8 x y mask)
+	// result: (VSCALEFPSMasked256 x y (VPMOVVec32x8ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		mask := v_2
+		v.reset(OpAMD64VSCALEFPSMasked256)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg3(x, y, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpScaleMaskedFloat64x2(v *Value) bool {
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (ScaleMaskedFloat64x2 x y mask)
+	// result: (VSCALEFPDMasked128 x y (VPMOVVec64x2ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		mask := v_2
+		v.reset(OpAMD64VSCALEFPDMasked128)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg3(x, y, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpScaleMaskedFloat64x4(v *Value) bool {
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (ScaleMaskedFloat64x4 x y mask)
+	// result: (VSCALEFPDMasked256 x y (VPMOVVec64x4ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		mask := v_2
+		v.reset(OpAMD64VSCALEFPDMasked256)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg3(x, y, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpScaleMaskedFloat64x8(v *Value) bool {
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (ScaleMaskedFloat64x8 x y mask)
+	// result: (VSCALEFPDMasked512 x y (VPMOVVec64x8ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		mask := v_2
+		v.reset(OpAMD64VSCALEFPDMasked512)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg3(x, y, v0)
+		return true
+	}
+}
 func rewriteValueAMD64_OpSelect0(v *Value) bool {
 	v_0 := v.Args[0]
 	b := v.Block
@@ -54763,6 +54361,222 @@ func rewriteValueAMD64_OpSubMaskedUint8x64(v *Value) bool {
 		return true
 	}
 }
+func rewriteValueAMD64_OpSubSaturatedMaskedInt16x16(v *Value) bool {
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (SubSaturatedMaskedInt16x16 x y mask)
+	// result: (VPSUBSWMasked256 x y (VPMOVVec16x16ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		mask := v_2
+		v.reset(OpAMD64VPSUBSWMasked256)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x16ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg3(x, y, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpSubSaturatedMaskedInt16x32(v *Value) bool {
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (SubSaturatedMaskedInt16x32 x y mask)
+	// result: (VPSUBSWMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		mask := v_2
+		v.reset(OpAMD64VPSUBSWMasked512)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x32ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg3(x, y, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpSubSaturatedMaskedInt16x8(v *Value) bool {
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (SubSaturatedMaskedInt16x8 x y mask)
+	// result: (VPSUBSWMasked128 x y (VPMOVVec16x8ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		mask := v_2
+		v.reset(OpAMD64VPSUBSWMasked128)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg3(x, y, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpSubSaturatedMaskedInt8x16(v *Value) bool {
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (SubSaturatedMaskedInt8x16 x y mask)
+	// result: (VPSUBSBMasked128 x y (VPMOVVec8x16ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		mask := v_2
+		v.reset(OpAMD64VPSUBSBMasked128)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x16ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg3(x, y, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpSubSaturatedMaskedInt8x32(v *Value) bool {
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (SubSaturatedMaskedInt8x32 x y mask)
+	// result: (VPSUBSBMasked256 x y (VPMOVVec8x32ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		mask := v_2
+		v.reset(OpAMD64VPSUBSBMasked256)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x32ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg3(x, y, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpSubSaturatedMaskedInt8x64(v *Value) bool {
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (SubSaturatedMaskedInt8x64 x y mask)
+	// result: (VPSUBSBMasked512 x y (VPMOVVec8x64ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		mask := v_2
+		v.reset(OpAMD64VPSUBSBMasked512)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x64ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg3(x, y, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpSubSaturatedMaskedUint16x16(v *Value) bool {
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (SubSaturatedMaskedUint16x16 x y mask)
+	// result: (VPSUBSWMasked256 x y (VPMOVVec16x16ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		mask := v_2
+		v.reset(OpAMD64VPSUBSWMasked256)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x16ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg3(x, y, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpSubSaturatedMaskedUint16x32(v *Value) bool {
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (SubSaturatedMaskedUint16x32 x y mask)
+	// result: (VPSUBSWMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		mask := v_2
+		v.reset(OpAMD64VPSUBSWMasked512)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x32ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg3(x, y, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpSubSaturatedMaskedUint16x8(v *Value) bool {
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (SubSaturatedMaskedUint16x8 x y mask)
+	// result: (VPSUBSWMasked128 x y (VPMOVVec16x8ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		mask := v_2
+		v.reset(OpAMD64VPSUBSWMasked128)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg3(x, y, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpSubSaturatedMaskedUint8x16(v *Value) bool {
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (SubSaturatedMaskedUint8x16 x y mask)
+	// result: (VPSUBSBMasked128 x y (VPMOVVec8x16ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		mask := v_2
+		v.reset(OpAMD64VPSUBSBMasked128)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x16ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg3(x, y, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpSubSaturatedMaskedUint8x32(v *Value) bool {
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (SubSaturatedMaskedUint8x32 x y mask)
+	// result: (VPSUBSBMasked256 x y (VPMOVVec8x32ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		mask := v_2
+		v.reset(OpAMD64VPSUBSBMasked256)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x32ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg3(x, y, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpSubSaturatedMaskedUint8x64(v *Value) bool {
+	v_2 := v.Args[2]
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (SubSaturatedMaskedUint8x64 x y mask)
+	// result: (VPSUBSBMasked512 x y (VPMOVVec8x64ToM <types.TypeMask> mask))
+	for {
+		x := v_0
+		y := v_1
+		mask := v_2
+		v.reset(OpAMD64VPSUBSBMasked512)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x64ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg3(x, y, v0)
+		return true
+	}
+}
 func rewriteValueAMD64_OpTrunc(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (Trunc x)
@@ -54823,9 +54637,9 @@ func rewriteValueAMD64_OpTruncFloat64x4(v *Value) bool {
 		return true
 	}
 }
-func rewriteValueAMD64_OpTruncWithPrecisionFloat32x16(v *Value) bool {
+func rewriteValueAMD64_OpTruncScaledFloat32x16(v *Value) bool {
 	v_0 := v.Args[0]
-	// match: (TruncWithPrecisionFloat32x16 [a] x)
+	// match: (TruncScaledFloat32x16 [a] x)
 	// result: (VRNDSCALEPS512 [a+3] x)
 	for {
 		a := auxIntToInt8(v.AuxInt)
@@ -54836,9 +54650,9 @@ func rewriteValueAMD64_OpTruncWithPrecisionFloat32x16(v *Value) bool {
 		return true
 	}
 }
-func rewriteValueAMD64_OpTruncWithPrecisionFloat32x4(v *Value) bool {
+func rewriteValueAMD64_OpTruncScaledFloat32x4(v *Value) bool {
 	v_0 := v.Args[0]
-	// match: (TruncWithPrecisionFloat32x4 [a] x)
+	// match: (TruncScaledFloat32x4 [a] x)
 	// result: (VRNDSCALEPS128 [a+3] x)
 	for {
 		a := auxIntToInt8(v.AuxInt)
@@ -54849,9 +54663,9 @@ func rewriteValueAMD64_OpTruncWithPrecisionFloat32x4(v *Value) bool {
 		return true
 	}
 }
-func rewriteValueAMD64_OpTruncWithPrecisionFloat32x8(v *Value) bool {
+func rewriteValueAMD64_OpTruncScaledFloat32x8(v *Value) bool {
 	v_0 := v.Args[0]
-	// match: (TruncWithPrecisionFloat32x8 [a] x)
+	// match: (TruncScaledFloat32x8 [a] x)
 	// result: (VRNDSCALEPS256 [a+3] x)
 	for {
 		a := auxIntToInt8(v.AuxInt)
@@ -54862,9 +54676,9 @@ func rewriteValueAMD64_OpTruncWithPrecisionFloat32x8(v *Value) bool {
 		return true
 	}
 }
-func rewriteValueAMD64_OpTruncWithPrecisionFloat64x2(v *Value) bool {
+func rewriteValueAMD64_OpTruncScaledFloat64x2(v *Value) bool {
 	v_0 := v.Args[0]
-	// match: (TruncWithPrecisionFloat64x2 [a] x)
+	// match: (TruncScaledFloat64x2 [a] x)
 	// result: (VRNDSCALEPD128 [a+3] x)
 	for {
 		a := auxIntToInt8(v.AuxInt)
@@ -54875,9 +54689,9 @@ func rewriteValueAMD64_OpTruncWithPrecisionFloat64x2(v *Value) bool {
 		return true
 	}
 }
-func rewriteValueAMD64_OpTruncWithPrecisionFloat64x4(v *Value) bool {
+func rewriteValueAMD64_OpTruncScaledFloat64x4(v *Value) bool {
 	v_0 := v.Args[0]
-	// match: (TruncWithPrecisionFloat64x4 [a] x)
+	// match: (TruncScaledFloat64x4 [a] x)
 	// result: (VRNDSCALEPD256 [a+3] x)
 	for {
 		a := auxIntToInt8(v.AuxInt)
@@ -54888,9 +54702,9 @@ func rewriteValueAMD64_OpTruncWithPrecisionFloat64x4(v *Value) bool {
 		return true
 	}
 }
-func rewriteValueAMD64_OpTruncWithPrecisionFloat64x8(v *Value) bool {
+func rewriteValueAMD64_OpTruncScaledFloat64x8(v *Value) bool {
 	v_0 := v.Args[0]
-	// match: (TruncWithPrecisionFloat64x8 [a] x)
+	// match: (TruncScaledFloat64x8 [a] x)
 	// result: (VRNDSCALEPD512 [a+3] x)
 	for {
 		a := auxIntToInt8(v.AuxInt)
@@ -54901,11 +54715,11 @@ func rewriteValueAMD64_OpTruncWithPrecisionFloat64x8(v *Value) bool {
 		return true
 	}
 }
-func rewriteValueAMD64_OpTruncWithPrecisionMaskedFloat32x16(v *Value) bool {
+func rewriteValueAMD64_OpTruncScaledMaskedFloat32x16(v *Value) bool {
 	v_1 := v.Args[1]
 	v_0 := v.Args[0]
 	b := v.Block
-	// match: (TruncWithPrecisionMaskedFloat32x16 [a] x mask)
+	// match: (TruncScaledMaskedFloat32x16 [a] x mask)
 	// result: (VRNDSCALEPSMasked512 [a+3] x (VPMOVVec32x16ToM <types.TypeMask> mask))
 	for {
 		a := auxIntToInt8(v.AuxInt)
@@ -54919,11 +54733,11 @@ func rewriteValueAMD64_OpTruncWithPrecisionMaskedFloat32x16(v *Value) bool {
 		return true
 	}
 }
-func rewriteValueAMD64_OpTruncWithPrecisionMaskedFloat32x4(v *Value) bool {
+func rewriteValueAMD64_OpTruncScaledMaskedFloat32x4(v *Value) bool {
 	v_1 := v.Args[1]
 	v_0 := v.Args[0]
 	b := v.Block
-	// match: (TruncWithPrecisionMaskedFloat32x4 [a] x mask)
+	// match: (TruncScaledMaskedFloat32x4 [a] x mask)
 	// result: (VRNDSCALEPSMasked128 [a+3] x (VPMOVVec32x4ToM <types.TypeMask> mask))
 	for {
 		a := auxIntToInt8(v.AuxInt)
@@ -54937,11 +54751,11 @@ func rewriteValueAMD64_OpTruncWithPrecisionMaskedFloat32x4(v *Value) bool {
 		return true
 	}
 }
-func rewriteValueAMD64_OpTruncWithPrecisionMaskedFloat32x8(v *Value) bool {
+func rewriteValueAMD64_OpTruncScaledMaskedFloat32x8(v *Value) bool {
 	v_1 := v.Args[1]
 	v_0 := v.Args[0]
 	b := v.Block
-	// match: (TruncWithPrecisionMaskedFloat32x8 [a] x mask)
+	// match: (TruncScaledMaskedFloat32x8 [a] x mask)
 	// result: (VRNDSCALEPSMasked256 [a+3] x (VPMOVVec32x8ToM <types.TypeMask> mask))
 	for {
 		a := auxIntToInt8(v.AuxInt)
@@ -54955,11 +54769,11 @@ func rewriteValueAMD64_OpTruncWithPrecisionMaskedFloat32x8(v *Value) bool {
 		return true
 	}
 }
-func rewriteValueAMD64_OpTruncWithPrecisionMaskedFloat64x2(v *Value) bool {
+func rewriteValueAMD64_OpTruncScaledMaskedFloat64x2(v *Value) bool {
 	v_1 := v.Args[1]
 	v_0 := v.Args[0]
 	b := v.Block
-	// match: (TruncWithPrecisionMaskedFloat64x2 [a] x mask)
+	// match: (TruncScaledMaskedFloat64x2 [a] x mask)
 	// result: (VRNDSCALEPDMasked128 [a+3] x (VPMOVVec64x2ToM <types.TypeMask> mask))
 	for {
 		a := auxIntToInt8(v.AuxInt)
@@ -54973,11 +54787,11 @@ func rewriteValueAMD64_OpTruncWithPrecisionMaskedFloat64x2(v *Value) bool {
 		return true
 	}
 }
-func rewriteValueAMD64_OpTruncWithPrecisionMaskedFloat64x4(v *Value) bool {
+func rewriteValueAMD64_OpTruncScaledMaskedFloat64x4(v *Value) bool {
 	v_1 := v.Args[1]
 	v_0 := v.Args[0]
 	b := v.Block
-	// match: (TruncWithPrecisionMaskedFloat64x4 [a] x mask)
+	// match: (TruncScaledMaskedFloat64x4 [a] x mask)
 	// result: (VRNDSCALEPDMasked256 [a+3] x (VPMOVVec64x4ToM <types.TypeMask> mask))
 	for {
 		a := auxIntToInt8(v.AuxInt)
@@ -54991,11 +54805,11 @@ func rewriteValueAMD64_OpTruncWithPrecisionMaskedFloat64x4(v *Value) bool {
 		return true
 	}
 }
-func rewriteValueAMD64_OpTruncWithPrecisionMaskedFloat64x8(v *Value) bool {
+func rewriteValueAMD64_OpTruncScaledMaskedFloat64x8(v *Value) bool {
 	v_1 := v.Args[1]
 	v_0 := v.Args[0]
 	b := v.Block
-	// match: (TruncWithPrecisionMaskedFloat64x8 [a] x mask)
+	// match: (TruncScaledMaskedFloat64x8 [a] x mask)
 	// result: (VRNDSCALEPDMasked512 [a+3] x (VPMOVVec64x8ToM <types.TypeMask> mask))
 	for {
 		a := auxIntToInt8(v.AuxInt)
@@ -55009,6 +54823,192 @@ func rewriteValueAMD64_OpTruncWithPrecisionMaskedFloat64x8(v *Value) bool {
 		return true
 	}
 }
+func rewriteValueAMD64_OpTruncScaledResidueFloat32x16(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (TruncScaledResidueFloat32x16 [a] x)
+	// result: (VREDUCEPS512 [a+3] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VREDUCEPS512)
+		v.AuxInt = int8ToAuxInt(a + 3)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpTruncScaledResidueFloat32x4(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (TruncScaledResidueFloat32x4 [a] x)
+	// result: (VREDUCEPS128 [a+3] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VREDUCEPS128)
+		v.AuxInt = int8ToAuxInt(a + 3)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpTruncScaledResidueFloat32x8(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (TruncScaledResidueFloat32x8 [a] x)
+	// result: (VREDUCEPS256 [a+3] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VREDUCEPS256)
+		v.AuxInt = int8ToAuxInt(a + 3)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpTruncScaledResidueFloat64x2(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (TruncScaledResidueFloat64x2 [a] x)
+	// result: (VREDUCEPD128 [a+3] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VREDUCEPD128)
+		v.AuxInt = int8ToAuxInt(a + 3)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpTruncScaledResidueFloat64x4(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (TruncScaledResidueFloat64x4 [a] x)
+	// result: (VREDUCEPD256 [a+3] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VREDUCEPD256)
+		v.AuxInt = int8ToAuxInt(a + 3)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpTruncScaledResidueFloat64x8(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (TruncScaledResidueFloat64x8 [a] x)
+	// result: (VREDUCEPD512 [a+3] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VREDUCEPD512)
+		v.AuxInt = int8ToAuxInt(a + 3)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpTruncScaledResidueMaskedFloat32x16(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (TruncScaledResidueMaskedFloat32x16 [a] x mask)
+	// result: (VREDUCEPSMasked512 [a+3] x (VPMOVVec32x16ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VREDUCEPSMasked512)
+		v.AuxInt = int8ToAuxInt(a + 3)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpTruncScaledResidueMaskedFloat32x4(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (TruncScaledResidueMaskedFloat32x4 [a] x mask)
+	// result: (VREDUCEPSMasked128 [a+3] x (VPMOVVec32x4ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VREDUCEPSMasked128)
+		v.AuxInt = int8ToAuxInt(a + 3)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpTruncScaledResidueMaskedFloat32x8(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (TruncScaledResidueMaskedFloat32x8 [a] x mask)
+	// result: (VREDUCEPSMasked256 [a+3] x (VPMOVVec32x8ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VREDUCEPSMasked256)
+		v.AuxInt = int8ToAuxInt(a + 3)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpTruncScaledResidueMaskedFloat64x2(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (TruncScaledResidueMaskedFloat64x2 [a] x mask)
+	// result: (VREDUCEPDMasked128 [a+3] x (VPMOVVec64x2ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VREDUCEPDMasked128)
+		v.AuxInt = int8ToAuxInt(a + 3)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpTruncScaledResidueMaskedFloat64x4(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (TruncScaledResidueMaskedFloat64x4 [a] x mask)
+	// result: (VREDUCEPDMasked256 [a+3] x (VPMOVVec64x4ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VREDUCEPDMasked256)
+		v.AuxInt = int8ToAuxInt(a + 3)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
+func rewriteValueAMD64_OpTruncScaledResidueMaskedFloat64x8(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	// match: (TruncScaledResidueMaskedFloat64x8 [a] x mask)
+	// result: (VREDUCEPDMasked512 [a+3] x (VPMOVVec64x8ToM <types.TypeMask> mask))
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		mask := v_1
+		v.reset(OpAMD64VREDUCEPDMasked512)
+		v.AuxInt = int8ToAuxInt(a + 3)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask)
+		v0.AddArg(mask)
+		v.AddArg2(x, v0)
+		return true
+	}
+}
 func rewriteValueAMD64_OpUnsignedSignedQuadDotProdAccumulateMaskedInt32x16(v *Value) bool {
 	v_3 := v.Args[3]
 	v_2 := v.Args[2]
diff --git a/src/cmd/compile/internal/ssagen/simdintrinsics.go b/src/cmd/compile/internal/ssagen/simdintrinsics.go
index 7a7367ee1e7..511974ffa1b 100644
--- a/src/cmd/compile/internal/ssagen/simdintrinsics.go
+++ b/src/cmd/compile/internal/ssagen/simdintrinsics.go
@@ -101,6 +101,44 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
 	addF(simdPackage, "Uint64x2.AddMasked", opLen3(ssa.OpAddMaskedUint64x2, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Uint64x4.AddMasked", opLen3(ssa.OpAddMaskedUint64x4, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint64x8.AddMasked", opLen3(ssa.OpAddMaskedUint64x8, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Float32x4.AddPairs", opLen2(ssa.OpAddPairsFloat32x4, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Float32x8.AddPairs", opLen2(ssa.OpAddPairsFloat32x8, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Float64x2.AddPairs", opLen2(ssa.OpAddPairsFloat64x2, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Float64x4.AddPairs", opLen2(ssa.OpAddPairsFloat64x4, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int16x8.AddPairs", opLen2(ssa.OpAddPairsInt16x8, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int16x16.AddPairs", opLen2(ssa.OpAddPairsInt16x16, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int32x4.AddPairs", opLen2(ssa.OpAddPairsInt32x4, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int32x8.AddPairs", opLen2(ssa.OpAddPairsInt32x8, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Uint16x8.AddPairs", opLen2(ssa.OpAddPairsUint16x8, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint16x16.AddPairs", opLen2(ssa.OpAddPairsUint16x16, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Uint32x4.AddPairs", opLen2(ssa.OpAddPairsUint32x4, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint32x8.AddPairs", opLen2(ssa.OpAddPairsUint32x8, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int16x8.AddPairsSaturated", opLen2(ssa.OpAddPairsSaturatedInt16x8, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int16x16.AddPairsSaturated", opLen2(ssa.OpAddPairsSaturatedInt16x16, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int8x16.AddSaturated", opLen2(ssa.OpAddSaturatedInt8x16, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int8x32.AddSaturated", opLen2(ssa.OpAddSaturatedInt8x32, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int8x64.AddSaturated", opLen2(ssa.OpAddSaturatedInt8x64, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Int16x8.AddSaturated", opLen2(ssa.OpAddSaturatedInt16x8, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int16x16.AddSaturated", opLen2(ssa.OpAddSaturatedInt16x16, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int16x32.AddSaturated", opLen2(ssa.OpAddSaturatedInt16x32, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Uint8x16.AddSaturated", opLen2(ssa.OpAddSaturatedUint8x16, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint8x32.AddSaturated", opLen2(ssa.OpAddSaturatedUint8x32, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Uint8x64.AddSaturated", opLen2(ssa.OpAddSaturatedUint8x64, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Uint16x8.AddSaturated", opLen2(ssa.OpAddSaturatedUint16x8, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint16x16.AddSaturated", opLen2(ssa.OpAddSaturatedUint16x16, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Uint16x32.AddSaturated", opLen2(ssa.OpAddSaturatedUint16x32, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Int8x16.AddSaturatedMasked", opLen3(ssa.OpAddSaturatedMaskedInt8x16, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int8x32.AddSaturatedMasked", opLen3(ssa.OpAddSaturatedMaskedInt8x32, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int8x64.AddSaturatedMasked", opLen3(ssa.OpAddSaturatedMaskedInt8x64, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Int16x8.AddSaturatedMasked", opLen3(ssa.OpAddSaturatedMaskedInt16x8, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int16x16.AddSaturatedMasked", opLen3(ssa.OpAddSaturatedMaskedInt16x16, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int16x32.AddSaturatedMasked", opLen3(ssa.OpAddSaturatedMaskedInt16x32, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Uint8x16.AddSaturatedMasked", opLen3(ssa.OpAddSaturatedMaskedUint8x16, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint8x32.AddSaturatedMasked", opLen3(ssa.OpAddSaturatedMaskedUint8x32, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Uint8x64.AddSaturatedMasked", opLen3(ssa.OpAddSaturatedMaskedUint8x64, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Uint16x8.AddSaturatedMasked", opLen3(ssa.OpAddSaturatedMaskedUint16x8, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint16x16.AddSaturatedMasked", opLen3(ssa.OpAddSaturatedMaskedUint16x16, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Uint16x32.AddSaturatedMasked", opLen3(ssa.OpAddSaturatedMaskedUint16x32, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Float32x4.AddSub", opLen2(ssa.OpAddSubFloat32x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Float32x8.AddSub", opLen2(ssa.OpAddSubFloat32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Float64x2.AddSub", opLen2(ssa.OpAddSubFloat64x2, types.TypeVec128), sys.AMD64)
@@ -217,18 +255,30 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
 	addF(simdPackage, "Float32x8.Ceil", opLen1(ssa.OpCeilFloat32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Float64x2.Ceil", opLen1(ssa.OpCeilFloat64x2, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Float64x4.Ceil", opLen1(ssa.OpCeilFloat64x4, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Float32x4.CeilWithPrecision", opLen1Imm8(ssa.OpCeilWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
-	addF(simdPackage, "Float32x8.CeilWithPrecision", opLen1Imm8(ssa.OpCeilWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
-	addF(simdPackage, "Float32x16.CeilWithPrecision", opLen1Imm8(ssa.OpCeilWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
-	addF(simdPackage, "Float64x2.CeilWithPrecision", opLen1Imm8(ssa.OpCeilWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
-	addF(simdPackage, "Float64x4.CeilWithPrecision", opLen1Imm8(ssa.OpCeilWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
-	addF(simdPackage, "Float64x8.CeilWithPrecision", opLen1Imm8(ssa.OpCeilWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
-	addF(simdPackage, "Float32x4.CeilWithPrecisionMasked", opLen2Imm8(ssa.OpCeilWithPrecisionMaskedFloat32x4, types.TypeVec128, 4), sys.AMD64)
-	addF(simdPackage, "Float32x8.CeilWithPrecisionMasked", opLen2Imm8(ssa.OpCeilWithPrecisionMaskedFloat32x8, types.TypeVec256, 4), sys.AMD64)
-	addF(simdPackage, "Float32x16.CeilWithPrecisionMasked", opLen2Imm8(ssa.OpCeilWithPrecisionMaskedFloat32x16, types.TypeVec512, 4), sys.AMD64)
-	addF(simdPackage, "Float64x2.CeilWithPrecisionMasked", opLen2Imm8(ssa.OpCeilWithPrecisionMaskedFloat64x2, types.TypeVec128, 4), sys.AMD64)
-	addF(simdPackage, "Float64x4.CeilWithPrecisionMasked", opLen2Imm8(ssa.OpCeilWithPrecisionMaskedFloat64x4, types.TypeVec256, 4), sys.AMD64)
-	addF(simdPackage, "Float64x8.CeilWithPrecisionMasked", opLen2Imm8(ssa.OpCeilWithPrecisionMaskedFloat64x8, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x4.CeilScaled", opLen1Imm8(ssa.OpCeilScaledFloat32x4, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float32x8.CeilScaled", opLen1Imm8(ssa.OpCeilScaledFloat32x8, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float32x16.CeilScaled", opLen1Imm8(ssa.OpCeilScaledFloat32x16, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float64x2.CeilScaled", opLen1Imm8(ssa.OpCeilScaledFloat64x2, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float64x4.CeilScaled", opLen1Imm8(ssa.OpCeilScaledFloat64x4, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x8.CeilScaled", opLen1Imm8(ssa.OpCeilScaledFloat64x8, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x4.CeilScaledMasked", opLen2Imm8(ssa.OpCeilScaledMaskedFloat32x4, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float32x8.CeilScaledMasked", opLen2Imm8(ssa.OpCeilScaledMaskedFloat32x8, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float32x16.CeilScaledMasked", opLen2Imm8(ssa.OpCeilScaledMaskedFloat32x16, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float64x2.CeilScaledMasked", opLen2Imm8(ssa.OpCeilScaledMaskedFloat64x2, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float64x4.CeilScaledMasked", opLen2Imm8(ssa.OpCeilScaledMaskedFloat64x4, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x8.CeilScaledMasked", opLen2Imm8(ssa.OpCeilScaledMaskedFloat64x8, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x4.CeilScaledResidue", opLen1Imm8(ssa.OpCeilScaledResidueFloat32x4, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float32x8.CeilScaledResidue", opLen1Imm8(ssa.OpCeilScaledResidueFloat32x8, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float32x16.CeilScaledResidue", opLen1Imm8(ssa.OpCeilScaledResidueFloat32x16, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float64x2.CeilScaledResidue", opLen1Imm8(ssa.OpCeilScaledResidueFloat64x2, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float64x4.CeilScaledResidue", opLen1Imm8(ssa.OpCeilScaledResidueFloat64x4, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x8.CeilScaledResidue", opLen1Imm8(ssa.OpCeilScaledResidueFloat64x8, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x4.CeilScaledResidueMasked", opLen2Imm8(ssa.OpCeilScaledResidueMaskedFloat32x4, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float32x8.CeilScaledResidueMasked", opLen2Imm8(ssa.OpCeilScaledResidueMaskedFloat32x8, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float32x16.CeilScaledResidueMasked", opLen2Imm8(ssa.OpCeilScaledResidueMaskedFloat32x16, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float64x2.CeilScaledResidueMasked", opLen2Imm8(ssa.OpCeilScaledResidueMaskedFloat64x2, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float64x4.CeilScaledResidueMasked", opLen2Imm8(ssa.OpCeilScaledResidueMaskedFloat64x4, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x8.CeilScaledResidueMasked", opLen2Imm8(ssa.OpCeilScaledResidueMaskedFloat64x8, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float32x4.Compress", opLen2(ssa.OpCompressFloat32x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Float32x8.Compress", opLen2(ssa.OpCompressFloat32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Float32x16.Compress", opLen2(ssa.OpCompressFloat32x16, types.TypeVec512), sys.AMD64)
@@ -271,54 +321,6 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
 	addF(simdPackage, "Float32x4.ConvertToUint32Masked", opLen2(ssa.OpConvertToUint32MaskedFloat32x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Float32x8.ConvertToUint32Masked", opLen2(ssa.OpConvertToUint32MaskedFloat32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Float32x16.ConvertToUint32Masked", opLen2(ssa.OpConvertToUint32MaskedFloat32x16, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Float32x4.DiffWithCeilWithPrecision", opLen1Imm8(ssa.OpDiffWithCeilWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
-	addF(simdPackage, "Float32x8.DiffWithCeilWithPrecision", opLen1Imm8(ssa.OpDiffWithCeilWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
-	addF(simdPackage, "Float32x16.DiffWithCeilWithPrecision", opLen1Imm8(ssa.OpDiffWithCeilWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
-	addF(simdPackage, "Float64x2.DiffWithCeilWithPrecision", opLen1Imm8(ssa.OpDiffWithCeilWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
-	addF(simdPackage, "Float64x4.DiffWithCeilWithPrecision", opLen1Imm8(ssa.OpDiffWithCeilWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
-	addF(simdPackage, "Float64x8.DiffWithCeilWithPrecision", opLen1Imm8(ssa.OpDiffWithCeilWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
-	addF(simdPackage, "Float32x4.DiffWithCeilWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithCeilWithPrecisionMaskedFloat32x4, types.TypeVec128, 4), sys.AMD64)
-	addF(simdPackage, "Float32x8.DiffWithCeilWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithCeilWithPrecisionMaskedFloat32x8, types.TypeVec256, 4), sys.AMD64)
-	addF(simdPackage, "Float32x16.DiffWithCeilWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithCeilWithPrecisionMaskedFloat32x16, types.TypeVec512, 4), sys.AMD64)
-	addF(simdPackage, "Float64x2.DiffWithCeilWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithCeilWithPrecisionMaskedFloat64x2, types.TypeVec128, 4), sys.AMD64)
-	addF(simdPackage, "Float64x4.DiffWithCeilWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithCeilWithPrecisionMaskedFloat64x4, types.TypeVec256, 4), sys.AMD64)
-	addF(simdPackage, "Float64x8.DiffWithCeilWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithCeilWithPrecisionMaskedFloat64x8, types.TypeVec512, 4), sys.AMD64)
-	addF(simdPackage, "Float32x4.DiffWithFloorWithPrecision", opLen1Imm8(ssa.OpDiffWithFloorWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
-	addF(simdPackage, "Float32x8.DiffWithFloorWithPrecision", opLen1Imm8(ssa.OpDiffWithFloorWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
-	addF(simdPackage, "Float32x16.DiffWithFloorWithPrecision", opLen1Imm8(ssa.OpDiffWithFloorWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
-	addF(simdPackage, "Float64x2.DiffWithFloorWithPrecision", opLen1Imm8(ssa.OpDiffWithFloorWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
-	addF(simdPackage, "Float64x4.DiffWithFloorWithPrecision", opLen1Imm8(ssa.OpDiffWithFloorWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
-	addF(simdPackage, "Float64x8.DiffWithFloorWithPrecision", opLen1Imm8(ssa.OpDiffWithFloorWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
-	addF(simdPackage, "Float32x4.DiffWithFloorWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithFloorWithPrecisionMaskedFloat32x4, types.TypeVec128, 4), sys.AMD64)
-	addF(simdPackage, "Float32x8.DiffWithFloorWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithFloorWithPrecisionMaskedFloat32x8, types.TypeVec256, 4), sys.AMD64)
-	addF(simdPackage, "Float32x16.DiffWithFloorWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithFloorWithPrecisionMaskedFloat32x16, types.TypeVec512, 4), sys.AMD64)
-	addF(simdPackage, "Float64x2.DiffWithFloorWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithFloorWithPrecisionMaskedFloat64x2, types.TypeVec128, 4), sys.AMD64)
-	addF(simdPackage, "Float64x4.DiffWithFloorWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithFloorWithPrecisionMaskedFloat64x4, types.TypeVec256, 4), sys.AMD64)
-	addF(simdPackage, "Float64x8.DiffWithFloorWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithFloorWithPrecisionMaskedFloat64x8, types.TypeVec512, 4), sys.AMD64)
-	addF(simdPackage, "Float32x4.DiffWithRoundWithPrecision", opLen1Imm8(ssa.OpDiffWithRoundWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
-	addF(simdPackage, "Float32x8.DiffWithRoundWithPrecision", opLen1Imm8(ssa.OpDiffWithRoundWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
-	addF(simdPackage, "Float32x16.DiffWithRoundWithPrecision", opLen1Imm8(ssa.OpDiffWithRoundWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
-	addF(simdPackage, "Float64x2.DiffWithRoundWithPrecision", opLen1Imm8(ssa.OpDiffWithRoundWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
-	addF(simdPackage, "Float64x4.DiffWithRoundWithPrecision", opLen1Imm8(ssa.OpDiffWithRoundWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
-	addF(simdPackage, "Float64x8.DiffWithRoundWithPrecision", opLen1Imm8(ssa.OpDiffWithRoundWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
-	addF(simdPackage, "Float32x4.DiffWithRoundWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithRoundWithPrecisionMaskedFloat32x4, types.TypeVec128, 4), sys.AMD64)
-	addF(simdPackage, "Float32x8.DiffWithRoundWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithRoundWithPrecisionMaskedFloat32x8, types.TypeVec256, 4), sys.AMD64)
-	addF(simdPackage, "Float32x16.DiffWithRoundWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithRoundWithPrecisionMaskedFloat32x16, types.TypeVec512, 4), sys.AMD64)
-	addF(simdPackage, "Float64x2.DiffWithRoundWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithRoundWithPrecisionMaskedFloat64x2, types.TypeVec128, 4), sys.AMD64)
-	addF(simdPackage, "Float64x4.DiffWithRoundWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithRoundWithPrecisionMaskedFloat64x4, types.TypeVec256, 4), sys.AMD64)
-	addF(simdPackage, "Float64x8.DiffWithRoundWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithRoundWithPrecisionMaskedFloat64x8, types.TypeVec512, 4), sys.AMD64)
-	addF(simdPackage, "Float32x4.DiffWithTruncWithPrecision", opLen1Imm8(ssa.OpDiffWithTruncWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
-	addF(simdPackage, "Float32x8.DiffWithTruncWithPrecision", opLen1Imm8(ssa.OpDiffWithTruncWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
-	addF(simdPackage, "Float32x16.DiffWithTruncWithPrecision", opLen1Imm8(ssa.OpDiffWithTruncWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
-	addF(simdPackage, "Float64x2.DiffWithTruncWithPrecision", opLen1Imm8(ssa.OpDiffWithTruncWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
-	addF(simdPackage, "Float64x4.DiffWithTruncWithPrecision", opLen1Imm8(ssa.OpDiffWithTruncWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
-	addF(simdPackage, "Float64x8.DiffWithTruncWithPrecision", opLen1Imm8(ssa.OpDiffWithTruncWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
-	addF(simdPackage, "Float32x4.DiffWithTruncWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithTruncWithPrecisionMaskedFloat32x4, types.TypeVec128, 4), sys.AMD64)
-	addF(simdPackage, "Float32x8.DiffWithTruncWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithTruncWithPrecisionMaskedFloat32x8, types.TypeVec256, 4), sys.AMD64)
-	addF(simdPackage, "Float32x16.DiffWithTruncWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithTruncWithPrecisionMaskedFloat32x16, types.TypeVec512, 4), sys.AMD64)
-	addF(simdPackage, "Float64x2.DiffWithTruncWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithTruncWithPrecisionMaskedFloat64x2, types.TypeVec128, 4), sys.AMD64)
-	addF(simdPackage, "Float64x4.DiffWithTruncWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithTruncWithPrecisionMaskedFloat64x4, types.TypeVec256, 4), sys.AMD64)
-	addF(simdPackage, "Float64x8.DiffWithTruncWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithTruncWithPrecisionMaskedFloat64x8, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float32x4.Div", opLen2(ssa.OpDivFloat32x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Float32x8.Div", opLen2(ssa.OpDivFloat32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Float32x16.Div", opLen2(ssa.OpDivFloat32x16, types.TypeVec512), sys.AMD64)
@@ -398,18 +400,30 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
 	addF(simdPackage, "Float32x8.Floor", opLen1(ssa.OpFloorFloat32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Float64x2.Floor", opLen1(ssa.OpFloorFloat64x2, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Float64x4.Floor", opLen1(ssa.OpFloorFloat64x4, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Float32x4.FloorWithPrecision", opLen1Imm8(ssa.OpFloorWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
-	addF(simdPackage, "Float32x8.FloorWithPrecision", opLen1Imm8(ssa.OpFloorWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
-	addF(simdPackage, "Float32x16.FloorWithPrecision", opLen1Imm8(ssa.OpFloorWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
-	addF(simdPackage, "Float64x2.FloorWithPrecision", opLen1Imm8(ssa.OpFloorWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
-	addF(simdPackage, "Float64x4.FloorWithPrecision", opLen1Imm8(ssa.OpFloorWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
-	addF(simdPackage, "Float64x8.FloorWithPrecision", opLen1Imm8(ssa.OpFloorWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
-	addF(simdPackage, "Float32x4.FloorWithPrecisionMasked", opLen2Imm8(ssa.OpFloorWithPrecisionMaskedFloat32x4, types.TypeVec128, 4), sys.AMD64)
-	addF(simdPackage, "Float32x8.FloorWithPrecisionMasked", opLen2Imm8(ssa.OpFloorWithPrecisionMaskedFloat32x8, types.TypeVec256, 4), sys.AMD64)
-	addF(simdPackage, "Float32x16.FloorWithPrecisionMasked", opLen2Imm8(ssa.OpFloorWithPrecisionMaskedFloat32x16, types.TypeVec512, 4), sys.AMD64)
-	addF(simdPackage, "Float64x2.FloorWithPrecisionMasked", opLen2Imm8(ssa.OpFloorWithPrecisionMaskedFloat64x2, types.TypeVec128, 4), sys.AMD64)
-	addF(simdPackage, "Float64x4.FloorWithPrecisionMasked", opLen2Imm8(ssa.OpFloorWithPrecisionMaskedFloat64x4, types.TypeVec256, 4), sys.AMD64)
-	addF(simdPackage, "Float64x8.FloorWithPrecisionMasked", opLen2Imm8(ssa.OpFloorWithPrecisionMaskedFloat64x8, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x4.FloorScaled", opLen1Imm8(ssa.OpFloorScaledFloat32x4, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float32x8.FloorScaled", opLen1Imm8(ssa.OpFloorScaledFloat32x8, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float32x16.FloorScaled", opLen1Imm8(ssa.OpFloorScaledFloat32x16, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float64x2.FloorScaled", opLen1Imm8(ssa.OpFloorScaledFloat64x2, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float64x4.FloorScaled", opLen1Imm8(ssa.OpFloorScaledFloat64x4, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x8.FloorScaled", opLen1Imm8(ssa.OpFloorScaledFloat64x8, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x4.FloorScaledMasked", opLen2Imm8(ssa.OpFloorScaledMaskedFloat32x4, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float32x8.FloorScaledMasked", opLen2Imm8(ssa.OpFloorScaledMaskedFloat32x8, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float32x16.FloorScaledMasked", opLen2Imm8(ssa.OpFloorScaledMaskedFloat32x16, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float64x2.FloorScaledMasked", opLen2Imm8(ssa.OpFloorScaledMaskedFloat64x2, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float64x4.FloorScaledMasked", opLen2Imm8(ssa.OpFloorScaledMaskedFloat64x4, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x8.FloorScaledMasked", opLen2Imm8(ssa.OpFloorScaledMaskedFloat64x8, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x4.FloorScaledResidue", opLen1Imm8(ssa.OpFloorScaledResidueFloat32x4, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float32x8.FloorScaledResidue", opLen1Imm8(ssa.OpFloorScaledResidueFloat32x8, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float32x16.FloorScaledResidue", opLen1Imm8(ssa.OpFloorScaledResidueFloat32x16, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float64x2.FloorScaledResidue", opLen1Imm8(ssa.OpFloorScaledResidueFloat64x2, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float64x4.FloorScaledResidue", opLen1Imm8(ssa.OpFloorScaledResidueFloat64x4, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x8.FloorScaledResidue", opLen1Imm8(ssa.OpFloorScaledResidueFloat64x8, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x4.FloorScaledResidueMasked", opLen2Imm8(ssa.OpFloorScaledResidueMaskedFloat32x4, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float32x8.FloorScaledResidueMasked", opLen2Imm8(ssa.OpFloorScaledResidueMaskedFloat32x8, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float32x16.FloorScaledResidueMasked", opLen2Imm8(ssa.OpFloorScaledResidueMaskedFloat32x16, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float64x2.FloorScaledResidueMasked", opLen2Imm8(ssa.OpFloorScaledResidueMaskedFloat64x2, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float64x4.FloorScaledResidueMasked", opLen2Imm8(ssa.OpFloorScaledResidueMaskedFloat64x4, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x8.FloorScaledResidueMasked", opLen2Imm8(ssa.OpFloorScaledResidueMaskedFloat64x8, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float32x4.FusedMultiplyAdd", opLen3(ssa.OpFusedMultiplyAddFloat32x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Float32x8.FusedMultiplyAdd", opLen3(ssa.OpFusedMultiplyAddFloat32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Float32x16.FusedMultiplyAdd", opLen3(ssa.OpFusedMultiplyAddFloat32x16, types.TypeVec512), sys.AMD64)
@@ -860,18 +874,15 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
 	addF(simdPackage, "Float64x2.Mul", opLen2(ssa.OpMulFloat64x2, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Float64x4.Mul", opLen2(ssa.OpMulFloat64x4, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Float64x8.Mul", opLen2(ssa.OpMulFloat64x8, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Float32x4.MulByPowOf2", opLen2(ssa.OpMulByPowOf2Float32x4, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Float32x8.MulByPowOf2", opLen2(ssa.OpMulByPowOf2Float32x8, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Float32x16.MulByPowOf2", opLen2(ssa.OpMulByPowOf2Float32x16, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Float64x2.MulByPowOf2", opLen2(ssa.OpMulByPowOf2Float64x2, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Float64x4.MulByPowOf2", opLen2(ssa.OpMulByPowOf2Float64x4, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Float64x8.MulByPowOf2", opLen2(ssa.OpMulByPowOf2Float64x8, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Float32x4.MulByPowOf2Masked", opLen3(ssa.OpMulByPowOf2MaskedFloat32x4, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Float32x8.MulByPowOf2Masked", opLen3(ssa.OpMulByPowOf2MaskedFloat32x8, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Float32x16.MulByPowOf2Masked", opLen3(ssa.OpMulByPowOf2MaskedFloat32x16, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Float64x2.MulByPowOf2Masked", opLen3(ssa.OpMulByPowOf2MaskedFloat64x2, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Float64x4.MulByPowOf2Masked", opLen3(ssa.OpMulByPowOf2MaskedFloat64x4, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Float64x8.MulByPowOf2Masked", opLen3(ssa.OpMulByPowOf2MaskedFloat64x8, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Int16x8.Mul", opLen2(ssa.OpMulInt16x8, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int16x16.Mul", opLen2(ssa.OpMulInt16x16, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int16x32.Mul", opLen2(ssa.OpMulInt16x32, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Int32x4.Mul", opLen2(ssa.OpMulInt32x4, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int32x8.Mul", opLen2(ssa.OpMulInt32x8, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int32x16.Mul", opLen2(ssa.OpMulInt32x16, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Int64x2.Mul", opLen2(ssa.OpMulInt64x2, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int64x4.Mul", opLen2(ssa.OpMulInt64x4, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int64x8.Mul", opLen2(ssa.OpMulInt64x8, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Int32x4.MulEvenWiden", opLen2(ssa.OpMulEvenWidenInt32x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Int32x8.MulEvenWiden", opLen2(ssa.OpMulEvenWidenInt32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int64x2.MulEvenWiden", opLen2(ssa.OpMulEvenWidenInt64x2, types.TypeVec128), sys.AMD64)
@@ -900,30 +911,21 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
 	addF(simdPackage, "Uint16x8.MulHighMasked", opLen3(ssa.OpMulHighMaskedUint16x8, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Uint16x16.MulHighMasked", opLen3(ssa.OpMulHighMaskedUint16x16, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint16x32.MulHighMasked", opLen3(ssa.OpMulHighMaskedUint16x32, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Int16x8.MulLow", opLen2(ssa.OpMulLowInt16x8, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int16x16.MulLow", opLen2(ssa.OpMulLowInt16x16, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Int16x32.MulLow", opLen2(ssa.OpMulLowInt16x32, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Int32x4.MulLow", opLen2(ssa.OpMulLowInt32x4, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int32x8.MulLow", opLen2(ssa.OpMulLowInt32x8, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Int32x16.MulLow", opLen2(ssa.OpMulLowInt32x16, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Int64x2.MulLow", opLen2(ssa.OpMulLowInt64x2, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int64x4.MulLow", opLen2(ssa.OpMulLowInt64x4, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Int64x8.MulLow", opLen2(ssa.OpMulLowInt64x8, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Int16x8.MulLowMasked", opLen3(ssa.OpMulLowMaskedInt16x8, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int16x16.MulLowMasked", opLen3(ssa.OpMulLowMaskedInt16x16, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Int16x32.MulLowMasked", opLen3(ssa.OpMulLowMaskedInt16x32, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Int32x4.MulLowMasked", opLen3(ssa.OpMulLowMaskedInt32x4, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int32x8.MulLowMasked", opLen3(ssa.OpMulLowMaskedInt32x8, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Int32x16.MulLowMasked", opLen3(ssa.OpMulLowMaskedInt32x16, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Int64x2.MulLowMasked", opLen3(ssa.OpMulLowMaskedInt64x2, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int64x4.MulLowMasked", opLen3(ssa.OpMulLowMaskedInt64x4, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Int64x8.MulLowMasked", opLen3(ssa.OpMulLowMaskedInt64x8, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Float32x4.MulMasked", opLen3(ssa.OpMulMaskedFloat32x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Float32x8.MulMasked", opLen3(ssa.OpMulMaskedFloat32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Float32x16.MulMasked", opLen3(ssa.OpMulMaskedFloat32x16, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Float64x2.MulMasked", opLen3(ssa.OpMulMaskedFloat64x2, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Float64x4.MulMasked", opLen3(ssa.OpMulMaskedFloat64x4, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Float64x8.MulMasked", opLen3(ssa.OpMulMaskedFloat64x8, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Int16x8.MulMasked", opLen3(ssa.OpMulMaskedInt16x8, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int16x16.MulMasked", opLen3(ssa.OpMulMaskedInt16x16, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int16x32.MulMasked", opLen3(ssa.OpMulMaskedInt16x32, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Int32x4.MulMasked", opLen3(ssa.OpMulMaskedInt32x4, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int32x8.MulMasked", opLen3(ssa.OpMulMaskedInt32x8, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int32x16.MulMasked", opLen3(ssa.OpMulMaskedInt32x16, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Int64x2.MulMasked", opLen3(ssa.OpMulMaskedInt64x2, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int64x4.MulMasked", opLen3(ssa.OpMulMaskedInt64x4, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int64x8.MulMasked", opLen3(ssa.OpMulMaskedInt64x8, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Float32x4.NotEqual", opLen2(ssa.OpNotEqualFloat32x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Float32x8.NotEqual", opLen2(ssa.OpNotEqualFloat32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Float32x16.NotEqual", opLen2(ssa.OpNotEqualFloat32x16, types.TypeVec512), sys.AMD64)
@@ -1026,30 +1028,6 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
 	addF(simdPackage, "Int16x8.PairDotProdMasked", opLen3(ssa.OpPairDotProdMaskedInt16x8, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Int16x16.PairDotProdMasked", opLen3(ssa.OpPairDotProdMaskedInt16x16, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int16x32.PairDotProdMasked", opLen3(ssa.OpPairDotProdMaskedInt16x32, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Float32x4.PairwiseAdd", opLen2(ssa.OpPairwiseAddFloat32x4, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Float32x8.PairwiseAdd", opLen2(ssa.OpPairwiseAddFloat32x8, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Float64x2.PairwiseAdd", opLen2(ssa.OpPairwiseAddFloat64x2, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Float64x4.PairwiseAdd", opLen2(ssa.OpPairwiseAddFloat64x4, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Int16x8.PairwiseAdd", opLen2(ssa.OpPairwiseAddInt16x8, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int16x16.PairwiseAdd", opLen2(ssa.OpPairwiseAddInt16x16, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Int32x4.PairwiseAdd", opLen2(ssa.OpPairwiseAddInt32x4, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int32x8.PairwiseAdd", opLen2(ssa.OpPairwiseAddInt32x8, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Uint16x8.PairwiseAdd", opLen2(ssa.OpPairwiseAddUint16x8, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Uint16x16.PairwiseAdd", opLen2(ssa.OpPairwiseAddUint16x16, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Uint32x4.PairwiseAdd", opLen2(ssa.OpPairwiseAddUint32x4, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Uint32x8.PairwiseAdd", opLen2(ssa.OpPairwiseAddUint32x8, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Float32x4.PairwiseSub", opLen2(ssa.OpPairwiseSubFloat32x4, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Float32x8.PairwiseSub", opLen2(ssa.OpPairwiseSubFloat32x8, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Float64x2.PairwiseSub", opLen2(ssa.OpPairwiseSubFloat64x2, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Float64x4.PairwiseSub", opLen2(ssa.OpPairwiseSubFloat64x4, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Int16x8.PairwiseSub", opLen2(ssa.OpPairwiseSubInt16x8, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int16x16.PairwiseSub", opLen2(ssa.OpPairwiseSubInt16x16, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Int32x4.PairwiseSub", opLen2(ssa.OpPairwiseSubInt32x4, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int32x8.PairwiseSub", opLen2(ssa.OpPairwiseSubInt32x8, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Uint16x8.PairwiseSub", opLen2(ssa.OpPairwiseSubUint16x8, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Uint16x16.PairwiseSub", opLen2(ssa.OpPairwiseSubUint16x16, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Uint32x4.PairwiseSub", opLen2(ssa.OpPairwiseSubUint32x4, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Uint32x8.PairwiseSub", opLen2(ssa.OpPairwiseSubUint32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int8x16.Permute", opLen2_21(ssa.OpPermuteInt8x16, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Uint8x16.Permute", opLen2_21(ssa.OpPermuteUint8x16, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Int8x32.Permute", opLen2_21(ssa.OpPermuteInt8x32, types.TypeVec256), sys.AMD64)
@@ -1306,76 +1284,36 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
 	addF(simdPackage, "Float32x8.Round", opLen1(ssa.OpRoundFloat32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Float64x2.Round", opLen1(ssa.OpRoundFloat64x2, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Float64x4.Round", opLen1(ssa.OpRoundFloat64x4, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Float32x4.RoundWithPrecision", opLen1Imm8(ssa.OpRoundWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
-	addF(simdPackage, "Float32x8.RoundWithPrecision", opLen1Imm8(ssa.OpRoundWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
-	addF(simdPackage, "Float32x16.RoundWithPrecision", opLen1Imm8(ssa.OpRoundWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
-	addF(simdPackage, "Float64x2.RoundWithPrecision", opLen1Imm8(ssa.OpRoundWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
-	addF(simdPackage, "Float64x4.RoundWithPrecision", opLen1Imm8(ssa.OpRoundWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
-	addF(simdPackage, "Float64x8.RoundWithPrecision", opLen1Imm8(ssa.OpRoundWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
-	addF(simdPackage, "Float32x4.RoundWithPrecisionMasked", opLen2Imm8(ssa.OpRoundWithPrecisionMaskedFloat32x4, types.TypeVec128, 4), sys.AMD64)
-	addF(simdPackage, "Float32x8.RoundWithPrecisionMasked", opLen2Imm8(ssa.OpRoundWithPrecisionMaskedFloat32x8, types.TypeVec256, 4), sys.AMD64)
-	addF(simdPackage, "Float32x16.RoundWithPrecisionMasked", opLen2Imm8(ssa.OpRoundWithPrecisionMaskedFloat32x16, types.TypeVec512, 4), sys.AMD64)
-	addF(simdPackage, "Float64x2.RoundWithPrecisionMasked", opLen2Imm8(ssa.OpRoundWithPrecisionMaskedFloat64x2, types.TypeVec128, 4), sys.AMD64)
-	addF(simdPackage, "Float64x4.RoundWithPrecisionMasked", opLen2Imm8(ssa.OpRoundWithPrecisionMaskedFloat64x4, types.TypeVec256, 4), sys.AMD64)
-	addF(simdPackage, "Float64x8.RoundWithPrecisionMasked", opLen2Imm8(ssa.OpRoundWithPrecisionMaskedFloat64x8, types.TypeVec512, 4), sys.AMD64)
-	addF(simdPackage, "Int8x16.SaturatedAdd", opLen2(ssa.OpSaturatedAddInt8x16, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int8x32.SaturatedAdd", opLen2(ssa.OpSaturatedAddInt8x32, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Int8x64.SaturatedAdd", opLen2(ssa.OpSaturatedAddInt8x64, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Int16x8.SaturatedAdd", opLen2(ssa.OpSaturatedAddInt16x8, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int16x16.SaturatedAdd", opLen2(ssa.OpSaturatedAddInt16x16, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Int16x32.SaturatedAdd", opLen2(ssa.OpSaturatedAddInt16x32, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Uint8x16.SaturatedAdd", opLen2(ssa.OpSaturatedAddUint8x16, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Uint8x32.SaturatedAdd", opLen2(ssa.OpSaturatedAddUint8x32, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Uint8x64.SaturatedAdd", opLen2(ssa.OpSaturatedAddUint8x64, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Uint16x8.SaturatedAdd", opLen2(ssa.OpSaturatedAddUint16x8, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Uint16x16.SaturatedAdd", opLen2(ssa.OpSaturatedAddUint16x16, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Uint16x32.SaturatedAdd", opLen2(ssa.OpSaturatedAddUint16x32, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Float32x4.RoundScaled", opLen1Imm8(ssa.OpRoundScaledFloat32x4, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float32x8.RoundScaled", opLen1Imm8(ssa.OpRoundScaledFloat32x8, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float32x16.RoundScaled", opLen1Imm8(ssa.OpRoundScaledFloat32x16, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float64x2.RoundScaled", opLen1Imm8(ssa.OpRoundScaledFloat64x2, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float64x4.RoundScaled", opLen1Imm8(ssa.OpRoundScaledFloat64x4, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x8.RoundScaled", opLen1Imm8(ssa.OpRoundScaledFloat64x8, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x4.RoundScaledMasked", opLen2Imm8(ssa.OpRoundScaledMaskedFloat32x4, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float32x8.RoundScaledMasked", opLen2Imm8(ssa.OpRoundScaledMaskedFloat32x8, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float32x16.RoundScaledMasked", opLen2Imm8(ssa.OpRoundScaledMaskedFloat32x16, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float64x2.RoundScaledMasked", opLen2Imm8(ssa.OpRoundScaledMaskedFloat64x2, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float64x4.RoundScaledMasked", opLen2Imm8(ssa.OpRoundScaledMaskedFloat64x4, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x8.RoundScaledMasked", opLen2Imm8(ssa.OpRoundScaledMaskedFloat64x8, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x4.RoundScaledResidue", opLen1Imm8(ssa.OpRoundScaledResidueFloat32x4, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float32x8.RoundScaledResidue", opLen1Imm8(ssa.OpRoundScaledResidueFloat32x8, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float32x16.RoundScaledResidue", opLen1Imm8(ssa.OpRoundScaledResidueFloat32x16, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float64x2.RoundScaledResidue", opLen1Imm8(ssa.OpRoundScaledResidueFloat64x2, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float64x4.RoundScaledResidue", opLen1Imm8(ssa.OpRoundScaledResidueFloat64x4, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x8.RoundScaledResidue", opLen1Imm8(ssa.OpRoundScaledResidueFloat64x8, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x4.RoundScaledResidueMasked", opLen2Imm8(ssa.OpRoundScaledResidueMaskedFloat32x4, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float32x8.RoundScaledResidueMasked", opLen2Imm8(ssa.OpRoundScaledResidueMaskedFloat32x8, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float32x16.RoundScaledResidueMasked", opLen2Imm8(ssa.OpRoundScaledResidueMaskedFloat32x16, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float64x2.RoundScaledResidueMasked", opLen2Imm8(ssa.OpRoundScaledResidueMaskedFloat64x2, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float64x4.RoundScaledResidueMasked", opLen2Imm8(ssa.OpRoundScaledResidueMaskedFloat64x4, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x8.RoundScaledResidueMasked", opLen2Imm8(ssa.OpRoundScaledResidueMaskedFloat64x8, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Int32x4.SaturatedAddDotProd", opLen3(ssa.OpSaturatedAddDotProdInt32x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Int32x8.SaturatedAddDotProd", opLen3(ssa.OpSaturatedAddDotProdInt32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int32x16.SaturatedAddDotProd", opLen3(ssa.OpSaturatedAddDotProdInt32x16, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Int32x4.SaturatedAddDotProdMasked", opLen4(ssa.OpSaturatedAddDotProdMaskedInt32x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Int32x8.SaturatedAddDotProdMasked", opLen4(ssa.OpSaturatedAddDotProdMaskedInt32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int32x16.SaturatedAddDotProdMasked", opLen4(ssa.OpSaturatedAddDotProdMaskedInt32x16, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Int8x16.SaturatedAddMasked", opLen3(ssa.OpSaturatedAddMaskedInt8x16, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int8x32.SaturatedAddMasked", opLen3(ssa.OpSaturatedAddMaskedInt8x32, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Int8x64.SaturatedAddMasked", opLen3(ssa.OpSaturatedAddMaskedInt8x64, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Int16x8.SaturatedAddMasked", opLen3(ssa.OpSaturatedAddMaskedInt16x8, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int16x16.SaturatedAddMasked", opLen3(ssa.OpSaturatedAddMaskedInt16x16, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Int16x32.SaturatedAddMasked", opLen3(ssa.OpSaturatedAddMaskedInt16x32, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Uint8x16.SaturatedAddMasked", opLen3(ssa.OpSaturatedAddMaskedUint8x16, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Uint8x32.SaturatedAddMasked", opLen3(ssa.OpSaturatedAddMaskedUint8x32, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Uint8x64.SaturatedAddMasked", opLen3(ssa.OpSaturatedAddMaskedUint8x64, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Uint16x8.SaturatedAddMasked", opLen3(ssa.OpSaturatedAddMaskedUint16x8, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Uint16x16.SaturatedAddMasked", opLen3(ssa.OpSaturatedAddMaskedUint16x16, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Uint16x32.SaturatedAddMasked", opLen3(ssa.OpSaturatedAddMaskedUint16x32, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Int16x8.SaturatedPairwiseAdd", opLen2(ssa.OpSaturatedPairwiseAddInt16x8, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int16x16.SaturatedPairwiseAdd", opLen2(ssa.OpSaturatedPairwiseAddInt16x16, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Int16x8.SaturatedPairwiseSub", opLen2(ssa.OpSaturatedPairwiseSubInt16x8, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int16x16.SaturatedPairwiseSub", opLen2(ssa.OpSaturatedPairwiseSubInt16x16, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Int8x16.SaturatedSub", opLen2(ssa.OpSaturatedSubInt8x16, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int8x32.SaturatedSub", opLen2(ssa.OpSaturatedSubInt8x32, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Int8x64.SaturatedSub", opLen2(ssa.OpSaturatedSubInt8x64, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Int16x8.SaturatedSub", opLen2(ssa.OpSaturatedSubInt16x8, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int16x16.SaturatedSub", opLen2(ssa.OpSaturatedSubInt16x16, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Int16x32.SaturatedSub", opLen2(ssa.OpSaturatedSubInt16x32, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Uint8x16.SaturatedSub", opLen2(ssa.OpSaturatedSubUint8x16, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Uint8x32.SaturatedSub", opLen2(ssa.OpSaturatedSubUint8x32, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Uint8x64.SaturatedSub", opLen2(ssa.OpSaturatedSubUint8x64, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Uint16x8.SaturatedSub", opLen2(ssa.OpSaturatedSubUint16x8, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Uint16x16.SaturatedSub", opLen2(ssa.OpSaturatedSubUint16x16, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Uint16x32.SaturatedSub", opLen2(ssa.OpSaturatedSubUint16x32, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Int8x16.SaturatedSubMasked", opLen3(ssa.OpSaturatedSubMaskedInt8x16, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int8x32.SaturatedSubMasked", opLen3(ssa.OpSaturatedSubMaskedInt8x32, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Int8x64.SaturatedSubMasked", opLen3(ssa.OpSaturatedSubMaskedInt8x64, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Int16x8.SaturatedSubMasked", opLen3(ssa.OpSaturatedSubMaskedInt16x8, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int16x16.SaturatedSubMasked", opLen3(ssa.OpSaturatedSubMaskedInt16x16, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Int16x32.SaturatedSubMasked", opLen3(ssa.OpSaturatedSubMaskedInt16x32, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Uint8x16.SaturatedSubMasked", opLen3(ssa.OpSaturatedSubMaskedUint8x16, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Uint8x32.SaturatedSubMasked", opLen3(ssa.OpSaturatedSubMaskedUint8x32, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Uint8x64.SaturatedSubMasked", opLen3(ssa.OpSaturatedSubMaskedUint8x64, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Uint16x8.SaturatedSubMasked", opLen3(ssa.OpSaturatedSubMaskedUint16x8, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Uint16x16.SaturatedSubMasked", opLen3(ssa.OpSaturatedSubMaskedUint16x16, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Uint16x32.SaturatedSubMasked", opLen3(ssa.OpSaturatedSubMaskedUint16x32, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Uint8x16.SaturatedUnsignedSignedPairDotProd", opLen2(ssa.OpSaturatedUnsignedSignedPairDotProdUint8x16, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Uint8x32.SaturatedUnsignedSignedPairDotProd", opLen2(ssa.OpSaturatedUnsignedSignedPairDotProdUint8x32, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint8x64.SaturatedUnsignedSignedPairDotProd", opLen2(ssa.OpSaturatedUnsignedSignedPairDotProdUint8x64, types.TypeVec512), sys.AMD64)
@@ -1388,6 +1326,18 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
 	addF(simdPackage, "Int8x16.SaturatedUnsignedSignedQuadDotProdAccumulateMasked", opLen4_31(ssa.OpSaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Int8x32.SaturatedUnsignedSignedQuadDotProdAccumulateMasked", opLen4_31(ssa.OpSaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int8x64.SaturatedUnsignedSignedQuadDotProdAccumulateMasked", opLen4_31(ssa.OpSaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x16, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Float32x4.Scale", opLen2(ssa.OpScaleFloat32x4, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Float32x8.Scale", opLen2(ssa.OpScaleFloat32x8, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Float32x16.Scale", opLen2(ssa.OpScaleFloat32x16, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Float64x2.Scale", opLen2(ssa.OpScaleFloat64x2, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Float64x4.Scale", opLen2(ssa.OpScaleFloat64x4, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Float64x8.Scale", opLen2(ssa.OpScaleFloat64x8, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Float32x4.ScaleMasked", opLen3(ssa.OpScaleMaskedFloat32x4, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Float32x8.ScaleMasked", opLen3(ssa.OpScaleMaskedFloat32x8, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Float32x16.ScaleMasked", opLen3(ssa.OpScaleMaskedFloat32x16, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Float64x2.ScaleMasked", opLen3(ssa.OpScaleMaskedFloat64x2, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Float64x4.ScaleMasked", opLen3(ssa.OpScaleMaskedFloat64x4, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Float64x8.ScaleMasked", opLen3(ssa.OpScaleMaskedFloat64x8, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Float32x8.Set128", opLen2Imm8(ssa.OpSet128Float32x8, types.TypeVec256, 0), sys.AMD64)
 	addF(simdPackage, "Float64x4.Set128", opLen2Imm8(ssa.OpSet128Float64x4, types.TypeVec256, 0), sys.AMD64)
 	addF(simdPackage, "Int8x32.Set128", opLen2Imm8(ssa.OpSet128Int8x32, types.TypeVec256, 0), sys.AMD64)
@@ -1772,22 +1722,72 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
 	addF(simdPackage, "Uint64x2.SubMasked", opLen3(ssa.OpSubMaskedUint64x2, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Uint64x4.SubMasked", opLen3(ssa.OpSubMaskedUint64x4, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint64x8.SubMasked", opLen3(ssa.OpSubMaskedUint64x8, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Float32x4.SubPairs", opLen2(ssa.OpSubPairsFloat32x4, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Float32x8.SubPairs", opLen2(ssa.OpSubPairsFloat32x8, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Float64x2.SubPairs", opLen2(ssa.OpSubPairsFloat64x2, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Float64x4.SubPairs", opLen2(ssa.OpSubPairsFloat64x4, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int16x8.SubPairs", opLen2(ssa.OpSubPairsInt16x8, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int16x16.SubPairs", opLen2(ssa.OpSubPairsInt16x16, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int32x4.SubPairs", opLen2(ssa.OpSubPairsInt32x4, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int32x8.SubPairs", opLen2(ssa.OpSubPairsInt32x8, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Uint16x8.SubPairs", opLen2(ssa.OpSubPairsUint16x8, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint16x16.SubPairs", opLen2(ssa.OpSubPairsUint16x16, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Uint32x4.SubPairs", opLen2(ssa.OpSubPairsUint32x4, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint32x8.SubPairs", opLen2(ssa.OpSubPairsUint32x8, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int16x8.SubPairsSaturated", opLen2(ssa.OpSubPairsSaturatedInt16x8, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int16x16.SubPairsSaturated", opLen2(ssa.OpSubPairsSaturatedInt16x16, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int8x16.SubSaturated", opLen2(ssa.OpSubSaturatedInt8x16, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int8x32.SubSaturated", opLen2(ssa.OpSubSaturatedInt8x32, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int8x64.SubSaturated", opLen2(ssa.OpSubSaturatedInt8x64, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Int16x8.SubSaturated", opLen2(ssa.OpSubSaturatedInt16x8, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int16x16.SubSaturated", opLen2(ssa.OpSubSaturatedInt16x16, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int16x32.SubSaturated", opLen2(ssa.OpSubSaturatedInt16x32, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Uint8x16.SubSaturated", opLen2(ssa.OpSubSaturatedUint8x16, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint8x32.SubSaturated", opLen2(ssa.OpSubSaturatedUint8x32, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Uint8x64.SubSaturated", opLen2(ssa.OpSubSaturatedUint8x64, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Uint16x8.SubSaturated", opLen2(ssa.OpSubSaturatedUint16x8, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint16x16.SubSaturated", opLen2(ssa.OpSubSaturatedUint16x16, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Uint16x32.SubSaturated", opLen2(ssa.OpSubSaturatedUint16x32, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Int8x16.SubSaturatedMasked", opLen3(ssa.OpSubSaturatedMaskedInt8x16, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int8x32.SubSaturatedMasked", opLen3(ssa.OpSubSaturatedMaskedInt8x32, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int8x64.SubSaturatedMasked", opLen3(ssa.OpSubSaturatedMaskedInt8x64, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Int16x8.SubSaturatedMasked", opLen3(ssa.OpSubSaturatedMaskedInt16x8, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int16x16.SubSaturatedMasked", opLen3(ssa.OpSubSaturatedMaskedInt16x16, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int16x32.SubSaturatedMasked", opLen3(ssa.OpSubSaturatedMaskedInt16x32, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Uint8x16.SubSaturatedMasked", opLen3(ssa.OpSubSaturatedMaskedUint8x16, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint8x32.SubSaturatedMasked", opLen3(ssa.OpSubSaturatedMaskedUint8x32, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Uint8x64.SubSaturatedMasked", opLen3(ssa.OpSubSaturatedMaskedUint8x64, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Uint16x8.SubSaturatedMasked", opLen3(ssa.OpSubSaturatedMaskedUint16x8, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint16x16.SubSaturatedMasked", opLen3(ssa.OpSubSaturatedMaskedUint16x16, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Uint16x32.SubSaturatedMasked", opLen3(ssa.OpSubSaturatedMaskedUint16x32, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Float32x4.Trunc", opLen1(ssa.OpTruncFloat32x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Float32x8.Trunc", opLen1(ssa.OpTruncFloat32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Float64x2.Trunc", opLen1(ssa.OpTruncFloat64x2, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Float64x4.Trunc", opLen1(ssa.OpTruncFloat64x4, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Float32x4.TruncWithPrecision", opLen1Imm8(ssa.OpTruncWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
-	addF(simdPackage, "Float32x8.TruncWithPrecision", opLen1Imm8(ssa.OpTruncWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
-	addF(simdPackage, "Float32x16.TruncWithPrecision", opLen1Imm8(ssa.OpTruncWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
-	addF(simdPackage, "Float64x2.TruncWithPrecision", opLen1Imm8(ssa.OpTruncWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
-	addF(simdPackage, "Float64x4.TruncWithPrecision", opLen1Imm8(ssa.OpTruncWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
-	addF(simdPackage, "Float64x8.TruncWithPrecision", opLen1Imm8(ssa.OpTruncWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
-	addF(simdPackage, "Float32x4.TruncWithPrecisionMasked", opLen2Imm8(ssa.OpTruncWithPrecisionMaskedFloat32x4, types.TypeVec128, 4), sys.AMD64)
-	addF(simdPackage, "Float32x8.TruncWithPrecisionMasked", opLen2Imm8(ssa.OpTruncWithPrecisionMaskedFloat32x8, types.TypeVec256, 4), sys.AMD64)
-	addF(simdPackage, "Float32x16.TruncWithPrecisionMasked", opLen2Imm8(ssa.OpTruncWithPrecisionMaskedFloat32x16, types.TypeVec512, 4), sys.AMD64)
-	addF(simdPackage, "Float64x2.TruncWithPrecisionMasked", opLen2Imm8(ssa.OpTruncWithPrecisionMaskedFloat64x2, types.TypeVec128, 4), sys.AMD64)
-	addF(simdPackage, "Float64x4.TruncWithPrecisionMasked", opLen2Imm8(ssa.OpTruncWithPrecisionMaskedFloat64x4, types.TypeVec256, 4), sys.AMD64)
-	addF(simdPackage, "Float64x8.TruncWithPrecisionMasked", opLen2Imm8(ssa.OpTruncWithPrecisionMaskedFloat64x8, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x4.TruncScaled", opLen1Imm8(ssa.OpTruncScaledFloat32x4, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float32x8.TruncScaled", opLen1Imm8(ssa.OpTruncScaledFloat32x8, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float32x16.TruncScaled", opLen1Imm8(ssa.OpTruncScaledFloat32x16, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float64x2.TruncScaled", opLen1Imm8(ssa.OpTruncScaledFloat64x2, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float64x4.TruncScaled", opLen1Imm8(ssa.OpTruncScaledFloat64x4, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x8.TruncScaled", opLen1Imm8(ssa.OpTruncScaledFloat64x8, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x4.TruncScaledMasked", opLen2Imm8(ssa.OpTruncScaledMaskedFloat32x4, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float32x8.TruncScaledMasked", opLen2Imm8(ssa.OpTruncScaledMaskedFloat32x8, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float32x16.TruncScaledMasked", opLen2Imm8(ssa.OpTruncScaledMaskedFloat32x16, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float64x2.TruncScaledMasked", opLen2Imm8(ssa.OpTruncScaledMaskedFloat64x2, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float64x4.TruncScaledMasked", opLen2Imm8(ssa.OpTruncScaledMaskedFloat64x4, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x8.TruncScaledMasked", opLen2Imm8(ssa.OpTruncScaledMaskedFloat64x8, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x4.TruncScaledResidue", opLen1Imm8(ssa.OpTruncScaledResidueFloat32x4, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float32x8.TruncScaledResidue", opLen1Imm8(ssa.OpTruncScaledResidueFloat32x8, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float32x16.TruncScaledResidue", opLen1Imm8(ssa.OpTruncScaledResidueFloat32x16, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float64x2.TruncScaledResidue", opLen1Imm8(ssa.OpTruncScaledResidueFloat64x2, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float64x4.TruncScaledResidue", opLen1Imm8(ssa.OpTruncScaledResidueFloat64x4, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x8.TruncScaledResidue", opLen1Imm8(ssa.OpTruncScaledResidueFloat64x8, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x4.TruncScaledResidueMasked", opLen2Imm8(ssa.OpTruncScaledResidueMaskedFloat32x4, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float32x8.TruncScaledResidueMasked", opLen2Imm8(ssa.OpTruncScaledResidueMaskedFloat32x8, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float32x16.TruncScaledResidueMasked", opLen2Imm8(ssa.OpTruncScaledResidueMaskedFloat32x16, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float64x2.TruncScaledResidueMasked", opLen2Imm8(ssa.OpTruncScaledResidueMaskedFloat64x2, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float64x4.TruncScaledResidueMasked", opLen2Imm8(ssa.OpTruncScaledResidueMaskedFloat64x4, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x8.TruncScaledResidueMasked", opLen2Imm8(ssa.OpTruncScaledResidueMaskedFloat64x8, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Int8x16.UnsignedSignedQuadDotProdAccumulate", opLen3_31(ssa.OpUnsignedSignedQuadDotProdAccumulateInt32x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Int8x32.UnsignedSignedQuadDotProdAccumulate", opLen3_31(ssa.OpUnsignedSignedQuadDotProdAccumulateInt32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int8x64.UnsignedSignedQuadDotProdAccumulate", opLen3_31(ssa.OpUnsignedSignedQuadDotProdAccumulateInt32x16, types.TypeVec512), sys.AMD64)
diff --git a/src/simd/binary_test.go b/src/simd/binary_test.go
index b7daf736f4e..c82bc070e12 100644
--- a/src/simd/binary_test.go
+++ b/src/simd/binary_test.go
@@ -309,42 +309,42 @@ func TestMul(t *testing.T) {
 	testFloat64x2Binary(t, simd.Float64x2.Mul, mulSlice[float64])
 	testFloat64x4Binary(t, simd.Float64x4.Mul, mulSlice[float64])
 
-	testInt16x16Binary(t, simd.Int16x16.MulLow, mulSlice[int16])
-	testInt16x8Binary(t, simd.Int16x8.MulLow, mulSlice[int16])
-	testInt32x4Binary(t, simd.Int32x4.MulLow, mulSlice[int32])
-	testInt32x8Binary(t, simd.Int32x8.MulLow, mulSlice[int32])
+	testInt16x16Binary(t, simd.Int16x16.Mul, mulSlice[int16])
+	testInt16x8Binary(t, simd.Int16x8.Mul, mulSlice[int16])
+	testInt32x4Binary(t, simd.Int32x4.Mul, mulSlice[int32])
+	testInt32x8Binary(t, simd.Int32x8.Mul, mulSlice[int32])
 
-	// testInt8x16Binary(t, simd.Int8x16.MulLow, mulSlice[int8]) // nope
-	// testInt8x32Binary(t, simd.Int8x32.MulLow, mulSlice[int8])
+	// testInt8x16Binary(t, simd.Int8x16.Mul, mulSlice[int8]) // nope
+	// testInt8x32Binary(t, simd.Int8x32.Mul, mulSlice[int8])
 
-	// TODO we should be able to do these, there's no difference between signed/unsigned mulLow
-	// testUint16x16Binary(t, simd.Uint16x16.MulLow, mulSlice[uint16])
-	// testUint16x8Binary(t, simd.Uint16x8.MulLow, mulSlice[uint16])
-	// testUint32x4Binary(t, simd.Uint32x4.MulLow, mulSlice[uint32])
-	// testUint32x8Binary(t, simd.Uint32x8.MulLow, mulSlice[uint32])
-	// testUint64x2Binary(t, simd.Uint64x2.MulLow, mulSlice[uint64])
-	// testUint64x4Binary(t, simd.Uint64x4.MulLow, mulSlice[uint64])
+	// TODO we should be able to do these, there's no difference between signed/unsigned Mul
+	// testUint16x16Binary(t, simd.Uint16x16.Mul, mulSlice[uint16])
+	// testUint16x8Binary(t, simd.Uint16x8.Mul, mulSlice[uint16])
+	// testUint32x4Binary(t, simd.Uint32x4.Mul, mulSlice[uint32])
+	// testUint32x8Binary(t, simd.Uint32x8.Mul, mulSlice[uint32])
+	// testUint64x2Binary(t, simd.Uint64x2.Mul, mulSlice[uint64])
+	// testUint64x4Binary(t, simd.Uint64x4.Mul, mulSlice[uint64])
 
-	// testUint8x16Binary(t, simd.Uint8x16.MulLow, mulSlice[uint8]) // nope
-	// testUint8x32Binary(t, simd.Uint8x32.MulLow, mulSlice[uint8])
+	// testUint8x16Binary(t, simd.Uint8x16.Mul, mulSlice[uint8]) // nope
+	// testUint8x32Binary(t, simd.Uint8x32.Mul, mulSlice[uint8])
 
 	if simd.HasAVX512() {
-		testInt64x2Binary(t, simd.Int64x2.MulLow, mulSlice[int64]) // avx512 only
-		testInt64x4Binary(t, simd.Int64x4.MulLow, mulSlice[int64])
+		testInt64x2Binary(t, simd.Int64x2.Mul, mulSlice[int64]) // avx512 only
+		testInt64x4Binary(t, simd.Int64x4.Mul, mulSlice[int64])
 
 		testFloat32x16Binary(t, simd.Float32x16.Mul, mulSlice[float32])
 		testFloat64x8Binary(t, simd.Float64x8.Mul, mulSlice[float64])
 
-		// testInt8x64Binary(t, simd.Int8x64.MulLow, mulSlice[int8]) // nope
-		testInt16x32Binary(t, simd.Int16x32.MulLow, mulSlice[int16])
-		testInt32x16Binary(t, simd.Int32x16.MulLow, mulSlice[int32])
-		testInt64x8Binary(t, simd.Int64x8.MulLow, mulSlice[int64])
-		// testUint8x64Binary(t, simd.Uint8x64.MulLow, mulSlice[uint8]) // nope
+		// testInt8x64Binary(t, simd.Int8x64.Mul, mulSlice[int8]) // nope
+		testInt16x32Binary(t, simd.Int16x32.Mul, mulSlice[int16])
+		testInt32x16Binary(t, simd.Int32x16.Mul, mulSlice[int32])
+		testInt64x8Binary(t, simd.Int64x8.Mul, mulSlice[int64])
+		// testUint8x64Binary(t, simd.Uint8x64.Mul, mulSlice[uint8]) // nope
 
 		// TODO signed should do the job
-		// testUint16x32Binary(t, simd.Uint16x32.MulLow, mulSlice[uint16])
-		// testUint32x16Binary(t, simd.Uint32x16.MulLow, mulSlice[uint32])
-		// testUint64x8Binary(t, simd.Uint64x8.MulLow, mulSlice[uint64])
+		// testUint16x32Binary(t, simd.Uint16x32.Mul, mulSlice[uint16])
+		// testUint32x16Binary(t, simd.Uint32x16.Mul, mulSlice[uint32])
+		// testUint64x8Binary(t, simd.Uint64x8.Mul, mulSlice[uint64])
 	}
 }
 
diff --git a/src/simd/ops_amd64.go b/src/simd/ops_amd64.go
index 5776350fe9f..dc42e73a53a 100644
--- a/src/simd/ops_amd64.go
+++ b/src/simd/ops_amd64.go
@@ -556,6 +556,242 @@ func (x Uint64x4) AddMasked(y Uint64x4, mask Mask64x4) Uint64x4
 // Asm: VPADDQ, CPU Feature: AVX512F
 func (x Uint64x8) AddMasked(y Uint64x8, mask Mask64x8) Uint64x8
 
+/* AddPairs */
+
+// AddPairs horizontally adds adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+//
+// Asm: VHADDPS, CPU Feature: AVX
+func (x Float32x4) AddPairs(y Float32x4) Float32x4
+
+// AddPairs horizontally adds adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+//
+// Asm: VHADDPS, CPU Feature: AVX
+func (x Float32x8) AddPairs(y Float32x8) Float32x8
+
+// AddPairs horizontally adds adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+//
+// Asm: VHADDPD, CPU Feature: AVX
+func (x Float64x2) AddPairs(y Float64x2) Float64x2
+
+// AddPairs horizontally adds adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+//
+// Asm: VHADDPD, CPU Feature: AVX
+func (x Float64x4) AddPairs(y Float64x4) Float64x4
+
+// AddPairs horizontally adds adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+//
+// Asm: VPHADDW, CPU Feature: AVX
+func (x Int16x8) AddPairs(y Int16x8) Int16x8
+
+// AddPairs horizontally adds adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+//
+// Asm: VPHADDW, CPU Feature: AVX2
+func (x Int16x16) AddPairs(y Int16x16) Int16x16
+
+// AddPairs horizontally adds adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+//
+// Asm: VPHADDD, CPU Feature: AVX
+func (x Int32x4) AddPairs(y Int32x4) Int32x4
+
+// AddPairs horizontally adds adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+//
+// Asm: VPHADDD, CPU Feature: AVX2
+func (x Int32x8) AddPairs(y Int32x8) Int32x8
+
+// AddPairs horizontally adds adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+//
+// Asm: VPHADDW, CPU Feature: AVX
+func (x Uint16x8) AddPairs(y Uint16x8) Uint16x8
+
+// AddPairs horizontally adds adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+//
+// Asm: VPHADDW, CPU Feature: AVX2
+func (x Uint16x16) AddPairs(y Uint16x16) Uint16x16
+
+// AddPairs horizontally adds adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+//
+// Asm: VPHADDD, CPU Feature: AVX
+func (x Uint32x4) AddPairs(y Uint32x4) Uint32x4
+
+// AddPairs horizontally adds adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+//
+// Asm: VPHADDD, CPU Feature: AVX2
+func (x Uint32x8) AddPairs(y Uint32x8) Uint32x8
+
+/* AddPairsSaturated */
+
+// AddPairsSaturated horizontally adds adjacent pairs of elements with saturation.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+//
+// Asm: VPHADDSW, CPU Feature: AVX
+func (x Int16x8) AddPairsSaturated(y Int16x8) Int16x8
+
+// AddPairsSaturated horizontally adds adjacent pairs of elements with saturation.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+//
+// Asm: VPHADDSW, CPU Feature: AVX2
+func (x Int16x16) AddPairsSaturated(y Int16x16) Int16x16
+
+/* AddSaturated */
+
+// AddSaturated adds corresponding elements of two vectors with saturation.
+//
+// Asm: VPADDSB, CPU Feature: AVX
+func (x Int8x16) AddSaturated(y Int8x16) Int8x16
+
+// AddSaturated adds corresponding elements of two vectors with saturation.
+//
+// Asm: VPADDSB, CPU Feature: AVX2
+func (x Int8x32) AddSaturated(y Int8x32) Int8x32
+
+// AddSaturated adds corresponding elements of two vectors with saturation.
+//
+// Asm: VPADDSB, CPU Feature: AVX512BW
+func (x Int8x64) AddSaturated(y Int8x64) Int8x64
+
+// AddSaturated adds corresponding elements of two vectors with saturation.
+//
+// Asm: VPADDSW, CPU Feature: AVX
+func (x Int16x8) AddSaturated(y Int16x8) Int16x8
+
+// AddSaturated adds corresponding elements of two vectors with saturation.
+//
+// Asm: VPADDSW, CPU Feature: AVX2
+func (x Int16x16) AddSaturated(y Int16x16) Int16x16
+
+// AddSaturated adds corresponding elements of two vectors with saturation.
+//
+// Asm: VPADDSW, CPU Feature: AVX512BW
+func (x Int16x32) AddSaturated(y Int16x32) Int16x32
+
+// AddSaturated adds corresponding elements of two vectors with saturation.
+//
+// Asm: VPADDSB, CPU Feature: AVX
+func (x Uint8x16) AddSaturated(y Uint8x16) Uint8x16
+
+// AddSaturated adds corresponding elements of two vectors with saturation.
+//
+// Asm: VPADDSB, CPU Feature: AVX2
+func (x Uint8x32) AddSaturated(y Uint8x32) Uint8x32
+
+// AddSaturated adds corresponding elements of two vectors with saturation.
+//
+// Asm: VPADDSB, CPU Feature: AVX512BW
+func (x Uint8x64) AddSaturated(y Uint8x64) Uint8x64
+
+// AddSaturated adds corresponding elements of two vectors with saturation.
+//
+// Asm: VPADDSW, CPU Feature: AVX
+func (x Uint16x8) AddSaturated(y Uint16x8) Uint16x8
+
+// AddSaturated adds corresponding elements of two vectors with saturation.
+//
+// Asm: VPADDSW, CPU Feature: AVX2
+func (x Uint16x16) AddSaturated(y Uint16x16) Uint16x16
+
+// AddSaturated adds corresponding elements of two vectors with saturation.
+//
+// Asm: VPADDSW, CPU Feature: AVX512BW
+func (x Uint16x32) AddSaturated(y Uint16x32) Uint16x32
+
+/* AddSaturatedMasked */
+
+// AddSaturatedMasked adds corresponding elements of two vectors with saturation.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPADDSB, CPU Feature: AVX512BW
+func (x Int8x16) AddSaturatedMasked(y Int8x16, mask Mask8x16) Int8x16
+
+// AddSaturatedMasked adds corresponding elements of two vectors with saturation.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPADDSB, CPU Feature: AVX512BW
+func (x Int8x32) AddSaturatedMasked(y Int8x32, mask Mask8x32) Int8x32
+
+// AddSaturatedMasked adds corresponding elements of two vectors with saturation.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPADDSB, CPU Feature: AVX512BW
+func (x Int8x64) AddSaturatedMasked(y Int8x64, mask Mask8x64) Int8x64
+
+// AddSaturatedMasked adds corresponding elements of two vectors with saturation.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPADDSW, CPU Feature: AVX512BW
+func (x Int16x8) AddSaturatedMasked(y Int16x8, mask Mask16x8) Int16x8
+
+// AddSaturatedMasked adds corresponding elements of two vectors with saturation.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPADDSW, CPU Feature: AVX512BW
+func (x Int16x16) AddSaturatedMasked(y Int16x16, mask Mask16x16) Int16x16
+
+// AddSaturatedMasked adds corresponding elements of two vectors with saturation.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPADDSW, CPU Feature: AVX512BW
+func (x Int16x32) AddSaturatedMasked(y Int16x32, mask Mask16x32) Int16x32
+
+// AddSaturatedMasked adds corresponding elements of two vectors with saturation.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPADDSB, CPU Feature: AVX512BW
+func (x Uint8x16) AddSaturatedMasked(y Uint8x16, mask Mask8x16) Uint8x16
+
+// AddSaturatedMasked adds corresponding elements of two vectors with saturation.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPADDSB, CPU Feature: AVX512BW
+func (x Uint8x32) AddSaturatedMasked(y Uint8x32, mask Mask8x32) Uint8x32
+
+// AddSaturatedMasked adds corresponding elements of two vectors with saturation.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPADDSB, CPU Feature: AVX512BW
+func (x Uint8x64) AddSaturatedMasked(y Uint8x64, mask Mask8x64) Uint8x64
+
+// AddSaturatedMasked adds corresponding elements of two vectors with saturation.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPADDSW, CPU Feature: AVX512BW
+func (x Uint16x8) AddSaturatedMasked(y Uint16x8, mask Mask16x8) Uint16x8
+
+// AddSaturatedMasked adds corresponding elements of two vectors with saturation.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPADDSW, CPU Feature: AVX512BW
+func (x Uint16x16) AddSaturatedMasked(y Uint16x16, mask Mask16x16) Uint16x16
+
+// AddSaturatedMasked adds corresponding elements of two vectors with saturation.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPADDSW, CPU Feature: AVX512BW
+func (x Uint16x32) AddSaturatedMasked(y Uint16x32, mask Mask16x32) Uint16x32
+
 /* AddSub */
 
 // AddSub subtracts even elements and adds odd elements of two vectors.
@@ -1244,105 +1480,205 @@ func (x Float64x2) Ceil() Float64x2
 // Asm: VROUNDPD, CPU Feature: AVX
 func (x Float64x4) Ceil() Float64x4
 
-/* CeilWithPrecision */
+/* CeilScaled */
 
-// CeilWithPrecision rounds elements up with specified precision.
+// CeilScaled rounds elements up with specified precision.
 //
 // prec is expected to be a constant, non-constant value will trigger a runtime panic.
 //
 // Asm: VRNDSCALEPS, CPU Feature: AVX512F
-func (x Float32x4) CeilWithPrecision(prec uint8) Float32x4
+func (x Float32x4) CeilScaled(prec uint8) Float32x4
 
-// CeilWithPrecision rounds elements up with specified precision.
+// CeilScaled rounds elements up with specified precision.
 //
 // prec is expected to be a constant, non-constant value will trigger a runtime panic.
 //
 // Asm: VRNDSCALEPS, CPU Feature: AVX512F
-func (x Float32x8) CeilWithPrecision(prec uint8) Float32x8
+func (x Float32x8) CeilScaled(prec uint8) Float32x8
 
-// CeilWithPrecision rounds elements up with specified precision.
+// CeilScaled rounds elements up with specified precision.
 //
 // prec is expected to be a constant, non-constant value will trigger a runtime panic.
 //
 // Asm: VRNDSCALEPS, CPU Feature: AVX512F
-func (x Float32x16) CeilWithPrecision(prec uint8) Float32x16
+func (x Float32x16) CeilScaled(prec uint8) Float32x16
 
-// CeilWithPrecision rounds elements up with specified precision.
+// CeilScaled rounds elements up with specified precision.
 //
 // prec is expected to be a constant, non-constant value will trigger a runtime panic.
 //
 // Asm: VRNDSCALEPD, CPU Feature: AVX512F
-func (x Float64x2) CeilWithPrecision(prec uint8) Float64x2
+func (x Float64x2) CeilScaled(prec uint8) Float64x2
 
-// CeilWithPrecision rounds elements up with specified precision.
+// CeilScaled rounds elements up with specified precision.
 //
 // prec is expected to be a constant, non-constant value will trigger a runtime panic.
 //
 // Asm: VRNDSCALEPD, CPU Feature: AVX512F
-func (x Float64x4) CeilWithPrecision(prec uint8) Float64x4
+func (x Float64x4) CeilScaled(prec uint8) Float64x4
 
-// CeilWithPrecision rounds elements up with specified precision.
+// CeilScaled rounds elements up with specified precision.
 //
 // prec is expected to be a constant, non-constant value will trigger a runtime panic.
 //
 // Asm: VRNDSCALEPD, CPU Feature: AVX512F
-func (x Float64x8) CeilWithPrecision(prec uint8) Float64x8
+func (x Float64x8) CeilScaled(prec uint8) Float64x8
 
-/* CeilWithPrecisionMasked */
+/* CeilScaledMasked */
 
-// CeilWithPrecisionMasked rounds elements up with specified precision.
+// CeilScaledMasked rounds elements up with specified precision.
 //
 // This operation is applied selectively under a write mask.
 //
 // prec is expected to be a constant, non-constant value will trigger a runtime panic.
 //
 // Asm: VRNDSCALEPS, CPU Feature: AVX512F
-func (x Float32x4) CeilWithPrecisionMasked(prec uint8, mask Mask32x4) Float32x4
+func (x Float32x4) CeilScaledMasked(prec uint8, mask Mask32x4) Float32x4
 
-// CeilWithPrecisionMasked rounds elements up with specified precision.
+// CeilScaledMasked rounds elements up with specified precision.
 //
 // This operation is applied selectively under a write mask.
 //
 // prec is expected to be a constant, non-constant value will trigger a runtime panic.
 //
 // Asm: VRNDSCALEPS, CPU Feature: AVX512F
-func (x Float32x8) CeilWithPrecisionMasked(prec uint8, mask Mask32x8) Float32x8
+func (x Float32x8) CeilScaledMasked(prec uint8, mask Mask32x8) Float32x8
 
-// CeilWithPrecisionMasked rounds elements up with specified precision.
+// CeilScaledMasked rounds elements up with specified precision.
 //
 // This operation is applied selectively under a write mask.
 //
 // prec is expected to be a constant, non-constant value will trigger a runtime panic.
 //
 // Asm: VRNDSCALEPS, CPU Feature: AVX512F
-func (x Float32x16) CeilWithPrecisionMasked(prec uint8, mask Mask32x16) Float32x16
+func (x Float32x16) CeilScaledMasked(prec uint8, mask Mask32x16) Float32x16
 
-// CeilWithPrecisionMasked rounds elements up with specified precision.
+// CeilScaledMasked rounds elements up with specified precision.
 //
 // This operation is applied selectively under a write mask.
 //
 // prec is expected to be a constant, non-constant value will trigger a runtime panic.
 //
 // Asm: VRNDSCALEPD, CPU Feature: AVX512F
-func (x Float64x2) CeilWithPrecisionMasked(prec uint8, mask Mask64x2) Float64x2
+func (x Float64x2) CeilScaledMasked(prec uint8, mask Mask64x2) Float64x2
 
-// CeilWithPrecisionMasked rounds elements up with specified precision.
+// CeilScaledMasked rounds elements up with specified precision.
 //
 // This operation is applied selectively under a write mask.
 //
 // prec is expected to be a constant, non-constant value will trigger a runtime panic.
 //
 // Asm: VRNDSCALEPD, CPU Feature: AVX512F
-func (x Float64x4) CeilWithPrecisionMasked(prec uint8, mask Mask64x4) Float64x4
+func (x Float64x4) CeilScaledMasked(prec uint8, mask Mask64x4) Float64x4
 
-// CeilWithPrecisionMasked rounds elements up with specified precision.
+// CeilScaledMasked rounds elements up with specified precision.
 //
 // This operation is applied selectively under a write mask.
 //
 // prec is expected to be a constant, non-constant value will trigger a runtime panic.
 //
 // Asm: VRNDSCALEPD, CPU Feature: AVX512F
-func (x Float64x8) CeilWithPrecisionMasked(prec uint8, mask Mask64x8) Float64x8
+func (x Float64x8) CeilScaledMasked(prec uint8, mask Mask64x8) Float64x8
+
+/* CeilScaledResidue */
+
+// CeilScaledResidue computes the difference after ceiling with specified precision.
+//
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VREDUCEPS, CPU Feature: AVX512DQ
+func (x Float32x4) CeilScaledResidue(prec uint8) Float32x4
+
+// CeilScaledResidue computes the difference after ceiling with specified precision.
+//
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VREDUCEPS, CPU Feature: AVX512DQ
+func (x Float32x8) CeilScaledResidue(prec uint8) Float32x8
+
+// CeilScaledResidue computes the difference after ceiling with specified precision.
+//
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VREDUCEPS, CPU Feature: AVX512DQ
+func (x Float32x16) CeilScaledResidue(prec uint8) Float32x16
+
+// CeilScaledResidue computes the difference after ceiling with specified precision.
+//
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VREDUCEPD, CPU Feature: AVX512DQ
+func (x Float64x2) CeilScaledResidue(prec uint8) Float64x2
+
+// CeilScaledResidue computes the difference after ceiling with specified precision.
+//
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VREDUCEPD, CPU Feature: AVX512DQ
+func (x Float64x4) CeilScaledResidue(prec uint8) Float64x4
+
+// CeilScaledResidue computes the difference after ceiling with specified precision.
+//
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VREDUCEPD, CPU Feature: AVX512DQ
+func (x Float64x8) CeilScaledResidue(prec uint8) Float64x8
+
+/* CeilScaledResidueMasked */
+
+// CeilScaledResidueMasked computes the difference after ceiling with specified precision.
+//
+// This operation is applied selectively under a write mask.
+//
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VREDUCEPS, CPU Feature: AVX512DQ
+func (x Float32x4) CeilScaledResidueMasked(prec uint8, mask Mask32x4) Float32x4
+
+// CeilScaledResidueMasked computes the difference after ceiling with specified precision.
+//
+// This operation is applied selectively under a write mask.
+//
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VREDUCEPS, CPU Feature: AVX512DQ
+func (x Float32x8) CeilScaledResidueMasked(prec uint8, mask Mask32x8) Float32x8
+
+// CeilScaledResidueMasked computes the difference after ceiling with specified precision.
+//
+// This operation is applied selectively under a write mask.
+//
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VREDUCEPS, CPU Feature: AVX512DQ
+func (x Float32x16) CeilScaledResidueMasked(prec uint8, mask Mask32x16) Float32x16
+
+// CeilScaledResidueMasked computes the difference after ceiling with specified precision.
+//
+// This operation is applied selectively under a write mask.
+//
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VREDUCEPD, CPU Feature: AVX512DQ
+func (x Float64x2) CeilScaledResidueMasked(prec uint8, mask Mask64x2) Float64x2
+
+// CeilScaledResidueMasked computes the difference after ceiling with specified precision.
+//
+// This operation is applied selectively under a write mask.
+//
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VREDUCEPD, CPU Feature: AVX512DQ
+func (x Float64x4) CeilScaledResidueMasked(prec uint8, mask Mask64x4) Float64x4
+
+// CeilScaledResidueMasked computes the difference after ceiling with specified precision.
+//
+// This operation is applied selectively under a write mask.
+//
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VREDUCEPD, CPU Feature: AVX512DQ
+func (x Float64x8) CeilScaledResidueMasked(prec uint8, mask Mask64x8) Float64x8
 
 /* Compress */
 
@@ -1606,406 +1942,6 @@ func (x Float32x8) ConvertToUint32Masked(mask Mask32x8) Uint32x8
 // Asm: VCVTPS2UDQ, CPU Feature: AVX512F
 func (x Float32x16) ConvertToUint32Masked(mask Mask32x16) Uint32x16
 
-/* DiffWithCeilWithPrecision */
-
-// DiffWithCeilWithPrecision computes the difference after ceiling with specified precision.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPS, CPU Feature: AVX512DQ
-func (x Float32x4) DiffWithCeilWithPrecision(prec uint8) Float32x4
-
-// DiffWithCeilWithPrecision computes the difference after ceiling with specified precision.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPS, CPU Feature: AVX512DQ
-func (x Float32x8) DiffWithCeilWithPrecision(prec uint8) Float32x8
-
-// DiffWithCeilWithPrecision computes the difference after ceiling with specified precision.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPS, CPU Feature: AVX512DQ
-func (x Float32x16) DiffWithCeilWithPrecision(prec uint8) Float32x16
-
-// DiffWithCeilWithPrecision computes the difference after ceiling with specified precision.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPD, CPU Feature: AVX512DQ
-func (x Float64x2) DiffWithCeilWithPrecision(prec uint8) Float64x2
-
-// DiffWithCeilWithPrecision computes the difference after ceiling with specified precision.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPD, CPU Feature: AVX512DQ
-func (x Float64x4) DiffWithCeilWithPrecision(prec uint8) Float64x4
-
-// DiffWithCeilWithPrecision computes the difference after ceiling with specified precision.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPD, CPU Feature: AVX512DQ
-func (x Float64x8) DiffWithCeilWithPrecision(prec uint8) Float64x8
-
-/* DiffWithCeilWithPrecisionMasked */
-
-// DiffWithCeilWithPrecisionMasked computes the difference after ceiling with specified precision.
-//
-// This operation is applied selectively under a write mask.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPS, CPU Feature: AVX512DQ
-func (x Float32x4) DiffWithCeilWithPrecisionMasked(prec uint8, mask Mask32x4) Float32x4
-
-// DiffWithCeilWithPrecisionMasked computes the difference after ceiling with specified precision.
-//
-// This operation is applied selectively under a write mask.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPS, CPU Feature: AVX512DQ
-func (x Float32x8) DiffWithCeilWithPrecisionMasked(prec uint8, mask Mask32x8) Float32x8
-
-// DiffWithCeilWithPrecisionMasked computes the difference after ceiling with specified precision.
-//
-// This operation is applied selectively under a write mask.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPS, CPU Feature: AVX512DQ
-func (x Float32x16) DiffWithCeilWithPrecisionMasked(prec uint8, mask Mask32x16) Float32x16
-
-// DiffWithCeilWithPrecisionMasked computes the difference after ceiling with specified precision.
-//
-// This operation is applied selectively under a write mask.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPD, CPU Feature: AVX512DQ
-func (x Float64x2) DiffWithCeilWithPrecisionMasked(prec uint8, mask Mask64x2) Float64x2
-
-// DiffWithCeilWithPrecisionMasked computes the difference after ceiling with specified precision.
-//
-// This operation is applied selectively under a write mask.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPD, CPU Feature: AVX512DQ
-func (x Float64x4) DiffWithCeilWithPrecisionMasked(prec uint8, mask Mask64x4) Float64x4
-
-// DiffWithCeilWithPrecisionMasked computes the difference after ceiling with specified precision.
-//
-// This operation is applied selectively under a write mask.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPD, CPU Feature: AVX512DQ
-func (x Float64x8) DiffWithCeilWithPrecisionMasked(prec uint8, mask Mask64x8) Float64x8
-
-/* DiffWithFloorWithPrecision */
-
-// DiffWithFloorWithPrecision computes the difference after flooring with specified precision.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPS, CPU Feature: AVX512DQ
-func (x Float32x4) DiffWithFloorWithPrecision(prec uint8) Float32x4
-
-// DiffWithFloorWithPrecision computes the difference after flooring with specified precision.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPS, CPU Feature: AVX512DQ
-func (x Float32x8) DiffWithFloorWithPrecision(prec uint8) Float32x8
-
-// DiffWithFloorWithPrecision computes the difference after flooring with specified precision.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPS, CPU Feature: AVX512DQ
-func (x Float32x16) DiffWithFloorWithPrecision(prec uint8) Float32x16
-
-// DiffWithFloorWithPrecision computes the difference after flooring with specified precision.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPD, CPU Feature: AVX512DQ
-func (x Float64x2) DiffWithFloorWithPrecision(prec uint8) Float64x2
-
-// DiffWithFloorWithPrecision computes the difference after flooring with specified precision.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPD, CPU Feature: AVX512DQ
-func (x Float64x4) DiffWithFloorWithPrecision(prec uint8) Float64x4
-
-// DiffWithFloorWithPrecision computes the difference after flooring with specified precision.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPD, CPU Feature: AVX512DQ
-func (x Float64x8) DiffWithFloorWithPrecision(prec uint8) Float64x8
-
-/* DiffWithFloorWithPrecisionMasked */
-
-// DiffWithFloorWithPrecisionMasked computes the difference after flooring with specified precision.
-//
-// This operation is applied selectively under a write mask.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPS, CPU Feature: AVX512DQ
-func (x Float32x4) DiffWithFloorWithPrecisionMasked(prec uint8, mask Mask32x4) Float32x4
-
-// DiffWithFloorWithPrecisionMasked computes the difference after flooring with specified precision.
-//
-// This operation is applied selectively under a write mask.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPS, CPU Feature: AVX512DQ
-func (x Float32x8) DiffWithFloorWithPrecisionMasked(prec uint8, mask Mask32x8) Float32x8
-
-// DiffWithFloorWithPrecisionMasked computes the difference after flooring with specified precision.
-//
-// This operation is applied selectively under a write mask.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPS, CPU Feature: AVX512DQ
-func (x Float32x16) DiffWithFloorWithPrecisionMasked(prec uint8, mask Mask32x16) Float32x16
-
-// DiffWithFloorWithPrecisionMasked computes the difference after flooring with specified precision.
-//
-// This operation is applied selectively under a write mask.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPD, CPU Feature: AVX512DQ
-func (x Float64x2) DiffWithFloorWithPrecisionMasked(prec uint8, mask Mask64x2) Float64x2
-
-// DiffWithFloorWithPrecisionMasked computes the difference after flooring with specified precision.
-//
-// This operation is applied selectively under a write mask.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPD, CPU Feature: AVX512DQ
-func (x Float64x4) DiffWithFloorWithPrecisionMasked(prec uint8, mask Mask64x4) Float64x4
-
-// DiffWithFloorWithPrecisionMasked computes the difference after flooring with specified precision.
-//
-// This operation is applied selectively under a write mask.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPD, CPU Feature: AVX512DQ
-func (x Float64x8) DiffWithFloorWithPrecisionMasked(prec uint8, mask Mask64x8) Float64x8
-
-/* DiffWithRoundWithPrecision */
-
-// DiffWithRoundWithPrecision computes the difference after rounding with specified precision.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPS, CPU Feature: AVX512DQ
-func (x Float32x4) DiffWithRoundWithPrecision(prec uint8) Float32x4
-
-// DiffWithRoundWithPrecision computes the difference after rounding with specified precision.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPS, CPU Feature: AVX512DQ
-func (x Float32x8) DiffWithRoundWithPrecision(prec uint8) Float32x8
-
-// DiffWithRoundWithPrecision computes the difference after rounding with specified precision.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPS, CPU Feature: AVX512DQ
-func (x Float32x16) DiffWithRoundWithPrecision(prec uint8) Float32x16
-
-// DiffWithRoundWithPrecision computes the difference after rounding with specified precision.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPD, CPU Feature: AVX512DQ
-func (x Float64x2) DiffWithRoundWithPrecision(prec uint8) Float64x2
-
-// DiffWithRoundWithPrecision computes the difference after rounding with specified precision.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPD, CPU Feature: AVX512DQ
-func (x Float64x4) DiffWithRoundWithPrecision(prec uint8) Float64x4
-
-// DiffWithRoundWithPrecision computes the difference after rounding with specified precision.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPD, CPU Feature: AVX512DQ
-func (x Float64x8) DiffWithRoundWithPrecision(prec uint8) Float64x8
-
-/* DiffWithRoundWithPrecisionMasked */
-
-// DiffWithRoundWithPrecisionMasked computes the difference after rounding with specified precision.
-//
-// This operation is applied selectively under a write mask.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPS, CPU Feature: AVX512DQ
-func (x Float32x4) DiffWithRoundWithPrecisionMasked(prec uint8, mask Mask32x4) Float32x4
-
-// DiffWithRoundWithPrecisionMasked computes the difference after rounding with specified precision.
-//
-// This operation is applied selectively under a write mask.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPS, CPU Feature: AVX512DQ
-func (x Float32x8) DiffWithRoundWithPrecisionMasked(prec uint8, mask Mask32x8) Float32x8
-
-// DiffWithRoundWithPrecisionMasked computes the difference after rounding with specified precision.
-//
-// This operation is applied selectively under a write mask.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPS, CPU Feature: AVX512DQ
-func (x Float32x16) DiffWithRoundWithPrecisionMasked(prec uint8, mask Mask32x16) Float32x16
-
-// DiffWithRoundWithPrecisionMasked computes the difference after rounding with specified precision.
-//
-// This operation is applied selectively under a write mask.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPD, CPU Feature: AVX512DQ
-func (x Float64x2) DiffWithRoundWithPrecisionMasked(prec uint8, mask Mask64x2) Float64x2
-
-// DiffWithRoundWithPrecisionMasked computes the difference after rounding with specified precision.
-//
-// This operation is applied selectively under a write mask.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPD, CPU Feature: AVX512DQ
-func (x Float64x4) DiffWithRoundWithPrecisionMasked(prec uint8, mask Mask64x4) Float64x4
-
-// DiffWithRoundWithPrecisionMasked computes the difference after rounding with specified precision.
-//
-// This operation is applied selectively under a write mask.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPD, CPU Feature: AVX512DQ
-func (x Float64x8) DiffWithRoundWithPrecisionMasked(prec uint8, mask Mask64x8) Float64x8
-
-/* DiffWithTruncWithPrecision */
-
-// DiffWithTruncWithPrecision computes the difference after truncating with specified precision.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPS, CPU Feature: AVX512DQ
-func (x Float32x4) DiffWithTruncWithPrecision(prec uint8) Float32x4
-
-// DiffWithTruncWithPrecision computes the difference after truncating with specified precision.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPS, CPU Feature: AVX512DQ
-func (x Float32x8) DiffWithTruncWithPrecision(prec uint8) Float32x8
-
-// DiffWithTruncWithPrecision computes the difference after truncating with specified precision.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPS, CPU Feature: AVX512DQ
-func (x Float32x16) DiffWithTruncWithPrecision(prec uint8) Float32x16
-
-// DiffWithTruncWithPrecision computes the difference after truncating with specified precision.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPD, CPU Feature: AVX512DQ
-func (x Float64x2) DiffWithTruncWithPrecision(prec uint8) Float64x2
-
-// DiffWithTruncWithPrecision computes the difference after truncating with specified precision.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPD, CPU Feature: AVX512DQ
-func (x Float64x4) DiffWithTruncWithPrecision(prec uint8) Float64x4
-
-// DiffWithTruncWithPrecision computes the difference after truncating with specified precision.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPD, CPU Feature: AVX512DQ
-func (x Float64x8) DiffWithTruncWithPrecision(prec uint8) Float64x8
-
-/* DiffWithTruncWithPrecisionMasked */
-
-// DiffWithTruncWithPrecisionMasked computes the difference after truncating with specified precision.
-//
-// This operation is applied selectively under a write mask.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPS, CPU Feature: AVX512DQ
-func (x Float32x4) DiffWithTruncWithPrecisionMasked(prec uint8, mask Mask32x4) Float32x4
-
-// DiffWithTruncWithPrecisionMasked computes the difference after truncating with specified precision.
-//
-// This operation is applied selectively under a write mask.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPS, CPU Feature: AVX512DQ
-func (x Float32x8) DiffWithTruncWithPrecisionMasked(prec uint8, mask Mask32x8) Float32x8
-
-// DiffWithTruncWithPrecisionMasked computes the difference after truncating with specified precision.
-//
-// This operation is applied selectively under a write mask.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPS, CPU Feature: AVX512DQ
-func (x Float32x16) DiffWithTruncWithPrecisionMasked(prec uint8, mask Mask32x16) Float32x16
-
-// DiffWithTruncWithPrecisionMasked computes the difference after truncating with specified precision.
-//
-// This operation is applied selectively under a write mask.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPD, CPU Feature: AVX512DQ
-func (x Float64x2) DiffWithTruncWithPrecisionMasked(prec uint8, mask Mask64x2) Float64x2
-
-// DiffWithTruncWithPrecisionMasked computes the difference after truncating with specified precision.
-//
-// This operation is applied selectively under a write mask.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPD, CPU Feature: AVX512DQ
-func (x Float64x4) DiffWithTruncWithPrecisionMasked(prec uint8, mask Mask64x4) Float64x4
-
-// DiffWithTruncWithPrecisionMasked computes the difference after truncating with specified precision.
-//
-// This operation is applied selectively under a write mask.
-//
-// prec is expected to be a constant, non-constant value will trigger a runtime panic.
-//
-// Asm: VREDUCEPD, CPU Feature: AVX512DQ
-func (x Float64x8) DiffWithTruncWithPrecisionMasked(prec uint8, mask Mask64x8) Float64x8
-
 /* Div */
 
 // Div divides elements of two vectors.
@@ -2485,105 +2421,205 @@ func (x Float64x2) Floor() Float64x2
 // Asm: VROUNDPD, CPU Feature: AVX
 func (x Float64x4) Floor() Float64x4
 
-/* FloorWithPrecision */
+/* FloorScaled */
 
-// FloorWithPrecision rounds elements down with specified precision.
+// FloorScaled rounds elements down with specified precision.
 //
 // prec is expected to be a constant, non-constant value will trigger a runtime panic.
 //
 // Asm: VRNDSCALEPS, CPU Feature: AVX512F
-func (x Float32x4) FloorWithPrecision(prec uint8) Float32x4
+func (x Float32x4) FloorScaled(prec uint8) Float32x4
 
-// FloorWithPrecision rounds elements down with specified precision.
+// FloorScaled rounds elements down with specified precision.
 //
 // prec is expected to be a constant, non-constant value will trigger a runtime panic.
 //
 // Asm: VRNDSCALEPS, CPU Feature: AVX512F
-func (x Float32x8) FloorWithPrecision(prec uint8) Float32x8
+func (x Float32x8) FloorScaled(prec uint8) Float32x8
 
-// FloorWithPrecision rounds elements down with specified precision.
+// FloorScaled rounds elements down with specified precision.
 //
 // prec is expected to be a constant, non-constant value will trigger a runtime panic.
 //
 // Asm: VRNDSCALEPS, CPU Feature: AVX512F
-func (x Float32x16) FloorWithPrecision(prec uint8) Float32x16
+func (x Float32x16) FloorScaled(prec uint8) Float32x16
 
-// FloorWithPrecision rounds elements down with specified precision.
+// FloorScaled rounds elements down with specified precision.
 //
 // prec is expected to be a constant, non-constant value will trigger a runtime panic.
 //
 // Asm: VRNDSCALEPD, CPU Feature: AVX512F
-func (x Float64x2) FloorWithPrecision(prec uint8) Float64x2
+func (x Float64x2) FloorScaled(prec uint8) Float64x2
 
-// FloorWithPrecision rounds elements down with specified precision.
+// FloorScaled rounds elements down with specified precision.
 //
 // prec is expected to be a constant, non-constant value will trigger a runtime panic.
 //
 // Asm: VRNDSCALEPD, CPU Feature: AVX512F
-func (x Float64x4) FloorWithPrecision(prec uint8) Float64x4
+func (x Float64x4) FloorScaled(prec uint8) Float64x4
 
-// FloorWithPrecision rounds elements down with specified precision.
+// FloorScaled rounds elements down with specified precision.
 //
 // prec is expected to be a constant, non-constant value will trigger a runtime panic.
 //
 // Asm: VRNDSCALEPD, CPU Feature: AVX512F
-func (x Float64x8) FloorWithPrecision(prec uint8) Float64x8
+func (x Float64x8) FloorScaled(prec uint8) Float64x8
 
-/* FloorWithPrecisionMasked */
+/* FloorScaledMasked */
 
-// FloorWithPrecisionMasked rounds elements down with specified precision.
+// FloorScaledMasked rounds elements down with specified precision.
 //
 // This operation is applied selectively under a write mask.
 //
 // prec is expected to be a constant, non-constant value will trigger a runtime panic.
 //
 // Asm: VRNDSCALEPS, CPU Feature: AVX512F
-func (x Float32x4) FloorWithPrecisionMasked(prec uint8, mask Mask32x4) Float32x4
+func (x Float32x4) FloorScaledMasked(prec uint8, mask Mask32x4) Float32x4
 
-// FloorWithPrecisionMasked rounds elements down with specified precision.
+// FloorScaledMasked rounds elements down with specified precision.
 //
 // This operation is applied selectively under a write mask.
 //
 // prec is expected to be a constant, non-constant value will trigger a runtime panic.
 //
 // Asm: VRNDSCALEPS, CPU Feature: AVX512F
-func (x Float32x8) FloorWithPrecisionMasked(prec uint8, mask Mask32x8) Float32x8
+func (x Float32x8) FloorScaledMasked(prec uint8, mask Mask32x8) Float32x8
 
-// FloorWithPrecisionMasked rounds elements down with specified precision.
+// FloorScaledMasked rounds elements down with specified precision.
 //
 // This operation is applied selectively under a write mask.
 //
 // prec is expected to be a constant, non-constant value will trigger a runtime panic.
 //
 // Asm: VRNDSCALEPS, CPU Feature: AVX512F
-func (x Float32x16) FloorWithPrecisionMasked(prec uint8, mask Mask32x16) Float32x16
+func (x Float32x16) FloorScaledMasked(prec uint8, mask Mask32x16) Float32x16
 
-// FloorWithPrecisionMasked rounds elements down with specified precision.
+// FloorScaledMasked rounds elements down with specified precision.
 //
 // This operation is applied selectively under a write mask.
 //
 // prec is expected to be a constant, non-constant value will trigger a runtime panic.
 //
 // Asm: VRNDSCALEPD, CPU Feature: AVX512F
-func (x Float64x2) FloorWithPrecisionMasked(prec uint8, mask Mask64x2) Float64x2
+func (x Float64x2) FloorScaledMasked(prec uint8, mask Mask64x2) Float64x2
 
-// FloorWithPrecisionMasked rounds elements down with specified precision.
+// FloorScaledMasked rounds elements down with specified precision.
 //
 // This operation is applied selectively under a write mask.
 //
 // prec is expected to be a constant, non-constant value will trigger a runtime panic.
 //
 // Asm: VRNDSCALEPD, CPU Feature: AVX512F
-func (x Float64x4) FloorWithPrecisionMasked(prec uint8, mask Mask64x4) Float64x4
+func (x Float64x4) FloorScaledMasked(prec uint8, mask Mask64x4) Float64x4
 
-// FloorWithPrecisionMasked rounds elements down with specified precision.
+// FloorScaledMasked rounds elements down with specified precision.
 //
 // This operation is applied selectively under a write mask.
 //
 // prec is expected to be a constant, non-constant value will trigger a runtime panic.
 //
 // Asm: VRNDSCALEPD, CPU Feature: AVX512F
-func (x Float64x8) FloorWithPrecisionMasked(prec uint8, mask Mask64x8) Float64x8
+func (x Float64x8) FloorScaledMasked(prec uint8, mask Mask64x8) Float64x8
+
+/* FloorScaledResidue */
+
+// FloorScaledResidue computes the difference after flooring with specified precision.
+//
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VREDUCEPS, CPU Feature: AVX512DQ
+func (x Float32x4) FloorScaledResidue(prec uint8) Float32x4
+
+// FloorScaledResidue computes the difference after flooring with specified precision.
+//
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VREDUCEPS, CPU Feature: AVX512DQ
+func (x Float32x8) FloorScaledResidue(prec uint8) Float32x8
+
+// FloorScaledResidue computes the difference after flooring with specified precision.
+//
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VREDUCEPS, CPU Feature: AVX512DQ
+func (x Float32x16) FloorScaledResidue(prec uint8) Float32x16
+
+// FloorScaledResidue computes the difference after flooring with specified precision.
+//
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VREDUCEPD, CPU Feature: AVX512DQ
+func (x Float64x2) FloorScaledResidue(prec uint8) Float64x2
+
+// FloorScaledResidue computes the difference after flooring with specified precision.
+//
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VREDUCEPD, CPU Feature: AVX512DQ
+func (x Float64x4) FloorScaledResidue(prec uint8) Float64x4
+
+// FloorScaledResidue computes the difference after flooring with specified precision.
+//
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VREDUCEPD, CPU Feature: AVX512DQ
+func (x Float64x8) FloorScaledResidue(prec uint8) Float64x8
+
+/* FloorScaledResidueMasked */
+
+// FloorScaledResidueMasked computes the difference after flooring with specified precision.
+//
+// This operation is applied selectively under a write mask.
+//
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VREDUCEPS, CPU Feature: AVX512DQ
+func (x Float32x4) FloorScaledResidueMasked(prec uint8, mask Mask32x4) Float32x4
+
+// FloorScaledResidueMasked computes the difference after flooring with specified precision.
+//
+// This operation is applied selectively under a write mask.
+//
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VREDUCEPS, CPU Feature: AVX512DQ
+func (x Float32x8) FloorScaledResidueMasked(prec uint8, mask Mask32x8) Float32x8
+
+// FloorScaledResidueMasked computes the difference after flooring with specified precision.
+//
+// This operation is applied selectively under a write mask.
+//
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VREDUCEPS, CPU Feature: AVX512DQ
+func (x Float32x16) FloorScaledResidueMasked(prec uint8, mask Mask32x16) Float32x16
+
+// FloorScaledResidueMasked computes the difference after flooring with specified precision.
+//
+// This operation is applied selectively under a write mask.
+//
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VREDUCEPD, CPU Feature: AVX512DQ
+func (x Float64x2) FloorScaledResidueMasked(prec uint8, mask Mask64x2) Float64x2
+
+// FloorScaledResidueMasked computes the difference after flooring with specified precision.
+//
+// This operation is applied selectively under a write mask.
+//
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VREDUCEPD, CPU Feature: AVX512DQ
+func (x Float64x4) FloorScaledResidueMasked(prec uint8, mask Mask64x4) Float64x4
+
+// FloorScaledResidueMasked computes the difference after flooring with specified precision.
+//
+// This operation is applied selectively under a write mask.
+//
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VREDUCEPD, CPU Feature: AVX512DQ
+func (x Float64x8) FloorScaledResidueMasked(prec uint8, mask Mask64x8) Float64x8
 
 /* FusedMultiplyAdd */
 
@@ -5427,81 +5463,50 @@ func (x Float64x4) Mul(y Float64x4) Float64x4
 // Asm: VMULPD, CPU Feature: AVX512F
 func (x Float64x8) Mul(y Float64x8) Float64x8
 
-/* MulByPowOf2 */
+// Mul multiplies corresponding elements of two vectors.
+//
+// Asm: VPMULLW, CPU Feature: AVX
+func (x Int16x8) Mul(y Int16x8) Int16x8
 
-// MulByPowOf2 multiplies elements by a power of 2.
+// Mul multiplies corresponding elements of two vectors.
 //
-// Asm: VSCALEFPS, CPU Feature: AVX512F
-func (x Float32x4) MulByPowOf2(y Float32x4) Float32x4
+// Asm: VPMULLW, CPU Feature: AVX2
+func (x Int16x16) Mul(y Int16x16) Int16x16
 
-// MulByPowOf2 multiplies elements by a power of 2.
+// Mul multiplies corresponding elements of two vectors.
 //
-// Asm: VSCALEFPS, CPU Feature: AVX512F
-func (x Float32x8) MulByPowOf2(y Float32x8) Float32x8
+// Asm: VPMULLW, CPU Feature: AVX512BW
+func (x Int16x32) Mul(y Int16x32) Int16x32
 
-// MulByPowOf2 multiplies elements by a power of 2.
+// Mul multiplies corresponding elements of two vectors.
 //
-// Asm: VSCALEFPS, CPU Feature: AVX512F
-func (x Float32x16) MulByPowOf2(y Float32x16) Float32x16
+// Asm: VPMULLD, CPU Feature: AVX
+func (x Int32x4) Mul(y Int32x4) Int32x4
 
-// MulByPowOf2 multiplies elements by a power of 2.
+// Mul multiplies corresponding elements of two vectors.
 //
-// Asm: VSCALEFPD, CPU Feature: AVX512F
-func (x Float64x2) MulByPowOf2(y Float64x2) Float64x2
+// Asm: VPMULLD, CPU Feature: AVX2
+func (x Int32x8) Mul(y Int32x8) Int32x8
 
-// MulByPowOf2 multiplies elements by a power of 2.
+// Mul multiplies corresponding elements of two vectors.
 //
-// Asm: VSCALEFPD, CPU Feature: AVX512F
-func (x Float64x4) MulByPowOf2(y Float64x4) Float64x4
+// Asm: VPMULLD, CPU Feature: AVX512F
+func (x Int32x16) Mul(y Int32x16) Int32x16
 
-// MulByPowOf2 multiplies elements by a power of 2.
+// Mul multiplies corresponding elements of two vectors.
 //
-// Asm: VSCALEFPD, CPU Feature: AVX512F
-func (x Float64x8) MulByPowOf2(y Float64x8) Float64x8
+// Asm: VPMULLQ, CPU Feature: AVX512DQ
+func (x Int64x2) Mul(y Int64x2) Int64x2
 
-/* MulByPowOf2Masked */
+// Mul multiplies corresponding elements of two vectors.
+//
+// Asm: VPMULLQ, CPU Feature: AVX512DQ
+func (x Int64x4) Mul(y Int64x4) Int64x4
 
-// MulByPowOf2Masked multiplies elements by a power of 2.
+// Mul multiplies corresponding elements of two vectors.
 //
-// This operation is applied selectively under a write mask.
-//
-// Asm: VSCALEFPS, CPU Feature: AVX512F
-func (x Float32x4) MulByPowOf2Masked(y Float32x4, mask Mask32x4) Float32x4
-
-// MulByPowOf2Masked multiplies elements by a power of 2.
-//
-// This operation is applied selectively under a write mask.
-//
-// Asm: VSCALEFPS, CPU Feature: AVX512F
-func (x Float32x8) MulByPowOf2Masked(y Float32x8, mask Mask32x8) Float32x8
-
-// MulByPowOf2Masked multiplies elements by a power of 2.
-//
-// This operation is applied selectively under a write mask.
-//
-// Asm: VSCALEFPS, CPU Feature: AVX512F
-func (x Float32x16) MulByPowOf2Masked(y Float32x16, mask Mask32x16) Float32x16
-
-// MulByPowOf2Masked multiplies elements by a power of 2.
-//
-// This operation is applied selectively under a write mask.
-//
-// Asm: VSCALEFPD, CPU Feature: AVX512F
-func (x Float64x2) MulByPowOf2Masked(y Float64x2, mask Mask64x2) Float64x2
-
-// MulByPowOf2Masked multiplies elements by a power of 2.
-//
-// This operation is applied selectively under a write mask.
-//
-// Asm: VSCALEFPD, CPU Feature: AVX512F
-func (x Float64x4) MulByPowOf2Masked(y Float64x4, mask Mask64x4) Float64x4
-
-// MulByPowOf2Masked multiplies elements by a power of 2.
-//
-// This operation is applied selectively under a write mask.
-//
-// Asm: VSCALEFPD, CPU Feature: AVX512F
-func (x Float64x8) MulByPowOf2Masked(y Float64x8, mask Mask64x8) Float64x8
+// Asm: VPMULLQ, CPU Feature: AVX512DQ
+func (x Int64x8) Mul(y Int64x8) Int64x8
 
 /* MulEvenWiden */
 
@@ -5691,118 +5696,6 @@ func (x Uint16x16) MulHighMasked(y Uint16x16, mask Mask16x16) Uint16x16
 // Asm: VPMULHUW, CPU Feature: AVX512BW
 func (x Uint16x32) MulHighMasked(y Uint16x32, mask Mask16x32) Uint16x32
 
-/* MulLow */
-
-// MulLow multiplies elements and stores the low part of the result.
-//
-// Asm: VPMULLW, CPU Feature: AVX
-func (x Int16x8) MulLow(y Int16x8) Int16x8
-
-// MulLow multiplies elements and stores the low part of the result.
-//
-// Asm: VPMULLW, CPU Feature: AVX2
-func (x Int16x16) MulLow(y Int16x16) Int16x16
-
-// MulLow multiplies elements and stores the low part of the result.
-//
-// Asm: VPMULLW, CPU Feature: AVX512BW
-func (x Int16x32) MulLow(y Int16x32) Int16x32
-
-// MulLow multiplies elements and stores the low part of the result.
-//
-// Asm: VPMULLD, CPU Feature: AVX
-func (x Int32x4) MulLow(y Int32x4) Int32x4
-
-// MulLow multiplies elements and stores the low part of the result.
-//
-// Asm: VPMULLD, CPU Feature: AVX2
-func (x Int32x8) MulLow(y Int32x8) Int32x8
-
-// MulLow multiplies elements and stores the low part of the result.
-//
-// Asm: VPMULLD, CPU Feature: AVX512F
-func (x Int32x16) MulLow(y Int32x16) Int32x16
-
-// MulLow multiplies elements and stores the low part of the result.
-//
-// Asm: VPMULLQ, CPU Feature: AVX512DQ
-func (x Int64x2) MulLow(y Int64x2) Int64x2
-
-// MulLow multiplies elements and stores the low part of the result.
-//
-// Asm: VPMULLQ, CPU Feature: AVX512DQ
-func (x Int64x4) MulLow(y Int64x4) Int64x4
-
-// MulLow multiplies elements and stores the low part of the result.
-//
-// Asm: VPMULLQ, CPU Feature: AVX512DQ
-func (x Int64x8) MulLow(y Int64x8) Int64x8
-
-/* MulLowMasked */
-
-// MulLowMasked multiplies elements and stores the low part of the result.
-//
-// This operation is applied selectively under a write mask.
-//
-// Asm: VPMULLW, CPU Feature: AVX512BW
-func (x Int16x8) MulLowMasked(y Int16x8, mask Mask16x8) Int16x8
-
-// MulLowMasked multiplies elements and stores the low part of the result.
-//
-// This operation is applied selectively under a write mask.
-//
-// Asm: VPMULLW, CPU Feature: AVX512BW
-func (x Int16x16) MulLowMasked(y Int16x16, mask Mask16x16) Int16x16
-
-// MulLowMasked multiplies elements and stores the low part of the result.
-//
-// This operation is applied selectively under a write mask.
-//
-// Asm: VPMULLW, CPU Feature: AVX512BW
-func (x Int16x32) MulLowMasked(y Int16x32, mask Mask16x32) Int16x32
-
-// MulLowMasked multiplies elements and stores the low part of the result.
-//
-// This operation is applied selectively under a write mask.
-//
-// Asm: VPMULLD, CPU Feature: AVX512F
-func (x Int32x4) MulLowMasked(y Int32x4, mask Mask32x4) Int32x4
-
-// MulLowMasked multiplies elements and stores the low part of the result.
-//
-// This operation is applied selectively under a write mask.
-//
-// Asm: VPMULLD, CPU Feature: AVX512F
-func (x Int32x8) MulLowMasked(y Int32x8, mask Mask32x8) Int32x8
-
-// MulLowMasked multiplies elements and stores the low part of the result.
-//
-// This operation is applied selectively under a write mask.
-//
-// Asm: VPMULLD, CPU Feature: AVX512F
-func (x Int32x16) MulLowMasked(y Int32x16, mask Mask32x16) Int32x16
-
-// MulLowMasked multiplies elements and stores the low part of the result.
-//
-// This operation is applied selectively under a write mask.
-//
-// Asm: VPMULLQ, CPU Feature: AVX512DQ
-func (x Int64x2) MulLowMasked(y Int64x2, mask Mask64x2) Int64x2
-
-// MulLowMasked multiplies elements and stores the low part of the result.
-//
-// This operation is applied selectively under a write mask.
-//
-// Asm: VPMULLQ, CPU Feature: AVX512DQ
-func (x Int64x4) MulLowMasked(y Int64x4, mask Mask64x4) Int64x4
-
-// MulLowMasked multiplies elements and stores the low part of the result.
-//
-// This operation is applied selectively under a write mask.
-//
-// Asm: VPMULLQ, CPU Feature: AVX512DQ
-func (x Int64x8) MulLowMasked(y Int64x8, mask Mask64x8) Int64x8
-
 /* MulMasked */
 
 // MulMasked multiplies corresponding elements of two vectors.
@@ -5847,6 +5740,69 @@ func (x Float64x4) MulMasked(y Float64x4, mask Mask64x4) Float64x4
 // Asm: VMULPD, CPU Feature: AVX512F
 func (x Float64x8) MulMasked(y Float64x8, mask Mask64x8) Float64x8
 
+// MulMasked multiplies corresponding elements of two vectors.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPMULLW, CPU Feature: AVX512BW
+func (x Int16x8) MulMasked(y Int16x8, mask Mask16x8) Int16x8
+
+// MulMasked multiplies corresponding elements of two vectors.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPMULLW, CPU Feature: AVX512BW
+func (x Int16x16) MulMasked(y Int16x16, mask Mask16x16) Int16x16
+
+// MulMasked multiplies corresponding elements of two vectors.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPMULLW, CPU Feature: AVX512BW
+func (x Int16x32) MulMasked(y Int16x32, mask Mask16x32) Int16x32
+
+// MulMasked multiplies corresponding elements of two vectors.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPMULLD, CPU Feature: AVX512F
+func (x Int32x4) MulMasked(y Int32x4, mask Mask32x4) Int32x4
+
+// MulMasked multiplies corresponding elements of two vectors.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPMULLD, CPU Feature: AVX512F
+func (x Int32x8) MulMasked(y Int32x8, mask Mask32x8) Int32x8
+
+// MulMasked multiplies corresponding elements of two vectors.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPMULLD, CPU Feature: AVX512F
+func (x Int32x16) MulMasked(y Int32x16, mask Mask32x16) Int32x16
+
+// MulMasked multiplies corresponding elements of two vectors.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPMULLQ, CPU Feature: AVX512DQ
+func (x Int64x2) MulMasked(y Int64x2, mask Mask64x2) Int64x2
+
+// MulMasked multiplies corresponding elements of two vectors.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPMULLQ, CPU Feature: AVX512DQ
+func (x Int64x4) MulMasked(y Int64x4, mask Mask64x4) Int64x4
+
+// MulMasked multiplies corresponding elements of two vectors.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPMULLQ, CPU Feature: AVX512DQ
+func (x Int64x8) MulMasked(y Int64x8, mask Mask64x8) Int64x8
+
 /* NotEqual */
 
 // NotEqual compares for inequality.
@@ -6465,154 +6421,6 @@ func (x Int16x16) PairDotProdMasked(y Int16x16, mask Mask16x16) Int32x8
 // Asm: VPMADDWD, CPU Feature: AVX512BW
 func (x Int16x32) PairDotProdMasked(y Int16x32, mask Mask16x32) Int32x16
 
-/* PairwiseAdd */
-
-// PairwiseAdd horizontally adds adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
-//
-// Asm: VHADDPS, CPU Feature: AVX
-func (x Float32x4) PairwiseAdd(y Float32x4) Float32x4
-
-// PairwiseAdd horizontally adds adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
-//
-// Asm: VHADDPS, CPU Feature: AVX
-func (x Float32x8) PairwiseAdd(y Float32x8) Float32x8
-
-// PairwiseAdd horizontally adds adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
-//
-// Asm: VHADDPD, CPU Feature: AVX
-func (x Float64x2) PairwiseAdd(y Float64x2) Float64x2
-
-// PairwiseAdd horizontally adds adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
-//
-// Asm: VHADDPD, CPU Feature: AVX
-func (x Float64x4) PairwiseAdd(y Float64x4) Float64x4
-
-// PairwiseAdd horizontally adds adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
-//
-// Asm: VPHADDW, CPU Feature: AVX
-func (x Int16x8) PairwiseAdd(y Int16x8) Int16x8
-
-// PairwiseAdd horizontally adds adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
-//
-// Asm: VPHADDW, CPU Feature: AVX2
-func (x Int16x16) PairwiseAdd(y Int16x16) Int16x16
-
-// PairwiseAdd horizontally adds adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
-//
-// Asm: VPHADDD, CPU Feature: AVX
-func (x Int32x4) PairwiseAdd(y Int32x4) Int32x4
-
-// PairwiseAdd horizontally adds adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
-//
-// Asm: VPHADDD, CPU Feature: AVX2
-func (x Int32x8) PairwiseAdd(y Int32x8) Int32x8
-
-// PairwiseAdd horizontally adds adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
-//
-// Asm: VPHADDW, CPU Feature: AVX
-func (x Uint16x8) PairwiseAdd(y Uint16x8) Uint16x8
-
-// PairwiseAdd horizontally adds adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
-//
-// Asm: VPHADDW, CPU Feature: AVX2
-func (x Uint16x16) PairwiseAdd(y Uint16x16) Uint16x16
-
-// PairwiseAdd horizontally adds adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
-//
-// Asm: VPHADDD, CPU Feature: AVX
-func (x Uint32x4) PairwiseAdd(y Uint32x4) Uint32x4
-
-// PairwiseAdd horizontally adds adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
-//
-// Asm: VPHADDD, CPU Feature: AVX2
-func (x Uint32x8) PairwiseAdd(y Uint32x8) Uint32x8
-
-/* PairwiseSub */
-
-// PairwiseSub horizontally subtracts adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
-//
-// Asm: VHSUBPS, CPU Feature: AVX
-func (x Float32x4) PairwiseSub(y Float32x4) Float32x4
-
-// PairwiseSub horizontally subtracts adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
-//
-// Asm: VHSUBPS, CPU Feature: AVX
-func (x Float32x8) PairwiseSub(y Float32x8) Float32x8
-
-// PairwiseSub horizontally subtracts adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
-//
-// Asm: VHSUBPD, CPU Feature: AVX
-func (x Float64x2) PairwiseSub(y Float64x2) Float64x2
-
-// PairwiseSub horizontally subtracts adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
-//
-// Asm: VHSUBPD, CPU Feature: AVX
-func (x Float64x4) PairwiseSub(y Float64x4) Float64x4
-
-// PairwiseSub horizontally subtracts adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
-//
-// Asm: VPHSUBW, CPU Feature: AVX
-func (x Int16x8) PairwiseSub(y Int16x8) Int16x8
-
-// PairwiseSub horizontally subtracts adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
-//
-// Asm: VPHSUBW, CPU Feature: AVX2
-func (x Int16x16) PairwiseSub(y Int16x16) Int16x16
-
-// PairwiseSub horizontally subtracts adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
-//
-// Asm: VPHSUBD, CPU Feature: AVX
-func (x Int32x4) PairwiseSub(y Int32x4) Int32x4
-
-// PairwiseSub horizontally subtracts adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
-//
-// Asm: VPHSUBD, CPU Feature: AVX2
-func (x Int32x8) PairwiseSub(y Int32x8) Int32x8
-
-// PairwiseSub horizontally subtracts adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
-//
-// Asm: VPHSUBW, CPU Feature: AVX
-func (x Uint16x8) PairwiseSub(y Uint16x8) Uint16x8
-
-// PairwiseSub horizontally subtracts adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
-//
-// Asm: VPHSUBW, CPU Feature: AVX2
-func (x Uint16x16) PairwiseSub(y Uint16x16) Uint16x16
-
-// PairwiseSub horizontally subtracts adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
-//
-// Asm: VPHSUBD, CPU Feature: AVX
-func (x Uint32x4) PairwiseSub(y Uint32x4) Uint32x4
-
-// PairwiseSub horizontally subtracts adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
-//
-// Asm: VPHSUBD, CPU Feature: AVX2
-func (x Uint32x8) PairwiseSub(y Uint32x8) Uint32x8
-
 /* Permute */
 
 // Permute performs a full permutation of vector x using indices:
@@ -8547,167 +8355,205 @@ func (x Float64x2) Round() Float64x2
 // Asm: VROUNDPD, CPU Feature: AVX
 func (x Float64x4) Round() Float64x4
 
-/* RoundWithPrecision */
+/* RoundScaled */
 
-// RoundWithPrecision rounds elements with specified precision.
+// RoundScaled rounds elements with specified precision.
 //
 // prec is expected to be a constant, non-constant value will trigger a runtime panic.
 //
 // Asm: VRNDSCALEPS, CPU Feature: AVX512F
-func (x Float32x4) RoundWithPrecision(prec uint8) Float32x4
+func (x Float32x4) RoundScaled(prec uint8) Float32x4
 
-// RoundWithPrecision rounds elements with specified precision.
+// RoundScaled rounds elements with specified precision.
 //
 // prec is expected to be a constant, non-constant value will trigger a runtime panic.
 //
 // Asm: VRNDSCALEPS, CPU Feature: AVX512F
-func (x Float32x8) RoundWithPrecision(prec uint8) Float32x8
+func (x Float32x8) RoundScaled(prec uint8) Float32x8
 
-// RoundWithPrecision rounds elements with specified precision.
+// RoundScaled rounds elements with specified precision.
 //
 // prec is expected to be a constant, non-constant value will trigger a runtime panic.
 //
 // Asm: VRNDSCALEPS, CPU Feature: AVX512F
-func (x Float32x16) RoundWithPrecision(prec uint8) Float32x16
+func (x Float32x16) RoundScaled(prec uint8) Float32x16
 
-// RoundWithPrecision rounds elements with specified precision.
+// RoundScaled rounds elements with specified precision.
 //
 // prec is expected to be a constant, non-constant value will trigger a runtime panic.
 //
 // Asm: VRNDSCALEPD, CPU Feature: AVX512F
-func (x Float64x2) RoundWithPrecision(prec uint8) Float64x2
+func (x Float64x2) RoundScaled(prec uint8) Float64x2
 
-// RoundWithPrecision rounds elements with specified precision.
+// RoundScaled rounds elements with specified precision.
 //
 // prec is expected to be a constant, non-constant value will trigger a runtime panic.
 //
 // Asm: VRNDSCALEPD, CPU Feature: AVX512F
-func (x Float64x4) RoundWithPrecision(prec uint8) Float64x4
+func (x Float64x4) RoundScaled(prec uint8) Float64x4
 
-// RoundWithPrecision rounds elements with specified precision.
+// RoundScaled rounds elements with specified precision.
 //
 // prec is expected to be a constant, non-constant value will trigger a runtime panic.
 //
 // Asm: VRNDSCALEPD, CPU Feature: AVX512F
-func (x Float64x8) RoundWithPrecision(prec uint8) Float64x8
+func (x Float64x8) RoundScaled(prec uint8) Float64x8
 
-/* RoundWithPrecisionMasked */
+/* RoundScaledMasked */
 
-// RoundWithPrecisionMasked rounds elements with specified precision.
+// RoundScaledMasked rounds elements with specified precision.
 //
 // This operation is applied selectively under a write mask.
 //
 // prec is expected to be a constant, non-constant value will trigger a runtime panic.
 //
 // Asm: VRNDSCALEPS, CPU Feature: AVX512F
-func (x Float32x4) RoundWithPrecisionMasked(prec uint8, mask Mask32x4) Float32x4
+func (x Float32x4) RoundScaledMasked(prec uint8, mask Mask32x4) Float32x4
 
-// RoundWithPrecisionMasked rounds elements with specified precision.
+// RoundScaledMasked rounds elements with specified precision.
 //
 // This operation is applied selectively under a write mask.
 //
 // prec is expected to be a constant, non-constant value will trigger a runtime panic.
 //
 // Asm: VRNDSCALEPS, CPU Feature: AVX512F
-func (x Float32x8) RoundWithPrecisionMasked(prec uint8, mask Mask32x8) Float32x8
+func (x Float32x8) RoundScaledMasked(prec uint8, mask Mask32x8) Float32x8
 
-// RoundWithPrecisionMasked rounds elements with specified precision.
+// RoundScaledMasked rounds elements with specified precision.
 //
 // This operation is applied selectively under a write mask.
 //
 // prec is expected to be a constant, non-constant value will trigger a runtime panic.
 //
 // Asm: VRNDSCALEPS, CPU Feature: AVX512F
-func (x Float32x16) RoundWithPrecisionMasked(prec uint8, mask Mask32x16) Float32x16
+func (x Float32x16) RoundScaledMasked(prec uint8, mask Mask32x16) Float32x16
 
-// RoundWithPrecisionMasked rounds elements with specified precision.
+// RoundScaledMasked rounds elements with specified precision.
 //
 // This operation is applied selectively under a write mask.
 //
 // prec is expected to be a constant, non-constant value will trigger a runtime panic.
 //
 // Asm: VRNDSCALEPD, CPU Feature: AVX512F
-func (x Float64x2) RoundWithPrecisionMasked(prec uint8, mask Mask64x2) Float64x2
+func (x Float64x2) RoundScaledMasked(prec uint8, mask Mask64x2) Float64x2
 
-// RoundWithPrecisionMasked rounds elements with specified precision.
+// RoundScaledMasked rounds elements with specified precision.
 //
 // This operation is applied selectively under a write mask.
 //
 // prec is expected to be a constant, non-constant value will trigger a runtime panic.
 //
 // Asm: VRNDSCALEPD, CPU Feature: AVX512F
-func (x Float64x4) RoundWithPrecisionMasked(prec uint8, mask Mask64x4) Float64x4
+func (x Float64x4) RoundScaledMasked(prec uint8, mask Mask64x4) Float64x4
 
-// RoundWithPrecisionMasked rounds elements with specified precision.
+// RoundScaledMasked rounds elements with specified precision.
 //
 // This operation is applied selectively under a write mask.
 //
 // prec is expected to be a constant, non-constant value will trigger a runtime panic.
 //
 // Asm: VRNDSCALEPD, CPU Feature: AVX512F
-func (x Float64x8) RoundWithPrecisionMasked(prec uint8, mask Mask64x8) Float64x8
+func (x Float64x8) RoundScaledMasked(prec uint8, mask Mask64x8) Float64x8
 
-/* SaturatedAdd */
+/* RoundScaledResidue */
 
-// SaturatedAdd adds corresponding elements of two vectors with saturation.
+// RoundScaledResidue computes the difference after rounding with specified precision.
 //
-// Asm: VPADDSB, CPU Feature: AVX
-func (x Int8x16) SaturatedAdd(y Int8x16) Int8x16
-
-// SaturatedAdd adds corresponding elements of two vectors with saturation.
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
 //
-// Asm: VPADDSB, CPU Feature: AVX2
-func (x Int8x32) SaturatedAdd(y Int8x32) Int8x32
+// Asm: VREDUCEPS, CPU Feature: AVX512DQ
+func (x Float32x4) RoundScaledResidue(prec uint8) Float32x4
 
-// SaturatedAdd adds corresponding elements of two vectors with saturation.
+// RoundScaledResidue computes the difference after rounding with specified precision.
 //
-// Asm: VPADDSB, CPU Feature: AVX512BW
-func (x Int8x64) SaturatedAdd(y Int8x64) Int8x64
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VREDUCEPS, CPU Feature: AVX512DQ
+func (x Float32x8) RoundScaledResidue(prec uint8) Float32x8
 
-// SaturatedAdd adds corresponding elements of two vectors with saturation.
+// RoundScaledResidue computes the difference after rounding with specified precision.
 //
-// Asm: VPADDSW, CPU Feature: AVX
-func (x Int16x8) SaturatedAdd(y Int16x8) Int16x8
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VREDUCEPS, CPU Feature: AVX512DQ
+func (x Float32x16) RoundScaledResidue(prec uint8) Float32x16
 
-// SaturatedAdd adds corresponding elements of two vectors with saturation.
+// RoundScaledResidue computes the difference after rounding with specified precision.
 //
-// Asm: VPADDSW, CPU Feature: AVX2
-func (x Int16x16) SaturatedAdd(y Int16x16) Int16x16
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VREDUCEPD, CPU Feature: AVX512DQ
+func (x Float64x2) RoundScaledResidue(prec uint8) Float64x2
 
-// SaturatedAdd adds corresponding elements of two vectors with saturation.
+// RoundScaledResidue computes the difference after rounding with specified precision.
 //
-// Asm: VPADDSW, CPU Feature: AVX512BW
-func (x Int16x32) SaturatedAdd(y Int16x32) Int16x32
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VREDUCEPD, CPU Feature: AVX512DQ
+func (x Float64x4) RoundScaledResidue(prec uint8) Float64x4
 
-// SaturatedAdd adds corresponding elements of two vectors with saturation.
+// RoundScaledResidue computes the difference after rounding with specified precision.
 //
-// Asm: VPADDSB, CPU Feature: AVX
-func (x Uint8x16) SaturatedAdd(y Uint8x16) Uint8x16
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VREDUCEPD, CPU Feature: AVX512DQ
+func (x Float64x8) RoundScaledResidue(prec uint8) Float64x8
 
-// SaturatedAdd adds corresponding elements of two vectors with saturation.
-//
-// Asm: VPADDSB, CPU Feature: AVX2
-func (x Uint8x32) SaturatedAdd(y Uint8x32) Uint8x32
+/* RoundScaledResidueMasked */
 
-// SaturatedAdd adds corresponding elements of two vectors with saturation.
+// RoundScaledResidueMasked computes the difference after rounding with specified precision.
 //
-// Asm: VPADDSB, CPU Feature: AVX512BW
-func (x Uint8x64) SaturatedAdd(y Uint8x64) Uint8x64
+// This operation is applied selectively under a write mask.
+//
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VREDUCEPS, CPU Feature: AVX512DQ
+func (x Float32x4) RoundScaledResidueMasked(prec uint8, mask Mask32x4) Float32x4
 
-// SaturatedAdd adds corresponding elements of two vectors with saturation.
+// RoundScaledResidueMasked computes the difference after rounding with specified precision.
 //
-// Asm: VPADDSW, CPU Feature: AVX
-func (x Uint16x8) SaturatedAdd(y Uint16x8) Uint16x8
+// This operation is applied selectively under a write mask.
+//
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VREDUCEPS, CPU Feature: AVX512DQ
+func (x Float32x8) RoundScaledResidueMasked(prec uint8, mask Mask32x8) Float32x8
 
-// SaturatedAdd adds corresponding elements of two vectors with saturation.
+// RoundScaledResidueMasked computes the difference after rounding with specified precision.
 //
-// Asm: VPADDSW, CPU Feature: AVX2
-func (x Uint16x16) SaturatedAdd(y Uint16x16) Uint16x16
+// This operation is applied selectively under a write mask.
+//
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VREDUCEPS, CPU Feature: AVX512DQ
+func (x Float32x16) RoundScaledResidueMasked(prec uint8, mask Mask32x16) Float32x16
 
-// SaturatedAdd adds corresponding elements of two vectors with saturation.
+// RoundScaledResidueMasked computes the difference after rounding with specified precision.
 //
-// Asm: VPADDSW, CPU Feature: AVX512BW
-func (x Uint16x32) SaturatedAdd(y Uint16x32) Uint16x32
+// This operation is applied selectively under a write mask.
+//
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VREDUCEPD, CPU Feature: AVX512DQ
+func (x Float64x2) RoundScaledResidueMasked(prec uint8, mask Mask64x2) Float64x2
+
+// RoundScaledResidueMasked computes the difference after rounding with specified precision.
+//
+// This operation is applied selectively under a write mask.
+//
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VREDUCEPD, CPU Feature: AVX512DQ
+func (x Float64x4) RoundScaledResidueMasked(prec uint8, mask Mask64x4) Float64x4
+
+// RoundScaledResidueMasked computes the difference after rounding with specified precision.
+//
+// This operation is applied selectively under a write mask.
+//
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VREDUCEPD, CPU Feature: AVX512DQ
+func (x Float64x8) RoundScaledResidueMasked(prec uint8, mask Mask64x8) Float64x8
 
 /* SaturatedAddDotProd */
 
@@ -8749,268 +8595,6 @@ func (x Int32x8) SaturatedAddDotProdMasked(y Int16x16, z Int16x16, mask Mask32x8
 // Asm: VPDPWSSDS, CPU Feature: AVX512VNNI
 func (x Int32x16) SaturatedAddDotProdMasked(y Int16x32, z Int16x32, mask Mask32x16) Int32x16
 
-/* SaturatedAddMasked */
-
-// SaturatedAddMasked adds corresponding elements of two vectors with saturation.
-//
-// This operation is applied selectively under a write mask.
-//
-// Asm: VPADDSB, CPU Feature: AVX512BW
-func (x Int8x16) SaturatedAddMasked(y Int8x16, mask Mask8x16) Int8x16
-
-// SaturatedAddMasked adds corresponding elements of two vectors with saturation.
-//
-// This operation is applied selectively under a write mask.
-//
-// Asm: VPADDSB, CPU Feature: AVX512BW
-func (x Int8x32) SaturatedAddMasked(y Int8x32, mask Mask8x32) Int8x32
-
-// SaturatedAddMasked adds corresponding elements of two vectors with saturation.
-//
-// This operation is applied selectively under a write mask.
-//
-// Asm: VPADDSB, CPU Feature: AVX512BW
-func (x Int8x64) SaturatedAddMasked(y Int8x64, mask Mask8x64) Int8x64
-
-// SaturatedAddMasked adds corresponding elements of two vectors with saturation.
-//
-// This operation is applied selectively under a write mask.
-//
-// Asm: VPADDSW, CPU Feature: AVX512BW
-func (x Int16x8) SaturatedAddMasked(y Int16x8, mask Mask16x8) Int16x8
-
-// SaturatedAddMasked adds corresponding elements of two vectors with saturation.
-//
-// This operation is applied selectively under a write mask.
-//
-// Asm: VPADDSW, CPU Feature: AVX512BW
-func (x Int16x16) SaturatedAddMasked(y Int16x16, mask Mask16x16) Int16x16
-
-// SaturatedAddMasked adds corresponding elements of two vectors with saturation.
-//
-// This operation is applied selectively under a write mask.
-//
-// Asm: VPADDSW, CPU Feature: AVX512BW
-func (x Int16x32) SaturatedAddMasked(y Int16x32, mask Mask16x32) Int16x32
-
-// SaturatedAddMasked adds corresponding elements of two vectors with saturation.
-//
-// This operation is applied selectively under a write mask.
-//
-// Asm: VPADDSB, CPU Feature: AVX512BW
-func (x Uint8x16) SaturatedAddMasked(y Uint8x16, mask Mask8x16) Uint8x16
-
-// SaturatedAddMasked adds corresponding elements of two vectors with saturation.
-//
-// This operation is applied selectively under a write mask.
-//
-// Asm: VPADDSB, CPU Feature: AVX512BW
-func (x Uint8x32) SaturatedAddMasked(y Uint8x32, mask Mask8x32) Uint8x32
-
-// SaturatedAddMasked adds corresponding elements of two vectors with saturation.
-//
-// This operation is applied selectively under a write mask.
-//
-// Asm: VPADDSB, CPU Feature: AVX512BW
-func (x Uint8x64) SaturatedAddMasked(y Uint8x64, mask Mask8x64) Uint8x64
-
-// SaturatedAddMasked adds corresponding elements of two vectors with saturation.
-//
-// This operation is applied selectively under a write mask.
-//
-// Asm: VPADDSW, CPU Feature: AVX512BW
-func (x Uint16x8) SaturatedAddMasked(y Uint16x8, mask Mask16x8) Uint16x8
-
-// SaturatedAddMasked adds corresponding elements of two vectors with saturation.
-//
-// This operation is applied selectively under a write mask.
-//
-// Asm: VPADDSW, CPU Feature: AVX512BW
-func (x Uint16x16) SaturatedAddMasked(y Uint16x16, mask Mask16x16) Uint16x16
-
-// SaturatedAddMasked adds corresponding elements of two vectors with saturation.
-//
-// This operation is applied selectively under a write mask.
-//
-// Asm: VPADDSW, CPU Feature: AVX512BW
-func (x Uint16x32) SaturatedAddMasked(y Uint16x32, mask Mask16x32) Uint16x32
-
-/* SaturatedPairwiseAdd */
-
-// SaturatedPairwiseAdd horizontally adds adjacent pairs of elements with saturation.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
-//
-// Asm: VPHADDSW, CPU Feature: AVX
-func (x Int16x8) SaturatedPairwiseAdd(y Int16x8) Int16x8
-
-// SaturatedPairwiseAdd horizontally adds adjacent pairs of elements with saturation.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
-//
-// Asm: VPHADDSW, CPU Feature: AVX2
-func (x Int16x16) SaturatedPairwiseAdd(y Int16x16) Int16x16
-
-/* SaturatedPairwiseSub */
-
-// SaturatedPairwiseSub horizontally subtracts adjacent pairs of elements with saturation.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
-//
-// Asm: VPHSUBSW, CPU Feature: AVX
-func (x Int16x8) SaturatedPairwiseSub(y Int16x8) Int16x8
-
-// SaturatedPairwiseSub horizontally subtracts adjacent pairs of elements with saturation.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
-//
-// Asm: VPHSUBSW, CPU Feature: AVX2
-func (x Int16x16) SaturatedPairwiseSub(y Int16x16) Int16x16
-
-/* SaturatedSub */
-
-// SaturatedSub subtracts corresponding elements of two vectors with saturation.
-//
-// Asm: VPSUBSB, CPU Feature: AVX
-func (x Int8x16) SaturatedSub(y Int8x16) Int8x16
-
-// SaturatedSub subtracts corresponding elements of two vectors with saturation.
-//
-// Asm: VPSUBSB, CPU Feature: AVX2
-func (x Int8x32) SaturatedSub(y Int8x32) Int8x32
-
-// SaturatedSub subtracts corresponding elements of two vectors with saturation.
-//
-// Asm: VPSUBSB, CPU Feature: AVX512BW
-func (x Int8x64) SaturatedSub(y Int8x64) Int8x64
-
-// SaturatedSub subtracts corresponding elements of two vectors with saturation.
-//
-// Asm: VPSUBSW, CPU Feature: AVX
-func (x Int16x8) SaturatedSub(y Int16x8) Int16x8
-
-// SaturatedSub subtracts corresponding elements of two vectors with saturation.
-//
-// Asm: VPSUBSW, CPU Feature: AVX2
-func (x Int16x16) SaturatedSub(y Int16x16) Int16x16
-
-// SaturatedSub subtracts corresponding elements of two vectors with saturation.
-//
-// Asm: VPSUBSW, CPU Feature: AVX512BW
-func (x Int16x32) SaturatedSub(y Int16x32) Int16x32
-
-// SaturatedSub subtracts corresponding elements of two vectors with saturation.
-//
-// Asm: VPSUBSB, CPU Feature: AVX
-func (x Uint8x16) SaturatedSub(y Uint8x16) Uint8x16
-
-// SaturatedSub subtracts corresponding elements of two vectors with saturation.
-//
-// Asm: VPSUBSB, CPU Feature: AVX2
-func (x Uint8x32) SaturatedSub(y Uint8x32) Uint8x32
-
-// SaturatedSub subtracts corresponding elements of two vectors with saturation.
-//
-// Asm: VPSUBSB, CPU Feature: AVX512BW
-func (x Uint8x64) SaturatedSub(y Uint8x64) Uint8x64
-
-// SaturatedSub subtracts corresponding elements of two vectors with saturation.
-//
-// Asm: VPSUBSW, CPU Feature: AVX
-func (x Uint16x8) SaturatedSub(y Uint16x8) Uint16x8
-
-// SaturatedSub subtracts corresponding elements of two vectors with saturation.
-//
-// Asm: VPSUBSW, CPU Feature: AVX2
-func (x Uint16x16) SaturatedSub(y Uint16x16) Uint16x16
-
-// SaturatedSub subtracts corresponding elements of two vectors with saturation.
-//
-// Asm: VPSUBSW, CPU Feature: AVX512BW
-func (x Uint16x32) SaturatedSub(y Uint16x32) Uint16x32
-
-/* SaturatedSubMasked */
-
-// SaturatedSubMasked subtracts corresponding elements of two vectors with saturation.
-//
-// This operation is applied selectively under a write mask.
-//
-// Asm: VPSUBSB, CPU Feature: AVX512BW
-func (x Int8x16) SaturatedSubMasked(y Int8x16, mask Mask8x16) Int8x16
-
-// SaturatedSubMasked subtracts corresponding elements of two vectors with saturation.
-//
-// This operation is applied selectively under a write mask.
-//
-// Asm: VPSUBSB, CPU Feature: AVX512BW
-func (x Int8x32) SaturatedSubMasked(y Int8x32, mask Mask8x32) Int8x32
-
-// SaturatedSubMasked subtracts corresponding elements of two vectors with saturation.
-//
-// This operation is applied selectively under a write mask.
-//
-// Asm: VPSUBSB, CPU Feature: AVX512BW
-func (x Int8x64) SaturatedSubMasked(y Int8x64, mask Mask8x64) Int8x64
-
-// SaturatedSubMasked subtracts corresponding elements of two vectors with saturation.
-//
-// This operation is applied selectively under a write mask.
-//
-// Asm: VPSUBSW, CPU Feature: AVX512BW
-func (x Int16x8) SaturatedSubMasked(y Int16x8, mask Mask16x8) Int16x8
-
-// SaturatedSubMasked subtracts corresponding elements of two vectors with saturation.
-//
-// This operation is applied selectively under a write mask.
-//
-// Asm: VPSUBSW, CPU Feature: AVX512BW
-func (x Int16x16) SaturatedSubMasked(y Int16x16, mask Mask16x16) Int16x16
-
-// SaturatedSubMasked subtracts corresponding elements of two vectors with saturation.
-//
-// This operation is applied selectively under a write mask.
-//
-// Asm: VPSUBSW, CPU Feature: AVX512BW
-func (x Int16x32) SaturatedSubMasked(y Int16x32, mask Mask16x32) Int16x32
-
-// SaturatedSubMasked subtracts corresponding elements of two vectors with saturation.
-//
-// This operation is applied selectively under a write mask.
-//
-// Asm: VPSUBSB, CPU Feature: AVX512BW
-func (x Uint8x16) SaturatedSubMasked(y Uint8x16, mask Mask8x16) Uint8x16
-
-// SaturatedSubMasked subtracts corresponding elements of two vectors with saturation.
-//
-// This operation is applied selectively under a write mask.
-//
-// Asm: VPSUBSB, CPU Feature: AVX512BW
-func (x Uint8x32) SaturatedSubMasked(y Uint8x32, mask Mask8x32) Uint8x32
-
-// SaturatedSubMasked subtracts corresponding elements of two vectors with saturation.
-//
-// This operation is applied selectively under a write mask.
-//
-// Asm: VPSUBSB, CPU Feature: AVX512BW
-func (x Uint8x64) SaturatedSubMasked(y Uint8x64, mask Mask8x64) Uint8x64
-
-// SaturatedSubMasked subtracts corresponding elements of two vectors with saturation.
-//
-// This operation is applied selectively under a write mask.
-//
-// Asm: VPSUBSW, CPU Feature: AVX512BW
-func (x Uint16x8) SaturatedSubMasked(y Uint16x8, mask Mask16x8) Uint16x8
-
-// SaturatedSubMasked subtracts corresponding elements of two vectors with saturation.
-//
-// This operation is applied selectively under a write mask.
-//
-// Asm: VPSUBSW, CPU Feature: AVX512BW
-func (x Uint16x16) SaturatedSubMasked(y Uint16x16, mask Mask16x16) Uint16x16
-
-// SaturatedSubMasked subtracts corresponding elements of two vectors with saturation.
-//
-// This operation is applied selectively under a write mask.
-//
-// Asm: VPSUBSW, CPU Feature: AVX512BW
-func (x Uint16x32) SaturatedSubMasked(y Uint16x32, mask Mask16x32) Uint16x32
-
 /* SaturatedUnsignedSignedPairDotProd */
 
 // SaturatedUnsignedSignedPairDotProd multiplies the elements and add the pairs together with saturation,
@@ -9097,6 +8681,82 @@ func (x Int8x32) SaturatedUnsignedSignedQuadDotProdAccumulateMasked(y Uint8x32,
 // Asm: VPDPBUSDS, CPU Feature: AVX512VNNI
 func (x Int8x64) SaturatedUnsignedSignedQuadDotProdAccumulateMasked(y Uint8x64, z Int32x16, mask Mask32x16) Int32x16
 
+/* Scale */
+
+// Scale multiplies elements by a power of 2.
+//
+// Asm: VSCALEFPS, CPU Feature: AVX512F
+func (x Float32x4) Scale(y Float32x4) Float32x4
+
+// Scale multiplies elements by a power of 2.
+//
+// Asm: VSCALEFPS, CPU Feature: AVX512F
+func (x Float32x8) Scale(y Float32x8) Float32x8
+
+// Scale multiplies elements by a power of 2.
+//
+// Asm: VSCALEFPS, CPU Feature: AVX512F
+func (x Float32x16) Scale(y Float32x16) Float32x16
+
+// Scale multiplies elements by a power of 2.
+//
+// Asm: VSCALEFPD, CPU Feature: AVX512F
+func (x Float64x2) Scale(y Float64x2) Float64x2
+
+// Scale multiplies elements by a power of 2.
+//
+// Asm: VSCALEFPD, CPU Feature: AVX512F
+func (x Float64x4) Scale(y Float64x4) Float64x4
+
+// Scale multiplies elements by a power of 2.
+//
+// Asm: VSCALEFPD, CPU Feature: AVX512F
+func (x Float64x8) Scale(y Float64x8) Float64x8
+
+/* ScaleMasked */
+
+// ScaleMasked multiplies elements by a power of 2.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VSCALEFPS, CPU Feature: AVX512F
+func (x Float32x4) ScaleMasked(y Float32x4, mask Mask32x4) Float32x4
+
+// ScaleMasked multiplies elements by a power of 2.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VSCALEFPS, CPU Feature: AVX512F
+func (x Float32x8) ScaleMasked(y Float32x8, mask Mask32x8) Float32x8
+
+// ScaleMasked multiplies elements by a power of 2.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VSCALEFPS, CPU Feature: AVX512F
+func (x Float32x16) ScaleMasked(y Float32x16, mask Mask32x16) Float32x16
+
+// ScaleMasked multiplies elements by a power of 2.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VSCALEFPD, CPU Feature: AVX512F
+func (x Float64x2) ScaleMasked(y Float64x2, mask Mask64x2) Float64x2
+
+// ScaleMasked multiplies elements by a power of 2.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VSCALEFPD, CPU Feature: AVX512F
+func (x Float64x4) ScaleMasked(y Float64x4, mask Mask64x4) Float64x4
+
+// ScaleMasked multiplies elements by a power of 2.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VSCALEFPD, CPU Feature: AVX512F
+func (x Float64x8) ScaleMasked(y Float64x8, mask Mask64x8) Float64x8
+
 /* Set128 */
 
 // Set128 combines a 128-bit vector with a 256-bit vector, where the constant operand specifies whether the low (0) or high (1) half is receives the smaller vector.
@@ -11753,6 +11413,242 @@ func (x Uint64x4) SubMasked(y Uint64x4, mask Mask64x4) Uint64x4
 // Asm: VPSUBQ, CPU Feature: AVX512F
 func (x Uint64x8) SubMasked(y Uint64x8, mask Mask64x8) Uint64x8
 
+/* SubPairs */
+
+// SubPairs horizontally subtracts adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+//
+// Asm: VHSUBPS, CPU Feature: AVX
+func (x Float32x4) SubPairs(y Float32x4) Float32x4
+
+// SubPairs horizontally subtracts adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+//
+// Asm: VHSUBPS, CPU Feature: AVX
+func (x Float32x8) SubPairs(y Float32x8) Float32x8
+
+// SubPairs horizontally subtracts adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+//
+// Asm: VHSUBPD, CPU Feature: AVX
+func (x Float64x2) SubPairs(y Float64x2) Float64x2
+
+// SubPairs horizontally subtracts adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+//
+// Asm: VHSUBPD, CPU Feature: AVX
+func (x Float64x4) SubPairs(y Float64x4) Float64x4
+
+// SubPairs horizontally subtracts adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+//
+// Asm: VPHSUBW, CPU Feature: AVX
+func (x Int16x8) SubPairs(y Int16x8) Int16x8
+
+// SubPairs horizontally subtracts adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+//
+// Asm: VPHSUBW, CPU Feature: AVX2
+func (x Int16x16) SubPairs(y Int16x16) Int16x16
+
+// SubPairs horizontally subtracts adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+//
+// Asm: VPHSUBD, CPU Feature: AVX
+func (x Int32x4) SubPairs(y Int32x4) Int32x4
+
+// SubPairs horizontally subtracts adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+//
+// Asm: VPHSUBD, CPU Feature: AVX2
+func (x Int32x8) SubPairs(y Int32x8) Int32x8
+
+// SubPairs horizontally subtracts adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+//
+// Asm: VPHSUBW, CPU Feature: AVX
+func (x Uint16x8) SubPairs(y Uint16x8) Uint16x8
+
+// SubPairs horizontally subtracts adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+//
+// Asm: VPHSUBW, CPU Feature: AVX2
+func (x Uint16x16) SubPairs(y Uint16x16) Uint16x16
+
+// SubPairs horizontally subtracts adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+//
+// Asm: VPHSUBD, CPU Feature: AVX
+func (x Uint32x4) SubPairs(y Uint32x4) Uint32x4
+
+// SubPairs horizontally subtracts adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+//
+// Asm: VPHSUBD, CPU Feature: AVX2
+func (x Uint32x8) SubPairs(y Uint32x8) Uint32x8
+
+/* SubPairsSaturated */
+
+// SubPairsSaturated horizontally subtracts adjacent pairs of elements with saturation.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+//
+// Asm: VPHSUBSW, CPU Feature: AVX
+func (x Int16x8) SubPairsSaturated(y Int16x8) Int16x8
+
+// SubPairsSaturated horizontally subtracts adjacent pairs of elements with saturation.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+//
+// Asm: VPHSUBSW, CPU Feature: AVX2
+func (x Int16x16) SubPairsSaturated(y Int16x16) Int16x16
+
+/* SubSaturated */
+
+// SubSaturated subtracts corresponding elements of two vectors with saturation.
+//
+// Asm: VPSUBSB, CPU Feature: AVX
+func (x Int8x16) SubSaturated(y Int8x16) Int8x16
+
+// SubSaturated subtracts corresponding elements of two vectors with saturation.
+//
+// Asm: VPSUBSB, CPU Feature: AVX2
+func (x Int8x32) SubSaturated(y Int8x32) Int8x32
+
+// SubSaturated subtracts corresponding elements of two vectors with saturation.
+//
+// Asm: VPSUBSB, CPU Feature: AVX512BW
+func (x Int8x64) SubSaturated(y Int8x64) Int8x64
+
+// SubSaturated subtracts corresponding elements of two vectors with saturation.
+//
+// Asm: VPSUBSW, CPU Feature: AVX
+func (x Int16x8) SubSaturated(y Int16x8) Int16x8
+
+// SubSaturated subtracts corresponding elements of two vectors with saturation.
+//
+// Asm: VPSUBSW, CPU Feature: AVX2
+func (x Int16x16) SubSaturated(y Int16x16) Int16x16
+
+// SubSaturated subtracts corresponding elements of two vectors with saturation.
+//
+// Asm: VPSUBSW, CPU Feature: AVX512BW
+func (x Int16x32) SubSaturated(y Int16x32) Int16x32
+
+// SubSaturated subtracts corresponding elements of two vectors with saturation.
+//
+// Asm: VPSUBSB, CPU Feature: AVX
+func (x Uint8x16) SubSaturated(y Uint8x16) Uint8x16
+
+// SubSaturated subtracts corresponding elements of two vectors with saturation.
+//
+// Asm: VPSUBSB, CPU Feature: AVX2
+func (x Uint8x32) SubSaturated(y Uint8x32) Uint8x32
+
+// SubSaturated subtracts corresponding elements of two vectors with saturation.
+//
+// Asm: VPSUBSB, CPU Feature: AVX512BW
+func (x Uint8x64) SubSaturated(y Uint8x64) Uint8x64
+
+// SubSaturated subtracts corresponding elements of two vectors with saturation.
+//
+// Asm: VPSUBSW, CPU Feature: AVX
+func (x Uint16x8) SubSaturated(y Uint16x8) Uint16x8
+
+// SubSaturated subtracts corresponding elements of two vectors with saturation.
+//
+// Asm: VPSUBSW, CPU Feature: AVX2
+func (x Uint16x16) SubSaturated(y Uint16x16) Uint16x16
+
+// SubSaturated subtracts corresponding elements of two vectors with saturation.
+//
+// Asm: VPSUBSW, CPU Feature: AVX512BW
+func (x Uint16x32) SubSaturated(y Uint16x32) Uint16x32
+
+/* SubSaturatedMasked */
+
+// SubSaturatedMasked subtracts corresponding elements of two vectors with saturation.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPSUBSB, CPU Feature: AVX512BW
+func (x Int8x16) SubSaturatedMasked(y Int8x16, mask Mask8x16) Int8x16
+
+// SubSaturatedMasked subtracts corresponding elements of two vectors with saturation.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPSUBSB, CPU Feature: AVX512BW
+func (x Int8x32) SubSaturatedMasked(y Int8x32, mask Mask8x32) Int8x32
+
+// SubSaturatedMasked subtracts corresponding elements of two vectors with saturation.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPSUBSB, CPU Feature: AVX512BW
+func (x Int8x64) SubSaturatedMasked(y Int8x64, mask Mask8x64) Int8x64
+
+// SubSaturatedMasked subtracts corresponding elements of two vectors with saturation.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPSUBSW, CPU Feature: AVX512BW
+func (x Int16x8) SubSaturatedMasked(y Int16x8, mask Mask16x8) Int16x8
+
+// SubSaturatedMasked subtracts corresponding elements of two vectors with saturation.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPSUBSW, CPU Feature: AVX512BW
+func (x Int16x16) SubSaturatedMasked(y Int16x16, mask Mask16x16) Int16x16
+
+// SubSaturatedMasked subtracts corresponding elements of two vectors with saturation.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPSUBSW, CPU Feature: AVX512BW
+func (x Int16x32) SubSaturatedMasked(y Int16x32, mask Mask16x32) Int16x32
+
+// SubSaturatedMasked subtracts corresponding elements of two vectors with saturation.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPSUBSB, CPU Feature: AVX512BW
+func (x Uint8x16) SubSaturatedMasked(y Uint8x16, mask Mask8x16) Uint8x16
+
+// SubSaturatedMasked subtracts corresponding elements of two vectors with saturation.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPSUBSB, CPU Feature: AVX512BW
+func (x Uint8x32) SubSaturatedMasked(y Uint8x32, mask Mask8x32) Uint8x32
+
+// SubSaturatedMasked subtracts corresponding elements of two vectors with saturation.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPSUBSB, CPU Feature: AVX512BW
+func (x Uint8x64) SubSaturatedMasked(y Uint8x64, mask Mask8x64) Uint8x64
+
+// SubSaturatedMasked subtracts corresponding elements of two vectors with saturation.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPSUBSW, CPU Feature: AVX512BW
+func (x Uint16x8) SubSaturatedMasked(y Uint16x8, mask Mask16x8) Uint16x8
+
+// SubSaturatedMasked subtracts corresponding elements of two vectors with saturation.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPSUBSW, CPU Feature: AVX512BW
+func (x Uint16x16) SubSaturatedMasked(y Uint16x16, mask Mask16x16) Uint16x16
+
+// SubSaturatedMasked subtracts corresponding elements of two vectors with saturation.
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPSUBSW, CPU Feature: AVX512BW
+func (x Uint16x32) SubSaturatedMasked(y Uint16x32, mask Mask16x32) Uint16x32
+
 /* Trunc */
 
 // Trunc truncates elements towards zero.
@@ -11775,105 +11671,205 @@ func (x Float64x2) Trunc() Float64x2
 // Asm: VROUNDPD, CPU Feature: AVX
 func (x Float64x4) Trunc() Float64x4
 
-/* TruncWithPrecision */
+/* TruncScaled */
 
-// TruncWithPrecision truncates elements with specified precision.
+// TruncScaled truncates elements with specified precision.
 //
 // prec is expected to be a constant, non-constant value will trigger a runtime panic.
 //
 // Asm: VRNDSCALEPS, CPU Feature: AVX512F
-func (x Float32x4) TruncWithPrecision(prec uint8) Float32x4
+func (x Float32x4) TruncScaled(prec uint8) Float32x4
 
-// TruncWithPrecision truncates elements with specified precision.
+// TruncScaled truncates elements with specified precision.
 //
 // prec is expected to be a constant, non-constant value will trigger a runtime panic.
 //
 // Asm: VRNDSCALEPS, CPU Feature: AVX512F
-func (x Float32x8) TruncWithPrecision(prec uint8) Float32x8
+func (x Float32x8) TruncScaled(prec uint8) Float32x8
 
-// TruncWithPrecision truncates elements with specified precision.
+// TruncScaled truncates elements with specified precision.
 //
 // prec is expected to be a constant, non-constant value will trigger a runtime panic.
 //
 // Asm: VRNDSCALEPS, CPU Feature: AVX512F
-func (x Float32x16) TruncWithPrecision(prec uint8) Float32x16
+func (x Float32x16) TruncScaled(prec uint8) Float32x16
 
-// TruncWithPrecision truncates elements with specified precision.
+// TruncScaled truncates elements with specified precision.
 //
 // prec is expected to be a constant, non-constant value will trigger a runtime panic.
 //
 // Asm: VRNDSCALEPD, CPU Feature: AVX512F
-func (x Float64x2) TruncWithPrecision(prec uint8) Float64x2
+func (x Float64x2) TruncScaled(prec uint8) Float64x2
 
-// TruncWithPrecision truncates elements with specified precision.
+// TruncScaled truncates elements with specified precision.
 //
 // prec is expected to be a constant, non-constant value will trigger a runtime panic.
 //
 // Asm: VRNDSCALEPD, CPU Feature: AVX512F
-func (x Float64x4) TruncWithPrecision(prec uint8) Float64x4
+func (x Float64x4) TruncScaled(prec uint8) Float64x4
 
-// TruncWithPrecision truncates elements with specified precision.
+// TruncScaled truncates elements with specified precision.
 //
 // prec is expected to be a constant, non-constant value will trigger a runtime panic.
 //
 // Asm: VRNDSCALEPD, CPU Feature: AVX512F
-func (x Float64x8) TruncWithPrecision(prec uint8) Float64x8
+func (x Float64x8) TruncScaled(prec uint8) Float64x8
 
-/* TruncWithPrecisionMasked */
+/* TruncScaledMasked */
 
-// TruncWithPrecisionMasked truncates elements with specified precision.
+// TruncScaledMasked truncates elements with specified precision.
 //
 // This operation is applied selectively under a write mask.
 //
 // prec is expected to be a constant, non-constant value will trigger a runtime panic.
 //
 // Asm: VRNDSCALEPS, CPU Feature: AVX512F
-func (x Float32x4) TruncWithPrecisionMasked(prec uint8, mask Mask32x4) Float32x4
+func (x Float32x4) TruncScaledMasked(prec uint8, mask Mask32x4) Float32x4
 
-// TruncWithPrecisionMasked truncates elements with specified precision.
+// TruncScaledMasked truncates elements with specified precision.
 //
 // This operation is applied selectively under a write mask.
 //
 // prec is expected to be a constant, non-constant value will trigger a runtime panic.
 //
 // Asm: VRNDSCALEPS, CPU Feature: AVX512F
-func (x Float32x8) TruncWithPrecisionMasked(prec uint8, mask Mask32x8) Float32x8
+func (x Float32x8) TruncScaledMasked(prec uint8, mask Mask32x8) Float32x8
 
-// TruncWithPrecisionMasked truncates elements with specified precision.
+// TruncScaledMasked truncates elements with specified precision.
 //
 // This operation is applied selectively under a write mask.
 //
 // prec is expected to be a constant, non-constant value will trigger a runtime panic.
 //
 // Asm: VRNDSCALEPS, CPU Feature: AVX512F
-func (x Float32x16) TruncWithPrecisionMasked(prec uint8, mask Mask32x16) Float32x16
+func (x Float32x16) TruncScaledMasked(prec uint8, mask Mask32x16) Float32x16
 
-// TruncWithPrecisionMasked truncates elements with specified precision.
+// TruncScaledMasked truncates elements with specified precision.
 //
 // This operation is applied selectively under a write mask.
 //
 // prec is expected to be a constant, non-constant value will trigger a runtime panic.
 //
 // Asm: VRNDSCALEPD, CPU Feature: AVX512F
-func (x Float64x2) TruncWithPrecisionMasked(prec uint8, mask Mask64x2) Float64x2
+func (x Float64x2) TruncScaledMasked(prec uint8, mask Mask64x2) Float64x2
 
-// TruncWithPrecisionMasked truncates elements with specified precision.
+// TruncScaledMasked truncates elements with specified precision.
 //
 // This operation is applied selectively under a write mask.
 //
 // prec is expected to be a constant, non-constant value will trigger a runtime panic.
 //
 // Asm: VRNDSCALEPD, CPU Feature: AVX512F
-func (x Float64x4) TruncWithPrecisionMasked(prec uint8, mask Mask64x4) Float64x4
+func (x Float64x4) TruncScaledMasked(prec uint8, mask Mask64x4) Float64x4
 
-// TruncWithPrecisionMasked truncates elements with specified precision.
+// TruncScaledMasked truncates elements with specified precision.
 //
 // This operation is applied selectively under a write mask.
 //
 // prec is expected to be a constant, non-constant value will trigger a runtime panic.
 //
 // Asm: VRNDSCALEPD, CPU Feature: AVX512F
-func (x Float64x8) TruncWithPrecisionMasked(prec uint8, mask Mask64x8) Float64x8
+func (x Float64x8) TruncScaledMasked(prec uint8, mask Mask64x8) Float64x8
+
+/* TruncScaledResidue */
+
+// TruncScaledResidue computes the difference after truncating with specified precision.
+//
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VREDUCEPS, CPU Feature: AVX512DQ
+func (x Float32x4) TruncScaledResidue(prec uint8) Float32x4
+
+// TruncScaledResidue computes the difference after truncating with specified precision.
+//
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VREDUCEPS, CPU Feature: AVX512DQ
+func (x Float32x8) TruncScaledResidue(prec uint8) Float32x8
+
+// TruncScaledResidue computes the difference after truncating with specified precision.
+//
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VREDUCEPS, CPU Feature: AVX512DQ
+func (x Float32x16) TruncScaledResidue(prec uint8) Float32x16
+
+// TruncScaledResidue computes the difference after truncating with specified precision.
+//
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VREDUCEPD, CPU Feature: AVX512DQ
+func (x Float64x2) TruncScaledResidue(prec uint8) Float64x2
+
+// TruncScaledResidue computes the difference after truncating with specified precision.
+//
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VREDUCEPD, CPU Feature: AVX512DQ
+func (x Float64x4) TruncScaledResidue(prec uint8) Float64x4
+
+// TruncScaledResidue computes the difference after truncating with specified precision.
+//
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VREDUCEPD, CPU Feature: AVX512DQ
+func (x Float64x8) TruncScaledResidue(prec uint8) Float64x8
+
+/* TruncScaledResidueMasked */
+
+// TruncScaledResidueMasked computes the difference after truncating with specified precision.
+//
+// This operation is applied selectively under a write mask.
+//
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VREDUCEPS, CPU Feature: AVX512DQ
+func (x Float32x4) TruncScaledResidueMasked(prec uint8, mask Mask32x4) Float32x4
+
+// TruncScaledResidueMasked computes the difference after truncating with specified precision.
+//
+// This operation is applied selectively under a write mask.
+//
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VREDUCEPS, CPU Feature: AVX512DQ
+func (x Float32x8) TruncScaledResidueMasked(prec uint8, mask Mask32x8) Float32x8
+
+// TruncScaledResidueMasked computes the difference after truncating with specified precision.
+//
+// This operation is applied selectively under a write mask.
+//
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VREDUCEPS, CPU Feature: AVX512DQ
+func (x Float32x16) TruncScaledResidueMasked(prec uint8, mask Mask32x16) Float32x16
+
+// TruncScaledResidueMasked computes the difference after truncating with specified precision.
+//
+// This operation is applied selectively under a write mask.
+//
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VREDUCEPD, CPU Feature: AVX512DQ
+func (x Float64x2) TruncScaledResidueMasked(prec uint8, mask Mask64x2) Float64x2
+
+// TruncScaledResidueMasked computes the difference after truncating with specified precision.
+//
+// This operation is applied selectively under a write mask.
+//
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VREDUCEPD, CPU Feature: AVX512DQ
+func (x Float64x4) TruncScaledResidueMasked(prec uint8, mask Mask64x4) Float64x4
+
+// TruncScaledResidueMasked computes the difference after truncating with specified precision.
+//
+// This operation is applied selectively under a write mask.
+//
+// prec is expected to be a constant, non-constant value will trigger a runtime panic.
+//
+// Asm: VREDUCEPD, CPU Feature: AVX512DQ
+func (x Float64x8) TruncScaledResidueMasked(prec uint8, mask Mask64x8) Float64x8
 
 /* UnsignedSignedQuadDotProdAccumulate */
 
diff --git a/src/simd/unary_test.go b/src/simd/unary_test.go
index 4263b81cd73..c9fdfff0ffc 100644
--- a/src/simd/unary_test.go
+++ b/src/simd/unary_test.go
@@ -89,20 +89,20 @@ func TestToInt32(t *testing.T) {
 	testFloat32x8UnaryToInt32(t, simd.Float32x8.ConvertToInt32, toInt32Slice[float32])
 }
 
-func TestDiffWithCeilWithPrecision(t *testing.T) {
+func TestCeilScaledResidue(t *testing.T) {
 	if !simd.HasAVX512() {
 		t.Skip("Needs AVX512")
 	}
 	testFloat64x8UnaryFlaky(t,
-		func(x simd.Float64x8) simd.Float64x8 { return x.DiffWithCeilWithPrecision(0) },
+		func(x simd.Float64x8) simd.Float64x8 { return x.CeilScaledResidue(0) },
 		map1(ceilResidueForPrecision[float64](0)),
 		0.001)
 	testFloat64x8UnaryFlaky(t,
-		func(x simd.Float64x8) simd.Float64x8 { return x.DiffWithCeilWithPrecision(1) },
+		func(x simd.Float64x8) simd.Float64x8 { return x.CeilScaledResidue(1) },
 		map1(ceilResidueForPrecision[float64](1)),
 		0.001)
 	testFloat64x8Unary(t,
-		func(x simd.Float64x8) simd.Float64x8 { return x.Sub(x.CeilWithPrecision(0)) },
+		func(x simd.Float64x8) simd.Float64x8 { return x.Sub(x.CeilScaled(0)) },
 		map1[float64](func(x float64) float64 { return x - math.Ceil(x) }))
 }