[dev.simd] simd, cmd/compile: rename some methods

generated by simdgen CL 692556 these are the "easy" ones SaturatedOp -> OpSaturated PairwiseOp -> OpPairs OpWithPrecision -> OpScaled DiffWithOpWithPrecision -> OpScaledResidue Change-Id: I036bf89c0690bcf9922c376d62cef48392942af3 Reviewed-on: https://go-review.googlesource.com/c/go/+/692357 Reviewed-by: Junyang Shao <shaojunyang@google.com> Reviewed-by: Cherry Mui <cherryyz@google.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
2025-12-08 06:10:04 +00:00 · 2025-08-01 15:58:29 -04:00 · 2025-08-01 15:58:29 -04:00 · 6b9b59e144
commit 6b9b59e144
parent d375b95357
9 changed files with 4809 additions and 4813 deletions
--- a/src/cmd/compile/internal/amd64/simdssa.go
+++ b/src/cmd/compile/internal/amd64/simdssa.go
@ -80,6 +80,22 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPADDQ128,
 		ssa.OpAMD64VPADDQ256,
 		ssa.OpAMD64VPADDQ512,
+		ssa.OpAMD64VHADDPS128,
+		ssa.OpAMD64VHADDPS256,
+		ssa.OpAMD64VHADDPD128,
+		ssa.OpAMD64VHADDPD256,
+		ssa.OpAMD64VPHADDW128,
+		ssa.OpAMD64VPHADDW256,
+		ssa.OpAMD64VPHADDD128,
+		ssa.OpAMD64VPHADDD256,
+		ssa.OpAMD64VPHADDSW128,
+		ssa.OpAMD64VPHADDSW256,
+		ssa.OpAMD64VPADDSB128,
+		ssa.OpAMD64VPADDSB256,
+		ssa.OpAMD64VPADDSB512,
+		ssa.OpAMD64VPADDSW128,
+		ssa.OpAMD64VPADDSW256,
+		ssa.OpAMD64VPADDSW512,
 		ssa.OpAMD64VADDSUBPS128,
 		ssa.OpAMD64VADDSUBPS256,
 		ssa.OpAMD64VADDSUBPD128,
@ -189,12 +205,15 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VMULPD128,
 		ssa.OpAMD64VMULPD256,
 		ssa.OpAMD64VMULPD512,
-		ssa.OpAMD64VSCALEFPS128,
-		ssa.OpAMD64VSCALEFPS256,
-		ssa.OpAMD64VSCALEFPS512,
-		ssa.OpAMD64VSCALEFPD128,
-		ssa.OpAMD64VSCALEFPD256,
-		ssa.OpAMD64VSCALEFPD512,
+		ssa.OpAMD64VPMULLW128,
+		ssa.OpAMD64VPMULLW256,
+		ssa.OpAMD64VPMULLW512,
+		ssa.OpAMD64VPMULLD128,
+		ssa.OpAMD64VPMULLD256,
+		ssa.OpAMD64VPMULLD512,
+		ssa.OpAMD64VPMULLQ128,
+		ssa.OpAMD64VPMULLQ256,
+		ssa.OpAMD64VPMULLQ512,
 		ssa.OpAMD64VPMULDQ128,
 		ssa.OpAMD64VPMULDQ256,
 		ssa.OpAMD64VPMULDQ512,
@ -207,15 +226,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPMULHUW128,
 		ssa.OpAMD64VPMULHUW256,
 		ssa.OpAMD64VPMULHUW512,
-		ssa.OpAMD64VPMULLW128,
-		ssa.OpAMD64VPMULLW256,
-		ssa.OpAMD64VPMULLW512,
-		ssa.OpAMD64VPMULLD128,
-		ssa.OpAMD64VPMULLD256,
-		ssa.OpAMD64VPMULLD512,
-		ssa.OpAMD64VPMULLQ128,
-		ssa.OpAMD64VPMULLQ256,
-		ssa.OpAMD64VPMULLQ512,
 		ssa.OpAMD64VPOR128,
 		ssa.OpAMD64VPOR256,
 		ssa.OpAMD64VPORD512,
@ -223,22 +233,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPMADDWD128,
 		ssa.OpAMD64VPMADDWD256,
 		ssa.OpAMD64VPMADDWD512,
-		ssa.OpAMD64VHADDPS128,
-		ssa.OpAMD64VHADDPS256,
-		ssa.OpAMD64VHADDPD128,
-		ssa.OpAMD64VHADDPD256,
-		ssa.OpAMD64VPHADDW128,
-		ssa.OpAMD64VPHADDW256,
-		ssa.OpAMD64VPHADDD128,
-		ssa.OpAMD64VPHADDD256,
-		ssa.OpAMD64VHSUBPS128,
-		ssa.OpAMD64VHSUBPS256,
-		ssa.OpAMD64VHSUBPD128,
-		ssa.OpAMD64VHSUBPD256,
-		ssa.OpAMD64VPHSUBW128,
-		ssa.OpAMD64VPHSUBW256,
-		ssa.OpAMD64VPHSUBD128,
-		ssa.OpAMD64VPHSUBD256,
 		ssa.OpAMD64VPERMB128,
 		ssa.OpAMD64VPERMB256,
 		ssa.OpAMD64VPERMB512,
@ -265,25 +259,15 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPRORVQ128,
 		ssa.OpAMD64VPRORVQ256,
 		ssa.OpAMD64VPRORVQ512,
-		ssa.OpAMD64VPADDSB128,
-		ssa.OpAMD64VPADDSB256,
-		ssa.OpAMD64VPADDSB512,
-		ssa.OpAMD64VPADDSW128,
-		ssa.OpAMD64VPADDSW256,
-		ssa.OpAMD64VPADDSW512,
-		ssa.OpAMD64VPHADDSW128,
-		ssa.OpAMD64VPHADDSW256,
-		ssa.OpAMD64VPHSUBSW128,
-		ssa.OpAMD64VPHSUBSW256,
-		ssa.OpAMD64VPSUBSB128,
-		ssa.OpAMD64VPSUBSB256,
-		ssa.OpAMD64VPSUBSB512,
-		ssa.OpAMD64VPSUBSW128,
-		ssa.OpAMD64VPSUBSW256,
-		ssa.OpAMD64VPSUBSW512,
 		ssa.OpAMD64VPMADDUBSW128,
 		ssa.OpAMD64VPMADDUBSW256,
 		ssa.OpAMD64VPMADDUBSW512,
+		ssa.OpAMD64VSCALEFPS128,
+		ssa.OpAMD64VSCALEFPS256,
+		ssa.OpAMD64VSCALEFPS512,
+		ssa.OpAMD64VSCALEFPD128,
+		ssa.OpAMD64VSCALEFPD256,
+		ssa.OpAMD64VSCALEFPD512,
 		ssa.OpAMD64VPSLLVW128,
 		ssa.OpAMD64VPSLLVW256,
 		ssa.OpAMD64VPSLLVW512,
@ -335,6 +319,22 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPSUBQ128,
 		ssa.OpAMD64VPSUBQ256,
 		ssa.OpAMD64VPSUBQ512,
+		ssa.OpAMD64VHSUBPS128,
+		ssa.OpAMD64VHSUBPS256,
+		ssa.OpAMD64VHSUBPD128,
+		ssa.OpAMD64VHSUBPD256,
+		ssa.OpAMD64VPHSUBW128,
+		ssa.OpAMD64VPHSUBW256,
+		ssa.OpAMD64VPHSUBD128,
+		ssa.OpAMD64VPHSUBD256,
+		ssa.OpAMD64VPHSUBSW128,
+		ssa.OpAMD64VPHSUBSW256,
+		ssa.OpAMD64VPSUBSB128,
+		ssa.OpAMD64VPSUBSB256,
+		ssa.OpAMD64VPSUBSB512,
+		ssa.OpAMD64VPSUBSW128,
+		ssa.OpAMD64VPSUBSW256,
+		ssa.OpAMD64VPSUBSW512,
 		ssa.OpAMD64VPXOR128,
 		ssa.OpAMD64VPXOR256,
 		ssa.OpAMD64VPXORD512,
@ -369,6 +369,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPADDQMasked128,
 		ssa.OpAMD64VPADDQMasked256,
 		ssa.OpAMD64VPADDQMasked512,
+		ssa.OpAMD64VPADDSBMasked128,
+		ssa.OpAMD64VPADDSBMasked256,
+		ssa.OpAMD64VPADDSBMasked512,
+		ssa.OpAMD64VPADDSWMasked128,
+		ssa.OpAMD64VPADDSWMasked256,
+		ssa.OpAMD64VPADDSWMasked512,
 		ssa.OpAMD64VPANDDMasked128,
 		ssa.OpAMD64VPANDDMasked256,
 		ssa.OpAMD64VPANDDMasked512,
@ -456,12 +462,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPMINUQMasked128,
 		ssa.OpAMD64VPMINUQMasked256,
 		ssa.OpAMD64VPMINUQMasked512,
-		ssa.OpAMD64VSCALEFPSMasked128,
-		ssa.OpAMD64VSCALEFPSMasked256,
-		ssa.OpAMD64VSCALEFPSMasked512,
-		ssa.OpAMD64VSCALEFPDMasked128,
-		ssa.OpAMD64VSCALEFPDMasked256,
-		ssa.OpAMD64VSCALEFPDMasked512,
 		ssa.OpAMD64VPMULDQMasked128,
 		ssa.OpAMD64VPMULDQMasked256,
 		ssa.OpAMD64VPMULDQMasked512,
@ -474,6 +474,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPMULHUWMasked128,
 		ssa.OpAMD64VPMULHUWMasked256,
 		ssa.OpAMD64VPMULHUWMasked512,
+		ssa.OpAMD64VMULPSMasked128,
+		ssa.OpAMD64VMULPSMasked256,
+		ssa.OpAMD64VMULPSMasked512,
+		ssa.OpAMD64VMULPDMasked128,
+		ssa.OpAMD64VMULPDMasked256,
+		ssa.OpAMD64VMULPDMasked512,
 		ssa.OpAMD64VPMULLWMasked128,
 		ssa.OpAMD64VPMULLWMasked256,
 		ssa.OpAMD64VPMULLWMasked512,
@ -483,12 +489,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPMULLQMasked128,
 		ssa.OpAMD64VPMULLQMasked256,
 		ssa.OpAMD64VPMULLQMasked512,
-		ssa.OpAMD64VMULPSMasked128,
-		ssa.OpAMD64VMULPSMasked256,
-		ssa.OpAMD64VMULPSMasked512,
-		ssa.OpAMD64VMULPDMasked128,
-		ssa.OpAMD64VMULPDMasked256,
-		ssa.OpAMD64VMULPDMasked512,
 		ssa.OpAMD64VPORDMasked128,
 		ssa.OpAMD64VPORDMasked256,
 		ssa.OpAMD64VPORDMasked512,
@ -524,21 +524,15 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPRORVQMasked128,
 		ssa.OpAMD64VPRORVQMasked256,
 		ssa.OpAMD64VPRORVQMasked512,
-		ssa.OpAMD64VPADDSBMasked128,
-		ssa.OpAMD64VPADDSBMasked256,
-		ssa.OpAMD64VPADDSBMasked512,
-		ssa.OpAMD64VPADDSWMasked128,
-		ssa.OpAMD64VPADDSWMasked256,
-		ssa.OpAMD64VPADDSWMasked512,
-		ssa.OpAMD64VPSUBSBMasked128,
-		ssa.OpAMD64VPSUBSBMasked256,
-		ssa.OpAMD64VPSUBSBMasked512,
-		ssa.OpAMD64VPSUBSWMasked128,
-		ssa.OpAMD64VPSUBSWMasked256,
-		ssa.OpAMD64VPSUBSWMasked512,
 		ssa.OpAMD64VPMADDUBSWMasked128,
 		ssa.OpAMD64VPMADDUBSWMasked256,
 		ssa.OpAMD64VPMADDUBSWMasked512,
+		ssa.OpAMD64VSCALEFPSMasked128,
+		ssa.OpAMD64VSCALEFPSMasked256,
+		ssa.OpAMD64VSCALEFPSMasked512,
+		ssa.OpAMD64VSCALEFPDMasked128,
+		ssa.OpAMD64VSCALEFPDMasked256,
+		ssa.OpAMD64VSCALEFPDMasked512,
 		ssa.OpAMD64VPSLLVWMasked128,
 		ssa.OpAMD64VPSLLVWMasked256,
 		ssa.OpAMD64VPSLLVWMasked512,
@ -584,6 +578,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPSUBQMasked128,
 		ssa.OpAMD64VPSUBQMasked256,
 		ssa.OpAMD64VPSUBQMasked512,
+		ssa.OpAMD64VPSUBSBMasked128,
+		ssa.OpAMD64VPSUBSBMasked256,
+		ssa.OpAMD64VPSUBSBMasked512,
+		ssa.OpAMD64VPSUBSWMasked128,
+		ssa.OpAMD64VPSUBSWMasked256,
+		ssa.OpAMD64VPSUBSWMasked512,
 		ssa.OpAMD64VPXORDMasked128,
 		ssa.OpAMD64VPXORDMasked256,
 		ssa.OpAMD64VPXORDMasked512,
@ -1085,6 +1085,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPADDQMasked128,
 		ssa.OpAMD64VPADDQMasked256,
 		ssa.OpAMD64VPADDQMasked512,
+		ssa.OpAMD64VPADDSBMasked128,
+		ssa.OpAMD64VPADDSBMasked256,
+		ssa.OpAMD64VPADDSBMasked512,
+		ssa.OpAMD64VPADDSWMasked128,
+		ssa.OpAMD64VPADDSWMasked256,
+		ssa.OpAMD64VPADDSWMasked512,
 		ssa.OpAMD64VPANDDMasked128,
 		ssa.OpAMD64VPANDDMasked256,
 		ssa.OpAMD64VPANDDMasked512,
@ -1121,6 +1127,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VRNDSCALEPDMasked128,
 		ssa.OpAMD64VRNDSCALEPDMasked256,
 		ssa.OpAMD64VRNDSCALEPDMasked512,
+		ssa.OpAMD64VREDUCEPSMasked128,
+		ssa.OpAMD64VREDUCEPSMasked256,
+		ssa.OpAMD64VREDUCEPSMasked512,
+		ssa.OpAMD64VREDUCEPDMasked128,
+		ssa.OpAMD64VREDUCEPDMasked256,
+		ssa.OpAMD64VREDUCEPDMasked512,
 		ssa.OpAMD64VCOMPRESSPSMasked128,
 		ssa.OpAMD64VCOMPRESSPSMasked256,
 		ssa.OpAMD64VCOMPRESSPSMasked512,
@ -1145,12 +1157,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VCVTPS2UDQMasked128,
 		ssa.OpAMD64VCVTPS2UDQMasked256,
 		ssa.OpAMD64VCVTPS2UDQMasked512,
-		ssa.OpAMD64VREDUCEPSMasked128,
-		ssa.OpAMD64VREDUCEPSMasked256,
-		ssa.OpAMD64VREDUCEPSMasked512,
-		ssa.OpAMD64VREDUCEPDMasked128,
-		ssa.OpAMD64VREDUCEPDMasked256,
-		ssa.OpAMD64VREDUCEPDMasked512,
 		ssa.OpAMD64VDIVPSMasked128,
 		ssa.OpAMD64VDIVPSMasked256,
 		ssa.OpAMD64VDIVPSMasked512,
@ -1244,12 +1250,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPMINUQMasked128,
 		ssa.OpAMD64VPMINUQMasked256,
 		ssa.OpAMD64VPMINUQMasked512,
-		ssa.OpAMD64VSCALEFPSMasked128,
-		ssa.OpAMD64VSCALEFPSMasked256,
-		ssa.OpAMD64VSCALEFPSMasked512,
-		ssa.OpAMD64VSCALEFPDMasked128,
-		ssa.OpAMD64VSCALEFPDMasked256,
-		ssa.OpAMD64VSCALEFPDMasked512,
 		ssa.OpAMD64VPMULDQMasked128,
 		ssa.OpAMD64VPMULDQMasked256,
 		ssa.OpAMD64VPMULDQMasked512,
@ -1262,6 +1262,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPMULHUWMasked128,
 		ssa.OpAMD64VPMULHUWMasked256,
 		ssa.OpAMD64VPMULHUWMasked512,
+		ssa.OpAMD64VMULPSMasked128,
+		ssa.OpAMD64VMULPSMasked256,
+		ssa.OpAMD64VMULPSMasked512,
+		ssa.OpAMD64VMULPDMasked128,
+		ssa.OpAMD64VMULPDMasked256,
+		ssa.OpAMD64VMULPDMasked512,
 		ssa.OpAMD64VPMULLWMasked128,
 		ssa.OpAMD64VPMULLWMasked256,
 		ssa.OpAMD64VPMULLWMasked512,
@ -1271,12 +1277,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPMULLQMasked128,
 		ssa.OpAMD64VPMULLQMasked256,
 		ssa.OpAMD64VPMULLQMasked512,
-		ssa.OpAMD64VMULPSMasked128,
-		ssa.OpAMD64VMULPSMasked256,
-		ssa.OpAMD64VMULPSMasked512,
-		ssa.OpAMD64VMULPDMasked128,
-		ssa.OpAMD64VMULPDMasked256,
-		ssa.OpAMD64VMULPDMasked512,
 		ssa.OpAMD64VPORDMasked128,
 		ssa.OpAMD64VPORDMasked256,
 		ssa.OpAMD64VPORDMasked512,
@ -1357,24 +1357,18 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPDPWSSDSMasked128,
 		ssa.OpAMD64VPDPWSSDSMasked256,
 		ssa.OpAMD64VPDPWSSDSMasked512,
-		ssa.OpAMD64VPADDSBMasked128,
-		ssa.OpAMD64VPADDSBMasked256,
-		ssa.OpAMD64VPADDSBMasked512,
-		ssa.OpAMD64VPADDSWMasked128,
-		ssa.OpAMD64VPADDSWMasked256,
-		ssa.OpAMD64VPADDSWMasked512,
-		ssa.OpAMD64VPSUBSBMasked128,
-		ssa.OpAMD64VPSUBSBMasked256,
-		ssa.OpAMD64VPSUBSBMasked512,
-		ssa.OpAMD64VPSUBSWMasked128,
-		ssa.OpAMD64VPSUBSWMasked256,
-		ssa.OpAMD64VPSUBSWMasked512,
 		ssa.OpAMD64VPMADDUBSWMasked128,
 		ssa.OpAMD64VPMADDUBSWMasked256,
 		ssa.OpAMD64VPMADDUBSWMasked512,
 		ssa.OpAMD64VPDPBUSDSMasked128,
 		ssa.OpAMD64VPDPBUSDSMasked256,
 		ssa.OpAMD64VPDPBUSDSMasked512,
+		ssa.OpAMD64VSCALEFPSMasked128,
+		ssa.OpAMD64VSCALEFPSMasked256,
+		ssa.OpAMD64VSCALEFPSMasked512,
+		ssa.OpAMD64VSCALEFPDMasked128,
+		ssa.OpAMD64VSCALEFPDMasked256,
+		ssa.OpAMD64VSCALEFPDMasked512,
 		ssa.OpAMD64VPSHLDWMasked128,
 		ssa.OpAMD64VPSHLDWMasked256,
 		ssa.OpAMD64VPSHLDWMasked512,
@ -1489,6 +1483,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPSUBQMasked128,
 		ssa.OpAMD64VPSUBQMasked256,
 		ssa.OpAMD64VPSUBQMasked512,
+		ssa.OpAMD64VPSUBSBMasked128,
+		ssa.OpAMD64VPSUBSBMasked256,
+		ssa.OpAMD64VPSUBSBMasked512,
+		ssa.OpAMD64VPSUBSWMasked128,
+		ssa.OpAMD64VPSUBSWMasked256,
+		ssa.OpAMD64VPSUBSWMasked512,
 		ssa.OpAMD64VPDPBUSDMasked128,
 		ssa.OpAMD64VPDPBUSDMasked256,
 		ssa.OpAMD64VPDPBUSDMasked512,
--- a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
+++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
@ -90,6 +90,44 @@
 (AddMaskedUint64x2 x y mask) => (VPADDQMasked128 x y (VPMOVVec64x2ToM <types.TypeMask> mask))
 (AddMaskedUint64x4 x y mask) => (VPADDQMasked256 x y (VPMOVVec64x4ToM <types.TypeMask> mask))
 (AddMaskedUint64x8 x y mask) => (VPADDQMasked512 x y (VPMOVVec64x8ToM <types.TypeMask> mask))
+(AddPairsFloat32x4 ...) => (VHADDPS128 ...)
+(AddPairsFloat32x8 ...) => (VHADDPS256 ...)
+(AddPairsFloat64x2 ...) => (VHADDPD128 ...)
+(AddPairsFloat64x4 ...) => (VHADDPD256 ...)
+(AddPairsInt16x8 ...) => (VPHADDW128 ...)
+(AddPairsInt16x16 ...) => (VPHADDW256 ...)
+(AddPairsInt32x4 ...) => (VPHADDD128 ...)
+(AddPairsInt32x8 ...) => (VPHADDD256 ...)
+(AddPairsUint16x8 ...) => (VPHADDW128 ...)
+(AddPairsUint16x16 ...) => (VPHADDW256 ...)
+(AddPairsUint32x4 ...) => (VPHADDD128 ...)
+(AddPairsUint32x8 ...) => (VPHADDD256 ...)
+(AddPairsSaturatedInt16x8 ...) => (VPHADDSW128 ...)
+(AddPairsSaturatedInt16x16 ...) => (VPHADDSW256 ...)
+(AddSaturatedInt8x16 ...) => (VPADDSB128 ...)
+(AddSaturatedInt8x32 ...) => (VPADDSB256 ...)
+(AddSaturatedInt8x64 ...) => (VPADDSB512 ...)
+(AddSaturatedInt16x8 ...) => (VPADDSW128 ...)
+(AddSaturatedInt16x16 ...) => (VPADDSW256 ...)
+(AddSaturatedInt16x32 ...) => (VPADDSW512 ...)
+(AddSaturatedUint8x16 ...) => (VPADDSB128 ...)
+(AddSaturatedUint8x32 ...) => (VPADDSB256 ...)
+(AddSaturatedUint8x64 ...) => (VPADDSB512 ...)
+(AddSaturatedUint16x8 ...) => (VPADDSW128 ...)
+(AddSaturatedUint16x16 ...) => (VPADDSW256 ...)
+(AddSaturatedUint16x32 ...) => (VPADDSW512 ...)
+(AddSaturatedMaskedInt8x16 x y mask) => (VPADDSBMasked128 x y (VPMOVVec8x16ToM <types.TypeMask> mask))
+(AddSaturatedMaskedInt8x32 x y mask) => (VPADDSBMasked256 x y (VPMOVVec8x32ToM <types.TypeMask> mask))
+(AddSaturatedMaskedInt8x64 x y mask) => (VPADDSBMasked512 x y (VPMOVVec8x64ToM <types.TypeMask> mask))
+(AddSaturatedMaskedInt16x8 x y mask) => (VPADDSWMasked128 x y (VPMOVVec16x8ToM <types.TypeMask> mask))
+(AddSaturatedMaskedInt16x16 x y mask) => (VPADDSWMasked256 x y (VPMOVVec16x16ToM <types.TypeMask> mask))
+(AddSaturatedMaskedInt16x32 x y mask) => (VPADDSWMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask))
+(AddSaturatedMaskedUint8x16 x y mask) => (VPADDSBMasked128 x y (VPMOVVec8x16ToM <types.TypeMask> mask))
+(AddSaturatedMaskedUint8x32 x y mask) => (VPADDSBMasked256 x y (VPMOVVec8x32ToM <types.TypeMask> mask))
+(AddSaturatedMaskedUint8x64 x y mask) => (VPADDSBMasked512 x y (VPMOVVec8x64ToM <types.TypeMask> mask))
+(AddSaturatedMaskedUint16x8 x y mask) => (VPADDSWMasked128 x y (VPMOVVec16x8ToM <types.TypeMask> mask))
+(AddSaturatedMaskedUint16x16 x y mask) => (VPADDSWMasked256 x y (VPMOVVec16x16ToM <types.TypeMask> mask))
+(AddSaturatedMaskedUint16x32 x y mask) => (VPADDSWMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask))
 (AddSubFloat32x4 ...) => (VADDSUBPS128 ...)
 (AddSubFloat32x8 ...) => (VADDSUBPS256 ...)
 (AddSubFloat64x2 ...) => (VADDSUBPD128 ...)
@ -206,18 +244,30 @@
 (CeilFloat32x8 x) => (VROUNDPS256 [2] x)
 (CeilFloat64x2 x) => (VROUNDPD128 [2] x)
 (CeilFloat64x4 x) => (VROUNDPD256 [2] x)
-(CeilWithPrecisionFloat32x4 [a] x) => (VRNDSCALEPS128 [a+2] x)
-(CeilWithPrecisionFloat32x8 [a] x) => (VRNDSCALEPS256 [a+2] x)
-(CeilWithPrecisionFloat32x16 [a] x) => (VRNDSCALEPS512 [a+2] x)
-(CeilWithPrecisionFloat64x2 [a] x) => (VRNDSCALEPD128 [a+2] x)
-(CeilWithPrecisionFloat64x4 [a] x) => (VRNDSCALEPD256 [a+2] x)
-(CeilWithPrecisionFloat64x8 [a] x) => (VRNDSCALEPD512 [a+2] x)
-(CeilWithPrecisionMaskedFloat32x4 [a] x mask) => (VRNDSCALEPSMasked128 [a+2] x (VPMOVVec32x4ToM <types.TypeMask> mask))
-(CeilWithPrecisionMaskedFloat32x8 [a] x mask) => (VRNDSCALEPSMasked256 [a+2] x (VPMOVVec32x8ToM <types.TypeMask> mask))
-(CeilWithPrecisionMaskedFloat32x16 [a] x mask) => (VRNDSCALEPSMasked512 [a+2] x (VPMOVVec32x16ToM <types.TypeMask> mask))
-(CeilWithPrecisionMaskedFloat64x2 [a] x mask) => (VRNDSCALEPDMasked128 [a+2] x (VPMOVVec64x2ToM <types.TypeMask> mask))
-(CeilWithPrecisionMaskedFloat64x4 [a] x mask) => (VRNDSCALEPDMasked256 [a+2] x (VPMOVVec64x4ToM <types.TypeMask> mask))
-(CeilWithPrecisionMaskedFloat64x8 [a] x mask) => (VRNDSCALEPDMasked512 [a+2] x (VPMOVVec64x8ToM <types.TypeMask> mask))
+(CeilScaledFloat32x4 [a] x) => (VRNDSCALEPS128 [a+2] x)
+(CeilScaledFloat32x8 [a] x) => (VRNDSCALEPS256 [a+2] x)
+(CeilScaledFloat32x16 [a] x) => (VRNDSCALEPS512 [a+2] x)
+(CeilScaledFloat64x2 [a] x) => (VRNDSCALEPD128 [a+2] x)
+(CeilScaledFloat64x4 [a] x) => (VRNDSCALEPD256 [a+2] x)
+(CeilScaledFloat64x8 [a] x) => (VRNDSCALEPD512 [a+2] x)
+(CeilScaledMaskedFloat32x4 [a] x mask) => (VRNDSCALEPSMasked128 [a+2] x (VPMOVVec32x4ToM <types.TypeMask> mask))
+(CeilScaledMaskedFloat32x8 [a] x mask) => (VRNDSCALEPSMasked256 [a+2] x (VPMOVVec32x8ToM <types.TypeMask> mask))
+(CeilScaledMaskedFloat32x16 [a] x mask) => (VRNDSCALEPSMasked512 [a+2] x (VPMOVVec32x16ToM <types.TypeMask> mask))
+(CeilScaledMaskedFloat64x2 [a] x mask) => (VRNDSCALEPDMasked128 [a+2] x (VPMOVVec64x2ToM <types.TypeMask> mask))
+(CeilScaledMaskedFloat64x4 [a] x mask) => (VRNDSCALEPDMasked256 [a+2] x (VPMOVVec64x4ToM <types.TypeMask> mask))
+(CeilScaledMaskedFloat64x8 [a] x mask) => (VRNDSCALEPDMasked512 [a+2] x (VPMOVVec64x8ToM <types.TypeMask> mask))
+(CeilScaledResidueFloat32x4 [a] x) => (VREDUCEPS128 [a+2] x)
+(CeilScaledResidueFloat32x8 [a] x) => (VREDUCEPS256 [a+2] x)
+(CeilScaledResidueFloat32x16 [a] x) => (VREDUCEPS512 [a+2] x)
+(CeilScaledResidueFloat64x2 [a] x) => (VREDUCEPD128 [a+2] x)
+(CeilScaledResidueFloat64x4 [a] x) => (VREDUCEPD256 [a+2] x)
+(CeilScaledResidueFloat64x8 [a] x) => (VREDUCEPD512 [a+2] x)
+(CeilScaledResidueMaskedFloat32x4 [a] x mask) => (VREDUCEPSMasked128 [a+2] x (VPMOVVec32x4ToM <types.TypeMask> mask))
+(CeilScaledResidueMaskedFloat32x8 [a] x mask) => (VREDUCEPSMasked256 [a+2] x (VPMOVVec32x8ToM <types.TypeMask> mask))
+(CeilScaledResidueMaskedFloat32x16 [a] x mask) => (VREDUCEPSMasked512 [a+2] x (VPMOVVec32x16ToM <types.TypeMask> mask))
+(CeilScaledResidueMaskedFloat64x2 [a] x mask) => (VREDUCEPDMasked128 [a+2] x (VPMOVVec64x2ToM <types.TypeMask> mask))
+(CeilScaledResidueMaskedFloat64x4 [a] x mask) => (VREDUCEPDMasked256 [a+2] x (VPMOVVec64x4ToM <types.TypeMask> mask))
+(CeilScaledResidueMaskedFloat64x8 [a] x mask) => (VREDUCEPDMasked512 [a+2] x (VPMOVVec64x8ToM <types.TypeMask> mask))
 (CompressFloat32x4 x mask) => (VCOMPRESSPSMasked128 x (VPMOVVec32x4ToM <types.TypeMask> mask))
 (CompressFloat32x8 x mask) => (VCOMPRESSPSMasked256 x (VPMOVVec32x8ToM <types.TypeMask> mask))
 (CompressFloat32x16 x mask) => (VCOMPRESSPSMasked512 x (VPMOVVec32x16ToM <types.TypeMask> mask))
@ -260,54 +310,6 @@
 (ConvertToUint32MaskedFloat32x4 x mask) => (VCVTPS2UDQMasked128 x (VPMOVVec32x4ToM <types.TypeMask> mask))
 (ConvertToUint32MaskedFloat32x8 x mask) => (VCVTPS2UDQMasked256 x (VPMOVVec32x8ToM <types.TypeMask> mask))
 (ConvertToUint32MaskedFloat32x16 x mask) => (VCVTPS2UDQMasked512 x (VPMOVVec32x16ToM <types.TypeMask> mask))
-(DiffWithCeilWithPrecisionFloat32x4 [a] x) => (VREDUCEPS128 [a+2] x)
-(DiffWithCeilWithPrecisionFloat32x8 [a] x) => (VREDUCEPS256 [a+2] x)
-(DiffWithCeilWithPrecisionFloat32x16 [a] x) => (VREDUCEPS512 [a+2] x)
-(DiffWithCeilWithPrecisionFloat64x2 [a] x) => (VREDUCEPD128 [a+2] x)
-(DiffWithCeilWithPrecisionFloat64x4 [a] x) => (VREDUCEPD256 [a+2] x)
-(DiffWithCeilWithPrecisionFloat64x8 [a] x) => (VREDUCEPD512 [a+2] x)
-(DiffWithCeilWithPrecisionMaskedFloat32x4 [a] x mask) => (VREDUCEPSMasked128 [a+2] x (VPMOVVec32x4ToM <types.TypeMask> mask))
-(DiffWithCeilWithPrecisionMaskedFloat32x8 [a] x mask) => (VREDUCEPSMasked256 [a+2] x (VPMOVVec32x8ToM <types.TypeMask> mask))
-(DiffWithCeilWithPrecisionMaskedFloat32x16 [a] x mask) => (VREDUCEPSMasked512 [a+2] x (VPMOVVec32x16ToM <types.TypeMask> mask))
-(DiffWithCeilWithPrecisionMaskedFloat64x2 [a] x mask) => (VREDUCEPDMasked128 [a+2] x (VPMOVVec64x2ToM <types.TypeMask> mask))
-(DiffWithCeilWithPrecisionMaskedFloat64x4 [a] x mask) => (VREDUCEPDMasked256 [a+2] x (VPMOVVec64x4ToM <types.TypeMask> mask))
-(DiffWithCeilWithPrecisionMaskedFloat64x8 [a] x mask) => (VREDUCEPDMasked512 [a+2] x (VPMOVVec64x8ToM <types.TypeMask> mask))
-(DiffWithFloorWithPrecisionFloat32x4 [a] x) => (VREDUCEPS128 [a+1] x)
-(DiffWithFloorWithPrecisionFloat32x8 [a] x) => (VREDUCEPS256 [a+1] x)
-(DiffWithFloorWithPrecisionFloat32x16 [a] x) => (VREDUCEPS512 [a+1] x)
-(DiffWithFloorWithPrecisionFloat64x2 [a] x) => (VREDUCEPD128 [a+1] x)
-(DiffWithFloorWithPrecisionFloat64x4 [a] x) => (VREDUCEPD256 [a+1] x)
-(DiffWithFloorWithPrecisionFloat64x8 [a] x) => (VREDUCEPD512 [a+1] x)
-(DiffWithFloorWithPrecisionMaskedFloat32x4 [a] x mask) => (VREDUCEPSMasked128 [a+1] x (VPMOVVec32x4ToM <types.TypeMask> mask))
-(DiffWithFloorWithPrecisionMaskedFloat32x8 [a] x mask) => (VREDUCEPSMasked256 [a+1] x (VPMOVVec32x8ToM <types.TypeMask> mask))
-(DiffWithFloorWithPrecisionMaskedFloat32x16 [a] x mask) => (VREDUCEPSMasked512 [a+1] x (VPMOVVec32x16ToM <types.TypeMask> mask))
-(DiffWithFloorWithPrecisionMaskedFloat64x2 [a] x mask) => (VREDUCEPDMasked128 [a+1] x (VPMOVVec64x2ToM <types.TypeMask> mask))
-(DiffWithFloorWithPrecisionMaskedFloat64x4 [a] x mask) => (VREDUCEPDMasked256 [a+1] x (VPMOVVec64x4ToM <types.TypeMask> mask))
-(DiffWithFloorWithPrecisionMaskedFloat64x8 [a] x mask) => (VREDUCEPDMasked512 [a+1] x (VPMOVVec64x8ToM <types.TypeMask> mask))
-(DiffWithRoundWithPrecisionFloat32x4 [a] x) => (VREDUCEPS128 [a+0] x)
-(DiffWithRoundWithPrecisionFloat32x8 [a] x) => (VREDUCEPS256 [a+0] x)
-(DiffWithRoundWithPrecisionFloat32x16 [a] x) => (VREDUCEPS512 [a+0] x)
-(DiffWithRoundWithPrecisionFloat64x2 [a] x) => (VREDUCEPD128 [a+0] x)
-(DiffWithRoundWithPrecisionFloat64x4 [a] x) => (VREDUCEPD256 [a+0] x)
-(DiffWithRoundWithPrecisionFloat64x8 [a] x) => (VREDUCEPD512 [a+0] x)
-(DiffWithRoundWithPrecisionMaskedFloat32x4 [a] x mask) => (VREDUCEPSMasked128 [a+0] x (VPMOVVec32x4ToM <types.TypeMask> mask))
-(DiffWithRoundWithPrecisionMaskedFloat32x8 [a] x mask) => (VREDUCEPSMasked256 [a+0] x (VPMOVVec32x8ToM <types.TypeMask> mask))
-(DiffWithRoundWithPrecisionMaskedFloat32x16 [a] x mask) => (VREDUCEPSMasked512 [a+0] x (VPMOVVec32x16ToM <types.TypeMask> mask))
-(DiffWithRoundWithPrecisionMaskedFloat64x2 [a] x mask) => (VREDUCEPDMasked128 [a+0] x (VPMOVVec64x2ToM <types.TypeMask> mask))
-(DiffWithRoundWithPrecisionMaskedFloat64x4 [a] x mask) => (VREDUCEPDMasked256 [a+0] x (VPMOVVec64x4ToM <types.TypeMask> mask))
-(DiffWithRoundWithPrecisionMaskedFloat64x8 [a] x mask) => (VREDUCEPDMasked512 [a+0] x (VPMOVVec64x8ToM <types.TypeMask> mask))
-(DiffWithTruncWithPrecisionFloat32x4 [a] x) => (VREDUCEPS128 [a+3] x)
-(DiffWithTruncWithPrecisionFloat32x8 [a] x) => (VREDUCEPS256 [a+3] x)
-(DiffWithTruncWithPrecisionFloat32x16 [a] x) => (VREDUCEPS512 [a+3] x)
-(DiffWithTruncWithPrecisionFloat64x2 [a] x) => (VREDUCEPD128 [a+3] x)
-(DiffWithTruncWithPrecisionFloat64x4 [a] x) => (VREDUCEPD256 [a+3] x)
-(DiffWithTruncWithPrecisionFloat64x8 [a] x) => (VREDUCEPD512 [a+3] x)
-(DiffWithTruncWithPrecisionMaskedFloat32x4 [a] x mask) => (VREDUCEPSMasked128 [a+3] x (VPMOVVec32x4ToM <types.TypeMask> mask))
-(DiffWithTruncWithPrecisionMaskedFloat32x8 [a] x mask) => (VREDUCEPSMasked256 [a+3] x (VPMOVVec32x8ToM <types.TypeMask> mask))
-(DiffWithTruncWithPrecisionMaskedFloat32x16 [a] x mask) => (VREDUCEPSMasked512 [a+3] x (VPMOVVec32x16ToM <types.TypeMask> mask))
-(DiffWithTruncWithPrecisionMaskedFloat64x2 [a] x mask) => (VREDUCEPDMasked128 [a+3] x (VPMOVVec64x2ToM <types.TypeMask> mask))
-(DiffWithTruncWithPrecisionMaskedFloat64x4 [a] x mask) => (VREDUCEPDMasked256 [a+3] x (VPMOVVec64x4ToM <types.TypeMask> mask))
-(DiffWithTruncWithPrecisionMaskedFloat64x8 [a] x mask) => (VREDUCEPDMasked512 [a+3] x (VPMOVVec64x8ToM <types.TypeMask> mask))
 (DivFloat32x4 ...) => (VDIVPS128 ...)
 (DivFloat32x8 ...) => (VDIVPS256 ...)
 (DivFloat32x16 ...) => (VDIVPS512 ...)
@ -387,18 +389,30 @@
 (FloorFloat32x8 x) => (VROUNDPS256 [1] x)
 (FloorFloat64x2 x) => (VROUNDPD128 [1] x)
 (FloorFloat64x4 x) => (VROUNDPD256 [1] x)
-(FloorWithPrecisionFloat32x4 [a] x) => (VRNDSCALEPS128 [a+1] x)
-(FloorWithPrecisionFloat32x8 [a] x) => (VRNDSCALEPS256 [a+1] x)
-(FloorWithPrecisionFloat32x16 [a] x) => (VRNDSCALEPS512 [a+1] x)
-(FloorWithPrecisionFloat64x2 [a] x) => (VRNDSCALEPD128 [a+1] x)
-(FloorWithPrecisionFloat64x4 [a] x) => (VRNDSCALEPD256 [a+1] x)
-(FloorWithPrecisionFloat64x8 [a] x) => (VRNDSCALEPD512 [a+1] x)
-(FloorWithPrecisionMaskedFloat32x4 [a] x mask) => (VRNDSCALEPSMasked128 [a+1] x (VPMOVVec32x4ToM <types.TypeMask> mask))
-(FloorWithPrecisionMaskedFloat32x8 [a] x mask) => (VRNDSCALEPSMasked256 [a+1] x (VPMOVVec32x8ToM <types.TypeMask> mask))
-(FloorWithPrecisionMaskedFloat32x16 [a] x mask) => (VRNDSCALEPSMasked512 [a+1] x (VPMOVVec32x16ToM <types.TypeMask> mask))
-(FloorWithPrecisionMaskedFloat64x2 [a] x mask) => (VRNDSCALEPDMasked128 [a+1] x (VPMOVVec64x2ToM <types.TypeMask> mask))
-(FloorWithPrecisionMaskedFloat64x4 [a] x mask) => (VRNDSCALEPDMasked256 [a+1] x (VPMOVVec64x4ToM <types.TypeMask> mask))
-(FloorWithPrecisionMaskedFloat64x8 [a] x mask) => (VRNDSCALEPDMasked512 [a+1] x (VPMOVVec64x8ToM <types.TypeMask> mask))
+(FloorScaledFloat32x4 [a] x) => (VRNDSCALEPS128 [a+1] x)
+(FloorScaledFloat32x8 [a] x) => (VRNDSCALEPS256 [a+1] x)
+(FloorScaledFloat32x16 [a] x) => (VRNDSCALEPS512 [a+1] x)
+(FloorScaledFloat64x2 [a] x) => (VRNDSCALEPD128 [a+1] x)
+(FloorScaledFloat64x4 [a] x) => (VRNDSCALEPD256 [a+1] x)
+(FloorScaledFloat64x8 [a] x) => (VRNDSCALEPD512 [a+1] x)
+(FloorScaledMaskedFloat32x4 [a] x mask) => (VRNDSCALEPSMasked128 [a+1] x (VPMOVVec32x4ToM <types.TypeMask> mask))
+(FloorScaledMaskedFloat32x8 [a] x mask) => (VRNDSCALEPSMasked256 [a+1] x (VPMOVVec32x8ToM <types.TypeMask> mask))
+(FloorScaledMaskedFloat32x16 [a] x mask) => (VRNDSCALEPSMasked512 [a+1] x (VPMOVVec32x16ToM <types.TypeMask> mask))
+(FloorScaledMaskedFloat64x2 [a] x mask) => (VRNDSCALEPDMasked128 [a+1] x (VPMOVVec64x2ToM <types.TypeMask> mask))
+(FloorScaledMaskedFloat64x4 [a] x mask) => (VRNDSCALEPDMasked256 [a+1] x (VPMOVVec64x4ToM <types.TypeMask> mask))
+(FloorScaledMaskedFloat64x8 [a] x mask) => (VRNDSCALEPDMasked512 [a+1] x (VPMOVVec64x8ToM <types.TypeMask> mask))
+(FloorScaledResidueFloat32x4 [a] x) => (VREDUCEPS128 [a+1] x)
+(FloorScaledResidueFloat32x8 [a] x) => (VREDUCEPS256 [a+1] x)
+(FloorScaledResidueFloat32x16 [a] x) => (VREDUCEPS512 [a+1] x)
+(FloorScaledResidueFloat64x2 [a] x) => (VREDUCEPD128 [a+1] x)
+(FloorScaledResidueFloat64x4 [a] x) => (VREDUCEPD256 [a+1] x)
+(FloorScaledResidueFloat64x8 [a] x) => (VREDUCEPD512 [a+1] x)
+(FloorScaledResidueMaskedFloat32x4 [a] x mask) => (VREDUCEPSMasked128 [a+1] x (VPMOVVec32x4ToM <types.TypeMask> mask))
+(FloorScaledResidueMaskedFloat32x8 [a] x mask) => (VREDUCEPSMasked256 [a+1] x (VPMOVVec32x8ToM <types.TypeMask> mask))
+(FloorScaledResidueMaskedFloat32x16 [a] x mask) => (VREDUCEPSMasked512 [a+1] x (VPMOVVec32x16ToM <types.TypeMask> mask))
+(FloorScaledResidueMaskedFloat64x2 [a] x mask) => (VREDUCEPDMasked128 [a+1] x (VPMOVVec64x2ToM <types.TypeMask> mask))
+(FloorScaledResidueMaskedFloat64x4 [a] x mask) => (VREDUCEPDMasked256 [a+1] x (VPMOVVec64x4ToM <types.TypeMask> mask))
+(FloorScaledResidueMaskedFloat64x8 [a] x mask) => (VREDUCEPDMasked512 [a+1] x (VPMOVVec64x8ToM <types.TypeMask> mask))
 (FusedMultiplyAddFloat32x4 ...) => (VFMADD213PS128 ...)
 (FusedMultiplyAddFloat32x8 ...) => (VFMADD213PS256 ...)
 (FusedMultiplyAddFloat32x16 ...) => (VFMADD213PS512 ...)
@ -849,18 +863,15 @@
 (MulFloat64x2 ...) => (VMULPD128 ...)
 (MulFloat64x4 ...) => (VMULPD256 ...)
 (MulFloat64x8 ...) => (VMULPD512 ...)
-(MulByPowOf2Float32x4 ...) => (VSCALEFPS128 ...)
-(MulByPowOf2Float32x8 ...) => (VSCALEFPS256 ...)
-(MulByPowOf2Float32x16 ...) => (VSCALEFPS512 ...)
-(MulByPowOf2Float64x2 ...) => (VSCALEFPD128 ...)
-(MulByPowOf2Float64x4 ...) => (VSCALEFPD256 ...)
-(MulByPowOf2Float64x8 ...) => (VSCALEFPD512 ...)
-(MulByPowOf2MaskedFloat32x4 x y mask) => (VSCALEFPSMasked128 x y (VPMOVVec32x4ToM <types.TypeMask> mask))
-(MulByPowOf2MaskedFloat32x8 x y mask) => (VSCALEFPSMasked256 x y (VPMOVVec32x8ToM <types.TypeMask> mask))
-(MulByPowOf2MaskedFloat32x16 x y mask) => (VSCALEFPSMasked512 x y (VPMOVVec32x16ToM <types.TypeMask> mask))
-(MulByPowOf2MaskedFloat64x2 x y mask) => (VSCALEFPDMasked128 x y (VPMOVVec64x2ToM <types.TypeMask> mask))
-(MulByPowOf2MaskedFloat64x4 x y mask) => (VSCALEFPDMasked256 x y (VPMOVVec64x4ToM <types.TypeMask> mask))
-(MulByPowOf2MaskedFloat64x8 x y mask) => (VSCALEFPDMasked512 x y (VPMOVVec64x8ToM <types.TypeMask> mask))
+(MulInt16x8 ...) => (VPMULLW128 ...)
+(MulInt16x16 ...) => (VPMULLW256 ...)
+(MulInt16x32 ...) => (VPMULLW512 ...)
+(MulInt32x4 ...) => (VPMULLD128 ...)
+(MulInt32x8 ...) => (VPMULLD256 ...)
+(MulInt32x16 ...) => (VPMULLD512 ...)
+(MulInt64x2 ...) => (VPMULLQ128 ...)
+(MulInt64x4 ...) => (VPMULLQ256 ...)
+(MulInt64x8 ...) => (VPMULLQ512 ...)
 (MulEvenWidenInt32x4 ...) => (VPMULDQ128 ...)
 (MulEvenWidenInt32x8 ...) => (VPMULDQ256 ...)
 (MulEvenWidenInt64x2 ...) => (VPMULDQ128 ...)
@ -889,30 +900,21 @@
 (MulHighMaskedUint16x8 x y mask) => (VPMULHUWMasked128 x y (VPMOVVec16x8ToM <types.TypeMask> mask))
 (MulHighMaskedUint16x16 x y mask) => (VPMULHUWMasked256 x y (VPMOVVec16x16ToM <types.TypeMask> mask))
 (MulHighMaskedUint16x32 x y mask) => (VPMULHUWMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask))
-(MulLowInt16x8 ...) => (VPMULLW128 ...)
-(MulLowInt16x16 ...) => (VPMULLW256 ...)
-(MulLowInt16x32 ...) => (VPMULLW512 ...)
-(MulLowInt32x4 ...) => (VPMULLD128 ...)
-(MulLowInt32x8 ...) => (VPMULLD256 ...)
-(MulLowInt32x16 ...) => (VPMULLD512 ...)
-(MulLowInt64x2 ...) => (VPMULLQ128 ...)
-(MulLowInt64x4 ...) => (VPMULLQ256 ...)
-(MulLowInt64x8 ...) => (VPMULLQ512 ...)
-(MulLowMaskedInt16x8 x y mask) => (VPMULLWMasked128 x y (VPMOVVec16x8ToM <types.TypeMask> mask))
-(MulLowMaskedInt16x16 x y mask) => (VPMULLWMasked256 x y (VPMOVVec16x16ToM <types.TypeMask> mask))
-(MulLowMaskedInt16x32 x y mask) => (VPMULLWMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask))
-(MulLowMaskedInt32x4 x y mask) => (VPMULLDMasked128 x y (VPMOVVec32x4ToM <types.TypeMask> mask))
-(MulLowMaskedInt32x8 x y mask) => (VPMULLDMasked256 x y (VPMOVVec32x8ToM <types.TypeMask> mask))
-(MulLowMaskedInt32x16 x y mask) => (VPMULLDMasked512 x y (VPMOVVec32x16ToM <types.TypeMask> mask))
-(MulLowMaskedInt64x2 x y mask) => (VPMULLQMasked128 x y (VPMOVVec64x2ToM <types.TypeMask> mask))
-(MulLowMaskedInt64x4 x y mask) => (VPMULLQMasked256 x y (VPMOVVec64x4ToM <types.TypeMask> mask))
-(MulLowMaskedInt64x8 x y mask) => (VPMULLQMasked512 x y (VPMOVVec64x8ToM <types.TypeMask> mask))
 (MulMaskedFloat32x4 x y mask) => (VMULPSMasked128 x y (VPMOVVec32x4ToM <types.TypeMask> mask))
 (MulMaskedFloat32x8 x y mask) => (VMULPSMasked256 x y (VPMOVVec32x8ToM <types.TypeMask> mask))
 (MulMaskedFloat32x16 x y mask) => (VMULPSMasked512 x y (VPMOVVec32x16ToM <types.TypeMask> mask))
 (MulMaskedFloat64x2 x y mask) => (VMULPDMasked128 x y (VPMOVVec64x2ToM <types.TypeMask> mask))
 (MulMaskedFloat64x4 x y mask) => (VMULPDMasked256 x y (VPMOVVec64x4ToM <types.TypeMask> mask))
 (MulMaskedFloat64x8 x y mask) => (VMULPDMasked512 x y (VPMOVVec64x8ToM <types.TypeMask> mask))
+(MulMaskedInt16x8 x y mask) => (VPMULLWMasked128 x y (VPMOVVec16x8ToM <types.TypeMask> mask))
+(MulMaskedInt16x16 x y mask) => (VPMULLWMasked256 x y (VPMOVVec16x16ToM <types.TypeMask> mask))
+(MulMaskedInt16x32 x y mask) => (VPMULLWMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask))
+(MulMaskedInt32x4 x y mask) => (VPMULLDMasked128 x y (VPMOVVec32x4ToM <types.TypeMask> mask))
+(MulMaskedInt32x8 x y mask) => (VPMULLDMasked256 x y (VPMOVVec32x8ToM <types.TypeMask> mask))
+(MulMaskedInt32x16 x y mask) => (VPMULLDMasked512 x y (VPMOVVec32x16ToM <types.TypeMask> mask))
+(MulMaskedInt64x2 x y mask) => (VPMULLQMasked128 x y (VPMOVVec64x2ToM <types.TypeMask> mask))
+(MulMaskedInt64x4 x y mask) => (VPMULLQMasked256 x y (VPMOVVec64x4ToM <types.TypeMask> mask))
+(MulMaskedInt64x8 x y mask) => (VPMULLQMasked512 x y (VPMOVVec64x8ToM <types.TypeMask> mask))
 (NotEqualFloat32x4 x y) => (VCMPPS128 [4] x y)
 (NotEqualFloat32x8 x y) => (VCMPPS256 [4] x y)
 (NotEqualFloat32x16 x y) => (VPMOVMToVec32x16 (VCMPPS512 [4] x y))
@ -1015,30 +1017,6 @@
 (PairDotProdMaskedInt16x8 x y mask) => (VPMADDWDMasked128 x y (VPMOVVec16x8ToM <types.TypeMask> mask))
 (PairDotProdMaskedInt16x16 x y mask) => (VPMADDWDMasked256 x y (VPMOVVec16x16ToM <types.TypeMask> mask))
 (PairDotProdMaskedInt16x32 x y mask) => (VPMADDWDMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask))
-(PairwiseAddFloat32x4 ...) => (VHADDPS128 ...)
-(PairwiseAddFloat32x8 ...) => (VHADDPS256 ...)
-(PairwiseAddFloat64x2 ...) => (VHADDPD128 ...)
-(PairwiseAddFloat64x4 ...) => (VHADDPD256 ...)
-(PairwiseAddInt16x8 ...) => (VPHADDW128 ...)
-(PairwiseAddInt16x16 ...) => (VPHADDW256 ...)
-(PairwiseAddInt32x4 ...) => (VPHADDD128 ...)
-(PairwiseAddInt32x8 ...) => (VPHADDD256 ...)
-(PairwiseAddUint16x8 ...) => (VPHADDW128 ...)
-(PairwiseAddUint16x16 ...) => (VPHADDW256 ...)
-(PairwiseAddUint32x4 ...) => (VPHADDD128 ...)
-(PairwiseAddUint32x8 ...) => (VPHADDD256 ...)
-(PairwiseSubFloat32x4 ...) => (VHSUBPS128 ...)
-(PairwiseSubFloat32x8 ...) => (VHSUBPS256 ...)
-(PairwiseSubFloat64x2 ...) => (VHSUBPD128 ...)
-(PairwiseSubFloat64x4 ...) => (VHSUBPD256 ...)
-(PairwiseSubInt16x8 ...) => (VPHSUBW128 ...)
-(PairwiseSubInt16x16 ...) => (VPHSUBW256 ...)
-(PairwiseSubInt32x4 ...) => (VPHSUBD128 ...)
-(PairwiseSubInt32x8 ...) => (VPHSUBD256 ...)
-(PairwiseSubUint16x8 ...) => (VPHSUBW128 ...)
-(PairwiseSubUint16x16 ...) => (VPHSUBW256 ...)
-(PairwiseSubUint32x4 ...) => (VPHSUBD128 ...)
-(PairwiseSubUint32x8 ...) => (VPHSUBD256 ...)
 (PermuteFloat32x8 ...) => (VPERMPS256 ...)
 (PermuteFloat32x16 ...) => (VPERMPS512 ...)
 (PermuteFloat64x4 ...) => (VPERMPD256 ...)
@ -1295,76 +1273,36 @@
 (RoundFloat32x8 x) => (VROUNDPS256 [0] x)
 (RoundFloat64x2 x) => (VROUNDPD128 [0] x)
 (RoundFloat64x4 x) => (VROUNDPD256 [0] x)
-(RoundWithPrecisionFloat32x4 [a] x) => (VRNDSCALEPS128 [a+0] x)
-(RoundWithPrecisionFloat32x8 [a] x) => (VRNDSCALEPS256 [a+0] x)
-(RoundWithPrecisionFloat32x16 [a] x) => (VRNDSCALEPS512 [a+0] x)
-(RoundWithPrecisionFloat64x2 [a] x) => (VRNDSCALEPD128 [a+0] x)
-(RoundWithPrecisionFloat64x4 [a] x) => (VRNDSCALEPD256 [a+0] x)
-(RoundWithPrecisionFloat64x8 [a] x) => (VRNDSCALEPD512 [a+0] x)
-(RoundWithPrecisionMaskedFloat32x4 [a] x mask) => (VRNDSCALEPSMasked128 [a+0] x (VPMOVVec32x4ToM <types.TypeMask> mask))
-(RoundWithPrecisionMaskedFloat32x8 [a] x mask) => (VRNDSCALEPSMasked256 [a+0] x (VPMOVVec32x8ToM <types.TypeMask> mask))
-(RoundWithPrecisionMaskedFloat32x16 [a] x mask) => (VRNDSCALEPSMasked512 [a+0] x (VPMOVVec32x16ToM <types.TypeMask> mask))
-(RoundWithPrecisionMaskedFloat64x2 [a] x mask) => (VRNDSCALEPDMasked128 [a+0] x (VPMOVVec64x2ToM <types.TypeMask> mask))
-(RoundWithPrecisionMaskedFloat64x4 [a] x mask) => (VRNDSCALEPDMasked256 [a+0] x (VPMOVVec64x4ToM <types.TypeMask> mask))
-(RoundWithPrecisionMaskedFloat64x8 [a] x mask) => (VRNDSCALEPDMasked512 [a+0] x (VPMOVVec64x8ToM <types.TypeMask> mask))
-(SaturatedAddInt8x16 ...) => (VPADDSB128 ...)
-(SaturatedAddInt8x32 ...) => (VPADDSB256 ...)
-(SaturatedAddInt8x64 ...) => (VPADDSB512 ...)
-(SaturatedAddInt16x8 ...) => (VPADDSW128 ...)
-(SaturatedAddInt16x16 ...) => (VPADDSW256 ...)
-(SaturatedAddInt16x32 ...) => (VPADDSW512 ...)
-(SaturatedAddUint8x16 ...) => (VPADDSB128 ...)
-(SaturatedAddUint8x32 ...) => (VPADDSB256 ...)
-(SaturatedAddUint8x64 ...) => (VPADDSB512 ...)
-(SaturatedAddUint16x8 ...) => (VPADDSW128 ...)
-(SaturatedAddUint16x16 ...) => (VPADDSW256 ...)
-(SaturatedAddUint16x32 ...) => (VPADDSW512 ...)
+(RoundScaledFloat32x4 [a] x) => (VRNDSCALEPS128 [a+0] x)
+(RoundScaledFloat32x8 [a] x) => (VRNDSCALEPS256 [a+0] x)
+(RoundScaledFloat32x16 [a] x) => (VRNDSCALEPS512 [a+0] x)
+(RoundScaledFloat64x2 [a] x) => (VRNDSCALEPD128 [a+0] x)
+(RoundScaledFloat64x4 [a] x) => (VRNDSCALEPD256 [a+0] x)
+(RoundScaledFloat64x8 [a] x) => (VRNDSCALEPD512 [a+0] x)
+(RoundScaledMaskedFloat32x4 [a] x mask) => (VRNDSCALEPSMasked128 [a+0] x (VPMOVVec32x4ToM <types.TypeMask> mask))
+(RoundScaledMaskedFloat32x8 [a] x mask) => (VRNDSCALEPSMasked256 [a+0] x (VPMOVVec32x8ToM <types.TypeMask> mask))
+(RoundScaledMaskedFloat32x16 [a] x mask) => (VRNDSCALEPSMasked512 [a+0] x (VPMOVVec32x16ToM <types.TypeMask> mask))
+(RoundScaledMaskedFloat64x2 [a] x mask) => (VRNDSCALEPDMasked128 [a+0] x (VPMOVVec64x2ToM <types.TypeMask> mask))
+(RoundScaledMaskedFloat64x4 [a] x mask) => (VRNDSCALEPDMasked256 [a+0] x (VPMOVVec64x4ToM <types.TypeMask> mask))
+(RoundScaledMaskedFloat64x8 [a] x mask) => (VRNDSCALEPDMasked512 [a+0] x (VPMOVVec64x8ToM <types.TypeMask> mask))
+(RoundScaledResidueFloat32x4 [a] x) => (VREDUCEPS128 [a+0] x)
+(RoundScaledResidueFloat32x8 [a] x) => (VREDUCEPS256 [a+0] x)
+(RoundScaledResidueFloat32x16 [a] x) => (VREDUCEPS512 [a+0] x)
+(RoundScaledResidueFloat64x2 [a] x) => (VREDUCEPD128 [a+0] x)
+(RoundScaledResidueFloat64x4 [a] x) => (VREDUCEPD256 [a+0] x)
+(RoundScaledResidueFloat64x8 [a] x) => (VREDUCEPD512 [a+0] x)
+(RoundScaledResidueMaskedFloat32x4 [a] x mask) => (VREDUCEPSMasked128 [a+0] x (VPMOVVec32x4ToM <types.TypeMask> mask))
+(RoundScaledResidueMaskedFloat32x8 [a] x mask) => (VREDUCEPSMasked256 [a+0] x (VPMOVVec32x8ToM <types.TypeMask> mask))
+(RoundScaledResidueMaskedFloat32x16 [a] x mask) => (VREDUCEPSMasked512 [a+0] x (VPMOVVec32x16ToM <types.TypeMask> mask))
+(RoundScaledResidueMaskedFloat64x2 [a] x mask) => (VREDUCEPDMasked128 [a+0] x (VPMOVVec64x2ToM <types.TypeMask> mask))
+(RoundScaledResidueMaskedFloat64x4 [a] x mask) => (VREDUCEPDMasked256 [a+0] x (VPMOVVec64x4ToM <types.TypeMask> mask))
+(RoundScaledResidueMaskedFloat64x8 [a] x mask) => (VREDUCEPDMasked512 [a+0] x (VPMOVVec64x8ToM <types.TypeMask> mask))
 (SaturatedAddDotProdInt32x4 ...) => (VPDPWSSDS128 ...)
 (SaturatedAddDotProdInt32x8 ...) => (VPDPWSSDS256 ...)
 (SaturatedAddDotProdInt32x16 ...) => (VPDPWSSDS512 ...)
 (SaturatedAddDotProdMaskedInt32x4 x y z mask) => (VPDPWSSDSMasked128 x y z (VPMOVVec32x4ToM <types.TypeMask> mask))
 (SaturatedAddDotProdMaskedInt32x8 x y z mask) => (VPDPWSSDSMasked256 x y z (VPMOVVec32x8ToM <types.TypeMask> mask))
 (SaturatedAddDotProdMaskedInt32x16 x y z mask) => (VPDPWSSDSMasked512 x y z (VPMOVVec32x16ToM <types.TypeMask> mask))
-(SaturatedAddMaskedInt8x16 x y mask) => (VPADDSBMasked128 x y (VPMOVVec8x16ToM <types.TypeMask> mask))
-(SaturatedAddMaskedInt8x32 x y mask) => (VPADDSBMasked256 x y (VPMOVVec8x32ToM <types.TypeMask> mask))
-(SaturatedAddMaskedInt8x64 x y mask) => (VPADDSBMasked512 x y (VPMOVVec8x64ToM <types.TypeMask> mask))
-(SaturatedAddMaskedInt16x8 x y mask) => (VPADDSWMasked128 x y (VPMOVVec16x8ToM <types.TypeMask> mask))
-(SaturatedAddMaskedInt16x16 x y mask) => (VPADDSWMasked256 x y (VPMOVVec16x16ToM <types.TypeMask> mask))
-(SaturatedAddMaskedInt16x32 x y mask) => (VPADDSWMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask))
-(SaturatedAddMaskedUint8x16 x y mask) => (VPADDSBMasked128 x y (VPMOVVec8x16ToM <types.TypeMask> mask))
-(SaturatedAddMaskedUint8x32 x y mask) => (VPADDSBMasked256 x y (VPMOVVec8x32ToM <types.TypeMask> mask))
-(SaturatedAddMaskedUint8x64 x y mask) => (VPADDSBMasked512 x y (VPMOVVec8x64ToM <types.TypeMask> mask))
-(SaturatedAddMaskedUint16x8 x y mask) => (VPADDSWMasked128 x y (VPMOVVec16x8ToM <types.TypeMask> mask))
-(SaturatedAddMaskedUint16x16 x y mask) => (VPADDSWMasked256 x y (VPMOVVec16x16ToM <types.TypeMask> mask))
-(SaturatedAddMaskedUint16x32 x y mask) => (VPADDSWMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask))
-(SaturatedPairwiseAddInt16x8 ...) => (VPHADDSW128 ...)
-(SaturatedPairwiseAddInt16x16 ...) => (VPHADDSW256 ...)
-(SaturatedPairwiseSubInt16x8 ...) => (VPHSUBSW128 ...)
-(SaturatedPairwiseSubInt16x16 ...) => (VPHSUBSW256 ...)
-(SaturatedSubInt8x16 ...) => (VPSUBSB128 ...)
-(SaturatedSubInt8x32 ...) => (VPSUBSB256 ...)
-(SaturatedSubInt8x64 ...) => (VPSUBSB512 ...)
-(SaturatedSubInt16x8 ...) => (VPSUBSW128 ...)
-(SaturatedSubInt16x16 ...) => (VPSUBSW256 ...)
-(SaturatedSubInt16x32 ...) => (VPSUBSW512 ...)
-(SaturatedSubUint8x16 ...) => (VPSUBSB128 ...)
-(SaturatedSubUint8x32 ...) => (VPSUBSB256 ...)
-(SaturatedSubUint8x64 ...) => (VPSUBSB512 ...)
-(SaturatedSubUint16x8 ...) => (VPSUBSW128 ...)
-(SaturatedSubUint16x16 ...) => (VPSUBSW256 ...)
-(SaturatedSubUint16x32 ...) => (VPSUBSW512 ...)
-(SaturatedSubMaskedInt8x16 x y mask) => (VPSUBSBMasked128 x y (VPMOVVec8x16ToM <types.TypeMask> mask))
-(SaturatedSubMaskedInt8x32 x y mask) => (VPSUBSBMasked256 x y (VPMOVVec8x32ToM <types.TypeMask> mask))
-(SaturatedSubMaskedInt8x64 x y mask) => (VPSUBSBMasked512 x y (VPMOVVec8x64ToM <types.TypeMask> mask))
-(SaturatedSubMaskedInt16x8 x y mask) => (VPSUBSWMasked128 x y (VPMOVVec16x8ToM <types.TypeMask> mask))
-(SaturatedSubMaskedInt16x16 x y mask) => (VPSUBSWMasked256 x y (VPMOVVec16x16ToM <types.TypeMask> mask))
-(SaturatedSubMaskedInt16x32 x y mask) => (VPSUBSWMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask))
-(SaturatedSubMaskedUint8x16 x y mask) => (VPSUBSBMasked128 x y (VPMOVVec8x16ToM <types.TypeMask> mask))
-(SaturatedSubMaskedUint8x32 x y mask) => (VPSUBSBMasked256 x y (VPMOVVec8x32ToM <types.TypeMask> mask))
-(SaturatedSubMaskedUint8x64 x y mask) => (VPSUBSBMasked512 x y (VPMOVVec8x64ToM <types.TypeMask> mask))
-(SaturatedSubMaskedUint16x8 x y mask) => (VPSUBSWMasked128 x y (VPMOVVec16x8ToM <types.TypeMask> mask))
-(SaturatedSubMaskedUint16x16 x y mask) => (VPSUBSWMasked256 x y (VPMOVVec16x16ToM <types.TypeMask> mask))
-(SaturatedSubMaskedUint16x32 x y mask) => (VPSUBSWMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask))
 (SaturatedUnsignedSignedPairDotProdUint8x16 ...) => (VPMADDUBSW128 ...)
 (SaturatedUnsignedSignedPairDotProdUint8x32 ...) => (VPMADDUBSW256 ...)
 (SaturatedUnsignedSignedPairDotProdUint8x64 ...) => (VPMADDUBSW512 ...)
@ -1377,6 +1315,18 @@
 (SaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x4 x y z mask) => (VPDPBUSDSMasked128 x y z (VPMOVVec32x4ToM <types.TypeMask> mask))
 (SaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x8 x y z mask) => (VPDPBUSDSMasked256 x y z (VPMOVVec32x8ToM <types.TypeMask> mask))
 (SaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x16 x y z mask) => (VPDPBUSDSMasked512 x y z (VPMOVVec32x16ToM <types.TypeMask> mask))
+(ScaleFloat32x4 ...) => (VSCALEFPS128 ...)
+(ScaleFloat32x8 ...) => (VSCALEFPS256 ...)
+(ScaleFloat32x16 ...) => (VSCALEFPS512 ...)
+(ScaleFloat64x2 ...) => (VSCALEFPD128 ...)
+(ScaleFloat64x4 ...) => (VSCALEFPD256 ...)
+(ScaleFloat64x8 ...) => (VSCALEFPD512 ...)
+(ScaleMaskedFloat32x4 x y mask) => (VSCALEFPSMasked128 x y (VPMOVVec32x4ToM <types.TypeMask> mask))
+(ScaleMaskedFloat32x8 x y mask) => (VSCALEFPSMasked256 x y (VPMOVVec32x8ToM <types.TypeMask> mask))
+(ScaleMaskedFloat32x16 x y mask) => (VSCALEFPSMasked512 x y (VPMOVVec32x16ToM <types.TypeMask> mask))
+(ScaleMaskedFloat64x2 x y mask) => (VSCALEFPDMasked128 x y (VPMOVVec64x2ToM <types.TypeMask> mask))
+(ScaleMaskedFloat64x4 x y mask) => (VSCALEFPDMasked256 x y (VPMOVVec64x4ToM <types.TypeMask> mask))
+(ScaleMaskedFloat64x8 x y mask) => (VSCALEFPDMasked512 x y (VPMOVVec64x8ToM <types.TypeMask> mask))
 (Set128Float32x8 ...) => (VINSERTF128256 ...)
 (Set128Float64x4 ...) => (VINSERTF128256 ...)
 (Set128Int8x32 ...) => (VINSERTI128256 ...)
@ -1761,22 +1711,72 @@
 (SubMaskedUint64x2 x y mask) => (VPSUBQMasked128 x y (VPMOVVec64x2ToM <types.TypeMask> mask))
 (SubMaskedUint64x4 x y mask) => (VPSUBQMasked256 x y (VPMOVVec64x4ToM <types.TypeMask> mask))
 (SubMaskedUint64x8 x y mask) => (VPSUBQMasked512 x y (VPMOVVec64x8ToM <types.TypeMask> mask))
+(SubPairsFloat32x4 ...) => (VHSUBPS128 ...)
+(SubPairsFloat32x8 ...) => (VHSUBPS256 ...)
+(SubPairsFloat64x2 ...) => (VHSUBPD128 ...)
+(SubPairsFloat64x4 ...) => (VHSUBPD256 ...)
+(SubPairsInt16x8 ...) => (VPHSUBW128 ...)
+(SubPairsInt16x16 ...) => (VPHSUBW256 ...)
+(SubPairsInt32x4 ...) => (VPHSUBD128 ...)
+(SubPairsInt32x8 ...) => (VPHSUBD256 ...)
+(SubPairsUint16x8 ...) => (VPHSUBW128 ...)
+(SubPairsUint16x16 ...) => (VPHSUBW256 ...)
+(SubPairsUint32x4 ...) => (VPHSUBD128 ...)
+(SubPairsUint32x8 ...) => (VPHSUBD256 ...)
+(SubPairsSaturatedInt16x8 ...) => (VPHSUBSW128 ...)
+(SubPairsSaturatedInt16x16 ...) => (VPHSUBSW256 ...)
+(SubSaturatedInt8x16 ...) => (VPSUBSB128 ...)
+(SubSaturatedInt8x32 ...) => (VPSUBSB256 ...)
+(SubSaturatedInt8x64 ...) => (VPSUBSB512 ...)
+(SubSaturatedInt16x8 ...) => (VPSUBSW128 ...)
+(SubSaturatedInt16x16 ...) => (VPSUBSW256 ...)
+(SubSaturatedInt16x32 ...) => (VPSUBSW512 ...)
+(SubSaturatedUint8x16 ...) => (VPSUBSB128 ...)
+(SubSaturatedUint8x32 ...) => (VPSUBSB256 ...)
+(SubSaturatedUint8x64 ...) => (VPSUBSB512 ...)
+(SubSaturatedUint16x8 ...) => (VPSUBSW128 ...)
+(SubSaturatedUint16x16 ...) => (VPSUBSW256 ...)
+(SubSaturatedUint16x32 ...) => (VPSUBSW512 ...)
+(SubSaturatedMaskedInt8x16 x y mask) => (VPSUBSBMasked128 x y (VPMOVVec8x16ToM <types.TypeMask> mask))
+(SubSaturatedMaskedInt8x32 x y mask) => (VPSUBSBMasked256 x y (VPMOVVec8x32ToM <types.TypeMask> mask))
+(SubSaturatedMaskedInt8x64 x y mask) => (VPSUBSBMasked512 x y (VPMOVVec8x64ToM <types.TypeMask> mask))
+(SubSaturatedMaskedInt16x8 x y mask) => (VPSUBSWMasked128 x y (VPMOVVec16x8ToM <types.TypeMask> mask))
+(SubSaturatedMaskedInt16x16 x y mask) => (VPSUBSWMasked256 x y (VPMOVVec16x16ToM <types.TypeMask> mask))
+(SubSaturatedMaskedInt16x32 x y mask) => (VPSUBSWMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask))
+(SubSaturatedMaskedUint8x16 x y mask) => (VPSUBSBMasked128 x y (VPMOVVec8x16ToM <types.TypeMask> mask))
+(SubSaturatedMaskedUint8x32 x y mask) => (VPSUBSBMasked256 x y (VPMOVVec8x32ToM <types.TypeMask> mask))
+(SubSaturatedMaskedUint8x64 x y mask) => (VPSUBSBMasked512 x y (VPMOVVec8x64ToM <types.TypeMask> mask))
+(SubSaturatedMaskedUint16x8 x y mask) => (VPSUBSWMasked128 x y (VPMOVVec16x8ToM <types.TypeMask> mask))
+(SubSaturatedMaskedUint16x16 x y mask) => (VPSUBSWMasked256 x y (VPMOVVec16x16ToM <types.TypeMask> mask))
+(SubSaturatedMaskedUint16x32 x y mask) => (VPSUBSWMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask))
 (TruncFloat32x4 x) => (VROUNDPS128 [3] x)
 (TruncFloat32x8 x) => (VROUNDPS256 [3] x)
 (TruncFloat64x2 x) => (VROUNDPD128 [3] x)
 (TruncFloat64x4 x) => (VROUNDPD256 [3] x)
-(TruncWithPrecisionFloat32x4 [a] x) => (VRNDSCALEPS128 [a+3] x)
-(TruncWithPrecisionFloat32x8 [a] x) => (VRNDSCALEPS256 [a+3] x)
-(TruncWithPrecisionFloat32x16 [a] x) => (VRNDSCALEPS512 [a+3] x)
-(TruncWithPrecisionFloat64x2 [a] x) => (VRNDSCALEPD128 [a+3] x)
-(TruncWithPrecisionFloat64x4 [a] x) => (VRNDSCALEPD256 [a+3] x)
-(TruncWithPrecisionFloat64x8 [a] x) => (VRNDSCALEPD512 [a+3] x)
-(TruncWithPrecisionMaskedFloat32x4 [a] x mask) => (VRNDSCALEPSMasked128 [a+3] x (VPMOVVec32x4ToM <types.TypeMask> mask))
-(TruncWithPrecisionMaskedFloat32x8 [a] x mask) => (VRNDSCALEPSMasked256 [a+3] x (VPMOVVec32x8ToM <types.TypeMask> mask))
-(TruncWithPrecisionMaskedFloat32x16 [a] x mask) => (VRNDSCALEPSMasked512 [a+3] x (VPMOVVec32x16ToM <types.TypeMask> mask))
-(TruncWithPrecisionMaskedFloat64x2 [a] x mask) => (VRNDSCALEPDMasked128 [a+3] x (VPMOVVec64x2ToM <types.TypeMask> mask))
-(TruncWithPrecisionMaskedFloat64x4 [a] x mask) => (VRNDSCALEPDMasked256 [a+3] x (VPMOVVec64x4ToM <types.TypeMask> mask))
-(TruncWithPrecisionMaskedFloat64x8 [a] x mask) => (VRNDSCALEPDMasked512 [a+3] x (VPMOVVec64x8ToM <types.TypeMask> mask))
+(TruncScaledFloat32x4 [a] x) => (VRNDSCALEPS128 [a+3] x)
+(TruncScaledFloat32x8 [a] x) => (VRNDSCALEPS256 [a+3] x)
+(TruncScaledFloat32x16 [a] x) => (VRNDSCALEPS512 [a+3] x)
+(TruncScaledFloat64x2 [a] x) => (VRNDSCALEPD128 [a+3] x)
+(TruncScaledFloat64x4 [a] x) => (VRNDSCALEPD256 [a+3] x)
+(TruncScaledFloat64x8 [a] x) => (VRNDSCALEPD512 [a+3] x)
+(TruncScaledMaskedFloat32x4 [a] x mask) => (VRNDSCALEPSMasked128 [a+3] x (VPMOVVec32x4ToM <types.TypeMask> mask))
+(TruncScaledMaskedFloat32x8 [a] x mask) => (VRNDSCALEPSMasked256 [a+3] x (VPMOVVec32x8ToM <types.TypeMask> mask))
+(TruncScaledMaskedFloat32x16 [a] x mask) => (VRNDSCALEPSMasked512 [a+3] x (VPMOVVec32x16ToM <types.TypeMask> mask))
+(TruncScaledMaskedFloat64x2 [a] x mask) => (VRNDSCALEPDMasked128 [a+3] x (VPMOVVec64x2ToM <types.TypeMask> mask))
+(TruncScaledMaskedFloat64x4 [a] x mask) => (VRNDSCALEPDMasked256 [a+3] x (VPMOVVec64x4ToM <types.TypeMask> mask))
+(TruncScaledMaskedFloat64x8 [a] x mask) => (VRNDSCALEPDMasked512 [a+3] x (VPMOVVec64x8ToM <types.TypeMask> mask))
+(TruncScaledResidueFloat32x4 [a] x) => (VREDUCEPS128 [a+3] x)
+(TruncScaledResidueFloat32x8 [a] x) => (VREDUCEPS256 [a+3] x)
+(TruncScaledResidueFloat32x16 [a] x) => (VREDUCEPS512 [a+3] x)
+(TruncScaledResidueFloat64x2 [a] x) => (VREDUCEPD128 [a+3] x)
+(TruncScaledResidueFloat64x4 [a] x) => (VREDUCEPD256 [a+3] x)
+(TruncScaledResidueFloat64x8 [a] x) => (VREDUCEPD512 [a+3] x)
+(TruncScaledResidueMaskedFloat32x4 [a] x mask) => (VREDUCEPSMasked128 [a+3] x (VPMOVVec32x4ToM <types.TypeMask> mask))
+(TruncScaledResidueMaskedFloat32x8 [a] x mask) => (VREDUCEPSMasked256 [a+3] x (VPMOVVec32x8ToM <types.TypeMask> mask))
+(TruncScaledResidueMaskedFloat32x16 [a] x mask) => (VREDUCEPSMasked512 [a+3] x (VPMOVVec32x16ToM <types.TypeMask> mask))
+(TruncScaledResidueMaskedFloat64x2 [a] x mask) => (VREDUCEPDMasked128 [a+3] x (VPMOVVec64x2ToM <types.TypeMask> mask))
+(TruncScaledResidueMaskedFloat64x4 [a] x mask) => (VREDUCEPDMasked256 [a+3] x (VPMOVVec64x4ToM <types.TypeMask> mask))
+(TruncScaledResidueMaskedFloat64x8 [a] x mask) => (VREDUCEPDMasked512 [a+3] x (VPMOVVec64x8ToM <types.TypeMask> mask))
 (UnsignedSignedQuadDotProdAccumulateInt32x4 ...) => (VPDPBUSD128 ...)
 (UnsignedSignedQuadDotProdAccumulateInt32x8 ...) => (VPDPBUSD256 ...)
 (UnsignedSignedQuadDotProdAccumulateInt32x16 ...) => (VPDPBUSD512 ...)
--- a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
+++ b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
@ -81,6 +81,44 @@ func simdGenericOps() []opData {
 		{name: "AddMaskedUint64x2", argLength: 3, commutative: true},
 		{name: "AddMaskedUint64x4", argLength: 3, commutative: true},
 		{name: "AddMaskedUint64x8", argLength: 3, commutative: true},
+		{name: "AddPairsFloat32x4", argLength: 2, commutative: false},
+		{name: "AddPairsFloat32x8", argLength: 2, commutative: false},
+		{name: "AddPairsFloat64x2", argLength: 2, commutative: false},
+		{name: "AddPairsFloat64x4", argLength: 2, commutative: false},
+		{name: "AddPairsInt16x8", argLength: 2, commutative: false},
+		{name: "AddPairsInt16x16", argLength: 2, commutative: false},
+		{name: "AddPairsInt32x4", argLength: 2, commutative: false},
+		{name: "AddPairsInt32x8", argLength: 2, commutative: false},
+		{name: "AddPairsSaturatedInt16x8", argLength: 2, commutative: false},
+		{name: "AddPairsSaturatedInt16x16", argLength: 2, commutative: false},
+		{name: "AddPairsUint16x8", argLength: 2, commutative: false},
+		{name: "AddPairsUint16x16", argLength: 2, commutative: false},
+		{name: "AddPairsUint32x4", argLength: 2, commutative: false},
+		{name: "AddPairsUint32x8", argLength: 2, commutative: false},
+		{name: "AddSaturatedInt8x16", argLength: 2, commutative: true},
+		{name: "AddSaturatedInt8x32", argLength: 2, commutative: true},
+		{name: "AddSaturatedInt8x64", argLength: 2, commutative: true},
+		{name: "AddSaturatedInt16x8", argLength: 2, commutative: true},
+		{name: "AddSaturatedInt16x16", argLength: 2, commutative: true},
+		{name: "AddSaturatedInt16x32", argLength: 2, commutative: true},
+		{name: "AddSaturatedMaskedInt8x16", argLength: 3, commutative: true},
+		{name: "AddSaturatedMaskedInt8x32", argLength: 3, commutative: true},
+		{name: "AddSaturatedMaskedInt8x64", argLength: 3, commutative: true},
+		{name: "AddSaturatedMaskedInt16x8", argLength: 3, commutative: true},
+		{name: "AddSaturatedMaskedInt16x16", argLength: 3, commutative: true},
+		{name: "AddSaturatedMaskedInt16x32", argLength: 3, commutative: true},
+		{name: "AddSaturatedMaskedUint8x16", argLength: 3, commutative: true},
+		{name: "AddSaturatedMaskedUint8x32", argLength: 3, commutative: true},
+		{name: "AddSaturatedMaskedUint8x64", argLength: 3, commutative: true},
+		{name: "AddSaturatedMaskedUint16x8", argLength: 3, commutative: true},
+		{name: "AddSaturatedMaskedUint16x16", argLength: 3, commutative: true},
+		{name: "AddSaturatedMaskedUint16x32", argLength: 3, commutative: true},
+		{name: "AddSaturatedUint8x16", argLength: 2, commutative: true},
+		{name: "AddSaturatedUint8x32", argLength: 2, commutative: true},
+		{name: "AddSaturatedUint8x64", argLength: 2, commutative: true},
+		{name: "AddSaturatedUint16x8", argLength: 2, commutative: true},
+		{name: "AddSaturatedUint16x16", argLength: 2, commutative: true},
+		{name: "AddSaturatedUint16x32", argLength: 2, commutative: true},
 		{name: "AddSubFloat32x4", argLength: 2, commutative: false},
 		{name: "AddSubFloat32x8", argLength: 2, commutative: false},
 		{name: "AddSubFloat64x2", argLength: 2, commutative: false},
@ -744,18 +782,6 @@ func simdGenericOps() []opData {
 		{name: "MinUint64x2", argLength: 2, commutative: true},
 		{name: "MinUint64x4", argLength: 2, commutative: true},
 		{name: "MinUint64x8", argLength: 2, commutative: true},
-		{name: "MulByPowOf2Float32x4", argLength: 2, commutative: false},
-		{name: "MulByPowOf2Float32x8", argLength: 2, commutative: false},
-		{name: "MulByPowOf2Float32x16", argLength: 2, commutative: false},
-		{name: "MulByPowOf2Float64x2", argLength: 2, commutative: false},
-		{name: "MulByPowOf2Float64x4", argLength: 2, commutative: false},
-		{name: "MulByPowOf2Float64x8", argLength: 2, commutative: false},
-		{name: "MulByPowOf2MaskedFloat32x4", argLength: 3, commutative: false},
-		{name: "MulByPowOf2MaskedFloat32x8", argLength: 3, commutative: false},
-		{name: "MulByPowOf2MaskedFloat32x16", argLength: 3, commutative: false},
-		{name: "MulByPowOf2MaskedFloat64x2", argLength: 3, commutative: false},
-		{name: "MulByPowOf2MaskedFloat64x4", argLength: 3, commutative: false},
-		{name: "MulByPowOf2MaskedFloat64x8", argLength: 3, commutative: false},
 		{name: "MulEvenWidenInt32x4", argLength: 2, commutative: true},
 		{name: "MulEvenWidenInt32x8", argLength: 2, commutative: true},
 		{name: "MulEvenWidenInt64x2", argLength: 2, commutative: true},
@ -790,30 +816,30 @@ func simdGenericOps() []opData {
 		{name: "MulHighUint16x8", argLength: 2, commutative: true},
 		{name: "MulHighUint16x16", argLength: 2, commutative: true},
 		{name: "MulHighUint16x32", argLength: 2, commutative: true},
-		{name: "MulLowInt16x8", argLength: 2, commutative: true},
-		{name: "MulLowInt16x16", argLength: 2, commutative: true},
-		{name: "MulLowInt16x32", argLength: 2, commutative: true},
-		{name: "MulLowInt32x4", argLength: 2, commutative: true},
-		{name: "MulLowInt32x8", argLength: 2, commutative: true},
-		{name: "MulLowInt32x16", argLength: 2, commutative: true},
-		{name: "MulLowInt64x2", argLength: 2, commutative: true},
-		{name: "MulLowInt64x4", argLength: 2, commutative: true},
-		{name: "MulLowInt64x8", argLength: 2, commutative: true},
-		{name: "MulLowMaskedInt16x8", argLength: 3, commutative: true},
-		{name: "MulLowMaskedInt16x16", argLength: 3, commutative: true},
-		{name: "MulLowMaskedInt16x32", argLength: 3, commutative: true},
-		{name: "MulLowMaskedInt32x4", argLength: 3, commutative: true},
-		{name: "MulLowMaskedInt32x8", argLength: 3, commutative: true},
-		{name: "MulLowMaskedInt32x16", argLength: 3, commutative: true},
-		{name: "MulLowMaskedInt64x2", argLength: 3, commutative: true},
-		{name: "MulLowMaskedInt64x4", argLength: 3, commutative: true},
-		{name: "MulLowMaskedInt64x8", argLength: 3, commutative: true},
+		{name: "MulInt16x8", argLength: 2, commutative: true},
+		{name: "MulInt16x16", argLength: 2, commutative: true},
+		{name: "MulInt16x32", argLength: 2, commutative: true},
+		{name: "MulInt32x4", argLength: 2, commutative: true},
+		{name: "MulInt32x8", argLength: 2, commutative: true},
+		{name: "MulInt32x16", argLength: 2, commutative: true},
+		{name: "MulInt64x2", argLength: 2, commutative: true},
+		{name: "MulInt64x4", argLength: 2, commutative: true},
+		{name: "MulInt64x8", argLength: 2, commutative: true},
 		{name: "MulMaskedFloat32x4", argLength: 3, commutative: true},
 		{name: "MulMaskedFloat32x8", argLength: 3, commutative: true},
 		{name: "MulMaskedFloat32x16", argLength: 3, commutative: true},
 		{name: "MulMaskedFloat64x2", argLength: 3, commutative: true},
 		{name: "MulMaskedFloat64x4", argLength: 3, commutative: true},
 		{name: "MulMaskedFloat64x8", argLength: 3, commutative: true},
+		{name: "MulMaskedInt16x8", argLength: 3, commutative: true},
+		{name: "MulMaskedInt16x16", argLength: 3, commutative: true},
+		{name: "MulMaskedInt16x32", argLength: 3, commutative: true},
+		{name: "MulMaskedInt32x4", argLength: 3, commutative: true},
+		{name: "MulMaskedInt32x8", argLength: 3, commutative: true},
+		{name: "MulMaskedInt32x16", argLength: 3, commutative: true},
+		{name: "MulMaskedInt64x2", argLength: 3, commutative: true},
+		{name: "MulMaskedInt64x4", argLength: 3, commutative: true},
+		{name: "MulMaskedInt64x8", argLength: 3, commutative: true},
 		{name: "NotEqualFloat32x4", argLength: 2, commutative: true},
 		{name: "NotEqualFloat32x8", argLength: 2, commutative: true},
 		{name: "NotEqualFloat32x16", argLength: 2, commutative: true},
@ -916,30 +942,6 @@ func simdGenericOps() []opData {
 		{name: "PairDotProdMaskedInt16x8", argLength: 3, commutative: false},
 		{name: "PairDotProdMaskedInt16x16", argLength: 3, commutative: false},
 		{name: "PairDotProdMaskedInt16x32", argLength: 3, commutative: false},
-		{name: "PairwiseAddFloat32x4", argLength: 2, commutative: false},
-		{name: "PairwiseAddFloat32x8", argLength: 2, commutative: false},
-		{name: "PairwiseAddFloat64x2", argLength: 2, commutative: false},
-		{name: "PairwiseAddFloat64x4", argLength: 2, commutative: false},
-		{name: "PairwiseAddInt16x8", argLength: 2, commutative: false},
-		{name: "PairwiseAddInt16x16", argLength: 2, commutative: false},
-		{name: "PairwiseAddInt32x4", argLength: 2, commutative: false},
-		{name: "PairwiseAddInt32x8", argLength: 2, commutative: false},
-		{name: "PairwiseAddUint16x8", argLength: 2, commutative: false},
-		{name: "PairwiseAddUint16x16", argLength: 2, commutative: false},
-		{name: "PairwiseAddUint32x4", argLength: 2, commutative: false},
-		{name: "PairwiseAddUint32x8", argLength: 2, commutative: false},
-		{name: "PairwiseSubFloat32x4", argLength: 2, commutative: false},
-		{name: "PairwiseSubFloat32x8", argLength: 2, commutative: false},
-		{name: "PairwiseSubFloat64x2", argLength: 2, commutative: false},
-		{name: "PairwiseSubFloat64x4", argLength: 2, commutative: false},
-		{name: "PairwiseSubInt16x8", argLength: 2, commutative: false},
-		{name: "PairwiseSubInt16x16", argLength: 2, commutative: false},
-		{name: "PairwiseSubInt32x4", argLength: 2, commutative: false},
-		{name: "PairwiseSubInt32x8", argLength: 2, commutative: false},
-		{name: "PairwiseSubUint16x8", argLength: 2, commutative: false},
-		{name: "PairwiseSubUint16x16", argLength: 2, commutative: false},
-		{name: "PairwiseSubUint32x4", argLength: 2, commutative: false},
-		{name: "PairwiseSubUint32x8", argLength: 2, commutative: false},
 		{name: "Permute2Float32x4", argLength: 3, commutative: false},
 		{name: "Permute2Float32x8", argLength: 3, commutative: false},
 		{name: "Permute2Float32x16", argLength: 3, commutative: false},
@ -1154,58 +1156,6 @@ func simdGenericOps() []opData {
 		{name: "SaturatedAddDotProdMaskedInt32x4", argLength: 4, commutative: false},
 		{name: "SaturatedAddDotProdMaskedInt32x8", argLength: 4, commutative: false},
 		{name: "SaturatedAddDotProdMaskedInt32x16", argLength: 4, commutative: false},
-		{name: "SaturatedAddInt8x16", argLength: 2, commutative: true},
-		{name: "SaturatedAddInt8x32", argLength: 2, commutative: true},
-		{name: "SaturatedAddInt8x64", argLength: 2, commutative: true},
-		{name: "SaturatedAddInt16x8", argLength: 2, commutative: true},
-		{name: "SaturatedAddInt16x16", argLength: 2, commutative: true},
-		{name: "SaturatedAddInt16x32", argLength: 2, commutative: true},
-		{name: "SaturatedAddMaskedInt8x16", argLength: 3, commutative: true},
-		{name: "SaturatedAddMaskedInt8x32", argLength: 3, commutative: true},
-		{name: "SaturatedAddMaskedInt8x64", argLength: 3, commutative: true},
-		{name: "SaturatedAddMaskedInt16x8", argLength: 3, commutative: true},
-		{name: "SaturatedAddMaskedInt16x16", argLength: 3, commutative: true},
-		{name: "SaturatedAddMaskedInt16x32", argLength: 3, commutative: true},
-		{name: "SaturatedAddMaskedUint8x16", argLength: 3, commutative: true},
-		{name: "SaturatedAddMaskedUint8x32", argLength: 3, commutative: true},
-		{name: "SaturatedAddMaskedUint8x64", argLength: 3, commutative: true},
-		{name: "SaturatedAddMaskedUint16x8", argLength: 3, commutative: true},
-		{name: "SaturatedAddMaskedUint16x16", argLength: 3, commutative: true},
-		{name: "SaturatedAddMaskedUint16x32", argLength: 3, commutative: true},
-		{name: "SaturatedAddUint8x16", argLength: 2, commutative: true},
-		{name: "SaturatedAddUint8x32", argLength: 2, commutative: true},
-		{name: "SaturatedAddUint8x64", argLength: 2, commutative: true},
-		{name: "SaturatedAddUint16x8", argLength: 2, commutative: true},
-		{name: "SaturatedAddUint16x16", argLength: 2, commutative: true},
-		{name: "SaturatedAddUint16x32", argLength: 2, commutative: true},
-		{name: "SaturatedPairwiseAddInt16x8", argLength: 2, commutative: false},
-		{name: "SaturatedPairwiseAddInt16x16", argLength: 2, commutative: false},
-		{name: "SaturatedPairwiseSubInt16x8", argLength: 2, commutative: false},
-		{name: "SaturatedPairwiseSubInt16x16", argLength: 2, commutative: false},
-		{name: "SaturatedSubInt8x16", argLength: 2, commutative: false},
-		{name: "SaturatedSubInt8x32", argLength: 2, commutative: false},
-		{name: "SaturatedSubInt8x64", argLength: 2, commutative: false},
-		{name: "SaturatedSubInt16x8", argLength: 2, commutative: false},
-		{name: "SaturatedSubInt16x16", argLength: 2, commutative: false},
-		{name: "SaturatedSubInt16x32", argLength: 2, commutative: false},
-		{name: "SaturatedSubMaskedInt8x16", argLength: 3, commutative: false},
-		{name: "SaturatedSubMaskedInt8x32", argLength: 3, commutative: false},
-		{name: "SaturatedSubMaskedInt8x64", argLength: 3, commutative: false},
-		{name: "SaturatedSubMaskedInt16x8", argLength: 3, commutative: false},
-		{name: "SaturatedSubMaskedInt16x16", argLength: 3, commutative: false},
-		{name: "SaturatedSubMaskedInt16x32", argLength: 3, commutative: false},
-		{name: "SaturatedSubMaskedUint8x16", argLength: 3, commutative: false},
-		{name: "SaturatedSubMaskedUint8x32", argLength: 3, commutative: false},
-		{name: "SaturatedSubMaskedUint8x64", argLength: 3, commutative: false},
-		{name: "SaturatedSubMaskedUint16x8", argLength: 3, commutative: false},
-		{name: "SaturatedSubMaskedUint16x16", argLength: 3, commutative: false},
-		{name: "SaturatedSubMaskedUint16x32", argLength: 3, commutative: false},
-		{name: "SaturatedSubUint8x16", argLength: 2, commutative: false},
-		{name: "SaturatedSubUint8x32", argLength: 2, commutative: false},
-		{name: "SaturatedSubUint8x64", argLength: 2, commutative: false},
-		{name: "SaturatedSubUint16x8", argLength: 2, commutative: false},
-		{name: "SaturatedSubUint16x16", argLength: 2, commutative: false},
-		{name: "SaturatedSubUint16x32", argLength: 2, commutative: false},
 		{name: "SaturatedUnsignedSignedPairDotProdMaskedUint8x16", argLength: 3, commutative: false},
 		{name: "SaturatedUnsignedSignedPairDotProdMaskedUint8x32", argLength: 3, commutative: false},
 		{name: "SaturatedUnsignedSignedPairDotProdMaskedUint8x64", argLength: 3, commutative: false},
@ -1218,6 +1168,18 @@ func simdGenericOps() []opData {
 		{name: "SaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x4", argLength: 4, commutative: false},
 		{name: "SaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x8", argLength: 4, commutative: false},
 		{name: "SaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x16", argLength: 4, commutative: false},
+		{name: "ScaleFloat32x4", argLength: 2, commutative: false},
+		{name: "ScaleFloat32x8", argLength: 2, commutative: false},
+		{name: "ScaleFloat32x16", argLength: 2, commutative: false},
+		{name: "ScaleFloat64x2", argLength: 2, commutative: false},
+		{name: "ScaleFloat64x4", argLength: 2, commutative: false},
+		{name: "ScaleFloat64x8", argLength: 2, commutative: false},
+		{name: "ScaleMaskedFloat32x4", argLength: 3, commutative: false},
+		{name: "ScaleMaskedFloat32x8", argLength: 3, commutative: false},
+		{name: "ScaleMaskedFloat32x16", argLength: 3, commutative: false},
+		{name: "ScaleMaskedFloat64x2", argLength: 3, commutative: false},
+		{name: "ScaleMaskedFloat64x4", argLength: 3, commutative: false},
+		{name: "ScaleMaskedFloat64x8", argLength: 3, commutative: false},
 		{name: "ShiftAllLeftInt16x8", argLength: 2, commutative: false},
 		{name: "ShiftAllLeftInt16x16", argLength: 2, commutative: false},
 		{name: "ShiftAllLeftInt16x32", argLength: 2, commutative: false},
@ -1500,6 +1462,44 @@ func simdGenericOps() []opData {
 		{name: "SubMaskedUint64x2", argLength: 3, commutative: false},
 		{name: "SubMaskedUint64x4", argLength: 3, commutative: false},
 		{name: "SubMaskedUint64x8", argLength: 3, commutative: false},
+		{name: "SubPairsFloat32x4", argLength: 2, commutative: false},
+		{name: "SubPairsFloat32x8", argLength: 2, commutative: false},
+		{name: "SubPairsFloat64x2", argLength: 2, commutative: false},
+		{name: "SubPairsFloat64x4", argLength: 2, commutative: false},
+		{name: "SubPairsInt16x8", argLength: 2, commutative: false},
+		{name: "SubPairsInt16x16", argLength: 2, commutative: false},
+		{name: "SubPairsInt32x4", argLength: 2, commutative: false},
+		{name: "SubPairsInt32x8", argLength: 2, commutative: false},
+		{name: "SubPairsSaturatedInt16x8", argLength: 2, commutative: false},
+		{name: "SubPairsSaturatedInt16x16", argLength: 2, commutative: false},
+		{name: "SubPairsUint16x8", argLength: 2, commutative: false},
+		{name: "SubPairsUint16x16", argLength: 2, commutative: false},
+		{name: "SubPairsUint32x4", argLength: 2, commutative: false},
+		{name: "SubPairsUint32x8", argLength: 2, commutative: false},
+		{name: "SubSaturatedInt8x16", argLength: 2, commutative: false},
+		{name: "SubSaturatedInt8x32", argLength: 2, commutative: false},
+		{name: "SubSaturatedInt8x64", argLength: 2, commutative: false},
+		{name: "SubSaturatedInt16x8", argLength: 2, commutative: false},
+		{name: "SubSaturatedInt16x16", argLength: 2, commutative: false},
+		{name: "SubSaturatedInt16x32", argLength: 2, commutative: false},
+		{name: "SubSaturatedMaskedInt8x16", argLength: 3, commutative: false},
+		{name: "SubSaturatedMaskedInt8x32", argLength: 3, commutative: false},
+		{name: "SubSaturatedMaskedInt8x64", argLength: 3, commutative: false},
+		{name: "SubSaturatedMaskedInt16x8", argLength: 3, commutative: false},
+		{name: "SubSaturatedMaskedInt16x16", argLength: 3, commutative: false},
+		{name: "SubSaturatedMaskedInt16x32", argLength: 3, commutative: false},
+		{name: "SubSaturatedMaskedUint8x16", argLength: 3, commutative: false},
+		{name: "SubSaturatedMaskedUint8x32", argLength: 3, commutative: false},
+		{name: "SubSaturatedMaskedUint8x64", argLength: 3, commutative: false},
+		{name: "SubSaturatedMaskedUint16x8", argLength: 3, commutative: false},
+		{name: "SubSaturatedMaskedUint16x16", argLength: 3, commutative: false},
+		{name: "SubSaturatedMaskedUint16x32", argLength: 3, commutative: false},
+		{name: "SubSaturatedUint8x16", argLength: 2, commutative: false},
+		{name: "SubSaturatedUint8x32", argLength: 2, commutative: false},
+		{name: "SubSaturatedUint8x64", argLength: 2, commutative: false},
+		{name: "SubSaturatedUint16x8", argLength: 2, commutative: false},
+		{name: "SubSaturatedUint16x16", argLength: 2, commutative: false},
+		{name: "SubSaturatedUint16x32", argLength: 2, commutative: false},
 		{name: "SubUint8x16", argLength: 2, commutative: false},
 		{name: "SubUint8x32", argLength: 2, commutative: false},
 		{name: "SubUint8x64", argLength: 2, commutative: false},
@ -1558,78 +1558,54 @@ func simdGenericOps() []opData {
 		{name: "XorUint64x2", argLength: 2, commutative: true},
 		{name: "XorUint64x4", argLength: 2, commutative: true},
 		{name: "XorUint64x8", argLength: 2, commutative: true},
-		{name: "CeilWithPrecisionFloat32x4", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "CeilWithPrecisionFloat32x8", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "CeilWithPrecisionFloat32x16", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "CeilWithPrecisionFloat64x2", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "CeilWithPrecisionFloat64x4", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "CeilWithPrecisionFloat64x8", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "CeilWithPrecisionMaskedFloat32x4", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "CeilWithPrecisionMaskedFloat32x8", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "CeilWithPrecisionMaskedFloat32x16", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "CeilWithPrecisionMaskedFloat64x2", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "CeilWithPrecisionMaskedFloat64x4", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "CeilWithPrecisionMaskedFloat64x8", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "DiffWithCeilWithPrecisionFloat32x4", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "DiffWithCeilWithPrecisionFloat32x8", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "DiffWithCeilWithPrecisionFloat32x16", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "DiffWithCeilWithPrecisionFloat64x2", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "DiffWithCeilWithPrecisionFloat64x4", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "DiffWithCeilWithPrecisionFloat64x8", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "DiffWithCeilWithPrecisionMaskedFloat32x4", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "DiffWithCeilWithPrecisionMaskedFloat32x8", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "DiffWithCeilWithPrecisionMaskedFloat32x16", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "DiffWithCeilWithPrecisionMaskedFloat64x2", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "DiffWithCeilWithPrecisionMaskedFloat64x4", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "DiffWithCeilWithPrecisionMaskedFloat64x8", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "DiffWithFloorWithPrecisionFloat32x4", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "DiffWithFloorWithPrecisionFloat32x8", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "DiffWithFloorWithPrecisionFloat32x16", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "DiffWithFloorWithPrecisionFloat64x2", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "DiffWithFloorWithPrecisionFloat64x4", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "DiffWithFloorWithPrecisionFloat64x8", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "DiffWithFloorWithPrecisionMaskedFloat32x4", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "DiffWithFloorWithPrecisionMaskedFloat32x8", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "DiffWithFloorWithPrecisionMaskedFloat32x16", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "DiffWithFloorWithPrecisionMaskedFloat64x2", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "DiffWithFloorWithPrecisionMaskedFloat64x4", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "DiffWithFloorWithPrecisionMaskedFloat64x8", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "DiffWithRoundWithPrecisionFloat32x4", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "DiffWithRoundWithPrecisionFloat32x8", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "DiffWithRoundWithPrecisionFloat32x16", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "DiffWithRoundWithPrecisionFloat64x2", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "DiffWithRoundWithPrecisionFloat64x4", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "DiffWithRoundWithPrecisionFloat64x8", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "DiffWithRoundWithPrecisionMaskedFloat32x4", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "DiffWithRoundWithPrecisionMaskedFloat32x8", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "DiffWithRoundWithPrecisionMaskedFloat32x16", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "DiffWithRoundWithPrecisionMaskedFloat64x2", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "DiffWithRoundWithPrecisionMaskedFloat64x4", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "DiffWithRoundWithPrecisionMaskedFloat64x8", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "DiffWithTruncWithPrecisionFloat32x4", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "DiffWithTruncWithPrecisionFloat32x8", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "DiffWithTruncWithPrecisionFloat32x16", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "DiffWithTruncWithPrecisionFloat64x2", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "DiffWithTruncWithPrecisionFloat64x4", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "DiffWithTruncWithPrecisionFloat64x8", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "DiffWithTruncWithPrecisionMaskedFloat32x4", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "DiffWithTruncWithPrecisionMaskedFloat32x8", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "DiffWithTruncWithPrecisionMaskedFloat32x16", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "DiffWithTruncWithPrecisionMaskedFloat64x2", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "DiffWithTruncWithPrecisionMaskedFloat64x4", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "DiffWithTruncWithPrecisionMaskedFloat64x8", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "FloorWithPrecisionFloat32x4", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "FloorWithPrecisionFloat32x8", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "FloorWithPrecisionFloat32x16", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "FloorWithPrecisionFloat64x2", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "FloorWithPrecisionFloat64x4", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "FloorWithPrecisionFloat64x8", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "FloorWithPrecisionMaskedFloat32x4", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "FloorWithPrecisionMaskedFloat32x8", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "FloorWithPrecisionMaskedFloat32x16", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "FloorWithPrecisionMaskedFloat64x2", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "FloorWithPrecisionMaskedFloat64x4", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "FloorWithPrecisionMaskedFloat64x8", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "CeilScaledFloat32x4", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "CeilScaledFloat32x8", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "CeilScaledFloat32x16", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "CeilScaledFloat64x2", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "CeilScaledFloat64x4", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "CeilScaledFloat64x8", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "CeilScaledMaskedFloat32x4", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "CeilScaledMaskedFloat32x8", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "CeilScaledMaskedFloat32x16", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "CeilScaledMaskedFloat64x2", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "CeilScaledMaskedFloat64x4", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "CeilScaledMaskedFloat64x8", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "CeilScaledResidueFloat32x4", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "CeilScaledResidueFloat32x8", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "CeilScaledResidueFloat32x16", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "CeilScaledResidueFloat64x2", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "CeilScaledResidueFloat64x4", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "CeilScaledResidueFloat64x8", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "CeilScaledResidueMaskedFloat32x4", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "CeilScaledResidueMaskedFloat32x8", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "CeilScaledResidueMaskedFloat32x16", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "CeilScaledResidueMaskedFloat64x2", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "CeilScaledResidueMaskedFloat64x4", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "CeilScaledResidueMaskedFloat64x8", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "FloorScaledFloat32x4", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "FloorScaledFloat32x8", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "FloorScaledFloat32x16", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "FloorScaledFloat64x2", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "FloorScaledFloat64x4", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "FloorScaledFloat64x8", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "FloorScaledMaskedFloat32x4", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "FloorScaledMaskedFloat32x8", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "FloorScaledMaskedFloat32x16", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "FloorScaledMaskedFloat64x2", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "FloorScaledMaskedFloat64x4", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "FloorScaledMaskedFloat64x8", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "FloorScaledResidueFloat32x4", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "FloorScaledResidueFloat32x8", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "FloorScaledResidueFloat32x16", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "FloorScaledResidueFloat64x2", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "FloorScaledResidueFloat64x4", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "FloorScaledResidueFloat64x8", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "FloorScaledResidueMaskedFloat32x4", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "FloorScaledResidueMaskedFloat32x8", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "FloorScaledResidueMaskedFloat32x16", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "FloorScaledResidueMaskedFloat64x2", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "FloorScaledResidueMaskedFloat64x4", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "FloorScaledResidueMaskedFloat64x8", argLength: 2, commutative: false, aux: "Int8"},
 		{name: "GaloisFieldAffineTransformInverseMaskedUint8x16", argLength: 3, commutative: false, aux: "Int8"},
 		{name: "GaloisFieldAffineTransformInverseMaskedUint8x32", argLength: 3, commutative: false, aux: "Int8"},
 		{name: "GaloisFieldAffineTransformInverseMaskedUint8x64", argLength: 3, commutative: false, aux: "Int8"},
@ -1708,18 +1684,30 @@ func simdGenericOps() []opData {
 		{name: "RotateAllRightUint64x2", argLength: 1, commutative: false, aux: "Int8"},
 		{name: "RotateAllRightUint64x4", argLength: 1, commutative: false, aux: "Int8"},
 		{name: "RotateAllRightUint64x8", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "RoundWithPrecisionFloat32x4", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "RoundWithPrecisionFloat32x8", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "RoundWithPrecisionFloat32x16", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "RoundWithPrecisionFloat64x2", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "RoundWithPrecisionFloat64x4", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "RoundWithPrecisionFloat64x8", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "RoundWithPrecisionMaskedFloat32x4", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "RoundWithPrecisionMaskedFloat32x8", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "RoundWithPrecisionMaskedFloat32x16", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "RoundWithPrecisionMaskedFloat64x2", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "RoundWithPrecisionMaskedFloat64x4", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "RoundWithPrecisionMaskedFloat64x8", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "RoundScaledFloat32x4", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "RoundScaledFloat32x8", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "RoundScaledFloat32x16", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "RoundScaledFloat64x2", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "RoundScaledFloat64x4", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "RoundScaledFloat64x8", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "RoundScaledMaskedFloat32x4", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "RoundScaledMaskedFloat32x8", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "RoundScaledMaskedFloat32x16", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "RoundScaledMaskedFloat64x2", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "RoundScaledMaskedFloat64x4", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "RoundScaledMaskedFloat64x8", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "RoundScaledResidueFloat32x4", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "RoundScaledResidueFloat32x8", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "RoundScaledResidueFloat32x16", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "RoundScaledResidueFloat64x2", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "RoundScaledResidueFloat64x4", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "RoundScaledResidueFloat64x8", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "RoundScaledResidueMaskedFloat32x4", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "RoundScaledResidueMaskedFloat32x8", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "RoundScaledResidueMaskedFloat32x16", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "RoundScaledResidueMaskedFloat64x2", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "RoundScaledResidueMaskedFloat64x4", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "RoundScaledResidueMaskedFloat64x8", argLength: 2, commutative: false, aux: "Int8"},
 		{name: "Set128Float32x8", argLength: 2, commutative: false, aux: "Int8"},
 		{name: "Set128Float64x4", argLength: 2, commutative: false, aux: "Int8"},
 		{name: "Set128Int8x32", argLength: 2, commutative: false, aux: "Int8"},
@ -1810,17 +1798,29 @@ func simdGenericOps() []opData {
 		{name: "ShiftAllRightConcatUint64x2", argLength: 2, commutative: false, aux: "Int8"},
 		{name: "ShiftAllRightConcatUint64x4", argLength: 2, commutative: false, aux: "Int8"},
 		{name: "ShiftAllRightConcatUint64x8", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "TruncWithPrecisionFloat32x4", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "TruncWithPrecisionFloat32x8", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "TruncWithPrecisionFloat32x16", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "TruncWithPrecisionFloat64x2", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "TruncWithPrecisionFloat64x4", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "TruncWithPrecisionFloat64x8", argLength: 1, commutative: false, aux: "Int8"},
-		{name: "TruncWithPrecisionMaskedFloat32x4", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "TruncWithPrecisionMaskedFloat32x8", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "TruncWithPrecisionMaskedFloat32x16", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "TruncWithPrecisionMaskedFloat64x2", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "TruncWithPrecisionMaskedFloat64x4", argLength: 2, commutative: false, aux: "Int8"},
-		{name: "TruncWithPrecisionMaskedFloat64x8", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "TruncScaledFloat32x4", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "TruncScaledFloat32x8", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "TruncScaledFloat32x16", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "TruncScaledFloat64x2", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "TruncScaledFloat64x4", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "TruncScaledFloat64x8", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "TruncScaledMaskedFloat32x4", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "TruncScaledMaskedFloat32x8", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "TruncScaledMaskedFloat32x16", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "TruncScaledMaskedFloat64x2", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "TruncScaledMaskedFloat64x4", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "TruncScaledMaskedFloat64x8", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "TruncScaledResidueFloat32x4", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "TruncScaledResidueFloat32x8", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "TruncScaledResidueFloat32x16", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "TruncScaledResidueFloat64x2", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "TruncScaledResidueFloat64x4", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "TruncScaledResidueFloat64x8", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "TruncScaledResidueMaskedFloat32x4", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "TruncScaledResidueMaskedFloat32x8", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "TruncScaledResidueMaskedFloat32x16", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "TruncScaledResidueMaskedFloat64x2", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "TruncScaledResidueMaskedFloat64x4", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "TruncScaledResidueMaskedFloat64x8", argLength: 2, commutative: false, aux: "Int8"},
 	}
 }
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
--- a/src/cmd/compile/internal/ssa/rewriteAMD64.go
+++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go
--- a/src/cmd/compile/internal/ssagen/simdintrinsics.go
+++ b/src/cmd/compile/internal/ssagen/simdintrinsics.go
@ -101,6 +101,44 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
 	addF(simdPackage, "Uint64x2.AddMasked", opLen3(ssa.OpAddMaskedUint64x2, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Uint64x4.AddMasked", opLen3(ssa.OpAddMaskedUint64x4, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint64x8.AddMasked", opLen3(ssa.OpAddMaskedUint64x8, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Float32x4.AddPairs", opLen2(ssa.OpAddPairsFloat32x4, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Float32x8.AddPairs", opLen2(ssa.OpAddPairsFloat32x8, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Float64x2.AddPairs", opLen2(ssa.OpAddPairsFloat64x2, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Float64x4.AddPairs", opLen2(ssa.OpAddPairsFloat64x4, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int16x8.AddPairs", opLen2(ssa.OpAddPairsInt16x8, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int16x16.AddPairs", opLen2(ssa.OpAddPairsInt16x16, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int32x4.AddPairs", opLen2(ssa.OpAddPairsInt32x4, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int32x8.AddPairs", opLen2(ssa.OpAddPairsInt32x8, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Uint16x8.AddPairs", opLen2(ssa.OpAddPairsUint16x8, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint16x16.AddPairs", opLen2(ssa.OpAddPairsUint16x16, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Uint32x4.AddPairs", opLen2(ssa.OpAddPairsUint32x4, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint32x8.AddPairs", opLen2(ssa.OpAddPairsUint32x8, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int16x8.AddPairsSaturated", opLen2(ssa.OpAddPairsSaturatedInt16x8, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int16x16.AddPairsSaturated", opLen2(ssa.OpAddPairsSaturatedInt16x16, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int8x16.AddSaturated", opLen2(ssa.OpAddSaturatedInt8x16, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int8x32.AddSaturated", opLen2(ssa.OpAddSaturatedInt8x32, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int8x64.AddSaturated", opLen2(ssa.OpAddSaturatedInt8x64, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Int16x8.AddSaturated", opLen2(ssa.OpAddSaturatedInt16x8, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int16x16.AddSaturated", opLen2(ssa.OpAddSaturatedInt16x16, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int16x32.AddSaturated", opLen2(ssa.OpAddSaturatedInt16x32, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Uint8x16.AddSaturated", opLen2(ssa.OpAddSaturatedUint8x16, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint8x32.AddSaturated", opLen2(ssa.OpAddSaturatedUint8x32, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Uint8x64.AddSaturated", opLen2(ssa.OpAddSaturatedUint8x64, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Uint16x8.AddSaturated", opLen2(ssa.OpAddSaturatedUint16x8, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint16x16.AddSaturated", opLen2(ssa.OpAddSaturatedUint16x16, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Uint16x32.AddSaturated", opLen2(ssa.OpAddSaturatedUint16x32, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Int8x16.AddSaturatedMasked", opLen3(ssa.OpAddSaturatedMaskedInt8x16, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int8x32.AddSaturatedMasked", opLen3(ssa.OpAddSaturatedMaskedInt8x32, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int8x64.AddSaturatedMasked", opLen3(ssa.OpAddSaturatedMaskedInt8x64, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Int16x8.AddSaturatedMasked", opLen3(ssa.OpAddSaturatedMaskedInt16x8, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int16x16.AddSaturatedMasked", opLen3(ssa.OpAddSaturatedMaskedInt16x16, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int16x32.AddSaturatedMasked", opLen3(ssa.OpAddSaturatedMaskedInt16x32, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Uint8x16.AddSaturatedMasked", opLen3(ssa.OpAddSaturatedMaskedUint8x16, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint8x32.AddSaturatedMasked", opLen3(ssa.OpAddSaturatedMaskedUint8x32, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Uint8x64.AddSaturatedMasked", opLen3(ssa.OpAddSaturatedMaskedUint8x64, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Uint16x8.AddSaturatedMasked", opLen3(ssa.OpAddSaturatedMaskedUint16x8, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint16x16.AddSaturatedMasked", opLen3(ssa.OpAddSaturatedMaskedUint16x16, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Uint16x32.AddSaturatedMasked", opLen3(ssa.OpAddSaturatedMaskedUint16x32, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Float32x4.AddSub", opLen2(ssa.OpAddSubFloat32x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Float32x8.AddSub", opLen2(ssa.OpAddSubFloat32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Float64x2.AddSub", opLen2(ssa.OpAddSubFloat64x2, types.TypeVec128), sys.AMD64)
@ -217,18 +255,30 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
 	addF(simdPackage, "Float32x8.Ceil", opLen1(ssa.OpCeilFloat32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Float64x2.Ceil", opLen1(ssa.OpCeilFloat64x2, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Float64x4.Ceil", opLen1(ssa.OpCeilFloat64x4, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Float32x4.CeilWithPrecision", opLen1Imm8(ssa.OpCeilWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
-	addF(simdPackage, "Float32x8.CeilWithPrecision", opLen1Imm8(ssa.OpCeilWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
-	addF(simdPackage, "Float32x16.CeilWithPrecision", opLen1Imm8(ssa.OpCeilWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
-	addF(simdPackage, "Float64x2.CeilWithPrecision", opLen1Imm8(ssa.OpCeilWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
-	addF(simdPackage, "Float64x4.CeilWithPrecision", opLen1Imm8(ssa.OpCeilWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
-	addF(simdPackage, "Float64x8.CeilWithPrecision", opLen1Imm8(ssa.OpCeilWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
-	addF(simdPackage, "Float32x4.CeilWithPrecisionMasked", opLen2Imm8(ssa.OpCeilWithPrecisionMaskedFloat32x4, types.TypeVec128, 4), sys.AMD64)
-	addF(simdPackage, "Float32x8.CeilWithPrecisionMasked", opLen2Imm8(ssa.OpCeilWithPrecisionMaskedFloat32x8, types.TypeVec256, 4), sys.AMD64)
-	addF(simdPackage, "Float32x16.CeilWithPrecisionMasked", opLen2Imm8(ssa.OpCeilWithPrecisionMaskedFloat32x16, types.TypeVec512, 4), sys.AMD64)
-	addF(simdPackage, "Float64x2.CeilWithPrecisionMasked", opLen2Imm8(ssa.OpCeilWithPrecisionMaskedFloat64x2, types.TypeVec128, 4), sys.AMD64)
-	addF(simdPackage, "Float64x4.CeilWithPrecisionMasked", opLen2Imm8(ssa.OpCeilWithPrecisionMaskedFloat64x4, types.TypeVec256, 4), sys.AMD64)
-	addF(simdPackage, "Float64x8.CeilWithPrecisionMasked", opLen2Imm8(ssa.OpCeilWithPrecisionMaskedFloat64x8, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x4.CeilScaled", opLen1Imm8(ssa.OpCeilScaledFloat32x4, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float32x8.CeilScaled", opLen1Imm8(ssa.OpCeilScaledFloat32x8, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float32x16.CeilScaled", opLen1Imm8(ssa.OpCeilScaledFloat32x16, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float64x2.CeilScaled", opLen1Imm8(ssa.OpCeilScaledFloat64x2, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float64x4.CeilScaled", opLen1Imm8(ssa.OpCeilScaledFloat64x4, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x8.CeilScaled", opLen1Imm8(ssa.OpCeilScaledFloat64x8, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x4.CeilScaledMasked", opLen2Imm8(ssa.OpCeilScaledMaskedFloat32x4, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float32x8.CeilScaledMasked", opLen2Imm8(ssa.OpCeilScaledMaskedFloat32x8, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float32x16.CeilScaledMasked", opLen2Imm8(ssa.OpCeilScaledMaskedFloat32x16, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float64x2.CeilScaledMasked", opLen2Imm8(ssa.OpCeilScaledMaskedFloat64x2, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float64x4.CeilScaledMasked", opLen2Imm8(ssa.OpCeilScaledMaskedFloat64x4, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x8.CeilScaledMasked", opLen2Imm8(ssa.OpCeilScaledMaskedFloat64x8, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x4.CeilScaledResidue", opLen1Imm8(ssa.OpCeilScaledResidueFloat32x4, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float32x8.CeilScaledResidue", opLen1Imm8(ssa.OpCeilScaledResidueFloat32x8, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float32x16.CeilScaledResidue", opLen1Imm8(ssa.OpCeilScaledResidueFloat32x16, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float64x2.CeilScaledResidue", opLen1Imm8(ssa.OpCeilScaledResidueFloat64x2, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float64x4.CeilScaledResidue", opLen1Imm8(ssa.OpCeilScaledResidueFloat64x4, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x8.CeilScaledResidue", opLen1Imm8(ssa.OpCeilScaledResidueFloat64x8, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x4.CeilScaledResidueMasked", opLen2Imm8(ssa.OpCeilScaledResidueMaskedFloat32x4, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float32x8.CeilScaledResidueMasked", opLen2Imm8(ssa.OpCeilScaledResidueMaskedFloat32x8, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float32x16.CeilScaledResidueMasked", opLen2Imm8(ssa.OpCeilScaledResidueMaskedFloat32x16, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float64x2.CeilScaledResidueMasked", opLen2Imm8(ssa.OpCeilScaledResidueMaskedFloat64x2, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float64x4.CeilScaledResidueMasked", opLen2Imm8(ssa.OpCeilScaledResidueMaskedFloat64x4, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x8.CeilScaledResidueMasked", opLen2Imm8(ssa.OpCeilScaledResidueMaskedFloat64x8, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float32x4.Compress", opLen2(ssa.OpCompressFloat32x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Float32x8.Compress", opLen2(ssa.OpCompressFloat32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Float32x16.Compress", opLen2(ssa.OpCompressFloat32x16, types.TypeVec512), sys.AMD64)
@ -271,54 +321,6 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
 	addF(simdPackage, "Float32x4.ConvertToUint32Masked", opLen2(ssa.OpConvertToUint32MaskedFloat32x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Float32x8.ConvertToUint32Masked", opLen2(ssa.OpConvertToUint32MaskedFloat32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Float32x16.ConvertToUint32Masked", opLen2(ssa.OpConvertToUint32MaskedFloat32x16, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Float32x4.DiffWithCeilWithPrecision", opLen1Imm8(ssa.OpDiffWithCeilWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
-	addF(simdPackage, "Float32x8.DiffWithCeilWithPrecision", opLen1Imm8(ssa.OpDiffWithCeilWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
-	addF(simdPackage, "Float32x16.DiffWithCeilWithPrecision", opLen1Imm8(ssa.OpDiffWithCeilWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
-	addF(simdPackage, "Float64x2.DiffWithCeilWithPrecision", opLen1Imm8(ssa.OpDiffWithCeilWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
-	addF(simdPackage, "Float64x4.DiffWithCeilWithPrecision", opLen1Imm8(ssa.OpDiffWithCeilWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
-	addF(simdPackage, "Float64x8.DiffWithCeilWithPrecision", opLen1Imm8(ssa.OpDiffWithCeilWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
-	addF(simdPackage, "Float32x4.DiffWithCeilWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithCeilWithPrecisionMaskedFloat32x4, types.TypeVec128, 4), sys.AMD64)
-	addF(simdPackage, "Float32x8.DiffWithCeilWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithCeilWithPrecisionMaskedFloat32x8, types.TypeVec256, 4), sys.AMD64)
-	addF(simdPackage, "Float32x16.DiffWithCeilWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithCeilWithPrecisionMaskedFloat32x16, types.TypeVec512, 4), sys.AMD64)
-	addF(simdPackage, "Float64x2.DiffWithCeilWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithCeilWithPrecisionMaskedFloat64x2, types.TypeVec128, 4), sys.AMD64)
-	addF(simdPackage, "Float64x4.DiffWithCeilWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithCeilWithPrecisionMaskedFloat64x4, types.TypeVec256, 4), sys.AMD64)
-	addF(simdPackage, "Float64x8.DiffWithCeilWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithCeilWithPrecisionMaskedFloat64x8, types.TypeVec512, 4), sys.AMD64)
-	addF(simdPackage, "Float32x4.DiffWithFloorWithPrecision", opLen1Imm8(ssa.OpDiffWithFloorWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
-	addF(simdPackage, "Float32x8.DiffWithFloorWithPrecision", opLen1Imm8(ssa.OpDiffWithFloorWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
-	addF(simdPackage, "Float32x16.DiffWithFloorWithPrecision", opLen1Imm8(ssa.OpDiffWithFloorWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
-	addF(simdPackage, "Float64x2.DiffWithFloorWithPrecision", opLen1Imm8(ssa.OpDiffWithFloorWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
-	addF(simdPackage, "Float64x4.DiffWithFloorWithPrecision", opLen1Imm8(ssa.OpDiffWithFloorWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
-	addF(simdPackage, "Float64x8.DiffWithFloorWithPrecision", opLen1Imm8(ssa.OpDiffWithFloorWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
-	addF(simdPackage, "Float32x4.DiffWithFloorWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithFloorWithPrecisionMaskedFloat32x4, types.TypeVec128, 4), sys.AMD64)
-	addF(simdPackage, "Float32x8.DiffWithFloorWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithFloorWithPrecisionMaskedFloat32x8, types.TypeVec256, 4), sys.AMD64)
-	addF(simdPackage, "Float32x16.DiffWithFloorWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithFloorWithPrecisionMaskedFloat32x16, types.TypeVec512, 4), sys.AMD64)
-	addF(simdPackage, "Float64x2.DiffWithFloorWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithFloorWithPrecisionMaskedFloat64x2, types.TypeVec128, 4), sys.AMD64)
-	addF(simdPackage, "Float64x4.DiffWithFloorWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithFloorWithPrecisionMaskedFloat64x4, types.TypeVec256, 4), sys.AMD64)
-	addF(simdPackage, "Float64x8.DiffWithFloorWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithFloorWithPrecisionMaskedFloat64x8, types.TypeVec512, 4), sys.AMD64)
-	addF(simdPackage, "Float32x4.DiffWithRoundWithPrecision", opLen1Imm8(ssa.OpDiffWithRoundWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
-	addF(simdPackage, "Float32x8.DiffWithRoundWithPrecision", opLen1Imm8(ssa.OpDiffWithRoundWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
-	addF(simdPackage, "Float32x16.DiffWithRoundWithPrecision", opLen1Imm8(ssa.OpDiffWithRoundWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
-	addF(simdPackage, "Float64x2.DiffWithRoundWithPrecision", opLen1Imm8(ssa.OpDiffWithRoundWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
-	addF(simdPackage, "Float64x4.DiffWithRoundWithPrecision", opLen1Imm8(ssa.OpDiffWithRoundWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
-	addF(simdPackage, "Float64x8.DiffWithRoundWithPrecision", opLen1Imm8(ssa.OpDiffWithRoundWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
-	addF(simdPackage, "Float32x4.DiffWithRoundWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithRoundWithPrecisionMaskedFloat32x4, types.TypeVec128, 4), sys.AMD64)
-	addF(simdPackage, "Float32x8.DiffWithRoundWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithRoundWithPrecisionMaskedFloat32x8, types.TypeVec256, 4), sys.AMD64)
-	addF(simdPackage, "Float32x16.DiffWithRoundWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithRoundWithPrecisionMaskedFloat32x16, types.TypeVec512, 4), sys.AMD64)
-	addF(simdPackage, "Float64x2.DiffWithRoundWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithRoundWithPrecisionMaskedFloat64x2, types.TypeVec128, 4), sys.AMD64)
-	addF(simdPackage, "Float64x4.DiffWithRoundWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithRoundWithPrecisionMaskedFloat64x4, types.TypeVec256, 4), sys.AMD64)
-	addF(simdPackage, "Float64x8.DiffWithRoundWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithRoundWithPrecisionMaskedFloat64x8, types.TypeVec512, 4), sys.AMD64)
-	addF(simdPackage, "Float32x4.DiffWithTruncWithPrecision", opLen1Imm8(ssa.OpDiffWithTruncWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
-	addF(simdPackage, "Float32x8.DiffWithTruncWithPrecision", opLen1Imm8(ssa.OpDiffWithTruncWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
-	addF(simdPackage, "Float32x16.DiffWithTruncWithPrecision", opLen1Imm8(ssa.OpDiffWithTruncWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
-	addF(simdPackage, "Float64x2.DiffWithTruncWithPrecision", opLen1Imm8(ssa.OpDiffWithTruncWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
-	addF(simdPackage, "Float64x4.DiffWithTruncWithPrecision", opLen1Imm8(ssa.OpDiffWithTruncWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
-	addF(simdPackage, "Float64x8.DiffWithTruncWithPrecision", opLen1Imm8(ssa.OpDiffWithTruncWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
-	addF(simdPackage, "Float32x4.DiffWithTruncWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithTruncWithPrecisionMaskedFloat32x4, types.TypeVec128, 4), sys.AMD64)
-	addF(simdPackage, "Float32x8.DiffWithTruncWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithTruncWithPrecisionMaskedFloat32x8, types.TypeVec256, 4), sys.AMD64)
-	addF(simdPackage, "Float32x16.DiffWithTruncWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithTruncWithPrecisionMaskedFloat32x16, types.TypeVec512, 4), sys.AMD64)
-	addF(simdPackage, "Float64x2.DiffWithTruncWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithTruncWithPrecisionMaskedFloat64x2, types.TypeVec128, 4), sys.AMD64)
-	addF(simdPackage, "Float64x4.DiffWithTruncWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithTruncWithPrecisionMaskedFloat64x4, types.TypeVec256, 4), sys.AMD64)
-	addF(simdPackage, "Float64x8.DiffWithTruncWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithTruncWithPrecisionMaskedFloat64x8, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float32x4.Div", opLen2(ssa.OpDivFloat32x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Float32x8.Div", opLen2(ssa.OpDivFloat32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Float32x16.Div", opLen2(ssa.OpDivFloat32x16, types.TypeVec512), sys.AMD64)
@ -398,18 +400,30 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
 	addF(simdPackage, "Float32x8.Floor", opLen1(ssa.OpFloorFloat32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Float64x2.Floor", opLen1(ssa.OpFloorFloat64x2, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Float64x4.Floor", opLen1(ssa.OpFloorFloat64x4, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Float32x4.FloorWithPrecision", opLen1Imm8(ssa.OpFloorWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
-	addF(simdPackage, "Float32x8.FloorWithPrecision", opLen1Imm8(ssa.OpFloorWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
-	addF(simdPackage, "Float32x16.FloorWithPrecision", opLen1Imm8(ssa.OpFloorWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
-	addF(simdPackage, "Float64x2.FloorWithPrecision", opLen1Imm8(ssa.OpFloorWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
-	addF(simdPackage, "Float64x4.FloorWithPrecision", opLen1Imm8(ssa.OpFloorWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
-	addF(simdPackage, "Float64x8.FloorWithPrecision", opLen1Imm8(ssa.OpFloorWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
-	addF(simdPackage, "Float32x4.FloorWithPrecisionMasked", opLen2Imm8(ssa.OpFloorWithPrecisionMaskedFloat32x4, types.TypeVec128, 4), sys.AMD64)
-	addF(simdPackage, "Float32x8.FloorWithPrecisionMasked", opLen2Imm8(ssa.OpFloorWithPrecisionMaskedFloat32x8, types.TypeVec256, 4), sys.AMD64)
-	addF(simdPackage, "Float32x16.FloorWithPrecisionMasked", opLen2Imm8(ssa.OpFloorWithPrecisionMaskedFloat32x16, types.TypeVec512, 4), sys.AMD64)
-	addF(simdPackage, "Float64x2.FloorWithPrecisionMasked", opLen2Imm8(ssa.OpFloorWithPrecisionMaskedFloat64x2, types.TypeVec128, 4), sys.AMD64)
-	addF(simdPackage, "Float64x4.FloorWithPrecisionMasked", opLen2Imm8(ssa.OpFloorWithPrecisionMaskedFloat64x4, types.TypeVec256, 4), sys.AMD64)
-	addF(simdPackage, "Float64x8.FloorWithPrecisionMasked", opLen2Imm8(ssa.OpFloorWithPrecisionMaskedFloat64x8, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x4.FloorScaled", opLen1Imm8(ssa.OpFloorScaledFloat32x4, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float32x8.FloorScaled", opLen1Imm8(ssa.OpFloorScaledFloat32x8, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float32x16.FloorScaled", opLen1Imm8(ssa.OpFloorScaledFloat32x16, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float64x2.FloorScaled", opLen1Imm8(ssa.OpFloorScaledFloat64x2, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float64x4.FloorScaled", opLen1Imm8(ssa.OpFloorScaledFloat64x4, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x8.FloorScaled", opLen1Imm8(ssa.OpFloorScaledFloat64x8, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x4.FloorScaledMasked", opLen2Imm8(ssa.OpFloorScaledMaskedFloat32x4, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float32x8.FloorScaledMasked", opLen2Imm8(ssa.OpFloorScaledMaskedFloat32x8, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float32x16.FloorScaledMasked", opLen2Imm8(ssa.OpFloorScaledMaskedFloat32x16, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float64x2.FloorScaledMasked", opLen2Imm8(ssa.OpFloorScaledMaskedFloat64x2, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float64x4.FloorScaledMasked", opLen2Imm8(ssa.OpFloorScaledMaskedFloat64x4, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x8.FloorScaledMasked", opLen2Imm8(ssa.OpFloorScaledMaskedFloat64x8, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x4.FloorScaledResidue", opLen1Imm8(ssa.OpFloorScaledResidueFloat32x4, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float32x8.FloorScaledResidue", opLen1Imm8(ssa.OpFloorScaledResidueFloat32x8, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float32x16.FloorScaledResidue", opLen1Imm8(ssa.OpFloorScaledResidueFloat32x16, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float64x2.FloorScaledResidue", opLen1Imm8(ssa.OpFloorScaledResidueFloat64x2, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float64x4.FloorScaledResidue", opLen1Imm8(ssa.OpFloorScaledResidueFloat64x4, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x8.FloorScaledResidue", opLen1Imm8(ssa.OpFloorScaledResidueFloat64x8, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x4.FloorScaledResidueMasked", opLen2Imm8(ssa.OpFloorScaledResidueMaskedFloat32x4, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float32x8.FloorScaledResidueMasked", opLen2Imm8(ssa.OpFloorScaledResidueMaskedFloat32x8, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float32x16.FloorScaledResidueMasked", opLen2Imm8(ssa.OpFloorScaledResidueMaskedFloat32x16, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float64x2.FloorScaledResidueMasked", opLen2Imm8(ssa.OpFloorScaledResidueMaskedFloat64x2, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float64x4.FloorScaledResidueMasked", opLen2Imm8(ssa.OpFloorScaledResidueMaskedFloat64x4, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x8.FloorScaledResidueMasked", opLen2Imm8(ssa.OpFloorScaledResidueMaskedFloat64x8, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Float32x4.FusedMultiplyAdd", opLen3(ssa.OpFusedMultiplyAddFloat32x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Float32x8.FusedMultiplyAdd", opLen3(ssa.OpFusedMultiplyAddFloat32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Float32x16.FusedMultiplyAdd", opLen3(ssa.OpFusedMultiplyAddFloat32x16, types.TypeVec512), sys.AMD64)
@ -860,18 +874,15 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
 	addF(simdPackage, "Float64x2.Mul", opLen2(ssa.OpMulFloat64x2, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Float64x4.Mul", opLen2(ssa.OpMulFloat64x4, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Float64x8.Mul", opLen2(ssa.OpMulFloat64x8, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Float32x4.MulByPowOf2", opLen2(ssa.OpMulByPowOf2Float32x4, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Float32x8.MulByPowOf2", opLen2(ssa.OpMulByPowOf2Float32x8, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Float32x16.MulByPowOf2", opLen2(ssa.OpMulByPowOf2Float32x16, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Float64x2.MulByPowOf2", opLen2(ssa.OpMulByPowOf2Float64x2, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Float64x4.MulByPowOf2", opLen2(ssa.OpMulByPowOf2Float64x4, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Float64x8.MulByPowOf2", opLen2(ssa.OpMulByPowOf2Float64x8, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Float32x4.MulByPowOf2Masked", opLen3(ssa.OpMulByPowOf2MaskedFloat32x4, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Float32x8.MulByPowOf2Masked", opLen3(ssa.OpMulByPowOf2MaskedFloat32x8, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Float32x16.MulByPowOf2Masked", opLen3(ssa.OpMulByPowOf2MaskedFloat32x16, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Float64x2.MulByPowOf2Masked", opLen3(ssa.OpMulByPowOf2MaskedFloat64x2, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Float64x4.MulByPowOf2Masked", opLen3(ssa.OpMulByPowOf2MaskedFloat64x4, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Float64x8.MulByPowOf2Masked", opLen3(ssa.OpMulByPowOf2MaskedFloat64x8, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Int16x8.Mul", opLen2(ssa.OpMulInt16x8, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int16x16.Mul", opLen2(ssa.OpMulInt16x16, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int16x32.Mul", opLen2(ssa.OpMulInt16x32, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Int32x4.Mul", opLen2(ssa.OpMulInt32x4, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int32x8.Mul", opLen2(ssa.OpMulInt32x8, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int32x16.Mul", opLen2(ssa.OpMulInt32x16, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Int64x2.Mul", opLen2(ssa.OpMulInt64x2, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int64x4.Mul", opLen2(ssa.OpMulInt64x4, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int64x8.Mul", opLen2(ssa.OpMulInt64x8, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Int32x4.MulEvenWiden", opLen2(ssa.OpMulEvenWidenInt32x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Int32x8.MulEvenWiden", opLen2(ssa.OpMulEvenWidenInt32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int64x2.MulEvenWiden", opLen2(ssa.OpMulEvenWidenInt64x2, types.TypeVec128), sys.AMD64)
@ -900,30 +911,21 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
 	addF(simdPackage, "Uint16x8.MulHighMasked", opLen3(ssa.OpMulHighMaskedUint16x8, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Uint16x16.MulHighMasked", opLen3(ssa.OpMulHighMaskedUint16x16, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint16x32.MulHighMasked", opLen3(ssa.OpMulHighMaskedUint16x32, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Int16x8.MulLow", opLen2(ssa.OpMulLowInt16x8, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int16x16.MulLow", opLen2(ssa.OpMulLowInt16x16, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Int16x32.MulLow", opLen2(ssa.OpMulLowInt16x32, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Int32x4.MulLow", opLen2(ssa.OpMulLowInt32x4, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int32x8.MulLow", opLen2(ssa.OpMulLowInt32x8, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Int32x16.MulLow", opLen2(ssa.OpMulLowInt32x16, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Int64x2.MulLow", opLen2(ssa.OpMulLowInt64x2, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int64x4.MulLow", opLen2(ssa.OpMulLowInt64x4, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Int64x8.MulLow", opLen2(ssa.OpMulLowInt64x8, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Int16x8.MulLowMasked", opLen3(ssa.OpMulLowMaskedInt16x8, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int16x16.MulLowMasked", opLen3(ssa.OpMulLowMaskedInt16x16, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Int16x32.MulLowMasked", opLen3(ssa.OpMulLowMaskedInt16x32, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Int32x4.MulLowMasked", opLen3(ssa.OpMulLowMaskedInt32x4, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int32x8.MulLowMasked", opLen3(ssa.OpMulLowMaskedInt32x8, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Int32x16.MulLowMasked", opLen3(ssa.OpMulLowMaskedInt32x16, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Int64x2.MulLowMasked", opLen3(ssa.OpMulLowMaskedInt64x2, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int64x4.MulLowMasked", opLen3(ssa.OpMulLowMaskedInt64x4, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Int64x8.MulLowMasked", opLen3(ssa.OpMulLowMaskedInt64x8, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Float32x4.MulMasked", opLen3(ssa.OpMulMaskedFloat32x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Float32x8.MulMasked", opLen3(ssa.OpMulMaskedFloat32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Float32x16.MulMasked", opLen3(ssa.OpMulMaskedFloat32x16, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Float64x2.MulMasked", opLen3(ssa.OpMulMaskedFloat64x2, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Float64x4.MulMasked", opLen3(ssa.OpMulMaskedFloat64x4, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Float64x8.MulMasked", opLen3(ssa.OpMulMaskedFloat64x8, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Int16x8.MulMasked", opLen3(ssa.OpMulMaskedInt16x8, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int16x16.MulMasked", opLen3(ssa.OpMulMaskedInt16x16, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int16x32.MulMasked", opLen3(ssa.OpMulMaskedInt16x32, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Int32x4.MulMasked", opLen3(ssa.OpMulMaskedInt32x4, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int32x8.MulMasked", opLen3(ssa.OpMulMaskedInt32x8, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int32x16.MulMasked", opLen3(ssa.OpMulMaskedInt32x16, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Int64x2.MulMasked", opLen3(ssa.OpMulMaskedInt64x2, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int64x4.MulMasked", opLen3(ssa.OpMulMaskedInt64x4, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int64x8.MulMasked", opLen3(ssa.OpMulMaskedInt64x8, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Float32x4.NotEqual", opLen2(ssa.OpNotEqualFloat32x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Float32x8.NotEqual", opLen2(ssa.OpNotEqualFloat32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Float32x16.NotEqual", opLen2(ssa.OpNotEqualFloat32x16, types.TypeVec512), sys.AMD64)
@ -1026,30 +1028,6 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
 	addF(simdPackage, "Int16x8.PairDotProdMasked", opLen3(ssa.OpPairDotProdMaskedInt16x8, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Int16x16.PairDotProdMasked", opLen3(ssa.OpPairDotProdMaskedInt16x16, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int16x32.PairDotProdMasked", opLen3(ssa.OpPairDotProdMaskedInt16x32, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Float32x4.PairwiseAdd", opLen2(ssa.OpPairwiseAddFloat32x4, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Float32x8.PairwiseAdd", opLen2(ssa.OpPairwiseAddFloat32x8, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Float64x2.PairwiseAdd", opLen2(ssa.OpPairwiseAddFloat64x2, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Float64x4.PairwiseAdd", opLen2(ssa.OpPairwiseAddFloat64x4, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Int16x8.PairwiseAdd", opLen2(ssa.OpPairwiseAddInt16x8, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int16x16.PairwiseAdd", opLen2(ssa.OpPairwiseAddInt16x16, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Int32x4.PairwiseAdd", opLen2(ssa.OpPairwiseAddInt32x4, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int32x8.PairwiseAdd", opLen2(ssa.OpPairwiseAddInt32x8, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Uint16x8.PairwiseAdd", opLen2(ssa.OpPairwiseAddUint16x8, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Uint16x16.PairwiseAdd", opLen2(ssa.OpPairwiseAddUint16x16, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Uint32x4.PairwiseAdd", opLen2(ssa.OpPairwiseAddUint32x4, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Uint32x8.PairwiseAdd", opLen2(ssa.OpPairwiseAddUint32x8, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Float32x4.PairwiseSub", opLen2(ssa.OpPairwiseSubFloat32x4, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Float32x8.PairwiseSub", opLen2(ssa.OpPairwiseSubFloat32x8, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Float64x2.PairwiseSub", opLen2(ssa.OpPairwiseSubFloat64x2, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Float64x4.PairwiseSub", opLen2(ssa.OpPairwiseSubFloat64x4, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Int16x8.PairwiseSub", opLen2(ssa.OpPairwiseSubInt16x8, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int16x16.PairwiseSub", opLen2(ssa.OpPairwiseSubInt16x16, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Int32x4.PairwiseSub", opLen2(ssa.OpPairwiseSubInt32x4, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int32x8.PairwiseSub", opLen2(ssa.OpPairwiseSubInt32x8, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Uint16x8.PairwiseSub", opLen2(ssa.OpPairwiseSubUint16x8, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Uint16x16.PairwiseSub", opLen2(ssa.OpPairwiseSubUint16x16, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Uint32x4.PairwiseSub", opLen2(ssa.OpPairwiseSubUint32x4, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Uint32x8.PairwiseSub", opLen2(ssa.OpPairwiseSubUint32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int8x16.Permute", opLen2_21(ssa.OpPermuteInt8x16, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Uint8x16.Permute", opLen2_21(ssa.OpPermuteUint8x16, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Int8x32.Permute", opLen2_21(ssa.OpPermuteInt8x32, types.TypeVec256), sys.AMD64)
@ -1306,76 +1284,36 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
 	addF(simdPackage, "Float32x8.Round", opLen1(ssa.OpRoundFloat32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Float64x2.Round", opLen1(ssa.OpRoundFloat64x2, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Float64x4.Round", opLen1(ssa.OpRoundFloat64x4, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Float32x4.RoundWithPrecision", opLen1Imm8(ssa.OpRoundWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
-	addF(simdPackage, "Float32x8.RoundWithPrecision", opLen1Imm8(ssa.OpRoundWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
-	addF(simdPackage, "Float32x16.RoundWithPrecision", opLen1Imm8(ssa.OpRoundWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
-	addF(simdPackage, "Float64x2.RoundWithPrecision", opLen1Imm8(ssa.OpRoundWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
-	addF(simdPackage, "Float64x4.RoundWithPrecision", opLen1Imm8(ssa.OpRoundWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
-	addF(simdPackage, "Float64x8.RoundWithPrecision", opLen1Imm8(ssa.OpRoundWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
-	addF(simdPackage, "Float32x4.RoundWithPrecisionMasked", opLen2Imm8(ssa.OpRoundWithPrecisionMaskedFloat32x4, types.TypeVec128, 4), sys.AMD64)
-	addF(simdPackage, "Float32x8.RoundWithPrecisionMasked", opLen2Imm8(ssa.OpRoundWithPrecisionMaskedFloat32x8, types.TypeVec256, 4), sys.AMD64)
-	addF(simdPackage, "Float32x16.RoundWithPrecisionMasked", opLen2Imm8(ssa.OpRoundWithPrecisionMaskedFloat32x16, types.TypeVec512, 4), sys.AMD64)
-	addF(simdPackage, "Float64x2.RoundWithPrecisionMasked", opLen2Imm8(ssa.OpRoundWithPrecisionMaskedFloat64x2, types.TypeVec128, 4), sys.AMD64)
-	addF(simdPackage, "Float64x4.RoundWithPrecisionMasked", opLen2Imm8(ssa.OpRoundWithPrecisionMaskedFloat64x4, types.TypeVec256, 4), sys.AMD64)
-	addF(simdPackage, "Float64x8.RoundWithPrecisionMasked", opLen2Imm8(ssa.OpRoundWithPrecisionMaskedFloat64x8, types.TypeVec512, 4), sys.AMD64)
-	addF(simdPackage, "Int8x16.SaturatedAdd", opLen2(ssa.OpSaturatedAddInt8x16, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int8x32.SaturatedAdd", opLen2(ssa.OpSaturatedAddInt8x32, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Int8x64.SaturatedAdd", opLen2(ssa.OpSaturatedAddInt8x64, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Int16x8.SaturatedAdd", opLen2(ssa.OpSaturatedAddInt16x8, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int16x16.SaturatedAdd", opLen2(ssa.OpSaturatedAddInt16x16, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Int16x32.SaturatedAdd", opLen2(ssa.OpSaturatedAddInt16x32, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Uint8x16.SaturatedAdd", opLen2(ssa.OpSaturatedAddUint8x16, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Uint8x32.SaturatedAdd", opLen2(ssa.OpSaturatedAddUint8x32, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Uint8x64.SaturatedAdd", opLen2(ssa.OpSaturatedAddUint8x64, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Uint16x8.SaturatedAdd", opLen2(ssa.OpSaturatedAddUint16x8, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Uint16x16.SaturatedAdd", opLen2(ssa.OpSaturatedAddUint16x16, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Uint16x32.SaturatedAdd", opLen2(ssa.OpSaturatedAddUint16x32, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Float32x4.RoundScaled", opLen1Imm8(ssa.OpRoundScaledFloat32x4, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float32x8.RoundScaled", opLen1Imm8(ssa.OpRoundScaledFloat32x8, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float32x16.RoundScaled", opLen1Imm8(ssa.OpRoundScaledFloat32x16, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float64x2.RoundScaled", opLen1Imm8(ssa.OpRoundScaledFloat64x2, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float64x4.RoundScaled", opLen1Imm8(ssa.OpRoundScaledFloat64x4, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x8.RoundScaled", opLen1Imm8(ssa.OpRoundScaledFloat64x8, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x4.RoundScaledMasked", opLen2Imm8(ssa.OpRoundScaledMaskedFloat32x4, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float32x8.RoundScaledMasked", opLen2Imm8(ssa.OpRoundScaledMaskedFloat32x8, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float32x16.RoundScaledMasked", opLen2Imm8(ssa.OpRoundScaledMaskedFloat32x16, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float64x2.RoundScaledMasked", opLen2Imm8(ssa.OpRoundScaledMaskedFloat64x2, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float64x4.RoundScaledMasked", opLen2Imm8(ssa.OpRoundScaledMaskedFloat64x4, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x8.RoundScaledMasked", opLen2Imm8(ssa.OpRoundScaledMaskedFloat64x8, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x4.RoundScaledResidue", opLen1Imm8(ssa.OpRoundScaledResidueFloat32x4, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float32x8.RoundScaledResidue", opLen1Imm8(ssa.OpRoundScaledResidueFloat32x8, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float32x16.RoundScaledResidue", opLen1Imm8(ssa.OpRoundScaledResidueFloat32x16, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float64x2.RoundScaledResidue", opLen1Imm8(ssa.OpRoundScaledResidueFloat64x2, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float64x4.RoundScaledResidue", opLen1Imm8(ssa.OpRoundScaledResidueFloat64x4, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x8.RoundScaledResidue", opLen1Imm8(ssa.OpRoundScaledResidueFloat64x8, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x4.RoundScaledResidueMasked", opLen2Imm8(ssa.OpRoundScaledResidueMaskedFloat32x4, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float32x8.RoundScaledResidueMasked", opLen2Imm8(ssa.OpRoundScaledResidueMaskedFloat32x8, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float32x16.RoundScaledResidueMasked", opLen2Imm8(ssa.OpRoundScaledResidueMaskedFloat32x16, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float64x2.RoundScaledResidueMasked", opLen2Imm8(ssa.OpRoundScaledResidueMaskedFloat64x2, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float64x4.RoundScaledResidueMasked", opLen2Imm8(ssa.OpRoundScaledResidueMaskedFloat64x4, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x8.RoundScaledResidueMasked", opLen2Imm8(ssa.OpRoundScaledResidueMaskedFloat64x8, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Int32x4.SaturatedAddDotProd", opLen3(ssa.OpSaturatedAddDotProdInt32x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Int32x8.SaturatedAddDotProd", opLen3(ssa.OpSaturatedAddDotProdInt32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int32x16.SaturatedAddDotProd", opLen3(ssa.OpSaturatedAddDotProdInt32x16, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Int32x4.SaturatedAddDotProdMasked", opLen4(ssa.OpSaturatedAddDotProdMaskedInt32x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Int32x8.SaturatedAddDotProdMasked", opLen4(ssa.OpSaturatedAddDotProdMaskedInt32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int32x16.SaturatedAddDotProdMasked", opLen4(ssa.OpSaturatedAddDotProdMaskedInt32x16, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Int8x16.SaturatedAddMasked", opLen3(ssa.OpSaturatedAddMaskedInt8x16, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int8x32.SaturatedAddMasked", opLen3(ssa.OpSaturatedAddMaskedInt8x32, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Int8x64.SaturatedAddMasked", opLen3(ssa.OpSaturatedAddMaskedInt8x64, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Int16x8.SaturatedAddMasked", opLen3(ssa.OpSaturatedAddMaskedInt16x8, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int16x16.SaturatedAddMasked", opLen3(ssa.OpSaturatedAddMaskedInt16x16, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Int16x32.SaturatedAddMasked", opLen3(ssa.OpSaturatedAddMaskedInt16x32, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Uint8x16.SaturatedAddMasked", opLen3(ssa.OpSaturatedAddMaskedUint8x16, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Uint8x32.SaturatedAddMasked", opLen3(ssa.OpSaturatedAddMaskedUint8x32, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Uint8x64.SaturatedAddMasked", opLen3(ssa.OpSaturatedAddMaskedUint8x64, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Uint16x8.SaturatedAddMasked", opLen3(ssa.OpSaturatedAddMaskedUint16x8, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Uint16x16.SaturatedAddMasked", opLen3(ssa.OpSaturatedAddMaskedUint16x16, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Uint16x32.SaturatedAddMasked", opLen3(ssa.OpSaturatedAddMaskedUint16x32, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Int16x8.SaturatedPairwiseAdd", opLen2(ssa.OpSaturatedPairwiseAddInt16x8, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int16x16.SaturatedPairwiseAdd", opLen2(ssa.OpSaturatedPairwiseAddInt16x16, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Int16x8.SaturatedPairwiseSub", opLen2(ssa.OpSaturatedPairwiseSubInt16x8, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int16x16.SaturatedPairwiseSub", opLen2(ssa.OpSaturatedPairwiseSubInt16x16, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Int8x16.SaturatedSub", opLen2(ssa.OpSaturatedSubInt8x16, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int8x32.SaturatedSub", opLen2(ssa.OpSaturatedSubInt8x32, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Int8x64.SaturatedSub", opLen2(ssa.OpSaturatedSubInt8x64, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Int16x8.SaturatedSub", opLen2(ssa.OpSaturatedSubInt16x8, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int16x16.SaturatedSub", opLen2(ssa.OpSaturatedSubInt16x16, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Int16x32.SaturatedSub", opLen2(ssa.OpSaturatedSubInt16x32, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Uint8x16.SaturatedSub", opLen2(ssa.OpSaturatedSubUint8x16, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Uint8x32.SaturatedSub", opLen2(ssa.OpSaturatedSubUint8x32, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Uint8x64.SaturatedSub", opLen2(ssa.OpSaturatedSubUint8x64, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Uint16x8.SaturatedSub", opLen2(ssa.OpSaturatedSubUint16x8, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Uint16x16.SaturatedSub", opLen2(ssa.OpSaturatedSubUint16x16, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Uint16x32.SaturatedSub", opLen2(ssa.OpSaturatedSubUint16x32, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Int8x16.SaturatedSubMasked", opLen3(ssa.OpSaturatedSubMaskedInt8x16, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int8x32.SaturatedSubMasked", opLen3(ssa.OpSaturatedSubMaskedInt8x32, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Int8x64.SaturatedSubMasked", opLen3(ssa.OpSaturatedSubMaskedInt8x64, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Int16x8.SaturatedSubMasked", opLen3(ssa.OpSaturatedSubMaskedInt16x8, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Int16x16.SaturatedSubMasked", opLen3(ssa.OpSaturatedSubMaskedInt16x16, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Int16x32.SaturatedSubMasked", opLen3(ssa.OpSaturatedSubMaskedInt16x32, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Uint8x16.SaturatedSubMasked", opLen3(ssa.OpSaturatedSubMaskedUint8x16, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Uint8x32.SaturatedSubMasked", opLen3(ssa.OpSaturatedSubMaskedUint8x32, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Uint8x64.SaturatedSubMasked", opLen3(ssa.OpSaturatedSubMaskedUint8x64, types.TypeVec512), sys.AMD64)
-	addF(simdPackage, "Uint16x8.SaturatedSubMasked", opLen3(ssa.OpSaturatedSubMaskedUint16x8, types.TypeVec128), sys.AMD64)
-	addF(simdPackage, "Uint16x16.SaturatedSubMasked", opLen3(ssa.OpSaturatedSubMaskedUint16x16, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Uint16x32.SaturatedSubMasked", opLen3(ssa.OpSaturatedSubMaskedUint16x32, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Uint8x16.SaturatedUnsignedSignedPairDotProd", opLen2(ssa.OpSaturatedUnsignedSignedPairDotProdUint8x16, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Uint8x32.SaturatedUnsignedSignedPairDotProd", opLen2(ssa.OpSaturatedUnsignedSignedPairDotProdUint8x32, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint8x64.SaturatedUnsignedSignedPairDotProd", opLen2(ssa.OpSaturatedUnsignedSignedPairDotProdUint8x64, types.TypeVec512), sys.AMD64)
@ -1388,6 +1326,18 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
 	addF(simdPackage, "Int8x16.SaturatedUnsignedSignedQuadDotProdAccumulateMasked", opLen4_31(ssa.OpSaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Int8x32.SaturatedUnsignedSignedQuadDotProdAccumulateMasked", opLen4_31(ssa.OpSaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int8x64.SaturatedUnsignedSignedQuadDotProdAccumulateMasked", opLen4_31(ssa.OpSaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x16, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Float32x4.Scale", opLen2(ssa.OpScaleFloat32x4, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Float32x8.Scale", opLen2(ssa.OpScaleFloat32x8, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Float32x16.Scale", opLen2(ssa.OpScaleFloat32x16, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Float64x2.Scale", opLen2(ssa.OpScaleFloat64x2, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Float64x4.Scale", opLen2(ssa.OpScaleFloat64x4, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Float64x8.Scale", opLen2(ssa.OpScaleFloat64x8, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Float32x4.ScaleMasked", opLen3(ssa.OpScaleMaskedFloat32x4, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Float32x8.ScaleMasked", opLen3(ssa.OpScaleMaskedFloat32x8, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Float32x16.ScaleMasked", opLen3(ssa.OpScaleMaskedFloat32x16, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Float64x2.ScaleMasked", opLen3(ssa.OpScaleMaskedFloat64x2, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Float64x4.ScaleMasked", opLen3(ssa.OpScaleMaskedFloat64x4, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Float64x8.ScaleMasked", opLen3(ssa.OpScaleMaskedFloat64x8, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Float32x8.Set128", opLen2Imm8(ssa.OpSet128Float32x8, types.TypeVec256, 0), sys.AMD64)
 	addF(simdPackage, "Float64x4.Set128", opLen2Imm8(ssa.OpSet128Float64x4, types.TypeVec256, 0), sys.AMD64)
 	addF(simdPackage, "Int8x32.Set128", opLen2Imm8(ssa.OpSet128Int8x32, types.TypeVec256, 0), sys.AMD64)
@ -1772,22 +1722,72 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
 	addF(simdPackage, "Uint64x2.SubMasked", opLen3(ssa.OpSubMaskedUint64x2, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Uint64x4.SubMasked", opLen3(ssa.OpSubMaskedUint64x4, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint64x8.SubMasked", opLen3(ssa.OpSubMaskedUint64x8, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Float32x4.SubPairs", opLen2(ssa.OpSubPairsFloat32x4, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Float32x8.SubPairs", opLen2(ssa.OpSubPairsFloat32x8, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Float64x2.SubPairs", opLen2(ssa.OpSubPairsFloat64x2, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Float64x4.SubPairs", opLen2(ssa.OpSubPairsFloat64x4, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int16x8.SubPairs", opLen2(ssa.OpSubPairsInt16x8, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int16x16.SubPairs", opLen2(ssa.OpSubPairsInt16x16, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int32x4.SubPairs", opLen2(ssa.OpSubPairsInt32x4, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int32x8.SubPairs", opLen2(ssa.OpSubPairsInt32x8, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Uint16x8.SubPairs", opLen2(ssa.OpSubPairsUint16x8, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint16x16.SubPairs", opLen2(ssa.OpSubPairsUint16x16, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Uint32x4.SubPairs", opLen2(ssa.OpSubPairsUint32x4, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint32x8.SubPairs", opLen2(ssa.OpSubPairsUint32x8, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int16x8.SubPairsSaturated", opLen2(ssa.OpSubPairsSaturatedInt16x8, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int16x16.SubPairsSaturated", opLen2(ssa.OpSubPairsSaturatedInt16x16, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int8x16.SubSaturated", opLen2(ssa.OpSubSaturatedInt8x16, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int8x32.SubSaturated", opLen2(ssa.OpSubSaturatedInt8x32, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int8x64.SubSaturated", opLen2(ssa.OpSubSaturatedInt8x64, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Int16x8.SubSaturated", opLen2(ssa.OpSubSaturatedInt16x8, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int16x16.SubSaturated", opLen2(ssa.OpSubSaturatedInt16x16, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int16x32.SubSaturated", opLen2(ssa.OpSubSaturatedInt16x32, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Uint8x16.SubSaturated", opLen2(ssa.OpSubSaturatedUint8x16, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint8x32.SubSaturated", opLen2(ssa.OpSubSaturatedUint8x32, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Uint8x64.SubSaturated", opLen2(ssa.OpSubSaturatedUint8x64, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Uint16x8.SubSaturated", opLen2(ssa.OpSubSaturatedUint16x8, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint16x16.SubSaturated", opLen2(ssa.OpSubSaturatedUint16x16, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Uint16x32.SubSaturated", opLen2(ssa.OpSubSaturatedUint16x32, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Int8x16.SubSaturatedMasked", opLen3(ssa.OpSubSaturatedMaskedInt8x16, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int8x32.SubSaturatedMasked", opLen3(ssa.OpSubSaturatedMaskedInt8x32, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int8x64.SubSaturatedMasked", opLen3(ssa.OpSubSaturatedMaskedInt8x64, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Int16x8.SubSaturatedMasked", opLen3(ssa.OpSubSaturatedMaskedInt16x8, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int16x16.SubSaturatedMasked", opLen3(ssa.OpSubSaturatedMaskedInt16x16, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Int16x32.SubSaturatedMasked", opLen3(ssa.OpSubSaturatedMaskedInt16x32, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Uint8x16.SubSaturatedMasked", opLen3(ssa.OpSubSaturatedMaskedUint8x16, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint8x32.SubSaturatedMasked", opLen3(ssa.OpSubSaturatedMaskedUint8x32, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Uint8x64.SubSaturatedMasked", opLen3(ssa.OpSubSaturatedMaskedUint8x64, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Uint16x8.SubSaturatedMasked", opLen3(ssa.OpSubSaturatedMaskedUint16x8, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint16x16.SubSaturatedMasked", opLen3(ssa.OpSubSaturatedMaskedUint16x16, types.TypeVec256), sys.AMD64)
+	addF(simdPackage, "Uint16x32.SubSaturatedMasked", opLen3(ssa.OpSubSaturatedMaskedUint16x32, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Float32x4.Trunc", opLen1(ssa.OpTruncFloat32x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Float32x8.Trunc", opLen1(ssa.OpTruncFloat32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Float64x2.Trunc", opLen1(ssa.OpTruncFloat64x2, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Float64x4.Trunc", opLen1(ssa.OpTruncFloat64x4, types.TypeVec256), sys.AMD64)
-	addF(simdPackage, "Float32x4.TruncWithPrecision", opLen1Imm8(ssa.OpTruncWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64)
-	addF(simdPackage, "Float32x8.TruncWithPrecision", opLen1Imm8(ssa.OpTruncWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64)
-	addF(simdPackage, "Float32x16.TruncWithPrecision", opLen1Imm8(ssa.OpTruncWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64)
-	addF(simdPackage, "Float64x2.TruncWithPrecision", opLen1Imm8(ssa.OpTruncWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64)
-	addF(simdPackage, "Float64x4.TruncWithPrecision", opLen1Imm8(ssa.OpTruncWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64)
-	addF(simdPackage, "Float64x8.TruncWithPrecision", opLen1Imm8(ssa.OpTruncWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64)
-	addF(simdPackage, "Float32x4.TruncWithPrecisionMasked", opLen2Imm8(ssa.OpTruncWithPrecisionMaskedFloat32x4, types.TypeVec128, 4), sys.AMD64)
-	addF(simdPackage, "Float32x8.TruncWithPrecisionMasked", opLen2Imm8(ssa.OpTruncWithPrecisionMaskedFloat32x8, types.TypeVec256, 4), sys.AMD64)
-	addF(simdPackage, "Float32x16.TruncWithPrecisionMasked", opLen2Imm8(ssa.OpTruncWithPrecisionMaskedFloat32x16, types.TypeVec512, 4), sys.AMD64)
-	addF(simdPackage, "Float64x2.TruncWithPrecisionMasked", opLen2Imm8(ssa.OpTruncWithPrecisionMaskedFloat64x2, types.TypeVec128, 4), sys.AMD64)
-	addF(simdPackage, "Float64x4.TruncWithPrecisionMasked", opLen2Imm8(ssa.OpTruncWithPrecisionMaskedFloat64x4, types.TypeVec256, 4), sys.AMD64)
-	addF(simdPackage, "Float64x8.TruncWithPrecisionMasked", opLen2Imm8(ssa.OpTruncWithPrecisionMaskedFloat64x8, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x4.TruncScaled", opLen1Imm8(ssa.OpTruncScaledFloat32x4, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float32x8.TruncScaled", opLen1Imm8(ssa.OpTruncScaledFloat32x8, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float32x16.TruncScaled", opLen1Imm8(ssa.OpTruncScaledFloat32x16, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float64x2.TruncScaled", opLen1Imm8(ssa.OpTruncScaledFloat64x2, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float64x4.TruncScaled", opLen1Imm8(ssa.OpTruncScaledFloat64x4, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x8.TruncScaled", opLen1Imm8(ssa.OpTruncScaledFloat64x8, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x4.TruncScaledMasked", opLen2Imm8(ssa.OpTruncScaledMaskedFloat32x4, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float32x8.TruncScaledMasked", opLen2Imm8(ssa.OpTruncScaledMaskedFloat32x8, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float32x16.TruncScaledMasked", opLen2Imm8(ssa.OpTruncScaledMaskedFloat32x16, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float64x2.TruncScaledMasked", opLen2Imm8(ssa.OpTruncScaledMaskedFloat64x2, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float64x4.TruncScaledMasked", opLen2Imm8(ssa.OpTruncScaledMaskedFloat64x4, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x8.TruncScaledMasked", opLen2Imm8(ssa.OpTruncScaledMaskedFloat64x8, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x4.TruncScaledResidue", opLen1Imm8(ssa.OpTruncScaledResidueFloat32x4, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float32x8.TruncScaledResidue", opLen1Imm8(ssa.OpTruncScaledResidueFloat32x8, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float32x16.TruncScaledResidue", opLen1Imm8(ssa.OpTruncScaledResidueFloat32x16, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float64x2.TruncScaledResidue", opLen1Imm8(ssa.OpTruncScaledResidueFloat64x2, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float64x4.TruncScaledResidue", opLen1Imm8(ssa.OpTruncScaledResidueFloat64x4, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x8.TruncScaledResidue", opLen1Imm8(ssa.OpTruncScaledResidueFloat64x8, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float32x4.TruncScaledResidueMasked", opLen2Imm8(ssa.OpTruncScaledResidueMaskedFloat32x4, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float32x8.TruncScaledResidueMasked", opLen2Imm8(ssa.OpTruncScaledResidueMaskedFloat32x8, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float32x16.TruncScaledResidueMasked", opLen2Imm8(ssa.OpTruncScaledResidueMaskedFloat32x16, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Float64x2.TruncScaledResidueMasked", opLen2Imm8(ssa.OpTruncScaledResidueMaskedFloat64x2, types.TypeVec128, 4), sys.AMD64)
+	addF(simdPackage, "Float64x4.TruncScaledResidueMasked", opLen2Imm8(ssa.OpTruncScaledResidueMaskedFloat64x4, types.TypeVec256, 4), sys.AMD64)
+	addF(simdPackage, "Float64x8.TruncScaledResidueMasked", opLen2Imm8(ssa.OpTruncScaledResidueMaskedFloat64x8, types.TypeVec512, 4), sys.AMD64)
 	addF(simdPackage, "Int8x16.UnsignedSignedQuadDotProdAccumulate", opLen3_31(ssa.OpUnsignedSignedQuadDotProdAccumulateInt32x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Int8x32.UnsignedSignedQuadDotProdAccumulate", opLen3_31(ssa.OpUnsignedSignedQuadDotProdAccumulateInt32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int8x64.UnsignedSignedQuadDotProdAccumulate", opLen3_31(ssa.OpUnsignedSignedQuadDotProdAccumulateInt32x16, types.TypeVec512), sys.AMD64)
--- a/src/simd/binary_test.go
+++ b/src/simd/binary_test.go
@ -309,42 +309,42 @@ func TestMul(t *testing.T) {
 	testFloat64x2Binary(t, simd.Float64x2.Mul, mulSlice[float64])
 	testFloat64x4Binary(t, simd.Float64x4.Mul, mulSlice[float64])

-	testInt16x16Binary(t, simd.Int16x16.MulLow, mulSlice[int16])
-	testInt16x8Binary(t, simd.Int16x8.MulLow, mulSlice[int16])
-	testInt32x4Binary(t, simd.Int32x4.MulLow, mulSlice[int32])
-	testInt32x8Binary(t, simd.Int32x8.MulLow, mulSlice[int32])
+	testInt16x16Binary(t, simd.Int16x16.Mul, mulSlice[int16])
+	testInt16x8Binary(t, simd.Int16x8.Mul, mulSlice[int16])
+	testInt32x4Binary(t, simd.Int32x4.Mul, mulSlice[int32])
+	testInt32x8Binary(t, simd.Int32x8.Mul, mulSlice[int32])

-	// testInt8x16Binary(t, simd.Int8x16.MulLow, mulSlice[int8]) // nope
-	// testInt8x32Binary(t, simd.Int8x32.MulLow, mulSlice[int8])
+	// testInt8x16Binary(t, simd.Int8x16.Mul, mulSlice[int8]) // nope
+	// testInt8x32Binary(t, simd.Int8x32.Mul, mulSlice[int8])

-	// TODO we should be able to do these, there's no difference between signed/unsigned mulLow
-	// testUint16x16Binary(t, simd.Uint16x16.MulLow, mulSlice[uint16])
-	// testUint16x8Binary(t, simd.Uint16x8.MulLow, mulSlice[uint16])
-	// testUint32x4Binary(t, simd.Uint32x4.MulLow, mulSlice[uint32])
-	// testUint32x8Binary(t, simd.Uint32x8.MulLow, mulSlice[uint32])
-	// testUint64x2Binary(t, simd.Uint64x2.MulLow, mulSlice[uint64])
-	// testUint64x4Binary(t, simd.Uint64x4.MulLow, mulSlice[uint64])
+	// TODO we should be able to do these, there's no difference between signed/unsigned Mul
+	// testUint16x16Binary(t, simd.Uint16x16.Mul, mulSlice[uint16])
+	// testUint16x8Binary(t, simd.Uint16x8.Mul, mulSlice[uint16])
+	// testUint32x4Binary(t, simd.Uint32x4.Mul, mulSlice[uint32])
+	// testUint32x8Binary(t, simd.Uint32x8.Mul, mulSlice[uint32])
+	// testUint64x2Binary(t, simd.Uint64x2.Mul, mulSlice[uint64])
+	// testUint64x4Binary(t, simd.Uint64x4.Mul, mulSlice[uint64])

-	// testUint8x16Binary(t, simd.Uint8x16.MulLow, mulSlice[uint8]) // nope
-	// testUint8x32Binary(t, simd.Uint8x32.MulLow, mulSlice[uint8])
+	// testUint8x16Binary(t, simd.Uint8x16.Mul, mulSlice[uint8]) // nope
+	// testUint8x32Binary(t, simd.Uint8x32.Mul, mulSlice[uint8])

 	if simd.HasAVX512() {
-		testInt64x2Binary(t, simd.Int64x2.MulLow, mulSlice[int64]) // avx512 only
-		testInt64x4Binary(t, simd.Int64x4.MulLow, mulSlice[int64])
+		testInt64x2Binary(t, simd.Int64x2.Mul, mulSlice[int64]) // avx512 only
+		testInt64x4Binary(t, simd.Int64x4.Mul, mulSlice[int64])

 		testFloat32x16Binary(t, simd.Float32x16.Mul, mulSlice[float32])
 		testFloat64x8Binary(t, simd.Float64x8.Mul, mulSlice[float64])

-		// testInt8x64Binary(t, simd.Int8x64.MulLow, mulSlice[int8]) // nope
-		testInt16x32Binary(t, simd.Int16x32.MulLow, mulSlice[int16])
-		testInt32x16Binary(t, simd.Int32x16.MulLow, mulSlice[int32])
-		testInt64x8Binary(t, simd.Int64x8.MulLow, mulSlice[int64])
-		// testUint8x64Binary(t, simd.Uint8x64.MulLow, mulSlice[uint8]) // nope
+		// testInt8x64Binary(t, simd.Int8x64.Mul, mulSlice[int8]) // nope
+		testInt16x32Binary(t, simd.Int16x32.Mul, mulSlice[int16])
+		testInt32x16Binary(t, simd.Int32x16.Mul, mulSlice[int32])
+		testInt64x8Binary(t, simd.Int64x8.Mul, mulSlice[int64])
+		// testUint8x64Binary(t, simd.Uint8x64.Mul, mulSlice[uint8]) // nope

 		// TODO signed should do the job
-		// testUint16x32Binary(t, simd.Uint16x32.MulLow, mulSlice[uint16])
-		// testUint32x16Binary(t, simd.Uint32x16.MulLow, mulSlice[uint32])
-		// testUint64x8Binary(t, simd.Uint64x8.MulLow, mulSlice[uint64])
+		// testUint16x32Binary(t, simd.Uint16x32.Mul, mulSlice[uint16])
+		// testUint32x16Binary(t, simd.Uint32x16.Mul, mulSlice[uint32])
+		// testUint64x8Binary(t, simd.Uint64x8.Mul, mulSlice[uint64])
 	}
 }

--- a/src/simd/ops_amd64.go
+++ b/src/simd/ops_amd64.go
--- a/src/simd/unary_test.go
+++ b/src/simd/unary_test.go
@ -89,20 +89,20 @@ func TestToInt32(t *testing.T) {
 	testFloat32x8UnaryToInt32(t, simd.Float32x8.ConvertToInt32, toInt32Slice[float32])
 }

-func TestDiffWithCeilWithPrecision(t *testing.T) {
+func TestCeilScaledResidue(t *testing.T) {
 	if !simd.HasAVX512() {
 		t.Skip("Needs AVX512")
 	}
 	testFloat64x8UnaryFlaky(t,
-		func(x simd.Float64x8) simd.Float64x8 { return x.DiffWithCeilWithPrecision(0) },
+		func(x simd.Float64x8) simd.Float64x8 { return x.CeilScaledResidue(0) },
 		map1(ceilResidueForPrecision[float64](0)),
 		0.001)
 	testFloat64x8UnaryFlaky(t,
-		func(x simd.Float64x8) simd.Float64x8 { return x.DiffWithCeilWithPrecision(1) },
+		func(x simd.Float64x8) simd.Float64x8 { return x.CeilScaledResidue(1) },
 		map1(ceilResidueForPrecision[float64](1)),
 		0.001)
 	testFloat64x8Unary(t,
-		func(x simd.Float64x8) simd.Float64x8 { return x.Sub(x.CeilWithPrecision(0)) },
+		func(x simd.Float64x8) simd.Float64x8 { return x.Sub(x.CeilScaled(0)) },
 		map1[float64](func(x float64) float64 { return x - math.Ceil(x) }))
 }