diff --git a/src/cmd/compile/internal/amd64/simdssa.go b/src/cmd/compile/internal/amd64/simdssa.go index 15ffbf66fa7..76ef42576d3 100644 --- a/src/cmd/compile/internal/amd64/simdssa.go +++ b/src/cmd/compile/internal/amd64/simdssa.go @@ -80,6 +80,22 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPADDQ128, ssa.OpAMD64VPADDQ256, ssa.OpAMD64VPADDQ512, + ssa.OpAMD64VHADDPS128, + ssa.OpAMD64VHADDPS256, + ssa.OpAMD64VHADDPD128, + ssa.OpAMD64VHADDPD256, + ssa.OpAMD64VPHADDW128, + ssa.OpAMD64VPHADDW256, + ssa.OpAMD64VPHADDD128, + ssa.OpAMD64VPHADDD256, + ssa.OpAMD64VPHADDSW128, + ssa.OpAMD64VPHADDSW256, + ssa.OpAMD64VPADDSB128, + ssa.OpAMD64VPADDSB256, + ssa.OpAMD64VPADDSB512, + ssa.OpAMD64VPADDSW128, + ssa.OpAMD64VPADDSW256, + ssa.OpAMD64VPADDSW512, ssa.OpAMD64VADDSUBPS128, ssa.OpAMD64VADDSUBPS256, ssa.OpAMD64VADDSUBPD128, @@ -189,12 +205,15 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VMULPD128, ssa.OpAMD64VMULPD256, ssa.OpAMD64VMULPD512, - ssa.OpAMD64VSCALEFPS128, - ssa.OpAMD64VSCALEFPS256, - ssa.OpAMD64VSCALEFPS512, - ssa.OpAMD64VSCALEFPD128, - ssa.OpAMD64VSCALEFPD256, - ssa.OpAMD64VSCALEFPD512, + ssa.OpAMD64VPMULLW128, + ssa.OpAMD64VPMULLW256, + ssa.OpAMD64VPMULLW512, + ssa.OpAMD64VPMULLD128, + ssa.OpAMD64VPMULLD256, + ssa.OpAMD64VPMULLD512, + ssa.OpAMD64VPMULLQ128, + ssa.OpAMD64VPMULLQ256, + ssa.OpAMD64VPMULLQ512, ssa.OpAMD64VPMULDQ128, ssa.OpAMD64VPMULDQ256, ssa.OpAMD64VPMULDQ512, @@ -207,15 +226,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPMULHUW128, ssa.OpAMD64VPMULHUW256, ssa.OpAMD64VPMULHUW512, - ssa.OpAMD64VPMULLW128, - ssa.OpAMD64VPMULLW256, - ssa.OpAMD64VPMULLW512, - ssa.OpAMD64VPMULLD128, - ssa.OpAMD64VPMULLD256, - ssa.OpAMD64VPMULLD512, - ssa.OpAMD64VPMULLQ128, - ssa.OpAMD64VPMULLQ256, - ssa.OpAMD64VPMULLQ512, ssa.OpAMD64VPOR128, ssa.OpAMD64VPOR256, ssa.OpAMD64VPORD512, @@ -223,22 +233,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPMADDWD128, ssa.OpAMD64VPMADDWD256, ssa.OpAMD64VPMADDWD512, - ssa.OpAMD64VHADDPS128, - ssa.OpAMD64VHADDPS256, - ssa.OpAMD64VHADDPD128, - ssa.OpAMD64VHADDPD256, - ssa.OpAMD64VPHADDW128, - ssa.OpAMD64VPHADDW256, - ssa.OpAMD64VPHADDD128, - ssa.OpAMD64VPHADDD256, - ssa.OpAMD64VHSUBPS128, - ssa.OpAMD64VHSUBPS256, - ssa.OpAMD64VHSUBPD128, - ssa.OpAMD64VHSUBPD256, - ssa.OpAMD64VPHSUBW128, - ssa.OpAMD64VPHSUBW256, - ssa.OpAMD64VPHSUBD128, - ssa.OpAMD64VPHSUBD256, ssa.OpAMD64VPERMB128, ssa.OpAMD64VPERMB256, ssa.OpAMD64VPERMB512, @@ -265,25 +259,15 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPRORVQ128, ssa.OpAMD64VPRORVQ256, ssa.OpAMD64VPRORVQ512, - ssa.OpAMD64VPADDSB128, - ssa.OpAMD64VPADDSB256, - ssa.OpAMD64VPADDSB512, - ssa.OpAMD64VPADDSW128, - ssa.OpAMD64VPADDSW256, - ssa.OpAMD64VPADDSW512, - ssa.OpAMD64VPHADDSW128, - ssa.OpAMD64VPHADDSW256, - ssa.OpAMD64VPHSUBSW128, - ssa.OpAMD64VPHSUBSW256, - ssa.OpAMD64VPSUBSB128, - ssa.OpAMD64VPSUBSB256, - ssa.OpAMD64VPSUBSB512, - ssa.OpAMD64VPSUBSW128, - ssa.OpAMD64VPSUBSW256, - ssa.OpAMD64VPSUBSW512, ssa.OpAMD64VPMADDUBSW128, ssa.OpAMD64VPMADDUBSW256, ssa.OpAMD64VPMADDUBSW512, + ssa.OpAMD64VSCALEFPS128, + ssa.OpAMD64VSCALEFPS256, + ssa.OpAMD64VSCALEFPS512, + ssa.OpAMD64VSCALEFPD128, + ssa.OpAMD64VSCALEFPD256, + ssa.OpAMD64VSCALEFPD512, ssa.OpAMD64VPSLLVW128, ssa.OpAMD64VPSLLVW256, ssa.OpAMD64VPSLLVW512, @@ -335,6 +319,22 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPSUBQ128, ssa.OpAMD64VPSUBQ256, ssa.OpAMD64VPSUBQ512, + ssa.OpAMD64VHSUBPS128, + ssa.OpAMD64VHSUBPS256, + ssa.OpAMD64VHSUBPD128, + ssa.OpAMD64VHSUBPD256, + ssa.OpAMD64VPHSUBW128, + ssa.OpAMD64VPHSUBW256, + ssa.OpAMD64VPHSUBD128, + ssa.OpAMD64VPHSUBD256, + ssa.OpAMD64VPHSUBSW128, + ssa.OpAMD64VPHSUBSW256, + ssa.OpAMD64VPSUBSB128, + ssa.OpAMD64VPSUBSB256, + ssa.OpAMD64VPSUBSB512, + ssa.OpAMD64VPSUBSW128, + ssa.OpAMD64VPSUBSW256, + ssa.OpAMD64VPSUBSW512, ssa.OpAMD64VPXOR128, ssa.OpAMD64VPXOR256, ssa.OpAMD64VPXORD512, @@ -369,6 +369,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPADDQMasked128, ssa.OpAMD64VPADDQMasked256, ssa.OpAMD64VPADDQMasked512, + ssa.OpAMD64VPADDSBMasked128, + ssa.OpAMD64VPADDSBMasked256, + ssa.OpAMD64VPADDSBMasked512, + ssa.OpAMD64VPADDSWMasked128, + ssa.OpAMD64VPADDSWMasked256, + ssa.OpAMD64VPADDSWMasked512, ssa.OpAMD64VPANDDMasked128, ssa.OpAMD64VPANDDMasked256, ssa.OpAMD64VPANDDMasked512, @@ -456,12 +462,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPMINUQMasked128, ssa.OpAMD64VPMINUQMasked256, ssa.OpAMD64VPMINUQMasked512, - ssa.OpAMD64VSCALEFPSMasked128, - ssa.OpAMD64VSCALEFPSMasked256, - ssa.OpAMD64VSCALEFPSMasked512, - ssa.OpAMD64VSCALEFPDMasked128, - ssa.OpAMD64VSCALEFPDMasked256, - ssa.OpAMD64VSCALEFPDMasked512, ssa.OpAMD64VPMULDQMasked128, ssa.OpAMD64VPMULDQMasked256, ssa.OpAMD64VPMULDQMasked512, @@ -474,6 +474,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPMULHUWMasked128, ssa.OpAMD64VPMULHUWMasked256, ssa.OpAMD64VPMULHUWMasked512, + ssa.OpAMD64VMULPSMasked128, + ssa.OpAMD64VMULPSMasked256, + ssa.OpAMD64VMULPSMasked512, + ssa.OpAMD64VMULPDMasked128, + ssa.OpAMD64VMULPDMasked256, + ssa.OpAMD64VMULPDMasked512, ssa.OpAMD64VPMULLWMasked128, ssa.OpAMD64VPMULLWMasked256, ssa.OpAMD64VPMULLWMasked512, @@ -483,12 +489,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPMULLQMasked128, ssa.OpAMD64VPMULLQMasked256, ssa.OpAMD64VPMULLQMasked512, - ssa.OpAMD64VMULPSMasked128, - ssa.OpAMD64VMULPSMasked256, - ssa.OpAMD64VMULPSMasked512, - ssa.OpAMD64VMULPDMasked128, - ssa.OpAMD64VMULPDMasked256, - ssa.OpAMD64VMULPDMasked512, ssa.OpAMD64VPORDMasked128, ssa.OpAMD64VPORDMasked256, ssa.OpAMD64VPORDMasked512, @@ -524,21 +524,15 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPRORVQMasked128, ssa.OpAMD64VPRORVQMasked256, ssa.OpAMD64VPRORVQMasked512, - ssa.OpAMD64VPADDSBMasked128, - ssa.OpAMD64VPADDSBMasked256, - ssa.OpAMD64VPADDSBMasked512, - ssa.OpAMD64VPADDSWMasked128, - ssa.OpAMD64VPADDSWMasked256, - ssa.OpAMD64VPADDSWMasked512, - ssa.OpAMD64VPSUBSBMasked128, - ssa.OpAMD64VPSUBSBMasked256, - ssa.OpAMD64VPSUBSBMasked512, - ssa.OpAMD64VPSUBSWMasked128, - ssa.OpAMD64VPSUBSWMasked256, - ssa.OpAMD64VPSUBSWMasked512, ssa.OpAMD64VPMADDUBSWMasked128, ssa.OpAMD64VPMADDUBSWMasked256, ssa.OpAMD64VPMADDUBSWMasked512, + ssa.OpAMD64VSCALEFPSMasked128, + ssa.OpAMD64VSCALEFPSMasked256, + ssa.OpAMD64VSCALEFPSMasked512, + ssa.OpAMD64VSCALEFPDMasked128, + ssa.OpAMD64VSCALEFPDMasked256, + ssa.OpAMD64VSCALEFPDMasked512, ssa.OpAMD64VPSLLVWMasked128, ssa.OpAMD64VPSLLVWMasked256, ssa.OpAMD64VPSLLVWMasked512, @@ -584,6 +578,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPSUBQMasked128, ssa.OpAMD64VPSUBQMasked256, ssa.OpAMD64VPSUBQMasked512, + ssa.OpAMD64VPSUBSBMasked128, + ssa.OpAMD64VPSUBSBMasked256, + ssa.OpAMD64VPSUBSBMasked512, + ssa.OpAMD64VPSUBSWMasked128, + ssa.OpAMD64VPSUBSWMasked256, + ssa.OpAMD64VPSUBSWMasked512, ssa.OpAMD64VPXORDMasked128, ssa.OpAMD64VPXORDMasked256, ssa.OpAMD64VPXORDMasked512, @@ -1085,6 +1085,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPADDQMasked128, ssa.OpAMD64VPADDQMasked256, ssa.OpAMD64VPADDQMasked512, + ssa.OpAMD64VPADDSBMasked128, + ssa.OpAMD64VPADDSBMasked256, + ssa.OpAMD64VPADDSBMasked512, + ssa.OpAMD64VPADDSWMasked128, + ssa.OpAMD64VPADDSWMasked256, + ssa.OpAMD64VPADDSWMasked512, ssa.OpAMD64VPANDDMasked128, ssa.OpAMD64VPANDDMasked256, ssa.OpAMD64VPANDDMasked512, @@ -1121,6 +1127,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VRNDSCALEPDMasked128, ssa.OpAMD64VRNDSCALEPDMasked256, ssa.OpAMD64VRNDSCALEPDMasked512, + ssa.OpAMD64VREDUCEPSMasked128, + ssa.OpAMD64VREDUCEPSMasked256, + ssa.OpAMD64VREDUCEPSMasked512, + ssa.OpAMD64VREDUCEPDMasked128, + ssa.OpAMD64VREDUCEPDMasked256, + ssa.OpAMD64VREDUCEPDMasked512, ssa.OpAMD64VCOMPRESSPSMasked128, ssa.OpAMD64VCOMPRESSPSMasked256, ssa.OpAMD64VCOMPRESSPSMasked512, @@ -1145,12 +1157,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VCVTPS2UDQMasked128, ssa.OpAMD64VCVTPS2UDQMasked256, ssa.OpAMD64VCVTPS2UDQMasked512, - ssa.OpAMD64VREDUCEPSMasked128, - ssa.OpAMD64VREDUCEPSMasked256, - ssa.OpAMD64VREDUCEPSMasked512, - ssa.OpAMD64VREDUCEPDMasked128, - ssa.OpAMD64VREDUCEPDMasked256, - ssa.OpAMD64VREDUCEPDMasked512, ssa.OpAMD64VDIVPSMasked128, ssa.OpAMD64VDIVPSMasked256, ssa.OpAMD64VDIVPSMasked512, @@ -1244,12 +1250,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPMINUQMasked128, ssa.OpAMD64VPMINUQMasked256, ssa.OpAMD64VPMINUQMasked512, - ssa.OpAMD64VSCALEFPSMasked128, - ssa.OpAMD64VSCALEFPSMasked256, - ssa.OpAMD64VSCALEFPSMasked512, - ssa.OpAMD64VSCALEFPDMasked128, - ssa.OpAMD64VSCALEFPDMasked256, - ssa.OpAMD64VSCALEFPDMasked512, ssa.OpAMD64VPMULDQMasked128, ssa.OpAMD64VPMULDQMasked256, ssa.OpAMD64VPMULDQMasked512, @@ -1262,6 +1262,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPMULHUWMasked128, ssa.OpAMD64VPMULHUWMasked256, ssa.OpAMD64VPMULHUWMasked512, + ssa.OpAMD64VMULPSMasked128, + ssa.OpAMD64VMULPSMasked256, + ssa.OpAMD64VMULPSMasked512, + ssa.OpAMD64VMULPDMasked128, + ssa.OpAMD64VMULPDMasked256, + ssa.OpAMD64VMULPDMasked512, ssa.OpAMD64VPMULLWMasked128, ssa.OpAMD64VPMULLWMasked256, ssa.OpAMD64VPMULLWMasked512, @@ -1271,12 +1277,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPMULLQMasked128, ssa.OpAMD64VPMULLQMasked256, ssa.OpAMD64VPMULLQMasked512, - ssa.OpAMD64VMULPSMasked128, - ssa.OpAMD64VMULPSMasked256, - ssa.OpAMD64VMULPSMasked512, - ssa.OpAMD64VMULPDMasked128, - ssa.OpAMD64VMULPDMasked256, - ssa.OpAMD64VMULPDMasked512, ssa.OpAMD64VPORDMasked128, ssa.OpAMD64VPORDMasked256, ssa.OpAMD64VPORDMasked512, @@ -1357,24 +1357,18 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPDPWSSDSMasked128, ssa.OpAMD64VPDPWSSDSMasked256, ssa.OpAMD64VPDPWSSDSMasked512, - ssa.OpAMD64VPADDSBMasked128, - ssa.OpAMD64VPADDSBMasked256, - ssa.OpAMD64VPADDSBMasked512, - ssa.OpAMD64VPADDSWMasked128, - ssa.OpAMD64VPADDSWMasked256, - ssa.OpAMD64VPADDSWMasked512, - ssa.OpAMD64VPSUBSBMasked128, - ssa.OpAMD64VPSUBSBMasked256, - ssa.OpAMD64VPSUBSBMasked512, - ssa.OpAMD64VPSUBSWMasked128, - ssa.OpAMD64VPSUBSWMasked256, - ssa.OpAMD64VPSUBSWMasked512, ssa.OpAMD64VPMADDUBSWMasked128, ssa.OpAMD64VPMADDUBSWMasked256, ssa.OpAMD64VPMADDUBSWMasked512, ssa.OpAMD64VPDPBUSDSMasked128, ssa.OpAMD64VPDPBUSDSMasked256, ssa.OpAMD64VPDPBUSDSMasked512, + ssa.OpAMD64VSCALEFPSMasked128, + ssa.OpAMD64VSCALEFPSMasked256, + ssa.OpAMD64VSCALEFPSMasked512, + ssa.OpAMD64VSCALEFPDMasked128, + ssa.OpAMD64VSCALEFPDMasked256, + ssa.OpAMD64VSCALEFPDMasked512, ssa.OpAMD64VPSHLDWMasked128, ssa.OpAMD64VPSHLDWMasked256, ssa.OpAMD64VPSHLDWMasked512, @@ -1489,6 +1483,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPSUBQMasked128, ssa.OpAMD64VPSUBQMasked256, ssa.OpAMD64VPSUBQMasked512, + ssa.OpAMD64VPSUBSBMasked128, + ssa.OpAMD64VPSUBSBMasked256, + ssa.OpAMD64VPSUBSBMasked512, + ssa.OpAMD64VPSUBSWMasked128, + ssa.OpAMD64VPSUBSWMasked256, + ssa.OpAMD64VPSUBSWMasked512, ssa.OpAMD64VPDPBUSDMasked128, ssa.OpAMD64VPDPBUSDMasked256, ssa.OpAMD64VPDPBUSDMasked512, diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules index 1d54cfcdbdd..060f220c7de 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules +++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules @@ -90,6 +90,44 @@ (AddMaskedUint64x2 x y mask) => (VPADDQMasked128 x y (VPMOVVec64x2ToM mask)) (AddMaskedUint64x4 x y mask) => (VPADDQMasked256 x y (VPMOVVec64x4ToM mask)) (AddMaskedUint64x8 x y mask) => (VPADDQMasked512 x y (VPMOVVec64x8ToM mask)) +(AddPairsFloat32x4 ...) => (VHADDPS128 ...) +(AddPairsFloat32x8 ...) => (VHADDPS256 ...) +(AddPairsFloat64x2 ...) => (VHADDPD128 ...) +(AddPairsFloat64x4 ...) => (VHADDPD256 ...) +(AddPairsInt16x8 ...) => (VPHADDW128 ...) +(AddPairsInt16x16 ...) => (VPHADDW256 ...) +(AddPairsInt32x4 ...) => (VPHADDD128 ...) +(AddPairsInt32x8 ...) => (VPHADDD256 ...) +(AddPairsUint16x8 ...) => (VPHADDW128 ...) +(AddPairsUint16x16 ...) => (VPHADDW256 ...) +(AddPairsUint32x4 ...) => (VPHADDD128 ...) +(AddPairsUint32x8 ...) => (VPHADDD256 ...) +(AddPairsSaturatedInt16x8 ...) => (VPHADDSW128 ...) +(AddPairsSaturatedInt16x16 ...) => (VPHADDSW256 ...) +(AddSaturatedInt8x16 ...) => (VPADDSB128 ...) +(AddSaturatedInt8x32 ...) => (VPADDSB256 ...) +(AddSaturatedInt8x64 ...) => (VPADDSB512 ...) +(AddSaturatedInt16x8 ...) => (VPADDSW128 ...) +(AddSaturatedInt16x16 ...) => (VPADDSW256 ...) +(AddSaturatedInt16x32 ...) => (VPADDSW512 ...) +(AddSaturatedUint8x16 ...) => (VPADDSB128 ...) +(AddSaturatedUint8x32 ...) => (VPADDSB256 ...) +(AddSaturatedUint8x64 ...) => (VPADDSB512 ...) +(AddSaturatedUint16x8 ...) => (VPADDSW128 ...) +(AddSaturatedUint16x16 ...) => (VPADDSW256 ...) +(AddSaturatedUint16x32 ...) => (VPADDSW512 ...) +(AddSaturatedMaskedInt8x16 x y mask) => (VPADDSBMasked128 x y (VPMOVVec8x16ToM mask)) +(AddSaturatedMaskedInt8x32 x y mask) => (VPADDSBMasked256 x y (VPMOVVec8x32ToM mask)) +(AddSaturatedMaskedInt8x64 x y mask) => (VPADDSBMasked512 x y (VPMOVVec8x64ToM mask)) +(AddSaturatedMaskedInt16x8 x y mask) => (VPADDSWMasked128 x y (VPMOVVec16x8ToM mask)) +(AddSaturatedMaskedInt16x16 x y mask) => (VPADDSWMasked256 x y (VPMOVVec16x16ToM mask)) +(AddSaturatedMaskedInt16x32 x y mask) => (VPADDSWMasked512 x y (VPMOVVec16x32ToM mask)) +(AddSaturatedMaskedUint8x16 x y mask) => (VPADDSBMasked128 x y (VPMOVVec8x16ToM mask)) +(AddSaturatedMaskedUint8x32 x y mask) => (VPADDSBMasked256 x y (VPMOVVec8x32ToM mask)) +(AddSaturatedMaskedUint8x64 x y mask) => (VPADDSBMasked512 x y (VPMOVVec8x64ToM mask)) +(AddSaturatedMaskedUint16x8 x y mask) => (VPADDSWMasked128 x y (VPMOVVec16x8ToM mask)) +(AddSaturatedMaskedUint16x16 x y mask) => (VPADDSWMasked256 x y (VPMOVVec16x16ToM mask)) +(AddSaturatedMaskedUint16x32 x y mask) => (VPADDSWMasked512 x y (VPMOVVec16x32ToM mask)) (AddSubFloat32x4 ...) => (VADDSUBPS128 ...) (AddSubFloat32x8 ...) => (VADDSUBPS256 ...) (AddSubFloat64x2 ...) => (VADDSUBPD128 ...) @@ -206,18 +244,30 @@ (CeilFloat32x8 x) => (VROUNDPS256 [2] x) (CeilFloat64x2 x) => (VROUNDPD128 [2] x) (CeilFloat64x4 x) => (VROUNDPD256 [2] x) -(CeilWithPrecisionFloat32x4 [a] x) => (VRNDSCALEPS128 [a+2] x) -(CeilWithPrecisionFloat32x8 [a] x) => (VRNDSCALEPS256 [a+2] x) -(CeilWithPrecisionFloat32x16 [a] x) => (VRNDSCALEPS512 [a+2] x) -(CeilWithPrecisionFloat64x2 [a] x) => (VRNDSCALEPD128 [a+2] x) -(CeilWithPrecisionFloat64x4 [a] x) => (VRNDSCALEPD256 [a+2] x) -(CeilWithPrecisionFloat64x8 [a] x) => (VRNDSCALEPD512 [a+2] x) -(CeilWithPrecisionMaskedFloat32x4 [a] x mask) => (VRNDSCALEPSMasked128 [a+2] x (VPMOVVec32x4ToM mask)) -(CeilWithPrecisionMaskedFloat32x8 [a] x mask) => (VRNDSCALEPSMasked256 [a+2] x (VPMOVVec32x8ToM mask)) -(CeilWithPrecisionMaskedFloat32x16 [a] x mask) => (VRNDSCALEPSMasked512 [a+2] x (VPMOVVec32x16ToM mask)) -(CeilWithPrecisionMaskedFloat64x2 [a] x mask) => (VRNDSCALEPDMasked128 [a+2] x (VPMOVVec64x2ToM mask)) -(CeilWithPrecisionMaskedFloat64x4 [a] x mask) => (VRNDSCALEPDMasked256 [a+2] x (VPMOVVec64x4ToM mask)) -(CeilWithPrecisionMaskedFloat64x8 [a] x mask) => (VRNDSCALEPDMasked512 [a+2] x (VPMOVVec64x8ToM mask)) +(CeilScaledFloat32x4 [a] x) => (VRNDSCALEPS128 [a+2] x) +(CeilScaledFloat32x8 [a] x) => (VRNDSCALEPS256 [a+2] x) +(CeilScaledFloat32x16 [a] x) => (VRNDSCALEPS512 [a+2] x) +(CeilScaledFloat64x2 [a] x) => (VRNDSCALEPD128 [a+2] x) +(CeilScaledFloat64x4 [a] x) => (VRNDSCALEPD256 [a+2] x) +(CeilScaledFloat64x8 [a] x) => (VRNDSCALEPD512 [a+2] x) +(CeilScaledMaskedFloat32x4 [a] x mask) => (VRNDSCALEPSMasked128 [a+2] x (VPMOVVec32x4ToM mask)) +(CeilScaledMaskedFloat32x8 [a] x mask) => (VRNDSCALEPSMasked256 [a+2] x (VPMOVVec32x8ToM mask)) +(CeilScaledMaskedFloat32x16 [a] x mask) => (VRNDSCALEPSMasked512 [a+2] x (VPMOVVec32x16ToM mask)) +(CeilScaledMaskedFloat64x2 [a] x mask) => (VRNDSCALEPDMasked128 [a+2] x (VPMOVVec64x2ToM mask)) +(CeilScaledMaskedFloat64x4 [a] x mask) => (VRNDSCALEPDMasked256 [a+2] x (VPMOVVec64x4ToM mask)) +(CeilScaledMaskedFloat64x8 [a] x mask) => (VRNDSCALEPDMasked512 [a+2] x (VPMOVVec64x8ToM mask)) +(CeilScaledResidueFloat32x4 [a] x) => (VREDUCEPS128 [a+2] x) +(CeilScaledResidueFloat32x8 [a] x) => (VREDUCEPS256 [a+2] x) +(CeilScaledResidueFloat32x16 [a] x) => (VREDUCEPS512 [a+2] x) +(CeilScaledResidueFloat64x2 [a] x) => (VREDUCEPD128 [a+2] x) +(CeilScaledResidueFloat64x4 [a] x) => (VREDUCEPD256 [a+2] x) +(CeilScaledResidueFloat64x8 [a] x) => (VREDUCEPD512 [a+2] x) +(CeilScaledResidueMaskedFloat32x4 [a] x mask) => (VREDUCEPSMasked128 [a+2] x (VPMOVVec32x4ToM mask)) +(CeilScaledResidueMaskedFloat32x8 [a] x mask) => (VREDUCEPSMasked256 [a+2] x (VPMOVVec32x8ToM mask)) +(CeilScaledResidueMaskedFloat32x16 [a] x mask) => (VREDUCEPSMasked512 [a+2] x (VPMOVVec32x16ToM mask)) +(CeilScaledResidueMaskedFloat64x2 [a] x mask) => (VREDUCEPDMasked128 [a+2] x (VPMOVVec64x2ToM mask)) +(CeilScaledResidueMaskedFloat64x4 [a] x mask) => (VREDUCEPDMasked256 [a+2] x (VPMOVVec64x4ToM mask)) +(CeilScaledResidueMaskedFloat64x8 [a] x mask) => (VREDUCEPDMasked512 [a+2] x (VPMOVVec64x8ToM mask)) (CompressFloat32x4 x mask) => (VCOMPRESSPSMasked128 x (VPMOVVec32x4ToM mask)) (CompressFloat32x8 x mask) => (VCOMPRESSPSMasked256 x (VPMOVVec32x8ToM mask)) (CompressFloat32x16 x mask) => (VCOMPRESSPSMasked512 x (VPMOVVec32x16ToM mask)) @@ -260,54 +310,6 @@ (ConvertToUint32MaskedFloat32x4 x mask) => (VCVTPS2UDQMasked128 x (VPMOVVec32x4ToM mask)) (ConvertToUint32MaskedFloat32x8 x mask) => (VCVTPS2UDQMasked256 x (VPMOVVec32x8ToM mask)) (ConvertToUint32MaskedFloat32x16 x mask) => (VCVTPS2UDQMasked512 x (VPMOVVec32x16ToM mask)) -(DiffWithCeilWithPrecisionFloat32x4 [a] x) => (VREDUCEPS128 [a+2] x) -(DiffWithCeilWithPrecisionFloat32x8 [a] x) => (VREDUCEPS256 [a+2] x) -(DiffWithCeilWithPrecisionFloat32x16 [a] x) => (VREDUCEPS512 [a+2] x) -(DiffWithCeilWithPrecisionFloat64x2 [a] x) => (VREDUCEPD128 [a+2] x) -(DiffWithCeilWithPrecisionFloat64x4 [a] x) => (VREDUCEPD256 [a+2] x) -(DiffWithCeilWithPrecisionFloat64x8 [a] x) => (VREDUCEPD512 [a+2] x) -(DiffWithCeilWithPrecisionMaskedFloat32x4 [a] x mask) => (VREDUCEPSMasked128 [a+2] x (VPMOVVec32x4ToM mask)) -(DiffWithCeilWithPrecisionMaskedFloat32x8 [a] x mask) => (VREDUCEPSMasked256 [a+2] x (VPMOVVec32x8ToM mask)) -(DiffWithCeilWithPrecisionMaskedFloat32x16 [a] x mask) => (VREDUCEPSMasked512 [a+2] x (VPMOVVec32x16ToM mask)) -(DiffWithCeilWithPrecisionMaskedFloat64x2 [a] x mask) => (VREDUCEPDMasked128 [a+2] x (VPMOVVec64x2ToM mask)) -(DiffWithCeilWithPrecisionMaskedFloat64x4 [a] x mask) => (VREDUCEPDMasked256 [a+2] x (VPMOVVec64x4ToM mask)) -(DiffWithCeilWithPrecisionMaskedFloat64x8 [a] x mask) => (VREDUCEPDMasked512 [a+2] x (VPMOVVec64x8ToM mask)) -(DiffWithFloorWithPrecisionFloat32x4 [a] x) => (VREDUCEPS128 [a+1] x) -(DiffWithFloorWithPrecisionFloat32x8 [a] x) => (VREDUCEPS256 [a+1] x) -(DiffWithFloorWithPrecisionFloat32x16 [a] x) => (VREDUCEPS512 [a+1] x) -(DiffWithFloorWithPrecisionFloat64x2 [a] x) => (VREDUCEPD128 [a+1] x) -(DiffWithFloorWithPrecisionFloat64x4 [a] x) => (VREDUCEPD256 [a+1] x) -(DiffWithFloorWithPrecisionFloat64x8 [a] x) => (VREDUCEPD512 [a+1] x) -(DiffWithFloorWithPrecisionMaskedFloat32x4 [a] x mask) => (VREDUCEPSMasked128 [a+1] x (VPMOVVec32x4ToM mask)) -(DiffWithFloorWithPrecisionMaskedFloat32x8 [a] x mask) => (VREDUCEPSMasked256 [a+1] x (VPMOVVec32x8ToM mask)) -(DiffWithFloorWithPrecisionMaskedFloat32x16 [a] x mask) => (VREDUCEPSMasked512 [a+1] x (VPMOVVec32x16ToM mask)) -(DiffWithFloorWithPrecisionMaskedFloat64x2 [a] x mask) => (VREDUCEPDMasked128 [a+1] x (VPMOVVec64x2ToM mask)) -(DiffWithFloorWithPrecisionMaskedFloat64x4 [a] x mask) => (VREDUCEPDMasked256 [a+1] x (VPMOVVec64x4ToM mask)) -(DiffWithFloorWithPrecisionMaskedFloat64x8 [a] x mask) => (VREDUCEPDMasked512 [a+1] x (VPMOVVec64x8ToM mask)) -(DiffWithRoundWithPrecisionFloat32x4 [a] x) => (VREDUCEPS128 [a+0] x) -(DiffWithRoundWithPrecisionFloat32x8 [a] x) => (VREDUCEPS256 [a+0] x) -(DiffWithRoundWithPrecisionFloat32x16 [a] x) => (VREDUCEPS512 [a+0] x) -(DiffWithRoundWithPrecisionFloat64x2 [a] x) => (VREDUCEPD128 [a+0] x) -(DiffWithRoundWithPrecisionFloat64x4 [a] x) => (VREDUCEPD256 [a+0] x) -(DiffWithRoundWithPrecisionFloat64x8 [a] x) => (VREDUCEPD512 [a+0] x) -(DiffWithRoundWithPrecisionMaskedFloat32x4 [a] x mask) => (VREDUCEPSMasked128 [a+0] x (VPMOVVec32x4ToM mask)) -(DiffWithRoundWithPrecisionMaskedFloat32x8 [a] x mask) => (VREDUCEPSMasked256 [a+0] x (VPMOVVec32x8ToM mask)) -(DiffWithRoundWithPrecisionMaskedFloat32x16 [a] x mask) => (VREDUCEPSMasked512 [a+0] x (VPMOVVec32x16ToM mask)) -(DiffWithRoundWithPrecisionMaskedFloat64x2 [a] x mask) => (VREDUCEPDMasked128 [a+0] x (VPMOVVec64x2ToM mask)) -(DiffWithRoundWithPrecisionMaskedFloat64x4 [a] x mask) => (VREDUCEPDMasked256 [a+0] x (VPMOVVec64x4ToM mask)) -(DiffWithRoundWithPrecisionMaskedFloat64x8 [a] x mask) => (VREDUCEPDMasked512 [a+0] x (VPMOVVec64x8ToM mask)) -(DiffWithTruncWithPrecisionFloat32x4 [a] x) => (VREDUCEPS128 [a+3] x) -(DiffWithTruncWithPrecisionFloat32x8 [a] x) => (VREDUCEPS256 [a+3] x) -(DiffWithTruncWithPrecisionFloat32x16 [a] x) => (VREDUCEPS512 [a+3] x) -(DiffWithTruncWithPrecisionFloat64x2 [a] x) => (VREDUCEPD128 [a+3] x) -(DiffWithTruncWithPrecisionFloat64x4 [a] x) => (VREDUCEPD256 [a+3] x) -(DiffWithTruncWithPrecisionFloat64x8 [a] x) => (VREDUCEPD512 [a+3] x) -(DiffWithTruncWithPrecisionMaskedFloat32x4 [a] x mask) => (VREDUCEPSMasked128 [a+3] x (VPMOVVec32x4ToM mask)) -(DiffWithTruncWithPrecisionMaskedFloat32x8 [a] x mask) => (VREDUCEPSMasked256 [a+3] x (VPMOVVec32x8ToM mask)) -(DiffWithTruncWithPrecisionMaskedFloat32x16 [a] x mask) => (VREDUCEPSMasked512 [a+3] x (VPMOVVec32x16ToM mask)) -(DiffWithTruncWithPrecisionMaskedFloat64x2 [a] x mask) => (VREDUCEPDMasked128 [a+3] x (VPMOVVec64x2ToM mask)) -(DiffWithTruncWithPrecisionMaskedFloat64x4 [a] x mask) => (VREDUCEPDMasked256 [a+3] x (VPMOVVec64x4ToM mask)) -(DiffWithTruncWithPrecisionMaskedFloat64x8 [a] x mask) => (VREDUCEPDMasked512 [a+3] x (VPMOVVec64x8ToM mask)) (DivFloat32x4 ...) => (VDIVPS128 ...) (DivFloat32x8 ...) => (VDIVPS256 ...) (DivFloat32x16 ...) => (VDIVPS512 ...) @@ -387,18 +389,30 @@ (FloorFloat32x8 x) => (VROUNDPS256 [1] x) (FloorFloat64x2 x) => (VROUNDPD128 [1] x) (FloorFloat64x4 x) => (VROUNDPD256 [1] x) -(FloorWithPrecisionFloat32x4 [a] x) => (VRNDSCALEPS128 [a+1] x) -(FloorWithPrecisionFloat32x8 [a] x) => (VRNDSCALEPS256 [a+1] x) -(FloorWithPrecisionFloat32x16 [a] x) => (VRNDSCALEPS512 [a+1] x) -(FloorWithPrecisionFloat64x2 [a] x) => (VRNDSCALEPD128 [a+1] x) -(FloorWithPrecisionFloat64x4 [a] x) => (VRNDSCALEPD256 [a+1] x) -(FloorWithPrecisionFloat64x8 [a] x) => (VRNDSCALEPD512 [a+1] x) -(FloorWithPrecisionMaskedFloat32x4 [a] x mask) => (VRNDSCALEPSMasked128 [a+1] x (VPMOVVec32x4ToM mask)) -(FloorWithPrecisionMaskedFloat32x8 [a] x mask) => (VRNDSCALEPSMasked256 [a+1] x (VPMOVVec32x8ToM mask)) -(FloorWithPrecisionMaskedFloat32x16 [a] x mask) => (VRNDSCALEPSMasked512 [a+1] x (VPMOVVec32x16ToM mask)) -(FloorWithPrecisionMaskedFloat64x2 [a] x mask) => (VRNDSCALEPDMasked128 [a+1] x (VPMOVVec64x2ToM mask)) -(FloorWithPrecisionMaskedFloat64x4 [a] x mask) => (VRNDSCALEPDMasked256 [a+1] x (VPMOVVec64x4ToM mask)) -(FloorWithPrecisionMaskedFloat64x8 [a] x mask) => (VRNDSCALEPDMasked512 [a+1] x (VPMOVVec64x8ToM mask)) +(FloorScaledFloat32x4 [a] x) => (VRNDSCALEPS128 [a+1] x) +(FloorScaledFloat32x8 [a] x) => (VRNDSCALEPS256 [a+1] x) +(FloorScaledFloat32x16 [a] x) => (VRNDSCALEPS512 [a+1] x) +(FloorScaledFloat64x2 [a] x) => (VRNDSCALEPD128 [a+1] x) +(FloorScaledFloat64x4 [a] x) => (VRNDSCALEPD256 [a+1] x) +(FloorScaledFloat64x8 [a] x) => (VRNDSCALEPD512 [a+1] x) +(FloorScaledMaskedFloat32x4 [a] x mask) => (VRNDSCALEPSMasked128 [a+1] x (VPMOVVec32x4ToM mask)) +(FloorScaledMaskedFloat32x8 [a] x mask) => (VRNDSCALEPSMasked256 [a+1] x (VPMOVVec32x8ToM mask)) +(FloorScaledMaskedFloat32x16 [a] x mask) => (VRNDSCALEPSMasked512 [a+1] x (VPMOVVec32x16ToM mask)) +(FloorScaledMaskedFloat64x2 [a] x mask) => (VRNDSCALEPDMasked128 [a+1] x (VPMOVVec64x2ToM mask)) +(FloorScaledMaskedFloat64x4 [a] x mask) => (VRNDSCALEPDMasked256 [a+1] x (VPMOVVec64x4ToM mask)) +(FloorScaledMaskedFloat64x8 [a] x mask) => (VRNDSCALEPDMasked512 [a+1] x (VPMOVVec64x8ToM mask)) +(FloorScaledResidueFloat32x4 [a] x) => (VREDUCEPS128 [a+1] x) +(FloorScaledResidueFloat32x8 [a] x) => (VREDUCEPS256 [a+1] x) +(FloorScaledResidueFloat32x16 [a] x) => (VREDUCEPS512 [a+1] x) +(FloorScaledResidueFloat64x2 [a] x) => (VREDUCEPD128 [a+1] x) +(FloorScaledResidueFloat64x4 [a] x) => (VREDUCEPD256 [a+1] x) +(FloorScaledResidueFloat64x8 [a] x) => (VREDUCEPD512 [a+1] x) +(FloorScaledResidueMaskedFloat32x4 [a] x mask) => (VREDUCEPSMasked128 [a+1] x (VPMOVVec32x4ToM mask)) +(FloorScaledResidueMaskedFloat32x8 [a] x mask) => (VREDUCEPSMasked256 [a+1] x (VPMOVVec32x8ToM mask)) +(FloorScaledResidueMaskedFloat32x16 [a] x mask) => (VREDUCEPSMasked512 [a+1] x (VPMOVVec32x16ToM mask)) +(FloorScaledResidueMaskedFloat64x2 [a] x mask) => (VREDUCEPDMasked128 [a+1] x (VPMOVVec64x2ToM mask)) +(FloorScaledResidueMaskedFloat64x4 [a] x mask) => (VREDUCEPDMasked256 [a+1] x (VPMOVVec64x4ToM mask)) +(FloorScaledResidueMaskedFloat64x8 [a] x mask) => (VREDUCEPDMasked512 [a+1] x (VPMOVVec64x8ToM mask)) (FusedMultiplyAddFloat32x4 ...) => (VFMADD213PS128 ...) (FusedMultiplyAddFloat32x8 ...) => (VFMADD213PS256 ...) (FusedMultiplyAddFloat32x16 ...) => (VFMADD213PS512 ...) @@ -849,18 +863,15 @@ (MulFloat64x2 ...) => (VMULPD128 ...) (MulFloat64x4 ...) => (VMULPD256 ...) (MulFloat64x8 ...) => (VMULPD512 ...) -(MulByPowOf2Float32x4 ...) => (VSCALEFPS128 ...) -(MulByPowOf2Float32x8 ...) => (VSCALEFPS256 ...) -(MulByPowOf2Float32x16 ...) => (VSCALEFPS512 ...) -(MulByPowOf2Float64x2 ...) => (VSCALEFPD128 ...) -(MulByPowOf2Float64x4 ...) => (VSCALEFPD256 ...) -(MulByPowOf2Float64x8 ...) => (VSCALEFPD512 ...) -(MulByPowOf2MaskedFloat32x4 x y mask) => (VSCALEFPSMasked128 x y (VPMOVVec32x4ToM mask)) -(MulByPowOf2MaskedFloat32x8 x y mask) => (VSCALEFPSMasked256 x y (VPMOVVec32x8ToM mask)) -(MulByPowOf2MaskedFloat32x16 x y mask) => (VSCALEFPSMasked512 x y (VPMOVVec32x16ToM mask)) -(MulByPowOf2MaskedFloat64x2 x y mask) => (VSCALEFPDMasked128 x y (VPMOVVec64x2ToM mask)) -(MulByPowOf2MaskedFloat64x4 x y mask) => (VSCALEFPDMasked256 x y (VPMOVVec64x4ToM mask)) -(MulByPowOf2MaskedFloat64x8 x y mask) => (VSCALEFPDMasked512 x y (VPMOVVec64x8ToM mask)) +(MulInt16x8 ...) => (VPMULLW128 ...) +(MulInt16x16 ...) => (VPMULLW256 ...) +(MulInt16x32 ...) => (VPMULLW512 ...) +(MulInt32x4 ...) => (VPMULLD128 ...) +(MulInt32x8 ...) => (VPMULLD256 ...) +(MulInt32x16 ...) => (VPMULLD512 ...) +(MulInt64x2 ...) => (VPMULLQ128 ...) +(MulInt64x4 ...) => (VPMULLQ256 ...) +(MulInt64x8 ...) => (VPMULLQ512 ...) (MulEvenWidenInt32x4 ...) => (VPMULDQ128 ...) (MulEvenWidenInt32x8 ...) => (VPMULDQ256 ...) (MulEvenWidenInt64x2 ...) => (VPMULDQ128 ...) @@ -889,30 +900,21 @@ (MulHighMaskedUint16x8 x y mask) => (VPMULHUWMasked128 x y (VPMOVVec16x8ToM mask)) (MulHighMaskedUint16x16 x y mask) => (VPMULHUWMasked256 x y (VPMOVVec16x16ToM mask)) (MulHighMaskedUint16x32 x y mask) => (VPMULHUWMasked512 x y (VPMOVVec16x32ToM mask)) -(MulLowInt16x8 ...) => (VPMULLW128 ...) -(MulLowInt16x16 ...) => (VPMULLW256 ...) -(MulLowInt16x32 ...) => (VPMULLW512 ...) -(MulLowInt32x4 ...) => (VPMULLD128 ...) -(MulLowInt32x8 ...) => (VPMULLD256 ...) -(MulLowInt32x16 ...) => (VPMULLD512 ...) -(MulLowInt64x2 ...) => (VPMULLQ128 ...) -(MulLowInt64x4 ...) => (VPMULLQ256 ...) -(MulLowInt64x8 ...) => (VPMULLQ512 ...) -(MulLowMaskedInt16x8 x y mask) => (VPMULLWMasked128 x y (VPMOVVec16x8ToM mask)) -(MulLowMaskedInt16x16 x y mask) => (VPMULLWMasked256 x y (VPMOVVec16x16ToM mask)) -(MulLowMaskedInt16x32 x y mask) => (VPMULLWMasked512 x y (VPMOVVec16x32ToM mask)) -(MulLowMaskedInt32x4 x y mask) => (VPMULLDMasked128 x y (VPMOVVec32x4ToM mask)) -(MulLowMaskedInt32x8 x y mask) => (VPMULLDMasked256 x y (VPMOVVec32x8ToM mask)) -(MulLowMaskedInt32x16 x y mask) => (VPMULLDMasked512 x y (VPMOVVec32x16ToM mask)) -(MulLowMaskedInt64x2 x y mask) => (VPMULLQMasked128 x y (VPMOVVec64x2ToM mask)) -(MulLowMaskedInt64x4 x y mask) => (VPMULLQMasked256 x y (VPMOVVec64x4ToM mask)) -(MulLowMaskedInt64x8 x y mask) => (VPMULLQMasked512 x y (VPMOVVec64x8ToM mask)) (MulMaskedFloat32x4 x y mask) => (VMULPSMasked128 x y (VPMOVVec32x4ToM mask)) (MulMaskedFloat32x8 x y mask) => (VMULPSMasked256 x y (VPMOVVec32x8ToM mask)) (MulMaskedFloat32x16 x y mask) => (VMULPSMasked512 x y (VPMOVVec32x16ToM mask)) (MulMaskedFloat64x2 x y mask) => (VMULPDMasked128 x y (VPMOVVec64x2ToM mask)) (MulMaskedFloat64x4 x y mask) => (VMULPDMasked256 x y (VPMOVVec64x4ToM mask)) (MulMaskedFloat64x8 x y mask) => (VMULPDMasked512 x y (VPMOVVec64x8ToM mask)) +(MulMaskedInt16x8 x y mask) => (VPMULLWMasked128 x y (VPMOVVec16x8ToM mask)) +(MulMaskedInt16x16 x y mask) => (VPMULLWMasked256 x y (VPMOVVec16x16ToM mask)) +(MulMaskedInt16x32 x y mask) => (VPMULLWMasked512 x y (VPMOVVec16x32ToM mask)) +(MulMaskedInt32x4 x y mask) => (VPMULLDMasked128 x y (VPMOVVec32x4ToM mask)) +(MulMaskedInt32x8 x y mask) => (VPMULLDMasked256 x y (VPMOVVec32x8ToM mask)) +(MulMaskedInt32x16 x y mask) => (VPMULLDMasked512 x y (VPMOVVec32x16ToM mask)) +(MulMaskedInt64x2 x y mask) => (VPMULLQMasked128 x y (VPMOVVec64x2ToM mask)) +(MulMaskedInt64x4 x y mask) => (VPMULLQMasked256 x y (VPMOVVec64x4ToM mask)) +(MulMaskedInt64x8 x y mask) => (VPMULLQMasked512 x y (VPMOVVec64x8ToM mask)) (NotEqualFloat32x4 x y) => (VCMPPS128 [4] x y) (NotEqualFloat32x8 x y) => (VCMPPS256 [4] x y) (NotEqualFloat32x16 x y) => (VPMOVMToVec32x16 (VCMPPS512 [4] x y)) @@ -1015,30 +1017,6 @@ (PairDotProdMaskedInt16x8 x y mask) => (VPMADDWDMasked128 x y (VPMOVVec16x8ToM mask)) (PairDotProdMaskedInt16x16 x y mask) => (VPMADDWDMasked256 x y (VPMOVVec16x16ToM mask)) (PairDotProdMaskedInt16x32 x y mask) => (VPMADDWDMasked512 x y (VPMOVVec16x32ToM mask)) -(PairwiseAddFloat32x4 ...) => (VHADDPS128 ...) -(PairwiseAddFloat32x8 ...) => (VHADDPS256 ...) -(PairwiseAddFloat64x2 ...) => (VHADDPD128 ...) -(PairwiseAddFloat64x4 ...) => (VHADDPD256 ...) -(PairwiseAddInt16x8 ...) => (VPHADDW128 ...) -(PairwiseAddInt16x16 ...) => (VPHADDW256 ...) -(PairwiseAddInt32x4 ...) => (VPHADDD128 ...) -(PairwiseAddInt32x8 ...) => (VPHADDD256 ...) -(PairwiseAddUint16x8 ...) => (VPHADDW128 ...) -(PairwiseAddUint16x16 ...) => (VPHADDW256 ...) -(PairwiseAddUint32x4 ...) => (VPHADDD128 ...) -(PairwiseAddUint32x8 ...) => (VPHADDD256 ...) -(PairwiseSubFloat32x4 ...) => (VHSUBPS128 ...) -(PairwiseSubFloat32x8 ...) => (VHSUBPS256 ...) -(PairwiseSubFloat64x2 ...) => (VHSUBPD128 ...) -(PairwiseSubFloat64x4 ...) => (VHSUBPD256 ...) -(PairwiseSubInt16x8 ...) => (VPHSUBW128 ...) -(PairwiseSubInt16x16 ...) => (VPHSUBW256 ...) -(PairwiseSubInt32x4 ...) => (VPHSUBD128 ...) -(PairwiseSubInt32x8 ...) => (VPHSUBD256 ...) -(PairwiseSubUint16x8 ...) => (VPHSUBW128 ...) -(PairwiseSubUint16x16 ...) => (VPHSUBW256 ...) -(PairwiseSubUint32x4 ...) => (VPHSUBD128 ...) -(PairwiseSubUint32x8 ...) => (VPHSUBD256 ...) (PermuteFloat32x8 ...) => (VPERMPS256 ...) (PermuteFloat32x16 ...) => (VPERMPS512 ...) (PermuteFloat64x4 ...) => (VPERMPD256 ...) @@ -1295,76 +1273,36 @@ (RoundFloat32x8 x) => (VROUNDPS256 [0] x) (RoundFloat64x2 x) => (VROUNDPD128 [0] x) (RoundFloat64x4 x) => (VROUNDPD256 [0] x) -(RoundWithPrecisionFloat32x4 [a] x) => (VRNDSCALEPS128 [a+0] x) -(RoundWithPrecisionFloat32x8 [a] x) => (VRNDSCALEPS256 [a+0] x) -(RoundWithPrecisionFloat32x16 [a] x) => (VRNDSCALEPS512 [a+0] x) -(RoundWithPrecisionFloat64x2 [a] x) => (VRNDSCALEPD128 [a+0] x) -(RoundWithPrecisionFloat64x4 [a] x) => (VRNDSCALEPD256 [a+0] x) -(RoundWithPrecisionFloat64x8 [a] x) => (VRNDSCALEPD512 [a+0] x) -(RoundWithPrecisionMaskedFloat32x4 [a] x mask) => (VRNDSCALEPSMasked128 [a+0] x (VPMOVVec32x4ToM mask)) -(RoundWithPrecisionMaskedFloat32x8 [a] x mask) => (VRNDSCALEPSMasked256 [a+0] x (VPMOVVec32x8ToM mask)) -(RoundWithPrecisionMaskedFloat32x16 [a] x mask) => (VRNDSCALEPSMasked512 [a+0] x (VPMOVVec32x16ToM mask)) -(RoundWithPrecisionMaskedFloat64x2 [a] x mask) => (VRNDSCALEPDMasked128 [a+0] x (VPMOVVec64x2ToM mask)) -(RoundWithPrecisionMaskedFloat64x4 [a] x mask) => (VRNDSCALEPDMasked256 [a+0] x (VPMOVVec64x4ToM mask)) -(RoundWithPrecisionMaskedFloat64x8 [a] x mask) => (VRNDSCALEPDMasked512 [a+0] x (VPMOVVec64x8ToM mask)) -(SaturatedAddInt8x16 ...) => (VPADDSB128 ...) -(SaturatedAddInt8x32 ...) => (VPADDSB256 ...) -(SaturatedAddInt8x64 ...) => (VPADDSB512 ...) -(SaturatedAddInt16x8 ...) => (VPADDSW128 ...) -(SaturatedAddInt16x16 ...) => (VPADDSW256 ...) -(SaturatedAddInt16x32 ...) => (VPADDSW512 ...) -(SaturatedAddUint8x16 ...) => (VPADDSB128 ...) -(SaturatedAddUint8x32 ...) => (VPADDSB256 ...) -(SaturatedAddUint8x64 ...) => (VPADDSB512 ...) -(SaturatedAddUint16x8 ...) => (VPADDSW128 ...) -(SaturatedAddUint16x16 ...) => (VPADDSW256 ...) -(SaturatedAddUint16x32 ...) => (VPADDSW512 ...) +(RoundScaledFloat32x4 [a] x) => (VRNDSCALEPS128 [a+0] x) +(RoundScaledFloat32x8 [a] x) => (VRNDSCALEPS256 [a+0] x) +(RoundScaledFloat32x16 [a] x) => (VRNDSCALEPS512 [a+0] x) +(RoundScaledFloat64x2 [a] x) => (VRNDSCALEPD128 [a+0] x) +(RoundScaledFloat64x4 [a] x) => (VRNDSCALEPD256 [a+0] x) +(RoundScaledFloat64x8 [a] x) => (VRNDSCALEPD512 [a+0] x) +(RoundScaledMaskedFloat32x4 [a] x mask) => (VRNDSCALEPSMasked128 [a+0] x (VPMOVVec32x4ToM mask)) +(RoundScaledMaskedFloat32x8 [a] x mask) => (VRNDSCALEPSMasked256 [a+0] x (VPMOVVec32x8ToM mask)) +(RoundScaledMaskedFloat32x16 [a] x mask) => (VRNDSCALEPSMasked512 [a+0] x (VPMOVVec32x16ToM mask)) +(RoundScaledMaskedFloat64x2 [a] x mask) => (VRNDSCALEPDMasked128 [a+0] x (VPMOVVec64x2ToM mask)) +(RoundScaledMaskedFloat64x4 [a] x mask) => (VRNDSCALEPDMasked256 [a+0] x (VPMOVVec64x4ToM mask)) +(RoundScaledMaskedFloat64x8 [a] x mask) => (VRNDSCALEPDMasked512 [a+0] x (VPMOVVec64x8ToM mask)) +(RoundScaledResidueFloat32x4 [a] x) => (VREDUCEPS128 [a+0] x) +(RoundScaledResidueFloat32x8 [a] x) => (VREDUCEPS256 [a+0] x) +(RoundScaledResidueFloat32x16 [a] x) => (VREDUCEPS512 [a+0] x) +(RoundScaledResidueFloat64x2 [a] x) => (VREDUCEPD128 [a+0] x) +(RoundScaledResidueFloat64x4 [a] x) => (VREDUCEPD256 [a+0] x) +(RoundScaledResidueFloat64x8 [a] x) => (VREDUCEPD512 [a+0] x) +(RoundScaledResidueMaskedFloat32x4 [a] x mask) => (VREDUCEPSMasked128 [a+0] x (VPMOVVec32x4ToM mask)) +(RoundScaledResidueMaskedFloat32x8 [a] x mask) => (VREDUCEPSMasked256 [a+0] x (VPMOVVec32x8ToM mask)) +(RoundScaledResidueMaskedFloat32x16 [a] x mask) => (VREDUCEPSMasked512 [a+0] x (VPMOVVec32x16ToM mask)) +(RoundScaledResidueMaskedFloat64x2 [a] x mask) => (VREDUCEPDMasked128 [a+0] x (VPMOVVec64x2ToM mask)) +(RoundScaledResidueMaskedFloat64x4 [a] x mask) => (VREDUCEPDMasked256 [a+0] x (VPMOVVec64x4ToM mask)) +(RoundScaledResidueMaskedFloat64x8 [a] x mask) => (VREDUCEPDMasked512 [a+0] x (VPMOVVec64x8ToM mask)) (SaturatedAddDotProdInt32x4 ...) => (VPDPWSSDS128 ...) (SaturatedAddDotProdInt32x8 ...) => (VPDPWSSDS256 ...) (SaturatedAddDotProdInt32x16 ...) => (VPDPWSSDS512 ...) (SaturatedAddDotProdMaskedInt32x4 x y z mask) => (VPDPWSSDSMasked128 x y z (VPMOVVec32x4ToM mask)) (SaturatedAddDotProdMaskedInt32x8 x y z mask) => (VPDPWSSDSMasked256 x y z (VPMOVVec32x8ToM mask)) (SaturatedAddDotProdMaskedInt32x16 x y z mask) => (VPDPWSSDSMasked512 x y z (VPMOVVec32x16ToM mask)) -(SaturatedAddMaskedInt8x16 x y mask) => (VPADDSBMasked128 x y (VPMOVVec8x16ToM mask)) -(SaturatedAddMaskedInt8x32 x y mask) => (VPADDSBMasked256 x y (VPMOVVec8x32ToM mask)) -(SaturatedAddMaskedInt8x64 x y mask) => (VPADDSBMasked512 x y (VPMOVVec8x64ToM mask)) -(SaturatedAddMaskedInt16x8 x y mask) => (VPADDSWMasked128 x y (VPMOVVec16x8ToM mask)) -(SaturatedAddMaskedInt16x16 x y mask) => (VPADDSWMasked256 x y (VPMOVVec16x16ToM mask)) -(SaturatedAddMaskedInt16x32 x y mask) => (VPADDSWMasked512 x y (VPMOVVec16x32ToM mask)) -(SaturatedAddMaskedUint8x16 x y mask) => (VPADDSBMasked128 x y (VPMOVVec8x16ToM mask)) -(SaturatedAddMaskedUint8x32 x y mask) => (VPADDSBMasked256 x y (VPMOVVec8x32ToM mask)) -(SaturatedAddMaskedUint8x64 x y mask) => (VPADDSBMasked512 x y (VPMOVVec8x64ToM mask)) -(SaturatedAddMaskedUint16x8 x y mask) => (VPADDSWMasked128 x y (VPMOVVec16x8ToM mask)) -(SaturatedAddMaskedUint16x16 x y mask) => (VPADDSWMasked256 x y (VPMOVVec16x16ToM mask)) -(SaturatedAddMaskedUint16x32 x y mask) => (VPADDSWMasked512 x y (VPMOVVec16x32ToM mask)) -(SaturatedPairwiseAddInt16x8 ...) => (VPHADDSW128 ...) -(SaturatedPairwiseAddInt16x16 ...) => (VPHADDSW256 ...) -(SaturatedPairwiseSubInt16x8 ...) => (VPHSUBSW128 ...) -(SaturatedPairwiseSubInt16x16 ...) => (VPHSUBSW256 ...) -(SaturatedSubInt8x16 ...) => (VPSUBSB128 ...) -(SaturatedSubInt8x32 ...) => (VPSUBSB256 ...) -(SaturatedSubInt8x64 ...) => (VPSUBSB512 ...) -(SaturatedSubInt16x8 ...) => (VPSUBSW128 ...) -(SaturatedSubInt16x16 ...) => (VPSUBSW256 ...) -(SaturatedSubInt16x32 ...) => (VPSUBSW512 ...) -(SaturatedSubUint8x16 ...) => (VPSUBSB128 ...) -(SaturatedSubUint8x32 ...) => (VPSUBSB256 ...) -(SaturatedSubUint8x64 ...) => (VPSUBSB512 ...) -(SaturatedSubUint16x8 ...) => (VPSUBSW128 ...) -(SaturatedSubUint16x16 ...) => (VPSUBSW256 ...) -(SaturatedSubUint16x32 ...) => (VPSUBSW512 ...) -(SaturatedSubMaskedInt8x16 x y mask) => (VPSUBSBMasked128 x y (VPMOVVec8x16ToM mask)) -(SaturatedSubMaskedInt8x32 x y mask) => (VPSUBSBMasked256 x y (VPMOVVec8x32ToM mask)) -(SaturatedSubMaskedInt8x64 x y mask) => (VPSUBSBMasked512 x y (VPMOVVec8x64ToM mask)) -(SaturatedSubMaskedInt16x8 x y mask) => (VPSUBSWMasked128 x y (VPMOVVec16x8ToM mask)) -(SaturatedSubMaskedInt16x16 x y mask) => (VPSUBSWMasked256 x y (VPMOVVec16x16ToM mask)) -(SaturatedSubMaskedInt16x32 x y mask) => (VPSUBSWMasked512 x y (VPMOVVec16x32ToM mask)) -(SaturatedSubMaskedUint8x16 x y mask) => (VPSUBSBMasked128 x y (VPMOVVec8x16ToM mask)) -(SaturatedSubMaskedUint8x32 x y mask) => (VPSUBSBMasked256 x y (VPMOVVec8x32ToM mask)) -(SaturatedSubMaskedUint8x64 x y mask) => (VPSUBSBMasked512 x y (VPMOVVec8x64ToM mask)) -(SaturatedSubMaskedUint16x8 x y mask) => (VPSUBSWMasked128 x y (VPMOVVec16x8ToM mask)) -(SaturatedSubMaskedUint16x16 x y mask) => (VPSUBSWMasked256 x y (VPMOVVec16x16ToM mask)) -(SaturatedSubMaskedUint16x32 x y mask) => (VPSUBSWMasked512 x y (VPMOVVec16x32ToM mask)) (SaturatedUnsignedSignedPairDotProdUint8x16 ...) => (VPMADDUBSW128 ...) (SaturatedUnsignedSignedPairDotProdUint8x32 ...) => (VPMADDUBSW256 ...) (SaturatedUnsignedSignedPairDotProdUint8x64 ...) => (VPMADDUBSW512 ...) @@ -1377,6 +1315,18 @@ (SaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x4 x y z mask) => (VPDPBUSDSMasked128 x y z (VPMOVVec32x4ToM mask)) (SaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x8 x y z mask) => (VPDPBUSDSMasked256 x y z (VPMOVVec32x8ToM mask)) (SaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x16 x y z mask) => (VPDPBUSDSMasked512 x y z (VPMOVVec32x16ToM mask)) +(ScaleFloat32x4 ...) => (VSCALEFPS128 ...) +(ScaleFloat32x8 ...) => (VSCALEFPS256 ...) +(ScaleFloat32x16 ...) => (VSCALEFPS512 ...) +(ScaleFloat64x2 ...) => (VSCALEFPD128 ...) +(ScaleFloat64x4 ...) => (VSCALEFPD256 ...) +(ScaleFloat64x8 ...) => (VSCALEFPD512 ...) +(ScaleMaskedFloat32x4 x y mask) => (VSCALEFPSMasked128 x y (VPMOVVec32x4ToM mask)) +(ScaleMaskedFloat32x8 x y mask) => (VSCALEFPSMasked256 x y (VPMOVVec32x8ToM mask)) +(ScaleMaskedFloat32x16 x y mask) => (VSCALEFPSMasked512 x y (VPMOVVec32x16ToM mask)) +(ScaleMaskedFloat64x2 x y mask) => (VSCALEFPDMasked128 x y (VPMOVVec64x2ToM mask)) +(ScaleMaskedFloat64x4 x y mask) => (VSCALEFPDMasked256 x y (VPMOVVec64x4ToM mask)) +(ScaleMaskedFloat64x8 x y mask) => (VSCALEFPDMasked512 x y (VPMOVVec64x8ToM mask)) (Set128Float32x8 ...) => (VINSERTF128256 ...) (Set128Float64x4 ...) => (VINSERTF128256 ...) (Set128Int8x32 ...) => (VINSERTI128256 ...) @@ -1761,22 +1711,72 @@ (SubMaskedUint64x2 x y mask) => (VPSUBQMasked128 x y (VPMOVVec64x2ToM mask)) (SubMaskedUint64x4 x y mask) => (VPSUBQMasked256 x y (VPMOVVec64x4ToM mask)) (SubMaskedUint64x8 x y mask) => (VPSUBQMasked512 x y (VPMOVVec64x8ToM mask)) +(SubPairsFloat32x4 ...) => (VHSUBPS128 ...) +(SubPairsFloat32x8 ...) => (VHSUBPS256 ...) +(SubPairsFloat64x2 ...) => (VHSUBPD128 ...) +(SubPairsFloat64x4 ...) => (VHSUBPD256 ...) +(SubPairsInt16x8 ...) => (VPHSUBW128 ...) +(SubPairsInt16x16 ...) => (VPHSUBW256 ...) +(SubPairsInt32x4 ...) => (VPHSUBD128 ...) +(SubPairsInt32x8 ...) => (VPHSUBD256 ...) +(SubPairsUint16x8 ...) => (VPHSUBW128 ...) +(SubPairsUint16x16 ...) => (VPHSUBW256 ...) +(SubPairsUint32x4 ...) => (VPHSUBD128 ...) +(SubPairsUint32x8 ...) => (VPHSUBD256 ...) +(SubPairsSaturatedInt16x8 ...) => (VPHSUBSW128 ...) +(SubPairsSaturatedInt16x16 ...) => (VPHSUBSW256 ...) +(SubSaturatedInt8x16 ...) => (VPSUBSB128 ...) +(SubSaturatedInt8x32 ...) => (VPSUBSB256 ...) +(SubSaturatedInt8x64 ...) => (VPSUBSB512 ...) +(SubSaturatedInt16x8 ...) => (VPSUBSW128 ...) +(SubSaturatedInt16x16 ...) => (VPSUBSW256 ...) +(SubSaturatedInt16x32 ...) => (VPSUBSW512 ...) +(SubSaturatedUint8x16 ...) => (VPSUBSB128 ...) +(SubSaturatedUint8x32 ...) => (VPSUBSB256 ...) +(SubSaturatedUint8x64 ...) => (VPSUBSB512 ...) +(SubSaturatedUint16x8 ...) => (VPSUBSW128 ...) +(SubSaturatedUint16x16 ...) => (VPSUBSW256 ...) +(SubSaturatedUint16x32 ...) => (VPSUBSW512 ...) +(SubSaturatedMaskedInt8x16 x y mask) => (VPSUBSBMasked128 x y (VPMOVVec8x16ToM mask)) +(SubSaturatedMaskedInt8x32 x y mask) => (VPSUBSBMasked256 x y (VPMOVVec8x32ToM mask)) +(SubSaturatedMaskedInt8x64 x y mask) => (VPSUBSBMasked512 x y (VPMOVVec8x64ToM mask)) +(SubSaturatedMaskedInt16x8 x y mask) => (VPSUBSWMasked128 x y (VPMOVVec16x8ToM mask)) +(SubSaturatedMaskedInt16x16 x y mask) => (VPSUBSWMasked256 x y (VPMOVVec16x16ToM mask)) +(SubSaturatedMaskedInt16x32 x y mask) => (VPSUBSWMasked512 x y (VPMOVVec16x32ToM mask)) +(SubSaturatedMaskedUint8x16 x y mask) => (VPSUBSBMasked128 x y (VPMOVVec8x16ToM mask)) +(SubSaturatedMaskedUint8x32 x y mask) => (VPSUBSBMasked256 x y (VPMOVVec8x32ToM mask)) +(SubSaturatedMaskedUint8x64 x y mask) => (VPSUBSBMasked512 x y (VPMOVVec8x64ToM mask)) +(SubSaturatedMaskedUint16x8 x y mask) => (VPSUBSWMasked128 x y (VPMOVVec16x8ToM mask)) +(SubSaturatedMaskedUint16x16 x y mask) => (VPSUBSWMasked256 x y (VPMOVVec16x16ToM mask)) +(SubSaturatedMaskedUint16x32 x y mask) => (VPSUBSWMasked512 x y (VPMOVVec16x32ToM mask)) (TruncFloat32x4 x) => (VROUNDPS128 [3] x) (TruncFloat32x8 x) => (VROUNDPS256 [3] x) (TruncFloat64x2 x) => (VROUNDPD128 [3] x) (TruncFloat64x4 x) => (VROUNDPD256 [3] x) -(TruncWithPrecisionFloat32x4 [a] x) => (VRNDSCALEPS128 [a+3] x) -(TruncWithPrecisionFloat32x8 [a] x) => (VRNDSCALEPS256 [a+3] x) -(TruncWithPrecisionFloat32x16 [a] x) => (VRNDSCALEPS512 [a+3] x) -(TruncWithPrecisionFloat64x2 [a] x) => (VRNDSCALEPD128 [a+3] x) -(TruncWithPrecisionFloat64x4 [a] x) => (VRNDSCALEPD256 [a+3] x) -(TruncWithPrecisionFloat64x8 [a] x) => (VRNDSCALEPD512 [a+3] x) -(TruncWithPrecisionMaskedFloat32x4 [a] x mask) => (VRNDSCALEPSMasked128 [a+3] x (VPMOVVec32x4ToM mask)) -(TruncWithPrecisionMaskedFloat32x8 [a] x mask) => (VRNDSCALEPSMasked256 [a+3] x (VPMOVVec32x8ToM mask)) -(TruncWithPrecisionMaskedFloat32x16 [a] x mask) => (VRNDSCALEPSMasked512 [a+3] x (VPMOVVec32x16ToM mask)) -(TruncWithPrecisionMaskedFloat64x2 [a] x mask) => (VRNDSCALEPDMasked128 [a+3] x (VPMOVVec64x2ToM mask)) -(TruncWithPrecisionMaskedFloat64x4 [a] x mask) => (VRNDSCALEPDMasked256 [a+3] x (VPMOVVec64x4ToM mask)) -(TruncWithPrecisionMaskedFloat64x8 [a] x mask) => (VRNDSCALEPDMasked512 [a+3] x (VPMOVVec64x8ToM mask)) +(TruncScaledFloat32x4 [a] x) => (VRNDSCALEPS128 [a+3] x) +(TruncScaledFloat32x8 [a] x) => (VRNDSCALEPS256 [a+3] x) +(TruncScaledFloat32x16 [a] x) => (VRNDSCALEPS512 [a+3] x) +(TruncScaledFloat64x2 [a] x) => (VRNDSCALEPD128 [a+3] x) +(TruncScaledFloat64x4 [a] x) => (VRNDSCALEPD256 [a+3] x) +(TruncScaledFloat64x8 [a] x) => (VRNDSCALEPD512 [a+3] x) +(TruncScaledMaskedFloat32x4 [a] x mask) => (VRNDSCALEPSMasked128 [a+3] x (VPMOVVec32x4ToM mask)) +(TruncScaledMaskedFloat32x8 [a] x mask) => (VRNDSCALEPSMasked256 [a+3] x (VPMOVVec32x8ToM mask)) +(TruncScaledMaskedFloat32x16 [a] x mask) => (VRNDSCALEPSMasked512 [a+3] x (VPMOVVec32x16ToM mask)) +(TruncScaledMaskedFloat64x2 [a] x mask) => (VRNDSCALEPDMasked128 [a+3] x (VPMOVVec64x2ToM mask)) +(TruncScaledMaskedFloat64x4 [a] x mask) => (VRNDSCALEPDMasked256 [a+3] x (VPMOVVec64x4ToM mask)) +(TruncScaledMaskedFloat64x8 [a] x mask) => (VRNDSCALEPDMasked512 [a+3] x (VPMOVVec64x8ToM mask)) +(TruncScaledResidueFloat32x4 [a] x) => (VREDUCEPS128 [a+3] x) +(TruncScaledResidueFloat32x8 [a] x) => (VREDUCEPS256 [a+3] x) +(TruncScaledResidueFloat32x16 [a] x) => (VREDUCEPS512 [a+3] x) +(TruncScaledResidueFloat64x2 [a] x) => (VREDUCEPD128 [a+3] x) +(TruncScaledResidueFloat64x4 [a] x) => (VREDUCEPD256 [a+3] x) +(TruncScaledResidueFloat64x8 [a] x) => (VREDUCEPD512 [a+3] x) +(TruncScaledResidueMaskedFloat32x4 [a] x mask) => (VREDUCEPSMasked128 [a+3] x (VPMOVVec32x4ToM mask)) +(TruncScaledResidueMaskedFloat32x8 [a] x mask) => (VREDUCEPSMasked256 [a+3] x (VPMOVVec32x8ToM mask)) +(TruncScaledResidueMaskedFloat32x16 [a] x mask) => (VREDUCEPSMasked512 [a+3] x (VPMOVVec32x16ToM mask)) +(TruncScaledResidueMaskedFloat64x2 [a] x mask) => (VREDUCEPDMasked128 [a+3] x (VPMOVVec64x2ToM mask)) +(TruncScaledResidueMaskedFloat64x4 [a] x mask) => (VREDUCEPDMasked256 [a+3] x (VPMOVVec64x4ToM mask)) +(TruncScaledResidueMaskedFloat64x8 [a] x mask) => (VREDUCEPDMasked512 [a+3] x (VPMOVVec64x8ToM mask)) (UnsignedSignedQuadDotProdAccumulateInt32x4 ...) => (VPDPBUSD128 ...) (UnsignedSignedQuadDotProdAccumulateInt32x8 ...) => (VPDPBUSD256 ...) (UnsignedSignedQuadDotProdAccumulateInt32x16 ...) => (VPDPBUSD512 ...) diff --git a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go index 492a994e936..ea52254413f 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go +++ b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go @@ -81,6 +81,44 @@ func simdGenericOps() []opData { {name: "AddMaskedUint64x2", argLength: 3, commutative: true}, {name: "AddMaskedUint64x4", argLength: 3, commutative: true}, {name: "AddMaskedUint64x8", argLength: 3, commutative: true}, + {name: "AddPairsFloat32x4", argLength: 2, commutative: false}, + {name: "AddPairsFloat32x8", argLength: 2, commutative: false}, + {name: "AddPairsFloat64x2", argLength: 2, commutative: false}, + {name: "AddPairsFloat64x4", argLength: 2, commutative: false}, + {name: "AddPairsInt16x8", argLength: 2, commutative: false}, + {name: "AddPairsInt16x16", argLength: 2, commutative: false}, + {name: "AddPairsInt32x4", argLength: 2, commutative: false}, + {name: "AddPairsInt32x8", argLength: 2, commutative: false}, + {name: "AddPairsSaturatedInt16x8", argLength: 2, commutative: false}, + {name: "AddPairsSaturatedInt16x16", argLength: 2, commutative: false}, + {name: "AddPairsUint16x8", argLength: 2, commutative: false}, + {name: "AddPairsUint16x16", argLength: 2, commutative: false}, + {name: "AddPairsUint32x4", argLength: 2, commutative: false}, + {name: "AddPairsUint32x8", argLength: 2, commutative: false}, + {name: "AddSaturatedInt8x16", argLength: 2, commutative: true}, + {name: "AddSaturatedInt8x32", argLength: 2, commutative: true}, + {name: "AddSaturatedInt8x64", argLength: 2, commutative: true}, + {name: "AddSaturatedInt16x8", argLength: 2, commutative: true}, + {name: "AddSaturatedInt16x16", argLength: 2, commutative: true}, + {name: "AddSaturatedInt16x32", argLength: 2, commutative: true}, + {name: "AddSaturatedMaskedInt8x16", argLength: 3, commutative: true}, + {name: "AddSaturatedMaskedInt8x32", argLength: 3, commutative: true}, + {name: "AddSaturatedMaskedInt8x64", argLength: 3, commutative: true}, + {name: "AddSaturatedMaskedInt16x8", argLength: 3, commutative: true}, + {name: "AddSaturatedMaskedInt16x16", argLength: 3, commutative: true}, + {name: "AddSaturatedMaskedInt16x32", argLength: 3, commutative: true}, + {name: "AddSaturatedMaskedUint8x16", argLength: 3, commutative: true}, + {name: "AddSaturatedMaskedUint8x32", argLength: 3, commutative: true}, + {name: "AddSaturatedMaskedUint8x64", argLength: 3, commutative: true}, + {name: "AddSaturatedMaskedUint16x8", argLength: 3, commutative: true}, + {name: "AddSaturatedMaskedUint16x16", argLength: 3, commutative: true}, + {name: "AddSaturatedMaskedUint16x32", argLength: 3, commutative: true}, + {name: "AddSaturatedUint8x16", argLength: 2, commutative: true}, + {name: "AddSaturatedUint8x32", argLength: 2, commutative: true}, + {name: "AddSaturatedUint8x64", argLength: 2, commutative: true}, + {name: "AddSaturatedUint16x8", argLength: 2, commutative: true}, + {name: "AddSaturatedUint16x16", argLength: 2, commutative: true}, + {name: "AddSaturatedUint16x32", argLength: 2, commutative: true}, {name: "AddSubFloat32x4", argLength: 2, commutative: false}, {name: "AddSubFloat32x8", argLength: 2, commutative: false}, {name: "AddSubFloat64x2", argLength: 2, commutative: false}, @@ -744,18 +782,6 @@ func simdGenericOps() []opData { {name: "MinUint64x2", argLength: 2, commutative: true}, {name: "MinUint64x4", argLength: 2, commutative: true}, {name: "MinUint64x8", argLength: 2, commutative: true}, - {name: "MulByPowOf2Float32x4", argLength: 2, commutative: false}, - {name: "MulByPowOf2Float32x8", argLength: 2, commutative: false}, - {name: "MulByPowOf2Float32x16", argLength: 2, commutative: false}, - {name: "MulByPowOf2Float64x2", argLength: 2, commutative: false}, - {name: "MulByPowOf2Float64x4", argLength: 2, commutative: false}, - {name: "MulByPowOf2Float64x8", argLength: 2, commutative: false}, - {name: "MulByPowOf2MaskedFloat32x4", argLength: 3, commutative: false}, - {name: "MulByPowOf2MaskedFloat32x8", argLength: 3, commutative: false}, - {name: "MulByPowOf2MaskedFloat32x16", argLength: 3, commutative: false}, - {name: "MulByPowOf2MaskedFloat64x2", argLength: 3, commutative: false}, - {name: "MulByPowOf2MaskedFloat64x4", argLength: 3, commutative: false}, - {name: "MulByPowOf2MaskedFloat64x8", argLength: 3, commutative: false}, {name: "MulEvenWidenInt32x4", argLength: 2, commutative: true}, {name: "MulEvenWidenInt32x8", argLength: 2, commutative: true}, {name: "MulEvenWidenInt64x2", argLength: 2, commutative: true}, @@ -790,30 +816,30 @@ func simdGenericOps() []opData { {name: "MulHighUint16x8", argLength: 2, commutative: true}, {name: "MulHighUint16x16", argLength: 2, commutative: true}, {name: "MulHighUint16x32", argLength: 2, commutative: true}, - {name: "MulLowInt16x8", argLength: 2, commutative: true}, - {name: "MulLowInt16x16", argLength: 2, commutative: true}, - {name: "MulLowInt16x32", argLength: 2, commutative: true}, - {name: "MulLowInt32x4", argLength: 2, commutative: true}, - {name: "MulLowInt32x8", argLength: 2, commutative: true}, - {name: "MulLowInt32x16", argLength: 2, commutative: true}, - {name: "MulLowInt64x2", argLength: 2, commutative: true}, - {name: "MulLowInt64x4", argLength: 2, commutative: true}, - {name: "MulLowInt64x8", argLength: 2, commutative: true}, - {name: "MulLowMaskedInt16x8", argLength: 3, commutative: true}, - {name: "MulLowMaskedInt16x16", argLength: 3, commutative: true}, - {name: "MulLowMaskedInt16x32", argLength: 3, commutative: true}, - {name: "MulLowMaskedInt32x4", argLength: 3, commutative: true}, - {name: "MulLowMaskedInt32x8", argLength: 3, commutative: true}, - {name: "MulLowMaskedInt32x16", argLength: 3, commutative: true}, - {name: "MulLowMaskedInt64x2", argLength: 3, commutative: true}, - {name: "MulLowMaskedInt64x4", argLength: 3, commutative: true}, - {name: "MulLowMaskedInt64x8", argLength: 3, commutative: true}, + {name: "MulInt16x8", argLength: 2, commutative: true}, + {name: "MulInt16x16", argLength: 2, commutative: true}, + {name: "MulInt16x32", argLength: 2, commutative: true}, + {name: "MulInt32x4", argLength: 2, commutative: true}, + {name: "MulInt32x8", argLength: 2, commutative: true}, + {name: "MulInt32x16", argLength: 2, commutative: true}, + {name: "MulInt64x2", argLength: 2, commutative: true}, + {name: "MulInt64x4", argLength: 2, commutative: true}, + {name: "MulInt64x8", argLength: 2, commutative: true}, {name: "MulMaskedFloat32x4", argLength: 3, commutative: true}, {name: "MulMaskedFloat32x8", argLength: 3, commutative: true}, {name: "MulMaskedFloat32x16", argLength: 3, commutative: true}, {name: "MulMaskedFloat64x2", argLength: 3, commutative: true}, {name: "MulMaskedFloat64x4", argLength: 3, commutative: true}, {name: "MulMaskedFloat64x8", argLength: 3, commutative: true}, + {name: "MulMaskedInt16x8", argLength: 3, commutative: true}, + {name: "MulMaskedInt16x16", argLength: 3, commutative: true}, + {name: "MulMaskedInt16x32", argLength: 3, commutative: true}, + {name: "MulMaskedInt32x4", argLength: 3, commutative: true}, + {name: "MulMaskedInt32x8", argLength: 3, commutative: true}, + {name: "MulMaskedInt32x16", argLength: 3, commutative: true}, + {name: "MulMaskedInt64x2", argLength: 3, commutative: true}, + {name: "MulMaskedInt64x4", argLength: 3, commutative: true}, + {name: "MulMaskedInt64x8", argLength: 3, commutative: true}, {name: "NotEqualFloat32x4", argLength: 2, commutative: true}, {name: "NotEqualFloat32x8", argLength: 2, commutative: true}, {name: "NotEqualFloat32x16", argLength: 2, commutative: true}, @@ -916,30 +942,6 @@ func simdGenericOps() []opData { {name: "PairDotProdMaskedInt16x8", argLength: 3, commutative: false}, {name: "PairDotProdMaskedInt16x16", argLength: 3, commutative: false}, {name: "PairDotProdMaskedInt16x32", argLength: 3, commutative: false}, - {name: "PairwiseAddFloat32x4", argLength: 2, commutative: false}, - {name: "PairwiseAddFloat32x8", argLength: 2, commutative: false}, - {name: "PairwiseAddFloat64x2", argLength: 2, commutative: false}, - {name: "PairwiseAddFloat64x4", argLength: 2, commutative: false}, - {name: "PairwiseAddInt16x8", argLength: 2, commutative: false}, - {name: "PairwiseAddInt16x16", argLength: 2, commutative: false}, - {name: "PairwiseAddInt32x4", argLength: 2, commutative: false}, - {name: "PairwiseAddInt32x8", argLength: 2, commutative: false}, - {name: "PairwiseAddUint16x8", argLength: 2, commutative: false}, - {name: "PairwiseAddUint16x16", argLength: 2, commutative: false}, - {name: "PairwiseAddUint32x4", argLength: 2, commutative: false}, - {name: "PairwiseAddUint32x8", argLength: 2, commutative: false}, - {name: "PairwiseSubFloat32x4", argLength: 2, commutative: false}, - {name: "PairwiseSubFloat32x8", argLength: 2, commutative: false}, - {name: "PairwiseSubFloat64x2", argLength: 2, commutative: false}, - {name: "PairwiseSubFloat64x4", argLength: 2, commutative: false}, - {name: "PairwiseSubInt16x8", argLength: 2, commutative: false}, - {name: "PairwiseSubInt16x16", argLength: 2, commutative: false}, - {name: "PairwiseSubInt32x4", argLength: 2, commutative: false}, - {name: "PairwiseSubInt32x8", argLength: 2, commutative: false}, - {name: "PairwiseSubUint16x8", argLength: 2, commutative: false}, - {name: "PairwiseSubUint16x16", argLength: 2, commutative: false}, - {name: "PairwiseSubUint32x4", argLength: 2, commutative: false}, - {name: "PairwiseSubUint32x8", argLength: 2, commutative: false}, {name: "Permute2Float32x4", argLength: 3, commutative: false}, {name: "Permute2Float32x8", argLength: 3, commutative: false}, {name: "Permute2Float32x16", argLength: 3, commutative: false}, @@ -1154,58 +1156,6 @@ func simdGenericOps() []opData { {name: "SaturatedAddDotProdMaskedInt32x4", argLength: 4, commutative: false}, {name: "SaturatedAddDotProdMaskedInt32x8", argLength: 4, commutative: false}, {name: "SaturatedAddDotProdMaskedInt32x16", argLength: 4, commutative: false}, - {name: "SaturatedAddInt8x16", argLength: 2, commutative: true}, - {name: "SaturatedAddInt8x32", argLength: 2, commutative: true}, - {name: "SaturatedAddInt8x64", argLength: 2, commutative: true}, - {name: "SaturatedAddInt16x8", argLength: 2, commutative: true}, - {name: "SaturatedAddInt16x16", argLength: 2, commutative: true}, - {name: "SaturatedAddInt16x32", argLength: 2, commutative: true}, - {name: "SaturatedAddMaskedInt8x16", argLength: 3, commutative: true}, - {name: "SaturatedAddMaskedInt8x32", argLength: 3, commutative: true}, - {name: "SaturatedAddMaskedInt8x64", argLength: 3, commutative: true}, - {name: "SaturatedAddMaskedInt16x8", argLength: 3, commutative: true}, - {name: "SaturatedAddMaskedInt16x16", argLength: 3, commutative: true}, - {name: "SaturatedAddMaskedInt16x32", argLength: 3, commutative: true}, - {name: "SaturatedAddMaskedUint8x16", argLength: 3, commutative: true}, - {name: "SaturatedAddMaskedUint8x32", argLength: 3, commutative: true}, - {name: "SaturatedAddMaskedUint8x64", argLength: 3, commutative: true}, - {name: "SaturatedAddMaskedUint16x8", argLength: 3, commutative: true}, - {name: "SaturatedAddMaskedUint16x16", argLength: 3, commutative: true}, - {name: "SaturatedAddMaskedUint16x32", argLength: 3, commutative: true}, - {name: "SaturatedAddUint8x16", argLength: 2, commutative: true}, - {name: "SaturatedAddUint8x32", argLength: 2, commutative: true}, - {name: "SaturatedAddUint8x64", argLength: 2, commutative: true}, - {name: "SaturatedAddUint16x8", argLength: 2, commutative: true}, - {name: "SaturatedAddUint16x16", argLength: 2, commutative: true}, - {name: "SaturatedAddUint16x32", argLength: 2, commutative: true}, - {name: "SaturatedPairwiseAddInt16x8", argLength: 2, commutative: false}, - {name: "SaturatedPairwiseAddInt16x16", argLength: 2, commutative: false}, - {name: "SaturatedPairwiseSubInt16x8", argLength: 2, commutative: false}, - {name: "SaturatedPairwiseSubInt16x16", argLength: 2, commutative: false}, - {name: "SaturatedSubInt8x16", argLength: 2, commutative: false}, - {name: "SaturatedSubInt8x32", argLength: 2, commutative: false}, - {name: "SaturatedSubInt8x64", argLength: 2, commutative: false}, - {name: "SaturatedSubInt16x8", argLength: 2, commutative: false}, - {name: "SaturatedSubInt16x16", argLength: 2, commutative: false}, - {name: "SaturatedSubInt16x32", argLength: 2, commutative: false}, - {name: "SaturatedSubMaskedInt8x16", argLength: 3, commutative: false}, - {name: "SaturatedSubMaskedInt8x32", argLength: 3, commutative: false}, - {name: "SaturatedSubMaskedInt8x64", argLength: 3, commutative: false}, - {name: "SaturatedSubMaskedInt16x8", argLength: 3, commutative: false}, - {name: "SaturatedSubMaskedInt16x16", argLength: 3, commutative: false}, - {name: "SaturatedSubMaskedInt16x32", argLength: 3, commutative: false}, - {name: "SaturatedSubMaskedUint8x16", argLength: 3, commutative: false}, - {name: "SaturatedSubMaskedUint8x32", argLength: 3, commutative: false}, - {name: "SaturatedSubMaskedUint8x64", argLength: 3, commutative: false}, - {name: "SaturatedSubMaskedUint16x8", argLength: 3, commutative: false}, - {name: "SaturatedSubMaskedUint16x16", argLength: 3, commutative: false}, - {name: "SaturatedSubMaskedUint16x32", argLength: 3, commutative: false}, - {name: "SaturatedSubUint8x16", argLength: 2, commutative: false}, - {name: "SaturatedSubUint8x32", argLength: 2, commutative: false}, - {name: "SaturatedSubUint8x64", argLength: 2, commutative: false}, - {name: "SaturatedSubUint16x8", argLength: 2, commutative: false}, - {name: "SaturatedSubUint16x16", argLength: 2, commutative: false}, - {name: "SaturatedSubUint16x32", argLength: 2, commutative: false}, {name: "SaturatedUnsignedSignedPairDotProdMaskedUint8x16", argLength: 3, commutative: false}, {name: "SaturatedUnsignedSignedPairDotProdMaskedUint8x32", argLength: 3, commutative: false}, {name: "SaturatedUnsignedSignedPairDotProdMaskedUint8x64", argLength: 3, commutative: false}, @@ -1218,6 +1168,18 @@ func simdGenericOps() []opData { {name: "SaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x4", argLength: 4, commutative: false}, {name: "SaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x8", argLength: 4, commutative: false}, {name: "SaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x16", argLength: 4, commutative: false}, + {name: "ScaleFloat32x4", argLength: 2, commutative: false}, + {name: "ScaleFloat32x8", argLength: 2, commutative: false}, + {name: "ScaleFloat32x16", argLength: 2, commutative: false}, + {name: "ScaleFloat64x2", argLength: 2, commutative: false}, + {name: "ScaleFloat64x4", argLength: 2, commutative: false}, + {name: "ScaleFloat64x8", argLength: 2, commutative: false}, + {name: "ScaleMaskedFloat32x4", argLength: 3, commutative: false}, + {name: "ScaleMaskedFloat32x8", argLength: 3, commutative: false}, + {name: "ScaleMaskedFloat32x16", argLength: 3, commutative: false}, + {name: "ScaleMaskedFloat64x2", argLength: 3, commutative: false}, + {name: "ScaleMaskedFloat64x4", argLength: 3, commutative: false}, + {name: "ScaleMaskedFloat64x8", argLength: 3, commutative: false}, {name: "ShiftAllLeftInt16x8", argLength: 2, commutative: false}, {name: "ShiftAllLeftInt16x16", argLength: 2, commutative: false}, {name: "ShiftAllLeftInt16x32", argLength: 2, commutative: false}, @@ -1500,6 +1462,44 @@ func simdGenericOps() []opData { {name: "SubMaskedUint64x2", argLength: 3, commutative: false}, {name: "SubMaskedUint64x4", argLength: 3, commutative: false}, {name: "SubMaskedUint64x8", argLength: 3, commutative: false}, + {name: "SubPairsFloat32x4", argLength: 2, commutative: false}, + {name: "SubPairsFloat32x8", argLength: 2, commutative: false}, + {name: "SubPairsFloat64x2", argLength: 2, commutative: false}, + {name: "SubPairsFloat64x4", argLength: 2, commutative: false}, + {name: "SubPairsInt16x8", argLength: 2, commutative: false}, + {name: "SubPairsInt16x16", argLength: 2, commutative: false}, + {name: "SubPairsInt32x4", argLength: 2, commutative: false}, + {name: "SubPairsInt32x8", argLength: 2, commutative: false}, + {name: "SubPairsSaturatedInt16x8", argLength: 2, commutative: false}, + {name: "SubPairsSaturatedInt16x16", argLength: 2, commutative: false}, + {name: "SubPairsUint16x8", argLength: 2, commutative: false}, + {name: "SubPairsUint16x16", argLength: 2, commutative: false}, + {name: "SubPairsUint32x4", argLength: 2, commutative: false}, + {name: "SubPairsUint32x8", argLength: 2, commutative: false}, + {name: "SubSaturatedInt8x16", argLength: 2, commutative: false}, + {name: "SubSaturatedInt8x32", argLength: 2, commutative: false}, + {name: "SubSaturatedInt8x64", argLength: 2, commutative: false}, + {name: "SubSaturatedInt16x8", argLength: 2, commutative: false}, + {name: "SubSaturatedInt16x16", argLength: 2, commutative: false}, + {name: "SubSaturatedInt16x32", argLength: 2, commutative: false}, + {name: "SubSaturatedMaskedInt8x16", argLength: 3, commutative: false}, + {name: "SubSaturatedMaskedInt8x32", argLength: 3, commutative: false}, + {name: "SubSaturatedMaskedInt8x64", argLength: 3, commutative: false}, + {name: "SubSaturatedMaskedInt16x8", argLength: 3, commutative: false}, + {name: "SubSaturatedMaskedInt16x16", argLength: 3, commutative: false}, + {name: "SubSaturatedMaskedInt16x32", argLength: 3, commutative: false}, + {name: "SubSaturatedMaskedUint8x16", argLength: 3, commutative: false}, + {name: "SubSaturatedMaskedUint8x32", argLength: 3, commutative: false}, + {name: "SubSaturatedMaskedUint8x64", argLength: 3, commutative: false}, + {name: "SubSaturatedMaskedUint16x8", argLength: 3, commutative: false}, + {name: "SubSaturatedMaskedUint16x16", argLength: 3, commutative: false}, + {name: "SubSaturatedMaskedUint16x32", argLength: 3, commutative: false}, + {name: "SubSaturatedUint8x16", argLength: 2, commutative: false}, + {name: "SubSaturatedUint8x32", argLength: 2, commutative: false}, + {name: "SubSaturatedUint8x64", argLength: 2, commutative: false}, + {name: "SubSaturatedUint16x8", argLength: 2, commutative: false}, + {name: "SubSaturatedUint16x16", argLength: 2, commutative: false}, + {name: "SubSaturatedUint16x32", argLength: 2, commutative: false}, {name: "SubUint8x16", argLength: 2, commutative: false}, {name: "SubUint8x32", argLength: 2, commutative: false}, {name: "SubUint8x64", argLength: 2, commutative: false}, @@ -1558,78 +1558,54 @@ func simdGenericOps() []opData { {name: "XorUint64x2", argLength: 2, commutative: true}, {name: "XorUint64x4", argLength: 2, commutative: true}, {name: "XorUint64x8", argLength: 2, commutative: true}, - {name: "CeilWithPrecisionFloat32x4", argLength: 1, commutative: false, aux: "Int8"}, - {name: "CeilWithPrecisionFloat32x8", argLength: 1, commutative: false, aux: "Int8"}, - {name: "CeilWithPrecisionFloat32x16", argLength: 1, commutative: false, aux: "Int8"}, - {name: "CeilWithPrecisionFloat64x2", argLength: 1, commutative: false, aux: "Int8"}, - {name: "CeilWithPrecisionFloat64x4", argLength: 1, commutative: false, aux: "Int8"}, - {name: "CeilWithPrecisionFloat64x8", argLength: 1, commutative: false, aux: "Int8"}, - {name: "CeilWithPrecisionMaskedFloat32x4", argLength: 2, commutative: false, aux: "Int8"}, - {name: "CeilWithPrecisionMaskedFloat32x8", argLength: 2, commutative: false, aux: "Int8"}, - {name: "CeilWithPrecisionMaskedFloat32x16", argLength: 2, commutative: false, aux: "Int8"}, - {name: "CeilWithPrecisionMaskedFloat64x2", argLength: 2, commutative: false, aux: "Int8"}, - {name: "CeilWithPrecisionMaskedFloat64x4", argLength: 2, commutative: false, aux: "Int8"}, - {name: "CeilWithPrecisionMaskedFloat64x8", argLength: 2, commutative: false, aux: "Int8"}, - {name: "DiffWithCeilWithPrecisionFloat32x4", argLength: 1, commutative: false, aux: "Int8"}, - {name: "DiffWithCeilWithPrecisionFloat32x8", argLength: 1, commutative: false, aux: "Int8"}, - {name: "DiffWithCeilWithPrecisionFloat32x16", argLength: 1, commutative: false, aux: "Int8"}, - {name: "DiffWithCeilWithPrecisionFloat64x2", argLength: 1, commutative: false, aux: "Int8"}, - {name: "DiffWithCeilWithPrecisionFloat64x4", argLength: 1, commutative: false, aux: "Int8"}, - {name: "DiffWithCeilWithPrecisionFloat64x8", argLength: 1, commutative: false, aux: "Int8"}, - {name: "DiffWithCeilWithPrecisionMaskedFloat32x4", argLength: 2, commutative: false, aux: "Int8"}, - {name: "DiffWithCeilWithPrecisionMaskedFloat32x8", argLength: 2, commutative: false, aux: "Int8"}, - {name: "DiffWithCeilWithPrecisionMaskedFloat32x16", argLength: 2, commutative: false, aux: "Int8"}, - {name: "DiffWithCeilWithPrecisionMaskedFloat64x2", argLength: 2, commutative: false, aux: "Int8"}, - {name: "DiffWithCeilWithPrecisionMaskedFloat64x4", argLength: 2, commutative: false, aux: "Int8"}, - {name: "DiffWithCeilWithPrecisionMaskedFloat64x8", argLength: 2, commutative: false, aux: "Int8"}, - {name: "DiffWithFloorWithPrecisionFloat32x4", argLength: 1, commutative: false, aux: "Int8"}, - {name: "DiffWithFloorWithPrecisionFloat32x8", argLength: 1, commutative: false, aux: "Int8"}, - {name: "DiffWithFloorWithPrecisionFloat32x16", argLength: 1, commutative: false, aux: "Int8"}, - {name: "DiffWithFloorWithPrecisionFloat64x2", argLength: 1, commutative: false, aux: "Int8"}, - {name: "DiffWithFloorWithPrecisionFloat64x4", argLength: 1, commutative: false, aux: "Int8"}, - {name: "DiffWithFloorWithPrecisionFloat64x8", argLength: 1, commutative: false, aux: "Int8"}, - {name: "DiffWithFloorWithPrecisionMaskedFloat32x4", argLength: 2, commutative: false, aux: "Int8"}, - {name: "DiffWithFloorWithPrecisionMaskedFloat32x8", argLength: 2, commutative: false, aux: "Int8"}, - {name: "DiffWithFloorWithPrecisionMaskedFloat32x16", argLength: 2, commutative: false, aux: "Int8"}, - {name: "DiffWithFloorWithPrecisionMaskedFloat64x2", argLength: 2, commutative: false, aux: "Int8"}, - {name: "DiffWithFloorWithPrecisionMaskedFloat64x4", argLength: 2, commutative: false, aux: "Int8"}, - {name: "DiffWithFloorWithPrecisionMaskedFloat64x8", argLength: 2, commutative: false, aux: "Int8"}, - {name: "DiffWithRoundWithPrecisionFloat32x4", argLength: 1, commutative: false, aux: "Int8"}, - {name: "DiffWithRoundWithPrecisionFloat32x8", argLength: 1, commutative: false, aux: "Int8"}, - {name: "DiffWithRoundWithPrecisionFloat32x16", argLength: 1, commutative: false, aux: "Int8"}, - {name: "DiffWithRoundWithPrecisionFloat64x2", argLength: 1, commutative: false, aux: "Int8"}, - {name: "DiffWithRoundWithPrecisionFloat64x4", argLength: 1, commutative: false, aux: "Int8"}, - {name: "DiffWithRoundWithPrecisionFloat64x8", argLength: 1, commutative: false, aux: "Int8"}, - {name: "DiffWithRoundWithPrecisionMaskedFloat32x4", argLength: 2, commutative: false, aux: "Int8"}, - {name: "DiffWithRoundWithPrecisionMaskedFloat32x8", argLength: 2, commutative: false, aux: "Int8"}, - {name: "DiffWithRoundWithPrecisionMaskedFloat32x16", argLength: 2, commutative: false, aux: "Int8"}, - {name: "DiffWithRoundWithPrecisionMaskedFloat64x2", argLength: 2, commutative: false, aux: "Int8"}, - {name: "DiffWithRoundWithPrecisionMaskedFloat64x4", argLength: 2, commutative: false, aux: "Int8"}, - {name: "DiffWithRoundWithPrecisionMaskedFloat64x8", argLength: 2, commutative: false, aux: "Int8"}, - {name: "DiffWithTruncWithPrecisionFloat32x4", argLength: 1, commutative: false, aux: "Int8"}, - {name: "DiffWithTruncWithPrecisionFloat32x8", argLength: 1, commutative: false, aux: "Int8"}, - {name: "DiffWithTruncWithPrecisionFloat32x16", argLength: 1, commutative: false, aux: "Int8"}, - {name: "DiffWithTruncWithPrecisionFloat64x2", argLength: 1, commutative: false, aux: "Int8"}, - {name: "DiffWithTruncWithPrecisionFloat64x4", argLength: 1, commutative: false, aux: "Int8"}, - {name: "DiffWithTruncWithPrecisionFloat64x8", argLength: 1, commutative: false, aux: "Int8"}, - {name: "DiffWithTruncWithPrecisionMaskedFloat32x4", argLength: 2, commutative: false, aux: "Int8"}, - {name: "DiffWithTruncWithPrecisionMaskedFloat32x8", argLength: 2, commutative: false, aux: "Int8"}, - {name: "DiffWithTruncWithPrecisionMaskedFloat32x16", argLength: 2, commutative: false, aux: "Int8"}, - {name: "DiffWithTruncWithPrecisionMaskedFloat64x2", argLength: 2, commutative: false, aux: "Int8"}, - {name: "DiffWithTruncWithPrecisionMaskedFloat64x4", argLength: 2, commutative: false, aux: "Int8"}, - {name: "DiffWithTruncWithPrecisionMaskedFloat64x8", argLength: 2, commutative: false, aux: "Int8"}, - {name: "FloorWithPrecisionFloat32x4", argLength: 1, commutative: false, aux: "Int8"}, - {name: "FloorWithPrecisionFloat32x8", argLength: 1, commutative: false, aux: "Int8"}, - {name: "FloorWithPrecisionFloat32x16", argLength: 1, commutative: false, aux: "Int8"}, - {name: "FloorWithPrecisionFloat64x2", argLength: 1, commutative: false, aux: "Int8"}, - {name: "FloorWithPrecisionFloat64x4", argLength: 1, commutative: false, aux: "Int8"}, - {name: "FloorWithPrecisionFloat64x8", argLength: 1, commutative: false, aux: "Int8"}, - {name: "FloorWithPrecisionMaskedFloat32x4", argLength: 2, commutative: false, aux: "Int8"}, - {name: "FloorWithPrecisionMaskedFloat32x8", argLength: 2, commutative: false, aux: "Int8"}, - {name: "FloorWithPrecisionMaskedFloat32x16", argLength: 2, commutative: false, aux: "Int8"}, - {name: "FloorWithPrecisionMaskedFloat64x2", argLength: 2, commutative: false, aux: "Int8"}, - {name: "FloorWithPrecisionMaskedFloat64x4", argLength: 2, commutative: false, aux: "Int8"}, - {name: "FloorWithPrecisionMaskedFloat64x8", argLength: 2, commutative: false, aux: "Int8"}, + {name: "CeilScaledFloat32x4", argLength: 1, commutative: false, aux: "Int8"}, + {name: "CeilScaledFloat32x8", argLength: 1, commutative: false, aux: "Int8"}, + {name: "CeilScaledFloat32x16", argLength: 1, commutative: false, aux: "Int8"}, + {name: "CeilScaledFloat64x2", argLength: 1, commutative: false, aux: "Int8"}, + {name: "CeilScaledFloat64x4", argLength: 1, commutative: false, aux: "Int8"}, + {name: "CeilScaledFloat64x8", argLength: 1, commutative: false, aux: "Int8"}, + {name: "CeilScaledMaskedFloat32x4", argLength: 2, commutative: false, aux: "Int8"}, + {name: "CeilScaledMaskedFloat32x8", argLength: 2, commutative: false, aux: "Int8"}, + {name: "CeilScaledMaskedFloat32x16", argLength: 2, commutative: false, aux: "Int8"}, + {name: "CeilScaledMaskedFloat64x2", argLength: 2, commutative: false, aux: "Int8"}, + {name: "CeilScaledMaskedFloat64x4", argLength: 2, commutative: false, aux: "Int8"}, + {name: "CeilScaledMaskedFloat64x8", argLength: 2, commutative: false, aux: "Int8"}, + {name: "CeilScaledResidueFloat32x4", argLength: 1, commutative: false, aux: "Int8"}, + {name: "CeilScaledResidueFloat32x8", argLength: 1, commutative: false, aux: "Int8"}, + {name: "CeilScaledResidueFloat32x16", argLength: 1, commutative: false, aux: "Int8"}, + {name: "CeilScaledResidueFloat64x2", argLength: 1, commutative: false, aux: "Int8"}, + {name: "CeilScaledResidueFloat64x4", argLength: 1, commutative: false, aux: "Int8"}, + {name: "CeilScaledResidueFloat64x8", argLength: 1, commutative: false, aux: "Int8"}, + {name: "CeilScaledResidueMaskedFloat32x4", argLength: 2, commutative: false, aux: "Int8"}, + {name: "CeilScaledResidueMaskedFloat32x8", argLength: 2, commutative: false, aux: "Int8"}, + {name: "CeilScaledResidueMaskedFloat32x16", argLength: 2, commutative: false, aux: "Int8"}, + {name: "CeilScaledResidueMaskedFloat64x2", argLength: 2, commutative: false, aux: "Int8"}, + {name: "CeilScaledResidueMaskedFloat64x4", argLength: 2, commutative: false, aux: "Int8"}, + {name: "CeilScaledResidueMaskedFloat64x8", argLength: 2, commutative: false, aux: "Int8"}, + {name: "FloorScaledFloat32x4", argLength: 1, commutative: false, aux: "Int8"}, + {name: "FloorScaledFloat32x8", argLength: 1, commutative: false, aux: "Int8"}, + {name: "FloorScaledFloat32x16", argLength: 1, commutative: false, aux: "Int8"}, + {name: "FloorScaledFloat64x2", argLength: 1, commutative: false, aux: "Int8"}, + {name: "FloorScaledFloat64x4", argLength: 1, commutative: false, aux: "Int8"}, + {name: "FloorScaledFloat64x8", argLength: 1, commutative: false, aux: "Int8"}, + {name: "FloorScaledMaskedFloat32x4", argLength: 2, commutative: false, aux: "Int8"}, + {name: "FloorScaledMaskedFloat32x8", argLength: 2, commutative: false, aux: "Int8"}, + {name: "FloorScaledMaskedFloat32x16", argLength: 2, commutative: false, aux: "Int8"}, + {name: "FloorScaledMaskedFloat64x2", argLength: 2, commutative: false, aux: "Int8"}, + {name: "FloorScaledMaskedFloat64x4", argLength: 2, commutative: false, aux: "Int8"}, + {name: "FloorScaledMaskedFloat64x8", argLength: 2, commutative: false, aux: "Int8"}, + {name: "FloorScaledResidueFloat32x4", argLength: 1, commutative: false, aux: "Int8"}, + {name: "FloorScaledResidueFloat32x8", argLength: 1, commutative: false, aux: "Int8"}, + {name: "FloorScaledResidueFloat32x16", argLength: 1, commutative: false, aux: "Int8"}, + {name: "FloorScaledResidueFloat64x2", argLength: 1, commutative: false, aux: "Int8"}, + {name: "FloorScaledResidueFloat64x4", argLength: 1, commutative: false, aux: "Int8"}, + {name: "FloorScaledResidueFloat64x8", argLength: 1, commutative: false, aux: "Int8"}, + {name: "FloorScaledResidueMaskedFloat32x4", argLength: 2, commutative: false, aux: "Int8"}, + {name: "FloorScaledResidueMaskedFloat32x8", argLength: 2, commutative: false, aux: "Int8"}, + {name: "FloorScaledResidueMaskedFloat32x16", argLength: 2, commutative: false, aux: "Int8"}, + {name: "FloorScaledResidueMaskedFloat64x2", argLength: 2, commutative: false, aux: "Int8"}, + {name: "FloorScaledResidueMaskedFloat64x4", argLength: 2, commutative: false, aux: "Int8"}, + {name: "FloorScaledResidueMaskedFloat64x8", argLength: 2, commutative: false, aux: "Int8"}, {name: "GaloisFieldAffineTransformInverseMaskedUint8x16", argLength: 3, commutative: false, aux: "Int8"}, {name: "GaloisFieldAffineTransformInverseMaskedUint8x32", argLength: 3, commutative: false, aux: "Int8"}, {name: "GaloisFieldAffineTransformInverseMaskedUint8x64", argLength: 3, commutative: false, aux: "Int8"}, @@ -1708,18 +1684,30 @@ func simdGenericOps() []opData { {name: "RotateAllRightUint64x2", argLength: 1, commutative: false, aux: "Int8"}, {name: "RotateAllRightUint64x4", argLength: 1, commutative: false, aux: "Int8"}, {name: "RotateAllRightUint64x8", argLength: 1, commutative: false, aux: "Int8"}, - {name: "RoundWithPrecisionFloat32x4", argLength: 1, commutative: false, aux: "Int8"}, - {name: "RoundWithPrecisionFloat32x8", argLength: 1, commutative: false, aux: "Int8"}, - {name: "RoundWithPrecisionFloat32x16", argLength: 1, commutative: false, aux: "Int8"}, - {name: "RoundWithPrecisionFloat64x2", argLength: 1, commutative: false, aux: "Int8"}, - {name: "RoundWithPrecisionFloat64x4", argLength: 1, commutative: false, aux: "Int8"}, - {name: "RoundWithPrecisionFloat64x8", argLength: 1, commutative: false, aux: "Int8"}, - {name: "RoundWithPrecisionMaskedFloat32x4", argLength: 2, commutative: false, aux: "Int8"}, - {name: "RoundWithPrecisionMaskedFloat32x8", argLength: 2, commutative: false, aux: "Int8"}, - {name: "RoundWithPrecisionMaskedFloat32x16", argLength: 2, commutative: false, aux: "Int8"}, - {name: "RoundWithPrecisionMaskedFloat64x2", argLength: 2, commutative: false, aux: "Int8"}, - {name: "RoundWithPrecisionMaskedFloat64x4", argLength: 2, commutative: false, aux: "Int8"}, - {name: "RoundWithPrecisionMaskedFloat64x8", argLength: 2, commutative: false, aux: "Int8"}, + {name: "RoundScaledFloat32x4", argLength: 1, commutative: false, aux: "Int8"}, + {name: "RoundScaledFloat32x8", argLength: 1, commutative: false, aux: "Int8"}, + {name: "RoundScaledFloat32x16", argLength: 1, commutative: false, aux: "Int8"}, + {name: "RoundScaledFloat64x2", argLength: 1, commutative: false, aux: "Int8"}, + {name: "RoundScaledFloat64x4", argLength: 1, commutative: false, aux: "Int8"}, + {name: "RoundScaledFloat64x8", argLength: 1, commutative: false, aux: "Int8"}, + {name: "RoundScaledMaskedFloat32x4", argLength: 2, commutative: false, aux: "Int8"}, + {name: "RoundScaledMaskedFloat32x8", argLength: 2, commutative: false, aux: "Int8"}, + {name: "RoundScaledMaskedFloat32x16", argLength: 2, commutative: false, aux: "Int8"}, + {name: "RoundScaledMaskedFloat64x2", argLength: 2, commutative: false, aux: "Int8"}, + {name: "RoundScaledMaskedFloat64x4", argLength: 2, commutative: false, aux: "Int8"}, + {name: "RoundScaledMaskedFloat64x8", argLength: 2, commutative: false, aux: "Int8"}, + {name: "RoundScaledResidueFloat32x4", argLength: 1, commutative: false, aux: "Int8"}, + {name: "RoundScaledResidueFloat32x8", argLength: 1, commutative: false, aux: "Int8"}, + {name: "RoundScaledResidueFloat32x16", argLength: 1, commutative: false, aux: "Int8"}, + {name: "RoundScaledResidueFloat64x2", argLength: 1, commutative: false, aux: "Int8"}, + {name: "RoundScaledResidueFloat64x4", argLength: 1, commutative: false, aux: "Int8"}, + {name: "RoundScaledResidueFloat64x8", argLength: 1, commutative: false, aux: "Int8"}, + {name: "RoundScaledResidueMaskedFloat32x4", argLength: 2, commutative: false, aux: "Int8"}, + {name: "RoundScaledResidueMaskedFloat32x8", argLength: 2, commutative: false, aux: "Int8"}, + {name: "RoundScaledResidueMaskedFloat32x16", argLength: 2, commutative: false, aux: "Int8"}, + {name: "RoundScaledResidueMaskedFloat64x2", argLength: 2, commutative: false, aux: "Int8"}, + {name: "RoundScaledResidueMaskedFloat64x4", argLength: 2, commutative: false, aux: "Int8"}, + {name: "RoundScaledResidueMaskedFloat64x8", argLength: 2, commutative: false, aux: "Int8"}, {name: "Set128Float32x8", argLength: 2, commutative: false, aux: "Int8"}, {name: "Set128Float64x4", argLength: 2, commutative: false, aux: "Int8"}, {name: "Set128Int8x32", argLength: 2, commutative: false, aux: "Int8"}, @@ -1810,17 +1798,29 @@ func simdGenericOps() []opData { {name: "ShiftAllRightConcatUint64x2", argLength: 2, commutative: false, aux: "Int8"}, {name: "ShiftAllRightConcatUint64x4", argLength: 2, commutative: false, aux: "Int8"}, {name: "ShiftAllRightConcatUint64x8", argLength: 2, commutative: false, aux: "Int8"}, - {name: "TruncWithPrecisionFloat32x4", argLength: 1, commutative: false, aux: "Int8"}, - {name: "TruncWithPrecisionFloat32x8", argLength: 1, commutative: false, aux: "Int8"}, - {name: "TruncWithPrecisionFloat32x16", argLength: 1, commutative: false, aux: "Int8"}, - {name: "TruncWithPrecisionFloat64x2", argLength: 1, commutative: false, aux: "Int8"}, - {name: "TruncWithPrecisionFloat64x4", argLength: 1, commutative: false, aux: "Int8"}, - {name: "TruncWithPrecisionFloat64x8", argLength: 1, commutative: false, aux: "Int8"}, - {name: "TruncWithPrecisionMaskedFloat32x4", argLength: 2, commutative: false, aux: "Int8"}, - {name: "TruncWithPrecisionMaskedFloat32x8", argLength: 2, commutative: false, aux: "Int8"}, - {name: "TruncWithPrecisionMaskedFloat32x16", argLength: 2, commutative: false, aux: "Int8"}, - {name: "TruncWithPrecisionMaskedFloat64x2", argLength: 2, commutative: false, aux: "Int8"}, - {name: "TruncWithPrecisionMaskedFloat64x4", argLength: 2, commutative: false, aux: "Int8"}, - {name: "TruncWithPrecisionMaskedFloat64x8", argLength: 2, commutative: false, aux: "Int8"}, + {name: "TruncScaledFloat32x4", argLength: 1, commutative: false, aux: "Int8"}, + {name: "TruncScaledFloat32x8", argLength: 1, commutative: false, aux: "Int8"}, + {name: "TruncScaledFloat32x16", argLength: 1, commutative: false, aux: "Int8"}, + {name: "TruncScaledFloat64x2", argLength: 1, commutative: false, aux: "Int8"}, + {name: "TruncScaledFloat64x4", argLength: 1, commutative: false, aux: "Int8"}, + {name: "TruncScaledFloat64x8", argLength: 1, commutative: false, aux: "Int8"}, + {name: "TruncScaledMaskedFloat32x4", argLength: 2, commutative: false, aux: "Int8"}, + {name: "TruncScaledMaskedFloat32x8", argLength: 2, commutative: false, aux: "Int8"}, + {name: "TruncScaledMaskedFloat32x16", argLength: 2, commutative: false, aux: "Int8"}, + {name: "TruncScaledMaskedFloat64x2", argLength: 2, commutative: false, aux: "Int8"}, + {name: "TruncScaledMaskedFloat64x4", argLength: 2, commutative: false, aux: "Int8"}, + {name: "TruncScaledMaskedFloat64x8", argLength: 2, commutative: false, aux: "Int8"}, + {name: "TruncScaledResidueFloat32x4", argLength: 1, commutative: false, aux: "Int8"}, + {name: "TruncScaledResidueFloat32x8", argLength: 1, commutative: false, aux: "Int8"}, + {name: "TruncScaledResidueFloat32x16", argLength: 1, commutative: false, aux: "Int8"}, + {name: "TruncScaledResidueFloat64x2", argLength: 1, commutative: false, aux: "Int8"}, + {name: "TruncScaledResidueFloat64x4", argLength: 1, commutative: false, aux: "Int8"}, + {name: "TruncScaledResidueFloat64x8", argLength: 1, commutative: false, aux: "Int8"}, + {name: "TruncScaledResidueMaskedFloat32x4", argLength: 2, commutative: false, aux: "Int8"}, + {name: "TruncScaledResidueMaskedFloat32x8", argLength: 2, commutative: false, aux: "Int8"}, + {name: "TruncScaledResidueMaskedFloat32x16", argLength: 2, commutative: false, aux: "Int8"}, + {name: "TruncScaledResidueMaskedFloat64x2", argLength: 2, commutative: false, aux: "Int8"}, + {name: "TruncScaledResidueMaskedFloat64x4", argLength: 2, commutative: false, aux: "Int8"}, + {name: "TruncScaledResidueMaskedFloat64x8", argLength: 2, commutative: false, aux: "Int8"}, } } diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go index e8a5354c001..6dcbec2573b 100644 --- a/src/cmd/compile/internal/ssa/opGen.go +++ b/src/cmd/compile/internal/ssa/opGen.go @@ -4567,6 +4567,44 @@ const ( OpAddMaskedUint64x2 OpAddMaskedUint64x4 OpAddMaskedUint64x8 + OpAddPairsFloat32x4 + OpAddPairsFloat32x8 + OpAddPairsFloat64x2 + OpAddPairsFloat64x4 + OpAddPairsInt16x8 + OpAddPairsInt16x16 + OpAddPairsInt32x4 + OpAddPairsInt32x8 + OpAddPairsSaturatedInt16x8 + OpAddPairsSaturatedInt16x16 + OpAddPairsUint16x8 + OpAddPairsUint16x16 + OpAddPairsUint32x4 + OpAddPairsUint32x8 + OpAddSaturatedInt8x16 + OpAddSaturatedInt8x32 + OpAddSaturatedInt8x64 + OpAddSaturatedInt16x8 + OpAddSaturatedInt16x16 + OpAddSaturatedInt16x32 + OpAddSaturatedMaskedInt8x16 + OpAddSaturatedMaskedInt8x32 + OpAddSaturatedMaskedInt8x64 + OpAddSaturatedMaskedInt16x8 + OpAddSaturatedMaskedInt16x16 + OpAddSaturatedMaskedInt16x32 + OpAddSaturatedMaskedUint8x16 + OpAddSaturatedMaskedUint8x32 + OpAddSaturatedMaskedUint8x64 + OpAddSaturatedMaskedUint16x8 + OpAddSaturatedMaskedUint16x16 + OpAddSaturatedMaskedUint16x32 + OpAddSaturatedUint8x16 + OpAddSaturatedUint8x32 + OpAddSaturatedUint8x64 + OpAddSaturatedUint16x8 + OpAddSaturatedUint16x16 + OpAddSaturatedUint16x32 OpAddSubFloat32x4 OpAddSubFloat32x8 OpAddSubFloat64x2 @@ -5230,18 +5268,6 @@ const ( OpMinUint64x2 OpMinUint64x4 OpMinUint64x8 - OpMulByPowOf2Float32x4 - OpMulByPowOf2Float32x8 - OpMulByPowOf2Float32x16 - OpMulByPowOf2Float64x2 - OpMulByPowOf2Float64x4 - OpMulByPowOf2Float64x8 - OpMulByPowOf2MaskedFloat32x4 - OpMulByPowOf2MaskedFloat32x8 - OpMulByPowOf2MaskedFloat32x16 - OpMulByPowOf2MaskedFloat64x2 - OpMulByPowOf2MaskedFloat64x4 - OpMulByPowOf2MaskedFloat64x8 OpMulEvenWidenInt32x4 OpMulEvenWidenInt32x8 OpMulEvenWidenInt64x2 @@ -5276,30 +5302,30 @@ const ( OpMulHighUint16x8 OpMulHighUint16x16 OpMulHighUint16x32 - OpMulLowInt16x8 - OpMulLowInt16x16 - OpMulLowInt16x32 - OpMulLowInt32x4 - OpMulLowInt32x8 - OpMulLowInt32x16 - OpMulLowInt64x2 - OpMulLowInt64x4 - OpMulLowInt64x8 - OpMulLowMaskedInt16x8 - OpMulLowMaskedInt16x16 - OpMulLowMaskedInt16x32 - OpMulLowMaskedInt32x4 - OpMulLowMaskedInt32x8 - OpMulLowMaskedInt32x16 - OpMulLowMaskedInt64x2 - OpMulLowMaskedInt64x4 - OpMulLowMaskedInt64x8 + OpMulInt16x8 + OpMulInt16x16 + OpMulInt16x32 + OpMulInt32x4 + OpMulInt32x8 + OpMulInt32x16 + OpMulInt64x2 + OpMulInt64x4 + OpMulInt64x8 OpMulMaskedFloat32x4 OpMulMaskedFloat32x8 OpMulMaskedFloat32x16 OpMulMaskedFloat64x2 OpMulMaskedFloat64x4 OpMulMaskedFloat64x8 + OpMulMaskedInt16x8 + OpMulMaskedInt16x16 + OpMulMaskedInt16x32 + OpMulMaskedInt32x4 + OpMulMaskedInt32x8 + OpMulMaskedInt32x16 + OpMulMaskedInt64x2 + OpMulMaskedInt64x4 + OpMulMaskedInt64x8 OpNotEqualFloat32x4 OpNotEqualFloat32x8 OpNotEqualFloat32x16 @@ -5402,30 +5428,6 @@ const ( OpPairDotProdMaskedInt16x8 OpPairDotProdMaskedInt16x16 OpPairDotProdMaskedInt16x32 - OpPairwiseAddFloat32x4 - OpPairwiseAddFloat32x8 - OpPairwiseAddFloat64x2 - OpPairwiseAddFloat64x4 - OpPairwiseAddInt16x8 - OpPairwiseAddInt16x16 - OpPairwiseAddInt32x4 - OpPairwiseAddInt32x8 - OpPairwiseAddUint16x8 - OpPairwiseAddUint16x16 - OpPairwiseAddUint32x4 - OpPairwiseAddUint32x8 - OpPairwiseSubFloat32x4 - OpPairwiseSubFloat32x8 - OpPairwiseSubFloat64x2 - OpPairwiseSubFloat64x4 - OpPairwiseSubInt16x8 - OpPairwiseSubInt16x16 - OpPairwiseSubInt32x4 - OpPairwiseSubInt32x8 - OpPairwiseSubUint16x8 - OpPairwiseSubUint16x16 - OpPairwiseSubUint32x4 - OpPairwiseSubUint32x8 OpPermute2Float32x4 OpPermute2Float32x8 OpPermute2Float32x16 @@ -5640,58 +5642,6 @@ const ( OpSaturatedAddDotProdMaskedInt32x4 OpSaturatedAddDotProdMaskedInt32x8 OpSaturatedAddDotProdMaskedInt32x16 - OpSaturatedAddInt8x16 - OpSaturatedAddInt8x32 - OpSaturatedAddInt8x64 - OpSaturatedAddInt16x8 - OpSaturatedAddInt16x16 - OpSaturatedAddInt16x32 - OpSaturatedAddMaskedInt8x16 - OpSaturatedAddMaskedInt8x32 - OpSaturatedAddMaskedInt8x64 - OpSaturatedAddMaskedInt16x8 - OpSaturatedAddMaskedInt16x16 - OpSaturatedAddMaskedInt16x32 - OpSaturatedAddMaskedUint8x16 - OpSaturatedAddMaskedUint8x32 - OpSaturatedAddMaskedUint8x64 - OpSaturatedAddMaskedUint16x8 - OpSaturatedAddMaskedUint16x16 - OpSaturatedAddMaskedUint16x32 - OpSaturatedAddUint8x16 - OpSaturatedAddUint8x32 - OpSaturatedAddUint8x64 - OpSaturatedAddUint16x8 - OpSaturatedAddUint16x16 - OpSaturatedAddUint16x32 - OpSaturatedPairwiseAddInt16x8 - OpSaturatedPairwiseAddInt16x16 - OpSaturatedPairwiseSubInt16x8 - OpSaturatedPairwiseSubInt16x16 - OpSaturatedSubInt8x16 - OpSaturatedSubInt8x32 - OpSaturatedSubInt8x64 - OpSaturatedSubInt16x8 - OpSaturatedSubInt16x16 - OpSaturatedSubInt16x32 - OpSaturatedSubMaskedInt8x16 - OpSaturatedSubMaskedInt8x32 - OpSaturatedSubMaskedInt8x64 - OpSaturatedSubMaskedInt16x8 - OpSaturatedSubMaskedInt16x16 - OpSaturatedSubMaskedInt16x32 - OpSaturatedSubMaskedUint8x16 - OpSaturatedSubMaskedUint8x32 - OpSaturatedSubMaskedUint8x64 - OpSaturatedSubMaskedUint16x8 - OpSaturatedSubMaskedUint16x16 - OpSaturatedSubMaskedUint16x32 - OpSaturatedSubUint8x16 - OpSaturatedSubUint8x32 - OpSaturatedSubUint8x64 - OpSaturatedSubUint16x8 - OpSaturatedSubUint16x16 - OpSaturatedSubUint16x32 OpSaturatedUnsignedSignedPairDotProdMaskedUint8x16 OpSaturatedUnsignedSignedPairDotProdMaskedUint8x32 OpSaturatedUnsignedSignedPairDotProdMaskedUint8x64 @@ -5704,6 +5654,18 @@ const ( OpSaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x4 OpSaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x8 OpSaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x16 + OpScaleFloat32x4 + OpScaleFloat32x8 + OpScaleFloat32x16 + OpScaleFloat64x2 + OpScaleFloat64x4 + OpScaleFloat64x8 + OpScaleMaskedFloat32x4 + OpScaleMaskedFloat32x8 + OpScaleMaskedFloat32x16 + OpScaleMaskedFloat64x2 + OpScaleMaskedFloat64x4 + OpScaleMaskedFloat64x8 OpShiftAllLeftInt16x8 OpShiftAllLeftInt16x16 OpShiftAllLeftInt16x32 @@ -5986,6 +5948,44 @@ const ( OpSubMaskedUint64x2 OpSubMaskedUint64x4 OpSubMaskedUint64x8 + OpSubPairsFloat32x4 + OpSubPairsFloat32x8 + OpSubPairsFloat64x2 + OpSubPairsFloat64x4 + OpSubPairsInt16x8 + OpSubPairsInt16x16 + OpSubPairsInt32x4 + OpSubPairsInt32x8 + OpSubPairsSaturatedInt16x8 + OpSubPairsSaturatedInt16x16 + OpSubPairsUint16x8 + OpSubPairsUint16x16 + OpSubPairsUint32x4 + OpSubPairsUint32x8 + OpSubSaturatedInt8x16 + OpSubSaturatedInt8x32 + OpSubSaturatedInt8x64 + OpSubSaturatedInt16x8 + OpSubSaturatedInt16x16 + OpSubSaturatedInt16x32 + OpSubSaturatedMaskedInt8x16 + OpSubSaturatedMaskedInt8x32 + OpSubSaturatedMaskedInt8x64 + OpSubSaturatedMaskedInt16x8 + OpSubSaturatedMaskedInt16x16 + OpSubSaturatedMaskedInt16x32 + OpSubSaturatedMaskedUint8x16 + OpSubSaturatedMaskedUint8x32 + OpSubSaturatedMaskedUint8x64 + OpSubSaturatedMaskedUint16x8 + OpSubSaturatedMaskedUint16x16 + OpSubSaturatedMaskedUint16x32 + OpSubSaturatedUint8x16 + OpSubSaturatedUint8x32 + OpSubSaturatedUint8x64 + OpSubSaturatedUint16x8 + OpSubSaturatedUint16x16 + OpSubSaturatedUint16x32 OpSubUint8x16 OpSubUint8x32 OpSubUint8x64 @@ -6044,78 +6044,54 @@ const ( OpXorUint64x2 OpXorUint64x4 OpXorUint64x8 - OpCeilWithPrecisionFloat32x4 - OpCeilWithPrecisionFloat32x8 - OpCeilWithPrecisionFloat32x16 - OpCeilWithPrecisionFloat64x2 - OpCeilWithPrecisionFloat64x4 - OpCeilWithPrecisionFloat64x8 - OpCeilWithPrecisionMaskedFloat32x4 - OpCeilWithPrecisionMaskedFloat32x8 - OpCeilWithPrecisionMaskedFloat32x16 - OpCeilWithPrecisionMaskedFloat64x2 - OpCeilWithPrecisionMaskedFloat64x4 - OpCeilWithPrecisionMaskedFloat64x8 - OpDiffWithCeilWithPrecisionFloat32x4 - OpDiffWithCeilWithPrecisionFloat32x8 - OpDiffWithCeilWithPrecisionFloat32x16 - OpDiffWithCeilWithPrecisionFloat64x2 - OpDiffWithCeilWithPrecisionFloat64x4 - OpDiffWithCeilWithPrecisionFloat64x8 - OpDiffWithCeilWithPrecisionMaskedFloat32x4 - OpDiffWithCeilWithPrecisionMaskedFloat32x8 - OpDiffWithCeilWithPrecisionMaskedFloat32x16 - OpDiffWithCeilWithPrecisionMaskedFloat64x2 - OpDiffWithCeilWithPrecisionMaskedFloat64x4 - OpDiffWithCeilWithPrecisionMaskedFloat64x8 - OpDiffWithFloorWithPrecisionFloat32x4 - OpDiffWithFloorWithPrecisionFloat32x8 - OpDiffWithFloorWithPrecisionFloat32x16 - OpDiffWithFloorWithPrecisionFloat64x2 - OpDiffWithFloorWithPrecisionFloat64x4 - OpDiffWithFloorWithPrecisionFloat64x8 - OpDiffWithFloorWithPrecisionMaskedFloat32x4 - OpDiffWithFloorWithPrecisionMaskedFloat32x8 - OpDiffWithFloorWithPrecisionMaskedFloat32x16 - OpDiffWithFloorWithPrecisionMaskedFloat64x2 - OpDiffWithFloorWithPrecisionMaskedFloat64x4 - OpDiffWithFloorWithPrecisionMaskedFloat64x8 - OpDiffWithRoundWithPrecisionFloat32x4 - OpDiffWithRoundWithPrecisionFloat32x8 - OpDiffWithRoundWithPrecisionFloat32x16 - OpDiffWithRoundWithPrecisionFloat64x2 - OpDiffWithRoundWithPrecisionFloat64x4 - OpDiffWithRoundWithPrecisionFloat64x8 - OpDiffWithRoundWithPrecisionMaskedFloat32x4 - OpDiffWithRoundWithPrecisionMaskedFloat32x8 - OpDiffWithRoundWithPrecisionMaskedFloat32x16 - OpDiffWithRoundWithPrecisionMaskedFloat64x2 - OpDiffWithRoundWithPrecisionMaskedFloat64x4 - OpDiffWithRoundWithPrecisionMaskedFloat64x8 - OpDiffWithTruncWithPrecisionFloat32x4 - OpDiffWithTruncWithPrecisionFloat32x8 - OpDiffWithTruncWithPrecisionFloat32x16 - OpDiffWithTruncWithPrecisionFloat64x2 - OpDiffWithTruncWithPrecisionFloat64x4 - OpDiffWithTruncWithPrecisionFloat64x8 - OpDiffWithTruncWithPrecisionMaskedFloat32x4 - OpDiffWithTruncWithPrecisionMaskedFloat32x8 - OpDiffWithTruncWithPrecisionMaskedFloat32x16 - OpDiffWithTruncWithPrecisionMaskedFloat64x2 - OpDiffWithTruncWithPrecisionMaskedFloat64x4 - OpDiffWithTruncWithPrecisionMaskedFloat64x8 - OpFloorWithPrecisionFloat32x4 - OpFloorWithPrecisionFloat32x8 - OpFloorWithPrecisionFloat32x16 - OpFloorWithPrecisionFloat64x2 - OpFloorWithPrecisionFloat64x4 - OpFloorWithPrecisionFloat64x8 - OpFloorWithPrecisionMaskedFloat32x4 - OpFloorWithPrecisionMaskedFloat32x8 - OpFloorWithPrecisionMaskedFloat32x16 - OpFloorWithPrecisionMaskedFloat64x2 - OpFloorWithPrecisionMaskedFloat64x4 - OpFloorWithPrecisionMaskedFloat64x8 + OpCeilScaledFloat32x4 + OpCeilScaledFloat32x8 + OpCeilScaledFloat32x16 + OpCeilScaledFloat64x2 + OpCeilScaledFloat64x4 + OpCeilScaledFloat64x8 + OpCeilScaledMaskedFloat32x4 + OpCeilScaledMaskedFloat32x8 + OpCeilScaledMaskedFloat32x16 + OpCeilScaledMaskedFloat64x2 + OpCeilScaledMaskedFloat64x4 + OpCeilScaledMaskedFloat64x8 + OpCeilScaledResidueFloat32x4 + OpCeilScaledResidueFloat32x8 + OpCeilScaledResidueFloat32x16 + OpCeilScaledResidueFloat64x2 + OpCeilScaledResidueFloat64x4 + OpCeilScaledResidueFloat64x8 + OpCeilScaledResidueMaskedFloat32x4 + OpCeilScaledResidueMaskedFloat32x8 + OpCeilScaledResidueMaskedFloat32x16 + OpCeilScaledResidueMaskedFloat64x2 + OpCeilScaledResidueMaskedFloat64x4 + OpCeilScaledResidueMaskedFloat64x8 + OpFloorScaledFloat32x4 + OpFloorScaledFloat32x8 + OpFloorScaledFloat32x16 + OpFloorScaledFloat64x2 + OpFloorScaledFloat64x4 + OpFloorScaledFloat64x8 + OpFloorScaledMaskedFloat32x4 + OpFloorScaledMaskedFloat32x8 + OpFloorScaledMaskedFloat32x16 + OpFloorScaledMaskedFloat64x2 + OpFloorScaledMaskedFloat64x4 + OpFloorScaledMaskedFloat64x8 + OpFloorScaledResidueFloat32x4 + OpFloorScaledResidueFloat32x8 + OpFloorScaledResidueFloat32x16 + OpFloorScaledResidueFloat64x2 + OpFloorScaledResidueFloat64x4 + OpFloorScaledResidueFloat64x8 + OpFloorScaledResidueMaskedFloat32x4 + OpFloorScaledResidueMaskedFloat32x8 + OpFloorScaledResidueMaskedFloat32x16 + OpFloorScaledResidueMaskedFloat64x2 + OpFloorScaledResidueMaskedFloat64x4 + OpFloorScaledResidueMaskedFloat64x8 OpGaloisFieldAffineTransformInverseMaskedUint8x16 OpGaloisFieldAffineTransformInverseMaskedUint8x32 OpGaloisFieldAffineTransformInverseMaskedUint8x64 @@ -6194,18 +6170,30 @@ const ( OpRotateAllRightUint64x2 OpRotateAllRightUint64x4 OpRotateAllRightUint64x8 - OpRoundWithPrecisionFloat32x4 - OpRoundWithPrecisionFloat32x8 - OpRoundWithPrecisionFloat32x16 - OpRoundWithPrecisionFloat64x2 - OpRoundWithPrecisionFloat64x4 - OpRoundWithPrecisionFloat64x8 - OpRoundWithPrecisionMaskedFloat32x4 - OpRoundWithPrecisionMaskedFloat32x8 - OpRoundWithPrecisionMaskedFloat32x16 - OpRoundWithPrecisionMaskedFloat64x2 - OpRoundWithPrecisionMaskedFloat64x4 - OpRoundWithPrecisionMaskedFloat64x8 + OpRoundScaledFloat32x4 + OpRoundScaledFloat32x8 + OpRoundScaledFloat32x16 + OpRoundScaledFloat64x2 + OpRoundScaledFloat64x4 + OpRoundScaledFloat64x8 + OpRoundScaledMaskedFloat32x4 + OpRoundScaledMaskedFloat32x8 + OpRoundScaledMaskedFloat32x16 + OpRoundScaledMaskedFloat64x2 + OpRoundScaledMaskedFloat64x4 + OpRoundScaledMaskedFloat64x8 + OpRoundScaledResidueFloat32x4 + OpRoundScaledResidueFloat32x8 + OpRoundScaledResidueFloat32x16 + OpRoundScaledResidueFloat64x2 + OpRoundScaledResidueFloat64x4 + OpRoundScaledResidueFloat64x8 + OpRoundScaledResidueMaskedFloat32x4 + OpRoundScaledResidueMaskedFloat32x8 + OpRoundScaledResidueMaskedFloat32x16 + OpRoundScaledResidueMaskedFloat64x2 + OpRoundScaledResidueMaskedFloat64x4 + OpRoundScaledResidueMaskedFloat64x8 OpSet128Float32x8 OpSet128Float64x4 OpSet128Int8x32 @@ -6296,18 +6284,30 @@ const ( OpShiftAllRightConcatUint64x2 OpShiftAllRightConcatUint64x4 OpShiftAllRightConcatUint64x8 - OpTruncWithPrecisionFloat32x4 - OpTruncWithPrecisionFloat32x8 - OpTruncWithPrecisionFloat32x16 - OpTruncWithPrecisionFloat64x2 - OpTruncWithPrecisionFloat64x4 - OpTruncWithPrecisionFloat64x8 - OpTruncWithPrecisionMaskedFloat32x4 - OpTruncWithPrecisionMaskedFloat32x8 - OpTruncWithPrecisionMaskedFloat32x16 - OpTruncWithPrecisionMaskedFloat64x2 - OpTruncWithPrecisionMaskedFloat64x4 - OpTruncWithPrecisionMaskedFloat64x8 + OpTruncScaledFloat32x4 + OpTruncScaledFloat32x8 + OpTruncScaledFloat32x16 + OpTruncScaledFloat64x2 + OpTruncScaledFloat64x4 + OpTruncScaledFloat64x8 + OpTruncScaledMaskedFloat32x4 + OpTruncScaledMaskedFloat32x8 + OpTruncScaledMaskedFloat32x16 + OpTruncScaledMaskedFloat64x2 + OpTruncScaledMaskedFloat64x4 + OpTruncScaledMaskedFloat64x8 + OpTruncScaledResidueFloat32x4 + OpTruncScaledResidueFloat32x8 + OpTruncScaledResidueFloat32x16 + OpTruncScaledResidueFloat64x2 + OpTruncScaledResidueFloat64x4 + OpTruncScaledResidueFloat64x8 + OpTruncScaledResidueMaskedFloat32x4 + OpTruncScaledResidueMaskedFloat32x8 + OpTruncScaledResidueMaskedFloat32x16 + OpTruncScaledResidueMaskedFloat64x2 + OpTruncScaledResidueMaskedFloat64x4 + OpTruncScaledResidueMaskedFloat64x8 ) var opcodeTable = [...]opInfo{ @@ -62123,6 +62123,220 @@ var opcodeTable = [...]opInfo{ commutative: true, generic: true, }, + { + name: "AddPairsFloat32x4", + argLen: 2, + generic: true, + }, + { + name: "AddPairsFloat32x8", + argLen: 2, + generic: true, + }, + { + name: "AddPairsFloat64x2", + argLen: 2, + generic: true, + }, + { + name: "AddPairsFloat64x4", + argLen: 2, + generic: true, + }, + { + name: "AddPairsInt16x8", + argLen: 2, + generic: true, + }, + { + name: "AddPairsInt16x16", + argLen: 2, + generic: true, + }, + { + name: "AddPairsInt32x4", + argLen: 2, + generic: true, + }, + { + name: "AddPairsInt32x8", + argLen: 2, + generic: true, + }, + { + name: "AddPairsSaturatedInt16x8", + argLen: 2, + generic: true, + }, + { + name: "AddPairsSaturatedInt16x16", + argLen: 2, + generic: true, + }, + { + name: "AddPairsUint16x8", + argLen: 2, + generic: true, + }, + { + name: "AddPairsUint16x16", + argLen: 2, + generic: true, + }, + { + name: "AddPairsUint32x4", + argLen: 2, + generic: true, + }, + { + name: "AddPairsUint32x8", + argLen: 2, + generic: true, + }, + { + name: "AddSaturatedInt8x16", + argLen: 2, + commutative: true, + generic: true, + }, + { + name: "AddSaturatedInt8x32", + argLen: 2, + commutative: true, + generic: true, + }, + { + name: "AddSaturatedInt8x64", + argLen: 2, + commutative: true, + generic: true, + }, + { + name: "AddSaturatedInt16x8", + argLen: 2, + commutative: true, + generic: true, + }, + { + name: "AddSaturatedInt16x16", + argLen: 2, + commutative: true, + generic: true, + }, + { + name: "AddSaturatedInt16x32", + argLen: 2, + commutative: true, + generic: true, + }, + { + name: "AddSaturatedMaskedInt8x16", + argLen: 3, + commutative: true, + generic: true, + }, + { + name: "AddSaturatedMaskedInt8x32", + argLen: 3, + commutative: true, + generic: true, + }, + { + name: "AddSaturatedMaskedInt8x64", + argLen: 3, + commutative: true, + generic: true, + }, + { + name: "AddSaturatedMaskedInt16x8", + argLen: 3, + commutative: true, + generic: true, + }, + { + name: "AddSaturatedMaskedInt16x16", + argLen: 3, + commutative: true, + generic: true, + }, + { + name: "AddSaturatedMaskedInt16x32", + argLen: 3, + commutative: true, + generic: true, + }, + { + name: "AddSaturatedMaskedUint8x16", + argLen: 3, + commutative: true, + generic: true, + }, + { + name: "AddSaturatedMaskedUint8x32", + argLen: 3, + commutative: true, + generic: true, + }, + { + name: "AddSaturatedMaskedUint8x64", + argLen: 3, + commutative: true, + generic: true, + }, + { + name: "AddSaturatedMaskedUint16x8", + argLen: 3, + commutative: true, + generic: true, + }, + { + name: "AddSaturatedMaskedUint16x16", + argLen: 3, + commutative: true, + generic: true, + }, + { + name: "AddSaturatedMaskedUint16x32", + argLen: 3, + commutative: true, + generic: true, + }, + { + name: "AddSaturatedUint8x16", + argLen: 2, + commutative: true, + generic: true, + }, + { + name: "AddSaturatedUint8x32", + argLen: 2, + commutative: true, + generic: true, + }, + { + name: "AddSaturatedUint8x64", + argLen: 2, + commutative: true, + generic: true, + }, + { + name: "AddSaturatedUint16x8", + argLen: 2, + commutative: true, + generic: true, + }, + { + name: "AddSaturatedUint16x16", + argLen: 2, + commutative: true, + generic: true, + }, + { + name: "AddSaturatedUint16x32", + argLen: 2, + commutative: true, + generic: true, + }, { name: "AddSubFloat32x4", argLen: 2, @@ -65693,66 +65907,6 @@ var opcodeTable = [...]opInfo{ commutative: true, generic: true, }, - { - name: "MulByPowOf2Float32x4", - argLen: 2, - generic: true, - }, - { - name: "MulByPowOf2Float32x8", - argLen: 2, - generic: true, - }, - { - name: "MulByPowOf2Float32x16", - argLen: 2, - generic: true, - }, - { - name: "MulByPowOf2Float64x2", - argLen: 2, - generic: true, - }, - { - name: "MulByPowOf2Float64x4", - argLen: 2, - generic: true, - }, - { - name: "MulByPowOf2Float64x8", - argLen: 2, - generic: true, - }, - { - name: "MulByPowOf2MaskedFloat32x4", - argLen: 3, - generic: true, - }, - { - name: "MulByPowOf2MaskedFloat32x8", - argLen: 3, - generic: true, - }, - { - name: "MulByPowOf2MaskedFloat32x16", - argLen: 3, - generic: true, - }, - { - name: "MulByPowOf2MaskedFloat64x2", - argLen: 3, - generic: true, - }, - { - name: "MulByPowOf2MaskedFloat64x4", - argLen: 3, - generic: true, - }, - { - name: "MulByPowOf2MaskedFloat64x8", - argLen: 3, - generic: true, - }, { name: "MulEvenWidenInt32x4", argLen: 2, @@ -65958,113 +66112,59 @@ var opcodeTable = [...]opInfo{ generic: true, }, { - name: "MulLowInt16x8", + name: "MulInt16x8", argLen: 2, commutative: true, generic: true, }, { - name: "MulLowInt16x16", + name: "MulInt16x16", argLen: 2, commutative: true, generic: true, }, { - name: "MulLowInt16x32", + name: "MulInt16x32", argLen: 2, commutative: true, generic: true, }, { - name: "MulLowInt32x4", + name: "MulInt32x4", argLen: 2, commutative: true, generic: true, }, { - name: "MulLowInt32x8", + name: "MulInt32x8", argLen: 2, commutative: true, generic: true, }, { - name: "MulLowInt32x16", + name: "MulInt32x16", argLen: 2, commutative: true, generic: true, }, { - name: "MulLowInt64x2", + name: "MulInt64x2", argLen: 2, commutative: true, generic: true, }, { - name: "MulLowInt64x4", + name: "MulInt64x4", argLen: 2, commutative: true, generic: true, }, { - name: "MulLowInt64x8", + name: "MulInt64x8", argLen: 2, commutative: true, generic: true, }, - { - name: "MulLowMaskedInt16x8", - argLen: 3, - commutative: true, - generic: true, - }, - { - name: "MulLowMaskedInt16x16", - argLen: 3, - commutative: true, - generic: true, - }, - { - name: "MulLowMaskedInt16x32", - argLen: 3, - commutative: true, - generic: true, - }, - { - name: "MulLowMaskedInt32x4", - argLen: 3, - commutative: true, - generic: true, - }, - { - name: "MulLowMaskedInt32x8", - argLen: 3, - commutative: true, - generic: true, - }, - { - name: "MulLowMaskedInt32x16", - argLen: 3, - commutative: true, - generic: true, - }, - { - name: "MulLowMaskedInt64x2", - argLen: 3, - commutative: true, - generic: true, - }, - { - name: "MulLowMaskedInt64x4", - argLen: 3, - commutative: true, - generic: true, - }, - { - name: "MulLowMaskedInt64x8", - argLen: 3, - commutative: true, - generic: true, - }, { name: "MulMaskedFloat32x4", argLen: 3, @@ -66101,6 +66201,60 @@ var opcodeTable = [...]opInfo{ commutative: true, generic: true, }, + { + name: "MulMaskedInt16x8", + argLen: 3, + commutative: true, + generic: true, + }, + { + name: "MulMaskedInt16x16", + argLen: 3, + commutative: true, + generic: true, + }, + { + name: "MulMaskedInt16x32", + argLen: 3, + commutative: true, + generic: true, + }, + { + name: "MulMaskedInt32x4", + argLen: 3, + commutative: true, + generic: true, + }, + { + name: "MulMaskedInt32x8", + argLen: 3, + commutative: true, + generic: true, + }, + { + name: "MulMaskedInt32x16", + argLen: 3, + commutative: true, + generic: true, + }, + { + name: "MulMaskedInt64x2", + argLen: 3, + commutative: true, + generic: true, + }, + { + name: "MulMaskedInt64x4", + argLen: 3, + commutative: true, + generic: true, + }, + { + name: "MulMaskedInt64x8", + argLen: 3, + commutative: true, + generic: true, + }, { name: "NotEqualFloat32x4", argLen: 2, @@ -66707,126 +66861,6 @@ var opcodeTable = [...]opInfo{ argLen: 3, generic: true, }, - { - name: "PairwiseAddFloat32x4", - argLen: 2, - generic: true, - }, - { - name: "PairwiseAddFloat32x8", - argLen: 2, - generic: true, - }, - { - name: "PairwiseAddFloat64x2", - argLen: 2, - generic: true, - }, - { - name: "PairwiseAddFloat64x4", - argLen: 2, - generic: true, - }, - { - name: "PairwiseAddInt16x8", - argLen: 2, - generic: true, - }, - { - name: "PairwiseAddInt16x16", - argLen: 2, - generic: true, - }, - { - name: "PairwiseAddInt32x4", - argLen: 2, - generic: true, - }, - { - name: "PairwiseAddInt32x8", - argLen: 2, - generic: true, - }, - { - name: "PairwiseAddUint16x8", - argLen: 2, - generic: true, - }, - { - name: "PairwiseAddUint16x16", - argLen: 2, - generic: true, - }, - { - name: "PairwiseAddUint32x4", - argLen: 2, - generic: true, - }, - { - name: "PairwiseAddUint32x8", - argLen: 2, - generic: true, - }, - { - name: "PairwiseSubFloat32x4", - argLen: 2, - generic: true, - }, - { - name: "PairwiseSubFloat32x8", - argLen: 2, - generic: true, - }, - { - name: "PairwiseSubFloat64x2", - argLen: 2, - generic: true, - }, - { - name: "PairwiseSubFloat64x4", - argLen: 2, - generic: true, - }, - { - name: "PairwiseSubInt16x8", - argLen: 2, - generic: true, - }, - { - name: "PairwiseSubInt16x16", - argLen: 2, - generic: true, - }, - { - name: "PairwiseSubInt32x4", - argLen: 2, - generic: true, - }, - { - name: "PairwiseSubInt32x8", - argLen: 2, - generic: true, - }, - { - name: "PairwiseSubUint16x8", - argLen: 2, - generic: true, - }, - { - name: "PairwiseSubUint16x16", - argLen: 2, - generic: true, - }, - { - name: "PairwiseSubUint32x4", - argLen: 2, - generic: true, - }, - { - name: "PairwiseSubUint32x8", - argLen: 2, - generic: true, - }, { name: "Permute2Float32x4", argLen: 3, @@ -67897,290 +67931,6 @@ var opcodeTable = [...]opInfo{ argLen: 4, generic: true, }, - { - name: "SaturatedAddInt8x16", - argLen: 2, - commutative: true, - generic: true, - }, - { - name: "SaturatedAddInt8x32", - argLen: 2, - commutative: true, - generic: true, - }, - { - name: "SaturatedAddInt8x64", - argLen: 2, - commutative: true, - generic: true, - }, - { - name: "SaturatedAddInt16x8", - argLen: 2, - commutative: true, - generic: true, - }, - { - name: "SaturatedAddInt16x16", - argLen: 2, - commutative: true, - generic: true, - }, - { - name: "SaturatedAddInt16x32", - argLen: 2, - commutative: true, - generic: true, - }, - { - name: "SaturatedAddMaskedInt8x16", - argLen: 3, - commutative: true, - generic: true, - }, - { - name: "SaturatedAddMaskedInt8x32", - argLen: 3, - commutative: true, - generic: true, - }, - { - name: "SaturatedAddMaskedInt8x64", - argLen: 3, - commutative: true, - generic: true, - }, - { - name: "SaturatedAddMaskedInt16x8", - argLen: 3, - commutative: true, - generic: true, - }, - { - name: "SaturatedAddMaskedInt16x16", - argLen: 3, - commutative: true, - generic: true, - }, - { - name: "SaturatedAddMaskedInt16x32", - argLen: 3, - commutative: true, - generic: true, - }, - { - name: "SaturatedAddMaskedUint8x16", - argLen: 3, - commutative: true, - generic: true, - }, - { - name: "SaturatedAddMaskedUint8x32", - argLen: 3, - commutative: true, - generic: true, - }, - { - name: "SaturatedAddMaskedUint8x64", - argLen: 3, - commutative: true, - generic: true, - }, - { - name: "SaturatedAddMaskedUint16x8", - argLen: 3, - commutative: true, - generic: true, - }, - { - name: "SaturatedAddMaskedUint16x16", - argLen: 3, - commutative: true, - generic: true, - }, - { - name: "SaturatedAddMaskedUint16x32", - argLen: 3, - commutative: true, - generic: true, - }, - { - name: "SaturatedAddUint8x16", - argLen: 2, - commutative: true, - generic: true, - }, - { - name: "SaturatedAddUint8x32", - argLen: 2, - commutative: true, - generic: true, - }, - { - name: "SaturatedAddUint8x64", - argLen: 2, - commutative: true, - generic: true, - }, - { - name: "SaturatedAddUint16x8", - argLen: 2, - commutative: true, - generic: true, - }, - { - name: "SaturatedAddUint16x16", - argLen: 2, - commutative: true, - generic: true, - }, - { - name: "SaturatedAddUint16x32", - argLen: 2, - commutative: true, - generic: true, - }, - { - name: "SaturatedPairwiseAddInt16x8", - argLen: 2, - generic: true, - }, - { - name: "SaturatedPairwiseAddInt16x16", - argLen: 2, - generic: true, - }, - { - name: "SaturatedPairwiseSubInt16x8", - argLen: 2, - generic: true, - }, - { - name: "SaturatedPairwiseSubInt16x16", - argLen: 2, - generic: true, - }, - { - name: "SaturatedSubInt8x16", - argLen: 2, - generic: true, - }, - { - name: "SaturatedSubInt8x32", - argLen: 2, - generic: true, - }, - { - name: "SaturatedSubInt8x64", - argLen: 2, - generic: true, - }, - { - name: "SaturatedSubInt16x8", - argLen: 2, - generic: true, - }, - { - name: "SaturatedSubInt16x16", - argLen: 2, - generic: true, - }, - { - name: "SaturatedSubInt16x32", - argLen: 2, - generic: true, - }, - { - name: "SaturatedSubMaskedInt8x16", - argLen: 3, - generic: true, - }, - { - name: "SaturatedSubMaskedInt8x32", - argLen: 3, - generic: true, - }, - { - name: "SaturatedSubMaskedInt8x64", - argLen: 3, - generic: true, - }, - { - name: "SaturatedSubMaskedInt16x8", - argLen: 3, - generic: true, - }, - { - name: "SaturatedSubMaskedInt16x16", - argLen: 3, - generic: true, - }, - { - name: "SaturatedSubMaskedInt16x32", - argLen: 3, - generic: true, - }, - { - name: "SaturatedSubMaskedUint8x16", - argLen: 3, - generic: true, - }, - { - name: "SaturatedSubMaskedUint8x32", - argLen: 3, - generic: true, - }, - { - name: "SaturatedSubMaskedUint8x64", - argLen: 3, - generic: true, - }, - { - name: "SaturatedSubMaskedUint16x8", - argLen: 3, - generic: true, - }, - { - name: "SaturatedSubMaskedUint16x16", - argLen: 3, - generic: true, - }, - { - name: "SaturatedSubMaskedUint16x32", - argLen: 3, - generic: true, - }, - { - name: "SaturatedSubUint8x16", - argLen: 2, - generic: true, - }, - { - name: "SaturatedSubUint8x32", - argLen: 2, - generic: true, - }, - { - name: "SaturatedSubUint8x64", - argLen: 2, - generic: true, - }, - { - name: "SaturatedSubUint16x8", - argLen: 2, - generic: true, - }, - { - name: "SaturatedSubUint16x16", - argLen: 2, - generic: true, - }, - { - name: "SaturatedSubUint16x32", - argLen: 2, - generic: true, - }, { name: "SaturatedUnsignedSignedPairDotProdMaskedUint8x16", argLen: 3, @@ -68241,6 +67991,66 @@ var opcodeTable = [...]opInfo{ argLen: 4, generic: true, }, + { + name: "ScaleFloat32x4", + argLen: 2, + generic: true, + }, + { + name: "ScaleFloat32x8", + argLen: 2, + generic: true, + }, + { + name: "ScaleFloat32x16", + argLen: 2, + generic: true, + }, + { + name: "ScaleFloat64x2", + argLen: 2, + generic: true, + }, + { + name: "ScaleFloat64x4", + argLen: 2, + generic: true, + }, + { + name: "ScaleFloat64x8", + argLen: 2, + generic: true, + }, + { + name: "ScaleMaskedFloat32x4", + argLen: 3, + generic: true, + }, + { + name: "ScaleMaskedFloat32x8", + argLen: 3, + generic: true, + }, + { + name: "ScaleMaskedFloat32x16", + argLen: 3, + generic: true, + }, + { + name: "ScaleMaskedFloat64x2", + argLen: 3, + generic: true, + }, + { + name: "ScaleMaskedFloat64x4", + argLen: 3, + generic: true, + }, + { + name: "ScaleMaskedFloat64x8", + argLen: 3, + generic: true, + }, { name: "ShiftAllLeftInt16x8", argLen: 2, @@ -69651,6 +69461,196 @@ var opcodeTable = [...]opInfo{ argLen: 3, generic: true, }, + { + name: "SubPairsFloat32x4", + argLen: 2, + generic: true, + }, + { + name: "SubPairsFloat32x8", + argLen: 2, + generic: true, + }, + { + name: "SubPairsFloat64x2", + argLen: 2, + generic: true, + }, + { + name: "SubPairsFloat64x4", + argLen: 2, + generic: true, + }, + { + name: "SubPairsInt16x8", + argLen: 2, + generic: true, + }, + { + name: "SubPairsInt16x16", + argLen: 2, + generic: true, + }, + { + name: "SubPairsInt32x4", + argLen: 2, + generic: true, + }, + { + name: "SubPairsInt32x8", + argLen: 2, + generic: true, + }, + { + name: "SubPairsSaturatedInt16x8", + argLen: 2, + generic: true, + }, + { + name: "SubPairsSaturatedInt16x16", + argLen: 2, + generic: true, + }, + { + name: "SubPairsUint16x8", + argLen: 2, + generic: true, + }, + { + name: "SubPairsUint16x16", + argLen: 2, + generic: true, + }, + { + name: "SubPairsUint32x4", + argLen: 2, + generic: true, + }, + { + name: "SubPairsUint32x8", + argLen: 2, + generic: true, + }, + { + name: "SubSaturatedInt8x16", + argLen: 2, + generic: true, + }, + { + name: "SubSaturatedInt8x32", + argLen: 2, + generic: true, + }, + { + name: "SubSaturatedInt8x64", + argLen: 2, + generic: true, + }, + { + name: "SubSaturatedInt16x8", + argLen: 2, + generic: true, + }, + { + name: "SubSaturatedInt16x16", + argLen: 2, + generic: true, + }, + { + name: "SubSaturatedInt16x32", + argLen: 2, + generic: true, + }, + { + name: "SubSaturatedMaskedInt8x16", + argLen: 3, + generic: true, + }, + { + name: "SubSaturatedMaskedInt8x32", + argLen: 3, + generic: true, + }, + { + name: "SubSaturatedMaskedInt8x64", + argLen: 3, + generic: true, + }, + { + name: "SubSaturatedMaskedInt16x8", + argLen: 3, + generic: true, + }, + { + name: "SubSaturatedMaskedInt16x16", + argLen: 3, + generic: true, + }, + { + name: "SubSaturatedMaskedInt16x32", + argLen: 3, + generic: true, + }, + { + name: "SubSaturatedMaskedUint8x16", + argLen: 3, + generic: true, + }, + { + name: "SubSaturatedMaskedUint8x32", + argLen: 3, + generic: true, + }, + { + name: "SubSaturatedMaskedUint8x64", + argLen: 3, + generic: true, + }, + { + name: "SubSaturatedMaskedUint16x8", + argLen: 3, + generic: true, + }, + { + name: "SubSaturatedMaskedUint16x16", + argLen: 3, + generic: true, + }, + { + name: "SubSaturatedMaskedUint16x32", + argLen: 3, + generic: true, + }, + { + name: "SubSaturatedUint8x16", + argLen: 2, + generic: true, + }, + { + name: "SubSaturatedUint8x32", + argLen: 2, + generic: true, + }, + { + name: "SubSaturatedUint8x64", + argLen: 2, + generic: true, + }, + { + name: "SubSaturatedUint16x8", + argLen: 2, + generic: true, + }, + { + name: "SubSaturatedUint16x16", + argLen: 2, + generic: true, + }, + { + name: "SubSaturatedUint16x32", + argLen: 2, + generic: true, + }, { name: "SubUint8x16", argLen: 2, @@ -69978,433 +69978,289 @@ var opcodeTable = [...]opInfo{ generic: true, }, { - name: "CeilWithPrecisionFloat32x4", + name: "CeilScaledFloat32x4", auxType: auxInt8, argLen: 1, generic: true, }, { - name: "CeilWithPrecisionFloat32x8", + name: "CeilScaledFloat32x8", auxType: auxInt8, argLen: 1, generic: true, }, { - name: "CeilWithPrecisionFloat32x16", + name: "CeilScaledFloat32x16", auxType: auxInt8, argLen: 1, generic: true, }, { - name: "CeilWithPrecisionFloat64x2", + name: "CeilScaledFloat64x2", auxType: auxInt8, argLen: 1, generic: true, }, { - name: "CeilWithPrecisionFloat64x4", + name: "CeilScaledFloat64x4", auxType: auxInt8, argLen: 1, generic: true, }, { - name: "CeilWithPrecisionFloat64x8", + name: "CeilScaledFloat64x8", auxType: auxInt8, argLen: 1, generic: true, }, { - name: "CeilWithPrecisionMaskedFloat32x4", + name: "CeilScaledMaskedFloat32x4", auxType: auxInt8, argLen: 2, generic: true, }, { - name: "CeilWithPrecisionMaskedFloat32x8", + name: "CeilScaledMaskedFloat32x8", auxType: auxInt8, argLen: 2, generic: true, }, { - name: "CeilWithPrecisionMaskedFloat32x16", + name: "CeilScaledMaskedFloat32x16", auxType: auxInt8, argLen: 2, generic: true, }, { - name: "CeilWithPrecisionMaskedFloat64x2", + name: "CeilScaledMaskedFloat64x2", auxType: auxInt8, argLen: 2, generic: true, }, { - name: "CeilWithPrecisionMaskedFloat64x4", + name: "CeilScaledMaskedFloat64x4", auxType: auxInt8, argLen: 2, generic: true, }, { - name: "CeilWithPrecisionMaskedFloat64x8", + name: "CeilScaledMaskedFloat64x8", auxType: auxInt8, argLen: 2, generic: true, }, { - name: "DiffWithCeilWithPrecisionFloat32x4", + name: "CeilScaledResidueFloat32x4", auxType: auxInt8, argLen: 1, generic: true, }, { - name: "DiffWithCeilWithPrecisionFloat32x8", + name: "CeilScaledResidueFloat32x8", auxType: auxInt8, argLen: 1, generic: true, }, { - name: "DiffWithCeilWithPrecisionFloat32x16", + name: "CeilScaledResidueFloat32x16", auxType: auxInt8, argLen: 1, generic: true, }, { - name: "DiffWithCeilWithPrecisionFloat64x2", + name: "CeilScaledResidueFloat64x2", auxType: auxInt8, argLen: 1, generic: true, }, { - name: "DiffWithCeilWithPrecisionFloat64x4", + name: "CeilScaledResidueFloat64x4", auxType: auxInt8, argLen: 1, generic: true, }, { - name: "DiffWithCeilWithPrecisionFloat64x8", + name: "CeilScaledResidueFloat64x8", auxType: auxInt8, argLen: 1, generic: true, }, { - name: "DiffWithCeilWithPrecisionMaskedFloat32x4", + name: "CeilScaledResidueMaskedFloat32x4", auxType: auxInt8, argLen: 2, generic: true, }, { - name: "DiffWithCeilWithPrecisionMaskedFloat32x8", + name: "CeilScaledResidueMaskedFloat32x8", auxType: auxInt8, argLen: 2, generic: true, }, { - name: "DiffWithCeilWithPrecisionMaskedFloat32x16", + name: "CeilScaledResidueMaskedFloat32x16", auxType: auxInt8, argLen: 2, generic: true, }, { - name: "DiffWithCeilWithPrecisionMaskedFloat64x2", + name: "CeilScaledResidueMaskedFloat64x2", auxType: auxInt8, argLen: 2, generic: true, }, { - name: "DiffWithCeilWithPrecisionMaskedFloat64x4", + name: "CeilScaledResidueMaskedFloat64x4", auxType: auxInt8, argLen: 2, generic: true, }, { - name: "DiffWithCeilWithPrecisionMaskedFloat64x8", + name: "CeilScaledResidueMaskedFloat64x8", auxType: auxInt8, argLen: 2, generic: true, }, { - name: "DiffWithFloorWithPrecisionFloat32x4", + name: "FloorScaledFloat32x4", auxType: auxInt8, argLen: 1, generic: true, }, { - name: "DiffWithFloorWithPrecisionFloat32x8", + name: "FloorScaledFloat32x8", auxType: auxInt8, argLen: 1, generic: true, }, { - name: "DiffWithFloorWithPrecisionFloat32x16", + name: "FloorScaledFloat32x16", auxType: auxInt8, argLen: 1, generic: true, }, { - name: "DiffWithFloorWithPrecisionFloat64x2", + name: "FloorScaledFloat64x2", auxType: auxInt8, argLen: 1, generic: true, }, { - name: "DiffWithFloorWithPrecisionFloat64x4", + name: "FloorScaledFloat64x4", auxType: auxInt8, argLen: 1, generic: true, }, { - name: "DiffWithFloorWithPrecisionFloat64x8", + name: "FloorScaledFloat64x8", auxType: auxInt8, argLen: 1, generic: true, }, { - name: "DiffWithFloorWithPrecisionMaskedFloat32x4", + name: "FloorScaledMaskedFloat32x4", auxType: auxInt8, argLen: 2, generic: true, }, { - name: "DiffWithFloorWithPrecisionMaskedFloat32x8", + name: "FloorScaledMaskedFloat32x8", auxType: auxInt8, argLen: 2, generic: true, }, { - name: "DiffWithFloorWithPrecisionMaskedFloat32x16", + name: "FloorScaledMaskedFloat32x16", auxType: auxInt8, argLen: 2, generic: true, }, { - name: "DiffWithFloorWithPrecisionMaskedFloat64x2", + name: "FloorScaledMaskedFloat64x2", auxType: auxInt8, argLen: 2, generic: true, }, { - name: "DiffWithFloorWithPrecisionMaskedFloat64x4", + name: "FloorScaledMaskedFloat64x4", auxType: auxInt8, argLen: 2, generic: true, }, { - name: "DiffWithFloorWithPrecisionMaskedFloat64x8", + name: "FloorScaledMaskedFloat64x8", auxType: auxInt8, argLen: 2, generic: true, }, { - name: "DiffWithRoundWithPrecisionFloat32x4", + name: "FloorScaledResidueFloat32x4", auxType: auxInt8, argLen: 1, generic: true, }, { - name: "DiffWithRoundWithPrecisionFloat32x8", + name: "FloorScaledResidueFloat32x8", auxType: auxInt8, argLen: 1, generic: true, }, { - name: "DiffWithRoundWithPrecisionFloat32x16", + name: "FloorScaledResidueFloat32x16", auxType: auxInt8, argLen: 1, generic: true, }, { - name: "DiffWithRoundWithPrecisionFloat64x2", + name: "FloorScaledResidueFloat64x2", auxType: auxInt8, argLen: 1, generic: true, }, { - name: "DiffWithRoundWithPrecisionFloat64x4", + name: "FloorScaledResidueFloat64x4", auxType: auxInt8, argLen: 1, generic: true, }, { - name: "DiffWithRoundWithPrecisionFloat64x8", + name: "FloorScaledResidueFloat64x8", auxType: auxInt8, argLen: 1, generic: true, }, { - name: "DiffWithRoundWithPrecisionMaskedFloat32x4", + name: "FloorScaledResidueMaskedFloat32x4", auxType: auxInt8, argLen: 2, generic: true, }, { - name: "DiffWithRoundWithPrecisionMaskedFloat32x8", + name: "FloorScaledResidueMaskedFloat32x8", auxType: auxInt8, argLen: 2, generic: true, }, { - name: "DiffWithRoundWithPrecisionMaskedFloat32x16", + name: "FloorScaledResidueMaskedFloat32x16", auxType: auxInt8, argLen: 2, generic: true, }, { - name: "DiffWithRoundWithPrecisionMaskedFloat64x2", + name: "FloorScaledResidueMaskedFloat64x2", auxType: auxInt8, argLen: 2, generic: true, }, { - name: "DiffWithRoundWithPrecisionMaskedFloat64x4", + name: "FloorScaledResidueMaskedFloat64x4", auxType: auxInt8, argLen: 2, generic: true, }, { - name: "DiffWithRoundWithPrecisionMaskedFloat64x8", - auxType: auxInt8, - argLen: 2, - generic: true, - }, - { - name: "DiffWithTruncWithPrecisionFloat32x4", - auxType: auxInt8, - argLen: 1, - generic: true, - }, - { - name: "DiffWithTruncWithPrecisionFloat32x8", - auxType: auxInt8, - argLen: 1, - generic: true, - }, - { - name: "DiffWithTruncWithPrecisionFloat32x16", - auxType: auxInt8, - argLen: 1, - generic: true, - }, - { - name: "DiffWithTruncWithPrecisionFloat64x2", - auxType: auxInt8, - argLen: 1, - generic: true, - }, - { - name: "DiffWithTruncWithPrecisionFloat64x4", - auxType: auxInt8, - argLen: 1, - generic: true, - }, - { - name: "DiffWithTruncWithPrecisionFloat64x8", - auxType: auxInt8, - argLen: 1, - generic: true, - }, - { - name: "DiffWithTruncWithPrecisionMaskedFloat32x4", - auxType: auxInt8, - argLen: 2, - generic: true, - }, - { - name: "DiffWithTruncWithPrecisionMaskedFloat32x8", - auxType: auxInt8, - argLen: 2, - generic: true, - }, - { - name: "DiffWithTruncWithPrecisionMaskedFloat32x16", - auxType: auxInt8, - argLen: 2, - generic: true, - }, - { - name: "DiffWithTruncWithPrecisionMaskedFloat64x2", - auxType: auxInt8, - argLen: 2, - generic: true, - }, - { - name: "DiffWithTruncWithPrecisionMaskedFloat64x4", - auxType: auxInt8, - argLen: 2, - generic: true, - }, - { - name: "DiffWithTruncWithPrecisionMaskedFloat64x8", - auxType: auxInt8, - argLen: 2, - generic: true, - }, - { - name: "FloorWithPrecisionFloat32x4", - auxType: auxInt8, - argLen: 1, - generic: true, - }, - { - name: "FloorWithPrecisionFloat32x8", - auxType: auxInt8, - argLen: 1, - generic: true, - }, - { - name: "FloorWithPrecisionFloat32x16", - auxType: auxInt8, - argLen: 1, - generic: true, - }, - { - name: "FloorWithPrecisionFloat64x2", - auxType: auxInt8, - argLen: 1, - generic: true, - }, - { - name: "FloorWithPrecisionFloat64x4", - auxType: auxInt8, - argLen: 1, - generic: true, - }, - { - name: "FloorWithPrecisionFloat64x8", - auxType: auxInt8, - argLen: 1, - generic: true, - }, - { - name: "FloorWithPrecisionMaskedFloat32x4", - auxType: auxInt8, - argLen: 2, - generic: true, - }, - { - name: "FloorWithPrecisionMaskedFloat32x8", - auxType: auxInt8, - argLen: 2, - generic: true, - }, - { - name: "FloorWithPrecisionMaskedFloat32x16", - auxType: auxInt8, - argLen: 2, - generic: true, - }, - { - name: "FloorWithPrecisionMaskedFloat64x2", - auxType: auxInt8, - argLen: 2, - generic: true, - }, - { - name: "FloorWithPrecisionMaskedFloat64x4", - auxType: auxInt8, - argLen: 2, - generic: true, - }, - { - name: "FloorWithPrecisionMaskedFloat64x8", + name: "FloorScaledResidueMaskedFloat64x8", auxType: auxInt8, argLen: 2, generic: true, @@ -70878,73 +70734,145 @@ var opcodeTable = [...]opInfo{ generic: true, }, { - name: "RoundWithPrecisionFloat32x4", + name: "RoundScaledFloat32x4", auxType: auxInt8, argLen: 1, generic: true, }, { - name: "RoundWithPrecisionFloat32x8", + name: "RoundScaledFloat32x8", auxType: auxInt8, argLen: 1, generic: true, }, { - name: "RoundWithPrecisionFloat32x16", + name: "RoundScaledFloat32x16", auxType: auxInt8, argLen: 1, generic: true, }, { - name: "RoundWithPrecisionFloat64x2", + name: "RoundScaledFloat64x2", auxType: auxInt8, argLen: 1, generic: true, }, { - name: "RoundWithPrecisionFloat64x4", + name: "RoundScaledFloat64x4", auxType: auxInt8, argLen: 1, generic: true, }, { - name: "RoundWithPrecisionFloat64x8", + name: "RoundScaledFloat64x8", auxType: auxInt8, argLen: 1, generic: true, }, { - name: "RoundWithPrecisionMaskedFloat32x4", + name: "RoundScaledMaskedFloat32x4", auxType: auxInt8, argLen: 2, generic: true, }, { - name: "RoundWithPrecisionMaskedFloat32x8", + name: "RoundScaledMaskedFloat32x8", auxType: auxInt8, argLen: 2, generic: true, }, { - name: "RoundWithPrecisionMaskedFloat32x16", + name: "RoundScaledMaskedFloat32x16", auxType: auxInt8, argLen: 2, generic: true, }, { - name: "RoundWithPrecisionMaskedFloat64x2", + name: "RoundScaledMaskedFloat64x2", auxType: auxInt8, argLen: 2, generic: true, }, { - name: "RoundWithPrecisionMaskedFloat64x4", + name: "RoundScaledMaskedFloat64x4", auxType: auxInt8, argLen: 2, generic: true, }, { - name: "RoundWithPrecisionMaskedFloat64x8", + name: "RoundScaledMaskedFloat64x8", + auxType: auxInt8, + argLen: 2, + generic: true, + }, + { + name: "RoundScaledResidueFloat32x4", + auxType: auxInt8, + argLen: 1, + generic: true, + }, + { + name: "RoundScaledResidueFloat32x8", + auxType: auxInt8, + argLen: 1, + generic: true, + }, + { + name: "RoundScaledResidueFloat32x16", + auxType: auxInt8, + argLen: 1, + generic: true, + }, + { + name: "RoundScaledResidueFloat64x2", + auxType: auxInt8, + argLen: 1, + generic: true, + }, + { + name: "RoundScaledResidueFloat64x4", + auxType: auxInt8, + argLen: 1, + generic: true, + }, + { + name: "RoundScaledResidueFloat64x8", + auxType: auxInt8, + argLen: 1, + generic: true, + }, + { + name: "RoundScaledResidueMaskedFloat32x4", + auxType: auxInt8, + argLen: 2, + generic: true, + }, + { + name: "RoundScaledResidueMaskedFloat32x8", + auxType: auxInt8, + argLen: 2, + generic: true, + }, + { + name: "RoundScaledResidueMaskedFloat32x16", + auxType: auxInt8, + argLen: 2, + generic: true, + }, + { + name: "RoundScaledResidueMaskedFloat64x2", + auxType: auxInt8, + argLen: 2, + generic: true, + }, + { + name: "RoundScaledResidueMaskedFloat64x4", + auxType: auxInt8, + argLen: 2, + generic: true, + }, + { + name: "RoundScaledResidueMaskedFloat64x8", auxType: auxInt8, argLen: 2, generic: true, @@ -71490,73 +71418,145 @@ var opcodeTable = [...]opInfo{ generic: true, }, { - name: "TruncWithPrecisionFloat32x4", + name: "TruncScaledFloat32x4", auxType: auxInt8, argLen: 1, generic: true, }, { - name: "TruncWithPrecisionFloat32x8", + name: "TruncScaledFloat32x8", auxType: auxInt8, argLen: 1, generic: true, }, { - name: "TruncWithPrecisionFloat32x16", + name: "TruncScaledFloat32x16", auxType: auxInt8, argLen: 1, generic: true, }, { - name: "TruncWithPrecisionFloat64x2", + name: "TruncScaledFloat64x2", auxType: auxInt8, argLen: 1, generic: true, }, { - name: "TruncWithPrecisionFloat64x4", + name: "TruncScaledFloat64x4", auxType: auxInt8, argLen: 1, generic: true, }, { - name: "TruncWithPrecisionFloat64x8", + name: "TruncScaledFloat64x8", auxType: auxInt8, argLen: 1, generic: true, }, { - name: "TruncWithPrecisionMaskedFloat32x4", + name: "TruncScaledMaskedFloat32x4", auxType: auxInt8, argLen: 2, generic: true, }, { - name: "TruncWithPrecisionMaskedFloat32x8", + name: "TruncScaledMaskedFloat32x8", auxType: auxInt8, argLen: 2, generic: true, }, { - name: "TruncWithPrecisionMaskedFloat32x16", + name: "TruncScaledMaskedFloat32x16", auxType: auxInt8, argLen: 2, generic: true, }, { - name: "TruncWithPrecisionMaskedFloat64x2", + name: "TruncScaledMaskedFloat64x2", auxType: auxInt8, argLen: 2, generic: true, }, { - name: "TruncWithPrecisionMaskedFloat64x4", + name: "TruncScaledMaskedFloat64x4", auxType: auxInt8, argLen: 2, generic: true, }, { - name: "TruncWithPrecisionMaskedFloat64x8", + name: "TruncScaledMaskedFloat64x8", + auxType: auxInt8, + argLen: 2, + generic: true, + }, + { + name: "TruncScaledResidueFloat32x4", + auxType: auxInt8, + argLen: 1, + generic: true, + }, + { + name: "TruncScaledResidueFloat32x8", + auxType: auxInt8, + argLen: 1, + generic: true, + }, + { + name: "TruncScaledResidueFloat32x16", + auxType: auxInt8, + argLen: 1, + generic: true, + }, + { + name: "TruncScaledResidueFloat64x2", + auxType: auxInt8, + argLen: 1, + generic: true, + }, + { + name: "TruncScaledResidueFloat64x4", + auxType: auxInt8, + argLen: 1, + generic: true, + }, + { + name: "TruncScaledResidueFloat64x8", + auxType: auxInt8, + argLen: 1, + generic: true, + }, + { + name: "TruncScaledResidueMaskedFloat32x4", + auxType: auxInt8, + argLen: 2, + generic: true, + }, + { + name: "TruncScaledResidueMaskedFloat32x8", + auxType: auxInt8, + argLen: 2, + generic: true, + }, + { + name: "TruncScaledResidueMaskedFloat32x16", + auxType: auxInt8, + argLen: 2, + generic: true, + }, + { + name: "TruncScaledResidueMaskedFloat64x2", + auxType: auxInt8, + argLen: 2, + generic: true, + }, + { + name: "TruncScaledResidueMaskedFloat64x4", + auxType: auxInt8, + argLen: 2, + generic: true, + }, + { + name: "TruncScaledResidueMaskedFloat64x8", auxType: auxInt8, argLen: 2, generic: true, diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go index 82f13b43c6e..a3a7ba7ed65 100644 --- a/src/cmd/compile/internal/ssa/rewriteAMD64.go +++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go @@ -760,9 +760,111 @@ func rewriteValueAMD64(v *Value) bool { return rewriteValueAMD64_OpAddMaskedUint8x32(v) case OpAddMaskedUint8x64: return rewriteValueAMD64_OpAddMaskedUint8x64(v) + case OpAddPairsFloat32x4: + v.Op = OpAMD64VHADDPS128 + return true + case OpAddPairsFloat32x8: + v.Op = OpAMD64VHADDPS256 + return true + case OpAddPairsFloat64x2: + v.Op = OpAMD64VHADDPD128 + return true + case OpAddPairsFloat64x4: + v.Op = OpAMD64VHADDPD256 + return true + case OpAddPairsInt16x16: + v.Op = OpAMD64VPHADDW256 + return true + case OpAddPairsInt16x8: + v.Op = OpAMD64VPHADDW128 + return true + case OpAddPairsInt32x4: + v.Op = OpAMD64VPHADDD128 + return true + case OpAddPairsInt32x8: + v.Op = OpAMD64VPHADDD256 + return true + case OpAddPairsSaturatedInt16x16: + v.Op = OpAMD64VPHADDSW256 + return true + case OpAddPairsSaturatedInt16x8: + v.Op = OpAMD64VPHADDSW128 + return true + case OpAddPairsUint16x16: + v.Op = OpAMD64VPHADDW256 + return true + case OpAddPairsUint16x8: + v.Op = OpAMD64VPHADDW128 + return true + case OpAddPairsUint32x4: + v.Op = OpAMD64VPHADDD128 + return true + case OpAddPairsUint32x8: + v.Op = OpAMD64VPHADDD256 + return true case OpAddPtr: v.Op = OpAMD64ADDQ return true + case OpAddSaturatedInt16x16: + v.Op = OpAMD64VPADDSW256 + return true + case OpAddSaturatedInt16x32: + v.Op = OpAMD64VPADDSW512 + return true + case OpAddSaturatedInt16x8: + v.Op = OpAMD64VPADDSW128 + return true + case OpAddSaturatedInt8x16: + v.Op = OpAMD64VPADDSB128 + return true + case OpAddSaturatedInt8x32: + v.Op = OpAMD64VPADDSB256 + return true + case OpAddSaturatedInt8x64: + v.Op = OpAMD64VPADDSB512 + return true + case OpAddSaturatedMaskedInt16x16: + return rewriteValueAMD64_OpAddSaturatedMaskedInt16x16(v) + case OpAddSaturatedMaskedInt16x32: + return rewriteValueAMD64_OpAddSaturatedMaskedInt16x32(v) + case OpAddSaturatedMaskedInt16x8: + return rewriteValueAMD64_OpAddSaturatedMaskedInt16x8(v) + case OpAddSaturatedMaskedInt8x16: + return rewriteValueAMD64_OpAddSaturatedMaskedInt8x16(v) + case OpAddSaturatedMaskedInt8x32: + return rewriteValueAMD64_OpAddSaturatedMaskedInt8x32(v) + case OpAddSaturatedMaskedInt8x64: + return rewriteValueAMD64_OpAddSaturatedMaskedInt8x64(v) + case OpAddSaturatedMaskedUint16x16: + return rewriteValueAMD64_OpAddSaturatedMaskedUint16x16(v) + case OpAddSaturatedMaskedUint16x32: + return rewriteValueAMD64_OpAddSaturatedMaskedUint16x32(v) + case OpAddSaturatedMaskedUint16x8: + return rewriteValueAMD64_OpAddSaturatedMaskedUint16x8(v) + case OpAddSaturatedMaskedUint8x16: + return rewriteValueAMD64_OpAddSaturatedMaskedUint8x16(v) + case OpAddSaturatedMaskedUint8x32: + return rewriteValueAMD64_OpAddSaturatedMaskedUint8x32(v) + case OpAddSaturatedMaskedUint8x64: + return rewriteValueAMD64_OpAddSaturatedMaskedUint8x64(v) + case OpAddSaturatedUint16x16: + v.Op = OpAMD64VPADDSW256 + return true + case OpAddSaturatedUint16x32: + v.Op = OpAMD64VPADDSW512 + return true + case OpAddSaturatedUint16x8: + v.Op = OpAMD64VPADDSW128 + return true + case OpAddSaturatedUint8x16: + v.Op = OpAMD64VPADDSB128 + return true + case OpAddSaturatedUint8x32: + v.Op = OpAMD64VPADDSB256 + return true + case OpAddSaturatedUint8x64: + v.Op = OpAMD64VPADDSB512 + return true case OpAddSubFloat32x4: v.Op = OpAMD64VADDSUBPS128 return true @@ -1185,30 +1287,54 @@ func rewriteValueAMD64(v *Value) bool { return rewriteValueAMD64_OpCeilFloat64x2(v) case OpCeilFloat64x4: return rewriteValueAMD64_OpCeilFloat64x4(v) - case OpCeilWithPrecisionFloat32x16: - return rewriteValueAMD64_OpCeilWithPrecisionFloat32x16(v) - case OpCeilWithPrecisionFloat32x4: - return rewriteValueAMD64_OpCeilWithPrecisionFloat32x4(v) - case OpCeilWithPrecisionFloat32x8: - return rewriteValueAMD64_OpCeilWithPrecisionFloat32x8(v) - case OpCeilWithPrecisionFloat64x2: - return rewriteValueAMD64_OpCeilWithPrecisionFloat64x2(v) - case OpCeilWithPrecisionFloat64x4: - return rewriteValueAMD64_OpCeilWithPrecisionFloat64x4(v) - case OpCeilWithPrecisionFloat64x8: - return rewriteValueAMD64_OpCeilWithPrecisionFloat64x8(v) - case OpCeilWithPrecisionMaskedFloat32x16: - return rewriteValueAMD64_OpCeilWithPrecisionMaskedFloat32x16(v) - case OpCeilWithPrecisionMaskedFloat32x4: - return rewriteValueAMD64_OpCeilWithPrecisionMaskedFloat32x4(v) - case OpCeilWithPrecisionMaskedFloat32x8: - return rewriteValueAMD64_OpCeilWithPrecisionMaskedFloat32x8(v) - case OpCeilWithPrecisionMaskedFloat64x2: - return rewriteValueAMD64_OpCeilWithPrecisionMaskedFloat64x2(v) - case OpCeilWithPrecisionMaskedFloat64x4: - return rewriteValueAMD64_OpCeilWithPrecisionMaskedFloat64x4(v) - case OpCeilWithPrecisionMaskedFloat64x8: - return rewriteValueAMD64_OpCeilWithPrecisionMaskedFloat64x8(v) + case OpCeilScaledFloat32x16: + return rewriteValueAMD64_OpCeilScaledFloat32x16(v) + case OpCeilScaledFloat32x4: + return rewriteValueAMD64_OpCeilScaledFloat32x4(v) + case OpCeilScaledFloat32x8: + return rewriteValueAMD64_OpCeilScaledFloat32x8(v) + case OpCeilScaledFloat64x2: + return rewriteValueAMD64_OpCeilScaledFloat64x2(v) + case OpCeilScaledFloat64x4: + return rewriteValueAMD64_OpCeilScaledFloat64x4(v) + case OpCeilScaledFloat64x8: + return rewriteValueAMD64_OpCeilScaledFloat64x8(v) + case OpCeilScaledMaskedFloat32x16: + return rewriteValueAMD64_OpCeilScaledMaskedFloat32x16(v) + case OpCeilScaledMaskedFloat32x4: + return rewriteValueAMD64_OpCeilScaledMaskedFloat32x4(v) + case OpCeilScaledMaskedFloat32x8: + return rewriteValueAMD64_OpCeilScaledMaskedFloat32x8(v) + case OpCeilScaledMaskedFloat64x2: + return rewriteValueAMD64_OpCeilScaledMaskedFloat64x2(v) + case OpCeilScaledMaskedFloat64x4: + return rewriteValueAMD64_OpCeilScaledMaskedFloat64x4(v) + case OpCeilScaledMaskedFloat64x8: + return rewriteValueAMD64_OpCeilScaledMaskedFloat64x8(v) + case OpCeilScaledResidueFloat32x16: + return rewriteValueAMD64_OpCeilScaledResidueFloat32x16(v) + case OpCeilScaledResidueFloat32x4: + return rewriteValueAMD64_OpCeilScaledResidueFloat32x4(v) + case OpCeilScaledResidueFloat32x8: + return rewriteValueAMD64_OpCeilScaledResidueFloat32x8(v) + case OpCeilScaledResidueFloat64x2: + return rewriteValueAMD64_OpCeilScaledResidueFloat64x2(v) + case OpCeilScaledResidueFloat64x4: + return rewriteValueAMD64_OpCeilScaledResidueFloat64x4(v) + case OpCeilScaledResidueFloat64x8: + return rewriteValueAMD64_OpCeilScaledResidueFloat64x8(v) + case OpCeilScaledResidueMaskedFloat32x16: + return rewriteValueAMD64_OpCeilScaledResidueMaskedFloat32x16(v) + case OpCeilScaledResidueMaskedFloat32x4: + return rewriteValueAMD64_OpCeilScaledResidueMaskedFloat32x4(v) + case OpCeilScaledResidueMaskedFloat32x8: + return rewriteValueAMD64_OpCeilScaledResidueMaskedFloat32x8(v) + case OpCeilScaledResidueMaskedFloat64x2: + return rewriteValueAMD64_OpCeilScaledResidueMaskedFloat64x2(v) + case OpCeilScaledResidueMaskedFloat64x4: + return rewriteValueAMD64_OpCeilScaledResidueMaskedFloat64x4(v) + case OpCeilScaledResidueMaskedFloat64x8: + return rewriteValueAMD64_OpCeilScaledResidueMaskedFloat64x8(v) case OpClosureCall: v.Op = OpAMD64CALLclosure return true @@ -1409,102 +1535,6 @@ func rewriteValueAMD64(v *Value) bool { case OpCvtBoolToUint8: v.Op = OpCopy return true - case OpDiffWithCeilWithPrecisionFloat32x16: - return rewriteValueAMD64_OpDiffWithCeilWithPrecisionFloat32x16(v) - case OpDiffWithCeilWithPrecisionFloat32x4: - return rewriteValueAMD64_OpDiffWithCeilWithPrecisionFloat32x4(v) - case OpDiffWithCeilWithPrecisionFloat32x8: - return rewriteValueAMD64_OpDiffWithCeilWithPrecisionFloat32x8(v) - case OpDiffWithCeilWithPrecisionFloat64x2: - return rewriteValueAMD64_OpDiffWithCeilWithPrecisionFloat64x2(v) - case OpDiffWithCeilWithPrecisionFloat64x4: - return rewriteValueAMD64_OpDiffWithCeilWithPrecisionFloat64x4(v) - case OpDiffWithCeilWithPrecisionFloat64x8: - return rewriteValueAMD64_OpDiffWithCeilWithPrecisionFloat64x8(v) - case OpDiffWithCeilWithPrecisionMaskedFloat32x16: - return rewriteValueAMD64_OpDiffWithCeilWithPrecisionMaskedFloat32x16(v) - case OpDiffWithCeilWithPrecisionMaskedFloat32x4: - return rewriteValueAMD64_OpDiffWithCeilWithPrecisionMaskedFloat32x4(v) - case OpDiffWithCeilWithPrecisionMaskedFloat32x8: - return rewriteValueAMD64_OpDiffWithCeilWithPrecisionMaskedFloat32x8(v) - case OpDiffWithCeilWithPrecisionMaskedFloat64x2: - return rewriteValueAMD64_OpDiffWithCeilWithPrecisionMaskedFloat64x2(v) - case OpDiffWithCeilWithPrecisionMaskedFloat64x4: - return rewriteValueAMD64_OpDiffWithCeilWithPrecisionMaskedFloat64x4(v) - case OpDiffWithCeilWithPrecisionMaskedFloat64x8: - return rewriteValueAMD64_OpDiffWithCeilWithPrecisionMaskedFloat64x8(v) - case OpDiffWithFloorWithPrecisionFloat32x16: - return rewriteValueAMD64_OpDiffWithFloorWithPrecisionFloat32x16(v) - case OpDiffWithFloorWithPrecisionFloat32x4: - return rewriteValueAMD64_OpDiffWithFloorWithPrecisionFloat32x4(v) - case OpDiffWithFloorWithPrecisionFloat32x8: - return rewriteValueAMD64_OpDiffWithFloorWithPrecisionFloat32x8(v) - case OpDiffWithFloorWithPrecisionFloat64x2: - return rewriteValueAMD64_OpDiffWithFloorWithPrecisionFloat64x2(v) - case OpDiffWithFloorWithPrecisionFloat64x4: - return rewriteValueAMD64_OpDiffWithFloorWithPrecisionFloat64x4(v) - case OpDiffWithFloorWithPrecisionFloat64x8: - return rewriteValueAMD64_OpDiffWithFloorWithPrecisionFloat64x8(v) - case OpDiffWithFloorWithPrecisionMaskedFloat32x16: - return rewriteValueAMD64_OpDiffWithFloorWithPrecisionMaskedFloat32x16(v) - case OpDiffWithFloorWithPrecisionMaskedFloat32x4: - return rewriteValueAMD64_OpDiffWithFloorWithPrecisionMaskedFloat32x4(v) - case OpDiffWithFloorWithPrecisionMaskedFloat32x8: - return rewriteValueAMD64_OpDiffWithFloorWithPrecisionMaskedFloat32x8(v) - case OpDiffWithFloorWithPrecisionMaskedFloat64x2: - return rewriteValueAMD64_OpDiffWithFloorWithPrecisionMaskedFloat64x2(v) - case OpDiffWithFloorWithPrecisionMaskedFloat64x4: - return rewriteValueAMD64_OpDiffWithFloorWithPrecisionMaskedFloat64x4(v) - case OpDiffWithFloorWithPrecisionMaskedFloat64x8: - return rewriteValueAMD64_OpDiffWithFloorWithPrecisionMaskedFloat64x8(v) - case OpDiffWithRoundWithPrecisionFloat32x16: - return rewriteValueAMD64_OpDiffWithRoundWithPrecisionFloat32x16(v) - case OpDiffWithRoundWithPrecisionFloat32x4: - return rewriteValueAMD64_OpDiffWithRoundWithPrecisionFloat32x4(v) - case OpDiffWithRoundWithPrecisionFloat32x8: - return rewriteValueAMD64_OpDiffWithRoundWithPrecisionFloat32x8(v) - case OpDiffWithRoundWithPrecisionFloat64x2: - return rewriteValueAMD64_OpDiffWithRoundWithPrecisionFloat64x2(v) - case OpDiffWithRoundWithPrecisionFloat64x4: - return rewriteValueAMD64_OpDiffWithRoundWithPrecisionFloat64x4(v) - case OpDiffWithRoundWithPrecisionFloat64x8: - return rewriteValueAMD64_OpDiffWithRoundWithPrecisionFloat64x8(v) - case OpDiffWithRoundWithPrecisionMaskedFloat32x16: - return rewriteValueAMD64_OpDiffWithRoundWithPrecisionMaskedFloat32x16(v) - case OpDiffWithRoundWithPrecisionMaskedFloat32x4: - return rewriteValueAMD64_OpDiffWithRoundWithPrecisionMaskedFloat32x4(v) - case OpDiffWithRoundWithPrecisionMaskedFloat32x8: - return rewriteValueAMD64_OpDiffWithRoundWithPrecisionMaskedFloat32x8(v) - case OpDiffWithRoundWithPrecisionMaskedFloat64x2: - return rewriteValueAMD64_OpDiffWithRoundWithPrecisionMaskedFloat64x2(v) - case OpDiffWithRoundWithPrecisionMaskedFloat64x4: - return rewriteValueAMD64_OpDiffWithRoundWithPrecisionMaskedFloat64x4(v) - case OpDiffWithRoundWithPrecisionMaskedFloat64x8: - return rewriteValueAMD64_OpDiffWithRoundWithPrecisionMaskedFloat64x8(v) - case OpDiffWithTruncWithPrecisionFloat32x16: - return rewriteValueAMD64_OpDiffWithTruncWithPrecisionFloat32x16(v) - case OpDiffWithTruncWithPrecisionFloat32x4: - return rewriteValueAMD64_OpDiffWithTruncWithPrecisionFloat32x4(v) - case OpDiffWithTruncWithPrecisionFloat32x8: - return rewriteValueAMD64_OpDiffWithTruncWithPrecisionFloat32x8(v) - case OpDiffWithTruncWithPrecisionFloat64x2: - return rewriteValueAMD64_OpDiffWithTruncWithPrecisionFloat64x2(v) - case OpDiffWithTruncWithPrecisionFloat64x4: - return rewriteValueAMD64_OpDiffWithTruncWithPrecisionFloat64x4(v) - case OpDiffWithTruncWithPrecisionFloat64x8: - return rewriteValueAMD64_OpDiffWithTruncWithPrecisionFloat64x8(v) - case OpDiffWithTruncWithPrecisionMaskedFloat32x16: - return rewriteValueAMD64_OpDiffWithTruncWithPrecisionMaskedFloat32x16(v) - case OpDiffWithTruncWithPrecisionMaskedFloat32x4: - return rewriteValueAMD64_OpDiffWithTruncWithPrecisionMaskedFloat32x4(v) - case OpDiffWithTruncWithPrecisionMaskedFloat32x8: - return rewriteValueAMD64_OpDiffWithTruncWithPrecisionMaskedFloat32x8(v) - case OpDiffWithTruncWithPrecisionMaskedFloat64x2: - return rewriteValueAMD64_OpDiffWithTruncWithPrecisionMaskedFloat64x2(v) - case OpDiffWithTruncWithPrecisionMaskedFloat64x4: - return rewriteValueAMD64_OpDiffWithTruncWithPrecisionMaskedFloat64x4(v) - case OpDiffWithTruncWithPrecisionMaskedFloat64x8: - return rewriteValueAMD64_OpDiffWithTruncWithPrecisionMaskedFloat64x8(v) case OpDiv128u: v.Op = OpAMD64DIVQU2 return true @@ -1730,30 +1760,54 @@ func rewriteValueAMD64(v *Value) bool { return rewriteValueAMD64_OpFloorFloat64x2(v) case OpFloorFloat64x4: return rewriteValueAMD64_OpFloorFloat64x4(v) - case OpFloorWithPrecisionFloat32x16: - return rewriteValueAMD64_OpFloorWithPrecisionFloat32x16(v) - case OpFloorWithPrecisionFloat32x4: - return rewriteValueAMD64_OpFloorWithPrecisionFloat32x4(v) - case OpFloorWithPrecisionFloat32x8: - return rewriteValueAMD64_OpFloorWithPrecisionFloat32x8(v) - case OpFloorWithPrecisionFloat64x2: - return rewriteValueAMD64_OpFloorWithPrecisionFloat64x2(v) - case OpFloorWithPrecisionFloat64x4: - return rewriteValueAMD64_OpFloorWithPrecisionFloat64x4(v) - case OpFloorWithPrecisionFloat64x8: - return rewriteValueAMD64_OpFloorWithPrecisionFloat64x8(v) - case OpFloorWithPrecisionMaskedFloat32x16: - return rewriteValueAMD64_OpFloorWithPrecisionMaskedFloat32x16(v) - case OpFloorWithPrecisionMaskedFloat32x4: - return rewriteValueAMD64_OpFloorWithPrecisionMaskedFloat32x4(v) - case OpFloorWithPrecisionMaskedFloat32x8: - return rewriteValueAMD64_OpFloorWithPrecisionMaskedFloat32x8(v) - case OpFloorWithPrecisionMaskedFloat64x2: - return rewriteValueAMD64_OpFloorWithPrecisionMaskedFloat64x2(v) - case OpFloorWithPrecisionMaskedFloat64x4: - return rewriteValueAMD64_OpFloorWithPrecisionMaskedFloat64x4(v) - case OpFloorWithPrecisionMaskedFloat64x8: - return rewriteValueAMD64_OpFloorWithPrecisionMaskedFloat64x8(v) + case OpFloorScaledFloat32x16: + return rewriteValueAMD64_OpFloorScaledFloat32x16(v) + case OpFloorScaledFloat32x4: + return rewriteValueAMD64_OpFloorScaledFloat32x4(v) + case OpFloorScaledFloat32x8: + return rewriteValueAMD64_OpFloorScaledFloat32x8(v) + case OpFloorScaledFloat64x2: + return rewriteValueAMD64_OpFloorScaledFloat64x2(v) + case OpFloorScaledFloat64x4: + return rewriteValueAMD64_OpFloorScaledFloat64x4(v) + case OpFloorScaledFloat64x8: + return rewriteValueAMD64_OpFloorScaledFloat64x8(v) + case OpFloorScaledMaskedFloat32x16: + return rewriteValueAMD64_OpFloorScaledMaskedFloat32x16(v) + case OpFloorScaledMaskedFloat32x4: + return rewriteValueAMD64_OpFloorScaledMaskedFloat32x4(v) + case OpFloorScaledMaskedFloat32x8: + return rewriteValueAMD64_OpFloorScaledMaskedFloat32x8(v) + case OpFloorScaledMaskedFloat64x2: + return rewriteValueAMD64_OpFloorScaledMaskedFloat64x2(v) + case OpFloorScaledMaskedFloat64x4: + return rewriteValueAMD64_OpFloorScaledMaskedFloat64x4(v) + case OpFloorScaledMaskedFloat64x8: + return rewriteValueAMD64_OpFloorScaledMaskedFloat64x8(v) + case OpFloorScaledResidueFloat32x16: + return rewriteValueAMD64_OpFloorScaledResidueFloat32x16(v) + case OpFloorScaledResidueFloat32x4: + return rewriteValueAMD64_OpFloorScaledResidueFloat32x4(v) + case OpFloorScaledResidueFloat32x8: + return rewriteValueAMD64_OpFloorScaledResidueFloat32x8(v) + case OpFloorScaledResidueFloat64x2: + return rewriteValueAMD64_OpFloorScaledResidueFloat64x2(v) + case OpFloorScaledResidueFloat64x4: + return rewriteValueAMD64_OpFloorScaledResidueFloat64x4(v) + case OpFloorScaledResidueFloat64x8: + return rewriteValueAMD64_OpFloorScaledResidueFloat64x8(v) + case OpFloorScaledResidueMaskedFloat32x16: + return rewriteValueAMD64_OpFloorScaledResidueMaskedFloat32x16(v) + case OpFloorScaledResidueMaskedFloat32x4: + return rewriteValueAMD64_OpFloorScaledResidueMaskedFloat32x4(v) + case OpFloorScaledResidueMaskedFloat32x8: + return rewriteValueAMD64_OpFloorScaledResidueMaskedFloat32x8(v) + case OpFloorScaledResidueMaskedFloat64x2: + return rewriteValueAMD64_OpFloorScaledResidueMaskedFloat64x2(v) + case OpFloorScaledResidueMaskedFloat64x4: + return rewriteValueAMD64_OpFloorScaledResidueMaskedFloat64x4(v) + case OpFloorScaledResidueMaskedFloat64x8: + return rewriteValueAMD64_OpFloorScaledResidueMaskedFloat64x8(v) case OpFusedMultiplyAddFloat32x16: v.Op = OpAMD64VFMADD213PS512 return true @@ -2944,36 +2998,6 @@ func rewriteValueAMD64(v *Value) bool { case OpMul8: v.Op = OpAMD64MULL return true - case OpMulByPowOf2Float32x16: - v.Op = OpAMD64VSCALEFPS512 - return true - case OpMulByPowOf2Float32x4: - v.Op = OpAMD64VSCALEFPS128 - return true - case OpMulByPowOf2Float32x8: - v.Op = OpAMD64VSCALEFPS256 - return true - case OpMulByPowOf2Float64x2: - v.Op = OpAMD64VSCALEFPD128 - return true - case OpMulByPowOf2Float64x4: - v.Op = OpAMD64VSCALEFPD256 - return true - case OpMulByPowOf2Float64x8: - v.Op = OpAMD64VSCALEFPD512 - return true - case OpMulByPowOf2MaskedFloat32x16: - return rewriteValueAMD64_OpMulByPowOf2MaskedFloat32x16(v) - case OpMulByPowOf2MaskedFloat32x4: - return rewriteValueAMD64_OpMulByPowOf2MaskedFloat32x4(v) - case OpMulByPowOf2MaskedFloat32x8: - return rewriteValueAMD64_OpMulByPowOf2MaskedFloat32x8(v) - case OpMulByPowOf2MaskedFloat64x2: - return rewriteValueAMD64_OpMulByPowOf2MaskedFloat64x2(v) - case OpMulByPowOf2MaskedFloat64x4: - return rewriteValueAMD64_OpMulByPowOf2MaskedFloat64x4(v) - case OpMulByPowOf2MaskedFloat64x8: - return rewriteValueAMD64_OpMulByPowOf2MaskedFloat64x8(v) case OpMulEvenWidenInt32x4: v.Op = OpAMD64VPMULDQ128 return true @@ -3064,51 +3088,33 @@ func rewriteValueAMD64(v *Value) bool { case OpMulHighUint16x8: v.Op = OpAMD64VPMULHUW128 return true - case OpMulLowInt16x16: + case OpMulInt16x16: v.Op = OpAMD64VPMULLW256 return true - case OpMulLowInt16x32: + case OpMulInt16x32: v.Op = OpAMD64VPMULLW512 return true - case OpMulLowInt16x8: + case OpMulInt16x8: v.Op = OpAMD64VPMULLW128 return true - case OpMulLowInt32x16: + case OpMulInt32x16: v.Op = OpAMD64VPMULLD512 return true - case OpMulLowInt32x4: + case OpMulInt32x4: v.Op = OpAMD64VPMULLD128 return true - case OpMulLowInt32x8: + case OpMulInt32x8: v.Op = OpAMD64VPMULLD256 return true - case OpMulLowInt64x2: + case OpMulInt64x2: v.Op = OpAMD64VPMULLQ128 return true - case OpMulLowInt64x4: + case OpMulInt64x4: v.Op = OpAMD64VPMULLQ256 return true - case OpMulLowInt64x8: + case OpMulInt64x8: v.Op = OpAMD64VPMULLQ512 return true - case OpMulLowMaskedInt16x16: - return rewriteValueAMD64_OpMulLowMaskedInt16x16(v) - case OpMulLowMaskedInt16x32: - return rewriteValueAMD64_OpMulLowMaskedInt16x32(v) - case OpMulLowMaskedInt16x8: - return rewriteValueAMD64_OpMulLowMaskedInt16x8(v) - case OpMulLowMaskedInt32x16: - return rewriteValueAMD64_OpMulLowMaskedInt32x16(v) - case OpMulLowMaskedInt32x4: - return rewriteValueAMD64_OpMulLowMaskedInt32x4(v) - case OpMulLowMaskedInt32x8: - return rewriteValueAMD64_OpMulLowMaskedInt32x8(v) - case OpMulLowMaskedInt64x2: - return rewriteValueAMD64_OpMulLowMaskedInt64x2(v) - case OpMulLowMaskedInt64x4: - return rewriteValueAMD64_OpMulLowMaskedInt64x4(v) - case OpMulLowMaskedInt64x8: - return rewriteValueAMD64_OpMulLowMaskedInt64x8(v) case OpMulMaskedFloat32x16: return rewriteValueAMD64_OpMulMaskedFloat32x16(v) case OpMulMaskedFloat32x4: @@ -3121,6 +3127,24 @@ func rewriteValueAMD64(v *Value) bool { return rewriteValueAMD64_OpMulMaskedFloat64x4(v) case OpMulMaskedFloat64x8: return rewriteValueAMD64_OpMulMaskedFloat64x8(v) + case OpMulMaskedInt16x16: + return rewriteValueAMD64_OpMulMaskedInt16x16(v) + case OpMulMaskedInt16x32: + return rewriteValueAMD64_OpMulMaskedInt16x32(v) + case OpMulMaskedInt16x8: + return rewriteValueAMD64_OpMulMaskedInt16x8(v) + case OpMulMaskedInt32x16: + return rewriteValueAMD64_OpMulMaskedInt32x16(v) + case OpMulMaskedInt32x4: + return rewriteValueAMD64_OpMulMaskedInt32x4(v) + case OpMulMaskedInt32x8: + return rewriteValueAMD64_OpMulMaskedInt32x8(v) + case OpMulMaskedInt64x2: + return rewriteValueAMD64_OpMulMaskedInt64x2(v) + case OpMulMaskedInt64x4: + return rewriteValueAMD64_OpMulMaskedInt64x4(v) + case OpMulMaskedInt64x8: + return rewriteValueAMD64_OpMulMaskedInt64x8(v) case OpNeg16: v.Op = OpAMD64NEGL return true @@ -3406,78 +3430,6 @@ func rewriteValueAMD64(v *Value) bool { return rewriteValueAMD64_OpPairDotProdMaskedInt16x32(v) case OpPairDotProdMaskedInt16x8: return rewriteValueAMD64_OpPairDotProdMaskedInt16x8(v) - case OpPairwiseAddFloat32x4: - v.Op = OpAMD64VHADDPS128 - return true - case OpPairwiseAddFloat32x8: - v.Op = OpAMD64VHADDPS256 - return true - case OpPairwiseAddFloat64x2: - v.Op = OpAMD64VHADDPD128 - return true - case OpPairwiseAddFloat64x4: - v.Op = OpAMD64VHADDPD256 - return true - case OpPairwiseAddInt16x16: - v.Op = OpAMD64VPHADDW256 - return true - case OpPairwiseAddInt16x8: - v.Op = OpAMD64VPHADDW128 - return true - case OpPairwiseAddInt32x4: - v.Op = OpAMD64VPHADDD128 - return true - case OpPairwiseAddInt32x8: - v.Op = OpAMD64VPHADDD256 - return true - case OpPairwiseAddUint16x16: - v.Op = OpAMD64VPHADDW256 - return true - case OpPairwiseAddUint16x8: - v.Op = OpAMD64VPHADDW128 - return true - case OpPairwiseAddUint32x4: - v.Op = OpAMD64VPHADDD128 - return true - case OpPairwiseAddUint32x8: - v.Op = OpAMD64VPHADDD256 - return true - case OpPairwiseSubFloat32x4: - v.Op = OpAMD64VHSUBPS128 - return true - case OpPairwiseSubFloat32x8: - v.Op = OpAMD64VHSUBPS256 - return true - case OpPairwiseSubFloat64x2: - v.Op = OpAMD64VHSUBPD128 - return true - case OpPairwiseSubFloat64x4: - v.Op = OpAMD64VHSUBPD256 - return true - case OpPairwiseSubInt16x16: - v.Op = OpAMD64VPHSUBW256 - return true - case OpPairwiseSubInt16x8: - v.Op = OpAMD64VPHSUBW128 - return true - case OpPairwiseSubInt32x4: - v.Op = OpAMD64VPHSUBD128 - return true - case OpPairwiseSubInt32x8: - v.Op = OpAMD64VPHSUBD256 - return true - case OpPairwiseSubUint16x16: - v.Op = OpAMD64VPHSUBW256 - return true - case OpPairwiseSubUint16x8: - v.Op = OpAMD64VPHSUBW128 - return true - case OpPairwiseSubUint32x4: - v.Op = OpAMD64VPHSUBD128 - return true - case OpPairwiseSubUint32x8: - v.Op = OpAMD64VPHSUBD256 - return true case OpPanicBounds: return rewriteValueAMD64_OpPanicBounds(v) case OpPermute2Float32x16: @@ -4152,32 +4104,56 @@ func rewriteValueAMD64(v *Value) bool { return rewriteValueAMD64_OpRoundFloat64x2(v) case OpRoundFloat64x4: return rewriteValueAMD64_OpRoundFloat64x4(v) + case OpRoundScaledFloat32x16: + return rewriteValueAMD64_OpRoundScaledFloat32x16(v) + case OpRoundScaledFloat32x4: + return rewriteValueAMD64_OpRoundScaledFloat32x4(v) + case OpRoundScaledFloat32x8: + return rewriteValueAMD64_OpRoundScaledFloat32x8(v) + case OpRoundScaledFloat64x2: + return rewriteValueAMD64_OpRoundScaledFloat64x2(v) + case OpRoundScaledFloat64x4: + return rewriteValueAMD64_OpRoundScaledFloat64x4(v) + case OpRoundScaledFloat64x8: + return rewriteValueAMD64_OpRoundScaledFloat64x8(v) + case OpRoundScaledMaskedFloat32x16: + return rewriteValueAMD64_OpRoundScaledMaskedFloat32x16(v) + case OpRoundScaledMaskedFloat32x4: + return rewriteValueAMD64_OpRoundScaledMaskedFloat32x4(v) + case OpRoundScaledMaskedFloat32x8: + return rewriteValueAMD64_OpRoundScaledMaskedFloat32x8(v) + case OpRoundScaledMaskedFloat64x2: + return rewriteValueAMD64_OpRoundScaledMaskedFloat64x2(v) + case OpRoundScaledMaskedFloat64x4: + return rewriteValueAMD64_OpRoundScaledMaskedFloat64x4(v) + case OpRoundScaledMaskedFloat64x8: + return rewriteValueAMD64_OpRoundScaledMaskedFloat64x8(v) + case OpRoundScaledResidueFloat32x16: + return rewriteValueAMD64_OpRoundScaledResidueFloat32x16(v) + case OpRoundScaledResidueFloat32x4: + return rewriteValueAMD64_OpRoundScaledResidueFloat32x4(v) + case OpRoundScaledResidueFloat32x8: + return rewriteValueAMD64_OpRoundScaledResidueFloat32x8(v) + case OpRoundScaledResidueFloat64x2: + return rewriteValueAMD64_OpRoundScaledResidueFloat64x2(v) + case OpRoundScaledResidueFloat64x4: + return rewriteValueAMD64_OpRoundScaledResidueFloat64x4(v) + case OpRoundScaledResidueFloat64x8: + return rewriteValueAMD64_OpRoundScaledResidueFloat64x8(v) + case OpRoundScaledResidueMaskedFloat32x16: + return rewriteValueAMD64_OpRoundScaledResidueMaskedFloat32x16(v) + case OpRoundScaledResidueMaskedFloat32x4: + return rewriteValueAMD64_OpRoundScaledResidueMaskedFloat32x4(v) + case OpRoundScaledResidueMaskedFloat32x8: + return rewriteValueAMD64_OpRoundScaledResidueMaskedFloat32x8(v) + case OpRoundScaledResidueMaskedFloat64x2: + return rewriteValueAMD64_OpRoundScaledResidueMaskedFloat64x2(v) + case OpRoundScaledResidueMaskedFloat64x4: + return rewriteValueAMD64_OpRoundScaledResidueMaskedFloat64x4(v) + case OpRoundScaledResidueMaskedFloat64x8: + return rewriteValueAMD64_OpRoundScaledResidueMaskedFloat64x8(v) case OpRoundToEven: return rewriteValueAMD64_OpRoundToEven(v) - case OpRoundWithPrecisionFloat32x16: - return rewriteValueAMD64_OpRoundWithPrecisionFloat32x16(v) - case OpRoundWithPrecisionFloat32x4: - return rewriteValueAMD64_OpRoundWithPrecisionFloat32x4(v) - case OpRoundWithPrecisionFloat32x8: - return rewriteValueAMD64_OpRoundWithPrecisionFloat32x8(v) - case OpRoundWithPrecisionFloat64x2: - return rewriteValueAMD64_OpRoundWithPrecisionFloat64x2(v) - case OpRoundWithPrecisionFloat64x4: - return rewriteValueAMD64_OpRoundWithPrecisionFloat64x4(v) - case OpRoundWithPrecisionFloat64x8: - return rewriteValueAMD64_OpRoundWithPrecisionFloat64x8(v) - case OpRoundWithPrecisionMaskedFloat32x16: - return rewriteValueAMD64_OpRoundWithPrecisionMaskedFloat32x16(v) - case OpRoundWithPrecisionMaskedFloat32x4: - return rewriteValueAMD64_OpRoundWithPrecisionMaskedFloat32x4(v) - case OpRoundWithPrecisionMaskedFloat32x8: - return rewriteValueAMD64_OpRoundWithPrecisionMaskedFloat32x8(v) - case OpRoundWithPrecisionMaskedFloat64x2: - return rewriteValueAMD64_OpRoundWithPrecisionMaskedFloat64x2(v) - case OpRoundWithPrecisionMaskedFloat64x4: - return rewriteValueAMD64_OpRoundWithPrecisionMaskedFloat64x4(v) - case OpRoundWithPrecisionMaskedFloat64x8: - return rewriteValueAMD64_OpRoundWithPrecisionMaskedFloat64x8(v) case OpRsh16Ux16: return rewriteValueAMD64_OpRsh16Ux16(v) case OpRsh16Ux32: @@ -4257,138 +4233,6 @@ func rewriteValueAMD64(v *Value) bool { return rewriteValueAMD64_OpSaturatedAddDotProdMaskedInt32x4(v) case OpSaturatedAddDotProdMaskedInt32x8: return rewriteValueAMD64_OpSaturatedAddDotProdMaskedInt32x8(v) - case OpSaturatedAddInt16x16: - v.Op = OpAMD64VPADDSW256 - return true - case OpSaturatedAddInt16x32: - v.Op = OpAMD64VPADDSW512 - return true - case OpSaturatedAddInt16x8: - v.Op = OpAMD64VPADDSW128 - return true - case OpSaturatedAddInt8x16: - v.Op = OpAMD64VPADDSB128 - return true - case OpSaturatedAddInt8x32: - v.Op = OpAMD64VPADDSB256 - return true - case OpSaturatedAddInt8x64: - v.Op = OpAMD64VPADDSB512 - return true - case OpSaturatedAddMaskedInt16x16: - return rewriteValueAMD64_OpSaturatedAddMaskedInt16x16(v) - case OpSaturatedAddMaskedInt16x32: - return rewriteValueAMD64_OpSaturatedAddMaskedInt16x32(v) - case OpSaturatedAddMaskedInt16x8: - return rewriteValueAMD64_OpSaturatedAddMaskedInt16x8(v) - case OpSaturatedAddMaskedInt8x16: - return rewriteValueAMD64_OpSaturatedAddMaskedInt8x16(v) - case OpSaturatedAddMaskedInt8x32: - return rewriteValueAMD64_OpSaturatedAddMaskedInt8x32(v) - case OpSaturatedAddMaskedInt8x64: - return rewriteValueAMD64_OpSaturatedAddMaskedInt8x64(v) - case OpSaturatedAddMaskedUint16x16: - return rewriteValueAMD64_OpSaturatedAddMaskedUint16x16(v) - case OpSaturatedAddMaskedUint16x32: - return rewriteValueAMD64_OpSaturatedAddMaskedUint16x32(v) - case OpSaturatedAddMaskedUint16x8: - return rewriteValueAMD64_OpSaturatedAddMaskedUint16x8(v) - case OpSaturatedAddMaskedUint8x16: - return rewriteValueAMD64_OpSaturatedAddMaskedUint8x16(v) - case OpSaturatedAddMaskedUint8x32: - return rewriteValueAMD64_OpSaturatedAddMaskedUint8x32(v) - case OpSaturatedAddMaskedUint8x64: - return rewriteValueAMD64_OpSaturatedAddMaskedUint8x64(v) - case OpSaturatedAddUint16x16: - v.Op = OpAMD64VPADDSW256 - return true - case OpSaturatedAddUint16x32: - v.Op = OpAMD64VPADDSW512 - return true - case OpSaturatedAddUint16x8: - v.Op = OpAMD64VPADDSW128 - return true - case OpSaturatedAddUint8x16: - v.Op = OpAMD64VPADDSB128 - return true - case OpSaturatedAddUint8x32: - v.Op = OpAMD64VPADDSB256 - return true - case OpSaturatedAddUint8x64: - v.Op = OpAMD64VPADDSB512 - return true - case OpSaturatedPairwiseAddInt16x16: - v.Op = OpAMD64VPHADDSW256 - return true - case OpSaturatedPairwiseAddInt16x8: - v.Op = OpAMD64VPHADDSW128 - return true - case OpSaturatedPairwiseSubInt16x16: - v.Op = OpAMD64VPHSUBSW256 - return true - case OpSaturatedPairwiseSubInt16x8: - v.Op = OpAMD64VPHSUBSW128 - return true - case OpSaturatedSubInt16x16: - v.Op = OpAMD64VPSUBSW256 - return true - case OpSaturatedSubInt16x32: - v.Op = OpAMD64VPSUBSW512 - return true - case OpSaturatedSubInt16x8: - v.Op = OpAMD64VPSUBSW128 - return true - case OpSaturatedSubInt8x16: - v.Op = OpAMD64VPSUBSB128 - return true - case OpSaturatedSubInt8x32: - v.Op = OpAMD64VPSUBSB256 - return true - case OpSaturatedSubInt8x64: - v.Op = OpAMD64VPSUBSB512 - return true - case OpSaturatedSubMaskedInt16x16: - return rewriteValueAMD64_OpSaturatedSubMaskedInt16x16(v) - case OpSaturatedSubMaskedInt16x32: - return rewriteValueAMD64_OpSaturatedSubMaskedInt16x32(v) - case OpSaturatedSubMaskedInt16x8: - return rewriteValueAMD64_OpSaturatedSubMaskedInt16x8(v) - case OpSaturatedSubMaskedInt8x16: - return rewriteValueAMD64_OpSaturatedSubMaskedInt8x16(v) - case OpSaturatedSubMaskedInt8x32: - return rewriteValueAMD64_OpSaturatedSubMaskedInt8x32(v) - case OpSaturatedSubMaskedInt8x64: - return rewriteValueAMD64_OpSaturatedSubMaskedInt8x64(v) - case OpSaturatedSubMaskedUint16x16: - return rewriteValueAMD64_OpSaturatedSubMaskedUint16x16(v) - case OpSaturatedSubMaskedUint16x32: - return rewriteValueAMD64_OpSaturatedSubMaskedUint16x32(v) - case OpSaturatedSubMaskedUint16x8: - return rewriteValueAMD64_OpSaturatedSubMaskedUint16x8(v) - case OpSaturatedSubMaskedUint8x16: - return rewriteValueAMD64_OpSaturatedSubMaskedUint8x16(v) - case OpSaturatedSubMaskedUint8x32: - return rewriteValueAMD64_OpSaturatedSubMaskedUint8x32(v) - case OpSaturatedSubMaskedUint8x64: - return rewriteValueAMD64_OpSaturatedSubMaskedUint8x64(v) - case OpSaturatedSubUint16x16: - v.Op = OpAMD64VPSUBSW256 - return true - case OpSaturatedSubUint16x32: - v.Op = OpAMD64VPSUBSW512 - return true - case OpSaturatedSubUint16x8: - v.Op = OpAMD64VPSUBSW128 - return true - case OpSaturatedSubUint8x16: - v.Op = OpAMD64VPSUBSB128 - return true - case OpSaturatedSubUint8x32: - v.Op = OpAMD64VPSUBSB256 - return true - case OpSaturatedSubUint8x64: - v.Op = OpAMD64VPSUBSB512 - return true case OpSaturatedUnsignedSignedPairDotProdMaskedUint8x16: return rewriteValueAMD64_OpSaturatedUnsignedSignedPairDotProdMaskedUint8x16(v) case OpSaturatedUnsignedSignedPairDotProdMaskedUint8x32: @@ -4419,6 +4263,36 @@ func rewriteValueAMD64(v *Value) bool { return rewriteValueAMD64_OpSaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x4(v) case OpSaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x8: return rewriteValueAMD64_OpSaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x8(v) + case OpScaleFloat32x16: + v.Op = OpAMD64VSCALEFPS512 + return true + case OpScaleFloat32x4: + v.Op = OpAMD64VSCALEFPS128 + return true + case OpScaleFloat32x8: + v.Op = OpAMD64VSCALEFPS256 + return true + case OpScaleFloat64x2: + v.Op = OpAMD64VSCALEFPD128 + return true + case OpScaleFloat64x4: + v.Op = OpAMD64VSCALEFPD256 + return true + case OpScaleFloat64x8: + v.Op = OpAMD64VSCALEFPD512 + return true + case OpScaleMaskedFloat32x16: + return rewriteValueAMD64_OpScaleMaskedFloat32x16(v) + case OpScaleMaskedFloat32x4: + return rewriteValueAMD64_OpScaleMaskedFloat32x4(v) + case OpScaleMaskedFloat32x8: + return rewriteValueAMD64_OpScaleMaskedFloat32x8(v) + case OpScaleMaskedFloat64x2: + return rewriteValueAMD64_OpScaleMaskedFloat64x2(v) + case OpScaleMaskedFloat64x4: + return rewriteValueAMD64_OpScaleMaskedFloat64x4(v) + case OpScaleMaskedFloat64x8: + return rewriteValueAMD64_OpScaleMaskedFloat64x8(v) case OpSelect0: return rewriteValueAMD64_OpSelect0(v) case OpSelect1: @@ -5446,9 +5320,111 @@ func rewriteValueAMD64(v *Value) bool { return rewriteValueAMD64_OpSubMaskedUint8x32(v) case OpSubMaskedUint8x64: return rewriteValueAMD64_OpSubMaskedUint8x64(v) + case OpSubPairsFloat32x4: + v.Op = OpAMD64VHSUBPS128 + return true + case OpSubPairsFloat32x8: + v.Op = OpAMD64VHSUBPS256 + return true + case OpSubPairsFloat64x2: + v.Op = OpAMD64VHSUBPD128 + return true + case OpSubPairsFloat64x4: + v.Op = OpAMD64VHSUBPD256 + return true + case OpSubPairsInt16x16: + v.Op = OpAMD64VPHSUBW256 + return true + case OpSubPairsInt16x8: + v.Op = OpAMD64VPHSUBW128 + return true + case OpSubPairsInt32x4: + v.Op = OpAMD64VPHSUBD128 + return true + case OpSubPairsInt32x8: + v.Op = OpAMD64VPHSUBD256 + return true + case OpSubPairsSaturatedInt16x16: + v.Op = OpAMD64VPHSUBSW256 + return true + case OpSubPairsSaturatedInt16x8: + v.Op = OpAMD64VPHSUBSW128 + return true + case OpSubPairsUint16x16: + v.Op = OpAMD64VPHSUBW256 + return true + case OpSubPairsUint16x8: + v.Op = OpAMD64VPHSUBW128 + return true + case OpSubPairsUint32x4: + v.Op = OpAMD64VPHSUBD128 + return true + case OpSubPairsUint32x8: + v.Op = OpAMD64VPHSUBD256 + return true case OpSubPtr: v.Op = OpAMD64SUBQ return true + case OpSubSaturatedInt16x16: + v.Op = OpAMD64VPSUBSW256 + return true + case OpSubSaturatedInt16x32: + v.Op = OpAMD64VPSUBSW512 + return true + case OpSubSaturatedInt16x8: + v.Op = OpAMD64VPSUBSW128 + return true + case OpSubSaturatedInt8x16: + v.Op = OpAMD64VPSUBSB128 + return true + case OpSubSaturatedInt8x32: + v.Op = OpAMD64VPSUBSB256 + return true + case OpSubSaturatedInt8x64: + v.Op = OpAMD64VPSUBSB512 + return true + case OpSubSaturatedMaskedInt16x16: + return rewriteValueAMD64_OpSubSaturatedMaskedInt16x16(v) + case OpSubSaturatedMaskedInt16x32: + return rewriteValueAMD64_OpSubSaturatedMaskedInt16x32(v) + case OpSubSaturatedMaskedInt16x8: + return rewriteValueAMD64_OpSubSaturatedMaskedInt16x8(v) + case OpSubSaturatedMaskedInt8x16: + return rewriteValueAMD64_OpSubSaturatedMaskedInt8x16(v) + case OpSubSaturatedMaskedInt8x32: + return rewriteValueAMD64_OpSubSaturatedMaskedInt8x32(v) + case OpSubSaturatedMaskedInt8x64: + return rewriteValueAMD64_OpSubSaturatedMaskedInt8x64(v) + case OpSubSaturatedMaskedUint16x16: + return rewriteValueAMD64_OpSubSaturatedMaskedUint16x16(v) + case OpSubSaturatedMaskedUint16x32: + return rewriteValueAMD64_OpSubSaturatedMaskedUint16x32(v) + case OpSubSaturatedMaskedUint16x8: + return rewriteValueAMD64_OpSubSaturatedMaskedUint16x8(v) + case OpSubSaturatedMaskedUint8x16: + return rewriteValueAMD64_OpSubSaturatedMaskedUint8x16(v) + case OpSubSaturatedMaskedUint8x32: + return rewriteValueAMD64_OpSubSaturatedMaskedUint8x32(v) + case OpSubSaturatedMaskedUint8x64: + return rewriteValueAMD64_OpSubSaturatedMaskedUint8x64(v) + case OpSubSaturatedUint16x16: + v.Op = OpAMD64VPSUBSW256 + return true + case OpSubSaturatedUint16x32: + v.Op = OpAMD64VPSUBSW512 + return true + case OpSubSaturatedUint16x8: + v.Op = OpAMD64VPSUBSW128 + return true + case OpSubSaturatedUint8x16: + v.Op = OpAMD64VPSUBSB128 + return true + case OpSubSaturatedUint8x32: + v.Op = OpAMD64VPSUBSB256 + return true + case OpSubSaturatedUint8x64: + v.Op = OpAMD64VPSUBSB512 + return true case OpSubUint16x16: v.Op = OpAMD64VPSUBW256 return true @@ -5516,30 +5492,54 @@ func rewriteValueAMD64(v *Value) bool { return rewriteValueAMD64_OpTruncFloat64x2(v) case OpTruncFloat64x4: return rewriteValueAMD64_OpTruncFloat64x4(v) - case OpTruncWithPrecisionFloat32x16: - return rewriteValueAMD64_OpTruncWithPrecisionFloat32x16(v) - case OpTruncWithPrecisionFloat32x4: - return rewriteValueAMD64_OpTruncWithPrecisionFloat32x4(v) - case OpTruncWithPrecisionFloat32x8: - return rewriteValueAMD64_OpTruncWithPrecisionFloat32x8(v) - case OpTruncWithPrecisionFloat64x2: - return rewriteValueAMD64_OpTruncWithPrecisionFloat64x2(v) - case OpTruncWithPrecisionFloat64x4: - return rewriteValueAMD64_OpTruncWithPrecisionFloat64x4(v) - case OpTruncWithPrecisionFloat64x8: - return rewriteValueAMD64_OpTruncWithPrecisionFloat64x8(v) - case OpTruncWithPrecisionMaskedFloat32x16: - return rewriteValueAMD64_OpTruncWithPrecisionMaskedFloat32x16(v) - case OpTruncWithPrecisionMaskedFloat32x4: - return rewriteValueAMD64_OpTruncWithPrecisionMaskedFloat32x4(v) - case OpTruncWithPrecisionMaskedFloat32x8: - return rewriteValueAMD64_OpTruncWithPrecisionMaskedFloat32x8(v) - case OpTruncWithPrecisionMaskedFloat64x2: - return rewriteValueAMD64_OpTruncWithPrecisionMaskedFloat64x2(v) - case OpTruncWithPrecisionMaskedFloat64x4: - return rewriteValueAMD64_OpTruncWithPrecisionMaskedFloat64x4(v) - case OpTruncWithPrecisionMaskedFloat64x8: - return rewriteValueAMD64_OpTruncWithPrecisionMaskedFloat64x8(v) + case OpTruncScaledFloat32x16: + return rewriteValueAMD64_OpTruncScaledFloat32x16(v) + case OpTruncScaledFloat32x4: + return rewriteValueAMD64_OpTruncScaledFloat32x4(v) + case OpTruncScaledFloat32x8: + return rewriteValueAMD64_OpTruncScaledFloat32x8(v) + case OpTruncScaledFloat64x2: + return rewriteValueAMD64_OpTruncScaledFloat64x2(v) + case OpTruncScaledFloat64x4: + return rewriteValueAMD64_OpTruncScaledFloat64x4(v) + case OpTruncScaledFloat64x8: + return rewriteValueAMD64_OpTruncScaledFloat64x8(v) + case OpTruncScaledMaskedFloat32x16: + return rewriteValueAMD64_OpTruncScaledMaskedFloat32x16(v) + case OpTruncScaledMaskedFloat32x4: + return rewriteValueAMD64_OpTruncScaledMaskedFloat32x4(v) + case OpTruncScaledMaskedFloat32x8: + return rewriteValueAMD64_OpTruncScaledMaskedFloat32x8(v) + case OpTruncScaledMaskedFloat64x2: + return rewriteValueAMD64_OpTruncScaledMaskedFloat64x2(v) + case OpTruncScaledMaskedFloat64x4: + return rewriteValueAMD64_OpTruncScaledMaskedFloat64x4(v) + case OpTruncScaledMaskedFloat64x8: + return rewriteValueAMD64_OpTruncScaledMaskedFloat64x8(v) + case OpTruncScaledResidueFloat32x16: + return rewriteValueAMD64_OpTruncScaledResidueFloat32x16(v) + case OpTruncScaledResidueFloat32x4: + return rewriteValueAMD64_OpTruncScaledResidueFloat32x4(v) + case OpTruncScaledResidueFloat32x8: + return rewriteValueAMD64_OpTruncScaledResidueFloat32x8(v) + case OpTruncScaledResidueFloat64x2: + return rewriteValueAMD64_OpTruncScaledResidueFloat64x2(v) + case OpTruncScaledResidueFloat64x4: + return rewriteValueAMD64_OpTruncScaledResidueFloat64x4(v) + case OpTruncScaledResidueFloat64x8: + return rewriteValueAMD64_OpTruncScaledResidueFloat64x8(v) + case OpTruncScaledResidueMaskedFloat32x16: + return rewriteValueAMD64_OpTruncScaledResidueMaskedFloat32x16(v) + case OpTruncScaledResidueMaskedFloat32x4: + return rewriteValueAMD64_OpTruncScaledResidueMaskedFloat32x4(v) + case OpTruncScaledResidueMaskedFloat32x8: + return rewriteValueAMD64_OpTruncScaledResidueMaskedFloat32x8(v) + case OpTruncScaledResidueMaskedFloat64x2: + return rewriteValueAMD64_OpTruncScaledResidueMaskedFloat64x2(v) + case OpTruncScaledResidueMaskedFloat64x4: + return rewriteValueAMD64_OpTruncScaledResidueMaskedFloat64x4(v) + case OpTruncScaledResidueMaskedFloat64x8: + return rewriteValueAMD64_OpTruncScaledResidueMaskedFloat64x8(v) case OpUnsignedSignedQuadDotProdAccumulateInt32x16: v.Op = OpAMD64VPDPBUSD512 return true @@ -29162,6 +29162,222 @@ func rewriteValueAMD64_OpAddMaskedUint8x64(v *Value) bool { return true } } +func rewriteValueAMD64_OpAddSaturatedMaskedInt16x16(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (AddSaturatedMaskedInt16x16 x y mask) + // result: (VPADDSWMasked256 x y (VPMOVVec16x16ToM mask)) + for { + x := v_0 + y := v_1 + mask := v_2 + v.reset(OpAMD64VPADDSWMasked256) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x16ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg3(x, y, v0) + return true + } +} +func rewriteValueAMD64_OpAddSaturatedMaskedInt16x32(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (AddSaturatedMaskedInt16x32 x y mask) + // result: (VPADDSWMasked512 x y (VPMOVVec16x32ToM mask)) + for { + x := v_0 + y := v_1 + mask := v_2 + v.reset(OpAMD64VPADDSWMasked512) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x32ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg3(x, y, v0) + return true + } +} +func rewriteValueAMD64_OpAddSaturatedMaskedInt16x8(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (AddSaturatedMaskedInt16x8 x y mask) + // result: (VPADDSWMasked128 x y (VPMOVVec16x8ToM mask)) + for { + x := v_0 + y := v_1 + mask := v_2 + v.reset(OpAMD64VPADDSWMasked128) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg3(x, y, v0) + return true + } +} +func rewriteValueAMD64_OpAddSaturatedMaskedInt8x16(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (AddSaturatedMaskedInt8x16 x y mask) + // result: (VPADDSBMasked128 x y (VPMOVVec8x16ToM mask)) + for { + x := v_0 + y := v_1 + mask := v_2 + v.reset(OpAMD64VPADDSBMasked128) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x16ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg3(x, y, v0) + return true + } +} +func rewriteValueAMD64_OpAddSaturatedMaskedInt8x32(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (AddSaturatedMaskedInt8x32 x y mask) + // result: (VPADDSBMasked256 x y (VPMOVVec8x32ToM mask)) + for { + x := v_0 + y := v_1 + mask := v_2 + v.reset(OpAMD64VPADDSBMasked256) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x32ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg3(x, y, v0) + return true + } +} +func rewriteValueAMD64_OpAddSaturatedMaskedInt8x64(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (AddSaturatedMaskedInt8x64 x y mask) + // result: (VPADDSBMasked512 x y (VPMOVVec8x64ToM mask)) + for { + x := v_0 + y := v_1 + mask := v_2 + v.reset(OpAMD64VPADDSBMasked512) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x64ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg3(x, y, v0) + return true + } +} +func rewriteValueAMD64_OpAddSaturatedMaskedUint16x16(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (AddSaturatedMaskedUint16x16 x y mask) + // result: (VPADDSWMasked256 x y (VPMOVVec16x16ToM mask)) + for { + x := v_0 + y := v_1 + mask := v_2 + v.reset(OpAMD64VPADDSWMasked256) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x16ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg3(x, y, v0) + return true + } +} +func rewriteValueAMD64_OpAddSaturatedMaskedUint16x32(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (AddSaturatedMaskedUint16x32 x y mask) + // result: (VPADDSWMasked512 x y (VPMOVVec16x32ToM mask)) + for { + x := v_0 + y := v_1 + mask := v_2 + v.reset(OpAMD64VPADDSWMasked512) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x32ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg3(x, y, v0) + return true + } +} +func rewriteValueAMD64_OpAddSaturatedMaskedUint16x8(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (AddSaturatedMaskedUint16x8 x y mask) + // result: (VPADDSWMasked128 x y (VPMOVVec16x8ToM mask)) + for { + x := v_0 + y := v_1 + mask := v_2 + v.reset(OpAMD64VPADDSWMasked128) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg3(x, y, v0) + return true + } +} +func rewriteValueAMD64_OpAddSaturatedMaskedUint8x16(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (AddSaturatedMaskedUint8x16 x y mask) + // result: (VPADDSBMasked128 x y (VPMOVVec8x16ToM mask)) + for { + x := v_0 + y := v_1 + mask := v_2 + v.reset(OpAMD64VPADDSBMasked128) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x16ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg3(x, y, v0) + return true + } +} +func rewriteValueAMD64_OpAddSaturatedMaskedUint8x32(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (AddSaturatedMaskedUint8x32 x y mask) + // result: (VPADDSBMasked256 x y (VPMOVVec8x32ToM mask)) + for { + x := v_0 + y := v_1 + mask := v_2 + v.reset(OpAMD64VPADDSBMasked256) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x32ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg3(x, y, v0) + return true + } +} +func rewriteValueAMD64_OpAddSaturatedMaskedUint8x64(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (AddSaturatedMaskedUint8x64 x y mask) + // result: (VPADDSBMasked512 x y (VPMOVVec8x64ToM mask)) + for { + x := v_0 + y := v_1 + mask := v_2 + v.reset(OpAMD64VPADDSBMasked512) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x64ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg3(x, y, v0) + return true + } +} func rewriteValueAMD64_OpAddr(v *Value) bool { v_0 := v.Args[0] // match: (Addr {sym} base) @@ -30521,9 +30737,9 @@ func rewriteValueAMD64_OpCeilFloat64x4(v *Value) bool { return true } } -func rewriteValueAMD64_OpCeilWithPrecisionFloat32x16(v *Value) bool { +func rewriteValueAMD64_OpCeilScaledFloat32x16(v *Value) bool { v_0 := v.Args[0] - // match: (CeilWithPrecisionFloat32x16 [a] x) + // match: (CeilScaledFloat32x16 [a] x) // result: (VRNDSCALEPS512 [a+2] x) for { a := auxIntToInt8(v.AuxInt) @@ -30534,9 +30750,9 @@ func rewriteValueAMD64_OpCeilWithPrecisionFloat32x16(v *Value) bool { return true } } -func rewriteValueAMD64_OpCeilWithPrecisionFloat32x4(v *Value) bool { +func rewriteValueAMD64_OpCeilScaledFloat32x4(v *Value) bool { v_0 := v.Args[0] - // match: (CeilWithPrecisionFloat32x4 [a] x) + // match: (CeilScaledFloat32x4 [a] x) // result: (VRNDSCALEPS128 [a+2] x) for { a := auxIntToInt8(v.AuxInt) @@ -30547,9 +30763,9 @@ func rewriteValueAMD64_OpCeilWithPrecisionFloat32x4(v *Value) bool { return true } } -func rewriteValueAMD64_OpCeilWithPrecisionFloat32x8(v *Value) bool { +func rewriteValueAMD64_OpCeilScaledFloat32x8(v *Value) bool { v_0 := v.Args[0] - // match: (CeilWithPrecisionFloat32x8 [a] x) + // match: (CeilScaledFloat32x8 [a] x) // result: (VRNDSCALEPS256 [a+2] x) for { a := auxIntToInt8(v.AuxInt) @@ -30560,9 +30776,9 @@ func rewriteValueAMD64_OpCeilWithPrecisionFloat32x8(v *Value) bool { return true } } -func rewriteValueAMD64_OpCeilWithPrecisionFloat64x2(v *Value) bool { +func rewriteValueAMD64_OpCeilScaledFloat64x2(v *Value) bool { v_0 := v.Args[0] - // match: (CeilWithPrecisionFloat64x2 [a] x) + // match: (CeilScaledFloat64x2 [a] x) // result: (VRNDSCALEPD128 [a+2] x) for { a := auxIntToInt8(v.AuxInt) @@ -30573,9 +30789,9 @@ func rewriteValueAMD64_OpCeilWithPrecisionFloat64x2(v *Value) bool { return true } } -func rewriteValueAMD64_OpCeilWithPrecisionFloat64x4(v *Value) bool { +func rewriteValueAMD64_OpCeilScaledFloat64x4(v *Value) bool { v_0 := v.Args[0] - // match: (CeilWithPrecisionFloat64x4 [a] x) + // match: (CeilScaledFloat64x4 [a] x) // result: (VRNDSCALEPD256 [a+2] x) for { a := auxIntToInt8(v.AuxInt) @@ -30586,9 +30802,9 @@ func rewriteValueAMD64_OpCeilWithPrecisionFloat64x4(v *Value) bool { return true } } -func rewriteValueAMD64_OpCeilWithPrecisionFloat64x8(v *Value) bool { +func rewriteValueAMD64_OpCeilScaledFloat64x8(v *Value) bool { v_0 := v.Args[0] - // match: (CeilWithPrecisionFloat64x8 [a] x) + // match: (CeilScaledFloat64x8 [a] x) // result: (VRNDSCALEPD512 [a+2] x) for { a := auxIntToInt8(v.AuxInt) @@ -30599,11 +30815,11 @@ func rewriteValueAMD64_OpCeilWithPrecisionFloat64x8(v *Value) bool { return true } } -func rewriteValueAMD64_OpCeilWithPrecisionMaskedFloat32x16(v *Value) bool { +func rewriteValueAMD64_OpCeilScaledMaskedFloat32x16(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] b := v.Block - // match: (CeilWithPrecisionMaskedFloat32x16 [a] x mask) + // match: (CeilScaledMaskedFloat32x16 [a] x mask) // result: (VRNDSCALEPSMasked512 [a+2] x (VPMOVVec32x16ToM mask)) for { a := auxIntToInt8(v.AuxInt) @@ -30617,11 +30833,11 @@ func rewriteValueAMD64_OpCeilWithPrecisionMaskedFloat32x16(v *Value) bool { return true } } -func rewriteValueAMD64_OpCeilWithPrecisionMaskedFloat32x4(v *Value) bool { +func rewriteValueAMD64_OpCeilScaledMaskedFloat32x4(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] b := v.Block - // match: (CeilWithPrecisionMaskedFloat32x4 [a] x mask) + // match: (CeilScaledMaskedFloat32x4 [a] x mask) // result: (VRNDSCALEPSMasked128 [a+2] x (VPMOVVec32x4ToM mask)) for { a := auxIntToInt8(v.AuxInt) @@ -30635,11 +30851,11 @@ func rewriteValueAMD64_OpCeilWithPrecisionMaskedFloat32x4(v *Value) bool { return true } } -func rewriteValueAMD64_OpCeilWithPrecisionMaskedFloat32x8(v *Value) bool { +func rewriteValueAMD64_OpCeilScaledMaskedFloat32x8(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] b := v.Block - // match: (CeilWithPrecisionMaskedFloat32x8 [a] x mask) + // match: (CeilScaledMaskedFloat32x8 [a] x mask) // result: (VRNDSCALEPSMasked256 [a+2] x (VPMOVVec32x8ToM mask)) for { a := auxIntToInt8(v.AuxInt) @@ -30653,11 +30869,11 @@ func rewriteValueAMD64_OpCeilWithPrecisionMaskedFloat32x8(v *Value) bool { return true } } -func rewriteValueAMD64_OpCeilWithPrecisionMaskedFloat64x2(v *Value) bool { +func rewriteValueAMD64_OpCeilScaledMaskedFloat64x2(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] b := v.Block - // match: (CeilWithPrecisionMaskedFloat64x2 [a] x mask) + // match: (CeilScaledMaskedFloat64x2 [a] x mask) // result: (VRNDSCALEPDMasked128 [a+2] x (VPMOVVec64x2ToM mask)) for { a := auxIntToInt8(v.AuxInt) @@ -30671,11 +30887,11 @@ func rewriteValueAMD64_OpCeilWithPrecisionMaskedFloat64x2(v *Value) bool { return true } } -func rewriteValueAMD64_OpCeilWithPrecisionMaskedFloat64x4(v *Value) bool { +func rewriteValueAMD64_OpCeilScaledMaskedFloat64x4(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] b := v.Block - // match: (CeilWithPrecisionMaskedFloat64x4 [a] x mask) + // match: (CeilScaledMaskedFloat64x4 [a] x mask) // result: (VRNDSCALEPDMasked256 [a+2] x (VPMOVVec64x4ToM mask)) for { a := auxIntToInt8(v.AuxInt) @@ -30689,11 +30905,11 @@ func rewriteValueAMD64_OpCeilWithPrecisionMaskedFloat64x4(v *Value) bool { return true } } -func rewriteValueAMD64_OpCeilWithPrecisionMaskedFloat64x8(v *Value) bool { +func rewriteValueAMD64_OpCeilScaledMaskedFloat64x8(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] b := v.Block - // match: (CeilWithPrecisionMaskedFloat64x8 [a] x mask) + // match: (CeilScaledMaskedFloat64x8 [a] x mask) // result: (VRNDSCALEPDMasked512 [a+2] x (VPMOVVec64x8ToM mask)) for { a := auxIntToInt8(v.AuxInt) @@ -30707,6 +30923,192 @@ func rewriteValueAMD64_OpCeilWithPrecisionMaskedFloat64x8(v *Value) bool { return true } } +func rewriteValueAMD64_OpCeilScaledResidueFloat32x16(v *Value) bool { + v_0 := v.Args[0] + // match: (CeilScaledResidueFloat32x16 [a] x) + // result: (VREDUCEPS512 [a+2] x) + for { + a := auxIntToInt8(v.AuxInt) + x := v_0 + v.reset(OpAMD64VREDUCEPS512) + v.AuxInt = int8ToAuxInt(a + 2) + v.AddArg(x) + return true + } +} +func rewriteValueAMD64_OpCeilScaledResidueFloat32x4(v *Value) bool { + v_0 := v.Args[0] + // match: (CeilScaledResidueFloat32x4 [a] x) + // result: (VREDUCEPS128 [a+2] x) + for { + a := auxIntToInt8(v.AuxInt) + x := v_0 + v.reset(OpAMD64VREDUCEPS128) + v.AuxInt = int8ToAuxInt(a + 2) + v.AddArg(x) + return true + } +} +func rewriteValueAMD64_OpCeilScaledResidueFloat32x8(v *Value) bool { + v_0 := v.Args[0] + // match: (CeilScaledResidueFloat32x8 [a] x) + // result: (VREDUCEPS256 [a+2] x) + for { + a := auxIntToInt8(v.AuxInt) + x := v_0 + v.reset(OpAMD64VREDUCEPS256) + v.AuxInt = int8ToAuxInt(a + 2) + v.AddArg(x) + return true + } +} +func rewriteValueAMD64_OpCeilScaledResidueFloat64x2(v *Value) bool { + v_0 := v.Args[0] + // match: (CeilScaledResidueFloat64x2 [a] x) + // result: (VREDUCEPD128 [a+2] x) + for { + a := auxIntToInt8(v.AuxInt) + x := v_0 + v.reset(OpAMD64VREDUCEPD128) + v.AuxInt = int8ToAuxInt(a + 2) + v.AddArg(x) + return true + } +} +func rewriteValueAMD64_OpCeilScaledResidueFloat64x4(v *Value) bool { + v_0 := v.Args[0] + // match: (CeilScaledResidueFloat64x4 [a] x) + // result: (VREDUCEPD256 [a+2] x) + for { + a := auxIntToInt8(v.AuxInt) + x := v_0 + v.reset(OpAMD64VREDUCEPD256) + v.AuxInt = int8ToAuxInt(a + 2) + v.AddArg(x) + return true + } +} +func rewriteValueAMD64_OpCeilScaledResidueFloat64x8(v *Value) bool { + v_0 := v.Args[0] + // match: (CeilScaledResidueFloat64x8 [a] x) + // result: (VREDUCEPD512 [a+2] x) + for { + a := auxIntToInt8(v.AuxInt) + x := v_0 + v.reset(OpAMD64VREDUCEPD512) + v.AuxInt = int8ToAuxInt(a + 2) + v.AddArg(x) + return true + } +} +func rewriteValueAMD64_OpCeilScaledResidueMaskedFloat32x16(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (CeilScaledResidueMaskedFloat32x16 [a] x mask) + // result: (VREDUCEPSMasked512 [a+2] x (VPMOVVec32x16ToM mask)) + for { + a := auxIntToInt8(v.AuxInt) + x := v_0 + mask := v_1 + v.reset(OpAMD64VREDUCEPSMasked512) + v.AuxInt = int8ToAuxInt(a + 2) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpCeilScaledResidueMaskedFloat32x4(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (CeilScaledResidueMaskedFloat32x4 [a] x mask) + // result: (VREDUCEPSMasked128 [a+2] x (VPMOVVec32x4ToM mask)) + for { + a := auxIntToInt8(v.AuxInt) + x := v_0 + mask := v_1 + v.reset(OpAMD64VREDUCEPSMasked128) + v.AuxInt = int8ToAuxInt(a + 2) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpCeilScaledResidueMaskedFloat32x8(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (CeilScaledResidueMaskedFloat32x8 [a] x mask) + // result: (VREDUCEPSMasked256 [a+2] x (VPMOVVec32x8ToM mask)) + for { + a := auxIntToInt8(v.AuxInt) + x := v_0 + mask := v_1 + v.reset(OpAMD64VREDUCEPSMasked256) + v.AuxInt = int8ToAuxInt(a + 2) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpCeilScaledResidueMaskedFloat64x2(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (CeilScaledResidueMaskedFloat64x2 [a] x mask) + // result: (VREDUCEPDMasked128 [a+2] x (VPMOVVec64x2ToM mask)) + for { + a := auxIntToInt8(v.AuxInt) + x := v_0 + mask := v_1 + v.reset(OpAMD64VREDUCEPDMasked128) + v.AuxInt = int8ToAuxInt(a + 2) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpCeilScaledResidueMaskedFloat64x4(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (CeilScaledResidueMaskedFloat64x4 [a] x mask) + // result: (VREDUCEPDMasked256 [a+2] x (VPMOVVec64x4ToM mask)) + for { + a := auxIntToInt8(v.AuxInt) + x := v_0 + mask := v_1 + v.reset(OpAMD64VREDUCEPDMasked256) + v.AuxInt = int8ToAuxInt(a + 2) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpCeilScaledResidueMaskedFloat64x8(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (CeilScaledResidueMaskedFloat64x8 [a] x mask) + // result: (VREDUCEPDMasked512 [a+2] x (VPMOVVec64x8ToM mask)) + for { + a := auxIntToInt8(v.AuxInt) + x := v_0 + mask := v_1 + v.reset(OpAMD64VREDUCEPDMasked512) + v.AuxInt = int8ToAuxInt(a + 2) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} func rewriteValueAMD64_OpCompressFloat32x16(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] @@ -32596,750 +32998,6 @@ func rewriteValueAMD64_OpCvt8toMask64x8(v *Value) bool { return true } } -func rewriteValueAMD64_OpDiffWithCeilWithPrecisionFloat32x16(v *Value) bool { - v_0 := v.Args[0] - // match: (DiffWithCeilWithPrecisionFloat32x16 [a] x) - // result: (VREDUCEPS512 [a+2] x) - for { - a := auxIntToInt8(v.AuxInt) - x := v_0 - v.reset(OpAMD64VREDUCEPS512) - v.AuxInt = int8ToAuxInt(a + 2) - v.AddArg(x) - return true - } -} -func rewriteValueAMD64_OpDiffWithCeilWithPrecisionFloat32x4(v *Value) bool { - v_0 := v.Args[0] - // match: (DiffWithCeilWithPrecisionFloat32x4 [a] x) - // result: (VREDUCEPS128 [a+2] x) - for { - a := auxIntToInt8(v.AuxInt) - x := v_0 - v.reset(OpAMD64VREDUCEPS128) - v.AuxInt = int8ToAuxInt(a + 2) - v.AddArg(x) - return true - } -} -func rewriteValueAMD64_OpDiffWithCeilWithPrecisionFloat32x8(v *Value) bool { - v_0 := v.Args[0] - // match: (DiffWithCeilWithPrecisionFloat32x8 [a] x) - // result: (VREDUCEPS256 [a+2] x) - for { - a := auxIntToInt8(v.AuxInt) - x := v_0 - v.reset(OpAMD64VREDUCEPS256) - v.AuxInt = int8ToAuxInt(a + 2) - v.AddArg(x) - return true - } -} -func rewriteValueAMD64_OpDiffWithCeilWithPrecisionFloat64x2(v *Value) bool { - v_0 := v.Args[0] - // match: (DiffWithCeilWithPrecisionFloat64x2 [a] x) - // result: (VREDUCEPD128 [a+2] x) - for { - a := auxIntToInt8(v.AuxInt) - x := v_0 - v.reset(OpAMD64VREDUCEPD128) - v.AuxInt = int8ToAuxInt(a + 2) - v.AddArg(x) - return true - } -} -func rewriteValueAMD64_OpDiffWithCeilWithPrecisionFloat64x4(v *Value) bool { - v_0 := v.Args[0] - // match: (DiffWithCeilWithPrecisionFloat64x4 [a] x) - // result: (VREDUCEPD256 [a+2] x) - for { - a := auxIntToInt8(v.AuxInt) - x := v_0 - v.reset(OpAMD64VREDUCEPD256) - v.AuxInt = int8ToAuxInt(a + 2) - v.AddArg(x) - return true - } -} -func rewriteValueAMD64_OpDiffWithCeilWithPrecisionFloat64x8(v *Value) bool { - v_0 := v.Args[0] - // match: (DiffWithCeilWithPrecisionFloat64x8 [a] x) - // result: (VREDUCEPD512 [a+2] x) - for { - a := auxIntToInt8(v.AuxInt) - x := v_0 - v.reset(OpAMD64VREDUCEPD512) - v.AuxInt = int8ToAuxInt(a + 2) - v.AddArg(x) - return true - } -} -func rewriteValueAMD64_OpDiffWithCeilWithPrecisionMaskedFloat32x16(v *Value) bool { - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (DiffWithCeilWithPrecisionMaskedFloat32x16 [a] x mask) - // result: (VREDUCEPSMasked512 [a+2] x (VPMOVVec32x16ToM mask)) - for { - a := auxIntToInt8(v.AuxInt) - x := v_0 - mask := v_1 - v.reset(OpAMD64VREDUCEPSMasked512) - v.AuxInt = int8ToAuxInt(a + 2) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg2(x, v0) - return true - } -} -func rewriteValueAMD64_OpDiffWithCeilWithPrecisionMaskedFloat32x4(v *Value) bool { - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (DiffWithCeilWithPrecisionMaskedFloat32x4 [a] x mask) - // result: (VREDUCEPSMasked128 [a+2] x (VPMOVVec32x4ToM mask)) - for { - a := auxIntToInt8(v.AuxInt) - x := v_0 - mask := v_1 - v.reset(OpAMD64VREDUCEPSMasked128) - v.AuxInt = int8ToAuxInt(a + 2) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg2(x, v0) - return true - } -} -func rewriteValueAMD64_OpDiffWithCeilWithPrecisionMaskedFloat32x8(v *Value) bool { - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (DiffWithCeilWithPrecisionMaskedFloat32x8 [a] x mask) - // result: (VREDUCEPSMasked256 [a+2] x (VPMOVVec32x8ToM mask)) - for { - a := auxIntToInt8(v.AuxInt) - x := v_0 - mask := v_1 - v.reset(OpAMD64VREDUCEPSMasked256) - v.AuxInt = int8ToAuxInt(a + 2) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg2(x, v0) - return true - } -} -func rewriteValueAMD64_OpDiffWithCeilWithPrecisionMaskedFloat64x2(v *Value) bool { - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (DiffWithCeilWithPrecisionMaskedFloat64x2 [a] x mask) - // result: (VREDUCEPDMasked128 [a+2] x (VPMOVVec64x2ToM mask)) - for { - a := auxIntToInt8(v.AuxInt) - x := v_0 - mask := v_1 - v.reset(OpAMD64VREDUCEPDMasked128) - v.AuxInt = int8ToAuxInt(a + 2) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg2(x, v0) - return true - } -} -func rewriteValueAMD64_OpDiffWithCeilWithPrecisionMaskedFloat64x4(v *Value) bool { - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (DiffWithCeilWithPrecisionMaskedFloat64x4 [a] x mask) - // result: (VREDUCEPDMasked256 [a+2] x (VPMOVVec64x4ToM mask)) - for { - a := auxIntToInt8(v.AuxInt) - x := v_0 - mask := v_1 - v.reset(OpAMD64VREDUCEPDMasked256) - v.AuxInt = int8ToAuxInt(a + 2) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg2(x, v0) - return true - } -} -func rewriteValueAMD64_OpDiffWithCeilWithPrecisionMaskedFloat64x8(v *Value) bool { - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (DiffWithCeilWithPrecisionMaskedFloat64x8 [a] x mask) - // result: (VREDUCEPDMasked512 [a+2] x (VPMOVVec64x8ToM mask)) - for { - a := auxIntToInt8(v.AuxInt) - x := v_0 - mask := v_1 - v.reset(OpAMD64VREDUCEPDMasked512) - v.AuxInt = int8ToAuxInt(a + 2) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg2(x, v0) - return true - } -} -func rewriteValueAMD64_OpDiffWithFloorWithPrecisionFloat32x16(v *Value) bool { - v_0 := v.Args[0] - // match: (DiffWithFloorWithPrecisionFloat32x16 [a] x) - // result: (VREDUCEPS512 [a+1] x) - for { - a := auxIntToInt8(v.AuxInt) - x := v_0 - v.reset(OpAMD64VREDUCEPS512) - v.AuxInt = int8ToAuxInt(a + 1) - v.AddArg(x) - return true - } -} -func rewriteValueAMD64_OpDiffWithFloorWithPrecisionFloat32x4(v *Value) bool { - v_0 := v.Args[0] - // match: (DiffWithFloorWithPrecisionFloat32x4 [a] x) - // result: (VREDUCEPS128 [a+1] x) - for { - a := auxIntToInt8(v.AuxInt) - x := v_0 - v.reset(OpAMD64VREDUCEPS128) - v.AuxInt = int8ToAuxInt(a + 1) - v.AddArg(x) - return true - } -} -func rewriteValueAMD64_OpDiffWithFloorWithPrecisionFloat32x8(v *Value) bool { - v_0 := v.Args[0] - // match: (DiffWithFloorWithPrecisionFloat32x8 [a] x) - // result: (VREDUCEPS256 [a+1] x) - for { - a := auxIntToInt8(v.AuxInt) - x := v_0 - v.reset(OpAMD64VREDUCEPS256) - v.AuxInt = int8ToAuxInt(a + 1) - v.AddArg(x) - return true - } -} -func rewriteValueAMD64_OpDiffWithFloorWithPrecisionFloat64x2(v *Value) bool { - v_0 := v.Args[0] - // match: (DiffWithFloorWithPrecisionFloat64x2 [a] x) - // result: (VREDUCEPD128 [a+1] x) - for { - a := auxIntToInt8(v.AuxInt) - x := v_0 - v.reset(OpAMD64VREDUCEPD128) - v.AuxInt = int8ToAuxInt(a + 1) - v.AddArg(x) - return true - } -} -func rewriteValueAMD64_OpDiffWithFloorWithPrecisionFloat64x4(v *Value) bool { - v_0 := v.Args[0] - // match: (DiffWithFloorWithPrecisionFloat64x4 [a] x) - // result: (VREDUCEPD256 [a+1] x) - for { - a := auxIntToInt8(v.AuxInt) - x := v_0 - v.reset(OpAMD64VREDUCEPD256) - v.AuxInt = int8ToAuxInt(a + 1) - v.AddArg(x) - return true - } -} -func rewriteValueAMD64_OpDiffWithFloorWithPrecisionFloat64x8(v *Value) bool { - v_0 := v.Args[0] - // match: (DiffWithFloorWithPrecisionFloat64x8 [a] x) - // result: (VREDUCEPD512 [a+1] x) - for { - a := auxIntToInt8(v.AuxInt) - x := v_0 - v.reset(OpAMD64VREDUCEPD512) - v.AuxInt = int8ToAuxInt(a + 1) - v.AddArg(x) - return true - } -} -func rewriteValueAMD64_OpDiffWithFloorWithPrecisionMaskedFloat32x16(v *Value) bool { - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (DiffWithFloorWithPrecisionMaskedFloat32x16 [a] x mask) - // result: (VREDUCEPSMasked512 [a+1] x (VPMOVVec32x16ToM mask)) - for { - a := auxIntToInt8(v.AuxInt) - x := v_0 - mask := v_1 - v.reset(OpAMD64VREDUCEPSMasked512) - v.AuxInt = int8ToAuxInt(a + 1) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg2(x, v0) - return true - } -} -func rewriteValueAMD64_OpDiffWithFloorWithPrecisionMaskedFloat32x4(v *Value) bool { - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (DiffWithFloorWithPrecisionMaskedFloat32x4 [a] x mask) - // result: (VREDUCEPSMasked128 [a+1] x (VPMOVVec32x4ToM mask)) - for { - a := auxIntToInt8(v.AuxInt) - x := v_0 - mask := v_1 - v.reset(OpAMD64VREDUCEPSMasked128) - v.AuxInt = int8ToAuxInt(a + 1) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg2(x, v0) - return true - } -} -func rewriteValueAMD64_OpDiffWithFloorWithPrecisionMaskedFloat32x8(v *Value) bool { - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (DiffWithFloorWithPrecisionMaskedFloat32x8 [a] x mask) - // result: (VREDUCEPSMasked256 [a+1] x (VPMOVVec32x8ToM mask)) - for { - a := auxIntToInt8(v.AuxInt) - x := v_0 - mask := v_1 - v.reset(OpAMD64VREDUCEPSMasked256) - v.AuxInt = int8ToAuxInt(a + 1) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg2(x, v0) - return true - } -} -func rewriteValueAMD64_OpDiffWithFloorWithPrecisionMaskedFloat64x2(v *Value) bool { - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (DiffWithFloorWithPrecisionMaskedFloat64x2 [a] x mask) - // result: (VREDUCEPDMasked128 [a+1] x (VPMOVVec64x2ToM mask)) - for { - a := auxIntToInt8(v.AuxInt) - x := v_0 - mask := v_1 - v.reset(OpAMD64VREDUCEPDMasked128) - v.AuxInt = int8ToAuxInt(a + 1) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg2(x, v0) - return true - } -} -func rewriteValueAMD64_OpDiffWithFloorWithPrecisionMaskedFloat64x4(v *Value) bool { - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (DiffWithFloorWithPrecisionMaskedFloat64x4 [a] x mask) - // result: (VREDUCEPDMasked256 [a+1] x (VPMOVVec64x4ToM mask)) - for { - a := auxIntToInt8(v.AuxInt) - x := v_0 - mask := v_1 - v.reset(OpAMD64VREDUCEPDMasked256) - v.AuxInt = int8ToAuxInt(a + 1) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg2(x, v0) - return true - } -} -func rewriteValueAMD64_OpDiffWithFloorWithPrecisionMaskedFloat64x8(v *Value) bool { - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (DiffWithFloorWithPrecisionMaskedFloat64x8 [a] x mask) - // result: (VREDUCEPDMasked512 [a+1] x (VPMOVVec64x8ToM mask)) - for { - a := auxIntToInt8(v.AuxInt) - x := v_0 - mask := v_1 - v.reset(OpAMD64VREDUCEPDMasked512) - v.AuxInt = int8ToAuxInt(a + 1) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg2(x, v0) - return true - } -} -func rewriteValueAMD64_OpDiffWithRoundWithPrecisionFloat32x16(v *Value) bool { - v_0 := v.Args[0] - // match: (DiffWithRoundWithPrecisionFloat32x16 [a] x) - // result: (VREDUCEPS512 [a+0] x) - for { - a := auxIntToInt8(v.AuxInt) - x := v_0 - v.reset(OpAMD64VREDUCEPS512) - v.AuxInt = int8ToAuxInt(a + 0) - v.AddArg(x) - return true - } -} -func rewriteValueAMD64_OpDiffWithRoundWithPrecisionFloat32x4(v *Value) bool { - v_0 := v.Args[0] - // match: (DiffWithRoundWithPrecisionFloat32x4 [a] x) - // result: (VREDUCEPS128 [a+0] x) - for { - a := auxIntToInt8(v.AuxInt) - x := v_0 - v.reset(OpAMD64VREDUCEPS128) - v.AuxInt = int8ToAuxInt(a + 0) - v.AddArg(x) - return true - } -} -func rewriteValueAMD64_OpDiffWithRoundWithPrecisionFloat32x8(v *Value) bool { - v_0 := v.Args[0] - // match: (DiffWithRoundWithPrecisionFloat32x8 [a] x) - // result: (VREDUCEPS256 [a+0] x) - for { - a := auxIntToInt8(v.AuxInt) - x := v_0 - v.reset(OpAMD64VREDUCEPS256) - v.AuxInt = int8ToAuxInt(a + 0) - v.AddArg(x) - return true - } -} -func rewriteValueAMD64_OpDiffWithRoundWithPrecisionFloat64x2(v *Value) bool { - v_0 := v.Args[0] - // match: (DiffWithRoundWithPrecisionFloat64x2 [a] x) - // result: (VREDUCEPD128 [a+0] x) - for { - a := auxIntToInt8(v.AuxInt) - x := v_0 - v.reset(OpAMD64VREDUCEPD128) - v.AuxInt = int8ToAuxInt(a + 0) - v.AddArg(x) - return true - } -} -func rewriteValueAMD64_OpDiffWithRoundWithPrecisionFloat64x4(v *Value) bool { - v_0 := v.Args[0] - // match: (DiffWithRoundWithPrecisionFloat64x4 [a] x) - // result: (VREDUCEPD256 [a+0] x) - for { - a := auxIntToInt8(v.AuxInt) - x := v_0 - v.reset(OpAMD64VREDUCEPD256) - v.AuxInt = int8ToAuxInt(a + 0) - v.AddArg(x) - return true - } -} -func rewriteValueAMD64_OpDiffWithRoundWithPrecisionFloat64x8(v *Value) bool { - v_0 := v.Args[0] - // match: (DiffWithRoundWithPrecisionFloat64x8 [a] x) - // result: (VREDUCEPD512 [a+0] x) - for { - a := auxIntToInt8(v.AuxInt) - x := v_0 - v.reset(OpAMD64VREDUCEPD512) - v.AuxInt = int8ToAuxInt(a + 0) - v.AddArg(x) - return true - } -} -func rewriteValueAMD64_OpDiffWithRoundWithPrecisionMaskedFloat32x16(v *Value) bool { - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (DiffWithRoundWithPrecisionMaskedFloat32x16 [a] x mask) - // result: (VREDUCEPSMasked512 [a+0] x (VPMOVVec32x16ToM mask)) - for { - a := auxIntToInt8(v.AuxInt) - x := v_0 - mask := v_1 - v.reset(OpAMD64VREDUCEPSMasked512) - v.AuxInt = int8ToAuxInt(a + 0) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg2(x, v0) - return true - } -} -func rewriteValueAMD64_OpDiffWithRoundWithPrecisionMaskedFloat32x4(v *Value) bool { - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (DiffWithRoundWithPrecisionMaskedFloat32x4 [a] x mask) - // result: (VREDUCEPSMasked128 [a+0] x (VPMOVVec32x4ToM mask)) - for { - a := auxIntToInt8(v.AuxInt) - x := v_0 - mask := v_1 - v.reset(OpAMD64VREDUCEPSMasked128) - v.AuxInt = int8ToAuxInt(a + 0) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg2(x, v0) - return true - } -} -func rewriteValueAMD64_OpDiffWithRoundWithPrecisionMaskedFloat32x8(v *Value) bool { - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (DiffWithRoundWithPrecisionMaskedFloat32x8 [a] x mask) - // result: (VREDUCEPSMasked256 [a+0] x (VPMOVVec32x8ToM mask)) - for { - a := auxIntToInt8(v.AuxInt) - x := v_0 - mask := v_1 - v.reset(OpAMD64VREDUCEPSMasked256) - v.AuxInt = int8ToAuxInt(a + 0) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg2(x, v0) - return true - } -} -func rewriteValueAMD64_OpDiffWithRoundWithPrecisionMaskedFloat64x2(v *Value) bool { - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (DiffWithRoundWithPrecisionMaskedFloat64x2 [a] x mask) - // result: (VREDUCEPDMasked128 [a+0] x (VPMOVVec64x2ToM mask)) - for { - a := auxIntToInt8(v.AuxInt) - x := v_0 - mask := v_1 - v.reset(OpAMD64VREDUCEPDMasked128) - v.AuxInt = int8ToAuxInt(a + 0) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg2(x, v0) - return true - } -} -func rewriteValueAMD64_OpDiffWithRoundWithPrecisionMaskedFloat64x4(v *Value) bool { - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (DiffWithRoundWithPrecisionMaskedFloat64x4 [a] x mask) - // result: (VREDUCEPDMasked256 [a+0] x (VPMOVVec64x4ToM mask)) - for { - a := auxIntToInt8(v.AuxInt) - x := v_0 - mask := v_1 - v.reset(OpAMD64VREDUCEPDMasked256) - v.AuxInt = int8ToAuxInt(a + 0) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg2(x, v0) - return true - } -} -func rewriteValueAMD64_OpDiffWithRoundWithPrecisionMaskedFloat64x8(v *Value) bool { - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (DiffWithRoundWithPrecisionMaskedFloat64x8 [a] x mask) - // result: (VREDUCEPDMasked512 [a+0] x (VPMOVVec64x8ToM mask)) - for { - a := auxIntToInt8(v.AuxInt) - x := v_0 - mask := v_1 - v.reset(OpAMD64VREDUCEPDMasked512) - v.AuxInt = int8ToAuxInt(a + 0) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg2(x, v0) - return true - } -} -func rewriteValueAMD64_OpDiffWithTruncWithPrecisionFloat32x16(v *Value) bool { - v_0 := v.Args[0] - // match: (DiffWithTruncWithPrecisionFloat32x16 [a] x) - // result: (VREDUCEPS512 [a+3] x) - for { - a := auxIntToInt8(v.AuxInt) - x := v_0 - v.reset(OpAMD64VREDUCEPS512) - v.AuxInt = int8ToAuxInt(a + 3) - v.AddArg(x) - return true - } -} -func rewriteValueAMD64_OpDiffWithTruncWithPrecisionFloat32x4(v *Value) bool { - v_0 := v.Args[0] - // match: (DiffWithTruncWithPrecisionFloat32x4 [a] x) - // result: (VREDUCEPS128 [a+3] x) - for { - a := auxIntToInt8(v.AuxInt) - x := v_0 - v.reset(OpAMD64VREDUCEPS128) - v.AuxInt = int8ToAuxInt(a + 3) - v.AddArg(x) - return true - } -} -func rewriteValueAMD64_OpDiffWithTruncWithPrecisionFloat32x8(v *Value) bool { - v_0 := v.Args[0] - // match: (DiffWithTruncWithPrecisionFloat32x8 [a] x) - // result: (VREDUCEPS256 [a+3] x) - for { - a := auxIntToInt8(v.AuxInt) - x := v_0 - v.reset(OpAMD64VREDUCEPS256) - v.AuxInt = int8ToAuxInt(a + 3) - v.AddArg(x) - return true - } -} -func rewriteValueAMD64_OpDiffWithTruncWithPrecisionFloat64x2(v *Value) bool { - v_0 := v.Args[0] - // match: (DiffWithTruncWithPrecisionFloat64x2 [a] x) - // result: (VREDUCEPD128 [a+3] x) - for { - a := auxIntToInt8(v.AuxInt) - x := v_0 - v.reset(OpAMD64VREDUCEPD128) - v.AuxInt = int8ToAuxInt(a + 3) - v.AddArg(x) - return true - } -} -func rewriteValueAMD64_OpDiffWithTruncWithPrecisionFloat64x4(v *Value) bool { - v_0 := v.Args[0] - // match: (DiffWithTruncWithPrecisionFloat64x4 [a] x) - // result: (VREDUCEPD256 [a+3] x) - for { - a := auxIntToInt8(v.AuxInt) - x := v_0 - v.reset(OpAMD64VREDUCEPD256) - v.AuxInt = int8ToAuxInt(a + 3) - v.AddArg(x) - return true - } -} -func rewriteValueAMD64_OpDiffWithTruncWithPrecisionFloat64x8(v *Value) bool { - v_0 := v.Args[0] - // match: (DiffWithTruncWithPrecisionFloat64x8 [a] x) - // result: (VREDUCEPD512 [a+3] x) - for { - a := auxIntToInt8(v.AuxInt) - x := v_0 - v.reset(OpAMD64VREDUCEPD512) - v.AuxInt = int8ToAuxInt(a + 3) - v.AddArg(x) - return true - } -} -func rewriteValueAMD64_OpDiffWithTruncWithPrecisionMaskedFloat32x16(v *Value) bool { - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (DiffWithTruncWithPrecisionMaskedFloat32x16 [a] x mask) - // result: (VREDUCEPSMasked512 [a+3] x (VPMOVVec32x16ToM mask)) - for { - a := auxIntToInt8(v.AuxInt) - x := v_0 - mask := v_1 - v.reset(OpAMD64VREDUCEPSMasked512) - v.AuxInt = int8ToAuxInt(a + 3) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg2(x, v0) - return true - } -} -func rewriteValueAMD64_OpDiffWithTruncWithPrecisionMaskedFloat32x4(v *Value) bool { - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (DiffWithTruncWithPrecisionMaskedFloat32x4 [a] x mask) - // result: (VREDUCEPSMasked128 [a+3] x (VPMOVVec32x4ToM mask)) - for { - a := auxIntToInt8(v.AuxInt) - x := v_0 - mask := v_1 - v.reset(OpAMD64VREDUCEPSMasked128) - v.AuxInt = int8ToAuxInt(a + 3) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg2(x, v0) - return true - } -} -func rewriteValueAMD64_OpDiffWithTruncWithPrecisionMaskedFloat32x8(v *Value) bool { - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (DiffWithTruncWithPrecisionMaskedFloat32x8 [a] x mask) - // result: (VREDUCEPSMasked256 [a+3] x (VPMOVVec32x8ToM mask)) - for { - a := auxIntToInt8(v.AuxInt) - x := v_0 - mask := v_1 - v.reset(OpAMD64VREDUCEPSMasked256) - v.AuxInt = int8ToAuxInt(a + 3) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg2(x, v0) - return true - } -} -func rewriteValueAMD64_OpDiffWithTruncWithPrecisionMaskedFloat64x2(v *Value) bool { - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (DiffWithTruncWithPrecisionMaskedFloat64x2 [a] x mask) - // result: (VREDUCEPDMasked128 [a+3] x (VPMOVVec64x2ToM mask)) - for { - a := auxIntToInt8(v.AuxInt) - x := v_0 - mask := v_1 - v.reset(OpAMD64VREDUCEPDMasked128) - v.AuxInt = int8ToAuxInt(a + 3) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg2(x, v0) - return true - } -} -func rewriteValueAMD64_OpDiffWithTruncWithPrecisionMaskedFloat64x4(v *Value) bool { - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (DiffWithTruncWithPrecisionMaskedFloat64x4 [a] x mask) - // result: (VREDUCEPDMasked256 [a+3] x (VPMOVVec64x4ToM mask)) - for { - a := auxIntToInt8(v.AuxInt) - x := v_0 - mask := v_1 - v.reset(OpAMD64VREDUCEPDMasked256) - v.AuxInt = int8ToAuxInt(a + 3) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg2(x, v0) - return true - } -} -func rewriteValueAMD64_OpDiffWithTruncWithPrecisionMaskedFloat64x8(v *Value) bool { - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (DiffWithTruncWithPrecisionMaskedFloat64x8 [a] x mask) - // result: (VREDUCEPDMasked512 [a+3] x (VPMOVVec64x8ToM mask)) - for { - a := auxIntToInt8(v.AuxInt) - x := v_0 - mask := v_1 - v.reset(OpAMD64VREDUCEPDMasked512) - v.AuxInt = int8ToAuxInt(a + 3) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg2(x, v0) - return true - } -} func rewriteValueAMD64_OpDiv16(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] @@ -34731,9 +34389,9 @@ func rewriteValueAMD64_OpFloorFloat64x4(v *Value) bool { return true } } -func rewriteValueAMD64_OpFloorWithPrecisionFloat32x16(v *Value) bool { +func rewriteValueAMD64_OpFloorScaledFloat32x16(v *Value) bool { v_0 := v.Args[0] - // match: (FloorWithPrecisionFloat32x16 [a] x) + // match: (FloorScaledFloat32x16 [a] x) // result: (VRNDSCALEPS512 [a+1] x) for { a := auxIntToInt8(v.AuxInt) @@ -34744,9 +34402,9 @@ func rewriteValueAMD64_OpFloorWithPrecisionFloat32x16(v *Value) bool { return true } } -func rewriteValueAMD64_OpFloorWithPrecisionFloat32x4(v *Value) bool { +func rewriteValueAMD64_OpFloorScaledFloat32x4(v *Value) bool { v_0 := v.Args[0] - // match: (FloorWithPrecisionFloat32x4 [a] x) + // match: (FloorScaledFloat32x4 [a] x) // result: (VRNDSCALEPS128 [a+1] x) for { a := auxIntToInt8(v.AuxInt) @@ -34757,9 +34415,9 @@ func rewriteValueAMD64_OpFloorWithPrecisionFloat32x4(v *Value) bool { return true } } -func rewriteValueAMD64_OpFloorWithPrecisionFloat32x8(v *Value) bool { +func rewriteValueAMD64_OpFloorScaledFloat32x8(v *Value) bool { v_0 := v.Args[0] - // match: (FloorWithPrecisionFloat32x8 [a] x) + // match: (FloorScaledFloat32x8 [a] x) // result: (VRNDSCALEPS256 [a+1] x) for { a := auxIntToInt8(v.AuxInt) @@ -34770,9 +34428,9 @@ func rewriteValueAMD64_OpFloorWithPrecisionFloat32x8(v *Value) bool { return true } } -func rewriteValueAMD64_OpFloorWithPrecisionFloat64x2(v *Value) bool { +func rewriteValueAMD64_OpFloorScaledFloat64x2(v *Value) bool { v_0 := v.Args[0] - // match: (FloorWithPrecisionFloat64x2 [a] x) + // match: (FloorScaledFloat64x2 [a] x) // result: (VRNDSCALEPD128 [a+1] x) for { a := auxIntToInt8(v.AuxInt) @@ -34783,9 +34441,9 @@ func rewriteValueAMD64_OpFloorWithPrecisionFloat64x2(v *Value) bool { return true } } -func rewriteValueAMD64_OpFloorWithPrecisionFloat64x4(v *Value) bool { +func rewriteValueAMD64_OpFloorScaledFloat64x4(v *Value) bool { v_0 := v.Args[0] - // match: (FloorWithPrecisionFloat64x4 [a] x) + // match: (FloorScaledFloat64x4 [a] x) // result: (VRNDSCALEPD256 [a+1] x) for { a := auxIntToInt8(v.AuxInt) @@ -34796,9 +34454,9 @@ func rewriteValueAMD64_OpFloorWithPrecisionFloat64x4(v *Value) bool { return true } } -func rewriteValueAMD64_OpFloorWithPrecisionFloat64x8(v *Value) bool { +func rewriteValueAMD64_OpFloorScaledFloat64x8(v *Value) bool { v_0 := v.Args[0] - // match: (FloorWithPrecisionFloat64x8 [a] x) + // match: (FloorScaledFloat64x8 [a] x) // result: (VRNDSCALEPD512 [a+1] x) for { a := auxIntToInt8(v.AuxInt) @@ -34809,11 +34467,11 @@ func rewriteValueAMD64_OpFloorWithPrecisionFloat64x8(v *Value) bool { return true } } -func rewriteValueAMD64_OpFloorWithPrecisionMaskedFloat32x16(v *Value) bool { +func rewriteValueAMD64_OpFloorScaledMaskedFloat32x16(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] b := v.Block - // match: (FloorWithPrecisionMaskedFloat32x16 [a] x mask) + // match: (FloorScaledMaskedFloat32x16 [a] x mask) // result: (VRNDSCALEPSMasked512 [a+1] x (VPMOVVec32x16ToM mask)) for { a := auxIntToInt8(v.AuxInt) @@ -34827,11 +34485,11 @@ func rewriteValueAMD64_OpFloorWithPrecisionMaskedFloat32x16(v *Value) bool { return true } } -func rewriteValueAMD64_OpFloorWithPrecisionMaskedFloat32x4(v *Value) bool { +func rewriteValueAMD64_OpFloorScaledMaskedFloat32x4(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] b := v.Block - // match: (FloorWithPrecisionMaskedFloat32x4 [a] x mask) + // match: (FloorScaledMaskedFloat32x4 [a] x mask) // result: (VRNDSCALEPSMasked128 [a+1] x (VPMOVVec32x4ToM mask)) for { a := auxIntToInt8(v.AuxInt) @@ -34845,11 +34503,11 @@ func rewriteValueAMD64_OpFloorWithPrecisionMaskedFloat32x4(v *Value) bool { return true } } -func rewriteValueAMD64_OpFloorWithPrecisionMaskedFloat32x8(v *Value) bool { +func rewriteValueAMD64_OpFloorScaledMaskedFloat32x8(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] b := v.Block - // match: (FloorWithPrecisionMaskedFloat32x8 [a] x mask) + // match: (FloorScaledMaskedFloat32x8 [a] x mask) // result: (VRNDSCALEPSMasked256 [a+1] x (VPMOVVec32x8ToM mask)) for { a := auxIntToInt8(v.AuxInt) @@ -34863,11 +34521,11 @@ func rewriteValueAMD64_OpFloorWithPrecisionMaskedFloat32x8(v *Value) bool { return true } } -func rewriteValueAMD64_OpFloorWithPrecisionMaskedFloat64x2(v *Value) bool { +func rewriteValueAMD64_OpFloorScaledMaskedFloat64x2(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] b := v.Block - // match: (FloorWithPrecisionMaskedFloat64x2 [a] x mask) + // match: (FloorScaledMaskedFloat64x2 [a] x mask) // result: (VRNDSCALEPDMasked128 [a+1] x (VPMOVVec64x2ToM mask)) for { a := auxIntToInt8(v.AuxInt) @@ -34881,11 +34539,11 @@ func rewriteValueAMD64_OpFloorWithPrecisionMaskedFloat64x2(v *Value) bool { return true } } -func rewriteValueAMD64_OpFloorWithPrecisionMaskedFloat64x4(v *Value) bool { +func rewriteValueAMD64_OpFloorScaledMaskedFloat64x4(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] b := v.Block - // match: (FloorWithPrecisionMaskedFloat64x4 [a] x mask) + // match: (FloorScaledMaskedFloat64x4 [a] x mask) // result: (VRNDSCALEPDMasked256 [a+1] x (VPMOVVec64x4ToM mask)) for { a := auxIntToInt8(v.AuxInt) @@ -34899,11 +34557,11 @@ func rewriteValueAMD64_OpFloorWithPrecisionMaskedFloat64x4(v *Value) bool { return true } } -func rewriteValueAMD64_OpFloorWithPrecisionMaskedFloat64x8(v *Value) bool { +func rewriteValueAMD64_OpFloorScaledMaskedFloat64x8(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] b := v.Block - // match: (FloorWithPrecisionMaskedFloat64x8 [a] x mask) + // match: (FloorScaledMaskedFloat64x8 [a] x mask) // result: (VRNDSCALEPDMasked512 [a+1] x (VPMOVVec64x8ToM mask)) for { a := auxIntToInt8(v.AuxInt) @@ -34917,6 +34575,192 @@ func rewriteValueAMD64_OpFloorWithPrecisionMaskedFloat64x8(v *Value) bool { return true } } +func rewriteValueAMD64_OpFloorScaledResidueFloat32x16(v *Value) bool { + v_0 := v.Args[0] + // match: (FloorScaledResidueFloat32x16 [a] x) + // result: (VREDUCEPS512 [a+1] x) + for { + a := auxIntToInt8(v.AuxInt) + x := v_0 + v.reset(OpAMD64VREDUCEPS512) + v.AuxInt = int8ToAuxInt(a + 1) + v.AddArg(x) + return true + } +} +func rewriteValueAMD64_OpFloorScaledResidueFloat32x4(v *Value) bool { + v_0 := v.Args[0] + // match: (FloorScaledResidueFloat32x4 [a] x) + // result: (VREDUCEPS128 [a+1] x) + for { + a := auxIntToInt8(v.AuxInt) + x := v_0 + v.reset(OpAMD64VREDUCEPS128) + v.AuxInt = int8ToAuxInt(a + 1) + v.AddArg(x) + return true + } +} +func rewriteValueAMD64_OpFloorScaledResidueFloat32x8(v *Value) bool { + v_0 := v.Args[0] + // match: (FloorScaledResidueFloat32x8 [a] x) + // result: (VREDUCEPS256 [a+1] x) + for { + a := auxIntToInt8(v.AuxInt) + x := v_0 + v.reset(OpAMD64VREDUCEPS256) + v.AuxInt = int8ToAuxInt(a + 1) + v.AddArg(x) + return true + } +} +func rewriteValueAMD64_OpFloorScaledResidueFloat64x2(v *Value) bool { + v_0 := v.Args[0] + // match: (FloorScaledResidueFloat64x2 [a] x) + // result: (VREDUCEPD128 [a+1] x) + for { + a := auxIntToInt8(v.AuxInt) + x := v_0 + v.reset(OpAMD64VREDUCEPD128) + v.AuxInt = int8ToAuxInt(a + 1) + v.AddArg(x) + return true + } +} +func rewriteValueAMD64_OpFloorScaledResidueFloat64x4(v *Value) bool { + v_0 := v.Args[0] + // match: (FloorScaledResidueFloat64x4 [a] x) + // result: (VREDUCEPD256 [a+1] x) + for { + a := auxIntToInt8(v.AuxInt) + x := v_0 + v.reset(OpAMD64VREDUCEPD256) + v.AuxInt = int8ToAuxInt(a + 1) + v.AddArg(x) + return true + } +} +func rewriteValueAMD64_OpFloorScaledResidueFloat64x8(v *Value) bool { + v_0 := v.Args[0] + // match: (FloorScaledResidueFloat64x8 [a] x) + // result: (VREDUCEPD512 [a+1] x) + for { + a := auxIntToInt8(v.AuxInt) + x := v_0 + v.reset(OpAMD64VREDUCEPD512) + v.AuxInt = int8ToAuxInt(a + 1) + v.AddArg(x) + return true + } +} +func rewriteValueAMD64_OpFloorScaledResidueMaskedFloat32x16(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (FloorScaledResidueMaskedFloat32x16 [a] x mask) + // result: (VREDUCEPSMasked512 [a+1] x (VPMOVVec32x16ToM mask)) + for { + a := auxIntToInt8(v.AuxInt) + x := v_0 + mask := v_1 + v.reset(OpAMD64VREDUCEPSMasked512) + v.AuxInt = int8ToAuxInt(a + 1) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpFloorScaledResidueMaskedFloat32x4(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (FloorScaledResidueMaskedFloat32x4 [a] x mask) + // result: (VREDUCEPSMasked128 [a+1] x (VPMOVVec32x4ToM mask)) + for { + a := auxIntToInt8(v.AuxInt) + x := v_0 + mask := v_1 + v.reset(OpAMD64VREDUCEPSMasked128) + v.AuxInt = int8ToAuxInt(a + 1) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpFloorScaledResidueMaskedFloat32x8(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (FloorScaledResidueMaskedFloat32x8 [a] x mask) + // result: (VREDUCEPSMasked256 [a+1] x (VPMOVVec32x8ToM mask)) + for { + a := auxIntToInt8(v.AuxInt) + x := v_0 + mask := v_1 + v.reset(OpAMD64VREDUCEPSMasked256) + v.AuxInt = int8ToAuxInt(a + 1) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpFloorScaledResidueMaskedFloat64x2(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (FloorScaledResidueMaskedFloat64x2 [a] x mask) + // result: (VREDUCEPDMasked128 [a+1] x (VPMOVVec64x2ToM mask)) + for { + a := auxIntToInt8(v.AuxInt) + x := v_0 + mask := v_1 + v.reset(OpAMD64VREDUCEPDMasked128) + v.AuxInt = int8ToAuxInt(a + 1) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpFloorScaledResidueMaskedFloat64x4(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (FloorScaledResidueMaskedFloat64x4 [a] x mask) + // result: (VREDUCEPDMasked256 [a+1] x (VPMOVVec64x4ToM mask)) + for { + a := auxIntToInt8(v.AuxInt) + x := v_0 + mask := v_1 + v.reset(OpAMD64VREDUCEPDMasked256) + v.AuxInt = int8ToAuxInt(a + 1) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpFloorScaledResidueMaskedFloat64x8(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (FloorScaledResidueMaskedFloat64x8 [a] x mask) + // result: (VREDUCEPDMasked512 [a+1] x (VPMOVVec64x8ToM mask)) + for { + a := auxIntToInt8(v.AuxInt) + x := v_0 + mask := v_1 + v.reset(OpAMD64VREDUCEPDMasked512) + v.AuxInt = int8ToAuxInt(a + 1) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} func rewriteValueAMD64_OpFusedMultiplyAddMaskedFloat32x16(v *Value) bool { v_3 := v.Args[3] v_2 := v.Args[2] @@ -43583,114 +43427,6 @@ func rewriteValueAMD64_OpMove(v *Value) bool { } return false } -func rewriteValueAMD64_OpMulByPowOf2MaskedFloat32x16(v *Value) bool { - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (MulByPowOf2MaskedFloat32x16 x y mask) - // result: (VSCALEFPSMasked512 x y (VPMOVVec32x16ToM mask)) - for { - x := v_0 - y := v_1 - mask := v_2 - v.reset(OpAMD64VSCALEFPSMasked512) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg3(x, y, v0) - return true - } -} -func rewriteValueAMD64_OpMulByPowOf2MaskedFloat32x4(v *Value) bool { - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (MulByPowOf2MaskedFloat32x4 x y mask) - // result: (VSCALEFPSMasked128 x y (VPMOVVec32x4ToM mask)) - for { - x := v_0 - y := v_1 - mask := v_2 - v.reset(OpAMD64VSCALEFPSMasked128) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg3(x, y, v0) - return true - } -} -func rewriteValueAMD64_OpMulByPowOf2MaskedFloat32x8(v *Value) bool { - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (MulByPowOf2MaskedFloat32x8 x y mask) - // result: (VSCALEFPSMasked256 x y (VPMOVVec32x8ToM mask)) - for { - x := v_0 - y := v_1 - mask := v_2 - v.reset(OpAMD64VSCALEFPSMasked256) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg3(x, y, v0) - return true - } -} -func rewriteValueAMD64_OpMulByPowOf2MaskedFloat64x2(v *Value) bool { - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (MulByPowOf2MaskedFloat64x2 x y mask) - // result: (VSCALEFPDMasked128 x y (VPMOVVec64x2ToM mask)) - for { - x := v_0 - y := v_1 - mask := v_2 - v.reset(OpAMD64VSCALEFPDMasked128) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg3(x, y, v0) - return true - } -} -func rewriteValueAMD64_OpMulByPowOf2MaskedFloat64x4(v *Value) bool { - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (MulByPowOf2MaskedFloat64x4 x y mask) - // result: (VSCALEFPDMasked256 x y (VPMOVVec64x4ToM mask)) - for { - x := v_0 - y := v_1 - mask := v_2 - v.reset(OpAMD64VSCALEFPDMasked256) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg3(x, y, v0) - return true - } -} -func rewriteValueAMD64_OpMulByPowOf2MaskedFloat64x8(v *Value) bool { - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (MulByPowOf2MaskedFloat64x8 x y mask) - // result: (VSCALEFPDMasked512 x y (VPMOVVec64x8ToM mask)) - for { - x := v_0 - y := v_1 - mask := v_2 - v.reset(OpAMD64VSCALEFPDMasked512) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg3(x, y, v0) - return true - } -} func rewriteValueAMD64_OpMulEvenWidenMaskedInt64x2(v *Value) bool { v_2 := v.Args[2] v_1 := v.Args[1] @@ -43907,168 +43643,6 @@ func rewriteValueAMD64_OpMulHighMaskedUint16x8(v *Value) bool { return true } } -func rewriteValueAMD64_OpMulLowMaskedInt16x16(v *Value) bool { - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (MulLowMaskedInt16x16 x y mask) - // result: (VPMULLWMasked256 x y (VPMOVVec16x16ToM mask)) - for { - x := v_0 - y := v_1 - mask := v_2 - v.reset(OpAMD64VPMULLWMasked256) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x16ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg3(x, y, v0) - return true - } -} -func rewriteValueAMD64_OpMulLowMaskedInt16x32(v *Value) bool { - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (MulLowMaskedInt16x32 x y mask) - // result: (VPMULLWMasked512 x y (VPMOVVec16x32ToM mask)) - for { - x := v_0 - y := v_1 - mask := v_2 - v.reset(OpAMD64VPMULLWMasked512) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x32ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg3(x, y, v0) - return true - } -} -func rewriteValueAMD64_OpMulLowMaskedInt16x8(v *Value) bool { - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (MulLowMaskedInt16x8 x y mask) - // result: (VPMULLWMasked128 x y (VPMOVVec16x8ToM mask)) - for { - x := v_0 - y := v_1 - mask := v_2 - v.reset(OpAMD64VPMULLWMasked128) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg3(x, y, v0) - return true - } -} -func rewriteValueAMD64_OpMulLowMaskedInt32x16(v *Value) bool { - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (MulLowMaskedInt32x16 x y mask) - // result: (VPMULLDMasked512 x y (VPMOVVec32x16ToM mask)) - for { - x := v_0 - y := v_1 - mask := v_2 - v.reset(OpAMD64VPMULLDMasked512) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg3(x, y, v0) - return true - } -} -func rewriteValueAMD64_OpMulLowMaskedInt32x4(v *Value) bool { - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (MulLowMaskedInt32x4 x y mask) - // result: (VPMULLDMasked128 x y (VPMOVVec32x4ToM mask)) - for { - x := v_0 - y := v_1 - mask := v_2 - v.reset(OpAMD64VPMULLDMasked128) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg3(x, y, v0) - return true - } -} -func rewriteValueAMD64_OpMulLowMaskedInt32x8(v *Value) bool { - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (MulLowMaskedInt32x8 x y mask) - // result: (VPMULLDMasked256 x y (VPMOVVec32x8ToM mask)) - for { - x := v_0 - y := v_1 - mask := v_2 - v.reset(OpAMD64VPMULLDMasked256) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg3(x, y, v0) - return true - } -} -func rewriteValueAMD64_OpMulLowMaskedInt64x2(v *Value) bool { - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (MulLowMaskedInt64x2 x y mask) - // result: (VPMULLQMasked128 x y (VPMOVVec64x2ToM mask)) - for { - x := v_0 - y := v_1 - mask := v_2 - v.reset(OpAMD64VPMULLQMasked128) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg3(x, y, v0) - return true - } -} -func rewriteValueAMD64_OpMulLowMaskedInt64x4(v *Value) bool { - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (MulLowMaskedInt64x4 x y mask) - // result: (VPMULLQMasked256 x y (VPMOVVec64x4ToM mask)) - for { - x := v_0 - y := v_1 - mask := v_2 - v.reset(OpAMD64VPMULLQMasked256) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg3(x, y, v0) - return true - } -} -func rewriteValueAMD64_OpMulLowMaskedInt64x8(v *Value) bool { - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (MulLowMaskedInt64x8 x y mask) - // result: (VPMULLQMasked512 x y (VPMOVVec64x8ToM mask)) - for { - x := v_0 - y := v_1 - mask := v_2 - v.reset(OpAMD64VPMULLQMasked512) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg3(x, y, v0) - return true - } -} func rewriteValueAMD64_OpMulMaskedFloat32x16(v *Value) bool { v_2 := v.Args[2] v_1 := v.Args[1] @@ -44177,6 +43751,168 @@ func rewriteValueAMD64_OpMulMaskedFloat64x8(v *Value) bool { return true } } +func rewriteValueAMD64_OpMulMaskedInt16x16(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (MulMaskedInt16x16 x y mask) + // result: (VPMULLWMasked256 x y (VPMOVVec16x16ToM mask)) + for { + x := v_0 + y := v_1 + mask := v_2 + v.reset(OpAMD64VPMULLWMasked256) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x16ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg3(x, y, v0) + return true + } +} +func rewriteValueAMD64_OpMulMaskedInt16x32(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (MulMaskedInt16x32 x y mask) + // result: (VPMULLWMasked512 x y (VPMOVVec16x32ToM mask)) + for { + x := v_0 + y := v_1 + mask := v_2 + v.reset(OpAMD64VPMULLWMasked512) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x32ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg3(x, y, v0) + return true + } +} +func rewriteValueAMD64_OpMulMaskedInt16x8(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (MulMaskedInt16x8 x y mask) + // result: (VPMULLWMasked128 x y (VPMOVVec16x8ToM mask)) + for { + x := v_0 + y := v_1 + mask := v_2 + v.reset(OpAMD64VPMULLWMasked128) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg3(x, y, v0) + return true + } +} +func rewriteValueAMD64_OpMulMaskedInt32x16(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (MulMaskedInt32x16 x y mask) + // result: (VPMULLDMasked512 x y (VPMOVVec32x16ToM mask)) + for { + x := v_0 + y := v_1 + mask := v_2 + v.reset(OpAMD64VPMULLDMasked512) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg3(x, y, v0) + return true + } +} +func rewriteValueAMD64_OpMulMaskedInt32x4(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (MulMaskedInt32x4 x y mask) + // result: (VPMULLDMasked128 x y (VPMOVVec32x4ToM mask)) + for { + x := v_0 + y := v_1 + mask := v_2 + v.reset(OpAMD64VPMULLDMasked128) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg3(x, y, v0) + return true + } +} +func rewriteValueAMD64_OpMulMaskedInt32x8(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (MulMaskedInt32x8 x y mask) + // result: (VPMULLDMasked256 x y (VPMOVVec32x8ToM mask)) + for { + x := v_0 + y := v_1 + mask := v_2 + v.reset(OpAMD64VPMULLDMasked256) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg3(x, y, v0) + return true + } +} +func rewriteValueAMD64_OpMulMaskedInt64x2(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (MulMaskedInt64x2 x y mask) + // result: (VPMULLQMasked128 x y (VPMOVVec64x2ToM mask)) + for { + x := v_0 + y := v_1 + mask := v_2 + v.reset(OpAMD64VPMULLQMasked128) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg3(x, y, v0) + return true + } +} +func rewriteValueAMD64_OpMulMaskedInt64x4(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (MulMaskedInt64x4 x y mask) + // result: (VPMULLQMasked256 x y (VPMOVVec64x4ToM mask)) + for { + x := v_0 + y := v_1 + mask := v_2 + v.reset(OpAMD64VPMULLQMasked256) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg3(x, y, v0) + return true + } +} +func rewriteValueAMD64_OpMulMaskedInt64x8(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (MulMaskedInt64x8 x y mask) + // result: (VPMULLQMasked512 x y (VPMOVVec64x8ToM mask)) + for { + x := v_0 + y := v_1 + mask := v_2 + v.reset(OpAMD64VPMULLQMasked512) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg3(x, y, v0) + return true + } +} func rewriteValueAMD64_OpNeg32F(v *Value) bool { v_0 := v.Args[0] b := v.Block @@ -48243,21 +47979,9 @@ func rewriteValueAMD64_OpRoundFloat64x4(v *Value) bool { return true } } -func rewriteValueAMD64_OpRoundToEven(v *Value) bool { +func rewriteValueAMD64_OpRoundScaledFloat32x16(v *Value) bool { v_0 := v.Args[0] - // match: (RoundToEven x) - // result: (ROUNDSD [0] x) - for { - x := v_0 - v.reset(OpAMD64ROUNDSD) - v.AuxInt = int8ToAuxInt(0) - v.AddArg(x) - return true - } -} -func rewriteValueAMD64_OpRoundWithPrecisionFloat32x16(v *Value) bool { - v_0 := v.Args[0] - // match: (RoundWithPrecisionFloat32x16 [a] x) + // match: (RoundScaledFloat32x16 [a] x) // result: (VRNDSCALEPS512 [a+0] x) for { a := auxIntToInt8(v.AuxInt) @@ -48268,9 +47992,9 @@ func rewriteValueAMD64_OpRoundWithPrecisionFloat32x16(v *Value) bool { return true } } -func rewriteValueAMD64_OpRoundWithPrecisionFloat32x4(v *Value) bool { +func rewriteValueAMD64_OpRoundScaledFloat32x4(v *Value) bool { v_0 := v.Args[0] - // match: (RoundWithPrecisionFloat32x4 [a] x) + // match: (RoundScaledFloat32x4 [a] x) // result: (VRNDSCALEPS128 [a+0] x) for { a := auxIntToInt8(v.AuxInt) @@ -48281,9 +48005,9 @@ func rewriteValueAMD64_OpRoundWithPrecisionFloat32x4(v *Value) bool { return true } } -func rewriteValueAMD64_OpRoundWithPrecisionFloat32x8(v *Value) bool { +func rewriteValueAMD64_OpRoundScaledFloat32x8(v *Value) bool { v_0 := v.Args[0] - // match: (RoundWithPrecisionFloat32x8 [a] x) + // match: (RoundScaledFloat32x8 [a] x) // result: (VRNDSCALEPS256 [a+0] x) for { a := auxIntToInt8(v.AuxInt) @@ -48294,9 +48018,9 @@ func rewriteValueAMD64_OpRoundWithPrecisionFloat32x8(v *Value) bool { return true } } -func rewriteValueAMD64_OpRoundWithPrecisionFloat64x2(v *Value) bool { +func rewriteValueAMD64_OpRoundScaledFloat64x2(v *Value) bool { v_0 := v.Args[0] - // match: (RoundWithPrecisionFloat64x2 [a] x) + // match: (RoundScaledFloat64x2 [a] x) // result: (VRNDSCALEPD128 [a+0] x) for { a := auxIntToInt8(v.AuxInt) @@ -48307,9 +48031,9 @@ func rewriteValueAMD64_OpRoundWithPrecisionFloat64x2(v *Value) bool { return true } } -func rewriteValueAMD64_OpRoundWithPrecisionFloat64x4(v *Value) bool { +func rewriteValueAMD64_OpRoundScaledFloat64x4(v *Value) bool { v_0 := v.Args[0] - // match: (RoundWithPrecisionFloat64x4 [a] x) + // match: (RoundScaledFloat64x4 [a] x) // result: (VRNDSCALEPD256 [a+0] x) for { a := auxIntToInt8(v.AuxInt) @@ -48320,9 +48044,9 @@ func rewriteValueAMD64_OpRoundWithPrecisionFloat64x4(v *Value) bool { return true } } -func rewriteValueAMD64_OpRoundWithPrecisionFloat64x8(v *Value) bool { +func rewriteValueAMD64_OpRoundScaledFloat64x8(v *Value) bool { v_0 := v.Args[0] - // match: (RoundWithPrecisionFloat64x8 [a] x) + // match: (RoundScaledFloat64x8 [a] x) // result: (VRNDSCALEPD512 [a+0] x) for { a := auxIntToInt8(v.AuxInt) @@ -48333,11 +48057,11 @@ func rewriteValueAMD64_OpRoundWithPrecisionFloat64x8(v *Value) bool { return true } } -func rewriteValueAMD64_OpRoundWithPrecisionMaskedFloat32x16(v *Value) bool { +func rewriteValueAMD64_OpRoundScaledMaskedFloat32x16(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] b := v.Block - // match: (RoundWithPrecisionMaskedFloat32x16 [a] x mask) + // match: (RoundScaledMaskedFloat32x16 [a] x mask) // result: (VRNDSCALEPSMasked512 [a+0] x (VPMOVVec32x16ToM mask)) for { a := auxIntToInt8(v.AuxInt) @@ -48351,11 +48075,11 @@ func rewriteValueAMD64_OpRoundWithPrecisionMaskedFloat32x16(v *Value) bool { return true } } -func rewriteValueAMD64_OpRoundWithPrecisionMaskedFloat32x4(v *Value) bool { +func rewriteValueAMD64_OpRoundScaledMaskedFloat32x4(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] b := v.Block - // match: (RoundWithPrecisionMaskedFloat32x4 [a] x mask) + // match: (RoundScaledMaskedFloat32x4 [a] x mask) // result: (VRNDSCALEPSMasked128 [a+0] x (VPMOVVec32x4ToM mask)) for { a := auxIntToInt8(v.AuxInt) @@ -48369,11 +48093,11 @@ func rewriteValueAMD64_OpRoundWithPrecisionMaskedFloat32x4(v *Value) bool { return true } } -func rewriteValueAMD64_OpRoundWithPrecisionMaskedFloat32x8(v *Value) bool { +func rewriteValueAMD64_OpRoundScaledMaskedFloat32x8(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] b := v.Block - // match: (RoundWithPrecisionMaskedFloat32x8 [a] x mask) + // match: (RoundScaledMaskedFloat32x8 [a] x mask) // result: (VRNDSCALEPSMasked256 [a+0] x (VPMOVVec32x8ToM mask)) for { a := auxIntToInt8(v.AuxInt) @@ -48387,11 +48111,11 @@ func rewriteValueAMD64_OpRoundWithPrecisionMaskedFloat32x8(v *Value) bool { return true } } -func rewriteValueAMD64_OpRoundWithPrecisionMaskedFloat64x2(v *Value) bool { +func rewriteValueAMD64_OpRoundScaledMaskedFloat64x2(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] b := v.Block - // match: (RoundWithPrecisionMaskedFloat64x2 [a] x mask) + // match: (RoundScaledMaskedFloat64x2 [a] x mask) // result: (VRNDSCALEPDMasked128 [a+0] x (VPMOVVec64x2ToM mask)) for { a := auxIntToInt8(v.AuxInt) @@ -48405,11 +48129,11 @@ func rewriteValueAMD64_OpRoundWithPrecisionMaskedFloat64x2(v *Value) bool { return true } } -func rewriteValueAMD64_OpRoundWithPrecisionMaskedFloat64x4(v *Value) bool { +func rewriteValueAMD64_OpRoundScaledMaskedFloat64x4(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] b := v.Block - // match: (RoundWithPrecisionMaskedFloat64x4 [a] x mask) + // match: (RoundScaledMaskedFloat64x4 [a] x mask) // result: (VRNDSCALEPDMasked256 [a+0] x (VPMOVVec64x4ToM mask)) for { a := auxIntToInt8(v.AuxInt) @@ -48423,11 +48147,11 @@ func rewriteValueAMD64_OpRoundWithPrecisionMaskedFloat64x4(v *Value) bool { return true } } -func rewriteValueAMD64_OpRoundWithPrecisionMaskedFloat64x8(v *Value) bool { +func rewriteValueAMD64_OpRoundScaledMaskedFloat64x8(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] b := v.Block - // match: (RoundWithPrecisionMaskedFloat64x8 [a] x mask) + // match: (RoundScaledMaskedFloat64x8 [a] x mask) // result: (VRNDSCALEPDMasked512 [a+0] x (VPMOVVec64x8ToM mask)) for { a := auxIntToInt8(v.AuxInt) @@ -48441,6 +48165,204 @@ func rewriteValueAMD64_OpRoundWithPrecisionMaskedFloat64x8(v *Value) bool { return true } } +func rewriteValueAMD64_OpRoundScaledResidueFloat32x16(v *Value) bool { + v_0 := v.Args[0] + // match: (RoundScaledResidueFloat32x16 [a] x) + // result: (VREDUCEPS512 [a+0] x) + for { + a := auxIntToInt8(v.AuxInt) + x := v_0 + v.reset(OpAMD64VREDUCEPS512) + v.AuxInt = int8ToAuxInt(a + 0) + v.AddArg(x) + return true + } +} +func rewriteValueAMD64_OpRoundScaledResidueFloat32x4(v *Value) bool { + v_0 := v.Args[0] + // match: (RoundScaledResidueFloat32x4 [a] x) + // result: (VREDUCEPS128 [a+0] x) + for { + a := auxIntToInt8(v.AuxInt) + x := v_0 + v.reset(OpAMD64VREDUCEPS128) + v.AuxInt = int8ToAuxInt(a + 0) + v.AddArg(x) + return true + } +} +func rewriteValueAMD64_OpRoundScaledResidueFloat32x8(v *Value) bool { + v_0 := v.Args[0] + // match: (RoundScaledResidueFloat32x8 [a] x) + // result: (VREDUCEPS256 [a+0] x) + for { + a := auxIntToInt8(v.AuxInt) + x := v_0 + v.reset(OpAMD64VREDUCEPS256) + v.AuxInt = int8ToAuxInt(a + 0) + v.AddArg(x) + return true + } +} +func rewriteValueAMD64_OpRoundScaledResidueFloat64x2(v *Value) bool { + v_0 := v.Args[0] + // match: (RoundScaledResidueFloat64x2 [a] x) + // result: (VREDUCEPD128 [a+0] x) + for { + a := auxIntToInt8(v.AuxInt) + x := v_0 + v.reset(OpAMD64VREDUCEPD128) + v.AuxInt = int8ToAuxInt(a + 0) + v.AddArg(x) + return true + } +} +func rewriteValueAMD64_OpRoundScaledResidueFloat64x4(v *Value) bool { + v_0 := v.Args[0] + // match: (RoundScaledResidueFloat64x4 [a] x) + // result: (VREDUCEPD256 [a+0] x) + for { + a := auxIntToInt8(v.AuxInt) + x := v_0 + v.reset(OpAMD64VREDUCEPD256) + v.AuxInt = int8ToAuxInt(a + 0) + v.AddArg(x) + return true + } +} +func rewriteValueAMD64_OpRoundScaledResidueFloat64x8(v *Value) bool { + v_0 := v.Args[0] + // match: (RoundScaledResidueFloat64x8 [a] x) + // result: (VREDUCEPD512 [a+0] x) + for { + a := auxIntToInt8(v.AuxInt) + x := v_0 + v.reset(OpAMD64VREDUCEPD512) + v.AuxInt = int8ToAuxInt(a + 0) + v.AddArg(x) + return true + } +} +func rewriteValueAMD64_OpRoundScaledResidueMaskedFloat32x16(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (RoundScaledResidueMaskedFloat32x16 [a] x mask) + // result: (VREDUCEPSMasked512 [a+0] x (VPMOVVec32x16ToM mask)) + for { + a := auxIntToInt8(v.AuxInt) + x := v_0 + mask := v_1 + v.reset(OpAMD64VREDUCEPSMasked512) + v.AuxInt = int8ToAuxInt(a + 0) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpRoundScaledResidueMaskedFloat32x4(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (RoundScaledResidueMaskedFloat32x4 [a] x mask) + // result: (VREDUCEPSMasked128 [a+0] x (VPMOVVec32x4ToM mask)) + for { + a := auxIntToInt8(v.AuxInt) + x := v_0 + mask := v_1 + v.reset(OpAMD64VREDUCEPSMasked128) + v.AuxInt = int8ToAuxInt(a + 0) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpRoundScaledResidueMaskedFloat32x8(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (RoundScaledResidueMaskedFloat32x8 [a] x mask) + // result: (VREDUCEPSMasked256 [a+0] x (VPMOVVec32x8ToM mask)) + for { + a := auxIntToInt8(v.AuxInt) + x := v_0 + mask := v_1 + v.reset(OpAMD64VREDUCEPSMasked256) + v.AuxInt = int8ToAuxInt(a + 0) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpRoundScaledResidueMaskedFloat64x2(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (RoundScaledResidueMaskedFloat64x2 [a] x mask) + // result: (VREDUCEPDMasked128 [a+0] x (VPMOVVec64x2ToM mask)) + for { + a := auxIntToInt8(v.AuxInt) + x := v_0 + mask := v_1 + v.reset(OpAMD64VREDUCEPDMasked128) + v.AuxInt = int8ToAuxInt(a + 0) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpRoundScaledResidueMaskedFloat64x4(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (RoundScaledResidueMaskedFloat64x4 [a] x mask) + // result: (VREDUCEPDMasked256 [a+0] x (VPMOVVec64x4ToM mask)) + for { + a := auxIntToInt8(v.AuxInt) + x := v_0 + mask := v_1 + v.reset(OpAMD64VREDUCEPDMasked256) + v.AuxInt = int8ToAuxInt(a + 0) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpRoundScaledResidueMaskedFloat64x8(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (RoundScaledResidueMaskedFloat64x8 [a] x mask) + // result: (VREDUCEPDMasked512 [a+0] x (VPMOVVec64x8ToM mask)) + for { + a := auxIntToInt8(v.AuxInt) + x := v_0 + mask := v_1 + v.reset(OpAMD64VREDUCEPDMasked512) + v.AuxInt = int8ToAuxInt(a + 0) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpRoundToEven(v *Value) bool { + v_0 := v.Args[0] + // match: (RoundToEven x) + // result: (ROUNDSD [0] x) + for { + x := v_0 + v.reset(OpAMD64ROUNDSD) + v.AuxInt = int8ToAuxInt(0) + v.AddArg(x) + return true + } +} func rewriteValueAMD64_OpRsh16Ux16(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] @@ -49829,438 +49751,6 @@ func rewriteValueAMD64_OpSaturatedAddDotProdMaskedInt32x8(v *Value) bool { return true } } -func rewriteValueAMD64_OpSaturatedAddMaskedInt16x16(v *Value) bool { - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (SaturatedAddMaskedInt16x16 x y mask) - // result: (VPADDSWMasked256 x y (VPMOVVec16x16ToM mask)) - for { - x := v_0 - y := v_1 - mask := v_2 - v.reset(OpAMD64VPADDSWMasked256) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x16ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg3(x, y, v0) - return true - } -} -func rewriteValueAMD64_OpSaturatedAddMaskedInt16x32(v *Value) bool { - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (SaturatedAddMaskedInt16x32 x y mask) - // result: (VPADDSWMasked512 x y (VPMOVVec16x32ToM mask)) - for { - x := v_0 - y := v_1 - mask := v_2 - v.reset(OpAMD64VPADDSWMasked512) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x32ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg3(x, y, v0) - return true - } -} -func rewriteValueAMD64_OpSaturatedAddMaskedInt16x8(v *Value) bool { - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (SaturatedAddMaskedInt16x8 x y mask) - // result: (VPADDSWMasked128 x y (VPMOVVec16x8ToM mask)) - for { - x := v_0 - y := v_1 - mask := v_2 - v.reset(OpAMD64VPADDSWMasked128) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg3(x, y, v0) - return true - } -} -func rewriteValueAMD64_OpSaturatedAddMaskedInt8x16(v *Value) bool { - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (SaturatedAddMaskedInt8x16 x y mask) - // result: (VPADDSBMasked128 x y (VPMOVVec8x16ToM mask)) - for { - x := v_0 - y := v_1 - mask := v_2 - v.reset(OpAMD64VPADDSBMasked128) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x16ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg3(x, y, v0) - return true - } -} -func rewriteValueAMD64_OpSaturatedAddMaskedInt8x32(v *Value) bool { - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (SaturatedAddMaskedInt8x32 x y mask) - // result: (VPADDSBMasked256 x y (VPMOVVec8x32ToM mask)) - for { - x := v_0 - y := v_1 - mask := v_2 - v.reset(OpAMD64VPADDSBMasked256) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x32ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg3(x, y, v0) - return true - } -} -func rewriteValueAMD64_OpSaturatedAddMaskedInt8x64(v *Value) bool { - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (SaturatedAddMaskedInt8x64 x y mask) - // result: (VPADDSBMasked512 x y (VPMOVVec8x64ToM mask)) - for { - x := v_0 - y := v_1 - mask := v_2 - v.reset(OpAMD64VPADDSBMasked512) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x64ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg3(x, y, v0) - return true - } -} -func rewriteValueAMD64_OpSaturatedAddMaskedUint16x16(v *Value) bool { - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (SaturatedAddMaskedUint16x16 x y mask) - // result: (VPADDSWMasked256 x y (VPMOVVec16x16ToM mask)) - for { - x := v_0 - y := v_1 - mask := v_2 - v.reset(OpAMD64VPADDSWMasked256) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x16ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg3(x, y, v0) - return true - } -} -func rewriteValueAMD64_OpSaturatedAddMaskedUint16x32(v *Value) bool { - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (SaturatedAddMaskedUint16x32 x y mask) - // result: (VPADDSWMasked512 x y (VPMOVVec16x32ToM mask)) - for { - x := v_0 - y := v_1 - mask := v_2 - v.reset(OpAMD64VPADDSWMasked512) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x32ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg3(x, y, v0) - return true - } -} -func rewriteValueAMD64_OpSaturatedAddMaskedUint16x8(v *Value) bool { - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (SaturatedAddMaskedUint16x8 x y mask) - // result: (VPADDSWMasked128 x y (VPMOVVec16x8ToM mask)) - for { - x := v_0 - y := v_1 - mask := v_2 - v.reset(OpAMD64VPADDSWMasked128) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg3(x, y, v0) - return true - } -} -func rewriteValueAMD64_OpSaturatedAddMaskedUint8x16(v *Value) bool { - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (SaturatedAddMaskedUint8x16 x y mask) - // result: (VPADDSBMasked128 x y (VPMOVVec8x16ToM mask)) - for { - x := v_0 - y := v_1 - mask := v_2 - v.reset(OpAMD64VPADDSBMasked128) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x16ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg3(x, y, v0) - return true - } -} -func rewriteValueAMD64_OpSaturatedAddMaskedUint8x32(v *Value) bool { - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (SaturatedAddMaskedUint8x32 x y mask) - // result: (VPADDSBMasked256 x y (VPMOVVec8x32ToM mask)) - for { - x := v_0 - y := v_1 - mask := v_2 - v.reset(OpAMD64VPADDSBMasked256) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x32ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg3(x, y, v0) - return true - } -} -func rewriteValueAMD64_OpSaturatedAddMaskedUint8x64(v *Value) bool { - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (SaturatedAddMaskedUint8x64 x y mask) - // result: (VPADDSBMasked512 x y (VPMOVVec8x64ToM mask)) - for { - x := v_0 - y := v_1 - mask := v_2 - v.reset(OpAMD64VPADDSBMasked512) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x64ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg3(x, y, v0) - return true - } -} -func rewriteValueAMD64_OpSaturatedSubMaskedInt16x16(v *Value) bool { - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (SaturatedSubMaskedInt16x16 x y mask) - // result: (VPSUBSWMasked256 x y (VPMOVVec16x16ToM mask)) - for { - x := v_0 - y := v_1 - mask := v_2 - v.reset(OpAMD64VPSUBSWMasked256) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x16ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg3(x, y, v0) - return true - } -} -func rewriteValueAMD64_OpSaturatedSubMaskedInt16x32(v *Value) bool { - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (SaturatedSubMaskedInt16x32 x y mask) - // result: (VPSUBSWMasked512 x y (VPMOVVec16x32ToM mask)) - for { - x := v_0 - y := v_1 - mask := v_2 - v.reset(OpAMD64VPSUBSWMasked512) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x32ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg3(x, y, v0) - return true - } -} -func rewriteValueAMD64_OpSaturatedSubMaskedInt16x8(v *Value) bool { - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (SaturatedSubMaskedInt16x8 x y mask) - // result: (VPSUBSWMasked128 x y (VPMOVVec16x8ToM mask)) - for { - x := v_0 - y := v_1 - mask := v_2 - v.reset(OpAMD64VPSUBSWMasked128) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg3(x, y, v0) - return true - } -} -func rewriteValueAMD64_OpSaturatedSubMaskedInt8x16(v *Value) bool { - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (SaturatedSubMaskedInt8x16 x y mask) - // result: (VPSUBSBMasked128 x y (VPMOVVec8x16ToM mask)) - for { - x := v_0 - y := v_1 - mask := v_2 - v.reset(OpAMD64VPSUBSBMasked128) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x16ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg3(x, y, v0) - return true - } -} -func rewriteValueAMD64_OpSaturatedSubMaskedInt8x32(v *Value) bool { - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (SaturatedSubMaskedInt8x32 x y mask) - // result: (VPSUBSBMasked256 x y (VPMOVVec8x32ToM mask)) - for { - x := v_0 - y := v_1 - mask := v_2 - v.reset(OpAMD64VPSUBSBMasked256) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x32ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg3(x, y, v0) - return true - } -} -func rewriteValueAMD64_OpSaturatedSubMaskedInt8x64(v *Value) bool { - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (SaturatedSubMaskedInt8x64 x y mask) - // result: (VPSUBSBMasked512 x y (VPMOVVec8x64ToM mask)) - for { - x := v_0 - y := v_1 - mask := v_2 - v.reset(OpAMD64VPSUBSBMasked512) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x64ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg3(x, y, v0) - return true - } -} -func rewriteValueAMD64_OpSaturatedSubMaskedUint16x16(v *Value) bool { - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (SaturatedSubMaskedUint16x16 x y mask) - // result: (VPSUBSWMasked256 x y (VPMOVVec16x16ToM mask)) - for { - x := v_0 - y := v_1 - mask := v_2 - v.reset(OpAMD64VPSUBSWMasked256) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x16ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg3(x, y, v0) - return true - } -} -func rewriteValueAMD64_OpSaturatedSubMaskedUint16x32(v *Value) bool { - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (SaturatedSubMaskedUint16x32 x y mask) - // result: (VPSUBSWMasked512 x y (VPMOVVec16x32ToM mask)) - for { - x := v_0 - y := v_1 - mask := v_2 - v.reset(OpAMD64VPSUBSWMasked512) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x32ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg3(x, y, v0) - return true - } -} -func rewriteValueAMD64_OpSaturatedSubMaskedUint16x8(v *Value) bool { - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (SaturatedSubMaskedUint16x8 x y mask) - // result: (VPSUBSWMasked128 x y (VPMOVVec16x8ToM mask)) - for { - x := v_0 - y := v_1 - mask := v_2 - v.reset(OpAMD64VPSUBSWMasked128) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg3(x, y, v0) - return true - } -} -func rewriteValueAMD64_OpSaturatedSubMaskedUint8x16(v *Value) bool { - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (SaturatedSubMaskedUint8x16 x y mask) - // result: (VPSUBSBMasked128 x y (VPMOVVec8x16ToM mask)) - for { - x := v_0 - y := v_1 - mask := v_2 - v.reset(OpAMD64VPSUBSBMasked128) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x16ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg3(x, y, v0) - return true - } -} -func rewriteValueAMD64_OpSaturatedSubMaskedUint8x32(v *Value) bool { - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (SaturatedSubMaskedUint8x32 x y mask) - // result: (VPSUBSBMasked256 x y (VPMOVVec8x32ToM mask)) - for { - x := v_0 - y := v_1 - mask := v_2 - v.reset(OpAMD64VPSUBSBMasked256) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x32ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg3(x, y, v0) - return true - } -} -func rewriteValueAMD64_OpSaturatedSubMaskedUint8x64(v *Value) bool { - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (SaturatedSubMaskedUint8x64 x y mask) - // result: (VPSUBSBMasked512 x y (VPMOVVec8x64ToM mask)) - for { - x := v_0 - y := v_1 - mask := v_2 - v.reset(OpAMD64VPSUBSBMasked512) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x64ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg3(x, y, v0) - return true - } -} func rewriteValueAMD64_OpSaturatedUnsignedSignedPairDotProdMaskedUint8x16(v *Value) bool { v_2 := v.Args[2] v_1 := v.Args[1] @@ -50375,6 +49865,114 @@ func rewriteValueAMD64_OpSaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32 return true } } +func rewriteValueAMD64_OpScaleMaskedFloat32x16(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (ScaleMaskedFloat32x16 x y mask) + // result: (VSCALEFPSMasked512 x y (VPMOVVec32x16ToM mask)) + for { + x := v_0 + y := v_1 + mask := v_2 + v.reset(OpAMD64VSCALEFPSMasked512) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg3(x, y, v0) + return true + } +} +func rewriteValueAMD64_OpScaleMaskedFloat32x4(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (ScaleMaskedFloat32x4 x y mask) + // result: (VSCALEFPSMasked128 x y (VPMOVVec32x4ToM mask)) + for { + x := v_0 + y := v_1 + mask := v_2 + v.reset(OpAMD64VSCALEFPSMasked128) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg3(x, y, v0) + return true + } +} +func rewriteValueAMD64_OpScaleMaskedFloat32x8(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (ScaleMaskedFloat32x8 x y mask) + // result: (VSCALEFPSMasked256 x y (VPMOVVec32x8ToM mask)) + for { + x := v_0 + y := v_1 + mask := v_2 + v.reset(OpAMD64VSCALEFPSMasked256) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg3(x, y, v0) + return true + } +} +func rewriteValueAMD64_OpScaleMaskedFloat64x2(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (ScaleMaskedFloat64x2 x y mask) + // result: (VSCALEFPDMasked128 x y (VPMOVVec64x2ToM mask)) + for { + x := v_0 + y := v_1 + mask := v_2 + v.reset(OpAMD64VSCALEFPDMasked128) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg3(x, y, v0) + return true + } +} +func rewriteValueAMD64_OpScaleMaskedFloat64x4(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (ScaleMaskedFloat64x4 x y mask) + // result: (VSCALEFPDMasked256 x y (VPMOVVec64x4ToM mask)) + for { + x := v_0 + y := v_1 + mask := v_2 + v.reset(OpAMD64VSCALEFPDMasked256) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg3(x, y, v0) + return true + } +} +func rewriteValueAMD64_OpScaleMaskedFloat64x8(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (ScaleMaskedFloat64x8 x y mask) + // result: (VSCALEFPDMasked512 x y (VPMOVVec64x8ToM mask)) + for { + x := v_0 + y := v_1 + mask := v_2 + v.reset(OpAMD64VSCALEFPDMasked512) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg3(x, y, v0) + return true + } +} func rewriteValueAMD64_OpSelect0(v *Value) bool { v_0 := v.Args[0] b := v.Block @@ -54763,6 +54361,222 @@ func rewriteValueAMD64_OpSubMaskedUint8x64(v *Value) bool { return true } } +func rewriteValueAMD64_OpSubSaturatedMaskedInt16x16(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (SubSaturatedMaskedInt16x16 x y mask) + // result: (VPSUBSWMasked256 x y (VPMOVVec16x16ToM mask)) + for { + x := v_0 + y := v_1 + mask := v_2 + v.reset(OpAMD64VPSUBSWMasked256) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x16ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg3(x, y, v0) + return true + } +} +func rewriteValueAMD64_OpSubSaturatedMaskedInt16x32(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (SubSaturatedMaskedInt16x32 x y mask) + // result: (VPSUBSWMasked512 x y (VPMOVVec16x32ToM mask)) + for { + x := v_0 + y := v_1 + mask := v_2 + v.reset(OpAMD64VPSUBSWMasked512) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x32ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg3(x, y, v0) + return true + } +} +func rewriteValueAMD64_OpSubSaturatedMaskedInt16x8(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (SubSaturatedMaskedInt16x8 x y mask) + // result: (VPSUBSWMasked128 x y (VPMOVVec16x8ToM mask)) + for { + x := v_0 + y := v_1 + mask := v_2 + v.reset(OpAMD64VPSUBSWMasked128) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg3(x, y, v0) + return true + } +} +func rewriteValueAMD64_OpSubSaturatedMaskedInt8x16(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (SubSaturatedMaskedInt8x16 x y mask) + // result: (VPSUBSBMasked128 x y (VPMOVVec8x16ToM mask)) + for { + x := v_0 + y := v_1 + mask := v_2 + v.reset(OpAMD64VPSUBSBMasked128) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x16ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg3(x, y, v0) + return true + } +} +func rewriteValueAMD64_OpSubSaturatedMaskedInt8x32(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (SubSaturatedMaskedInt8x32 x y mask) + // result: (VPSUBSBMasked256 x y (VPMOVVec8x32ToM mask)) + for { + x := v_0 + y := v_1 + mask := v_2 + v.reset(OpAMD64VPSUBSBMasked256) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x32ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg3(x, y, v0) + return true + } +} +func rewriteValueAMD64_OpSubSaturatedMaskedInt8x64(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (SubSaturatedMaskedInt8x64 x y mask) + // result: (VPSUBSBMasked512 x y (VPMOVVec8x64ToM mask)) + for { + x := v_0 + y := v_1 + mask := v_2 + v.reset(OpAMD64VPSUBSBMasked512) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x64ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg3(x, y, v0) + return true + } +} +func rewriteValueAMD64_OpSubSaturatedMaskedUint16x16(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (SubSaturatedMaskedUint16x16 x y mask) + // result: (VPSUBSWMasked256 x y (VPMOVVec16x16ToM mask)) + for { + x := v_0 + y := v_1 + mask := v_2 + v.reset(OpAMD64VPSUBSWMasked256) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x16ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg3(x, y, v0) + return true + } +} +func rewriteValueAMD64_OpSubSaturatedMaskedUint16x32(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (SubSaturatedMaskedUint16x32 x y mask) + // result: (VPSUBSWMasked512 x y (VPMOVVec16x32ToM mask)) + for { + x := v_0 + y := v_1 + mask := v_2 + v.reset(OpAMD64VPSUBSWMasked512) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x32ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg3(x, y, v0) + return true + } +} +func rewriteValueAMD64_OpSubSaturatedMaskedUint16x8(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (SubSaturatedMaskedUint16x8 x y mask) + // result: (VPSUBSWMasked128 x y (VPMOVVec16x8ToM mask)) + for { + x := v_0 + y := v_1 + mask := v_2 + v.reset(OpAMD64VPSUBSWMasked128) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg3(x, y, v0) + return true + } +} +func rewriteValueAMD64_OpSubSaturatedMaskedUint8x16(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (SubSaturatedMaskedUint8x16 x y mask) + // result: (VPSUBSBMasked128 x y (VPMOVVec8x16ToM mask)) + for { + x := v_0 + y := v_1 + mask := v_2 + v.reset(OpAMD64VPSUBSBMasked128) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x16ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg3(x, y, v0) + return true + } +} +func rewriteValueAMD64_OpSubSaturatedMaskedUint8x32(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (SubSaturatedMaskedUint8x32 x y mask) + // result: (VPSUBSBMasked256 x y (VPMOVVec8x32ToM mask)) + for { + x := v_0 + y := v_1 + mask := v_2 + v.reset(OpAMD64VPSUBSBMasked256) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x32ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg3(x, y, v0) + return true + } +} +func rewriteValueAMD64_OpSubSaturatedMaskedUint8x64(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (SubSaturatedMaskedUint8x64 x y mask) + // result: (VPSUBSBMasked512 x y (VPMOVVec8x64ToM mask)) + for { + x := v_0 + y := v_1 + mask := v_2 + v.reset(OpAMD64VPSUBSBMasked512) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x64ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg3(x, y, v0) + return true + } +} func rewriteValueAMD64_OpTrunc(v *Value) bool { v_0 := v.Args[0] // match: (Trunc x) @@ -54823,9 +54637,9 @@ func rewriteValueAMD64_OpTruncFloat64x4(v *Value) bool { return true } } -func rewriteValueAMD64_OpTruncWithPrecisionFloat32x16(v *Value) bool { +func rewriteValueAMD64_OpTruncScaledFloat32x16(v *Value) bool { v_0 := v.Args[0] - // match: (TruncWithPrecisionFloat32x16 [a] x) + // match: (TruncScaledFloat32x16 [a] x) // result: (VRNDSCALEPS512 [a+3] x) for { a := auxIntToInt8(v.AuxInt) @@ -54836,9 +54650,9 @@ func rewriteValueAMD64_OpTruncWithPrecisionFloat32x16(v *Value) bool { return true } } -func rewriteValueAMD64_OpTruncWithPrecisionFloat32x4(v *Value) bool { +func rewriteValueAMD64_OpTruncScaledFloat32x4(v *Value) bool { v_0 := v.Args[0] - // match: (TruncWithPrecisionFloat32x4 [a] x) + // match: (TruncScaledFloat32x4 [a] x) // result: (VRNDSCALEPS128 [a+3] x) for { a := auxIntToInt8(v.AuxInt) @@ -54849,9 +54663,9 @@ func rewriteValueAMD64_OpTruncWithPrecisionFloat32x4(v *Value) bool { return true } } -func rewriteValueAMD64_OpTruncWithPrecisionFloat32x8(v *Value) bool { +func rewriteValueAMD64_OpTruncScaledFloat32x8(v *Value) bool { v_0 := v.Args[0] - // match: (TruncWithPrecisionFloat32x8 [a] x) + // match: (TruncScaledFloat32x8 [a] x) // result: (VRNDSCALEPS256 [a+3] x) for { a := auxIntToInt8(v.AuxInt) @@ -54862,9 +54676,9 @@ func rewriteValueAMD64_OpTruncWithPrecisionFloat32x8(v *Value) bool { return true } } -func rewriteValueAMD64_OpTruncWithPrecisionFloat64x2(v *Value) bool { +func rewriteValueAMD64_OpTruncScaledFloat64x2(v *Value) bool { v_0 := v.Args[0] - // match: (TruncWithPrecisionFloat64x2 [a] x) + // match: (TruncScaledFloat64x2 [a] x) // result: (VRNDSCALEPD128 [a+3] x) for { a := auxIntToInt8(v.AuxInt) @@ -54875,9 +54689,9 @@ func rewriteValueAMD64_OpTruncWithPrecisionFloat64x2(v *Value) bool { return true } } -func rewriteValueAMD64_OpTruncWithPrecisionFloat64x4(v *Value) bool { +func rewriteValueAMD64_OpTruncScaledFloat64x4(v *Value) bool { v_0 := v.Args[0] - // match: (TruncWithPrecisionFloat64x4 [a] x) + // match: (TruncScaledFloat64x4 [a] x) // result: (VRNDSCALEPD256 [a+3] x) for { a := auxIntToInt8(v.AuxInt) @@ -54888,9 +54702,9 @@ func rewriteValueAMD64_OpTruncWithPrecisionFloat64x4(v *Value) bool { return true } } -func rewriteValueAMD64_OpTruncWithPrecisionFloat64x8(v *Value) bool { +func rewriteValueAMD64_OpTruncScaledFloat64x8(v *Value) bool { v_0 := v.Args[0] - // match: (TruncWithPrecisionFloat64x8 [a] x) + // match: (TruncScaledFloat64x8 [a] x) // result: (VRNDSCALEPD512 [a+3] x) for { a := auxIntToInt8(v.AuxInt) @@ -54901,11 +54715,11 @@ func rewriteValueAMD64_OpTruncWithPrecisionFloat64x8(v *Value) bool { return true } } -func rewriteValueAMD64_OpTruncWithPrecisionMaskedFloat32x16(v *Value) bool { +func rewriteValueAMD64_OpTruncScaledMaskedFloat32x16(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] b := v.Block - // match: (TruncWithPrecisionMaskedFloat32x16 [a] x mask) + // match: (TruncScaledMaskedFloat32x16 [a] x mask) // result: (VRNDSCALEPSMasked512 [a+3] x (VPMOVVec32x16ToM mask)) for { a := auxIntToInt8(v.AuxInt) @@ -54919,11 +54733,11 @@ func rewriteValueAMD64_OpTruncWithPrecisionMaskedFloat32x16(v *Value) bool { return true } } -func rewriteValueAMD64_OpTruncWithPrecisionMaskedFloat32x4(v *Value) bool { +func rewriteValueAMD64_OpTruncScaledMaskedFloat32x4(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] b := v.Block - // match: (TruncWithPrecisionMaskedFloat32x4 [a] x mask) + // match: (TruncScaledMaskedFloat32x4 [a] x mask) // result: (VRNDSCALEPSMasked128 [a+3] x (VPMOVVec32x4ToM mask)) for { a := auxIntToInt8(v.AuxInt) @@ -54937,11 +54751,11 @@ func rewriteValueAMD64_OpTruncWithPrecisionMaskedFloat32x4(v *Value) bool { return true } } -func rewriteValueAMD64_OpTruncWithPrecisionMaskedFloat32x8(v *Value) bool { +func rewriteValueAMD64_OpTruncScaledMaskedFloat32x8(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] b := v.Block - // match: (TruncWithPrecisionMaskedFloat32x8 [a] x mask) + // match: (TruncScaledMaskedFloat32x8 [a] x mask) // result: (VRNDSCALEPSMasked256 [a+3] x (VPMOVVec32x8ToM mask)) for { a := auxIntToInt8(v.AuxInt) @@ -54955,11 +54769,11 @@ func rewriteValueAMD64_OpTruncWithPrecisionMaskedFloat32x8(v *Value) bool { return true } } -func rewriteValueAMD64_OpTruncWithPrecisionMaskedFloat64x2(v *Value) bool { +func rewriteValueAMD64_OpTruncScaledMaskedFloat64x2(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] b := v.Block - // match: (TruncWithPrecisionMaskedFloat64x2 [a] x mask) + // match: (TruncScaledMaskedFloat64x2 [a] x mask) // result: (VRNDSCALEPDMasked128 [a+3] x (VPMOVVec64x2ToM mask)) for { a := auxIntToInt8(v.AuxInt) @@ -54973,11 +54787,11 @@ func rewriteValueAMD64_OpTruncWithPrecisionMaskedFloat64x2(v *Value) bool { return true } } -func rewriteValueAMD64_OpTruncWithPrecisionMaskedFloat64x4(v *Value) bool { +func rewriteValueAMD64_OpTruncScaledMaskedFloat64x4(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] b := v.Block - // match: (TruncWithPrecisionMaskedFloat64x4 [a] x mask) + // match: (TruncScaledMaskedFloat64x4 [a] x mask) // result: (VRNDSCALEPDMasked256 [a+3] x (VPMOVVec64x4ToM mask)) for { a := auxIntToInt8(v.AuxInt) @@ -54991,11 +54805,11 @@ func rewriteValueAMD64_OpTruncWithPrecisionMaskedFloat64x4(v *Value) bool { return true } } -func rewriteValueAMD64_OpTruncWithPrecisionMaskedFloat64x8(v *Value) bool { +func rewriteValueAMD64_OpTruncScaledMaskedFloat64x8(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] b := v.Block - // match: (TruncWithPrecisionMaskedFloat64x8 [a] x mask) + // match: (TruncScaledMaskedFloat64x8 [a] x mask) // result: (VRNDSCALEPDMasked512 [a+3] x (VPMOVVec64x8ToM mask)) for { a := auxIntToInt8(v.AuxInt) @@ -55009,6 +54823,192 @@ func rewriteValueAMD64_OpTruncWithPrecisionMaskedFloat64x8(v *Value) bool { return true } } +func rewriteValueAMD64_OpTruncScaledResidueFloat32x16(v *Value) bool { + v_0 := v.Args[0] + // match: (TruncScaledResidueFloat32x16 [a] x) + // result: (VREDUCEPS512 [a+3] x) + for { + a := auxIntToInt8(v.AuxInt) + x := v_0 + v.reset(OpAMD64VREDUCEPS512) + v.AuxInt = int8ToAuxInt(a + 3) + v.AddArg(x) + return true + } +} +func rewriteValueAMD64_OpTruncScaledResidueFloat32x4(v *Value) bool { + v_0 := v.Args[0] + // match: (TruncScaledResidueFloat32x4 [a] x) + // result: (VREDUCEPS128 [a+3] x) + for { + a := auxIntToInt8(v.AuxInt) + x := v_0 + v.reset(OpAMD64VREDUCEPS128) + v.AuxInt = int8ToAuxInt(a + 3) + v.AddArg(x) + return true + } +} +func rewriteValueAMD64_OpTruncScaledResidueFloat32x8(v *Value) bool { + v_0 := v.Args[0] + // match: (TruncScaledResidueFloat32x8 [a] x) + // result: (VREDUCEPS256 [a+3] x) + for { + a := auxIntToInt8(v.AuxInt) + x := v_0 + v.reset(OpAMD64VREDUCEPS256) + v.AuxInt = int8ToAuxInt(a + 3) + v.AddArg(x) + return true + } +} +func rewriteValueAMD64_OpTruncScaledResidueFloat64x2(v *Value) bool { + v_0 := v.Args[0] + // match: (TruncScaledResidueFloat64x2 [a] x) + // result: (VREDUCEPD128 [a+3] x) + for { + a := auxIntToInt8(v.AuxInt) + x := v_0 + v.reset(OpAMD64VREDUCEPD128) + v.AuxInt = int8ToAuxInt(a + 3) + v.AddArg(x) + return true + } +} +func rewriteValueAMD64_OpTruncScaledResidueFloat64x4(v *Value) bool { + v_0 := v.Args[0] + // match: (TruncScaledResidueFloat64x4 [a] x) + // result: (VREDUCEPD256 [a+3] x) + for { + a := auxIntToInt8(v.AuxInt) + x := v_0 + v.reset(OpAMD64VREDUCEPD256) + v.AuxInt = int8ToAuxInt(a + 3) + v.AddArg(x) + return true + } +} +func rewriteValueAMD64_OpTruncScaledResidueFloat64x8(v *Value) bool { + v_0 := v.Args[0] + // match: (TruncScaledResidueFloat64x8 [a] x) + // result: (VREDUCEPD512 [a+3] x) + for { + a := auxIntToInt8(v.AuxInt) + x := v_0 + v.reset(OpAMD64VREDUCEPD512) + v.AuxInt = int8ToAuxInt(a + 3) + v.AddArg(x) + return true + } +} +func rewriteValueAMD64_OpTruncScaledResidueMaskedFloat32x16(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (TruncScaledResidueMaskedFloat32x16 [a] x mask) + // result: (VREDUCEPSMasked512 [a+3] x (VPMOVVec32x16ToM mask)) + for { + a := auxIntToInt8(v.AuxInt) + x := v_0 + mask := v_1 + v.reset(OpAMD64VREDUCEPSMasked512) + v.AuxInt = int8ToAuxInt(a + 3) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpTruncScaledResidueMaskedFloat32x4(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (TruncScaledResidueMaskedFloat32x4 [a] x mask) + // result: (VREDUCEPSMasked128 [a+3] x (VPMOVVec32x4ToM mask)) + for { + a := auxIntToInt8(v.AuxInt) + x := v_0 + mask := v_1 + v.reset(OpAMD64VREDUCEPSMasked128) + v.AuxInt = int8ToAuxInt(a + 3) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpTruncScaledResidueMaskedFloat32x8(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (TruncScaledResidueMaskedFloat32x8 [a] x mask) + // result: (VREDUCEPSMasked256 [a+3] x (VPMOVVec32x8ToM mask)) + for { + a := auxIntToInt8(v.AuxInt) + x := v_0 + mask := v_1 + v.reset(OpAMD64VREDUCEPSMasked256) + v.AuxInt = int8ToAuxInt(a + 3) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpTruncScaledResidueMaskedFloat64x2(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (TruncScaledResidueMaskedFloat64x2 [a] x mask) + // result: (VREDUCEPDMasked128 [a+3] x (VPMOVVec64x2ToM mask)) + for { + a := auxIntToInt8(v.AuxInt) + x := v_0 + mask := v_1 + v.reset(OpAMD64VREDUCEPDMasked128) + v.AuxInt = int8ToAuxInt(a + 3) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpTruncScaledResidueMaskedFloat64x4(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (TruncScaledResidueMaskedFloat64x4 [a] x mask) + // result: (VREDUCEPDMasked256 [a+3] x (VPMOVVec64x4ToM mask)) + for { + a := auxIntToInt8(v.AuxInt) + x := v_0 + mask := v_1 + v.reset(OpAMD64VREDUCEPDMasked256) + v.AuxInt = int8ToAuxInt(a + 3) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} +func rewriteValueAMD64_OpTruncScaledResidueMaskedFloat64x8(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (TruncScaledResidueMaskedFloat64x8 [a] x mask) + // result: (VREDUCEPDMasked512 [a+3] x (VPMOVVec64x8ToM mask)) + for { + a := auxIntToInt8(v.AuxInt) + x := v_0 + mask := v_1 + v.reset(OpAMD64VREDUCEPDMasked512) + v.AuxInt = int8ToAuxInt(a + 3) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg2(x, v0) + return true + } +} func rewriteValueAMD64_OpUnsignedSignedQuadDotProdAccumulateMaskedInt32x16(v *Value) bool { v_3 := v.Args[3] v_2 := v.Args[2] diff --git a/src/cmd/compile/internal/ssagen/simdintrinsics.go b/src/cmd/compile/internal/ssagen/simdintrinsics.go index 7a7367ee1e7..511974ffa1b 100644 --- a/src/cmd/compile/internal/ssagen/simdintrinsics.go +++ b/src/cmd/compile/internal/ssagen/simdintrinsics.go @@ -101,6 +101,44 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Uint64x2.AddMasked", opLen3(ssa.OpAddMaskedUint64x2, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint64x4.AddMasked", opLen3(ssa.OpAddMaskedUint64x4, types.TypeVec256), sys.AMD64) addF(simdPackage, "Uint64x8.AddMasked", opLen3(ssa.OpAddMaskedUint64x8, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Float32x4.AddPairs", opLen2(ssa.OpAddPairsFloat32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Float32x8.AddPairs", opLen2(ssa.OpAddPairsFloat32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Float64x2.AddPairs", opLen2(ssa.OpAddPairsFloat64x2, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Float64x4.AddPairs", opLen2(ssa.OpAddPairsFloat64x4, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int16x8.AddPairs", opLen2(ssa.OpAddPairsInt16x8, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int16x16.AddPairs", opLen2(ssa.OpAddPairsInt16x16, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int32x4.AddPairs", opLen2(ssa.OpAddPairsInt32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int32x8.AddPairs", opLen2(ssa.OpAddPairsInt32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Uint16x8.AddPairs", opLen2(ssa.OpAddPairsUint16x8, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Uint16x16.AddPairs", opLen2(ssa.OpAddPairsUint16x16, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Uint32x4.AddPairs", opLen2(ssa.OpAddPairsUint32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Uint32x8.AddPairs", opLen2(ssa.OpAddPairsUint32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int16x8.AddPairsSaturated", opLen2(ssa.OpAddPairsSaturatedInt16x8, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int16x16.AddPairsSaturated", opLen2(ssa.OpAddPairsSaturatedInt16x16, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int8x16.AddSaturated", opLen2(ssa.OpAddSaturatedInt8x16, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int8x32.AddSaturated", opLen2(ssa.OpAddSaturatedInt8x32, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int8x64.AddSaturated", opLen2(ssa.OpAddSaturatedInt8x64, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int16x8.AddSaturated", opLen2(ssa.OpAddSaturatedInt16x8, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int16x16.AddSaturated", opLen2(ssa.OpAddSaturatedInt16x16, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int16x32.AddSaturated", opLen2(ssa.OpAddSaturatedInt16x32, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Uint8x16.AddSaturated", opLen2(ssa.OpAddSaturatedUint8x16, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Uint8x32.AddSaturated", opLen2(ssa.OpAddSaturatedUint8x32, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Uint8x64.AddSaturated", opLen2(ssa.OpAddSaturatedUint8x64, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Uint16x8.AddSaturated", opLen2(ssa.OpAddSaturatedUint16x8, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Uint16x16.AddSaturated", opLen2(ssa.OpAddSaturatedUint16x16, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Uint16x32.AddSaturated", opLen2(ssa.OpAddSaturatedUint16x32, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int8x16.AddSaturatedMasked", opLen3(ssa.OpAddSaturatedMaskedInt8x16, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int8x32.AddSaturatedMasked", opLen3(ssa.OpAddSaturatedMaskedInt8x32, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int8x64.AddSaturatedMasked", opLen3(ssa.OpAddSaturatedMaskedInt8x64, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int16x8.AddSaturatedMasked", opLen3(ssa.OpAddSaturatedMaskedInt16x8, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int16x16.AddSaturatedMasked", opLen3(ssa.OpAddSaturatedMaskedInt16x16, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int16x32.AddSaturatedMasked", opLen3(ssa.OpAddSaturatedMaskedInt16x32, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Uint8x16.AddSaturatedMasked", opLen3(ssa.OpAddSaturatedMaskedUint8x16, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Uint8x32.AddSaturatedMasked", opLen3(ssa.OpAddSaturatedMaskedUint8x32, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Uint8x64.AddSaturatedMasked", opLen3(ssa.OpAddSaturatedMaskedUint8x64, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Uint16x8.AddSaturatedMasked", opLen3(ssa.OpAddSaturatedMaskedUint16x8, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Uint16x16.AddSaturatedMasked", opLen3(ssa.OpAddSaturatedMaskedUint16x16, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Uint16x32.AddSaturatedMasked", opLen3(ssa.OpAddSaturatedMaskedUint16x32, types.TypeVec512), sys.AMD64) addF(simdPackage, "Float32x4.AddSub", opLen2(ssa.OpAddSubFloat32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Float32x8.AddSub", opLen2(ssa.OpAddSubFloat32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Float64x2.AddSub", opLen2(ssa.OpAddSubFloat64x2, types.TypeVec128), sys.AMD64) @@ -217,18 +255,30 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Float32x8.Ceil", opLen1(ssa.OpCeilFloat32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Float64x2.Ceil", opLen1(ssa.OpCeilFloat64x2, types.TypeVec128), sys.AMD64) addF(simdPackage, "Float64x4.Ceil", opLen1(ssa.OpCeilFloat64x4, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Float32x4.CeilWithPrecision", opLen1Imm8(ssa.OpCeilWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64) - addF(simdPackage, "Float32x8.CeilWithPrecision", opLen1Imm8(ssa.OpCeilWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64) - addF(simdPackage, "Float32x16.CeilWithPrecision", opLen1Imm8(ssa.OpCeilWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64) - addF(simdPackage, "Float64x2.CeilWithPrecision", opLen1Imm8(ssa.OpCeilWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64) - addF(simdPackage, "Float64x4.CeilWithPrecision", opLen1Imm8(ssa.OpCeilWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64) - addF(simdPackage, "Float64x8.CeilWithPrecision", opLen1Imm8(ssa.OpCeilWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64) - addF(simdPackage, "Float32x4.CeilWithPrecisionMasked", opLen2Imm8(ssa.OpCeilWithPrecisionMaskedFloat32x4, types.TypeVec128, 4), sys.AMD64) - addF(simdPackage, "Float32x8.CeilWithPrecisionMasked", opLen2Imm8(ssa.OpCeilWithPrecisionMaskedFloat32x8, types.TypeVec256, 4), sys.AMD64) - addF(simdPackage, "Float32x16.CeilWithPrecisionMasked", opLen2Imm8(ssa.OpCeilWithPrecisionMaskedFloat32x16, types.TypeVec512, 4), sys.AMD64) - addF(simdPackage, "Float64x2.CeilWithPrecisionMasked", opLen2Imm8(ssa.OpCeilWithPrecisionMaskedFloat64x2, types.TypeVec128, 4), sys.AMD64) - addF(simdPackage, "Float64x4.CeilWithPrecisionMasked", opLen2Imm8(ssa.OpCeilWithPrecisionMaskedFloat64x4, types.TypeVec256, 4), sys.AMD64) - addF(simdPackage, "Float64x8.CeilWithPrecisionMasked", opLen2Imm8(ssa.OpCeilWithPrecisionMaskedFloat64x8, types.TypeVec512, 4), sys.AMD64) + addF(simdPackage, "Float32x4.CeilScaled", opLen1Imm8(ssa.OpCeilScaledFloat32x4, types.TypeVec128, 4), sys.AMD64) + addF(simdPackage, "Float32x8.CeilScaled", opLen1Imm8(ssa.OpCeilScaledFloat32x8, types.TypeVec256, 4), sys.AMD64) + addF(simdPackage, "Float32x16.CeilScaled", opLen1Imm8(ssa.OpCeilScaledFloat32x16, types.TypeVec512, 4), sys.AMD64) + addF(simdPackage, "Float64x2.CeilScaled", opLen1Imm8(ssa.OpCeilScaledFloat64x2, types.TypeVec128, 4), sys.AMD64) + addF(simdPackage, "Float64x4.CeilScaled", opLen1Imm8(ssa.OpCeilScaledFloat64x4, types.TypeVec256, 4), sys.AMD64) + addF(simdPackage, "Float64x8.CeilScaled", opLen1Imm8(ssa.OpCeilScaledFloat64x8, types.TypeVec512, 4), sys.AMD64) + addF(simdPackage, "Float32x4.CeilScaledMasked", opLen2Imm8(ssa.OpCeilScaledMaskedFloat32x4, types.TypeVec128, 4), sys.AMD64) + addF(simdPackage, "Float32x8.CeilScaledMasked", opLen2Imm8(ssa.OpCeilScaledMaskedFloat32x8, types.TypeVec256, 4), sys.AMD64) + addF(simdPackage, "Float32x16.CeilScaledMasked", opLen2Imm8(ssa.OpCeilScaledMaskedFloat32x16, types.TypeVec512, 4), sys.AMD64) + addF(simdPackage, "Float64x2.CeilScaledMasked", opLen2Imm8(ssa.OpCeilScaledMaskedFloat64x2, types.TypeVec128, 4), sys.AMD64) + addF(simdPackage, "Float64x4.CeilScaledMasked", opLen2Imm8(ssa.OpCeilScaledMaskedFloat64x4, types.TypeVec256, 4), sys.AMD64) + addF(simdPackage, "Float64x8.CeilScaledMasked", opLen2Imm8(ssa.OpCeilScaledMaskedFloat64x8, types.TypeVec512, 4), sys.AMD64) + addF(simdPackage, "Float32x4.CeilScaledResidue", opLen1Imm8(ssa.OpCeilScaledResidueFloat32x4, types.TypeVec128, 4), sys.AMD64) + addF(simdPackage, "Float32x8.CeilScaledResidue", opLen1Imm8(ssa.OpCeilScaledResidueFloat32x8, types.TypeVec256, 4), sys.AMD64) + addF(simdPackage, "Float32x16.CeilScaledResidue", opLen1Imm8(ssa.OpCeilScaledResidueFloat32x16, types.TypeVec512, 4), sys.AMD64) + addF(simdPackage, "Float64x2.CeilScaledResidue", opLen1Imm8(ssa.OpCeilScaledResidueFloat64x2, types.TypeVec128, 4), sys.AMD64) + addF(simdPackage, "Float64x4.CeilScaledResidue", opLen1Imm8(ssa.OpCeilScaledResidueFloat64x4, types.TypeVec256, 4), sys.AMD64) + addF(simdPackage, "Float64x8.CeilScaledResidue", opLen1Imm8(ssa.OpCeilScaledResidueFloat64x8, types.TypeVec512, 4), sys.AMD64) + addF(simdPackage, "Float32x4.CeilScaledResidueMasked", opLen2Imm8(ssa.OpCeilScaledResidueMaskedFloat32x4, types.TypeVec128, 4), sys.AMD64) + addF(simdPackage, "Float32x8.CeilScaledResidueMasked", opLen2Imm8(ssa.OpCeilScaledResidueMaskedFloat32x8, types.TypeVec256, 4), sys.AMD64) + addF(simdPackage, "Float32x16.CeilScaledResidueMasked", opLen2Imm8(ssa.OpCeilScaledResidueMaskedFloat32x16, types.TypeVec512, 4), sys.AMD64) + addF(simdPackage, "Float64x2.CeilScaledResidueMasked", opLen2Imm8(ssa.OpCeilScaledResidueMaskedFloat64x2, types.TypeVec128, 4), sys.AMD64) + addF(simdPackage, "Float64x4.CeilScaledResidueMasked", opLen2Imm8(ssa.OpCeilScaledResidueMaskedFloat64x4, types.TypeVec256, 4), sys.AMD64) + addF(simdPackage, "Float64x8.CeilScaledResidueMasked", opLen2Imm8(ssa.OpCeilScaledResidueMaskedFloat64x8, types.TypeVec512, 4), sys.AMD64) addF(simdPackage, "Float32x4.Compress", opLen2(ssa.OpCompressFloat32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Float32x8.Compress", opLen2(ssa.OpCompressFloat32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Float32x16.Compress", opLen2(ssa.OpCompressFloat32x16, types.TypeVec512), sys.AMD64) @@ -271,54 +321,6 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Float32x4.ConvertToUint32Masked", opLen2(ssa.OpConvertToUint32MaskedFloat32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Float32x8.ConvertToUint32Masked", opLen2(ssa.OpConvertToUint32MaskedFloat32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Float32x16.ConvertToUint32Masked", opLen2(ssa.OpConvertToUint32MaskedFloat32x16, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Float32x4.DiffWithCeilWithPrecision", opLen1Imm8(ssa.OpDiffWithCeilWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64) - addF(simdPackage, "Float32x8.DiffWithCeilWithPrecision", opLen1Imm8(ssa.OpDiffWithCeilWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64) - addF(simdPackage, "Float32x16.DiffWithCeilWithPrecision", opLen1Imm8(ssa.OpDiffWithCeilWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64) - addF(simdPackage, "Float64x2.DiffWithCeilWithPrecision", opLen1Imm8(ssa.OpDiffWithCeilWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64) - addF(simdPackage, "Float64x4.DiffWithCeilWithPrecision", opLen1Imm8(ssa.OpDiffWithCeilWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64) - addF(simdPackage, "Float64x8.DiffWithCeilWithPrecision", opLen1Imm8(ssa.OpDiffWithCeilWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64) - addF(simdPackage, "Float32x4.DiffWithCeilWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithCeilWithPrecisionMaskedFloat32x4, types.TypeVec128, 4), sys.AMD64) - addF(simdPackage, "Float32x8.DiffWithCeilWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithCeilWithPrecisionMaskedFloat32x8, types.TypeVec256, 4), sys.AMD64) - addF(simdPackage, "Float32x16.DiffWithCeilWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithCeilWithPrecisionMaskedFloat32x16, types.TypeVec512, 4), sys.AMD64) - addF(simdPackage, "Float64x2.DiffWithCeilWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithCeilWithPrecisionMaskedFloat64x2, types.TypeVec128, 4), sys.AMD64) - addF(simdPackage, "Float64x4.DiffWithCeilWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithCeilWithPrecisionMaskedFloat64x4, types.TypeVec256, 4), sys.AMD64) - addF(simdPackage, "Float64x8.DiffWithCeilWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithCeilWithPrecisionMaskedFloat64x8, types.TypeVec512, 4), sys.AMD64) - addF(simdPackage, "Float32x4.DiffWithFloorWithPrecision", opLen1Imm8(ssa.OpDiffWithFloorWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64) - addF(simdPackage, "Float32x8.DiffWithFloorWithPrecision", opLen1Imm8(ssa.OpDiffWithFloorWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64) - addF(simdPackage, "Float32x16.DiffWithFloorWithPrecision", opLen1Imm8(ssa.OpDiffWithFloorWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64) - addF(simdPackage, "Float64x2.DiffWithFloorWithPrecision", opLen1Imm8(ssa.OpDiffWithFloorWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64) - addF(simdPackage, "Float64x4.DiffWithFloorWithPrecision", opLen1Imm8(ssa.OpDiffWithFloorWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64) - addF(simdPackage, "Float64x8.DiffWithFloorWithPrecision", opLen1Imm8(ssa.OpDiffWithFloorWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64) - addF(simdPackage, "Float32x4.DiffWithFloorWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithFloorWithPrecisionMaskedFloat32x4, types.TypeVec128, 4), sys.AMD64) - addF(simdPackage, "Float32x8.DiffWithFloorWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithFloorWithPrecisionMaskedFloat32x8, types.TypeVec256, 4), sys.AMD64) - addF(simdPackage, "Float32x16.DiffWithFloorWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithFloorWithPrecisionMaskedFloat32x16, types.TypeVec512, 4), sys.AMD64) - addF(simdPackage, "Float64x2.DiffWithFloorWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithFloorWithPrecisionMaskedFloat64x2, types.TypeVec128, 4), sys.AMD64) - addF(simdPackage, "Float64x4.DiffWithFloorWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithFloorWithPrecisionMaskedFloat64x4, types.TypeVec256, 4), sys.AMD64) - addF(simdPackage, "Float64x8.DiffWithFloorWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithFloorWithPrecisionMaskedFloat64x8, types.TypeVec512, 4), sys.AMD64) - addF(simdPackage, "Float32x4.DiffWithRoundWithPrecision", opLen1Imm8(ssa.OpDiffWithRoundWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64) - addF(simdPackage, "Float32x8.DiffWithRoundWithPrecision", opLen1Imm8(ssa.OpDiffWithRoundWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64) - addF(simdPackage, "Float32x16.DiffWithRoundWithPrecision", opLen1Imm8(ssa.OpDiffWithRoundWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64) - addF(simdPackage, "Float64x2.DiffWithRoundWithPrecision", opLen1Imm8(ssa.OpDiffWithRoundWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64) - addF(simdPackage, "Float64x4.DiffWithRoundWithPrecision", opLen1Imm8(ssa.OpDiffWithRoundWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64) - addF(simdPackage, "Float64x8.DiffWithRoundWithPrecision", opLen1Imm8(ssa.OpDiffWithRoundWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64) - addF(simdPackage, "Float32x4.DiffWithRoundWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithRoundWithPrecisionMaskedFloat32x4, types.TypeVec128, 4), sys.AMD64) - addF(simdPackage, "Float32x8.DiffWithRoundWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithRoundWithPrecisionMaskedFloat32x8, types.TypeVec256, 4), sys.AMD64) - addF(simdPackage, "Float32x16.DiffWithRoundWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithRoundWithPrecisionMaskedFloat32x16, types.TypeVec512, 4), sys.AMD64) - addF(simdPackage, "Float64x2.DiffWithRoundWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithRoundWithPrecisionMaskedFloat64x2, types.TypeVec128, 4), sys.AMD64) - addF(simdPackage, "Float64x4.DiffWithRoundWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithRoundWithPrecisionMaskedFloat64x4, types.TypeVec256, 4), sys.AMD64) - addF(simdPackage, "Float64x8.DiffWithRoundWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithRoundWithPrecisionMaskedFloat64x8, types.TypeVec512, 4), sys.AMD64) - addF(simdPackage, "Float32x4.DiffWithTruncWithPrecision", opLen1Imm8(ssa.OpDiffWithTruncWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64) - addF(simdPackage, "Float32x8.DiffWithTruncWithPrecision", opLen1Imm8(ssa.OpDiffWithTruncWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64) - addF(simdPackage, "Float32x16.DiffWithTruncWithPrecision", opLen1Imm8(ssa.OpDiffWithTruncWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64) - addF(simdPackage, "Float64x2.DiffWithTruncWithPrecision", opLen1Imm8(ssa.OpDiffWithTruncWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64) - addF(simdPackage, "Float64x4.DiffWithTruncWithPrecision", opLen1Imm8(ssa.OpDiffWithTruncWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64) - addF(simdPackage, "Float64x8.DiffWithTruncWithPrecision", opLen1Imm8(ssa.OpDiffWithTruncWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64) - addF(simdPackage, "Float32x4.DiffWithTruncWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithTruncWithPrecisionMaskedFloat32x4, types.TypeVec128, 4), sys.AMD64) - addF(simdPackage, "Float32x8.DiffWithTruncWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithTruncWithPrecisionMaskedFloat32x8, types.TypeVec256, 4), sys.AMD64) - addF(simdPackage, "Float32x16.DiffWithTruncWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithTruncWithPrecisionMaskedFloat32x16, types.TypeVec512, 4), sys.AMD64) - addF(simdPackage, "Float64x2.DiffWithTruncWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithTruncWithPrecisionMaskedFloat64x2, types.TypeVec128, 4), sys.AMD64) - addF(simdPackage, "Float64x4.DiffWithTruncWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithTruncWithPrecisionMaskedFloat64x4, types.TypeVec256, 4), sys.AMD64) - addF(simdPackage, "Float64x8.DiffWithTruncWithPrecisionMasked", opLen2Imm8(ssa.OpDiffWithTruncWithPrecisionMaskedFloat64x8, types.TypeVec512, 4), sys.AMD64) addF(simdPackage, "Float32x4.Div", opLen2(ssa.OpDivFloat32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Float32x8.Div", opLen2(ssa.OpDivFloat32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Float32x16.Div", opLen2(ssa.OpDivFloat32x16, types.TypeVec512), sys.AMD64) @@ -398,18 +400,30 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Float32x8.Floor", opLen1(ssa.OpFloorFloat32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Float64x2.Floor", opLen1(ssa.OpFloorFloat64x2, types.TypeVec128), sys.AMD64) addF(simdPackage, "Float64x4.Floor", opLen1(ssa.OpFloorFloat64x4, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Float32x4.FloorWithPrecision", opLen1Imm8(ssa.OpFloorWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64) - addF(simdPackage, "Float32x8.FloorWithPrecision", opLen1Imm8(ssa.OpFloorWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64) - addF(simdPackage, "Float32x16.FloorWithPrecision", opLen1Imm8(ssa.OpFloorWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64) - addF(simdPackage, "Float64x2.FloorWithPrecision", opLen1Imm8(ssa.OpFloorWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64) - addF(simdPackage, "Float64x4.FloorWithPrecision", opLen1Imm8(ssa.OpFloorWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64) - addF(simdPackage, "Float64x8.FloorWithPrecision", opLen1Imm8(ssa.OpFloorWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64) - addF(simdPackage, "Float32x4.FloorWithPrecisionMasked", opLen2Imm8(ssa.OpFloorWithPrecisionMaskedFloat32x4, types.TypeVec128, 4), sys.AMD64) - addF(simdPackage, "Float32x8.FloorWithPrecisionMasked", opLen2Imm8(ssa.OpFloorWithPrecisionMaskedFloat32x8, types.TypeVec256, 4), sys.AMD64) - addF(simdPackage, "Float32x16.FloorWithPrecisionMasked", opLen2Imm8(ssa.OpFloorWithPrecisionMaskedFloat32x16, types.TypeVec512, 4), sys.AMD64) - addF(simdPackage, "Float64x2.FloorWithPrecisionMasked", opLen2Imm8(ssa.OpFloorWithPrecisionMaskedFloat64x2, types.TypeVec128, 4), sys.AMD64) - addF(simdPackage, "Float64x4.FloorWithPrecisionMasked", opLen2Imm8(ssa.OpFloorWithPrecisionMaskedFloat64x4, types.TypeVec256, 4), sys.AMD64) - addF(simdPackage, "Float64x8.FloorWithPrecisionMasked", opLen2Imm8(ssa.OpFloorWithPrecisionMaskedFloat64x8, types.TypeVec512, 4), sys.AMD64) + addF(simdPackage, "Float32x4.FloorScaled", opLen1Imm8(ssa.OpFloorScaledFloat32x4, types.TypeVec128, 4), sys.AMD64) + addF(simdPackage, "Float32x8.FloorScaled", opLen1Imm8(ssa.OpFloorScaledFloat32x8, types.TypeVec256, 4), sys.AMD64) + addF(simdPackage, "Float32x16.FloorScaled", opLen1Imm8(ssa.OpFloorScaledFloat32x16, types.TypeVec512, 4), sys.AMD64) + addF(simdPackage, "Float64x2.FloorScaled", opLen1Imm8(ssa.OpFloorScaledFloat64x2, types.TypeVec128, 4), sys.AMD64) + addF(simdPackage, "Float64x4.FloorScaled", opLen1Imm8(ssa.OpFloorScaledFloat64x4, types.TypeVec256, 4), sys.AMD64) + addF(simdPackage, "Float64x8.FloorScaled", opLen1Imm8(ssa.OpFloorScaledFloat64x8, types.TypeVec512, 4), sys.AMD64) + addF(simdPackage, "Float32x4.FloorScaledMasked", opLen2Imm8(ssa.OpFloorScaledMaskedFloat32x4, types.TypeVec128, 4), sys.AMD64) + addF(simdPackage, "Float32x8.FloorScaledMasked", opLen2Imm8(ssa.OpFloorScaledMaskedFloat32x8, types.TypeVec256, 4), sys.AMD64) + addF(simdPackage, "Float32x16.FloorScaledMasked", opLen2Imm8(ssa.OpFloorScaledMaskedFloat32x16, types.TypeVec512, 4), sys.AMD64) + addF(simdPackage, "Float64x2.FloorScaledMasked", opLen2Imm8(ssa.OpFloorScaledMaskedFloat64x2, types.TypeVec128, 4), sys.AMD64) + addF(simdPackage, "Float64x4.FloorScaledMasked", opLen2Imm8(ssa.OpFloorScaledMaskedFloat64x4, types.TypeVec256, 4), sys.AMD64) + addF(simdPackage, "Float64x8.FloorScaledMasked", opLen2Imm8(ssa.OpFloorScaledMaskedFloat64x8, types.TypeVec512, 4), sys.AMD64) + addF(simdPackage, "Float32x4.FloorScaledResidue", opLen1Imm8(ssa.OpFloorScaledResidueFloat32x4, types.TypeVec128, 4), sys.AMD64) + addF(simdPackage, "Float32x8.FloorScaledResidue", opLen1Imm8(ssa.OpFloorScaledResidueFloat32x8, types.TypeVec256, 4), sys.AMD64) + addF(simdPackage, "Float32x16.FloorScaledResidue", opLen1Imm8(ssa.OpFloorScaledResidueFloat32x16, types.TypeVec512, 4), sys.AMD64) + addF(simdPackage, "Float64x2.FloorScaledResidue", opLen1Imm8(ssa.OpFloorScaledResidueFloat64x2, types.TypeVec128, 4), sys.AMD64) + addF(simdPackage, "Float64x4.FloorScaledResidue", opLen1Imm8(ssa.OpFloorScaledResidueFloat64x4, types.TypeVec256, 4), sys.AMD64) + addF(simdPackage, "Float64x8.FloorScaledResidue", opLen1Imm8(ssa.OpFloorScaledResidueFloat64x8, types.TypeVec512, 4), sys.AMD64) + addF(simdPackage, "Float32x4.FloorScaledResidueMasked", opLen2Imm8(ssa.OpFloorScaledResidueMaskedFloat32x4, types.TypeVec128, 4), sys.AMD64) + addF(simdPackage, "Float32x8.FloorScaledResidueMasked", opLen2Imm8(ssa.OpFloorScaledResidueMaskedFloat32x8, types.TypeVec256, 4), sys.AMD64) + addF(simdPackage, "Float32x16.FloorScaledResidueMasked", opLen2Imm8(ssa.OpFloorScaledResidueMaskedFloat32x16, types.TypeVec512, 4), sys.AMD64) + addF(simdPackage, "Float64x2.FloorScaledResidueMasked", opLen2Imm8(ssa.OpFloorScaledResidueMaskedFloat64x2, types.TypeVec128, 4), sys.AMD64) + addF(simdPackage, "Float64x4.FloorScaledResidueMasked", opLen2Imm8(ssa.OpFloorScaledResidueMaskedFloat64x4, types.TypeVec256, 4), sys.AMD64) + addF(simdPackage, "Float64x8.FloorScaledResidueMasked", opLen2Imm8(ssa.OpFloorScaledResidueMaskedFloat64x8, types.TypeVec512, 4), sys.AMD64) addF(simdPackage, "Float32x4.FusedMultiplyAdd", opLen3(ssa.OpFusedMultiplyAddFloat32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Float32x8.FusedMultiplyAdd", opLen3(ssa.OpFusedMultiplyAddFloat32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Float32x16.FusedMultiplyAdd", opLen3(ssa.OpFusedMultiplyAddFloat32x16, types.TypeVec512), sys.AMD64) @@ -860,18 +874,15 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Float64x2.Mul", opLen2(ssa.OpMulFloat64x2, types.TypeVec128), sys.AMD64) addF(simdPackage, "Float64x4.Mul", opLen2(ssa.OpMulFloat64x4, types.TypeVec256), sys.AMD64) addF(simdPackage, "Float64x8.Mul", opLen2(ssa.OpMulFloat64x8, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Float32x4.MulByPowOf2", opLen2(ssa.OpMulByPowOf2Float32x4, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Float32x8.MulByPowOf2", opLen2(ssa.OpMulByPowOf2Float32x8, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Float32x16.MulByPowOf2", opLen2(ssa.OpMulByPowOf2Float32x16, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Float64x2.MulByPowOf2", opLen2(ssa.OpMulByPowOf2Float64x2, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Float64x4.MulByPowOf2", opLen2(ssa.OpMulByPowOf2Float64x4, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Float64x8.MulByPowOf2", opLen2(ssa.OpMulByPowOf2Float64x8, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Float32x4.MulByPowOf2Masked", opLen3(ssa.OpMulByPowOf2MaskedFloat32x4, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Float32x8.MulByPowOf2Masked", opLen3(ssa.OpMulByPowOf2MaskedFloat32x8, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Float32x16.MulByPowOf2Masked", opLen3(ssa.OpMulByPowOf2MaskedFloat32x16, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Float64x2.MulByPowOf2Masked", opLen3(ssa.OpMulByPowOf2MaskedFloat64x2, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Float64x4.MulByPowOf2Masked", opLen3(ssa.OpMulByPowOf2MaskedFloat64x4, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Float64x8.MulByPowOf2Masked", opLen3(ssa.OpMulByPowOf2MaskedFloat64x8, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int16x8.Mul", opLen2(ssa.OpMulInt16x8, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int16x16.Mul", opLen2(ssa.OpMulInt16x16, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int16x32.Mul", opLen2(ssa.OpMulInt16x32, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int32x4.Mul", opLen2(ssa.OpMulInt32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int32x8.Mul", opLen2(ssa.OpMulInt32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int32x16.Mul", opLen2(ssa.OpMulInt32x16, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int64x2.Mul", opLen2(ssa.OpMulInt64x2, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int64x4.Mul", opLen2(ssa.OpMulInt64x4, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int64x8.Mul", opLen2(ssa.OpMulInt64x8, types.TypeVec512), sys.AMD64) addF(simdPackage, "Int32x4.MulEvenWiden", opLen2(ssa.OpMulEvenWidenInt32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int32x8.MulEvenWiden", opLen2(ssa.OpMulEvenWidenInt32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Int64x2.MulEvenWiden", opLen2(ssa.OpMulEvenWidenInt64x2, types.TypeVec128), sys.AMD64) @@ -900,30 +911,21 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Uint16x8.MulHighMasked", opLen3(ssa.OpMulHighMaskedUint16x8, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint16x16.MulHighMasked", opLen3(ssa.OpMulHighMaskedUint16x16, types.TypeVec256), sys.AMD64) addF(simdPackage, "Uint16x32.MulHighMasked", opLen3(ssa.OpMulHighMaskedUint16x32, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Int16x8.MulLow", opLen2(ssa.OpMulLowInt16x8, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int16x16.MulLow", opLen2(ssa.OpMulLowInt16x16, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Int16x32.MulLow", opLen2(ssa.OpMulLowInt16x32, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Int32x4.MulLow", opLen2(ssa.OpMulLowInt32x4, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int32x8.MulLow", opLen2(ssa.OpMulLowInt32x8, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Int32x16.MulLow", opLen2(ssa.OpMulLowInt32x16, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Int64x2.MulLow", opLen2(ssa.OpMulLowInt64x2, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int64x4.MulLow", opLen2(ssa.OpMulLowInt64x4, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Int64x8.MulLow", opLen2(ssa.OpMulLowInt64x8, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Int16x8.MulLowMasked", opLen3(ssa.OpMulLowMaskedInt16x8, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int16x16.MulLowMasked", opLen3(ssa.OpMulLowMaskedInt16x16, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Int16x32.MulLowMasked", opLen3(ssa.OpMulLowMaskedInt16x32, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Int32x4.MulLowMasked", opLen3(ssa.OpMulLowMaskedInt32x4, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int32x8.MulLowMasked", opLen3(ssa.OpMulLowMaskedInt32x8, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Int32x16.MulLowMasked", opLen3(ssa.OpMulLowMaskedInt32x16, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Int64x2.MulLowMasked", opLen3(ssa.OpMulLowMaskedInt64x2, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int64x4.MulLowMasked", opLen3(ssa.OpMulLowMaskedInt64x4, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Int64x8.MulLowMasked", opLen3(ssa.OpMulLowMaskedInt64x8, types.TypeVec512), sys.AMD64) addF(simdPackage, "Float32x4.MulMasked", opLen3(ssa.OpMulMaskedFloat32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Float32x8.MulMasked", opLen3(ssa.OpMulMaskedFloat32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Float32x16.MulMasked", opLen3(ssa.OpMulMaskedFloat32x16, types.TypeVec512), sys.AMD64) addF(simdPackage, "Float64x2.MulMasked", opLen3(ssa.OpMulMaskedFloat64x2, types.TypeVec128), sys.AMD64) addF(simdPackage, "Float64x4.MulMasked", opLen3(ssa.OpMulMaskedFloat64x4, types.TypeVec256), sys.AMD64) addF(simdPackage, "Float64x8.MulMasked", opLen3(ssa.OpMulMaskedFloat64x8, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int16x8.MulMasked", opLen3(ssa.OpMulMaskedInt16x8, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int16x16.MulMasked", opLen3(ssa.OpMulMaskedInt16x16, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int16x32.MulMasked", opLen3(ssa.OpMulMaskedInt16x32, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int32x4.MulMasked", opLen3(ssa.OpMulMaskedInt32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int32x8.MulMasked", opLen3(ssa.OpMulMaskedInt32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int32x16.MulMasked", opLen3(ssa.OpMulMaskedInt32x16, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int64x2.MulMasked", opLen3(ssa.OpMulMaskedInt64x2, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int64x4.MulMasked", opLen3(ssa.OpMulMaskedInt64x4, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int64x8.MulMasked", opLen3(ssa.OpMulMaskedInt64x8, types.TypeVec512), sys.AMD64) addF(simdPackage, "Float32x4.NotEqual", opLen2(ssa.OpNotEqualFloat32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Float32x8.NotEqual", opLen2(ssa.OpNotEqualFloat32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Float32x16.NotEqual", opLen2(ssa.OpNotEqualFloat32x16, types.TypeVec512), sys.AMD64) @@ -1026,30 +1028,6 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Int16x8.PairDotProdMasked", opLen3(ssa.OpPairDotProdMaskedInt16x8, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int16x16.PairDotProdMasked", opLen3(ssa.OpPairDotProdMaskedInt16x16, types.TypeVec256), sys.AMD64) addF(simdPackage, "Int16x32.PairDotProdMasked", opLen3(ssa.OpPairDotProdMaskedInt16x32, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Float32x4.PairwiseAdd", opLen2(ssa.OpPairwiseAddFloat32x4, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Float32x8.PairwiseAdd", opLen2(ssa.OpPairwiseAddFloat32x8, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Float64x2.PairwiseAdd", opLen2(ssa.OpPairwiseAddFloat64x2, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Float64x4.PairwiseAdd", opLen2(ssa.OpPairwiseAddFloat64x4, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Int16x8.PairwiseAdd", opLen2(ssa.OpPairwiseAddInt16x8, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int16x16.PairwiseAdd", opLen2(ssa.OpPairwiseAddInt16x16, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Int32x4.PairwiseAdd", opLen2(ssa.OpPairwiseAddInt32x4, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int32x8.PairwiseAdd", opLen2(ssa.OpPairwiseAddInt32x8, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Uint16x8.PairwiseAdd", opLen2(ssa.OpPairwiseAddUint16x8, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Uint16x16.PairwiseAdd", opLen2(ssa.OpPairwiseAddUint16x16, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Uint32x4.PairwiseAdd", opLen2(ssa.OpPairwiseAddUint32x4, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Uint32x8.PairwiseAdd", opLen2(ssa.OpPairwiseAddUint32x8, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Float32x4.PairwiseSub", opLen2(ssa.OpPairwiseSubFloat32x4, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Float32x8.PairwiseSub", opLen2(ssa.OpPairwiseSubFloat32x8, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Float64x2.PairwiseSub", opLen2(ssa.OpPairwiseSubFloat64x2, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Float64x4.PairwiseSub", opLen2(ssa.OpPairwiseSubFloat64x4, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Int16x8.PairwiseSub", opLen2(ssa.OpPairwiseSubInt16x8, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int16x16.PairwiseSub", opLen2(ssa.OpPairwiseSubInt16x16, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Int32x4.PairwiseSub", opLen2(ssa.OpPairwiseSubInt32x4, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int32x8.PairwiseSub", opLen2(ssa.OpPairwiseSubInt32x8, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Uint16x8.PairwiseSub", opLen2(ssa.OpPairwiseSubUint16x8, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Uint16x16.PairwiseSub", opLen2(ssa.OpPairwiseSubUint16x16, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Uint32x4.PairwiseSub", opLen2(ssa.OpPairwiseSubUint32x4, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Uint32x8.PairwiseSub", opLen2(ssa.OpPairwiseSubUint32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Int8x16.Permute", opLen2_21(ssa.OpPermuteInt8x16, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint8x16.Permute", opLen2_21(ssa.OpPermuteUint8x16, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int8x32.Permute", opLen2_21(ssa.OpPermuteInt8x32, types.TypeVec256), sys.AMD64) @@ -1306,76 +1284,36 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Float32x8.Round", opLen1(ssa.OpRoundFloat32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Float64x2.Round", opLen1(ssa.OpRoundFloat64x2, types.TypeVec128), sys.AMD64) addF(simdPackage, "Float64x4.Round", opLen1(ssa.OpRoundFloat64x4, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Float32x4.RoundWithPrecision", opLen1Imm8(ssa.OpRoundWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64) - addF(simdPackage, "Float32x8.RoundWithPrecision", opLen1Imm8(ssa.OpRoundWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64) - addF(simdPackage, "Float32x16.RoundWithPrecision", opLen1Imm8(ssa.OpRoundWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64) - addF(simdPackage, "Float64x2.RoundWithPrecision", opLen1Imm8(ssa.OpRoundWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64) - addF(simdPackage, "Float64x4.RoundWithPrecision", opLen1Imm8(ssa.OpRoundWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64) - addF(simdPackage, "Float64x8.RoundWithPrecision", opLen1Imm8(ssa.OpRoundWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64) - addF(simdPackage, "Float32x4.RoundWithPrecisionMasked", opLen2Imm8(ssa.OpRoundWithPrecisionMaskedFloat32x4, types.TypeVec128, 4), sys.AMD64) - addF(simdPackage, "Float32x8.RoundWithPrecisionMasked", opLen2Imm8(ssa.OpRoundWithPrecisionMaskedFloat32x8, types.TypeVec256, 4), sys.AMD64) - addF(simdPackage, "Float32x16.RoundWithPrecisionMasked", opLen2Imm8(ssa.OpRoundWithPrecisionMaskedFloat32x16, types.TypeVec512, 4), sys.AMD64) - addF(simdPackage, "Float64x2.RoundWithPrecisionMasked", opLen2Imm8(ssa.OpRoundWithPrecisionMaskedFloat64x2, types.TypeVec128, 4), sys.AMD64) - addF(simdPackage, "Float64x4.RoundWithPrecisionMasked", opLen2Imm8(ssa.OpRoundWithPrecisionMaskedFloat64x4, types.TypeVec256, 4), sys.AMD64) - addF(simdPackage, "Float64x8.RoundWithPrecisionMasked", opLen2Imm8(ssa.OpRoundWithPrecisionMaskedFloat64x8, types.TypeVec512, 4), sys.AMD64) - addF(simdPackage, "Int8x16.SaturatedAdd", opLen2(ssa.OpSaturatedAddInt8x16, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int8x32.SaturatedAdd", opLen2(ssa.OpSaturatedAddInt8x32, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Int8x64.SaturatedAdd", opLen2(ssa.OpSaturatedAddInt8x64, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Int16x8.SaturatedAdd", opLen2(ssa.OpSaturatedAddInt16x8, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int16x16.SaturatedAdd", opLen2(ssa.OpSaturatedAddInt16x16, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Int16x32.SaturatedAdd", opLen2(ssa.OpSaturatedAddInt16x32, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Uint8x16.SaturatedAdd", opLen2(ssa.OpSaturatedAddUint8x16, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Uint8x32.SaturatedAdd", opLen2(ssa.OpSaturatedAddUint8x32, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Uint8x64.SaturatedAdd", opLen2(ssa.OpSaturatedAddUint8x64, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Uint16x8.SaturatedAdd", opLen2(ssa.OpSaturatedAddUint16x8, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Uint16x16.SaturatedAdd", opLen2(ssa.OpSaturatedAddUint16x16, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Uint16x32.SaturatedAdd", opLen2(ssa.OpSaturatedAddUint16x32, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Float32x4.RoundScaled", opLen1Imm8(ssa.OpRoundScaledFloat32x4, types.TypeVec128, 4), sys.AMD64) + addF(simdPackage, "Float32x8.RoundScaled", opLen1Imm8(ssa.OpRoundScaledFloat32x8, types.TypeVec256, 4), sys.AMD64) + addF(simdPackage, "Float32x16.RoundScaled", opLen1Imm8(ssa.OpRoundScaledFloat32x16, types.TypeVec512, 4), sys.AMD64) + addF(simdPackage, "Float64x2.RoundScaled", opLen1Imm8(ssa.OpRoundScaledFloat64x2, types.TypeVec128, 4), sys.AMD64) + addF(simdPackage, "Float64x4.RoundScaled", opLen1Imm8(ssa.OpRoundScaledFloat64x4, types.TypeVec256, 4), sys.AMD64) + addF(simdPackage, "Float64x8.RoundScaled", opLen1Imm8(ssa.OpRoundScaledFloat64x8, types.TypeVec512, 4), sys.AMD64) + addF(simdPackage, "Float32x4.RoundScaledMasked", opLen2Imm8(ssa.OpRoundScaledMaskedFloat32x4, types.TypeVec128, 4), sys.AMD64) + addF(simdPackage, "Float32x8.RoundScaledMasked", opLen2Imm8(ssa.OpRoundScaledMaskedFloat32x8, types.TypeVec256, 4), sys.AMD64) + addF(simdPackage, "Float32x16.RoundScaledMasked", opLen2Imm8(ssa.OpRoundScaledMaskedFloat32x16, types.TypeVec512, 4), sys.AMD64) + addF(simdPackage, "Float64x2.RoundScaledMasked", opLen2Imm8(ssa.OpRoundScaledMaskedFloat64x2, types.TypeVec128, 4), sys.AMD64) + addF(simdPackage, "Float64x4.RoundScaledMasked", opLen2Imm8(ssa.OpRoundScaledMaskedFloat64x4, types.TypeVec256, 4), sys.AMD64) + addF(simdPackage, "Float64x8.RoundScaledMasked", opLen2Imm8(ssa.OpRoundScaledMaskedFloat64x8, types.TypeVec512, 4), sys.AMD64) + addF(simdPackage, "Float32x4.RoundScaledResidue", opLen1Imm8(ssa.OpRoundScaledResidueFloat32x4, types.TypeVec128, 4), sys.AMD64) + addF(simdPackage, "Float32x8.RoundScaledResidue", opLen1Imm8(ssa.OpRoundScaledResidueFloat32x8, types.TypeVec256, 4), sys.AMD64) + addF(simdPackage, "Float32x16.RoundScaledResidue", opLen1Imm8(ssa.OpRoundScaledResidueFloat32x16, types.TypeVec512, 4), sys.AMD64) + addF(simdPackage, "Float64x2.RoundScaledResidue", opLen1Imm8(ssa.OpRoundScaledResidueFloat64x2, types.TypeVec128, 4), sys.AMD64) + addF(simdPackage, "Float64x4.RoundScaledResidue", opLen1Imm8(ssa.OpRoundScaledResidueFloat64x4, types.TypeVec256, 4), sys.AMD64) + addF(simdPackage, "Float64x8.RoundScaledResidue", opLen1Imm8(ssa.OpRoundScaledResidueFloat64x8, types.TypeVec512, 4), sys.AMD64) + addF(simdPackage, "Float32x4.RoundScaledResidueMasked", opLen2Imm8(ssa.OpRoundScaledResidueMaskedFloat32x4, types.TypeVec128, 4), sys.AMD64) + addF(simdPackage, "Float32x8.RoundScaledResidueMasked", opLen2Imm8(ssa.OpRoundScaledResidueMaskedFloat32x8, types.TypeVec256, 4), sys.AMD64) + addF(simdPackage, "Float32x16.RoundScaledResidueMasked", opLen2Imm8(ssa.OpRoundScaledResidueMaskedFloat32x16, types.TypeVec512, 4), sys.AMD64) + addF(simdPackage, "Float64x2.RoundScaledResidueMasked", opLen2Imm8(ssa.OpRoundScaledResidueMaskedFloat64x2, types.TypeVec128, 4), sys.AMD64) + addF(simdPackage, "Float64x4.RoundScaledResidueMasked", opLen2Imm8(ssa.OpRoundScaledResidueMaskedFloat64x4, types.TypeVec256, 4), sys.AMD64) + addF(simdPackage, "Float64x8.RoundScaledResidueMasked", opLen2Imm8(ssa.OpRoundScaledResidueMaskedFloat64x8, types.TypeVec512, 4), sys.AMD64) addF(simdPackage, "Int32x4.SaturatedAddDotProd", opLen3(ssa.OpSaturatedAddDotProdInt32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int32x8.SaturatedAddDotProd", opLen3(ssa.OpSaturatedAddDotProdInt32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Int32x16.SaturatedAddDotProd", opLen3(ssa.OpSaturatedAddDotProdInt32x16, types.TypeVec512), sys.AMD64) addF(simdPackage, "Int32x4.SaturatedAddDotProdMasked", opLen4(ssa.OpSaturatedAddDotProdMaskedInt32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int32x8.SaturatedAddDotProdMasked", opLen4(ssa.OpSaturatedAddDotProdMaskedInt32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Int32x16.SaturatedAddDotProdMasked", opLen4(ssa.OpSaturatedAddDotProdMaskedInt32x16, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Int8x16.SaturatedAddMasked", opLen3(ssa.OpSaturatedAddMaskedInt8x16, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int8x32.SaturatedAddMasked", opLen3(ssa.OpSaturatedAddMaskedInt8x32, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Int8x64.SaturatedAddMasked", opLen3(ssa.OpSaturatedAddMaskedInt8x64, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Int16x8.SaturatedAddMasked", opLen3(ssa.OpSaturatedAddMaskedInt16x8, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int16x16.SaturatedAddMasked", opLen3(ssa.OpSaturatedAddMaskedInt16x16, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Int16x32.SaturatedAddMasked", opLen3(ssa.OpSaturatedAddMaskedInt16x32, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Uint8x16.SaturatedAddMasked", opLen3(ssa.OpSaturatedAddMaskedUint8x16, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Uint8x32.SaturatedAddMasked", opLen3(ssa.OpSaturatedAddMaskedUint8x32, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Uint8x64.SaturatedAddMasked", opLen3(ssa.OpSaturatedAddMaskedUint8x64, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Uint16x8.SaturatedAddMasked", opLen3(ssa.OpSaturatedAddMaskedUint16x8, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Uint16x16.SaturatedAddMasked", opLen3(ssa.OpSaturatedAddMaskedUint16x16, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Uint16x32.SaturatedAddMasked", opLen3(ssa.OpSaturatedAddMaskedUint16x32, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Int16x8.SaturatedPairwiseAdd", opLen2(ssa.OpSaturatedPairwiseAddInt16x8, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int16x16.SaturatedPairwiseAdd", opLen2(ssa.OpSaturatedPairwiseAddInt16x16, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Int16x8.SaturatedPairwiseSub", opLen2(ssa.OpSaturatedPairwiseSubInt16x8, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int16x16.SaturatedPairwiseSub", opLen2(ssa.OpSaturatedPairwiseSubInt16x16, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Int8x16.SaturatedSub", opLen2(ssa.OpSaturatedSubInt8x16, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int8x32.SaturatedSub", opLen2(ssa.OpSaturatedSubInt8x32, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Int8x64.SaturatedSub", opLen2(ssa.OpSaturatedSubInt8x64, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Int16x8.SaturatedSub", opLen2(ssa.OpSaturatedSubInt16x8, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int16x16.SaturatedSub", opLen2(ssa.OpSaturatedSubInt16x16, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Int16x32.SaturatedSub", opLen2(ssa.OpSaturatedSubInt16x32, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Uint8x16.SaturatedSub", opLen2(ssa.OpSaturatedSubUint8x16, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Uint8x32.SaturatedSub", opLen2(ssa.OpSaturatedSubUint8x32, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Uint8x64.SaturatedSub", opLen2(ssa.OpSaturatedSubUint8x64, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Uint16x8.SaturatedSub", opLen2(ssa.OpSaturatedSubUint16x8, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Uint16x16.SaturatedSub", opLen2(ssa.OpSaturatedSubUint16x16, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Uint16x32.SaturatedSub", opLen2(ssa.OpSaturatedSubUint16x32, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Int8x16.SaturatedSubMasked", opLen3(ssa.OpSaturatedSubMaskedInt8x16, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int8x32.SaturatedSubMasked", opLen3(ssa.OpSaturatedSubMaskedInt8x32, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Int8x64.SaturatedSubMasked", opLen3(ssa.OpSaturatedSubMaskedInt8x64, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Int16x8.SaturatedSubMasked", opLen3(ssa.OpSaturatedSubMaskedInt16x8, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int16x16.SaturatedSubMasked", opLen3(ssa.OpSaturatedSubMaskedInt16x16, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Int16x32.SaturatedSubMasked", opLen3(ssa.OpSaturatedSubMaskedInt16x32, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Uint8x16.SaturatedSubMasked", opLen3(ssa.OpSaturatedSubMaskedUint8x16, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Uint8x32.SaturatedSubMasked", opLen3(ssa.OpSaturatedSubMaskedUint8x32, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Uint8x64.SaturatedSubMasked", opLen3(ssa.OpSaturatedSubMaskedUint8x64, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Uint16x8.SaturatedSubMasked", opLen3(ssa.OpSaturatedSubMaskedUint16x8, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Uint16x16.SaturatedSubMasked", opLen3(ssa.OpSaturatedSubMaskedUint16x16, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Uint16x32.SaturatedSubMasked", opLen3(ssa.OpSaturatedSubMaskedUint16x32, types.TypeVec512), sys.AMD64) addF(simdPackage, "Uint8x16.SaturatedUnsignedSignedPairDotProd", opLen2(ssa.OpSaturatedUnsignedSignedPairDotProdUint8x16, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint8x32.SaturatedUnsignedSignedPairDotProd", opLen2(ssa.OpSaturatedUnsignedSignedPairDotProdUint8x32, types.TypeVec256), sys.AMD64) addF(simdPackage, "Uint8x64.SaturatedUnsignedSignedPairDotProd", opLen2(ssa.OpSaturatedUnsignedSignedPairDotProdUint8x64, types.TypeVec512), sys.AMD64) @@ -1388,6 +1326,18 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Int8x16.SaturatedUnsignedSignedQuadDotProdAccumulateMasked", opLen4_31(ssa.OpSaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int8x32.SaturatedUnsignedSignedQuadDotProdAccumulateMasked", opLen4_31(ssa.OpSaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Int8x64.SaturatedUnsignedSignedQuadDotProdAccumulateMasked", opLen4_31(ssa.OpSaturatedUnsignedSignedQuadDotProdAccumulateMaskedInt32x16, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Float32x4.Scale", opLen2(ssa.OpScaleFloat32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Float32x8.Scale", opLen2(ssa.OpScaleFloat32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Float32x16.Scale", opLen2(ssa.OpScaleFloat32x16, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Float64x2.Scale", opLen2(ssa.OpScaleFloat64x2, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Float64x4.Scale", opLen2(ssa.OpScaleFloat64x4, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Float64x8.Scale", opLen2(ssa.OpScaleFloat64x8, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Float32x4.ScaleMasked", opLen3(ssa.OpScaleMaskedFloat32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Float32x8.ScaleMasked", opLen3(ssa.OpScaleMaskedFloat32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Float32x16.ScaleMasked", opLen3(ssa.OpScaleMaskedFloat32x16, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Float64x2.ScaleMasked", opLen3(ssa.OpScaleMaskedFloat64x2, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Float64x4.ScaleMasked", opLen3(ssa.OpScaleMaskedFloat64x4, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Float64x8.ScaleMasked", opLen3(ssa.OpScaleMaskedFloat64x8, types.TypeVec512), sys.AMD64) addF(simdPackage, "Float32x8.Set128", opLen2Imm8(ssa.OpSet128Float32x8, types.TypeVec256, 0), sys.AMD64) addF(simdPackage, "Float64x4.Set128", opLen2Imm8(ssa.OpSet128Float64x4, types.TypeVec256, 0), sys.AMD64) addF(simdPackage, "Int8x32.Set128", opLen2Imm8(ssa.OpSet128Int8x32, types.TypeVec256, 0), sys.AMD64) @@ -1772,22 +1722,72 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Uint64x2.SubMasked", opLen3(ssa.OpSubMaskedUint64x2, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint64x4.SubMasked", opLen3(ssa.OpSubMaskedUint64x4, types.TypeVec256), sys.AMD64) addF(simdPackage, "Uint64x8.SubMasked", opLen3(ssa.OpSubMaskedUint64x8, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Float32x4.SubPairs", opLen2(ssa.OpSubPairsFloat32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Float32x8.SubPairs", opLen2(ssa.OpSubPairsFloat32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Float64x2.SubPairs", opLen2(ssa.OpSubPairsFloat64x2, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Float64x4.SubPairs", opLen2(ssa.OpSubPairsFloat64x4, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int16x8.SubPairs", opLen2(ssa.OpSubPairsInt16x8, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int16x16.SubPairs", opLen2(ssa.OpSubPairsInt16x16, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int32x4.SubPairs", opLen2(ssa.OpSubPairsInt32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int32x8.SubPairs", opLen2(ssa.OpSubPairsInt32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Uint16x8.SubPairs", opLen2(ssa.OpSubPairsUint16x8, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Uint16x16.SubPairs", opLen2(ssa.OpSubPairsUint16x16, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Uint32x4.SubPairs", opLen2(ssa.OpSubPairsUint32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Uint32x8.SubPairs", opLen2(ssa.OpSubPairsUint32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int16x8.SubPairsSaturated", opLen2(ssa.OpSubPairsSaturatedInt16x8, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int16x16.SubPairsSaturated", opLen2(ssa.OpSubPairsSaturatedInt16x16, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int8x16.SubSaturated", opLen2(ssa.OpSubSaturatedInt8x16, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int8x32.SubSaturated", opLen2(ssa.OpSubSaturatedInt8x32, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int8x64.SubSaturated", opLen2(ssa.OpSubSaturatedInt8x64, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int16x8.SubSaturated", opLen2(ssa.OpSubSaturatedInt16x8, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int16x16.SubSaturated", opLen2(ssa.OpSubSaturatedInt16x16, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int16x32.SubSaturated", opLen2(ssa.OpSubSaturatedInt16x32, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Uint8x16.SubSaturated", opLen2(ssa.OpSubSaturatedUint8x16, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Uint8x32.SubSaturated", opLen2(ssa.OpSubSaturatedUint8x32, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Uint8x64.SubSaturated", opLen2(ssa.OpSubSaturatedUint8x64, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Uint16x8.SubSaturated", opLen2(ssa.OpSubSaturatedUint16x8, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Uint16x16.SubSaturated", opLen2(ssa.OpSubSaturatedUint16x16, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Uint16x32.SubSaturated", opLen2(ssa.OpSubSaturatedUint16x32, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int8x16.SubSaturatedMasked", opLen3(ssa.OpSubSaturatedMaskedInt8x16, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int8x32.SubSaturatedMasked", opLen3(ssa.OpSubSaturatedMaskedInt8x32, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int8x64.SubSaturatedMasked", opLen3(ssa.OpSubSaturatedMaskedInt8x64, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int16x8.SubSaturatedMasked", opLen3(ssa.OpSubSaturatedMaskedInt16x8, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int16x16.SubSaturatedMasked", opLen3(ssa.OpSubSaturatedMaskedInt16x16, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int16x32.SubSaturatedMasked", opLen3(ssa.OpSubSaturatedMaskedInt16x32, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Uint8x16.SubSaturatedMasked", opLen3(ssa.OpSubSaturatedMaskedUint8x16, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Uint8x32.SubSaturatedMasked", opLen3(ssa.OpSubSaturatedMaskedUint8x32, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Uint8x64.SubSaturatedMasked", opLen3(ssa.OpSubSaturatedMaskedUint8x64, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Uint16x8.SubSaturatedMasked", opLen3(ssa.OpSubSaturatedMaskedUint16x8, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Uint16x16.SubSaturatedMasked", opLen3(ssa.OpSubSaturatedMaskedUint16x16, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Uint16x32.SubSaturatedMasked", opLen3(ssa.OpSubSaturatedMaskedUint16x32, types.TypeVec512), sys.AMD64) addF(simdPackage, "Float32x4.Trunc", opLen1(ssa.OpTruncFloat32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Float32x8.Trunc", opLen1(ssa.OpTruncFloat32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Float64x2.Trunc", opLen1(ssa.OpTruncFloat64x2, types.TypeVec128), sys.AMD64) addF(simdPackage, "Float64x4.Trunc", opLen1(ssa.OpTruncFloat64x4, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Float32x4.TruncWithPrecision", opLen1Imm8(ssa.OpTruncWithPrecisionFloat32x4, types.TypeVec128, 4), sys.AMD64) - addF(simdPackage, "Float32x8.TruncWithPrecision", opLen1Imm8(ssa.OpTruncWithPrecisionFloat32x8, types.TypeVec256, 4), sys.AMD64) - addF(simdPackage, "Float32x16.TruncWithPrecision", opLen1Imm8(ssa.OpTruncWithPrecisionFloat32x16, types.TypeVec512, 4), sys.AMD64) - addF(simdPackage, "Float64x2.TruncWithPrecision", opLen1Imm8(ssa.OpTruncWithPrecisionFloat64x2, types.TypeVec128, 4), sys.AMD64) - addF(simdPackage, "Float64x4.TruncWithPrecision", opLen1Imm8(ssa.OpTruncWithPrecisionFloat64x4, types.TypeVec256, 4), sys.AMD64) - addF(simdPackage, "Float64x8.TruncWithPrecision", opLen1Imm8(ssa.OpTruncWithPrecisionFloat64x8, types.TypeVec512, 4), sys.AMD64) - addF(simdPackage, "Float32x4.TruncWithPrecisionMasked", opLen2Imm8(ssa.OpTruncWithPrecisionMaskedFloat32x4, types.TypeVec128, 4), sys.AMD64) - addF(simdPackage, "Float32x8.TruncWithPrecisionMasked", opLen2Imm8(ssa.OpTruncWithPrecisionMaskedFloat32x8, types.TypeVec256, 4), sys.AMD64) - addF(simdPackage, "Float32x16.TruncWithPrecisionMasked", opLen2Imm8(ssa.OpTruncWithPrecisionMaskedFloat32x16, types.TypeVec512, 4), sys.AMD64) - addF(simdPackage, "Float64x2.TruncWithPrecisionMasked", opLen2Imm8(ssa.OpTruncWithPrecisionMaskedFloat64x2, types.TypeVec128, 4), sys.AMD64) - addF(simdPackage, "Float64x4.TruncWithPrecisionMasked", opLen2Imm8(ssa.OpTruncWithPrecisionMaskedFloat64x4, types.TypeVec256, 4), sys.AMD64) - addF(simdPackage, "Float64x8.TruncWithPrecisionMasked", opLen2Imm8(ssa.OpTruncWithPrecisionMaskedFloat64x8, types.TypeVec512, 4), sys.AMD64) + addF(simdPackage, "Float32x4.TruncScaled", opLen1Imm8(ssa.OpTruncScaledFloat32x4, types.TypeVec128, 4), sys.AMD64) + addF(simdPackage, "Float32x8.TruncScaled", opLen1Imm8(ssa.OpTruncScaledFloat32x8, types.TypeVec256, 4), sys.AMD64) + addF(simdPackage, "Float32x16.TruncScaled", opLen1Imm8(ssa.OpTruncScaledFloat32x16, types.TypeVec512, 4), sys.AMD64) + addF(simdPackage, "Float64x2.TruncScaled", opLen1Imm8(ssa.OpTruncScaledFloat64x2, types.TypeVec128, 4), sys.AMD64) + addF(simdPackage, "Float64x4.TruncScaled", opLen1Imm8(ssa.OpTruncScaledFloat64x4, types.TypeVec256, 4), sys.AMD64) + addF(simdPackage, "Float64x8.TruncScaled", opLen1Imm8(ssa.OpTruncScaledFloat64x8, types.TypeVec512, 4), sys.AMD64) + addF(simdPackage, "Float32x4.TruncScaledMasked", opLen2Imm8(ssa.OpTruncScaledMaskedFloat32x4, types.TypeVec128, 4), sys.AMD64) + addF(simdPackage, "Float32x8.TruncScaledMasked", opLen2Imm8(ssa.OpTruncScaledMaskedFloat32x8, types.TypeVec256, 4), sys.AMD64) + addF(simdPackage, "Float32x16.TruncScaledMasked", opLen2Imm8(ssa.OpTruncScaledMaskedFloat32x16, types.TypeVec512, 4), sys.AMD64) + addF(simdPackage, "Float64x2.TruncScaledMasked", opLen2Imm8(ssa.OpTruncScaledMaskedFloat64x2, types.TypeVec128, 4), sys.AMD64) + addF(simdPackage, "Float64x4.TruncScaledMasked", opLen2Imm8(ssa.OpTruncScaledMaskedFloat64x4, types.TypeVec256, 4), sys.AMD64) + addF(simdPackage, "Float64x8.TruncScaledMasked", opLen2Imm8(ssa.OpTruncScaledMaskedFloat64x8, types.TypeVec512, 4), sys.AMD64) + addF(simdPackage, "Float32x4.TruncScaledResidue", opLen1Imm8(ssa.OpTruncScaledResidueFloat32x4, types.TypeVec128, 4), sys.AMD64) + addF(simdPackage, "Float32x8.TruncScaledResidue", opLen1Imm8(ssa.OpTruncScaledResidueFloat32x8, types.TypeVec256, 4), sys.AMD64) + addF(simdPackage, "Float32x16.TruncScaledResidue", opLen1Imm8(ssa.OpTruncScaledResidueFloat32x16, types.TypeVec512, 4), sys.AMD64) + addF(simdPackage, "Float64x2.TruncScaledResidue", opLen1Imm8(ssa.OpTruncScaledResidueFloat64x2, types.TypeVec128, 4), sys.AMD64) + addF(simdPackage, "Float64x4.TruncScaledResidue", opLen1Imm8(ssa.OpTruncScaledResidueFloat64x4, types.TypeVec256, 4), sys.AMD64) + addF(simdPackage, "Float64x8.TruncScaledResidue", opLen1Imm8(ssa.OpTruncScaledResidueFloat64x8, types.TypeVec512, 4), sys.AMD64) + addF(simdPackage, "Float32x4.TruncScaledResidueMasked", opLen2Imm8(ssa.OpTruncScaledResidueMaskedFloat32x4, types.TypeVec128, 4), sys.AMD64) + addF(simdPackage, "Float32x8.TruncScaledResidueMasked", opLen2Imm8(ssa.OpTruncScaledResidueMaskedFloat32x8, types.TypeVec256, 4), sys.AMD64) + addF(simdPackage, "Float32x16.TruncScaledResidueMasked", opLen2Imm8(ssa.OpTruncScaledResidueMaskedFloat32x16, types.TypeVec512, 4), sys.AMD64) + addF(simdPackage, "Float64x2.TruncScaledResidueMasked", opLen2Imm8(ssa.OpTruncScaledResidueMaskedFloat64x2, types.TypeVec128, 4), sys.AMD64) + addF(simdPackage, "Float64x4.TruncScaledResidueMasked", opLen2Imm8(ssa.OpTruncScaledResidueMaskedFloat64x4, types.TypeVec256, 4), sys.AMD64) + addF(simdPackage, "Float64x8.TruncScaledResidueMasked", opLen2Imm8(ssa.OpTruncScaledResidueMaskedFloat64x8, types.TypeVec512, 4), sys.AMD64) addF(simdPackage, "Int8x16.UnsignedSignedQuadDotProdAccumulate", opLen3_31(ssa.OpUnsignedSignedQuadDotProdAccumulateInt32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int8x32.UnsignedSignedQuadDotProdAccumulate", opLen3_31(ssa.OpUnsignedSignedQuadDotProdAccumulateInt32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Int8x64.UnsignedSignedQuadDotProdAccumulate", opLen3_31(ssa.OpUnsignedSignedQuadDotProdAccumulateInt32x16, types.TypeVec512), sys.AMD64) diff --git a/src/simd/binary_test.go b/src/simd/binary_test.go index b7daf736f4e..c82bc070e12 100644 --- a/src/simd/binary_test.go +++ b/src/simd/binary_test.go @@ -309,42 +309,42 @@ func TestMul(t *testing.T) { testFloat64x2Binary(t, simd.Float64x2.Mul, mulSlice[float64]) testFloat64x4Binary(t, simd.Float64x4.Mul, mulSlice[float64]) - testInt16x16Binary(t, simd.Int16x16.MulLow, mulSlice[int16]) - testInt16x8Binary(t, simd.Int16x8.MulLow, mulSlice[int16]) - testInt32x4Binary(t, simd.Int32x4.MulLow, mulSlice[int32]) - testInt32x8Binary(t, simd.Int32x8.MulLow, mulSlice[int32]) + testInt16x16Binary(t, simd.Int16x16.Mul, mulSlice[int16]) + testInt16x8Binary(t, simd.Int16x8.Mul, mulSlice[int16]) + testInt32x4Binary(t, simd.Int32x4.Mul, mulSlice[int32]) + testInt32x8Binary(t, simd.Int32x8.Mul, mulSlice[int32]) - // testInt8x16Binary(t, simd.Int8x16.MulLow, mulSlice[int8]) // nope - // testInt8x32Binary(t, simd.Int8x32.MulLow, mulSlice[int8]) + // testInt8x16Binary(t, simd.Int8x16.Mul, mulSlice[int8]) // nope + // testInt8x32Binary(t, simd.Int8x32.Mul, mulSlice[int8]) - // TODO we should be able to do these, there's no difference between signed/unsigned mulLow - // testUint16x16Binary(t, simd.Uint16x16.MulLow, mulSlice[uint16]) - // testUint16x8Binary(t, simd.Uint16x8.MulLow, mulSlice[uint16]) - // testUint32x4Binary(t, simd.Uint32x4.MulLow, mulSlice[uint32]) - // testUint32x8Binary(t, simd.Uint32x8.MulLow, mulSlice[uint32]) - // testUint64x2Binary(t, simd.Uint64x2.MulLow, mulSlice[uint64]) - // testUint64x4Binary(t, simd.Uint64x4.MulLow, mulSlice[uint64]) + // TODO we should be able to do these, there's no difference between signed/unsigned Mul + // testUint16x16Binary(t, simd.Uint16x16.Mul, mulSlice[uint16]) + // testUint16x8Binary(t, simd.Uint16x8.Mul, mulSlice[uint16]) + // testUint32x4Binary(t, simd.Uint32x4.Mul, mulSlice[uint32]) + // testUint32x8Binary(t, simd.Uint32x8.Mul, mulSlice[uint32]) + // testUint64x2Binary(t, simd.Uint64x2.Mul, mulSlice[uint64]) + // testUint64x4Binary(t, simd.Uint64x4.Mul, mulSlice[uint64]) - // testUint8x16Binary(t, simd.Uint8x16.MulLow, mulSlice[uint8]) // nope - // testUint8x32Binary(t, simd.Uint8x32.MulLow, mulSlice[uint8]) + // testUint8x16Binary(t, simd.Uint8x16.Mul, mulSlice[uint8]) // nope + // testUint8x32Binary(t, simd.Uint8x32.Mul, mulSlice[uint8]) if simd.HasAVX512() { - testInt64x2Binary(t, simd.Int64x2.MulLow, mulSlice[int64]) // avx512 only - testInt64x4Binary(t, simd.Int64x4.MulLow, mulSlice[int64]) + testInt64x2Binary(t, simd.Int64x2.Mul, mulSlice[int64]) // avx512 only + testInt64x4Binary(t, simd.Int64x4.Mul, mulSlice[int64]) testFloat32x16Binary(t, simd.Float32x16.Mul, mulSlice[float32]) testFloat64x8Binary(t, simd.Float64x8.Mul, mulSlice[float64]) - // testInt8x64Binary(t, simd.Int8x64.MulLow, mulSlice[int8]) // nope - testInt16x32Binary(t, simd.Int16x32.MulLow, mulSlice[int16]) - testInt32x16Binary(t, simd.Int32x16.MulLow, mulSlice[int32]) - testInt64x8Binary(t, simd.Int64x8.MulLow, mulSlice[int64]) - // testUint8x64Binary(t, simd.Uint8x64.MulLow, mulSlice[uint8]) // nope + // testInt8x64Binary(t, simd.Int8x64.Mul, mulSlice[int8]) // nope + testInt16x32Binary(t, simd.Int16x32.Mul, mulSlice[int16]) + testInt32x16Binary(t, simd.Int32x16.Mul, mulSlice[int32]) + testInt64x8Binary(t, simd.Int64x8.Mul, mulSlice[int64]) + // testUint8x64Binary(t, simd.Uint8x64.Mul, mulSlice[uint8]) // nope // TODO signed should do the job - // testUint16x32Binary(t, simd.Uint16x32.MulLow, mulSlice[uint16]) - // testUint32x16Binary(t, simd.Uint32x16.MulLow, mulSlice[uint32]) - // testUint64x8Binary(t, simd.Uint64x8.MulLow, mulSlice[uint64]) + // testUint16x32Binary(t, simd.Uint16x32.Mul, mulSlice[uint16]) + // testUint32x16Binary(t, simd.Uint32x16.Mul, mulSlice[uint32]) + // testUint64x8Binary(t, simd.Uint64x8.Mul, mulSlice[uint64]) } } diff --git a/src/simd/ops_amd64.go b/src/simd/ops_amd64.go index 5776350fe9f..dc42e73a53a 100644 --- a/src/simd/ops_amd64.go +++ b/src/simd/ops_amd64.go @@ -556,6 +556,242 @@ func (x Uint64x4) AddMasked(y Uint64x4, mask Mask64x4) Uint64x4 // Asm: VPADDQ, CPU Feature: AVX512F func (x Uint64x8) AddMasked(y Uint64x8, mask Mask64x8) Uint64x8 +/* AddPairs */ + +// AddPairs horizontally adds adjacent pairs of elements. +// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...]. +// +// Asm: VHADDPS, CPU Feature: AVX +func (x Float32x4) AddPairs(y Float32x4) Float32x4 + +// AddPairs horizontally adds adjacent pairs of elements. +// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...]. +// +// Asm: VHADDPS, CPU Feature: AVX +func (x Float32x8) AddPairs(y Float32x8) Float32x8 + +// AddPairs horizontally adds adjacent pairs of elements. +// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...]. +// +// Asm: VHADDPD, CPU Feature: AVX +func (x Float64x2) AddPairs(y Float64x2) Float64x2 + +// AddPairs horizontally adds adjacent pairs of elements. +// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...]. +// +// Asm: VHADDPD, CPU Feature: AVX +func (x Float64x4) AddPairs(y Float64x4) Float64x4 + +// AddPairs horizontally adds adjacent pairs of elements. +// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...]. +// +// Asm: VPHADDW, CPU Feature: AVX +func (x Int16x8) AddPairs(y Int16x8) Int16x8 + +// AddPairs horizontally adds adjacent pairs of elements. +// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...]. +// +// Asm: VPHADDW, CPU Feature: AVX2 +func (x Int16x16) AddPairs(y Int16x16) Int16x16 + +// AddPairs horizontally adds adjacent pairs of elements. +// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...]. +// +// Asm: VPHADDD, CPU Feature: AVX +func (x Int32x4) AddPairs(y Int32x4) Int32x4 + +// AddPairs horizontally adds adjacent pairs of elements. +// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...]. +// +// Asm: VPHADDD, CPU Feature: AVX2 +func (x Int32x8) AddPairs(y Int32x8) Int32x8 + +// AddPairs horizontally adds adjacent pairs of elements. +// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...]. +// +// Asm: VPHADDW, CPU Feature: AVX +func (x Uint16x8) AddPairs(y Uint16x8) Uint16x8 + +// AddPairs horizontally adds adjacent pairs of elements. +// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...]. +// +// Asm: VPHADDW, CPU Feature: AVX2 +func (x Uint16x16) AddPairs(y Uint16x16) Uint16x16 + +// AddPairs horizontally adds adjacent pairs of elements. +// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...]. +// +// Asm: VPHADDD, CPU Feature: AVX +func (x Uint32x4) AddPairs(y Uint32x4) Uint32x4 + +// AddPairs horizontally adds adjacent pairs of elements. +// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...]. +// +// Asm: VPHADDD, CPU Feature: AVX2 +func (x Uint32x8) AddPairs(y Uint32x8) Uint32x8 + +/* AddPairsSaturated */ + +// AddPairsSaturated horizontally adds adjacent pairs of elements with saturation. +// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...]. +// +// Asm: VPHADDSW, CPU Feature: AVX +func (x Int16x8) AddPairsSaturated(y Int16x8) Int16x8 + +// AddPairsSaturated horizontally adds adjacent pairs of elements with saturation. +// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...]. +// +// Asm: VPHADDSW, CPU Feature: AVX2 +func (x Int16x16) AddPairsSaturated(y Int16x16) Int16x16 + +/* AddSaturated */ + +// AddSaturated adds corresponding elements of two vectors with saturation. +// +// Asm: VPADDSB, CPU Feature: AVX +func (x Int8x16) AddSaturated(y Int8x16) Int8x16 + +// AddSaturated adds corresponding elements of two vectors with saturation. +// +// Asm: VPADDSB, CPU Feature: AVX2 +func (x Int8x32) AddSaturated(y Int8x32) Int8x32 + +// AddSaturated adds corresponding elements of two vectors with saturation. +// +// Asm: VPADDSB, CPU Feature: AVX512BW +func (x Int8x64) AddSaturated(y Int8x64) Int8x64 + +// AddSaturated adds corresponding elements of two vectors with saturation. +// +// Asm: VPADDSW, CPU Feature: AVX +func (x Int16x8) AddSaturated(y Int16x8) Int16x8 + +// AddSaturated adds corresponding elements of two vectors with saturation. +// +// Asm: VPADDSW, CPU Feature: AVX2 +func (x Int16x16) AddSaturated(y Int16x16) Int16x16 + +// AddSaturated adds corresponding elements of two vectors with saturation. +// +// Asm: VPADDSW, CPU Feature: AVX512BW +func (x Int16x32) AddSaturated(y Int16x32) Int16x32 + +// AddSaturated adds corresponding elements of two vectors with saturation. +// +// Asm: VPADDSB, CPU Feature: AVX +func (x Uint8x16) AddSaturated(y Uint8x16) Uint8x16 + +// AddSaturated adds corresponding elements of two vectors with saturation. +// +// Asm: VPADDSB, CPU Feature: AVX2 +func (x Uint8x32) AddSaturated(y Uint8x32) Uint8x32 + +// AddSaturated adds corresponding elements of two vectors with saturation. +// +// Asm: VPADDSB, CPU Feature: AVX512BW +func (x Uint8x64) AddSaturated(y Uint8x64) Uint8x64 + +// AddSaturated adds corresponding elements of two vectors with saturation. +// +// Asm: VPADDSW, CPU Feature: AVX +func (x Uint16x8) AddSaturated(y Uint16x8) Uint16x8 + +// AddSaturated adds corresponding elements of two vectors with saturation. +// +// Asm: VPADDSW, CPU Feature: AVX2 +func (x Uint16x16) AddSaturated(y Uint16x16) Uint16x16 + +// AddSaturated adds corresponding elements of two vectors with saturation. +// +// Asm: VPADDSW, CPU Feature: AVX512BW +func (x Uint16x32) AddSaturated(y Uint16x32) Uint16x32 + +/* AddSaturatedMasked */ + +// AddSaturatedMasked adds corresponding elements of two vectors with saturation. +// +// This operation is applied selectively under a write mask. +// +// Asm: VPADDSB, CPU Feature: AVX512BW +func (x Int8x16) AddSaturatedMasked(y Int8x16, mask Mask8x16) Int8x16 + +// AddSaturatedMasked adds corresponding elements of two vectors with saturation. +// +// This operation is applied selectively under a write mask. +// +// Asm: VPADDSB, CPU Feature: AVX512BW +func (x Int8x32) AddSaturatedMasked(y Int8x32, mask Mask8x32) Int8x32 + +// AddSaturatedMasked adds corresponding elements of two vectors with saturation. +// +// This operation is applied selectively under a write mask. +// +// Asm: VPADDSB, CPU Feature: AVX512BW +func (x Int8x64) AddSaturatedMasked(y Int8x64, mask Mask8x64) Int8x64 + +// AddSaturatedMasked adds corresponding elements of two vectors with saturation. +// +// This operation is applied selectively under a write mask. +// +// Asm: VPADDSW, CPU Feature: AVX512BW +func (x Int16x8) AddSaturatedMasked(y Int16x8, mask Mask16x8) Int16x8 + +// AddSaturatedMasked adds corresponding elements of two vectors with saturation. +// +// This operation is applied selectively under a write mask. +// +// Asm: VPADDSW, CPU Feature: AVX512BW +func (x Int16x16) AddSaturatedMasked(y Int16x16, mask Mask16x16) Int16x16 + +// AddSaturatedMasked adds corresponding elements of two vectors with saturation. +// +// This operation is applied selectively under a write mask. +// +// Asm: VPADDSW, CPU Feature: AVX512BW +func (x Int16x32) AddSaturatedMasked(y Int16x32, mask Mask16x32) Int16x32 + +// AddSaturatedMasked adds corresponding elements of two vectors with saturation. +// +// This operation is applied selectively under a write mask. +// +// Asm: VPADDSB, CPU Feature: AVX512BW +func (x Uint8x16) AddSaturatedMasked(y Uint8x16, mask Mask8x16) Uint8x16 + +// AddSaturatedMasked adds corresponding elements of two vectors with saturation. +// +// This operation is applied selectively under a write mask. +// +// Asm: VPADDSB, CPU Feature: AVX512BW +func (x Uint8x32) AddSaturatedMasked(y Uint8x32, mask Mask8x32) Uint8x32 + +// AddSaturatedMasked adds corresponding elements of two vectors with saturation. +// +// This operation is applied selectively under a write mask. +// +// Asm: VPADDSB, CPU Feature: AVX512BW +func (x Uint8x64) AddSaturatedMasked(y Uint8x64, mask Mask8x64) Uint8x64 + +// AddSaturatedMasked adds corresponding elements of two vectors with saturation. +// +// This operation is applied selectively under a write mask. +// +// Asm: VPADDSW, CPU Feature: AVX512BW +func (x Uint16x8) AddSaturatedMasked(y Uint16x8, mask Mask16x8) Uint16x8 + +// AddSaturatedMasked adds corresponding elements of two vectors with saturation. +// +// This operation is applied selectively under a write mask. +// +// Asm: VPADDSW, CPU Feature: AVX512BW +func (x Uint16x16) AddSaturatedMasked(y Uint16x16, mask Mask16x16) Uint16x16 + +// AddSaturatedMasked adds corresponding elements of two vectors with saturation. +// +// This operation is applied selectively under a write mask. +// +// Asm: VPADDSW, CPU Feature: AVX512BW +func (x Uint16x32) AddSaturatedMasked(y Uint16x32, mask Mask16x32) Uint16x32 + /* AddSub */ // AddSub subtracts even elements and adds odd elements of two vectors. @@ -1244,105 +1480,205 @@ func (x Float64x2) Ceil() Float64x2 // Asm: VROUNDPD, CPU Feature: AVX func (x Float64x4) Ceil() Float64x4 -/* CeilWithPrecision */ +/* CeilScaled */ -// CeilWithPrecision rounds elements up with specified precision. +// CeilScaled rounds elements up with specified precision. // // prec is expected to be a constant, non-constant value will trigger a runtime panic. // // Asm: VRNDSCALEPS, CPU Feature: AVX512F -func (x Float32x4) CeilWithPrecision(prec uint8) Float32x4 +func (x Float32x4) CeilScaled(prec uint8) Float32x4 -// CeilWithPrecision rounds elements up with specified precision. +// CeilScaled rounds elements up with specified precision. // // prec is expected to be a constant, non-constant value will trigger a runtime panic. // // Asm: VRNDSCALEPS, CPU Feature: AVX512F -func (x Float32x8) CeilWithPrecision(prec uint8) Float32x8 +func (x Float32x8) CeilScaled(prec uint8) Float32x8 -// CeilWithPrecision rounds elements up with specified precision. +// CeilScaled rounds elements up with specified precision. // // prec is expected to be a constant, non-constant value will trigger a runtime panic. // // Asm: VRNDSCALEPS, CPU Feature: AVX512F -func (x Float32x16) CeilWithPrecision(prec uint8) Float32x16 +func (x Float32x16) CeilScaled(prec uint8) Float32x16 -// CeilWithPrecision rounds elements up with specified precision. +// CeilScaled rounds elements up with specified precision. // // prec is expected to be a constant, non-constant value will trigger a runtime panic. // // Asm: VRNDSCALEPD, CPU Feature: AVX512F -func (x Float64x2) CeilWithPrecision(prec uint8) Float64x2 +func (x Float64x2) CeilScaled(prec uint8) Float64x2 -// CeilWithPrecision rounds elements up with specified precision. +// CeilScaled rounds elements up with specified precision. // // prec is expected to be a constant, non-constant value will trigger a runtime panic. // // Asm: VRNDSCALEPD, CPU Feature: AVX512F -func (x Float64x4) CeilWithPrecision(prec uint8) Float64x4 +func (x Float64x4) CeilScaled(prec uint8) Float64x4 -// CeilWithPrecision rounds elements up with specified precision. +// CeilScaled rounds elements up with specified precision. // // prec is expected to be a constant, non-constant value will trigger a runtime panic. // // Asm: VRNDSCALEPD, CPU Feature: AVX512F -func (x Float64x8) CeilWithPrecision(prec uint8) Float64x8 +func (x Float64x8) CeilScaled(prec uint8) Float64x8 -/* CeilWithPrecisionMasked */ +/* CeilScaledMasked */ -// CeilWithPrecisionMasked rounds elements up with specified precision. +// CeilScaledMasked rounds elements up with specified precision. // // This operation is applied selectively under a write mask. // // prec is expected to be a constant, non-constant value will trigger a runtime panic. // // Asm: VRNDSCALEPS, CPU Feature: AVX512F -func (x Float32x4) CeilWithPrecisionMasked(prec uint8, mask Mask32x4) Float32x4 +func (x Float32x4) CeilScaledMasked(prec uint8, mask Mask32x4) Float32x4 -// CeilWithPrecisionMasked rounds elements up with specified precision. +// CeilScaledMasked rounds elements up with specified precision. // // This operation is applied selectively under a write mask. // // prec is expected to be a constant, non-constant value will trigger a runtime panic. // // Asm: VRNDSCALEPS, CPU Feature: AVX512F -func (x Float32x8) CeilWithPrecisionMasked(prec uint8, mask Mask32x8) Float32x8 +func (x Float32x8) CeilScaledMasked(prec uint8, mask Mask32x8) Float32x8 -// CeilWithPrecisionMasked rounds elements up with specified precision. +// CeilScaledMasked rounds elements up with specified precision. // // This operation is applied selectively under a write mask. // // prec is expected to be a constant, non-constant value will trigger a runtime panic. // // Asm: VRNDSCALEPS, CPU Feature: AVX512F -func (x Float32x16) CeilWithPrecisionMasked(prec uint8, mask Mask32x16) Float32x16 +func (x Float32x16) CeilScaledMasked(prec uint8, mask Mask32x16) Float32x16 -// CeilWithPrecisionMasked rounds elements up with specified precision. +// CeilScaledMasked rounds elements up with specified precision. // // This operation is applied selectively under a write mask. // // prec is expected to be a constant, non-constant value will trigger a runtime panic. // // Asm: VRNDSCALEPD, CPU Feature: AVX512F -func (x Float64x2) CeilWithPrecisionMasked(prec uint8, mask Mask64x2) Float64x2 +func (x Float64x2) CeilScaledMasked(prec uint8, mask Mask64x2) Float64x2 -// CeilWithPrecisionMasked rounds elements up with specified precision. +// CeilScaledMasked rounds elements up with specified precision. // // This operation is applied selectively under a write mask. // // prec is expected to be a constant, non-constant value will trigger a runtime panic. // // Asm: VRNDSCALEPD, CPU Feature: AVX512F -func (x Float64x4) CeilWithPrecisionMasked(prec uint8, mask Mask64x4) Float64x4 +func (x Float64x4) CeilScaledMasked(prec uint8, mask Mask64x4) Float64x4 -// CeilWithPrecisionMasked rounds elements up with specified precision. +// CeilScaledMasked rounds elements up with specified precision. // // This operation is applied selectively under a write mask. // // prec is expected to be a constant, non-constant value will trigger a runtime panic. // // Asm: VRNDSCALEPD, CPU Feature: AVX512F -func (x Float64x8) CeilWithPrecisionMasked(prec uint8, mask Mask64x8) Float64x8 +func (x Float64x8) CeilScaledMasked(prec uint8, mask Mask64x8) Float64x8 + +/* CeilScaledResidue */ + +// CeilScaledResidue computes the difference after ceiling with specified precision. +// +// prec is expected to be a constant, non-constant value will trigger a runtime panic. +// +// Asm: VREDUCEPS, CPU Feature: AVX512DQ +func (x Float32x4) CeilScaledResidue(prec uint8) Float32x4 + +// CeilScaledResidue computes the difference after ceiling with specified precision. +// +// prec is expected to be a constant, non-constant value will trigger a runtime panic. +// +// Asm: VREDUCEPS, CPU Feature: AVX512DQ +func (x Float32x8) CeilScaledResidue(prec uint8) Float32x8 + +// CeilScaledResidue computes the difference after ceiling with specified precision. +// +// prec is expected to be a constant, non-constant value will trigger a runtime panic. +// +// Asm: VREDUCEPS, CPU Feature: AVX512DQ +func (x Float32x16) CeilScaledResidue(prec uint8) Float32x16 + +// CeilScaledResidue computes the difference after ceiling with specified precision. +// +// prec is expected to be a constant, non-constant value will trigger a runtime panic. +// +// Asm: VREDUCEPD, CPU Feature: AVX512DQ +func (x Float64x2) CeilScaledResidue(prec uint8) Float64x2 + +// CeilScaledResidue computes the difference after ceiling with specified precision. +// +// prec is expected to be a constant, non-constant value will trigger a runtime panic. +// +// Asm: VREDUCEPD, CPU Feature: AVX512DQ +func (x Float64x4) CeilScaledResidue(prec uint8) Float64x4 + +// CeilScaledResidue computes the difference after ceiling with specified precision. +// +// prec is expected to be a constant, non-constant value will trigger a runtime panic. +// +// Asm: VREDUCEPD, CPU Feature: AVX512DQ +func (x Float64x8) CeilScaledResidue(prec uint8) Float64x8 + +/* CeilScaledResidueMasked */ + +// CeilScaledResidueMasked computes the difference after ceiling with specified precision. +// +// This operation is applied selectively under a write mask. +// +// prec is expected to be a constant, non-constant value will trigger a runtime panic. +// +// Asm: VREDUCEPS, CPU Feature: AVX512DQ +func (x Float32x4) CeilScaledResidueMasked(prec uint8, mask Mask32x4) Float32x4 + +// CeilScaledResidueMasked computes the difference after ceiling with specified precision. +// +// This operation is applied selectively under a write mask. +// +// prec is expected to be a constant, non-constant value will trigger a runtime panic. +// +// Asm: VREDUCEPS, CPU Feature: AVX512DQ +func (x Float32x8) CeilScaledResidueMasked(prec uint8, mask Mask32x8) Float32x8 + +// CeilScaledResidueMasked computes the difference after ceiling with specified precision. +// +// This operation is applied selectively under a write mask. +// +// prec is expected to be a constant, non-constant value will trigger a runtime panic. +// +// Asm: VREDUCEPS, CPU Feature: AVX512DQ +func (x Float32x16) CeilScaledResidueMasked(prec uint8, mask Mask32x16) Float32x16 + +// CeilScaledResidueMasked computes the difference after ceiling with specified precision. +// +// This operation is applied selectively under a write mask. +// +// prec is expected to be a constant, non-constant value will trigger a runtime panic. +// +// Asm: VREDUCEPD, CPU Feature: AVX512DQ +func (x Float64x2) CeilScaledResidueMasked(prec uint8, mask Mask64x2) Float64x2 + +// CeilScaledResidueMasked computes the difference after ceiling with specified precision. +// +// This operation is applied selectively under a write mask. +// +// prec is expected to be a constant, non-constant value will trigger a runtime panic. +// +// Asm: VREDUCEPD, CPU Feature: AVX512DQ +func (x Float64x4) CeilScaledResidueMasked(prec uint8, mask Mask64x4) Float64x4 + +// CeilScaledResidueMasked computes the difference after ceiling with specified precision. +// +// This operation is applied selectively under a write mask. +// +// prec is expected to be a constant, non-constant value will trigger a runtime panic. +// +// Asm: VREDUCEPD, CPU Feature: AVX512DQ +func (x Float64x8) CeilScaledResidueMasked(prec uint8, mask Mask64x8) Float64x8 /* Compress */ @@ -1606,406 +1942,6 @@ func (x Float32x8) ConvertToUint32Masked(mask Mask32x8) Uint32x8 // Asm: VCVTPS2UDQ, CPU Feature: AVX512F func (x Float32x16) ConvertToUint32Masked(mask Mask32x16) Uint32x16 -/* DiffWithCeilWithPrecision */ - -// DiffWithCeilWithPrecision computes the difference after ceiling with specified precision. -// -// prec is expected to be a constant, non-constant value will trigger a runtime panic. -// -// Asm: VREDUCEPS, CPU Feature: AVX512DQ -func (x Float32x4) DiffWithCeilWithPrecision(prec uint8) Float32x4 - -// DiffWithCeilWithPrecision computes the difference after ceiling with specified precision. -// -// prec is expected to be a constant, non-constant value will trigger a runtime panic. -// -// Asm: VREDUCEPS, CPU Feature: AVX512DQ -func (x Float32x8) DiffWithCeilWithPrecision(prec uint8) Float32x8 - -// DiffWithCeilWithPrecision computes the difference after ceiling with specified precision. -// -// prec is expected to be a constant, non-constant value will trigger a runtime panic. -// -// Asm: VREDUCEPS, CPU Feature: AVX512DQ -func (x Float32x16) DiffWithCeilWithPrecision(prec uint8) Float32x16 - -// DiffWithCeilWithPrecision computes the difference after ceiling with specified precision. -// -// prec is expected to be a constant, non-constant value will trigger a runtime panic. -// -// Asm: VREDUCEPD, CPU Feature: AVX512DQ -func (x Float64x2) DiffWithCeilWithPrecision(prec uint8) Float64x2 - -// DiffWithCeilWithPrecision computes the difference after ceiling with specified precision. -// -// prec is expected to be a constant, non-constant value will trigger a runtime panic. -// -// Asm: VREDUCEPD, CPU Feature: AVX512DQ -func (x Float64x4) DiffWithCeilWithPrecision(prec uint8) Float64x4 - -// DiffWithCeilWithPrecision computes the difference after ceiling with specified precision. -// -// prec is expected to be a constant, non-constant value will trigger a runtime panic. -// -// Asm: VREDUCEPD, CPU Feature: AVX512DQ -func (x Float64x8) DiffWithCeilWithPrecision(prec uint8) Float64x8 - -/* DiffWithCeilWithPrecisionMasked */ - -// DiffWithCeilWithPrecisionMasked computes the difference after ceiling with specified precision. -// -// This operation is applied selectively under a write mask. -// -// prec is expected to be a constant, non-constant value will trigger a runtime panic. -// -// Asm: VREDUCEPS, CPU Feature: AVX512DQ -func (x Float32x4) DiffWithCeilWithPrecisionMasked(prec uint8, mask Mask32x4) Float32x4 - -// DiffWithCeilWithPrecisionMasked computes the difference after ceiling with specified precision. -// -// This operation is applied selectively under a write mask. -// -// prec is expected to be a constant, non-constant value will trigger a runtime panic. -// -// Asm: VREDUCEPS, CPU Feature: AVX512DQ -func (x Float32x8) DiffWithCeilWithPrecisionMasked(prec uint8, mask Mask32x8) Float32x8 - -// DiffWithCeilWithPrecisionMasked computes the difference after ceiling with specified precision. -// -// This operation is applied selectively under a write mask. -// -// prec is expected to be a constant, non-constant value will trigger a runtime panic. -// -// Asm: VREDUCEPS, CPU Feature: AVX512DQ -func (x Float32x16) DiffWithCeilWithPrecisionMasked(prec uint8, mask Mask32x16) Float32x16 - -// DiffWithCeilWithPrecisionMasked computes the difference after ceiling with specified precision. -// -// This operation is applied selectively under a write mask. -// -// prec is expected to be a constant, non-constant value will trigger a runtime panic. -// -// Asm: VREDUCEPD, CPU Feature: AVX512DQ -func (x Float64x2) DiffWithCeilWithPrecisionMasked(prec uint8, mask Mask64x2) Float64x2 - -// DiffWithCeilWithPrecisionMasked computes the difference after ceiling with specified precision. -// -// This operation is applied selectively under a write mask. -// -// prec is expected to be a constant, non-constant value will trigger a runtime panic. -// -// Asm: VREDUCEPD, CPU Feature: AVX512DQ -func (x Float64x4) DiffWithCeilWithPrecisionMasked(prec uint8, mask Mask64x4) Float64x4 - -// DiffWithCeilWithPrecisionMasked computes the difference after ceiling with specified precision. -// -// This operation is applied selectively under a write mask. -// -// prec is expected to be a constant, non-constant value will trigger a runtime panic. -// -// Asm: VREDUCEPD, CPU Feature: AVX512DQ -func (x Float64x8) DiffWithCeilWithPrecisionMasked(prec uint8, mask Mask64x8) Float64x8 - -/* DiffWithFloorWithPrecision */ - -// DiffWithFloorWithPrecision computes the difference after flooring with specified precision. -// -// prec is expected to be a constant, non-constant value will trigger a runtime panic. -// -// Asm: VREDUCEPS, CPU Feature: AVX512DQ -func (x Float32x4) DiffWithFloorWithPrecision(prec uint8) Float32x4 - -// DiffWithFloorWithPrecision computes the difference after flooring with specified precision. -// -// prec is expected to be a constant, non-constant value will trigger a runtime panic. -// -// Asm: VREDUCEPS, CPU Feature: AVX512DQ -func (x Float32x8) DiffWithFloorWithPrecision(prec uint8) Float32x8 - -// DiffWithFloorWithPrecision computes the difference after flooring with specified precision. -// -// prec is expected to be a constant, non-constant value will trigger a runtime panic. -// -// Asm: VREDUCEPS, CPU Feature: AVX512DQ -func (x Float32x16) DiffWithFloorWithPrecision(prec uint8) Float32x16 - -// DiffWithFloorWithPrecision computes the difference after flooring with specified precision. -// -// prec is expected to be a constant, non-constant value will trigger a runtime panic. -// -// Asm: VREDUCEPD, CPU Feature: AVX512DQ -func (x Float64x2) DiffWithFloorWithPrecision(prec uint8) Float64x2 - -// DiffWithFloorWithPrecision computes the difference after flooring with specified precision. -// -// prec is expected to be a constant, non-constant value will trigger a runtime panic. -// -// Asm: VREDUCEPD, CPU Feature: AVX512DQ -func (x Float64x4) DiffWithFloorWithPrecision(prec uint8) Float64x4 - -// DiffWithFloorWithPrecision computes the difference after flooring with specified precision. -// -// prec is expected to be a constant, non-constant value will trigger a runtime panic. -// -// Asm: VREDUCEPD, CPU Feature: AVX512DQ -func (x Float64x8) DiffWithFloorWithPrecision(prec uint8) Float64x8 - -/* DiffWithFloorWithPrecisionMasked */ - -// DiffWithFloorWithPrecisionMasked computes the difference after flooring with specified precision. -// -// This operation is applied selectively under a write mask. -// -// prec is expected to be a constant, non-constant value will trigger a runtime panic. -// -// Asm: VREDUCEPS, CPU Feature: AVX512DQ -func (x Float32x4) DiffWithFloorWithPrecisionMasked(prec uint8, mask Mask32x4) Float32x4 - -// DiffWithFloorWithPrecisionMasked computes the difference after flooring with specified precision. -// -// This operation is applied selectively under a write mask. -// -// prec is expected to be a constant, non-constant value will trigger a runtime panic. -// -// Asm: VREDUCEPS, CPU Feature: AVX512DQ -func (x Float32x8) DiffWithFloorWithPrecisionMasked(prec uint8, mask Mask32x8) Float32x8 - -// DiffWithFloorWithPrecisionMasked computes the difference after flooring with specified precision. -// -// This operation is applied selectively under a write mask. -// -// prec is expected to be a constant, non-constant value will trigger a runtime panic. -// -// Asm: VREDUCEPS, CPU Feature: AVX512DQ -func (x Float32x16) DiffWithFloorWithPrecisionMasked(prec uint8, mask Mask32x16) Float32x16 - -// DiffWithFloorWithPrecisionMasked computes the difference after flooring with specified precision. -// -// This operation is applied selectively under a write mask. -// -// prec is expected to be a constant, non-constant value will trigger a runtime panic. -// -// Asm: VREDUCEPD, CPU Feature: AVX512DQ -func (x Float64x2) DiffWithFloorWithPrecisionMasked(prec uint8, mask Mask64x2) Float64x2 - -// DiffWithFloorWithPrecisionMasked computes the difference after flooring with specified precision. -// -// This operation is applied selectively under a write mask. -// -// prec is expected to be a constant, non-constant value will trigger a runtime panic. -// -// Asm: VREDUCEPD, CPU Feature: AVX512DQ -func (x Float64x4) DiffWithFloorWithPrecisionMasked(prec uint8, mask Mask64x4) Float64x4 - -// DiffWithFloorWithPrecisionMasked computes the difference after flooring with specified precision. -// -// This operation is applied selectively under a write mask. -// -// prec is expected to be a constant, non-constant value will trigger a runtime panic. -// -// Asm: VREDUCEPD, CPU Feature: AVX512DQ -func (x Float64x8) DiffWithFloorWithPrecisionMasked(prec uint8, mask Mask64x8) Float64x8 - -/* DiffWithRoundWithPrecision */ - -// DiffWithRoundWithPrecision computes the difference after rounding with specified precision. -// -// prec is expected to be a constant, non-constant value will trigger a runtime panic. -// -// Asm: VREDUCEPS, CPU Feature: AVX512DQ -func (x Float32x4) DiffWithRoundWithPrecision(prec uint8) Float32x4 - -// DiffWithRoundWithPrecision computes the difference after rounding with specified precision. -// -// prec is expected to be a constant, non-constant value will trigger a runtime panic. -// -// Asm: VREDUCEPS, CPU Feature: AVX512DQ -func (x Float32x8) DiffWithRoundWithPrecision(prec uint8) Float32x8 - -// DiffWithRoundWithPrecision computes the difference after rounding with specified precision. -// -// prec is expected to be a constant, non-constant value will trigger a runtime panic. -// -// Asm: VREDUCEPS, CPU Feature: AVX512DQ -func (x Float32x16) DiffWithRoundWithPrecision(prec uint8) Float32x16 - -// DiffWithRoundWithPrecision computes the difference after rounding with specified precision. -// -// prec is expected to be a constant, non-constant value will trigger a runtime panic. -// -// Asm: VREDUCEPD, CPU Feature: AVX512DQ -func (x Float64x2) DiffWithRoundWithPrecision(prec uint8) Float64x2 - -// DiffWithRoundWithPrecision computes the difference after rounding with specified precision. -// -// prec is expected to be a constant, non-constant value will trigger a runtime panic. -// -// Asm: VREDUCEPD, CPU Feature: AVX512DQ -func (x Float64x4) DiffWithRoundWithPrecision(prec uint8) Float64x4 - -// DiffWithRoundWithPrecision computes the difference after rounding with specified precision. -// -// prec is expected to be a constant, non-constant value will trigger a runtime panic. -// -// Asm: VREDUCEPD, CPU Feature: AVX512DQ -func (x Float64x8) DiffWithRoundWithPrecision(prec uint8) Float64x8 - -/* DiffWithRoundWithPrecisionMasked */ - -// DiffWithRoundWithPrecisionMasked computes the difference after rounding with specified precision. -// -// This operation is applied selectively under a write mask. -// -// prec is expected to be a constant, non-constant value will trigger a runtime panic. -// -// Asm: VREDUCEPS, CPU Feature: AVX512DQ -func (x Float32x4) DiffWithRoundWithPrecisionMasked(prec uint8, mask Mask32x4) Float32x4 - -// DiffWithRoundWithPrecisionMasked computes the difference after rounding with specified precision. -// -// This operation is applied selectively under a write mask. -// -// prec is expected to be a constant, non-constant value will trigger a runtime panic. -// -// Asm: VREDUCEPS, CPU Feature: AVX512DQ -func (x Float32x8) DiffWithRoundWithPrecisionMasked(prec uint8, mask Mask32x8) Float32x8 - -// DiffWithRoundWithPrecisionMasked computes the difference after rounding with specified precision. -// -// This operation is applied selectively under a write mask. -// -// prec is expected to be a constant, non-constant value will trigger a runtime panic. -// -// Asm: VREDUCEPS, CPU Feature: AVX512DQ -func (x Float32x16) DiffWithRoundWithPrecisionMasked(prec uint8, mask Mask32x16) Float32x16 - -// DiffWithRoundWithPrecisionMasked computes the difference after rounding with specified precision. -// -// This operation is applied selectively under a write mask. -// -// prec is expected to be a constant, non-constant value will trigger a runtime panic. -// -// Asm: VREDUCEPD, CPU Feature: AVX512DQ -func (x Float64x2) DiffWithRoundWithPrecisionMasked(prec uint8, mask Mask64x2) Float64x2 - -// DiffWithRoundWithPrecisionMasked computes the difference after rounding with specified precision. -// -// This operation is applied selectively under a write mask. -// -// prec is expected to be a constant, non-constant value will trigger a runtime panic. -// -// Asm: VREDUCEPD, CPU Feature: AVX512DQ -func (x Float64x4) DiffWithRoundWithPrecisionMasked(prec uint8, mask Mask64x4) Float64x4 - -// DiffWithRoundWithPrecisionMasked computes the difference after rounding with specified precision. -// -// This operation is applied selectively under a write mask. -// -// prec is expected to be a constant, non-constant value will trigger a runtime panic. -// -// Asm: VREDUCEPD, CPU Feature: AVX512DQ -func (x Float64x8) DiffWithRoundWithPrecisionMasked(prec uint8, mask Mask64x8) Float64x8 - -/* DiffWithTruncWithPrecision */ - -// DiffWithTruncWithPrecision computes the difference after truncating with specified precision. -// -// prec is expected to be a constant, non-constant value will trigger a runtime panic. -// -// Asm: VREDUCEPS, CPU Feature: AVX512DQ -func (x Float32x4) DiffWithTruncWithPrecision(prec uint8) Float32x4 - -// DiffWithTruncWithPrecision computes the difference after truncating with specified precision. -// -// prec is expected to be a constant, non-constant value will trigger a runtime panic. -// -// Asm: VREDUCEPS, CPU Feature: AVX512DQ -func (x Float32x8) DiffWithTruncWithPrecision(prec uint8) Float32x8 - -// DiffWithTruncWithPrecision computes the difference after truncating with specified precision. -// -// prec is expected to be a constant, non-constant value will trigger a runtime panic. -// -// Asm: VREDUCEPS, CPU Feature: AVX512DQ -func (x Float32x16) DiffWithTruncWithPrecision(prec uint8) Float32x16 - -// DiffWithTruncWithPrecision computes the difference after truncating with specified precision. -// -// prec is expected to be a constant, non-constant value will trigger a runtime panic. -// -// Asm: VREDUCEPD, CPU Feature: AVX512DQ -func (x Float64x2) DiffWithTruncWithPrecision(prec uint8) Float64x2 - -// DiffWithTruncWithPrecision computes the difference after truncating with specified precision. -// -// prec is expected to be a constant, non-constant value will trigger a runtime panic. -// -// Asm: VREDUCEPD, CPU Feature: AVX512DQ -func (x Float64x4) DiffWithTruncWithPrecision(prec uint8) Float64x4 - -// DiffWithTruncWithPrecision computes the difference after truncating with specified precision. -// -// prec is expected to be a constant, non-constant value will trigger a runtime panic. -// -// Asm: VREDUCEPD, CPU Feature: AVX512DQ -func (x Float64x8) DiffWithTruncWithPrecision(prec uint8) Float64x8 - -/* DiffWithTruncWithPrecisionMasked */ - -// DiffWithTruncWithPrecisionMasked computes the difference after truncating with specified precision. -// -// This operation is applied selectively under a write mask. -// -// prec is expected to be a constant, non-constant value will trigger a runtime panic. -// -// Asm: VREDUCEPS, CPU Feature: AVX512DQ -func (x Float32x4) DiffWithTruncWithPrecisionMasked(prec uint8, mask Mask32x4) Float32x4 - -// DiffWithTruncWithPrecisionMasked computes the difference after truncating with specified precision. -// -// This operation is applied selectively under a write mask. -// -// prec is expected to be a constant, non-constant value will trigger a runtime panic. -// -// Asm: VREDUCEPS, CPU Feature: AVX512DQ -func (x Float32x8) DiffWithTruncWithPrecisionMasked(prec uint8, mask Mask32x8) Float32x8 - -// DiffWithTruncWithPrecisionMasked computes the difference after truncating with specified precision. -// -// This operation is applied selectively under a write mask. -// -// prec is expected to be a constant, non-constant value will trigger a runtime panic. -// -// Asm: VREDUCEPS, CPU Feature: AVX512DQ -func (x Float32x16) DiffWithTruncWithPrecisionMasked(prec uint8, mask Mask32x16) Float32x16 - -// DiffWithTruncWithPrecisionMasked computes the difference after truncating with specified precision. -// -// This operation is applied selectively under a write mask. -// -// prec is expected to be a constant, non-constant value will trigger a runtime panic. -// -// Asm: VREDUCEPD, CPU Feature: AVX512DQ -func (x Float64x2) DiffWithTruncWithPrecisionMasked(prec uint8, mask Mask64x2) Float64x2 - -// DiffWithTruncWithPrecisionMasked computes the difference after truncating with specified precision. -// -// This operation is applied selectively under a write mask. -// -// prec is expected to be a constant, non-constant value will trigger a runtime panic. -// -// Asm: VREDUCEPD, CPU Feature: AVX512DQ -func (x Float64x4) DiffWithTruncWithPrecisionMasked(prec uint8, mask Mask64x4) Float64x4 - -// DiffWithTruncWithPrecisionMasked computes the difference after truncating with specified precision. -// -// This operation is applied selectively under a write mask. -// -// prec is expected to be a constant, non-constant value will trigger a runtime panic. -// -// Asm: VREDUCEPD, CPU Feature: AVX512DQ -func (x Float64x8) DiffWithTruncWithPrecisionMasked(prec uint8, mask Mask64x8) Float64x8 - /* Div */ // Div divides elements of two vectors. @@ -2485,105 +2421,205 @@ func (x Float64x2) Floor() Float64x2 // Asm: VROUNDPD, CPU Feature: AVX func (x Float64x4) Floor() Float64x4 -/* FloorWithPrecision */ +/* FloorScaled */ -// FloorWithPrecision rounds elements down with specified precision. +// FloorScaled rounds elements down with specified precision. // // prec is expected to be a constant, non-constant value will trigger a runtime panic. // // Asm: VRNDSCALEPS, CPU Feature: AVX512F -func (x Float32x4) FloorWithPrecision(prec uint8) Float32x4 +func (x Float32x4) FloorScaled(prec uint8) Float32x4 -// FloorWithPrecision rounds elements down with specified precision. +// FloorScaled rounds elements down with specified precision. // // prec is expected to be a constant, non-constant value will trigger a runtime panic. // // Asm: VRNDSCALEPS, CPU Feature: AVX512F -func (x Float32x8) FloorWithPrecision(prec uint8) Float32x8 +func (x Float32x8) FloorScaled(prec uint8) Float32x8 -// FloorWithPrecision rounds elements down with specified precision. +// FloorScaled rounds elements down with specified precision. // // prec is expected to be a constant, non-constant value will trigger a runtime panic. // // Asm: VRNDSCALEPS, CPU Feature: AVX512F -func (x Float32x16) FloorWithPrecision(prec uint8) Float32x16 +func (x Float32x16) FloorScaled(prec uint8) Float32x16 -// FloorWithPrecision rounds elements down with specified precision. +// FloorScaled rounds elements down with specified precision. // // prec is expected to be a constant, non-constant value will trigger a runtime panic. // // Asm: VRNDSCALEPD, CPU Feature: AVX512F -func (x Float64x2) FloorWithPrecision(prec uint8) Float64x2 +func (x Float64x2) FloorScaled(prec uint8) Float64x2 -// FloorWithPrecision rounds elements down with specified precision. +// FloorScaled rounds elements down with specified precision. // // prec is expected to be a constant, non-constant value will trigger a runtime panic. // // Asm: VRNDSCALEPD, CPU Feature: AVX512F -func (x Float64x4) FloorWithPrecision(prec uint8) Float64x4 +func (x Float64x4) FloorScaled(prec uint8) Float64x4 -// FloorWithPrecision rounds elements down with specified precision. +// FloorScaled rounds elements down with specified precision. // // prec is expected to be a constant, non-constant value will trigger a runtime panic. // // Asm: VRNDSCALEPD, CPU Feature: AVX512F -func (x Float64x8) FloorWithPrecision(prec uint8) Float64x8 +func (x Float64x8) FloorScaled(prec uint8) Float64x8 -/* FloorWithPrecisionMasked */ +/* FloorScaledMasked */ -// FloorWithPrecisionMasked rounds elements down with specified precision. +// FloorScaledMasked rounds elements down with specified precision. // // This operation is applied selectively under a write mask. // // prec is expected to be a constant, non-constant value will trigger a runtime panic. // // Asm: VRNDSCALEPS, CPU Feature: AVX512F -func (x Float32x4) FloorWithPrecisionMasked(prec uint8, mask Mask32x4) Float32x4 +func (x Float32x4) FloorScaledMasked(prec uint8, mask Mask32x4) Float32x4 -// FloorWithPrecisionMasked rounds elements down with specified precision. +// FloorScaledMasked rounds elements down with specified precision. // // This operation is applied selectively under a write mask. // // prec is expected to be a constant, non-constant value will trigger a runtime panic. // // Asm: VRNDSCALEPS, CPU Feature: AVX512F -func (x Float32x8) FloorWithPrecisionMasked(prec uint8, mask Mask32x8) Float32x8 +func (x Float32x8) FloorScaledMasked(prec uint8, mask Mask32x8) Float32x8 -// FloorWithPrecisionMasked rounds elements down with specified precision. +// FloorScaledMasked rounds elements down with specified precision. // // This operation is applied selectively under a write mask. // // prec is expected to be a constant, non-constant value will trigger a runtime panic. // // Asm: VRNDSCALEPS, CPU Feature: AVX512F -func (x Float32x16) FloorWithPrecisionMasked(prec uint8, mask Mask32x16) Float32x16 +func (x Float32x16) FloorScaledMasked(prec uint8, mask Mask32x16) Float32x16 -// FloorWithPrecisionMasked rounds elements down with specified precision. +// FloorScaledMasked rounds elements down with specified precision. // // This operation is applied selectively under a write mask. // // prec is expected to be a constant, non-constant value will trigger a runtime panic. // // Asm: VRNDSCALEPD, CPU Feature: AVX512F -func (x Float64x2) FloorWithPrecisionMasked(prec uint8, mask Mask64x2) Float64x2 +func (x Float64x2) FloorScaledMasked(prec uint8, mask Mask64x2) Float64x2 -// FloorWithPrecisionMasked rounds elements down with specified precision. +// FloorScaledMasked rounds elements down with specified precision. // // This operation is applied selectively under a write mask. // // prec is expected to be a constant, non-constant value will trigger a runtime panic. // // Asm: VRNDSCALEPD, CPU Feature: AVX512F -func (x Float64x4) FloorWithPrecisionMasked(prec uint8, mask Mask64x4) Float64x4 +func (x Float64x4) FloorScaledMasked(prec uint8, mask Mask64x4) Float64x4 -// FloorWithPrecisionMasked rounds elements down with specified precision. +// FloorScaledMasked rounds elements down with specified precision. // // This operation is applied selectively under a write mask. // // prec is expected to be a constant, non-constant value will trigger a runtime panic. // // Asm: VRNDSCALEPD, CPU Feature: AVX512F -func (x Float64x8) FloorWithPrecisionMasked(prec uint8, mask Mask64x8) Float64x8 +func (x Float64x8) FloorScaledMasked(prec uint8, mask Mask64x8) Float64x8 + +/* FloorScaledResidue */ + +// FloorScaledResidue computes the difference after flooring with specified precision. +// +// prec is expected to be a constant, non-constant value will trigger a runtime panic. +// +// Asm: VREDUCEPS, CPU Feature: AVX512DQ +func (x Float32x4) FloorScaledResidue(prec uint8) Float32x4 + +// FloorScaledResidue computes the difference after flooring with specified precision. +// +// prec is expected to be a constant, non-constant value will trigger a runtime panic. +// +// Asm: VREDUCEPS, CPU Feature: AVX512DQ +func (x Float32x8) FloorScaledResidue(prec uint8) Float32x8 + +// FloorScaledResidue computes the difference after flooring with specified precision. +// +// prec is expected to be a constant, non-constant value will trigger a runtime panic. +// +// Asm: VREDUCEPS, CPU Feature: AVX512DQ +func (x Float32x16) FloorScaledResidue(prec uint8) Float32x16 + +// FloorScaledResidue computes the difference after flooring with specified precision. +// +// prec is expected to be a constant, non-constant value will trigger a runtime panic. +// +// Asm: VREDUCEPD, CPU Feature: AVX512DQ +func (x Float64x2) FloorScaledResidue(prec uint8) Float64x2 + +// FloorScaledResidue computes the difference after flooring with specified precision. +// +// prec is expected to be a constant, non-constant value will trigger a runtime panic. +// +// Asm: VREDUCEPD, CPU Feature: AVX512DQ +func (x Float64x4) FloorScaledResidue(prec uint8) Float64x4 + +// FloorScaledResidue computes the difference after flooring with specified precision. +// +// prec is expected to be a constant, non-constant value will trigger a runtime panic. +// +// Asm: VREDUCEPD, CPU Feature: AVX512DQ +func (x Float64x8) FloorScaledResidue(prec uint8) Float64x8 + +/* FloorScaledResidueMasked */ + +// FloorScaledResidueMasked computes the difference after flooring with specified precision. +// +// This operation is applied selectively under a write mask. +// +// prec is expected to be a constant, non-constant value will trigger a runtime panic. +// +// Asm: VREDUCEPS, CPU Feature: AVX512DQ +func (x Float32x4) FloorScaledResidueMasked(prec uint8, mask Mask32x4) Float32x4 + +// FloorScaledResidueMasked computes the difference after flooring with specified precision. +// +// This operation is applied selectively under a write mask. +// +// prec is expected to be a constant, non-constant value will trigger a runtime panic. +// +// Asm: VREDUCEPS, CPU Feature: AVX512DQ +func (x Float32x8) FloorScaledResidueMasked(prec uint8, mask Mask32x8) Float32x8 + +// FloorScaledResidueMasked computes the difference after flooring with specified precision. +// +// This operation is applied selectively under a write mask. +// +// prec is expected to be a constant, non-constant value will trigger a runtime panic. +// +// Asm: VREDUCEPS, CPU Feature: AVX512DQ +func (x Float32x16) FloorScaledResidueMasked(prec uint8, mask Mask32x16) Float32x16 + +// FloorScaledResidueMasked computes the difference after flooring with specified precision. +// +// This operation is applied selectively under a write mask. +// +// prec is expected to be a constant, non-constant value will trigger a runtime panic. +// +// Asm: VREDUCEPD, CPU Feature: AVX512DQ +func (x Float64x2) FloorScaledResidueMasked(prec uint8, mask Mask64x2) Float64x2 + +// FloorScaledResidueMasked computes the difference after flooring with specified precision. +// +// This operation is applied selectively under a write mask. +// +// prec is expected to be a constant, non-constant value will trigger a runtime panic. +// +// Asm: VREDUCEPD, CPU Feature: AVX512DQ +func (x Float64x4) FloorScaledResidueMasked(prec uint8, mask Mask64x4) Float64x4 + +// FloorScaledResidueMasked computes the difference after flooring with specified precision. +// +// This operation is applied selectively under a write mask. +// +// prec is expected to be a constant, non-constant value will trigger a runtime panic. +// +// Asm: VREDUCEPD, CPU Feature: AVX512DQ +func (x Float64x8) FloorScaledResidueMasked(prec uint8, mask Mask64x8) Float64x8 /* FusedMultiplyAdd */ @@ -5427,81 +5463,50 @@ func (x Float64x4) Mul(y Float64x4) Float64x4 // Asm: VMULPD, CPU Feature: AVX512F func (x Float64x8) Mul(y Float64x8) Float64x8 -/* MulByPowOf2 */ +// Mul multiplies corresponding elements of two vectors. +// +// Asm: VPMULLW, CPU Feature: AVX +func (x Int16x8) Mul(y Int16x8) Int16x8 -// MulByPowOf2 multiplies elements by a power of 2. +// Mul multiplies corresponding elements of two vectors. // -// Asm: VSCALEFPS, CPU Feature: AVX512F -func (x Float32x4) MulByPowOf2(y Float32x4) Float32x4 +// Asm: VPMULLW, CPU Feature: AVX2 +func (x Int16x16) Mul(y Int16x16) Int16x16 -// MulByPowOf2 multiplies elements by a power of 2. +// Mul multiplies corresponding elements of two vectors. // -// Asm: VSCALEFPS, CPU Feature: AVX512F -func (x Float32x8) MulByPowOf2(y Float32x8) Float32x8 +// Asm: VPMULLW, CPU Feature: AVX512BW +func (x Int16x32) Mul(y Int16x32) Int16x32 -// MulByPowOf2 multiplies elements by a power of 2. +// Mul multiplies corresponding elements of two vectors. // -// Asm: VSCALEFPS, CPU Feature: AVX512F -func (x Float32x16) MulByPowOf2(y Float32x16) Float32x16 +// Asm: VPMULLD, CPU Feature: AVX +func (x Int32x4) Mul(y Int32x4) Int32x4 -// MulByPowOf2 multiplies elements by a power of 2. +// Mul multiplies corresponding elements of two vectors. // -// Asm: VSCALEFPD, CPU Feature: AVX512F -func (x Float64x2) MulByPowOf2(y Float64x2) Float64x2 +// Asm: VPMULLD, CPU Feature: AVX2 +func (x Int32x8) Mul(y Int32x8) Int32x8 -// MulByPowOf2 multiplies elements by a power of 2. +// Mul multiplies corresponding elements of two vectors. // -// Asm: VSCALEFPD, CPU Feature: AVX512F -func (x Float64x4) MulByPowOf2(y Float64x4) Float64x4 +// Asm: VPMULLD, CPU Feature: AVX512F +func (x Int32x16) Mul(y Int32x16) Int32x16 -// MulByPowOf2 multiplies elements by a power of 2. +// Mul multiplies corresponding elements of two vectors. // -// Asm: VSCALEFPD, CPU Feature: AVX512F -func (x Float64x8) MulByPowOf2(y Float64x8) Float64x8 +// Asm: VPMULLQ, CPU Feature: AVX512DQ +func (x Int64x2) Mul(y Int64x2) Int64x2 -/* MulByPowOf2Masked */ +// Mul multiplies corresponding elements of two vectors. +// +// Asm: VPMULLQ, CPU Feature: AVX512DQ +func (x Int64x4) Mul(y Int64x4) Int64x4 -// MulByPowOf2Masked multiplies elements by a power of 2. +// Mul multiplies corresponding elements of two vectors. // -// This operation is applied selectively under a write mask. -// -// Asm: VSCALEFPS, CPU Feature: AVX512F -func (x Float32x4) MulByPowOf2Masked(y Float32x4, mask Mask32x4) Float32x4 - -// MulByPowOf2Masked multiplies elements by a power of 2. -// -// This operation is applied selectively under a write mask. -// -// Asm: VSCALEFPS, CPU Feature: AVX512F -func (x Float32x8) MulByPowOf2Masked(y Float32x8, mask Mask32x8) Float32x8 - -// MulByPowOf2Masked multiplies elements by a power of 2. -// -// This operation is applied selectively under a write mask. -// -// Asm: VSCALEFPS, CPU Feature: AVX512F -func (x Float32x16) MulByPowOf2Masked(y Float32x16, mask Mask32x16) Float32x16 - -// MulByPowOf2Masked multiplies elements by a power of 2. -// -// This operation is applied selectively under a write mask. -// -// Asm: VSCALEFPD, CPU Feature: AVX512F -func (x Float64x2) MulByPowOf2Masked(y Float64x2, mask Mask64x2) Float64x2 - -// MulByPowOf2Masked multiplies elements by a power of 2. -// -// This operation is applied selectively under a write mask. -// -// Asm: VSCALEFPD, CPU Feature: AVX512F -func (x Float64x4) MulByPowOf2Masked(y Float64x4, mask Mask64x4) Float64x4 - -// MulByPowOf2Masked multiplies elements by a power of 2. -// -// This operation is applied selectively under a write mask. -// -// Asm: VSCALEFPD, CPU Feature: AVX512F -func (x Float64x8) MulByPowOf2Masked(y Float64x8, mask Mask64x8) Float64x8 +// Asm: VPMULLQ, CPU Feature: AVX512DQ +func (x Int64x8) Mul(y Int64x8) Int64x8 /* MulEvenWiden */ @@ -5691,118 +5696,6 @@ func (x Uint16x16) MulHighMasked(y Uint16x16, mask Mask16x16) Uint16x16 // Asm: VPMULHUW, CPU Feature: AVX512BW func (x Uint16x32) MulHighMasked(y Uint16x32, mask Mask16x32) Uint16x32 -/* MulLow */ - -// MulLow multiplies elements and stores the low part of the result. -// -// Asm: VPMULLW, CPU Feature: AVX -func (x Int16x8) MulLow(y Int16x8) Int16x8 - -// MulLow multiplies elements and stores the low part of the result. -// -// Asm: VPMULLW, CPU Feature: AVX2 -func (x Int16x16) MulLow(y Int16x16) Int16x16 - -// MulLow multiplies elements and stores the low part of the result. -// -// Asm: VPMULLW, CPU Feature: AVX512BW -func (x Int16x32) MulLow(y Int16x32) Int16x32 - -// MulLow multiplies elements and stores the low part of the result. -// -// Asm: VPMULLD, CPU Feature: AVX -func (x Int32x4) MulLow(y Int32x4) Int32x4 - -// MulLow multiplies elements and stores the low part of the result. -// -// Asm: VPMULLD, CPU Feature: AVX2 -func (x Int32x8) MulLow(y Int32x8) Int32x8 - -// MulLow multiplies elements and stores the low part of the result. -// -// Asm: VPMULLD, CPU Feature: AVX512F -func (x Int32x16) MulLow(y Int32x16) Int32x16 - -// MulLow multiplies elements and stores the low part of the result. -// -// Asm: VPMULLQ, CPU Feature: AVX512DQ -func (x Int64x2) MulLow(y Int64x2) Int64x2 - -// MulLow multiplies elements and stores the low part of the result. -// -// Asm: VPMULLQ, CPU Feature: AVX512DQ -func (x Int64x4) MulLow(y Int64x4) Int64x4 - -// MulLow multiplies elements and stores the low part of the result. -// -// Asm: VPMULLQ, CPU Feature: AVX512DQ -func (x Int64x8) MulLow(y Int64x8) Int64x8 - -/* MulLowMasked */ - -// MulLowMasked multiplies elements and stores the low part of the result. -// -// This operation is applied selectively under a write mask. -// -// Asm: VPMULLW, CPU Feature: AVX512BW -func (x Int16x8) MulLowMasked(y Int16x8, mask Mask16x8) Int16x8 - -// MulLowMasked multiplies elements and stores the low part of the result. -// -// This operation is applied selectively under a write mask. -// -// Asm: VPMULLW, CPU Feature: AVX512BW -func (x Int16x16) MulLowMasked(y Int16x16, mask Mask16x16) Int16x16 - -// MulLowMasked multiplies elements and stores the low part of the result. -// -// This operation is applied selectively under a write mask. -// -// Asm: VPMULLW, CPU Feature: AVX512BW -func (x Int16x32) MulLowMasked(y Int16x32, mask Mask16x32) Int16x32 - -// MulLowMasked multiplies elements and stores the low part of the result. -// -// This operation is applied selectively under a write mask. -// -// Asm: VPMULLD, CPU Feature: AVX512F -func (x Int32x4) MulLowMasked(y Int32x4, mask Mask32x4) Int32x4 - -// MulLowMasked multiplies elements and stores the low part of the result. -// -// This operation is applied selectively under a write mask. -// -// Asm: VPMULLD, CPU Feature: AVX512F -func (x Int32x8) MulLowMasked(y Int32x8, mask Mask32x8) Int32x8 - -// MulLowMasked multiplies elements and stores the low part of the result. -// -// This operation is applied selectively under a write mask. -// -// Asm: VPMULLD, CPU Feature: AVX512F -func (x Int32x16) MulLowMasked(y Int32x16, mask Mask32x16) Int32x16 - -// MulLowMasked multiplies elements and stores the low part of the result. -// -// This operation is applied selectively under a write mask. -// -// Asm: VPMULLQ, CPU Feature: AVX512DQ -func (x Int64x2) MulLowMasked(y Int64x2, mask Mask64x2) Int64x2 - -// MulLowMasked multiplies elements and stores the low part of the result. -// -// This operation is applied selectively under a write mask. -// -// Asm: VPMULLQ, CPU Feature: AVX512DQ -func (x Int64x4) MulLowMasked(y Int64x4, mask Mask64x4) Int64x4 - -// MulLowMasked multiplies elements and stores the low part of the result. -// -// This operation is applied selectively under a write mask. -// -// Asm: VPMULLQ, CPU Feature: AVX512DQ -func (x Int64x8) MulLowMasked(y Int64x8, mask Mask64x8) Int64x8 - /* MulMasked */ // MulMasked multiplies corresponding elements of two vectors. @@ -5847,6 +5740,69 @@ func (x Float64x4) MulMasked(y Float64x4, mask Mask64x4) Float64x4 // Asm: VMULPD, CPU Feature: AVX512F func (x Float64x8) MulMasked(y Float64x8, mask Mask64x8) Float64x8 +// MulMasked multiplies corresponding elements of two vectors. +// +// This operation is applied selectively under a write mask. +// +// Asm: VPMULLW, CPU Feature: AVX512BW +func (x Int16x8) MulMasked(y Int16x8, mask Mask16x8) Int16x8 + +// MulMasked multiplies corresponding elements of two vectors. +// +// This operation is applied selectively under a write mask. +// +// Asm: VPMULLW, CPU Feature: AVX512BW +func (x Int16x16) MulMasked(y Int16x16, mask Mask16x16) Int16x16 + +// MulMasked multiplies corresponding elements of two vectors. +// +// This operation is applied selectively under a write mask. +// +// Asm: VPMULLW, CPU Feature: AVX512BW +func (x Int16x32) MulMasked(y Int16x32, mask Mask16x32) Int16x32 + +// MulMasked multiplies corresponding elements of two vectors. +// +// This operation is applied selectively under a write mask. +// +// Asm: VPMULLD, CPU Feature: AVX512F +func (x Int32x4) MulMasked(y Int32x4, mask Mask32x4) Int32x4 + +// MulMasked multiplies corresponding elements of two vectors. +// +// This operation is applied selectively under a write mask. +// +// Asm: VPMULLD, CPU Feature: AVX512F +func (x Int32x8) MulMasked(y Int32x8, mask Mask32x8) Int32x8 + +// MulMasked multiplies corresponding elements of two vectors. +// +// This operation is applied selectively under a write mask. +// +// Asm: VPMULLD, CPU Feature: AVX512F +func (x Int32x16) MulMasked(y Int32x16, mask Mask32x16) Int32x16 + +// MulMasked multiplies corresponding elements of two vectors. +// +// This operation is applied selectively under a write mask. +// +// Asm: VPMULLQ, CPU Feature: AVX512DQ +func (x Int64x2) MulMasked(y Int64x2, mask Mask64x2) Int64x2 + +// MulMasked multiplies corresponding elements of two vectors. +// +// This operation is applied selectively under a write mask. +// +// Asm: VPMULLQ, CPU Feature: AVX512DQ +func (x Int64x4) MulMasked(y Int64x4, mask Mask64x4) Int64x4 + +// MulMasked multiplies corresponding elements of two vectors. +// +// This operation is applied selectively under a write mask. +// +// Asm: VPMULLQ, CPU Feature: AVX512DQ +func (x Int64x8) MulMasked(y Int64x8, mask Mask64x8) Int64x8 + /* NotEqual */ // NotEqual compares for inequality. @@ -6465,154 +6421,6 @@ func (x Int16x16) PairDotProdMasked(y Int16x16, mask Mask16x16) Int32x8 // Asm: VPMADDWD, CPU Feature: AVX512BW func (x Int16x32) PairDotProdMasked(y Int16x32, mask Mask16x32) Int32x16 -/* PairwiseAdd */ - -// PairwiseAdd horizontally adds adjacent pairs of elements. -// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...]. -// -// Asm: VHADDPS, CPU Feature: AVX -func (x Float32x4) PairwiseAdd(y Float32x4) Float32x4 - -// PairwiseAdd horizontally adds adjacent pairs of elements. -// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...]. -// -// Asm: VHADDPS, CPU Feature: AVX -func (x Float32x8) PairwiseAdd(y Float32x8) Float32x8 - -// PairwiseAdd horizontally adds adjacent pairs of elements. -// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...]. -// -// Asm: VHADDPD, CPU Feature: AVX -func (x Float64x2) PairwiseAdd(y Float64x2) Float64x2 - -// PairwiseAdd horizontally adds adjacent pairs of elements. -// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...]. -// -// Asm: VHADDPD, CPU Feature: AVX -func (x Float64x4) PairwiseAdd(y Float64x4) Float64x4 - -// PairwiseAdd horizontally adds adjacent pairs of elements. -// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...]. -// -// Asm: VPHADDW, CPU Feature: AVX -func (x Int16x8) PairwiseAdd(y Int16x8) Int16x8 - -// PairwiseAdd horizontally adds adjacent pairs of elements. -// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...]. -// -// Asm: VPHADDW, CPU Feature: AVX2 -func (x Int16x16) PairwiseAdd(y Int16x16) Int16x16 - -// PairwiseAdd horizontally adds adjacent pairs of elements. -// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...]. -// -// Asm: VPHADDD, CPU Feature: AVX -func (x Int32x4) PairwiseAdd(y Int32x4) Int32x4 - -// PairwiseAdd horizontally adds adjacent pairs of elements. -// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...]. -// -// Asm: VPHADDD, CPU Feature: AVX2 -func (x Int32x8) PairwiseAdd(y Int32x8) Int32x8 - -// PairwiseAdd horizontally adds adjacent pairs of elements. -// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...]. -// -// Asm: VPHADDW, CPU Feature: AVX -func (x Uint16x8) PairwiseAdd(y Uint16x8) Uint16x8 - -// PairwiseAdd horizontally adds adjacent pairs of elements. -// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...]. -// -// Asm: VPHADDW, CPU Feature: AVX2 -func (x Uint16x16) PairwiseAdd(y Uint16x16) Uint16x16 - -// PairwiseAdd horizontally adds adjacent pairs of elements. -// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...]. -// -// Asm: VPHADDD, CPU Feature: AVX -func (x Uint32x4) PairwiseAdd(y Uint32x4) Uint32x4 - -// PairwiseAdd horizontally adds adjacent pairs of elements. -// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...]. -// -// Asm: VPHADDD, CPU Feature: AVX2 -func (x Uint32x8) PairwiseAdd(y Uint32x8) Uint32x8 - -/* PairwiseSub */ - -// PairwiseSub horizontally subtracts adjacent pairs of elements. -// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...]. -// -// Asm: VHSUBPS, CPU Feature: AVX -func (x Float32x4) PairwiseSub(y Float32x4) Float32x4 - -// PairwiseSub horizontally subtracts adjacent pairs of elements. -// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...]. -// -// Asm: VHSUBPS, CPU Feature: AVX -func (x Float32x8) PairwiseSub(y Float32x8) Float32x8 - -// PairwiseSub horizontally subtracts adjacent pairs of elements. -// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...]. -// -// Asm: VHSUBPD, CPU Feature: AVX -func (x Float64x2) PairwiseSub(y Float64x2) Float64x2 - -// PairwiseSub horizontally subtracts adjacent pairs of elements. -// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...]. -// -// Asm: VHSUBPD, CPU Feature: AVX -func (x Float64x4) PairwiseSub(y Float64x4) Float64x4 - -// PairwiseSub horizontally subtracts adjacent pairs of elements. -// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...]. -// -// Asm: VPHSUBW, CPU Feature: AVX -func (x Int16x8) PairwiseSub(y Int16x8) Int16x8 - -// PairwiseSub horizontally subtracts adjacent pairs of elements. -// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...]. -// -// Asm: VPHSUBW, CPU Feature: AVX2 -func (x Int16x16) PairwiseSub(y Int16x16) Int16x16 - -// PairwiseSub horizontally subtracts adjacent pairs of elements. -// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...]. -// -// Asm: VPHSUBD, CPU Feature: AVX -func (x Int32x4) PairwiseSub(y Int32x4) Int32x4 - -// PairwiseSub horizontally subtracts adjacent pairs of elements. -// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...]. -// -// Asm: VPHSUBD, CPU Feature: AVX2 -func (x Int32x8) PairwiseSub(y Int32x8) Int32x8 - -// PairwiseSub horizontally subtracts adjacent pairs of elements. -// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...]. -// -// Asm: VPHSUBW, CPU Feature: AVX -func (x Uint16x8) PairwiseSub(y Uint16x8) Uint16x8 - -// PairwiseSub horizontally subtracts adjacent pairs of elements. -// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...]. -// -// Asm: VPHSUBW, CPU Feature: AVX2 -func (x Uint16x16) PairwiseSub(y Uint16x16) Uint16x16 - -// PairwiseSub horizontally subtracts adjacent pairs of elements. -// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...]. -// -// Asm: VPHSUBD, CPU Feature: AVX -func (x Uint32x4) PairwiseSub(y Uint32x4) Uint32x4 - -// PairwiseSub horizontally subtracts adjacent pairs of elements. -// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...]. -// -// Asm: VPHSUBD, CPU Feature: AVX2 -func (x Uint32x8) PairwiseSub(y Uint32x8) Uint32x8 - /* Permute */ // Permute performs a full permutation of vector x using indices: @@ -8547,167 +8355,205 @@ func (x Float64x2) Round() Float64x2 // Asm: VROUNDPD, CPU Feature: AVX func (x Float64x4) Round() Float64x4 -/* RoundWithPrecision */ +/* RoundScaled */ -// RoundWithPrecision rounds elements with specified precision. +// RoundScaled rounds elements with specified precision. // // prec is expected to be a constant, non-constant value will trigger a runtime panic. // // Asm: VRNDSCALEPS, CPU Feature: AVX512F -func (x Float32x4) RoundWithPrecision(prec uint8) Float32x4 +func (x Float32x4) RoundScaled(prec uint8) Float32x4 -// RoundWithPrecision rounds elements with specified precision. +// RoundScaled rounds elements with specified precision. // // prec is expected to be a constant, non-constant value will trigger a runtime panic. // // Asm: VRNDSCALEPS, CPU Feature: AVX512F -func (x Float32x8) RoundWithPrecision(prec uint8) Float32x8 +func (x Float32x8) RoundScaled(prec uint8) Float32x8 -// RoundWithPrecision rounds elements with specified precision. +// RoundScaled rounds elements with specified precision. // // prec is expected to be a constant, non-constant value will trigger a runtime panic. // // Asm: VRNDSCALEPS, CPU Feature: AVX512F -func (x Float32x16) RoundWithPrecision(prec uint8) Float32x16 +func (x Float32x16) RoundScaled(prec uint8) Float32x16 -// RoundWithPrecision rounds elements with specified precision. +// RoundScaled rounds elements with specified precision. // // prec is expected to be a constant, non-constant value will trigger a runtime panic. // // Asm: VRNDSCALEPD, CPU Feature: AVX512F -func (x Float64x2) RoundWithPrecision(prec uint8) Float64x2 +func (x Float64x2) RoundScaled(prec uint8) Float64x2 -// RoundWithPrecision rounds elements with specified precision. +// RoundScaled rounds elements with specified precision. // // prec is expected to be a constant, non-constant value will trigger a runtime panic. // // Asm: VRNDSCALEPD, CPU Feature: AVX512F -func (x Float64x4) RoundWithPrecision(prec uint8) Float64x4 +func (x Float64x4) RoundScaled(prec uint8) Float64x4 -// RoundWithPrecision rounds elements with specified precision. +// RoundScaled rounds elements with specified precision. // // prec is expected to be a constant, non-constant value will trigger a runtime panic. // // Asm: VRNDSCALEPD, CPU Feature: AVX512F -func (x Float64x8) RoundWithPrecision(prec uint8) Float64x8 +func (x Float64x8) RoundScaled(prec uint8) Float64x8 -/* RoundWithPrecisionMasked */ +/* RoundScaledMasked */ -// RoundWithPrecisionMasked rounds elements with specified precision. +// RoundScaledMasked rounds elements with specified precision. // // This operation is applied selectively under a write mask. // // prec is expected to be a constant, non-constant value will trigger a runtime panic. // // Asm: VRNDSCALEPS, CPU Feature: AVX512F -func (x Float32x4) RoundWithPrecisionMasked(prec uint8, mask Mask32x4) Float32x4 +func (x Float32x4) RoundScaledMasked(prec uint8, mask Mask32x4) Float32x4 -// RoundWithPrecisionMasked rounds elements with specified precision. +// RoundScaledMasked rounds elements with specified precision. // // This operation is applied selectively under a write mask. // // prec is expected to be a constant, non-constant value will trigger a runtime panic. // // Asm: VRNDSCALEPS, CPU Feature: AVX512F -func (x Float32x8) RoundWithPrecisionMasked(prec uint8, mask Mask32x8) Float32x8 +func (x Float32x8) RoundScaledMasked(prec uint8, mask Mask32x8) Float32x8 -// RoundWithPrecisionMasked rounds elements with specified precision. +// RoundScaledMasked rounds elements with specified precision. // // This operation is applied selectively under a write mask. // // prec is expected to be a constant, non-constant value will trigger a runtime panic. // // Asm: VRNDSCALEPS, CPU Feature: AVX512F -func (x Float32x16) RoundWithPrecisionMasked(prec uint8, mask Mask32x16) Float32x16 +func (x Float32x16) RoundScaledMasked(prec uint8, mask Mask32x16) Float32x16 -// RoundWithPrecisionMasked rounds elements with specified precision. +// RoundScaledMasked rounds elements with specified precision. // // This operation is applied selectively under a write mask. // // prec is expected to be a constant, non-constant value will trigger a runtime panic. // // Asm: VRNDSCALEPD, CPU Feature: AVX512F -func (x Float64x2) RoundWithPrecisionMasked(prec uint8, mask Mask64x2) Float64x2 +func (x Float64x2) RoundScaledMasked(prec uint8, mask Mask64x2) Float64x2 -// RoundWithPrecisionMasked rounds elements with specified precision. +// RoundScaledMasked rounds elements with specified precision. // // This operation is applied selectively under a write mask. // // prec is expected to be a constant, non-constant value will trigger a runtime panic. // // Asm: VRNDSCALEPD, CPU Feature: AVX512F -func (x Float64x4) RoundWithPrecisionMasked(prec uint8, mask Mask64x4) Float64x4 +func (x Float64x4) RoundScaledMasked(prec uint8, mask Mask64x4) Float64x4 -// RoundWithPrecisionMasked rounds elements with specified precision. +// RoundScaledMasked rounds elements with specified precision. // // This operation is applied selectively under a write mask. // // prec is expected to be a constant, non-constant value will trigger a runtime panic. // // Asm: VRNDSCALEPD, CPU Feature: AVX512F -func (x Float64x8) RoundWithPrecisionMasked(prec uint8, mask Mask64x8) Float64x8 +func (x Float64x8) RoundScaledMasked(prec uint8, mask Mask64x8) Float64x8 -/* SaturatedAdd */ +/* RoundScaledResidue */ -// SaturatedAdd adds corresponding elements of two vectors with saturation. +// RoundScaledResidue computes the difference after rounding with specified precision. // -// Asm: VPADDSB, CPU Feature: AVX -func (x Int8x16) SaturatedAdd(y Int8x16) Int8x16 - -// SaturatedAdd adds corresponding elements of two vectors with saturation. +// prec is expected to be a constant, non-constant value will trigger a runtime panic. // -// Asm: VPADDSB, CPU Feature: AVX2 -func (x Int8x32) SaturatedAdd(y Int8x32) Int8x32 +// Asm: VREDUCEPS, CPU Feature: AVX512DQ +func (x Float32x4) RoundScaledResidue(prec uint8) Float32x4 -// SaturatedAdd adds corresponding elements of two vectors with saturation. +// RoundScaledResidue computes the difference after rounding with specified precision. // -// Asm: VPADDSB, CPU Feature: AVX512BW -func (x Int8x64) SaturatedAdd(y Int8x64) Int8x64 +// prec is expected to be a constant, non-constant value will trigger a runtime panic. +// +// Asm: VREDUCEPS, CPU Feature: AVX512DQ +func (x Float32x8) RoundScaledResidue(prec uint8) Float32x8 -// SaturatedAdd adds corresponding elements of two vectors with saturation. +// RoundScaledResidue computes the difference after rounding with specified precision. // -// Asm: VPADDSW, CPU Feature: AVX -func (x Int16x8) SaturatedAdd(y Int16x8) Int16x8 +// prec is expected to be a constant, non-constant value will trigger a runtime panic. +// +// Asm: VREDUCEPS, CPU Feature: AVX512DQ +func (x Float32x16) RoundScaledResidue(prec uint8) Float32x16 -// SaturatedAdd adds corresponding elements of two vectors with saturation. +// RoundScaledResidue computes the difference after rounding with specified precision. // -// Asm: VPADDSW, CPU Feature: AVX2 -func (x Int16x16) SaturatedAdd(y Int16x16) Int16x16 +// prec is expected to be a constant, non-constant value will trigger a runtime panic. +// +// Asm: VREDUCEPD, CPU Feature: AVX512DQ +func (x Float64x2) RoundScaledResidue(prec uint8) Float64x2 -// SaturatedAdd adds corresponding elements of two vectors with saturation. +// RoundScaledResidue computes the difference after rounding with specified precision. // -// Asm: VPADDSW, CPU Feature: AVX512BW -func (x Int16x32) SaturatedAdd(y Int16x32) Int16x32 +// prec is expected to be a constant, non-constant value will trigger a runtime panic. +// +// Asm: VREDUCEPD, CPU Feature: AVX512DQ +func (x Float64x4) RoundScaledResidue(prec uint8) Float64x4 -// SaturatedAdd adds corresponding elements of two vectors with saturation. +// RoundScaledResidue computes the difference after rounding with specified precision. // -// Asm: VPADDSB, CPU Feature: AVX -func (x Uint8x16) SaturatedAdd(y Uint8x16) Uint8x16 +// prec is expected to be a constant, non-constant value will trigger a runtime panic. +// +// Asm: VREDUCEPD, CPU Feature: AVX512DQ +func (x Float64x8) RoundScaledResidue(prec uint8) Float64x8 -// SaturatedAdd adds corresponding elements of two vectors with saturation. -// -// Asm: VPADDSB, CPU Feature: AVX2 -func (x Uint8x32) SaturatedAdd(y Uint8x32) Uint8x32 +/* RoundScaledResidueMasked */ -// SaturatedAdd adds corresponding elements of two vectors with saturation. +// RoundScaledResidueMasked computes the difference after rounding with specified precision. // -// Asm: VPADDSB, CPU Feature: AVX512BW -func (x Uint8x64) SaturatedAdd(y Uint8x64) Uint8x64 +// This operation is applied selectively under a write mask. +// +// prec is expected to be a constant, non-constant value will trigger a runtime panic. +// +// Asm: VREDUCEPS, CPU Feature: AVX512DQ +func (x Float32x4) RoundScaledResidueMasked(prec uint8, mask Mask32x4) Float32x4 -// SaturatedAdd adds corresponding elements of two vectors with saturation. +// RoundScaledResidueMasked computes the difference after rounding with specified precision. // -// Asm: VPADDSW, CPU Feature: AVX -func (x Uint16x8) SaturatedAdd(y Uint16x8) Uint16x8 +// This operation is applied selectively under a write mask. +// +// prec is expected to be a constant, non-constant value will trigger a runtime panic. +// +// Asm: VREDUCEPS, CPU Feature: AVX512DQ +func (x Float32x8) RoundScaledResidueMasked(prec uint8, mask Mask32x8) Float32x8 -// SaturatedAdd adds corresponding elements of two vectors with saturation. +// RoundScaledResidueMasked computes the difference after rounding with specified precision. // -// Asm: VPADDSW, CPU Feature: AVX2 -func (x Uint16x16) SaturatedAdd(y Uint16x16) Uint16x16 +// This operation is applied selectively under a write mask. +// +// prec is expected to be a constant, non-constant value will trigger a runtime panic. +// +// Asm: VREDUCEPS, CPU Feature: AVX512DQ +func (x Float32x16) RoundScaledResidueMasked(prec uint8, mask Mask32x16) Float32x16 -// SaturatedAdd adds corresponding elements of two vectors with saturation. +// RoundScaledResidueMasked computes the difference after rounding with specified precision. // -// Asm: VPADDSW, CPU Feature: AVX512BW -func (x Uint16x32) SaturatedAdd(y Uint16x32) Uint16x32 +// This operation is applied selectively under a write mask. +// +// prec is expected to be a constant, non-constant value will trigger a runtime panic. +// +// Asm: VREDUCEPD, CPU Feature: AVX512DQ +func (x Float64x2) RoundScaledResidueMasked(prec uint8, mask Mask64x2) Float64x2 + +// RoundScaledResidueMasked computes the difference after rounding with specified precision. +// +// This operation is applied selectively under a write mask. +// +// prec is expected to be a constant, non-constant value will trigger a runtime panic. +// +// Asm: VREDUCEPD, CPU Feature: AVX512DQ +func (x Float64x4) RoundScaledResidueMasked(prec uint8, mask Mask64x4) Float64x4 + +// RoundScaledResidueMasked computes the difference after rounding with specified precision. +// +// This operation is applied selectively under a write mask. +// +// prec is expected to be a constant, non-constant value will trigger a runtime panic. +// +// Asm: VREDUCEPD, CPU Feature: AVX512DQ +func (x Float64x8) RoundScaledResidueMasked(prec uint8, mask Mask64x8) Float64x8 /* SaturatedAddDotProd */ @@ -8749,268 +8595,6 @@ func (x Int32x8) SaturatedAddDotProdMasked(y Int16x16, z Int16x16, mask Mask32x8 // Asm: VPDPWSSDS, CPU Feature: AVX512VNNI func (x Int32x16) SaturatedAddDotProdMasked(y Int16x32, z Int16x32, mask Mask32x16) Int32x16 -/* SaturatedAddMasked */ - -// SaturatedAddMasked adds corresponding elements of two vectors with saturation. -// -// This operation is applied selectively under a write mask. -// -// Asm: VPADDSB, CPU Feature: AVX512BW -func (x Int8x16) SaturatedAddMasked(y Int8x16, mask Mask8x16) Int8x16 - -// SaturatedAddMasked adds corresponding elements of two vectors with saturation. -// -// This operation is applied selectively under a write mask. -// -// Asm: VPADDSB, CPU Feature: AVX512BW -func (x Int8x32) SaturatedAddMasked(y Int8x32, mask Mask8x32) Int8x32 - -// SaturatedAddMasked adds corresponding elements of two vectors with saturation. -// -// This operation is applied selectively under a write mask. -// -// Asm: VPADDSB, CPU Feature: AVX512BW -func (x Int8x64) SaturatedAddMasked(y Int8x64, mask Mask8x64) Int8x64 - -// SaturatedAddMasked adds corresponding elements of two vectors with saturation. -// -// This operation is applied selectively under a write mask. -// -// Asm: VPADDSW, CPU Feature: AVX512BW -func (x Int16x8) SaturatedAddMasked(y Int16x8, mask Mask16x8) Int16x8 - -// SaturatedAddMasked adds corresponding elements of two vectors with saturation. -// -// This operation is applied selectively under a write mask. -// -// Asm: VPADDSW, CPU Feature: AVX512BW -func (x Int16x16) SaturatedAddMasked(y Int16x16, mask Mask16x16) Int16x16 - -// SaturatedAddMasked adds corresponding elements of two vectors with saturation. -// -// This operation is applied selectively under a write mask. -// -// Asm: VPADDSW, CPU Feature: AVX512BW -func (x Int16x32) SaturatedAddMasked(y Int16x32, mask Mask16x32) Int16x32 - -// SaturatedAddMasked adds corresponding elements of two vectors with saturation. -// -// This operation is applied selectively under a write mask. -// -// Asm: VPADDSB, CPU Feature: AVX512BW -func (x Uint8x16) SaturatedAddMasked(y Uint8x16, mask Mask8x16) Uint8x16 - -// SaturatedAddMasked adds corresponding elements of two vectors with saturation. -// -// This operation is applied selectively under a write mask. -// -// Asm: VPADDSB, CPU Feature: AVX512BW -func (x Uint8x32) SaturatedAddMasked(y Uint8x32, mask Mask8x32) Uint8x32 - -// SaturatedAddMasked adds corresponding elements of two vectors with saturation. -// -// This operation is applied selectively under a write mask. -// -// Asm: VPADDSB, CPU Feature: AVX512BW -func (x Uint8x64) SaturatedAddMasked(y Uint8x64, mask Mask8x64) Uint8x64 - -// SaturatedAddMasked adds corresponding elements of two vectors with saturation. -// -// This operation is applied selectively under a write mask. -// -// Asm: VPADDSW, CPU Feature: AVX512BW -func (x Uint16x8) SaturatedAddMasked(y Uint16x8, mask Mask16x8) Uint16x8 - -// SaturatedAddMasked adds corresponding elements of two vectors with saturation. -// -// This operation is applied selectively under a write mask. -// -// Asm: VPADDSW, CPU Feature: AVX512BW -func (x Uint16x16) SaturatedAddMasked(y Uint16x16, mask Mask16x16) Uint16x16 - -// SaturatedAddMasked adds corresponding elements of two vectors with saturation. -// -// This operation is applied selectively under a write mask. -// -// Asm: VPADDSW, CPU Feature: AVX512BW -func (x Uint16x32) SaturatedAddMasked(y Uint16x32, mask Mask16x32) Uint16x32 - -/* SaturatedPairwiseAdd */ - -// SaturatedPairwiseAdd horizontally adds adjacent pairs of elements with saturation. -// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...]. -// -// Asm: VPHADDSW, CPU Feature: AVX -func (x Int16x8) SaturatedPairwiseAdd(y Int16x8) Int16x8 - -// SaturatedPairwiseAdd horizontally adds adjacent pairs of elements with saturation. -// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...]. -// -// Asm: VPHADDSW, CPU Feature: AVX2 -func (x Int16x16) SaturatedPairwiseAdd(y Int16x16) Int16x16 - -/* SaturatedPairwiseSub */ - -// SaturatedPairwiseSub horizontally subtracts adjacent pairs of elements with saturation. -// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...]. -// -// Asm: VPHSUBSW, CPU Feature: AVX -func (x Int16x8) SaturatedPairwiseSub(y Int16x8) Int16x8 - -// SaturatedPairwiseSub horizontally subtracts adjacent pairs of elements with saturation. -// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...]. -// -// Asm: VPHSUBSW, CPU Feature: AVX2 -func (x Int16x16) SaturatedPairwiseSub(y Int16x16) Int16x16 - -/* SaturatedSub */ - -// SaturatedSub subtracts corresponding elements of two vectors with saturation. -// -// Asm: VPSUBSB, CPU Feature: AVX -func (x Int8x16) SaturatedSub(y Int8x16) Int8x16 - -// SaturatedSub subtracts corresponding elements of two vectors with saturation. -// -// Asm: VPSUBSB, CPU Feature: AVX2 -func (x Int8x32) SaturatedSub(y Int8x32) Int8x32 - -// SaturatedSub subtracts corresponding elements of two vectors with saturation. -// -// Asm: VPSUBSB, CPU Feature: AVX512BW -func (x Int8x64) SaturatedSub(y Int8x64) Int8x64 - -// SaturatedSub subtracts corresponding elements of two vectors with saturation. -// -// Asm: VPSUBSW, CPU Feature: AVX -func (x Int16x8) SaturatedSub(y Int16x8) Int16x8 - -// SaturatedSub subtracts corresponding elements of two vectors with saturation. -// -// Asm: VPSUBSW, CPU Feature: AVX2 -func (x Int16x16) SaturatedSub(y Int16x16) Int16x16 - -// SaturatedSub subtracts corresponding elements of two vectors with saturation. -// -// Asm: VPSUBSW, CPU Feature: AVX512BW -func (x Int16x32) SaturatedSub(y Int16x32) Int16x32 - -// SaturatedSub subtracts corresponding elements of two vectors with saturation. -// -// Asm: VPSUBSB, CPU Feature: AVX -func (x Uint8x16) SaturatedSub(y Uint8x16) Uint8x16 - -// SaturatedSub subtracts corresponding elements of two vectors with saturation. -// -// Asm: VPSUBSB, CPU Feature: AVX2 -func (x Uint8x32) SaturatedSub(y Uint8x32) Uint8x32 - -// SaturatedSub subtracts corresponding elements of two vectors with saturation. -// -// Asm: VPSUBSB, CPU Feature: AVX512BW -func (x Uint8x64) SaturatedSub(y Uint8x64) Uint8x64 - -// SaturatedSub subtracts corresponding elements of two vectors with saturation. -// -// Asm: VPSUBSW, CPU Feature: AVX -func (x Uint16x8) SaturatedSub(y Uint16x8) Uint16x8 - -// SaturatedSub subtracts corresponding elements of two vectors with saturation. -// -// Asm: VPSUBSW, CPU Feature: AVX2 -func (x Uint16x16) SaturatedSub(y Uint16x16) Uint16x16 - -// SaturatedSub subtracts corresponding elements of two vectors with saturation. -// -// Asm: VPSUBSW, CPU Feature: AVX512BW -func (x Uint16x32) SaturatedSub(y Uint16x32) Uint16x32 - -/* SaturatedSubMasked */ - -// SaturatedSubMasked subtracts corresponding elements of two vectors with saturation. -// -// This operation is applied selectively under a write mask. -// -// Asm: VPSUBSB, CPU Feature: AVX512BW -func (x Int8x16) SaturatedSubMasked(y Int8x16, mask Mask8x16) Int8x16 - -// SaturatedSubMasked subtracts corresponding elements of two vectors with saturation. -// -// This operation is applied selectively under a write mask. -// -// Asm: VPSUBSB, CPU Feature: AVX512BW -func (x Int8x32) SaturatedSubMasked(y Int8x32, mask Mask8x32) Int8x32 - -// SaturatedSubMasked subtracts corresponding elements of two vectors with saturation. -// -// This operation is applied selectively under a write mask. -// -// Asm: VPSUBSB, CPU Feature: AVX512BW -func (x Int8x64) SaturatedSubMasked(y Int8x64, mask Mask8x64) Int8x64 - -// SaturatedSubMasked subtracts corresponding elements of two vectors with saturation. -// -// This operation is applied selectively under a write mask. -// -// Asm: VPSUBSW, CPU Feature: AVX512BW -func (x Int16x8) SaturatedSubMasked(y Int16x8, mask Mask16x8) Int16x8 - -// SaturatedSubMasked subtracts corresponding elements of two vectors with saturation. -// -// This operation is applied selectively under a write mask. -// -// Asm: VPSUBSW, CPU Feature: AVX512BW -func (x Int16x16) SaturatedSubMasked(y Int16x16, mask Mask16x16) Int16x16 - -// SaturatedSubMasked subtracts corresponding elements of two vectors with saturation. -// -// This operation is applied selectively under a write mask. -// -// Asm: VPSUBSW, CPU Feature: AVX512BW -func (x Int16x32) SaturatedSubMasked(y Int16x32, mask Mask16x32) Int16x32 - -// SaturatedSubMasked subtracts corresponding elements of two vectors with saturation. -// -// This operation is applied selectively under a write mask. -// -// Asm: VPSUBSB, CPU Feature: AVX512BW -func (x Uint8x16) SaturatedSubMasked(y Uint8x16, mask Mask8x16) Uint8x16 - -// SaturatedSubMasked subtracts corresponding elements of two vectors with saturation. -// -// This operation is applied selectively under a write mask. -// -// Asm: VPSUBSB, CPU Feature: AVX512BW -func (x Uint8x32) SaturatedSubMasked(y Uint8x32, mask Mask8x32) Uint8x32 - -// SaturatedSubMasked subtracts corresponding elements of two vectors with saturation. -// -// This operation is applied selectively under a write mask. -// -// Asm: VPSUBSB, CPU Feature: AVX512BW -func (x Uint8x64) SaturatedSubMasked(y Uint8x64, mask Mask8x64) Uint8x64 - -// SaturatedSubMasked subtracts corresponding elements of two vectors with saturation. -// -// This operation is applied selectively under a write mask. -// -// Asm: VPSUBSW, CPU Feature: AVX512BW -func (x Uint16x8) SaturatedSubMasked(y Uint16x8, mask Mask16x8) Uint16x8 - -// SaturatedSubMasked subtracts corresponding elements of two vectors with saturation. -// -// This operation is applied selectively under a write mask. -// -// Asm: VPSUBSW, CPU Feature: AVX512BW -func (x Uint16x16) SaturatedSubMasked(y Uint16x16, mask Mask16x16) Uint16x16 - -// SaturatedSubMasked subtracts corresponding elements of two vectors with saturation. -// -// This operation is applied selectively under a write mask. -// -// Asm: VPSUBSW, CPU Feature: AVX512BW -func (x Uint16x32) SaturatedSubMasked(y Uint16x32, mask Mask16x32) Uint16x32 - /* SaturatedUnsignedSignedPairDotProd */ // SaturatedUnsignedSignedPairDotProd multiplies the elements and add the pairs together with saturation, @@ -9097,6 +8681,82 @@ func (x Int8x32) SaturatedUnsignedSignedQuadDotProdAccumulateMasked(y Uint8x32, // Asm: VPDPBUSDS, CPU Feature: AVX512VNNI func (x Int8x64) SaturatedUnsignedSignedQuadDotProdAccumulateMasked(y Uint8x64, z Int32x16, mask Mask32x16) Int32x16 +/* Scale */ + +// Scale multiplies elements by a power of 2. +// +// Asm: VSCALEFPS, CPU Feature: AVX512F +func (x Float32x4) Scale(y Float32x4) Float32x4 + +// Scale multiplies elements by a power of 2. +// +// Asm: VSCALEFPS, CPU Feature: AVX512F +func (x Float32x8) Scale(y Float32x8) Float32x8 + +// Scale multiplies elements by a power of 2. +// +// Asm: VSCALEFPS, CPU Feature: AVX512F +func (x Float32x16) Scale(y Float32x16) Float32x16 + +// Scale multiplies elements by a power of 2. +// +// Asm: VSCALEFPD, CPU Feature: AVX512F +func (x Float64x2) Scale(y Float64x2) Float64x2 + +// Scale multiplies elements by a power of 2. +// +// Asm: VSCALEFPD, CPU Feature: AVX512F +func (x Float64x4) Scale(y Float64x4) Float64x4 + +// Scale multiplies elements by a power of 2. +// +// Asm: VSCALEFPD, CPU Feature: AVX512F +func (x Float64x8) Scale(y Float64x8) Float64x8 + +/* ScaleMasked */ + +// ScaleMasked multiplies elements by a power of 2. +// +// This operation is applied selectively under a write mask. +// +// Asm: VSCALEFPS, CPU Feature: AVX512F +func (x Float32x4) ScaleMasked(y Float32x4, mask Mask32x4) Float32x4 + +// ScaleMasked multiplies elements by a power of 2. +// +// This operation is applied selectively under a write mask. +// +// Asm: VSCALEFPS, CPU Feature: AVX512F +func (x Float32x8) ScaleMasked(y Float32x8, mask Mask32x8) Float32x8 + +// ScaleMasked multiplies elements by a power of 2. +// +// This operation is applied selectively under a write mask. +// +// Asm: VSCALEFPS, CPU Feature: AVX512F +func (x Float32x16) ScaleMasked(y Float32x16, mask Mask32x16) Float32x16 + +// ScaleMasked multiplies elements by a power of 2. +// +// This operation is applied selectively under a write mask. +// +// Asm: VSCALEFPD, CPU Feature: AVX512F +func (x Float64x2) ScaleMasked(y Float64x2, mask Mask64x2) Float64x2 + +// ScaleMasked multiplies elements by a power of 2. +// +// This operation is applied selectively under a write mask. +// +// Asm: VSCALEFPD, CPU Feature: AVX512F +func (x Float64x4) ScaleMasked(y Float64x4, mask Mask64x4) Float64x4 + +// ScaleMasked multiplies elements by a power of 2. +// +// This operation is applied selectively under a write mask. +// +// Asm: VSCALEFPD, CPU Feature: AVX512F +func (x Float64x8) ScaleMasked(y Float64x8, mask Mask64x8) Float64x8 + /* Set128 */ // Set128 combines a 128-bit vector with a 256-bit vector, where the constant operand specifies whether the low (0) or high (1) half is receives the smaller vector. @@ -11753,6 +11413,242 @@ func (x Uint64x4) SubMasked(y Uint64x4, mask Mask64x4) Uint64x4 // Asm: VPSUBQ, CPU Feature: AVX512F func (x Uint64x8) SubMasked(y Uint64x8, mask Mask64x8) Uint64x8 +/* SubPairs */ + +// SubPairs horizontally subtracts adjacent pairs of elements. +// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...]. +// +// Asm: VHSUBPS, CPU Feature: AVX +func (x Float32x4) SubPairs(y Float32x4) Float32x4 + +// SubPairs horizontally subtracts adjacent pairs of elements. +// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...]. +// +// Asm: VHSUBPS, CPU Feature: AVX +func (x Float32x8) SubPairs(y Float32x8) Float32x8 + +// SubPairs horizontally subtracts adjacent pairs of elements. +// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...]. +// +// Asm: VHSUBPD, CPU Feature: AVX +func (x Float64x2) SubPairs(y Float64x2) Float64x2 + +// SubPairs horizontally subtracts adjacent pairs of elements. +// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...]. +// +// Asm: VHSUBPD, CPU Feature: AVX +func (x Float64x4) SubPairs(y Float64x4) Float64x4 + +// SubPairs horizontally subtracts adjacent pairs of elements. +// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...]. +// +// Asm: VPHSUBW, CPU Feature: AVX +func (x Int16x8) SubPairs(y Int16x8) Int16x8 + +// SubPairs horizontally subtracts adjacent pairs of elements. +// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...]. +// +// Asm: VPHSUBW, CPU Feature: AVX2 +func (x Int16x16) SubPairs(y Int16x16) Int16x16 + +// SubPairs horizontally subtracts adjacent pairs of elements. +// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...]. +// +// Asm: VPHSUBD, CPU Feature: AVX +func (x Int32x4) SubPairs(y Int32x4) Int32x4 + +// SubPairs horizontally subtracts adjacent pairs of elements. +// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...]. +// +// Asm: VPHSUBD, CPU Feature: AVX2 +func (x Int32x8) SubPairs(y Int32x8) Int32x8 + +// SubPairs horizontally subtracts adjacent pairs of elements. +// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...]. +// +// Asm: VPHSUBW, CPU Feature: AVX +func (x Uint16x8) SubPairs(y Uint16x8) Uint16x8 + +// SubPairs horizontally subtracts adjacent pairs of elements. +// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...]. +// +// Asm: VPHSUBW, CPU Feature: AVX2 +func (x Uint16x16) SubPairs(y Uint16x16) Uint16x16 + +// SubPairs horizontally subtracts adjacent pairs of elements. +// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...]. +// +// Asm: VPHSUBD, CPU Feature: AVX +func (x Uint32x4) SubPairs(y Uint32x4) Uint32x4 + +// SubPairs horizontally subtracts adjacent pairs of elements. +// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...]. +// +// Asm: VPHSUBD, CPU Feature: AVX2 +func (x Uint32x8) SubPairs(y Uint32x8) Uint32x8 + +/* SubPairsSaturated */ + +// SubPairsSaturated horizontally subtracts adjacent pairs of elements with saturation. +// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...]. +// +// Asm: VPHSUBSW, CPU Feature: AVX +func (x Int16x8) SubPairsSaturated(y Int16x8) Int16x8 + +// SubPairsSaturated horizontally subtracts adjacent pairs of elements with saturation. +// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...]. +// +// Asm: VPHSUBSW, CPU Feature: AVX2 +func (x Int16x16) SubPairsSaturated(y Int16x16) Int16x16 + +/* SubSaturated */ + +// SubSaturated subtracts corresponding elements of two vectors with saturation. +// +// Asm: VPSUBSB, CPU Feature: AVX +func (x Int8x16) SubSaturated(y Int8x16) Int8x16 + +// SubSaturated subtracts corresponding elements of two vectors with saturation. +// +// Asm: VPSUBSB, CPU Feature: AVX2 +func (x Int8x32) SubSaturated(y Int8x32) Int8x32 + +// SubSaturated subtracts corresponding elements of two vectors with saturation. +// +// Asm: VPSUBSB, CPU Feature: AVX512BW +func (x Int8x64) SubSaturated(y Int8x64) Int8x64 + +// SubSaturated subtracts corresponding elements of two vectors with saturation. +// +// Asm: VPSUBSW, CPU Feature: AVX +func (x Int16x8) SubSaturated(y Int16x8) Int16x8 + +// SubSaturated subtracts corresponding elements of two vectors with saturation. +// +// Asm: VPSUBSW, CPU Feature: AVX2 +func (x Int16x16) SubSaturated(y Int16x16) Int16x16 + +// SubSaturated subtracts corresponding elements of two vectors with saturation. +// +// Asm: VPSUBSW, CPU Feature: AVX512BW +func (x Int16x32) SubSaturated(y Int16x32) Int16x32 + +// SubSaturated subtracts corresponding elements of two vectors with saturation. +// +// Asm: VPSUBSB, CPU Feature: AVX +func (x Uint8x16) SubSaturated(y Uint8x16) Uint8x16 + +// SubSaturated subtracts corresponding elements of two vectors with saturation. +// +// Asm: VPSUBSB, CPU Feature: AVX2 +func (x Uint8x32) SubSaturated(y Uint8x32) Uint8x32 + +// SubSaturated subtracts corresponding elements of two vectors with saturation. +// +// Asm: VPSUBSB, CPU Feature: AVX512BW +func (x Uint8x64) SubSaturated(y Uint8x64) Uint8x64 + +// SubSaturated subtracts corresponding elements of two vectors with saturation. +// +// Asm: VPSUBSW, CPU Feature: AVX +func (x Uint16x8) SubSaturated(y Uint16x8) Uint16x8 + +// SubSaturated subtracts corresponding elements of two vectors with saturation. +// +// Asm: VPSUBSW, CPU Feature: AVX2 +func (x Uint16x16) SubSaturated(y Uint16x16) Uint16x16 + +// SubSaturated subtracts corresponding elements of two vectors with saturation. +// +// Asm: VPSUBSW, CPU Feature: AVX512BW +func (x Uint16x32) SubSaturated(y Uint16x32) Uint16x32 + +/* SubSaturatedMasked */ + +// SubSaturatedMasked subtracts corresponding elements of two vectors with saturation. +// +// This operation is applied selectively under a write mask. +// +// Asm: VPSUBSB, CPU Feature: AVX512BW +func (x Int8x16) SubSaturatedMasked(y Int8x16, mask Mask8x16) Int8x16 + +// SubSaturatedMasked subtracts corresponding elements of two vectors with saturation. +// +// This operation is applied selectively under a write mask. +// +// Asm: VPSUBSB, CPU Feature: AVX512BW +func (x Int8x32) SubSaturatedMasked(y Int8x32, mask Mask8x32) Int8x32 + +// SubSaturatedMasked subtracts corresponding elements of two vectors with saturation. +// +// This operation is applied selectively under a write mask. +// +// Asm: VPSUBSB, CPU Feature: AVX512BW +func (x Int8x64) SubSaturatedMasked(y Int8x64, mask Mask8x64) Int8x64 + +// SubSaturatedMasked subtracts corresponding elements of two vectors with saturation. +// +// This operation is applied selectively under a write mask. +// +// Asm: VPSUBSW, CPU Feature: AVX512BW +func (x Int16x8) SubSaturatedMasked(y Int16x8, mask Mask16x8) Int16x8 + +// SubSaturatedMasked subtracts corresponding elements of two vectors with saturation. +// +// This operation is applied selectively under a write mask. +// +// Asm: VPSUBSW, CPU Feature: AVX512BW +func (x Int16x16) SubSaturatedMasked(y Int16x16, mask Mask16x16) Int16x16 + +// SubSaturatedMasked subtracts corresponding elements of two vectors with saturation. +// +// This operation is applied selectively under a write mask. +// +// Asm: VPSUBSW, CPU Feature: AVX512BW +func (x Int16x32) SubSaturatedMasked(y Int16x32, mask Mask16x32) Int16x32 + +// SubSaturatedMasked subtracts corresponding elements of two vectors with saturation. +// +// This operation is applied selectively under a write mask. +// +// Asm: VPSUBSB, CPU Feature: AVX512BW +func (x Uint8x16) SubSaturatedMasked(y Uint8x16, mask Mask8x16) Uint8x16 + +// SubSaturatedMasked subtracts corresponding elements of two vectors with saturation. +// +// This operation is applied selectively under a write mask. +// +// Asm: VPSUBSB, CPU Feature: AVX512BW +func (x Uint8x32) SubSaturatedMasked(y Uint8x32, mask Mask8x32) Uint8x32 + +// SubSaturatedMasked subtracts corresponding elements of two vectors with saturation. +// +// This operation is applied selectively under a write mask. +// +// Asm: VPSUBSB, CPU Feature: AVX512BW +func (x Uint8x64) SubSaturatedMasked(y Uint8x64, mask Mask8x64) Uint8x64 + +// SubSaturatedMasked subtracts corresponding elements of two vectors with saturation. +// +// This operation is applied selectively under a write mask. +// +// Asm: VPSUBSW, CPU Feature: AVX512BW +func (x Uint16x8) SubSaturatedMasked(y Uint16x8, mask Mask16x8) Uint16x8 + +// SubSaturatedMasked subtracts corresponding elements of two vectors with saturation. +// +// This operation is applied selectively under a write mask. +// +// Asm: VPSUBSW, CPU Feature: AVX512BW +func (x Uint16x16) SubSaturatedMasked(y Uint16x16, mask Mask16x16) Uint16x16 + +// SubSaturatedMasked subtracts corresponding elements of two vectors with saturation. +// +// This operation is applied selectively under a write mask. +// +// Asm: VPSUBSW, CPU Feature: AVX512BW +func (x Uint16x32) SubSaturatedMasked(y Uint16x32, mask Mask16x32) Uint16x32 + /* Trunc */ // Trunc truncates elements towards zero. @@ -11775,105 +11671,205 @@ func (x Float64x2) Trunc() Float64x2 // Asm: VROUNDPD, CPU Feature: AVX func (x Float64x4) Trunc() Float64x4 -/* TruncWithPrecision */ +/* TruncScaled */ -// TruncWithPrecision truncates elements with specified precision. +// TruncScaled truncates elements with specified precision. // // prec is expected to be a constant, non-constant value will trigger a runtime panic. // // Asm: VRNDSCALEPS, CPU Feature: AVX512F -func (x Float32x4) TruncWithPrecision(prec uint8) Float32x4 +func (x Float32x4) TruncScaled(prec uint8) Float32x4 -// TruncWithPrecision truncates elements with specified precision. +// TruncScaled truncates elements with specified precision. // // prec is expected to be a constant, non-constant value will trigger a runtime panic. // // Asm: VRNDSCALEPS, CPU Feature: AVX512F -func (x Float32x8) TruncWithPrecision(prec uint8) Float32x8 +func (x Float32x8) TruncScaled(prec uint8) Float32x8 -// TruncWithPrecision truncates elements with specified precision. +// TruncScaled truncates elements with specified precision. // // prec is expected to be a constant, non-constant value will trigger a runtime panic. // // Asm: VRNDSCALEPS, CPU Feature: AVX512F -func (x Float32x16) TruncWithPrecision(prec uint8) Float32x16 +func (x Float32x16) TruncScaled(prec uint8) Float32x16 -// TruncWithPrecision truncates elements with specified precision. +// TruncScaled truncates elements with specified precision. // // prec is expected to be a constant, non-constant value will trigger a runtime panic. // // Asm: VRNDSCALEPD, CPU Feature: AVX512F -func (x Float64x2) TruncWithPrecision(prec uint8) Float64x2 +func (x Float64x2) TruncScaled(prec uint8) Float64x2 -// TruncWithPrecision truncates elements with specified precision. +// TruncScaled truncates elements with specified precision. // // prec is expected to be a constant, non-constant value will trigger a runtime panic. // // Asm: VRNDSCALEPD, CPU Feature: AVX512F -func (x Float64x4) TruncWithPrecision(prec uint8) Float64x4 +func (x Float64x4) TruncScaled(prec uint8) Float64x4 -// TruncWithPrecision truncates elements with specified precision. +// TruncScaled truncates elements with specified precision. // // prec is expected to be a constant, non-constant value will trigger a runtime panic. // // Asm: VRNDSCALEPD, CPU Feature: AVX512F -func (x Float64x8) TruncWithPrecision(prec uint8) Float64x8 +func (x Float64x8) TruncScaled(prec uint8) Float64x8 -/* TruncWithPrecisionMasked */ +/* TruncScaledMasked */ -// TruncWithPrecisionMasked truncates elements with specified precision. +// TruncScaledMasked truncates elements with specified precision. // // This operation is applied selectively under a write mask. // // prec is expected to be a constant, non-constant value will trigger a runtime panic. // // Asm: VRNDSCALEPS, CPU Feature: AVX512F -func (x Float32x4) TruncWithPrecisionMasked(prec uint8, mask Mask32x4) Float32x4 +func (x Float32x4) TruncScaledMasked(prec uint8, mask Mask32x4) Float32x4 -// TruncWithPrecisionMasked truncates elements with specified precision. +// TruncScaledMasked truncates elements with specified precision. // // This operation is applied selectively under a write mask. // // prec is expected to be a constant, non-constant value will trigger a runtime panic. // // Asm: VRNDSCALEPS, CPU Feature: AVX512F -func (x Float32x8) TruncWithPrecisionMasked(prec uint8, mask Mask32x8) Float32x8 +func (x Float32x8) TruncScaledMasked(prec uint8, mask Mask32x8) Float32x8 -// TruncWithPrecisionMasked truncates elements with specified precision. +// TruncScaledMasked truncates elements with specified precision. // // This operation is applied selectively under a write mask. // // prec is expected to be a constant, non-constant value will trigger a runtime panic. // // Asm: VRNDSCALEPS, CPU Feature: AVX512F -func (x Float32x16) TruncWithPrecisionMasked(prec uint8, mask Mask32x16) Float32x16 +func (x Float32x16) TruncScaledMasked(prec uint8, mask Mask32x16) Float32x16 -// TruncWithPrecisionMasked truncates elements with specified precision. +// TruncScaledMasked truncates elements with specified precision. // // This operation is applied selectively under a write mask. // // prec is expected to be a constant, non-constant value will trigger a runtime panic. // // Asm: VRNDSCALEPD, CPU Feature: AVX512F -func (x Float64x2) TruncWithPrecisionMasked(prec uint8, mask Mask64x2) Float64x2 +func (x Float64x2) TruncScaledMasked(prec uint8, mask Mask64x2) Float64x2 -// TruncWithPrecisionMasked truncates elements with specified precision. +// TruncScaledMasked truncates elements with specified precision. // // This operation is applied selectively under a write mask. // // prec is expected to be a constant, non-constant value will trigger a runtime panic. // // Asm: VRNDSCALEPD, CPU Feature: AVX512F -func (x Float64x4) TruncWithPrecisionMasked(prec uint8, mask Mask64x4) Float64x4 +func (x Float64x4) TruncScaledMasked(prec uint8, mask Mask64x4) Float64x4 -// TruncWithPrecisionMasked truncates elements with specified precision. +// TruncScaledMasked truncates elements with specified precision. // // This operation is applied selectively under a write mask. // // prec is expected to be a constant, non-constant value will trigger a runtime panic. // // Asm: VRNDSCALEPD, CPU Feature: AVX512F -func (x Float64x8) TruncWithPrecisionMasked(prec uint8, mask Mask64x8) Float64x8 +func (x Float64x8) TruncScaledMasked(prec uint8, mask Mask64x8) Float64x8 + +/* TruncScaledResidue */ + +// TruncScaledResidue computes the difference after truncating with specified precision. +// +// prec is expected to be a constant, non-constant value will trigger a runtime panic. +// +// Asm: VREDUCEPS, CPU Feature: AVX512DQ +func (x Float32x4) TruncScaledResidue(prec uint8) Float32x4 + +// TruncScaledResidue computes the difference after truncating with specified precision. +// +// prec is expected to be a constant, non-constant value will trigger a runtime panic. +// +// Asm: VREDUCEPS, CPU Feature: AVX512DQ +func (x Float32x8) TruncScaledResidue(prec uint8) Float32x8 + +// TruncScaledResidue computes the difference after truncating with specified precision. +// +// prec is expected to be a constant, non-constant value will trigger a runtime panic. +// +// Asm: VREDUCEPS, CPU Feature: AVX512DQ +func (x Float32x16) TruncScaledResidue(prec uint8) Float32x16 + +// TruncScaledResidue computes the difference after truncating with specified precision. +// +// prec is expected to be a constant, non-constant value will trigger a runtime panic. +// +// Asm: VREDUCEPD, CPU Feature: AVX512DQ +func (x Float64x2) TruncScaledResidue(prec uint8) Float64x2 + +// TruncScaledResidue computes the difference after truncating with specified precision. +// +// prec is expected to be a constant, non-constant value will trigger a runtime panic. +// +// Asm: VREDUCEPD, CPU Feature: AVX512DQ +func (x Float64x4) TruncScaledResidue(prec uint8) Float64x4 + +// TruncScaledResidue computes the difference after truncating with specified precision. +// +// prec is expected to be a constant, non-constant value will trigger a runtime panic. +// +// Asm: VREDUCEPD, CPU Feature: AVX512DQ +func (x Float64x8) TruncScaledResidue(prec uint8) Float64x8 + +/* TruncScaledResidueMasked */ + +// TruncScaledResidueMasked computes the difference after truncating with specified precision. +// +// This operation is applied selectively under a write mask. +// +// prec is expected to be a constant, non-constant value will trigger a runtime panic. +// +// Asm: VREDUCEPS, CPU Feature: AVX512DQ +func (x Float32x4) TruncScaledResidueMasked(prec uint8, mask Mask32x4) Float32x4 + +// TruncScaledResidueMasked computes the difference after truncating with specified precision. +// +// This operation is applied selectively under a write mask. +// +// prec is expected to be a constant, non-constant value will trigger a runtime panic. +// +// Asm: VREDUCEPS, CPU Feature: AVX512DQ +func (x Float32x8) TruncScaledResidueMasked(prec uint8, mask Mask32x8) Float32x8 + +// TruncScaledResidueMasked computes the difference after truncating with specified precision. +// +// This operation is applied selectively under a write mask. +// +// prec is expected to be a constant, non-constant value will trigger a runtime panic. +// +// Asm: VREDUCEPS, CPU Feature: AVX512DQ +func (x Float32x16) TruncScaledResidueMasked(prec uint8, mask Mask32x16) Float32x16 + +// TruncScaledResidueMasked computes the difference after truncating with specified precision. +// +// This operation is applied selectively under a write mask. +// +// prec is expected to be a constant, non-constant value will trigger a runtime panic. +// +// Asm: VREDUCEPD, CPU Feature: AVX512DQ +func (x Float64x2) TruncScaledResidueMasked(prec uint8, mask Mask64x2) Float64x2 + +// TruncScaledResidueMasked computes the difference after truncating with specified precision. +// +// This operation is applied selectively under a write mask. +// +// prec is expected to be a constant, non-constant value will trigger a runtime panic. +// +// Asm: VREDUCEPD, CPU Feature: AVX512DQ +func (x Float64x4) TruncScaledResidueMasked(prec uint8, mask Mask64x4) Float64x4 + +// TruncScaledResidueMasked computes the difference after truncating with specified precision. +// +// This operation is applied selectively under a write mask. +// +// prec is expected to be a constant, non-constant value will trigger a runtime panic. +// +// Asm: VREDUCEPD, CPU Feature: AVX512DQ +func (x Float64x8) TruncScaledResidueMasked(prec uint8, mask Mask64x8) Float64x8 /* UnsignedSignedQuadDotProdAccumulate */ diff --git a/src/simd/unary_test.go b/src/simd/unary_test.go index 4263b81cd73..c9fdfff0ffc 100644 --- a/src/simd/unary_test.go +++ b/src/simd/unary_test.go @@ -89,20 +89,20 @@ func TestToInt32(t *testing.T) { testFloat32x8UnaryToInt32(t, simd.Float32x8.ConvertToInt32, toInt32Slice[float32]) } -func TestDiffWithCeilWithPrecision(t *testing.T) { +func TestCeilScaledResidue(t *testing.T) { if !simd.HasAVX512() { t.Skip("Needs AVX512") } testFloat64x8UnaryFlaky(t, - func(x simd.Float64x8) simd.Float64x8 { return x.DiffWithCeilWithPrecision(0) }, + func(x simd.Float64x8) simd.Float64x8 { return x.CeilScaledResidue(0) }, map1(ceilResidueForPrecision[float64](0)), 0.001) testFloat64x8UnaryFlaky(t, - func(x simd.Float64x8) simd.Float64x8 { return x.DiffWithCeilWithPrecision(1) }, + func(x simd.Float64x8) simd.Float64x8 { return x.CeilScaledResidue(1) }, map1(ceilResidueForPrecision[float64](1)), 0.001) testFloat64x8Unary(t, - func(x simd.Float64x8) simd.Float64x8 { return x.Sub(x.CeilWithPrecision(0)) }, + func(x simd.Float64x8) simd.Float64x8 { return x.Sub(x.CeilScaled(0)) }, map1[float64](func(x float64) float64 { return x - math.Ceil(x) })) }