diff --git a/src/cmd/compile/internal/amd64/simdssa.go b/src/cmd/compile/internal/amd64/simdssa.go index d4126cef1e3..15ffbf66fa7 100644 --- a/src/cmd/compile/internal/amd64/simdssa.go +++ b/src/cmd/compile/internal/amd64/simdssa.go @@ -813,7 +813,10 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPCMPUQMasked512: p = simdV2kkImm8(s, v) - case ssa.OpAMD64VFMADD213PS128, + case ssa.OpAMD64VPDPWSSD128, + ssa.OpAMD64VPDPWSSD256, + ssa.OpAMD64VPDPWSSD512, + ssa.OpAMD64VFMADD213PS128, ssa.OpAMD64VFMADD213PS256, ssa.OpAMD64VFMADD213PS512, ssa.OpAMD64VFMADD213PD128, @@ -831,9 +834,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VFMSUBADD213PD128, ssa.OpAMD64VFMSUBADD213PD256, ssa.OpAMD64VFMSUBADD213PD512, - ssa.OpAMD64VPDPWSSD128, - ssa.OpAMD64VPDPWSSD256, - ssa.OpAMD64VPDPWSSD512, ssa.OpAMD64VPERMI2B128, ssa.OpAMD64VPERMI2B256, ssa.OpAMD64VPERMI2B512, @@ -881,7 +881,10 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPDPBUSD512: p = simdV31ResultInArg0(s, v) - case ssa.OpAMD64VFMADD213PSMasked128, + case ssa.OpAMD64VPDPWSSDMasked128, + ssa.OpAMD64VPDPWSSDMasked256, + ssa.OpAMD64VPDPWSSDMasked512, + ssa.OpAMD64VFMADD213PSMasked128, ssa.OpAMD64VFMADD213PSMasked256, ssa.OpAMD64VFMADD213PSMasked512, ssa.OpAMD64VFMADD213PDMasked128, @@ -899,9 +902,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VFMSUBADD213PDMasked128, ssa.OpAMD64VFMSUBADD213PDMasked256, ssa.OpAMD64VFMSUBADD213PDMasked512, - ssa.OpAMD64VPDPWSSDMasked128, - ssa.OpAMD64VPDPWSSDMasked256, - ssa.OpAMD64VPDPWSSDMasked512, ssa.OpAMD64VPERMI2BMasked128, ssa.OpAMD64VPERMI2BMasked256, ssa.OpAMD64VPERMI2BMasked512, @@ -1064,6 +1064,9 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPABSQMasked128, ssa.OpAMD64VPABSQMasked256, ssa.OpAMD64VPABSQMasked512, + ssa.OpAMD64VPDPWSSDMasked128, + ssa.OpAMD64VPDPWSSDMasked256, + ssa.OpAMD64VPDPWSSDMasked512, ssa.OpAMD64VADDPSMasked128, ssa.OpAMD64VADDPSMasked256, ssa.OpAMD64VADDPSMasked512, @@ -1280,9 +1283,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPORQMasked128, ssa.OpAMD64VPORQMasked256, ssa.OpAMD64VPORQMasked512, - ssa.OpAMD64VPDPWSSDMasked128, - ssa.OpAMD64VPDPWSSDMasked256, - ssa.OpAMD64VPDPWSSDMasked512, ssa.OpAMD64VPMADDWDMasked128, ssa.OpAMD64VPMADDWDMasked256, ssa.OpAMD64VPMADDWDMasked512, @@ -1354,15 +1354,15 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPRORVQMasked128, ssa.OpAMD64VPRORVQMasked256, ssa.OpAMD64VPRORVQMasked512, + ssa.OpAMD64VPDPWSSDSMasked128, + ssa.OpAMD64VPDPWSSDSMasked256, + ssa.OpAMD64VPDPWSSDSMasked512, ssa.OpAMD64VPADDSBMasked128, ssa.OpAMD64VPADDSBMasked256, ssa.OpAMD64VPADDSBMasked512, ssa.OpAMD64VPADDSWMasked128, ssa.OpAMD64VPADDSWMasked256, ssa.OpAMD64VPADDSWMasked512, - ssa.OpAMD64VPDPWSSDSMasked128, - ssa.OpAMD64VPDPWSSDSMasked256, - ssa.OpAMD64VPDPWSSDSMasked512, ssa.OpAMD64VPSUBSBMasked128, ssa.OpAMD64VPSUBSBMasked256, ssa.OpAMD64VPSUBSBMasked512, diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules index 38b602f35b8..7b7cbb9dc76 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules +++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules @@ -54,6 +54,12 @@ (AddUint64x2 ...) => (VPADDQ128 ...) (AddUint64x4 ...) => (VPADDQ256 ...) (AddUint64x8 ...) => (VPADDQ512 ...) +(AddDotProdInt32x4 ...) => (VPDPWSSD128 ...) +(AddDotProdInt32x8 ...) => (VPDPWSSD256 ...) +(AddDotProdInt32x16 ...) => (VPDPWSSD512 ...) +(AddDotProdMaskedInt32x4 x y z mask) => (VPDPWSSDMasked128 x y z (VPMOVVec32x4ToM mask)) +(AddDotProdMaskedInt32x8 x y z mask) => (VPDPWSSDMasked256 x y z (VPMOVVec32x8ToM mask)) +(AddDotProdMaskedInt32x16 x y z mask) => (VPDPWSSDMasked512 x y z (VPMOVVec32x16ToM mask)) (AddMaskedFloat32x4 x y mask) => (VADDPSMasked128 x y (VPMOVVec32x4ToM mask)) (AddMaskedFloat32x8 x y mask) => (VADDPSMasked256 x y (VPMOVVec32x8ToM mask)) (AddMaskedFloat32x16 x y mask) => (VADDPSMasked512 x y (VPMOVVec32x16ToM mask)) @@ -994,12 +1000,6 @@ (PairDotProdInt16x8 ...) => (VPMADDWD128 ...) (PairDotProdInt16x16 ...) => (VPMADDWD256 ...) (PairDotProdInt16x32 ...) => (VPMADDWD512 ...) -(PairDotProdAccumulateInt32x4 ...) => (VPDPWSSD128 ...) -(PairDotProdAccumulateInt32x8 ...) => (VPDPWSSD256 ...) -(PairDotProdAccumulateInt32x16 ...) => (VPDPWSSD512 ...) -(PairDotProdAccumulateMaskedInt32x4 x y z mask) => (VPDPWSSDMasked128 x y z (VPMOVVec32x4ToM mask)) -(PairDotProdAccumulateMaskedInt32x8 x y z mask) => (VPDPWSSDMasked256 x y z (VPMOVVec32x8ToM mask)) -(PairDotProdAccumulateMaskedInt32x16 x y z mask) => (VPDPWSSDMasked512 x y z (VPMOVVec32x16ToM mask)) (PairDotProdMaskedInt16x8 x y mask) => (VPMADDWDMasked128 x y (VPMOVVec16x8ToM mask)) (PairDotProdMaskedInt16x16 x y mask) => (VPMADDWDMasked256 x y (VPMOVVec16x16ToM mask)) (PairDotProdMaskedInt16x32 x y mask) => (VPMADDWDMasked512 x y (VPMOVVec16x32ToM mask)) @@ -1307,6 +1307,12 @@ (SaturatedAddUint16x8 ...) => (VPADDSW128 ...) (SaturatedAddUint16x16 ...) => (VPADDSW256 ...) (SaturatedAddUint16x32 ...) => (VPADDSW512 ...) +(SaturatedAddDotProdInt32x4 ...) => (VPDPWSSDS128 ...) +(SaturatedAddDotProdInt32x8 ...) => (VPDPWSSDS256 ...) +(SaturatedAddDotProdInt32x16 ...) => (VPDPWSSDS512 ...) +(SaturatedAddDotProdMaskedInt32x4 x y z mask) => (VPDPWSSDSMasked128 x y z (VPMOVVec32x4ToM mask)) +(SaturatedAddDotProdMaskedInt32x8 x y z mask) => (VPDPWSSDSMasked256 x y z (VPMOVVec32x8ToM mask)) +(SaturatedAddDotProdMaskedInt32x16 x y z mask) => (VPDPWSSDSMasked512 x y z (VPMOVVec32x16ToM mask)) (SaturatedAddMaskedInt8x16 x y mask) => (VPADDSBMasked128 x y (VPMOVVec8x16ToM mask)) (SaturatedAddMaskedInt8x32 x y mask) => (VPADDSBMasked256 x y (VPMOVVec8x32ToM mask)) (SaturatedAddMaskedInt8x64 x y mask) => (VPADDSBMasked512 x y (VPMOVVec8x64ToM mask)) @@ -1319,12 +1325,6 @@ (SaturatedAddMaskedUint16x8 x y mask) => (VPADDSWMasked128 x y (VPMOVVec16x8ToM mask)) (SaturatedAddMaskedUint16x16 x y mask) => (VPADDSWMasked256 x y (VPMOVVec16x16ToM mask)) (SaturatedAddMaskedUint16x32 x y mask) => (VPADDSWMasked512 x y (VPMOVVec16x32ToM mask)) -(SaturatedPairDotProdAccumulateInt32x4 ...) => (VPDPWSSDS128 ...) -(SaturatedPairDotProdAccumulateInt32x8 ...) => (VPDPWSSDS256 ...) -(SaturatedPairDotProdAccumulateInt32x16 ...) => (VPDPWSSDS512 ...) -(SaturatedPairDotProdAccumulateMaskedInt32x4 x y z mask) => (VPDPWSSDSMasked128 x y z (VPMOVVec32x4ToM mask)) -(SaturatedPairDotProdAccumulateMaskedInt32x8 x y z mask) => (VPDPWSSDSMasked256 x y z (VPMOVVec32x8ToM mask)) -(SaturatedPairDotProdAccumulateMaskedInt32x16 x y z mask) => (VPDPWSSDSMasked512 x y z (VPMOVVec32x16ToM mask)) (SaturatedPairwiseAddInt16x8 ...) => (VPHADDSW128 ...) (SaturatedPairwiseAddInt16x16 ...) => (VPHADDSW256 ...) (SaturatedPairwiseSubInt16x8 ...) => (VPHSUBSW128 ...) diff --git a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go index d681620bc39..6853c3b0919 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go +++ b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go @@ -27,6 +27,12 @@ func simdGenericOps() []opData { {name: "AbsoluteMaskedInt64x2", argLength: 2, commutative: false}, {name: "AbsoluteMaskedInt64x4", argLength: 2, commutative: false}, {name: "AbsoluteMaskedInt64x8", argLength: 2, commutative: false}, + {name: "AddDotProdInt32x4", argLength: 3, commutative: false}, + {name: "AddDotProdInt32x8", argLength: 3, commutative: false}, + {name: "AddDotProdInt32x16", argLength: 3, commutative: false}, + {name: "AddDotProdMaskedInt32x4", argLength: 4, commutative: false}, + {name: "AddDotProdMaskedInt32x8", argLength: 4, commutative: false}, + {name: "AddDotProdMaskedInt32x16", argLength: 4, commutative: false}, {name: "AddFloat32x4", argLength: 2, commutative: true}, {name: "AddFloat32x8", argLength: 2, commutative: true}, {name: "AddFloat32x16", argLength: 2, commutative: true}, @@ -892,12 +898,6 @@ func simdGenericOps() []opData { {name: "OrUint64x2", argLength: 2, commutative: true}, {name: "OrUint64x4", argLength: 2, commutative: true}, {name: "OrUint64x8", argLength: 2, commutative: true}, - {name: "PairDotProdAccumulateInt32x4", argLength: 3, commutative: false}, - {name: "PairDotProdAccumulateInt32x8", argLength: 3, commutative: false}, - {name: "PairDotProdAccumulateInt32x16", argLength: 3, commutative: false}, - {name: "PairDotProdAccumulateMaskedInt32x4", argLength: 4, commutative: false}, - {name: "PairDotProdAccumulateMaskedInt32x8", argLength: 4, commutative: false}, - {name: "PairDotProdAccumulateMaskedInt32x16", argLength: 4, commutative: false}, {name: "PairDotProdInt16x8", argLength: 2, commutative: false}, {name: "PairDotProdInt16x16", argLength: 2, commutative: false}, {name: "PairDotProdInt16x32", argLength: 2, commutative: false}, @@ -1136,6 +1136,12 @@ func simdGenericOps() []opData { {name: "RoundFloat32x8", argLength: 1, commutative: false}, {name: "RoundFloat64x2", argLength: 1, commutative: false}, {name: "RoundFloat64x4", argLength: 1, commutative: false}, + {name: "SaturatedAddDotProdInt32x4", argLength: 3, commutative: false}, + {name: "SaturatedAddDotProdInt32x8", argLength: 3, commutative: false}, + {name: "SaturatedAddDotProdInt32x16", argLength: 3, commutative: false}, + {name: "SaturatedAddDotProdMaskedInt32x4", argLength: 4, commutative: false}, + {name: "SaturatedAddDotProdMaskedInt32x8", argLength: 4, commutative: false}, + {name: "SaturatedAddDotProdMaskedInt32x16", argLength: 4, commutative: false}, {name: "SaturatedAddInt8x16", argLength: 2, commutative: true}, {name: "SaturatedAddInt8x32", argLength: 2, commutative: true}, {name: "SaturatedAddInt8x64", argLength: 2, commutative: true}, @@ -1160,12 +1166,6 @@ func simdGenericOps() []opData { {name: "SaturatedAddUint16x8", argLength: 2, commutative: true}, {name: "SaturatedAddUint16x16", argLength: 2, commutative: true}, {name: "SaturatedAddUint16x32", argLength: 2, commutative: true}, - {name: "SaturatedPairDotProdAccumulateInt32x4", argLength: 3, commutative: false}, - {name: "SaturatedPairDotProdAccumulateInt32x8", argLength: 3, commutative: false}, - {name: "SaturatedPairDotProdAccumulateInt32x16", argLength: 3, commutative: false}, - {name: "SaturatedPairDotProdAccumulateMaskedInt32x4", argLength: 4, commutative: false}, - {name: "SaturatedPairDotProdAccumulateMaskedInt32x8", argLength: 4, commutative: false}, - {name: "SaturatedPairDotProdAccumulateMaskedInt32x16", argLength: 4, commutative: false}, {name: "SaturatedPairwiseAddInt16x8", argLength: 2, commutative: false}, {name: "SaturatedPairwiseAddInt16x16", argLength: 2, commutative: false}, {name: "SaturatedPairwiseSubInt16x8", argLength: 2, commutative: false}, diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go index de4477bc91b..7427137b221 100644 --- a/src/cmd/compile/internal/ssa/opGen.go +++ b/src/cmd/compile/internal/ssa/opGen.go @@ -4513,6 +4513,12 @@ const ( OpAbsoluteMaskedInt64x2 OpAbsoluteMaskedInt64x4 OpAbsoluteMaskedInt64x8 + OpAddDotProdInt32x4 + OpAddDotProdInt32x8 + OpAddDotProdInt32x16 + OpAddDotProdMaskedInt32x4 + OpAddDotProdMaskedInt32x8 + OpAddDotProdMaskedInt32x16 OpAddFloat32x4 OpAddFloat32x8 OpAddFloat32x16 @@ -5378,12 +5384,6 @@ const ( OpOrUint64x2 OpOrUint64x4 OpOrUint64x8 - OpPairDotProdAccumulateInt32x4 - OpPairDotProdAccumulateInt32x8 - OpPairDotProdAccumulateInt32x16 - OpPairDotProdAccumulateMaskedInt32x4 - OpPairDotProdAccumulateMaskedInt32x8 - OpPairDotProdAccumulateMaskedInt32x16 OpPairDotProdInt16x8 OpPairDotProdInt16x16 OpPairDotProdInt16x32 @@ -5622,6 +5622,12 @@ const ( OpRoundFloat32x8 OpRoundFloat64x2 OpRoundFloat64x4 + OpSaturatedAddDotProdInt32x4 + OpSaturatedAddDotProdInt32x8 + OpSaturatedAddDotProdInt32x16 + OpSaturatedAddDotProdMaskedInt32x4 + OpSaturatedAddDotProdMaskedInt32x8 + OpSaturatedAddDotProdMaskedInt32x16 OpSaturatedAddInt8x16 OpSaturatedAddInt8x32 OpSaturatedAddInt8x64 @@ -5646,12 +5652,6 @@ const ( OpSaturatedAddUint16x8 OpSaturatedAddUint16x16 OpSaturatedAddUint16x32 - OpSaturatedPairDotProdAccumulateInt32x4 - OpSaturatedPairDotProdAccumulateInt32x8 - OpSaturatedPairDotProdAccumulateInt32x16 - OpSaturatedPairDotProdAccumulateMaskedInt32x4 - OpSaturatedPairDotProdAccumulateMaskedInt32x8 - OpSaturatedPairDotProdAccumulateMaskedInt32x16 OpSaturatedPairwiseAddInt16x8 OpSaturatedPairwiseAddInt16x16 OpSaturatedPairwiseSubInt16x8 @@ -61789,6 +61789,36 @@ var opcodeTable = [...]opInfo{ argLen: 2, generic: true, }, + { + name: "AddDotProdInt32x4", + argLen: 3, + generic: true, + }, + { + name: "AddDotProdInt32x8", + argLen: 3, + generic: true, + }, + { + name: "AddDotProdInt32x16", + argLen: 3, + generic: true, + }, + { + name: "AddDotProdMaskedInt32x4", + argLen: 4, + generic: true, + }, + { + name: "AddDotProdMaskedInt32x8", + argLen: 4, + generic: true, + }, + { + name: "AddDotProdMaskedInt32x16", + argLen: 4, + generic: true, + }, { name: "AddFloat32x4", argLen: 2, @@ -66563,36 +66593,6 @@ var opcodeTable = [...]opInfo{ commutative: true, generic: true, }, - { - name: "PairDotProdAccumulateInt32x4", - argLen: 3, - generic: true, - }, - { - name: "PairDotProdAccumulateInt32x8", - argLen: 3, - generic: true, - }, - { - name: "PairDotProdAccumulateInt32x16", - argLen: 3, - generic: true, - }, - { - name: "PairDotProdAccumulateMaskedInt32x4", - argLen: 4, - generic: true, - }, - { - name: "PairDotProdAccumulateMaskedInt32x8", - argLen: 4, - generic: true, - }, - { - name: "PairDotProdAccumulateMaskedInt32x16", - argLen: 4, - generic: true, - }, { name: "PairDotProdInt16x8", argLen: 2, @@ -67783,6 +67783,36 @@ var opcodeTable = [...]opInfo{ argLen: 1, generic: true, }, + { + name: "SaturatedAddDotProdInt32x4", + argLen: 3, + generic: true, + }, + { + name: "SaturatedAddDotProdInt32x8", + argLen: 3, + generic: true, + }, + { + name: "SaturatedAddDotProdInt32x16", + argLen: 3, + generic: true, + }, + { + name: "SaturatedAddDotProdMaskedInt32x4", + argLen: 4, + generic: true, + }, + { + name: "SaturatedAddDotProdMaskedInt32x8", + argLen: 4, + generic: true, + }, + { + name: "SaturatedAddDotProdMaskedInt32x16", + argLen: 4, + generic: true, + }, { name: "SaturatedAddInt8x16", argLen: 2, @@ -67927,36 +67957,6 @@ var opcodeTable = [...]opInfo{ commutative: true, generic: true, }, - { - name: "SaturatedPairDotProdAccumulateInt32x4", - argLen: 3, - generic: true, - }, - { - name: "SaturatedPairDotProdAccumulateInt32x8", - argLen: 3, - generic: true, - }, - { - name: "SaturatedPairDotProdAccumulateInt32x16", - argLen: 3, - generic: true, - }, - { - name: "SaturatedPairDotProdAccumulateMaskedInt32x4", - argLen: 4, - generic: true, - }, - { - name: "SaturatedPairDotProdAccumulateMaskedInt32x8", - argLen: 4, - generic: true, - }, - { - name: "SaturatedPairDotProdAccumulateMaskedInt32x16", - argLen: 4, - generic: true, - }, { name: "SaturatedPairwiseAddInt16x8", argLen: 2, diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go index e9a2fd70e4e..5abb50ab713 100644 --- a/src/cmd/compile/internal/ssa/rewriteAMD64.go +++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go @@ -631,6 +631,21 @@ func rewriteValueAMD64(v *Value) bool { case OpAdd8: v.Op = OpAMD64ADDL return true + case OpAddDotProdInt32x16: + v.Op = OpAMD64VPDPWSSD512 + return true + case OpAddDotProdInt32x4: + v.Op = OpAMD64VPDPWSSD128 + return true + case OpAddDotProdInt32x8: + v.Op = OpAMD64VPDPWSSD256 + return true + case OpAddDotProdMaskedInt32x16: + return rewriteValueAMD64_OpAddDotProdMaskedInt32x16(v) + case OpAddDotProdMaskedInt32x4: + return rewriteValueAMD64_OpAddDotProdMaskedInt32x4(v) + case OpAddDotProdMaskedInt32x8: + return rewriteValueAMD64_OpAddDotProdMaskedInt32x8(v) case OpAddFloat32x16: v.Op = OpAMD64VADDPS512 return true @@ -3340,21 +3355,6 @@ func rewriteValueAMD64(v *Value) bool { case OpOrUint8x32: v.Op = OpAMD64VPOR256 return true - case OpPairDotProdAccumulateInt32x16: - v.Op = OpAMD64VPDPWSSD512 - return true - case OpPairDotProdAccumulateInt32x4: - v.Op = OpAMD64VPDPWSSD128 - return true - case OpPairDotProdAccumulateInt32x8: - v.Op = OpAMD64VPDPWSSD256 - return true - case OpPairDotProdAccumulateMaskedInt32x16: - return rewriteValueAMD64_OpPairDotProdAccumulateMaskedInt32x16(v) - case OpPairDotProdAccumulateMaskedInt32x4: - return rewriteValueAMD64_OpPairDotProdAccumulateMaskedInt32x4(v) - case OpPairDotProdAccumulateMaskedInt32x8: - return rewriteValueAMD64_OpPairDotProdAccumulateMaskedInt32x8(v) case OpPairDotProdInt16x16: v.Op = OpAMD64VPMADDWD256 return true @@ -4206,6 +4206,21 @@ func rewriteValueAMD64(v *Value) bool { return rewriteValueAMD64_OpRsh8x64(v) case OpRsh8x8: return rewriteValueAMD64_OpRsh8x8(v) + case OpSaturatedAddDotProdInt32x16: + v.Op = OpAMD64VPDPWSSDS512 + return true + case OpSaturatedAddDotProdInt32x4: + v.Op = OpAMD64VPDPWSSDS128 + return true + case OpSaturatedAddDotProdInt32x8: + v.Op = OpAMD64VPDPWSSDS256 + return true + case OpSaturatedAddDotProdMaskedInt32x16: + return rewriteValueAMD64_OpSaturatedAddDotProdMaskedInt32x16(v) + case OpSaturatedAddDotProdMaskedInt32x4: + return rewriteValueAMD64_OpSaturatedAddDotProdMaskedInt32x4(v) + case OpSaturatedAddDotProdMaskedInt32x8: + return rewriteValueAMD64_OpSaturatedAddDotProdMaskedInt32x8(v) case OpSaturatedAddInt16x16: v.Op = OpAMD64VPADDSW256 return true @@ -4266,21 +4281,6 @@ func rewriteValueAMD64(v *Value) bool { case OpSaturatedAddUint8x64: v.Op = OpAMD64VPADDSB512 return true - case OpSaturatedPairDotProdAccumulateInt32x16: - v.Op = OpAMD64VPDPWSSDS512 - return true - case OpSaturatedPairDotProdAccumulateInt32x4: - v.Op = OpAMD64VPDPWSSDS128 - return true - case OpSaturatedPairDotProdAccumulateInt32x8: - v.Op = OpAMD64VPDPWSSDS256 - return true - case OpSaturatedPairDotProdAccumulateMaskedInt32x16: - return rewriteValueAMD64_OpSaturatedPairDotProdAccumulateMaskedInt32x16(v) - case OpSaturatedPairDotProdAccumulateMaskedInt32x4: - return rewriteValueAMD64_OpSaturatedPairDotProdAccumulateMaskedInt32x4(v) - case OpSaturatedPairDotProdAccumulateMaskedInt32x8: - return rewriteValueAMD64_OpSaturatedPairDotProdAccumulateMaskedInt32x8(v) case OpSaturatedPairwiseAddInt16x16: v.Op = OpAMD64VPHADDSW256 return true @@ -28514,6 +28514,66 @@ func rewriteValueAMD64_OpAbsoluteMaskedInt8x64(v *Value) bool { return true } } +func rewriteValueAMD64_OpAddDotProdMaskedInt32x16(v *Value) bool { + v_3 := v.Args[3] + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (AddDotProdMaskedInt32x16 x y z mask) + // result: (VPDPWSSDMasked512 x y z (VPMOVVec32x16ToM mask)) + for { + x := v_0 + y := v_1 + z := v_2 + mask := v_3 + v.reset(OpAMD64VPDPWSSDMasked512) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg4(x, y, z, v0) + return true + } +} +func rewriteValueAMD64_OpAddDotProdMaskedInt32x4(v *Value) bool { + v_3 := v.Args[3] + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (AddDotProdMaskedInt32x4 x y z mask) + // result: (VPDPWSSDMasked128 x y z (VPMOVVec32x4ToM mask)) + for { + x := v_0 + y := v_1 + z := v_2 + mask := v_3 + v.reset(OpAMD64VPDPWSSDMasked128) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg4(x, y, z, v0) + return true + } +} +func rewriteValueAMD64_OpAddDotProdMaskedInt32x8(v *Value) bool { + v_3 := v.Args[3] + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (AddDotProdMaskedInt32x8 x y z mask) + // result: (VPDPWSSDMasked256 x y z (VPMOVVec32x8ToM mask)) + for { + x := v_0 + y := v_1 + z := v_2 + mask := v_3 + v.reset(OpAMD64VPDPWSSDMasked256) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg4(x, y, z, v0) + return true + } +} func rewriteValueAMD64_OpAddMaskedFloat32x16(v *Value) bool { v_2 := v.Args[2] v_1 := v.Args[1] @@ -45669,66 +45729,6 @@ func rewriteValueAMD64_OpOrMaskedUint64x8(v *Value) bool { return true } } -func rewriteValueAMD64_OpPairDotProdAccumulateMaskedInt32x16(v *Value) bool { - v_3 := v.Args[3] - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (PairDotProdAccumulateMaskedInt32x16 x y z mask) - // result: (VPDPWSSDMasked512 x y z (VPMOVVec32x16ToM mask)) - for { - x := v_0 - y := v_1 - z := v_2 - mask := v_3 - v.reset(OpAMD64VPDPWSSDMasked512) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg4(x, y, z, v0) - return true - } -} -func rewriteValueAMD64_OpPairDotProdAccumulateMaskedInt32x4(v *Value) bool { - v_3 := v.Args[3] - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (PairDotProdAccumulateMaskedInt32x4 x y z mask) - // result: (VPDPWSSDMasked128 x y z (VPMOVVec32x4ToM mask)) - for { - x := v_0 - y := v_1 - z := v_2 - mask := v_3 - v.reset(OpAMD64VPDPWSSDMasked128) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg4(x, y, z, v0) - return true - } -} -func rewriteValueAMD64_OpPairDotProdAccumulateMaskedInt32x8(v *Value) bool { - v_3 := v.Args[3] - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (PairDotProdAccumulateMaskedInt32x8 x y z mask) - // result: (VPDPWSSDMasked256 x y z (VPMOVVec32x8ToM mask)) - for { - x := v_0 - y := v_1 - z := v_2 - mask := v_3 - v.reset(OpAMD64VPDPWSSDMasked256) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg4(x, y, z, v0) - return true - } -} func rewriteValueAMD64_OpPairDotProdMaskedInt16x16(v *Value) bool { v_2 := v.Args[2] v_1 := v.Args[1] @@ -49721,6 +49721,66 @@ func rewriteValueAMD64_OpRsh8x8(v *Value) bool { } return false } +func rewriteValueAMD64_OpSaturatedAddDotProdMaskedInt32x16(v *Value) bool { + v_3 := v.Args[3] + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (SaturatedAddDotProdMaskedInt32x16 x y z mask) + // result: (VPDPWSSDSMasked512 x y z (VPMOVVec32x16ToM mask)) + for { + x := v_0 + y := v_1 + z := v_2 + mask := v_3 + v.reset(OpAMD64VPDPWSSDSMasked512) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg4(x, y, z, v0) + return true + } +} +func rewriteValueAMD64_OpSaturatedAddDotProdMaskedInt32x4(v *Value) bool { + v_3 := v.Args[3] + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (SaturatedAddDotProdMaskedInt32x4 x y z mask) + // result: (VPDPWSSDSMasked128 x y z (VPMOVVec32x4ToM mask)) + for { + x := v_0 + y := v_1 + z := v_2 + mask := v_3 + v.reset(OpAMD64VPDPWSSDSMasked128) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg4(x, y, z, v0) + return true + } +} +func rewriteValueAMD64_OpSaturatedAddDotProdMaskedInt32x8(v *Value) bool { + v_3 := v.Args[3] + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + // match: (SaturatedAddDotProdMaskedInt32x8 x y z mask) + // result: (VPDPWSSDSMasked256 x y z (VPMOVVec32x8ToM mask)) + for { + x := v_0 + y := v_1 + z := v_2 + mask := v_3 + v.reset(OpAMD64VPDPWSSDSMasked256) + v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask) + v0.AddArg(mask) + v.AddArg4(x, y, z, v0) + return true + } +} func rewriteValueAMD64_OpSaturatedAddMaskedInt16x16(v *Value) bool { v_2 := v.Args[2] v_1 := v.Args[1] @@ -49937,66 +49997,6 @@ func rewriteValueAMD64_OpSaturatedAddMaskedUint8x64(v *Value) bool { return true } } -func rewriteValueAMD64_OpSaturatedPairDotProdAccumulateMaskedInt32x16(v *Value) bool { - v_3 := v.Args[3] - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (SaturatedPairDotProdAccumulateMaskedInt32x16 x y z mask) - // result: (VPDPWSSDSMasked512 x y z (VPMOVVec32x16ToM mask)) - for { - x := v_0 - y := v_1 - z := v_2 - mask := v_3 - v.reset(OpAMD64VPDPWSSDSMasked512) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg4(x, y, z, v0) - return true - } -} -func rewriteValueAMD64_OpSaturatedPairDotProdAccumulateMaskedInt32x4(v *Value) bool { - v_3 := v.Args[3] - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (SaturatedPairDotProdAccumulateMaskedInt32x4 x y z mask) - // result: (VPDPWSSDSMasked128 x y z (VPMOVVec32x4ToM mask)) - for { - x := v_0 - y := v_1 - z := v_2 - mask := v_3 - v.reset(OpAMD64VPDPWSSDSMasked128) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg4(x, y, z, v0) - return true - } -} -func rewriteValueAMD64_OpSaturatedPairDotProdAccumulateMaskedInt32x8(v *Value) bool { - v_3 := v.Args[3] - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - b := v.Block - // match: (SaturatedPairDotProdAccumulateMaskedInt32x8 x y z mask) - // result: (VPDPWSSDSMasked256 x y z (VPMOVVec32x8ToM mask)) - for { - x := v_0 - y := v_1 - z := v_2 - mask := v_3 - v.reset(OpAMD64VPDPWSSDSMasked256) - v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask) - v0.AddArg(mask) - v.AddArg4(x, y, z, v0) - return true - } -} func rewriteValueAMD64_OpSaturatedSubMaskedInt16x16(v *Value) bool { v_2 := v.Args[2] v_1 := v.Args[1] diff --git a/src/cmd/compile/internal/ssagen/simdintrinsics.go b/src/cmd/compile/internal/ssagen/simdintrinsics.go index d6c5b889ed3..12c388ca913 100644 --- a/src/cmd/compile/internal/ssagen/simdintrinsics.go +++ b/src/cmd/compile/internal/ssagen/simdintrinsics.go @@ -65,6 +65,12 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Uint64x2.Add", opLen2(ssa.OpAddUint64x2, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint64x4.Add", opLen2(ssa.OpAddUint64x4, types.TypeVec256), sys.AMD64) addF(simdPackage, "Uint64x8.Add", opLen2(ssa.OpAddUint64x8, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int32x4.AddDotProd", opLen3(ssa.OpAddDotProdInt32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int32x8.AddDotProd", opLen3(ssa.OpAddDotProdInt32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int32x16.AddDotProd", opLen3(ssa.OpAddDotProdInt32x16, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int32x4.AddDotProdMasked", opLen4(ssa.OpAddDotProdMaskedInt32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int32x8.AddDotProdMasked", opLen4(ssa.OpAddDotProdMaskedInt32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int32x16.AddDotProdMasked", opLen4(ssa.OpAddDotProdMaskedInt32x16, types.TypeVec512), sys.AMD64) addF(simdPackage, "Float32x4.AddMasked", opLen3(ssa.OpAddMaskedFloat32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Float32x8.AddMasked", opLen3(ssa.OpAddMaskedFloat32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Float32x16.AddMasked", opLen3(ssa.OpAddMaskedFloat32x16, types.TypeVec512), sys.AMD64) @@ -1005,12 +1011,6 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Int16x8.PairDotProd", opLen2(ssa.OpPairDotProdInt16x8, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int16x16.PairDotProd", opLen2(ssa.OpPairDotProdInt16x16, types.TypeVec256), sys.AMD64) addF(simdPackage, "Int16x32.PairDotProd", opLen2(ssa.OpPairDotProdInt16x32, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Int16x8.PairDotProdAccumulate", opLen3_31(ssa.OpPairDotProdAccumulateInt32x4, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int16x16.PairDotProdAccumulate", opLen3_31(ssa.OpPairDotProdAccumulateInt32x8, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Int16x32.PairDotProdAccumulate", opLen3_31(ssa.OpPairDotProdAccumulateInt32x16, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Int16x8.PairDotProdAccumulateMasked", opLen4_31(ssa.OpPairDotProdAccumulateMaskedInt32x4, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int16x16.PairDotProdAccumulateMasked", opLen4_31(ssa.OpPairDotProdAccumulateMaskedInt32x8, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Int16x32.PairDotProdAccumulateMasked", opLen4_31(ssa.OpPairDotProdAccumulateMaskedInt32x16, types.TypeVec512), sys.AMD64) addF(simdPackage, "Int16x8.PairDotProdMasked", opLen3(ssa.OpPairDotProdMaskedInt16x8, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int16x16.PairDotProdMasked", opLen3(ssa.OpPairDotProdMaskedInt16x16, types.TypeVec256), sys.AMD64) addF(simdPackage, "Int16x32.PairDotProdMasked", opLen3(ssa.OpPairDotProdMaskedInt16x32, types.TypeVec512), sys.AMD64) @@ -1318,6 +1318,12 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Uint16x8.SaturatedAdd", opLen2(ssa.OpSaturatedAddUint16x8, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint16x16.SaturatedAdd", opLen2(ssa.OpSaturatedAddUint16x16, types.TypeVec256), sys.AMD64) addF(simdPackage, "Uint16x32.SaturatedAdd", opLen2(ssa.OpSaturatedAddUint16x32, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int32x4.SaturatedAddDotProd", opLen3(ssa.OpSaturatedAddDotProdInt32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int32x8.SaturatedAddDotProd", opLen3(ssa.OpSaturatedAddDotProdInt32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int32x16.SaturatedAddDotProd", opLen3(ssa.OpSaturatedAddDotProdInt32x16, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int32x4.SaturatedAddDotProdMasked", opLen4(ssa.OpSaturatedAddDotProdMaskedInt32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int32x8.SaturatedAddDotProdMasked", opLen4(ssa.OpSaturatedAddDotProdMaskedInt32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int32x16.SaturatedAddDotProdMasked", opLen4(ssa.OpSaturatedAddDotProdMaskedInt32x16, types.TypeVec512), sys.AMD64) addF(simdPackage, "Int8x16.SaturatedAddMasked", opLen3(ssa.OpSaturatedAddMaskedInt8x16, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int8x32.SaturatedAddMasked", opLen3(ssa.OpSaturatedAddMaskedInt8x32, types.TypeVec256), sys.AMD64) addF(simdPackage, "Int8x64.SaturatedAddMasked", opLen3(ssa.OpSaturatedAddMaskedInt8x64, types.TypeVec512), sys.AMD64) @@ -1330,12 +1336,6 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Uint16x8.SaturatedAddMasked", opLen3(ssa.OpSaturatedAddMaskedUint16x8, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint16x16.SaturatedAddMasked", opLen3(ssa.OpSaturatedAddMaskedUint16x16, types.TypeVec256), sys.AMD64) addF(simdPackage, "Uint16x32.SaturatedAddMasked", opLen3(ssa.OpSaturatedAddMaskedUint16x32, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Int16x8.SaturatedPairDotProdAccumulate", opLen3_31(ssa.OpSaturatedPairDotProdAccumulateInt32x4, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int16x16.SaturatedPairDotProdAccumulate", opLen3_31(ssa.OpSaturatedPairDotProdAccumulateInt32x8, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Int16x32.SaturatedPairDotProdAccumulate", opLen3_31(ssa.OpSaturatedPairDotProdAccumulateInt32x16, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Int16x8.SaturatedPairDotProdAccumulateMasked", opLen4_31(ssa.OpSaturatedPairDotProdAccumulateMaskedInt32x4, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int16x16.SaturatedPairDotProdAccumulateMasked", opLen4_31(ssa.OpSaturatedPairDotProdAccumulateMaskedInt32x8, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Int16x32.SaturatedPairDotProdAccumulateMasked", opLen4_31(ssa.OpSaturatedPairDotProdAccumulateMaskedInt32x16, types.TypeVec512), sys.AMD64) addF(simdPackage, "Int16x8.SaturatedPairwiseAdd", opLen2(ssa.OpSaturatedPairwiseAddInt16x8, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int16x16.SaturatedPairwiseAdd", opLen2(ssa.OpSaturatedPairwiseAddInt16x16, types.TypeVec256), sys.AMD64) addF(simdPackage, "Int16x8.SaturatedPairwiseSub", opLen2(ssa.OpSaturatedPairwiseSubInt16x8, types.TypeVec128), sys.AMD64) diff --git a/src/simd/ops_amd64.go b/src/simd/ops_amd64.go index f88410af43d..ea0c5981571 100644 --- a/src/simd/ops_amd64.go +++ b/src/simd/ops_amd64.go @@ -304,6 +304,46 @@ func (x Uint64x4) Add(y Uint64x4) Uint64x4 // Asm: VPADDQ, CPU Feature: AVX512F func (x Uint64x8) Add(y Uint64x8) Uint64x8 +/* AddDotProd */ + +// AddDotProd performs dot products on pairs of elements of y and z and then adds x. +// +// Asm: VPDPWSSD, CPU Feature: AVXVNNI +func (x Int32x4) AddDotProd(y Int16x8, z Int16x8) Int32x4 + +// AddDotProd performs dot products on pairs of elements of y and z and then adds x. +// +// Asm: VPDPWSSD, CPU Feature: AVXVNNI +func (x Int32x8) AddDotProd(y Int16x16, z Int16x16) Int32x8 + +// AddDotProd performs dot products on pairs of elements of y and z and then adds x. +// +// Asm: VPDPWSSD, CPU Feature: AVX512VNNI +func (x Int32x16) AddDotProd(y Int16x32, z Int16x32) Int32x16 + +/* AddDotProdMasked */ + +// AddDotProdMasked performs dot products on pairs of elements of y and z and then adds x. +// +// This operation is applied selectively under a write mask. +// +// Asm: VPDPWSSD, CPU Feature: AVX512VNNI +func (x Int32x4) AddDotProdMasked(y Int16x8, z Int16x8, mask Mask32x4) Int32x4 + +// AddDotProdMasked performs dot products on pairs of elements of y and z and then adds x. +// +// This operation is applied selectively under a write mask. +// +// Asm: VPDPWSSD, CPU Feature: AVX512VNNI +func (x Int32x8) AddDotProdMasked(y Int16x16, z Int16x16, mask Mask32x8) Int32x8 + +// AddDotProdMasked performs dot products on pairs of elements of y and z and then adds x. +// +// This operation is applied selectively under a write mask. +// +// Asm: VPDPWSSD, CPU Feature: AVX512VNNI +func (x Int32x16) AddDotProdMasked(y Int16x32, z Int16x32, mask Mask32x16) Int32x16 + /* AddMasked */ // AddMasked adds corresponding elements of two vectors. @@ -6339,46 +6379,6 @@ func (x Int16x16) PairDotProd(y Int16x16) Int32x8 // Asm: VPMADDWD, CPU Feature: AVX512BW func (x Int16x32) PairDotProd(y Int16x32) Int32x16 -/* PairDotProdAccumulate */ - -// PairDotProdAccumulate performs dot products on pairs of elements of x and y and then adds z. -// -// Asm: VPDPWSSD, CPU Feature: AVXVNNI -func (x Int16x8) PairDotProdAccumulate(y Int16x8, z Int32x4) Int32x4 - -// PairDotProdAccumulate performs dot products on pairs of elements of x and y and then adds z. -// -// Asm: VPDPWSSD, CPU Feature: AVXVNNI -func (x Int16x16) PairDotProdAccumulate(y Int16x16, z Int32x8) Int32x8 - -// PairDotProdAccumulate performs dot products on pairs of elements of x and y and then adds z. -// -// Asm: VPDPWSSD, CPU Feature: AVX512VNNI -func (x Int16x32) PairDotProdAccumulate(y Int16x32, z Int32x16) Int32x16 - -/* PairDotProdAccumulateMasked */ - -// PairDotProdAccumulateMasked performs dot products on pairs of elements of x and y and then adds z. -// -// This operation is applied selectively under a write mask. -// -// Asm: VPDPWSSD, CPU Feature: AVX512VNNI -func (x Int16x8) PairDotProdAccumulateMasked(y Int16x8, z Int32x4, mask Mask32x4) Int32x4 - -// PairDotProdAccumulateMasked performs dot products on pairs of elements of x and y and then adds z. -// -// This operation is applied selectively under a write mask. -// -// Asm: VPDPWSSD, CPU Feature: AVX512VNNI -func (x Int16x16) PairDotProdAccumulateMasked(y Int16x16, z Int32x8, mask Mask32x8) Int32x8 - -// PairDotProdAccumulateMasked performs dot products on pairs of elements of x and y and then adds z. -// -// This operation is applied selectively under a write mask. -// -// Asm: VPDPWSSD, CPU Feature: AVX512VNNI -func (x Int16x32) PairDotProdAccumulateMasked(y Int16x32, z Int32x16, mask Mask32x16) Int32x16 - /* PairDotProdMasked */ // PairDotProdMasked multiplies the elements and add the pairs together, @@ -8649,6 +8649,46 @@ func (x Uint16x16) SaturatedAdd(y Uint16x16) Uint16x16 // Asm: VPADDSW, CPU Feature: AVX512BW func (x Uint16x32) SaturatedAdd(y Uint16x32) Uint16x32 +/* SaturatedAddDotProd */ + +// SaturatedAddDotProd performs dot products on pairs of elements of y and z and then adds x. +// +// Asm: VPDPWSSDS, CPU Feature: AVXVNNI +func (x Int32x4) SaturatedAddDotProd(y Int16x8, z Int16x8) Int32x4 + +// SaturatedAddDotProd performs dot products on pairs of elements of y and z and then adds x. +// +// Asm: VPDPWSSDS, CPU Feature: AVXVNNI +func (x Int32x8) SaturatedAddDotProd(y Int16x16, z Int16x16) Int32x8 + +// SaturatedAddDotProd performs dot products on pairs of elements of y and z and then adds x. +// +// Asm: VPDPWSSDS, CPU Feature: AVX512VNNI +func (x Int32x16) SaturatedAddDotProd(y Int16x32, z Int16x32) Int32x16 + +/* SaturatedAddDotProdMasked */ + +// SaturatedAddDotProdMasked performs dot products on pairs of elements of y and z and then adds x. +// +// This operation is applied selectively under a write mask. +// +// Asm: VPDPWSSDS, CPU Feature: AVX512VNNI +func (x Int32x4) SaturatedAddDotProdMasked(y Int16x8, z Int16x8, mask Mask32x4) Int32x4 + +// SaturatedAddDotProdMasked performs dot products on pairs of elements of y and z and then adds x. +// +// This operation is applied selectively under a write mask. +// +// Asm: VPDPWSSDS, CPU Feature: AVX512VNNI +func (x Int32x8) SaturatedAddDotProdMasked(y Int16x16, z Int16x16, mask Mask32x8) Int32x8 + +// SaturatedAddDotProdMasked performs dot products on pairs of elements of y and z and then adds x. +// +// This operation is applied selectively under a write mask. +// +// Asm: VPDPWSSDS, CPU Feature: AVX512VNNI +func (x Int32x16) SaturatedAddDotProdMasked(y Int16x32, z Int16x32, mask Mask32x16) Int32x16 + /* SaturatedAddMasked */ // SaturatedAddMasked adds corresponding elements of two vectors with saturation. @@ -8735,46 +8775,6 @@ func (x Uint16x16) SaturatedAddMasked(y Uint16x16, mask Mask16x16) Uint16x16 // Asm: VPADDSW, CPU Feature: AVX512BW func (x Uint16x32) SaturatedAddMasked(y Uint16x32, mask Mask16x32) Uint16x32 -/* SaturatedPairDotProdAccumulate */ - -// SaturatedPairDotProdAccumulate performs dot products on pairs of elements of x and y and then adds z. -// -// Asm: VPDPWSSDS, CPU Feature: AVXVNNI -func (x Int16x8) SaturatedPairDotProdAccumulate(y Int16x8, z Int32x4) Int32x4 - -// SaturatedPairDotProdAccumulate performs dot products on pairs of elements of x and y and then adds z. -// -// Asm: VPDPWSSDS, CPU Feature: AVXVNNI -func (x Int16x16) SaturatedPairDotProdAccumulate(y Int16x16, z Int32x8) Int32x8 - -// SaturatedPairDotProdAccumulate performs dot products on pairs of elements of x and y and then adds z. -// -// Asm: VPDPWSSDS, CPU Feature: AVX512VNNI -func (x Int16x32) SaturatedPairDotProdAccumulate(y Int16x32, z Int32x16) Int32x16 - -/* SaturatedPairDotProdAccumulateMasked */ - -// SaturatedPairDotProdAccumulateMasked performs dot products on pairs of elements of x and y and then adds z. -// -// This operation is applied selectively under a write mask. -// -// Asm: VPDPWSSDS, CPU Feature: AVX512VNNI -func (x Int16x8) SaturatedPairDotProdAccumulateMasked(y Int16x8, z Int32x4, mask Mask32x4) Int32x4 - -// SaturatedPairDotProdAccumulateMasked performs dot products on pairs of elements of x and y and then adds z. -// -// This operation is applied selectively under a write mask. -// -// Asm: VPDPWSSDS, CPU Feature: AVX512VNNI -func (x Int16x16) SaturatedPairDotProdAccumulateMasked(y Int16x16, z Int32x8, mask Mask32x8) Int32x8 - -// SaturatedPairDotProdAccumulateMasked performs dot products on pairs of elements of x and y and then adds z. -// -// This operation is applied selectively under a write mask. -// -// Asm: VPDPWSSDS, CPU Feature: AVX512VNNI -func (x Int16x32) SaturatedPairDotProdAccumulateMasked(y Int16x32, z Int32x16, mask Mask32x16) Int32x16 - /* SaturatedPairwiseAdd */ // SaturatedPairwiseAdd horizontally adds adjacent pairs of elements with saturation. diff --git a/src/simd/simd_test.go b/src/simd/simd_test.go index 72180a30469..2326addea94 100644 --- a/src/simd/simd_test.go +++ b/src/simd/simd_test.go @@ -197,7 +197,7 @@ func TestPairDotProdAccumulate(t *testing.T) { z := simd.LoadInt32x4Slice([]int32{3, 3, 3, 3}) want := []int32{11, 11, 11, 11} got := make([]int32, 4) - z = x.PairDotProdAccumulate(x, z) + z = z.AddDotProd(x, x) z.StoreSlice(got) for i := range 4 { if got[i] != want[i] {