[dev.simd] cmd/compile, simd: change PairDotProdAccumulate to AddDotProd

This CL is generated by CL 692219.

Change-Id: I50fa919f1edc5c6505bc6d3238f65b37fc7628b5
Reviewed-on: https://go-review.googlesource.com/c/go/+/692156
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: David Chase <drchase@google.com>
This commit is contained in:
Junyang Shao 2025-07-31 23:51:50 +00:00
parent 2c25f3e846
commit c2d775d401
8 changed files with 353 additions and 353 deletions

View file

@ -813,7 +813,10 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
ssa.OpAMD64VPCMPUQMasked512:
p = simdV2kkImm8(s, v)
case ssa.OpAMD64VFMADD213PS128,
case ssa.OpAMD64VPDPWSSD128,
ssa.OpAMD64VPDPWSSD256,
ssa.OpAMD64VPDPWSSD512,
ssa.OpAMD64VFMADD213PS128,
ssa.OpAMD64VFMADD213PS256,
ssa.OpAMD64VFMADD213PS512,
ssa.OpAMD64VFMADD213PD128,
@ -831,9 +834,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
ssa.OpAMD64VFMSUBADD213PD128,
ssa.OpAMD64VFMSUBADD213PD256,
ssa.OpAMD64VFMSUBADD213PD512,
ssa.OpAMD64VPDPWSSD128,
ssa.OpAMD64VPDPWSSD256,
ssa.OpAMD64VPDPWSSD512,
ssa.OpAMD64VPERMI2B128,
ssa.OpAMD64VPERMI2B256,
ssa.OpAMD64VPERMI2B512,
@ -881,7 +881,10 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
ssa.OpAMD64VPDPBUSD512:
p = simdV31ResultInArg0(s, v)
case ssa.OpAMD64VFMADD213PSMasked128,
case ssa.OpAMD64VPDPWSSDMasked128,
ssa.OpAMD64VPDPWSSDMasked256,
ssa.OpAMD64VPDPWSSDMasked512,
ssa.OpAMD64VFMADD213PSMasked128,
ssa.OpAMD64VFMADD213PSMasked256,
ssa.OpAMD64VFMADD213PSMasked512,
ssa.OpAMD64VFMADD213PDMasked128,
@ -899,9 +902,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
ssa.OpAMD64VFMSUBADD213PDMasked128,
ssa.OpAMD64VFMSUBADD213PDMasked256,
ssa.OpAMD64VFMSUBADD213PDMasked512,
ssa.OpAMD64VPDPWSSDMasked128,
ssa.OpAMD64VPDPWSSDMasked256,
ssa.OpAMD64VPDPWSSDMasked512,
ssa.OpAMD64VPERMI2BMasked128,
ssa.OpAMD64VPERMI2BMasked256,
ssa.OpAMD64VPERMI2BMasked512,
@ -1064,6 +1064,9 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
ssa.OpAMD64VPABSQMasked128,
ssa.OpAMD64VPABSQMasked256,
ssa.OpAMD64VPABSQMasked512,
ssa.OpAMD64VPDPWSSDMasked128,
ssa.OpAMD64VPDPWSSDMasked256,
ssa.OpAMD64VPDPWSSDMasked512,
ssa.OpAMD64VADDPSMasked128,
ssa.OpAMD64VADDPSMasked256,
ssa.OpAMD64VADDPSMasked512,
@ -1280,9 +1283,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
ssa.OpAMD64VPORQMasked128,
ssa.OpAMD64VPORQMasked256,
ssa.OpAMD64VPORQMasked512,
ssa.OpAMD64VPDPWSSDMasked128,
ssa.OpAMD64VPDPWSSDMasked256,
ssa.OpAMD64VPDPWSSDMasked512,
ssa.OpAMD64VPMADDWDMasked128,
ssa.OpAMD64VPMADDWDMasked256,
ssa.OpAMD64VPMADDWDMasked512,
@ -1354,15 +1354,15 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
ssa.OpAMD64VPRORVQMasked128,
ssa.OpAMD64VPRORVQMasked256,
ssa.OpAMD64VPRORVQMasked512,
ssa.OpAMD64VPDPWSSDSMasked128,
ssa.OpAMD64VPDPWSSDSMasked256,
ssa.OpAMD64VPDPWSSDSMasked512,
ssa.OpAMD64VPADDSBMasked128,
ssa.OpAMD64VPADDSBMasked256,
ssa.OpAMD64VPADDSBMasked512,
ssa.OpAMD64VPADDSWMasked128,
ssa.OpAMD64VPADDSWMasked256,
ssa.OpAMD64VPADDSWMasked512,
ssa.OpAMD64VPDPWSSDSMasked128,
ssa.OpAMD64VPDPWSSDSMasked256,
ssa.OpAMD64VPDPWSSDSMasked512,
ssa.OpAMD64VPSUBSBMasked128,
ssa.OpAMD64VPSUBSBMasked256,
ssa.OpAMD64VPSUBSBMasked512,

View file

@ -54,6 +54,12 @@
(AddUint64x2 ...) => (VPADDQ128 ...)
(AddUint64x4 ...) => (VPADDQ256 ...)
(AddUint64x8 ...) => (VPADDQ512 ...)
(AddDotProdInt32x4 ...) => (VPDPWSSD128 ...)
(AddDotProdInt32x8 ...) => (VPDPWSSD256 ...)
(AddDotProdInt32x16 ...) => (VPDPWSSD512 ...)
(AddDotProdMaskedInt32x4 x y z mask) => (VPDPWSSDMasked128 x y z (VPMOVVec32x4ToM <types.TypeMask> mask))
(AddDotProdMaskedInt32x8 x y z mask) => (VPDPWSSDMasked256 x y z (VPMOVVec32x8ToM <types.TypeMask> mask))
(AddDotProdMaskedInt32x16 x y z mask) => (VPDPWSSDMasked512 x y z (VPMOVVec32x16ToM <types.TypeMask> mask))
(AddMaskedFloat32x4 x y mask) => (VADDPSMasked128 x y (VPMOVVec32x4ToM <types.TypeMask> mask))
(AddMaskedFloat32x8 x y mask) => (VADDPSMasked256 x y (VPMOVVec32x8ToM <types.TypeMask> mask))
(AddMaskedFloat32x16 x y mask) => (VADDPSMasked512 x y (VPMOVVec32x16ToM <types.TypeMask> mask))
@ -994,12 +1000,6 @@
(PairDotProdInt16x8 ...) => (VPMADDWD128 ...)
(PairDotProdInt16x16 ...) => (VPMADDWD256 ...)
(PairDotProdInt16x32 ...) => (VPMADDWD512 ...)
(PairDotProdAccumulateInt32x4 ...) => (VPDPWSSD128 ...)
(PairDotProdAccumulateInt32x8 ...) => (VPDPWSSD256 ...)
(PairDotProdAccumulateInt32x16 ...) => (VPDPWSSD512 ...)
(PairDotProdAccumulateMaskedInt32x4 x y z mask) => (VPDPWSSDMasked128 x y z (VPMOVVec32x4ToM <types.TypeMask> mask))
(PairDotProdAccumulateMaskedInt32x8 x y z mask) => (VPDPWSSDMasked256 x y z (VPMOVVec32x8ToM <types.TypeMask> mask))
(PairDotProdAccumulateMaskedInt32x16 x y z mask) => (VPDPWSSDMasked512 x y z (VPMOVVec32x16ToM <types.TypeMask> mask))
(PairDotProdMaskedInt16x8 x y mask) => (VPMADDWDMasked128 x y (VPMOVVec16x8ToM <types.TypeMask> mask))
(PairDotProdMaskedInt16x16 x y mask) => (VPMADDWDMasked256 x y (VPMOVVec16x16ToM <types.TypeMask> mask))
(PairDotProdMaskedInt16x32 x y mask) => (VPMADDWDMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask))
@ -1307,6 +1307,12 @@
(SaturatedAddUint16x8 ...) => (VPADDSW128 ...)
(SaturatedAddUint16x16 ...) => (VPADDSW256 ...)
(SaturatedAddUint16x32 ...) => (VPADDSW512 ...)
(SaturatedAddDotProdInt32x4 ...) => (VPDPWSSDS128 ...)
(SaturatedAddDotProdInt32x8 ...) => (VPDPWSSDS256 ...)
(SaturatedAddDotProdInt32x16 ...) => (VPDPWSSDS512 ...)
(SaturatedAddDotProdMaskedInt32x4 x y z mask) => (VPDPWSSDSMasked128 x y z (VPMOVVec32x4ToM <types.TypeMask> mask))
(SaturatedAddDotProdMaskedInt32x8 x y z mask) => (VPDPWSSDSMasked256 x y z (VPMOVVec32x8ToM <types.TypeMask> mask))
(SaturatedAddDotProdMaskedInt32x16 x y z mask) => (VPDPWSSDSMasked512 x y z (VPMOVVec32x16ToM <types.TypeMask> mask))
(SaturatedAddMaskedInt8x16 x y mask) => (VPADDSBMasked128 x y (VPMOVVec8x16ToM <types.TypeMask> mask))
(SaturatedAddMaskedInt8x32 x y mask) => (VPADDSBMasked256 x y (VPMOVVec8x32ToM <types.TypeMask> mask))
(SaturatedAddMaskedInt8x64 x y mask) => (VPADDSBMasked512 x y (VPMOVVec8x64ToM <types.TypeMask> mask))
@ -1319,12 +1325,6 @@
(SaturatedAddMaskedUint16x8 x y mask) => (VPADDSWMasked128 x y (VPMOVVec16x8ToM <types.TypeMask> mask))
(SaturatedAddMaskedUint16x16 x y mask) => (VPADDSWMasked256 x y (VPMOVVec16x16ToM <types.TypeMask> mask))
(SaturatedAddMaskedUint16x32 x y mask) => (VPADDSWMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask))
(SaturatedPairDotProdAccumulateInt32x4 ...) => (VPDPWSSDS128 ...)
(SaturatedPairDotProdAccumulateInt32x8 ...) => (VPDPWSSDS256 ...)
(SaturatedPairDotProdAccumulateInt32x16 ...) => (VPDPWSSDS512 ...)
(SaturatedPairDotProdAccumulateMaskedInt32x4 x y z mask) => (VPDPWSSDSMasked128 x y z (VPMOVVec32x4ToM <types.TypeMask> mask))
(SaturatedPairDotProdAccumulateMaskedInt32x8 x y z mask) => (VPDPWSSDSMasked256 x y z (VPMOVVec32x8ToM <types.TypeMask> mask))
(SaturatedPairDotProdAccumulateMaskedInt32x16 x y z mask) => (VPDPWSSDSMasked512 x y z (VPMOVVec32x16ToM <types.TypeMask> mask))
(SaturatedPairwiseAddInt16x8 ...) => (VPHADDSW128 ...)
(SaturatedPairwiseAddInt16x16 ...) => (VPHADDSW256 ...)
(SaturatedPairwiseSubInt16x8 ...) => (VPHSUBSW128 ...)

View file

@ -27,6 +27,12 @@ func simdGenericOps() []opData {
{name: "AbsoluteMaskedInt64x2", argLength: 2, commutative: false},
{name: "AbsoluteMaskedInt64x4", argLength: 2, commutative: false},
{name: "AbsoluteMaskedInt64x8", argLength: 2, commutative: false},
{name: "AddDotProdInt32x4", argLength: 3, commutative: false},
{name: "AddDotProdInt32x8", argLength: 3, commutative: false},
{name: "AddDotProdInt32x16", argLength: 3, commutative: false},
{name: "AddDotProdMaskedInt32x4", argLength: 4, commutative: false},
{name: "AddDotProdMaskedInt32x8", argLength: 4, commutative: false},
{name: "AddDotProdMaskedInt32x16", argLength: 4, commutative: false},
{name: "AddFloat32x4", argLength: 2, commutative: true},
{name: "AddFloat32x8", argLength: 2, commutative: true},
{name: "AddFloat32x16", argLength: 2, commutative: true},
@ -892,12 +898,6 @@ func simdGenericOps() []opData {
{name: "OrUint64x2", argLength: 2, commutative: true},
{name: "OrUint64x4", argLength: 2, commutative: true},
{name: "OrUint64x8", argLength: 2, commutative: true},
{name: "PairDotProdAccumulateInt32x4", argLength: 3, commutative: false},
{name: "PairDotProdAccumulateInt32x8", argLength: 3, commutative: false},
{name: "PairDotProdAccumulateInt32x16", argLength: 3, commutative: false},
{name: "PairDotProdAccumulateMaskedInt32x4", argLength: 4, commutative: false},
{name: "PairDotProdAccumulateMaskedInt32x8", argLength: 4, commutative: false},
{name: "PairDotProdAccumulateMaskedInt32x16", argLength: 4, commutative: false},
{name: "PairDotProdInt16x8", argLength: 2, commutative: false},
{name: "PairDotProdInt16x16", argLength: 2, commutative: false},
{name: "PairDotProdInt16x32", argLength: 2, commutative: false},
@ -1136,6 +1136,12 @@ func simdGenericOps() []opData {
{name: "RoundFloat32x8", argLength: 1, commutative: false},
{name: "RoundFloat64x2", argLength: 1, commutative: false},
{name: "RoundFloat64x4", argLength: 1, commutative: false},
{name: "SaturatedAddDotProdInt32x4", argLength: 3, commutative: false},
{name: "SaturatedAddDotProdInt32x8", argLength: 3, commutative: false},
{name: "SaturatedAddDotProdInt32x16", argLength: 3, commutative: false},
{name: "SaturatedAddDotProdMaskedInt32x4", argLength: 4, commutative: false},
{name: "SaturatedAddDotProdMaskedInt32x8", argLength: 4, commutative: false},
{name: "SaturatedAddDotProdMaskedInt32x16", argLength: 4, commutative: false},
{name: "SaturatedAddInt8x16", argLength: 2, commutative: true},
{name: "SaturatedAddInt8x32", argLength: 2, commutative: true},
{name: "SaturatedAddInt8x64", argLength: 2, commutative: true},
@ -1160,12 +1166,6 @@ func simdGenericOps() []opData {
{name: "SaturatedAddUint16x8", argLength: 2, commutative: true},
{name: "SaturatedAddUint16x16", argLength: 2, commutative: true},
{name: "SaturatedAddUint16x32", argLength: 2, commutative: true},
{name: "SaturatedPairDotProdAccumulateInt32x4", argLength: 3, commutative: false},
{name: "SaturatedPairDotProdAccumulateInt32x8", argLength: 3, commutative: false},
{name: "SaturatedPairDotProdAccumulateInt32x16", argLength: 3, commutative: false},
{name: "SaturatedPairDotProdAccumulateMaskedInt32x4", argLength: 4, commutative: false},
{name: "SaturatedPairDotProdAccumulateMaskedInt32x8", argLength: 4, commutative: false},
{name: "SaturatedPairDotProdAccumulateMaskedInt32x16", argLength: 4, commutative: false},
{name: "SaturatedPairwiseAddInt16x8", argLength: 2, commutative: false},
{name: "SaturatedPairwiseAddInt16x16", argLength: 2, commutative: false},
{name: "SaturatedPairwiseSubInt16x8", argLength: 2, commutative: false},

View file

@ -4513,6 +4513,12 @@ const (
OpAbsoluteMaskedInt64x2
OpAbsoluteMaskedInt64x4
OpAbsoluteMaskedInt64x8
OpAddDotProdInt32x4
OpAddDotProdInt32x8
OpAddDotProdInt32x16
OpAddDotProdMaskedInt32x4
OpAddDotProdMaskedInt32x8
OpAddDotProdMaskedInt32x16
OpAddFloat32x4
OpAddFloat32x8
OpAddFloat32x16
@ -5378,12 +5384,6 @@ const (
OpOrUint64x2
OpOrUint64x4
OpOrUint64x8
OpPairDotProdAccumulateInt32x4
OpPairDotProdAccumulateInt32x8
OpPairDotProdAccumulateInt32x16
OpPairDotProdAccumulateMaskedInt32x4
OpPairDotProdAccumulateMaskedInt32x8
OpPairDotProdAccumulateMaskedInt32x16
OpPairDotProdInt16x8
OpPairDotProdInt16x16
OpPairDotProdInt16x32
@ -5622,6 +5622,12 @@ const (
OpRoundFloat32x8
OpRoundFloat64x2
OpRoundFloat64x4
OpSaturatedAddDotProdInt32x4
OpSaturatedAddDotProdInt32x8
OpSaturatedAddDotProdInt32x16
OpSaturatedAddDotProdMaskedInt32x4
OpSaturatedAddDotProdMaskedInt32x8
OpSaturatedAddDotProdMaskedInt32x16
OpSaturatedAddInt8x16
OpSaturatedAddInt8x32
OpSaturatedAddInt8x64
@ -5646,12 +5652,6 @@ const (
OpSaturatedAddUint16x8
OpSaturatedAddUint16x16
OpSaturatedAddUint16x32
OpSaturatedPairDotProdAccumulateInt32x4
OpSaturatedPairDotProdAccumulateInt32x8
OpSaturatedPairDotProdAccumulateInt32x16
OpSaturatedPairDotProdAccumulateMaskedInt32x4
OpSaturatedPairDotProdAccumulateMaskedInt32x8
OpSaturatedPairDotProdAccumulateMaskedInt32x16
OpSaturatedPairwiseAddInt16x8
OpSaturatedPairwiseAddInt16x16
OpSaturatedPairwiseSubInt16x8
@ -61789,6 +61789,36 @@ var opcodeTable = [...]opInfo{
argLen: 2,
generic: true,
},
{
name: "AddDotProdInt32x4",
argLen: 3,
generic: true,
},
{
name: "AddDotProdInt32x8",
argLen: 3,
generic: true,
},
{
name: "AddDotProdInt32x16",
argLen: 3,
generic: true,
},
{
name: "AddDotProdMaskedInt32x4",
argLen: 4,
generic: true,
},
{
name: "AddDotProdMaskedInt32x8",
argLen: 4,
generic: true,
},
{
name: "AddDotProdMaskedInt32x16",
argLen: 4,
generic: true,
},
{
name: "AddFloat32x4",
argLen: 2,
@ -66563,36 +66593,6 @@ var opcodeTable = [...]opInfo{
commutative: true,
generic: true,
},
{
name: "PairDotProdAccumulateInt32x4",
argLen: 3,
generic: true,
},
{
name: "PairDotProdAccumulateInt32x8",
argLen: 3,
generic: true,
},
{
name: "PairDotProdAccumulateInt32x16",
argLen: 3,
generic: true,
},
{
name: "PairDotProdAccumulateMaskedInt32x4",
argLen: 4,
generic: true,
},
{
name: "PairDotProdAccumulateMaskedInt32x8",
argLen: 4,
generic: true,
},
{
name: "PairDotProdAccumulateMaskedInt32x16",
argLen: 4,
generic: true,
},
{
name: "PairDotProdInt16x8",
argLen: 2,
@ -67783,6 +67783,36 @@ var opcodeTable = [...]opInfo{
argLen: 1,
generic: true,
},
{
name: "SaturatedAddDotProdInt32x4",
argLen: 3,
generic: true,
},
{
name: "SaturatedAddDotProdInt32x8",
argLen: 3,
generic: true,
},
{
name: "SaturatedAddDotProdInt32x16",
argLen: 3,
generic: true,
},
{
name: "SaturatedAddDotProdMaskedInt32x4",
argLen: 4,
generic: true,
},
{
name: "SaturatedAddDotProdMaskedInt32x8",
argLen: 4,
generic: true,
},
{
name: "SaturatedAddDotProdMaskedInt32x16",
argLen: 4,
generic: true,
},
{
name: "SaturatedAddInt8x16",
argLen: 2,
@ -67927,36 +67957,6 @@ var opcodeTable = [...]opInfo{
commutative: true,
generic: true,
},
{
name: "SaturatedPairDotProdAccumulateInt32x4",
argLen: 3,
generic: true,
},
{
name: "SaturatedPairDotProdAccumulateInt32x8",
argLen: 3,
generic: true,
},
{
name: "SaturatedPairDotProdAccumulateInt32x16",
argLen: 3,
generic: true,
},
{
name: "SaturatedPairDotProdAccumulateMaskedInt32x4",
argLen: 4,
generic: true,
},
{
name: "SaturatedPairDotProdAccumulateMaskedInt32x8",
argLen: 4,
generic: true,
},
{
name: "SaturatedPairDotProdAccumulateMaskedInt32x16",
argLen: 4,
generic: true,
},
{
name: "SaturatedPairwiseAddInt16x8",
argLen: 2,

View file

@ -631,6 +631,21 @@ func rewriteValueAMD64(v *Value) bool {
case OpAdd8:
v.Op = OpAMD64ADDL
return true
case OpAddDotProdInt32x16:
v.Op = OpAMD64VPDPWSSD512
return true
case OpAddDotProdInt32x4:
v.Op = OpAMD64VPDPWSSD128
return true
case OpAddDotProdInt32x8:
v.Op = OpAMD64VPDPWSSD256
return true
case OpAddDotProdMaskedInt32x16:
return rewriteValueAMD64_OpAddDotProdMaskedInt32x16(v)
case OpAddDotProdMaskedInt32x4:
return rewriteValueAMD64_OpAddDotProdMaskedInt32x4(v)
case OpAddDotProdMaskedInt32x8:
return rewriteValueAMD64_OpAddDotProdMaskedInt32x8(v)
case OpAddFloat32x16:
v.Op = OpAMD64VADDPS512
return true
@ -3340,21 +3355,6 @@ func rewriteValueAMD64(v *Value) bool {
case OpOrUint8x32:
v.Op = OpAMD64VPOR256
return true
case OpPairDotProdAccumulateInt32x16:
v.Op = OpAMD64VPDPWSSD512
return true
case OpPairDotProdAccumulateInt32x4:
v.Op = OpAMD64VPDPWSSD128
return true
case OpPairDotProdAccumulateInt32x8:
v.Op = OpAMD64VPDPWSSD256
return true
case OpPairDotProdAccumulateMaskedInt32x16:
return rewriteValueAMD64_OpPairDotProdAccumulateMaskedInt32x16(v)
case OpPairDotProdAccumulateMaskedInt32x4:
return rewriteValueAMD64_OpPairDotProdAccumulateMaskedInt32x4(v)
case OpPairDotProdAccumulateMaskedInt32x8:
return rewriteValueAMD64_OpPairDotProdAccumulateMaskedInt32x8(v)
case OpPairDotProdInt16x16:
v.Op = OpAMD64VPMADDWD256
return true
@ -4206,6 +4206,21 @@ func rewriteValueAMD64(v *Value) bool {
return rewriteValueAMD64_OpRsh8x64(v)
case OpRsh8x8:
return rewriteValueAMD64_OpRsh8x8(v)
case OpSaturatedAddDotProdInt32x16:
v.Op = OpAMD64VPDPWSSDS512
return true
case OpSaturatedAddDotProdInt32x4:
v.Op = OpAMD64VPDPWSSDS128
return true
case OpSaturatedAddDotProdInt32x8:
v.Op = OpAMD64VPDPWSSDS256
return true
case OpSaturatedAddDotProdMaskedInt32x16:
return rewriteValueAMD64_OpSaturatedAddDotProdMaskedInt32x16(v)
case OpSaturatedAddDotProdMaskedInt32x4:
return rewriteValueAMD64_OpSaturatedAddDotProdMaskedInt32x4(v)
case OpSaturatedAddDotProdMaskedInt32x8:
return rewriteValueAMD64_OpSaturatedAddDotProdMaskedInt32x8(v)
case OpSaturatedAddInt16x16:
v.Op = OpAMD64VPADDSW256
return true
@ -4266,21 +4281,6 @@ func rewriteValueAMD64(v *Value) bool {
case OpSaturatedAddUint8x64:
v.Op = OpAMD64VPADDSB512
return true
case OpSaturatedPairDotProdAccumulateInt32x16:
v.Op = OpAMD64VPDPWSSDS512
return true
case OpSaturatedPairDotProdAccumulateInt32x4:
v.Op = OpAMD64VPDPWSSDS128
return true
case OpSaturatedPairDotProdAccumulateInt32x8:
v.Op = OpAMD64VPDPWSSDS256
return true
case OpSaturatedPairDotProdAccumulateMaskedInt32x16:
return rewriteValueAMD64_OpSaturatedPairDotProdAccumulateMaskedInt32x16(v)
case OpSaturatedPairDotProdAccumulateMaskedInt32x4:
return rewriteValueAMD64_OpSaturatedPairDotProdAccumulateMaskedInt32x4(v)
case OpSaturatedPairDotProdAccumulateMaskedInt32x8:
return rewriteValueAMD64_OpSaturatedPairDotProdAccumulateMaskedInt32x8(v)
case OpSaturatedPairwiseAddInt16x16:
v.Op = OpAMD64VPHADDSW256
return true
@ -28514,6 +28514,66 @@ func rewriteValueAMD64_OpAbsoluteMaskedInt8x64(v *Value) bool {
return true
}
}
func rewriteValueAMD64_OpAddDotProdMaskedInt32x16(v *Value) bool {
v_3 := v.Args[3]
v_2 := v.Args[2]
v_1 := v.Args[1]
v_0 := v.Args[0]
b := v.Block
// match: (AddDotProdMaskedInt32x16 x y z mask)
// result: (VPDPWSSDMasked512 x y z (VPMOVVec32x16ToM <types.TypeMask> mask))
for {
x := v_0
y := v_1
z := v_2
mask := v_3
v.reset(OpAMD64VPDPWSSDMasked512)
v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask)
v0.AddArg(mask)
v.AddArg4(x, y, z, v0)
return true
}
}
func rewriteValueAMD64_OpAddDotProdMaskedInt32x4(v *Value) bool {
v_3 := v.Args[3]
v_2 := v.Args[2]
v_1 := v.Args[1]
v_0 := v.Args[0]
b := v.Block
// match: (AddDotProdMaskedInt32x4 x y z mask)
// result: (VPDPWSSDMasked128 x y z (VPMOVVec32x4ToM <types.TypeMask> mask))
for {
x := v_0
y := v_1
z := v_2
mask := v_3
v.reset(OpAMD64VPDPWSSDMasked128)
v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
v0.AddArg(mask)
v.AddArg4(x, y, z, v0)
return true
}
}
func rewriteValueAMD64_OpAddDotProdMaskedInt32x8(v *Value) bool {
v_3 := v.Args[3]
v_2 := v.Args[2]
v_1 := v.Args[1]
v_0 := v.Args[0]
b := v.Block
// match: (AddDotProdMaskedInt32x8 x y z mask)
// result: (VPDPWSSDMasked256 x y z (VPMOVVec32x8ToM <types.TypeMask> mask))
for {
x := v_0
y := v_1
z := v_2
mask := v_3
v.reset(OpAMD64VPDPWSSDMasked256)
v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask)
v0.AddArg(mask)
v.AddArg4(x, y, z, v0)
return true
}
}
func rewriteValueAMD64_OpAddMaskedFloat32x16(v *Value) bool {
v_2 := v.Args[2]
v_1 := v.Args[1]
@ -45669,66 +45729,6 @@ func rewriteValueAMD64_OpOrMaskedUint64x8(v *Value) bool {
return true
}
}
func rewriteValueAMD64_OpPairDotProdAccumulateMaskedInt32x16(v *Value) bool {
v_3 := v.Args[3]
v_2 := v.Args[2]
v_1 := v.Args[1]
v_0 := v.Args[0]
b := v.Block
// match: (PairDotProdAccumulateMaskedInt32x16 x y z mask)
// result: (VPDPWSSDMasked512 x y z (VPMOVVec32x16ToM <types.TypeMask> mask))
for {
x := v_0
y := v_1
z := v_2
mask := v_3
v.reset(OpAMD64VPDPWSSDMasked512)
v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask)
v0.AddArg(mask)
v.AddArg4(x, y, z, v0)
return true
}
}
func rewriteValueAMD64_OpPairDotProdAccumulateMaskedInt32x4(v *Value) bool {
v_3 := v.Args[3]
v_2 := v.Args[2]
v_1 := v.Args[1]
v_0 := v.Args[0]
b := v.Block
// match: (PairDotProdAccumulateMaskedInt32x4 x y z mask)
// result: (VPDPWSSDMasked128 x y z (VPMOVVec32x4ToM <types.TypeMask> mask))
for {
x := v_0
y := v_1
z := v_2
mask := v_3
v.reset(OpAMD64VPDPWSSDMasked128)
v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
v0.AddArg(mask)
v.AddArg4(x, y, z, v0)
return true
}
}
func rewriteValueAMD64_OpPairDotProdAccumulateMaskedInt32x8(v *Value) bool {
v_3 := v.Args[3]
v_2 := v.Args[2]
v_1 := v.Args[1]
v_0 := v.Args[0]
b := v.Block
// match: (PairDotProdAccumulateMaskedInt32x8 x y z mask)
// result: (VPDPWSSDMasked256 x y z (VPMOVVec32x8ToM <types.TypeMask> mask))
for {
x := v_0
y := v_1
z := v_2
mask := v_3
v.reset(OpAMD64VPDPWSSDMasked256)
v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask)
v0.AddArg(mask)
v.AddArg4(x, y, z, v0)
return true
}
}
func rewriteValueAMD64_OpPairDotProdMaskedInt16x16(v *Value) bool {
v_2 := v.Args[2]
v_1 := v.Args[1]
@ -49721,6 +49721,66 @@ func rewriteValueAMD64_OpRsh8x8(v *Value) bool {
}
return false
}
func rewriteValueAMD64_OpSaturatedAddDotProdMaskedInt32x16(v *Value) bool {
v_3 := v.Args[3]
v_2 := v.Args[2]
v_1 := v.Args[1]
v_0 := v.Args[0]
b := v.Block
// match: (SaturatedAddDotProdMaskedInt32x16 x y z mask)
// result: (VPDPWSSDSMasked512 x y z (VPMOVVec32x16ToM <types.TypeMask> mask))
for {
x := v_0
y := v_1
z := v_2
mask := v_3
v.reset(OpAMD64VPDPWSSDSMasked512)
v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask)
v0.AddArg(mask)
v.AddArg4(x, y, z, v0)
return true
}
}
func rewriteValueAMD64_OpSaturatedAddDotProdMaskedInt32x4(v *Value) bool {
v_3 := v.Args[3]
v_2 := v.Args[2]
v_1 := v.Args[1]
v_0 := v.Args[0]
b := v.Block
// match: (SaturatedAddDotProdMaskedInt32x4 x y z mask)
// result: (VPDPWSSDSMasked128 x y z (VPMOVVec32x4ToM <types.TypeMask> mask))
for {
x := v_0
y := v_1
z := v_2
mask := v_3
v.reset(OpAMD64VPDPWSSDSMasked128)
v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
v0.AddArg(mask)
v.AddArg4(x, y, z, v0)
return true
}
}
func rewriteValueAMD64_OpSaturatedAddDotProdMaskedInt32x8(v *Value) bool {
v_3 := v.Args[3]
v_2 := v.Args[2]
v_1 := v.Args[1]
v_0 := v.Args[0]
b := v.Block
// match: (SaturatedAddDotProdMaskedInt32x8 x y z mask)
// result: (VPDPWSSDSMasked256 x y z (VPMOVVec32x8ToM <types.TypeMask> mask))
for {
x := v_0
y := v_1
z := v_2
mask := v_3
v.reset(OpAMD64VPDPWSSDSMasked256)
v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask)
v0.AddArg(mask)
v.AddArg4(x, y, z, v0)
return true
}
}
func rewriteValueAMD64_OpSaturatedAddMaskedInt16x16(v *Value) bool {
v_2 := v.Args[2]
v_1 := v.Args[1]
@ -49937,66 +49997,6 @@ func rewriteValueAMD64_OpSaturatedAddMaskedUint8x64(v *Value) bool {
return true
}
}
func rewriteValueAMD64_OpSaturatedPairDotProdAccumulateMaskedInt32x16(v *Value) bool {
v_3 := v.Args[3]
v_2 := v.Args[2]
v_1 := v.Args[1]
v_0 := v.Args[0]
b := v.Block
// match: (SaturatedPairDotProdAccumulateMaskedInt32x16 x y z mask)
// result: (VPDPWSSDSMasked512 x y z (VPMOVVec32x16ToM <types.TypeMask> mask))
for {
x := v_0
y := v_1
z := v_2
mask := v_3
v.reset(OpAMD64VPDPWSSDSMasked512)
v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, types.TypeMask)
v0.AddArg(mask)
v.AddArg4(x, y, z, v0)
return true
}
}
func rewriteValueAMD64_OpSaturatedPairDotProdAccumulateMaskedInt32x4(v *Value) bool {
v_3 := v.Args[3]
v_2 := v.Args[2]
v_1 := v.Args[1]
v_0 := v.Args[0]
b := v.Block
// match: (SaturatedPairDotProdAccumulateMaskedInt32x4 x y z mask)
// result: (VPDPWSSDSMasked128 x y z (VPMOVVec32x4ToM <types.TypeMask> mask))
for {
x := v_0
y := v_1
z := v_2
mask := v_3
v.reset(OpAMD64VPDPWSSDSMasked128)
v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, types.TypeMask)
v0.AddArg(mask)
v.AddArg4(x, y, z, v0)
return true
}
}
func rewriteValueAMD64_OpSaturatedPairDotProdAccumulateMaskedInt32x8(v *Value) bool {
v_3 := v.Args[3]
v_2 := v.Args[2]
v_1 := v.Args[1]
v_0 := v.Args[0]
b := v.Block
// match: (SaturatedPairDotProdAccumulateMaskedInt32x8 x y z mask)
// result: (VPDPWSSDSMasked256 x y z (VPMOVVec32x8ToM <types.TypeMask> mask))
for {
x := v_0
y := v_1
z := v_2
mask := v_3
v.reset(OpAMD64VPDPWSSDSMasked256)
v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, types.TypeMask)
v0.AddArg(mask)
v.AddArg4(x, y, z, v0)
return true
}
}
func rewriteValueAMD64_OpSaturatedSubMaskedInt16x16(v *Value) bool {
v_2 := v.Args[2]
v_1 := v.Args[1]

View file

@ -65,6 +65,12 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
addF(simdPackage, "Uint64x2.Add", opLen2(ssa.OpAddUint64x2, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Uint64x4.Add", opLen2(ssa.OpAddUint64x4, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint64x8.Add", opLen2(ssa.OpAddUint64x8, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int32x4.AddDotProd", opLen3(ssa.OpAddDotProdInt32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int32x8.AddDotProd", opLen3(ssa.OpAddDotProdInt32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int32x16.AddDotProd", opLen3(ssa.OpAddDotProdInt32x16, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int32x4.AddDotProdMasked", opLen4(ssa.OpAddDotProdMaskedInt32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int32x8.AddDotProdMasked", opLen4(ssa.OpAddDotProdMaskedInt32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int32x16.AddDotProdMasked", opLen4(ssa.OpAddDotProdMaskedInt32x16, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Float32x4.AddMasked", opLen3(ssa.OpAddMaskedFloat32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Float32x8.AddMasked", opLen3(ssa.OpAddMaskedFloat32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Float32x16.AddMasked", opLen3(ssa.OpAddMaskedFloat32x16, types.TypeVec512), sys.AMD64)
@ -1005,12 +1011,6 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
addF(simdPackage, "Int16x8.PairDotProd", opLen2(ssa.OpPairDotProdInt16x8, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int16x16.PairDotProd", opLen2(ssa.OpPairDotProdInt16x16, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int16x32.PairDotProd", opLen2(ssa.OpPairDotProdInt16x32, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int16x8.PairDotProdAccumulate", opLen3_31(ssa.OpPairDotProdAccumulateInt32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int16x16.PairDotProdAccumulate", opLen3_31(ssa.OpPairDotProdAccumulateInt32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int16x32.PairDotProdAccumulate", opLen3_31(ssa.OpPairDotProdAccumulateInt32x16, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int16x8.PairDotProdAccumulateMasked", opLen4_31(ssa.OpPairDotProdAccumulateMaskedInt32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int16x16.PairDotProdAccumulateMasked", opLen4_31(ssa.OpPairDotProdAccumulateMaskedInt32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int16x32.PairDotProdAccumulateMasked", opLen4_31(ssa.OpPairDotProdAccumulateMaskedInt32x16, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int16x8.PairDotProdMasked", opLen3(ssa.OpPairDotProdMaskedInt16x8, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int16x16.PairDotProdMasked", opLen3(ssa.OpPairDotProdMaskedInt16x16, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int16x32.PairDotProdMasked", opLen3(ssa.OpPairDotProdMaskedInt16x32, types.TypeVec512), sys.AMD64)
@ -1318,6 +1318,12 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
addF(simdPackage, "Uint16x8.SaturatedAdd", opLen2(ssa.OpSaturatedAddUint16x8, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Uint16x16.SaturatedAdd", opLen2(ssa.OpSaturatedAddUint16x16, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint16x32.SaturatedAdd", opLen2(ssa.OpSaturatedAddUint16x32, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int32x4.SaturatedAddDotProd", opLen3(ssa.OpSaturatedAddDotProdInt32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int32x8.SaturatedAddDotProd", opLen3(ssa.OpSaturatedAddDotProdInt32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int32x16.SaturatedAddDotProd", opLen3(ssa.OpSaturatedAddDotProdInt32x16, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int32x4.SaturatedAddDotProdMasked", opLen4(ssa.OpSaturatedAddDotProdMaskedInt32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int32x8.SaturatedAddDotProdMasked", opLen4(ssa.OpSaturatedAddDotProdMaskedInt32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int32x16.SaturatedAddDotProdMasked", opLen4(ssa.OpSaturatedAddDotProdMaskedInt32x16, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int8x16.SaturatedAddMasked", opLen3(ssa.OpSaturatedAddMaskedInt8x16, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int8x32.SaturatedAddMasked", opLen3(ssa.OpSaturatedAddMaskedInt8x32, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int8x64.SaturatedAddMasked", opLen3(ssa.OpSaturatedAddMaskedInt8x64, types.TypeVec512), sys.AMD64)
@ -1330,12 +1336,6 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
addF(simdPackage, "Uint16x8.SaturatedAddMasked", opLen3(ssa.OpSaturatedAddMaskedUint16x8, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Uint16x16.SaturatedAddMasked", opLen3(ssa.OpSaturatedAddMaskedUint16x16, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint16x32.SaturatedAddMasked", opLen3(ssa.OpSaturatedAddMaskedUint16x32, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int16x8.SaturatedPairDotProdAccumulate", opLen3_31(ssa.OpSaturatedPairDotProdAccumulateInt32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int16x16.SaturatedPairDotProdAccumulate", opLen3_31(ssa.OpSaturatedPairDotProdAccumulateInt32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int16x32.SaturatedPairDotProdAccumulate", opLen3_31(ssa.OpSaturatedPairDotProdAccumulateInt32x16, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int16x8.SaturatedPairDotProdAccumulateMasked", opLen4_31(ssa.OpSaturatedPairDotProdAccumulateMaskedInt32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int16x16.SaturatedPairDotProdAccumulateMasked", opLen4_31(ssa.OpSaturatedPairDotProdAccumulateMaskedInt32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int16x32.SaturatedPairDotProdAccumulateMasked", opLen4_31(ssa.OpSaturatedPairDotProdAccumulateMaskedInt32x16, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int16x8.SaturatedPairwiseAdd", opLen2(ssa.OpSaturatedPairwiseAddInt16x8, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int16x16.SaturatedPairwiseAdd", opLen2(ssa.OpSaturatedPairwiseAddInt16x16, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int16x8.SaturatedPairwiseSub", opLen2(ssa.OpSaturatedPairwiseSubInt16x8, types.TypeVec128), sys.AMD64)

View file

@ -304,6 +304,46 @@ func (x Uint64x4) Add(y Uint64x4) Uint64x4
// Asm: VPADDQ, CPU Feature: AVX512F
func (x Uint64x8) Add(y Uint64x8) Uint64x8
/* AddDotProd */
// AddDotProd performs dot products on pairs of elements of y and z and then adds x.
//
// Asm: VPDPWSSD, CPU Feature: AVXVNNI
func (x Int32x4) AddDotProd(y Int16x8, z Int16x8) Int32x4
// AddDotProd performs dot products on pairs of elements of y and z and then adds x.
//
// Asm: VPDPWSSD, CPU Feature: AVXVNNI
func (x Int32x8) AddDotProd(y Int16x16, z Int16x16) Int32x8
// AddDotProd performs dot products on pairs of elements of y and z and then adds x.
//
// Asm: VPDPWSSD, CPU Feature: AVX512VNNI
func (x Int32x16) AddDotProd(y Int16x32, z Int16x32) Int32x16
/* AddDotProdMasked */
// AddDotProdMasked performs dot products on pairs of elements of y and z and then adds x.
//
// This operation is applied selectively under a write mask.
//
// Asm: VPDPWSSD, CPU Feature: AVX512VNNI
func (x Int32x4) AddDotProdMasked(y Int16x8, z Int16x8, mask Mask32x4) Int32x4
// AddDotProdMasked performs dot products on pairs of elements of y and z and then adds x.
//
// This operation is applied selectively under a write mask.
//
// Asm: VPDPWSSD, CPU Feature: AVX512VNNI
func (x Int32x8) AddDotProdMasked(y Int16x16, z Int16x16, mask Mask32x8) Int32x8
// AddDotProdMasked performs dot products on pairs of elements of y and z and then adds x.
//
// This operation is applied selectively under a write mask.
//
// Asm: VPDPWSSD, CPU Feature: AVX512VNNI
func (x Int32x16) AddDotProdMasked(y Int16x32, z Int16x32, mask Mask32x16) Int32x16
/* AddMasked */
// AddMasked adds corresponding elements of two vectors.
@ -6339,46 +6379,6 @@ func (x Int16x16) PairDotProd(y Int16x16) Int32x8
// Asm: VPMADDWD, CPU Feature: AVX512BW
func (x Int16x32) PairDotProd(y Int16x32) Int32x16
/* PairDotProdAccumulate */
// PairDotProdAccumulate performs dot products on pairs of elements of x and y and then adds z.
//
// Asm: VPDPWSSD, CPU Feature: AVXVNNI
func (x Int16x8) PairDotProdAccumulate(y Int16x8, z Int32x4) Int32x4
// PairDotProdAccumulate performs dot products on pairs of elements of x and y and then adds z.
//
// Asm: VPDPWSSD, CPU Feature: AVXVNNI
func (x Int16x16) PairDotProdAccumulate(y Int16x16, z Int32x8) Int32x8
// PairDotProdAccumulate performs dot products on pairs of elements of x and y and then adds z.
//
// Asm: VPDPWSSD, CPU Feature: AVX512VNNI
func (x Int16x32) PairDotProdAccumulate(y Int16x32, z Int32x16) Int32x16
/* PairDotProdAccumulateMasked */
// PairDotProdAccumulateMasked performs dot products on pairs of elements of x and y and then adds z.
//
// This operation is applied selectively under a write mask.
//
// Asm: VPDPWSSD, CPU Feature: AVX512VNNI
func (x Int16x8) PairDotProdAccumulateMasked(y Int16x8, z Int32x4, mask Mask32x4) Int32x4
// PairDotProdAccumulateMasked performs dot products on pairs of elements of x and y and then adds z.
//
// This operation is applied selectively under a write mask.
//
// Asm: VPDPWSSD, CPU Feature: AVX512VNNI
func (x Int16x16) PairDotProdAccumulateMasked(y Int16x16, z Int32x8, mask Mask32x8) Int32x8
// PairDotProdAccumulateMasked performs dot products on pairs of elements of x and y and then adds z.
//
// This operation is applied selectively under a write mask.
//
// Asm: VPDPWSSD, CPU Feature: AVX512VNNI
func (x Int16x32) PairDotProdAccumulateMasked(y Int16x32, z Int32x16, mask Mask32x16) Int32x16
/* PairDotProdMasked */
// PairDotProdMasked multiplies the elements and add the pairs together,
@ -8649,6 +8649,46 @@ func (x Uint16x16) SaturatedAdd(y Uint16x16) Uint16x16
// Asm: VPADDSW, CPU Feature: AVX512BW
func (x Uint16x32) SaturatedAdd(y Uint16x32) Uint16x32
/* SaturatedAddDotProd */
// SaturatedAddDotProd performs dot products on pairs of elements of y and z and then adds x.
//
// Asm: VPDPWSSDS, CPU Feature: AVXVNNI
func (x Int32x4) SaturatedAddDotProd(y Int16x8, z Int16x8) Int32x4
// SaturatedAddDotProd performs dot products on pairs of elements of y and z and then adds x.
//
// Asm: VPDPWSSDS, CPU Feature: AVXVNNI
func (x Int32x8) SaturatedAddDotProd(y Int16x16, z Int16x16) Int32x8
// SaturatedAddDotProd performs dot products on pairs of elements of y and z and then adds x.
//
// Asm: VPDPWSSDS, CPU Feature: AVX512VNNI
func (x Int32x16) SaturatedAddDotProd(y Int16x32, z Int16x32) Int32x16
/* SaturatedAddDotProdMasked */
// SaturatedAddDotProdMasked performs dot products on pairs of elements of y and z and then adds x.
//
// This operation is applied selectively under a write mask.
//
// Asm: VPDPWSSDS, CPU Feature: AVX512VNNI
func (x Int32x4) SaturatedAddDotProdMasked(y Int16x8, z Int16x8, mask Mask32x4) Int32x4
// SaturatedAddDotProdMasked performs dot products on pairs of elements of y and z and then adds x.
//
// This operation is applied selectively under a write mask.
//
// Asm: VPDPWSSDS, CPU Feature: AVX512VNNI
func (x Int32x8) SaturatedAddDotProdMasked(y Int16x16, z Int16x16, mask Mask32x8) Int32x8
// SaturatedAddDotProdMasked performs dot products on pairs of elements of y and z and then adds x.
//
// This operation is applied selectively under a write mask.
//
// Asm: VPDPWSSDS, CPU Feature: AVX512VNNI
func (x Int32x16) SaturatedAddDotProdMasked(y Int16x32, z Int16x32, mask Mask32x16) Int32x16
/* SaturatedAddMasked */
// SaturatedAddMasked adds corresponding elements of two vectors with saturation.
@ -8735,46 +8775,6 @@ func (x Uint16x16) SaturatedAddMasked(y Uint16x16, mask Mask16x16) Uint16x16
// Asm: VPADDSW, CPU Feature: AVX512BW
func (x Uint16x32) SaturatedAddMasked(y Uint16x32, mask Mask16x32) Uint16x32
/* SaturatedPairDotProdAccumulate */
// SaturatedPairDotProdAccumulate performs dot products on pairs of elements of x and y and then adds z.
//
// Asm: VPDPWSSDS, CPU Feature: AVXVNNI
func (x Int16x8) SaturatedPairDotProdAccumulate(y Int16x8, z Int32x4) Int32x4
// SaturatedPairDotProdAccumulate performs dot products on pairs of elements of x and y and then adds z.
//
// Asm: VPDPWSSDS, CPU Feature: AVXVNNI
func (x Int16x16) SaturatedPairDotProdAccumulate(y Int16x16, z Int32x8) Int32x8
// SaturatedPairDotProdAccumulate performs dot products on pairs of elements of x and y and then adds z.
//
// Asm: VPDPWSSDS, CPU Feature: AVX512VNNI
func (x Int16x32) SaturatedPairDotProdAccumulate(y Int16x32, z Int32x16) Int32x16
/* SaturatedPairDotProdAccumulateMasked */
// SaturatedPairDotProdAccumulateMasked performs dot products on pairs of elements of x and y and then adds z.
//
// This operation is applied selectively under a write mask.
//
// Asm: VPDPWSSDS, CPU Feature: AVX512VNNI
func (x Int16x8) SaturatedPairDotProdAccumulateMasked(y Int16x8, z Int32x4, mask Mask32x4) Int32x4
// SaturatedPairDotProdAccumulateMasked performs dot products on pairs of elements of x and y and then adds z.
//
// This operation is applied selectively under a write mask.
//
// Asm: VPDPWSSDS, CPU Feature: AVX512VNNI
func (x Int16x16) SaturatedPairDotProdAccumulateMasked(y Int16x16, z Int32x8, mask Mask32x8) Int32x8
// SaturatedPairDotProdAccumulateMasked performs dot products on pairs of elements of x and y and then adds z.
//
// This operation is applied selectively under a write mask.
//
// Asm: VPDPWSSDS, CPU Feature: AVX512VNNI
func (x Int16x32) SaturatedPairDotProdAccumulateMasked(y Int16x32, z Int32x16, mask Mask32x16) Int32x16
/* SaturatedPairwiseAdd */
// SaturatedPairwiseAdd horizontally adds adjacent pairs of elements with saturation.

View file

@ -197,7 +197,7 @@ func TestPairDotProdAccumulate(t *testing.T) {
z := simd.LoadInt32x4Slice([]int32{3, 3, 3, 3})
want := []int32{11, 11, 11, 11}
got := make([]int32, 4)
z = x.PairDotProdAccumulate(x, z)
z = z.AddDotProd(x, x)
z.StoreSlice(got)
for i := range 4 {
if got[i] != want[i] {