[dev.simd] cmd/compile, simd: update DotProd to DotProduct

API naming changes.

This CL also remove AddDotProductPairsSaturated.

Change-Id: I02e6d45268704f3ed4eaf62f0ecb7dc936b42124
Reviewed-on: https://go-review.googlesource.com/c/go/+/710935
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: David Chase <drchase@google.com>
This commit is contained in:
Junyang Shao 2025-10-10 19:18:01 +00:00
parent 647c790143
commit 416332dba2
10 changed files with 113 additions and 534 deletions

View file

@ -1142,9 +1142,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
case ssa.OpAMD64VPDPWSSD128, case ssa.OpAMD64VPDPWSSD128,
ssa.OpAMD64VPDPWSSD256, ssa.OpAMD64VPDPWSSD256,
ssa.OpAMD64VPDPWSSD512, ssa.OpAMD64VPDPWSSD512,
ssa.OpAMD64VPDPWSSDS128,
ssa.OpAMD64VPDPWSSDS256,
ssa.OpAMD64VPDPWSSDS512,
ssa.OpAMD64VPDPBUSD128, ssa.OpAMD64VPDPBUSD128,
ssa.OpAMD64VPDPBUSD256, ssa.OpAMD64VPDPBUSD256,
ssa.OpAMD64VPDPBUSD512, ssa.OpAMD64VPDPBUSD512,
@ -1210,9 +1207,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
case ssa.OpAMD64VPDPWSSDMasked128, case ssa.OpAMD64VPDPWSSDMasked128,
ssa.OpAMD64VPDPWSSDMasked256, ssa.OpAMD64VPDPWSSDMasked256,
ssa.OpAMD64VPDPWSSDMasked512, ssa.OpAMD64VPDPWSSDMasked512,
ssa.OpAMD64VPDPWSSDSMasked128,
ssa.OpAMD64VPDPWSSDSMasked256,
ssa.OpAMD64VPDPWSSDSMasked512,
ssa.OpAMD64VPDPBUSDMasked128, ssa.OpAMD64VPDPBUSDMasked128,
ssa.OpAMD64VPDPBUSDMasked256, ssa.OpAMD64VPDPBUSDMasked256,
ssa.OpAMD64VPDPBUSDMasked512, ssa.OpAMD64VPDPBUSDMasked512,
@ -1500,7 +1494,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
p = simdV21load(s, v) p = simdV21load(s, v)
case ssa.OpAMD64VPDPWSSD512load, case ssa.OpAMD64VPDPWSSD512load,
ssa.OpAMD64VPDPWSSDS512load,
ssa.OpAMD64VPDPBUSD512load, ssa.OpAMD64VPDPBUSD512load,
ssa.OpAMD64VPDPBUSDS512load, ssa.OpAMD64VPDPBUSDS512load,
ssa.OpAMD64VFMADD213PS128load, ssa.OpAMD64VFMADD213PS128load,
@ -1550,9 +1543,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
case ssa.OpAMD64VPDPWSSDMasked128load, case ssa.OpAMD64VPDPWSSDMasked128load,
ssa.OpAMD64VPDPWSSDMasked256load, ssa.OpAMD64VPDPWSSDMasked256load,
ssa.OpAMD64VPDPWSSDMasked512load, ssa.OpAMD64VPDPWSSDMasked512load,
ssa.OpAMD64VPDPWSSDSMasked128load,
ssa.OpAMD64VPDPWSSDSMasked256load,
ssa.OpAMD64VPDPWSSDSMasked512load,
ssa.OpAMD64VPDPBUSDMasked128load, ssa.OpAMD64VPDPBUSDMasked128load,
ssa.OpAMD64VPDPBUSDMasked256load, ssa.OpAMD64VPDPBUSDMasked256load,
ssa.OpAMD64VPDPBUSDMasked512load, ssa.OpAMD64VPDPBUSDMasked512load,
@ -1971,9 +1961,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
ssa.OpAMD64VPDPWSSDMasked128, ssa.OpAMD64VPDPWSSDMasked128,
ssa.OpAMD64VPDPWSSDMasked256, ssa.OpAMD64VPDPWSSDMasked256,
ssa.OpAMD64VPDPWSSDMasked512, ssa.OpAMD64VPDPWSSDMasked512,
ssa.OpAMD64VPDPWSSDSMasked128,
ssa.OpAMD64VPDPWSSDSMasked256,
ssa.OpAMD64VPDPWSSDSMasked512,
ssa.OpAMD64VPDPBUSDMasked128, ssa.OpAMD64VPDPBUSDMasked128,
ssa.OpAMD64VPDPBUSDMasked256, ssa.OpAMD64VPDPBUSDMasked256,
ssa.OpAMD64VPDPBUSDMasked512, ssa.OpAMD64VPDPBUSDMasked512,

View file

@ -52,15 +52,12 @@
(AddUint64x2 ...) => (VPADDQ128 ...) (AddUint64x2 ...) => (VPADDQ128 ...)
(AddUint64x4 ...) => (VPADDQ256 ...) (AddUint64x4 ...) => (VPADDQ256 ...)
(AddUint64x8 ...) => (VPADDQ512 ...) (AddUint64x8 ...) => (VPADDQ512 ...)
(AddDotProdPairsSaturatedInt32x4 ...) => (VPDPWSSDS128 ...) (AddDotProductQuadrupleInt32x4 ...) => (VPDPBUSD128 ...)
(AddDotProdPairsSaturatedInt32x8 ...) => (VPDPWSSDS256 ...) (AddDotProductQuadrupleInt32x8 ...) => (VPDPBUSD256 ...)
(AddDotProdPairsSaturatedInt32x16 ...) => (VPDPWSSDS512 ...) (AddDotProductQuadrupleInt32x16 ...) => (VPDPBUSD512 ...)
(AddDotProdQuadrupleInt32x4 ...) => (VPDPBUSD128 ...) (AddDotProductQuadrupleSaturatedInt32x4 ...) => (VPDPBUSDS128 ...)
(AddDotProdQuadrupleInt32x8 ...) => (VPDPBUSD256 ...) (AddDotProductQuadrupleSaturatedInt32x8 ...) => (VPDPBUSDS256 ...)
(AddDotProdQuadrupleInt32x16 ...) => (VPDPBUSD512 ...) (AddDotProductQuadrupleSaturatedInt32x16 ...) => (VPDPBUSDS512 ...)
(AddDotProdQuadrupleSaturatedInt32x4 ...) => (VPDPBUSDS128 ...)
(AddDotProdQuadrupleSaturatedInt32x8 ...) => (VPDPBUSDS256 ...)
(AddDotProdQuadrupleSaturatedInt32x16 ...) => (VPDPBUSDS512 ...)
(AddPairsFloat32x4 ...) => (VHADDPS128 ...) (AddPairsFloat32x4 ...) => (VHADDPS128 ...)
(AddPairsFloat32x8 ...) => (VHADDPS256 ...) (AddPairsFloat32x8 ...) => (VHADDPS256 ...)
(AddPairsFloat64x2 ...) => (VHADDPD128 ...) (AddPairsFloat64x2 ...) => (VHADDPD128 ...)
@ -353,12 +350,12 @@
(DivFloat64x2 ...) => (VDIVPD128 ...) (DivFloat64x2 ...) => (VDIVPD128 ...)
(DivFloat64x4 ...) => (VDIVPD256 ...) (DivFloat64x4 ...) => (VDIVPD256 ...)
(DivFloat64x8 ...) => (VDIVPD512 ...) (DivFloat64x8 ...) => (VDIVPD512 ...)
(DotProdPairsInt16x8 ...) => (VPMADDWD128 ...) (DotProductPairsInt16x8 ...) => (VPMADDWD128 ...)
(DotProdPairsInt16x16 ...) => (VPMADDWD256 ...) (DotProductPairsInt16x16 ...) => (VPMADDWD256 ...)
(DotProdPairsInt16x32 ...) => (VPMADDWD512 ...) (DotProductPairsInt16x32 ...) => (VPMADDWD512 ...)
(DotProdPairsSaturatedUint8x16 ...) => (VPMADDUBSW128 ...) (DotProductPairsSaturatedUint8x16 ...) => (VPMADDUBSW128 ...)
(DotProdPairsSaturatedUint8x32 ...) => (VPMADDUBSW256 ...) (DotProductPairsSaturatedUint8x32 ...) => (VPMADDUBSW256 ...)
(DotProdPairsSaturatedUint8x64 ...) => (VPMADDUBSW512 ...) (DotProductPairsSaturatedUint8x64 ...) => (VPMADDUBSW512 ...)
(EqualFloat32x4 x y) => (VCMPPS128 [0] x y) (EqualFloat32x4 x y) => (VCMPPS128 [0] x y)
(EqualFloat32x8 x y) => (VCMPPS256 [0] x y) (EqualFloat32x8 x y) => (VCMPPS256 [0] x y)
(EqualFloat32x16 x y) => (VPMOVMToVec32x16 (VCMPPS512 [0] x y)) (EqualFloat32x16 x y) => (VPMOVMToVec32x16 (VCMPPS512 [0] x y))
@ -1328,7 +1325,6 @@
(VMOVDQU32Masked512 (VPABSD512 x) mask) => (VPABSDMasked512 x mask) (VMOVDQU32Masked512 (VPABSD512 x) mask) => (VPABSDMasked512 x mask)
(VMOVDQU64Masked512 (VPABSQ512 x) mask) => (VPABSQMasked512 x mask) (VMOVDQU64Masked512 (VPABSQ512 x) mask) => (VPABSQMasked512 x mask)
(VMOVDQU32Masked512 (VPDPWSSD512 x y z) mask) => (VPDPWSSDMasked512 x y z mask) (VMOVDQU32Masked512 (VPDPWSSD512 x y z) mask) => (VPDPWSSDMasked512 x y z mask)
(VMOVDQU32Masked512 (VPDPWSSDS512 x y z) mask) => (VPDPWSSDSMasked512 x y z mask)
(VMOVDQU32Masked512 (VPDPBUSD512 x y z) mask) => (VPDPBUSDMasked512 x y z mask) (VMOVDQU32Masked512 (VPDPBUSD512 x y z) mask) => (VPDPBUSDMasked512 x y z mask)
(VMOVDQU32Masked512 (VPDPBUSDS512 x y z) mask) => (VPDPBUSDSMasked512 x y z mask) (VMOVDQU32Masked512 (VPDPBUSDS512 x y z) mask) => (VPDPBUSDSMasked512 x y z mask)
(VMOVDQU32Masked512 (VADDPS512 x y) mask) => (VADDPSMasked512 x y mask) (VMOVDQU32Masked512 (VADDPS512 x y) mask) => (VADDPSMasked512 x y mask)
@ -1521,10 +1517,6 @@
(VPDPWSSDMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPDPWSSDMasked128load {sym} [off] x y ptr mask mem) (VPDPWSSDMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPDPWSSDMasked128load {sym} [off] x y ptr mask mem)
(VPDPWSSDMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPDPWSSDMasked256load {sym} [off] x y ptr mask mem) (VPDPWSSDMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPDPWSSDMasked256load {sym} [off] x y ptr mask mem)
(VPDPWSSDMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPDPWSSDMasked512load {sym} [off] x y ptr mask mem) (VPDPWSSDMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPDPWSSDMasked512load {sym} [off] x y ptr mask mem)
(VPDPWSSDS512 x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPDPWSSDS512load {sym} [off] x y ptr mem)
(VPDPWSSDSMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPDPWSSDSMasked128load {sym} [off] x y ptr mask mem)
(VPDPWSSDSMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPDPWSSDSMasked256load {sym} [off] x y ptr mask mem)
(VPDPWSSDSMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPDPWSSDSMasked512load {sym} [off] x y ptr mask mem)
(VPDPBUSD512 x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPDPBUSD512load {sym} [off] x y ptr mem) (VPDPBUSD512 x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPDPBUSD512load {sym} [off] x y ptr mem)
(VPDPBUSDMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPDPBUSDMasked128load {sym} [off] x y ptr mask mem) (VPDPBUSDMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPDPBUSDMasked128load {sym} [off] x y ptr mask mem)
(VPDPBUSDMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPDPBUSDMasked256load {sym} [off] x y ptr mask mem) (VPDPBUSDMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPDPBUSDMasked256load {sym} [off] x y ptr mask mem)

View file

@ -368,12 +368,6 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
{name: "VPDPWSSDMasked128", argLength: 4, reg: w3kw, asm: "VPDPWSSD", commutative: false, typ: "Vec128", resultInArg0: true}, {name: "VPDPWSSDMasked128", argLength: 4, reg: w3kw, asm: "VPDPWSSD", commutative: false, typ: "Vec128", resultInArg0: true},
{name: "VPDPWSSDMasked256", argLength: 4, reg: w3kw, asm: "VPDPWSSD", commutative: false, typ: "Vec256", resultInArg0: true}, {name: "VPDPWSSDMasked256", argLength: 4, reg: w3kw, asm: "VPDPWSSD", commutative: false, typ: "Vec256", resultInArg0: true},
{name: "VPDPWSSDMasked512", argLength: 4, reg: w3kw, asm: "VPDPWSSD", commutative: false, typ: "Vec512", resultInArg0: true}, {name: "VPDPWSSDMasked512", argLength: 4, reg: w3kw, asm: "VPDPWSSD", commutative: false, typ: "Vec512", resultInArg0: true},
{name: "VPDPWSSDS128", argLength: 3, reg: v31, asm: "VPDPWSSDS", commutative: false, typ: "Vec128", resultInArg0: true},
{name: "VPDPWSSDS256", argLength: 3, reg: v31, asm: "VPDPWSSDS", commutative: false, typ: "Vec256", resultInArg0: true},
{name: "VPDPWSSDS512", argLength: 3, reg: w31, asm: "VPDPWSSDS", commutative: false, typ: "Vec512", resultInArg0: true},
{name: "VPDPWSSDSMasked128", argLength: 4, reg: w3kw, asm: "VPDPWSSDS", commutative: false, typ: "Vec128", resultInArg0: true},
{name: "VPDPWSSDSMasked256", argLength: 4, reg: w3kw, asm: "VPDPWSSDS", commutative: false, typ: "Vec256", resultInArg0: true},
{name: "VPDPWSSDSMasked512", argLength: 4, reg: w3kw, asm: "VPDPWSSDS", commutative: false, typ: "Vec512", resultInArg0: true},
{name: "VPERMB256", argLength: 2, reg: w21, asm: "VPERMB", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VPERMB256", argLength: 2, reg: w21, asm: "VPERMB", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VPERMB512", argLength: 2, reg: w21, asm: "VPERMB", commutative: false, typ: "Vec512", resultInArg0: false}, {name: "VPERMB512", argLength: 2, reg: w21, asm: "VPERMB", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "VPERMBMasked256", argLength: 3, reg: w2kw, asm: "VPERMB", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VPERMBMasked256", argLength: 3, reg: w2kw, asm: "VPERMB", commutative: false, typ: "Vec256", resultInArg0: false},
@ -1346,10 +1340,6 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
{name: "VPDPWSSDMasked128load", argLength: 5, reg: w3kwload, asm: "VPDPWSSD", commutative: false, typ: "Vec128", aux: "SymOff", symEffect: "Read", resultInArg0: true}, {name: "VPDPWSSDMasked128load", argLength: 5, reg: w3kwload, asm: "VPDPWSSD", commutative: false, typ: "Vec128", aux: "SymOff", symEffect: "Read", resultInArg0: true},
{name: "VPDPWSSDMasked256load", argLength: 5, reg: w3kwload, asm: "VPDPWSSD", commutative: false, typ: "Vec256", aux: "SymOff", symEffect: "Read", resultInArg0: true}, {name: "VPDPWSSDMasked256load", argLength: 5, reg: w3kwload, asm: "VPDPWSSD", commutative: false, typ: "Vec256", aux: "SymOff", symEffect: "Read", resultInArg0: true},
{name: "VPDPWSSDMasked512load", argLength: 5, reg: w3kwload, asm: "VPDPWSSD", commutative: false, typ: "Vec512", aux: "SymOff", symEffect: "Read", resultInArg0: true}, {name: "VPDPWSSDMasked512load", argLength: 5, reg: w3kwload, asm: "VPDPWSSD", commutative: false, typ: "Vec512", aux: "SymOff", symEffect: "Read", resultInArg0: true},
{name: "VPDPWSSDS512load", argLength: 4, reg: w31load, asm: "VPDPWSSDS", commutative: false, typ: "Vec512", aux: "SymOff", symEffect: "Read", resultInArg0: true},
{name: "VPDPWSSDSMasked128load", argLength: 5, reg: w3kwload, asm: "VPDPWSSDS", commutative: false, typ: "Vec128", aux: "SymOff", symEffect: "Read", resultInArg0: true},
{name: "VPDPWSSDSMasked256load", argLength: 5, reg: w3kwload, asm: "VPDPWSSDS", commutative: false, typ: "Vec256", aux: "SymOff", symEffect: "Read", resultInArg0: true},
{name: "VPDPWSSDSMasked512load", argLength: 5, reg: w3kwload, asm: "VPDPWSSDS", commutative: false, typ: "Vec512", aux: "SymOff", symEffect: "Read", resultInArg0: true},
{name: "VPDPBUSD512load", argLength: 4, reg: w31load, asm: "VPDPBUSD", commutative: false, typ: "Vec512", aux: "SymOff", symEffect: "Read", resultInArg0: true}, {name: "VPDPBUSD512load", argLength: 4, reg: w31load, asm: "VPDPBUSD", commutative: false, typ: "Vec512", aux: "SymOff", symEffect: "Read", resultInArg0: true},
{name: "VPDPBUSDMasked128load", argLength: 5, reg: w3kwload, asm: "VPDPBUSD", commutative: false, typ: "Vec128", aux: "SymOff", symEffect: "Read", resultInArg0: true}, {name: "VPDPBUSDMasked128load", argLength: 5, reg: w3kwload, asm: "VPDPBUSD", commutative: false, typ: "Vec128", aux: "SymOff", symEffect: "Read", resultInArg0: true},
{name: "VPDPBUSDMasked256load", argLength: 5, reg: w3kwload, asm: "VPDPBUSD", commutative: false, typ: "Vec256", aux: "SymOff", symEffect: "Read", resultInArg0: true}, {name: "VPDPBUSDMasked256load", argLength: 5, reg: w3kwload, asm: "VPDPBUSD", commutative: false, typ: "Vec256", aux: "SymOff", symEffect: "Read", resultInArg0: true},

View file

@ -25,15 +25,12 @@ func simdGenericOps() []opData {
{name: "AbsInt64x2", argLength: 1, commutative: false}, {name: "AbsInt64x2", argLength: 1, commutative: false},
{name: "AbsInt64x4", argLength: 1, commutative: false}, {name: "AbsInt64x4", argLength: 1, commutative: false},
{name: "AbsInt64x8", argLength: 1, commutative: false}, {name: "AbsInt64x8", argLength: 1, commutative: false},
{name: "AddDotProdPairsSaturatedInt32x4", argLength: 3, commutative: false}, {name: "AddDotProductQuadrupleInt32x4", argLength: 3, commutative: false},
{name: "AddDotProdPairsSaturatedInt32x8", argLength: 3, commutative: false}, {name: "AddDotProductQuadrupleInt32x8", argLength: 3, commutative: false},
{name: "AddDotProdPairsSaturatedInt32x16", argLength: 3, commutative: false}, {name: "AddDotProductQuadrupleInt32x16", argLength: 3, commutative: false},
{name: "AddDotProdQuadrupleInt32x4", argLength: 3, commutative: false}, {name: "AddDotProductQuadrupleSaturatedInt32x4", argLength: 3, commutative: false},
{name: "AddDotProdQuadrupleInt32x8", argLength: 3, commutative: false}, {name: "AddDotProductQuadrupleSaturatedInt32x8", argLength: 3, commutative: false},
{name: "AddDotProdQuadrupleInt32x16", argLength: 3, commutative: false}, {name: "AddDotProductQuadrupleSaturatedInt32x16", argLength: 3, commutative: false},
{name: "AddDotProdQuadrupleSaturatedInt32x4", argLength: 3, commutative: false},
{name: "AddDotProdQuadrupleSaturatedInt32x8", argLength: 3, commutative: false},
{name: "AddDotProdQuadrupleSaturatedInt32x16", argLength: 3, commutative: false},
{name: "AddFloat32x4", argLength: 2, commutative: true}, {name: "AddFloat32x4", argLength: 2, commutative: true},
{name: "AddFloat32x8", argLength: 2, commutative: true}, {name: "AddFloat32x8", argLength: 2, commutative: true},
{name: "AddFloat32x16", argLength: 2, commutative: true}, {name: "AddFloat32x16", argLength: 2, commutative: true},
@ -344,12 +341,12 @@ func simdGenericOps() []opData {
{name: "DivFloat64x2", argLength: 2, commutative: false}, {name: "DivFloat64x2", argLength: 2, commutative: false},
{name: "DivFloat64x4", argLength: 2, commutative: false}, {name: "DivFloat64x4", argLength: 2, commutative: false},
{name: "DivFloat64x8", argLength: 2, commutative: false}, {name: "DivFloat64x8", argLength: 2, commutative: false},
{name: "DotProdPairsInt16x8", argLength: 2, commutative: false}, {name: "DotProductPairsInt16x8", argLength: 2, commutative: false},
{name: "DotProdPairsInt16x16", argLength: 2, commutative: false}, {name: "DotProductPairsInt16x16", argLength: 2, commutative: false},
{name: "DotProdPairsInt16x32", argLength: 2, commutative: false}, {name: "DotProductPairsInt16x32", argLength: 2, commutative: false},
{name: "DotProdPairsSaturatedUint8x16", argLength: 2, commutative: false}, {name: "DotProductPairsSaturatedUint8x16", argLength: 2, commutative: false},
{name: "DotProdPairsSaturatedUint8x32", argLength: 2, commutative: false}, {name: "DotProductPairsSaturatedUint8x32", argLength: 2, commutative: false},
{name: "DotProdPairsSaturatedUint8x64", argLength: 2, commutative: false}, {name: "DotProductPairsSaturatedUint8x64", argLength: 2, commutative: false},
{name: "EqualFloat32x4", argLength: 2, commutative: true}, {name: "EqualFloat32x4", argLength: 2, commutative: true},
{name: "EqualFloat32x8", argLength: 2, commutative: true}, {name: "EqualFloat32x8", argLength: 2, commutative: true},
{name: "EqualFloat32x16", argLength: 2, commutative: true}, {name: "EqualFloat32x16", argLength: 2, commutative: true},

View file

@ -1608,12 +1608,6 @@ const (
OpAMD64VPDPWSSDMasked128 OpAMD64VPDPWSSDMasked128
OpAMD64VPDPWSSDMasked256 OpAMD64VPDPWSSDMasked256
OpAMD64VPDPWSSDMasked512 OpAMD64VPDPWSSDMasked512
OpAMD64VPDPWSSDS128
OpAMD64VPDPWSSDS256
OpAMD64VPDPWSSDS512
OpAMD64VPDPWSSDSMasked128
OpAMD64VPDPWSSDSMasked256
OpAMD64VPDPWSSDSMasked512
OpAMD64VPERMB256 OpAMD64VPERMB256
OpAMD64VPERMB512 OpAMD64VPERMB512
OpAMD64VPERMBMasked256 OpAMD64VPERMBMasked256
@ -2586,10 +2580,6 @@ const (
OpAMD64VPDPWSSDMasked128load OpAMD64VPDPWSSDMasked128load
OpAMD64VPDPWSSDMasked256load OpAMD64VPDPWSSDMasked256load
OpAMD64VPDPWSSDMasked512load OpAMD64VPDPWSSDMasked512load
OpAMD64VPDPWSSDS512load
OpAMD64VPDPWSSDSMasked128load
OpAMD64VPDPWSSDSMasked256load
OpAMD64VPDPWSSDSMasked512load
OpAMD64VPDPBUSD512load OpAMD64VPDPBUSD512load
OpAMD64VPDPBUSDMasked128load OpAMD64VPDPBUSDMasked128load
OpAMD64VPDPBUSDMasked256load OpAMD64VPDPBUSDMasked256load
@ -5416,15 +5406,12 @@ const (
OpAbsInt64x2 OpAbsInt64x2
OpAbsInt64x4 OpAbsInt64x4
OpAbsInt64x8 OpAbsInt64x8
OpAddDotProdPairsSaturatedInt32x4 OpAddDotProductQuadrupleInt32x4
OpAddDotProdPairsSaturatedInt32x8 OpAddDotProductQuadrupleInt32x8
OpAddDotProdPairsSaturatedInt32x16 OpAddDotProductQuadrupleInt32x16
OpAddDotProdQuadrupleInt32x4 OpAddDotProductQuadrupleSaturatedInt32x4
OpAddDotProdQuadrupleInt32x8 OpAddDotProductQuadrupleSaturatedInt32x8
OpAddDotProdQuadrupleInt32x16 OpAddDotProductQuadrupleSaturatedInt32x16
OpAddDotProdQuadrupleSaturatedInt32x4
OpAddDotProdQuadrupleSaturatedInt32x8
OpAddDotProdQuadrupleSaturatedInt32x16
OpAddFloat32x4 OpAddFloat32x4
OpAddFloat32x8 OpAddFloat32x8
OpAddFloat32x16 OpAddFloat32x16
@ -5735,12 +5722,12 @@ const (
OpDivFloat64x2 OpDivFloat64x2
OpDivFloat64x4 OpDivFloat64x4
OpDivFloat64x8 OpDivFloat64x8
OpDotProdPairsInt16x8 OpDotProductPairsInt16x8
OpDotProdPairsInt16x16 OpDotProductPairsInt16x16
OpDotProdPairsInt16x32 OpDotProductPairsInt16x32
OpDotProdPairsSaturatedUint8x16 OpDotProductPairsSaturatedUint8x16
OpDotProdPairsSaturatedUint8x32 OpDotProductPairsSaturatedUint8x32
OpDotProdPairsSaturatedUint8x64 OpDotProductPairsSaturatedUint8x64
OpEqualFloat32x4 OpEqualFloat32x4
OpEqualFloat32x8 OpEqualFloat32x8
OpEqualFloat32x16 OpEqualFloat32x16
@ -25338,105 +25325,6 @@ var opcodeTable = [...]opInfo{
}, },
}, },
}, },
{
name: "VPDPWSSDS128",
argLen: 3,
resultInArg0: true,
asm: x86.AVPDPWSSDS,
reg: regInfo{
inputs: []inputInfo{
{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
{1, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
{2, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
},
outputs: []outputInfo{
{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
},
},
},
{
name: "VPDPWSSDS256",
argLen: 3,
resultInArg0: true,
asm: x86.AVPDPWSSDS,
reg: regInfo{
inputs: []inputInfo{
{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
{1, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
{2, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
},
outputs: []outputInfo{
{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
},
},
},
{
name: "VPDPWSSDS512",
argLen: 3,
resultInArg0: true,
asm: x86.AVPDPWSSDS,
reg: regInfo{
inputs: []inputInfo{
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
{1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
{2, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
outputs: []outputInfo{
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
},
},
{
name: "VPDPWSSDSMasked128",
argLen: 4,
resultInArg0: true,
asm: x86.AVPDPWSSDS,
reg: regInfo{
inputs: []inputInfo{
{3, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
{1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
{2, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
outputs: []outputInfo{
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
},
},
{
name: "VPDPWSSDSMasked256",
argLen: 4,
resultInArg0: true,
asm: x86.AVPDPWSSDS,
reg: regInfo{
inputs: []inputInfo{
{3, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
{1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
{2, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
outputs: []outputInfo{
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
},
},
{
name: "VPDPWSSDSMasked512",
argLen: 4,
resultInArg0: true,
asm: x86.AVPDPWSSDS,
reg: regInfo{
inputs: []inputInfo{
{3, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
{1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
{2, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
outputs: []outputInfo{
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
},
},
{ {
name: "VPERMB256", name: "VPERMB256",
argLen: 2, argLen: 2,
@ -39773,81 +39661,6 @@ var opcodeTable = [...]opInfo{
}, },
}, },
}, },
{
name: "VPDPWSSDS512load",
auxType: auxSymOff,
argLen: 4,
resultInArg0: true,
symEffect: SymRead,
asm: x86.AVPDPWSSDS,
reg: regInfo{
inputs: []inputInfo{
{2, 72057594037977087}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R15 SB
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
{1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
outputs: []outputInfo{
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
},
},
{
name: "VPDPWSSDSMasked128load",
auxType: auxSymOff,
argLen: 5,
resultInArg0: true,
symEffect: SymRead,
asm: x86.AVPDPWSSDS,
reg: regInfo{
inputs: []inputInfo{
{3, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
{2, 72057594037977087}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R15 SB
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
{1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
outputs: []outputInfo{
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
},
},
{
name: "VPDPWSSDSMasked256load",
auxType: auxSymOff,
argLen: 5,
resultInArg0: true,
symEffect: SymRead,
asm: x86.AVPDPWSSDS,
reg: regInfo{
inputs: []inputInfo{
{3, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
{2, 72057594037977087}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R15 SB
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
{1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
outputs: []outputInfo{
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
},
},
{
name: "VPDPWSSDSMasked512load",
auxType: auxSymOff,
argLen: 5,
resultInArg0: true,
symEffect: SymRead,
asm: x86.AVPDPWSSDS,
reg: regInfo{
inputs: []inputInfo{
{3, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
{2, 72057594037977087}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R15 SB
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
{1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
outputs: []outputInfo{
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
},
},
{ {
name: "VPDPBUSD512load", name: "VPDPBUSD512load",
auxType: auxSymOff, auxType: auxSymOff,
@ -76268,47 +76081,32 @@ var opcodeTable = [...]opInfo{
generic: true, generic: true,
}, },
{ {
name: "AddDotProdPairsSaturatedInt32x4", name: "AddDotProductQuadrupleInt32x4",
argLen: 3, argLen: 3,
generic: true, generic: true,
}, },
{ {
name: "AddDotProdPairsSaturatedInt32x8", name: "AddDotProductQuadrupleInt32x8",
argLen: 3, argLen: 3,
generic: true, generic: true,
}, },
{ {
name: "AddDotProdPairsSaturatedInt32x16", name: "AddDotProductQuadrupleInt32x16",
argLen: 3, argLen: 3,
generic: true, generic: true,
}, },
{ {
name: "AddDotProdQuadrupleInt32x4", name: "AddDotProductQuadrupleSaturatedInt32x4",
argLen: 3, argLen: 3,
generic: true, generic: true,
}, },
{ {
name: "AddDotProdQuadrupleInt32x8", name: "AddDotProductQuadrupleSaturatedInt32x8",
argLen: 3, argLen: 3,
generic: true, generic: true,
}, },
{ {
name: "AddDotProdQuadrupleInt32x16", name: "AddDotProductQuadrupleSaturatedInt32x16",
argLen: 3,
generic: true,
},
{
name: "AddDotProdQuadrupleSaturatedInt32x4",
argLen: 3,
generic: true,
},
{
name: "AddDotProdQuadrupleSaturatedInt32x8",
argLen: 3,
generic: true,
},
{
name: "AddDotProdQuadrupleSaturatedInt32x16",
argLen: 3, argLen: 3,
generic: true, generic: true,
}, },
@ -77935,32 +77733,32 @@ var opcodeTable = [...]opInfo{
generic: true, generic: true,
}, },
{ {
name: "DotProdPairsInt16x8", name: "DotProductPairsInt16x8",
argLen: 2, argLen: 2,
generic: true, generic: true,
}, },
{ {
name: "DotProdPairsInt16x16", name: "DotProductPairsInt16x16",
argLen: 2, argLen: 2,
generic: true, generic: true,
}, },
{ {
name: "DotProdPairsInt16x32", name: "DotProductPairsInt16x32",
argLen: 2, argLen: 2,
generic: true, generic: true,
}, },
{ {
name: "DotProdPairsSaturatedUint8x16", name: "DotProductPairsSaturatedUint8x16",
argLen: 2, argLen: 2,
generic: true, generic: true,
}, },
{ {
name: "DotProdPairsSaturatedUint8x32", name: "DotProductPairsSaturatedUint8x32",
argLen: 2, argLen: 2,
generic: true, generic: true,
}, },
{ {
name: "DotProdPairsSaturatedUint8x64", name: "DotProductPairsSaturatedUint8x64",
argLen: 2, argLen: 2,
generic: true, generic: true,
}, },

View file

@ -949,14 +949,6 @@ func rewriteValueAMD64(v *Value) bool {
return rewriteValueAMD64_OpAMD64VPDPWSSDMasked256(v) return rewriteValueAMD64_OpAMD64VPDPWSSDMasked256(v)
case OpAMD64VPDPWSSDMasked512: case OpAMD64VPDPWSSDMasked512:
return rewriteValueAMD64_OpAMD64VPDPWSSDMasked512(v) return rewriteValueAMD64_OpAMD64VPDPWSSDMasked512(v)
case OpAMD64VPDPWSSDS512:
return rewriteValueAMD64_OpAMD64VPDPWSSDS512(v)
case OpAMD64VPDPWSSDSMasked128:
return rewriteValueAMD64_OpAMD64VPDPWSSDSMasked128(v)
case OpAMD64VPDPWSSDSMasked256:
return rewriteValueAMD64_OpAMD64VPDPWSSDSMasked256(v)
case OpAMD64VPDPWSSDSMasked512:
return rewriteValueAMD64_OpAMD64VPDPWSSDSMasked512(v)
case OpAMD64VPERMD512: case OpAMD64VPERMD512:
return rewriteValueAMD64_OpAMD64VPERMD512(v) return rewriteValueAMD64_OpAMD64VPERMD512(v)
case OpAMD64VPERMDMasked256: case OpAMD64VPERMDMasked256:
@ -1871,31 +1863,22 @@ func rewriteValueAMD64(v *Value) bool {
case OpAdd8: case OpAdd8:
v.Op = OpAMD64ADDL v.Op = OpAMD64ADDL
return true return true
case OpAddDotProdPairsSaturatedInt32x16: case OpAddDotProductQuadrupleInt32x16:
v.Op = OpAMD64VPDPWSSDS512
return true
case OpAddDotProdPairsSaturatedInt32x4:
v.Op = OpAMD64VPDPWSSDS128
return true
case OpAddDotProdPairsSaturatedInt32x8:
v.Op = OpAMD64VPDPWSSDS256
return true
case OpAddDotProdQuadrupleInt32x16:
v.Op = OpAMD64VPDPBUSD512 v.Op = OpAMD64VPDPBUSD512
return true return true
case OpAddDotProdQuadrupleInt32x4: case OpAddDotProductQuadrupleInt32x4:
v.Op = OpAMD64VPDPBUSD128 v.Op = OpAMD64VPDPBUSD128
return true return true
case OpAddDotProdQuadrupleInt32x8: case OpAddDotProductQuadrupleInt32x8:
v.Op = OpAMD64VPDPBUSD256 v.Op = OpAMD64VPDPBUSD256
return true return true
case OpAddDotProdQuadrupleSaturatedInt32x16: case OpAddDotProductQuadrupleSaturatedInt32x16:
v.Op = OpAMD64VPDPBUSDS512 v.Op = OpAMD64VPDPBUSDS512
return true return true
case OpAddDotProdQuadrupleSaturatedInt32x4: case OpAddDotProductQuadrupleSaturatedInt32x4:
v.Op = OpAMD64VPDPBUSDS128 v.Op = OpAMD64VPDPBUSDS128
return true return true
case OpAddDotProdQuadrupleSaturatedInt32x8: case OpAddDotProductQuadrupleSaturatedInt32x8:
v.Op = OpAMD64VPDPBUSDS256 v.Op = OpAMD64VPDPBUSDS256
return true return true
case OpAddFloat32x16: case OpAddFloat32x16:
@ -3064,22 +3047,22 @@ func rewriteValueAMD64(v *Value) bool {
case OpDivFloat64x8: case OpDivFloat64x8:
v.Op = OpAMD64VDIVPD512 v.Op = OpAMD64VDIVPD512
return true return true
case OpDotProdPairsInt16x16: case OpDotProductPairsInt16x16:
v.Op = OpAMD64VPMADDWD256 v.Op = OpAMD64VPMADDWD256
return true return true
case OpDotProdPairsInt16x32: case OpDotProductPairsInt16x32:
v.Op = OpAMD64VPMADDWD512 v.Op = OpAMD64VPMADDWD512
return true return true
case OpDotProdPairsInt16x8: case OpDotProductPairsInt16x8:
v.Op = OpAMD64VPMADDWD128 v.Op = OpAMD64VPMADDWD128
return true return true
case OpDotProdPairsSaturatedUint8x16: case OpDotProductPairsSaturatedUint8x16:
v.Op = OpAMD64VPMADDUBSW128 v.Op = OpAMD64VPMADDUBSW128
return true return true
case OpDotProdPairsSaturatedUint8x32: case OpDotProductPairsSaturatedUint8x32:
v.Op = OpAMD64VPMADDUBSW256 v.Op = OpAMD64VPMADDUBSW256
return true return true
case OpDotProdPairsSaturatedUint8x64: case OpDotProductPairsSaturatedUint8x64:
v.Op = OpAMD64VPMADDUBSW512 v.Op = OpAMD64VPMADDUBSW512
return true return true
case OpEq16: case OpEq16:
@ -31631,20 +31614,6 @@ func rewriteValueAMD64_OpAMD64VMOVDQU32Masked512(v *Value) bool {
v.AddArg4(x, y, z, mask) v.AddArg4(x, y, z, mask)
return true return true
} }
// match: (VMOVDQU32Masked512 (VPDPWSSDS512 x y z) mask)
// result: (VPDPWSSDSMasked512 x y z mask)
for {
if v_0.Op != OpAMD64VPDPWSSDS512 {
break
}
z := v_0.Args[2]
x := v_0.Args[0]
y := v_0.Args[1]
mask := v_1
v.reset(OpAMD64VPDPWSSDSMasked512)
v.AddArg4(x, y, z, mask)
return true
}
// match: (VMOVDQU32Masked512 (VPDPBUSD512 x y z) mask) // match: (VMOVDQU32Masked512 (VPDPBUSD512 x y z) mask)
// result: (VPDPBUSDMasked512 x y z mask) // result: (VPDPBUSDMasked512 x y z mask)
for { for {
@ -36686,128 +36655,6 @@ func rewriteValueAMD64_OpAMD64VPDPWSSDMasked512(v *Value) bool {
} }
return false return false
} }
func rewriteValueAMD64_OpAMD64VPDPWSSDS512(v *Value) bool {
v_2 := v.Args[2]
v_1 := v.Args[1]
v_0 := v.Args[0]
// match: (VPDPWSSDS512 x y l:(VMOVDQUload512 {sym} [off] ptr mem))
// cond: canMergeLoad(v, l) && clobber(l)
// result: (VPDPWSSDS512load {sym} [off] x y ptr mem)
for {
x := v_0
y := v_1
l := v_2
if l.Op != OpAMD64VMOVDQUload512 {
break
}
off := auxIntToInt32(l.AuxInt)
sym := auxToSym(l.Aux)
mem := l.Args[1]
ptr := l.Args[0]
if !(canMergeLoad(v, l) && clobber(l)) {
break
}
v.reset(OpAMD64VPDPWSSDS512load)
v.AuxInt = int32ToAuxInt(off)
v.Aux = symToAux(sym)
v.AddArg4(x, y, ptr, mem)
return true
}
return false
}
func rewriteValueAMD64_OpAMD64VPDPWSSDSMasked128(v *Value) bool {
v_3 := v.Args[3]
v_2 := v.Args[2]
v_1 := v.Args[1]
v_0 := v.Args[0]
// match: (VPDPWSSDSMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask)
// cond: canMergeLoad(v, l) && clobber(l)
// result: (VPDPWSSDSMasked128load {sym} [off] x y ptr mask mem)
for {
x := v_0
y := v_1
l := v_2
if l.Op != OpAMD64VMOVDQUload128 {
break
}
off := auxIntToInt32(l.AuxInt)
sym := auxToSym(l.Aux)
mem := l.Args[1]
ptr := l.Args[0]
mask := v_3
if !(canMergeLoad(v, l) && clobber(l)) {
break
}
v.reset(OpAMD64VPDPWSSDSMasked128load)
v.AuxInt = int32ToAuxInt(off)
v.Aux = symToAux(sym)
v.AddArg5(x, y, ptr, mask, mem)
return true
}
return false
}
func rewriteValueAMD64_OpAMD64VPDPWSSDSMasked256(v *Value) bool {
v_3 := v.Args[3]
v_2 := v.Args[2]
v_1 := v.Args[1]
v_0 := v.Args[0]
// match: (VPDPWSSDSMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask)
// cond: canMergeLoad(v, l) && clobber(l)
// result: (VPDPWSSDSMasked256load {sym} [off] x y ptr mask mem)
for {
x := v_0
y := v_1
l := v_2
if l.Op != OpAMD64VMOVDQUload256 {
break
}
off := auxIntToInt32(l.AuxInt)
sym := auxToSym(l.Aux)
mem := l.Args[1]
ptr := l.Args[0]
mask := v_3
if !(canMergeLoad(v, l) && clobber(l)) {
break
}
v.reset(OpAMD64VPDPWSSDSMasked256load)
v.AuxInt = int32ToAuxInt(off)
v.Aux = symToAux(sym)
v.AddArg5(x, y, ptr, mask, mem)
return true
}
return false
}
func rewriteValueAMD64_OpAMD64VPDPWSSDSMasked512(v *Value) bool {
v_3 := v.Args[3]
v_2 := v.Args[2]
v_1 := v.Args[1]
v_0 := v.Args[0]
// match: (VPDPWSSDSMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask)
// cond: canMergeLoad(v, l) && clobber(l)
// result: (VPDPWSSDSMasked512load {sym} [off] x y ptr mask mem)
for {
x := v_0
y := v_1
l := v_2
if l.Op != OpAMD64VMOVDQUload512 {
break
}
off := auxIntToInt32(l.AuxInt)
sym := auxToSym(l.Aux)
mem := l.Args[1]
ptr := l.Args[0]
mask := v_3
if !(canMergeLoad(v, l) && clobber(l)) {
break
}
v.reset(OpAMD64VPDPWSSDSMasked512load)
v.AuxInt = int32ToAuxInt(off)
v.Aux = symToAux(sym)
v.AddArg5(x, y, ptr, mask, mem)
return true
}
return false
}
func rewriteValueAMD64_OpAMD64VPERMD512(v *Value) bool { func rewriteValueAMD64_OpAMD64VPERMD512(v *Value) bool {
v_1 := v.Args[1] v_1 := v.Args[1]
v_0 := v.Args[0] v_0 := v.Args[0]

View file

@ -64,15 +64,12 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
addF(simdPackage, "Uint64x2.Add", opLen2(ssa.OpAddUint64x2, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint64x2.Add", opLen2(ssa.OpAddUint64x2, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Uint64x4.Add", opLen2(ssa.OpAddUint64x4, types.TypeVec256), sys.AMD64) addF(simdPackage, "Uint64x4.Add", opLen2(ssa.OpAddUint64x4, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint64x8.Add", opLen2(ssa.OpAddUint64x8, types.TypeVec512), sys.AMD64) addF(simdPackage, "Uint64x8.Add", opLen2(ssa.OpAddUint64x8, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int32x4.AddDotProdPairsSaturated", opLen3(ssa.OpAddDotProdPairsSaturatedInt32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int8x16.AddDotProductQuadruple", opLen3_31(ssa.OpAddDotProductQuadrupleInt32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int32x8.AddDotProdPairsSaturated", opLen3(ssa.OpAddDotProdPairsSaturatedInt32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Int8x32.AddDotProductQuadruple", opLen3_31(ssa.OpAddDotProductQuadrupleInt32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int32x16.AddDotProdPairsSaturated", opLen3(ssa.OpAddDotProdPairsSaturatedInt32x16, types.TypeVec512), sys.AMD64) addF(simdPackage, "Int8x64.AddDotProductQuadruple", opLen3_31(ssa.OpAddDotProductQuadrupleInt32x16, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int8x16.AddDotProdQuadruple", opLen3_31(ssa.OpAddDotProdQuadrupleInt32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int8x16.AddDotProductQuadrupleSaturated", opLen3_31(ssa.OpAddDotProductQuadrupleSaturatedInt32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int8x32.AddDotProdQuadruple", opLen3_31(ssa.OpAddDotProdQuadrupleInt32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Int8x32.AddDotProductQuadrupleSaturated", opLen3_31(ssa.OpAddDotProductQuadrupleSaturatedInt32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int8x64.AddDotProdQuadruple", opLen3_31(ssa.OpAddDotProdQuadrupleInt32x16, types.TypeVec512), sys.AMD64) addF(simdPackage, "Int8x64.AddDotProductQuadrupleSaturated", opLen3_31(ssa.OpAddDotProductQuadrupleSaturatedInt32x16, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int8x16.AddDotProdQuadrupleSaturated", opLen3_31(ssa.OpAddDotProdQuadrupleSaturatedInt32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int8x32.AddDotProdQuadrupleSaturated", opLen3_31(ssa.OpAddDotProdQuadrupleSaturatedInt32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int8x64.AddDotProdQuadrupleSaturated", opLen3_31(ssa.OpAddDotProdQuadrupleSaturatedInt32x16, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Float32x4.AddPairs", opLen2(ssa.OpAddPairsFloat32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Float32x4.AddPairs", opLen2(ssa.OpAddPairsFloat32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Float32x8.AddPairs", opLen2(ssa.OpAddPairsFloat32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Float32x8.AddPairs", opLen2(ssa.OpAddPairsFloat32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Float64x2.AddPairs", opLen2(ssa.OpAddPairsFloat64x2, types.TypeVec128), sys.AMD64) addF(simdPackage, "Float64x2.AddPairs", opLen2(ssa.OpAddPairsFloat64x2, types.TypeVec128), sys.AMD64)
@ -365,12 +362,12 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
addF(simdPackage, "Float64x2.Div", opLen2(ssa.OpDivFloat64x2, types.TypeVec128), sys.AMD64) addF(simdPackage, "Float64x2.Div", opLen2(ssa.OpDivFloat64x2, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Float64x4.Div", opLen2(ssa.OpDivFloat64x4, types.TypeVec256), sys.AMD64) addF(simdPackage, "Float64x4.Div", opLen2(ssa.OpDivFloat64x4, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Float64x8.Div", opLen2(ssa.OpDivFloat64x8, types.TypeVec512), sys.AMD64) addF(simdPackage, "Float64x8.Div", opLen2(ssa.OpDivFloat64x8, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int16x8.DotProdPairs", opLen2(ssa.OpDotProdPairsInt16x8, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int16x8.DotProductPairs", opLen2(ssa.OpDotProductPairsInt16x8, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int16x16.DotProdPairs", opLen2(ssa.OpDotProdPairsInt16x16, types.TypeVec256), sys.AMD64) addF(simdPackage, "Int16x16.DotProductPairs", opLen2(ssa.OpDotProductPairsInt16x16, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int16x32.DotProdPairs", opLen2(ssa.OpDotProdPairsInt16x32, types.TypeVec512), sys.AMD64) addF(simdPackage, "Int16x32.DotProductPairs", opLen2(ssa.OpDotProductPairsInt16x32, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Uint8x16.DotProdPairsSaturated", opLen2(ssa.OpDotProdPairsSaturatedUint8x16, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint8x16.DotProductPairsSaturated", opLen2(ssa.OpDotProductPairsSaturatedUint8x16, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Uint8x32.DotProdPairsSaturated", opLen2(ssa.OpDotProdPairsSaturatedUint8x32, types.TypeVec256), sys.AMD64) addF(simdPackage, "Uint8x32.DotProductPairsSaturated", opLen2(ssa.OpDotProductPairsSaturatedUint8x32, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint8x64.DotProdPairsSaturated", opLen2(ssa.OpDotProdPairsSaturatedUint8x64, types.TypeVec512), sys.AMD64) addF(simdPackage, "Uint8x64.DotProductPairsSaturated", opLen2(ssa.OpDotProductPairsSaturatedUint8x64, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int8x16.Equal", opLen2(ssa.OpEqualInt8x16, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int8x16.Equal", opLen2(ssa.OpEqualInt8x16, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int8x32.Equal", opLen2(ssa.OpEqualInt8x32, types.TypeVec256), sys.AMD64) addF(simdPackage, "Int8x32.Equal", opLen2(ssa.OpEqualInt8x32, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int8x64.Equal", opLen2(ssa.OpEqualInt8x64, types.TypeVec512), sys.AMD64) addF(simdPackage, "Int8x64.Equal", opLen2(ssa.OpEqualInt8x64, types.TypeVec512), sys.AMD64)

View file

@ -1,38 +1,34 @@
!sum !sum
- go: DotProdPairs - go: DotProductPairs
commutative: false commutative: false
documentation: !string |- documentation: !string |-
// NAME multiplies the elements and add the pairs together, // NAME multiplies the elements and add the pairs together,
// yielding a vector of half as many elements with twice the input element size. // yielding a vector of half as many elements with twice the input element size.
# TODO: maybe simplify this name within the receiver-type + method-naming scheme we use. # TODO: maybe simplify this name within the receiver-type + method-naming scheme we use.
- go: DotProdPairsSaturated - go: DotProductPairsSaturated
commutative: false commutative: false
documentation: !string |- documentation: !string |-
// NAME multiplies the elements and add the pairs together with saturation, // NAME multiplies the elements and add the pairs together with saturation,
// yielding a vector of half as many elements with twice the input element size. // yielding a vector of half as many elements with twice the input element size.
# QuadDotProd, i.e. VPDPBUSD(S) are operations with src/dst on the same register, we are not supporting this as of now. # QuadDotProduct, i.e. VPDPBUSD(S) are operations with src/dst on the same register, we are not supporting this as of now.
# - go: DotProdBroadcast # - go: DotProductBroadcast
# commutative: true # commutative: true
# # documentation: !string |- # # documentation: !string |-
# // NAME multiplies all elements and broadcasts the sum. # // NAME multiplies all elements and broadcasts the sum.
- go: AddDotProdQuadruple - go: AddDotProductQuadruple
commutative: false commutative: false
documentation: !string |- documentation: !string |-
// NAME performs dot products on groups of 4 elements of x and y and then adds z. // NAME performs dot products on groups of 4 elements of x and y and then adds z.
- go: AddDotProdQuadrupleSaturated - go: AddDotProductQuadrupleSaturated
commutative: false commutative: false
documentation: !string |- documentation: !string |-
// NAME multiplies performs dot products on groups of 4 elements of x and y and then adds z. // NAME multiplies performs dot products on groups of 4 elements of x and y and then adds z.
- go: AddDotProdPairs - go: AddDotProductPairs
commutative: false commutative: false
noTypes: "true" noTypes: "true"
noGenericOps: "true" noGenericOps: "true"
documentation: !string |- documentation: !string |-
// NAME performs dot products on pairs of elements of y and z and then adds x. // NAME performs dot products on pairs of elements of y and z and then adds x.
- go: AddDotProdPairsSaturated
commutative: false
documentation: !string |-
// NAME performs dot products on pairs of elements of y and z and then adds x.
- go: MulAdd - go: MulAdd
commutative: false commutative: false
documentation: !string |- documentation: !string |-

View file

@ -1,5 +1,5 @@
!sum !sum
- go: DotProdPairs - go: DotProductPairs
asm: VPMADDWD asm: VPMADDWD
in: in:
- &int - &int
@ -10,7 +10,7 @@
- &int2 # The elemBits are different - &int2 # The elemBits are different
go: $t2 go: $t2
base: int base: int
- go: DotProdPairsSaturated - go: DotProductPairsSaturated
asm: VPMADDUBSW asm: VPMADDUBSW
in: in:
- &uint - &uint
@ -23,7 +23,7 @@
overwriteElementBits: 8 overwriteElementBits: 8
out: out:
- *int2 - *int2
# - go: DotProdBroadcast # - go: DotProductBroadcast
# asm: VDPP[SD] # asm: VDPP[SD]
# in: # in:
# - &dpb_src # - &dpb_src
@ -33,7 +33,7 @@
# const: 127 # const: 127
# out: # out:
# - *dpb_src # - *dpb_src
- go: AddDotProdQuadruple - go: AddDotProductQuadruple
asm: "VPDPBUSD" asm: "VPDPBUSD"
operandOrder: "31" # switch operand 3 and 1 operandOrder: "31" # switch operand 3 and 1
in: in:
@ -51,7 +51,7 @@
overwriteElementBits: 8 overwriteElementBits: 8
out: out:
- *qdpa_acc - *qdpa_acc
- go: AddDotProdQuadrupleSaturated - go: AddDotProductQuadrupleSaturated
asm: "VPDPBUSDS" asm: "VPDPBUSDS"
operandOrder: "31" # switch operand 3 and 1 operandOrder: "31" # switch operand 3 and 1
in: in:
@ -60,7 +60,7 @@
- *qdpa_src2 - *qdpa_src2
out: out:
- *qdpa_acc - *qdpa_acc
- go: AddDotProdPairs - go: AddDotProductPairs
asm: "VPDPWSSD" asm: "VPDPWSSD"
in: in:
- &pdpa_acc - &pdpa_acc
@ -77,14 +77,6 @@
overwriteElementBits: 16 overwriteElementBits: 16
out: out:
- *pdpa_acc - *pdpa_acc
- go: AddDotProdPairsSaturated
asm: "VPDPWSSDS"
in:
- *pdpa_acc
- *pdpa_src1
- *pdpa_src2
out:
- *pdpa_acc
- go: MulAdd - go: MulAdd
asm: "VFMADD213PS|VFMADD213PD" asm: "VFMADD213PS|VFMADD213PD"
in: in:

View file

@ -314,56 +314,39 @@ func (x Uint64x4) Add(y Uint64x4) Uint64x4
// Asm: VPADDQ, CPU Feature: AVX512 // Asm: VPADDQ, CPU Feature: AVX512
func (x Uint64x8) Add(y Uint64x8) Uint64x8 func (x Uint64x8) Add(y Uint64x8) Uint64x8
/* AddDotProdPairsSaturated */ /* AddDotProductQuadruple */
// AddDotProdPairsSaturated performs dot products on pairs of elements of y and z and then adds x. // AddDotProductQuadruple performs dot products on groups of 4 elements of x and y and then adds z.
//
// Asm: VPDPWSSDS, CPU Feature: AVXVNNI
func (x Int32x4) AddDotProdPairsSaturated(y Int16x8, z Int16x8) Int32x4
// AddDotProdPairsSaturated performs dot products on pairs of elements of y and z and then adds x.
//
// Asm: VPDPWSSDS, CPU Feature: AVXVNNI
func (x Int32x8) AddDotProdPairsSaturated(y Int16x16, z Int16x16) Int32x8
// AddDotProdPairsSaturated performs dot products on pairs of elements of y and z and then adds x.
//
// Asm: VPDPWSSDS, CPU Feature: AVX512VNNI
func (x Int32x16) AddDotProdPairsSaturated(y Int16x32, z Int16x32) Int32x16
/* AddDotProdQuadruple */
// AddDotProdQuadruple performs dot products on groups of 4 elements of x and y and then adds z.
// //
// Asm: VPDPBUSD, CPU Feature: AVXVNNI // Asm: VPDPBUSD, CPU Feature: AVXVNNI
func (x Int8x16) AddDotProdQuadruple(y Uint8x16, z Int32x4) Int32x4 func (x Int8x16) AddDotProductQuadruple(y Uint8x16, z Int32x4) Int32x4
// AddDotProdQuadruple performs dot products on groups of 4 elements of x and y and then adds z. // AddDotProductQuadruple performs dot products on groups of 4 elements of x and y and then adds z.
// //
// Asm: VPDPBUSD, CPU Feature: AVXVNNI // Asm: VPDPBUSD, CPU Feature: AVXVNNI
func (x Int8x32) AddDotProdQuadruple(y Uint8x32, z Int32x8) Int32x8 func (x Int8x32) AddDotProductQuadruple(y Uint8x32, z Int32x8) Int32x8
// AddDotProdQuadruple performs dot products on groups of 4 elements of x and y and then adds z. // AddDotProductQuadruple performs dot products on groups of 4 elements of x and y and then adds z.
// //
// Asm: VPDPBUSD, CPU Feature: AVX512VNNI // Asm: VPDPBUSD, CPU Feature: AVX512VNNI
func (x Int8x64) AddDotProdQuadruple(y Uint8x64, z Int32x16) Int32x16 func (x Int8x64) AddDotProductQuadruple(y Uint8x64, z Int32x16) Int32x16
/* AddDotProdQuadrupleSaturated */ /* AddDotProductQuadrupleSaturated */
// AddDotProdQuadrupleSaturated multiplies performs dot products on groups of 4 elements of x and y and then adds z. // AddDotProductQuadrupleSaturated multiplies performs dot products on groups of 4 elements of x and y and then adds z.
// //
// Asm: VPDPBUSDS, CPU Feature: AVXVNNI // Asm: VPDPBUSDS, CPU Feature: AVXVNNI
func (x Int8x16) AddDotProdQuadrupleSaturated(y Uint8x16, z Int32x4) Int32x4 func (x Int8x16) AddDotProductQuadrupleSaturated(y Uint8x16, z Int32x4) Int32x4
// AddDotProdQuadrupleSaturated multiplies performs dot products on groups of 4 elements of x and y and then adds z. // AddDotProductQuadrupleSaturated multiplies performs dot products on groups of 4 elements of x and y and then adds z.
// //
// Asm: VPDPBUSDS, CPU Feature: AVXVNNI // Asm: VPDPBUSDS, CPU Feature: AVXVNNI
func (x Int8x32) AddDotProdQuadrupleSaturated(y Uint8x32, z Int32x8) Int32x8 func (x Int8x32) AddDotProductQuadrupleSaturated(y Uint8x32, z Int32x8) Int32x8
// AddDotProdQuadrupleSaturated multiplies performs dot products on groups of 4 elements of x and y and then adds z. // AddDotProductQuadrupleSaturated multiplies performs dot products on groups of 4 elements of x and y and then adds z.
// //
// Asm: VPDPBUSDS, CPU Feature: AVX512VNNI // Asm: VPDPBUSDS, CPU Feature: AVX512VNNI
func (x Int8x64) AddDotProdQuadrupleSaturated(y Uint8x64, z Int32x16) Int32x16 func (x Int8x64) AddDotProductQuadrupleSaturated(y Uint8x64, z Int32x16) Int32x16
/* AddPairs */ /* AddPairs */
@ -2143,45 +2126,45 @@ func (x Float64x4) Div(y Float64x4) Float64x4
// Asm: VDIVPD, CPU Feature: AVX512 // Asm: VDIVPD, CPU Feature: AVX512
func (x Float64x8) Div(y Float64x8) Float64x8 func (x Float64x8) Div(y Float64x8) Float64x8
/* DotProdPairs */ /* DotProductPairs */
// DotProdPairs multiplies the elements and add the pairs together, // DotProductPairs multiplies the elements and add the pairs together,
// yielding a vector of half as many elements with twice the input element size. // yielding a vector of half as many elements with twice the input element size.
// //
// Asm: VPMADDWD, CPU Feature: AVX // Asm: VPMADDWD, CPU Feature: AVX
func (x Int16x8) DotProdPairs(y Int16x8) Int32x4 func (x Int16x8) DotProductPairs(y Int16x8) Int32x4
// DotProdPairs multiplies the elements and add the pairs together, // DotProductPairs multiplies the elements and add the pairs together,
// yielding a vector of half as many elements with twice the input element size. // yielding a vector of half as many elements with twice the input element size.
// //
// Asm: VPMADDWD, CPU Feature: AVX2 // Asm: VPMADDWD, CPU Feature: AVX2
func (x Int16x16) DotProdPairs(y Int16x16) Int32x8 func (x Int16x16) DotProductPairs(y Int16x16) Int32x8
// DotProdPairs multiplies the elements and add the pairs together, // DotProductPairs multiplies the elements and add the pairs together,
// yielding a vector of half as many elements with twice the input element size. // yielding a vector of half as many elements with twice the input element size.
// //
// Asm: VPMADDWD, CPU Feature: AVX512 // Asm: VPMADDWD, CPU Feature: AVX512
func (x Int16x32) DotProdPairs(y Int16x32) Int32x16 func (x Int16x32) DotProductPairs(y Int16x32) Int32x16
/* DotProdPairsSaturated */ /* DotProductPairsSaturated */
// DotProdPairsSaturated multiplies the elements and add the pairs together with saturation, // DotProductPairsSaturated multiplies the elements and add the pairs together with saturation,
// yielding a vector of half as many elements with twice the input element size. // yielding a vector of half as many elements with twice the input element size.
// //
// Asm: VPMADDUBSW, CPU Feature: AVX // Asm: VPMADDUBSW, CPU Feature: AVX
func (x Uint8x16) DotProdPairsSaturated(y Int8x16) Int16x8 func (x Uint8x16) DotProductPairsSaturated(y Int8x16) Int16x8
// DotProdPairsSaturated multiplies the elements and add the pairs together with saturation, // DotProductPairsSaturated multiplies the elements and add the pairs together with saturation,
// yielding a vector of half as many elements with twice the input element size. // yielding a vector of half as many elements with twice the input element size.
// //
// Asm: VPMADDUBSW, CPU Feature: AVX2 // Asm: VPMADDUBSW, CPU Feature: AVX2
func (x Uint8x32) DotProdPairsSaturated(y Int8x32) Int16x16 func (x Uint8x32) DotProductPairsSaturated(y Int8x32) Int16x16
// DotProdPairsSaturated multiplies the elements and add the pairs together with saturation, // DotProductPairsSaturated multiplies the elements and add the pairs together with saturation,
// yielding a vector of half as many elements with twice the input element size. // yielding a vector of half as many elements with twice the input element size.
// //
// Asm: VPMADDUBSW, CPU Feature: AVX512 // Asm: VPMADDUBSW, CPU Feature: AVX512
func (x Uint8x64) DotProdPairsSaturated(y Int8x64) Int16x32 func (x Uint8x64) DotProductPairsSaturated(y Int8x64) Int16x32
/* Equal */ /* Equal */