From 416332dba285e45d57899eac73eb161cb2cd6bf4 Mon Sep 17 00:00:00 2001 From: Junyang Shao Date: Fri, 10 Oct 2025 19:18:01 +0000 Subject: [PATCH] [dev.simd] cmd/compile, simd: update DotProd to DotProduct API naming changes. This CL also remove AddDotProductPairsSaturated. Change-Id: I02e6d45268704f3ed4eaf62f0ecb7dc936b42124 Reviewed-on: https://go-review.googlesource.com/c/go/+/710935 LUCI-TryBot-Result: Go LUCI Reviewed-by: David Chase --- src/cmd/compile/internal/amd64/simdssa.go | 13 - .../compile/internal/ssa/_gen/simdAMD64.rules | 32 +-- .../compile/internal/ssa/_gen/simdAMD64ops.go | 10 - .../internal/ssa/_gen/simdgenericOps.go | 27 +- src/cmd/compile/internal/ssa/opGen.go | 250 ++---------------- src/cmd/compile/internal/ssa/rewriteAMD64.go | 177 +------------ .../compile/internal/ssagen/simdintrinsics.go | 27 +- .../_gen/simdgen/ops/MLOps/categories.yaml | 18 +- src/simd/_gen/simdgen/ops/MLOps/go.yaml | 20 +- src/simd/ops_amd64.go | 73 ++--- 10 files changed, 113 insertions(+), 534 deletions(-) diff --git a/src/cmd/compile/internal/amd64/simdssa.go b/src/cmd/compile/internal/amd64/simdssa.go index de9cad8a478..fe2ae019acd 100644 --- a/src/cmd/compile/internal/amd64/simdssa.go +++ b/src/cmd/compile/internal/amd64/simdssa.go @@ -1142,9 +1142,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { case ssa.OpAMD64VPDPWSSD128, ssa.OpAMD64VPDPWSSD256, ssa.OpAMD64VPDPWSSD512, - ssa.OpAMD64VPDPWSSDS128, - ssa.OpAMD64VPDPWSSDS256, - ssa.OpAMD64VPDPWSSDS512, ssa.OpAMD64VPDPBUSD128, ssa.OpAMD64VPDPBUSD256, ssa.OpAMD64VPDPBUSD512, @@ -1210,9 +1207,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { case ssa.OpAMD64VPDPWSSDMasked128, ssa.OpAMD64VPDPWSSDMasked256, ssa.OpAMD64VPDPWSSDMasked512, - ssa.OpAMD64VPDPWSSDSMasked128, - ssa.OpAMD64VPDPWSSDSMasked256, - ssa.OpAMD64VPDPWSSDSMasked512, ssa.OpAMD64VPDPBUSDMasked128, ssa.OpAMD64VPDPBUSDMasked256, ssa.OpAMD64VPDPBUSDMasked512, @@ -1500,7 +1494,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { p = simdV21load(s, v) case ssa.OpAMD64VPDPWSSD512load, - ssa.OpAMD64VPDPWSSDS512load, ssa.OpAMD64VPDPBUSD512load, ssa.OpAMD64VPDPBUSDS512load, ssa.OpAMD64VFMADD213PS128load, @@ -1550,9 +1543,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { case ssa.OpAMD64VPDPWSSDMasked128load, ssa.OpAMD64VPDPWSSDMasked256load, ssa.OpAMD64VPDPWSSDMasked512load, - ssa.OpAMD64VPDPWSSDSMasked128load, - ssa.OpAMD64VPDPWSSDSMasked256load, - ssa.OpAMD64VPDPWSSDSMasked512load, ssa.OpAMD64VPDPBUSDMasked128load, ssa.OpAMD64VPDPBUSDMasked256load, ssa.OpAMD64VPDPBUSDMasked512load, @@ -1971,9 +1961,6 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPDPWSSDMasked128, ssa.OpAMD64VPDPWSSDMasked256, ssa.OpAMD64VPDPWSSDMasked512, - ssa.OpAMD64VPDPWSSDSMasked128, - ssa.OpAMD64VPDPWSSDSMasked256, - ssa.OpAMD64VPDPWSSDSMasked512, ssa.OpAMD64VPDPBUSDMasked128, ssa.OpAMD64VPDPBUSDMasked256, ssa.OpAMD64VPDPBUSDMasked512, diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules index d9229e958ad..9e34d4b8816 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules +++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules @@ -52,15 +52,12 @@ (AddUint64x2 ...) => (VPADDQ128 ...) (AddUint64x4 ...) => (VPADDQ256 ...) (AddUint64x8 ...) => (VPADDQ512 ...) -(AddDotProdPairsSaturatedInt32x4 ...) => (VPDPWSSDS128 ...) -(AddDotProdPairsSaturatedInt32x8 ...) => (VPDPWSSDS256 ...) -(AddDotProdPairsSaturatedInt32x16 ...) => (VPDPWSSDS512 ...) -(AddDotProdQuadrupleInt32x4 ...) => (VPDPBUSD128 ...) -(AddDotProdQuadrupleInt32x8 ...) => (VPDPBUSD256 ...) -(AddDotProdQuadrupleInt32x16 ...) => (VPDPBUSD512 ...) -(AddDotProdQuadrupleSaturatedInt32x4 ...) => (VPDPBUSDS128 ...) -(AddDotProdQuadrupleSaturatedInt32x8 ...) => (VPDPBUSDS256 ...) -(AddDotProdQuadrupleSaturatedInt32x16 ...) => (VPDPBUSDS512 ...) +(AddDotProductQuadrupleInt32x4 ...) => (VPDPBUSD128 ...) +(AddDotProductQuadrupleInt32x8 ...) => (VPDPBUSD256 ...) +(AddDotProductQuadrupleInt32x16 ...) => (VPDPBUSD512 ...) +(AddDotProductQuadrupleSaturatedInt32x4 ...) => (VPDPBUSDS128 ...) +(AddDotProductQuadrupleSaturatedInt32x8 ...) => (VPDPBUSDS256 ...) +(AddDotProductQuadrupleSaturatedInt32x16 ...) => (VPDPBUSDS512 ...) (AddPairsFloat32x4 ...) => (VHADDPS128 ...) (AddPairsFloat32x8 ...) => (VHADDPS256 ...) (AddPairsFloat64x2 ...) => (VHADDPD128 ...) @@ -353,12 +350,12 @@ (DivFloat64x2 ...) => (VDIVPD128 ...) (DivFloat64x4 ...) => (VDIVPD256 ...) (DivFloat64x8 ...) => (VDIVPD512 ...) -(DotProdPairsInt16x8 ...) => (VPMADDWD128 ...) -(DotProdPairsInt16x16 ...) => (VPMADDWD256 ...) -(DotProdPairsInt16x32 ...) => (VPMADDWD512 ...) -(DotProdPairsSaturatedUint8x16 ...) => (VPMADDUBSW128 ...) -(DotProdPairsSaturatedUint8x32 ...) => (VPMADDUBSW256 ...) -(DotProdPairsSaturatedUint8x64 ...) => (VPMADDUBSW512 ...) +(DotProductPairsInt16x8 ...) => (VPMADDWD128 ...) +(DotProductPairsInt16x16 ...) => (VPMADDWD256 ...) +(DotProductPairsInt16x32 ...) => (VPMADDWD512 ...) +(DotProductPairsSaturatedUint8x16 ...) => (VPMADDUBSW128 ...) +(DotProductPairsSaturatedUint8x32 ...) => (VPMADDUBSW256 ...) +(DotProductPairsSaturatedUint8x64 ...) => (VPMADDUBSW512 ...) (EqualFloat32x4 x y) => (VCMPPS128 [0] x y) (EqualFloat32x8 x y) => (VCMPPS256 [0] x y) (EqualFloat32x16 x y) => (VPMOVMToVec32x16 (VCMPPS512 [0] x y)) @@ -1328,7 +1325,6 @@ (VMOVDQU32Masked512 (VPABSD512 x) mask) => (VPABSDMasked512 x mask) (VMOVDQU64Masked512 (VPABSQ512 x) mask) => (VPABSQMasked512 x mask) (VMOVDQU32Masked512 (VPDPWSSD512 x y z) mask) => (VPDPWSSDMasked512 x y z mask) -(VMOVDQU32Masked512 (VPDPWSSDS512 x y z) mask) => (VPDPWSSDSMasked512 x y z mask) (VMOVDQU32Masked512 (VPDPBUSD512 x y z) mask) => (VPDPBUSDMasked512 x y z mask) (VMOVDQU32Masked512 (VPDPBUSDS512 x y z) mask) => (VPDPBUSDSMasked512 x y z mask) (VMOVDQU32Masked512 (VADDPS512 x y) mask) => (VADDPSMasked512 x y mask) @@ -1521,10 +1517,6 @@ (VPDPWSSDMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPDPWSSDMasked128load {sym} [off] x y ptr mask mem) (VPDPWSSDMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPDPWSSDMasked256load {sym} [off] x y ptr mask mem) (VPDPWSSDMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPDPWSSDMasked512load {sym} [off] x y ptr mask mem) -(VPDPWSSDS512 x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPDPWSSDS512load {sym} [off] x y ptr mem) -(VPDPWSSDSMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPDPWSSDSMasked128load {sym} [off] x y ptr mask mem) -(VPDPWSSDSMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPDPWSSDSMasked256load {sym} [off] x y ptr mask mem) -(VPDPWSSDSMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPDPWSSDSMasked512load {sym} [off] x y ptr mask mem) (VPDPBUSD512 x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPDPBUSD512load {sym} [off] x y ptr mem) (VPDPBUSDMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPDPBUSDMasked128load {sym} [off] x y ptr mask mem) (VPDPBUSDMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPDPBUSDMasked256load {sym} [off] x y ptr mask mem) diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go b/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go index 680c576bb14..2cdf80c1ba1 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go +++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go @@ -368,12 +368,6 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf {name: "VPDPWSSDMasked128", argLength: 4, reg: w3kw, asm: "VPDPWSSD", commutative: false, typ: "Vec128", resultInArg0: true}, {name: "VPDPWSSDMasked256", argLength: 4, reg: w3kw, asm: "VPDPWSSD", commutative: false, typ: "Vec256", resultInArg0: true}, {name: "VPDPWSSDMasked512", argLength: 4, reg: w3kw, asm: "VPDPWSSD", commutative: false, typ: "Vec512", resultInArg0: true}, - {name: "VPDPWSSDS128", argLength: 3, reg: v31, asm: "VPDPWSSDS", commutative: false, typ: "Vec128", resultInArg0: true}, - {name: "VPDPWSSDS256", argLength: 3, reg: v31, asm: "VPDPWSSDS", commutative: false, typ: "Vec256", resultInArg0: true}, - {name: "VPDPWSSDS512", argLength: 3, reg: w31, asm: "VPDPWSSDS", commutative: false, typ: "Vec512", resultInArg0: true}, - {name: "VPDPWSSDSMasked128", argLength: 4, reg: w3kw, asm: "VPDPWSSDS", commutative: false, typ: "Vec128", resultInArg0: true}, - {name: "VPDPWSSDSMasked256", argLength: 4, reg: w3kw, asm: "VPDPWSSDS", commutative: false, typ: "Vec256", resultInArg0: true}, - {name: "VPDPWSSDSMasked512", argLength: 4, reg: w3kw, asm: "VPDPWSSDS", commutative: false, typ: "Vec512", resultInArg0: true}, {name: "VPERMB256", argLength: 2, reg: w21, asm: "VPERMB", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VPERMB512", argLength: 2, reg: w21, asm: "VPERMB", commutative: false, typ: "Vec512", resultInArg0: false}, {name: "VPERMBMasked256", argLength: 3, reg: w2kw, asm: "VPERMB", commutative: false, typ: "Vec256", resultInArg0: false}, @@ -1346,10 +1340,6 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf {name: "VPDPWSSDMasked128load", argLength: 5, reg: w3kwload, asm: "VPDPWSSD", commutative: false, typ: "Vec128", aux: "SymOff", symEffect: "Read", resultInArg0: true}, {name: "VPDPWSSDMasked256load", argLength: 5, reg: w3kwload, asm: "VPDPWSSD", commutative: false, typ: "Vec256", aux: "SymOff", symEffect: "Read", resultInArg0: true}, {name: "VPDPWSSDMasked512load", argLength: 5, reg: w3kwload, asm: "VPDPWSSD", commutative: false, typ: "Vec512", aux: "SymOff", symEffect: "Read", resultInArg0: true}, - {name: "VPDPWSSDS512load", argLength: 4, reg: w31load, asm: "VPDPWSSDS", commutative: false, typ: "Vec512", aux: "SymOff", symEffect: "Read", resultInArg0: true}, - {name: "VPDPWSSDSMasked128load", argLength: 5, reg: w3kwload, asm: "VPDPWSSDS", commutative: false, typ: "Vec128", aux: "SymOff", symEffect: "Read", resultInArg0: true}, - {name: "VPDPWSSDSMasked256load", argLength: 5, reg: w3kwload, asm: "VPDPWSSDS", commutative: false, typ: "Vec256", aux: "SymOff", symEffect: "Read", resultInArg0: true}, - {name: "VPDPWSSDSMasked512load", argLength: 5, reg: w3kwload, asm: "VPDPWSSDS", commutative: false, typ: "Vec512", aux: "SymOff", symEffect: "Read", resultInArg0: true}, {name: "VPDPBUSD512load", argLength: 4, reg: w31load, asm: "VPDPBUSD", commutative: false, typ: "Vec512", aux: "SymOff", symEffect: "Read", resultInArg0: true}, {name: "VPDPBUSDMasked128load", argLength: 5, reg: w3kwload, asm: "VPDPBUSD", commutative: false, typ: "Vec128", aux: "SymOff", symEffect: "Read", resultInArg0: true}, {name: "VPDPBUSDMasked256load", argLength: 5, reg: w3kwload, asm: "VPDPBUSD", commutative: false, typ: "Vec256", aux: "SymOff", symEffect: "Read", resultInArg0: true}, diff --git a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go index 2e9f3ff1c49..f5eb9075d71 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go +++ b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go @@ -25,15 +25,12 @@ func simdGenericOps() []opData { {name: "AbsInt64x2", argLength: 1, commutative: false}, {name: "AbsInt64x4", argLength: 1, commutative: false}, {name: "AbsInt64x8", argLength: 1, commutative: false}, - {name: "AddDotProdPairsSaturatedInt32x4", argLength: 3, commutative: false}, - {name: "AddDotProdPairsSaturatedInt32x8", argLength: 3, commutative: false}, - {name: "AddDotProdPairsSaturatedInt32x16", argLength: 3, commutative: false}, - {name: "AddDotProdQuadrupleInt32x4", argLength: 3, commutative: false}, - {name: "AddDotProdQuadrupleInt32x8", argLength: 3, commutative: false}, - {name: "AddDotProdQuadrupleInt32x16", argLength: 3, commutative: false}, - {name: "AddDotProdQuadrupleSaturatedInt32x4", argLength: 3, commutative: false}, - {name: "AddDotProdQuadrupleSaturatedInt32x8", argLength: 3, commutative: false}, - {name: "AddDotProdQuadrupleSaturatedInt32x16", argLength: 3, commutative: false}, + {name: "AddDotProductQuadrupleInt32x4", argLength: 3, commutative: false}, + {name: "AddDotProductQuadrupleInt32x8", argLength: 3, commutative: false}, + {name: "AddDotProductQuadrupleInt32x16", argLength: 3, commutative: false}, + {name: "AddDotProductQuadrupleSaturatedInt32x4", argLength: 3, commutative: false}, + {name: "AddDotProductQuadrupleSaturatedInt32x8", argLength: 3, commutative: false}, + {name: "AddDotProductQuadrupleSaturatedInt32x16", argLength: 3, commutative: false}, {name: "AddFloat32x4", argLength: 2, commutative: true}, {name: "AddFloat32x8", argLength: 2, commutative: true}, {name: "AddFloat32x16", argLength: 2, commutative: true}, @@ -344,12 +341,12 @@ func simdGenericOps() []opData { {name: "DivFloat64x2", argLength: 2, commutative: false}, {name: "DivFloat64x4", argLength: 2, commutative: false}, {name: "DivFloat64x8", argLength: 2, commutative: false}, - {name: "DotProdPairsInt16x8", argLength: 2, commutative: false}, - {name: "DotProdPairsInt16x16", argLength: 2, commutative: false}, - {name: "DotProdPairsInt16x32", argLength: 2, commutative: false}, - {name: "DotProdPairsSaturatedUint8x16", argLength: 2, commutative: false}, - {name: "DotProdPairsSaturatedUint8x32", argLength: 2, commutative: false}, - {name: "DotProdPairsSaturatedUint8x64", argLength: 2, commutative: false}, + {name: "DotProductPairsInt16x8", argLength: 2, commutative: false}, + {name: "DotProductPairsInt16x16", argLength: 2, commutative: false}, + {name: "DotProductPairsInt16x32", argLength: 2, commutative: false}, + {name: "DotProductPairsSaturatedUint8x16", argLength: 2, commutative: false}, + {name: "DotProductPairsSaturatedUint8x32", argLength: 2, commutative: false}, + {name: "DotProductPairsSaturatedUint8x64", argLength: 2, commutative: false}, {name: "EqualFloat32x4", argLength: 2, commutative: true}, {name: "EqualFloat32x8", argLength: 2, commutative: true}, {name: "EqualFloat32x16", argLength: 2, commutative: true}, diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go index 30831e828a8..6dd7082e100 100644 --- a/src/cmd/compile/internal/ssa/opGen.go +++ b/src/cmd/compile/internal/ssa/opGen.go @@ -1608,12 +1608,6 @@ const ( OpAMD64VPDPWSSDMasked128 OpAMD64VPDPWSSDMasked256 OpAMD64VPDPWSSDMasked512 - OpAMD64VPDPWSSDS128 - OpAMD64VPDPWSSDS256 - OpAMD64VPDPWSSDS512 - OpAMD64VPDPWSSDSMasked128 - OpAMD64VPDPWSSDSMasked256 - OpAMD64VPDPWSSDSMasked512 OpAMD64VPERMB256 OpAMD64VPERMB512 OpAMD64VPERMBMasked256 @@ -2586,10 +2580,6 @@ const ( OpAMD64VPDPWSSDMasked128load OpAMD64VPDPWSSDMasked256load OpAMD64VPDPWSSDMasked512load - OpAMD64VPDPWSSDS512load - OpAMD64VPDPWSSDSMasked128load - OpAMD64VPDPWSSDSMasked256load - OpAMD64VPDPWSSDSMasked512load OpAMD64VPDPBUSD512load OpAMD64VPDPBUSDMasked128load OpAMD64VPDPBUSDMasked256load @@ -5416,15 +5406,12 @@ const ( OpAbsInt64x2 OpAbsInt64x4 OpAbsInt64x8 - OpAddDotProdPairsSaturatedInt32x4 - OpAddDotProdPairsSaturatedInt32x8 - OpAddDotProdPairsSaturatedInt32x16 - OpAddDotProdQuadrupleInt32x4 - OpAddDotProdQuadrupleInt32x8 - OpAddDotProdQuadrupleInt32x16 - OpAddDotProdQuadrupleSaturatedInt32x4 - OpAddDotProdQuadrupleSaturatedInt32x8 - OpAddDotProdQuadrupleSaturatedInt32x16 + OpAddDotProductQuadrupleInt32x4 + OpAddDotProductQuadrupleInt32x8 + OpAddDotProductQuadrupleInt32x16 + OpAddDotProductQuadrupleSaturatedInt32x4 + OpAddDotProductQuadrupleSaturatedInt32x8 + OpAddDotProductQuadrupleSaturatedInt32x16 OpAddFloat32x4 OpAddFloat32x8 OpAddFloat32x16 @@ -5735,12 +5722,12 @@ const ( OpDivFloat64x2 OpDivFloat64x4 OpDivFloat64x8 - OpDotProdPairsInt16x8 - OpDotProdPairsInt16x16 - OpDotProdPairsInt16x32 - OpDotProdPairsSaturatedUint8x16 - OpDotProdPairsSaturatedUint8x32 - OpDotProdPairsSaturatedUint8x64 + OpDotProductPairsInt16x8 + OpDotProductPairsInt16x16 + OpDotProductPairsInt16x32 + OpDotProductPairsSaturatedUint8x16 + OpDotProductPairsSaturatedUint8x32 + OpDotProductPairsSaturatedUint8x64 OpEqualFloat32x4 OpEqualFloat32x8 OpEqualFloat32x16 @@ -25338,105 +25325,6 @@ var opcodeTable = [...]opInfo{ }, }, }, - { - name: "VPDPWSSDS128", - argLen: 3, - resultInArg0: true, - asm: x86.AVPDPWSSDS, - reg: regInfo{ - inputs: []inputInfo{ - {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 - {1, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 - {2, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 - }, - outputs: []outputInfo{ - {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 - }, - }, - }, - { - name: "VPDPWSSDS256", - argLen: 3, - resultInArg0: true, - asm: x86.AVPDPWSSDS, - reg: regInfo{ - inputs: []inputInfo{ - {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 - {1, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 - {2, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 - }, - outputs: []outputInfo{ - {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 - }, - }, - }, - { - name: "VPDPWSSDS512", - argLen: 3, - resultInArg0: true, - asm: x86.AVPDPWSSDS, - reg: regInfo{ - inputs: []inputInfo{ - {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 - {1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 - {2, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 - }, - outputs: []outputInfo{ - {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 - }, - }, - }, - { - name: "VPDPWSSDSMasked128", - argLen: 4, - resultInArg0: true, - asm: x86.AVPDPWSSDS, - reg: regInfo{ - inputs: []inputInfo{ - {3, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 - {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 - {1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 - {2, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 - }, - outputs: []outputInfo{ - {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 - }, - }, - }, - { - name: "VPDPWSSDSMasked256", - argLen: 4, - resultInArg0: true, - asm: x86.AVPDPWSSDS, - reg: regInfo{ - inputs: []inputInfo{ - {3, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 - {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 - {1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 - {2, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 - }, - outputs: []outputInfo{ - {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 - }, - }, - }, - { - name: "VPDPWSSDSMasked512", - argLen: 4, - resultInArg0: true, - asm: x86.AVPDPWSSDS, - reg: regInfo{ - inputs: []inputInfo{ - {3, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 - {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 - {1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 - {2, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 - }, - outputs: []outputInfo{ - {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 - }, - }, - }, { name: "VPERMB256", argLen: 2, @@ -39773,81 +39661,6 @@ var opcodeTable = [...]opInfo{ }, }, }, - { - name: "VPDPWSSDS512load", - auxType: auxSymOff, - argLen: 4, - resultInArg0: true, - symEffect: SymRead, - asm: x86.AVPDPWSSDS, - reg: regInfo{ - inputs: []inputInfo{ - {2, 72057594037977087}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R15 SB - {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 - {1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 - }, - outputs: []outputInfo{ - {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 - }, - }, - }, - { - name: "VPDPWSSDSMasked128load", - auxType: auxSymOff, - argLen: 5, - resultInArg0: true, - symEffect: SymRead, - asm: x86.AVPDPWSSDS, - reg: regInfo{ - inputs: []inputInfo{ - {3, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 - {2, 72057594037977087}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R15 SB - {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 - {1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 - }, - outputs: []outputInfo{ - {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 - }, - }, - }, - { - name: "VPDPWSSDSMasked256load", - auxType: auxSymOff, - argLen: 5, - resultInArg0: true, - symEffect: SymRead, - asm: x86.AVPDPWSSDS, - reg: regInfo{ - inputs: []inputInfo{ - {3, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 - {2, 72057594037977087}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R15 SB - {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 - {1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 - }, - outputs: []outputInfo{ - {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 - }, - }, - }, - { - name: "VPDPWSSDSMasked512load", - auxType: auxSymOff, - argLen: 5, - resultInArg0: true, - symEffect: SymRead, - asm: x86.AVPDPWSSDS, - reg: regInfo{ - inputs: []inputInfo{ - {3, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7 - {2, 72057594037977087}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R15 SB - {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 - {1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 - }, - outputs: []outputInfo{ - {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 - }, - }, - }, { name: "VPDPBUSD512load", auxType: auxSymOff, @@ -76268,47 +76081,32 @@ var opcodeTable = [...]opInfo{ generic: true, }, { - name: "AddDotProdPairsSaturatedInt32x4", + name: "AddDotProductQuadrupleInt32x4", argLen: 3, generic: true, }, { - name: "AddDotProdPairsSaturatedInt32x8", + name: "AddDotProductQuadrupleInt32x8", argLen: 3, generic: true, }, { - name: "AddDotProdPairsSaturatedInt32x16", + name: "AddDotProductQuadrupleInt32x16", argLen: 3, generic: true, }, { - name: "AddDotProdQuadrupleInt32x4", + name: "AddDotProductQuadrupleSaturatedInt32x4", argLen: 3, generic: true, }, { - name: "AddDotProdQuadrupleInt32x8", + name: "AddDotProductQuadrupleSaturatedInt32x8", argLen: 3, generic: true, }, { - name: "AddDotProdQuadrupleInt32x16", - argLen: 3, - generic: true, - }, - { - name: "AddDotProdQuadrupleSaturatedInt32x4", - argLen: 3, - generic: true, - }, - { - name: "AddDotProdQuadrupleSaturatedInt32x8", - argLen: 3, - generic: true, - }, - { - name: "AddDotProdQuadrupleSaturatedInt32x16", + name: "AddDotProductQuadrupleSaturatedInt32x16", argLen: 3, generic: true, }, @@ -77935,32 +77733,32 @@ var opcodeTable = [...]opInfo{ generic: true, }, { - name: "DotProdPairsInt16x8", + name: "DotProductPairsInt16x8", argLen: 2, generic: true, }, { - name: "DotProdPairsInt16x16", + name: "DotProductPairsInt16x16", argLen: 2, generic: true, }, { - name: "DotProdPairsInt16x32", + name: "DotProductPairsInt16x32", argLen: 2, generic: true, }, { - name: "DotProdPairsSaturatedUint8x16", + name: "DotProductPairsSaturatedUint8x16", argLen: 2, generic: true, }, { - name: "DotProdPairsSaturatedUint8x32", + name: "DotProductPairsSaturatedUint8x32", argLen: 2, generic: true, }, { - name: "DotProdPairsSaturatedUint8x64", + name: "DotProductPairsSaturatedUint8x64", argLen: 2, generic: true, }, diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go index 908fd71b783..42814029144 100644 --- a/src/cmd/compile/internal/ssa/rewriteAMD64.go +++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go @@ -949,14 +949,6 @@ func rewriteValueAMD64(v *Value) bool { return rewriteValueAMD64_OpAMD64VPDPWSSDMasked256(v) case OpAMD64VPDPWSSDMasked512: return rewriteValueAMD64_OpAMD64VPDPWSSDMasked512(v) - case OpAMD64VPDPWSSDS512: - return rewriteValueAMD64_OpAMD64VPDPWSSDS512(v) - case OpAMD64VPDPWSSDSMasked128: - return rewriteValueAMD64_OpAMD64VPDPWSSDSMasked128(v) - case OpAMD64VPDPWSSDSMasked256: - return rewriteValueAMD64_OpAMD64VPDPWSSDSMasked256(v) - case OpAMD64VPDPWSSDSMasked512: - return rewriteValueAMD64_OpAMD64VPDPWSSDSMasked512(v) case OpAMD64VPERMD512: return rewriteValueAMD64_OpAMD64VPERMD512(v) case OpAMD64VPERMDMasked256: @@ -1871,31 +1863,22 @@ func rewriteValueAMD64(v *Value) bool { case OpAdd8: v.Op = OpAMD64ADDL return true - case OpAddDotProdPairsSaturatedInt32x16: - v.Op = OpAMD64VPDPWSSDS512 - return true - case OpAddDotProdPairsSaturatedInt32x4: - v.Op = OpAMD64VPDPWSSDS128 - return true - case OpAddDotProdPairsSaturatedInt32x8: - v.Op = OpAMD64VPDPWSSDS256 - return true - case OpAddDotProdQuadrupleInt32x16: + case OpAddDotProductQuadrupleInt32x16: v.Op = OpAMD64VPDPBUSD512 return true - case OpAddDotProdQuadrupleInt32x4: + case OpAddDotProductQuadrupleInt32x4: v.Op = OpAMD64VPDPBUSD128 return true - case OpAddDotProdQuadrupleInt32x8: + case OpAddDotProductQuadrupleInt32x8: v.Op = OpAMD64VPDPBUSD256 return true - case OpAddDotProdQuadrupleSaturatedInt32x16: + case OpAddDotProductQuadrupleSaturatedInt32x16: v.Op = OpAMD64VPDPBUSDS512 return true - case OpAddDotProdQuadrupleSaturatedInt32x4: + case OpAddDotProductQuadrupleSaturatedInt32x4: v.Op = OpAMD64VPDPBUSDS128 return true - case OpAddDotProdQuadrupleSaturatedInt32x8: + case OpAddDotProductQuadrupleSaturatedInt32x8: v.Op = OpAMD64VPDPBUSDS256 return true case OpAddFloat32x16: @@ -3064,22 +3047,22 @@ func rewriteValueAMD64(v *Value) bool { case OpDivFloat64x8: v.Op = OpAMD64VDIVPD512 return true - case OpDotProdPairsInt16x16: + case OpDotProductPairsInt16x16: v.Op = OpAMD64VPMADDWD256 return true - case OpDotProdPairsInt16x32: + case OpDotProductPairsInt16x32: v.Op = OpAMD64VPMADDWD512 return true - case OpDotProdPairsInt16x8: + case OpDotProductPairsInt16x8: v.Op = OpAMD64VPMADDWD128 return true - case OpDotProdPairsSaturatedUint8x16: + case OpDotProductPairsSaturatedUint8x16: v.Op = OpAMD64VPMADDUBSW128 return true - case OpDotProdPairsSaturatedUint8x32: + case OpDotProductPairsSaturatedUint8x32: v.Op = OpAMD64VPMADDUBSW256 return true - case OpDotProdPairsSaturatedUint8x64: + case OpDotProductPairsSaturatedUint8x64: v.Op = OpAMD64VPMADDUBSW512 return true case OpEq16: @@ -31631,20 +31614,6 @@ func rewriteValueAMD64_OpAMD64VMOVDQU32Masked512(v *Value) bool { v.AddArg4(x, y, z, mask) return true } - // match: (VMOVDQU32Masked512 (VPDPWSSDS512 x y z) mask) - // result: (VPDPWSSDSMasked512 x y z mask) - for { - if v_0.Op != OpAMD64VPDPWSSDS512 { - break - } - z := v_0.Args[2] - x := v_0.Args[0] - y := v_0.Args[1] - mask := v_1 - v.reset(OpAMD64VPDPWSSDSMasked512) - v.AddArg4(x, y, z, mask) - return true - } // match: (VMOVDQU32Masked512 (VPDPBUSD512 x y z) mask) // result: (VPDPBUSDMasked512 x y z mask) for { @@ -36686,128 +36655,6 @@ func rewriteValueAMD64_OpAMD64VPDPWSSDMasked512(v *Value) bool { } return false } -func rewriteValueAMD64_OpAMD64VPDPWSSDS512(v *Value) bool { - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - // match: (VPDPWSSDS512 x y l:(VMOVDQUload512 {sym} [off] ptr mem)) - // cond: canMergeLoad(v, l) && clobber(l) - // result: (VPDPWSSDS512load {sym} [off] x y ptr mem) - for { - x := v_0 - y := v_1 - l := v_2 - if l.Op != OpAMD64VMOVDQUload512 { - break - } - off := auxIntToInt32(l.AuxInt) - sym := auxToSym(l.Aux) - mem := l.Args[1] - ptr := l.Args[0] - if !(canMergeLoad(v, l) && clobber(l)) { - break - } - v.reset(OpAMD64VPDPWSSDS512load) - v.AuxInt = int32ToAuxInt(off) - v.Aux = symToAux(sym) - v.AddArg4(x, y, ptr, mem) - return true - } - return false -} -func rewriteValueAMD64_OpAMD64VPDPWSSDSMasked128(v *Value) bool { - v_3 := v.Args[3] - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - // match: (VPDPWSSDSMasked128 x y l:(VMOVDQUload128 {sym} [off] ptr mem) mask) - // cond: canMergeLoad(v, l) && clobber(l) - // result: (VPDPWSSDSMasked128load {sym} [off] x y ptr mask mem) - for { - x := v_0 - y := v_1 - l := v_2 - if l.Op != OpAMD64VMOVDQUload128 { - break - } - off := auxIntToInt32(l.AuxInt) - sym := auxToSym(l.Aux) - mem := l.Args[1] - ptr := l.Args[0] - mask := v_3 - if !(canMergeLoad(v, l) && clobber(l)) { - break - } - v.reset(OpAMD64VPDPWSSDSMasked128load) - v.AuxInt = int32ToAuxInt(off) - v.Aux = symToAux(sym) - v.AddArg5(x, y, ptr, mask, mem) - return true - } - return false -} -func rewriteValueAMD64_OpAMD64VPDPWSSDSMasked256(v *Value) bool { - v_3 := v.Args[3] - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - // match: (VPDPWSSDSMasked256 x y l:(VMOVDQUload256 {sym} [off] ptr mem) mask) - // cond: canMergeLoad(v, l) && clobber(l) - // result: (VPDPWSSDSMasked256load {sym} [off] x y ptr mask mem) - for { - x := v_0 - y := v_1 - l := v_2 - if l.Op != OpAMD64VMOVDQUload256 { - break - } - off := auxIntToInt32(l.AuxInt) - sym := auxToSym(l.Aux) - mem := l.Args[1] - ptr := l.Args[0] - mask := v_3 - if !(canMergeLoad(v, l) && clobber(l)) { - break - } - v.reset(OpAMD64VPDPWSSDSMasked256load) - v.AuxInt = int32ToAuxInt(off) - v.Aux = symToAux(sym) - v.AddArg5(x, y, ptr, mask, mem) - return true - } - return false -} -func rewriteValueAMD64_OpAMD64VPDPWSSDSMasked512(v *Value) bool { - v_3 := v.Args[3] - v_2 := v.Args[2] - v_1 := v.Args[1] - v_0 := v.Args[0] - // match: (VPDPWSSDSMasked512 x y l:(VMOVDQUload512 {sym} [off] ptr mem) mask) - // cond: canMergeLoad(v, l) && clobber(l) - // result: (VPDPWSSDSMasked512load {sym} [off] x y ptr mask mem) - for { - x := v_0 - y := v_1 - l := v_2 - if l.Op != OpAMD64VMOVDQUload512 { - break - } - off := auxIntToInt32(l.AuxInt) - sym := auxToSym(l.Aux) - mem := l.Args[1] - ptr := l.Args[0] - mask := v_3 - if !(canMergeLoad(v, l) && clobber(l)) { - break - } - v.reset(OpAMD64VPDPWSSDSMasked512load) - v.AuxInt = int32ToAuxInt(off) - v.Aux = symToAux(sym) - v.AddArg5(x, y, ptr, mask, mem) - return true - } - return false -} func rewriteValueAMD64_OpAMD64VPERMD512(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] diff --git a/src/cmd/compile/internal/ssagen/simdintrinsics.go b/src/cmd/compile/internal/ssagen/simdintrinsics.go index 47be7d67a41..d4fb524b247 100644 --- a/src/cmd/compile/internal/ssagen/simdintrinsics.go +++ b/src/cmd/compile/internal/ssagen/simdintrinsics.go @@ -64,15 +64,12 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Uint64x2.Add", opLen2(ssa.OpAddUint64x2, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint64x4.Add", opLen2(ssa.OpAddUint64x4, types.TypeVec256), sys.AMD64) addF(simdPackage, "Uint64x8.Add", opLen2(ssa.OpAddUint64x8, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Int32x4.AddDotProdPairsSaturated", opLen3(ssa.OpAddDotProdPairsSaturatedInt32x4, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int32x8.AddDotProdPairsSaturated", opLen3(ssa.OpAddDotProdPairsSaturatedInt32x8, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Int32x16.AddDotProdPairsSaturated", opLen3(ssa.OpAddDotProdPairsSaturatedInt32x16, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Int8x16.AddDotProdQuadruple", opLen3_31(ssa.OpAddDotProdQuadrupleInt32x4, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int8x32.AddDotProdQuadruple", opLen3_31(ssa.OpAddDotProdQuadrupleInt32x8, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Int8x64.AddDotProdQuadruple", opLen3_31(ssa.OpAddDotProdQuadrupleInt32x16, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Int8x16.AddDotProdQuadrupleSaturated", opLen3_31(ssa.OpAddDotProdQuadrupleSaturatedInt32x4, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int8x32.AddDotProdQuadrupleSaturated", opLen3_31(ssa.OpAddDotProdQuadrupleSaturatedInt32x8, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Int8x64.AddDotProdQuadrupleSaturated", opLen3_31(ssa.OpAddDotProdQuadrupleSaturatedInt32x16, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int8x16.AddDotProductQuadruple", opLen3_31(ssa.OpAddDotProductQuadrupleInt32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int8x32.AddDotProductQuadruple", opLen3_31(ssa.OpAddDotProductQuadrupleInt32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int8x64.AddDotProductQuadruple", opLen3_31(ssa.OpAddDotProductQuadrupleInt32x16, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int8x16.AddDotProductQuadrupleSaturated", opLen3_31(ssa.OpAddDotProductQuadrupleSaturatedInt32x4, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int8x32.AddDotProductQuadrupleSaturated", opLen3_31(ssa.OpAddDotProductQuadrupleSaturatedInt32x8, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int8x64.AddDotProductQuadrupleSaturated", opLen3_31(ssa.OpAddDotProductQuadrupleSaturatedInt32x16, types.TypeVec512), sys.AMD64) addF(simdPackage, "Float32x4.AddPairs", opLen2(ssa.OpAddPairsFloat32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Float32x8.AddPairs", opLen2(ssa.OpAddPairsFloat32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Float64x2.AddPairs", opLen2(ssa.OpAddPairsFloat64x2, types.TypeVec128), sys.AMD64) @@ -365,12 +362,12 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Float64x2.Div", opLen2(ssa.OpDivFloat64x2, types.TypeVec128), sys.AMD64) addF(simdPackage, "Float64x4.Div", opLen2(ssa.OpDivFloat64x4, types.TypeVec256), sys.AMD64) addF(simdPackage, "Float64x8.Div", opLen2(ssa.OpDivFloat64x8, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Int16x8.DotProdPairs", opLen2(ssa.OpDotProdPairsInt16x8, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Int16x16.DotProdPairs", opLen2(ssa.OpDotProdPairsInt16x16, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Int16x32.DotProdPairs", opLen2(ssa.OpDotProdPairsInt16x32, types.TypeVec512), sys.AMD64) - addF(simdPackage, "Uint8x16.DotProdPairsSaturated", opLen2(ssa.OpDotProdPairsSaturatedUint8x16, types.TypeVec128), sys.AMD64) - addF(simdPackage, "Uint8x32.DotProdPairsSaturated", opLen2(ssa.OpDotProdPairsSaturatedUint8x32, types.TypeVec256), sys.AMD64) - addF(simdPackage, "Uint8x64.DotProdPairsSaturated", opLen2(ssa.OpDotProdPairsSaturatedUint8x64, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int16x8.DotProductPairs", opLen2(ssa.OpDotProductPairsInt16x8, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Int16x16.DotProductPairs", opLen2(ssa.OpDotProductPairsInt16x16, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Int16x32.DotProductPairs", opLen2(ssa.OpDotProductPairsInt16x32, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Uint8x16.DotProductPairsSaturated", opLen2(ssa.OpDotProductPairsSaturatedUint8x16, types.TypeVec128), sys.AMD64) + addF(simdPackage, "Uint8x32.DotProductPairsSaturated", opLen2(ssa.OpDotProductPairsSaturatedUint8x32, types.TypeVec256), sys.AMD64) + addF(simdPackage, "Uint8x64.DotProductPairsSaturated", opLen2(ssa.OpDotProductPairsSaturatedUint8x64, types.TypeVec512), sys.AMD64) addF(simdPackage, "Int8x16.Equal", opLen2(ssa.OpEqualInt8x16, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int8x32.Equal", opLen2(ssa.OpEqualInt8x32, types.TypeVec256), sys.AMD64) addF(simdPackage, "Int8x64.Equal", opLen2(ssa.OpEqualInt8x64, types.TypeVec512), sys.AMD64) diff --git a/src/simd/_gen/simdgen/ops/MLOps/categories.yaml b/src/simd/_gen/simdgen/ops/MLOps/categories.yaml index 772a7b3cf67..0317b42c6af 100644 --- a/src/simd/_gen/simdgen/ops/MLOps/categories.yaml +++ b/src/simd/_gen/simdgen/ops/MLOps/categories.yaml @@ -1,38 +1,34 @@ !sum -- go: DotProdPairs +- go: DotProductPairs commutative: false documentation: !string |- // NAME multiplies the elements and add the pairs together, // yielding a vector of half as many elements with twice the input element size. # TODO: maybe simplify this name within the receiver-type + method-naming scheme we use. -- go: DotProdPairsSaturated +- go: DotProductPairsSaturated commutative: false documentation: !string |- // NAME multiplies the elements and add the pairs together with saturation, // yielding a vector of half as many elements with twice the input element size. -# QuadDotProd, i.e. VPDPBUSD(S) are operations with src/dst on the same register, we are not supporting this as of now. -# - go: DotProdBroadcast +# QuadDotProduct, i.e. VPDPBUSD(S) are operations with src/dst on the same register, we are not supporting this as of now. +# - go: DotProductBroadcast # commutative: true # # documentation: !string |- # // NAME multiplies all elements and broadcasts the sum. -- go: AddDotProdQuadruple +- go: AddDotProductQuadruple commutative: false documentation: !string |- // NAME performs dot products on groups of 4 elements of x and y and then adds z. -- go: AddDotProdQuadrupleSaturated +- go: AddDotProductQuadrupleSaturated commutative: false documentation: !string |- // NAME multiplies performs dot products on groups of 4 elements of x and y and then adds z. -- go: AddDotProdPairs +- go: AddDotProductPairs commutative: false noTypes: "true" noGenericOps: "true" documentation: !string |- // NAME performs dot products on pairs of elements of y and z and then adds x. -- go: AddDotProdPairsSaturated - commutative: false - documentation: !string |- - // NAME performs dot products on pairs of elements of y and z and then adds x. - go: MulAdd commutative: false documentation: !string |- diff --git a/src/simd/_gen/simdgen/ops/MLOps/go.yaml b/src/simd/_gen/simdgen/ops/MLOps/go.yaml index 5c2009dcf81..162c47ea0ef 100644 --- a/src/simd/_gen/simdgen/ops/MLOps/go.yaml +++ b/src/simd/_gen/simdgen/ops/MLOps/go.yaml @@ -1,5 +1,5 @@ !sum -- go: DotProdPairs +- go: DotProductPairs asm: VPMADDWD in: - &int @@ -10,7 +10,7 @@ - &int2 # The elemBits are different go: $t2 base: int -- go: DotProdPairsSaturated +- go: DotProductPairsSaturated asm: VPMADDUBSW in: - &uint @@ -23,7 +23,7 @@ overwriteElementBits: 8 out: - *int2 -# - go: DotProdBroadcast +# - go: DotProductBroadcast # asm: VDPP[SD] # in: # - &dpb_src @@ -33,7 +33,7 @@ # const: 127 # out: # - *dpb_src -- go: AddDotProdQuadruple +- go: AddDotProductQuadruple asm: "VPDPBUSD" operandOrder: "31" # switch operand 3 and 1 in: @@ -51,7 +51,7 @@ overwriteElementBits: 8 out: - *qdpa_acc -- go: AddDotProdQuadrupleSaturated +- go: AddDotProductQuadrupleSaturated asm: "VPDPBUSDS" operandOrder: "31" # switch operand 3 and 1 in: @@ -60,7 +60,7 @@ - *qdpa_src2 out: - *qdpa_acc -- go: AddDotProdPairs +- go: AddDotProductPairs asm: "VPDPWSSD" in: - &pdpa_acc @@ -77,14 +77,6 @@ overwriteElementBits: 16 out: - *pdpa_acc -- go: AddDotProdPairsSaturated - asm: "VPDPWSSDS" - in: - - *pdpa_acc - - *pdpa_src1 - - *pdpa_src2 - out: - - *pdpa_acc - go: MulAdd asm: "VFMADD213PS|VFMADD213PD" in: diff --git a/src/simd/ops_amd64.go b/src/simd/ops_amd64.go index 8956c2e0772..23316223617 100644 --- a/src/simd/ops_amd64.go +++ b/src/simd/ops_amd64.go @@ -314,56 +314,39 @@ func (x Uint64x4) Add(y Uint64x4) Uint64x4 // Asm: VPADDQ, CPU Feature: AVX512 func (x Uint64x8) Add(y Uint64x8) Uint64x8 -/* AddDotProdPairsSaturated */ +/* AddDotProductQuadruple */ -// AddDotProdPairsSaturated performs dot products on pairs of elements of y and z and then adds x. -// -// Asm: VPDPWSSDS, CPU Feature: AVXVNNI -func (x Int32x4) AddDotProdPairsSaturated(y Int16x8, z Int16x8) Int32x4 - -// AddDotProdPairsSaturated performs dot products on pairs of elements of y and z and then adds x. -// -// Asm: VPDPWSSDS, CPU Feature: AVXVNNI -func (x Int32x8) AddDotProdPairsSaturated(y Int16x16, z Int16x16) Int32x8 - -// AddDotProdPairsSaturated performs dot products on pairs of elements of y and z and then adds x. -// -// Asm: VPDPWSSDS, CPU Feature: AVX512VNNI -func (x Int32x16) AddDotProdPairsSaturated(y Int16x32, z Int16x32) Int32x16 - -/* AddDotProdQuadruple */ - -// AddDotProdQuadruple performs dot products on groups of 4 elements of x and y and then adds z. +// AddDotProductQuadruple performs dot products on groups of 4 elements of x and y and then adds z. // // Asm: VPDPBUSD, CPU Feature: AVXVNNI -func (x Int8x16) AddDotProdQuadruple(y Uint8x16, z Int32x4) Int32x4 +func (x Int8x16) AddDotProductQuadruple(y Uint8x16, z Int32x4) Int32x4 -// AddDotProdQuadruple performs dot products on groups of 4 elements of x and y and then adds z. +// AddDotProductQuadruple performs dot products on groups of 4 elements of x and y and then adds z. // // Asm: VPDPBUSD, CPU Feature: AVXVNNI -func (x Int8x32) AddDotProdQuadruple(y Uint8x32, z Int32x8) Int32x8 +func (x Int8x32) AddDotProductQuadruple(y Uint8x32, z Int32x8) Int32x8 -// AddDotProdQuadruple performs dot products on groups of 4 elements of x and y and then adds z. +// AddDotProductQuadruple performs dot products on groups of 4 elements of x and y and then adds z. // // Asm: VPDPBUSD, CPU Feature: AVX512VNNI -func (x Int8x64) AddDotProdQuadruple(y Uint8x64, z Int32x16) Int32x16 +func (x Int8x64) AddDotProductQuadruple(y Uint8x64, z Int32x16) Int32x16 -/* AddDotProdQuadrupleSaturated */ +/* AddDotProductQuadrupleSaturated */ -// AddDotProdQuadrupleSaturated multiplies performs dot products on groups of 4 elements of x and y and then adds z. +// AddDotProductQuadrupleSaturated multiplies performs dot products on groups of 4 elements of x and y and then adds z. // // Asm: VPDPBUSDS, CPU Feature: AVXVNNI -func (x Int8x16) AddDotProdQuadrupleSaturated(y Uint8x16, z Int32x4) Int32x4 +func (x Int8x16) AddDotProductQuadrupleSaturated(y Uint8x16, z Int32x4) Int32x4 -// AddDotProdQuadrupleSaturated multiplies performs dot products on groups of 4 elements of x and y and then adds z. +// AddDotProductQuadrupleSaturated multiplies performs dot products on groups of 4 elements of x and y and then adds z. // // Asm: VPDPBUSDS, CPU Feature: AVXVNNI -func (x Int8x32) AddDotProdQuadrupleSaturated(y Uint8x32, z Int32x8) Int32x8 +func (x Int8x32) AddDotProductQuadrupleSaturated(y Uint8x32, z Int32x8) Int32x8 -// AddDotProdQuadrupleSaturated multiplies performs dot products on groups of 4 elements of x and y and then adds z. +// AddDotProductQuadrupleSaturated multiplies performs dot products on groups of 4 elements of x and y and then adds z. // // Asm: VPDPBUSDS, CPU Feature: AVX512VNNI -func (x Int8x64) AddDotProdQuadrupleSaturated(y Uint8x64, z Int32x16) Int32x16 +func (x Int8x64) AddDotProductQuadrupleSaturated(y Uint8x64, z Int32x16) Int32x16 /* AddPairs */ @@ -2143,45 +2126,45 @@ func (x Float64x4) Div(y Float64x4) Float64x4 // Asm: VDIVPD, CPU Feature: AVX512 func (x Float64x8) Div(y Float64x8) Float64x8 -/* DotProdPairs */ +/* DotProductPairs */ -// DotProdPairs multiplies the elements and add the pairs together, +// DotProductPairs multiplies the elements and add the pairs together, // yielding a vector of half as many elements with twice the input element size. // // Asm: VPMADDWD, CPU Feature: AVX -func (x Int16x8) DotProdPairs(y Int16x8) Int32x4 +func (x Int16x8) DotProductPairs(y Int16x8) Int32x4 -// DotProdPairs multiplies the elements and add the pairs together, +// DotProductPairs multiplies the elements and add the pairs together, // yielding a vector of half as many elements with twice the input element size. // // Asm: VPMADDWD, CPU Feature: AVX2 -func (x Int16x16) DotProdPairs(y Int16x16) Int32x8 +func (x Int16x16) DotProductPairs(y Int16x16) Int32x8 -// DotProdPairs multiplies the elements and add the pairs together, +// DotProductPairs multiplies the elements and add the pairs together, // yielding a vector of half as many elements with twice the input element size. // // Asm: VPMADDWD, CPU Feature: AVX512 -func (x Int16x32) DotProdPairs(y Int16x32) Int32x16 +func (x Int16x32) DotProductPairs(y Int16x32) Int32x16 -/* DotProdPairsSaturated */ +/* DotProductPairsSaturated */ -// DotProdPairsSaturated multiplies the elements and add the pairs together with saturation, +// DotProductPairsSaturated multiplies the elements and add the pairs together with saturation, // yielding a vector of half as many elements with twice the input element size. // // Asm: VPMADDUBSW, CPU Feature: AVX -func (x Uint8x16) DotProdPairsSaturated(y Int8x16) Int16x8 +func (x Uint8x16) DotProductPairsSaturated(y Int8x16) Int16x8 -// DotProdPairsSaturated multiplies the elements and add the pairs together with saturation, +// DotProductPairsSaturated multiplies the elements and add the pairs together with saturation, // yielding a vector of half as many elements with twice the input element size. // // Asm: VPMADDUBSW, CPU Feature: AVX2 -func (x Uint8x32) DotProdPairsSaturated(y Int8x32) Int16x16 +func (x Uint8x32) DotProductPairsSaturated(y Int8x32) Int16x16 -// DotProdPairsSaturated multiplies the elements and add the pairs together with saturation, +// DotProductPairsSaturated multiplies the elements and add the pairs together with saturation, // yielding a vector of half as many elements with twice the input element size. // // Asm: VPMADDUBSW, CPU Feature: AVX512 -func (x Uint8x64) DotProdPairsSaturated(y Int8x64) Int16x32 +func (x Uint8x64) DotProductPairsSaturated(y Int8x64) Int16x32 /* Equal */