mirror of
https://github.com/golang/go.git
synced 2026-02-06 18:00:01 +00:00
[dev.simd] cmd/compile, simd: make Permute 128-bit use AVX VPSHUFB
Change-Id: Ib89f602f797065e411eb0cbc95ccf2748b25fdec Reviewed-on: https://go-review.googlesource.com/c/go/+/698295 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: David Chase <drchase@google.com>
This commit is contained in:
parent
bc217d4170
commit
fa1e78c9ad
9 changed files with 63 additions and 46 deletions
|
|
@ -332,7 +332,7 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
|
|||
ssa.OpAMD64VPOR256,
|
||||
ssa.OpAMD64VPORD512,
|
||||
ssa.OpAMD64VPORQ512,
|
||||
ssa.OpAMD64VPERMB128,
|
||||
ssa.OpAMD64VPSHUFB128,
|
||||
ssa.OpAMD64VPERMB256,
|
||||
ssa.OpAMD64VPERMB512,
|
||||
ssa.OpAMD64VPERMW128,
|
||||
|
|
@ -606,7 +606,7 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
|
|||
ssa.OpAMD64VPORQMasked128,
|
||||
ssa.OpAMD64VPORQMasked256,
|
||||
ssa.OpAMD64VPORQMasked512,
|
||||
ssa.OpAMD64VPERMBMasked128,
|
||||
ssa.OpAMD64VPSHUFBMasked128,
|
||||
ssa.OpAMD64VPERMBMasked256,
|
||||
ssa.OpAMD64VPERMBMasked512,
|
||||
ssa.OpAMD64VPERMWMasked128,
|
||||
|
|
@ -1682,7 +1682,7 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
|
|||
ssa.OpAMD64VPERMI2QMasked256,
|
||||
ssa.OpAMD64VPERMI2PDMasked512,
|
||||
ssa.OpAMD64VPERMI2QMasked512,
|
||||
ssa.OpAMD64VPERMBMasked128,
|
||||
ssa.OpAMD64VPSHUFBMasked128,
|
||||
ssa.OpAMD64VPERMBMasked256,
|
||||
ssa.OpAMD64VPERMBMasked512,
|
||||
ssa.OpAMD64VPERMWMasked128,
|
||||
|
|
|
|||
|
|
@ -732,7 +732,7 @@
|
|||
(PermuteFloat32x16 ...) => (VPERMPS512 ...)
|
||||
(PermuteFloat64x4 ...) => (VPERMPD256 ...)
|
||||
(PermuteFloat64x8 ...) => (VPERMPD512 ...)
|
||||
(PermuteInt8x16 ...) => (VPERMB128 ...)
|
||||
(PermuteInt8x16 ...) => (VPSHUFB128 ...)
|
||||
(PermuteInt8x32 ...) => (VPERMB256 ...)
|
||||
(PermuteInt8x64 ...) => (VPERMB512 ...)
|
||||
(PermuteInt16x8 ...) => (VPERMW128 ...)
|
||||
|
|
@ -742,7 +742,7 @@
|
|||
(PermuteInt32x16 ...) => (VPERMD512 ...)
|
||||
(PermuteInt64x4 ...) => (VPERMQ256 ...)
|
||||
(PermuteInt64x8 ...) => (VPERMQ512 ...)
|
||||
(PermuteUint8x16 ...) => (VPERMB128 ...)
|
||||
(PermuteUint8x16 ...) => (VPSHUFB128 ...)
|
||||
(PermuteUint8x32 ...) => (VPERMB256 ...)
|
||||
(PermuteUint8x64 ...) => (VPERMB512 ...)
|
||||
(PermuteUint16x8 ...) => (VPERMW128 ...)
|
||||
|
|
|
|||
|
|
@ -364,10 +364,8 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
|
|||
{name: "VPDPWSSDSMasked128", argLength: 4, reg: w3kw, asm: "VPDPWSSDS", commutative: false, typ: "Vec128", resultInArg0: true},
|
||||
{name: "VPDPWSSDSMasked256", argLength: 4, reg: w3kw, asm: "VPDPWSSDS", commutative: false, typ: "Vec256", resultInArg0: true},
|
||||
{name: "VPDPWSSDSMasked512", argLength: 4, reg: w3kw, asm: "VPDPWSSDS", commutative: false, typ: "Vec512", resultInArg0: true},
|
||||
{name: "VPERMB128", argLength: 2, reg: w21, asm: "VPERMB", commutative: false, typ: "Vec128", resultInArg0: false},
|
||||
{name: "VPERMB256", argLength: 2, reg: w21, asm: "VPERMB", commutative: false, typ: "Vec256", resultInArg0: false},
|
||||
{name: "VPERMB512", argLength: 2, reg: w21, asm: "VPERMB", commutative: false, typ: "Vec512", resultInArg0: false},
|
||||
{name: "VPERMBMasked128", argLength: 3, reg: w2kw, asm: "VPERMB", commutative: false, typ: "Vec128", resultInArg0: false},
|
||||
{name: "VPERMBMasked256", argLength: 3, reg: w2kw, asm: "VPERMB", commutative: false, typ: "Vec256", resultInArg0: false},
|
||||
{name: "VPERMBMasked512", argLength: 3, reg: w2kw, asm: "VPERMB", commutative: false, typ: "Vec512", resultInArg0: false},
|
||||
{name: "VPERMD256", argLength: 2, reg: v21, asm: "VPERMD", commutative: false, typ: "Vec256", resultInArg0: false},
|
||||
|
|
@ -817,6 +815,8 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
|
|||
{name: "VPSHRDVWMasked128", argLength: 4, reg: w3kw, asm: "VPSHRDVW", commutative: false, typ: "Vec128", resultInArg0: true},
|
||||
{name: "VPSHRDVWMasked256", argLength: 4, reg: w3kw, asm: "VPSHRDVW", commutative: false, typ: "Vec256", resultInArg0: true},
|
||||
{name: "VPSHRDVWMasked512", argLength: 4, reg: w3kw, asm: "VPSHRDVW", commutative: false, typ: "Vec512", resultInArg0: true},
|
||||
{name: "VPSHUFB128", argLength: 2, reg: v21, asm: "VPSHUFB", commutative: false, typ: "Vec128", resultInArg0: false},
|
||||
{name: "VPSHUFBMasked128", argLength: 3, reg: w2kw, asm: "VPSHUFB", commutative: false, typ: "Vec128", resultInArg0: false},
|
||||
{name: "VPSIGNB128", argLength: 2, reg: v21, asm: "VPSIGNB", commutative: false, typ: "Vec128", resultInArg0: false},
|
||||
{name: "VPSIGNB256", argLength: 2, reg: v21, asm: "VPSIGNB", commutative: false, typ: "Vec256", resultInArg0: false},
|
||||
{name: "VPSIGND128", argLength: 2, reg: v21, asm: "VPSIGND", commutative: false, typ: "Vec128", resultInArg0: false},
|
||||
|
|
|
|||
|
|
@ -1587,10 +1587,8 @@ const (
|
|||
OpAMD64VPDPWSSDSMasked128
|
||||
OpAMD64VPDPWSSDSMasked256
|
||||
OpAMD64VPDPWSSDSMasked512
|
||||
OpAMD64VPERMB128
|
||||
OpAMD64VPERMB256
|
||||
OpAMD64VPERMB512
|
||||
OpAMD64VPERMBMasked128
|
||||
OpAMD64VPERMBMasked256
|
||||
OpAMD64VPERMBMasked512
|
||||
OpAMD64VPERMD256
|
||||
|
|
@ -2040,6 +2038,8 @@ const (
|
|||
OpAMD64VPSHRDVWMasked128
|
||||
OpAMD64VPSHRDVWMasked256
|
||||
OpAMD64VPSHRDVWMasked512
|
||||
OpAMD64VPSHUFB128
|
||||
OpAMD64VPSHUFBMasked128
|
||||
OpAMD64VPSIGNB128
|
||||
OpAMD64VPSIGNB256
|
||||
OpAMD64VPSIGND128
|
||||
|
|
@ -24358,20 +24358,6 @@ var opcodeTable = [...]opInfo{
|
|||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "VPERMB128",
|
||||
argLen: 2,
|
||||
asm: x86.AVPERMB,
|
||||
reg: regInfo{
|
||||
inputs: []inputInfo{
|
||||
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
|
||||
{1, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
|
||||
},
|
||||
outputs: []outputInfo{
|
||||
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "VPERMB256",
|
||||
argLen: 2,
|
||||
|
|
@ -24400,21 +24386,6 @@ var opcodeTable = [...]opInfo{
|
|||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "VPERMBMasked128",
|
||||
argLen: 3,
|
||||
asm: x86.AVPERMB,
|
||||
reg: regInfo{
|
||||
inputs: []inputInfo{
|
||||
{2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
|
||||
{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
|
||||
{1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
|
||||
},
|
||||
outputs: []outputInfo{
|
||||
{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "VPERMBMasked256",
|
||||
argLen: 3,
|
||||
|
|
@ -31046,6 +31017,35 @@ var opcodeTable = [...]opInfo{
|
|||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "VPSHUFB128",
|
||||
argLen: 2,
|
||||
asm: x86.AVPSHUFB,
|
||||
reg: regInfo{
|
||||
inputs: []inputInfo{
|
||||
{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
|
||||
{1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
|
||||
},
|
||||
outputs: []outputInfo{
|
||||
{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "VPSHUFBMasked128",
|
||||
argLen: 3,
|
||||
asm: x86.AVPSHUFB,
|
||||
reg: regInfo{
|
||||
inputs: []inputInfo{
|
||||
{2, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
|
||||
{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
|
||||
{1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
|
||||
},
|
||||
outputs: []outputInfo{
|
||||
{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "VPSIGNB128",
|
||||
argLen: 2,
|
||||
|
|
|
|||
|
|
@ -3257,7 +3257,7 @@ func rewriteValueAMD64(v *Value) bool {
|
|||
v.Op = OpAMD64VPERMQ512
|
||||
return true
|
||||
case OpPermuteInt8x16:
|
||||
v.Op = OpAMD64VPERMB128
|
||||
v.Op = OpAMD64VPSHUFB128
|
||||
return true
|
||||
case OpPermuteInt8x32:
|
||||
v.Op = OpAMD64VPERMB256
|
||||
|
|
@ -3287,7 +3287,7 @@ func rewriteValueAMD64(v *Value) bool {
|
|||
v.Op = OpAMD64VPERMQ512
|
||||
return true
|
||||
case OpPermuteUint8x16:
|
||||
v.Op = OpAMD64VPERMB128
|
||||
v.Op = OpAMD64VPSHUFB128
|
||||
return true
|
||||
case OpPermuteUint8x32:
|
||||
v.Op = OpAMD64VPERMB256
|
||||
|
|
|
|||
|
|
@ -740,8 +740,8 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
|
|||
addF(simdPackage, "Uint64x2.Or", opLen2(ssa.OpOrUint64x2, types.TypeVec128), sys.AMD64)
|
||||
addF(simdPackage, "Uint64x4.Or", opLen2(ssa.OpOrUint64x4, types.TypeVec256), sys.AMD64)
|
||||
addF(simdPackage, "Uint64x8.Or", opLen2(ssa.OpOrUint64x8, types.TypeVec512), sys.AMD64)
|
||||
addF(simdPackage, "Int8x16.Permute", opLen2_21(ssa.OpPermuteInt8x16, types.TypeVec128), sys.AMD64)
|
||||
addF(simdPackage, "Uint8x16.Permute", opLen2_21(ssa.OpPermuteUint8x16, types.TypeVec128), sys.AMD64)
|
||||
addF(simdPackage, "Int8x16.Permute", opLen2(ssa.OpPermuteInt8x16, types.TypeVec128), sys.AMD64)
|
||||
addF(simdPackage, "Uint8x16.Permute", opLen2(ssa.OpPermuteUint8x16, types.TypeVec128), sys.AMD64)
|
||||
addF(simdPackage, "Int8x32.Permute", opLen2_21(ssa.OpPermuteInt8x32, types.TypeVec256), sys.AMD64)
|
||||
addF(simdPackage, "Uint8x32.Permute", opLen2_21(ssa.OpPermuteUint8x32, types.TypeVec256), sys.AMD64)
|
||||
addF(simdPackage, "Int8x64.Permute", opLen2_21(ssa.OpPermuteInt8x64, types.TypeVec512), sys.AMD64)
|
||||
|
|
|
|||
|
|
@ -74,4 +74,4 @@
|
|||
commutative: false
|
||||
documentation: !string |-
|
||||
// NAME copies element zero of its (128-bit) input to all elements of
|
||||
// the 512-bit output vector.
|
||||
// the 512-bit output vector.
|
||||
|
|
@ -418,3 +418,18 @@
|
|||
bits: 512
|
||||
elemBits: $e
|
||||
base: $b
|
||||
|
||||
# VPSHUFB for 128-bit byte shuffles will be picked with higher priority than VPERMB, given its lower CPU feature requirement. (It's AVX)
|
||||
- go: Permute
|
||||
asm: VPSHUFB
|
||||
addDoc: !string |-
|
||||
// However when the top bit is set, the low bits will be disregard and the respective element in the result vector will be zeroed.
|
||||
in:
|
||||
- &128any
|
||||
bits: 128
|
||||
go: $t
|
||||
- bits: 128
|
||||
go: $t
|
||||
name: indices
|
||||
out:
|
||||
- *128any
|
||||
|
|
@ -4155,15 +4155,17 @@ func (x Uint64x8) Or(y Uint64x8) Uint64x8
|
|||
// Permute performs a full permutation of vector x using indices:
|
||||
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
|
||||
// Only the needed bits to represent x's index are used in indices' elements.
|
||||
// However when the top bit is set, the low bits will be disregard and the respective element in the result vector will be zeroed.
|
||||
//
|
||||
// Asm: VPERMB, CPU Feature: AVX512VBMI
|
||||
func (x Int8x16) Permute(indices Uint8x16) Int8x16
|
||||
// Asm: VPSHUFB, CPU Feature: AVX
|
||||
func (x Int8x16) Permute(indices Int8x16) Int8x16
|
||||
|
||||
// Permute performs a full permutation of vector x using indices:
|
||||
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
|
||||
// Only the needed bits to represent x's index are used in indices' elements.
|
||||
// However when the top bit is set, the low bits will be disregard and the respective element in the result vector will be zeroed.
|
||||
//
|
||||
// Asm: VPERMB, CPU Feature: AVX512VBMI
|
||||
// Asm: VPSHUFB, CPU Feature: AVX
|
||||
func (x Uint8x16) Permute(indices Uint8x16) Uint8x16
|
||||
|
||||
// Permute performs a full permutation of vector x using indices:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue