mirror of
https://github.com/golang/go.git
synced 2025-12-08 06:10:04 +00:00
cmd/compile: intrinsify swissmap match calls with SIMD on amd64
Use similar SIMD operations to the ones used in Abseil. We still using 8-slot groups (even though the XMM registers could handle 16-slot groups) to keep the implementation simpler (no changes to the memory layout of maps). Still, the implementations of matchH2 and matchEmpty are shorter than the portable version using standard arithmetic operations. They also return a packed bitset, which avoids the need to shift in bitset.first. That said, the packed bitset is a downside in cognitive complexity, as we have to think about two different possible representations. This doesn't leak out of the API, but we do need to intrinsify bitset to switch to a compatible implementation. The compiler's intrinsics don't support intrinsifying methods, so the implementations move to free functions. This makes operations between 0-3% faster on my machine. e.g., MapGetHit/impl=runtimeMap/t=Int64/len=6-12 12.34n ± 1% 11.42n ± 1% -7.46% (p=0.000 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=12-12 15.14n ± 2% 14.88n ± 1% -1.72% (p=0.009 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=18-12 15.04n ± 6% 14.66n ± 2% -2.53% (p=0.000 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=24-12 15.80n ± 1% 15.48n ± 3% ~ (p=0.444 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=30-12 15.55n ± 4% 14.77n ± 3% -5.02% (p=0.004 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=64-12 15.26n ± 1% 15.05n ± 1% ~ (p=0.055 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=128-12 15.34n ± 1% 15.02n ± 2% -2.09% (p=0.000 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=256-12 15.42n ± 1% 15.15n ± 1% -1.75% (p=0.001 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=512-12 15.48n ± 1% 15.18n ± 1% -1.94% (p=0.000 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=1024-12 17.38n ± 1% 17.05n ± 1% -1.90% (p=0.000 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=2048-12 17.96n ± 0% 17.59n ± 1% -2.06% (p=0.000 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=4096-12 18.36n ± 1% 18.18n ± 1% -0.98% (p=0.013 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=8192-12 18.75n ± 0% 18.31n ± 1% -2.35% (p=0.000 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=65536-12 26.25n ± 0% 25.95n ± 1% -1.14% (p=0.000 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=262144-12 44.24n ± 1% 44.06n ± 1% ~ (p=0.181 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=1048576-12 85.02n ± 0% 85.35n ± 0% +0.39% (p=0.032 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=4194304-12 98.87n ± 1% 98.85n ± 1% ~ (p=0.799 n=25) For #54766. Cq-Include-Trybots: luci.golang.try:gotip-linux-ppc64_power10,gotip-linux-amd64-goamd64v3 Change-Id: Ic1b852f02744404122cb3672900fd95f4625905e Reviewed-on: https://go-review.googlesource.com/c/go/+/626277 Reviewed-by: Keith Randall <khr@golang.org> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Auto-Submit: Michael Pratt <mpratt@google.com> Reviewed-by: Keith Randall <khr@google.com>
This commit is contained in:
parent
b8ba5b440b
commit
dce30a1920
8 changed files with 1858 additions and 1287 deletions
|
|
@ -1134,6 +1134,60 @@ func init() {
|
|||
{name: "SHRXLloadidx8", argLength: 4, reg: gp21shxloadidx, asm: "SHRXL", scale: 8, aux: "SymOff", typ: "Uint32", faultOnNilArg0: true, symEffect: "Read"}, // unsigned *(arg0+8*arg1+auxint+aux) >> arg2, arg3=mem, shift amount is mod 32
|
||||
{name: "SHRXQloadidx1", argLength: 4, reg: gp21shxloadidx, asm: "SHRXQ", scale: 1, aux: "SymOff", typ: "Uint64", faultOnNilArg0: true, symEffect: "Read"}, // unsigned *(arg0+1*arg1+auxint+aux) >> arg2, arg3=mem, shift amount is mod 64
|
||||
{name: "SHRXQloadidx8", argLength: 4, reg: gp21shxloadidx, asm: "SHRXQ", scale: 8, aux: "SymOff", typ: "Uint64", faultOnNilArg0: true, symEffect: "Read"}, // unsigned *(arg0+8*arg1+auxint+aux) >> arg2, arg3=mem, shift amount is mod 64
|
||||
|
||||
// Unpack bytes, low 64-bits.
|
||||
//
|
||||
// Input/output registers treated as [8]uint8.
|
||||
//
|
||||
// output = {in1[0], in2[0], in1[1], in2[1], in1[2], in2[2], in1[3], in2[3]}
|
||||
{name: "PUNPCKLBW", argLength: 2, reg: fp21, resultInArg0: true, asm: "PUNPCKLBW"},
|
||||
|
||||
// Shuffle 16-bit words, low 64-bits.
|
||||
//
|
||||
// Input/output registers treated as [4]uint16.
|
||||
// aux=source word index for each destination word, 2 bits per index.
|
||||
//
|
||||
// output[i] = input[(aux>>2*i)&3].
|
||||
{name: "PSHUFLW", argLength: 1, reg: fp11, aux: "Int8", asm: "PSHUFLW"},
|
||||
|
||||
// Broadcast input byte.
|
||||
//
|
||||
// Input treated as uint8, output treated as [16]uint8.
|
||||
//
|
||||
// output[i] = input.
|
||||
{name: "PSHUFBbroadcast", argLength: 1, reg: fp11, resultInArg0: true, asm: "PSHUFB"}, // PSHUFB with mask zero, (GOAMD64=v1)
|
||||
{name: "VPBROADCASTB", argLength: 1, reg: gpfp, asm: "VPBROADCASTB"}, // Broadcast input byte from gp (GOAMD64=v3)
|
||||
|
||||
// Byte negate/zero/preserve (GOAMD64=v2).
|
||||
//
|
||||
// Input/output registers treated as [16]uint8.
|
||||
//
|
||||
// if in2[i] > 0 {
|
||||
// output[i] = in1[i]
|
||||
// } else if in2[i] == 0 {
|
||||
// output[i] = 0
|
||||
// } else {
|
||||
// output[i] = -1 * in1[i]
|
||||
// }
|
||||
{name: "PSIGNB", argLength: 2, reg: fp21, resultInArg0: true, asm: "PSIGNB"},
|
||||
|
||||
// Byte compare.
|
||||
//
|
||||
// Input/output registers treated as [16]uint8.
|
||||
//
|
||||
// if in1[i] == in2[i] {
|
||||
// output[i] = 0xff
|
||||
// } else {
|
||||
// output[i] = 0
|
||||
// }
|
||||
{name: "PCMPEQB", argLength: 2, reg: fp21, resultInArg0: true, asm: "PCMPEQB"},
|
||||
|
||||
// Byte sign mask. Output is a bitmap of sign bits from each input byte.
|
||||
//
|
||||
// Input treated as [16]uint8. Output is [16]bit (uint16 bitmap).
|
||||
//
|
||||
// output[i] = (input[i] >> 7) & 1
|
||||
{name: "PMOVMSKB", argLength: 1, reg: fpgp, asm: "PMOVMSKB"},
|
||||
}
|
||||
|
||||
var AMD64blocks = []blockData{
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue