cmd/compile: intrinsify swissmap match calls with SIMD on amd64

Use similar SIMD operations to the ones used in Abseil. We still
using 8-slot groups (even though the XMM registers could handle 16-slot
groups) to keep the implementation simpler (no changes to the memory
layout of maps).

Still, the implementations of matchH2 and matchEmpty are shorter than
the portable version using standard arithmetic operations. They also
return a packed bitset, which avoids the need to shift in bitset.first.

That said, the packed bitset is a downside in cognitive complexity, as
we have to think about two different possible representations. This
doesn't leak out of the API, but we do need to intrinsify bitset to
switch to a compatible implementation.

The compiler's intrinsics don't support intrinsifying methods, so the
implementations move to free functions.

This makes operations between 0-3% faster on my machine. e.g.,

MapGetHit/impl=runtimeMap/t=Int64/len=6-12                      12.34n ±  1%   11.42n ± 1%   -7.46% (p=0.000 n=25)
MapGetHit/impl=runtimeMap/t=Int64/len=12-12                     15.14n ±  2%   14.88n ± 1%   -1.72% (p=0.009 n=25)
MapGetHit/impl=runtimeMap/t=Int64/len=18-12                     15.04n ±  6%   14.66n ± 2%   -2.53% (p=0.000 n=25)
MapGetHit/impl=runtimeMap/t=Int64/len=24-12                     15.80n ±  1%   15.48n ± 3%        ~ (p=0.444 n=25)
MapGetHit/impl=runtimeMap/t=Int64/len=30-12                     15.55n ±  4%   14.77n ± 3%   -5.02% (p=0.004 n=25)
MapGetHit/impl=runtimeMap/t=Int64/len=64-12                     15.26n ±  1%   15.05n ± 1%        ~ (p=0.055 n=25)
MapGetHit/impl=runtimeMap/t=Int64/len=128-12                    15.34n ±  1%   15.02n ± 2%   -2.09% (p=0.000 n=25)
MapGetHit/impl=runtimeMap/t=Int64/len=256-12                    15.42n ±  1%   15.15n ± 1%   -1.75% (p=0.001 n=25)
MapGetHit/impl=runtimeMap/t=Int64/len=512-12                    15.48n ±  1%   15.18n ± 1%   -1.94% (p=0.000 n=25)
MapGetHit/impl=runtimeMap/t=Int64/len=1024-12                   17.38n ±  1%   17.05n ± 1%   -1.90% (p=0.000 n=25)
MapGetHit/impl=runtimeMap/t=Int64/len=2048-12                   17.96n ±  0%   17.59n ± 1%   -2.06% (p=0.000 n=25)
MapGetHit/impl=runtimeMap/t=Int64/len=4096-12                   18.36n ±  1%   18.18n ± 1%   -0.98% (p=0.013 n=25)
MapGetHit/impl=runtimeMap/t=Int64/len=8192-12                   18.75n ±  0%   18.31n ± 1%   -2.35% (p=0.000 n=25)
MapGetHit/impl=runtimeMap/t=Int64/len=65536-12                  26.25n ±  0%   25.95n ± 1%   -1.14% (p=0.000 n=25)
MapGetHit/impl=runtimeMap/t=Int64/len=262144-12                 44.24n ±  1%   44.06n ± 1%        ~ (p=0.181 n=25)
MapGetHit/impl=runtimeMap/t=Int64/len=1048576-12                85.02n ±  0%   85.35n ± 0%   +0.39% (p=0.032 n=25)
MapGetHit/impl=runtimeMap/t=Int64/len=4194304-12                98.87n ±  1%   98.85n ± 1%        ~ (p=0.799 n=25)

For #54766.

Cq-Include-Trybots: luci.golang.try:gotip-linux-ppc64_power10,gotip-linux-amd64-goamd64v3
Change-Id: Ic1b852f02744404122cb3672900fd95f4625905e
Reviewed-on: https://go-review.googlesource.com/c/go/+/626277
Reviewed-by: Keith Randall <khr@golang.org>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Auto-Submit: Michael Pratt <mpratt@google.com>
Reviewed-by: Keith Randall <khr@google.com>
This commit is contained in:
Michael Pratt 2024-11-04 12:41:33 -05:00 committed by Gopher Robot
parent b8ba5b440b
commit dce30a1920
8 changed files with 1858 additions and 1287 deletions

View file

@ -256,9 +256,39 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
ssa.OpAMD64POR, ssa.OpAMD64PXOR, ssa.OpAMD64POR, ssa.OpAMD64PXOR,
ssa.OpAMD64BTSL, ssa.OpAMD64BTSQ, ssa.OpAMD64BTSL, ssa.OpAMD64BTSQ,
ssa.OpAMD64BTCL, ssa.OpAMD64BTCQ, ssa.OpAMD64BTCL, ssa.OpAMD64BTCQ,
ssa.OpAMD64BTRL, ssa.OpAMD64BTRQ: ssa.OpAMD64BTRL, ssa.OpAMD64BTRQ,
ssa.OpAMD64PCMPEQB, ssa.OpAMD64PSIGNB,
ssa.OpAMD64PUNPCKLBW:
opregreg(s, v.Op.Asm(), v.Reg(), v.Args[1].Reg()) opregreg(s, v.Op.Asm(), v.Reg(), v.Args[1].Reg())
case ssa.OpAMD64PSHUFLW:
p := s.Prog(v.Op.Asm())
imm := v.AuxInt
if imm < 0 || imm > 255 {
v.Fatalf("Invalid source selection immediate")
}
p.From.Offset = imm
p.From.Type = obj.TYPE_CONST
p.AddRestSourceReg(v.Args[0].Reg())
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg()
case ssa.OpAMD64PSHUFBbroadcast:
// PSHUFB with a control mask of zero copies byte 0 to all
// bytes in the register.
//
// X15 is always zero with ABIInternal.
if s.ABI != obj.ABIInternal {
// zero X15 manually
opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15)
}
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg()
p.From.Reg = x86.REG_X15
case ssa.OpAMD64SHRDQ, ssa.OpAMD64SHLDQ: case ssa.OpAMD64SHRDQ, ssa.OpAMD64SHLDQ:
p := s.Prog(v.Op.Asm()) p := s.Prog(v.Op.Asm())
lo, hi, bits := v.Args[0].Reg(), v.Args[1].Reg(), v.Args[2].Reg() lo, hi, bits := v.Args[0].Reg(), v.Args[1].Reg(), v.Args[2].Reg()
@ -915,7 +945,7 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
ssagen.AddAux2(&p.To, v, sc.Off64()) ssagen.AddAux2(&p.To, v, sc.Off64())
case ssa.OpAMD64MOVLQSX, ssa.OpAMD64MOVWQSX, ssa.OpAMD64MOVBQSX, ssa.OpAMD64MOVLQZX, ssa.OpAMD64MOVWQZX, ssa.OpAMD64MOVBQZX, case ssa.OpAMD64MOVLQSX, ssa.OpAMD64MOVWQSX, ssa.OpAMD64MOVBQSX, ssa.OpAMD64MOVLQZX, ssa.OpAMD64MOVWQZX, ssa.OpAMD64MOVBQZX,
ssa.OpAMD64CVTTSS2SL, ssa.OpAMD64CVTTSD2SL, ssa.OpAMD64CVTTSS2SQ, ssa.OpAMD64CVTTSD2SQ, ssa.OpAMD64CVTTSS2SL, ssa.OpAMD64CVTTSD2SL, ssa.OpAMD64CVTTSS2SQ, ssa.OpAMD64CVTTSD2SQ,
ssa.OpAMD64CVTSS2SD, ssa.OpAMD64CVTSD2SS: ssa.OpAMD64CVTSS2SD, ssa.OpAMD64CVTSD2SS, ssa.OpAMD64VPBROADCASTB, ssa.OpAMD64PMOVMSKB:
opregreg(s, v.Op.Asm(), v.Reg(), v.Args[0].Reg()) opregreg(s, v.Op.Asm(), v.Reg(), v.Args[0].Reg())
case ssa.OpAMD64CVTSL2SD, ssa.OpAMD64CVTSQ2SD, ssa.OpAMD64CVTSQ2SS, ssa.OpAMD64CVTSL2SS: case ssa.OpAMD64CVTSL2SD, ssa.OpAMD64CVTSQ2SD, ssa.OpAMD64CVTSQ2SS, ssa.OpAMD64CVTSL2SS:
r := v.Reg() r := v.Reg()

View file

@ -1134,6 +1134,60 @@ func init() {
{name: "SHRXLloadidx8", argLength: 4, reg: gp21shxloadidx, asm: "SHRXL", scale: 8, aux: "SymOff", typ: "Uint32", faultOnNilArg0: true, symEffect: "Read"}, // unsigned *(arg0+8*arg1+auxint+aux) >> arg2, arg3=mem, shift amount is mod 32 {name: "SHRXLloadidx8", argLength: 4, reg: gp21shxloadidx, asm: "SHRXL", scale: 8, aux: "SymOff", typ: "Uint32", faultOnNilArg0: true, symEffect: "Read"}, // unsigned *(arg0+8*arg1+auxint+aux) >> arg2, arg3=mem, shift amount is mod 32
{name: "SHRXQloadidx1", argLength: 4, reg: gp21shxloadidx, asm: "SHRXQ", scale: 1, aux: "SymOff", typ: "Uint64", faultOnNilArg0: true, symEffect: "Read"}, // unsigned *(arg0+1*arg1+auxint+aux) >> arg2, arg3=mem, shift amount is mod 64 {name: "SHRXQloadidx1", argLength: 4, reg: gp21shxloadidx, asm: "SHRXQ", scale: 1, aux: "SymOff", typ: "Uint64", faultOnNilArg0: true, symEffect: "Read"}, // unsigned *(arg0+1*arg1+auxint+aux) >> arg2, arg3=mem, shift amount is mod 64
{name: "SHRXQloadidx8", argLength: 4, reg: gp21shxloadidx, asm: "SHRXQ", scale: 8, aux: "SymOff", typ: "Uint64", faultOnNilArg0: true, symEffect: "Read"}, // unsigned *(arg0+8*arg1+auxint+aux) >> arg2, arg3=mem, shift amount is mod 64 {name: "SHRXQloadidx8", argLength: 4, reg: gp21shxloadidx, asm: "SHRXQ", scale: 8, aux: "SymOff", typ: "Uint64", faultOnNilArg0: true, symEffect: "Read"}, // unsigned *(arg0+8*arg1+auxint+aux) >> arg2, arg3=mem, shift amount is mod 64
// Unpack bytes, low 64-bits.
//
// Input/output registers treated as [8]uint8.
//
// output = {in1[0], in2[0], in1[1], in2[1], in1[2], in2[2], in1[3], in2[3]}
{name: "PUNPCKLBW", argLength: 2, reg: fp21, resultInArg0: true, asm: "PUNPCKLBW"},
// Shuffle 16-bit words, low 64-bits.
//
// Input/output registers treated as [4]uint16.
// aux=source word index for each destination word, 2 bits per index.
//
// output[i] = input[(aux>>2*i)&3].
{name: "PSHUFLW", argLength: 1, reg: fp11, aux: "Int8", asm: "PSHUFLW"},
// Broadcast input byte.
//
// Input treated as uint8, output treated as [16]uint8.
//
// output[i] = input.
{name: "PSHUFBbroadcast", argLength: 1, reg: fp11, resultInArg0: true, asm: "PSHUFB"}, // PSHUFB with mask zero, (GOAMD64=v1)
{name: "VPBROADCASTB", argLength: 1, reg: gpfp, asm: "VPBROADCASTB"}, // Broadcast input byte from gp (GOAMD64=v3)
// Byte negate/zero/preserve (GOAMD64=v2).
//
// Input/output registers treated as [16]uint8.
//
// if in2[i] > 0 {
// output[i] = in1[i]
// } else if in2[i] == 0 {
// output[i] = 0
// } else {
// output[i] = -1 * in1[i]
// }
{name: "PSIGNB", argLength: 2, reg: fp21, resultInArg0: true, asm: "PSIGNB"},
// Byte compare.
//
// Input/output registers treated as [16]uint8.
//
// if in1[i] == in2[i] {
// output[i] = 0xff
// } else {
// output[i] = 0
// }
{name: "PCMPEQB", argLength: 2, reg: fp21, resultInArg0: true, asm: "PCMPEQB"},
// Byte sign mask. Output is a bitmap of sign bits from each input byte.
//
// Input treated as [16]uint8. Output is [16]bit (uint16 bitmap).
//
// output[i] = (input[i] >> 7) & 1
{name: "PMOVMSKB", argLength: 1, reg: fpgp, asm: "PMOVMSKB"},
} }
var AMD64blocks = []blockData{ var AMD64blocks = []blockData{

View file

@ -1150,6 +1150,13 @@ const (
OpAMD64SHRXLloadidx8 OpAMD64SHRXLloadidx8
OpAMD64SHRXQloadidx1 OpAMD64SHRXQloadidx1
OpAMD64SHRXQloadidx8 OpAMD64SHRXQloadidx8
OpAMD64PUNPCKLBW
OpAMD64PSHUFLW
OpAMD64PSHUFBbroadcast
OpAMD64VPBROADCASTB
OpAMD64PSIGNB
OpAMD64PCMPEQB
OpAMD64PMOVMSKB
OpARMADD OpARMADD
OpARMADDconst OpARMADDconst
@ -15333,6 +15340,105 @@ var opcodeTable = [...]opInfo{
}, },
}, },
}, },
{
name: "PUNPCKLBW",
argLen: 2,
resultInArg0: true,
asm: x86.APUNPCKLBW,
reg: regInfo{
inputs: []inputInfo{
{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
{1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
},
outputs: []outputInfo{
{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
},
},
},
{
name: "PSHUFLW",
auxType: auxInt8,
argLen: 1,
asm: x86.APSHUFLW,
reg: regInfo{
inputs: []inputInfo{
{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
},
outputs: []outputInfo{
{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
},
},
},
{
name: "PSHUFBbroadcast",
argLen: 1,
resultInArg0: true,
asm: x86.APSHUFB,
reg: regInfo{
inputs: []inputInfo{
{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
},
outputs: []outputInfo{
{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
},
},
},
{
name: "VPBROADCASTB",
argLen: 1,
asm: x86.AVPBROADCASTB,
reg: regInfo{
inputs: []inputInfo{
{0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
},
outputs: []outputInfo{
{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
},
},
},
{
name: "PSIGNB",
argLen: 2,
resultInArg0: true,
asm: x86.APSIGNB,
reg: regInfo{
inputs: []inputInfo{
{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
{1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
},
outputs: []outputInfo{
{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
},
},
},
{
name: "PCMPEQB",
argLen: 2,
resultInArg0: true,
asm: x86.APCMPEQB,
reg: regInfo{
inputs: []inputInfo{
{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
{1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
},
outputs: []outputInfo{
{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
},
},
},
{
name: "PMOVMSKB",
argLen: 1,
asm: x86.APMOVMSKB,
reg: regInfo{
inputs: []inputInfo{
{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
},
outputs: []outputInfo{
{0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
},
},
},
{ {
name: "ADD", name: "ADD",

View file

@ -6,6 +6,7 @@ package ssagen
import ( import (
"fmt" "fmt"
"internal/abi"
"internal/buildcfg" "internal/buildcfg"
"cmd/compile/internal/base" "cmd/compile/internal/base"
@ -1259,6 +1260,297 @@ func initIntrinsics(cfg *intrinsicBuildConfig) {
/******** math/big ********/ /******** math/big ********/
alias("math/big", "mulWW", "math/bits", "Mul64", p8...) alias("math/big", "mulWW", "math/bits", "Mul64", p8...)
/******** internal/runtime/maps ********/
// Important: The intrinsic implementations below return a packed
// bitset, while the portable Go implementation uses an unpacked
// representation (one bit set in each byte).
//
// Thus we must replace most bitset methods with implementations that
// work with the packed representation.
//
// TODO(prattmic): The bitset implementations don't use SIMD, so they
// could be handled with build tags (though that would break
// -d=ssa/intrinsics/off=1).
// With a packed representation we no longer need to shift the result
// of TrailingZeros64.
alias("internal/runtime/maps", "bitsetFirst", "internal/runtime/sys", "TrailingZeros64", sys.ArchAMD64)
addF("internal/runtime/maps", "bitsetRemoveBelow",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
b := args[0]
i := args[1]
// Clear the lower i bits in b.
//
// out = b &^ ((1 << i) - 1)
one := s.constInt64(types.Types[types.TUINT64], 1)
mask := s.newValue2(ssa.OpLsh8x8, types.Types[types.TUINT64], one, i)
mask = s.newValue2(ssa.OpSub64, types.Types[types.TUINT64], mask, one)
mask = s.newValue1(ssa.OpCom64, types.Types[types.TUINT64], mask)
return s.newValue2(ssa.OpAnd64, types.Types[types.TUINT64], b, mask)
},
sys.AMD64)
addF("internal/runtime/maps", "bitsetLowestSet",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
b := args[0]
// Test the lowest bit in b.
//
// out = (b & 1) == 1
one := s.constInt64(types.Types[types.TUINT64], 1)
and := s.newValue2(ssa.OpAnd64, types.Types[types.TUINT64], b, one)
return s.newValue2(ssa.OpEq64, types.Types[types.TBOOL], and, one)
},
sys.AMD64)
addF("internal/runtime/maps", "bitsetShiftOutLowest",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
b := args[0]
// Right shift out the lowest bit in b.
//
// out = b >> 1
one := s.constInt64(types.Types[types.TUINT64], 1)
return s.newValue2(ssa.OpRsh64Ux64, types.Types[types.TUINT64], b, one)
},
sys.AMD64)
addF("internal/runtime/maps", "ctrlGroupMatchH2",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
g := args[0]
h := args[1]
// Explicit copies to fp registers. See
// https://go.dev/issue/70451.
gfp := s.newValue1(ssa.OpAMD64MOVQi2f, types.TypeInt128, g)
hfp := s.newValue1(ssa.OpAMD64MOVQi2f, types.TypeInt128, h)
// Broadcast h2 into each byte of a word.
var broadcast *ssa.Value
if buildcfg.GOAMD64 >= 4 {
// VPBROADCASTB saves 1 instruction vs PSHUFB
// because the input can come from a GP
// register, while PSHUFB requires moving into
// an FP register first.
//
// Nominally PSHUFB would require a second
// additional instruction to load the control
// mask into a FP register. But broadcast uses
// a control mask of 0, and the register ABI
// already defines X15 as a zero register.
broadcast = s.newValue1(ssa.OpAMD64VPBROADCASTB, types.TypeInt128, h) // use gp copy of h
} else if buildcfg.GOAMD64 >= 2 {
// PSHUFB performs a byte broadcast when given
// a control input of 0.
broadcast = s.newValue1(ssa.OpAMD64PSHUFBbroadcast, types.TypeInt128, hfp)
} else {
// No direct byte broadcast. First we must
// duplicate the lower byte and then do a
// 16-bit broadcast.
// "Unpack" h2 with itself. This duplicates the
// input, resulting in h2 in the lower two
// bytes.
unpack := s.newValue2(ssa.OpAMD64PUNPCKLBW, types.TypeInt128, hfp, hfp)
// Copy the lower 16-bits of unpack into every
// 16-bit slot in the lower 64-bits of the
// output register. Note that immediate 0
// selects the low word as the source for every
// destination slot.
broadcast = s.newValue1I(ssa.OpAMD64PSHUFLW, types.TypeInt128, 0, unpack)
// No need to broadcast into the upper 64-bits,
// as we don't use those.
}
// Compare each byte of the control word with h2. Each
// matching byte has every bit set.
eq := s.newValue2(ssa.OpAMD64PCMPEQB, types.TypeInt128, broadcast, gfp)
// Construct a "byte mask": each output bit is equal to
// the sign bit each input byte.
//
// This results in a packed output (bit N set means
// byte N matched).
//
// NOTE: See comment above on bitsetFirst.
out := s.newValue1(ssa.OpAMD64PMOVMSKB, types.Types[types.TUINT16], eq)
// g is only 64-bits so the upper 64-bits of the
// 128-bit register will be zero. If h2 is also zero,
// then we'll get matches on those bytes. Truncate the
// upper bits to ignore such matches.
ret := s.newValue1(ssa.OpZeroExt8to64, types.Types[types.TUINT64], out)
return ret
},
sys.AMD64)
addF("internal/runtime/maps", "ctrlGroupMatchEmpty",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
// An empty slot is 1000 0000
// A deleted slot is 1111 1110
// A full slot is 0??? ????
g := args[0]
// Explicit copy to fp register. See
// https://go.dev/issue/70451.
gfp := s.newValue1(ssa.OpAMD64MOVQi2f, types.TypeInt128, g)
if buildcfg.GOAMD64 >= 2 {
// "PSIGNB negates each data element of the
// destination operand (the first operand) if
// the signed integer value of the
// corresponding data element in the source
// operand (the second operand) is less than
// zero. If the signed integer value of a data
// element in the source operand is positive,
// the corresponding data element in the
// destination operand is unchanged. If a data
// element in the source operand is zero, the
// corresponding data element in the
// destination operand is set to zero" - Intel SDM
//
// If we pass the group control word as both
// arguments:
// - Full slots are unchanged.
// - Deleted slots are negated, becoming
// 0000 0010.
// - Empty slots are negated, becoming
// 1000 0000 (unchanged!).
//
// The result is that only empty slots have the
// sign bit set. We then use PMOVMSKB to
// extract the sign bits.
sign := s.newValue2(ssa.OpAMD64PSIGNB, types.TypeInt128, gfp, gfp)
// Construct a "byte mask": each output bit is
// equal to the sign bit each input byte. The
// sign bit is only set for empty or deleted
// slots.
//
// This results in a packed output (bit N set
// means byte N matched).
//
// NOTE: See comment above on bitsetFirst.
ret := s.newValue1(ssa.OpAMD64PMOVMSKB, types.Types[types.TUINT16], sign)
// g is only 64-bits so the upper 64-bits of
// the 128-bit register will be zero. PSIGNB
// will keep all of these bytes zero, so no
// need to truncate.
return ret
}
// No PSIGNB, simply do byte equality with ctrlEmpty.
// Load ctrlEmpty into each byte of a control word.
var ctrlsEmpty uint64 = abi.SwissMapCtrlEmpty
e := s.constInt64(types.Types[types.TUINT64], int64(ctrlsEmpty))
// Explicit copy to fp register. See
// https://go.dev/issue/70451.
efp := s.newValue1(ssa.OpAMD64MOVQi2f, types.TypeInt128, e)
// Compare each byte of the control word with ctrlEmpty. Each
// matching byte has every bit set.
eq := s.newValue2(ssa.OpAMD64PCMPEQB, types.TypeInt128, efp, gfp)
// Construct a "byte mask": each output bit is equal to
// the sign bit each input byte.
//
// This results in a packed output (bit N set means
// byte N matched).
//
// NOTE: See comment above on bitsetFirst.
out := s.newValue1(ssa.OpAMD64PMOVMSKB, types.Types[types.TUINT16], eq)
// g is only 64-bits so the upper 64-bits of the
// 128-bit register will be zero. The upper 64-bits of
// efp are also zero, so we'll get matches on those
// bytes. Truncate the upper bits to ignore such
// matches.
return s.newValue1(ssa.OpZeroExt8to64, types.Types[types.TUINT64], out)
},
sys.AMD64)
addF("internal/runtime/maps", "ctrlGroupMatchEmptyOrDeleted",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
// An empty slot is 1000 0000
// A deleted slot is 1111 1110
// A full slot is 0??? ????
//
// A slot is empty or deleted iff bit 7 (sign bit) is
// set.
g := args[0]
// Explicit copy to fp register. See
// https://go.dev/issue/70451.
gfp := s.newValue1(ssa.OpAMD64MOVQi2f, types.TypeInt128, g)
// Construct a "byte mask": each output bit is equal to
// the sign bit each input byte. The sign bit is only
// set for empty or deleted slots.
//
// This results in a packed output (bit N set means
// byte N matched).
//
// NOTE: See comment above on bitsetFirst.
ret := s.newValue1(ssa.OpAMD64PMOVMSKB, types.Types[types.TUINT16], gfp)
// g is only 64-bits so the upper 64-bits of the
// 128-bit register will be zero. Zero will never match
// ctrlEmpty or ctrlDeleted, so no need to truncate.
return ret
},
sys.AMD64)
addF("internal/runtime/maps", "ctrlGroupMatchFull",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
// An empty slot is 1000 0000
// A deleted slot is 1111 1110
// A full slot is 0??? ????
//
// A slot is full iff bit 7 (sign bit) is unset.
g := args[0]
// Explicit copy to fp register. See
// https://go.dev/issue/70451.
gfp := s.newValue1(ssa.OpAMD64MOVQi2f, types.TypeInt128, g)
// Construct a "byte mask": each output bit is equal to
// the sign bit each input byte. The sign bit is only
// set for empty or deleted slots.
//
// This results in a packed output (bit N set means
// byte N matched).
//
// NOTE: See comment above on bitsetFirst.
mask := s.newValue1(ssa.OpAMD64PMOVMSKB, types.Types[types.TUINT16], gfp)
// Invert the mask to set the bits for the full slots.
out := s.newValue1(ssa.OpCom16, types.Types[types.TUINT16], mask)
// g is only 64-bits so the upper 64-bits of the
// 128-bit register will be zero, with bit 7 unset.
// Truncate the upper bits to ignore these.
return s.newValue1(ssa.OpZeroExt8to64, types.Types[types.TUINT64], out)
},
sys.AMD64)
} }
// findIntrinsic returns a function which builds the SSA equivalent of the // findIntrinsic returns a function which builds the SSA equivalent of the

View file

@ -88,6 +88,14 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{
{"amd64", "internal/runtime/atomic", "Xchgint32"}: struct{}{}, {"amd64", "internal/runtime/atomic", "Xchgint32"}: struct{}{},
{"amd64", "internal/runtime/atomic", "Xchgint64"}: struct{}{}, {"amd64", "internal/runtime/atomic", "Xchgint64"}: struct{}{},
{"amd64", "internal/runtime/atomic", "Xchguintptr"}: struct{}{}, {"amd64", "internal/runtime/atomic", "Xchguintptr"}: struct{}{},
{"amd64", "internal/runtime/maps", "bitsetFirst"}: struct{}{},
{"amd64", "internal/runtime/maps", "bitsetRemoveBelow"}: struct{}{},
{"amd64", "internal/runtime/maps", "bitsetLowestSet"}: struct{}{},
{"amd64", "internal/runtime/maps", "bitsetShiftOutLowest"}: struct{}{},
{"amd64", "internal/runtime/maps", "ctrlGroupMatchH2"}: struct{}{},
{"amd64", "internal/runtime/maps", "ctrlGroupMatchEmpty"}: struct{}{},
{"amd64", "internal/runtime/maps", "ctrlGroupMatchEmptyOrDeleted"}: struct{}{},
{"amd64", "internal/runtime/maps", "ctrlGroupMatchFull"}: struct{}{},
{"amd64", "internal/runtime/math", "Add64"}: struct{}{}, {"amd64", "internal/runtime/math", "Add64"}: struct{}{},
{"amd64", "internal/runtime/math", "Mul64"}: struct{}{}, {"amd64", "internal/runtime/math", "Mul64"}: struct{}{},
{"amd64", "internal/runtime/math", "MulUintptr"}: struct{}{}, {"amd64", "internal/runtime/math", "MulUintptr"}: struct{}{},

View file

@ -30,33 +30,82 @@ const (
// bitset represents a set of slots within a group. // bitset represents a set of slots within a group.
// //
// The underlying representation uses one byte per slot, where each byte is // The underlying representation depends on GOARCH.
//
// On AMD64, bitset uses one bit per slot, where the bit is set if the slot is
// part of the set. All of the ctrlGroup.match* methods are replaced with
// intrinsics that return this packed representation.
//
// On other architectures, bitset uses one byte per slot, where each byte is
// either 0x80 if the slot is part of the set or 0x00 otherwise. This makes it // either 0x80 if the slot is part of the set or 0x00 otherwise. This makes it
// convenient to calculate for an entire group at once (e.g. see matchEmpty). // convenient to calculate for an entire group at once using standard
// arithemetic instructions.
type bitset uint64 type bitset uint64
// first assumes that only the MSB of each control byte can be set (e.g. bitset // first returns the relative index of the first control byte in the group that
// is the result of matchEmpty or similar) and returns the relative index of the // is in the set.
// first control byte in the group that has the MSB set.
// //
// Returns abi.SwissMapGroupSlots if the bitset is empty. // Preconditions: b is not 0 (empty).
func (b bitset) first() uintptr { func (b bitset) first() uintptr {
return bitsetFirst(b)
}
// Portable implementation of first.
//
// On AMD64, this is replaced with an intrisic that simply does
// TrailingZeros64. There is no need to shift as the bitset is packed.
func bitsetFirst(b bitset) uintptr {
return uintptr(sys.TrailingZeros64(uint64(b))) >> 3 return uintptr(sys.TrailingZeros64(uint64(b))) >> 3
} }
// removeFirst removes the first set bit (that is, resets the least significant // removeFirst clears the first set bit (that is, resets the least significant
// set bit to 0). // set bit to 0).
func (b bitset) removeFirst() bitset { func (b bitset) removeFirst() bitset {
return b & (b - 1) return b & (b - 1)
} }
// removeBelow removes all set bits below slot i (non-inclusive). // removeBelow clears all set bits below slot i (non-inclusive).
func (b bitset) removeBelow(i uintptr) bitset { func (b bitset) removeBelow(i uintptr) bitset {
return bitsetRemoveBelow(b, i)
}
// Portable implementation of removeBelow.
//
// On AMD64, this is replaced with an intrisic that clears the lower i bits.
func bitsetRemoveBelow(b bitset, i uintptr) bitset {
// Clear all bits below slot i's byte. // Clear all bits below slot i's byte.
mask := (uint64(1) << (8 * uint64(i))) - 1 mask := (uint64(1) << (8 * uint64(i))) - 1
return b &^ bitset(mask) return b &^ bitset(mask)
} }
// lowestSet returns true if the bit is set for the lowest index in the bitset.
//
// This is intended for use with shiftOutLowest to loop over all entries in the
// bitset regardless of whether they are set.
func (b bitset) lowestSet() bool {
return bitsetLowestSet(b)
}
// Portable implementation of lowestSet.
//
// On AMD64, this is replaced with an intrisic that checks the lowest bit.
func bitsetLowestSet(b bitset) bool {
return b&(1<<7) != 0
}
// shiftOutLowest shifts the lowest entry out of the bitset. Afterwards, the
// lowest entry in the bitset corresponds to the next slot.
func (b bitset) shiftOutLowest() bitset {
return bitsetShiftOutLowest(b)
}
// Portable implementation of shiftOutLowest.
//
// On AMD64, this is replaced with an intrisic that shifts a single bit.
func bitsetShiftOutLowest(b bitset) bitset {
return b >> 8
}
// Each slot in the hash table has a control byte which can have one of three // Each slot in the hash table has a control byte which can have one of three
// states: empty, deleted, and full. They have the following bit patterns: // states: empty, deleted, and full. They have the following bit patterns:
// //
@ -96,6 +145,14 @@ func (g *ctrlGroup) setEmpty() {
// matchH2 returns the set of slots which are full and for which the 7-bit hash // matchH2 returns the set of slots which are full and for which the 7-bit hash
// matches the given value. May return false positives. // matches the given value. May return false positives.
func (g ctrlGroup) matchH2(h uintptr) bitset { func (g ctrlGroup) matchH2(h uintptr) bitset {
return ctrlGroupMatchH2(g, h)
}
// Portable implementation of matchH2.
//
// Note: On AMD64, this is an intrinsic implemented with SIMD instructions. See
// note on bitset about the packed instrinsified return value.
func ctrlGroupMatchH2(g ctrlGroup, h uintptr) bitset {
// NB: This generic matching routine produces false positive matches when // NB: This generic matching routine produces false positive matches when
// h is 2^N and the control bytes have a seq of 2^N followed by 2^N+1. For // h is 2^N and the control bytes have a seq of 2^N followed by 2^N+1. For
// example: if ctrls==0x0302 and h=02, we'll compute v as 0x0100. When we // example: if ctrls==0x0302 and h=02, we'll compute v as 0x0100. When we
@ -110,6 +167,14 @@ func (g ctrlGroup) matchH2(h uintptr) bitset {
// matchEmpty returns the set of slots in the group that are empty. // matchEmpty returns the set of slots in the group that are empty.
func (g ctrlGroup) matchEmpty() bitset { func (g ctrlGroup) matchEmpty() bitset {
return ctrlGroupMatchEmpty(g)
}
// Portable implementation of matchEmpty.
//
// Note: On AMD64, this is an intrinsic implemented with SIMD instructions. See
// note on bitset about the packed instrinsified return value.
func ctrlGroupMatchEmpty(g ctrlGroup) bitset {
// An empty slot is 1000 0000 // An empty slot is 1000 0000
// A deleted slot is 1111 1110 // A deleted slot is 1111 1110
// A full slot is 0??? ???? // A full slot is 0??? ????
@ -123,6 +188,14 @@ func (g ctrlGroup) matchEmpty() bitset {
// matchEmptyOrDeleted returns the set of slots in the group that are empty or // matchEmptyOrDeleted returns the set of slots in the group that are empty or
// deleted. // deleted.
func (g ctrlGroup) matchEmptyOrDeleted() bitset { func (g ctrlGroup) matchEmptyOrDeleted() bitset {
return ctrlGroupMatchEmptyOrDeleted(g)
}
// Portable implementation of matchEmptyOrDeleted.
//
// Note: On AMD64, this is an intrinsic implemented with SIMD instructions. See
// note on bitset about the packed instrinsified return value.
func ctrlGroupMatchEmptyOrDeleted(g ctrlGroup) bitset {
// An empty slot is 1000 0000 // An empty slot is 1000 0000
// A deleted slot is 1111 1110 // A deleted slot is 1111 1110
// A full slot is 0??? ???? // A full slot is 0??? ????
@ -134,6 +207,14 @@ func (g ctrlGroup) matchEmptyOrDeleted() bitset {
// matchFull returns the set of slots in the group that are full. // matchFull returns the set of slots in the group that are full.
func (g ctrlGroup) matchFull() bitset { func (g ctrlGroup) matchFull() bitset {
return ctrlGroupMatchFull(g)
}
// Portable implementation of matchFull.
//
// Note: On AMD64, this is an intrinsic implemented with SIMD instructions. See
// note on bitset about the packed instrinsified return value.
func ctrlGroupMatchFull(g ctrlGroup) bitset {
// An empty slot is 1000 0000 // An empty slot is 1000 0000
// A deleted slot is 1111 1110 // A deleted slot is 1111 1110
// A full slot is 0??? ???? // A full slot is 0??? ????

View file

@ -38,12 +38,12 @@ func runtime_mapaccess1_fast32(typ *abi.SwissMapType, m *Map, key uint32) unsafe
slotKey := g.key(typ, 0) slotKey := g.key(typ, 0)
slotSize := typ.SlotSize slotSize := typ.SlotSize
for full != 0 { for full != 0 {
if key == *(*uint32)(slotKey) && full&(1<<7) != 0 { if key == *(*uint32)(slotKey) && full.lowestSet() {
slotElem := unsafe.Pointer(uintptr(slotKey) + typ.ElemOff) slotElem := unsafe.Pointer(uintptr(slotKey) + typ.ElemOff)
return slotElem return slotElem
} }
slotKey = unsafe.Pointer(uintptr(slotKey) + slotSize) slotKey = unsafe.Pointer(uintptr(slotKey) + slotSize)
full >>= 8 full = full.shiftOutLowest()
} }
return unsafe.Pointer(&zeroVal[0]) return unsafe.Pointer(&zeroVal[0])
} }
@ -107,12 +107,12 @@ func runtime_mapaccess2_fast32(typ *abi.SwissMapType, m *Map, key uint32) (unsaf
slotKey := g.key(typ, 0) slotKey := g.key(typ, 0)
slotSize := typ.SlotSize slotSize := typ.SlotSize
for full != 0 { for full != 0 {
if key == *(*uint32)(slotKey) && full&(1<<7) != 0 { if key == *(*uint32)(slotKey) && full.lowestSet() {
slotElem := unsafe.Pointer(uintptr(slotKey) + typ.ElemOff) slotElem := unsafe.Pointer(uintptr(slotKey) + typ.ElemOff)
return slotElem, true return slotElem, true
} }
slotKey = unsafe.Pointer(uintptr(slotKey) + slotSize) slotKey = unsafe.Pointer(uintptr(slotKey) + slotSize)
full >>= 8 full = full.shiftOutLowest()
} }
return unsafe.Pointer(&zeroVal[0]), false return unsafe.Pointer(&zeroVal[0]), false
} }

View file

@ -38,12 +38,12 @@ func runtime_mapaccess1_fast64(typ *abi.SwissMapType, m *Map, key uint64) unsafe
slotKey := g.key(typ, 0) slotKey := g.key(typ, 0)
slotSize := typ.SlotSize slotSize := typ.SlotSize
for full != 0 { for full != 0 {
if key == *(*uint64)(slotKey) && full&(1<<7) != 0 { if key == *(*uint64)(slotKey) && full.lowestSet() {
slotElem := unsafe.Pointer(uintptr(slotKey) + 8) slotElem := unsafe.Pointer(uintptr(slotKey) + 8)
return slotElem return slotElem
} }
slotKey = unsafe.Pointer(uintptr(slotKey) + slotSize) slotKey = unsafe.Pointer(uintptr(slotKey) + slotSize)
full >>= 8 full = full.shiftOutLowest()
} }
return unsafe.Pointer(&zeroVal[0]) return unsafe.Pointer(&zeroVal[0])
} }
@ -107,12 +107,12 @@ func runtime_mapaccess2_fast64(typ *abi.SwissMapType, m *Map, key uint64) (unsaf
slotKey := g.key(typ, 0) slotKey := g.key(typ, 0)
slotSize := typ.SlotSize slotSize := typ.SlotSize
for full != 0 { for full != 0 {
if key == *(*uint64)(slotKey) && full&(1<<7) != 0 { if key == *(*uint64)(slotKey) && full.lowestSet() {
slotElem := unsafe.Pointer(uintptr(slotKey) + 8) slotElem := unsafe.Pointer(uintptr(slotKey) + 8)
return slotElem, true return slotElem, true
} }
slotKey = unsafe.Pointer(uintptr(slotKey) + slotSize) slotKey = unsafe.Pointer(uintptr(slotKey) + slotSize)
full >>= 8 full = full.shiftOutLowest()
} }
return unsafe.Pointer(&zeroVal[0]), false return unsafe.Pointer(&zeroVal[0]), false
} }