[dev.simd] internal/runtime/gc: add simd package based greentea kernels

This CL adds a new generator to internal/runtime/gc/scan that generates expander
kernels in Go SIMD. This CL also includes a Go SIMD scan kernel and a
Go SIMD filter kernel.

This CL also includes the plumbing, it will use the Go SIMD kernels if
goexperiment.simd is on.

Benchmark results:
...
ScanSpanPacked/cache=tiny/pages=1/sizeclass=26/pct=80-88     354.8n ±  1%   272.4n ±  0%  -23.22% (p=0.002 n=6)
ScanSpanPacked/cache=tiny/pages=1/sizeclass=26/pct=90-88     375.7n ±  0%   287.1n ±  0%  -23.58% (p=0.002 n=6)
ScanSpanPacked/cache=tiny/pages=1/sizeclass=26/pct=100-88    450.0n ±  1%   327.4n ±  0%  -27.24% (p=0.002 n=6)
geomean                                                      246.5n         199.4n        -19.10%

Throughput +25%.

Change-Id: Ib85e01b7de18181db9e7b6026863209a993aa85f
Reviewed-on: https://go-review.googlesource.com/c/go/+/719520
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: David Chase <drchase@google.com>
This commit is contained in:
Junyang Shao 2025-03-09 17:19:48 +00:00
parent 3fdd183aef
commit da92168ec8
19 changed files with 5004 additions and 2650 deletions

View file

@ -140,7 +140,7 @@ func TestStmtLines(t *testing.T) {
var m float64
switch runtime.GOARCH {
case "amd64":
m = 0.0111 // > 98.89% obtained on amd64, no backsliding
m = 0.0112 // > 98.88% obtained on amd64, no backsliding
case "riscv64":
m = 0.03 // XXX temporary update threshold to 97% for regabi
default:

View file

@ -88,6 +88,7 @@ var depsRules = `
internal/strconv,
internal/trace/tracev2,
math/bits,
simd,
structs
< internal/bytealg
< internal/stringslite
@ -835,7 +836,8 @@ var depsRules = `
os,
reflect,
strings,
sync
sync,
regexp
< internal/runtime/gc/internal/gen;
regexp, internal/txtar, internal/trace, internal/trace/raw

File diff suppressed because it is too large Load diff

View file

@ -11,9 +11,9 @@ import (
"testing"
)
func TestExpandAVX512(t *testing.T) {
func TestExpandAVX512Asm(t *testing.T) {
if !scan.CanAVX512() {
t.Skip("no AVX512")
}
testExpand(t, scan.ExpandAVX512)
testExpand(t, scan.ExpandAVX512Asm)
}

View file

@ -0,0 +1,19 @@
// Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build amd64 && goexperiment.simd
package scan_test
import (
"internal/runtime/gc/scan"
"testing"
)
func TestExpandAVX512(t *testing.T) {
if !scan.CanAVX512() {
t.Skip("no AVX512")
}
testExpand(t, scan.ExpandAVX512)
}

View file

@ -23,7 +23,7 @@ func testExpand(t *testing.T, expF expandFunc) {
for i := range want {
if got[i] != want[i] {
t.Errorf("expansion differs from reference at bit %d", i*goarch.PtrSize)
t.Errorf("expansion differs from reference at bit %d, sizeClass=%d", i*goarch.PtrSize, sizeClass)
if goarch.PtrSize == 4 {
t.Logf("got: %032b", got[i])
t.Logf("want: %032b", want[i])

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -2,9 +2,13 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build amd64
package scan
import "internal/runtime/gc"
import (
"internal/runtime/gc"
)
// ExpandAVX512 expands each bit in packed into f consecutive bits in unpacked,
// where f is the word size of objects in sizeClass.
@ -12,11 +16,11 @@ import "internal/runtime/gc"
// This is a testing entrypoint to the expanders used by scanSpanPacked*.
//
//go:noescape
func ExpandAVX512(sizeClass int, packed *gc.ObjMask, unpacked *gc.PtrMask)
func ExpandAVX512Asm(sizeClass int, packed *gc.ObjMask, unpacked *gc.PtrMask)
// gcExpandersAVX512 is the PCs of expander functions. These cannot be called directly
// as they don't follow the Go ABI, but you can use this to check if a given
// expander PC is 0.
//
// It is defined in assembly.
var gcExpandersAVX512 [len(gc.SizeClassToSize)]uintptr
var gcExpandersAVX512Asm [len(gc.SizeClassToSize)]uintptr

View file

@ -0,0 +1,24 @@
// Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build goexperiment.simd && amd64
package scan
import (
"internal/runtime/gc"
"simd"
"unsafe"
)
// ExpandAVX512 expands each bit in packed into f consecutive bits in unpacked,
// where f is the word size of objects in sizeClass.
//
// This is a testing entrypoint to the expanders used by scanSpanPacked*.
func ExpandAVX512(sizeClass int, packed *gc.ObjMask, unpacked *gc.PtrMask) {
v1, v2 := gcExpandersAVX512[sizeClass](unsafe.Pointer(packed))
v1.Store((*[8]uint64)(unsafe.Pointer(unpacked)))
v2.Store((*[8]uint64)(unsafe.Pointer(uintptr(unsafe.Pointer(unpacked)) + 64)))
simd.ClearAVXUpperBits()
}

View file

@ -22,7 +22,7 @@ import (
const header = "// Code generated by mkasm.go. DO NOT EDIT.\n\n"
func main() {
generate("expand_amd64.s", genExpanders)
generate("expanders_amd64.s", genExpanders)
}
func generate(fileName string, genFunc func(*gen.File)) {
@ -63,7 +63,7 @@ func genExpanders(file *gen.File) {
xf := int(ob) / 8
log.Printf("size class %d bytes, expansion %dx", ob, xf)
fn := gen.NewFunc(fmt.Sprintf("expandAVX512_%d<>", xf))
fn := gen.NewFunc(fmt.Sprintf("expandAVX512Asm_%d<>", xf))
ptrObjBits := gen.Arg[gen.Ptr[gen.Uint8x64]](fn)
if xf == 1 {
@ -79,7 +79,7 @@ func genExpanders(file *gen.File) {
}
// Generate table mapping size class to expander PC
file.AddConst("·gcExpandersAVX512", gcExpandersAVX512)
file.AddConst("·gcExpandersAVX512Asm", gcExpandersAVX512)
}
// mat8x8 is an 8x8 bit matrix.

View file

@ -0,0 +1,638 @@
// Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// This file is a fork of mkasm.go, instead of generating
// assembly code, this file generates Go code that uses
// the simd package.
//go:build ignore
package main
import (
"bytes"
"fmt"
"go/format"
"log"
"os"
"slices"
"strconv"
"strings"
"text/template"
"unsafe"
"internal/runtime/gc"
)
var simdTemplate = template.Must(template.New("template").Parse(`
{{- define "header"}}
// Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build goexperiment.simd && amd64
package scan
import (
"simd"
"unsafe"
)
{{- end}}
{{- define "expandersList"}}
var gcExpandersAVX512 = [{{- len .}}]func(unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8){
{{- range .}}
{{.}},
{{- end}}
}
{{- end}}
{{- define "expanderData"}}
var {{.Name}} = [8]uint64{
{{.Vals}}
}
{{- end}}
{{- define "expander"}}
func {{.Name}}(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) {
{{- .BodyLoadString }}
{{- .BodyString }}
}
{{- end}}
`))
// expanderData is global data used by the expanders.
// They will be generated as global arrays.
type expanderData struct {
Name string // Name of the global array
Vals string // The values of the arrays, should already be formatted.
}
// expander is the expander function, it only operates on 3 kinds of values:
//
// uint8x64, mask8x64, uint64.
//
// And a limited set of operations.
type expander struct {
Name string // The name of the expander function
BodyLoad strings.Builder
Body strings.Builder // The actual expand computations, after loads.
data []expanderData
dataByVals map[string]string
uint8x64Cnt int
mask8x64Cnt int
uint64Cnt int
}
// Used by text/template.
// This is needed because tex/template cannot call pointer receiver methods.
func (e expander) BodyLoadString() string {
return e.BodyLoad.String()
}
func (e expander) BodyString() string {
return e.Body.String()
}
// mat8x8 is an 8x8 bit matrix.
type mat8x8 struct {
mat [8]uint8
}
func matGroupToVec(mats *[8]mat8x8) [8]uint64 {
var out [8]uint64
for i, mat := range mats {
for j, row := range mat.mat {
// For some reason, Intel flips the rows.
out[i] |= uint64(row) << ((7 - j) * 8)
}
}
return out
}
func (fn *expander) newVec() string {
v := fmt.Sprintf("v%d", fn.uint8x64Cnt)
fn.uint8x64Cnt++
return v
}
func (fn *expander) newMask() string {
v := fmt.Sprintf("m%d", fn.mask8x64Cnt)
fn.mask8x64Cnt++
return v
}
func (fn *expander) newU() string {
v := fmt.Sprintf("u%d", fn.uint64Cnt)
fn.uint64Cnt++
return v
}
// expandIdentity implements 1x expansion (that is, no expansion).
func (fn *expander) expandIdentity() {
fn.Body.WriteString(`
x := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()
y := simd.LoadUint64x8((*[8]uint64)(unsafe.Pointer(uintptr(src)+64))).AsUint8x64()
return x.AsUint64x8(), y.AsUint64x8()`)
}
func (fn *expander) loadSrcAsUint8x64() string {
v := fn.newVec()
fn.BodyLoad.WriteString(fmt.Sprintf("%s := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()\n", v))
return v
}
func (fn *expander) loadGlobalArrAsUint8x64(arrName string) string {
v := fn.newVec()
fn.BodyLoad.WriteString(fmt.Sprintf("%s := simd.LoadUint64x8(&%s).AsUint8x64()\n", v, arrName))
return v
}
func (fn *expander) permuteUint8x64(data, indices string) string {
v := fn.newVec()
fn.Body.WriteString(fmt.Sprintf("%s := %s.Permute(%s)\n", v, data, indices))
return v
}
func (fn *expander) permute2Uint8x64(x, y, indices string) string {
v := fn.newVec()
fn.Body.WriteString(fmt.Sprintf("%s := %s.ConcatPermute(%s, %s)\n", v, x, y, indices))
return v
}
func (fn *expander) permuteMaskedUint8x64(data, indices, mask string) string {
v := fn.newVec()
fn.Body.WriteString(fmt.Sprintf("%s := %s.Permute(%s).Masked(%s)\n", v, data, indices, mask))
return v
}
func (fn *expander) permute2MaskedUint8x64(x, y, indices, mask string) string {
v := fn.newVec()
fn.Body.WriteString(fmt.Sprintf("%s := %s.ConcatPermute(%s, %s).Masked(%s)\n", v, x, y, indices, mask))
return v
}
func (fn *expander) galoisFieldAffineTransformUint8x64(data, matrix string) string {
v := fn.newVec()
fn.Body.WriteString(fmt.Sprintf("%s := %s.GaloisFieldAffineTransform(%s.AsUint64x8(), 0)\n", v, data, matrix))
return v
}
func (fn *expander) returns(x, y string) {
fn.Body.WriteString(fmt.Sprintf("return %s.AsUint64x8(), %s.AsUint64x8()", x, y))
}
func uint8x64Data(data [64]uint8) string {
res := ""
for i := range 8 {
ptr64 := (*uint64)(unsafe.Pointer(&data[i*8]))
res += fmt.Sprintf("%#016x,", *ptr64)
if i == 3 {
res += "\n"
}
}
return res
}
func uint64x8Data(data [8]uint64) string {
res := ""
for i := range 8 {
res += fmt.Sprintf("%#016x,", data[i])
if i == 3 {
res += "\n"
}
}
return res
}
func (fn *expander) loadGlobalUint8x64(name string, data [64]uint8) string {
val := uint8x64Data(data)
if n, ok := fn.dataByVals[val]; !ok {
fullName := fmt.Sprintf("%s_%s", fn.Name, name)
fn.data = append(fn.data, expanderData{fullName, val})
v := fn.loadGlobalArrAsUint8x64(fullName)
fn.dataByVals[val] = v
return v
} else {
return n
}
}
func (fn *expander) loadGlobalUint64x8(name string, data [8]uint64) string {
val := uint64x8Data(data)
if n, ok := fn.dataByVals[val]; !ok {
fullName := fmt.Sprintf("%s_%s", fn.Name, name)
fn.data = append(fn.data, expanderData{fullName, val})
v := fn.loadGlobalArrAsUint8x64(fullName)
fn.dataByVals[val] = v
return v
} else {
return n
}
}
func (fn *expander) mask8x64FromBits(data uint64) string {
v1 := fn.newU()
v2 := fn.newMask()
fn.Body.WriteString(fmt.Sprintf("%s := uint64(%#x)\n%s := simd.Mask8x64FromBits(%s)\n",
v1, data, v2, v1))
return v2
}
func (fn *expander) orUint8x64(x, y string) string {
v := fn.newVec()
fn.Body.WriteString(fmt.Sprintf("%s := %s.Or(%s)\n", v, x, y))
return v
}
func main() {
generate("expanders_amd64.go", genExpanders)
}
func generate(fileName string, genFunc func(*bytes.Buffer)) {
var buf bytes.Buffer
genFunc(&buf)
f, err := os.Create(fileName)
if err != nil {
log.Fatal(err)
}
defer f.Close()
b, err := format.Source(buf.Bytes())
if err != nil {
log.Printf(string(buf.Bytes()))
log.Fatal(err)
}
_, err = f.Write(b)
if err != nil {
log.Fatal(err)
}
}
func genExpanders(buffer *bytes.Buffer) {
if err := simdTemplate.ExecuteTemplate(buffer, "header", nil); err != nil {
panic(fmt.Errorf("failed to execute header template: %w", err))
}
gcExpandersAVX512 := make([]expander, len(gc.SizeClassToSize))
for sc, ob := range gc.SizeClassToSize {
if gc.SizeClassToNPages[sc] != 1 {
// These functions all produce a bitmap that covers exactly one
// page.
continue
}
if ob > gc.MinSizeForMallocHeader {
// This size class is too big to have a packed pointer/scalar bitmap.
break
}
xf := int(ob) / 8
log.Printf("size class %d bytes, expansion %dx", ob, xf)
fn := expander{Name: fmt.Sprintf("expandAVX512_%d", xf), dataByVals: make(map[string]string)}
if xf == 1 {
fn.expandIdentity()
} else {
ok := gfExpander(xf, &fn)
if !ok {
log.Printf("failed to generate expander for size class %d", sc)
}
}
gcExpandersAVX512[sc] = fn
}
// Fill in the expanders data first
eld := make([]string, len(gcExpandersAVX512))
for i, gce := range gcExpandersAVX512 {
if gce.Name == "" {
eld[i] = "nil"
} else {
eld[i] = gce.Name
}
}
if err := simdTemplate.ExecuteTemplate(buffer, "expandersList", eld); err != nil {
panic(fmt.Errorf("failed to execute expandersList template: %w", err))
}
// List out the expander functions and their data
for _, gce := range gcExpandersAVX512 {
if gce.Name == "" {
continue
}
for _, data := range gce.data {
if err := simdTemplate.ExecuteTemplate(buffer, "expanderData", data); err != nil {
panic(fmt.Errorf("failed to execute expanderData template: %w", err))
}
}
if err := simdTemplate.ExecuteTemplate(buffer, "expander", gce); err != nil {
panic(fmt.Errorf("failed to execute expander template: %w", err))
}
}
}
// gfExpander produces a function that expands each bit in an input bitmap into
// f consecutive bits in an output bitmap.
//
// The input is
//
// *[8]uint64 = A pointer to floor(1024/f) bits (f >= 2, so at most 512 bits)
//
// The output is
//
// [64]uint8 = The bottom 512 bits of the expanded bitmap
// [64]uint8 = The top 512 bits of the expanded bitmap
func gfExpander(f int, fn *expander) bool {
// TODO(austin): For powers of 2 >= 8, we can use mask expansion ops to make this much simpler.
// TODO(austin): For f >= 8, I suspect there are better ways to do this.
//
// For example, we could use a mask expansion to get a full byte for each
// input bit, and separately create the bytes that blend adjacent bits, then
// shuffle those bytes together. Certainly for f >= 16 this makes sense
// because each of those bytes will be used, possibly more than once.
objBits := fn.loadSrcAsUint8x64()
type term struct {
iByte, oByte int
mat mat8x8
}
var terms []term
// Iterate over all output bytes and construct the 8x8 GF2 matrix to compute
// the output byte from the appropriate input byte. Gather all of these into
// "terms".
for oByte := 0; oByte < 1024/8; oByte++ {
var byteMat mat8x8
iByte := -1
for oBit := oByte * 8; oBit < oByte*8+8; oBit++ {
iBit := oBit / f
if iByte == -1 {
iByte = iBit / 8
} else if iByte != iBit/8 {
log.Printf("output byte %d straddles input bytes %d and %d", oByte, iByte, iBit/8)
return false
}
// One way to view this is that the i'th row of the matrix will be
// ANDed with the input byte, and the parity of the result will set
// the i'th bit in the output. We use a simple 1 bit mask, so the
// parity is irrelevant beyond selecting out that one bit.
byteMat.mat[oBit%8] = 1 << (iBit % 8)
}
terms = append(terms, term{iByte, oByte, byteMat})
}
if false {
// Print input byte -> output byte as a matrix
maxIByte, maxOByte := 0, 0
for _, term := range terms {
maxIByte = max(maxIByte, term.iByte)
maxOByte = max(maxOByte, term.oByte)
}
iToO := make([][]rune, maxIByte+1)
for i := range iToO {
iToO[i] = make([]rune, maxOByte+1)
}
matMap := make(map[mat8x8]int)
for _, term := range terms {
i, ok := matMap[term.mat]
if !ok {
i = len(matMap)
matMap[term.mat] = i
}
iToO[term.iByte][term.oByte] = 'A' + rune(i)
}
for o := range maxOByte + 1 {
fmt.Printf("%d", o)
for i := range maxIByte + 1 {
fmt.Printf(",")
if mat := iToO[i][o]; mat != 0 {
fmt.Printf("%c", mat)
}
}
fmt.Println()
}
}
// In hardware, each (8 byte) matrix applies to 8 bytes of data in parallel,
// and we get to operate on up to 8 matrixes in parallel (or 64 values). That is:
//
// abcdefgh ijklmnop qrstuvwx yzABCDEF GHIJKLMN OPQRSTUV WXYZ0123 456789_+
// mat0 mat1 mat2 mat3 mat4 mat5 mat6 mat7
// Group the terms by matrix, but limit each group to 8 terms.
const termsPerGroup = 8 // Number of terms we can multiply by the same matrix.
const groupsPerSuperGroup = 8 // Number of matrixes we can fit in a vector.
matMap := make(map[mat8x8]int)
allMats := make(map[mat8x8]bool)
var termGroups [][]term
for _, term := range terms {
allMats[term.mat] = true
i, ok := matMap[term.mat]
if ok && f > groupsPerSuperGroup {
// The output is ultimately produced in two [64]uint8 registers.
// Getting every byte in the right place of each of these requires a
// final permutation that often requires more than one source.
//
// Up to 8x expansion, we can get a really nice grouping so we can use
// the same 8 matrix vector several times, without producing
// permutations that require more than two sources.
//
// Above 8x, however, we can't get nice matrixes anyway, so we
// instead prefer reducing the complexity of the permutations we
// need to produce the final outputs. To do this, avoid grouping
// together terms that are split across the two registers.
outRegister := termGroups[i][0].oByte / 64
if term.oByte/64 != outRegister {
ok = false
}
}
if !ok {
// Start a new term group.
i = len(termGroups)
matMap[term.mat] = i
termGroups = append(termGroups, nil)
}
termGroups[i] = append(termGroups[i], term)
if len(termGroups[i]) == termsPerGroup {
// This term group is full.
delete(matMap, term.mat)
}
}
for i, termGroup := range termGroups {
log.Printf("term group %d:", i)
for _, term := range termGroup {
log.Printf(" %+v", term)
}
}
// We can do 8 matrix multiplies in parallel, which is 8 term groups. Pack
// as many term groups as we can into each super-group to minimize the
// number of matrix multiplies.
//
// Ideally, we use the same matrix in each super-group, which might mean
// doing fewer than 8 multiplies at a time. That's fine because it never
// increases the total number of matrix multiplies.
//
// TODO: Packing the matrixes less densely may let us use more broadcast
// loads instead of general permutations, though. That replaces a load of
// the permutation with a load of the matrix, but is probably still slightly
// better.
var sgSize, nSuperGroups int
oneMatVec := f <= groupsPerSuperGroup
if oneMatVec {
// We can use the same matrix in each multiply by doing sgSize
// multiplies at a time.
sgSize = groupsPerSuperGroup / len(allMats) * len(allMats)
nSuperGroups = (len(termGroups) + sgSize - 1) / sgSize
} else {
// We can't use the same matrix for each multiply. Just do as many at a
// time as we can.
//
// TODO: This is going to produce several distinct matrixes, when we
// probably only need two. Be smarter about how we create super-groups
// in this case. Maybe we build up an array of super-groups and then the
// loop below just turns them into ops?
sgSize = 8
nSuperGroups = (len(termGroups) + groupsPerSuperGroup - 1) / groupsPerSuperGroup
}
// Construct each super-group.
var matGroup [8]mat8x8
var matMuls []string
var perm [128]int
for sgi := range nSuperGroups {
var iperm [64]uint8
for i := range iperm {
iperm[i] = 0xff // "Don't care"
}
// Pick off sgSize term groups.
superGroup := termGroups[:min(len(termGroups), sgSize)]
termGroups = termGroups[len(superGroup):]
// Build the matrix and permutations for this super-group.
var thisMatGroup [8]mat8x8
for i, termGroup := range superGroup {
// All terms in this group have the same matrix. Pick one.
thisMatGroup[i] = termGroup[0].mat
for j, term := range termGroup {
// Build the input permutation.
iperm[i*termsPerGroup+j] = uint8(term.iByte)
// Build the output permutation.
perm[term.oByte] = sgi*groupsPerSuperGroup*termsPerGroup + i*termsPerGroup + j
}
}
log.Printf("input permutation %d: %v", sgi, iperm)
// Check that we're not making more distinct matrixes than expected.
if oneMatVec {
if sgi == 0 {
matGroup = thisMatGroup
} else if matGroup != thisMatGroup {
log.Printf("super-groups have different matrixes:\n%+v\n%+v", matGroup, thisMatGroup)
return false
}
}
// Emit matrix op.
matConst :=
fn.loadGlobalUint64x8(fmt.Sprintf("mat%d", sgi),
matGroupToVec(&thisMatGroup))
inShufConst :=
fn.loadGlobalUint8x64(fmt.Sprintf("inShuf%d", sgi),
iperm)
inOp := fn.permuteUint8x64(objBits, inShufConst)
matMul := fn.galoisFieldAffineTransformUint8x64(inOp, matConst)
matMuls = append(matMuls, matMul)
}
log.Printf("output permutation: %v", perm)
outLo, ok := genShuffle(fn, "outShufLo", (*[64]int)(perm[:64]), matMuls...)
if !ok {
log.Printf("bad number of inputs to final shuffle: %d != 1, 2, or 4", len(matMuls))
return false
}
outHi, ok := genShuffle(fn, "outShufHi", (*[64]int)(perm[64:]), matMuls...)
if !ok {
log.Printf("bad number of inputs to final shuffle: %d != 1, 2, or 4", len(matMuls))
return false
}
fn.returns(outLo, outHi)
return true
}
func genShuffle(fn *expander, name string, perm *[64]int, args ...string) (string, bool) {
// Construct flattened permutation.
var vperm [64]byte
// Get the inputs used by this permutation.
var inputs []int
for i, src := range perm {
inputIdx := slices.Index(inputs, src/64)
if inputIdx == -1 {
inputIdx = len(inputs)
inputs = append(inputs, src/64)
}
vperm[i] = byte(src%64 | (inputIdx << 6))
}
// Emit instructions for easy cases.
switch len(inputs) {
case 1:
constOp := fn.loadGlobalUint8x64(name, vperm)
return fn.permuteUint8x64(args[inputs[0]], constOp), true
case 2:
constOp := fn.loadGlobalUint8x64(name, vperm)
return fn.permute2Uint8x64(args[inputs[0]], args[inputs[1]], constOp), true
}
// Harder case, we need to shuffle in from up to 2 more tables.
//
// Perform two shuffles. One shuffle will get its data from the first
// two inputs, the other shuffle will get its data from the other one
// or two inputs. All values they don't care each don't care about will
// be zeroed.
var vperms [2][64]byte
var masks [2]uint64
for j, idx := range vperm {
for i := range vperms {
vperms[i][j] = 0xff // "Don't care"
}
if idx == 0xff {
continue
}
vperms[idx/128][j] = idx % 128
masks[idx/128] |= uint64(1) << j
}
// Validate that the masks are fully disjoint.
if masks[0]^masks[1] != ^uint64(0) {
panic("bad shuffle!")
}
// Generate constants.
constOps := make([]string, len(vperms))
for i, v := range vperms {
constOps[i] = fn.loadGlobalUint8x64(name+strconv.Itoa(i), v)
}
// Generate shuffles.
switch len(inputs) {
case 3:
r0 := fn.permute2MaskedUint8x64(args[inputs[0]], args[inputs[1]], constOps[0], fn.mask8x64FromBits(masks[0]))
r1 := fn.permuteMaskedUint8x64(args[inputs[2]], constOps[1], fn.mask8x64FromBits(masks[1]))
return fn.orUint8x64(r0, r1), true
case 4:
r0 := fn.permute2MaskedUint8x64(args[inputs[0]], args[inputs[1]], constOps[0], fn.mask8x64FromBits(masks[0]))
r1 := fn.permute2MaskedUint8x64(args[inputs[2]], args[inputs[3]], constOps[1], fn.mask8x64FromBits(masks[1]))
return fn.orUint8x64(r0, r1), true
}
// Too many inputs. To support more, we'd need to separate tables much earlier.
// Right now all the indices fit in a byte, but with >4 inputs they might not (>256 bytes).
return args[0], false
}

View file

@ -6,13 +6,25 @@ package scan
import (
"internal/cpu"
"internal/goexperiment"
"internal/runtime/gc"
"unsafe"
)
func ScanSpanPacked(mem unsafe.Pointer, bufp *uintptr, objMarks *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32) {
if CanAVX512() {
return ScanSpanPackedAVX512(mem, bufp, objMarks, sizeClass, ptrMask)
if goexperiment.SIMD {
return ScanSpanPackedAVX512(mem, bufp, objMarks, sizeClass, ptrMask)
} else {
return ScanSpanPackedAVX512Asm(mem, bufp, objMarks, sizeClass, ptrMask)
}
}
panic("not implemented")
}
func ScanSpanPackedAsm(mem unsafe.Pointer, bufp *uintptr, objMarks *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32) {
if CanAVX512() {
return ScanSpanPackedAVX512Asm(mem, bufp, objMarks, sizeClass, ptrMask)
}
panic("not implemented")
}
@ -27,12 +39,12 @@ func CanAVX512() bool {
return avx512ScanPackedReqsMet
}
func ScanSpanPackedAVX512(mem unsafe.Pointer, bufp *uintptr, objMarks *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32) {
return FilterNil(bufp, scanSpanPackedAVX512(mem, bufp, objMarks, sizeClass, ptrMask))
func ScanSpanPackedAVX512Asm(mem unsafe.Pointer, bufp *uintptr, objMarks *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32) {
return FilterNil(bufp, scanSpanPackedAVX512Asm(mem, bufp, objMarks, sizeClass, ptrMask))
}
//go:noescape
func scanSpanPackedAVX512(mem unsafe.Pointer, bufp *uintptr, objMarks *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32)
func scanSpanPackedAVX512Asm(mem unsafe.Pointer, bufp *uintptr, objMarks *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32)
var avx512ScanPackedReqsMet = cpu.X86.HasAVX512VL &&
cpu.X86.HasAVX512BW &&

View file

@ -6,12 +6,12 @@
#include "textflag.h"
// Test-only.
TEXT ·ExpandAVX512(SB), NOSPLIT, $0-24
TEXT ·ExpandAVX512Asm(SB), NOSPLIT, $0-24
MOVQ sizeClass+0(FP), CX
MOVQ packed+8(FP), AX
// Call the expander for this size class
LEAQ ·gcExpandersAVX512(SB), BX
LEAQ ·gcExpandersAVX512Asm(SB), BX
CALL (BX)(CX*8)
MOVQ unpacked+16(FP), DI // Expanded output bitmap pointer
@ -20,11 +20,11 @@ TEXT ·ExpandAVX512(SB), NOSPLIT, $0-24
VZEROUPPER
RET
TEXT ·scanSpanPackedAVX512(SB), NOSPLIT, $256-44
TEXT ·scanSpanPackedAVX512Asm(SB), NOSPLIT, $256-44
// Z1+Z2 = Expand the grey object mask into a grey word mask
MOVQ objMarks+16(FP), AX
MOVQ sizeClass+24(FP), CX
LEAQ ·gcExpandersAVX512(SB), BX
LEAQ ·gcExpandersAVX512Asm(SB), BX
CALL (BX)(CX*8)
// Z3+Z4 = Load the pointer mask

View file

@ -11,6 +11,13 @@ import (
"testing"
)
func TestScanSpanPackedAVX512Asm(t *testing.T) {
if !scan.CanAVX512() {
t.Skip("no AVX512")
}
testScanSpanPacked(t, scan.ScanSpanPackedAVX512Asm)
}
func TestScanSpanPackedAVX512(t *testing.T) {
if !scan.CanAVX512() {
t.Skip("no AVX512")

View file

@ -21,3 +21,6 @@ func HasFastScanSpanPacked() bool {
func ScanSpanPacked(mem unsafe.Pointer, bufp *uintptr, objMarks *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32) {
return ScanSpanPackedGo(mem, bufp, objMarks, sizeClass, ptrMask)
}
func ScanSpanPackedAsm(mem unsafe.Pointer, bufp *uintptr, objMarks *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32) {
panic("not implemented")
}

View file

@ -0,0 +1,16 @@
// Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build !goexperiment.simd
package scan
import (
"internal/runtime/gc"
"unsafe"
)
func ScanSpanPackedAVX512(mem unsafe.Pointer, bufp *uintptr, objMarks *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32) {
panic("not implemented")
}

View file

@ -0,0 +1,92 @@
// Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build goexperiment.simd && amd64
package scan
import (
"internal/abi"
"internal/runtime/gc"
"math/bits"
"simd"
"unsafe"
)
func FilterNilAVX512(bufp *uintptr, n int32) (cnt int32) {
scanned := 0
buf := unsafe.Slice((*uint64)(unsafe.Pointer(bufp)), int(n))
// Use the widest vector
var zeros simd.Uint64x8
for ; scanned+8 <= int(n); scanned += 8 {
v := simd.LoadUint64x8Slice(buf[scanned:])
m := v.NotEqual(zeros)
v.Compress(m).StoreSlice(buf[cnt:])
// Count the mask bits
mbits := uint64(m.ToBits())
mbits &= 0xFF // Only the lower 8 bits are meaningful.
nonNilCnt := bits.OnesCount64(mbits)
cnt += int32(nonNilCnt)
}
// Scalar code to clean up tails.
for i := scanned; i < int(n); i++ {
if buf[i] != 0 {
buf[cnt] = buf[i]
cnt++
}
}
return
}
func ScanSpanPackedAVX512(mem unsafe.Pointer, bufp *uintptr, objMarks *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32) {
return FilterNilAVX512(bufp, scanSpanPackedAVX512(mem, bufp, objMarks, sizeClass, ptrMask))
}
func scanSpanPackedAVX512(mem unsafe.Pointer, buf *uintptr, objDarts *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32) {
// Expand the grey object mask into a grey word mask
m1, m2 := gcExpandersAVX512[sizeClass](abi.NoEscape(unsafe.Pointer(objDarts)))
// Load the pointer mask
ptrm := unsafe.Pointer(ptrMask)
m3 := simd.LoadUint64x8((*[8]uint64)(ptrm))
m4 := simd.LoadUint64x8((*[8]uint64)(unsafe.Pointer(uintptr(ptrm) + 64)))
masks := [128]uint8{}
counts := [128]uint8{}
// Combine the grey word mask with the pointer mask to get the scan mask
m1m3 := m1.And(m3).AsUint8x64()
m2m4 := m2.And(m4).AsUint8x64()
m1m3.Store((*[64]uint8)(unsafe.Pointer(&masks[0])))
m2m4.Store((*[64]uint8)(unsafe.Pointer(&masks[64])))
// Now each bit of m1m3 and m2m4 represents one word of the span.
// Thus, each byte covers 64 bytes of memory, which is also how
// much we can fix in a ZMM register.
//
// We do a load/compress for each 64 byte frame.
//
// counts = Number of memory words to scan in each 64 byte frame
// TODO: Right now the type casting is done via memory, is it possible to
// workaround these stores and loads and keep them in register?
m1m3.OnesCount().Store((*[64]uint8)(unsafe.Pointer(&counts[0])))
m2m4.OnesCount().Store((*[64]uint8)(unsafe.Pointer(&counts[64])))
// Loop over the 64 byte frames in this span.
// TODO: is there a way to PCALIGN this loop?
for i := range 128 {
mv := masks[i]
// Skip empty frames.
if mv == 0 {
continue
}
// Load the 64 byte frame.
m := simd.Mask64x8FromBits(mv)
ptrs := simd.LoadUint64x8((*[8]uint64)(unsafe.Pointer(uintptr(mem) + uintptr(i*64))))
// Collect just the pointers from the greyed objects into the scan buffer,
// i.e., copy the word indices in the mask from Z1 into contiguous memory.
ptrs.Compress(m).Store((*[8]uint64)(unsafe.Pointer(uintptr(unsafe.Pointer(buf)) + uintptr(count*8))))
// Advance the scan buffer position by the number of pointers.
count += int32(counts[i])
}
simd.ClearAVXUpperBits()
return
}

View file

@ -204,6 +204,13 @@ func benchmarkScanSpanPacked(b *testing.B, nPages int, sizeClass int) {
scan.ScanSpanPacked(unsafe.Pointer(&mem[gc.PageWords*page]), &buf[0], &objMarks, uintptr(sizeClass), &ptrs[page])
}
})
b.Run("impl=PlatformAsm", func(b *testing.B) {
b.SetBytes(avgBytes)
for i := range b.N {
page := pageOrder[i%len(pageOrder)]
scan.ScanSpanPackedAsm(unsafe.Pointer(&mem[gc.PageWords*page]), &buf[0], &objMarks, uintptr(sizeClass), &ptrs[page])
}
})
}
})
}