[dev.simd] internal/runtime/gc: add simd package based greentea kernels

This CL adds a new generator to internal/runtime/gc/scan that generates expander kernels in Go SIMD. This CL also includes a Go SIMD scan kernel and a Go SIMD filter kernel. This CL also includes the plumbing, it will use the Go SIMD kernels if goexperiment.simd is on. Benchmark results: ... ScanSpanPacked/cache=tiny/pages=1/sizeclass=26/pct=80-88 354.8n ± 1% 272.4n ± 0% -23.22% (p=0.002 n=6) ScanSpanPacked/cache=tiny/pages=1/sizeclass=26/pct=90-88 375.7n ± 0% 287.1n ± 0% -23.58% (p=0.002 n=6) ScanSpanPacked/cache=tiny/pages=1/sizeclass=26/pct=100-88 450.0n ± 1% 327.4n ± 0% -27.24% (p=0.002 n=6) geomean 246.5n 199.4n -19.10% Throughput +25%. Change-Id: Ib85e01b7de18181db9e7b6026863209a993aa85f Reviewed-on: https://go-review.googlesource.com/c/go/+/719520 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: David Chase <drchase@google.com>
2025-12-08 06:10:04 +00:00 · 2025-03-09 17:19:48 +00:00 · 2025-03-09 17:19:48 +00:00 · da92168ec8
commit da92168ec8
parent 3fdd183aef
19 changed files with 5004 additions and 2650 deletions
--- a/src/cmd/compile/internal/ssa/stmtlines_test.go
+++ b/src/cmd/compile/internal/ssa/stmtlines_test.go
@ -140,7 +140,7 @@ func TestStmtLines(t *testing.T) {
 	var m float64
 	switch runtime.GOARCH {
 	case "amd64":
-		m = 0.0111 // > 98.89% obtained on amd64, no backsliding
+		m = 0.0112 // > 98.88% obtained on amd64, no backsliding
 	case "riscv64":
 		m = 0.03 // XXX temporary update threshold to 97% for regabi
 	default:
--- a/src/go/build/deps_test.go
+++ b/src/go/build/deps_test.go
@ -88,6 +88,7 @@ var depsRules = `
 	internal/strconv,
 	internal/trace/tracev2,
 	math/bits,
+	simd,
 	structs
 	< internal/bytealg
 	< internal/stringslite
@ -835,7 +836,8 @@ var depsRules = `
 	os,
 	reflect,
 	strings,
-	sync
+	sync,
+	regexp
 	< internal/runtime/gc/internal/gen;

 	regexp, internal/txtar, internal/trace, internal/trace/raw
--- a/src/internal/runtime/gc/scan/expand_amd64.s
+++ b/src/internal/runtime/gc/scan/expand_amd64.s
--- a/src/internal/runtime/gc/scan/expand_amd64_test.go
+++ b/src/internal/runtime/gc/scan/expand_amd64_test.go
@ -11,9 +11,9 @@ import (
 	"testing"
 )

-func TestExpandAVX512(t *testing.T) {
+func TestExpandAVX512Asm(t *testing.T) {
 	if !scan.CanAVX512() {
 		t.Skip("no AVX512")
 	}
-	testExpand(t, scan.ExpandAVX512)
+	testExpand(t, scan.ExpandAVX512Asm)
 }
--- a/src/internal/runtime/gc/scan/expand_simd_amd64_test.go
+++ b/src/internal/runtime/gc/scan/expand_simd_amd64_test.go
@ -0,0 +1,19 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build amd64 && goexperiment.simd
+
+package scan_test
+
+import (
+	"internal/runtime/gc/scan"
+	"testing"
+)
+
+func TestExpandAVX512(t *testing.T) {
+	if !scan.CanAVX512() {
+		t.Skip("no AVX512")
+	}
+	testExpand(t, scan.ExpandAVX512)
+}
--- a/src/internal/runtime/gc/scan/expand_test.go
+++ b/src/internal/runtime/gc/scan/expand_test.go
@ -23,7 +23,7 @@ func testExpand(t *testing.T, expF expandFunc) {

 		for i := range want {
 			if got[i] != want[i] {
-				t.Errorf("expansion differs from reference at bit %d", i*goarch.PtrSize)
+				t.Errorf("expansion differs from reference at bit %d, sizeClass=%d", i*goarch.PtrSize, sizeClass)
 				if goarch.PtrSize == 4 {
 					t.Logf("got:  %032b", got[i])
 					t.Logf("want: %032b", want[i])
--- a/src/internal/runtime/gc/scan/expanders_amd64.go
+++ b/src/internal/runtime/gc/scan/expanders_amd64.go
--- a/src/internal/runtime/gc/scan/expanders_amd64.s
+++ b/src/internal/runtime/gc/scan/expanders_amd64.s
--- a/src/internal/runtime/gc/scan/export_amd64_test.go
+++ b/src/internal/runtime/gc/scan/export_amd64_test.go
@ -2,9 +2,13 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

+//go:build amd64
+
 package scan

-import "internal/runtime/gc"
+import (
+	"internal/runtime/gc"
+)

 // ExpandAVX512 expands each bit in packed into f consecutive bits in unpacked,
 // where f is the word size of objects in sizeClass.
@ -12,11 +16,11 @@ import "internal/runtime/gc"
 // This is a testing entrypoint to the expanders used by scanSpanPacked*.
 //
 //go:noescape
-func ExpandAVX512(sizeClass int, packed *gc.ObjMask, unpacked *gc.PtrMask)
+func ExpandAVX512Asm(sizeClass int, packed *gc.ObjMask, unpacked *gc.PtrMask)

 // gcExpandersAVX512 is the PCs of expander functions. These cannot be called directly
 // as they don't follow the Go ABI, but you can use this to check if a given
 // expander PC is 0.
 //
 // It is defined in assembly.
-var gcExpandersAVX512 [len(gc.SizeClassToSize)]uintptr
+var gcExpandersAVX512Asm [len(gc.SizeClassToSize)]uintptr
--- a/src/internal/runtime/gc/scan/export_simd_amd64_test.go
+++ b/src/internal/runtime/gc/scan/export_simd_amd64_test.go
@ -0,0 +1,24 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build goexperiment.simd && amd64
+
+package scan
+
+import (
+	"internal/runtime/gc"
+	"simd"
+	"unsafe"
+)
+
+// ExpandAVX512 expands each bit in packed into f consecutive bits in unpacked,
+// where f is the word size of objects in sizeClass.
+//
+// This is a testing entrypoint to the expanders used by scanSpanPacked*.
+func ExpandAVX512(sizeClass int, packed *gc.ObjMask, unpacked *gc.PtrMask) {
+	v1, v2 := gcExpandersAVX512[sizeClass](unsafe.Pointer(packed))
+	v1.Store((*[8]uint64)(unsafe.Pointer(unpacked)))
+	v2.Store((*[8]uint64)(unsafe.Pointer(uintptr(unsafe.Pointer(unpacked)) + 64)))
+	simd.ClearAVXUpperBits()
+}
--- a/src/internal/runtime/gc/scan/mkasm.go
+++ b/src/internal/runtime/gc/scan/mkasm.go
@ -22,7 +22,7 @@ import (
 const header = "// Code generated by mkasm.go. DO NOT EDIT.\n\n"

 func main() {
-	generate("expand_amd64.s", genExpanders)
+	generate("expanders_amd64.s", genExpanders)
 }

 func generate(fileName string, genFunc func(*gen.File)) {
@ -63,7 +63,7 @@ func genExpanders(file *gen.File) {
 		xf := int(ob) / 8
 		log.Printf("size class %d bytes, expansion %dx", ob, xf)

-		fn := gen.NewFunc(fmt.Sprintf("expandAVX512_%d<>", xf))
+		fn := gen.NewFunc(fmt.Sprintf("expandAVX512Asm_%d<>", xf))
 		ptrObjBits := gen.Arg[gen.Ptr[gen.Uint8x64]](fn)

 		if xf == 1 {
@ -79,7 +79,7 @@ func genExpanders(file *gen.File) {
 	}

 	// Generate table mapping size class to expander PC
-	file.AddConst("·gcExpandersAVX512", gcExpandersAVX512)
+	file.AddConst("·gcExpandersAVX512Asm", gcExpandersAVX512)
 }

 // mat8x8 is an 8x8 bit matrix.
--- a/src/internal/runtime/gc/scan/mkexpanders.go
+++ b/src/internal/runtime/gc/scan/mkexpanders.go
@ -0,0 +1,638 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This file is a fork of mkasm.go, instead of generating
+// assembly code, this file generates Go code that uses
+// the simd package.
+
+//go:build ignore
+
+package main
+
+import (
+	"bytes"
+	"fmt"
+	"go/format"
+	"log"
+	"os"
+	"slices"
+	"strconv"
+	"strings"
+	"text/template"
+	"unsafe"
+
+	"internal/runtime/gc"
+)
+
+var simdTemplate = template.Must(template.New("template").Parse(`
+{{- define "header"}}
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build goexperiment.simd && amd64
+
+package scan
+
+import (
+	"simd"
+	"unsafe"
+)
+{{- end}}
+{{- define "expandersList"}}
+var gcExpandersAVX512 = [{{- len .}}]func(unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8){
+{{- range .}}
+	{{.}},
+{{- end}}
+}
+{{- end}}
+
+{{- define "expanderData"}}
+var {{.Name}} = [8]uint64{
+{{.Vals}}
+}
+{{- end}}
+
+{{- define "expander"}}
+func {{.Name}}(src unsafe.Pointer) (simd.Uint64x8, simd.Uint64x8) {
+	{{- .BodyLoadString }}
+	{{- .BodyString }}
+}
+{{- end}}
+`))
+
+// expanderData is global data used by the expanders.
+// They will be generated as global arrays.
+type expanderData struct {
+	Name string // Name of the global array
+	Vals string // The values of the arrays, should already be formatted.
+}
+
+// expander is the expander function, it only operates on 3 kinds of values:
+//
+//	uint8x64, mask8x64, uint64.
+//
+// And a limited set of operations.
+type expander struct {
+	Name        string // The name of the expander function
+	BodyLoad    strings.Builder
+	Body        strings.Builder // The actual expand computations, after loads.
+	data        []expanderData
+	dataByVals  map[string]string
+	uint8x64Cnt int
+	mask8x64Cnt int
+	uint64Cnt   int
+}
+
+// Used by text/template.
+// This is needed because tex/template cannot call pointer receiver methods.
+func (e expander) BodyLoadString() string {
+	return e.BodyLoad.String()
+}
+
+func (e expander) BodyString() string {
+	return e.Body.String()
+}
+
+// mat8x8 is an 8x8 bit matrix.
+type mat8x8 struct {
+	mat [8]uint8
+}
+
+func matGroupToVec(mats *[8]mat8x8) [8]uint64 {
+	var out [8]uint64
+	for i, mat := range mats {
+		for j, row := range mat.mat {
+			// For some reason, Intel flips the rows.
+			out[i] |= uint64(row) << ((7 - j) * 8)
+		}
+	}
+	return out
+}
+
+func (fn *expander) newVec() string {
+	v := fmt.Sprintf("v%d", fn.uint8x64Cnt)
+	fn.uint8x64Cnt++
+	return v
+}
+
+func (fn *expander) newMask() string {
+	v := fmt.Sprintf("m%d", fn.mask8x64Cnt)
+	fn.mask8x64Cnt++
+	return v
+}
+
+func (fn *expander) newU() string {
+	v := fmt.Sprintf("u%d", fn.uint64Cnt)
+	fn.uint64Cnt++
+	return v
+}
+
+// expandIdentity implements 1x expansion (that is, no expansion).
+func (fn *expander) expandIdentity() {
+	fn.Body.WriteString(`
+	x := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()
+	y := simd.LoadUint64x8((*[8]uint64)(unsafe.Pointer(uintptr(src)+64))).AsUint8x64()
+	return x.AsUint64x8(), y.AsUint64x8()`)
+}
+
+func (fn *expander) loadSrcAsUint8x64() string {
+	v := fn.newVec()
+	fn.BodyLoad.WriteString(fmt.Sprintf("%s := simd.LoadUint64x8((*[8]uint64)(src)).AsUint8x64()\n", v))
+	return v
+}
+
+func (fn *expander) loadGlobalArrAsUint8x64(arrName string) string {
+	v := fn.newVec()
+	fn.BodyLoad.WriteString(fmt.Sprintf("%s := simd.LoadUint64x8(&%s).AsUint8x64()\n", v, arrName))
+	return v
+}
+
+func (fn *expander) permuteUint8x64(data, indices string) string {
+	v := fn.newVec()
+	fn.Body.WriteString(fmt.Sprintf("%s := %s.Permute(%s)\n", v, data, indices))
+	return v
+}
+
+func (fn *expander) permute2Uint8x64(x, y, indices string) string {
+	v := fn.newVec()
+	fn.Body.WriteString(fmt.Sprintf("%s := %s.ConcatPermute(%s, %s)\n", v, x, y, indices))
+	return v
+}
+
+func (fn *expander) permuteMaskedUint8x64(data, indices, mask string) string {
+	v := fn.newVec()
+	fn.Body.WriteString(fmt.Sprintf("%s := %s.Permute(%s).Masked(%s)\n", v, data, indices, mask))
+	return v
+}
+
+func (fn *expander) permute2MaskedUint8x64(x, y, indices, mask string) string {
+	v := fn.newVec()
+	fn.Body.WriteString(fmt.Sprintf("%s := %s.ConcatPermute(%s, %s).Masked(%s)\n", v, x, y, indices, mask))
+	return v
+}
+
+func (fn *expander) galoisFieldAffineTransformUint8x64(data, matrix string) string {
+	v := fn.newVec()
+	fn.Body.WriteString(fmt.Sprintf("%s := %s.GaloisFieldAffineTransform(%s.AsUint64x8(), 0)\n", v, data, matrix))
+	return v
+}
+
+func (fn *expander) returns(x, y string) {
+	fn.Body.WriteString(fmt.Sprintf("return %s.AsUint64x8(), %s.AsUint64x8()", x, y))
+}
+
+func uint8x64Data(data [64]uint8) string {
+	res := ""
+	for i := range 8 {
+		ptr64 := (*uint64)(unsafe.Pointer(&data[i*8]))
+		res += fmt.Sprintf("%#016x,", *ptr64)
+		if i == 3 {
+			res += "\n"
+		}
+	}
+	return res
+}
+
+func uint64x8Data(data [8]uint64) string {
+	res := ""
+	for i := range 8 {
+		res += fmt.Sprintf("%#016x,", data[i])
+		if i == 3 {
+			res += "\n"
+		}
+	}
+	return res
+}
+
+func (fn *expander) loadGlobalUint8x64(name string, data [64]uint8) string {
+	val := uint8x64Data(data)
+	if n, ok := fn.dataByVals[val]; !ok {
+		fullName := fmt.Sprintf("%s_%s", fn.Name, name)
+		fn.data = append(fn.data, expanderData{fullName, val})
+		v := fn.loadGlobalArrAsUint8x64(fullName)
+		fn.dataByVals[val] = v
+		return v
+	} else {
+		return n
+	}
+}
+
+func (fn *expander) loadGlobalUint64x8(name string, data [8]uint64) string {
+	val := uint64x8Data(data)
+	if n, ok := fn.dataByVals[val]; !ok {
+		fullName := fmt.Sprintf("%s_%s", fn.Name, name)
+		fn.data = append(fn.data, expanderData{fullName, val})
+		v := fn.loadGlobalArrAsUint8x64(fullName)
+		fn.dataByVals[val] = v
+		return v
+	} else {
+		return n
+	}
+}
+
+func (fn *expander) mask8x64FromBits(data uint64) string {
+	v1 := fn.newU()
+	v2 := fn.newMask()
+	fn.Body.WriteString(fmt.Sprintf("%s := uint64(%#x)\n%s := simd.Mask8x64FromBits(%s)\n",
+		v1, data, v2, v1))
+	return v2
+}
+
+func (fn *expander) orUint8x64(x, y string) string {
+	v := fn.newVec()
+	fn.Body.WriteString(fmt.Sprintf("%s := %s.Or(%s)\n", v, x, y))
+	return v
+}
+
+func main() {
+	generate("expanders_amd64.go", genExpanders)
+}
+
+func generate(fileName string, genFunc func(*bytes.Buffer)) {
+	var buf bytes.Buffer
+	genFunc(&buf)
+	f, err := os.Create(fileName)
+	if err != nil {
+		log.Fatal(err)
+	}
+	defer f.Close()
+	b, err := format.Source(buf.Bytes())
+	if err != nil {
+		log.Printf(string(buf.Bytes()))
+		log.Fatal(err)
+	}
+	_, err = f.Write(b)
+	if err != nil {
+		log.Fatal(err)
+	}
+}
+
+func genExpanders(buffer *bytes.Buffer) {
+	if err := simdTemplate.ExecuteTemplate(buffer, "header", nil); err != nil {
+		panic(fmt.Errorf("failed to execute header template: %w", err))
+	}
+	gcExpandersAVX512 := make([]expander, len(gc.SizeClassToSize))
+	for sc, ob := range gc.SizeClassToSize {
+		if gc.SizeClassToNPages[sc] != 1 {
+			// These functions all produce a bitmap that covers exactly one
+			// page.
+			continue
+		}
+		if ob > gc.MinSizeForMallocHeader {
+			// This size class is too big to have a packed pointer/scalar bitmap.
+			break
+		}
+
+		xf := int(ob) / 8
+		log.Printf("size class %d bytes, expansion %dx", ob, xf)
+
+		fn := expander{Name: fmt.Sprintf("expandAVX512_%d", xf), dataByVals: make(map[string]string)}
+
+		if xf == 1 {
+			fn.expandIdentity()
+		} else {
+			ok := gfExpander(xf, &fn)
+			if !ok {
+				log.Printf("failed to generate expander for size class %d", sc)
+			}
+		}
+		gcExpandersAVX512[sc] = fn
+	}
+	// Fill in the expanders data first
+	eld := make([]string, len(gcExpandersAVX512))
+	for i, gce := range gcExpandersAVX512 {
+		if gce.Name == "" {
+			eld[i] = "nil"
+		} else {
+			eld[i] = gce.Name
+		}
+	}
+	if err := simdTemplate.ExecuteTemplate(buffer, "expandersList", eld); err != nil {
+		panic(fmt.Errorf("failed to execute expandersList template: %w", err))
+	}
+	// List out the expander functions and their data
+	for _, gce := range gcExpandersAVX512 {
+		if gce.Name == "" {
+			continue
+		}
+		for _, data := range gce.data {
+			if err := simdTemplate.ExecuteTemplate(buffer, "expanderData", data); err != nil {
+				panic(fmt.Errorf("failed to execute expanderData template: %w", err))
+			}
+		}
+		if err := simdTemplate.ExecuteTemplate(buffer, "expander", gce); err != nil {
+			panic(fmt.Errorf("failed to execute expander template: %w", err))
+		}
+	}
+}
+
+// gfExpander produces a function that expands each bit in an input bitmap into
+// f consecutive bits in an output bitmap.
+//
+// The input is
+//
+//	*[8]uint64 = A pointer to floor(1024/f) bits (f >= 2, so at most 512 bits)
+//
+// The output is
+//
+//	[64]uint8  = The bottom 512 bits of the expanded bitmap
+//	[64]uint8  = The top 512 bits of the expanded bitmap
+func gfExpander(f int, fn *expander) bool {
+	// TODO(austin): For powers of 2 >= 8, we can use mask expansion ops to make this much simpler.
+
+	// TODO(austin): For f >= 8, I suspect there are better ways to do this.
+	//
+	// For example, we could use a mask expansion to get a full byte for each
+	// input bit, and separately create the bytes that blend adjacent bits, then
+	// shuffle those bytes together. Certainly for f >= 16 this makes sense
+	// because each of those bytes will be used, possibly more than once.
+
+	objBits := fn.loadSrcAsUint8x64()
+
+	type term struct {
+		iByte, oByte int
+		mat          mat8x8
+	}
+	var terms []term
+
+	// Iterate over all output bytes and construct the 8x8 GF2 matrix to compute
+	// the output byte from the appropriate input byte. Gather all of these into
+	// "terms".
+	for oByte := 0; oByte < 1024/8; oByte++ {
+		var byteMat mat8x8
+		iByte := -1
+		for oBit := oByte * 8; oBit < oByte*8+8; oBit++ {
+			iBit := oBit / f
+			if iByte == -1 {
+				iByte = iBit / 8
+			} else if iByte != iBit/8 {
+				log.Printf("output byte %d straddles input bytes %d and %d", oByte, iByte, iBit/8)
+				return false
+			}
+			// One way to view this is that the i'th row of the matrix will be
+			// ANDed with the input byte, and the parity of the result will set
+			// the i'th bit in the output. We use a simple 1 bit mask, so the
+			// parity is irrelevant beyond selecting out that one bit.
+			byteMat.mat[oBit%8] = 1 << (iBit % 8)
+		}
+		terms = append(terms, term{iByte, oByte, byteMat})
+	}
+
+	if false {
+		// Print input byte -> output byte as a matrix
+		maxIByte, maxOByte := 0, 0
+		for _, term := range terms {
+			maxIByte = max(maxIByte, term.iByte)
+			maxOByte = max(maxOByte, term.oByte)
+		}
+		iToO := make([][]rune, maxIByte+1)
+		for i := range iToO {
+			iToO[i] = make([]rune, maxOByte+1)
+		}
+		matMap := make(map[mat8x8]int)
+		for _, term := range terms {
+			i, ok := matMap[term.mat]
+			if !ok {
+				i = len(matMap)
+				matMap[term.mat] = i
+			}
+			iToO[term.iByte][term.oByte] = 'A' + rune(i)
+		}
+		for o := range maxOByte + 1 {
+			fmt.Printf("%d", o)
+			for i := range maxIByte + 1 {
+				fmt.Printf(",")
+				if mat := iToO[i][o]; mat != 0 {
+					fmt.Printf("%c", mat)
+				}
+			}
+			fmt.Println()
+		}
+	}
+
+	// In hardware, each (8 byte) matrix applies to 8 bytes of data in parallel,
+	// and we get to operate on up to 8 matrixes in parallel (or 64 values). That is:
+	//
+	//  abcdefgh ijklmnop qrstuvwx yzABCDEF GHIJKLMN OPQRSTUV WXYZ0123 456789_+
+	//    mat0     mat1     mat2     mat3     mat4     mat5     mat6     mat7
+
+	// Group the terms by matrix, but limit each group to 8 terms.
+	const termsPerGroup = 8       // Number of terms we can multiply by the same matrix.
+	const groupsPerSuperGroup = 8 // Number of matrixes we can fit in a vector.
+
+	matMap := make(map[mat8x8]int)
+	allMats := make(map[mat8x8]bool)
+	var termGroups [][]term
+	for _, term := range terms {
+		allMats[term.mat] = true
+
+		i, ok := matMap[term.mat]
+		if ok && f > groupsPerSuperGroup {
+			// The output is ultimately produced in two [64]uint8 registers.
+			// Getting every byte in the right place of each of these requires a
+			// final permutation that often requires more than one source.
+			//
+			// Up to 8x expansion, we can get a really nice grouping so we can use
+			// the same 8 matrix vector several times, without producing
+			// permutations that require more than two sources.
+			//
+			// Above 8x, however, we can't get nice matrixes anyway, so we
+			// instead prefer reducing the complexity of the permutations we
+			// need to produce the final outputs. To do this, avoid grouping
+			// together terms that are split across the two registers.
+			outRegister := termGroups[i][0].oByte / 64
+			if term.oByte/64 != outRegister {
+				ok = false
+			}
+		}
+		if !ok {
+			// Start a new term group.
+			i = len(termGroups)
+			matMap[term.mat] = i
+			termGroups = append(termGroups, nil)
+		}
+
+		termGroups[i] = append(termGroups[i], term)
+
+		if len(termGroups[i]) == termsPerGroup {
+			// This term group is full.
+			delete(matMap, term.mat)
+		}
+	}
+
+	for i, termGroup := range termGroups {
+		log.Printf("term group %d:", i)
+		for _, term := range termGroup {
+			log.Printf("  %+v", term)
+		}
+	}
+
+	// We can do 8 matrix multiplies in parallel, which is 8 term groups. Pack
+	// as many term groups as we can into each super-group to minimize the
+	// number of matrix multiplies.
+	//
+	// Ideally, we use the same matrix in each super-group, which might mean
+	// doing fewer than 8 multiplies at a time. That's fine because it never
+	// increases the total number of matrix multiplies.
+	//
+	// TODO: Packing the matrixes less densely may let us use more broadcast
+	// loads instead of general permutations, though. That replaces a load of
+	// the permutation with a load of the matrix, but is probably still slightly
+	// better.
+	var sgSize, nSuperGroups int
+	oneMatVec := f <= groupsPerSuperGroup
+	if oneMatVec {
+		// We can use the same matrix in each multiply by doing sgSize
+		// multiplies at a time.
+		sgSize = groupsPerSuperGroup / len(allMats) * len(allMats)
+		nSuperGroups = (len(termGroups) + sgSize - 1) / sgSize
+	} else {
+		// We can't use the same matrix for each multiply. Just do as many at a
+		// time as we can.
+		//
+		// TODO: This is going to produce several distinct matrixes, when we
+		// probably only need two. Be smarter about how we create super-groups
+		// in this case. Maybe we build up an array of super-groups and then the
+		// loop below just turns them into ops?
+		sgSize = 8
+		nSuperGroups = (len(termGroups) + groupsPerSuperGroup - 1) / groupsPerSuperGroup
+	}
+
+	// Construct each super-group.
+	var matGroup [8]mat8x8
+	var matMuls []string
+	var perm [128]int
+	for sgi := range nSuperGroups {
+		var iperm [64]uint8
+		for i := range iperm {
+			iperm[i] = 0xff // "Don't care"
+		}
+		// Pick off sgSize term groups.
+		superGroup := termGroups[:min(len(termGroups), sgSize)]
+		termGroups = termGroups[len(superGroup):]
+		// Build the matrix and permutations for this super-group.
+		var thisMatGroup [8]mat8x8
+		for i, termGroup := range superGroup {
+			// All terms in this group have the same matrix. Pick one.
+			thisMatGroup[i] = termGroup[0].mat
+			for j, term := range termGroup {
+				// Build the input permutation.
+				iperm[i*termsPerGroup+j] = uint8(term.iByte)
+				// Build the output permutation.
+				perm[term.oByte] = sgi*groupsPerSuperGroup*termsPerGroup + i*termsPerGroup + j
+			}
+		}
+		log.Printf("input permutation %d: %v", sgi, iperm)
+
+		// Check that we're not making more distinct matrixes than expected.
+		if oneMatVec {
+			if sgi == 0 {
+				matGroup = thisMatGroup
+			} else if matGroup != thisMatGroup {
+				log.Printf("super-groups have different matrixes:\n%+v\n%+v", matGroup, thisMatGroup)
+				return false
+			}
+		}
+
+		// Emit matrix op.
+		matConst :=
+			fn.loadGlobalUint64x8(fmt.Sprintf("mat%d", sgi),
+				matGroupToVec(&thisMatGroup))
+		inShufConst :=
+			fn.loadGlobalUint8x64(fmt.Sprintf("inShuf%d", sgi),
+				iperm)
+		inOp := fn.permuteUint8x64(objBits, inShufConst)
+		matMul := fn.galoisFieldAffineTransformUint8x64(inOp, matConst)
+		matMuls = append(matMuls, matMul)
+	}
+
+	log.Printf("output permutation: %v", perm)
+
+	outLo, ok := genShuffle(fn, "outShufLo", (*[64]int)(perm[:64]), matMuls...)
+	if !ok {
+		log.Printf("bad number of inputs to final shuffle: %d != 1, 2, or 4", len(matMuls))
+		return false
+	}
+	outHi, ok := genShuffle(fn, "outShufHi", (*[64]int)(perm[64:]), matMuls...)
+	if !ok {
+		log.Printf("bad number of inputs to final shuffle: %d != 1, 2, or 4", len(matMuls))
+		return false
+	}
+	fn.returns(outLo, outHi)
+
+	return true
+}
+
+func genShuffle(fn *expander, name string, perm *[64]int, args ...string) (string, bool) {
+	// Construct flattened permutation.
+	var vperm [64]byte
+
+	// Get the inputs used by this permutation.
+	var inputs []int
+	for i, src := range perm {
+		inputIdx := slices.Index(inputs, src/64)
+		if inputIdx == -1 {
+			inputIdx = len(inputs)
+			inputs = append(inputs, src/64)
+		}
+		vperm[i] = byte(src%64 | (inputIdx << 6))
+	}
+
+	// Emit instructions for easy cases.
+	switch len(inputs) {
+	case 1:
+		constOp := fn.loadGlobalUint8x64(name, vperm)
+		return fn.permuteUint8x64(args[inputs[0]], constOp), true
+	case 2:
+		constOp := fn.loadGlobalUint8x64(name, vperm)
+		return fn.permute2Uint8x64(args[inputs[0]], args[inputs[1]], constOp), true
+	}
+
+	// Harder case, we need to shuffle in from up to 2 more tables.
+	//
+	// Perform two shuffles. One shuffle will get its data from the first
+	// two inputs, the other shuffle will get its data from the other one
+	// or two inputs. All values they don't care each don't care about will
+	// be zeroed.
+	var vperms [2][64]byte
+	var masks [2]uint64
+	for j, idx := range vperm {
+		for i := range vperms {
+			vperms[i][j] = 0xff // "Don't care"
+		}
+		if idx == 0xff {
+			continue
+		}
+		vperms[idx/128][j] = idx % 128
+		masks[idx/128] |= uint64(1) << j
+	}
+
+	// Validate that the masks are fully disjoint.
+	if masks[0]^masks[1] != ^uint64(0) {
+		panic("bad shuffle!")
+	}
+
+	// Generate constants.
+	constOps := make([]string, len(vperms))
+	for i, v := range vperms {
+		constOps[i] = fn.loadGlobalUint8x64(name+strconv.Itoa(i), v)
+	}
+
+	// Generate shuffles.
+	switch len(inputs) {
+	case 3:
+		r0 := fn.permute2MaskedUint8x64(args[inputs[0]], args[inputs[1]], constOps[0], fn.mask8x64FromBits(masks[0]))
+		r1 := fn.permuteMaskedUint8x64(args[inputs[2]], constOps[1], fn.mask8x64FromBits(masks[1]))
+		return fn.orUint8x64(r0, r1), true
+	case 4:
+		r0 := fn.permute2MaskedUint8x64(args[inputs[0]], args[inputs[1]], constOps[0], fn.mask8x64FromBits(masks[0]))
+		r1 := fn.permute2MaskedUint8x64(args[inputs[2]], args[inputs[3]], constOps[1], fn.mask8x64FromBits(masks[1]))
+		return fn.orUint8x64(r0, r1), true
+	}
+
+	// Too many inputs. To support more, we'd need to separate tables much earlier.
+	// Right now all the indices fit in a byte, but with >4 inputs they might not (>256 bytes).
+	return args[0], false
+}
--- a/src/internal/runtime/gc/scan/scan_amd64.go
+++ b/src/internal/runtime/gc/scan/scan_amd64.go
@ -6,13 +6,25 @@ package scan

 import (
 	"internal/cpu"
+	"internal/goexperiment"
 	"internal/runtime/gc"
 	"unsafe"
 )

 func ScanSpanPacked(mem unsafe.Pointer, bufp *uintptr, objMarks *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32) {
 	if CanAVX512() {
-		return ScanSpanPackedAVX512(mem, bufp, objMarks, sizeClass, ptrMask)
+		if goexperiment.SIMD {
+			return ScanSpanPackedAVX512(mem, bufp, objMarks, sizeClass, ptrMask)
+		} else {
+			return ScanSpanPackedAVX512Asm(mem, bufp, objMarks, sizeClass, ptrMask)
+		}
+	}
+	panic("not implemented")
+}
+
+func ScanSpanPackedAsm(mem unsafe.Pointer, bufp *uintptr, objMarks *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32) {
+	if CanAVX512() {
+		return ScanSpanPackedAVX512Asm(mem, bufp, objMarks, sizeClass, ptrMask)
 	}
 	panic("not implemented")
 }
@ -27,12 +39,12 @@ func CanAVX512() bool {
 	return avx512ScanPackedReqsMet
 }

-func ScanSpanPackedAVX512(mem unsafe.Pointer, bufp *uintptr, objMarks *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32) {
-	return FilterNil(bufp, scanSpanPackedAVX512(mem, bufp, objMarks, sizeClass, ptrMask))
+func ScanSpanPackedAVX512Asm(mem unsafe.Pointer, bufp *uintptr, objMarks *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32) {
+	return FilterNil(bufp, scanSpanPackedAVX512Asm(mem, bufp, objMarks, sizeClass, ptrMask))
 }

 //go:noescape
-func scanSpanPackedAVX512(mem unsafe.Pointer, bufp *uintptr, objMarks *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32)
+func scanSpanPackedAVX512Asm(mem unsafe.Pointer, bufp *uintptr, objMarks *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32)

 var avx512ScanPackedReqsMet = cpu.X86.HasAVX512VL &&
 	cpu.X86.HasAVX512BW &&
--- a/src/internal/runtime/gc/scan/scan_amd64.s
+++ b/src/internal/runtime/gc/scan/scan_amd64.s
@ -6,12 +6,12 @@
 #include "textflag.h"

 // Test-only.
-TEXT ·ExpandAVX512(SB), NOSPLIT, $0-24
+TEXT ·ExpandAVX512Asm(SB), NOSPLIT, $0-24
 	MOVQ sizeClass+0(FP), CX
 	MOVQ packed+8(FP), AX

 	// Call the expander for this size class
-	LEAQ ·gcExpandersAVX512(SB), BX
+	LEAQ ·gcExpandersAVX512Asm(SB), BX
 	CALL (BX)(CX*8)

 	MOVQ unpacked+16(FP), DI // Expanded output bitmap pointer
@ -20,11 +20,11 @@ TEXT ·ExpandAVX512(SB), NOSPLIT, $0-24
 	VZEROUPPER
 	RET

-TEXT ·scanSpanPackedAVX512(SB), NOSPLIT, $256-44
+TEXT ·scanSpanPackedAVX512Asm(SB), NOSPLIT, $256-44
 	// Z1+Z2 = Expand the grey object mask into a grey word mask
 	MOVQ objMarks+16(FP), AX
 	MOVQ sizeClass+24(FP), CX
-	LEAQ ·gcExpandersAVX512(SB), BX
+	LEAQ ·gcExpandersAVX512Asm(SB), BX
 	CALL (BX)(CX*8)

 	// Z3+Z4 = Load the pointer mask
--- a/src/internal/runtime/gc/scan/scan_amd64_test.go
+++ b/src/internal/runtime/gc/scan/scan_amd64_test.go
@ -11,6 +11,13 @@ import (
 	"testing"
 )

+func TestScanSpanPackedAVX512Asm(t *testing.T) {
+	if !scan.CanAVX512() {
+		t.Skip("no AVX512")
+	}
+	testScanSpanPacked(t, scan.ScanSpanPackedAVX512Asm)
+}
+
 func TestScanSpanPackedAVX512(t *testing.T) {
 	if !scan.CanAVX512() {
 		t.Skip("no AVX512")
--- a/src/internal/runtime/gc/scan/scan_generic.go
+++ b/src/internal/runtime/gc/scan/scan_generic.go
@ -21,3 +21,6 @@ func HasFastScanSpanPacked() bool {
 func ScanSpanPacked(mem unsafe.Pointer, bufp *uintptr, objMarks *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32) {
 	return ScanSpanPackedGo(mem, bufp, objMarks, sizeClass, ptrMask)
 }
+func ScanSpanPackedAsm(mem unsafe.Pointer, bufp *uintptr, objMarks *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32) {
+	panic("not implemented")
+}
--- a/src/internal/runtime/gc/scan/scan_nosimd_amd64.go
+++ b/src/internal/runtime/gc/scan/scan_nosimd_amd64.go
@ -0,0 +1,16 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !goexperiment.simd
+
+package scan
+
+import (
+	"internal/runtime/gc"
+	"unsafe"
+)
+
+func ScanSpanPackedAVX512(mem unsafe.Pointer, bufp *uintptr, objMarks *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32) {
+	panic("not implemented")
+}
--- a/src/internal/runtime/gc/scan/scan_simd_amd64.go
+++ b/src/internal/runtime/gc/scan/scan_simd_amd64.go
@ -0,0 +1,92 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build goexperiment.simd && amd64
+
+package scan
+
+import (
+	"internal/abi"
+	"internal/runtime/gc"
+	"math/bits"
+	"simd"
+	"unsafe"
+)
+
+func FilterNilAVX512(bufp *uintptr, n int32) (cnt int32) {
+	scanned := 0
+	buf := unsafe.Slice((*uint64)(unsafe.Pointer(bufp)), int(n))
+	// Use the widest vector
+	var zeros simd.Uint64x8
+	for ; scanned+8 <= int(n); scanned += 8 {
+		v := simd.LoadUint64x8Slice(buf[scanned:])
+		m := v.NotEqual(zeros)
+		v.Compress(m).StoreSlice(buf[cnt:])
+		// Count the mask bits
+		mbits := uint64(m.ToBits())
+		mbits &= 0xFF // Only the lower 8 bits are meaningful.
+		nonNilCnt := bits.OnesCount64(mbits)
+		cnt += int32(nonNilCnt)
+	}
+	// Scalar code to clean up tails.
+	for i := scanned; i < int(n); i++ {
+		if buf[i] != 0 {
+			buf[cnt] = buf[i]
+			cnt++
+		}
+	}
+	return
+}
+
+func ScanSpanPackedAVX512(mem unsafe.Pointer, bufp *uintptr, objMarks *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32) {
+	return FilterNilAVX512(bufp, scanSpanPackedAVX512(mem, bufp, objMarks, sizeClass, ptrMask))
+}
+
+func scanSpanPackedAVX512(mem unsafe.Pointer, buf *uintptr, objDarts *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32) {
+	// Expand the grey object mask into a grey word mask
+	m1, m2 := gcExpandersAVX512[sizeClass](abi.NoEscape(unsafe.Pointer(objDarts)))
+	// Load the pointer mask
+	ptrm := unsafe.Pointer(ptrMask)
+	m3 := simd.LoadUint64x8((*[8]uint64)(ptrm))
+	m4 := simd.LoadUint64x8((*[8]uint64)(unsafe.Pointer(uintptr(ptrm) + 64)))
+
+	masks := [128]uint8{}
+	counts := [128]uint8{}
+	// Combine the grey word mask with the pointer mask to get the scan mask
+	m1m3 := m1.And(m3).AsUint8x64()
+	m2m4 := m2.And(m4).AsUint8x64()
+	m1m3.Store((*[64]uint8)(unsafe.Pointer(&masks[0])))
+	m2m4.Store((*[64]uint8)(unsafe.Pointer(&masks[64])))
+	// Now each bit of m1m3 and m2m4 represents one word of the span.
+	// Thus, each byte covers 64 bytes of memory, which is also how
+	// much we can fix in a ZMM register.
+	//
+	// We do a load/compress for each 64 byte frame.
+	//
+	// counts = Number of memory words to scan in each 64 byte frame
+	// TODO: Right now the type casting is done via memory, is it possible to
+	// workaround these stores and loads and keep them in register?
+	m1m3.OnesCount().Store((*[64]uint8)(unsafe.Pointer(&counts[0])))
+	m2m4.OnesCount().Store((*[64]uint8)(unsafe.Pointer(&counts[64])))
+
+	// Loop over the 64 byte frames in this span.
+	// TODO: is there a way to PCALIGN this loop?
+	for i := range 128 {
+		mv := masks[i]
+		// Skip empty frames.
+		if mv == 0 {
+			continue
+		}
+		// Load the 64 byte frame.
+		m := simd.Mask64x8FromBits(mv)
+		ptrs := simd.LoadUint64x8((*[8]uint64)(unsafe.Pointer(uintptr(mem) + uintptr(i*64))))
+		// Collect just the pointers from the greyed objects into the scan buffer,
+		// i.e., copy the word indices in the mask from Z1 into contiguous memory.
+		ptrs.Compress(m).Store((*[8]uint64)(unsafe.Pointer(uintptr(unsafe.Pointer(buf)) + uintptr(count*8))))
+		// Advance the scan buffer position by the number of pointers.
+		count += int32(counts[i])
+	}
+	simd.ClearAVXUpperBits()
+	return
+}
--- a/src/internal/runtime/gc/scan/scan_test.go
+++ b/src/internal/runtime/gc/scan/scan_test.go
@ -204,6 +204,13 @@ func benchmarkScanSpanPacked(b *testing.B, nPages int, sizeClass int) {
 						scan.ScanSpanPacked(unsafe.Pointer(&mem[gc.PageWords*page]), &buf[0], &objMarks, uintptr(sizeClass), &ptrs[page])
 					}
 				})
+				b.Run("impl=PlatformAsm", func(b *testing.B) {
+					b.SetBytes(avgBytes)
+					for i := range b.N {
+						page := pageOrder[i%len(pageOrder)]
+						scan.ScanSpanPackedAsm(unsafe.Pointer(&mem[gc.PageWords*page]), &buf[0], &objMarks, uintptr(sizeClass), &ptrs[page])
+					}
+				})
 			}
 		})
 	}