go/src/internal/runtime/gc/scan/scan_test.go

// Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package scan_test

import (
	"fmt"
	"internal/cpu"
	"internal/goarch"
	"internal/runtime/gc"
	"internal/runtime/gc/scan"
	"math/bits"
	"math/rand/v2"
	"slices"
	"sync"
	"testing"
	"unsafe"
)

type scanFunc func(mem unsafe.Pointer, bufp *uintptr, objMarks *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32)

func testScanSpanPacked(t *testing.T, scanF scanFunc) {
	scanR := scan.ScanSpanPackedReference

	// Construct a fake memory
	mem, free := makeMem(t, 1)
	defer free()
	for i := range mem {
		// Use values > heap.PageSize because a scan function can discard
		// pointers smaller than this.
		mem[i] = uintptr(int(gc.PageSize) + i + 1)
	}

	// Construct a random pointer mask
	rnd := rand.New(rand.NewPCG(42, 42))
	var ptrs gc.PtrMask
	for i := range ptrs {
		ptrs[i] = uintptr(rnd.Uint64())
	}

	bufF := make([]uintptr, gc.PageWords)
	bufR := make([]uintptr, gc.PageWords)
	testObjs(t, func(t *testing.T, sizeClass int, objs *gc.ObjMask) {
		nF := scanF(unsafe.Pointer(&mem[0]), &bufF[0], objs, uintptr(sizeClass), &ptrs)
		nR := scanR(unsafe.Pointer(&mem[0]), &bufR[0], objs, uintptr(sizeClass), &ptrs)

		if nR != nF {
			t.Errorf("want %d count, got %d", nR, nF)
		} else if !slices.Equal(bufF[:nF], bufR[:nR]) {
			t.Errorf("want scanned pointers %d, got %d", bufR[:nR], bufF[:nF])
		}
	})
}

func testObjs(t *testing.T, f func(t *testing.T, sizeClass int, objMask *gc.ObjMask)) {
	for sizeClass := range gc.NumSizeClasses {
		if sizeClass == 0 {
			continue
		}
		size := uintptr(gc.SizeClassToSize[sizeClass])
		if size > gc.MinSizeForMallocHeader {
			break // Pointer/scalar metadata is not packed for larger sizes.
		}
		t.Run(fmt.Sprintf("size=%d", size), func(t *testing.T) {
			// Scan a few objects near i to test boundary conditions.
			const objMask = 0x101
			nObj := uintptr(gc.SizeClassToNPages[sizeClass]) * gc.PageSize / size
			for i := range nObj - uintptr(bits.Len(objMask)-1) {
				t.Run(fmt.Sprintf("objs=0x%x<<%d", objMask, i), func(t *testing.T) {
					var objs gc.ObjMask
					objs[i/goarch.PtrBits] = objMask << (i % goarch.PtrBits)
					f(t, sizeClass, &objs)
				})
			}
		})
	}
}

var dataCacheSizes = sync.OnceValue(func() []uintptr {
	cs := cpu.DataCacheSizes()
	for i, c := range cs {
		fmt.Printf("# L%d cache: %d (%d Go pages)\n", i+1, c, c/gc.PageSize)
	}
	return cs
})

func BenchmarkScanSpanPacked(b *testing.B) {
	benchmarkCacheSizes(b, benchmarkScanSpanPackedAllSizeClasses)
}

func benchmarkCacheSizes(b *testing.B, fn func(b *testing.B, heapPages int)) {
	cacheSizes := dataCacheSizes()
	b.Run("cache=tiny/pages=1", func(b *testing.B) {
		fn(b, 1)
	})
	for i, cacheBytes := range cacheSizes {
		pages := int(cacheBytes*3/4) / gc.PageSize
		b.Run(fmt.Sprintf("cache=L%d/pages=%d", i+1, pages), func(b *testing.B) {
			fn(b, pages)
		})
	}
	if len(cacheSizes) == 0 {
		return
	}
	ramPages := int(cacheSizes[len(cacheSizes)-1]*3/2) / gc.PageSize
	b.Run(fmt.Sprintf("cache=ram/pages=%d", ramPages), func(b *testing.B) {
		fn(b, ramPages)
	})
}

func benchmarkScanSpanPackedAllSizeClasses(b *testing.B, nPages int) {
	for sc := range gc.NumSizeClasses {
		if sc == 0 {
			continue
		}
		size := gc.SizeClassToSize[sc]
		if size >= gc.MinSizeForMallocHeader {
			break
		}
		b.Run(fmt.Sprintf("sizeclass=%d", sc), func(b *testing.B) {
			benchmarkScanSpanPacked(b, nPages, sc)
		})
	}
}

func benchmarkScanSpanPacked(b *testing.B, nPages int, sizeClass int) {
	rnd := rand.New(rand.NewPCG(42, 42))

	// Construct a fake memory
	mem, free := makeMem(b, nPages)
	defer free()
	for i := range mem {
		// Use values > heap.PageSize because a scan function can discard
		// pointers smaller than this.
		mem[i] = uintptr(int(gc.PageSize) + i + 1)
	}

	// Construct a random pointer mask
	ptrs := make([]gc.PtrMask, nPages)
	for i := range ptrs {
		for j := range ptrs[i] {
			ptrs[i][j] = uintptr(rnd.Uint64())
		}
	}

	// Visit the pages in a random order
	pageOrder := rnd.Perm(nPages)

	// Create the scan buffer.
	buf := make([]uintptr, gc.PageWords)

	// Sweep from 0 marks to all marks. We'll use the same marks for each page
	// because I don't think that predictability matters.
	objBytes := uintptr(gc.SizeClassToSize[sizeClass])
	nObj := gc.PageSize / objBytes
	markOrder := rnd.Perm(int(nObj))
	const steps = 11
	for i := 0; i < steps; i++ {
		frac := float64(i) / float64(steps-1)
		// Set frac marks.
		nMarks := int(float64(len(markOrder))*frac + 0.5)
		var objMarks gc.ObjMask
		for _, mark := range markOrder[:nMarks] {
			objMarks[mark/goarch.PtrBits] |= 1 << (mark % goarch.PtrBits)
		}
		greyClusters := 0
		for page := range ptrs {
			greyClusters += countGreyClusters(sizeClass, &objMarks, &ptrs[page])
		}

		// Report MB/s of how much memory they're actually hitting. This assumes
		// 64 byte cache lines (TODO: Should it assume 128 byte cache lines?)
		// and expands each access to the whole cache line. This is useful for
		// comparing against memory bandwidth.
		//
		// TODO: Add a benchmark that just measures single core memory bandwidth
		// for comparison. (See runtime memcpy benchmarks.)
		//
		// TODO: Should there be a separate measure where we don't expand to
		// cache lines?
		avgBytes := int64(greyClusters) * int64(cpu.CacheLineSize) / int64(len(ptrs))

		b.Run(fmt.Sprintf("pct=%d", int(100*frac)), func(b *testing.B) {
			b.Run("impl=Reference", func(b *testing.B) {
				b.SetBytes(avgBytes)
				for i := range b.N {
					page := pageOrder[i%len(pageOrder)]
					scan.ScanSpanPackedReference(unsafe.Pointer(&mem[gc.PageWords*page]), &buf[0], &objMarks, uintptr(sizeClass), &ptrs[page])
				}
			})
			b.Run("impl=Go", func(b *testing.B) {
				b.SetBytes(avgBytes)
				for i := range b.N {
					page := pageOrder[i%len(pageOrder)]
					scan.ScanSpanPackedGo(unsafe.Pointer(&mem[gc.PageWords*page]), &buf[0], &objMarks, uintptr(sizeClass), &ptrs[page])
				}
			})
			if scan.HasFastScanSpanPacked() {
				b.Run("impl=Platform", func(b *testing.B) {
					b.SetBytes(avgBytes)
					for i := range b.N {
						page := pageOrder[i%len(pageOrder)]
						scan.ScanSpanPacked(unsafe.Pointer(&mem[gc.PageWords*page]), &buf[0], &objMarks, uintptr(sizeClass), &ptrs[page])
					}
				})
				b.Run("impl=PlatformAsm", func(b *testing.B) {
					b.SetBytes(avgBytes)
					for i := range b.N {
						page := pageOrder[i%len(pageOrder)]
						scan.ScanSpanPackedAsm(unsafe.Pointer(&mem[gc.PageWords*page]), &buf[0], &objMarks, uintptr(sizeClass), &ptrs[page])
					}
				})
			}
		})
	}
}

func countGreyClusters(sizeClass int, objMarks *gc.ObjMask, ptrMask *gc.PtrMask) int {
	clusters := 0
	lastCluster := -1

	expandBy := uintptr(gc.SizeClassToSize[sizeClass]) / goarch.PtrSize
	for word := range gc.PageWords {
		objI := uintptr(word) / expandBy
		if objMarks[objI/goarch.PtrBits]&(1<<(objI%goarch.PtrBits)) == 0 {
			continue
		}
		if ptrMask[word/goarch.PtrBits]&(1<<(word%goarch.PtrBits)) == 0 {
			continue
		}
		c := word * 8 / goarch.PtrBits
		if c != lastCluster {
			lastCluster = c
			clusters++
		}
	}
	return clusters
}

func BenchmarkScanMaxBandwidth(b *testing.B) {
	// Measure the theoretical "maximum" bandwidth of scanning by reproducing
	// the memory access pattern of a full page scan, but using memcpy as the
	// kernel instead of scanning.
	benchmarkCacheSizes(b, func(b *testing.B, heapPages int) {
		mem, free := makeMem(b, heapPages)
		defer free()
		for i := range mem {
			mem[i] = uintptr(int(gc.PageSize) + i + 1)
		}
		buf := make([]uintptr, gc.PageWords)

		// Visit the pages in a random order
		rnd := rand.New(rand.NewPCG(42, 42))
		pageOrder := rnd.Perm(heapPages)

		b.SetBytes(int64(gc.PageSize))

		b.ResetTimer()
		for i := range b.N {
			page := pageOrder[i%len(pageOrder)]
			copy(buf, mem[gc.PageWords*page:])
		}
	})
}
internal/runtime/gc/scan: import scan kernel from gclab [green tea] This change imports the AVX512 GC scanning kernel from CL 593938 into a new package, internal/runtime/gc/scan. Credit to Austin Clements for most of this work. I did some cleanup, added support for more size classes to the expanders, and added more testing. I also restructured the code to make it easier and clearer to add new scan kernels for new architectures. For #73581. Change-Id: I76bcbc889fa6cad73ba0084620fae084a5912e6b Cq-Include-Trybots: luci.golang.try:gotip-linux-amd64_avx512,gotip-linux-amd64_avx512-greenteagc Reviewed-on: https://go-review.googlesource.com/c/go/+/655280 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Auto-Submit: Michael Knyszek <mknyszek@google.com> Reviewed-by: Michael Pratt <mpratt@google.com> 2025-03-05 20:12:47 +00:00			`// Copyright 2025 The Go Authors. All rights reserved.`
			`// Use of this source code is governed by a BSD-style`
			`// license that can be found in the LICENSE file.`

			`package scan_test`

			`import (`
			`"fmt"`
			`"internal/cpu"`
			`"internal/goarch"`
			`"internal/runtime/gc"`
			`"internal/runtime/gc/scan"`
			`"math/bits"`
			`"math/rand/v2"`
			`"slices"`
			`"sync"`
			`"testing"`
			`"unsafe"`
			`)`

			`type scanFunc func(mem unsafe.Pointer, bufp uintptr, objMarks gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32)`

			`func testScanSpanPacked(t *testing.T, scanF scanFunc) {`
			`scanR := scan.ScanSpanPackedReference`

			`// Construct a fake memory`
			`mem, free := makeMem(t, 1)`
			`defer free()`
			`for i := range mem {`
			`// Use values > heap.PageSize because a scan function can discard`
			`// pointers smaller than this.`
			`mem[i] = uintptr(int(gc.PageSize) + i + 1)`
			`}`

			`// Construct a random pointer mask`
			`rnd := rand.New(rand.NewPCG(42, 42))`
			`var ptrs gc.PtrMask`
			`for i := range ptrs {`
			`ptrs[i] = uintptr(rnd.Uint64())`
			`}`

			`bufF := make([]uintptr, gc.PageWords)`
			`bufR := make([]uintptr, gc.PageWords)`
			`testObjs(t, func(t testing.T, sizeClass int, objs gc.ObjMask) {`
			`nF := scanF(unsafe.Pointer(&mem[0]), &bufF[0], objs, uintptr(sizeClass), &ptrs)`
			`nR := scanR(unsafe.Pointer(&mem[0]), &bufR[0], objs, uintptr(sizeClass), &ptrs)`

			`if nR != nF {`
			`t.Errorf("want %d count, got %d", nR, nF)`
			`} else if !slices.Equal(bufF[:nF], bufR[:nR]) {`
			`t.Errorf("want scanned pointers %d, got %d", bufR[:nR], bufF[:nF])`
			`}`
			`})`
			`}`

			`func testObjs(t testing.T, f func(t testing.T, sizeClass int, objMask *gc.ObjMask)) {`
			`for sizeClass := range gc.NumSizeClasses {`
			`if sizeClass == 0 {`
			`continue`
			`}`
			`size := uintptr(gc.SizeClassToSize[sizeClass])`
			`if size > gc.MinSizeForMallocHeader {`
			`break // Pointer/scalar metadata is not packed for larger sizes.`
			`}`
			`t.Run(fmt.Sprintf("size=%d", size), func(t *testing.T) {`
			`// Scan a few objects near i to test boundary conditions.`
			`const objMask = 0x101`
			`nObj := uintptr(gc.SizeClassToNPages[sizeClass]) * gc.PageSize / size`
			`for i := range nObj - uintptr(bits.Len(objMask)-1) {`
			`t.Run(fmt.Sprintf("objs=0x%x<<%d", objMask, i), func(t *testing.T) {`
			`var objs gc.ObjMask`
			`objs[i/goarch.PtrBits] = objMask << (i % goarch.PtrBits)`
			`f(t, sizeClass, &objs)`
			`})`
			`}`
			`})`
			`}`
			`}`

			`var dataCacheSizes = sync.OnceValue(func() []uintptr {`
			`cs := cpu.DataCacheSizes()`
			`for i, c := range cs {`
			`fmt.Printf("# L%d cache: %d (%d Go pages)\n", i+1, c, c/gc.PageSize)`
			`}`
			`return cs`
			`})`

			`func BenchmarkScanSpanPacked(b *testing.B) {`
			`benchmarkCacheSizes(b, benchmarkScanSpanPackedAllSizeClasses)`
			`}`

			`func benchmarkCacheSizes(b testing.B, fn func(b testing.B, heapPages int)) {`
			`cacheSizes := dataCacheSizes()`
			`b.Run("cache=tiny/pages=1", func(b *testing.B) {`
			`fn(b, 1)`
			`})`
			`for i, cacheBytes := range cacheSizes {`
			`pages := int(cacheBytes*3/4) / gc.PageSize`
			`b.Run(fmt.Sprintf("cache=L%d/pages=%d", i+1, pages), func(b *testing.B) {`
			`fn(b, pages)`
			`})`
			`}`
internal/runtime/gc/scan: avoid -1 index when cache sizes unavailable Fixes #74984. Fixes #74983. Change-Id: I011c66c2005bc4d92f1d17f1f8ce88158634f71f Reviewed-on: https://go-review.googlesource.com/c/go/+/695476 Auto-Submit: Michael Knyszek <mknyszek@google.com> Reviewed-by: Michael Pratt <mpratt@google.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> 2025-08-12 21:57:23 +00:00			`if len(cacheSizes) == 0 {`
			`return`
			`}`
internal/runtime/gc/scan: import scan kernel from gclab [green tea] This change imports the AVX512 GC scanning kernel from CL 593938 into a new package, internal/runtime/gc/scan. Credit to Austin Clements for most of this work. I did some cleanup, added support for more size classes to the expanders, and added more testing. I also restructured the code to make it easier and clearer to add new scan kernels for new architectures. For #73581. Change-Id: I76bcbc889fa6cad73ba0084620fae084a5912e6b Cq-Include-Trybots: luci.golang.try:gotip-linux-amd64_avx512,gotip-linux-amd64_avx512-greenteagc Reviewed-on: https://go-review.googlesource.com/c/go/+/655280 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Auto-Submit: Michael Knyszek <mknyszek@google.com> Reviewed-by: Michael Pratt <mpratt@google.com> 2025-03-05 20:12:47 +00:00			`ramPages := int(cacheSizes[len(cacheSizes)-1]*3/2) / gc.PageSize`
			`b.Run(fmt.Sprintf("cache=ram/pages=%d", ramPages), func(b *testing.B) {`
			`fn(b, ramPages)`
			`})`
			`}`

			`func benchmarkScanSpanPackedAllSizeClasses(b *testing.B, nPages int) {`
			`for sc := range gc.NumSizeClasses {`
			`if sc == 0 {`
			`continue`
			`}`
internal/runtime/gc/scan: correct size class size check This check intends to skip size classes that are too big for scanSpan, but it compares the size class index with a byte size. It must do the conversion first. For #73581. Change-Id: I6a6a636c8d19fa3bf2a2b609870d67d33f47f66e Reviewed-on: https://go-review.googlesource.com/c/go/+/715460 Auto-Submit: Michael Pratt <mpratt@google.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Michael Knyszek <mknyszek@google.com> 2025-10-27 16:17:31 -04:00			`size := gc.SizeClassToSize[sc]`
			`if size >= gc.MinSizeForMallocHeader {`
internal/runtime/gc/scan: import scan kernel from gclab [green tea] This change imports the AVX512 GC scanning kernel from CL 593938 into a new package, internal/runtime/gc/scan. Credit to Austin Clements for most of this work. I did some cleanup, added support for more size classes to the expanders, and added more testing. I also restructured the code to make it easier and clearer to add new scan kernels for new architectures. For #73581. Change-Id: I76bcbc889fa6cad73ba0084620fae084a5912e6b Cq-Include-Trybots: luci.golang.try:gotip-linux-amd64_avx512,gotip-linux-amd64_avx512-greenteagc Reviewed-on: https://go-review.googlesource.com/c/go/+/655280 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Auto-Submit: Michael Knyszek <mknyszek@google.com> Reviewed-by: Michael Pratt <mpratt@google.com> 2025-03-05 20:12:47 +00:00			`break`
			`}`
			`b.Run(fmt.Sprintf("sizeclass=%d", sc), func(b *testing.B) {`
			`benchmarkScanSpanPacked(b, nPages, sc)`
			`})`
			`}`
			`}`

			`func benchmarkScanSpanPacked(b *testing.B, nPages int, sizeClass int) {`
			`rnd := rand.New(rand.NewPCG(42, 42))`

			`// Construct a fake memory`
			`mem, free := makeMem(b, nPages)`
			`defer free()`
			`for i := range mem {`
			`// Use values > heap.PageSize because a scan function can discard`
			`// pointers smaller than this.`
			`mem[i] = uintptr(int(gc.PageSize) + i + 1)`
			`}`

			`// Construct a random pointer mask`
			`ptrs := make([]gc.PtrMask, nPages)`
			`for i := range ptrs {`
			`for j := range ptrs[i] {`
			`ptrs[i][j] = uintptr(rnd.Uint64())`
			`}`
			`}`

			`// Visit the pages in a random order`
			`pageOrder := rnd.Perm(nPages)`

			`// Create the scan buffer.`
			`buf := make([]uintptr, gc.PageWords)`

			`// Sweep from 0 marks to all marks. We'll use the same marks for each page`
			`// because I don't think that predictability matters.`
			`objBytes := uintptr(gc.SizeClassToSize[sizeClass])`
			`nObj := gc.PageSize / objBytes`
			`markOrder := rnd.Perm(int(nObj))`
			`const steps = 11`
			`for i := 0; i < steps; i++ {`
			`frac := float64(i) / float64(steps-1)`
			`// Set frac marks.`
			`nMarks := int(float64(len(markOrder))*frac + 0.5)`
			`var objMarks gc.ObjMask`
			`for _, mark := range markOrder[:nMarks] {`
			`objMarks[mark/goarch.PtrBits] \|= 1 << (mark % goarch.PtrBits)`
			`}`
			`greyClusters := 0`
			`for page := range ptrs {`
			`greyClusters += countGreyClusters(sizeClass, &objMarks, &ptrs[page])`
			`}`

			`// Report MB/s of how much memory they're actually hitting. This assumes`
			`// 64 byte cache lines (TODO: Should it assume 128 byte cache lines?)`
			`// and expands each access to the whole cache line. This is useful for`
			`// comparing against memory bandwidth.`
			`//`
			`// TODO: Add a benchmark that just measures single core memory bandwidth`
			`// for comparison. (See runtime memcpy benchmarks.)`
			`//`
			`// TODO: Should there be a separate measure where we don't expand to`
			`// cache lines?`
			`avgBytes := int64(greyClusters) * int64(cpu.CacheLineSize) / int64(len(ptrs))`

			`b.Run(fmt.Sprintf("pct=%d", int(100frac)), func(b testing.B) {`
			`b.Run("impl=Reference", func(b *testing.B) {`
			`b.SetBytes(avgBytes)`
			`for i := range b.N {`
			`page := pageOrder[i%len(pageOrder)]`
			`scan.ScanSpanPackedReference(unsafe.Pointer(&mem[gc.PageWords*page]), &buf[0], &objMarks, uintptr(sizeClass), &ptrs[page])`
			`}`
			`})`
			`b.Run("impl=Go", func(b *testing.B) {`
			`b.SetBytes(avgBytes)`
			`for i := range b.N {`
			`page := pageOrder[i%len(pageOrder)]`
			`scan.ScanSpanPackedGo(unsafe.Pointer(&mem[gc.PageWords*page]), &buf[0], &objMarks, uintptr(sizeClass), &ptrs[page])`
			`}`
			`})`
			`if scan.HasFastScanSpanPacked() {`
			`b.Run("impl=Platform", func(b *testing.B) {`
			`b.SetBytes(avgBytes)`
			`for i := range b.N {`
			`page := pageOrder[i%len(pageOrder)]`
			`scan.ScanSpanPacked(unsafe.Pointer(&mem[gc.PageWords*page]), &buf[0], &objMarks, uintptr(sizeClass), &ptrs[page])`
			`}`
			`})`
[dev.simd] internal/runtime/gc: add simd package based greentea kernels This CL adds a new generator to internal/runtime/gc/scan that generates expander kernels in Go SIMD. This CL also includes a Go SIMD scan kernel and a Go SIMD filter kernel. This CL also includes the plumbing, it will use the Go SIMD kernels if goexperiment.simd is on. Benchmark results: ... ScanSpanPacked/cache=tiny/pages=1/sizeclass=26/pct=80-88 354.8n ± 1% 272.4n ± 0% -23.22% (p=0.002 n=6) ScanSpanPacked/cache=tiny/pages=1/sizeclass=26/pct=90-88 375.7n ± 0% 287.1n ± 0% -23.58% (p=0.002 n=6) ScanSpanPacked/cache=tiny/pages=1/sizeclass=26/pct=100-88 450.0n ± 1% 327.4n ± 0% -27.24% (p=0.002 n=6) geomean 246.5n 199.4n -19.10% Throughput +25%. Change-Id: Ib85e01b7de18181db9e7b6026863209a993aa85f Reviewed-on: https://go-review.googlesource.com/c/go/+/719520 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: David Chase <drchase@google.com> 2025-03-09 17:19:48 +00:00			`b.Run("impl=PlatformAsm", func(b *testing.B) {`
			`b.SetBytes(avgBytes)`
			`for i := range b.N {`
			`page := pageOrder[i%len(pageOrder)]`
			`scan.ScanSpanPackedAsm(unsafe.Pointer(&mem[gc.PageWords*page]), &buf[0], &objMarks, uintptr(sizeClass), &ptrs[page])`
			`}`
			`})`
internal/runtime/gc/scan: import scan kernel from gclab [green tea] This change imports the AVX512 GC scanning kernel from CL 593938 into a new package, internal/runtime/gc/scan. Credit to Austin Clements for most of this work. I did some cleanup, added support for more size classes to the expanders, and added more testing. I also restructured the code to make it easier and clearer to add new scan kernels for new architectures. For #73581. Change-Id: I76bcbc889fa6cad73ba0084620fae084a5912e6b Cq-Include-Trybots: luci.golang.try:gotip-linux-amd64_avx512,gotip-linux-amd64_avx512-greenteagc Reviewed-on: https://go-review.googlesource.com/c/go/+/655280 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Auto-Submit: Michael Knyszek <mknyszek@google.com> Reviewed-by: Michael Pratt <mpratt@google.com> 2025-03-05 20:12:47 +00:00			`}`
			`})`
			`}`
			`}`

			`func countGreyClusters(sizeClass int, objMarks gc.ObjMask, ptrMask gc.PtrMask) int {`
			`clusters := 0`
			`lastCluster := -1`

			`expandBy := uintptr(gc.SizeClassToSize[sizeClass]) / goarch.PtrSize`
			`for word := range gc.PageWords {`
			`objI := uintptr(word) / expandBy`
			`if objMarks[objI/goarch.PtrBits]&(1<<(objI%goarch.PtrBits)) == 0 {`
			`continue`
			`}`
			`if ptrMask[word/goarch.PtrBits]&(1<<(word%goarch.PtrBits)) == 0 {`
			`continue`
			`}`
			`c := word * 8 / goarch.PtrBits`
			`if c != lastCluster {`
			`lastCluster = c`
			`clusters++`
			`}`
			`}`
			`return clusters`
			`}`

			`func BenchmarkScanMaxBandwidth(b *testing.B) {`
			`// Measure the theoretical "maximum" bandwidth of scanning by reproducing`
			`// the memory access pattern of a full page scan, but using memcpy as the`
			`// kernel instead of scanning.`
			`benchmarkCacheSizes(b, func(b *testing.B, heapPages int) {`
			`mem, free := makeMem(b, heapPages)`
			`defer free()`
			`for i := range mem {`
			`mem[i] = uintptr(int(gc.PageSize) + i + 1)`
			`}`
			`buf := make([]uintptr, gc.PageWords)`

			`// Visit the pages in a random order`
			`rnd := rand.New(rand.NewPCG(42, 42))`
			`pageOrder := rnd.Perm(heapPages)`

			`b.SetBytes(int64(gc.PageSize))`

			`b.ResetTimer()`
			`for i := range b.N {`
			`page := pageOrder[i%len(pageOrder)]`
			`copy(buf, mem[gc.PageWords*page:])`
			`}`
			`})`
			`}`