2025-03-05 20:12:47 +00:00
|
|
|
// Copyright 2025 The Go Authors. All rights reserved.
|
|
|
|
|
// Use of this source code is governed by a BSD-style
|
|
|
|
|
// license that can be found in the LICENSE file.
|
|
|
|
|
|
|
|
|
|
package scan_test
|
|
|
|
|
|
|
|
|
|
import (
|
|
|
|
|
"fmt"
|
|
|
|
|
"internal/cpu"
|
|
|
|
|
"internal/goarch"
|
|
|
|
|
"internal/runtime/gc"
|
|
|
|
|
"internal/runtime/gc/scan"
|
|
|
|
|
"math/bits"
|
|
|
|
|
"math/rand/v2"
|
|
|
|
|
"slices"
|
|
|
|
|
"sync"
|
|
|
|
|
"testing"
|
|
|
|
|
"unsafe"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
type scanFunc func(mem unsafe.Pointer, bufp *uintptr, objMarks *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32)
|
|
|
|
|
|
|
|
|
|
func testScanSpanPacked(t *testing.T, scanF scanFunc) {
|
|
|
|
|
scanR := scan.ScanSpanPackedReference
|
|
|
|
|
|
|
|
|
|
// Construct a fake memory
|
|
|
|
|
mem, free := makeMem(t, 1)
|
|
|
|
|
defer free()
|
|
|
|
|
for i := range mem {
|
|
|
|
|
// Use values > heap.PageSize because a scan function can discard
|
|
|
|
|
// pointers smaller than this.
|
|
|
|
|
mem[i] = uintptr(int(gc.PageSize) + i + 1)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Construct a random pointer mask
|
|
|
|
|
rnd := rand.New(rand.NewPCG(42, 42))
|
|
|
|
|
var ptrs gc.PtrMask
|
|
|
|
|
for i := range ptrs {
|
|
|
|
|
ptrs[i] = uintptr(rnd.Uint64())
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bufF := make([]uintptr, gc.PageWords)
|
|
|
|
|
bufR := make([]uintptr, gc.PageWords)
|
|
|
|
|
testObjs(t, func(t *testing.T, sizeClass int, objs *gc.ObjMask) {
|
|
|
|
|
nF := scanF(unsafe.Pointer(&mem[0]), &bufF[0], objs, uintptr(sizeClass), &ptrs)
|
|
|
|
|
nR := scanR(unsafe.Pointer(&mem[0]), &bufR[0], objs, uintptr(sizeClass), &ptrs)
|
|
|
|
|
|
|
|
|
|
if nR != nF {
|
|
|
|
|
t.Errorf("want %d count, got %d", nR, nF)
|
|
|
|
|
} else if !slices.Equal(bufF[:nF], bufR[:nR]) {
|
|
|
|
|
t.Errorf("want scanned pointers %d, got %d", bufR[:nR], bufF[:nF])
|
|
|
|
|
}
|
|
|
|
|
})
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func testObjs(t *testing.T, f func(t *testing.T, sizeClass int, objMask *gc.ObjMask)) {
|
|
|
|
|
for sizeClass := range gc.NumSizeClasses {
|
|
|
|
|
if sizeClass == 0 {
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
size := uintptr(gc.SizeClassToSize[sizeClass])
|
|
|
|
|
if size > gc.MinSizeForMallocHeader {
|
|
|
|
|
break // Pointer/scalar metadata is not packed for larger sizes.
|
|
|
|
|
}
|
|
|
|
|
t.Run(fmt.Sprintf("size=%d", size), func(t *testing.T) {
|
|
|
|
|
// Scan a few objects near i to test boundary conditions.
|
|
|
|
|
const objMask = 0x101
|
|
|
|
|
nObj := uintptr(gc.SizeClassToNPages[sizeClass]) * gc.PageSize / size
|
|
|
|
|
for i := range nObj - uintptr(bits.Len(objMask)-1) {
|
|
|
|
|
t.Run(fmt.Sprintf("objs=0x%x<<%d", objMask, i), func(t *testing.T) {
|
|
|
|
|
var objs gc.ObjMask
|
|
|
|
|
objs[i/goarch.PtrBits] = objMask << (i % goarch.PtrBits)
|
|
|
|
|
f(t, sizeClass, &objs)
|
|
|
|
|
})
|
|
|
|
|
}
|
|
|
|
|
})
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
var dataCacheSizes = sync.OnceValue(func() []uintptr {
|
|
|
|
|
cs := cpu.DataCacheSizes()
|
|
|
|
|
for i, c := range cs {
|
|
|
|
|
fmt.Printf("# L%d cache: %d (%d Go pages)\n", i+1, c, c/gc.PageSize)
|
|
|
|
|
}
|
|
|
|
|
return cs
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
func BenchmarkScanSpanPacked(b *testing.B) {
|
|
|
|
|
benchmarkCacheSizes(b, benchmarkScanSpanPackedAllSizeClasses)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func benchmarkCacheSizes(b *testing.B, fn func(b *testing.B, heapPages int)) {
|
|
|
|
|
cacheSizes := dataCacheSizes()
|
|
|
|
|
b.Run("cache=tiny/pages=1", func(b *testing.B) {
|
|
|
|
|
fn(b, 1)
|
|
|
|
|
})
|
|
|
|
|
for i, cacheBytes := range cacheSizes {
|
|
|
|
|
pages := int(cacheBytes*3/4) / gc.PageSize
|
|
|
|
|
b.Run(fmt.Sprintf("cache=L%d/pages=%d", i+1, pages), func(b *testing.B) {
|
|
|
|
|
fn(b, pages)
|
|
|
|
|
})
|
|
|
|
|
}
|
2025-08-12 21:57:23 +00:00
|
|
|
if len(cacheSizes) == 0 {
|
|
|
|
|
return
|
|
|
|
|
}
|
2025-03-05 20:12:47 +00:00
|
|
|
ramPages := int(cacheSizes[len(cacheSizes)-1]*3/2) / gc.PageSize
|
|
|
|
|
b.Run(fmt.Sprintf("cache=ram/pages=%d", ramPages), func(b *testing.B) {
|
|
|
|
|
fn(b, ramPages)
|
|
|
|
|
})
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func benchmarkScanSpanPackedAllSizeClasses(b *testing.B, nPages int) {
|
|
|
|
|
for sc := range gc.NumSizeClasses {
|
|
|
|
|
if sc == 0 {
|
|
|
|
|
continue
|
|
|
|
|
}
|
2025-10-27 16:17:31 -04:00
|
|
|
size := gc.SizeClassToSize[sc]
|
|
|
|
|
if size >= gc.MinSizeForMallocHeader {
|
2025-03-05 20:12:47 +00:00
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
b.Run(fmt.Sprintf("sizeclass=%d", sc), func(b *testing.B) {
|
|
|
|
|
benchmarkScanSpanPacked(b, nPages, sc)
|
|
|
|
|
})
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func benchmarkScanSpanPacked(b *testing.B, nPages int, sizeClass int) {
|
|
|
|
|
rnd := rand.New(rand.NewPCG(42, 42))
|
|
|
|
|
|
|
|
|
|
// Construct a fake memory
|
|
|
|
|
mem, free := makeMem(b, nPages)
|
|
|
|
|
defer free()
|
|
|
|
|
for i := range mem {
|
|
|
|
|
// Use values > heap.PageSize because a scan function can discard
|
|
|
|
|
// pointers smaller than this.
|
|
|
|
|
mem[i] = uintptr(int(gc.PageSize) + i + 1)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Construct a random pointer mask
|
|
|
|
|
ptrs := make([]gc.PtrMask, nPages)
|
|
|
|
|
for i := range ptrs {
|
|
|
|
|
for j := range ptrs[i] {
|
|
|
|
|
ptrs[i][j] = uintptr(rnd.Uint64())
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Visit the pages in a random order
|
|
|
|
|
pageOrder := rnd.Perm(nPages)
|
|
|
|
|
|
|
|
|
|
// Create the scan buffer.
|
|
|
|
|
buf := make([]uintptr, gc.PageWords)
|
|
|
|
|
|
|
|
|
|
// Sweep from 0 marks to all marks. We'll use the same marks for each page
|
|
|
|
|
// because I don't think that predictability matters.
|
|
|
|
|
objBytes := uintptr(gc.SizeClassToSize[sizeClass])
|
|
|
|
|
nObj := gc.PageSize / objBytes
|
|
|
|
|
markOrder := rnd.Perm(int(nObj))
|
|
|
|
|
const steps = 11
|
|
|
|
|
for i := 0; i < steps; i++ {
|
|
|
|
|
frac := float64(i) / float64(steps-1)
|
|
|
|
|
// Set frac marks.
|
|
|
|
|
nMarks := int(float64(len(markOrder))*frac + 0.5)
|
|
|
|
|
var objMarks gc.ObjMask
|
|
|
|
|
for _, mark := range markOrder[:nMarks] {
|
|
|
|
|
objMarks[mark/goarch.PtrBits] |= 1 << (mark % goarch.PtrBits)
|
|
|
|
|
}
|
|
|
|
|
greyClusters := 0
|
|
|
|
|
for page := range ptrs {
|
|
|
|
|
greyClusters += countGreyClusters(sizeClass, &objMarks, &ptrs[page])
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Report MB/s of how much memory they're actually hitting. This assumes
|
|
|
|
|
// 64 byte cache lines (TODO: Should it assume 128 byte cache lines?)
|
|
|
|
|
// and expands each access to the whole cache line. This is useful for
|
|
|
|
|
// comparing against memory bandwidth.
|
|
|
|
|
//
|
|
|
|
|
// TODO: Add a benchmark that just measures single core memory bandwidth
|
|
|
|
|
// for comparison. (See runtime memcpy benchmarks.)
|
|
|
|
|
//
|
|
|
|
|
// TODO: Should there be a separate measure where we don't expand to
|
|
|
|
|
// cache lines?
|
|
|
|
|
avgBytes := int64(greyClusters) * int64(cpu.CacheLineSize) / int64(len(ptrs))
|
|
|
|
|
|
|
|
|
|
b.Run(fmt.Sprintf("pct=%d", int(100*frac)), func(b *testing.B) {
|
|
|
|
|
b.Run("impl=Reference", func(b *testing.B) {
|
|
|
|
|
b.SetBytes(avgBytes)
|
|
|
|
|
for i := range b.N {
|
|
|
|
|
page := pageOrder[i%len(pageOrder)]
|
|
|
|
|
scan.ScanSpanPackedReference(unsafe.Pointer(&mem[gc.PageWords*page]), &buf[0], &objMarks, uintptr(sizeClass), &ptrs[page])
|
|
|
|
|
}
|
|
|
|
|
})
|
|
|
|
|
b.Run("impl=Go", func(b *testing.B) {
|
|
|
|
|
b.SetBytes(avgBytes)
|
|
|
|
|
for i := range b.N {
|
|
|
|
|
page := pageOrder[i%len(pageOrder)]
|
|
|
|
|
scan.ScanSpanPackedGo(unsafe.Pointer(&mem[gc.PageWords*page]), &buf[0], &objMarks, uintptr(sizeClass), &ptrs[page])
|
|
|
|
|
}
|
|
|
|
|
})
|
|
|
|
|
if scan.HasFastScanSpanPacked() {
|
|
|
|
|
b.Run("impl=Platform", func(b *testing.B) {
|
|
|
|
|
b.SetBytes(avgBytes)
|
|
|
|
|
for i := range b.N {
|
|
|
|
|
page := pageOrder[i%len(pageOrder)]
|
|
|
|
|
scan.ScanSpanPacked(unsafe.Pointer(&mem[gc.PageWords*page]), &buf[0], &objMarks, uintptr(sizeClass), &ptrs[page])
|
|
|
|
|
}
|
|
|
|
|
})
|
2025-03-09 17:19:48 +00:00
|
|
|
b.Run("impl=PlatformAsm", func(b *testing.B) {
|
|
|
|
|
b.SetBytes(avgBytes)
|
|
|
|
|
for i := range b.N {
|
|
|
|
|
page := pageOrder[i%len(pageOrder)]
|
|
|
|
|
scan.ScanSpanPackedAsm(unsafe.Pointer(&mem[gc.PageWords*page]), &buf[0], &objMarks, uintptr(sizeClass), &ptrs[page])
|
|
|
|
|
}
|
|
|
|
|
})
|
2025-03-05 20:12:47 +00:00
|
|
|
}
|
|
|
|
|
})
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func countGreyClusters(sizeClass int, objMarks *gc.ObjMask, ptrMask *gc.PtrMask) int {
|
|
|
|
|
clusters := 0
|
|
|
|
|
lastCluster := -1
|
|
|
|
|
|
|
|
|
|
expandBy := uintptr(gc.SizeClassToSize[sizeClass]) / goarch.PtrSize
|
|
|
|
|
for word := range gc.PageWords {
|
|
|
|
|
objI := uintptr(word) / expandBy
|
|
|
|
|
if objMarks[objI/goarch.PtrBits]&(1<<(objI%goarch.PtrBits)) == 0 {
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
if ptrMask[word/goarch.PtrBits]&(1<<(word%goarch.PtrBits)) == 0 {
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
c := word * 8 / goarch.PtrBits
|
|
|
|
|
if c != lastCluster {
|
|
|
|
|
lastCluster = c
|
|
|
|
|
clusters++
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return clusters
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func BenchmarkScanMaxBandwidth(b *testing.B) {
|
|
|
|
|
// Measure the theoretical "maximum" bandwidth of scanning by reproducing
|
|
|
|
|
// the memory access pattern of a full page scan, but using memcpy as the
|
|
|
|
|
// kernel instead of scanning.
|
|
|
|
|
benchmarkCacheSizes(b, func(b *testing.B, heapPages int) {
|
|
|
|
|
mem, free := makeMem(b, heapPages)
|
|
|
|
|
defer free()
|
|
|
|
|
for i := range mem {
|
|
|
|
|
mem[i] = uintptr(int(gc.PageSize) + i + 1)
|
|
|
|
|
}
|
|
|
|
|
buf := make([]uintptr, gc.PageWords)
|
|
|
|
|
|
|
|
|
|
// Visit the pages in a random order
|
|
|
|
|
rnd := rand.New(rand.NewPCG(42, 42))
|
|
|
|
|
pageOrder := rnd.Perm(heapPages)
|
|
|
|
|
|
|
|
|
|
b.SetBytes(int64(gc.PageSize))
|
|
|
|
|
|
|
|
|
|
b.ResetTimer()
|
|
|
|
|
for i := range b.N {
|
|
|
|
|
page := pageOrder[i%len(pageOrder)]
|
|
|
|
|
copy(buf, mem[gc.PageWords*page:])
|
|
|
|
|
}
|
|
|
|
|
})
|
|
|
|
|
}
|