runtime: put AddCleanup cleanup arguments in their own allocation

Currently, AddCleanup just creates a simple closure that calls
`cleanup(arg)` as the actual cleanup function tracked internally.
However, the argument ends up getting its own allocation. If it's tiny,
then it can also end up sharing a tiny allocation slot with the object
we're adding the cleanup to. Since the closure is a GC root, we can end
up with cleanups that never fire.

This change refactors the AddCleanup machinery to make the storage for
the argument separate and explicit. With that in place, it explicitly
allocates 16 bytes of storage for tiny arguments to side-step the tiny
allocator.

One would think this would cause an increase in memory use and more
bytes allocated, but that's actually wrong! It turns out that the
current "simple closure" actually creates _two_ closures. By making the
argument passing explicit, we eliminate one layer of closures, so this
actually results in a slightly faster AddCleanup overall and 16 bytes
less memory allocated.

goos: linux
goarch: amd64
pkg: runtime
cpu: AMD EPYC 7B13
                     │ before.bench │            after.bench             │
                     │    sec/op    │   sec/op     vs base               │
AddCleanupAndStop-64    124.5n ± 2%   103.7n ± 2%  -16.71% (p=0.002 n=6)

                     │ before.bench │            after.bench            │
                     │     B/op     │    B/op     vs base               │
AddCleanupAndStop-64     48.00 ± 0%   32.00 ± 0%  -33.33% (p=0.002 n=6)

                     │ before.bench │            after.bench            │
                     │  allocs/op   │ allocs/op   vs base               │
AddCleanupAndStop-64     3.000 ± 0%   2.000 ± 0%  -33.33% (p=0.002 n=6)

This change does, however, does add 16 bytes of overhead to the cleanup
special itself, and makes each cleanup block entry 24 bytes instead of 8
bytes. This means the overall memory overhead delta with this change is
neutral, and we just have a faster cleanup. (Cleanup block entries are
transient, so I suspect any increase in memory overhead there is
negligible.)

Together with CL 719960, fixes #76007.

Change-Id: I81bf3e44339e71c016c30d80bb4ee151c8263d5c
Reviewed-on: https://go-review.googlesource.com/c/go/+/720321
Reviewed-by: Cherry Mui <cherryyz@google.com>
Auto-Submit: Michael Knyszek <mknyszek@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
This commit is contained in:
Michael Anthony Knyszek 2025-11-13 18:29:23 +00:00 committed by Gopher Robot
parent 9fd2e44439
commit 1bb1f2bf0c
3 changed files with 78 additions and 37 deletions

View file

@ -72,8 +72,9 @@ import (
// pass the object to the [KeepAlive] function after the last point
// where the object must remain reachable.
func AddCleanup[T, S any](ptr *T, cleanup func(S), arg S) Cleanup {
// Explicitly force ptr to escape to the heap.
// Explicitly force ptr and cleanup to escape to the heap.
ptr = abi.Escape(ptr)
cleanup = abi.Escape(cleanup)
// The pointer to the object must be valid.
if ptr == nil {
@ -82,7 +83,8 @@ func AddCleanup[T, S any](ptr *T, cleanup func(S), arg S) Cleanup {
usptr := uintptr(unsafe.Pointer(ptr))
// Check that arg is not equal to ptr.
if kind := abi.TypeOf(arg).Kind(); kind == abi.Pointer || kind == abi.UnsafePointer {
argType := abi.TypeOf(arg)
if kind := argType.Kind(); kind == abi.Pointer || kind == abi.UnsafePointer {
if unsafe.Pointer(ptr) == *((*unsafe.Pointer)(unsafe.Pointer(&arg))) {
panic("runtime.AddCleanup: ptr is equal to arg, cleanup will never run")
}
@ -98,12 +100,23 @@ func AddCleanup[T, S any](ptr *T, cleanup func(S), arg S) Cleanup {
return Cleanup{}
}
fn := func() {
cleanup(arg)
// Create new storage for the argument.
var argv *S
if size := unsafe.Sizeof(arg); size < maxTinySize && argType.PtrBytes == 0 {
// Side-step the tiny allocator to avoid liveness issues, since this box
// will be treated like a root by the GC. We model the box as an array of
// uintptrs to guarantee maximum allocator alignment.
//
// TODO(mknyszek): Consider just making space in cleanupFn for this. The
// unfortunate part of this is it would grow specialCleanup by 16 bytes, so
// while there wouldn't be an allocation, *every* cleanup would take the
// memory overhead hit.
box := new([maxTinySize / goarch.PtrSize]uintptr)
argv = (*S)(unsafe.Pointer(box))
} else {
argv = new(S)
}
// Closure must escape.
fv := *(**funcval)(unsafe.Pointer(&fn))
fv = abi.Escape(fv)
*argv = arg
// Find the containing object.
base, _, _ := findObject(usptr, 0, 0)
@ -120,7 +133,16 @@ func AddCleanup[T, S any](ptr *T, cleanup func(S), arg S) Cleanup {
gcCleanups.createGs()
}
id := addCleanup(unsafe.Pointer(ptr), fv)
id := addCleanup(unsafe.Pointer(ptr), cleanupFn{
// Instantiate a caller function to call the cleanup, that is cleanup(*argv).
//
// TODO(mknyszek): This allocates because the generic dictionary argument
// gets closed over, but callCleanup doesn't even use the dictionary argument,
// so theoretically that could be removed, eliminating an allocation.
call: callCleanup[S],
fn: *(**funcval)(unsafe.Pointer(&cleanup)),
arg: unsafe.Pointer(argv),
})
if debug.checkfinalizers != 0 {
cleanupFn := *(**funcval)(unsafe.Pointer(&cleanup))
setCleanupContext(unsafe.Pointer(ptr), abi.TypeFor[T](), sys.GetCallerPC(), cleanupFn.fn, id)
@ -131,6 +153,16 @@ func AddCleanup[T, S any](ptr *T, cleanup func(S), arg S) Cleanup {
}
}
// callCleanup is a helper for calling cleanups in a polymorphic way.
//
// In practice, all it does is call fn(*arg). arg must be a *T.
//
//go:noinline
func callCleanup[T any](fn *funcval, arg unsafe.Pointer) {
cleanup := *(*func(T))(unsafe.Pointer(&fn))
cleanup(*(*T)(arg))
}
// Cleanup is a handle to a cleanup call for a specific object.
type Cleanup struct {
// id is the unique identifier for the cleanup within the arena.
@ -216,7 +248,17 @@ const cleanupBlockSize = 512
// that the cleanup queue does not grow during marking (but it can shrink).
type cleanupBlock struct {
cleanupBlockHeader
cleanups [(cleanupBlockSize - unsafe.Sizeof(cleanupBlockHeader{})) / goarch.PtrSize]*funcval
cleanups [(cleanupBlockSize - unsafe.Sizeof(cleanupBlockHeader{})) / unsafe.Sizeof(cleanupFn{})]cleanupFn
}
var cleanupFnPtrMask = [...]uint8{0b111}
// cleanupFn represents a cleanup function with it's argument, yet to be called.
type cleanupFn struct {
// call is an adapter function that understands how to safely call fn(*arg).
call func(*funcval, unsafe.Pointer)
fn *funcval // cleanup function passed to AddCleanup.
arg unsafe.Pointer // pointer to argument to pass to cleanup function.
}
var cleanupBlockPtrMask [cleanupBlockSize / goarch.PtrSize / 8]byte
@ -245,8 +287,8 @@ type cleanupBlockHeader struct {
//
// Must only be called if the GC is in the sweep phase (gcphase == _GCoff),
// because it does not synchronize with the garbage collector.
func (b *cleanupBlock) enqueue(fn *funcval) bool {
b.cleanups[b.n] = fn
func (b *cleanupBlock) enqueue(c cleanupFn) bool {
b.cleanups[b.n] = c
b.n++
return b.full()
}
@ -375,7 +417,7 @@ func (q *cleanupQueue) tryTakeWork() bool {
// enqueue queues a single cleanup for execution.
//
// Called by the sweeper, and only the sweeper.
func (q *cleanupQueue) enqueue(fn *funcval) {
func (q *cleanupQueue) enqueue(c cleanupFn) {
mp := acquirem()
pp := mp.p.ptr()
b := pp.cleanups
@ -396,7 +438,7 @@ func (q *cleanupQueue) enqueue(fn *funcval) {
}
pp.cleanups = b
}
if full := b.enqueue(fn); full {
if full := b.enqueue(c); full {
q.full.push(&b.lfnode)
pp.cleanups = nil
q.addWork(1)
@ -641,7 +683,8 @@ func runCleanups() {
gcCleanups.beginRunningCleanups()
for i := 0; i < int(b.n); i++ {
fn := b.cleanups[i]
c := b.cleanups[i]
b.cleanups[i] = cleanupFn{}
var racectx uintptr
if raceenabled {
@ -650,20 +693,15 @@ func runCleanups() {
// the same goroutine.
//
// Synchronize on fn. This would fail to find races on the
// closed-over values in fn (suppose fn is passed to multiple
// AddCleanup calls) if fn was not unique, but it is. Update
// the synchronization on fn if you intend to optimize it
// and store the cleanup function and cleanup argument on the
// queue directly.
racerelease(unsafe.Pointer(fn))
// closed-over values in fn (suppose arg is passed to multiple
// AddCleanup calls) if arg was not unique, but it is.
racerelease(unsafe.Pointer(c.arg))
racectx = raceEnterNewCtx()
raceacquire(unsafe.Pointer(fn))
raceacquire(unsafe.Pointer(c.arg))
}
// Execute the next cleanup.
cleanup := *(*func())(unsafe.Pointer(&fn))
cleanup()
b.cleanups[i] = nil
c.call(c.fn, c.arg)
if raceenabled {
// Restore the old context.

View file

@ -204,7 +204,7 @@ func gcMarkRootCheck() {
})
}
// ptrmask for an allocation containing a single pointer.
// oneptrmask for an allocation containing a single pointer.
var oneptrmask = [...]uint8{1}
// markroot scans the i'th root.
@ -251,7 +251,7 @@ func markroot(gcw *gcWork, i uint32, flushBgCredit bool) int64 {
// N.B. This only needs to synchronize with cleanup execution, which only resets these blocks.
// All cleanup queueing happens during sweep.
n := uintptr(atomic.Load(&cb.n))
scanblock(uintptr(unsafe.Pointer(&cb.cleanups[0])), n*goarch.PtrSize, &cleanupBlockPtrMask[0], gcw, nil)
scanblock(uintptr(unsafe.Pointer(&cb.cleanups[0])), n*unsafe.Sizeof(cleanupFn{}), &cleanupBlockPtrMask[0], gcw, nil)
}
case work.baseSpans <= i && i < work.baseStacks:
@ -489,7 +489,7 @@ func gcScanFinalizer(spf *specialfinalizer, s *mspan, gcw *gcWork) {
// gcScanCleanup scans the relevant parts of a cleanup special as a root.
func gcScanCleanup(spc *specialCleanup, gcw *gcWork) {
// The special itself is a root.
scanblock(uintptr(unsafe.Pointer(&spc.fn)), goarch.PtrSize, &oneptrmask[0], gcw, nil)
scanblock(uintptr(unsafe.Pointer(&spc.cleanup)), unsafe.Sizeof(cleanupFn{}), &cleanupFnPtrMask[0], gcw, nil)
}
// gcAssistAlloc performs GC work to make gp's assist debt positive.

View file

@ -2161,7 +2161,7 @@ func removefinalizer(p unsafe.Pointer) {
type specialCleanup struct {
_ sys.NotInHeap
special special
fn *funcval
cleanup cleanupFn
// Globally unique ID for the cleanup, obtained from mheap_.cleanupID.
id uint64
}
@ -2170,14 +2170,18 @@ type specialCleanup struct {
// cleanups are allowed on an object, and even the same pointer.
// A cleanup id is returned which can be used to uniquely identify
// the cleanup.
func addCleanup(p unsafe.Pointer, f *funcval) uint64 {
func addCleanup(p unsafe.Pointer, c cleanupFn) uint64 {
// TODO(mknyszek): Consider pooling specialCleanups on the P
// so we don't have to take the lock every time. Just locking
// is a considerable part of the cost of AddCleanup. This
// would also require reserving some cleanup IDs on the P.
lock(&mheap_.speciallock)
s := (*specialCleanup)(mheap_.specialCleanupAlloc.alloc())
mheap_.cleanupID++ // Increment first. ID 0 is reserved.
id := mheap_.cleanupID
unlock(&mheap_.speciallock)
s.special.kind = _KindSpecialCleanup
s.fn = f
s.cleanup = c
s.id = id
mp := acquirem()
@ -2187,17 +2191,16 @@ func addCleanup(p unsafe.Pointer, f *funcval) uint64 {
// situation where it's possible that markrootSpans
// has already run but mark termination hasn't yet.
if gcphase != _GCoff {
gcw := &mp.p.ptr().gcw
// Mark the cleanup itself, since the
// special isn't part of the GC'd heap.
scanblock(uintptr(unsafe.Pointer(&s.fn)), goarch.PtrSize, &oneptrmask[0], gcw, nil)
gcScanCleanup(s, &mp.p.ptr().gcw)
}
releasem(mp)
// Keep f alive. There's a window in this function where it's
// only reachable via the special while the special hasn't been
// added to the specials list yet. This is similar to a bug
// Keep c and its referents alive. There's a window in this function
// where it's only reachable via the special while the special hasn't
// been added to the specials list yet. This is similar to a bug
// discovered for weak handles, see #70455.
KeepAlive(f)
KeepAlive(c)
return id
}
@ -2800,7 +2803,7 @@ func freeSpecial(s *special, p unsafe.Pointer, size uintptr) {
// Cleanups, unlike finalizers, do not resurrect the objects
// they're attached to, so we only need to pass the cleanup
// function, not the object.
gcCleanups.enqueue(sc.fn)
gcCleanups.enqueue(sc.cleanup)
lock(&mheap_.speciallock)
mheap_.specialCleanupAlloc.free(unsafe.Pointer(sc))
unlock(&mheap_.speciallock)