runtime: add valgrind instrumentation

Add build tag gated Valgrind annotations to the runtime which let it understand how the runtime manages memory. This allows for Go binaries to be run under Valgrind without emitting spurious errors. Instead of adding the Valgrind headers to the tree, and using cgo to call the various Valgrind client request macros, we just add an assembly function which emits the necessary instructions to trigger client requests. In particular we add instrumentation of the memory allocator, using a two-level mempool structure (as described in the Valgrind manual [0]). We also add annotations which allow Valgrind to track which memory we use for stacks, which seems necessary to let it properly function. We describe the memory model to Valgrind as follows: we treat heap arenas as a "pool" created with VALGRIND_CREATE_MEMPOOL_EXT (so that we can use VALGRIND_MEMPOOL_METAPOOL and VALGRIND_MEMPOOL_AUTO_FREE). Within the pool we treat spans as "superblocks", annotated with VALGRIND_MEMPOOL_ALLOC. We then allocate individual objects within spans with VALGRIND_MALLOCLIKE_BLOCK. It should be noted that running binaries under Valgrind can be _quite slow_, and certain operations, such as running the GC, can be _very slow_. It is recommended to run programs with GOGC=off. Additionally, async preemption should be turned off, since it'll cause strange behavior (GODEBUG=asyncpreemptoff=1). Running Valgrind with --leak-check=yes will result in some errors resulting from some things not being marked fully free'd. These likely need more annotations to rectify, but for now it is recommended to run with --leak-check=off. Updates #73602 [0] https://valgrind.org/docs/manual/mc-manual.html#mc-manual.mempools Change-Id: I71b26c47d7084de71ef1e03947ef6b1cc6d38301 Reviewed-on: https://go-review.googlesource.com/c/go/+/674077 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Michael Knyszek <mknyszek@google.com>
2025-10-19 11:03:18 +00:00 · 2025-03-22 00:58:55 +00:00 · 2025-03-22 00:58:55 +00:00 · 40b19b56a9
commit 40b19b56a9
parent 2a5ac1a993
15 changed files with 364 additions and 3 deletions
--- a/src/os/pidfd_linux.go
+++ b/src/os/pidfd_linux.go
@ -170,7 +170,10 @@ func checkPidfd() error {

 	// Check waitid(P_PIDFD) works.
 	err = ignoringEINTR(func() error {
-		return unix.Waitid(unix.P_PIDFD, int(fd), nil, syscall.WEXITED, nil)
+		var info unix.SiginfoChild
+		// We don't actually care about the info, but passing a nil pointer
+		// makes valgrind complain because 0x0 is unaddressable.
+		return unix.Waitid(unix.P_PIDFD, int(fd), &info, syscall.WEXITED, nil)
 	})
 	// Expect ECHILD from waitid since we're not our own parent.
 	if err != syscall.ECHILD {
--- a/src/runtime/arena.go
+++ b/src/runtime/arena.go
@ -950,6 +950,9 @@ func freeUserArenaChunk(s *mspan, x unsafe.Pointer) {
 	if asanenabled {
 		asanpoison(unsafe.Pointer(s.base()), s.elemsize)
 	}
+	if valgrindenabled {
+		valgrindFree(unsafe.Pointer(s.base()))
+	}

 	// Make ourselves non-preemptible as we manipulate state and statistics.
 	//
--- a/src/runtime/malloc.go
+++ b/src/runtime/malloc.go
@ -754,6 +754,11 @@ func (h *mheap) sysAlloc(n uintptr, hintList **arenaHint, arenaList *[]arenaIdx)
 	}

 mapped:
+	if valgrindenabled {
+		valgrindCreateMempool(v)
+		valgrindMakeMemNoAccess(v, size)
+	}
+
 	// Create arena metadata.
 	for ri := arenaIndex(uintptr(v)); ri <= arenaIndex(uintptr(v)+size-1); ri++ {
 		l2 := h.arenas[ri.l1()]
@ -1084,6 +1089,9 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 		asanpoison(unsafe.Add(x, size-asanRZ), asanRZ)
 		asanunpoison(x, size-asanRZ)
 	}
+	if valgrindenabled {
+		valgrindMalloc(x, size-asanRZ)
+	}

 	// Adjust our GC assist debt to account for internal fragmentation.
 	if gcBlackenEnabled != 0 && elemsize != 0 {
--- a/src/runtime/mgcmark.go
+++ b/src/runtime/mgcmark.go
@ -315,6 +315,10 @@ func markrootFreeGStacks() {
 		stackfree(gp.stack)
 		gp.stack.lo = 0
 		gp.stack.hi = 0
+		if valgrindenabled {
+			valgrindDeregisterStack(gp.valgrindStackID)
+			gp.valgrindStackID = 0
+		}
 	}

 	q := gQueue{list.head, tail.guintptr(), list.size}
--- a/src/runtime/mgcsweep.go
+++ b/src/runtime/mgcsweep.go
@ -641,6 +641,9 @@ func (sl *sweepLocked) sweep(preserve bool) bool {
 				if asanenabled && !s.isUserArenaChunk {
 					asanpoison(unsafe.Pointer(x), size)
 				}
+				if valgrindenabled && !s.isUserArenaChunk {
+					valgrindFree(unsafe.Pointer(x))
+				}
 			}
 			mbits.advance()
 			abits.advance()
--- a/src/runtime/mheap.go
+++ b/src/runtime/mheap.go
@ -1388,6 +1388,10 @@ HaveSpan:
 	// Initialize the span.
 	h.initSpan(s, typ, spanclass, base, npages)

+	if valgrindenabled {
+		valgrindMempoolMalloc(unsafe.Pointer(arenaBase(arenaIndex(base))), unsafe.Pointer(base), npages*pageSize)
+	}
+
 	// Commit and account for any scavenged memory that the span now owns.
 	nbytes := npages * pageSize
 	if scav != 0 {
@ -1643,6 +1647,10 @@ func (h *mheap) freeSpan(s *mspan) {
 			bytes := s.npages << gc.PageShift
 			asanpoison(base, bytes)
 		}
+		if valgrindenabled {
+			base := s.base()
+			valgrindMempoolFree(unsafe.Pointer(arenaBase(arenaIndex(base))), unsafe.Pointer(base))
+		}
 		h.freeSpanLocked(s, spanAllocHeap)
 		unlock(&h.lock)
 	})
@ -1671,6 +1679,10 @@ func (h *mheap) freeManual(s *mspan, typ spanAllocType) {

 	s.needzero = 1
 	lock(&h.lock)
+	if valgrindenabled {
+		base := s.base()
+		valgrindMempoolFree(unsafe.Pointer(arenaBase(arenaIndex(base))), unsafe.Pointer(base))
+	}
 	h.freeSpanLocked(s, typ)
 	unlock(&h.lock)
 }
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@ -1955,6 +1955,10 @@ func mexit(osStack bool) {
 	// Free the gsignal stack.
 	if mp.gsignal != nil {
 		stackfree(mp.gsignal.stack)
+		if valgrindenabled {
+			valgrindDeregisterStack(mp.gsignal.valgrindStackID)
+			mp.gsignal.valgrindStackID = 0
+		}
 		// On some platforms, when calling into VDSO (e.g. nanotime)
 		// we store our g on the gsignal stack, if there is one.
 		// Now the stack is freed, unlink it from the m, so we
@ -2252,6 +2256,10 @@ func allocm(pp *p, fn func(), id int64) *m {
 				// startm.
 				systemstack(func() {
 					stackfree(freem.g0.stack)
+					if valgrindenabled {
+						valgrindDeregisterStack(freem.g0.valgrindStackID)
+						freem.g0.valgrindStackID = 0
+					}
 				})
 			}
 			freem = freem.freelink
@ -5046,6 +5054,9 @@ func malg(stacksize int32) *g {
 		stacksize = round2(stackSystem + stacksize)
 		systemstack(func() {
 			newg.stack = stackalloc(uint32(stacksize))
+			if valgrindenabled {
+				newg.valgrindStackID = valgrindRegisterStack(unsafe.Pointer(newg.stack.lo), unsafe.Pointer(newg.stack.hi))
+			}
 		})
 		newg.stackguard0 = newg.stack.lo + stackGuard
 		newg.stackguard1 = ^uintptr(0)
@ -5234,6 +5245,10 @@ func gfput(pp *p, gp *g) {
 		gp.stack.lo = 0
 		gp.stack.hi = 0
 		gp.stackguard0 = 0
+		if valgrindenabled {
+			valgrindDeregisterStack(gp.valgrindStackID)
+			gp.valgrindStackID = 0
+		}
 	}

 	pp.gFree.push(gp)
@ -5291,12 +5306,19 @@ retry:
 			gp.stack.lo = 0
 			gp.stack.hi = 0
 			gp.stackguard0 = 0
+			if valgrindenabled {
+				valgrindDeregisterStack(gp.valgrindStackID)
+				gp.valgrindStackID = 0
+			}
 		})
 	}
 	if gp.stack.lo == 0 {
 		// Stack was deallocated in gfput or just above. Allocate a new one.
 		systemstack(func() {
 			gp.stack = stackalloc(startingStackSize)
+			if valgrindenabled {
+				gp.valgrindStackID = valgrindRegisterStack(unsafe.Pointer(gp.stack.lo), unsafe.Pointer(gp.stack.hi))
+			}
 		})
 		gp.stackguard0 = gp.stack.lo + stackGuard
 	} else {
--- a/src/runtime/runtime2.go
+++ b/src/runtime/runtime2.go
@ -504,6 +504,10 @@ type g struct {
 	// and check for debt in the malloc hot path. The assist ratio
 	// determines how this corresponds to scan work debt.
 	gcAssistBytes int64
+
+	// valgrindStackID is used to track what memory is used for stacks when a program is
+	// built with the "valgrind" build tag, otherwise it is unused.
+	valgrindStackID uintptr
 }

 // gTrackingPeriod is the number of transitions out of _Grunning between
--- a/src/runtime/sizeof_test.go
+++ b/src/runtime/sizeof_test.go
@ -20,7 +20,7 @@ func TestSizeof(t *testing.T) {
 		_32bit uintptr // size on 32bit platforms
 		_64bit uintptr // size on 64bit platforms
 	}{
-		{runtime.G{}, 276, 432},   // g, but exported for testing
+		{runtime.G{}, 280, 440},   // g, but exported for testing
 		{runtime.Sudog{}, 56, 88}, // sudog, but exported for testing
 	}

--- a/src/runtime/stack.go
+++ b/src/runtime/stack.go
@ -211,6 +211,13 @@ func stackpoolalloc(order uint8) gclinkptr {
 		s.elemsize = fixedStack << order
 		for i := uintptr(0); i < _StackCacheSize; i += s.elemsize {
 			x := gclinkptr(s.base() + i)
+			if valgrindenabled {
+				// The address of x.ptr() becomes the base of stacks. We need to
+				// mark it allocated here and in stackfree and stackpoolfree, and free'd in
+				// stackalloc in order to avoid overlapping allocations and
+				// uninitialized memory errors in valgrind.
+				valgrindMalloc(unsafe.Pointer(x.ptr()), unsafe.Sizeof(x.ptr()))
+			}
 			x.ptr().next = s.manualFreeList
 			s.manualFreeList = x
 		}
@ -388,6 +395,12 @@ func stackalloc(n uint32) stack {
 			c.stackcache[order].list = x.ptr().next
 			c.stackcache[order].size -= uintptr(n)
 		}
+		if valgrindenabled {
+			// We're about to allocate the stack region starting at x.ptr().
+			// To prevent valgrind from complaining about overlapping allocations,
+			// we need to mark the (previously allocated) memory as free'd.
+			valgrindFree(unsafe.Pointer(x.ptr()))
+		}
 		v = unsafe.Pointer(x)
 	} else {
 		var s *mspan
@ -432,6 +445,9 @@ func stackalloc(n uint32) stack {
 	if asanenabled {
 		asanunpoison(v, uintptr(n))
 	}
+	if valgrindenabled {
+		valgrindMalloc(v, uintptr(n))
+	}
 	if stackDebug >= 1 {
 		print("  allocated ", v, "\n")
 	}
@ -479,6 +495,9 @@ func stackfree(stk stack) {
 	if asanenabled {
 		asanpoison(v, n)
 	}
+	if valgrindenabled {
+		valgrindFree(v)
+	}
 	if n < fixedStack<<_NumStackOrders && n < _StackCacheSize {
 		order := uint8(0)
 		n2 := n
@ -489,6 +508,11 @@ func stackfree(stk stack) {
 		x := gclinkptr(v)
 		if stackNoCache != 0 || gp.m.p == 0 || gp.m.preemptoff != "" {
 			lock(&stackpool[order].item.mu)
+			if valgrindenabled {
+				// x.ptr() is the head of the list of free stacks, and will be used
+				// when allocating a new stack, so it has to be marked allocated.
+				valgrindMalloc(unsafe.Pointer(x.ptr()), unsafe.Sizeof(x.ptr()))
+			}
 			stackpoolfree(x, order)
 			unlock(&stackpool[order].item.mu)
 		} else {
@ -496,6 +520,12 @@ func stackfree(stk stack) {
 			if c.stackcache[order].size >= _StackCacheSize {
 				stackcacherelease(c, order)
 			}
+			if valgrindenabled {
+				// x.ptr() is the head of the list of free stacks, and will
+				// be used when allocating a new stack, so it has to be
+				// marked allocated.
+				valgrindMalloc(unsafe.Pointer(x.ptr()), unsafe.Sizeof(x.ptr()))
+			}
 			x.ptr().next = c.stackcache[order].list
 			c.stackcache[order].list = x
 			c.stackcache[order].size += n
@ -583,6 +613,16 @@ func adjustpointer(adjinfo *adjustinfo, vpp unsafe.Pointer) {
 	if stackDebug >= 4 {
 		print("        ", pp, ":", hex(p), "\n")
 	}
+	if valgrindenabled {
+		// p is a pointer on a stack, it is inherently initialized, as
+		// everything on the stack is, but valgrind for _some unknown reason_
+		// sometimes thinks it's uninitialized, and flags operations on p below
+		// as uninitialized. We just initialize it if valgrind thinks its
+		// uninitialized.
+		//
+		// See go.dev/issues/73801.
+		valgrindMakeMemDefined(unsafe.Pointer(&p), unsafe.Sizeof(&p))
+	}
 	if adjinfo.old.lo <= p && p < adjinfo.old.hi {
 		*pp = p + adjinfo.delta
 		if stackDebug >= 3 {
@ -936,6 +976,14 @@ func copystack(gp *g, newsize uintptr) {
 		adjustframe(&u.frame, &adjinfo)
 	}

+	if valgrindenabled {
+		if gp.valgrindStackID == 0 {
+			gp.valgrindStackID = valgrindRegisterStack(unsafe.Pointer(new.lo), unsafe.Pointer(new.hi))
+		} else {
+			valgrindChangeStack(gp.valgrindStackID, unsafe.Pointer(new.lo), unsafe.Pointer(new.hi))
+		}
+	}
+
 	// free old stack
 	if stackPoisonCopy != 0 {
 		fillstack(old, 0xfc)
--- a/src/runtime/valgrind.go
+++ b/src/runtime/valgrind.go
@ -0,0 +1,138 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build valgrind && linux && (arm64 || amd64)
+
+package runtime
+
+import "unsafe"
+
+const valgrindenabled = true
+
+// Valgrind provides a mechanism to allow programs under test to modify
+// Valgrinds behavior in certain ways, referred to as client requests [0]. These
+// requests are triggered putting the address of a series of uints in a specific
+// register and emitting a very specific sequence of assembly instructions. The
+// result of the request (if there is one) is then put in another register for
+// the program to retrieve. Each request is identified by a unique uint, which
+// is passed as the first "argument".
+//
+// Valgrind provides headers (valgrind/valgrind.h, valgrind/memcheck.h) with
+// macros that emit the correct assembly for these requests. Instead of copying
+// these headers into the tree and using cgo to call the macros, we implement
+// the client request assembly ourselves. Since both the magic instruction
+// sequences, and the request uint's are stable, it is safe for us to implement.
+//
+// The client requests we add are used to describe our memory allocator to
+// Valgrind, per [1]. We describe the allocator using the two-level mempool
+// structure a We also add annotations which allow Valgrind to track which
+// memory we use for stacks, which seems necessary to let it properly function.
+//
+// We describe the memory model to Valgrind as follows: we treat heap arenas as
+// "pools" created with VALGRIND_CREATE_MEMPOOL_EXT (so that we can use
+// VALGRIND_MEMPOOL_METAPOOL and VALGRIND_MEMPOOL_AUTO_FREE). Within the pool we
+// treat spans as "superblocks", annotated with VALGRIND_MEMPOOL_ALLOC. We then
+// allocate individual objects within spans with VALGRIND_MALLOCLIKE_BLOCK.
+//
+// [0] https://valgrind.org/docs/manual/manual-core-adv.html#manual-core-adv.clientreq
+// [1] https://valgrind.org/docs/manual/mc-manual.html#mc-manual.mempools
+
+const (
+	// Valgrind request IDs, copied from valgrind/valgrind.h.
+	vg_userreq__malloclike_block = 0x1301
+	vg_userreq__freelike_block   = 0x1302
+	vg_userreq__create_mempool   = 0x1303
+	vg_userreq__mempool_alloc    = 0x1305
+	vg_userreq__mempool_free     = 0x1306
+	vg_userreq__stack_register   = 0x1501
+	vg_userreq__stack_deregister = 0x1502
+	vg_userreq__stack_change     = 0x1503
+)
+
+const (
+	// Memcheck request IDs are offset from ('M'&0xff) << 24 | ('C'&0xff) << 16, or 0x4d430000,
+	// copied from valgrind/memcheck.h.
+	vg_userreq__make_mem_noaccess = iota + ('M'&0xff)<<24 | ('C'&0xff)<<16
+	vg_userreq__make_mem_undefined
+	vg_userreq__make_mem_defined
+)
+
+const (
+	// VALGRIND_CREATE_MEMPOOL_EXT flags, copied from valgrind/valgrind.h.
+	valgrind_mempool_auto_free = 1
+	valgrind_mempool_metapool  = 2
+)
+
+//
+
+//go:noescape
+func valgrindClientRequest(uintptr, uintptr, uintptr, uintptr, uintptr, uintptr) uintptr
+
+//go:nosplit
+func valgrindRegisterStack(start, end unsafe.Pointer) uintptr {
+	// VALGRIND_STACK_REGISTER
+	return valgrindClientRequest(vg_userreq__stack_register, uintptr(start), uintptr(end), 0, 0, 0)
+}
+
+//go:nosplit
+func valgrindDeregisterStack(id uintptr) {
+	// VALGRIND_STACK_DEREGISTER
+	valgrindClientRequest(vg_userreq__stack_deregister, id, 0, 0, 0, 0)
+}
+
+//go:nosplit
+func valgrindChangeStack(id uintptr, start, end unsafe.Pointer) {
+	// VALGRIND_STACK_CHANGE
+	valgrindClientRequest(vg_userreq__stack_change, id, uintptr(start), uintptr(end), 0, 0)
+}
+
+//go:nosplit
+func valgrindMalloc(addr unsafe.Pointer, size uintptr) {
+	// VALGRIND_MALLOCLIKE_BLOCK
+	valgrindClientRequest(vg_userreq__malloclike_block, uintptr(addr), size, 0, 1, 0)
+}
+
+//go:nosplit
+func valgrindFree(addr unsafe.Pointer) {
+	// VALGRIND_FREELIKE_BLOCK
+	valgrindClientRequest(vg_userreq__freelike_block, uintptr(addr), 0, 0, 0, 0)
+}
+
+//go:nosplit
+func valgrindCreateMempool(addr unsafe.Pointer) {
+	// VALGRIND_CREATE_MEMPOOL_EXT
+	valgrindClientRequest(vg_userreq__create_mempool, uintptr(addr), 0, 1, valgrind_mempool_auto_free|valgrind_mempool_metapool, 0)
+}
+
+//go:nosplit
+func valgrindMempoolMalloc(pool, addr unsafe.Pointer, size uintptr) {
+	// VALGRIND_MEMPOOL_ALLOC
+	valgrindClientRequest(vg_userreq__mempool_alloc, uintptr(pool), uintptr(addr), size, 0, 0)
+}
+
+//go:nosplit
+func valgrindMempoolFree(pool, addr unsafe.Pointer) {
+	// VALGRIND_MEMPOOL_FREE
+	valgrindClientRequest(vg_userreq__mempool_free, uintptr(pool), uintptr(addr), 0, 0, 0)
+}
+
+// Memcheck client requests, copied from valgrind/memcheck.h
+
+//go:nosplit
+func valgrindMakeMemUndefined(addr unsafe.Pointer, size uintptr) {
+	// VALGRIND_MAKE_MEM_UNDEFINED
+	valgrindClientRequest(vg_userreq__make_mem_undefined, uintptr(addr), size, 0, 0, 0)
+}
+
+//go:nosplit
+func valgrindMakeMemDefined(addr unsafe.Pointer, size uintptr) {
+	// VALGRIND_MAKE_MEM_DEFINED
+	valgrindClientRequest(vg_userreq__make_mem_defined, uintptr(addr), size, 0, 0, 0)
+}
+
+//go:nosplit
+func valgrindMakeMemNoAccess(addr unsafe.Pointer, size uintptr) {
+	// VALGRIND_MAKE_MEM_NOACCESS
+	valgrindClientRequest(vg_userreq__make_mem_noaccess, uintptr(addr), size, 0, 0, 0)
+}
--- a/src/runtime/valgrind0.go
+++ b/src/runtime/valgrind0.go
@ -0,0 +1,25 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Valgrind instrumentation is only available on linux amd64 and arm64.
+
+//go:build !valgrind || !linux || (!amd64 && !arm64)
+
+package runtime
+
+import "unsafe"
+
+const valgrindenabled = false
+
+func valgrindRegisterStack(start, end unsafe.Pointer) uintptr       { return 0 }
+func valgrindDeregisterStack(id uintptr)                            {}
+func valgrindChangeStack(id uintptr, start, end unsafe.Pointer)     {}
+func valgrindMalloc(addr unsafe.Pointer, size uintptr)              {}
+func valgrindFree(addr unsafe.Pointer)                              {}
+func valgrindCreateMempool(addr unsafe.Pointer)                     {}
+func valgrindMempoolMalloc(pool, addr unsafe.Pointer, size uintptr) {}
+func valgrindMempoolFree(pool, addr unsafe.Pointer)                 {}
+func valgrindMakeMemUndefined(addr unsafe.Pointer, size uintptr)    {}
+func valgrindMakeMemDefined(addr unsafe.Pointer, size uintptr)      {}
+func valgrindMakeMemNoAccess(addr unsafe.Pointer, size uintptr)     {}
--- a/src/runtime/valgrind_amd64.s
+++ b/src/runtime/valgrind_amd64.s
@ -0,0 +1,37 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build valgrind && linux
+
+#include "textflag.h"
+
+// Instead of using cgo and using the Valgrind macros, we just emit the special client request
+// assembly ourselves. The client request mechanism is basically the same across all architectures,
+// with the notable difference being the special preamble that lets Valgrind know we want to do
+// a client request.
+//
+// The form of the VALGRIND_DO_CLIENT_REQUEST macro assembly can be found in the valgrind/valgrind.h
+// header file [0].
+//
+// [0] https://sourceware.org/git/?p=valgrind.git;a=blob;f=include/valgrind.h.in;h=f1710924aa7372e7b7e2abfbf7366a2286e33d2d;hb=HEAD
+
+// func valgrindClientRequest(uintptr, uintptr, uintptr, uintptr, uintptr, uintptr) (ret uintptr)
+TEXT runtime·valgrindClientRequest(SB), NOSPLIT, $0-56
+	// Load the address of the first of the (contiguous) arguments into AX.
+	LEAQ args+0(FP), AX
+
+	// Zero DX, since some requests may not populate it.
+	XORL DX, DX
+
+	// Emit the special preabmle.
+	ROLQ $3, DI; ROLQ $13, DI
+	ROLQ $61, DI; ROLQ $51, DI
+
+	// "Execute" the client request.
+	XCHGQ BX, BX
+
+	// Copy the result out of DX.
+	MOVQ DX, ret+48(FP)
+
+	RET
--- a/src/runtime/valgrind_arm64.s
+++ b/src/runtime/valgrind_arm64.s
@ -0,0 +1,29 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build valgrind && linux
+
+#include "textflag.h"
+
+// See valgrind_amd64.s for notes about this assembly.
+
+// func valgrindClientRequest(uintptr, uintptr, uintptr, uintptr, uintptr, uintptr) (ret uintptr)
+TEXT runtime·valgrindClientRequest(SB), NOSPLIT, $0-56
+	// Load the address of the first of the (contiguous) arguments into x4.
+	MOVD $args+0(FP), R4
+
+	// Zero x3, since some requests may not populate it.
+	MOVD ZR, R3
+
+	// Emit the special preamble.
+	ROR $3, R12; ROR $13, R12
+	ROR $51, R12; ROR $61, R12
+
+	// "Execute" the client request.
+	ORR R10, R10
+
+	// Copy the result out of x3.
+	MOVD R3, ret+48(FP)
+
+	RET
--- a/src/syscall/exec_linux.go
+++ b/src/syscall/exec_linux.go
@ -800,9 +800,34 @@ func os_checkClonePidfd() error {
 	// pidfd.
 	defer Close(int(pidfd))

+	// TODO(roland): this is necessary to prevent valgrind from complaining
+	// about passing 0x0 to waitid, which is doesn't like. This is clearly not
+	// ideal. The structures are copied (mostly) verbatim from syscall/unix,
+	// which we obviously cannot import because of an import loop.
+
+	const is64bit = ^uint(0) >> 63 // 0 for 32-bit hosts, 1 for 64-bit ones.
+	type sigInfo struct {
+		Signo int32
+		_     struct {
+			Errno int32
+			Code  int32
+		} // Two int32 fields, swapped on MIPS.
+		_ [is64bit]int32 // Extra padding for 64-bit hosts only.
+
+		// End of common part. Beginning of signal-specific part.
+
+		Pid    int32
+		Uid    uint32
+		Status int32
+
+		// Pad to 128 bytes.
+		_ [128 - (6+is64bit)*4]byte
+	}
+
 	for {
 		const _P_PIDFD = 3
-		_, _, errno = Syscall6(SYS_WAITID, _P_PIDFD, uintptr(pidfd), 0, WEXITED|WCLONE, 0, 0)
+		var info sigInfo
+		_, _, errno = Syscall6(SYS_WAITID, _P_PIDFD, uintptr(pidfd), uintptr(unsafe.Pointer(&info)), WEXITED|WCLONE, 0, 0)
 		if errno != EINTR {
 			break
 		}