runtime: save scalar registers off stack in amd64 async preemption

Asynchronous preemption must save all registers that could be in use by Go code. Currently, it saves all of these to the goroutine stack. As a result, the stack frame requirements of asynchronous preemption can be rather high. On amd64, this requires 368 bytes of stack space, most of which is the XMM registers. Several RISC architectures are around 0.5 KiB. As we add support for SIMD instructions, this is going to become a problem. The AVX-512 register state is 2.5 KiB. This well exceeds the nosplit limit, and even if it didn't, could constrain when we can asynchronously preempt goroutines on small stacks. This CL fixes this by moving pure scalar state stored in non-GP registers off the stack and into an allocated "extended register state" object. To reduce space overhead, we only allocate these objects as needed. While in the theoretical limit, every G could need this register state, in practice very few do at a time. However, we can't allocate when we're in the middle of saving the register state during an asynchronous preemption, so we reserve scratch space on every P to temporarily store the register state, which can then be copied out to an allocated state object later by Go code. This commit only implements this for amd64, since that's where we're about to add much more vector state, but it lays the groundwork for doing this on any architecture that could benefit. This is a cherry-pick of CL 680898 plus bug fix CL 684836 from the dev.simd branch. Change-Id: I123a95e21c11d5c10942d70e27f84d2d99bbf735 Reviewed-on: https://go-review.googlesource.com/c/go/+/669195 Auto-Submit: Austin Clements <austin@google.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Cherry Mui <cherryyz@google.com>
2025-12-08 06:10:04 +00:00 · 2025-04-29 22:55:40 -04:00 · 2025-04-29 22:55:40 -04:00 · af0c4fe2ca
commit af0c4fe2ca
parent e73afaae69
13 changed files with 387 additions and 66 deletions
--- a/src/runtime/export_test.go
+++ b/src/runtime/export_test.go
@ -554,6 +554,8 @@ type G = g

 type Sudog = sudog

+type XRegPerG = xRegPerG
+
 func Getg() *G {
 	return getg()
 }
--- a/src/runtime/lockrank.go
+++ b/src/runtime/lockrank.go
@ -70,6 +70,7 @@ const (
 	lockRankHchanLeaf
 	// WB
 	lockRankWbufSpans
+	lockRankXRegAlloc
 	lockRankMheap
 	lockRankMheapSpecial
 	lockRankGlobalAlloc
@ -143,6 +144,7 @@ var lockNames = []string{
 	lockRankStackLarge:          "stackLarge",
 	lockRankHchanLeaf:           "hchanLeaf",
 	lockRankWbufSpans:           "wbufSpans",
+	lockRankXRegAlloc:           "xRegAlloc",
 	lockRankMheap:               "mheap",
 	lockRankMheapSpecial:        "mheapSpecial",
 	lockRankGlobalAlloc:         "globalAlloc",
@ -228,9 +230,10 @@ var lockPartialOrder [][]lockRank = [][]lockRank{
 	lockRankStackLarge:          {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankComputeMaxProcs, lockRankUpdateMaxProcsG, lockRankSweepWaiters, lockRankAssistQueue, lockRankStrongFromWeakQueue, lockRankCleanupQueue, lockRankSweep, lockRankTestR, lockRankVgetrandom, lockRankTimerSend, lockRankExecW, lockRankCpuprof, lockRankPollDesc, lockRankWakeableSleep, lockRankHchan, lockRankAllocmR, lockRankExecR, lockRankSched, lockRankAllg, lockRankAllp, lockRankNotifyList, lockRankTimers, lockRankTimer, lockRankNetpollInit, lockRankRoot, lockRankItab, lockRankReflectOffs, lockRankSynctest, lockRankUserArenaState, lockRankTraceBuf, lockRankTraceStrings, lockRankFin, lockRankSpanSetSpine, lockRankMspanSpecial, lockRankGcBitsArenas, lockRankProfInsert, lockRankProfBlock, lockRankProfMemActive, lockRankProfMemFuture, lockRankGscan},
 	lockRankHchanLeaf:           {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankComputeMaxProcs, lockRankUpdateMaxProcsG, lockRankSweepWaiters, lockRankAssistQueue, lockRankStrongFromWeakQueue, lockRankCleanupQueue, lockRankSweep, lockRankTestR, lockRankVgetrandom, lockRankTimerSend, lockRankExecW, lockRankCpuprof, lockRankPollDesc, lockRankWakeableSleep, lockRankHchan, lockRankAllocmR, lockRankExecR, lockRankSched, lockRankAllg, lockRankAllp, lockRankNotifyList, lockRankTimers, lockRankTimer, lockRankNetpollInit, lockRankRoot, lockRankItab, lockRankReflectOffs, lockRankSynctest, lockRankUserArenaState, lockRankTraceBuf, lockRankTraceStrings, lockRankFin, lockRankSpanSetSpine, lockRankMspanSpecial, lockRankGcBitsArenas, lockRankProfInsert, lockRankProfBlock, lockRankProfMemActive, lockRankProfMemFuture, lockRankGscan, lockRankHchanLeaf},
 	lockRankWbufSpans:           {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankComputeMaxProcs, lockRankUpdateMaxProcsG, lockRankDefer, lockRankSweepWaiters, lockRankAssistQueue, lockRankStrongFromWeakQueue, lockRankCleanupQueue, lockRankSweep, lockRankTestR, lockRankVgetrandom, lockRankTimerSend, lockRankExecW, lockRankCpuprof, lockRankPollCache, lockRankPollDesc, lockRankWakeableSleep, lockRankHchan, lockRankAllocmR, lockRankExecR, lockRankSched, lockRankAllg, lockRankAllp, lockRankNotifyList, lockRankSudog, lockRankTimers, lockRankTimer, lockRankNetpollInit, lockRankRoot, lockRankItab, lockRankReflectOffs, lockRankSynctest, lockRankUserArenaState, lockRankTraceBuf, lockRankTraceStrings, lockRankFin, lockRankSpanSetSpine, lockRankMspanSpecial, lockRankGcBitsArenas, lockRankProfInsert, lockRankProfBlock, lockRankProfMemActive, lockRankProfMemFuture, lockRankGscan},
+	lockRankXRegAlloc:           {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankComputeMaxProcs, lockRankUpdateMaxProcsG, lockRankSweepWaiters, lockRankAssistQueue, lockRankStrongFromWeakQueue, lockRankCleanupQueue, lockRankSweep, lockRankTestR, lockRankTimerSend, lockRankCpuprof, lockRankPollDesc, lockRankWakeableSleep, lockRankHchan, lockRankAllocmR, lockRankExecR, lockRankSched},
 	lockRankMheap:               {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankComputeMaxProcs, lockRankUpdateMaxProcsG, lockRankDefer, lockRankSweepWaiters, lockRankAssistQueue, lockRankStrongFromWeakQueue, lockRankCleanupQueue, lockRankSweep, lockRankTestR, lockRankVgetrandom, lockRankTimerSend, lockRankExecW, lockRankCpuprof, lockRankPollCache, lockRankPollDesc, lockRankWakeableSleep, lockRankHchan, lockRankAllocmR, lockRankExecR, lockRankSched, lockRankAllg, lockRankAllp, lockRankNotifyList, lockRankSudog, lockRankTimers, lockRankTimer, lockRankNetpollInit, lockRankRoot, lockRankItab, lockRankReflectOffs, lockRankSynctest, lockRankUserArenaState, lockRankTraceBuf, lockRankTraceStrings, lockRankFin, lockRankSpanSetSpine, lockRankMspanSpecial, lockRankGcBitsArenas, lockRankProfInsert, lockRankProfBlock, lockRankProfMemActive, lockRankProfMemFuture, lockRankGscan, lockRankStackpool, lockRankStackLarge, lockRankWbufSpans},
 	lockRankMheapSpecial:        {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankComputeMaxProcs, lockRankUpdateMaxProcsG, lockRankDefer, lockRankSweepWaiters, lockRankAssistQueue, lockRankStrongFromWeakQueue, lockRankCleanupQueue, lockRankSweep, lockRankTestR, lockRankVgetrandom, lockRankTimerSend, lockRankExecW, lockRankCpuprof, lockRankPollCache, lockRankPollDesc, lockRankWakeableSleep, lockRankHchan, lockRankAllocmR, lockRankExecR, lockRankSched, lockRankAllg, lockRankAllp, lockRankNotifyList, lockRankSudog, lockRankTimers, lockRankTimer, lockRankNetpollInit, lockRankRoot, lockRankItab, lockRankReflectOffs, lockRankSynctest, lockRankUserArenaState, lockRankTraceBuf, lockRankTraceStrings, lockRankFin, lockRankSpanSetSpine, lockRankMspanSpecial, lockRankGcBitsArenas, lockRankProfInsert, lockRankProfBlock, lockRankProfMemActive, lockRankProfMemFuture, lockRankGscan, lockRankStackpool, lockRankStackLarge, lockRankWbufSpans, lockRankMheap},
-	lockRankGlobalAlloc:         {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankComputeMaxProcs, lockRankUpdateMaxProcsG, lockRankDefer, lockRankSweepWaiters, lockRankAssistQueue, lockRankStrongFromWeakQueue, lockRankCleanupQueue, lockRankSweep, lockRankTestR, lockRankVgetrandom, lockRankTimerSend, lockRankExecW, lockRankCpuprof, lockRankPollCache, lockRankPollDesc, lockRankWakeableSleep, lockRankHchan, lockRankAllocmR, lockRankExecR, lockRankSched, lockRankAllg, lockRankAllp, lockRankNotifyList, lockRankSudog, lockRankTimers, lockRankTimer, lockRankNetpollInit, lockRankRoot, lockRankItab, lockRankReflectOffs, lockRankSynctest, lockRankUserArenaState, lockRankTraceBuf, lockRankTraceStrings, lockRankFin, lockRankSpanSetSpine, lockRankMspanSpecial, lockRankGcBitsArenas, lockRankProfInsert, lockRankProfBlock, lockRankProfMemActive, lockRankProfMemFuture, lockRankGscan, lockRankStackpool, lockRankStackLarge, lockRankWbufSpans, lockRankMheap, lockRankMheapSpecial},
+	lockRankGlobalAlloc:         {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankComputeMaxProcs, lockRankUpdateMaxProcsG, lockRankDefer, lockRankSweepWaiters, lockRankAssistQueue, lockRankStrongFromWeakQueue, lockRankCleanupQueue, lockRankSweep, lockRankTestR, lockRankVgetrandom, lockRankTimerSend, lockRankExecW, lockRankCpuprof, lockRankPollCache, lockRankPollDesc, lockRankWakeableSleep, lockRankHchan, lockRankAllocmR, lockRankExecR, lockRankSched, lockRankAllg, lockRankAllp, lockRankNotifyList, lockRankSudog, lockRankTimers, lockRankTimer, lockRankNetpollInit, lockRankRoot, lockRankItab, lockRankReflectOffs, lockRankSynctest, lockRankUserArenaState, lockRankTraceBuf, lockRankTraceStrings, lockRankFin, lockRankSpanSetSpine, lockRankMspanSpecial, lockRankGcBitsArenas, lockRankProfInsert, lockRankProfBlock, lockRankProfMemActive, lockRankProfMemFuture, lockRankGscan, lockRankStackpool, lockRankStackLarge, lockRankWbufSpans, lockRankXRegAlloc, lockRankMheap, lockRankMheapSpecial},
 	lockRankTrace:               {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankComputeMaxProcs, lockRankUpdateMaxProcsG, lockRankDefer, lockRankSweepWaiters, lockRankAssistQueue, lockRankStrongFromWeakQueue, lockRankCleanupQueue, lockRankSweep, lockRankTestR, lockRankVgetrandom, lockRankTimerSend, lockRankExecW, lockRankCpuprof, lockRankPollCache, lockRankPollDesc, lockRankWakeableSleep, lockRankHchan, lockRankAllocmR, lockRankExecR, lockRankSched, lockRankAllg, lockRankAllp, lockRankNotifyList, lockRankSudog, lockRankTimers, lockRankTimer, lockRankNetpollInit, lockRankRoot, lockRankItab, lockRankReflectOffs, lockRankSynctest, lockRankUserArenaState, lockRankTraceBuf, lockRankTraceStrings, lockRankFin, lockRankSpanSetSpine, lockRankMspanSpecial, lockRankGcBitsArenas, lockRankProfInsert, lockRankProfBlock, lockRankProfMemActive, lockRankProfMemFuture, lockRankGscan, lockRankStackpool, lockRankStackLarge, lockRankWbufSpans, lockRankMheap},
 	lockRankTraceStackTab:       {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankComputeMaxProcs, lockRankUpdateMaxProcsG, lockRankDefer, lockRankSweepWaiters, lockRankAssistQueue, lockRankStrongFromWeakQueue, lockRankCleanupQueue, lockRankSweep, lockRankTestR, lockRankVgetrandom, lockRankTimerSend, lockRankExecW, lockRankCpuprof, lockRankPollCache, lockRankPollDesc, lockRankWakeableSleep, lockRankHchan, lockRankAllocmR, lockRankExecR, lockRankSched, lockRankAllg, lockRankAllp, lockRankNotifyList, lockRankSudog, lockRankTimers, lockRankTimer, lockRankNetpollInit, lockRankRoot, lockRankItab, lockRankReflectOffs, lockRankSynctest, lockRankUserArenaState, lockRankTraceBuf, lockRankTraceStrings, lockRankFin, lockRankSpanSetSpine, lockRankMspanSpecial, lockRankGcBitsArenas, lockRankProfInsert, lockRankProfBlock, lockRankProfMemActive, lockRankProfMemFuture, lockRankGscan, lockRankStackpool, lockRankStackLarge, lockRankWbufSpans, lockRankMheap, lockRankTrace},
 	lockRankPanic:               {},
--- a/src/runtime/mheap.go
+++ b/src/runtime/mheap.go
@ -821,6 +821,8 @@ func (h *mheap) init() {
 	}

 	h.pages.init(&h.lock, &memstats.gcMiscSys, false)
+
+	xRegInitAlloc()
 }

 // reclaim sweeps and reclaims at least npage pages into the heap.
--- a/src/runtime/mklockrank.go
+++ b/src/runtime/mklockrank.go
@ -193,6 +193,9 @@ defer,
 # Below WB is the write barrier implementation.
 < wbufSpans;

+# xRegState allocator
+sched < xRegAlloc;
+
 # Span allocator
 stackLarge,
  stackpool,
@ -205,7 +208,8 @@ stackLarge,
 # an mspanSpecial lock, and they're part of the malloc implementation.
 # Pinner bits might be freed by the span allocator.
 mheap, mspanSpecial < mheapSpecial;
-mheap, mheapSpecial < globalAlloc;
+# Fixallocs
+mheap, mheapSpecial, xRegAlloc < globalAlloc;

 # Execution tracer events (with a P)
 hchan,
--- a/src/runtime/mkpreempt.go
+++ b/src/runtime/mkpreempt.go
@ -9,8 +9,10 @@
 package main

 import (
+	"bytes"
 	"flag"
 	"fmt"
+	"go/format"
 	"io"
 	"log"
 	"os"
@ -122,14 +124,19 @@ type gen struct {
 	goarch string
 }

-func (g *gen) asmHeader() {
+func (g *gen) commonHeader() {
 	fmt.Fprintf(g.w, "// Code generated by mkpreempt.go; DO NOT EDIT.\n\n")
 	if beLe[g.goarch] {
 		base := g.goarch[:len(g.goarch)-1]
 		fmt.Fprintf(g.w, "//go:build %s || %sle\n\n", base, base)
 	}
+}
+
+func (g *gen) asmHeader() {
+	g.commonHeader()
 	fmt.Fprintf(g.w, "#include \"go_asm.h\"\n")
 	if g.goarch == "amd64" {
+		fmt.Fprintf(g.w, "#include \"go_tls.h\"\n")
 		fmt.Fprintf(g.w, "#include \"asm_amd64.h\"\n")
 	}
 	fmt.Fprintf(g.w, "#include \"textflag.h\"\n\n")
@ -145,6 +152,43 @@ func (g *gen) label(l string) {
 	fmt.Fprintf(g.w, "%s\n", l)
 }

+// writeXRegs writes an architecture xregs file.
+func writeXRegs(arch string, l *layout) {
+	var code bytes.Buffer
+	g := gen{&code, arch}
+	g.commonHeader()
+	fmt.Fprintf(g.w, `
+package runtime
+
+type xRegs struct {
+`)
+	pos := 0
+	for _, reg := range l.regs {
+		if reg.pos != pos {
+			log.Fatalf("padding not implemented")
+		}
+		typ := fmt.Sprintf("[%d]byte", reg.size)
+		switch {
+		case reg.size == 4 && reg.pos%4 == 0:
+			typ = "uint32"
+		case reg.size == 8 && reg.pos%8 == 0:
+			typ = "uint64"
+		}
+		fmt.Fprintf(g.w, "\t%s %s\n", reg.reg, typ)
+		pos += reg.size
+	}
+	fmt.Fprintf(g.w, "}\n")
+
+	path := fmt.Sprintf("preempt_%s.go", arch)
+	b, err := format.Source(code.Bytes())
+	if err != nil {
+		log.Fatalf("formatting %s: %s", path, err)
+	}
+	if err := os.WriteFile(path, b, 0666); err != nil {
+		log.Fatal(err)
+	}
+}
+
 type layout struct {
 	stack int
 	regs  []regPos
@ -152,7 +196,7 @@ type layout struct {
 }

 type regPos struct {
-	pos int
+	pos, size int

 	saveOp    string
 	restoreOp string
@ -165,17 +209,17 @@ type regPos struct {
 }

 func (l *layout) add(op, reg string, size int) {
-	l.regs = append(l.regs, regPos{saveOp: op, restoreOp: op, reg: reg, pos: l.stack})
+	l.regs = append(l.regs, regPos{saveOp: op, restoreOp: op, reg: reg, pos: l.stack, size: size})
 	l.stack += size
 }

 func (l *layout) add2(sop, rop, reg string, size int) {
-	l.regs = append(l.regs, regPos{saveOp: sop, restoreOp: rop, reg: reg, pos: l.stack})
+	l.regs = append(l.regs, regPos{saveOp: sop, restoreOp: rop, reg: reg, pos: l.stack, size: size})
 	l.stack += size
 }

 func (l *layout) addSpecial(save, restore string, size int) {
-	l.regs = append(l.regs, regPos{save: save, restore: restore, pos: l.stack})
+	l.regs = append(l.regs, regPos{save: save, restore: restore, pos: l.stack, size: size})
 	l.stack += size
 }

@ -239,6 +283,8 @@ func gen386(g *gen) {
 }

 func genAMD64(g *gen) {
+	const xReg = "AX" // *xRegState
+
 	p := g.p

 	// Assign stack offsets.
@ -251,12 +297,13 @@ func genAMD64(g *gen) {
 			l.add("MOVQ", reg, 8)
 		}
 	}
-	lSSE := layout{stack: l.stack, sp: "SP"}
+	lXRegs := layout{sp: xReg} // Non-GP registers
 	for _, reg := range regNamesAMD64 {
 		if strings.HasPrefix(reg, "X") {
-			lSSE.add("MOVUPS", reg, 16)
+			lXRegs.add("MOVUPS", reg, 16)
 		}
 	}
+	writeXRegs(g.goarch, &lXRegs)

 	// TODO: MXCSR register?

@ -265,17 +312,40 @@ func genAMD64(g *gen) {
 	p("// Save flags before clobbering them")
 	p("PUSHFQ")
 	p("// obj doesn't understand ADD/SUB on SP, but does understand ADJSP")
-	p("ADJSP $%d", lSSE.stack)
+	p("ADJSP $%d", l.stack)
 	p("// But vet doesn't know ADJSP, so suppress vet stack checking")
 	p("NOP SP")

+	p("// Save GPs")
 	l.save(g)

-	lSSE.save(g)
+	// In general, the limitations on asynchronous preemption mean we only
+	// preempt in ABIInternal code. However, there's at least one exception to
+	// this: when we're in an open-coded transition between an ABIInternal
+	// function and an ABI0 call. We could more carefully arrange unsafe points
+	// to avoid ever landing in ABI0, but it's easy to just make this code not
+	// sensitive to the ABI we're preempting. The CALL to asyncPreempt2 will
+	// ensure we're in ABIInternal register state.
+	p("// Save extended register state to p.xRegs.scratch")
+	p("// Don't make assumptions about ABI register state. See mkpreempt.go")
+	p("get_tls(CX)")
+	p("MOVQ g(CX), R14")
+	p("MOVQ g_m(R14), %s", xReg)
+	p("MOVQ m_p(%s), %s", xReg, xReg)
+	p("LEAQ (p_xRegs+xRegPerP_scratch)(%s), %s", xReg, xReg)
+	lXRegs.save(g)
+
 	p("CALL ·asyncPreempt2(SB)")
-	lSSE.restore(g)
+
+	p("// Restore non-GPs from *p.xRegs.cache")
+	p("MOVQ g_m(R14), %s", xReg)
+	p("MOVQ m_p(%s), %s", xReg, xReg)
+	p("MOVQ (p_xRegs+xRegPerP_cache)(%s), %s", xReg, xReg)
+	lXRegs.restore(g)
+
+	p("// Restore GPs")
 	l.restore(g)
-	p("ADJSP $%d", -lSSE.stack)
+	p("ADJSP $%d", -l.stack)
 	p("POPFQ")
 	p("POPQ BP")
 	p("RET")
--- a/src/runtime/preempt.go
+++ b/src/runtime/preempt.go
@ -292,21 +292,52 @@ func canPreemptM(mp *m) bool {

 // asyncPreempt saves all user registers and calls asyncPreempt2.
 //
-// When stack scanning encounters an asyncPreempt frame, it scans that
+// It saves GP registers (anything that might contain a pointer) to the G stack.
+// Hence, when stack scanning encounters an asyncPreempt frame, it scans that
 // frame and its parent frame conservatively.
 //
+// On some platforms, it saves large additional scalar-only register state such
+// as vector registers to an "extended register state" on the P.
+//
 // asyncPreempt is implemented in assembly.
 func asyncPreempt()

+// asyncPreempt2 is the Go continuation of asyncPreempt.
+//
+// It must be deeply nosplit because there's untyped data on the stack from
+// asyncPreempt.
+//
+// It must not have any write barriers because we need to limit the amount of
+// stack it uses.
+//
 //go:nosplit
+//go:nowritebarrierrec
 func asyncPreempt2() {
-	gp := getg()
+	// We can't grow the stack with untyped data from asyncPreempt, so switch to
+	// the system stack right away.
+	mcall(func(gp *g) {
 		gp.asyncSafePoint = true
+
+		// Move the extended register state from the P to the G. We do this now that
+		// we're on the system stack to avoid stack splits.
+		xRegSave(gp)
+
 		if gp.preemptStop {
-		mcall(preemptPark)
+			preemptPark(gp)
 		} else {
-		mcall(gopreempt_m)
+			gopreempt_m(gp)
 		}
+		// The above functions never return.
+	})
+
+	// Do not grow the stack below here!
+
+	gp := getg()
+
+	// Put the extended register state back on the M so resumption can find it.
+	// We can't do this in asyncPreemptM because the park calls never return.
+	xRegRestore(gp)
+
 	gp.asyncSafePoint = false
 }

@ -319,19 +350,13 @@ func init() {
 	total := funcMaxSPDelta(f)
 	f = findfunc(abi.FuncPCABIInternal(asyncPreempt2))
 	total += funcMaxSPDelta(f)
+	f = findfunc(abi.FuncPCABIInternal(xRegRestore))
+	total += funcMaxSPDelta(f)
 	// Add some overhead for return PCs, etc.
 	asyncPreemptStack = uintptr(total) + 8*goarch.PtrSize
 	if asyncPreemptStack > stackNosplit {
-		// We need more than the nosplit limit. This isn't
-		// unsafe, but it may limit asynchronous preemption.
-		//
-		// This may be a problem if we start using more
-		// registers. In that case, we should store registers
-		// in a context object. If we pre-allocate one per P,
-		// asyncPreempt can spill just a few registers to the
-		// stack, then grab its context object and spill into
-		// it. When it enters the runtime, it would allocate a
-		// new context for the P.
+		// We need more than the nosplit limit. This isn't unsafe, but it may
+		// limit asynchronous preemption. Consider moving state into xRegState.
 		print("runtime: asyncPreemptStack=", asyncPreemptStack, "\n")
 		throw("async stack too large")
 	}
--- a/src/runtime/preempt_amd64.go
+++ b/src/runtime/preempt_amd64.go
@ -0,0 +1,22 @@
+// Code generated by mkpreempt.go; DO NOT EDIT.
+
+package runtime
+
+type xRegs struct {
+	X0  [16]byte
+	X1  [16]byte
+	X2  [16]byte
+	X3  [16]byte
+	X4  [16]byte
+	X5  [16]byte
+	X6  [16]byte
+	X7  [16]byte
+	X8  [16]byte
+	X9  [16]byte
+	X10 [16]byte
+	X11 [16]byte
+	X12 [16]byte
+	X13 [16]byte
+	X14 [16]byte
+	X15 [16]byte
+}
--- a/src/runtime/preempt_amd64.s
+++ b/src/runtime/preempt_amd64.s
@ -1,6 +1,7 @@
 // Code generated by mkpreempt.go; DO NOT EDIT.

 #include "go_asm.h"
+#include "go_tls.h"
 #include "asm_amd64.h"
 #include "textflag.h"

@ -10,9 +11,10 @@ TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0
 	// Save flags before clobbering them
 	PUSHFQ
 	// obj doesn't understand ADD/SUB on SP, but does understand ADJSP
-	ADJSP $368
+	ADJSP $112
 	// But vet doesn't know ADJSP, so suppress vet stack checking
 	NOP SP
+	// Save GPs
 	MOVQ AX, 0(SP)
 	MOVQ CX, 8(SP)
 	MOVQ DX, 16(SP)
@ -27,39 +29,51 @@ TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0
 	MOVQ R13, 88(SP)
 	MOVQ R14, 96(SP)
 	MOVQ R15, 104(SP)
-	MOVUPS X0, 112(SP)
-	MOVUPS X1, 128(SP)
-	MOVUPS X2, 144(SP)
-	MOVUPS X3, 160(SP)
-	MOVUPS X4, 176(SP)
-	MOVUPS X5, 192(SP)
-	MOVUPS X6, 208(SP)
-	MOVUPS X7, 224(SP)
-	MOVUPS X8, 240(SP)
-	MOVUPS X9, 256(SP)
-	MOVUPS X10, 272(SP)
-	MOVUPS X11, 288(SP)
-	MOVUPS X12, 304(SP)
-	MOVUPS X13, 320(SP)
-	MOVUPS X14, 336(SP)
-	MOVUPS X15, 352(SP)
+	// Save extended register state to p.xRegs.scratch
+	// Don't make assumptions about ABI register state. See mkpreempt.go
+	get_tls(CX)
+	MOVQ g(CX), R14
+	MOVQ g_m(R14), AX
+	MOVQ m_p(AX), AX
+	LEAQ (p_xRegs+xRegPerP_scratch)(AX), AX
+	MOVUPS X0, 0(AX)
+	MOVUPS X1, 16(AX)
+	MOVUPS X2, 32(AX)
+	MOVUPS X3, 48(AX)
+	MOVUPS X4, 64(AX)
+	MOVUPS X5, 80(AX)
+	MOVUPS X6, 96(AX)
+	MOVUPS X7, 112(AX)
+	MOVUPS X8, 128(AX)
+	MOVUPS X9, 144(AX)
+	MOVUPS X10, 160(AX)
+	MOVUPS X11, 176(AX)
+	MOVUPS X12, 192(AX)
+	MOVUPS X13, 208(AX)
+	MOVUPS X14, 224(AX)
+	MOVUPS X15, 240(AX)
 	CALL ·asyncPreempt2(SB)
-	MOVUPS 352(SP), X15
-	MOVUPS 336(SP), X14
-	MOVUPS 320(SP), X13
-	MOVUPS 304(SP), X12
-	MOVUPS 288(SP), X11
-	MOVUPS 272(SP), X10
-	MOVUPS 256(SP), X9
-	MOVUPS 240(SP), X8
-	MOVUPS 224(SP), X7
-	MOVUPS 208(SP), X6
-	MOVUPS 192(SP), X5
-	MOVUPS 176(SP), X4
-	MOVUPS 160(SP), X3
-	MOVUPS 144(SP), X2
-	MOVUPS 128(SP), X1
-	MOVUPS 112(SP), X0
+	// Restore non-GPs from *p.xRegs.cache
+	MOVQ g_m(R14), AX
+	MOVQ m_p(AX), AX
+	MOVQ (p_xRegs+xRegPerP_cache)(AX), AX
+	MOVUPS 240(AX), X15
+	MOVUPS 224(AX), X14
+	MOVUPS 208(AX), X13
+	MOVUPS 192(AX), X12
+	MOVUPS 176(AX), X11
+	MOVUPS 160(AX), X10
+	MOVUPS 144(AX), X9
+	MOVUPS 128(AX), X8
+	MOVUPS 112(AX), X7
+	MOVUPS 96(AX), X6
+	MOVUPS 80(AX), X5
+	MOVUPS 64(AX), X4
+	MOVUPS 48(AX), X3
+	MOVUPS 32(AX), X2
+	MOVUPS 16(AX), X1
+	MOVUPS 0(AX), X0
+	// Restore GPs
 	MOVQ 104(SP), R15
 	MOVQ 96(SP), R14
 	MOVQ 88(SP), R13
@ -74,7 +88,7 @@ TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0
 	MOVQ 16(SP), DX
 	MOVQ 8(SP), CX
 	MOVQ 0(SP), AX
-	ADJSP $-368
+	ADJSP $-112
 	POPFQ
 	POPQ BP
 	RET
--- a/src/runtime/preempt_noxreg.go
+++ b/src/runtime/preempt_noxreg.go
@ -0,0 +1,27 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !amd64
+
+// This provides common support for architectures that DO NOT use extended
+// register state in asynchronous preemption.
+
+package runtime
+
+type xRegPerG struct{}
+
+type xRegPerP struct{}
+
+// xRegState is defined only so the build fails if we try to define a real
+// xRegState on a noxreg architecture.
+type xRegState struct{}
+
+func xRegInitAlloc() {}
+
+func xRegSave(gp *g) {}
+
+//go:nosplit
+func xRegRestore(gp *g) {}
+
+func (*xRegPerP) free() {}
--- a/src/runtime/preempt_xreg.go
+++ b/src/runtime/preempt_xreg.go
@ -0,0 +1,137 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build amd64
+
+// This provides common support for architectures that use extended register
+// state in asynchronous preemption.
+//
+// While asynchronous preemption stores general-purpose (GP) registers on the
+// preempted goroutine's own stack, extended register state can be used to save
+// non-GP state off the stack. In particular, this is meant for large vector
+// register files. Currently, we assume this contains only scalar data, though
+// we could change this constraint by conservatively scanning this memory.
+//
+// For an architecture to support extended register state, it must provide a Go
+// definition of an xRegState type for storing the state, and its asyncPreempt
+// implementation must write this register state to p.xRegs.scratch.
+
+package runtime
+
+import (
+	"internal/runtime/sys"
+	"unsafe"
+)
+
+// xRegState is long-lived extended register state. It is allocated off-heap and
+// manually managed.
+type xRegState struct {
+	_    sys.NotInHeap // Allocated from xRegAlloc
+	regs xRegs
+}
+
+// xRegPerG stores extended register state while a goroutine is asynchronously
+// preempted. This is nil otherwise, so we can reuse a (likely small) pool of
+// xRegState objects.
+type xRegPerG struct {
+	state *xRegState
+}
+
+type xRegPerP struct {
+	// scratch temporary per-P space where [asyncPreempt] saves the register
+	// state before entering Go. It's quickly copied to per-G state.
+	scratch xRegs
+
+	// cache is a 1-element allocation cache of extended register state used by
+	// asynchronous preemption. On entry to preemption, this is used as a simple
+	// allocation cache. On exit from preemption, the G's xRegState is always
+	// stored here where it can be restored, and later either freed or reused
+	// for another preemption. On exit, this serves the dual purpose of
+	// delay-freeing the allocated xRegState until after we've definitely
+	// restored it.
+	cache *xRegState
+}
+
+// xRegAlloc allocates xRegState objects.
+var xRegAlloc struct {
+	lock  mutex
+	alloc fixalloc
+}
+
+func xRegInitAlloc() {
+	lockInit(&xRegAlloc.lock, lockRankXRegAlloc)
+	xRegAlloc.alloc.init(unsafe.Sizeof(xRegState{}), nil, nil, &memstats.other_sys)
+}
+
+// xRegSave saves the extended register state on this P to gp.
+//
+// This must run on the system stack because it assumes the P won't change.
+//
+//go:systemstack
+func xRegSave(gp *g) {
+	if gp.xRegs.state != nil {
+		// Double preempt?
+		throw("gp.xRegState.p != nil on async preempt")
+	}
+
+	// Get the place to save the register state.
+	var dest *xRegState
+	pp := gp.m.p.ptr()
+	if pp.xRegs.cache != nil {
+		// Use the cached allocation.
+		dest = pp.xRegs.cache
+		pp.xRegs.cache = nil
+	} else {
+		// Allocate a new save block.
+		lock(&xRegAlloc.lock)
+		dest = (*xRegState)(xRegAlloc.alloc.alloc())
+		unlock(&xRegAlloc.lock)
+	}
+
+	// Copy state saved in the scratchpad to dest.
+	//
+	// If we ever need to save less state (e.g., avoid saving vector registers
+	// that aren't in use), we could have multiple allocation pools for
+	// different size states and copy only the registers we need.
+	dest.regs = pp.xRegs.scratch
+
+	// Save on the G.
+	gp.xRegs.state = dest
+}
+
+// xRegRestore prepares the extended register state on gp to be restored.
+//
+// It moves the state to gp.m.p.xRegs.cache where [asyncPreempt] expects to find
+// it. This means nothing else may use the cache between this call and the
+// return to asyncPreempt. This is not quite symmetric with [xRegSave], which
+// uses gp.m.p.xRegs.scratch. By using cache instead, we save a block copy.
+//
+// This is called with asyncPreempt on the stack and thus must not grow the
+// stack.
+//
+//go:nosplit
+func xRegRestore(gp *g) {
+	if gp.xRegs.state == nil {
+		throw("gp.xRegState.p == nil on return from async preempt")
+	}
+	// If the P has a block cached on it, free that so we can replace it.
+	pp := gp.m.p.ptr()
+	if pp.xRegs.cache != nil {
+		// Don't grow the G stack.
+		systemstack(func() {
+			pp.xRegs.free()
+		})
+	}
+	pp.xRegs.cache = gp.xRegs.state
+	gp.xRegs.state = nil
+}
+
+func (xRegs *xRegPerP) free() {
+	if xRegs.cache != nil {
+		lock(&xRegAlloc.lock)
+		xRegAlloc.alloc.free(unsafe.Pointer(xRegs.cache))
+		xRegs.cache = nil
+		unlock(&xRegAlloc.lock)
+	}
+}
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@ -5838,6 +5838,7 @@ func (pp *p) destroy() {
 	pp.gcAssistTime = 0
 	gcCleanups.queued += pp.cleanupsQueued
 	pp.cleanupsQueued = 0
+	pp.xRegs.free()
 	pp.status = _Pdead
 }

--- a/src/runtime/runtime2.go
+++ b/src/runtime/runtime2.go
@ -492,6 +492,10 @@ type g struct {
 	coroarg *coro // argument during coroutine transfers
 	bubble  *synctestBubble

+	// xRegs stores the extended register state if this G has been
+	// asynchronously preempted.
+	xRegs xRegPerG
+
 	// Per-G tracer state.
 	trace gTraceState

@ -760,6 +764,11 @@ type p struct {
 	// gcStopTime is the nanotime timestamp that this P last entered _Pgcstop.
 	gcStopTime int64

+	// xRegs is the per-P extended register state used by asynchronous
+	// preemption. This is an empty struct on platforms that don't use extended
+	// register state.
+	xRegs xRegPerP
+
 	// Padding is no longer needed. False sharing is now not a worry because p is large enough
 	// that its size class is an integer multiple of the cache line size (for any of our architectures).
 }
--- a/src/runtime/sizeof_test.go
+++ b/src/runtime/sizeof_test.go
@ -15,15 +15,20 @@ import (

 func TestSizeof(t *testing.T) {
 	const _64bit = unsafe.Sizeof(uintptr(0)) == 8
+	const xreg = unsafe.Sizeof(runtime.XRegPerG{}) // Varies per architecture
 	var tests = []struct {
 		val    any     // type as a value
 		_32bit uintptr // size on 32bit platforms
 		_64bit uintptr // size on 64bit platforms
 	}{
-		{runtime.G{}, 280, 440},   // g, but exported for testing
+		{runtime.G{}, 280 + xreg, 440 + xreg}, // g, but exported for testing
 		{runtime.Sudog{}, 56, 88},             // sudog, but exported for testing
 	}

+	if xreg > runtime.PtrSize {
+		t.Errorf("unsafe.Sizeof(xRegPerG) = %d, want <= %d", xreg, runtime.PtrSize)
+	}
+
 	for _, tt := range tests {
 		want := tt._32bit
 		if _64bit {