runtime: save scalar registers off stack in amd64 async preemption

Asynchronous preemption must save all registers that could be in use
by Go code. Currently, it saves all of these to the goroutine stack.
As a result, the stack frame requirements of asynchronous preemption
can be rather high. On amd64, this requires 368 bytes of stack space,
most of which is the XMM registers. Several RISC architectures are
around 0.5 KiB.

As we add support for SIMD instructions, this is going to become a
problem. The AVX-512 register state is 2.5 KiB. This well exceeds the
nosplit limit, and even if it didn't, could constrain when we can
asynchronously preempt goroutines on small stacks.

This CL fixes this by moving pure scalar state stored in non-GP
registers off the stack and into an allocated "extended register
state" object. To reduce space overhead, we only allocate these
objects as needed. While in the theoretical limit, every G could need
this register state, in practice very few do at a time.

However, we can't allocate when we're in the middle of saving the
register state during an asynchronous preemption, so we reserve
scratch space on every P to temporarily store the register state,
which can then be copied out to an allocated state object later by Go
code.

This commit only implements this for amd64, since that's where we're
about to add much more vector state, but it lays the groundwork for
doing this on any architecture that could benefit.

This is a cherry-pick of CL 680898 plus bug fix CL 684836 from the
dev.simd branch.

Change-Id: I123a95e21c11d5c10942d70e27f84d2d99bbf735
Reviewed-on: https://go-review.googlesource.com/c/go/+/669195
Auto-Submit: Austin Clements <austin@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
This commit is contained in:
Austin Clements 2025-04-29 22:55:40 -04:00 committed by Gopher Robot
parent e73afaae69
commit af0c4fe2ca
13 changed files with 387 additions and 66 deletions

View file

@ -554,6 +554,8 @@ type G = g
type Sudog = sudog
type XRegPerG = xRegPerG
func Getg() *G {
return getg()
}

View file

@ -70,6 +70,7 @@ const (
lockRankHchanLeaf
// WB
lockRankWbufSpans
lockRankXRegAlloc
lockRankMheap
lockRankMheapSpecial
lockRankGlobalAlloc
@ -143,6 +144,7 @@ var lockNames = []string{
lockRankStackLarge: "stackLarge",
lockRankHchanLeaf: "hchanLeaf",
lockRankWbufSpans: "wbufSpans",
lockRankXRegAlloc: "xRegAlloc",
lockRankMheap: "mheap",
lockRankMheapSpecial: "mheapSpecial",
lockRankGlobalAlloc: "globalAlloc",
@ -228,9 +230,10 @@ var lockPartialOrder [][]lockRank = [][]lockRank{
lockRankStackLarge: {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankComputeMaxProcs, lockRankUpdateMaxProcsG, lockRankSweepWaiters, lockRankAssistQueue, lockRankStrongFromWeakQueue, lockRankCleanupQueue, lockRankSweep, lockRankTestR, lockRankVgetrandom, lockRankTimerSend, lockRankExecW, lockRankCpuprof, lockRankPollDesc, lockRankWakeableSleep, lockRankHchan, lockRankAllocmR, lockRankExecR, lockRankSched, lockRankAllg, lockRankAllp, lockRankNotifyList, lockRankTimers, lockRankTimer, lockRankNetpollInit, lockRankRoot, lockRankItab, lockRankReflectOffs, lockRankSynctest, lockRankUserArenaState, lockRankTraceBuf, lockRankTraceStrings, lockRankFin, lockRankSpanSetSpine, lockRankMspanSpecial, lockRankGcBitsArenas, lockRankProfInsert, lockRankProfBlock, lockRankProfMemActive, lockRankProfMemFuture, lockRankGscan},
lockRankHchanLeaf: {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankComputeMaxProcs, lockRankUpdateMaxProcsG, lockRankSweepWaiters, lockRankAssistQueue, lockRankStrongFromWeakQueue, lockRankCleanupQueue, lockRankSweep, lockRankTestR, lockRankVgetrandom, lockRankTimerSend, lockRankExecW, lockRankCpuprof, lockRankPollDesc, lockRankWakeableSleep, lockRankHchan, lockRankAllocmR, lockRankExecR, lockRankSched, lockRankAllg, lockRankAllp, lockRankNotifyList, lockRankTimers, lockRankTimer, lockRankNetpollInit, lockRankRoot, lockRankItab, lockRankReflectOffs, lockRankSynctest, lockRankUserArenaState, lockRankTraceBuf, lockRankTraceStrings, lockRankFin, lockRankSpanSetSpine, lockRankMspanSpecial, lockRankGcBitsArenas, lockRankProfInsert, lockRankProfBlock, lockRankProfMemActive, lockRankProfMemFuture, lockRankGscan, lockRankHchanLeaf},
lockRankWbufSpans: {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankComputeMaxProcs, lockRankUpdateMaxProcsG, lockRankDefer, lockRankSweepWaiters, lockRankAssistQueue, lockRankStrongFromWeakQueue, lockRankCleanupQueue, lockRankSweep, lockRankTestR, lockRankVgetrandom, lockRankTimerSend, lockRankExecW, lockRankCpuprof, lockRankPollCache, lockRankPollDesc, lockRankWakeableSleep, lockRankHchan, lockRankAllocmR, lockRankExecR, lockRankSched, lockRankAllg, lockRankAllp, lockRankNotifyList, lockRankSudog, lockRankTimers, lockRankTimer, lockRankNetpollInit, lockRankRoot, lockRankItab, lockRankReflectOffs, lockRankSynctest, lockRankUserArenaState, lockRankTraceBuf, lockRankTraceStrings, lockRankFin, lockRankSpanSetSpine, lockRankMspanSpecial, lockRankGcBitsArenas, lockRankProfInsert, lockRankProfBlock, lockRankProfMemActive, lockRankProfMemFuture, lockRankGscan},
lockRankXRegAlloc: {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankComputeMaxProcs, lockRankUpdateMaxProcsG, lockRankSweepWaiters, lockRankAssistQueue, lockRankStrongFromWeakQueue, lockRankCleanupQueue, lockRankSweep, lockRankTestR, lockRankTimerSend, lockRankCpuprof, lockRankPollDesc, lockRankWakeableSleep, lockRankHchan, lockRankAllocmR, lockRankExecR, lockRankSched},
lockRankMheap: {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankComputeMaxProcs, lockRankUpdateMaxProcsG, lockRankDefer, lockRankSweepWaiters, lockRankAssistQueue, lockRankStrongFromWeakQueue, lockRankCleanupQueue, lockRankSweep, lockRankTestR, lockRankVgetrandom, lockRankTimerSend, lockRankExecW, lockRankCpuprof, lockRankPollCache, lockRankPollDesc, lockRankWakeableSleep, lockRankHchan, lockRankAllocmR, lockRankExecR, lockRankSched, lockRankAllg, lockRankAllp, lockRankNotifyList, lockRankSudog, lockRankTimers, lockRankTimer, lockRankNetpollInit, lockRankRoot, lockRankItab, lockRankReflectOffs, lockRankSynctest, lockRankUserArenaState, lockRankTraceBuf, lockRankTraceStrings, lockRankFin, lockRankSpanSetSpine, lockRankMspanSpecial, lockRankGcBitsArenas, lockRankProfInsert, lockRankProfBlock, lockRankProfMemActive, lockRankProfMemFuture, lockRankGscan, lockRankStackpool, lockRankStackLarge, lockRankWbufSpans},
lockRankMheapSpecial: {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankComputeMaxProcs, lockRankUpdateMaxProcsG, lockRankDefer, lockRankSweepWaiters, lockRankAssistQueue, lockRankStrongFromWeakQueue, lockRankCleanupQueue, lockRankSweep, lockRankTestR, lockRankVgetrandom, lockRankTimerSend, lockRankExecW, lockRankCpuprof, lockRankPollCache, lockRankPollDesc, lockRankWakeableSleep, lockRankHchan, lockRankAllocmR, lockRankExecR, lockRankSched, lockRankAllg, lockRankAllp, lockRankNotifyList, lockRankSudog, lockRankTimers, lockRankTimer, lockRankNetpollInit, lockRankRoot, lockRankItab, lockRankReflectOffs, lockRankSynctest, lockRankUserArenaState, lockRankTraceBuf, lockRankTraceStrings, lockRankFin, lockRankSpanSetSpine, lockRankMspanSpecial, lockRankGcBitsArenas, lockRankProfInsert, lockRankProfBlock, lockRankProfMemActive, lockRankProfMemFuture, lockRankGscan, lockRankStackpool, lockRankStackLarge, lockRankWbufSpans, lockRankMheap},
lockRankGlobalAlloc: {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankComputeMaxProcs, lockRankUpdateMaxProcsG, lockRankDefer, lockRankSweepWaiters, lockRankAssistQueue, lockRankStrongFromWeakQueue, lockRankCleanupQueue, lockRankSweep, lockRankTestR, lockRankVgetrandom, lockRankTimerSend, lockRankExecW, lockRankCpuprof, lockRankPollCache, lockRankPollDesc, lockRankWakeableSleep, lockRankHchan, lockRankAllocmR, lockRankExecR, lockRankSched, lockRankAllg, lockRankAllp, lockRankNotifyList, lockRankSudog, lockRankTimers, lockRankTimer, lockRankNetpollInit, lockRankRoot, lockRankItab, lockRankReflectOffs, lockRankSynctest, lockRankUserArenaState, lockRankTraceBuf, lockRankTraceStrings, lockRankFin, lockRankSpanSetSpine, lockRankMspanSpecial, lockRankGcBitsArenas, lockRankProfInsert, lockRankProfBlock, lockRankProfMemActive, lockRankProfMemFuture, lockRankGscan, lockRankStackpool, lockRankStackLarge, lockRankWbufSpans, lockRankMheap, lockRankMheapSpecial},
lockRankGlobalAlloc: {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankComputeMaxProcs, lockRankUpdateMaxProcsG, lockRankDefer, lockRankSweepWaiters, lockRankAssistQueue, lockRankStrongFromWeakQueue, lockRankCleanupQueue, lockRankSweep, lockRankTestR, lockRankVgetrandom, lockRankTimerSend, lockRankExecW, lockRankCpuprof, lockRankPollCache, lockRankPollDesc, lockRankWakeableSleep, lockRankHchan, lockRankAllocmR, lockRankExecR, lockRankSched, lockRankAllg, lockRankAllp, lockRankNotifyList, lockRankSudog, lockRankTimers, lockRankTimer, lockRankNetpollInit, lockRankRoot, lockRankItab, lockRankReflectOffs, lockRankSynctest, lockRankUserArenaState, lockRankTraceBuf, lockRankTraceStrings, lockRankFin, lockRankSpanSetSpine, lockRankMspanSpecial, lockRankGcBitsArenas, lockRankProfInsert, lockRankProfBlock, lockRankProfMemActive, lockRankProfMemFuture, lockRankGscan, lockRankStackpool, lockRankStackLarge, lockRankWbufSpans, lockRankXRegAlloc, lockRankMheap, lockRankMheapSpecial},
lockRankTrace: {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankComputeMaxProcs, lockRankUpdateMaxProcsG, lockRankDefer, lockRankSweepWaiters, lockRankAssistQueue, lockRankStrongFromWeakQueue, lockRankCleanupQueue, lockRankSweep, lockRankTestR, lockRankVgetrandom, lockRankTimerSend, lockRankExecW, lockRankCpuprof, lockRankPollCache, lockRankPollDesc, lockRankWakeableSleep, lockRankHchan, lockRankAllocmR, lockRankExecR, lockRankSched, lockRankAllg, lockRankAllp, lockRankNotifyList, lockRankSudog, lockRankTimers, lockRankTimer, lockRankNetpollInit, lockRankRoot, lockRankItab, lockRankReflectOffs, lockRankSynctest, lockRankUserArenaState, lockRankTraceBuf, lockRankTraceStrings, lockRankFin, lockRankSpanSetSpine, lockRankMspanSpecial, lockRankGcBitsArenas, lockRankProfInsert, lockRankProfBlock, lockRankProfMemActive, lockRankProfMemFuture, lockRankGscan, lockRankStackpool, lockRankStackLarge, lockRankWbufSpans, lockRankMheap},
lockRankTraceStackTab: {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankComputeMaxProcs, lockRankUpdateMaxProcsG, lockRankDefer, lockRankSweepWaiters, lockRankAssistQueue, lockRankStrongFromWeakQueue, lockRankCleanupQueue, lockRankSweep, lockRankTestR, lockRankVgetrandom, lockRankTimerSend, lockRankExecW, lockRankCpuprof, lockRankPollCache, lockRankPollDesc, lockRankWakeableSleep, lockRankHchan, lockRankAllocmR, lockRankExecR, lockRankSched, lockRankAllg, lockRankAllp, lockRankNotifyList, lockRankSudog, lockRankTimers, lockRankTimer, lockRankNetpollInit, lockRankRoot, lockRankItab, lockRankReflectOffs, lockRankSynctest, lockRankUserArenaState, lockRankTraceBuf, lockRankTraceStrings, lockRankFin, lockRankSpanSetSpine, lockRankMspanSpecial, lockRankGcBitsArenas, lockRankProfInsert, lockRankProfBlock, lockRankProfMemActive, lockRankProfMemFuture, lockRankGscan, lockRankStackpool, lockRankStackLarge, lockRankWbufSpans, lockRankMheap, lockRankTrace},
lockRankPanic: {},

View file

@ -821,6 +821,8 @@ func (h *mheap) init() {
}
h.pages.init(&h.lock, &memstats.gcMiscSys, false)
xRegInitAlloc()
}
// reclaim sweeps and reclaims at least npage pages into the heap.

View file

@ -193,6 +193,9 @@ defer,
# Below WB is the write barrier implementation.
< wbufSpans;
# xRegState allocator
sched < xRegAlloc;
# Span allocator
stackLarge,
stackpool,
@ -205,7 +208,8 @@ stackLarge,
# an mspanSpecial lock, and they're part of the malloc implementation.
# Pinner bits might be freed by the span allocator.
mheap, mspanSpecial < mheapSpecial;
mheap, mheapSpecial < globalAlloc;
# Fixallocs
mheap, mheapSpecial, xRegAlloc < globalAlloc;
# Execution tracer events (with a P)
hchan,

View file

@ -9,8 +9,10 @@
package main
import (
"bytes"
"flag"
"fmt"
"go/format"
"io"
"log"
"os"
@ -122,14 +124,19 @@ type gen struct {
goarch string
}
func (g *gen) asmHeader() {
func (g *gen) commonHeader() {
fmt.Fprintf(g.w, "// Code generated by mkpreempt.go; DO NOT EDIT.\n\n")
if beLe[g.goarch] {
base := g.goarch[:len(g.goarch)-1]
fmt.Fprintf(g.w, "//go:build %s || %sle\n\n", base, base)
}
}
func (g *gen) asmHeader() {
g.commonHeader()
fmt.Fprintf(g.w, "#include \"go_asm.h\"\n")
if g.goarch == "amd64" {
fmt.Fprintf(g.w, "#include \"go_tls.h\"\n")
fmt.Fprintf(g.w, "#include \"asm_amd64.h\"\n")
}
fmt.Fprintf(g.w, "#include \"textflag.h\"\n\n")
@ -145,6 +152,43 @@ func (g *gen) label(l string) {
fmt.Fprintf(g.w, "%s\n", l)
}
// writeXRegs writes an architecture xregs file.
func writeXRegs(arch string, l *layout) {
var code bytes.Buffer
g := gen{&code, arch}
g.commonHeader()
fmt.Fprintf(g.w, `
package runtime
type xRegs struct {
`)
pos := 0
for _, reg := range l.regs {
if reg.pos != pos {
log.Fatalf("padding not implemented")
}
typ := fmt.Sprintf("[%d]byte", reg.size)
switch {
case reg.size == 4 && reg.pos%4 == 0:
typ = "uint32"
case reg.size == 8 && reg.pos%8 == 0:
typ = "uint64"
}
fmt.Fprintf(g.w, "\t%s %s\n", reg.reg, typ)
pos += reg.size
}
fmt.Fprintf(g.w, "}\n")
path := fmt.Sprintf("preempt_%s.go", arch)
b, err := format.Source(code.Bytes())
if err != nil {
log.Fatalf("formatting %s: %s", path, err)
}
if err := os.WriteFile(path, b, 0666); err != nil {
log.Fatal(err)
}
}
type layout struct {
stack int
regs []regPos
@ -152,7 +196,7 @@ type layout struct {
}
type regPos struct {
pos int
pos, size int
saveOp string
restoreOp string
@ -165,17 +209,17 @@ type regPos struct {
}
func (l *layout) add(op, reg string, size int) {
l.regs = append(l.regs, regPos{saveOp: op, restoreOp: op, reg: reg, pos: l.stack})
l.regs = append(l.regs, regPos{saveOp: op, restoreOp: op, reg: reg, pos: l.stack, size: size})
l.stack += size
}
func (l *layout) add2(sop, rop, reg string, size int) {
l.regs = append(l.regs, regPos{saveOp: sop, restoreOp: rop, reg: reg, pos: l.stack})
l.regs = append(l.regs, regPos{saveOp: sop, restoreOp: rop, reg: reg, pos: l.stack, size: size})
l.stack += size
}
func (l *layout) addSpecial(save, restore string, size int) {
l.regs = append(l.regs, regPos{save: save, restore: restore, pos: l.stack})
l.regs = append(l.regs, regPos{save: save, restore: restore, pos: l.stack, size: size})
l.stack += size
}
@ -239,6 +283,8 @@ func gen386(g *gen) {
}
func genAMD64(g *gen) {
const xReg = "AX" // *xRegState
p := g.p
// Assign stack offsets.
@ -251,12 +297,13 @@ func genAMD64(g *gen) {
l.add("MOVQ", reg, 8)
}
}
lSSE := layout{stack: l.stack, sp: "SP"}
lXRegs := layout{sp: xReg} // Non-GP registers
for _, reg := range regNamesAMD64 {
if strings.HasPrefix(reg, "X") {
lSSE.add("MOVUPS", reg, 16)
lXRegs.add("MOVUPS", reg, 16)
}
}
writeXRegs(g.goarch, &lXRegs)
// TODO: MXCSR register?
@ -265,17 +312,40 @@ func genAMD64(g *gen) {
p("// Save flags before clobbering them")
p("PUSHFQ")
p("// obj doesn't understand ADD/SUB on SP, but does understand ADJSP")
p("ADJSP $%d", lSSE.stack)
p("ADJSP $%d", l.stack)
p("// But vet doesn't know ADJSP, so suppress vet stack checking")
p("NOP SP")
p("// Save GPs")
l.save(g)
lSSE.save(g)
// In general, the limitations on asynchronous preemption mean we only
// preempt in ABIInternal code. However, there's at least one exception to
// this: when we're in an open-coded transition between an ABIInternal
// function and an ABI0 call. We could more carefully arrange unsafe points
// to avoid ever landing in ABI0, but it's easy to just make this code not
// sensitive to the ABI we're preempting. The CALL to asyncPreempt2 will
// ensure we're in ABIInternal register state.
p("// Save extended register state to p.xRegs.scratch")
p("// Don't make assumptions about ABI register state. See mkpreempt.go")
p("get_tls(CX)")
p("MOVQ g(CX), R14")
p("MOVQ g_m(R14), %s", xReg)
p("MOVQ m_p(%s), %s", xReg, xReg)
p("LEAQ (p_xRegs+xRegPerP_scratch)(%s), %s", xReg, xReg)
lXRegs.save(g)
p("CALL ·asyncPreempt2(SB)")
lSSE.restore(g)
p("// Restore non-GPs from *p.xRegs.cache")
p("MOVQ g_m(R14), %s", xReg)
p("MOVQ m_p(%s), %s", xReg, xReg)
p("MOVQ (p_xRegs+xRegPerP_cache)(%s), %s", xReg, xReg)
lXRegs.restore(g)
p("// Restore GPs")
l.restore(g)
p("ADJSP $%d", -lSSE.stack)
p("ADJSP $%d", -l.stack)
p("POPFQ")
p("POPQ BP")
p("RET")

View file

@ -292,21 +292,52 @@ func canPreemptM(mp *m) bool {
// asyncPreempt saves all user registers and calls asyncPreempt2.
//
// When stack scanning encounters an asyncPreempt frame, it scans that
// It saves GP registers (anything that might contain a pointer) to the G stack.
// Hence, when stack scanning encounters an asyncPreempt frame, it scans that
// frame and its parent frame conservatively.
//
// On some platforms, it saves large additional scalar-only register state such
// as vector registers to an "extended register state" on the P.
//
// asyncPreempt is implemented in assembly.
func asyncPreempt()
// asyncPreempt2 is the Go continuation of asyncPreempt.
//
// It must be deeply nosplit because there's untyped data on the stack from
// asyncPreempt.
//
// It must not have any write barriers because we need to limit the amount of
// stack it uses.
//
//go:nosplit
//go:nowritebarrierrec
func asyncPreempt2() {
gp := getg()
// We can't grow the stack with untyped data from asyncPreempt, so switch to
// the system stack right away.
mcall(func(gp *g) {
gp.asyncSafePoint = true
// Move the extended register state from the P to the G. We do this now that
// we're on the system stack to avoid stack splits.
xRegSave(gp)
if gp.preemptStop {
mcall(preemptPark)
preemptPark(gp)
} else {
mcall(gopreempt_m)
gopreempt_m(gp)
}
// The above functions never return.
})
// Do not grow the stack below here!
gp := getg()
// Put the extended register state back on the M so resumption can find it.
// We can't do this in asyncPreemptM because the park calls never return.
xRegRestore(gp)
gp.asyncSafePoint = false
}
@ -319,19 +350,13 @@ func init() {
total := funcMaxSPDelta(f)
f = findfunc(abi.FuncPCABIInternal(asyncPreempt2))
total += funcMaxSPDelta(f)
f = findfunc(abi.FuncPCABIInternal(xRegRestore))
total += funcMaxSPDelta(f)
// Add some overhead for return PCs, etc.
asyncPreemptStack = uintptr(total) + 8*goarch.PtrSize
if asyncPreemptStack > stackNosplit {
// We need more than the nosplit limit. This isn't
// unsafe, but it may limit asynchronous preemption.
//
// This may be a problem if we start using more
// registers. In that case, we should store registers
// in a context object. If we pre-allocate one per P,
// asyncPreempt can spill just a few registers to the
// stack, then grab its context object and spill into
// it. When it enters the runtime, it would allocate a
// new context for the P.
// We need more than the nosplit limit. This isn't unsafe, but it may
// limit asynchronous preemption. Consider moving state into xRegState.
print("runtime: asyncPreemptStack=", asyncPreemptStack, "\n")
throw("async stack too large")
}

View file

@ -0,0 +1,22 @@
// Code generated by mkpreempt.go; DO NOT EDIT.
package runtime
type xRegs struct {
X0 [16]byte
X1 [16]byte
X2 [16]byte
X3 [16]byte
X4 [16]byte
X5 [16]byte
X6 [16]byte
X7 [16]byte
X8 [16]byte
X9 [16]byte
X10 [16]byte
X11 [16]byte
X12 [16]byte
X13 [16]byte
X14 [16]byte
X15 [16]byte
}

View file

@ -1,6 +1,7 @@
// Code generated by mkpreempt.go; DO NOT EDIT.
#include "go_asm.h"
#include "go_tls.h"
#include "asm_amd64.h"
#include "textflag.h"
@ -10,9 +11,10 @@ TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0
// Save flags before clobbering them
PUSHFQ
// obj doesn't understand ADD/SUB on SP, but does understand ADJSP
ADJSP $368
ADJSP $112
// But vet doesn't know ADJSP, so suppress vet stack checking
NOP SP
// Save GPs
MOVQ AX, 0(SP)
MOVQ CX, 8(SP)
MOVQ DX, 16(SP)
@ -27,39 +29,51 @@ TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0
MOVQ R13, 88(SP)
MOVQ R14, 96(SP)
MOVQ R15, 104(SP)
MOVUPS X0, 112(SP)
MOVUPS X1, 128(SP)
MOVUPS X2, 144(SP)
MOVUPS X3, 160(SP)
MOVUPS X4, 176(SP)
MOVUPS X5, 192(SP)
MOVUPS X6, 208(SP)
MOVUPS X7, 224(SP)
MOVUPS X8, 240(SP)
MOVUPS X9, 256(SP)
MOVUPS X10, 272(SP)
MOVUPS X11, 288(SP)
MOVUPS X12, 304(SP)
MOVUPS X13, 320(SP)
MOVUPS X14, 336(SP)
MOVUPS X15, 352(SP)
// Save extended register state to p.xRegs.scratch
// Don't make assumptions about ABI register state. See mkpreempt.go
get_tls(CX)
MOVQ g(CX), R14
MOVQ g_m(R14), AX
MOVQ m_p(AX), AX
LEAQ (p_xRegs+xRegPerP_scratch)(AX), AX
MOVUPS X0, 0(AX)
MOVUPS X1, 16(AX)
MOVUPS X2, 32(AX)
MOVUPS X3, 48(AX)
MOVUPS X4, 64(AX)
MOVUPS X5, 80(AX)
MOVUPS X6, 96(AX)
MOVUPS X7, 112(AX)
MOVUPS X8, 128(AX)
MOVUPS X9, 144(AX)
MOVUPS X10, 160(AX)
MOVUPS X11, 176(AX)
MOVUPS X12, 192(AX)
MOVUPS X13, 208(AX)
MOVUPS X14, 224(AX)
MOVUPS X15, 240(AX)
CALL ·asyncPreempt2(SB)
MOVUPS 352(SP), X15
MOVUPS 336(SP), X14
MOVUPS 320(SP), X13
MOVUPS 304(SP), X12
MOVUPS 288(SP), X11
MOVUPS 272(SP), X10
MOVUPS 256(SP), X9
MOVUPS 240(SP), X8
MOVUPS 224(SP), X7
MOVUPS 208(SP), X6
MOVUPS 192(SP), X5
MOVUPS 176(SP), X4
MOVUPS 160(SP), X3
MOVUPS 144(SP), X2
MOVUPS 128(SP), X1
MOVUPS 112(SP), X0
// Restore non-GPs from *p.xRegs.cache
MOVQ g_m(R14), AX
MOVQ m_p(AX), AX
MOVQ (p_xRegs+xRegPerP_cache)(AX), AX
MOVUPS 240(AX), X15
MOVUPS 224(AX), X14
MOVUPS 208(AX), X13
MOVUPS 192(AX), X12
MOVUPS 176(AX), X11
MOVUPS 160(AX), X10
MOVUPS 144(AX), X9
MOVUPS 128(AX), X8
MOVUPS 112(AX), X7
MOVUPS 96(AX), X6
MOVUPS 80(AX), X5
MOVUPS 64(AX), X4
MOVUPS 48(AX), X3
MOVUPS 32(AX), X2
MOVUPS 16(AX), X1
MOVUPS 0(AX), X0
// Restore GPs
MOVQ 104(SP), R15
MOVQ 96(SP), R14
MOVQ 88(SP), R13
@ -74,7 +88,7 @@ TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0
MOVQ 16(SP), DX
MOVQ 8(SP), CX
MOVQ 0(SP), AX
ADJSP $-368
ADJSP $-112
POPFQ
POPQ BP
RET

View file

@ -0,0 +1,27 @@
// Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build !amd64
// This provides common support for architectures that DO NOT use extended
// register state in asynchronous preemption.
package runtime
type xRegPerG struct{}
type xRegPerP struct{}
// xRegState is defined only so the build fails if we try to define a real
// xRegState on a noxreg architecture.
type xRegState struct{}
func xRegInitAlloc() {}
func xRegSave(gp *g) {}
//go:nosplit
func xRegRestore(gp *g) {}
func (*xRegPerP) free() {}

137
src/runtime/preempt_xreg.go Normal file
View file

@ -0,0 +1,137 @@
// Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build amd64
// This provides common support for architectures that use extended register
// state in asynchronous preemption.
//
// While asynchronous preemption stores general-purpose (GP) registers on the
// preempted goroutine's own stack, extended register state can be used to save
// non-GP state off the stack. In particular, this is meant for large vector
// register files. Currently, we assume this contains only scalar data, though
// we could change this constraint by conservatively scanning this memory.
//
// For an architecture to support extended register state, it must provide a Go
// definition of an xRegState type for storing the state, and its asyncPreempt
// implementation must write this register state to p.xRegs.scratch.
package runtime
import (
"internal/runtime/sys"
"unsafe"
)
// xRegState is long-lived extended register state. It is allocated off-heap and
// manually managed.
type xRegState struct {
_ sys.NotInHeap // Allocated from xRegAlloc
regs xRegs
}
// xRegPerG stores extended register state while a goroutine is asynchronously
// preempted. This is nil otherwise, so we can reuse a (likely small) pool of
// xRegState objects.
type xRegPerG struct {
state *xRegState
}
type xRegPerP struct {
// scratch temporary per-P space where [asyncPreempt] saves the register
// state before entering Go. It's quickly copied to per-G state.
scratch xRegs
// cache is a 1-element allocation cache of extended register state used by
// asynchronous preemption. On entry to preemption, this is used as a simple
// allocation cache. On exit from preemption, the G's xRegState is always
// stored here where it can be restored, and later either freed or reused
// for another preemption. On exit, this serves the dual purpose of
// delay-freeing the allocated xRegState until after we've definitely
// restored it.
cache *xRegState
}
// xRegAlloc allocates xRegState objects.
var xRegAlloc struct {
lock mutex
alloc fixalloc
}
func xRegInitAlloc() {
lockInit(&xRegAlloc.lock, lockRankXRegAlloc)
xRegAlloc.alloc.init(unsafe.Sizeof(xRegState{}), nil, nil, &memstats.other_sys)
}
// xRegSave saves the extended register state on this P to gp.
//
// This must run on the system stack because it assumes the P won't change.
//
//go:systemstack
func xRegSave(gp *g) {
if gp.xRegs.state != nil {
// Double preempt?
throw("gp.xRegState.p != nil on async preempt")
}
// Get the place to save the register state.
var dest *xRegState
pp := gp.m.p.ptr()
if pp.xRegs.cache != nil {
// Use the cached allocation.
dest = pp.xRegs.cache
pp.xRegs.cache = nil
} else {
// Allocate a new save block.
lock(&xRegAlloc.lock)
dest = (*xRegState)(xRegAlloc.alloc.alloc())
unlock(&xRegAlloc.lock)
}
// Copy state saved in the scratchpad to dest.
//
// If we ever need to save less state (e.g., avoid saving vector registers
// that aren't in use), we could have multiple allocation pools for
// different size states and copy only the registers we need.
dest.regs = pp.xRegs.scratch
// Save on the G.
gp.xRegs.state = dest
}
// xRegRestore prepares the extended register state on gp to be restored.
//
// It moves the state to gp.m.p.xRegs.cache where [asyncPreempt] expects to find
// it. This means nothing else may use the cache between this call and the
// return to asyncPreempt. This is not quite symmetric with [xRegSave], which
// uses gp.m.p.xRegs.scratch. By using cache instead, we save a block copy.
//
// This is called with asyncPreempt on the stack and thus must not grow the
// stack.
//
//go:nosplit
func xRegRestore(gp *g) {
if gp.xRegs.state == nil {
throw("gp.xRegState.p == nil on return from async preempt")
}
// If the P has a block cached on it, free that so we can replace it.
pp := gp.m.p.ptr()
if pp.xRegs.cache != nil {
// Don't grow the G stack.
systemstack(func() {
pp.xRegs.free()
})
}
pp.xRegs.cache = gp.xRegs.state
gp.xRegs.state = nil
}
func (xRegs *xRegPerP) free() {
if xRegs.cache != nil {
lock(&xRegAlloc.lock)
xRegAlloc.alloc.free(unsafe.Pointer(xRegs.cache))
xRegs.cache = nil
unlock(&xRegAlloc.lock)
}
}

View file

@ -5838,6 +5838,7 @@ func (pp *p) destroy() {
pp.gcAssistTime = 0
gcCleanups.queued += pp.cleanupsQueued
pp.cleanupsQueued = 0
pp.xRegs.free()
pp.status = _Pdead
}

View file

@ -492,6 +492,10 @@ type g struct {
coroarg *coro // argument during coroutine transfers
bubble *synctestBubble
// xRegs stores the extended register state if this G has been
// asynchronously preempted.
xRegs xRegPerG
// Per-G tracer state.
trace gTraceState
@ -760,6 +764,11 @@ type p struct {
// gcStopTime is the nanotime timestamp that this P last entered _Pgcstop.
gcStopTime int64
// xRegs is the per-P extended register state used by asynchronous
// preemption. This is an empty struct on platforms that don't use extended
// register state.
xRegs xRegPerP
// Padding is no longer needed. False sharing is now not a worry because p is large enough
// that its size class is an integer multiple of the cache line size (for any of our architectures).
}

View file

@ -15,15 +15,20 @@ import (
func TestSizeof(t *testing.T) {
const _64bit = unsafe.Sizeof(uintptr(0)) == 8
const xreg = unsafe.Sizeof(runtime.XRegPerG{}) // Varies per architecture
var tests = []struct {
val any // type as a value
_32bit uintptr // size on 32bit platforms
_64bit uintptr // size on 64bit platforms
}{
{runtime.G{}, 280, 440}, // g, but exported for testing
{runtime.G{}, 280 + xreg, 440 + xreg}, // g, but exported for testing
{runtime.Sudog{}, 56, 88}, // sudog, but exported for testing
}
if xreg > runtime.PtrSize {
t.Errorf("unsafe.Sizeof(xRegPerG) = %d, want <= %d", xreg, runtime.PtrSize)
}
for _, tt := range tests {
want := tt._32bit
if _64bit {