diff --git a/src/runtime/export_test.go b/src/runtime/export_test.go index fa30efccb1e..1f55717f0a1 100644 --- a/src/runtime/export_test.go +++ b/src/runtime/export_test.go @@ -554,6 +554,8 @@ type G = g type Sudog = sudog +type XRegPerG = xRegPerG + func Getg() *G { return getg() } diff --git a/src/runtime/lockrank.go b/src/runtime/lockrank.go index 44015ce862d..9821e499989 100644 --- a/src/runtime/lockrank.go +++ b/src/runtime/lockrank.go @@ -70,6 +70,7 @@ const ( lockRankHchanLeaf // WB lockRankWbufSpans + lockRankXRegAlloc lockRankMheap lockRankMheapSpecial lockRankGlobalAlloc @@ -143,6 +144,7 @@ var lockNames = []string{ lockRankStackLarge: "stackLarge", lockRankHchanLeaf: "hchanLeaf", lockRankWbufSpans: "wbufSpans", + lockRankXRegAlloc: "xRegAlloc", lockRankMheap: "mheap", lockRankMheapSpecial: "mheapSpecial", lockRankGlobalAlloc: "globalAlloc", @@ -228,9 +230,10 @@ var lockPartialOrder [][]lockRank = [][]lockRank{ lockRankStackLarge: {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankComputeMaxProcs, lockRankUpdateMaxProcsG, lockRankSweepWaiters, lockRankAssistQueue, lockRankStrongFromWeakQueue, lockRankCleanupQueue, lockRankSweep, lockRankTestR, lockRankVgetrandom, lockRankTimerSend, lockRankExecW, lockRankCpuprof, lockRankPollDesc, lockRankWakeableSleep, lockRankHchan, lockRankAllocmR, lockRankExecR, lockRankSched, lockRankAllg, lockRankAllp, lockRankNotifyList, lockRankTimers, lockRankTimer, lockRankNetpollInit, lockRankRoot, lockRankItab, lockRankReflectOffs, lockRankSynctest, lockRankUserArenaState, lockRankTraceBuf, lockRankTraceStrings, lockRankFin, lockRankSpanSetSpine, lockRankMspanSpecial, lockRankGcBitsArenas, lockRankProfInsert, lockRankProfBlock, lockRankProfMemActive, lockRankProfMemFuture, lockRankGscan}, lockRankHchanLeaf: {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankComputeMaxProcs, lockRankUpdateMaxProcsG, lockRankSweepWaiters, lockRankAssistQueue, lockRankStrongFromWeakQueue, lockRankCleanupQueue, lockRankSweep, lockRankTestR, lockRankVgetrandom, lockRankTimerSend, lockRankExecW, lockRankCpuprof, lockRankPollDesc, lockRankWakeableSleep, lockRankHchan, lockRankAllocmR, lockRankExecR, lockRankSched, lockRankAllg, lockRankAllp, lockRankNotifyList, lockRankTimers, lockRankTimer, lockRankNetpollInit, lockRankRoot, lockRankItab, lockRankReflectOffs, lockRankSynctest, lockRankUserArenaState, lockRankTraceBuf, lockRankTraceStrings, lockRankFin, lockRankSpanSetSpine, lockRankMspanSpecial, lockRankGcBitsArenas, lockRankProfInsert, lockRankProfBlock, lockRankProfMemActive, lockRankProfMemFuture, lockRankGscan, lockRankHchanLeaf}, lockRankWbufSpans: {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankComputeMaxProcs, lockRankUpdateMaxProcsG, lockRankDefer, lockRankSweepWaiters, lockRankAssistQueue, lockRankStrongFromWeakQueue, lockRankCleanupQueue, lockRankSweep, lockRankTestR, lockRankVgetrandom, lockRankTimerSend, lockRankExecW, lockRankCpuprof, lockRankPollCache, lockRankPollDesc, lockRankWakeableSleep, lockRankHchan, lockRankAllocmR, lockRankExecR, lockRankSched, lockRankAllg, lockRankAllp, lockRankNotifyList, lockRankSudog, lockRankTimers, lockRankTimer, lockRankNetpollInit, lockRankRoot, lockRankItab, lockRankReflectOffs, lockRankSynctest, lockRankUserArenaState, lockRankTraceBuf, lockRankTraceStrings, lockRankFin, lockRankSpanSetSpine, lockRankMspanSpecial, lockRankGcBitsArenas, lockRankProfInsert, lockRankProfBlock, lockRankProfMemActive, lockRankProfMemFuture, lockRankGscan}, + lockRankXRegAlloc: {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankComputeMaxProcs, lockRankUpdateMaxProcsG, lockRankSweepWaiters, lockRankAssistQueue, lockRankStrongFromWeakQueue, lockRankCleanupQueue, lockRankSweep, lockRankTestR, lockRankTimerSend, lockRankCpuprof, lockRankPollDesc, lockRankWakeableSleep, lockRankHchan, lockRankAllocmR, lockRankExecR, lockRankSched}, lockRankMheap: {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankComputeMaxProcs, lockRankUpdateMaxProcsG, lockRankDefer, lockRankSweepWaiters, lockRankAssistQueue, lockRankStrongFromWeakQueue, lockRankCleanupQueue, lockRankSweep, lockRankTestR, lockRankVgetrandom, lockRankTimerSend, lockRankExecW, lockRankCpuprof, lockRankPollCache, lockRankPollDesc, lockRankWakeableSleep, lockRankHchan, lockRankAllocmR, lockRankExecR, lockRankSched, lockRankAllg, lockRankAllp, lockRankNotifyList, lockRankSudog, lockRankTimers, lockRankTimer, lockRankNetpollInit, lockRankRoot, lockRankItab, lockRankReflectOffs, lockRankSynctest, lockRankUserArenaState, lockRankTraceBuf, lockRankTraceStrings, lockRankFin, lockRankSpanSetSpine, lockRankMspanSpecial, lockRankGcBitsArenas, lockRankProfInsert, lockRankProfBlock, lockRankProfMemActive, lockRankProfMemFuture, lockRankGscan, lockRankStackpool, lockRankStackLarge, lockRankWbufSpans}, lockRankMheapSpecial: {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankComputeMaxProcs, lockRankUpdateMaxProcsG, lockRankDefer, lockRankSweepWaiters, lockRankAssistQueue, lockRankStrongFromWeakQueue, lockRankCleanupQueue, lockRankSweep, lockRankTestR, lockRankVgetrandom, lockRankTimerSend, lockRankExecW, lockRankCpuprof, lockRankPollCache, lockRankPollDesc, lockRankWakeableSleep, lockRankHchan, lockRankAllocmR, lockRankExecR, lockRankSched, lockRankAllg, lockRankAllp, lockRankNotifyList, lockRankSudog, lockRankTimers, lockRankTimer, lockRankNetpollInit, lockRankRoot, lockRankItab, lockRankReflectOffs, lockRankSynctest, lockRankUserArenaState, lockRankTraceBuf, lockRankTraceStrings, lockRankFin, lockRankSpanSetSpine, lockRankMspanSpecial, lockRankGcBitsArenas, lockRankProfInsert, lockRankProfBlock, lockRankProfMemActive, lockRankProfMemFuture, lockRankGscan, lockRankStackpool, lockRankStackLarge, lockRankWbufSpans, lockRankMheap}, - lockRankGlobalAlloc: {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankComputeMaxProcs, lockRankUpdateMaxProcsG, lockRankDefer, lockRankSweepWaiters, lockRankAssistQueue, lockRankStrongFromWeakQueue, lockRankCleanupQueue, lockRankSweep, lockRankTestR, lockRankVgetrandom, lockRankTimerSend, lockRankExecW, lockRankCpuprof, lockRankPollCache, lockRankPollDesc, lockRankWakeableSleep, lockRankHchan, lockRankAllocmR, lockRankExecR, lockRankSched, lockRankAllg, lockRankAllp, lockRankNotifyList, lockRankSudog, lockRankTimers, lockRankTimer, lockRankNetpollInit, lockRankRoot, lockRankItab, lockRankReflectOffs, lockRankSynctest, lockRankUserArenaState, lockRankTraceBuf, lockRankTraceStrings, lockRankFin, lockRankSpanSetSpine, lockRankMspanSpecial, lockRankGcBitsArenas, lockRankProfInsert, lockRankProfBlock, lockRankProfMemActive, lockRankProfMemFuture, lockRankGscan, lockRankStackpool, lockRankStackLarge, lockRankWbufSpans, lockRankMheap, lockRankMheapSpecial}, + lockRankGlobalAlloc: {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankComputeMaxProcs, lockRankUpdateMaxProcsG, lockRankDefer, lockRankSweepWaiters, lockRankAssistQueue, lockRankStrongFromWeakQueue, lockRankCleanupQueue, lockRankSweep, lockRankTestR, lockRankVgetrandom, lockRankTimerSend, lockRankExecW, lockRankCpuprof, lockRankPollCache, lockRankPollDesc, lockRankWakeableSleep, lockRankHchan, lockRankAllocmR, lockRankExecR, lockRankSched, lockRankAllg, lockRankAllp, lockRankNotifyList, lockRankSudog, lockRankTimers, lockRankTimer, lockRankNetpollInit, lockRankRoot, lockRankItab, lockRankReflectOffs, lockRankSynctest, lockRankUserArenaState, lockRankTraceBuf, lockRankTraceStrings, lockRankFin, lockRankSpanSetSpine, lockRankMspanSpecial, lockRankGcBitsArenas, lockRankProfInsert, lockRankProfBlock, lockRankProfMemActive, lockRankProfMemFuture, lockRankGscan, lockRankStackpool, lockRankStackLarge, lockRankWbufSpans, lockRankXRegAlloc, lockRankMheap, lockRankMheapSpecial}, lockRankTrace: {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankComputeMaxProcs, lockRankUpdateMaxProcsG, lockRankDefer, lockRankSweepWaiters, lockRankAssistQueue, lockRankStrongFromWeakQueue, lockRankCleanupQueue, lockRankSweep, lockRankTestR, lockRankVgetrandom, lockRankTimerSend, lockRankExecW, lockRankCpuprof, lockRankPollCache, lockRankPollDesc, lockRankWakeableSleep, lockRankHchan, lockRankAllocmR, lockRankExecR, lockRankSched, lockRankAllg, lockRankAllp, lockRankNotifyList, lockRankSudog, lockRankTimers, lockRankTimer, lockRankNetpollInit, lockRankRoot, lockRankItab, lockRankReflectOffs, lockRankSynctest, lockRankUserArenaState, lockRankTraceBuf, lockRankTraceStrings, lockRankFin, lockRankSpanSetSpine, lockRankMspanSpecial, lockRankGcBitsArenas, lockRankProfInsert, lockRankProfBlock, lockRankProfMemActive, lockRankProfMemFuture, lockRankGscan, lockRankStackpool, lockRankStackLarge, lockRankWbufSpans, lockRankMheap}, lockRankTraceStackTab: {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankComputeMaxProcs, lockRankUpdateMaxProcsG, lockRankDefer, lockRankSweepWaiters, lockRankAssistQueue, lockRankStrongFromWeakQueue, lockRankCleanupQueue, lockRankSweep, lockRankTestR, lockRankVgetrandom, lockRankTimerSend, lockRankExecW, lockRankCpuprof, lockRankPollCache, lockRankPollDesc, lockRankWakeableSleep, lockRankHchan, lockRankAllocmR, lockRankExecR, lockRankSched, lockRankAllg, lockRankAllp, lockRankNotifyList, lockRankSudog, lockRankTimers, lockRankTimer, lockRankNetpollInit, lockRankRoot, lockRankItab, lockRankReflectOffs, lockRankSynctest, lockRankUserArenaState, lockRankTraceBuf, lockRankTraceStrings, lockRankFin, lockRankSpanSetSpine, lockRankMspanSpecial, lockRankGcBitsArenas, lockRankProfInsert, lockRankProfBlock, lockRankProfMemActive, lockRankProfMemFuture, lockRankGscan, lockRankStackpool, lockRankStackLarge, lockRankWbufSpans, lockRankMheap, lockRankTrace}, lockRankPanic: {}, diff --git a/src/runtime/mheap.go b/src/runtime/mheap.go index cb0d3400489..17762065738 100644 --- a/src/runtime/mheap.go +++ b/src/runtime/mheap.go @@ -821,6 +821,8 @@ func (h *mheap) init() { } h.pages.init(&h.lock, &memstats.gcMiscSys, false) + + xRegInitAlloc() } // reclaim sweeps and reclaims at least npage pages into the heap. diff --git a/src/runtime/mklockrank.go b/src/runtime/mklockrank.go index 46a063fdce5..9c503369a35 100644 --- a/src/runtime/mklockrank.go +++ b/src/runtime/mklockrank.go @@ -193,6 +193,9 @@ defer, # Below WB is the write barrier implementation. < wbufSpans; +# xRegState allocator +sched < xRegAlloc; + # Span allocator stackLarge, stackpool, @@ -205,7 +208,8 @@ stackLarge, # an mspanSpecial lock, and they're part of the malloc implementation. # Pinner bits might be freed by the span allocator. mheap, mspanSpecial < mheapSpecial; -mheap, mheapSpecial < globalAlloc; +# Fixallocs +mheap, mheapSpecial, xRegAlloc < globalAlloc; # Execution tracer events (with a P) hchan, diff --git a/src/runtime/mkpreempt.go b/src/runtime/mkpreempt.go index ec900a23d25..8fce40ccd8d 100644 --- a/src/runtime/mkpreempt.go +++ b/src/runtime/mkpreempt.go @@ -9,8 +9,10 @@ package main import ( + "bytes" "flag" "fmt" + "go/format" "io" "log" "os" @@ -122,14 +124,19 @@ type gen struct { goarch string } -func (g *gen) asmHeader() { +func (g *gen) commonHeader() { fmt.Fprintf(g.w, "// Code generated by mkpreempt.go; DO NOT EDIT.\n\n") if beLe[g.goarch] { base := g.goarch[:len(g.goarch)-1] fmt.Fprintf(g.w, "//go:build %s || %sle\n\n", base, base) } +} + +func (g *gen) asmHeader() { + g.commonHeader() fmt.Fprintf(g.w, "#include \"go_asm.h\"\n") if g.goarch == "amd64" { + fmt.Fprintf(g.w, "#include \"go_tls.h\"\n") fmt.Fprintf(g.w, "#include \"asm_amd64.h\"\n") } fmt.Fprintf(g.w, "#include \"textflag.h\"\n\n") @@ -145,6 +152,43 @@ func (g *gen) label(l string) { fmt.Fprintf(g.w, "%s\n", l) } +// writeXRegs writes an architecture xregs file. +func writeXRegs(arch string, l *layout) { + var code bytes.Buffer + g := gen{&code, arch} + g.commonHeader() + fmt.Fprintf(g.w, ` +package runtime + +type xRegs struct { +`) + pos := 0 + for _, reg := range l.regs { + if reg.pos != pos { + log.Fatalf("padding not implemented") + } + typ := fmt.Sprintf("[%d]byte", reg.size) + switch { + case reg.size == 4 && reg.pos%4 == 0: + typ = "uint32" + case reg.size == 8 && reg.pos%8 == 0: + typ = "uint64" + } + fmt.Fprintf(g.w, "\t%s %s\n", reg.reg, typ) + pos += reg.size + } + fmt.Fprintf(g.w, "}\n") + + path := fmt.Sprintf("preempt_%s.go", arch) + b, err := format.Source(code.Bytes()) + if err != nil { + log.Fatalf("formatting %s: %s", path, err) + } + if err := os.WriteFile(path, b, 0666); err != nil { + log.Fatal(err) + } +} + type layout struct { stack int regs []regPos @@ -152,7 +196,7 @@ type layout struct { } type regPos struct { - pos int + pos, size int saveOp string restoreOp string @@ -165,17 +209,17 @@ type regPos struct { } func (l *layout) add(op, reg string, size int) { - l.regs = append(l.regs, regPos{saveOp: op, restoreOp: op, reg: reg, pos: l.stack}) + l.regs = append(l.regs, regPos{saveOp: op, restoreOp: op, reg: reg, pos: l.stack, size: size}) l.stack += size } func (l *layout) add2(sop, rop, reg string, size int) { - l.regs = append(l.regs, regPos{saveOp: sop, restoreOp: rop, reg: reg, pos: l.stack}) + l.regs = append(l.regs, regPos{saveOp: sop, restoreOp: rop, reg: reg, pos: l.stack, size: size}) l.stack += size } func (l *layout) addSpecial(save, restore string, size int) { - l.regs = append(l.regs, regPos{save: save, restore: restore, pos: l.stack}) + l.regs = append(l.regs, regPos{save: save, restore: restore, pos: l.stack, size: size}) l.stack += size } @@ -239,6 +283,8 @@ func gen386(g *gen) { } func genAMD64(g *gen) { + const xReg = "AX" // *xRegState + p := g.p // Assign stack offsets. @@ -251,12 +297,13 @@ func genAMD64(g *gen) { l.add("MOVQ", reg, 8) } } - lSSE := layout{stack: l.stack, sp: "SP"} + lXRegs := layout{sp: xReg} // Non-GP registers for _, reg := range regNamesAMD64 { if strings.HasPrefix(reg, "X") { - lSSE.add("MOVUPS", reg, 16) + lXRegs.add("MOVUPS", reg, 16) } } + writeXRegs(g.goarch, &lXRegs) // TODO: MXCSR register? @@ -265,17 +312,40 @@ func genAMD64(g *gen) { p("// Save flags before clobbering them") p("PUSHFQ") p("// obj doesn't understand ADD/SUB on SP, but does understand ADJSP") - p("ADJSP $%d", lSSE.stack) + p("ADJSP $%d", l.stack) p("// But vet doesn't know ADJSP, so suppress vet stack checking") p("NOP SP") + p("// Save GPs") l.save(g) - lSSE.save(g) + // In general, the limitations on asynchronous preemption mean we only + // preempt in ABIInternal code. However, there's at least one exception to + // this: when we're in an open-coded transition between an ABIInternal + // function and an ABI0 call. We could more carefully arrange unsafe points + // to avoid ever landing in ABI0, but it's easy to just make this code not + // sensitive to the ABI we're preempting. The CALL to asyncPreempt2 will + // ensure we're in ABIInternal register state. + p("// Save extended register state to p.xRegs.scratch") + p("// Don't make assumptions about ABI register state. See mkpreempt.go") + p("get_tls(CX)") + p("MOVQ g(CX), R14") + p("MOVQ g_m(R14), %s", xReg) + p("MOVQ m_p(%s), %s", xReg, xReg) + p("LEAQ (p_xRegs+xRegPerP_scratch)(%s), %s", xReg, xReg) + lXRegs.save(g) + p("CALL ·asyncPreempt2(SB)") - lSSE.restore(g) + + p("// Restore non-GPs from *p.xRegs.cache") + p("MOVQ g_m(R14), %s", xReg) + p("MOVQ m_p(%s), %s", xReg, xReg) + p("MOVQ (p_xRegs+xRegPerP_cache)(%s), %s", xReg, xReg) + lXRegs.restore(g) + + p("// Restore GPs") l.restore(g) - p("ADJSP $%d", -lSSE.stack) + p("ADJSP $%d", -l.stack) p("POPFQ") p("POPQ BP") p("RET") diff --git a/src/runtime/preempt.go b/src/runtime/preempt.go index c41c3558359..22727df74ee 100644 --- a/src/runtime/preempt.go +++ b/src/runtime/preempt.go @@ -292,21 +292,52 @@ func canPreemptM(mp *m) bool { // asyncPreempt saves all user registers and calls asyncPreempt2. // -// When stack scanning encounters an asyncPreempt frame, it scans that +// It saves GP registers (anything that might contain a pointer) to the G stack. +// Hence, when stack scanning encounters an asyncPreempt frame, it scans that // frame and its parent frame conservatively. // +// On some platforms, it saves large additional scalar-only register state such +// as vector registers to an "extended register state" on the P. +// // asyncPreempt is implemented in assembly. func asyncPreempt() +// asyncPreempt2 is the Go continuation of asyncPreempt. +// +// It must be deeply nosplit because there's untyped data on the stack from +// asyncPreempt. +// +// It must not have any write barriers because we need to limit the amount of +// stack it uses. +// //go:nosplit +//go:nowritebarrierrec func asyncPreempt2() { + // We can't grow the stack with untyped data from asyncPreempt, so switch to + // the system stack right away. + mcall(func(gp *g) { + gp.asyncSafePoint = true + + // Move the extended register state from the P to the G. We do this now that + // we're on the system stack to avoid stack splits. + xRegSave(gp) + + if gp.preemptStop { + preemptPark(gp) + } else { + gopreempt_m(gp) + } + // The above functions never return. + }) + + // Do not grow the stack below here! + gp := getg() - gp.asyncSafePoint = true - if gp.preemptStop { - mcall(preemptPark) - } else { - mcall(gopreempt_m) - } + + // Put the extended register state back on the M so resumption can find it. + // We can't do this in asyncPreemptM because the park calls never return. + xRegRestore(gp) + gp.asyncSafePoint = false } @@ -319,19 +350,13 @@ func init() { total := funcMaxSPDelta(f) f = findfunc(abi.FuncPCABIInternal(asyncPreempt2)) total += funcMaxSPDelta(f) + f = findfunc(abi.FuncPCABIInternal(xRegRestore)) + total += funcMaxSPDelta(f) // Add some overhead for return PCs, etc. asyncPreemptStack = uintptr(total) + 8*goarch.PtrSize if asyncPreemptStack > stackNosplit { - // We need more than the nosplit limit. This isn't - // unsafe, but it may limit asynchronous preemption. - // - // This may be a problem if we start using more - // registers. In that case, we should store registers - // in a context object. If we pre-allocate one per P, - // asyncPreempt can spill just a few registers to the - // stack, then grab its context object and spill into - // it. When it enters the runtime, it would allocate a - // new context for the P. + // We need more than the nosplit limit. This isn't unsafe, but it may + // limit asynchronous preemption. Consider moving state into xRegState. print("runtime: asyncPreemptStack=", asyncPreemptStack, "\n") throw("async stack too large") } diff --git a/src/runtime/preempt_amd64.go b/src/runtime/preempt_amd64.go new file mode 100644 index 00000000000..4a3ced79df3 --- /dev/null +++ b/src/runtime/preempt_amd64.go @@ -0,0 +1,22 @@ +// Code generated by mkpreempt.go; DO NOT EDIT. + +package runtime + +type xRegs struct { + X0 [16]byte + X1 [16]byte + X2 [16]byte + X3 [16]byte + X4 [16]byte + X5 [16]byte + X6 [16]byte + X7 [16]byte + X8 [16]byte + X9 [16]byte + X10 [16]byte + X11 [16]byte + X12 [16]byte + X13 [16]byte + X14 [16]byte + X15 [16]byte +} diff --git a/src/runtime/preempt_amd64.s b/src/runtime/preempt_amd64.s index 8e3ed0d7c59..0a33ce7f3e7 100644 --- a/src/runtime/preempt_amd64.s +++ b/src/runtime/preempt_amd64.s @@ -1,6 +1,7 @@ // Code generated by mkpreempt.go; DO NOT EDIT. #include "go_asm.h" +#include "go_tls.h" #include "asm_amd64.h" #include "textflag.h" @@ -10,9 +11,10 @@ TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0 // Save flags before clobbering them PUSHFQ // obj doesn't understand ADD/SUB on SP, but does understand ADJSP - ADJSP $368 + ADJSP $112 // But vet doesn't know ADJSP, so suppress vet stack checking NOP SP + // Save GPs MOVQ AX, 0(SP) MOVQ CX, 8(SP) MOVQ DX, 16(SP) @@ -27,39 +29,51 @@ TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0 MOVQ R13, 88(SP) MOVQ R14, 96(SP) MOVQ R15, 104(SP) - MOVUPS X0, 112(SP) - MOVUPS X1, 128(SP) - MOVUPS X2, 144(SP) - MOVUPS X3, 160(SP) - MOVUPS X4, 176(SP) - MOVUPS X5, 192(SP) - MOVUPS X6, 208(SP) - MOVUPS X7, 224(SP) - MOVUPS X8, 240(SP) - MOVUPS X9, 256(SP) - MOVUPS X10, 272(SP) - MOVUPS X11, 288(SP) - MOVUPS X12, 304(SP) - MOVUPS X13, 320(SP) - MOVUPS X14, 336(SP) - MOVUPS X15, 352(SP) + // Save extended register state to p.xRegs.scratch + // Don't make assumptions about ABI register state. See mkpreempt.go + get_tls(CX) + MOVQ g(CX), R14 + MOVQ g_m(R14), AX + MOVQ m_p(AX), AX + LEAQ (p_xRegs+xRegPerP_scratch)(AX), AX + MOVUPS X0, 0(AX) + MOVUPS X1, 16(AX) + MOVUPS X2, 32(AX) + MOVUPS X3, 48(AX) + MOVUPS X4, 64(AX) + MOVUPS X5, 80(AX) + MOVUPS X6, 96(AX) + MOVUPS X7, 112(AX) + MOVUPS X8, 128(AX) + MOVUPS X9, 144(AX) + MOVUPS X10, 160(AX) + MOVUPS X11, 176(AX) + MOVUPS X12, 192(AX) + MOVUPS X13, 208(AX) + MOVUPS X14, 224(AX) + MOVUPS X15, 240(AX) CALL ·asyncPreempt2(SB) - MOVUPS 352(SP), X15 - MOVUPS 336(SP), X14 - MOVUPS 320(SP), X13 - MOVUPS 304(SP), X12 - MOVUPS 288(SP), X11 - MOVUPS 272(SP), X10 - MOVUPS 256(SP), X9 - MOVUPS 240(SP), X8 - MOVUPS 224(SP), X7 - MOVUPS 208(SP), X6 - MOVUPS 192(SP), X5 - MOVUPS 176(SP), X4 - MOVUPS 160(SP), X3 - MOVUPS 144(SP), X2 - MOVUPS 128(SP), X1 - MOVUPS 112(SP), X0 + // Restore non-GPs from *p.xRegs.cache + MOVQ g_m(R14), AX + MOVQ m_p(AX), AX + MOVQ (p_xRegs+xRegPerP_cache)(AX), AX + MOVUPS 240(AX), X15 + MOVUPS 224(AX), X14 + MOVUPS 208(AX), X13 + MOVUPS 192(AX), X12 + MOVUPS 176(AX), X11 + MOVUPS 160(AX), X10 + MOVUPS 144(AX), X9 + MOVUPS 128(AX), X8 + MOVUPS 112(AX), X7 + MOVUPS 96(AX), X6 + MOVUPS 80(AX), X5 + MOVUPS 64(AX), X4 + MOVUPS 48(AX), X3 + MOVUPS 32(AX), X2 + MOVUPS 16(AX), X1 + MOVUPS 0(AX), X0 + // Restore GPs MOVQ 104(SP), R15 MOVQ 96(SP), R14 MOVQ 88(SP), R13 @@ -74,7 +88,7 @@ TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0 MOVQ 16(SP), DX MOVQ 8(SP), CX MOVQ 0(SP), AX - ADJSP $-368 + ADJSP $-112 POPFQ POPQ BP RET diff --git a/src/runtime/preempt_noxreg.go b/src/runtime/preempt_noxreg.go new file mode 100644 index 00000000000..dfe46559b5b --- /dev/null +++ b/src/runtime/preempt_noxreg.go @@ -0,0 +1,27 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build !amd64 + +// This provides common support for architectures that DO NOT use extended +// register state in asynchronous preemption. + +package runtime + +type xRegPerG struct{} + +type xRegPerP struct{} + +// xRegState is defined only so the build fails if we try to define a real +// xRegState on a noxreg architecture. +type xRegState struct{} + +func xRegInitAlloc() {} + +func xRegSave(gp *g) {} + +//go:nosplit +func xRegRestore(gp *g) {} + +func (*xRegPerP) free() {} diff --git a/src/runtime/preempt_xreg.go b/src/runtime/preempt_xreg.go new file mode 100644 index 00000000000..9e05455ddbb --- /dev/null +++ b/src/runtime/preempt_xreg.go @@ -0,0 +1,137 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build amd64 + +// This provides common support for architectures that use extended register +// state in asynchronous preemption. +// +// While asynchronous preemption stores general-purpose (GP) registers on the +// preempted goroutine's own stack, extended register state can be used to save +// non-GP state off the stack. In particular, this is meant for large vector +// register files. Currently, we assume this contains only scalar data, though +// we could change this constraint by conservatively scanning this memory. +// +// For an architecture to support extended register state, it must provide a Go +// definition of an xRegState type for storing the state, and its asyncPreempt +// implementation must write this register state to p.xRegs.scratch. + +package runtime + +import ( + "internal/runtime/sys" + "unsafe" +) + +// xRegState is long-lived extended register state. It is allocated off-heap and +// manually managed. +type xRegState struct { + _ sys.NotInHeap // Allocated from xRegAlloc + regs xRegs +} + +// xRegPerG stores extended register state while a goroutine is asynchronously +// preempted. This is nil otherwise, so we can reuse a (likely small) pool of +// xRegState objects. +type xRegPerG struct { + state *xRegState +} + +type xRegPerP struct { + // scratch temporary per-P space where [asyncPreempt] saves the register + // state before entering Go. It's quickly copied to per-G state. + scratch xRegs + + // cache is a 1-element allocation cache of extended register state used by + // asynchronous preemption. On entry to preemption, this is used as a simple + // allocation cache. On exit from preemption, the G's xRegState is always + // stored here where it can be restored, and later either freed or reused + // for another preemption. On exit, this serves the dual purpose of + // delay-freeing the allocated xRegState until after we've definitely + // restored it. + cache *xRegState +} + +// xRegAlloc allocates xRegState objects. +var xRegAlloc struct { + lock mutex + alloc fixalloc +} + +func xRegInitAlloc() { + lockInit(&xRegAlloc.lock, lockRankXRegAlloc) + xRegAlloc.alloc.init(unsafe.Sizeof(xRegState{}), nil, nil, &memstats.other_sys) +} + +// xRegSave saves the extended register state on this P to gp. +// +// This must run on the system stack because it assumes the P won't change. +// +//go:systemstack +func xRegSave(gp *g) { + if gp.xRegs.state != nil { + // Double preempt? + throw("gp.xRegState.p != nil on async preempt") + } + + // Get the place to save the register state. + var dest *xRegState + pp := gp.m.p.ptr() + if pp.xRegs.cache != nil { + // Use the cached allocation. + dest = pp.xRegs.cache + pp.xRegs.cache = nil + } else { + // Allocate a new save block. + lock(&xRegAlloc.lock) + dest = (*xRegState)(xRegAlloc.alloc.alloc()) + unlock(&xRegAlloc.lock) + } + + // Copy state saved in the scratchpad to dest. + // + // If we ever need to save less state (e.g., avoid saving vector registers + // that aren't in use), we could have multiple allocation pools for + // different size states and copy only the registers we need. + dest.regs = pp.xRegs.scratch + + // Save on the G. + gp.xRegs.state = dest +} + +// xRegRestore prepares the extended register state on gp to be restored. +// +// It moves the state to gp.m.p.xRegs.cache where [asyncPreempt] expects to find +// it. This means nothing else may use the cache between this call and the +// return to asyncPreempt. This is not quite symmetric with [xRegSave], which +// uses gp.m.p.xRegs.scratch. By using cache instead, we save a block copy. +// +// This is called with asyncPreempt on the stack and thus must not grow the +// stack. +// +//go:nosplit +func xRegRestore(gp *g) { + if gp.xRegs.state == nil { + throw("gp.xRegState.p == nil on return from async preempt") + } + // If the P has a block cached on it, free that so we can replace it. + pp := gp.m.p.ptr() + if pp.xRegs.cache != nil { + // Don't grow the G stack. + systemstack(func() { + pp.xRegs.free() + }) + } + pp.xRegs.cache = gp.xRegs.state + gp.xRegs.state = nil +} + +func (xRegs *xRegPerP) free() { + if xRegs.cache != nil { + lock(&xRegAlloc.lock) + xRegAlloc.alloc.free(unsafe.Pointer(xRegs.cache)) + xRegs.cache = nil + unlock(&xRegAlloc.lock) + } +} diff --git a/src/runtime/proc.go b/src/runtime/proc.go index ec66384a75f..25d39d9ba38 100644 --- a/src/runtime/proc.go +++ b/src/runtime/proc.go @@ -5838,6 +5838,7 @@ func (pp *p) destroy() { pp.gcAssistTime = 0 gcCleanups.queued += pp.cleanupsQueued pp.cleanupsQueued = 0 + pp.xRegs.free() pp.status = _Pdead } diff --git a/src/runtime/runtime2.go b/src/runtime/runtime2.go index 29e9b8a7b99..b5d2dcefade 100644 --- a/src/runtime/runtime2.go +++ b/src/runtime/runtime2.go @@ -492,6 +492,10 @@ type g struct { coroarg *coro // argument during coroutine transfers bubble *synctestBubble + // xRegs stores the extended register state if this G has been + // asynchronously preempted. + xRegs xRegPerG + // Per-G tracer state. trace gTraceState @@ -760,6 +764,11 @@ type p struct { // gcStopTime is the nanotime timestamp that this P last entered _Pgcstop. gcStopTime int64 + // xRegs is the per-P extended register state used by asynchronous + // preemption. This is an empty struct on platforms that don't use extended + // register state. + xRegs xRegPerP + // Padding is no longer needed. False sharing is now not a worry because p is large enough // that its size class is an integer multiple of the cache line size (for any of our architectures). } diff --git a/src/runtime/sizeof_test.go b/src/runtime/sizeof_test.go index a5dc8aed344..de859866a5a 100644 --- a/src/runtime/sizeof_test.go +++ b/src/runtime/sizeof_test.go @@ -15,13 +15,18 @@ import ( func TestSizeof(t *testing.T) { const _64bit = unsafe.Sizeof(uintptr(0)) == 8 + const xreg = unsafe.Sizeof(runtime.XRegPerG{}) // Varies per architecture var tests = []struct { val any // type as a value _32bit uintptr // size on 32bit platforms _64bit uintptr // size on 64bit platforms }{ - {runtime.G{}, 280, 440}, // g, but exported for testing - {runtime.Sudog{}, 56, 88}, // sudog, but exported for testing + {runtime.G{}, 280 + xreg, 440 + xreg}, // g, but exported for testing + {runtime.Sudog{}, 56, 88}, // sudog, but exported for testing + } + + if xreg > runtime.PtrSize { + t.Errorf("unsafe.Sizeof(xRegPerG) = %d, want <= %d", xreg, runtime.PtrSize) } for _, tt := range tests {