runtime: print a stack trace at "morestack on g0"

Error like "morestack on g0" is one of the errors that is very
hard to debug, because often it doesn't print a useful stack trace.
The runtime doesn't directly print a stack trace because it is
a bad stack state to call print. Sometimes the SIGABRT may trigger
a traceback, but sometimes not especially in a cgo binary. Even if
it triggers a traceback it often does not include the stack trace
of the bad stack.

This CL makes it explicitly print a stack trace and throw. The
idea is to have some space as an "emergency" crash stack. When the
stack is in a really bad state, we switch to the crash stack and
do a traceback.

Currently only implemented on AMD64 and ARM64.

TODO: also handle errors like "morestack on gsignal" and bad
systemstack. Also handle other architectures.

Change-Id: Ibfc397202f2bb0737c5cbe99f2763de83301c1c1
Reviewed-on: https://go-review.googlesource.com/c/go/+/419435
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Michael Pratt <mpratt@google.com>
This commit is contained in:
Cherry Mui 2023-08-28 14:57:29 -04:00
parent 29b80397a8
commit 0262ea1ff9
6 changed files with 147 additions and 27 deletions

View file

@ -12,3 +12,11 @@ TEXT ·sigpanic0(SB),NOSPLIT,$0-0
// See map.go comment on the need for this routine.
TEXT ·mapinitnoop<ABIInternal>(SB),NOSPLIT,$0-0
RET
#ifndef GOARCH_amd64
#ifndef GOARCH_arm64
// stub to appease shared build mode.
TEXT ·switchToCrashStack0<ABIInternal>(SB),NOSPLIT,$0-0
UNDEF
#endif
#endif

View file

@ -537,6 +537,30 @@ bad:
CALL AX
INT $3
// func switchToCrashStack0(fn func())
TEXT runtime·switchToCrashStack0<ABIInternal>(SB), NOSPLIT, $0-8
MOVQ g_m(R14), BX // curm
// set g to gcrash
LEAQ runtime·gcrash(SB), R14 // g = &gcrash
MOVQ BX, g_m(R14) // g.m = curm
MOVQ R14, m_g0(BX) // curm.g0 = g
get_tls(CX)
MOVQ R14, g(CX)
// switch to crashstack
MOVQ (g_stack+stack_hi)(R14), BX
SUBQ $(4*8), BX
MOVQ BX, SP
// call target function
MOVQ AX, DX
MOVQ 0(AX), AX
CALL AX
// should never return
CALL runtime·abort(SB)
UNDEF
/*
* support for morestack
@ -551,17 +575,26 @@ bad:
TEXT runtime·morestack(SB),NOSPLIT|NOFRAME,$0-0
// Cannot grow scheduler stack (m->g0).
get_tls(CX)
MOVQ g(CX), BX
MOVQ g_m(BX), BX
MOVQ m_g0(BX), SI
CMPQ g(CX), SI
MOVQ g(CX), DI // DI = g
MOVQ g_m(DI), BX // BX = m
// Set g->sched to context in f.
MOVQ 0(SP), AX // f's PC
MOVQ AX, (g_sched+gobuf_pc)(DI)
LEAQ 8(SP), AX // f's SP
MOVQ AX, (g_sched+gobuf_sp)(DI)
MOVQ BP, (g_sched+gobuf_bp)(DI)
MOVQ DX, (g_sched+gobuf_ctxt)(DI)
MOVQ m_g0(BX), SI // SI = m.g0
CMPQ DI, SI
JNE 3(PC)
CALL runtime·badmorestackg0(SB)
CALL runtime·abort(SB)
// Cannot grow signal stack (m->gsignal).
MOVQ m_gsignal(BX), SI
CMPQ g(CX), SI
CMPQ DI, SI
JNE 3(PC)
CALL runtime·badmorestackgsignal(SB)
CALL runtime·abort(SB)
@ -573,17 +606,7 @@ TEXT runtime·morestack(SB),NOSPLIT|NOFRAME,$0-0
MOVQ AX, (m_morebuf+gobuf_pc)(BX)
LEAQ 16(SP), AX // f's caller's SP
MOVQ AX, (m_morebuf+gobuf_sp)(BX)
get_tls(CX)
MOVQ g(CX), SI
MOVQ SI, (m_morebuf+gobuf_g)(BX)
// Set g->sched to context in f.
MOVQ 0(SP), AX // f's PC
MOVQ AX, (g_sched+gobuf_pc)(SI)
LEAQ 8(SP), AX // f's SP
MOVQ AX, (g_sched+gobuf_sp)(SI)
MOVQ BP, (g_sched+gobuf_bp)(SI)
MOVQ DX, (g_sched+gobuf_ctxt)(SI)
MOVQ DI, (m_morebuf+gobuf_g)(BX)
// Call newstack on m->g0's stack.
MOVQ m_g0(BX), BX

View file

@ -262,6 +262,30 @@ noswitch:
SUB $8, RSP, R29 // restore FP
B (R3)
// func switchToCrashStack0(fn func())
TEXT runtime·switchToCrashStack0<ABIInternal>(SB), NOSPLIT, $0-8
MOVD R0, R26 // context register
MOVD g_m(g), R1 // curm
// set g to gcrash
MOVD $runtime·gcrash(SB), g // g = &gcrash
BL runtime·save_g(SB) // clobbers R0
MOVD R1, g_m(g) // g.m = curm
MOVD g, m_g0(R1) // curm.g0 = g
// switch to crashstack
MOVD (g_stack+stack_hi)(g), R1
SUB $(4*8), R1
MOVD R1, RSP
// call target function
MOVD 0(R26), R0
CALL (R0)
// should never return
CALL runtime·abort(SB)
UNDEF
/*
* support for morestack
*/
@ -278,6 +302,16 @@ TEXT runtime·morestack(SB),NOSPLIT|NOFRAME,$0-0
// Cannot grow scheduler stack (m->g0).
MOVD g_m(g), R8
MOVD m_g0(R8), R4
// Called from f.
// Set g->sched to context in f
MOVD RSP, R0
MOVD R0, (g_sched+gobuf_sp)(g)
MOVD R29, (g_sched+gobuf_bp)(g)
MOVD LR, (g_sched+gobuf_pc)(g)
MOVD R3, (g_sched+gobuf_lr)(g)
MOVD R26, (g_sched+gobuf_ctxt)(g)
CMP g, R4
BNE 3(PC)
BL runtime·badmorestackg0(SB)
@ -290,15 +324,6 @@ TEXT runtime·morestack(SB),NOSPLIT|NOFRAME,$0-0
BL runtime·badmorestackgsignal(SB)
B runtime·abort(SB)
// Called from f.
// Set g->sched to context in f
MOVD RSP, R0
MOVD R0, (g_sched+gobuf_sp)(g)
MOVD R29, (g_sched+gobuf_bp)(g)
MOVD LR, (g_sched+gobuf_pc)(g)
MOVD R3, (g_sched+gobuf_lr)(g)
MOVD R26, (g_sched+gobuf_ctxt)(g)
// Called from f.
// Set m->morebuf to f's callers.
MOVD R3, (m_morebuf+gobuf_pc)(R8) // f's caller's PC

View file

@ -804,6 +804,14 @@ func TestG0StackOverflow(t *testing.T) {
if n := strings.Count(string(out), "morestack on g0\n"); n != 1 {
t.Fatalf("%s\n(exit status %v)", out, err)
}
if runtime.GOARCH == "amd64" || runtime.GOARCH == "arm64" {
// check for a stack trace
want := "runtime.stackOverflow"
if n := strings.Count(string(out), want); n < 5 {
t.Errorf("output does not contain %q at least 5 times:\n%s", want, out)
}
return // it's not a signal-style traceback
}
// Check that it's a signal-style traceback.
if runtime.GOOS != "windows" {
if want := "PC="; !strings.Contains(string(out), want) {

View file

@ -694,7 +694,7 @@ func G0StackOverflow() {
// The stack bounds for g0 stack is not always precise.
// Use an artificially small stack, to trigger a stack overflow
// without actually run out of the system stack (which may seg fault).
g0.stack.lo = sp - 4096
g0.stack.lo = sp - 4096 - stackSystem
g0.stackguard0 = g0.stack.lo + stackGuard
g0.stackguard1 = g0.stackguard0

View file

@ -516,7 +516,20 @@ func badreflectcall() {
//go:nosplit
//go:nowritebarrierrec
func badmorestackg0() {
writeErrStr("fatal: morestack on g0\n")
if !crashStackImplemented {
writeErrStr("fatal: morestack on g0\n")
return
}
g := getg()
switchToCrashStack(func() {
print("runtime: morestack on g0, stack [", hex(g.stack.lo), " ", hex(g.stack.hi), "], sp=", hex(g.sched.sp), ", called from\n")
g.m.traceback = 2 // include pc and sp in stack trace
traceback1(g.sched.pc, g.sched.sp, g.sched.lr, g, 0)
print("\n")
throw("morestack on g0")
})
}
//go:nosplit
@ -530,6 +543,49 @@ func badctxt() {
throw("ctxt != 0")
}
// crashstack is a space that can be used as the stack when it is
// crashing on bad stack conditions, e.g. morestack on g0.
// gcrash is the corresponding (fake) g.
var crashstack [16384]byte
var gcrash = g{
stack: stack{uintptr(unsafe.Pointer(&crashstack)), uintptr(unsafe.Pointer(&crashstack)) + unsafe.Sizeof(crashstack)},
stackguard0: uintptr(unsafe.Pointer(&crashstack)) + 1000,
stackguard1: uintptr(unsafe.Pointer(&crashstack)) + 1000,
}
var crashingG atomic.Pointer[g]
// Switch to crashstack and call fn, with special handling of
// concurrent and recursive cases.
//
// Nosplit as it is called in a bad stack condition (we know
// morestack would fail).
//
//go:nosplit
//go:nowritebarrierrec
func switchToCrashStack(fn func()) {
me := getg()
if crashingG.CompareAndSwapNoWB(nil, me) {
switchToCrashStack0(fn) // should never return
abort()
}
if crashingG.Load() == me {
// recursive crashing. too bad.
writeErrStr("fatal: recursive switchToCrashStack\n")
abort()
}
// Another g is crashing. Give it some time, hopefully it will finish traceback.
usleep_no_g(100)
writeErrStr("fatal: concurrent switchToCrashStack\n")
abort()
}
const crashStackImplemented = GOARCH == "amd64" || GOARCH == "arm64"
//go:noescape
func switchToCrashStack0(func()) // in assembly
func lockedOSThread() bool {
gp := getg()
return gp.lockedm != 0 && gp.m.lockedg != 0