mirror of
https://github.com/golang/go.git
synced 2025-12-08 06:10:04 +00:00
syscall: syscall.AllThreadsSyscall signal handling fixes
The runtime support for syscall.AllThreadsSyscall() functions had some corner case deadlock issues when signal handling was in use. This was observed in at least 3 build test failures on ppc64 and amd64 architecture CGO_ENABLED=0 builds over the last few months. The fixes involve more controlled handling of signals while the AllThreads mechanism is being executed. Further details are discussed in bug #44193. The all-threads syscall support is new in go1.16, so earlier releases are not affected by this bug. Fixes #44193 Change-Id: I01ba8508a6e1bb2d872751f50da86dd07911a41d Reviewed-on: https://go-review.googlesource.com/c/go/+/305149 Reviewed-by: Michael Pratt <mpratt@google.com> Trust: Michael Pratt <mpratt@google.com> Trust: Ian Lance Taylor <iant@golang.org> Run-TryBot: Michael Pratt <mpratt@google.com> TryBot-Result: Go Bot <gobot@golang.org>
This commit is contained in:
parent
54af9fd9e6
commit
7e97e4e8cc
4 changed files with 96 additions and 7 deletions
|
|
@ -16,6 +16,7 @@ import (
|
|||
"os"
|
||||
"os/exec"
|
||||
"runtime"
|
||||
"runtime/trace"
|
||||
"strconv"
|
||||
"sync"
|
||||
"syscall"
|
||||
|
|
@ -854,3 +855,44 @@ func TestNotifyContextStringer(t *testing.T) {
|
|||
t.Errorf("c.String() = %q, want %q", got, want)
|
||||
}
|
||||
}
|
||||
|
||||
// #44193 test signal handling while stopping and starting the world.
|
||||
func TestSignalTrace(t *testing.T) {
|
||||
done := make(chan struct{})
|
||||
quit := make(chan struct{})
|
||||
c := make(chan os.Signal, 1)
|
||||
Notify(c, syscall.SIGHUP)
|
||||
|
||||
// Source and sink for signals busy loop unsynchronized with
|
||||
// trace starts and stops. We are ultimately validating that
|
||||
// signals and runtime.(stop|start)TheWorldGC are compatible.
|
||||
go func() {
|
||||
defer close(done)
|
||||
defer Stop(c)
|
||||
pid := syscall.Getpid()
|
||||
for {
|
||||
select {
|
||||
case <-quit:
|
||||
return
|
||||
default:
|
||||
syscall.Kill(pid, syscall.SIGHUP)
|
||||
}
|
||||
waitSig(t, c, syscall.SIGHUP)
|
||||
}
|
||||
}()
|
||||
|
||||
for i := 0; i < 100; i++ {
|
||||
buf := new(bytes.Buffer)
|
||||
if err := trace.Start(buf); err != nil {
|
||||
t.Fatalf("[%d] failed to start tracing: %v", i, err)
|
||||
}
|
||||
time.After(1 * time.Microsecond)
|
||||
trace.Stop()
|
||||
size := buf.Len()
|
||||
if size == 0 {
|
||||
t.Fatalf("[%d] trace is empty", i)
|
||||
}
|
||||
}
|
||||
close(quit)
|
||||
<-done
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1402,6 +1402,9 @@ func mPark() {
|
|||
g := getg()
|
||||
for {
|
||||
notesleep(&g.m.park)
|
||||
// Note, because of signal handling by this parked m,
|
||||
// a preemptive mDoFixup() may actually occur via
|
||||
// mDoFixupAndOSYield(). (See golang.org/issue/44193)
|
||||
noteclear(&g.m.park)
|
||||
if !mDoFixup() {
|
||||
return
|
||||
|
|
@ -1635,6 +1638,22 @@ func syscall_runtime_doAllThreadsSyscall(fn func(bool) bool) {
|
|||
for atomic.Load(&sched.sysmonStarting) != 0 {
|
||||
osyield()
|
||||
}
|
||||
|
||||
// We don't want this thread to handle signals for the
|
||||
// duration of this critical section. The underlying issue
|
||||
// being that this locked coordinating m is the one monitoring
|
||||
// for fn() execution by all the other m's of the runtime,
|
||||
// while no regular go code execution is permitted (the world
|
||||
// is stopped). If this present m were to get distracted to
|
||||
// run signal handling code, and find itself waiting for a
|
||||
// second thread to execute go code before being able to
|
||||
// return from that signal handling, a deadlock will result.
|
||||
// (See golang.org/issue/44193.)
|
||||
lockOSThread()
|
||||
var sigmask sigset
|
||||
sigsave(&sigmask)
|
||||
sigblock(false)
|
||||
|
||||
stopTheWorldGC("doAllThreadsSyscall")
|
||||
if atomic.Load(&newmHandoff.haveTemplateThread) != 0 {
|
||||
// Ensure that there are no in-flight thread
|
||||
|
|
@ -1686,6 +1705,7 @@ func syscall_runtime_doAllThreadsSyscall(fn func(bool) bool) {
|
|||
// the possibility of racing with mp.
|
||||
lock(&mp.mFixup.lock)
|
||||
mp.mFixup.fn = fn
|
||||
atomic.Store(&mp.mFixup.used, 1)
|
||||
if mp.doesPark {
|
||||
// For non-service threads this will
|
||||
// cause the wakeup to be short lived
|
||||
|
|
@ -1702,9 +1722,7 @@ func syscall_runtime_doAllThreadsSyscall(fn func(bool) bool) {
|
|||
if mp.procid == tid {
|
||||
continue
|
||||
}
|
||||
lock(&mp.mFixup.lock)
|
||||
done = done && (mp.mFixup.fn == nil)
|
||||
unlock(&mp.mFixup.lock)
|
||||
done = atomic.Load(&mp.mFixup.used) == 0
|
||||
}
|
||||
if done {
|
||||
break
|
||||
|
|
@ -1731,6 +1749,8 @@ func syscall_runtime_doAllThreadsSyscall(fn func(bool) bool) {
|
|||
unlock(&mFixupRace.lock)
|
||||
}
|
||||
startTheWorldGC()
|
||||
msigrestore(sigmask)
|
||||
unlockOSThread()
|
||||
}
|
||||
|
||||
// runSafePointFn runs the safe point function, if any, for this P.
|
||||
|
|
@ -2225,9 +2245,21 @@ var mFixupRace struct {
|
|||
// mDoFixup runs any outstanding fixup function for the running m.
|
||||
// Returns true if a fixup was outstanding and actually executed.
|
||||
//
|
||||
// Note: to avoid deadlocks, and the need for the fixup function
|
||||
// itself to be async safe, signals are blocked for the working m
|
||||
// while it holds the mFixup lock. (See golang.org/issue/44193)
|
||||
//
|
||||
//go:nosplit
|
||||
func mDoFixup() bool {
|
||||
_g_ := getg()
|
||||
if used := atomic.Load(&_g_.m.mFixup.used); used == 0 {
|
||||
return false
|
||||
}
|
||||
|
||||
// slow path - if fixup fn is used, block signals and lock.
|
||||
var sigmask sigset
|
||||
sigsave(&sigmask)
|
||||
sigblock(false)
|
||||
lock(&_g_.m.mFixup.lock)
|
||||
fn := _g_.m.mFixup.fn
|
||||
if fn != nil {
|
||||
|
|
@ -2244,7 +2276,6 @@ func mDoFixup() bool {
|
|||
// is more obviously safe.
|
||||
throw("GC must be disabled to protect validity of fn value")
|
||||
}
|
||||
*(*uintptr)(unsafe.Pointer(&_g_.m.mFixup.fn)) = 0
|
||||
if _g_.racectx != 0 || !raceenabled {
|
||||
fn(false)
|
||||
} else {
|
||||
|
|
@ -2259,11 +2290,24 @@ func mDoFixup() bool {
|
|||
_g_.racectx = 0
|
||||
unlock(&mFixupRace.lock)
|
||||
}
|
||||
*(*uintptr)(unsafe.Pointer(&_g_.m.mFixup.fn)) = 0
|
||||
atomic.Store(&_g_.m.mFixup.used, 0)
|
||||
}
|
||||
unlock(&_g_.m.mFixup.lock)
|
||||
msigrestore(sigmask)
|
||||
return fn != nil
|
||||
}
|
||||
|
||||
// mDoFixupAndOSYield is called when an m is unable to send a signal
|
||||
// because the allThreadsSyscall mechanism is in progress. That is, an
|
||||
// mPark() has been interrupted with this signal handler so we need to
|
||||
// ensure the fixup is executed from this context.
|
||||
//go:nosplit
|
||||
func mDoFixupAndOSYield() {
|
||||
mDoFixup()
|
||||
osyield()
|
||||
}
|
||||
|
||||
// templateThread is a thread in a known-good state that exists solely
|
||||
// to start new threads in known-good states when the calling thread
|
||||
// may not be in a good state.
|
||||
|
|
|
|||
|
|
@ -554,10 +554,13 @@ type m struct {
|
|||
syscalltick uint32
|
||||
freelink *m // on sched.freem
|
||||
|
||||
// mFixup is used to synchronize OS related m state (credentials etc)
|
||||
// use mutex to access.
|
||||
// mFixup is used to synchronize OS related m state
|
||||
// (credentials etc) use mutex to access. To avoid deadlocks
|
||||
// an atomic.Load() of used being zero in mDoFixupFn()
|
||||
// guarantees fn is nil.
|
||||
mFixup struct {
|
||||
lock mutex
|
||||
used uint32
|
||||
fn func(bool) bool
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -120,7 +120,7 @@ Send:
|
|||
}
|
||||
case sigFixup:
|
||||
// nothing to do - we need to wait for sigIdle.
|
||||
osyield()
|
||||
mDoFixupAndOSYield()
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue