mirror of
https://github.com/golang/go.git
synced 2025-12-08 06:10:04 +00:00
syscall: syscall.AllThreadsSyscall signal handling fixes
The runtime support for syscall.AllThreadsSyscall() functions had some corner case deadlock issues when signal handling was in use. This was observed in at least 3 build test failures on ppc64 and amd64 architecture CGO_ENABLED=0 builds over the last few months. The fixes involve more controlled handling of signals while the AllThreads mechanism is being executed. Further details are discussed in bug #44193. The all-threads syscall support is new in go1.16, so earlier releases are not affected by this bug. Fixes #44193 Change-Id: I01ba8508a6e1bb2d872751f50da86dd07911a41d Reviewed-on: https://go-review.googlesource.com/c/go/+/305149 Reviewed-by: Michael Pratt <mpratt@google.com> Trust: Michael Pratt <mpratt@google.com> Trust: Ian Lance Taylor <iant@golang.org> Run-TryBot: Michael Pratt <mpratt@google.com> TryBot-Result: Go Bot <gobot@golang.org>
This commit is contained in:
parent
54af9fd9e6
commit
7e97e4e8cc
4 changed files with 96 additions and 7 deletions
|
|
@ -16,6 +16,7 @@ import (
|
||||||
"os"
|
"os"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"runtime"
|
"runtime"
|
||||||
|
"runtime/trace"
|
||||||
"strconv"
|
"strconv"
|
||||||
"sync"
|
"sync"
|
||||||
"syscall"
|
"syscall"
|
||||||
|
|
@ -854,3 +855,44 @@ func TestNotifyContextStringer(t *testing.T) {
|
||||||
t.Errorf("c.String() = %q, want %q", got, want)
|
t.Errorf("c.String() = %q, want %q", got, want)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// #44193 test signal handling while stopping and starting the world.
|
||||||
|
func TestSignalTrace(t *testing.T) {
|
||||||
|
done := make(chan struct{})
|
||||||
|
quit := make(chan struct{})
|
||||||
|
c := make(chan os.Signal, 1)
|
||||||
|
Notify(c, syscall.SIGHUP)
|
||||||
|
|
||||||
|
// Source and sink for signals busy loop unsynchronized with
|
||||||
|
// trace starts and stops. We are ultimately validating that
|
||||||
|
// signals and runtime.(stop|start)TheWorldGC are compatible.
|
||||||
|
go func() {
|
||||||
|
defer close(done)
|
||||||
|
defer Stop(c)
|
||||||
|
pid := syscall.Getpid()
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-quit:
|
||||||
|
return
|
||||||
|
default:
|
||||||
|
syscall.Kill(pid, syscall.SIGHUP)
|
||||||
|
}
|
||||||
|
waitSig(t, c, syscall.SIGHUP)
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
for i := 0; i < 100; i++ {
|
||||||
|
buf := new(bytes.Buffer)
|
||||||
|
if err := trace.Start(buf); err != nil {
|
||||||
|
t.Fatalf("[%d] failed to start tracing: %v", i, err)
|
||||||
|
}
|
||||||
|
time.After(1 * time.Microsecond)
|
||||||
|
trace.Stop()
|
||||||
|
size := buf.Len()
|
||||||
|
if size == 0 {
|
||||||
|
t.Fatalf("[%d] trace is empty", i)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
close(quit)
|
||||||
|
<-done
|
||||||
|
}
|
||||||
|
|
|
||||||
|
|
@ -1402,6 +1402,9 @@ func mPark() {
|
||||||
g := getg()
|
g := getg()
|
||||||
for {
|
for {
|
||||||
notesleep(&g.m.park)
|
notesleep(&g.m.park)
|
||||||
|
// Note, because of signal handling by this parked m,
|
||||||
|
// a preemptive mDoFixup() may actually occur via
|
||||||
|
// mDoFixupAndOSYield(). (See golang.org/issue/44193)
|
||||||
noteclear(&g.m.park)
|
noteclear(&g.m.park)
|
||||||
if !mDoFixup() {
|
if !mDoFixup() {
|
||||||
return
|
return
|
||||||
|
|
@ -1635,6 +1638,22 @@ func syscall_runtime_doAllThreadsSyscall(fn func(bool) bool) {
|
||||||
for atomic.Load(&sched.sysmonStarting) != 0 {
|
for atomic.Load(&sched.sysmonStarting) != 0 {
|
||||||
osyield()
|
osyield()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// We don't want this thread to handle signals for the
|
||||||
|
// duration of this critical section. The underlying issue
|
||||||
|
// being that this locked coordinating m is the one monitoring
|
||||||
|
// for fn() execution by all the other m's of the runtime,
|
||||||
|
// while no regular go code execution is permitted (the world
|
||||||
|
// is stopped). If this present m were to get distracted to
|
||||||
|
// run signal handling code, and find itself waiting for a
|
||||||
|
// second thread to execute go code before being able to
|
||||||
|
// return from that signal handling, a deadlock will result.
|
||||||
|
// (See golang.org/issue/44193.)
|
||||||
|
lockOSThread()
|
||||||
|
var sigmask sigset
|
||||||
|
sigsave(&sigmask)
|
||||||
|
sigblock(false)
|
||||||
|
|
||||||
stopTheWorldGC("doAllThreadsSyscall")
|
stopTheWorldGC("doAllThreadsSyscall")
|
||||||
if atomic.Load(&newmHandoff.haveTemplateThread) != 0 {
|
if atomic.Load(&newmHandoff.haveTemplateThread) != 0 {
|
||||||
// Ensure that there are no in-flight thread
|
// Ensure that there are no in-flight thread
|
||||||
|
|
@ -1686,6 +1705,7 @@ func syscall_runtime_doAllThreadsSyscall(fn func(bool) bool) {
|
||||||
// the possibility of racing with mp.
|
// the possibility of racing with mp.
|
||||||
lock(&mp.mFixup.lock)
|
lock(&mp.mFixup.lock)
|
||||||
mp.mFixup.fn = fn
|
mp.mFixup.fn = fn
|
||||||
|
atomic.Store(&mp.mFixup.used, 1)
|
||||||
if mp.doesPark {
|
if mp.doesPark {
|
||||||
// For non-service threads this will
|
// For non-service threads this will
|
||||||
// cause the wakeup to be short lived
|
// cause the wakeup to be short lived
|
||||||
|
|
@ -1702,9 +1722,7 @@ func syscall_runtime_doAllThreadsSyscall(fn func(bool) bool) {
|
||||||
if mp.procid == tid {
|
if mp.procid == tid {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
lock(&mp.mFixup.lock)
|
done = atomic.Load(&mp.mFixup.used) == 0
|
||||||
done = done && (mp.mFixup.fn == nil)
|
|
||||||
unlock(&mp.mFixup.lock)
|
|
||||||
}
|
}
|
||||||
if done {
|
if done {
|
||||||
break
|
break
|
||||||
|
|
@ -1731,6 +1749,8 @@ func syscall_runtime_doAllThreadsSyscall(fn func(bool) bool) {
|
||||||
unlock(&mFixupRace.lock)
|
unlock(&mFixupRace.lock)
|
||||||
}
|
}
|
||||||
startTheWorldGC()
|
startTheWorldGC()
|
||||||
|
msigrestore(sigmask)
|
||||||
|
unlockOSThread()
|
||||||
}
|
}
|
||||||
|
|
||||||
// runSafePointFn runs the safe point function, if any, for this P.
|
// runSafePointFn runs the safe point function, if any, for this P.
|
||||||
|
|
@ -2225,9 +2245,21 @@ var mFixupRace struct {
|
||||||
// mDoFixup runs any outstanding fixup function for the running m.
|
// mDoFixup runs any outstanding fixup function for the running m.
|
||||||
// Returns true if a fixup was outstanding and actually executed.
|
// Returns true if a fixup was outstanding and actually executed.
|
||||||
//
|
//
|
||||||
|
// Note: to avoid deadlocks, and the need for the fixup function
|
||||||
|
// itself to be async safe, signals are blocked for the working m
|
||||||
|
// while it holds the mFixup lock. (See golang.org/issue/44193)
|
||||||
|
//
|
||||||
//go:nosplit
|
//go:nosplit
|
||||||
func mDoFixup() bool {
|
func mDoFixup() bool {
|
||||||
_g_ := getg()
|
_g_ := getg()
|
||||||
|
if used := atomic.Load(&_g_.m.mFixup.used); used == 0 {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
// slow path - if fixup fn is used, block signals and lock.
|
||||||
|
var sigmask sigset
|
||||||
|
sigsave(&sigmask)
|
||||||
|
sigblock(false)
|
||||||
lock(&_g_.m.mFixup.lock)
|
lock(&_g_.m.mFixup.lock)
|
||||||
fn := _g_.m.mFixup.fn
|
fn := _g_.m.mFixup.fn
|
||||||
if fn != nil {
|
if fn != nil {
|
||||||
|
|
@ -2244,7 +2276,6 @@ func mDoFixup() bool {
|
||||||
// is more obviously safe.
|
// is more obviously safe.
|
||||||
throw("GC must be disabled to protect validity of fn value")
|
throw("GC must be disabled to protect validity of fn value")
|
||||||
}
|
}
|
||||||
*(*uintptr)(unsafe.Pointer(&_g_.m.mFixup.fn)) = 0
|
|
||||||
if _g_.racectx != 0 || !raceenabled {
|
if _g_.racectx != 0 || !raceenabled {
|
||||||
fn(false)
|
fn(false)
|
||||||
} else {
|
} else {
|
||||||
|
|
@ -2259,11 +2290,24 @@ func mDoFixup() bool {
|
||||||
_g_.racectx = 0
|
_g_.racectx = 0
|
||||||
unlock(&mFixupRace.lock)
|
unlock(&mFixupRace.lock)
|
||||||
}
|
}
|
||||||
|
*(*uintptr)(unsafe.Pointer(&_g_.m.mFixup.fn)) = 0
|
||||||
|
atomic.Store(&_g_.m.mFixup.used, 0)
|
||||||
}
|
}
|
||||||
unlock(&_g_.m.mFixup.lock)
|
unlock(&_g_.m.mFixup.lock)
|
||||||
|
msigrestore(sigmask)
|
||||||
return fn != nil
|
return fn != nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// mDoFixupAndOSYield is called when an m is unable to send a signal
|
||||||
|
// because the allThreadsSyscall mechanism is in progress. That is, an
|
||||||
|
// mPark() has been interrupted with this signal handler so we need to
|
||||||
|
// ensure the fixup is executed from this context.
|
||||||
|
//go:nosplit
|
||||||
|
func mDoFixupAndOSYield() {
|
||||||
|
mDoFixup()
|
||||||
|
osyield()
|
||||||
|
}
|
||||||
|
|
||||||
// templateThread is a thread in a known-good state that exists solely
|
// templateThread is a thread in a known-good state that exists solely
|
||||||
// to start new threads in known-good states when the calling thread
|
// to start new threads in known-good states when the calling thread
|
||||||
// may not be in a good state.
|
// may not be in a good state.
|
||||||
|
|
|
||||||
|
|
@ -554,10 +554,13 @@ type m struct {
|
||||||
syscalltick uint32
|
syscalltick uint32
|
||||||
freelink *m // on sched.freem
|
freelink *m // on sched.freem
|
||||||
|
|
||||||
// mFixup is used to synchronize OS related m state (credentials etc)
|
// mFixup is used to synchronize OS related m state
|
||||||
// use mutex to access.
|
// (credentials etc) use mutex to access. To avoid deadlocks
|
||||||
|
// an atomic.Load() of used being zero in mDoFixupFn()
|
||||||
|
// guarantees fn is nil.
|
||||||
mFixup struct {
|
mFixup struct {
|
||||||
lock mutex
|
lock mutex
|
||||||
|
used uint32
|
||||||
fn func(bool) bool
|
fn func(bool) bool
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -120,7 +120,7 @@ Send:
|
||||||
}
|
}
|
||||||
case sigFixup:
|
case sigFixup:
|
||||||
// nothing to do - we need to wait for sigIdle.
|
// nothing to do - we need to wait for sigIdle.
|
||||||
osyield()
|
mDoFixupAndOSYield()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue