syscall: syscall.AllThreadsSyscall signal handling fixes

The runtime support for syscall.AllThreadsSyscall() functions had
some corner case deadlock issues when signal handling was in use.
This was observed in at least 3 build test failures on ppc64 and
amd64 architecture CGO_ENABLED=0 builds over the last few months.

The fixes involve more controlled handling of signals while the
AllThreads mechanism is being executed. Further details are
discussed in bug #44193.

The all-threads syscall support is new in go1.16, so earlier
releases are not affected by this bug.

Fixes #44193

Change-Id: I01ba8508a6e1bb2d872751f50da86dd07911a41d
Reviewed-on: https://go-review.googlesource.com/c/go/+/305149
Reviewed-by: Michael Pratt <mpratt@google.com>
Trust: Michael Pratt <mpratt@google.com>
Trust: Ian Lance Taylor <iant@golang.org>
Run-TryBot: Michael Pratt <mpratt@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
This commit is contained in:
Andrew G. Morgan 2021-03-26 19:27:22 -07:00 committed by Michael Pratt
parent 54af9fd9e6
commit 7e97e4e8cc
4 changed files with 96 additions and 7 deletions

View file

@ -16,6 +16,7 @@ import (
"os"
"os/exec"
"runtime"
"runtime/trace"
"strconv"
"sync"
"syscall"
@ -854,3 +855,44 @@ func TestNotifyContextStringer(t *testing.T) {
t.Errorf("c.String() = %q, want %q", got, want)
}
}
// #44193 test signal handling while stopping and starting the world.
func TestSignalTrace(t *testing.T) {
done := make(chan struct{})
quit := make(chan struct{})
c := make(chan os.Signal, 1)
Notify(c, syscall.SIGHUP)
// Source and sink for signals busy loop unsynchronized with
// trace starts and stops. We are ultimately validating that
// signals and runtime.(stop|start)TheWorldGC are compatible.
go func() {
defer close(done)
defer Stop(c)
pid := syscall.Getpid()
for {
select {
case <-quit:
return
default:
syscall.Kill(pid, syscall.SIGHUP)
}
waitSig(t, c, syscall.SIGHUP)
}
}()
for i := 0; i < 100; i++ {
buf := new(bytes.Buffer)
if err := trace.Start(buf); err != nil {
t.Fatalf("[%d] failed to start tracing: %v", i, err)
}
time.After(1 * time.Microsecond)
trace.Stop()
size := buf.Len()
if size == 0 {
t.Fatalf("[%d] trace is empty", i)
}
}
close(quit)
<-done
}

View file

@ -1402,6 +1402,9 @@ func mPark() {
g := getg()
for {
notesleep(&g.m.park)
// Note, because of signal handling by this parked m,
// a preemptive mDoFixup() may actually occur via
// mDoFixupAndOSYield(). (See golang.org/issue/44193)
noteclear(&g.m.park)
if !mDoFixup() {
return
@ -1635,6 +1638,22 @@ func syscall_runtime_doAllThreadsSyscall(fn func(bool) bool) {
for atomic.Load(&sched.sysmonStarting) != 0 {
osyield()
}
// We don't want this thread to handle signals for the
// duration of this critical section. The underlying issue
// being that this locked coordinating m is the one monitoring
// for fn() execution by all the other m's of the runtime,
// while no regular go code execution is permitted (the world
// is stopped). If this present m were to get distracted to
// run signal handling code, and find itself waiting for a
// second thread to execute go code before being able to
// return from that signal handling, a deadlock will result.
// (See golang.org/issue/44193.)
lockOSThread()
var sigmask sigset
sigsave(&sigmask)
sigblock(false)
stopTheWorldGC("doAllThreadsSyscall")
if atomic.Load(&newmHandoff.haveTemplateThread) != 0 {
// Ensure that there are no in-flight thread
@ -1686,6 +1705,7 @@ func syscall_runtime_doAllThreadsSyscall(fn func(bool) bool) {
// the possibility of racing with mp.
lock(&mp.mFixup.lock)
mp.mFixup.fn = fn
atomic.Store(&mp.mFixup.used, 1)
if mp.doesPark {
// For non-service threads this will
// cause the wakeup to be short lived
@ -1702,9 +1722,7 @@ func syscall_runtime_doAllThreadsSyscall(fn func(bool) bool) {
if mp.procid == tid {
continue
}
lock(&mp.mFixup.lock)
done = done && (mp.mFixup.fn == nil)
unlock(&mp.mFixup.lock)
done = atomic.Load(&mp.mFixup.used) == 0
}
if done {
break
@ -1731,6 +1749,8 @@ func syscall_runtime_doAllThreadsSyscall(fn func(bool) bool) {
unlock(&mFixupRace.lock)
}
startTheWorldGC()
msigrestore(sigmask)
unlockOSThread()
}
// runSafePointFn runs the safe point function, if any, for this P.
@ -2225,9 +2245,21 @@ var mFixupRace struct {
// mDoFixup runs any outstanding fixup function for the running m.
// Returns true if a fixup was outstanding and actually executed.
//
// Note: to avoid deadlocks, and the need for the fixup function
// itself to be async safe, signals are blocked for the working m
// while it holds the mFixup lock. (See golang.org/issue/44193)
//
//go:nosplit
func mDoFixup() bool {
_g_ := getg()
if used := atomic.Load(&_g_.m.mFixup.used); used == 0 {
return false
}
// slow path - if fixup fn is used, block signals and lock.
var sigmask sigset
sigsave(&sigmask)
sigblock(false)
lock(&_g_.m.mFixup.lock)
fn := _g_.m.mFixup.fn
if fn != nil {
@ -2244,7 +2276,6 @@ func mDoFixup() bool {
// is more obviously safe.
throw("GC must be disabled to protect validity of fn value")
}
*(*uintptr)(unsafe.Pointer(&_g_.m.mFixup.fn)) = 0
if _g_.racectx != 0 || !raceenabled {
fn(false)
} else {
@ -2259,11 +2290,24 @@ func mDoFixup() bool {
_g_.racectx = 0
unlock(&mFixupRace.lock)
}
*(*uintptr)(unsafe.Pointer(&_g_.m.mFixup.fn)) = 0
atomic.Store(&_g_.m.mFixup.used, 0)
}
unlock(&_g_.m.mFixup.lock)
msigrestore(sigmask)
return fn != nil
}
// mDoFixupAndOSYield is called when an m is unable to send a signal
// because the allThreadsSyscall mechanism is in progress. That is, an
// mPark() has been interrupted with this signal handler so we need to
// ensure the fixup is executed from this context.
//go:nosplit
func mDoFixupAndOSYield() {
mDoFixup()
osyield()
}
// templateThread is a thread in a known-good state that exists solely
// to start new threads in known-good states when the calling thread
// may not be in a good state.

View file

@ -554,10 +554,13 @@ type m struct {
syscalltick uint32
freelink *m // on sched.freem
// mFixup is used to synchronize OS related m state (credentials etc)
// use mutex to access.
// mFixup is used to synchronize OS related m state
// (credentials etc) use mutex to access. To avoid deadlocks
// an atomic.Load() of used being zero in mDoFixupFn()
// guarantees fn is nil.
mFixup struct {
lock mutex
used uint32
fn func(bool) bool
}

View file

@ -120,7 +120,7 @@ Send:
}
case sigFixup:
// nothing to do - we need to wait for sigIdle.
osyield()
mDoFixupAndOSYield()
}
}