diff --git a/src/runtime/crash_test.go b/src/runtime/crash_test.go index 2b8ca549ad8..7d02363cc99 100644 --- a/src/runtime/crash_test.go +++ b/src/runtime/crash_test.go @@ -788,6 +788,9 @@ func TestPanicInlined(t *testing.T) { // Test for issues #3934 and #20018. // We want to delay exiting until a panic print is complete. func TestPanicRace(t *testing.T) { + if race.Enabled { + t.Skip("racefini exits program before main can delay exiting") + } testenv.MustHaveGoRun(t) exe, err := buildTestProg(t, "testprog") diff --git a/src/runtime/heapdump.go b/src/runtime/heapdump.go index 9aaee8f90c9..3671c65ab73 100644 --- a/src/runtime/heapdump.go +++ b/src/runtime/heapdump.go @@ -412,6 +412,14 @@ func dumpgs() { forEachG(func(gp *g) { status := readgstatus(gp) // The world is stopped so gp will not be in a scan state. switch status { + case _Grunning: + // Dump goroutine if it's _Grunning only during a syscall. This is safe + // because the goroutine will just park without mutating its stack, since + // the world is stopped. + if gp.syscallsp != 0 { + dumpgoroutine(gp) + } + fallthrough default: print("runtime: unexpected G.status ", hex(status), "\n") throw("dumpgs in STW - bad status") diff --git a/src/runtime/metrics.go b/src/runtime/metrics.go index 6ee2c285493..2710bf68652 100644 --- a/src/runtime/metrics.go +++ b/src/runtime/metrics.go @@ -818,9 +818,12 @@ func (a *schedStatsAggregate) compute() { a.gCreated += p.goroutinesCreated switch p.status { case _Prunning: - a.gRunning++ - case _Psyscall: - a.gNonGo++ + if thread, ok := setBlockOnExitSyscall(p); ok { + thread.resume() + a.gNonGo++ + } else { + a.gRunning++ + } case _Pgcstop: // The world is stopping or stopped. // This is fine. The results will be @@ -847,7 +850,7 @@ func (a *schedStatsAggregate) compute() { // Global run queue. a.gRunnable += uint64(sched.runq.size) - // Account for Gs that are in _Gsyscall without a P in _Psyscall. + // Account for Gs that are in _Gsyscall without a P. nGsyscallNoP := sched.nGsyscallNoP.Load() // nGsyscallNoP can go negative during temporary races. diff --git a/src/runtime/mprof.go b/src/runtime/mprof.go index 19e90fbb337..d745d5f5b95 100644 --- a/src/runtime/mprof.go +++ b/src/runtime/mprof.go @@ -1535,7 +1535,18 @@ func doRecordGoroutineProfile(gp1 *g, pcbuf []uintptr) { // everything else, we just don't record the stack in the profile. return } - if readgstatus(gp1) == _Grunning { + // Double-check that we didn't make a grave mistake. If the G is running then in + // general, we cannot safely read its stack. + // + // However, there is one case where it's OK. There's a small window of time in + // exitsyscall where a goroutine could be in _Grunning as it's exiting a syscall. + // This is OK because goroutine will not exit the syscall until it passes through + // a call to tryRecordGoroutineProfile. (An explicit one on the fast path, an + // implicit one via the scheduler on the slow path.) + // + // This is also why it's safe to check syscallsp here. The syscall path mutates + // syscallsp only after passing through tryRecordGoroutineProfile. + if readgstatus(gp1) == _Grunning && gp1.syscallsp == 0 { print("doRecordGoroutineProfile gp1=", gp1.goid, "\n") throw("cannot read stack of running goroutine") } diff --git a/src/runtime/mstats.go b/src/runtime/mstats.go index e34f0b10ea0..febfb69a3a3 100644 --- a/src/runtime/mstats.go +++ b/src/runtime/mstats.go @@ -914,7 +914,7 @@ type cpuStats struct { ScavengeTotalTime int64 IdleTime int64 // Time Ps spent in _Pidle. - UserTime int64 // Time Ps spent in _Prunning or _Psyscall that's not any of the above. + UserTime int64 // Time Ps spent in _Prunning that's not any of the above. TotalTime int64 // GOMAXPROCS * (monotonic wall clock time elapsed) } @@ -976,7 +976,7 @@ func (s *cpuStats) accumulate(now int64, gcMarkPhase bool) { // Compute userTime. We compute this indirectly as everything that's not the above. // // Since time spent in _Pgcstop is covered by gcPauseTime, and time spent in _Pidle - // is covered by idleTime, what we're left with is time spent in _Prunning and _Psyscall, + // is covered by idleTime, what we're left with is time spent in _Prunning, // the latter of which is fine because the P will either go idle or get used for something // else via sysmon. Meanwhile if we subtract GC time from whatever's left, we get non-GC // _Prunning time. Note that this still leaves time spent in sweeping and in the scheduler, diff --git a/src/runtime/os_windows.go b/src/runtime/os_windows.go index 2ae625580df..1688ba910e6 100644 --- a/src/runtime/os_windows.go +++ b/src/runtime/os_windows.go @@ -191,8 +191,8 @@ type mOS struct { // complete. // // TODO(austin): We may not need this if preemption were more - // tightly synchronized on the G/P status and preemption - // blocked transition into _Gsyscall/_Psyscall. + // tightly synchronized on the G status and preemption + // blocked transition into _Gsyscall. preemptExtLock uint32 } diff --git a/src/runtime/preempt.go b/src/runtime/preempt.go index 1044a6d7207..447c7399fcb 100644 --- a/src/runtime/preempt.go +++ b/src/runtime/preempt.go @@ -286,7 +286,7 @@ func resumeG(state suspendGState) { // //go:nosplit func canPreemptM(mp *m) bool { - return mp.locks == 0 && mp.mallocing == 0 && mp.preemptoff == "" && mp.p.ptr().status == _Prunning + return mp.locks == 0 && mp.mallocing == 0 && mp.preemptoff == "" && mp.p.ptr().status == _Prunning && mp.curg != nil && readgstatus(mp.curg)&^_Gscan != _Gsyscall } //go:generate go run mkpreempt.go diff --git a/src/runtime/proc.go b/src/runtime/proc.go index 6c16effc954..081b9a28258 100644 --- a/src/runtime/proc.go +++ b/src/runtime/proc.go @@ -1654,29 +1654,21 @@ func stopTheWorldWithSema(reason stwReason) worldStop { sched.stopwait = gomaxprocs sched.gcwaiting.Store(true) preemptall() - // stop current P + + // Stop current P. gp.m.p.ptr().status = _Pgcstop // Pgcstop is only diagnostic. gp.m.p.ptr().gcStopTime = start sched.stopwait-- - // try to retake all P's in Psyscall status - trace = traceAcquire() + + // Try to retake all P's in syscalls. for _, pp := range allp { - s := pp.status - if s == _Psyscall && atomic.Cas(&pp.status, s, _Pgcstop) { - if trace.ok() { - trace.ProcSteal(pp, false) - } - sched.nGsyscallNoP.Add(1) - pp.syscalltick++ - pp.gcStopTime = nanotime() - sched.stopwait-- + if thread, ok := setBlockOnExitSyscall(pp); ok { + thread.gcstopP() + thread.resume() } } - if trace.ok() { - traceRelease(trace) - } - // stop idle P's + // Stop idle Ps. now := nanotime() for { pp, _ := pidleget(now) @@ -1690,7 +1682,7 @@ func stopTheWorldWithSema(reason stwReason) worldStop { wait := sched.stopwait > 0 unlock(&sched.lock) - // wait for remaining P's to stop voluntarily + // Wait for remaining Ps to stop voluntarily. if wait { for { // wait for 100us, then try to re-preempt in case of any races @@ -2156,9 +2148,9 @@ func forEachPInternal(fn func(*p)) { } preemptall() - // Any P entering _Pidle or _Psyscall from now on will observe + // Any P entering _Pidle or a system call from now on will observe // p.runSafePointFn == 1 and will call runSafePointFn when - // changing its status to _Pidle/_Psyscall. + // changing its status to _Pidle. // Run safe point function for all idle Ps. sched.pidle will // not change because we hold sched.lock. @@ -2175,25 +2167,19 @@ func forEachPInternal(fn func(*p)) { // Run fn for the current P. fn(pp) - // Force Ps currently in _Psyscall into _Pidle and hand them + // Force Ps currently in a system call into _Pidle and hand them // off to induce safe point function execution. for _, p2 := range allp { - s := p2.status - // We need to be fine-grained about tracing here, since handoffp // might call into the tracer, and the tracer is non-reentrant. - trace := traceAcquire() - if s == _Psyscall && p2.runSafePointFn == 1 && atomic.Cas(&p2.status, s, _Pidle) { - if trace.ok() { - // It's important that we traceRelease before we call handoffp, which may also traceAcquire. - trace.ProcSteal(p2, false) - traceRelease(trace) - } - sched.nGsyscallNoP.Add(1) - p2.syscalltick++ + if atomic.Load(&p2.runSafePointFn) != 1 { + // Already ran it. + continue + } + if thread, ok := setBlockOnExitSyscall(p2); ok { + thread.takeP() + thread.resume() handoffp(p2) - } else if trace.ok() { - traceRelease(trace) } } @@ -2234,9 +2220,9 @@ func forEachPInternal(fn func(*p)) { // } // // runSafePointFn must be checked on any transition in to _Pidle or -// _Psyscall to avoid a race where forEachP sees that the P is running -// just before the P goes into _Pidle/_Psyscall and neither forEachP -// nor the P run the safe-point function. +// when entering a system call to avoid a race where forEachP sees +// that the P is running just before the P goes into _Pidle/system call +// and neither forEachP nor the P run the safe-point function. func runSafePointFn() { p := getg().m.p.ptr() // Resolve the race between forEachP running the safe-point @@ -2571,7 +2557,7 @@ func oneNewExtraM() { // So that the destructor would invoke dropm while the non-Go thread is exiting. // This is much faster since it avoids expensive signal-related syscalls. // -// This always runs without a P, so //go:nowritebarrierrec is required. +// This may run without a P, so //go:nowritebarrierrec is required. // // This may run with a different stack than was recorded in g0 (there is no // call to callbackUpdateSystemStack prior to dropm), so this must be @@ -4566,7 +4552,6 @@ func save(pc, sp, bp uintptr) { // //go:nosplit func reentersyscall(pc, sp, bp uintptr) { - trace := traceAcquire() gp := getg() // Disable preemption because during this function g is in Gsyscall status, @@ -4580,17 +4565,23 @@ func reentersyscall(pc, sp, bp uintptr) { gp.stackguard0 = stackPreempt gp.throwsplit = true + // Copy the syscalltick over so we can identify if the P got stolen later. + gp.m.syscalltick = gp.m.p.ptr().syscalltick + + pp := gp.m.p.ptr() + if pp.runSafePointFn != 0 { + // runSafePointFn may stack split if run on this stack + systemstack(runSafePointFn) + } + gp.m.oldp.set(pp) + // Leave SP around for GC and traceback. save(pc, sp, bp) gp.syscallsp = sp gp.syscallpc = pc gp.syscallbp = bp - casgstatus(gp, _Grunning, _Gsyscall) - if staticLockRanking { - // When doing static lock ranking casgstatus can call - // systemstack which clobbers g.sched. - save(pc, sp, bp) - } + + // Double-check sp and bp. if gp.syscallsp < gp.stack.lo || gp.stack.hi < gp.syscallsp { systemstack(func() { print("entersyscall inconsistent sp ", hex(gp.syscallsp), " [", hex(gp.stack.lo), ",", hex(gp.stack.hi), "]\n") @@ -4603,40 +4594,45 @@ func reentersyscall(pc, sp, bp uintptr) { throw("entersyscall") }) } - + trace := traceAcquire() if trace.ok() { + // Emit a trace event. Notably, actually emitting the event must happen before + // the casgstatus because it mutates the P, but the traceLocker must be held + // across the casgstatus so the transition is atomic with respect to the event. systemstack(func() { trace.GoSysCall() - traceRelease(trace) }) - // systemstack itself clobbers g.sched.{pc,sp} and we might - // need them later when the G is genuinely blocked in a - // syscall + // systemstack clobbered gp.sched, so restore it. save(pc, sp, bp) } - - if sched.sysmonwait.Load() { - systemstack(entersyscall_sysmon) - save(pc, sp, bp) - } - - if gp.m.p.ptr().runSafePointFn != 0 { - // runSafePointFn may stack split if run on this stack - systemstack(runSafePointFn) - save(pc, sp, bp) - } - - gp.m.syscalltick = gp.m.p.ptr().syscalltick - pp := gp.m.p.ptr() - pp.m = 0 - gp.m.oldp.set(pp) - gp.m.p = 0 - atomic.Store(&pp.status, _Psyscall) if sched.gcwaiting.Load() { - systemstack(entersyscall_gcwait) + // Optimization: If there's a pending STW, do the equivalent of + // entersyscallblock here at the last minute and immediately give + // away our P. + systemstack(func() { + entersyscallHandleGCWait(trace) + }) + // systemstack clobbered gp.sched, so restore it. + save(pc, sp, bp) + } + // As soon as we switch to _Gsyscall, we are in danger of losing our P. + // We must not touch it after this point. + casgstatus(gp, _Grunning, _Gsyscall) + if staticLockRanking { + // casgstatus clobbers gp.sched via systemstack under staticLockRanking. Restore it. + save(pc, sp, bp) + } + if trace.ok() { + // N.B. We don't need to go on the systemstack because traceRelease is very + // carefully recursively nosplit. This also means we don't need to worry + // about clobbering gp.sched. + traceRelease(trace) + } + if sched.sysmonwait.Load() { + systemstack(entersyscallWakeSysmon) + // systemstack clobbered gp.sched, so restore it. save(pc, sp, bp) } - gp.m.locks-- } @@ -4663,7 +4659,7 @@ func entersyscall() { reentersyscall(sys.GetCallerPC(), sys.GetCallerSP(), fp) } -func entersyscall_sysmon() { +func entersyscallWakeSysmon() { lock(&sched.lock) if sched.sysmonwait.Load() { sched.sysmonwait.Store(false) @@ -4672,25 +4668,19 @@ func entersyscall_sysmon() { unlock(&sched.lock) } -func entersyscall_gcwait() { +func entersyscallHandleGCWait(trace traceLocker) { gp := getg() - pp := gp.m.oldp.ptr() lock(&sched.lock) - trace := traceAcquire() - if sched.stopwait > 0 && atomic.Cas(&pp.status, _Psyscall, _Pgcstop) { + if sched.stopwait > 0 { + // Set our P to _Pgcstop so the STW can take it. + pp := gp.m.p.ptr() + pp.m = 0 + gp.m.p = 0 + atomic.Store(&pp.status, _Pgcstop) + if trace.ok() { - // This is a steal in the new tracer. While it's very likely - // that we were the ones to put this P into _Psyscall, between - // then and now it's totally possible it had been stolen and - // then put back into _Psyscall for us to acquire here. In such - // case ProcStop would be incorrect. - // - // TODO(mknyszek): Consider emitting a ProcStop instead when - // gp.m.syscalltick == pp.syscalltick, since then we know we never - // lost the P. - trace.ProcSteal(pp, true) - traceRelease(trace) + trace.ProcStop(pp) } sched.nGsyscallNoP.Add(1) pp.gcStopTime = nanotime() @@ -4698,8 +4688,6 @@ func entersyscall_gcwait() { if sched.stopwait--; sched.stopwait == 0 { notewakeup(&sched.stopnote) } - } else if trace.ok() { - traceRelease(trace) } unlock(&sched.lock) } @@ -4744,6 +4732,22 @@ func entersyscallblock() { throw("entersyscallblock") }) } + + // Once we switch to _Gsyscall, we can't safely touch + // our P anymore, so we need to hand it off beforehand. + // The tracer also needs to see the syscall before the P + // handoff, so the order here must be (1) trace, + // (2) handoff, (3) _Gsyscall switch. + trace := traceAcquire() + systemstack(func() { + if trace.ok() { + trace.GoSysCall() + } + handoffp(releasep()) + }) + // <-- + // Caution: we're in a small window where we are in _Grunning without a P. + // --> casgstatus(gp, _Grunning, _Gsyscall) if gp.syscallsp < gp.stack.lo || gp.stack.hi < gp.syscallsp { systemstack(func() { @@ -4757,8 +4761,11 @@ func entersyscallblock() { throw("entersyscallblock") }) } - - systemstack(entersyscallblock_handoff) + if trace.ok() { + systemstack(func() { + traceRelease(trace) + }) + } // Resave for traceback during blocked call. save(sys.GetCallerPC(), sys.GetCallerSP(), getcallerfp()) @@ -4766,15 +4773,6 @@ func entersyscallblock() { gp.m.locks-- } -func entersyscallblock_handoff() { - trace := traceAcquire() - if trace.ok() { - trace.GoSysCall() - traceRelease(trace) - } - handoffp(releasep()) -} - // The goroutine g exited its system call. // Arrange for it to run on a cpu again. // This is called only from the go syscall library, not @@ -4802,13 +4800,87 @@ func exitsyscall() { if sys.GetCallerSP() > gp.syscallsp { throw("exitsyscall: syscall frame is no longer valid") } - gp.waitsince = 0 + + if sched.stopwait == freezeStopWait { + // Wedge ourselves if there's an outstanding freezetheworld. + // If we transition to running, we might end up with our traceback + // being taken twice. + systemstack(func() { + lock(&deadlock) + lock(&deadlock) + }) + } + + // Optimistically assume we're going to keep running, and switch to running. + // Before this point, our P wiring is not ours. Once we get past this point, + // we can access our P if we have it, otherwise we lost it. + // + // N.B. Because we're transitioning to _Grunning here, traceAcquire doesn't + // need to be held ahead of time. We're effectively atomic with respect to + // the tracer because we're non-preemptible and in the runtime. It can't stop + // us to read a bad status. + casgstatus(gp, _Gsyscall, _Grunning) + + // Caution: we're in a window where we may be in _Grunning without a P. + // Either we will grab a P or call exitsyscall0, where we'll switch to + // _Grunnable. + + // Grab and clear our old P. oldp := gp.m.oldp.ptr() - gp.m.oldp = 0 - if exitsyscallfast(oldp) { - // When exitsyscallfast returns success, we have a P so can now use - // write barriers + gp.m.oldp.set(nil) + + // Check if we still have a P, and if not, try to acquire an idle P. + pp := gp.m.p.ptr() + if pp != nil { + // Fast path: we still have our P. Just emit a syscall exit event. + if trace := traceAcquire(); trace.ok() { + systemstack(func() { + // The truth is we truly never lost the P, but syscalltick + // is used to indicate whether the P should be treated as + // lost anyway. For example, when syscalltick is trashed by + // dropm. + // + // TODO(mknyszek): Consider a more explicit mechanism for this. + // Then syscalltick doesn't need to be trashed, and can be used + // exclusively by sysmon for deciding when it's time to retake. + if pp.syscalltick == gp.m.syscalltick { + trace.GoSysExit(false) + } else { + // Since we need to pretend we lost the P, but nobody ever + // took it, we need a ProcSteal event to model the loss. + // Then, continue with everything else we'd do if we lost + // the P. + trace.ProcSteal(pp) + trace.ProcStart() + trace.GoSysExit(true) + trace.GoStart() + } + traceRelease(trace) + }) + } + } else { + // Slow path: we lost our P. Try to get another one. + systemstack(func() { + // Try to get some other P. + if pp := exitsyscallTryGetP(oldp); pp != nil { + // Install the P. + acquirepNoTrace(pp) + + // We're going to start running again, so emit all the relevant events. + if trace := traceAcquire(); trace.ok() { + trace.ProcStart() + trace.GoSysExit(true) + trace.GoStart() + traceRelease(trace) + } + } + }) + pp = gp.m.p.ptr() + } + + // If we have a P, clean up and exit. + if pp != nil { if goroutineProfile.active { // Make sure that gp has had its stack written out to the goroutine // profile, exactly as it was when the goroutine profiler first @@ -4817,41 +4889,19 @@ func exitsyscall() { tryRecordGoroutineProfileWB(gp) }) } - trace := traceAcquire() - if trace.ok() { - lostP := oldp != gp.m.p.ptr() || gp.m.syscalltick != gp.m.p.ptr().syscalltick - systemstack(func() { - // Write out syscall exit eagerly. - // - // It's important that we write this *after* we know whether we - // lost our P or not (determined by exitsyscallfast). - trace.GoSysExit(lostP) - if lostP { - // We lost the P at some point, even though we got it back here. - // Trace that we're starting again, because there was a tracev2.GoSysBlock - // call somewhere in exitsyscallfast (indicating that this goroutine - // had blocked) and we're about to start running again. - trace.GoStart() - } - }) - } - // There's a cpu for us, so we can run. - gp.m.p.ptr().syscalltick++ - // We need to cas the status and scan before resuming... - casgstatus(gp, _Gsyscall, _Grunning) - if trace.ok() { - traceRelease(trace) - } + + // Increment the syscalltick for P, since we're exiting a syscall. + pp.syscalltick++ // Garbage collector isn't running (since we are), // so okay to clear syscallsp. gp.syscallsp = 0 gp.m.locks-- if gp.preempt { - // restore the preemption request in case we've cleared it in newstack + // Restore the preemption request in case we cleared it in newstack. gp.stackguard0 = stackPreempt } else { - // otherwise restore the real stackGuard, we've spoiled it in entersyscall/entersyscallblock + // Otherwise restore the real stackGuard, we clobbered it in entersyscall/entersyscallblock. gp.stackguard0 = gp.stack.lo + stackGuard } gp.throwsplit = false @@ -4860,14 +4910,13 @@ func exitsyscall() { // Scheduling of this goroutine is disabled. Gosched() } - return } - + // Slowest path: We couldn't get a P, so call into the scheduler. gp.m.locks-- // Call the scheduler. - mcall(exitsyscall0) + mcall(exitsyscallNoP) // Scheduler returned, so we're allowed to run now. // Delete the syscallsp information that we left for @@ -4880,78 +4929,38 @@ func exitsyscall() { gp.throwsplit = false } -//go:nosplit -func exitsyscallfast(oldp *p) bool { - // Freezetheworld sets stopwait but does not retake P's. - if sched.stopwait == freezeStopWait { - return false - } - - // Try to re-acquire the last P. - trace := traceAcquire() - if oldp != nil && oldp.status == _Psyscall && atomic.Cas(&oldp.status, _Psyscall, _Pidle) { - // There's a cpu for us, so we can run. - wirep(oldp) - exitsyscallfast_reacquired(trace) - if trace.ok() { - traceRelease(trace) - } - return true - } - if trace.ok() { - traceRelease(trace) - } - - // Try to get any other idle P. - if sched.pidle != 0 { - var ok bool - systemstack(func() { - ok = exitsyscallfast_pidle() - }) - if ok { - return true - } - } - return false -} - -// exitsyscallfast_reacquired is the exitsyscall path on which this G -// has successfully reacquired the P it was running on before the -// syscall. +// exitsyscall's attempt to try to get any P, if it's missing one. +// Returns true on success. // -//go:nosplit -func exitsyscallfast_reacquired(trace traceLocker) { - gp := getg() - if gp.m.syscalltick != gp.m.p.ptr().syscalltick { - if trace.ok() { - // The p was retaken and then enter into syscall again (since gp.m.syscalltick has changed). - // tracev2.GoSysBlock for this syscall was already emitted, - // but here we effectively retake the p from the new syscall running on the same p. - systemstack(func() { - // We're stealing the P. It's treated - // as if it temporarily stopped running. Then, start running. - trace.ProcSteal(gp.m.p.ptr(), true) - trace.ProcStart() - }) +// Must execute on the systemstack because exitsyscall is nosplit. +// +//go:systemstack +func exitsyscallTryGetP(oldp *p) *p { + // Try to steal our old P back. + if oldp != nil { + if thread, ok := setBlockOnExitSyscall(oldp); ok { + thread.takeP() + thread.resume() + sched.nGsyscallNoP.Add(-1) // takeP adds 1. + return oldp } - gp.m.p.ptr().syscalltick++ } -} -func exitsyscallfast_pidle() bool { - lock(&sched.lock) - pp, _ := pidleget(0) - if pp != nil && sched.sysmonwait.Load() { - sched.sysmonwait.Store(false) - notewakeup(&sched.sysmonnote) + // Try to get an idle P. + if sched.pidle != 0 { + lock(&sched.lock) + pp, _ := pidleget(0) + if pp != nil && sched.sysmonwait.Load() { + sched.sysmonwait.Store(false) + notewakeup(&sched.sysmonnote) + } + unlock(&sched.lock) + if pp != nil { + sched.nGsyscallNoP.Add(-1) + return pp + } } - unlock(&sched.lock) - if pp != nil { - sched.nGsyscallNoP.Add(-1) - acquirep(pp) - return true - } - return false + return nil } // exitsyscall slow path on g0. @@ -4960,11 +4969,10 @@ func exitsyscallfast_pidle() bool { // Called via mcall, so gp is the calling g from this M. // //go:nowritebarrierrec -func exitsyscall0(gp *g) { - var trace traceLocker +func exitsyscallNoP(gp *g) { traceExitingSyscall() - trace = traceAcquire() - casgstatus(gp, _Gsyscall, _Grunnable) + trace := traceAcquire() + casgstatus(gp, _Grunning, _Grunnable) traceExitedSyscall() if trace.ok() { // Write out syscall exit eagerly. @@ -6021,6 +6029,21 @@ func procresize(nprocs int32) *p { // //go:yeswritebarrierrec func acquirep(pp *p) { + // Do the work. + acquirepNoTrace(pp) + + // Emit the event. + trace := traceAcquire() + if trace.ok() { + trace.ProcStart() + traceRelease(trace) + } +} + +// Internals of acquirep, just skipping the trace events. +// +//go:yeswritebarrierrec +func acquirepNoTrace(pp *p) { // Do the part that isn't allowed to have write barriers. wirep(pp) @@ -6029,12 +6052,6 @@ func acquirep(pp *p) { // Perform deferred mcache flush before this P can allocate // from a potentially stale mcache. pp.mcache.prepareForSweep() - - trace := traceAcquire() - if trace.ok() { - trace.ProcStart() - traceRelease(trace) - } } // wirep is the first step of acquirep, which actually associates the @@ -6382,73 +6399,205 @@ func retake(now int64) uint32 { // temporarily drop the allpLock. Hence, we need to re-fetch // allp each time around the loop. for i := 0; i < len(allp); i++ { + // Quickly filter out non-running Ps. Running Ps are either + // in a syscall or are actually executing. Idle Ps don't + // need to be retaken. + // + // This is best-effort, so it's OK that it's racy. Our target + // is to retake Ps that have been running or in a syscall for + // a long time (milliseconds), so the state has plenty of time + // to stabilize. pp := allp[i] - if pp == nil { - // This can happen if procresize has grown + if pp == nil || atomic.Load(&pp.status) != _Prunning { + // pp can be nil if procresize has grown // allp but not yet created new Ps. continue } pd := &pp.sysmontick - s := pp.status sysretake := false - if s == _Prunning || s == _Psyscall { - // Preempt G if it's running on the same schedtick for - // too long. This could be from a single long-running - // goroutine or a sequence of goroutines run via - // runnext, which share a single schedtick time slice. - t := int64(pp.schedtick) - if int64(pd.schedtick) != t { - pd.schedtick = uint32(t) - pd.schedwhen = now - } else if pd.schedwhen+forcePreemptNS <= now { - preemptone(pp) - // In case of syscall, preemptone() doesn't - // work, because there is no M wired to P. - sysretake = true - } + + // Preempt G if it's running on the same schedtick for + // too long. This could be from a single long-running + // goroutine or a sequence of goroutines run via + // runnext, which share a single schedtick time slice. + schedt := int64(pp.schedtick) + if int64(pd.schedtick) != schedt { + pd.schedtick = uint32(schedt) + pd.schedwhen = now + } else if pd.schedwhen+forcePreemptNS <= now { + preemptone(pp) + // If pp is in a syscall, preemptone doesn't work. + // The goroutine nor the thread can respond to a + // preemption request because they're not in Go code, + // so we need to take the P ourselves. + sysretake = true } - if s == _Psyscall { - // Retake P from syscall if it's there for more than 1 sysmon tick (at least 20us). - t := int64(pp.syscalltick) - if !sysretake && int64(pd.syscalltick) != t { - pd.syscalltick = uint32(t) - pd.syscallwhen = now - continue - } - // On the one hand we don't want to retake Ps if there is no other work to do, - // but on the other hand we want to retake them eventually - // because they can prevent the sysmon thread from deep sleep. - if runqempty(pp) && sched.nmspinning.Load()+sched.npidle.Load() > 0 && pd.syscallwhen+10*1000*1000 > now { - continue - } - // Drop allpLock so we can take sched.lock. - unlock(&allpLock) - // Need to decrement number of idle locked M's - // (pretending that one more is running) before the CAS. - // Otherwise the M from which we retake can exit the syscall, - // increment nmidle and report deadlock. - incidlelocked(-1) - trace := traceAcquire() - if atomic.Cas(&pp.status, s, _Pidle) { - if trace.ok() { - trace.ProcSteal(pp, false) - traceRelease(trace) - } - sched.nGsyscallNoP.Add(1) - n++ - pp.syscalltick++ - handoffp(pp) - } else if trace.ok() { - traceRelease(trace) - } - incidlelocked(1) - lock(&allpLock) + + // Drop allpLock so we can take sched.lock. + unlock(&allpLock) + + // Need to decrement number of idle locked M's (pretending that + // one more is running) before we take the P and resume. + // Otherwise the M from which we retake can exit the syscall, + // increment nmidle and report deadlock. + // + // Can't call incidlelocked once we setBlockOnExitSyscall, due + // to a lock ordering violation between sched.lock and _Gscan. + incidlelocked(-1) + + // Try to prevent the P from continuing in the syscall, if it's in one at all. + thread, ok := setBlockOnExitSyscall(pp) + if !ok { + // Not in a syscall, or something changed out from under us. + goto done } + + // Retake the P if it's there for more than 1 sysmon tick (at least 20us). + if syst := int64(pp.syscalltick); !sysretake && int64(pd.syscalltick) != syst { + pd.syscalltick = uint32(syst) + pd.syscallwhen = now + thread.resume() + goto done + } + + // On the one hand we don't want to retake Ps if there is no other work to do, + // but on the other hand we want to retake them eventually + // because they can prevent the sysmon thread from deep sleep. + if runqempty(pp) && sched.nmspinning.Load()+sched.npidle.Load() > 0 && pd.syscallwhen+10*1000*1000 > now { + thread.resume() + goto done + } + + // Take the P. Note: because we have the scan bit, the goroutine + // is at worst stuck spinning in exitsyscall. + thread.takeP() + thread.resume() + n++ + + // Handoff the P for some other thread to run it. + handoffp(pp) + + // The P has been handed off to another thread, so risk of a false + // deadlock report while we hold onto it is gone. + done: + incidlelocked(1) + lock(&allpLock) } unlock(&allpLock) return uint32(n) } +// syscallingThread represents a thread in a system call that temporarily +// cannot advance out of the system call. +type syscallingThread struct { + gp *g + mp *m + pp *p + status uint32 +} + +// setBlockOnExitSyscall prevents pp's thread from advancing out of +// exitsyscall. On success, returns the g/m/p state of the thread +// and true. At that point, the caller owns the g/m/p links referenced, +// the goroutine is in _Gsyscall, and prevented from transitioning out +// of it. On failure, it returns false, and none of these guarantees are +// made. +// +// Callers must call resume on the resulting thread state once +// they're done with thread, otherwise it will remain blocked forever. +// +// This function races with state changes on pp, and thus may fail +// if pp is not in a system call, or exits a system call concurrently +// with this function. However, this function is safe to call without +// any additional synchronization. +func setBlockOnExitSyscall(pp *p) (syscallingThread, bool) { + if pp.status != _Prunning { + return syscallingThread{}, false + } + // Be very careful here, these reads are intentionally racy. + // Once we notice the G is in _Gsyscall, acquire its scan bit, + // and validate that it's still connected to the *same* M and P, + // we can actually get to work. Holding the scan bit will prevent + // the G from exiting the syscall. + // + // Our goal here is to interrupt long syscalls. If it turns out + // that we're wrong and the G switched to another syscall while + // we were trying to do this, that's completely fine. It's + // probably making more frequent syscalls and the typical + // preemption paths should be effective. + mp := pp.m.ptr() + if mp == nil { + // Nothing to do. + return syscallingThread{}, false + } + gp := mp.curg + if gp == nil { + // Nothing to do. + return syscallingThread{}, false + } + status := readgstatus(gp) &^ _Gscan + + // A goroutine is considered in a syscall, and may have a corresponding + // P, if it's in _Gsyscall *or* _Gdeadextra. In the latter case, it's an + // extra M goroutine. + if status != _Gsyscall && status != _Gdeadextra { + // Not in a syscall, nothing to do. + return syscallingThread{}, false + } + if !castogscanstatus(gp, status, status|_Gscan) { + // Not in _Gsyscall or _Gdeadextra anymore. Nothing to do. + return syscallingThread{}, false + } + if gp.m != mp || gp.m.p.ptr() != pp { + // This is not what we originally observed. Nothing to do. + casfrom_Gscanstatus(gp, status|_Gscan, status) + return syscallingThread{}, false + } + return syscallingThread{gp, mp, pp, status}, true +} + +// gcstopP unwires the P attached to the syscalling thread +// and moves it into the _Pgcstop state. +// +// The caller must be stopping the world. +func (s syscallingThread) gcstopP() { + assertLockHeld(&sched.lock) + + s.releaseP(_Pgcstop) + s.pp.gcStopTime = nanotime() + sched.stopwait-- +} + +// takeP unwires the P attached to the syscalling thread +// and moves it into the _Pidle state. +func (s syscallingThread) takeP() { + s.releaseP(_Pidle) +} + +// releaseP unwires the P from the syscalling thread, moving +// it to the provided state. Callers should prefer to use +// takeP and gcstopP. +func (s syscallingThread) releaseP(state uint32) { + if state != _Pidle && state != _Pgcstop { + throw("attempted to release P into a bad state") + } + trace := traceAcquire() + s.pp.m = 0 + s.mp.p = 0 + atomic.Store(&s.pp.status, state) + if trace.ok() { + trace.ProcSteal(s.pp) + traceRelease(trace) + } + sched.nGsyscallNoP.Add(1) + s.pp.syscalltick++ +} + +// resume allows a syscalling thread to advance beyond exitsyscall. +func (s syscallingThread) resume() { + casfrom_Gscanstatus(s.gp, s.status|_Gscan, s.status) +} + // Tell all goroutines that they have been preempted and they should stop. // This function is purely best-effort. It can fail to inform a goroutine if a // processor just started running it. @@ -6486,6 +6635,10 @@ func preemptone(pp *p) bool { if gp == nil || gp == mp.g0 { return false } + if readgstatus(gp)&^_Gscan == _Gsyscall { + // Don't bother trying to preempt a goroutine in a syscall. + return false + } gp.preempt = true diff --git a/src/runtime/runtime2.go b/src/runtime/runtime2.go index 721fae0b28b..b346337d603 100644 --- a/src/runtime/runtime2.go +++ b/src/runtime/runtime2.go @@ -42,12 +42,16 @@ const ( // _Grunning means this goroutine may execute user code. The // stack is owned by this goroutine. It is not on a run queue. - // It is assigned an M and a P (g.m and g.m.p are valid). + // It is assigned an M (g.m is valid) and it usually has a P + // (g.m.p is valid), but there are small windows of time where + // it might not, namely upon entering and exiting _Gsyscall. _Grunning // 2 // _Gsyscall means this goroutine is executing a system call. // It is not executing user code. The stack is owned by this // goroutine. It is not on a run queue. It is assigned an M. + // It may have a P attached, but it does not own it. Code + // executing in this state must not touch g.m.p. _Gsyscall // 3 // _Gwaiting means this goroutine is blocked in the runtime. @@ -131,22 +135,15 @@ const ( // run user code or the scheduler. Only the M that owns this P // is allowed to change the P's status from _Prunning. The M // may transition the P to _Pidle (if it has no more work to - // do), _Psyscall (when entering a syscall), or _Pgcstop (to - // halt for the GC). The M may also hand ownership of the P - // off directly to another M (e.g., to schedule a locked G). + // do), or _Pgcstop (to halt for the GC). The M may also hand + // ownership of the P off directly to another M (for example, + // to schedule a locked G). _Prunning - // _Psyscall means a P is not running user code. It has - // affinity to an M in a syscall but is not owned by it and - // may be stolen by another M. This is similar to _Pidle but - // uses lightweight transitions and maintains M affinity. - // - // Leaving _Psyscall must be done with a CAS, either to steal - // or retake the P. Note that there's an ABA hazard: even if - // an M successfully CASes its original P back to _Prunning - // after a syscall, it must understand the P may have been - // used by another M in the interim. - _Psyscall + // _Psyscall_unused is a now-defunct state for a P. A P is + // identified as "in a system call" by looking at the goroutine's + // state. + _Psyscall_unused // _Pgcstop means a P is halted for STW and owned by the M // that stopped the world. The M that stopped the world @@ -620,18 +617,27 @@ type m struct { morebuf gobuf // gobuf arg to morestack divmod uint32 // div/mod denominator for arm - known to liblink (cmd/internal/obj/arm/obj5.go) - // Fields not known to debuggers. - procid uint64 // for debuggers, but offset not hard-coded - gsignal *g // signal-handling g - goSigStack gsignalStack // Go-allocated signal handling stack - sigmask sigset // storage for saved signal mask - tls [tlsSlots]uintptr // thread-local storage (for x86 extern register) - mstartfn func() - curg *g // current running goroutine - caughtsig guintptr // goroutine running during fatal signal - p puintptr // attached p for executing go code (nil if not executing go code) - nextp puintptr - oldp puintptr // the p that was attached before executing a syscall + // Fields whose offsets are not known to debuggers. + + procid uint64 // for debuggers, but offset not hard-coded + gsignal *g // signal-handling g + goSigStack gsignalStack // Go-allocated signal handling stack + sigmask sigset // storage for saved signal mask + tls [tlsSlots]uintptr // thread-local storage (for x86 extern register) + mstartfn func() + curg *g // current running goroutine + caughtsig guintptr // goroutine running during fatal signal + + // p is the currently attached P for executing Go code, nil if not executing user Go code. + // + // A non-nil p implies exclusive ownership of the P, unless curg is in _Gsyscall. + // In _Gsyscall the scheduler may mutate this instead. The point of synchronization + // is the _Gscan bit on curg's status. The scheduler must arrange to prevent curg + // from transitioning out of _Gsyscall if it intends to mutate p. + p puintptr + + nextp puintptr // The next P to install before executing. Implies exclusive ownership of this P. + oldp puintptr // The P that was attached before executing a syscall. id int64 mallocing int32 throwing throwType diff --git a/src/runtime/traceevent.go b/src/runtime/traceevent.go index b0bc4c017da..86512754761 100644 --- a/src/runtime/traceevent.go +++ b/src/runtime/traceevent.go @@ -44,6 +44,13 @@ func (tl traceLocker) eventWriter(goStatus tracev2.GoStatus, procStatus tracev2. if gp := tl.mp.curg; gp != nil && !gp.trace.statusWasTraced(tl.gen) && gp.trace.acquireStatus(tl.gen) { tl.writer().writeGoStatus(gp.goid, int64(tl.mp.procid), goStatus, gp.inMarkAssist, 0 /* no stack */).end() } + return tl.rawEventWriter() +} + +// rawEventWriter creates a new traceEventWriter without emitting any status events. +// +// It is the caller's responsibility to emit any status events, if necessary. +func (tl traceLocker) rawEventWriter() traceEventWriter { return traceEventWriter{tl} } diff --git a/src/runtime/traceruntime.go b/src/runtime/traceruntime.go index 7df6f68b706..d3f82f6fcee 100644 --- a/src/runtime/traceruntime.go +++ b/src/runtime/traceruntime.go @@ -534,10 +534,8 @@ func (tl traceLocker) GoSysExit(lostP bool) { // ProcSteal indicates that our current M stole a P from another M. // -// inSyscall indicates that we're stealing the P from a syscall context. -// // The caller must have ownership of pp. -func (tl traceLocker) ProcSteal(pp *p, inSyscall bool) { +func (tl traceLocker) ProcSteal(pp *p) { // Grab the M ID we stole from. mStolenFrom := pp.trace.mSyscallID pp.trace.mSyscallID = -1 @@ -561,7 +559,7 @@ func (tl traceLocker) ProcSteal(pp *p, inSyscall bool) { // In the latter, we're a goroutine in a syscall. goStatus := tracev2.GoRunning procStatus := tracev2.ProcRunning - if inSyscall { + if tl.mp.curg != nil && tl.mp.curg.syscallsp != 0 { goStatus = tracev2.GoSyscall procStatus = tracev2.ProcSyscallAbandoned } @@ -595,19 +593,27 @@ func (tl traceLocker) GoCreateSyscall(gp *g) { // N.B. We should never trace a status for this goroutine (which we're currently running on), // since we want this to appear like goroutine creation. gp.trace.setStatusTraced(tl.gen) - tl.eventWriter(tracev2.GoBad, tracev2.ProcBad).event(tracev2.EvGoCreateSyscall, traceArg(gp.goid)) + + // We might have a P left over on the thread from the last cgo callback, + // but in a syscall context, it is NOT ours. Act as if we do not have a P, + // and don't record a status. + tl.rawEventWriter().event(tracev2.EvGoCreateSyscall, traceArg(gp.goid)) } // GoDestroySyscall indicates that a goroutine has transitioned from GoSyscall to dead. // -// Must not have a P. -// // This occurs when Go code returns back to C. On pthread platforms it occurs only when // the C thread is destroyed. func (tl traceLocker) GoDestroySyscall() { - // N.B. If we trace a status here, we must never have a P, and we must be on a goroutine - // that is in the syscall state. - tl.eventWriter(tracev2.GoSyscall, tracev2.ProcBad).event(tracev2.EvGoDestroySyscall) + // Write the status for the goroutine if necessary. + if gp := tl.mp.curg; gp != nil && !gp.trace.statusWasTraced(tl.gen) && gp.trace.acquireStatus(tl.gen) { + tl.writer().writeGoStatus(gp.goid, int64(tl.mp.procid), tracev2.GoSyscall, false, 0 /* no stack */).end() + } + + // We might have a P left over on the thread from the last cgo callback, + // but in a syscall context, it is NOT ours. Act as if we do not have a P, + // and don't record a status. + tl.rawEventWriter().event(tracev2.EvGoDestroySyscall) } // To access runtime functions from runtime/trace. diff --git a/src/runtime/tracestatus.go b/src/runtime/tracestatus.go index f09bdfd3554..6343126faf0 100644 --- a/src/runtime/tracestatus.go +++ b/src/runtime/tracestatus.go @@ -62,15 +62,14 @@ func (w traceWriter) writeProcStatusForP(pp *p, inSTW bool) traceWriter { } case _Prunning: status = tracev2.ProcRunning - // There's a short window wherein the goroutine may have entered _Gsyscall - // but it still owns the P (it's not in _Psyscall yet). The goroutine entering - // _Gsyscall is the tracer's signal that the P its bound to is also in a syscall, - // so we need to emit a status that matches. See #64318. + // A P is considered to be in a syscall if its attached G is. Since we fully + // own P, then the goroutine isn't going to transition and we can trivially + // check if the goroutine is in a syscall. This used to be just a small problematic + // window, but this is now the default since _Psyscall no longer exists. See #64318 + // for the history on why it was needed while _Psyscall still existed. if w.mp.p.ptr() == pp && w.mp.curg != nil && readgstatus(w.mp.curg)&^_Gscan == _Gsyscall { status = tracev2.ProcSyscall } - case _Psyscall: - status = tracev2.ProcSyscall default: throw("attempt to trace invalid or unsupported P status") }