runtime: eliminate _Psyscall

This change eliminates the _Psyscall state by using synchronization on the G status _Gsyscall to make syscalls work instead. This removes an atomic Store and an atomic CAS on the syscall path, which reduces syscall and cgo overheads. It also simplifies the syscall paths quite a bit. The one danger with this change is that we have a new combination of states that was previously impossible. There are brief windows where it's possible to observe a goroutine in _Grunning but without a P. This change is careful to hide this detail from the execution tracer, but it may have unexpected effects in the rest of the runtime, making this change somewhat risky. goos: linux goarch: amd64 pkg: internal/runtime/cgobench cpu: AMD EPYC 7B13 │ before.out │ after.out │ │ sec/op │ sec/op vs base │ CgoCall-64 43.69n ± 1% 35.83n ± 1% -17.99% (p=0.002 n=6) CgoCallParallel-64 5.306n ± 1% 5.338n ± 1% ~ (p=0.132 n=6) Change-Id: I4551afc1eea0c1b67a0b2dd26b0d49aa47bf1fb8 Reviewed-on: https://go-review.googlesource.com/c/go/+/646198 Auto-Submit: Michael Knyszek <mknyszek@google.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Michael Pratt <mpratt@google.com>
2026-02-06 09:50:02 +00:00 · 2025-02-02 19:50:39 +00:00 · 2025-02-02 19:50:39 +00:00 · 7244e9221f
commit 7244e9221f
parent 5ef19c0d0c
12 changed files with 513 additions and 317 deletions
--- a/src/runtime/crash_test.go
+++ b/src/runtime/crash_test.go
@ -788,6 +788,9 @@ func TestPanicInlined(t *testing.T) {
 // Test for issues #3934 and #20018.
 // We want to delay exiting until a panic print is complete.
 func TestPanicRace(t *testing.T) {
+	if race.Enabled {
+		t.Skip("racefini exits program before main can delay exiting")
+	}
 	testenv.MustHaveGoRun(t)

 	exe, err := buildTestProg(t, "testprog")
--- a/src/runtime/heapdump.go
+++ b/src/runtime/heapdump.go
@ -412,6 +412,14 @@ func dumpgs() {
 	forEachG(func(gp *g) {
 		status := readgstatus(gp) // The world is stopped so gp will not be in a scan state.
 		switch status {
+		case _Grunning:
+			// Dump goroutine if it's _Grunning only during a syscall. This is safe
+			// because the goroutine will just park without mutating its stack, since
+			// the world is stopped.
+			if gp.syscallsp != 0 {
+				dumpgoroutine(gp)
+			}
+			fallthrough
 		default:
 			print("runtime: unexpected G.status ", hex(status), "\n")
 			throw("dumpgs in STW - bad status")
--- a/src/runtime/metrics.go
+++ b/src/runtime/metrics.go
@ -818,9 +818,12 @@ func (a *schedStatsAggregate) compute() {
 		a.gCreated += p.goroutinesCreated
 		switch p.status {
 		case _Prunning:
-			a.gRunning++
-		case _Psyscall:
-			a.gNonGo++
+			if thread, ok := setBlockOnExitSyscall(p); ok {
+				thread.resume()
+				a.gNonGo++
+			} else {
+				a.gRunning++
+			}
 		case _Pgcstop:
 			// The world is stopping or stopped.
 			// This is fine. The results will be
@ -847,7 +850,7 @@ func (a *schedStatsAggregate) compute() {
 	// Global run queue.
 	a.gRunnable += uint64(sched.runq.size)

-	// Account for Gs that are in _Gsyscall without a P in _Psyscall.
+	// Account for Gs that are in _Gsyscall without a P.
 	nGsyscallNoP := sched.nGsyscallNoP.Load()

 	// nGsyscallNoP can go negative during temporary races.
--- a/src/runtime/mprof.go
+++ b/src/runtime/mprof.go
@ -1535,7 +1535,18 @@ func doRecordGoroutineProfile(gp1 *g, pcbuf []uintptr) {
 		// everything else, we just don't record the stack in the profile.
 		return
 	}
-	if readgstatus(gp1) == _Grunning {
+	// Double-check that we didn't make a grave mistake. If the G is running then in
+	// general, we cannot safely read its stack.
+	//
+	// However, there is one case where it's OK. There's a small window of time in
+	// exitsyscall where a goroutine could be in _Grunning as it's exiting a syscall.
+	// This is OK because goroutine will not exit the syscall until it passes through
+	// a call to tryRecordGoroutineProfile. (An explicit one on the fast path, an
+	// implicit one via the scheduler on the slow path.)
+	//
+	// This is also why it's safe to check syscallsp here. The syscall path mutates
+	// syscallsp only after passing through tryRecordGoroutineProfile.
+	if readgstatus(gp1) == _Grunning && gp1.syscallsp == 0 {
 		print("doRecordGoroutineProfile gp1=", gp1.goid, "\n")
 		throw("cannot read stack of running goroutine")
 	}
--- a/src/runtime/mstats.go
+++ b/src/runtime/mstats.go
@ -914,7 +914,7 @@ type cpuStats struct {
 	ScavengeTotalTime  int64

 	IdleTime int64 // Time Ps spent in _Pidle.
-	UserTime int64 // Time Ps spent in _Prunning or _Psyscall that's not any of the above.
+	UserTime int64 // Time Ps spent in _Prunning that's not any of the above.

 	TotalTime int64 // GOMAXPROCS * (monotonic wall clock time elapsed)
 }
@ -976,7 +976,7 @@ func (s *cpuStats) accumulate(now int64, gcMarkPhase bool) {
 	// Compute userTime. We compute this indirectly as everything that's not the above.
 	//
 	// Since time spent in _Pgcstop is covered by gcPauseTime, and time spent in _Pidle
-	// is covered by idleTime, what we're left with is time spent in _Prunning and _Psyscall,
+	// is covered by idleTime, what we're left with is time spent in _Prunning,
 	// the latter of which is fine because the P will either go idle or get used for something
 	// else via sysmon. Meanwhile if we subtract GC time from whatever's left, we get non-GC
 	// _Prunning time. Note that this still leaves time spent in sweeping and in the scheduler,
--- a/src/runtime/os_windows.go
+++ b/src/runtime/os_windows.go
@ -191,8 +191,8 @@ type mOS struct {
 	// complete.
 	//
 	// TODO(austin): We may not need this if preemption were more
-	// tightly synchronized on the G/P status and preemption
-	// blocked transition into _Gsyscall/_Psyscall.
+	// tightly synchronized on the G status and preemption
+	// blocked transition into _Gsyscall.
 	preemptExtLock uint32
 }

--- a/src/runtime/preempt.go
+++ b/src/runtime/preempt.go
@ -286,7 +286,7 @@ func resumeG(state suspendGState) {
 //
 //go:nosplit
 func canPreemptM(mp *m) bool {
-	return mp.locks == 0 && mp.mallocing == 0 && mp.preemptoff == "" && mp.p.ptr().status == _Prunning
+	return mp.locks == 0 && mp.mallocing == 0 && mp.preemptoff == "" && mp.p.ptr().status == _Prunning && mp.curg != nil && readgstatus(mp.curg)&^_Gscan != _Gsyscall
 }

 //go:generate go run mkpreempt.go
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@ -1654,29 +1654,21 @@ func stopTheWorldWithSema(reason stwReason) worldStop {
 	sched.stopwait = gomaxprocs
 	sched.gcwaiting.Store(true)
 	preemptall()
-	// stop current P
+
+	// Stop current P.
 	gp.m.p.ptr().status = _Pgcstop // Pgcstop is only diagnostic.
 	gp.m.p.ptr().gcStopTime = start
 	sched.stopwait--
-	// try to retake all P's in Psyscall status
-	trace = traceAcquire()
+
+	// Try to retake all P's in syscalls.
 	for _, pp := range allp {
-		s := pp.status
-		if s == _Psyscall && atomic.Cas(&pp.status, s, _Pgcstop) {
-			if trace.ok() {
-				trace.ProcSteal(pp, false)
-			}
-			sched.nGsyscallNoP.Add(1)
-			pp.syscalltick++
-			pp.gcStopTime = nanotime()
-			sched.stopwait--
+		if thread, ok := setBlockOnExitSyscall(pp); ok {
+			thread.gcstopP()
+			thread.resume()
 		}
 	}
-	if trace.ok() {
-		traceRelease(trace)
-	}

-	// stop idle P's
+	// Stop idle Ps.
 	now := nanotime()
 	for {
 		pp, _ := pidleget(now)
@ -1690,7 +1682,7 @@ func stopTheWorldWithSema(reason stwReason) worldStop {
 	wait := sched.stopwait > 0
 	unlock(&sched.lock)

-	// wait for remaining P's to stop voluntarily
+	// Wait for remaining Ps to stop voluntarily.
 	if wait {
 		for {
 			// wait for 100us, then try to re-preempt in case of any races
@ -2156,9 +2148,9 @@ func forEachPInternal(fn func(*p)) {
 	}
 	preemptall()

-	// Any P entering _Pidle or _Psyscall from now on will observe
+	// Any P entering _Pidle or a system call from now on will observe
 	// p.runSafePointFn == 1 and will call runSafePointFn when
-	// changing its status to _Pidle/_Psyscall.
+	// changing its status to _Pidle.

 	// Run safe point function for all idle Ps. sched.pidle will
 	// not change because we hold sched.lock.
@ -2175,25 +2167,19 @@ func forEachPInternal(fn func(*p)) {
 	// Run fn for the current P.
 	fn(pp)

-	// Force Ps currently in _Psyscall into _Pidle and hand them
+	// Force Ps currently in a system call into _Pidle and hand them
 	// off to induce safe point function execution.
 	for _, p2 := range allp {
-		s := p2.status
-
 		// We need to be fine-grained about tracing here, since handoffp
 		// might call into the tracer, and the tracer is non-reentrant.
-		trace := traceAcquire()
-		if s == _Psyscall && p2.runSafePointFn == 1 && atomic.Cas(&p2.status, s, _Pidle) {
-			if trace.ok() {
-				// It's important that we traceRelease before we call handoffp, which may also traceAcquire.
-				trace.ProcSteal(p2, false)
-				traceRelease(trace)
-			}
-			sched.nGsyscallNoP.Add(1)
-			p2.syscalltick++
+		if atomic.Load(&p2.runSafePointFn) != 1 {
+			// Already ran it.
+			continue
+		}
+		if thread, ok := setBlockOnExitSyscall(p2); ok {
+			thread.takeP()
+			thread.resume()
 			handoffp(p2)
-		} else if trace.ok() {
-			traceRelease(trace)
 		}
 	}

@ -2234,9 +2220,9 @@ func forEachPInternal(fn func(*p)) {
 //	}
 //
 // runSafePointFn must be checked on any transition in to _Pidle or
-// _Psyscall to avoid a race where forEachP sees that the P is running
-// just before the P goes into _Pidle/_Psyscall and neither forEachP
-// nor the P run the safe-point function.
+// when entering a system call to avoid a race where forEachP sees
+// that the P is running just before the P goes into _Pidle/system call
+// and neither forEachP nor the P run the safe-point function.
 func runSafePointFn() {
 	p := getg().m.p.ptr()
 	// Resolve the race between forEachP running the safe-point
@ -2571,7 +2557,7 @@ func oneNewExtraM() {
 // So that the destructor would invoke dropm while the non-Go thread is exiting.
 // This is much faster since it avoids expensive signal-related syscalls.
 //
-// This always runs without a P, so //go:nowritebarrierrec is required.
+// This may run without a P, so //go:nowritebarrierrec is required.
 //
 // This may run with a different stack than was recorded in g0 (there is no
 // call to callbackUpdateSystemStack prior to dropm), so this must be
@ -4566,7 +4552,6 @@ func save(pc, sp, bp uintptr) {
 //
 //go:nosplit
 func reentersyscall(pc, sp, bp uintptr) {
-	trace := traceAcquire()
 	gp := getg()

 	// Disable preemption because during this function g is in Gsyscall status,
@ -4580,17 +4565,23 @@ func reentersyscall(pc, sp, bp uintptr) {
 	gp.stackguard0 = stackPreempt
 	gp.throwsplit = true

+	// Copy the syscalltick over so we can identify if the P got stolen later.
+	gp.m.syscalltick = gp.m.p.ptr().syscalltick
+
+	pp := gp.m.p.ptr()
+	if pp.runSafePointFn != 0 {
+		// runSafePointFn may stack split if run on this stack
+		systemstack(runSafePointFn)
+	}
+	gp.m.oldp.set(pp)
+
 	// Leave SP around for GC and traceback.
 	save(pc, sp, bp)
 	gp.syscallsp = sp
 	gp.syscallpc = pc
 	gp.syscallbp = bp
-	casgstatus(gp, _Grunning, _Gsyscall)
-	if staticLockRanking {
-		// When doing static lock ranking casgstatus can call
-		// systemstack which clobbers g.sched.
-		save(pc, sp, bp)
-	}
+
+	// Double-check sp and bp.
 	if gp.syscallsp < gp.stack.lo || gp.stack.hi < gp.syscallsp {
 		systemstack(func() {
 			print("entersyscall inconsistent sp ", hex(gp.syscallsp), " [", hex(gp.stack.lo), ",", hex(gp.stack.hi), "]\n")
@ -4603,40 +4594,45 @@ func reentersyscall(pc, sp, bp uintptr) {
 			throw("entersyscall")
 		})
 	}
-
+	trace := traceAcquire()
 	if trace.ok() {
+		// Emit a trace event. Notably, actually emitting the event must happen before
+		// the casgstatus because it mutates the P, but the traceLocker must be held
+		// across the casgstatus so the transition is atomic with respect to the event.
 		systemstack(func() {
 			trace.GoSysCall()
-			traceRelease(trace)
 		})
-		// systemstack itself clobbers g.sched.{pc,sp} and we might
-		// need them later when the G is genuinely blocked in a
-		// syscall
+		// systemstack clobbered gp.sched, so restore it.
 		save(pc, sp, bp)
 	}
-
-	if sched.sysmonwait.Load() {
-		systemstack(entersyscall_sysmon)
-		save(pc, sp, bp)
-	}
-
-	if gp.m.p.ptr().runSafePointFn != 0 {
-		// runSafePointFn may stack split if run on this stack
-		systemstack(runSafePointFn)
-		save(pc, sp, bp)
-	}
-
-	gp.m.syscalltick = gp.m.p.ptr().syscalltick
-	pp := gp.m.p.ptr()
-	pp.m = 0
-	gp.m.oldp.set(pp)
-	gp.m.p = 0
-	atomic.Store(&pp.status, _Psyscall)
 	if sched.gcwaiting.Load() {
-		systemstack(entersyscall_gcwait)
+		// Optimization: If there's a pending STW, do the equivalent of
+		// entersyscallblock here at the last minute and immediately give
+		// away our P.
+		systemstack(func() {
+			entersyscallHandleGCWait(trace)
+		})
+		// systemstack clobbered gp.sched, so restore it.
+		save(pc, sp, bp)
+	}
+	// As soon as we switch to _Gsyscall, we are in danger of losing our P.
+	// We must not touch it after this point.
+	casgstatus(gp, _Grunning, _Gsyscall)
+	if staticLockRanking {
+		// casgstatus clobbers gp.sched via systemstack under staticLockRanking. Restore it.
+		save(pc, sp, bp)
+	}
+	if trace.ok() {
+		// N.B. We don't need to go on the systemstack because traceRelease is very
+		// carefully recursively nosplit. This also means we don't need to worry
+		// about clobbering gp.sched.
+		traceRelease(trace)
+	}
+	if sched.sysmonwait.Load() {
+		systemstack(entersyscallWakeSysmon)
+		// systemstack clobbered gp.sched, so restore it.
 		save(pc, sp, bp)
 	}
-
 	gp.m.locks--
 }

@ -4663,7 +4659,7 @@ func entersyscall() {
 	reentersyscall(sys.GetCallerPC(), sys.GetCallerSP(), fp)
 }

-func entersyscall_sysmon() {
+func entersyscallWakeSysmon() {
 	lock(&sched.lock)
 	if sched.sysmonwait.Load() {
 		sched.sysmonwait.Store(false)
@ -4672,25 +4668,19 @@ func entersyscall_sysmon() {
 	unlock(&sched.lock)
 }

-func entersyscall_gcwait() {
+func entersyscallHandleGCWait(trace traceLocker) {
 	gp := getg()
-	pp := gp.m.oldp.ptr()

 	lock(&sched.lock)
-	trace := traceAcquire()
-	if sched.stopwait > 0 && atomic.Cas(&pp.status, _Psyscall, _Pgcstop) {
+	if sched.stopwait > 0 {
+		// Set our P to _Pgcstop so the STW can take it.
+		pp := gp.m.p.ptr()
+		pp.m = 0
+		gp.m.p = 0
+		atomic.Store(&pp.status, _Pgcstop)
+
 		if trace.ok() {
-			// This is a steal in the new tracer. While it's very likely
-			// that we were the ones to put this P into _Psyscall, between
-			// then and now it's totally possible it had been stolen and
-			// then put back into _Psyscall for us to acquire here. In such
-			// case ProcStop would be incorrect.
-			//
-			// TODO(mknyszek): Consider emitting a ProcStop instead when
-			// gp.m.syscalltick == pp.syscalltick, since then we know we never
-			// lost the P.
-			trace.ProcSteal(pp, true)
-			traceRelease(trace)
+			trace.ProcStop(pp)
 		}
 		sched.nGsyscallNoP.Add(1)
 		pp.gcStopTime = nanotime()
@ -4698,8 +4688,6 @@ func entersyscall_gcwait() {
 		if sched.stopwait--; sched.stopwait == 0 {
 			notewakeup(&sched.stopnote)
 		}
-	} else if trace.ok() {
-		traceRelease(trace)
 	}
 	unlock(&sched.lock)
 }
@ -4744,6 +4732,22 @@ func entersyscallblock() {
 			throw("entersyscallblock")
 		})
 	}
+
+	// Once we switch to _Gsyscall, we can't safely touch
+	// our P anymore, so we need to hand it off beforehand.
+	// The tracer also needs to see the syscall before the P
+	// handoff, so the order here must be (1) trace,
+	// (2) handoff, (3) _Gsyscall switch.
+	trace := traceAcquire()
+	systemstack(func() {
+		if trace.ok() {
+			trace.GoSysCall()
+		}
+		handoffp(releasep())
+	})
+	// <--
+	// Caution: we're in a small window where we are in _Grunning without a P.
+	// -->
 	casgstatus(gp, _Grunning, _Gsyscall)
 	if gp.syscallsp < gp.stack.lo || gp.stack.hi < gp.syscallsp {
 		systemstack(func() {
@ -4757,8 +4761,11 @@ func entersyscallblock() {
 			throw("entersyscallblock")
 		})
 	}
-
-	systemstack(entersyscallblock_handoff)
+	if trace.ok() {
+		systemstack(func() {
+			traceRelease(trace)
+		})
+	}

 	// Resave for traceback during blocked call.
 	save(sys.GetCallerPC(), sys.GetCallerSP(), getcallerfp())
@ -4766,15 +4773,6 @@ func entersyscallblock() {
 	gp.m.locks--
 }

-func entersyscallblock_handoff() {
-	trace := traceAcquire()
-	if trace.ok() {
-		trace.GoSysCall()
-		traceRelease(trace)
-	}
-	handoffp(releasep())
-}
-
 // The goroutine g exited its system call.
 // Arrange for it to run on a cpu again.
 // This is called only from the go syscall library, not
@ -4802,13 +4800,87 @@ func exitsyscall() {
 	if sys.GetCallerSP() > gp.syscallsp {
 		throw("exitsyscall: syscall frame is no longer valid")
 	}
-
 	gp.waitsince = 0
+
+	if sched.stopwait == freezeStopWait {
+		// Wedge ourselves if there's an outstanding freezetheworld.
+		// If we transition to running, we might end up with our traceback
+		// being taken twice.
+		systemstack(func() {
+			lock(&deadlock)
+			lock(&deadlock)
+		})
+	}
+
+	// Optimistically assume we're going to keep running, and switch to running.
+	// Before this point, our P wiring is not ours. Once we get past this point,
+	// we can access our P if we have it, otherwise we lost it.
+	//
+	// N.B. Because we're transitioning to _Grunning here, traceAcquire doesn't
+	// need to be held ahead of time. We're effectively atomic with respect to
+	// the tracer because we're non-preemptible and in the runtime. It can't stop
+	// us to read a bad status.
+	casgstatus(gp, _Gsyscall, _Grunning)
+
+	// Caution: we're in a window where we may be in _Grunning without a P.
+	// Either we will grab a P or call exitsyscall0, where we'll switch to
+	// _Grunnable.
+
+	// Grab and clear our old P.
 	oldp := gp.m.oldp.ptr()
-	gp.m.oldp = 0
-	if exitsyscallfast(oldp) {
-		// When exitsyscallfast returns success, we have a P so can now use
-		// write barriers
+	gp.m.oldp.set(nil)
+
+	// Check if we still have a P, and if not, try to acquire an idle P.
+	pp := gp.m.p.ptr()
+	if pp != nil {
+		// Fast path: we still have our P. Just emit a syscall exit event.
+		if trace := traceAcquire(); trace.ok() {
+			systemstack(func() {
+				// The truth is we truly never lost the P, but syscalltick
+				// is used to indicate whether the P should be treated as
+				// lost anyway. For example, when syscalltick is trashed by
+				// dropm.
+				//
+				// TODO(mknyszek): Consider a more explicit mechanism for this.
+				// Then syscalltick doesn't need to be trashed, and can be used
+				// exclusively by sysmon for deciding when it's time to retake.
+				if pp.syscalltick == gp.m.syscalltick {
+					trace.GoSysExit(false)
+				} else {
+					// Since we need to pretend we lost the P, but nobody ever
+					// took it, we need a ProcSteal event to model the loss.
+					// Then, continue with everything else we'd do if we lost
+					// the P.
+					trace.ProcSteal(pp)
+					trace.ProcStart()
+					trace.GoSysExit(true)
+					trace.GoStart()
+				}
+				traceRelease(trace)
+			})
+		}
+	} else {
+		// Slow path: we lost our P. Try to get another one.
+		systemstack(func() {
+			// Try to get some other P.
+			if pp := exitsyscallTryGetP(oldp); pp != nil {
+				// Install the P.
+				acquirepNoTrace(pp)
+
+				// We're going to start running again, so emit all the relevant events.
+				if trace := traceAcquire(); trace.ok() {
+					trace.ProcStart()
+					trace.GoSysExit(true)
+					trace.GoStart()
+					traceRelease(trace)
+				}
+			}
+		})
+		pp = gp.m.p.ptr()
+	}
+
+	// If we have a P, clean up and exit.
+	if pp != nil {
 		if goroutineProfile.active {
 			// Make sure that gp has had its stack written out to the goroutine
 			// profile, exactly as it was when the goroutine profiler first
@ -4817,41 +4889,19 @@ func exitsyscall() {
 				tryRecordGoroutineProfileWB(gp)
 			})
 		}
-		trace := traceAcquire()
-		if trace.ok() {
-			lostP := oldp != gp.m.p.ptr() || gp.m.syscalltick != gp.m.p.ptr().syscalltick
-			systemstack(func() {
-				// Write out syscall exit eagerly.
-				//
-				// It's important that we write this *after* we know whether we
-				// lost our P or not (determined by exitsyscallfast).
-				trace.GoSysExit(lostP)
-				if lostP {
-					// We lost the P at some point, even though we got it back here.
-					// Trace that we're starting again, because there was a tracev2.GoSysBlock
-					// call somewhere in exitsyscallfast (indicating that this goroutine
-					// had blocked) and we're about to start running again.
-					trace.GoStart()
-				}
-			})
-		}
-		// There's a cpu for us, so we can run.
-		gp.m.p.ptr().syscalltick++
-		// We need to cas the status and scan before resuming...
-		casgstatus(gp, _Gsyscall, _Grunning)
-		if trace.ok() {
-			traceRelease(trace)
-		}
+
+		// Increment the syscalltick for P, since we're exiting a syscall.
+		pp.syscalltick++

 		// Garbage collector isn't running (since we are),
 		// so okay to clear syscallsp.
 		gp.syscallsp = 0
 		gp.m.locks--
 		if gp.preempt {
-			// restore the preemption request in case we've cleared it in newstack
+			// Restore the preemption request in case we cleared it in newstack.
 			gp.stackguard0 = stackPreempt
 		} else {
-			// otherwise restore the real stackGuard, we've spoiled it in entersyscall/entersyscallblock
+			// Otherwise restore the real stackGuard, we clobbered it in entersyscall/entersyscallblock.
 			gp.stackguard0 = gp.stack.lo + stackGuard
 		}
 		gp.throwsplit = false
@ -4860,14 +4910,13 @@ func exitsyscall() {
 			// Scheduling of this goroutine is disabled.
 			Gosched()
 		}
-
 		return
 	}
-
+	// Slowest path: We couldn't get a P, so call into the scheduler.
 	gp.m.locks--

 	// Call the scheduler.
-	mcall(exitsyscall0)
+	mcall(exitsyscallNoP)

 	// Scheduler returned, so we're allowed to run now.
 	// Delete the syscallsp information that we left for
@ -4880,78 +4929,38 @@ func exitsyscall() {
 	gp.throwsplit = false
 }

-//go:nosplit
-func exitsyscallfast(oldp *p) bool {
-	// Freezetheworld sets stopwait but does not retake P's.
-	if sched.stopwait == freezeStopWait {
-		return false
-	}
-
-	// Try to re-acquire the last P.
-	trace := traceAcquire()
-	if oldp != nil && oldp.status == _Psyscall && atomic.Cas(&oldp.status, _Psyscall, _Pidle) {
-		// There's a cpu for us, so we can run.
-		wirep(oldp)
-		exitsyscallfast_reacquired(trace)
-		if trace.ok() {
-			traceRelease(trace)
-		}
-		return true
-	}
-	if trace.ok() {
-		traceRelease(trace)
-	}
-
-	// Try to get any other idle P.
-	if sched.pidle != 0 {
-		var ok bool
-		systemstack(func() {
-			ok = exitsyscallfast_pidle()
-		})
-		if ok {
-			return true
-		}
-	}
-	return false
-}
-
-// exitsyscallfast_reacquired is the exitsyscall path on which this G
-// has successfully reacquired the P it was running on before the
-// syscall.
+// exitsyscall's attempt to try to get any P, if it's missing one.
+// Returns true on success.
 //
-//go:nosplit
-func exitsyscallfast_reacquired(trace traceLocker) {
-	gp := getg()
-	if gp.m.syscalltick != gp.m.p.ptr().syscalltick {
-		if trace.ok() {
-			// The p was retaken and then enter into syscall again (since gp.m.syscalltick has changed).
-			// tracev2.GoSysBlock for this syscall was already emitted,
-			// but here we effectively retake the p from the new syscall running on the same p.
-			systemstack(func() {
-				// We're stealing the P. It's treated
-				// as if it temporarily stopped running. Then, start running.
-				trace.ProcSteal(gp.m.p.ptr(), true)
-				trace.ProcStart()
-			})
+// Must execute on the systemstack because exitsyscall is nosplit.
+//
+//go:systemstack
+func exitsyscallTryGetP(oldp *p) *p {
+	// Try to steal our old P back.
+	if oldp != nil {
+		if thread, ok := setBlockOnExitSyscall(oldp); ok {
+			thread.takeP()
+			thread.resume()
+			sched.nGsyscallNoP.Add(-1) // takeP adds 1.
+			return oldp
 		}
-		gp.m.p.ptr().syscalltick++
 	}
-}

-func exitsyscallfast_pidle() bool {
-	lock(&sched.lock)
-	pp, _ := pidleget(0)
-	if pp != nil && sched.sysmonwait.Load() {
-		sched.sysmonwait.Store(false)
-		notewakeup(&sched.sysmonnote)
+	// Try to get an idle P.
+	if sched.pidle != 0 {
+		lock(&sched.lock)
+		pp, _ := pidleget(0)
+		if pp != nil && sched.sysmonwait.Load() {
+			sched.sysmonwait.Store(false)
+			notewakeup(&sched.sysmonnote)
+		}
+		unlock(&sched.lock)
+		if pp != nil {
+			sched.nGsyscallNoP.Add(-1)
+			return pp
+		}
 	}
-	unlock(&sched.lock)
-	if pp != nil {
-		sched.nGsyscallNoP.Add(-1)
-		acquirep(pp)
-		return true
-	}
-	return false
+	return nil
 }

 // exitsyscall slow path on g0.
@ -4960,11 +4969,10 @@ func exitsyscallfast_pidle() bool {
 // Called via mcall, so gp is the calling g from this M.
 //
 //go:nowritebarrierrec
-func exitsyscall0(gp *g) {
-	var trace traceLocker
+func exitsyscallNoP(gp *g) {
 	traceExitingSyscall()
-	trace = traceAcquire()
-	casgstatus(gp, _Gsyscall, _Grunnable)
+	trace := traceAcquire()
+	casgstatus(gp, _Grunning, _Grunnable)
 	traceExitedSyscall()
 	if trace.ok() {
 		// Write out syscall exit eagerly.
@ -6021,6 +6029,21 @@ func procresize(nprocs int32) *p {
 //
 //go:yeswritebarrierrec
 func acquirep(pp *p) {
+	// Do the work.
+	acquirepNoTrace(pp)
+
+	// Emit the event.
+	trace := traceAcquire()
+	if trace.ok() {
+		trace.ProcStart()
+		traceRelease(trace)
+	}
+}
+
+// Internals of acquirep, just skipping the trace events.
+//
+//go:yeswritebarrierrec
+func acquirepNoTrace(pp *p) {
 	// Do the part that isn't allowed to have write barriers.
 	wirep(pp)

@ -6029,12 +6052,6 @@ func acquirep(pp *p) {
 	// Perform deferred mcache flush before this P can allocate
 	// from a potentially stale mcache.
 	pp.mcache.prepareForSweep()
-
-	trace := traceAcquire()
-	if trace.ok() {
-		trace.ProcStart()
-		traceRelease(trace)
-	}
 }

 // wirep is the first step of acquirep, which actually associates the
@ -6382,73 +6399,205 @@ func retake(now int64) uint32 {
 	// temporarily drop the allpLock. Hence, we need to re-fetch
 	// allp each time around the loop.
 	for i := 0; i < len(allp); i++ {
+		// Quickly filter out non-running Ps. Running Ps are either
+		// in a syscall or are actually executing. Idle Ps don't
+		// need to be retaken.
+		//
+		// This is best-effort, so it's OK that it's racy. Our target
+		// is to retake Ps that have been running or in a syscall for
+		// a long time (milliseconds), so the state has plenty of time
+		// to stabilize.
 		pp := allp[i]
-		if pp == nil {
-			// This can happen if procresize has grown
+		if pp == nil || atomic.Load(&pp.status) != _Prunning {
+			// pp can be nil if procresize has grown
 			// allp but not yet created new Ps.
 			continue
 		}
 		pd := &pp.sysmontick
-		s := pp.status
 		sysretake := false
-		if s == _Prunning || s == _Psyscall {
-			// Preempt G if it's running on the same schedtick for
-			// too long. This could be from a single long-running
-			// goroutine or a sequence of goroutines run via
-			// runnext, which share a single schedtick time slice.
-			t := int64(pp.schedtick)
-			if int64(pd.schedtick) != t {
-				pd.schedtick = uint32(t)
-				pd.schedwhen = now
-			} else if pd.schedwhen+forcePreemptNS <= now {
-				preemptone(pp)
-				// In case of syscall, preemptone() doesn't
-				// work, because there is no M wired to P.
-				sysretake = true
-			}
+
+		// Preempt G if it's running on the same schedtick for
+		// too long. This could be from a single long-running
+		// goroutine or a sequence of goroutines run via
+		// runnext, which share a single schedtick time slice.
+		schedt := int64(pp.schedtick)
+		if int64(pd.schedtick) != schedt {
+			pd.schedtick = uint32(schedt)
+			pd.schedwhen = now
+		} else if pd.schedwhen+forcePreemptNS <= now {
+			preemptone(pp)
+			// If pp is in a syscall, preemptone doesn't work.
+			// The goroutine nor the thread can respond to a
+			// preemption request because they're not in Go code,
+			// so we need to take the P ourselves.
+			sysretake = true
 		}
-		if s == _Psyscall {
-			// Retake P from syscall if it's there for more than 1 sysmon tick (at least 20us).
-			t := int64(pp.syscalltick)
-			if !sysretake && int64(pd.syscalltick) != t {
-				pd.syscalltick = uint32(t)
-				pd.syscallwhen = now
-				continue
-			}
-			// On the one hand we don't want to retake Ps if there is no other work to do,
-			// but on the other hand we want to retake them eventually
-			// because they can prevent the sysmon thread from deep sleep.
-			if runqempty(pp) && sched.nmspinning.Load()+sched.npidle.Load() > 0 && pd.syscallwhen+10*1000*1000 > now {
-				continue
-			}
-			// Drop allpLock so we can take sched.lock.
-			unlock(&allpLock)
-			// Need to decrement number of idle locked M's
-			// (pretending that one more is running) before the CAS.
-			// Otherwise the M from which we retake can exit the syscall,
-			// increment nmidle and report deadlock.
-			incidlelocked(-1)
-			trace := traceAcquire()
-			if atomic.Cas(&pp.status, s, _Pidle) {
-				if trace.ok() {
-					trace.ProcSteal(pp, false)
-					traceRelease(trace)
-				}
-				sched.nGsyscallNoP.Add(1)
-				n++
-				pp.syscalltick++
-				handoffp(pp)
-			} else if trace.ok() {
-				traceRelease(trace)
-			}
-			incidlelocked(1)
-			lock(&allpLock)
+
+		// Drop allpLock so we can take sched.lock.
+		unlock(&allpLock)
+
+		// Need to decrement number of idle locked M's (pretending that
+		// one more is running) before we take the P and resume.
+		// Otherwise the M from which we retake can exit the syscall,
+		// increment nmidle and report deadlock.
+		//
+		// Can't call incidlelocked once we setBlockOnExitSyscall, due
+		// to a lock ordering violation between sched.lock and _Gscan.
+		incidlelocked(-1)
+
+		// Try to prevent the P from continuing in the syscall, if it's in one at all.
+		thread, ok := setBlockOnExitSyscall(pp)
+		if !ok {
+			// Not in a syscall, or something changed out from under us.
+			goto done
 		}
+
+		// Retake the P if it's there for more than 1 sysmon tick (at least 20us).
+		if syst := int64(pp.syscalltick); !sysretake && int64(pd.syscalltick) != syst {
+			pd.syscalltick = uint32(syst)
+			pd.syscallwhen = now
+			thread.resume()
+			goto done
+		}
+
+		// On the one hand we don't want to retake Ps if there is no other work to do,
+		// but on the other hand we want to retake them eventually
+		// because they can prevent the sysmon thread from deep sleep.
+		if runqempty(pp) && sched.nmspinning.Load()+sched.npidle.Load() > 0 && pd.syscallwhen+10*1000*1000 > now {
+			thread.resume()
+			goto done
+		}
+
+		// Take the P. Note: because we have the scan bit, the goroutine
+		// is at worst stuck spinning in exitsyscall.
+		thread.takeP()
+		thread.resume()
+		n++
+
+		// Handoff the P for some other thread to run it.
+		handoffp(pp)
+
+		// The P has been handed off to another thread, so risk of a false
+		// deadlock report while we hold onto it is gone.
+	done:
+		incidlelocked(1)
+		lock(&allpLock)
 	}
 	unlock(&allpLock)
 	return uint32(n)
 }

+// syscallingThread represents a thread in a system call that temporarily
+// cannot advance out of the system call.
+type syscallingThread struct {
+	gp     *g
+	mp     *m
+	pp     *p
+	status uint32
+}
+
+// setBlockOnExitSyscall prevents pp's thread from advancing out of
+// exitsyscall. On success, returns the g/m/p state of the thread
+// and true. At that point, the caller owns the g/m/p links referenced,
+// the goroutine is in _Gsyscall, and prevented from transitioning out
+// of it. On failure, it returns false, and none of these guarantees are
+// made.
+//
+// Callers must call resume on the resulting thread state once
+// they're done with thread, otherwise it will remain blocked forever.
+//
+// This function races with state changes on pp, and thus may fail
+// if pp is not in a system call, or exits a system call concurrently
+// with this function. However, this function is safe to call without
+// any additional synchronization.
+func setBlockOnExitSyscall(pp *p) (syscallingThread, bool) {
+	if pp.status != _Prunning {
+		return syscallingThread{}, false
+	}
+	// Be very careful here, these reads are intentionally racy.
+	// Once we notice the G is in _Gsyscall, acquire its scan bit,
+	// and validate that it's still connected to the *same* M and P,
+	// we can actually get to work. Holding the scan bit will prevent
+	// the G from exiting the syscall.
+	//
+	// Our goal here is to interrupt long syscalls. If it turns out
+	// that we're wrong and the G switched to another syscall while
+	// we were trying to do this, that's completely fine. It's
+	// probably making more frequent syscalls and the typical
+	// preemption paths should be effective.
+	mp := pp.m.ptr()
+	if mp == nil {
+		// Nothing to do.
+		return syscallingThread{}, false
+	}
+	gp := mp.curg
+	if gp == nil {
+		// Nothing to do.
+		return syscallingThread{}, false
+	}
+	status := readgstatus(gp) &^ _Gscan
+
+	// A goroutine is considered in a syscall, and may have a corresponding
+	// P, if it's in _Gsyscall *or* _Gdeadextra. In the latter case, it's an
+	// extra M goroutine.
+	if status != _Gsyscall && status != _Gdeadextra {
+		// Not in a syscall, nothing to do.
+		return syscallingThread{}, false
+	}
+	if !castogscanstatus(gp, status, status|_Gscan) {
+		// Not in _Gsyscall or _Gdeadextra anymore. Nothing to do.
+		return syscallingThread{}, false
+	}
+	if gp.m != mp || gp.m.p.ptr() != pp {
+		// This is not what we originally observed. Nothing to do.
+		casfrom_Gscanstatus(gp, status|_Gscan, status)
+		return syscallingThread{}, false
+	}
+	return syscallingThread{gp, mp, pp, status}, true
+}
+
+// gcstopP unwires the P attached to the syscalling thread
+// and moves it into the _Pgcstop state.
+//
+// The caller must be stopping the world.
+func (s syscallingThread) gcstopP() {
+	assertLockHeld(&sched.lock)
+
+	s.releaseP(_Pgcstop)
+	s.pp.gcStopTime = nanotime()
+	sched.stopwait--
+}
+
+// takeP unwires the P attached to the syscalling thread
+// and moves it into the _Pidle state.
+func (s syscallingThread) takeP() {
+	s.releaseP(_Pidle)
+}
+
+// releaseP unwires the P from the syscalling thread, moving
+// it to the provided state. Callers should prefer to use
+// takeP and gcstopP.
+func (s syscallingThread) releaseP(state uint32) {
+	if state != _Pidle && state != _Pgcstop {
+		throw("attempted to release P into a bad state")
+	}
+	trace := traceAcquire()
+	s.pp.m = 0
+	s.mp.p = 0
+	atomic.Store(&s.pp.status, state)
+	if trace.ok() {
+		trace.ProcSteal(s.pp)
+		traceRelease(trace)
+	}
+	sched.nGsyscallNoP.Add(1)
+	s.pp.syscalltick++
+}
+
+// resume allows a syscalling thread to advance beyond exitsyscall.
+func (s syscallingThread) resume() {
+	casfrom_Gscanstatus(s.gp, s.status|_Gscan, s.status)
+}
+
 // Tell all goroutines that they have been preempted and they should stop.
 // This function is purely best-effort. It can fail to inform a goroutine if a
 // processor just started running it.
@ -6486,6 +6635,10 @@ func preemptone(pp *p) bool {
 	if gp == nil || gp == mp.g0 {
 		return false
 	}
+	if readgstatus(gp)&^_Gscan == _Gsyscall {
+		// Don't bother trying to preempt a goroutine in a syscall.
+		return false
+	}

 	gp.preempt = true

--- a/src/runtime/runtime2.go
+++ b/src/runtime/runtime2.go
@ -42,12 +42,16 @@ const (

 	// _Grunning means this goroutine may execute user code. The
 	// stack is owned by this goroutine. It is not on a run queue.
-	// It is assigned an M and a P (g.m and g.m.p are valid).
+	// It is assigned an M (g.m is valid) and it usually has a P
+	// (g.m.p is valid), but there are small windows of time where
+	// it might not, namely upon entering and exiting _Gsyscall.
 	_Grunning // 2

 	// _Gsyscall means this goroutine is executing a system call.
 	// It is not executing user code. The stack is owned by this
 	// goroutine. It is not on a run queue. It is assigned an M.
+	// It may have a P attached, but it does not own it. Code
+	// executing in this state must not touch g.m.p.
 	_Gsyscall // 3

 	// _Gwaiting means this goroutine is blocked in the runtime.
@ -131,22 +135,15 @@ const (
 	// run user code or the scheduler. Only the M that owns this P
 	// is allowed to change the P's status from _Prunning. The M
 	// may transition the P to _Pidle (if it has no more work to
-	// do), _Psyscall (when entering a syscall), or _Pgcstop (to
-	// halt for the GC). The M may also hand ownership of the P
-	// off directly to another M (e.g., to schedule a locked G).
+	// do), or _Pgcstop (to halt for the GC). The M may also hand
+	// ownership of the P off directly to another M (for example,
+	// to schedule a locked G).
 	_Prunning

-	// _Psyscall means a P is not running user code. It has
-	// affinity to an M in a syscall but is not owned by it and
-	// may be stolen by another M. This is similar to _Pidle but
-	// uses lightweight transitions and maintains M affinity.
-	//
-	// Leaving _Psyscall must be done with a CAS, either to steal
-	// or retake the P. Note that there's an ABA hazard: even if
-	// an M successfully CASes its original P back to _Prunning
-	// after a syscall, it must understand the P may have been
-	// used by another M in the interim.
-	_Psyscall
+	// _Psyscall_unused is a now-defunct state for a P. A P is
+	// identified as "in a system call" by looking at the goroutine's
+	// state.
+	_Psyscall_unused

 	// _Pgcstop means a P is halted for STW and owned by the M
 	// that stopped the world. The M that stopped the world
@ -620,18 +617,27 @@ type m struct {
 	morebuf gobuf  // gobuf arg to morestack
 	divmod  uint32 // div/mod denominator for arm - known to liblink (cmd/internal/obj/arm/obj5.go)

-	// Fields not known to debuggers.
-	procid          uint64            // for debuggers, but offset not hard-coded
-	gsignal         *g                // signal-handling g
-	goSigStack      gsignalStack      // Go-allocated signal handling stack
-	sigmask         sigset            // storage for saved signal mask
-	tls             [tlsSlots]uintptr // thread-local storage (for x86 extern register)
-	mstartfn        func()
-	curg            *g       // current running goroutine
-	caughtsig       guintptr // goroutine running during fatal signal
-	p               puintptr // attached p for executing go code (nil if not executing go code)
-	nextp           puintptr
-	oldp            puintptr // the p that was attached before executing a syscall
+	// Fields whose offsets are not known to debuggers.
+
+	procid     uint64            // for debuggers, but offset not hard-coded
+	gsignal    *g                // signal-handling g
+	goSigStack gsignalStack      // Go-allocated signal handling stack
+	sigmask    sigset            // storage for saved signal mask
+	tls        [tlsSlots]uintptr // thread-local storage (for x86 extern register)
+	mstartfn   func()
+	curg       *g       // current running goroutine
+	caughtsig  guintptr // goroutine running during fatal signal
+
+	// p is the currently attached P for executing Go code, nil if not executing user Go code.
+	//
+	// A non-nil p implies exclusive ownership of the P, unless curg is in _Gsyscall.
+	// In _Gsyscall the scheduler may mutate this instead. The point of synchronization
+	// is the _Gscan bit on curg's status. The scheduler must arrange to prevent curg
+	// from transitioning out of _Gsyscall if it intends to mutate p.
+	p puintptr
+
+	nextp           puintptr // The next P to install before executing. Implies exclusive ownership of this P.
+	oldp            puintptr // The P that was attached before executing a syscall.
 	id              int64
 	mallocing       int32
 	throwing        throwType
--- a/src/runtime/traceevent.go
+++ b/src/runtime/traceevent.go
@ -44,6 +44,13 @@ func (tl traceLocker) eventWriter(goStatus tracev2.GoStatus, procStatus tracev2.
 	if gp := tl.mp.curg; gp != nil && !gp.trace.statusWasTraced(tl.gen) && gp.trace.acquireStatus(tl.gen) {
 		tl.writer().writeGoStatus(gp.goid, int64(tl.mp.procid), goStatus, gp.inMarkAssist, 0 /* no stack */).end()
 	}
+	return tl.rawEventWriter()
+}
+
+// rawEventWriter creates a new traceEventWriter without emitting any status events.
+//
+// It is the caller's responsibility to emit any status events, if necessary.
+func (tl traceLocker) rawEventWriter() traceEventWriter {
 	return traceEventWriter{tl}
 }

--- a/src/runtime/traceruntime.go
+++ b/src/runtime/traceruntime.go
@ -534,10 +534,8 @@ func (tl traceLocker) GoSysExit(lostP bool) {

 // ProcSteal indicates that our current M stole a P from another M.
 //
-// inSyscall indicates that we're stealing the P from a syscall context.
-//
 // The caller must have ownership of pp.
-func (tl traceLocker) ProcSteal(pp *p, inSyscall bool) {
+func (tl traceLocker) ProcSteal(pp *p) {
 	// Grab the M ID we stole from.
 	mStolenFrom := pp.trace.mSyscallID
 	pp.trace.mSyscallID = -1
@ -561,7 +559,7 @@ func (tl traceLocker) ProcSteal(pp *p, inSyscall bool) {
 	// In the latter, we're a goroutine in a syscall.
 	goStatus := tracev2.GoRunning
 	procStatus := tracev2.ProcRunning
-	if inSyscall {
+	if tl.mp.curg != nil && tl.mp.curg.syscallsp != 0 {
 		goStatus = tracev2.GoSyscall
 		procStatus = tracev2.ProcSyscallAbandoned
 	}
@ -595,19 +593,27 @@ func (tl traceLocker) GoCreateSyscall(gp *g) {
 	// N.B. We should never trace a status for this goroutine (which we're currently running on),
 	// since we want this to appear like goroutine creation.
 	gp.trace.setStatusTraced(tl.gen)
-	tl.eventWriter(tracev2.GoBad, tracev2.ProcBad).event(tracev2.EvGoCreateSyscall, traceArg(gp.goid))
+
+	// We might have a P left over on the thread from the last cgo callback,
+	// but in a syscall context, it is NOT ours. Act as if we do not have a P,
+	// and don't record a status.
+	tl.rawEventWriter().event(tracev2.EvGoCreateSyscall, traceArg(gp.goid))
 }

 // GoDestroySyscall indicates that a goroutine has transitioned from GoSyscall to dead.
 //
-// Must not have a P.
-//
 // This occurs when Go code returns back to C. On pthread platforms it occurs only when
 // the C thread is destroyed.
 func (tl traceLocker) GoDestroySyscall() {
-	// N.B. If we trace a status here, we must never have a P, and we must be on a goroutine
-	// that is in the syscall state.
-	tl.eventWriter(tracev2.GoSyscall, tracev2.ProcBad).event(tracev2.EvGoDestroySyscall)
+	// Write the status for the goroutine if necessary.
+	if gp := tl.mp.curg; gp != nil && !gp.trace.statusWasTraced(tl.gen) && gp.trace.acquireStatus(tl.gen) {
+		tl.writer().writeGoStatus(gp.goid, int64(tl.mp.procid), tracev2.GoSyscall, false, 0 /* no stack */).end()
+	}
+
+	// We might have a P left over on the thread from the last cgo callback,
+	// but in a syscall context, it is NOT ours. Act as if we do not have a P,
+	// and don't record a status.
+	tl.rawEventWriter().event(tracev2.EvGoDestroySyscall)
 }

 // To access runtime functions from runtime/trace.
--- a/src/runtime/tracestatus.go
+++ b/src/runtime/tracestatus.go
@ -62,15 +62,14 @@ func (w traceWriter) writeProcStatusForP(pp *p, inSTW bool) traceWriter {
 		}
 	case _Prunning:
 		status = tracev2.ProcRunning
-		// There's a short window wherein the goroutine may have entered _Gsyscall
-		// but it still owns the P (it's not in _Psyscall yet). The goroutine entering
-		// _Gsyscall is the tracer's signal that the P its bound to is also in a syscall,
-		// so we need to emit a status that matches. See #64318.
+		// A P is considered to be in a syscall if its attached G is. Since we fully
+		// own P, then the goroutine isn't going to transition and we can trivially
+		// check if the goroutine is in a syscall. This used to be just a small problematic
+		// window, but this is now the default since _Psyscall no longer exists. See #64318
+		// for the history on why it was needed while _Psyscall still existed.
 		if w.mp.p.ptr() == pp && w.mp.curg != nil && readgstatus(w.mp.curg)&^_Gscan == _Gsyscall {
 			status = tracev2.ProcSyscall
 		}
-	case _Psyscall:
-		status = tracev2.ProcSyscall
 	default:
 		throw("attempt to trace invalid or unsupported P status")
 	}