2014-11-11 17:05:19 -05:00
|
|
|
// Copyright 2009 The Go Authors. All rights reserved.
|
|
|
|
|
// Use of this source code is governed by a BSD-style
|
|
|
|
|
// license that can be found in the LICENSE file.
|
|
|
|
|
|
|
|
|
|
package runtime
|
|
|
|
|
|
2015-11-02 14:09:24 -05:00
|
|
|
import (
|
2023-04-17 15:43:29 -04:00
|
|
|
"internal/abi"
|
math/rand, math/rand/v2: use ChaCha8 for global rand
Move ChaCha8 code into internal/chacha8rand and use it to implement
runtime.rand, which is used for the unseeded global source for
both math/rand and math/rand/v2. This also affects the calculation of
the start point for iteration over very very large maps (when the
32-bit fastrand is not big enough).
The benefit is that misuse of the global random number generators
in math/rand and math/rand/v2 in contexts where non-predictable
randomness is important for security reasons is no longer a
security problem, removing a common mistake among programmers
who are unaware of the different kinds of randomness.
The cost is an extra 304 bytes per thread stored in the m struct
plus 2-3ns more per random uint64 due to the more sophisticated
algorithm. Using PCG looks like it would cost about the same,
although I haven't benchmarked that.
Before this, the math/rand and math/rand/v2 global generator
was wyrand (https://github.com/wangyi-fudan/wyhash).
For math/rand, using wyrand instead of the Mitchell/Reeds/Thompson
ALFG was justifiable, since the latter was not any better.
But for math/rand/v2, the global generator really should be
at least as good as one of the well-studied, specific algorithms
provided directly by the package, and it's not.
(Wyrand is still reasonable for scheduling and cache decisions.)
Good randomness does have a cost: about twice wyrand.
Also rationalize the various runtime rand references.
goos: linux
goarch: amd64
pkg: math/rand/v2
cpu: AMD Ryzen 9 7950X 16-Core Processor
│ bbb48afeb7.amd64 │ 5cf807d1ea.amd64 │
│ sec/op │ sec/op vs base │
ChaCha8-32 1.862n ± 2% 1.861n ± 2% ~ (p=0.825 n=20)
PCG_DXSM-32 1.471n ± 1% 1.460n ± 2% ~ (p=0.153 n=20)
SourceUint64-32 1.636n ± 2% 1.582n ± 1% -3.30% (p=0.000 n=20)
GlobalInt64-32 2.087n ± 1% 3.663n ± 1% +75.54% (p=0.000 n=20)
GlobalInt64Parallel-32 0.1042n ± 1% 0.2026n ± 1% +94.48% (p=0.000 n=20)
GlobalUint64-32 2.263n ± 2% 3.724n ± 1% +64.57% (p=0.000 n=20)
GlobalUint64Parallel-32 0.1019n ± 1% 0.1973n ± 1% +93.67% (p=0.000 n=20)
Int64-32 1.771n ± 1% 1.774n ± 1% ~ (p=0.449 n=20)
Uint64-32 1.863n ± 2% 1.866n ± 1% ~ (p=0.364 n=20)
GlobalIntN1000-32 3.134n ± 3% 4.730n ± 2% +50.95% (p=0.000 n=20)
IntN1000-32 2.489n ± 1% 2.489n ± 1% ~ (p=0.683 n=20)
Int64N1000-32 2.521n ± 1% 2.516n ± 1% ~ (p=0.394 n=20)
Int64N1e8-32 2.479n ± 1% 2.478n ± 2% ~ (p=0.743 n=20)
Int64N1e9-32 2.530n ± 2% 2.514n ± 2% ~ (p=0.193 n=20)
Int64N2e9-32 2.501n ± 1% 2.494n ± 1% ~ (p=0.616 n=20)
Int64N1e18-32 3.227n ± 1% 3.205n ± 1% ~ (p=0.101 n=20)
Int64N2e18-32 3.647n ± 1% 3.599n ± 1% ~ (p=0.019 n=20)
Int64N4e18-32 5.135n ± 1% 5.069n ± 2% ~ (p=0.034 n=20)
Int32N1000-32 2.657n ± 1% 2.637n ± 1% ~ (p=0.180 n=20)
Int32N1e8-32 2.636n ± 1% 2.636n ± 1% ~ (p=0.763 n=20)
Int32N1e9-32 2.660n ± 2% 2.638n ± 1% ~ (p=0.358 n=20)
Int32N2e9-32 2.662n ± 2% 2.618n ± 2% ~ (p=0.064 n=20)
Float32-32 2.272n ± 2% 2.239n ± 2% ~ (p=0.194 n=20)
Float64-32 2.272n ± 1% 2.286n ± 2% ~ (p=0.763 n=20)
ExpFloat64-32 3.762n ± 1% 3.744n ± 1% ~ (p=0.171 n=20)
NormFloat64-32 3.706n ± 1% 3.655n ± 2% ~ (p=0.066 n=20)
Perm3-32 32.93n ± 3% 34.62n ± 1% +5.13% (p=0.000 n=20)
Perm30-32 202.9n ± 1% 204.0n ± 1% ~ (p=0.482 n=20)
Perm30ViaShuffle-32 115.0n ± 1% 114.9n ± 1% ~ (p=0.358 n=20)
ShuffleOverhead-32 112.8n ± 1% 112.7n ± 1% ~ (p=0.692 n=20)
Concurrent-32 2.107n ± 0% 3.725n ± 1% +76.75% (p=0.000 n=20)
goos: darwin
goarch: arm64
pkg: math/rand/v2
│ bbb48afeb7.arm64 │ 5cf807d1ea.arm64 │
│ sec/op │ sec/op vs base │
ChaCha8-8 2.480n ± 0% 2.429n ± 0% -2.04% (p=0.000 n=20)
PCG_DXSM-8 2.531n ± 0% 2.530n ± 0% ~ (p=0.877 n=20)
SourceUint64-8 2.534n ± 0% 2.533n ± 0% ~ (p=0.732 n=20)
GlobalInt64-8 2.172n ± 1% 4.794n ± 0% +120.67% (p=0.000 n=20)
GlobalInt64Parallel-8 0.4320n ± 0% 0.9605n ± 0% +122.32% (p=0.000 n=20)
GlobalUint64-8 2.182n ± 0% 4.770n ± 0% +118.58% (p=0.000 n=20)
GlobalUint64Parallel-8 0.4307n ± 0% 0.9583n ± 0% +122.51% (p=0.000 n=20)
Int64-8 4.107n ± 0% 4.104n ± 0% ~ (p=0.416 n=20)
Uint64-8 4.080n ± 0% 4.080n ± 0% ~ (p=0.052 n=20)
GlobalIntN1000-8 2.814n ± 2% 5.643n ± 0% +100.50% (p=0.000 n=20)
IntN1000-8 4.141n ± 0% 4.139n ± 0% ~ (p=0.140 n=20)
Int64N1000-8 4.140n ± 0% 4.140n ± 0% ~ (p=0.313 n=20)
Int64N1e8-8 4.140n ± 0% 4.139n ± 0% ~ (p=0.103 n=20)
Int64N1e9-8 4.139n ± 0% 4.140n ± 0% ~ (p=0.761 n=20)
Int64N2e9-8 4.140n ± 0% 4.140n ± 0% ~ (p=0.636 n=20)
Int64N1e18-8 5.266n ± 0% 5.326n ± 1% +1.14% (p=0.001 n=20)
Int64N2e18-8 6.052n ± 0% 6.167n ± 0% +1.90% (p=0.000 n=20)
Int64N4e18-8 8.826n ± 0% 9.051n ± 0% +2.55% (p=0.000 n=20)
Int32N1000-8 4.127n ± 0% 4.132n ± 0% +0.12% (p=0.000 n=20)
Int32N1e8-8 4.126n ± 0% 4.131n ± 0% +0.12% (p=0.000 n=20)
Int32N1e9-8 4.127n ± 0% 4.132n ± 0% +0.12% (p=0.000 n=20)
Int32N2e9-8 4.132n ± 0% 4.131n ± 0% ~ (p=0.017 n=20)
Float32-8 4.109n ± 0% 4.105n ± 0% ~ (p=0.379 n=20)
Float64-8 4.107n ± 0% 4.106n ± 0% ~ (p=0.867 n=20)
ExpFloat64-8 5.339n ± 0% 5.383n ± 0% +0.82% (p=0.000 n=20)
NormFloat64-8 5.735n ± 0% 5.737n ± 1% ~ (p=0.856 n=20)
Perm3-8 26.65n ± 0% 26.80n ± 1% +0.58% (p=0.000 n=20)
Perm30-8 194.8n ± 1% 197.0n ± 0% +1.18% (p=0.000 n=20)
Perm30ViaShuffle-8 156.6n ± 0% 157.6n ± 1% +0.61% (p=0.000 n=20)
ShuffleOverhead-8 124.9n ± 0% 125.5n ± 0% +0.52% (p=0.000 n=20)
Concurrent-8 2.434n ± 3% 5.066n ± 0% +108.09% (p=0.000 n=20)
goos: linux
goarch: 386
pkg: math/rand/v2
cpu: AMD Ryzen 9 7950X 16-Core Processor
│ bbb48afeb7.386 │ 5cf807d1ea.386 │
│ sec/op │ sec/op vs base │
ChaCha8-32 11.295n ± 1% 4.748n ± 2% -57.96% (p=0.000 n=20)
PCG_DXSM-32 7.693n ± 1% 7.738n ± 2% ~ (p=0.542 n=20)
SourceUint64-32 7.658n ± 2% 7.622n ± 2% ~ (p=0.344 n=20)
GlobalInt64-32 3.473n ± 2% 7.526n ± 2% +116.73% (p=0.000 n=20)
GlobalInt64Parallel-32 0.3198n ± 0% 0.5444n ± 0% +70.22% (p=0.000 n=20)
GlobalUint64-32 3.612n ± 0% 7.575n ± 1% +109.69% (p=0.000 n=20)
GlobalUint64Parallel-32 0.3168n ± 0% 0.5403n ± 0% +70.51% (p=0.000 n=20)
Int64-32 7.673n ± 2% 7.789n ± 1% ~ (p=0.122 n=20)
Uint64-32 7.773n ± 1% 7.827n ± 2% ~ (p=0.920 n=20)
GlobalIntN1000-32 6.268n ± 1% 9.581n ± 1% +52.87% (p=0.000 n=20)
IntN1000-32 10.33n ± 2% 10.45n ± 1% ~ (p=0.233 n=20)
Int64N1000-32 10.98n ± 2% 11.01n ± 1% ~ (p=0.401 n=20)
Int64N1e8-32 11.19n ± 2% 10.97n ± 1% ~ (p=0.033 n=20)
Int64N1e9-32 11.06n ± 1% 11.08n ± 1% ~ (p=0.498 n=20)
Int64N2e9-32 11.10n ± 1% 11.01n ± 2% ~ (p=0.995 n=20)
Int64N1e18-32 15.23n ± 2% 15.04n ± 1% ~ (p=0.973 n=20)
Int64N2e18-32 15.89n ± 1% 15.85n ± 1% ~ (p=0.409 n=20)
Int64N4e18-32 18.96n ± 2% 19.34n ± 2% ~ (p=0.048 n=20)
Int32N1000-32 10.46n ± 2% 10.44n ± 2% ~ (p=0.480 n=20)
Int32N1e8-32 10.46n ± 2% 10.49n ± 2% ~ (p=0.951 n=20)
Int32N1e9-32 10.28n ± 2% 10.26n ± 1% ~ (p=0.431 n=20)
Int32N2e9-32 10.50n ± 2% 10.44n ± 2% ~ (p=0.249 n=20)
Float32-32 13.80n ± 2% 13.80n ± 2% ~ (p=0.751 n=20)
Float64-32 23.55n ± 2% 23.87n ± 0% ~ (p=0.408 n=20)
ExpFloat64-32 15.36n ± 1% 15.29n ± 2% ~ (p=0.316 n=20)
NormFloat64-32 13.57n ± 1% 13.79n ± 1% +1.66% (p=0.005 n=20)
Perm3-32 45.70n ± 2% 46.99n ± 2% +2.81% (p=0.001 n=20)
Perm30-32 399.0n ± 1% 403.8n ± 1% +1.19% (p=0.006 n=20)
Perm30ViaShuffle-32 349.0n ± 1% 350.4n ± 1% ~ (p=0.909 n=20)
ShuffleOverhead-32 322.3n ± 1% 323.8n ± 1% ~ (p=0.410 n=20)
Concurrent-32 3.331n ± 1% 7.312n ± 1% +119.50% (p=0.000 n=20)
For #61716.
Change-Id: Ibdddeed85c34d9ae397289dc899e04d4845f9ed2
Reviewed-on: https://go-review.googlesource.com/c/go/+/516860
Reviewed-by: Michael Pratt <mpratt@google.com>
Reviewed-by: Filippo Valsorda <filippo@golang.org>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
2023-08-06 13:26:28 +10:00
|
|
|
"internal/chacha8rand"
|
2021-06-16 23:05:44 +00:00
|
|
|
"internal/goarch"
|
2024-02-01 10:21:14 +08:00
|
|
|
"internal/runtime/atomic"
|
2024-07-23 11:43:23 -04:00
|
|
|
"internal/runtime/sys"
|
2015-11-02 14:09:24 -05:00
|
|
|
"unsafe"
|
|
|
|
|
)
|
2014-11-11 17:05:19 -05:00
|
|
|
|
2016-04-13 18:16:21 +09:00
|
|
|
// defined constants
|
2014-11-11 17:05:19 -05:00
|
|
|
const (
|
|
|
|
|
// G status
|
|
|
|
|
//
|
2016-02-27 18:44:25 -05:00
|
|
|
// Beyond indicating the general state of a G, the G status
|
|
|
|
|
// acts like a lock on the goroutine's stack (and hence its
|
|
|
|
|
// ability to execute user code).
|
|
|
|
|
//
|
2014-11-11 17:05:19 -05:00
|
|
|
// If you add to this list, add to the list
|
|
|
|
|
// of "okay during garbage collection" status
|
2015-03-11 12:58:47 -07:00
|
|
|
// in mgcmark.go too.
|
2019-04-03 16:32:42 -04:00
|
|
|
//
|
|
|
|
|
// TODO(austin): The _Gscan bit could be much lighter-weight.
|
|
|
|
|
// For example, we could choose not to run _Gscanrunnable
|
|
|
|
|
// goroutines found in the run queue, rather than CAS-looping
|
|
|
|
|
// until they become _Grunnable. And transitions like
|
|
|
|
|
// _Gscanwaiting -> _Gscanrunnable are actually okay because
|
|
|
|
|
// they don't affect stack ownership.
|
2016-02-27 18:44:25 -05:00
|
|
|
|
|
|
|
|
// _Gidle means this goroutine was just allocated and has not
|
|
|
|
|
// yet been initialized.
|
|
|
|
|
_Gidle = iota // 0
|
|
|
|
|
|
|
|
|
|
// _Grunnable means this goroutine is on a run queue. It is
|
|
|
|
|
// not currently executing user code. The stack is not owned.
|
|
|
|
|
_Grunnable // 1
|
|
|
|
|
|
|
|
|
|
// _Grunning means this goroutine may execute user code. The
|
|
|
|
|
// stack is owned by this goroutine. It is not on a run queue.
|
2025-02-02 19:50:39 +00:00
|
|
|
// It is assigned an M (g.m is valid) and it usually has a P
|
|
|
|
|
// (g.m.p is valid), but there are small windows of time where
|
|
|
|
|
// it might not, namely upon entering and exiting _Gsyscall.
|
2016-02-27 18:44:25 -05:00
|
|
|
_Grunning // 2
|
|
|
|
|
|
|
|
|
|
// _Gsyscall means this goroutine is executing a system call.
|
|
|
|
|
// It is not executing user code. The stack is owned by this
|
|
|
|
|
// goroutine. It is not on a run queue. It is assigned an M.
|
2025-02-02 19:50:39 +00:00
|
|
|
// It may have a P attached, but it does not own it. Code
|
|
|
|
|
// executing in this state must not touch g.m.p.
|
2016-02-27 18:44:25 -05:00
|
|
|
_Gsyscall // 3
|
|
|
|
|
|
|
|
|
|
// _Gwaiting means this goroutine is blocked in the runtime.
|
|
|
|
|
// It is not executing user code. It is not on a run queue,
|
|
|
|
|
// but should be recorded somewhere (e.g., a channel wait
|
|
|
|
|
// queue) so it can be ready()d when necessary. The stack is
|
|
|
|
|
// not owned *except* that a channel operation may read or
|
|
|
|
|
// write parts of the stack under the appropriate channel
|
|
|
|
|
// lock. Otherwise, it is not safe to access the stack after a
|
|
|
|
|
// goroutine enters _Gwaiting (e.g., it may get moved).
|
|
|
|
|
_Gwaiting // 4
|
|
|
|
|
|
|
|
|
|
// _Gmoribund_unused is currently unused, but hardcoded in gdb
|
|
|
|
|
// scripts.
|
|
|
|
|
_Gmoribund_unused // 5
|
|
|
|
|
|
|
|
|
|
// _Gdead means this goroutine is currently unused. It may be
|
|
|
|
|
// just exited, on a free list, or just being initialized. It
|
|
|
|
|
// is not executing user code. It may or may not have a stack
|
|
|
|
|
// allocated. The G and its stack (if any) are owned by the M
|
|
|
|
|
// that is exiting the G or that obtained the G from the free
|
|
|
|
|
// list.
|
|
|
|
|
_Gdead // 6
|
|
|
|
|
|
|
|
|
|
// _Genqueue_unused is currently unused.
|
|
|
|
|
_Genqueue_unused // 7
|
|
|
|
|
|
|
|
|
|
// _Gcopystack means this goroutine's stack is being moved. It
|
|
|
|
|
// is not executing user code and is not on a run queue. The
|
|
|
|
|
// stack is owned by the goroutine that put it in _Gcopystack.
|
|
|
|
|
_Gcopystack // 8
|
|
|
|
|
|
2019-09-27 12:27:51 -04:00
|
|
|
// _Gpreempted means this goroutine stopped itself for a
|
|
|
|
|
// suspendG preemption. It is like _Gwaiting, but nothing is
|
|
|
|
|
// yet responsible for ready()ing it. Some suspendG must CAS
|
|
|
|
|
// the status to _Gwaiting to take responsibility for
|
|
|
|
|
// ready()ing this G.
|
|
|
|
|
_Gpreempted // 9
|
|
|
|
|
|
2025-10-02 11:57:58 +00:00
|
|
|
// _Gleaked represents a leaked goroutine caught by the GC.
|
|
|
|
|
_Gleaked // 10
|
|
|
|
|
|
2025-02-03 15:22:37 +00:00
|
|
|
// _Gdeadextra is a _Gdead goroutine that's attached to an extra M
|
|
|
|
|
// used for cgo callbacks.
|
|
|
|
|
_Gdeadextra // 11
|
|
|
|
|
|
2016-02-27 18:44:25 -05:00
|
|
|
// _Gscan combined with one of the above states other than
|
|
|
|
|
// _Grunning indicates that GC is scanning the stack. The
|
|
|
|
|
// goroutine is not executing user code and the stack is owned
|
|
|
|
|
// by the goroutine that set the _Gscan bit.
|
|
|
|
|
//
|
|
|
|
|
// _Gscanrunning is different: it is used to briefly block
|
|
|
|
|
// state transitions while GC signals the G to scan its own
|
|
|
|
|
// stack. This is otherwise like _Grunning.
|
|
|
|
|
//
|
|
|
|
|
// atomicstatus&~Gscan gives the state the goroutine will
|
|
|
|
|
// return to when the scan completes.
|
2019-09-27 12:27:51 -04:00
|
|
|
_Gscan = 0x1000
|
|
|
|
|
_Gscanrunnable = _Gscan + _Grunnable // 0x1001
|
|
|
|
|
_Gscanrunning = _Gscan + _Grunning // 0x1002
|
|
|
|
|
_Gscansyscall = _Gscan + _Gsyscall // 0x1003
|
|
|
|
|
_Gscanwaiting = _Gscan + _Gwaiting // 0x1004
|
|
|
|
|
_Gscanpreempted = _Gscan + _Gpreempted // 0x1009
|
2025-10-02 11:57:58 +00:00
|
|
|
_Gscanleaked = _Gscan + _Gleaked // 0x100a
|
2025-02-03 15:22:37 +00:00
|
|
|
_Gscandeadextra = _Gscan + _Gdeadextra // 0x100b
|
2014-11-11 17:05:19 -05:00
|
|
|
)
|
|
|
|
|
|
|
|
|
|
const (
|
|
|
|
|
// P status
|
2019-04-18 21:37:51 -04:00
|
|
|
|
|
|
|
|
// _Pidle means a P is not being used to run user code or the
|
|
|
|
|
// scheduler. Typically, it's on the idle P list and available
|
|
|
|
|
// to the scheduler, but it may just be transitioning between
|
|
|
|
|
// other states.
|
|
|
|
|
//
|
|
|
|
|
// The P is owned by the idle list or by whatever is
|
|
|
|
|
// transitioning its state. Its run queue is empty.
|
|
|
|
|
_Pidle = iota
|
|
|
|
|
|
|
|
|
|
// _Prunning means a P is owned by an M and is being used to
|
|
|
|
|
// run user code or the scheduler. Only the M that owns this P
|
|
|
|
|
// is allowed to change the P's status from _Prunning. The M
|
|
|
|
|
// may transition the P to _Pidle (if it has no more work to
|
2025-02-02 19:50:39 +00:00
|
|
|
// do), or _Pgcstop (to halt for the GC). The M may also hand
|
|
|
|
|
// ownership of the P off directly to another M (for example,
|
|
|
|
|
// to schedule a locked G).
|
2019-04-18 21:37:51 -04:00
|
|
|
_Prunning
|
|
|
|
|
|
2025-02-02 19:50:39 +00:00
|
|
|
// _Psyscall_unused is a now-defunct state for a P. A P is
|
|
|
|
|
// identified as "in a system call" by looking at the goroutine's
|
|
|
|
|
// state.
|
|
|
|
|
_Psyscall_unused
|
2019-04-18 21:37:51 -04:00
|
|
|
|
|
|
|
|
// _Pgcstop means a P is halted for STW and owned by the M
|
|
|
|
|
// that stopped the world. The M that stopped the world
|
|
|
|
|
// continues to use its P, even in _Pgcstop. Transitioning
|
|
|
|
|
// from _Prunning to _Pgcstop causes an M to release its P and
|
|
|
|
|
// park.
|
|
|
|
|
//
|
|
|
|
|
// The P retains its run queue and startTheWorld will restart
|
|
|
|
|
// the scheduler on Ps with non-empty run queues.
|
2014-11-11 17:05:19 -05:00
|
|
|
_Pgcstop
|
2019-04-18 21:37:51 -04:00
|
|
|
|
|
|
|
|
// _Pdead means a P is no longer used (GOMAXPROCS shrank). We
|
|
|
|
|
// reuse Ps if GOMAXPROCS increases. A dead P is mostly
|
|
|
|
|
// stripped of its resources, though a few things remain
|
|
|
|
|
// (e.g., trace buffers).
|
2014-11-11 17:05:19 -05:00
|
|
|
_Pdead
|
|
|
|
|
)
|
|
|
|
|
|
2016-04-13 18:16:21 +09:00
|
|
|
// Mutual exclusion locks. In the uncontended case,
|
|
|
|
|
// as fast as spin locks (just a few user-level instructions),
|
|
|
|
|
// but on the contention path they sleep in the kernel.
|
|
|
|
|
// A zeroed Mutex is unlocked (no need to initialize each lock).
|
runtime: static lock ranking for the runtime (enabled by GOEXPERIMENT)
I took some of the infrastructure from Austin's lock logging CR
https://go-review.googlesource.com/c/go/+/192704 (with deadlock
detection from the logs), and developed a setup to give static lock
ranking for runtime locks.
Static lock ranking establishes a documented total ordering among locks,
and then reports an error if the total order is violated. This can
happen if a deadlock happens (by acquiring a sequence of locks in
different orders), or if just one side of a possible deadlock happens.
Lock ordering deadlocks cannot happen as long as the lock ordering is
followed.
Along the way, I found a deadlock involving the new timer code, which Ian fixed
via https://go-review.googlesource.com/c/go/+/207348, as well as two other
potential deadlocks.
See the constants at the top of runtime/lockrank.go to show the static
lock ranking that I ended up with, along with some comments. This is
great documentation of the current intended lock ordering when acquiring
multiple locks in the runtime.
I also added an array lockPartialOrder[] which shows and enforces the
current partial ordering among locks (which is embedded within the total
ordering). This is more specific about the dependencies among locks.
I don't try to check the ranking within a lock class with multiple locks
that can be acquired at the same time (i.e. check the ranking when
multiple hchan locks are acquired).
Currently, I am doing a lockInit() call to set the lock rank of most
locks. Any lock that is not otherwise initialized is assumed to be a
leaf lock (a very high rank lock), so that eliminates the need to do
anything for a bunch of locks (including all architecture-dependent
locks). For two locks, root.lock and notifyList.lock (only in the
runtime/sema.go file), it is not as easy to do lock initialization, so
instead, I am passing the lock rank with the lock calls.
For Windows compilation, I needed to increase the StackGuard size from
896 to 928 because of the new lock-rank checking functions.
Checking of the static lock ranking is enabled by setting
GOEXPERIMENT=staticlockranking before doing a run.
To make sure that the static lock ranking code has no overhead in memory
or CPU when not enabled by GOEXPERIMENT, I changed 'go build/install' so
that it defines a build tag (with the same name) whenever any experiment
has been baked into the toolchain (by checking Expstring()). This allows
me to avoid increasing the size of the 'mutex' type when static lock
ranking is not enabled.
Fixes #38029
Change-Id: I154217ff307c47051f8dae9c2a03b53081acd83a
Reviewed-on: https://go-review.googlesource.com/c/go/+/207619
Reviewed-by: Dan Scales <danscales@google.com>
Reviewed-by: Keith Randall <khr@golang.org>
Run-TryBot: Dan Scales <danscales@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2019-11-13 17:34:47 -08:00
|
|
|
// Initialization is helpful for static lock ranking, but not required.
|
2014-11-11 17:05:19 -05:00
|
|
|
type mutex struct {
|
runtime: static lock ranking for the runtime (enabled by GOEXPERIMENT)
I took some of the infrastructure from Austin's lock logging CR
https://go-review.googlesource.com/c/go/+/192704 (with deadlock
detection from the logs), and developed a setup to give static lock
ranking for runtime locks.
Static lock ranking establishes a documented total ordering among locks,
and then reports an error if the total order is violated. This can
happen if a deadlock happens (by acquiring a sequence of locks in
different orders), or if just one side of a possible deadlock happens.
Lock ordering deadlocks cannot happen as long as the lock ordering is
followed.
Along the way, I found a deadlock involving the new timer code, which Ian fixed
via https://go-review.googlesource.com/c/go/+/207348, as well as two other
potential deadlocks.
See the constants at the top of runtime/lockrank.go to show the static
lock ranking that I ended up with, along with some comments. This is
great documentation of the current intended lock ordering when acquiring
multiple locks in the runtime.
I also added an array lockPartialOrder[] which shows and enforces the
current partial ordering among locks (which is embedded within the total
ordering). This is more specific about the dependencies among locks.
I don't try to check the ranking within a lock class with multiple locks
that can be acquired at the same time (i.e. check the ranking when
multiple hchan locks are acquired).
Currently, I am doing a lockInit() call to set the lock rank of most
locks. Any lock that is not otherwise initialized is assumed to be a
leaf lock (a very high rank lock), so that eliminates the need to do
anything for a bunch of locks (including all architecture-dependent
locks). For two locks, root.lock and notifyList.lock (only in the
runtime/sema.go file), it is not as easy to do lock initialization, so
instead, I am passing the lock rank with the lock calls.
For Windows compilation, I needed to increase the StackGuard size from
896 to 928 because of the new lock-rank checking functions.
Checking of the static lock ranking is enabled by setting
GOEXPERIMENT=staticlockranking before doing a run.
To make sure that the static lock ranking code has no overhead in memory
or CPU when not enabled by GOEXPERIMENT, I changed 'go build/install' so
that it defines a build tag (with the same name) whenever any experiment
has been baked into the toolchain (by checking Expstring()). This allows
me to avoid increasing the size of the 'mutex' type when static lock
ranking is not enabled.
Fixes #38029
Change-Id: I154217ff307c47051f8dae9c2a03b53081acd83a
Reviewed-on: https://go-review.googlesource.com/c/go/+/207619
Reviewed-by: Dan Scales <danscales@google.com>
Reviewed-by: Keith Randall <khr@golang.org>
Run-TryBot: Dan Scales <danscales@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2019-11-13 17:34:47 -08:00
|
|
|
// Empty struct if lock ranking is disabled, otherwise includes the lock rank
|
|
|
|
|
lockRankStruct
|
2014-11-11 17:05:19 -05:00
|
|
|
// Futex-based impl treats it as uint32 key,
|
|
|
|
|
// while sema-based impl as M* waitm.
|
|
|
|
|
// Used to be a union, but unions break precise GC.
|
|
|
|
|
key uintptr
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
type funcval struct {
|
|
|
|
|
fn uintptr
|
|
|
|
|
// variable-size, fn-specific data here
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
type iface struct {
|
|
|
|
|
tab *itab
|
|
|
|
|
data unsafe.Pointer
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
type eface struct {
|
|
|
|
|
_type *_type
|
|
|
|
|
data unsafe.Pointer
|
|
|
|
|
}
|
|
|
|
|
|
2021-12-01 12:15:45 -05:00
|
|
|
func efaceOf(ep *any) *eface {
|
2015-10-21 12:12:25 -07:00
|
|
|
return (*eface)(unsafe.Pointer(ep))
|
|
|
|
|
}
|
|
|
|
|
|
2015-04-17 00:21:30 -04:00
|
|
|
// The guintptr, muintptr, and puintptr are all used to bypass write barriers.
|
|
|
|
|
// It is particularly important to avoid write barriers when the current P has
|
|
|
|
|
// been released, because the GC thinks the world is stopped, and an
|
|
|
|
|
// unexpected write barrier would not be synchronized with the GC,
|
|
|
|
|
// which can lead to a half-executed write barrier that has marked the object
|
|
|
|
|
// but not queued it. If the GC skips the object and completes before the
|
|
|
|
|
// queuing can occur, it will incorrectly free the object.
|
|
|
|
|
//
|
|
|
|
|
// We tried using special assignment functions invoked only when not
|
|
|
|
|
// holding a running P, but then some updates to a particular memory
|
|
|
|
|
// word went through write barriers and some did not. This breaks the
|
|
|
|
|
// write barrier shadow checking mode, and it is also scary: better to have
|
|
|
|
|
// a word that is completely ignored by the GC than to have one for which
|
|
|
|
|
// only a few updates are ignored.
|
|
|
|
|
//
|
runtime: make it possible to exit Go-created threads
Currently, threads created by the runtime exist until the whole
program exits. For #14592 and #20395, we want to be able to exit and
clean up threads created by the runtime. This commit implements that
mechanism.
The main difficulty is how to clean up the g0 stack. In cgo mode and
on Solaris and Windows where the OS manages thread stacks, we simply
arrange to return from mstart and let the system clean up the thread.
If the runtime allocated the g0 stack, then we use a new exitThread
syscall wrapper that arranges to clear a flag in the M once the stack
can safely be reaped and call the thread termination syscall.
exitThread is based on the existing exit1 wrapper, which was always
meant to terminate the calling thread. However, exit1 has never been
used since it was introduced 9 years ago, so it was broken on several
platforms. exitThread also has the additional complication of having
to flag that the stack is unused, which requires some tricks on
platforms that use the stack for syscalls.
This still leaves the problem of how to reap the unused g0 stacks. For
this, we move the M from allm to a new freem list as part of the M
exiting. Later, allocm scans the freem list, finds Ms that are marked
as done with their stack, removes these from the list and frees their
g0 stacks. This also allows these Ms to be garbage collected.
This CL does not yet use any of this functionality. Follow-up CLs
will. Likewise, there are no new tests in this CL because we'll need
follow-up functionality to test it.
Change-Id: Ic851ee74227b6d39c6fc1219fc71b45d3004bc63
Reviewed-on: https://go-review.googlesource.com/46037
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2017-06-16 15:54:21 -04:00
|
|
|
// Gs and Ps are always reachable via true pointers in the
|
|
|
|
|
// allgs and allp lists or (during allocation before they reach those lists)
|
2015-04-17 00:21:30 -04:00
|
|
|
// from stack variables.
|
runtime: make it possible to exit Go-created threads
Currently, threads created by the runtime exist until the whole
program exits. For #14592 and #20395, we want to be able to exit and
clean up threads created by the runtime. This commit implements that
mechanism.
The main difficulty is how to clean up the g0 stack. In cgo mode and
on Solaris and Windows where the OS manages thread stacks, we simply
arrange to return from mstart and let the system clean up the thread.
If the runtime allocated the g0 stack, then we use a new exitThread
syscall wrapper that arranges to clear a flag in the M once the stack
can safely be reaped and call the thread termination syscall.
exitThread is based on the existing exit1 wrapper, which was always
meant to terminate the calling thread. However, exit1 has never been
used since it was introduced 9 years ago, so it was broken on several
platforms. exitThread also has the additional complication of having
to flag that the stack is unused, which requires some tricks on
platforms that use the stack for syscalls.
This still leaves the problem of how to reap the unused g0 stacks. For
this, we move the M from allm to a new freem list as part of the M
exiting. Later, allocm scans the freem list, finds Ms that are marked
as done with their stack, removes these from the list and frees their
g0 stacks. This also allows these Ms to be garbage collected.
This CL does not yet use any of this functionality. Follow-up CLs
will. Likewise, there are no new tests in this CL because we'll need
follow-up functionality to test it.
Change-Id: Ic851ee74227b6d39c6fc1219fc71b45d3004bc63
Reviewed-on: https://go-review.googlesource.com/46037
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2017-06-16 15:54:21 -04:00
|
|
|
//
|
|
|
|
|
// Ms are always reachable via true pointers either from allm or
|
|
|
|
|
// freem. Unlike Gs and Ps we do free Ms, so it's important that
|
|
|
|
|
// nothing ever hold an muintptr across a safe point.
|
2015-04-17 00:21:30 -04:00
|
|
|
|
2014-12-22 22:43:49 -05:00
|
|
|
// A guintptr holds a goroutine pointer, but typed as a uintptr
|
2015-04-17 00:21:30 -04:00
|
|
|
// to bypass write barriers. It is used in the Gobuf goroutine state
|
|
|
|
|
// and in scheduling lists that are manipulated without a P.
|
2014-12-22 22:43:49 -05:00
|
|
|
//
|
|
|
|
|
// The Gobuf.g goroutine pointer is almost always updated by assembly code.
|
|
|
|
|
// In one of the few places it is updated by Go code - func save - it must be
|
|
|
|
|
// treated as a uintptr to avoid a write barrier being emitted at a bad time.
|
|
|
|
|
// Instead of figuring out how to emit the write barriers missing in the
|
|
|
|
|
// assembly manipulation, we change the type of the field to uintptr,
|
|
|
|
|
// so that it does not require write barriers at all.
|
|
|
|
|
//
|
|
|
|
|
// Goroutine structs are published in the allg list and never freed.
|
|
|
|
|
// That will keep the goroutine structs from being collected.
|
|
|
|
|
// There is never a time that Gobuf.g's contain the only references
|
|
|
|
|
// to a goroutine: the publishing of the goroutine in allg comes first.
|
|
|
|
|
// Goroutine pointers are also kept in non-GC-visible places like TLS,
|
|
|
|
|
// so I can't see them ever moving. If we did want to start moving data
|
|
|
|
|
// in the GC, we'd need to allocate the goroutine structs from an
|
|
|
|
|
// alternate arena. Using guintptr doesn't make that problem any worse.
|
runtime: fix net poll races
The netpoll code was written long ago, when the
only multiprocessors that Go ran on were x86.
It assumed that an atomic store would trigger a
full memory barrier and then used that barrier
to order otherwise racy access to a handful of fields,
including pollDesc.closing.
On ARM64, this code has finally failed, because
the atomic store is on a value completely unrelated
to any of the racily-accessed fields, and the ARMv8
hardware, unlike x86, is clever enough not to do a
full memory barrier for a simple atomic store.
We are seeing a constant background rate of trybot
failures where the net/http tests deadlock - a netpollblock
has clearly happened after the pollDesc has begun to close.
The code that does the racy reads is netpollcheckerr,
which needs to be able to run without acquiring a lock.
This CL fixes the race, without introducing unnecessary
inefficiency or deadlock, by arranging for every updater
of the relevant fields to publish a summary as a single
atomic uint32, and then having netpollcheckerr use a
single atomic load to fetch the relevant bits and then
proceed as before.
Fixes #45211 (until proven otherwise!).
Change-Id: Ib6788c8da4d00b7bda84d55ca3fdffb5a64c1a0a
Reviewed-on: https://go-review.googlesource.com/c/go/+/378234
Trust: Russ Cox <rsc@golang.org>
Run-TryBot: Russ Cox <rsc@golang.org>
Trust: Bryan Mills <bcmills@google.com>
Reviewed-by: Ian Lance Taylor <iant@golang.org>
2022-01-12 17:22:09 -05:00
|
|
|
// Note that pollDesc.rg, pollDesc.wg also store g in uintptr form,
|
|
|
|
|
// so they would need to be updated too if g's start moving.
|
2014-12-22 22:43:49 -05:00
|
|
|
type guintptr uintptr
|
|
|
|
|
|
2015-11-24 09:15:36 +13:00
|
|
|
//go:nosplit
|
|
|
|
|
func (gp guintptr) ptr() *g { return (*g)(unsafe.Pointer(gp)) }
|
|
|
|
|
|
|
|
|
|
//go:nosplit
|
2015-04-17 00:21:30 -04:00
|
|
|
func (gp *guintptr) set(g *g) { *gp = guintptr(unsafe.Pointer(g)) }
|
2015-11-24 09:15:36 +13:00
|
|
|
|
|
|
|
|
//go:nosplit
|
runtime: yield time slice to most recently readied G
Currently, when the runtime ready()s a G, it adds it to the end of the
current P's run queue and continues running. If there are many other
things in the run queue, this can result in a significant delay before
the ready()d G actually runs and can hurt fairness when other Gs in
the run queue are CPU hogs. For example, if there are three Gs sharing
a P, one of which is a CPU hog that never voluntarily gives up the P
and the other two of which are doing small amounts of work and
communicating back and forth on an unbuffered channel, the two
communicating Gs will get very little CPU time.
Change this so that when G1 ready()s G2 and then blocks, the scheduler
immediately hands off the remainder of G1's time slice to G2. In the
above example, the two communicating Gs will now act as a unit and
together get half of the CPU time, while the CPU hog gets the other
half of the CPU time.
This fixes the problem demonstrated by the ping-pong benchmark added
in the previous commit:
benchmark old ns/op new ns/op delta
BenchmarkPingPongHog 684287 825 -99.88%
On the x/benchmarks suite, this change improves the performance of
garbage by ~6% (for GOMAXPROCS=1 and 4), and json by 28% and 36% for
GOMAXPROCS=1 and 4. It has negligible effect on heap size.
This has no effect on the go1 benchmark suite since those benchmarks
are mostly single-threaded.
Change-Id: I858a08eaa78f702ea98a5fac99d28a4ac91d339f
Reviewed-on: https://go-review.googlesource.com/9289
Reviewed-by: Rick Hudson <rlh@golang.org>
Reviewed-by: Russ Cox <rsc@golang.org>
2015-04-22 14:42:26 -04:00
|
|
|
func (gp *guintptr) cas(old, new guintptr) bool {
|
2015-11-02 14:09:24 -05:00
|
|
|
return atomic.Casuintptr((*uintptr)(unsafe.Pointer(gp)), uintptr(old), uintptr(new))
|
runtime: yield time slice to most recently readied G
Currently, when the runtime ready()s a G, it adds it to the end of the
current P's run queue and continues running. If there are many other
things in the run queue, this can result in a significant delay before
the ready()d G actually runs and can hurt fairness when other Gs in
the run queue are CPU hogs. For example, if there are three Gs sharing
a P, one of which is a CPU hog that never voluntarily gives up the P
and the other two of which are doing small amounts of work and
communicating back and forth on an unbuffered channel, the two
communicating Gs will get very little CPU time.
Change this so that when G1 ready()s G2 and then blocks, the scheduler
immediately hands off the remainder of G1's time slice to G2. In the
above example, the two communicating Gs will now act as a unit and
together get half of the CPU time, while the CPU hog gets the other
half of the CPU time.
This fixes the problem demonstrated by the ping-pong benchmark added
in the previous commit:
benchmark old ns/op new ns/op delta
BenchmarkPingPongHog 684287 825 -99.88%
On the x/benchmarks suite, this change improves the performance of
garbage by ~6% (for GOMAXPROCS=1 and 4), and json by 28% and 36% for
GOMAXPROCS=1 and 4. It has negligible effect on heap size.
This has no effect on the go1 benchmark suite since those benchmarks
are mostly single-threaded.
Change-Id: I858a08eaa78f702ea98a5fac99d28a4ac91d339f
Reviewed-on: https://go-review.googlesource.com/9289
Reviewed-by: Rick Hudson <rlh@golang.org>
Reviewed-by: Russ Cox <rsc@golang.org>
2015-04-22 14:42:26 -04:00
|
|
|
}
|
2014-12-22 22:43:49 -05:00
|
|
|
|
runtime: implement traceback iterator
Currently, all stack walking logic is in one venerable, large, and
very, very complicated function: runtime.gentraceback. This function
has three distinct operating modes: printing, populating a PC buffer,
or invoking a callback. And it has three different modes of unwinding:
physical Go frames, inlined Go frames, and cgo frames. It also has
several flags. All of this logic is very interwoven.
This CL reimplements the monolithic gentraceback function as an
"unwinder" type with an iterator API. It moves all of the logic for
stack walking into this new type, and gentraceback is now a
much-simplified wrapper around the new unwinder type that still
implements printing, populating a PC buffer, and invoking a callback.
Follow-up CLs will replace uses of gentraceback with direct uses of
unwinder.
Exposing traceback functionality as an iterator API will enable a lot
of follow-up work such as simplifying the open-coded defer
implementation (which should in turn help with #26813 and #37233),
printing the bottom of deep stacks (#7181), and eliminating the small
limit on CPU stacks in profiles (#56029).
Fixes #54466.
Change-Id: I36e046dc423c9429c4f286d47162af61aff49a0d
Reviewed-on: https://go-review.googlesource.com/c/go/+/458218
Reviewed-by: Michael Pratt <mpratt@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Run-TryBot: Austin Clements <austin@google.com>
2022-07-14 12:22:24 -04:00
|
|
|
//go:nosplit
|
|
|
|
|
func (gp *g) guintptr() guintptr {
|
|
|
|
|
return guintptr(unsafe.Pointer(gp))
|
|
|
|
|
}
|
|
|
|
|
|
2016-10-19 16:00:07 -04:00
|
|
|
// setGNoWB performs *gp = new without a write barrier.
|
|
|
|
|
// For times when it's impractical to use a guintptr.
|
2022-01-30 20:13:43 -05:00
|
|
|
//
|
2016-10-19 16:00:07 -04:00
|
|
|
//go:nosplit
|
|
|
|
|
//go:nowritebarrier
|
|
|
|
|
func setGNoWB(gp **g, new *g) {
|
|
|
|
|
(*guintptr)(unsafe.Pointer(gp)).set(new)
|
|
|
|
|
}
|
|
|
|
|
|
2015-04-17 00:21:30 -04:00
|
|
|
type puintptr uintptr
|
runtime: Remove write barriers during STW.
The GC assumes that there will be no asynchronous write barriers when
the world is stopped. This keeps the synchronization between write
barriers and the GC simple. However, currently, there are a few places
in runtime code where this assumption does not hold.
The GC stops the world by collecting all Ps, which stops all user Go
code, but small parts of the runtime can run without a P. For example,
the code that releases a P must still deschedule its G onto a runnable
queue before stopping. Similarly, when a G returns from a long-running
syscall, it must run code to reacquire a P.
Currently, this code can contain write barriers. This can lead to the
GC collecting reachable objects if something like the following
sequence of events happens:
1. GC stops the world by collecting all Ps.
2. G #1 returns from a syscall (for example), tries to install a
pointer to object X, and calls greyobject on X.
3. greyobject on G #1 marks X, but does not yet add it to a write
buffer. At this point, X is effectively black, not grey, even though
it may point to white objects.
4. GC reaches X through some other path and calls greyobject on X, but
greyobject does nothing because X is already marked.
5. GC completes.
6. greyobject on G #1 adds X to a work buffer, but it's too late.
7. Objects that were reachable only through X are incorrectly collected.
To fix this, we check the invariant that no asynchronous write
barriers happen when the world is stopped by checking that write
barriers always have a P, and modify all currently known sources of
these writes to disable the write barrier. In all modified cases this
is safe because the object in question will always be reachable via
some other path.
Some of the trace code was turned off, in particular the
code that traces returning from a syscall. The GC assumes
that as far as the heap is concerned the thread is stopped
when it is in a syscall. Upon returning the trace code
must not do any heap writes for the same reasons discussed
above.
Fixes #10098
Fixes #9953
Fixes #9951
Fixes #9884
May relate to #9610 #9771
Change-Id: Ic2e70b7caffa053e56156838eb8d89503e3c0c8a
Reviewed-on: https://go-review.googlesource.com/7504
Reviewed-by: Austin Clements <austin@google.com>
2015-03-12 14:19:21 -04:00
|
|
|
|
2015-11-24 09:15:36 +13:00
|
|
|
//go:nosplit
|
|
|
|
|
func (pp puintptr) ptr() *p { return (*p)(unsafe.Pointer(pp)) }
|
|
|
|
|
|
|
|
|
|
//go:nosplit
|
2015-04-17 00:21:30 -04:00
|
|
|
func (pp *puintptr) set(p *p) { *pp = puintptr(unsafe.Pointer(p)) }
|
runtime: Remove write barriers during STW.
The GC assumes that there will be no asynchronous write barriers when
the world is stopped. This keeps the synchronization between write
barriers and the GC simple. However, currently, there are a few places
in runtime code where this assumption does not hold.
The GC stops the world by collecting all Ps, which stops all user Go
code, but small parts of the runtime can run without a P. For example,
the code that releases a P must still deschedule its G onto a runnable
queue before stopping. Similarly, when a G returns from a long-running
syscall, it must run code to reacquire a P.
Currently, this code can contain write barriers. This can lead to the
GC collecting reachable objects if something like the following
sequence of events happens:
1. GC stops the world by collecting all Ps.
2. G #1 returns from a syscall (for example), tries to install a
pointer to object X, and calls greyobject on X.
3. greyobject on G #1 marks X, but does not yet add it to a write
buffer. At this point, X is effectively black, not grey, even though
it may point to white objects.
4. GC reaches X through some other path and calls greyobject on X, but
greyobject does nothing because X is already marked.
5. GC completes.
6. greyobject on G #1 adds X to a work buffer, but it's too late.
7. Objects that were reachable only through X are incorrectly collected.
To fix this, we check the invariant that no asynchronous write
barriers happen when the world is stopped by checking that write
barriers always have a P, and modify all currently known sources of
these writes to disable the write barrier. In all modified cases this
is safe because the object in question will always be reachable via
some other path.
Some of the trace code was turned off, in particular the
code that traces returning from a syscall. The GC assumes
that as far as the heap is concerned the thread is stopped
when it is in a syscall. Upon returning the trace code
must not do any heap writes for the same reasons discussed
above.
Fixes #10098
Fixes #9953
Fixes #9951
Fixes #9884
May relate to #9610 #9771
Change-Id: Ic2e70b7caffa053e56156838eb8d89503e3c0c8a
Reviewed-on: https://go-review.googlesource.com/7504
Reviewed-by: Austin Clements <austin@google.com>
2015-03-12 14:19:21 -04:00
|
|
|
|
runtime: make it possible to exit Go-created threads
Currently, threads created by the runtime exist until the whole
program exits. For #14592 and #20395, we want to be able to exit and
clean up threads created by the runtime. This commit implements that
mechanism.
The main difficulty is how to clean up the g0 stack. In cgo mode and
on Solaris and Windows where the OS manages thread stacks, we simply
arrange to return from mstart and let the system clean up the thread.
If the runtime allocated the g0 stack, then we use a new exitThread
syscall wrapper that arranges to clear a flag in the M once the stack
can safely be reaped and call the thread termination syscall.
exitThread is based on the existing exit1 wrapper, which was always
meant to terminate the calling thread. However, exit1 has never been
used since it was introduced 9 years ago, so it was broken on several
platforms. exitThread also has the additional complication of having
to flag that the stack is unused, which requires some tricks on
platforms that use the stack for syscalls.
This still leaves the problem of how to reap the unused g0 stacks. For
this, we move the M from allm to a new freem list as part of the M
exiting. Later, allocm scans the freem list, finds Ms that are marked
as done with their stack, removes these from the list and frees their
g0 stacks. This also allows these Ms to be garbage collected.
This CL does not yet use any of this functionality. Follow-up CLs
will. Likewise, there are no new tests in this CL because we'll need
follow-up functionality to test it.
Change-Id: Ic851ee74227b6d39c6fc1219fc71b45d3004bc63
Reviewed-on: https://go-review.googlesource.com/46037
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2017-06-16 15:54:21 -04:00
|
|
|
// muintptr is a *m that is not tracked by the garbage collector.
|
|
|
|
|
//
|
|
|
|
|
// Because we do free Ms, there are some additional constrains on
|
|
|
|
|
// muintptrs:
|
|
|
|
|
//
|
2022-01-29 19:07:27 -05:00
|
|
|
// 1. Never hold an muintptr locally across a safe point.
|
runtime: make it possible to exit Go-created threads
Currently, threads created by the runtime exist until the whole
program exits. For #14592 and #20395, we want to be able to exit and
clean up threads created by the runtime. This commit implements that
mechanism.
The main difficulty is how to clean up the g0 stack. In cgo mode and
on Solaris and Windows where the OS manages thread stacks, we simply
arrange to return from mstart and let the system clean up the thread.
If the runtime allocated the g0 stack, then we use a new exitThread
syscall wrapper that arranges to clear a flag in the M once the stack
can safely be reaped and call the thread termination syscall.
exitThread is based on the existing exit1 wrapper, which was always
meant to terminate the calling thread. However, exit1 has never been
used since it was introduced 9 years ago, so it was broken on several
platforms. exitThread also has the additional complication of having
to flag that the stack is unused, which requires some tricks on
platforms that use the stack for syscalls.
This still leaves the problem of how to reap the unused g0 stacks. For
this, we move the M from allm to a new freem list as part of the M
exiting. Later, allocm scans the freem list, finds Ms that are marked
as done with their stack, removes these from the list and frees their
g0 stacks. This also allows these Ms to be garbage collected.
This CL does not yet use any of this functionality. Follow-up CLs
will. Likewise, there are no new tests in this CL because we'll need
follow-up functionality to test it.
Change-Id: Ic851ee74227b6d39c6fc1219fc71b45d3004bc63
Reviewed-on: https://go-review.googlesource.com/46037
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2017-06-16 15:54:21 -04:00
|
|
|
//
|
2022-01-29 19:07:27 -05:00
|
|
|
// 2. Any muintptr in the heap must be owned by the M itself so it can
|
|
|
|
|
// ensure it is not in use when the last true *m is released.
|
2015-04-17 00:21:30 -04:00
|
|
|
type muintptr uintptr
|
runtime: Remove write barriers during STW.
The GC assumes that there will be no asynchronous write barriers when
the world is stopped. This keeps the synchronization between write
barriers and the GC simple. However, currently, there are a few places
in runtime code where this assumption does not hold.
The GC stops the world by collecting all Ps, which stops all user Go
code, but small parts of the runtime can run without a P. For example,
the code that releases a P must still deschedule its G onto a runnable
queue before stopping. Similarly, when a G returns from a long-running
syscall, it must run code to reacquire a P.
Currently, this code can contain write barriers. This can lead to the
GC collecting reachable objects if something like the following
sequence of events happens:
1. GC stops the world by collecting all Ps.
2. G #1 returns from a syscall (for example), tries to install a
pointer to object X, and calls greyobject on X.
3. greyobject on G #1 marks X, but does not yet add it to a write
buffer. At this point, X is effectively black, not grey, even though
it may point to white objects.
4. GC reaches X through some other path and calls greyobject on X, but
greyobject does nothing because X is already marked.
5. GC completes.
6. greyobject on G #1 adds X to a work buffer, but it's too late.
7. Objects that were reachable only through X are incorrectly collected.
To fix this, we check the invariant that no asynchronous write
barriers happen when the world is stopped by checking that write
barriers always have a P, and modify all currently known sources of
these writes to disable the write barrier. In all modified cases this
is safe because the object in question will always be reachable via
some other path.
Some of the trace code was turned off, in particular the
code that traces returning from a syscall. The GC assumes
that as far as the heap is concerned the thread is stopped
when it is in a syscall. Upon returning the trace code
must not do any heap writes for the same reasons discussed
above.
Fixes #10098
Fixes #9953
Fixes #9951
Fixes #9884
May relate to #9610 #9771
Change-Id: Ic2e70b7caffa053e56156838eb8d89503e3c0c8a
Reviewed-on: https://go-review.googlesource.com/7504
Reviewed-by: Austin Clements <austin@google.com>
2015-03-12 14:19:21 -04:00
|
|
|
|
2015-11-24 09:15:36 +13:00
|
|
|
//go:nosplit
|
|
|
|
|
func (mp muintptr) ptr() *m { return (*m)(unsafe.Pointer(mp)) }
|
|
|
|
|
|
|
|
|
|
//go:nosplit
|
2015-04-17 00:21:30 -04:00
|
|
|
func (mp *muintptr) set(m *m) { *mp = muintptr(unsafe.Pointer(m)) }
|
runtime: Remove write barriers during STW.
The GC assumes that there will be no asynchronous write barriers when
the world is stopped. This keeps the synchronization between write
barriers and the GC simple. However, currently, there are a few places
in runtime code where this assumption does not hold.
The GC stops the world by collecting all Ps, which stops all user Go
code, but small parts of the runtime can run without a P. For example,
the code that releases a P must still deschedule its G onto a runnable
queue before stopping. Similarly, when a G returns from a long-running
syscall, it must run code to reacquire a P.
Currently, this code can contain write barriers. This can lead to the
GC collecting reachable objects if something like the following
sequence of events happens:
1. GC stops the world by collecting all Ps.
2. G #1 returns from a syscall (for example), tries to install a
pointer to object X, and calls greyobject on X.
3. greyobject on G #1 marks X, but does not yet add it to a write
buffer. At this point, X is effectively black, not grey, even though
it may point to white objects.
4. GC reaches X through some other path and calls greyobject on X, but
greyobject does nothing because X is already marked.
5. GC completes.
6. greyobject on G #1 adds X to a work buffer, but it's too late.
7. Objects that were reachable only through X are incorrectly collected.
To fix this, we check the invariant that no asynchronous write
barriers happen when the world is stopped by checking that write
barriers always have a P, and modify all currently known sources of
these writes to disable the write barrier. In all modified cases this
is safe because the object in question will always be reachable via
some other path.
Some of the trace code was turned off, in particular the
code that traces returning from a syscall. The GC assumes
that as far as the heap is concerned the thread is stopped
when it is in a syscall. Upon returning the trace code
must not do any heap writes for the same reasons discussed
above.
Fixes #10098
Fixes #9953
Fixes #9951
Fixes #9884
May relate to #9610 #9771
Change-Id: Ic2e70b7caffa053e56156838eb8d89503e3c0c8a
Reviewed-on: https://go-review.googlesource.com/7504
Reviewed-by: Austin Clements <austin@google.com>
2015-03-12 14:19:21 -04:00
|
|
|
|
2016-10-19 16:00:07 -04:00
|
|
|
// setMNoWB performs *mp = new without a write barrier.
|
|
|
|
|
// For times when it's impractical to use an muintptr.
|
2022-01-30 20:13:43 -05:00
|
|
|
//
|
2016-10-19 16:00:07 -04:00
|
|
|
//go:nosplit
|
|
|
|
|
//go:nowritebarrier
|
|
|
|
|
func setMNoWB(mp **m, new *m) {
|
|
|
|
|
(*muintptr)(unsafe.Pointer(mp)).set(new)
|
|
|
|
|
}
|
|
|
|
|
|
2014-11-11 17:05:19 -05:00
|
|
|
type gobuf struct {
|
|
|
|
|
// The offsets of sp, pc, and g are known to (hard-coded in) libmach.
|
2016-10-19 15:49:31 -04:00
|
|
|
//
|
|
|
|
|
// ctxt is unusual with respect to GC: it may be a
|
runtime: remove write barriers from newstack, gogo
Currently, newstack and gogo have write barriers for maintaining the
context register saved in g.sched.ctxt. This is troublesome, because
newstack can be called from go:nowritebarrierrec places that can't
allow write barriers. It happens to be benign because g.sched.ctxt
will always be nil on entry to newstack *and* it so happens the
incoming ctxt will also always be nil in these contexts (I
think/hope), but this is playing with fire. It's also desirable to
mark newstack go:nowritebarrierrec to prevent any other, non-benign
write barriers from creeping in, but we can't do that right now
because of this one write barrier.
Fix all of this by observing that g.sched.ctxt is really just a saved
live pointer register. Hence, we can shade it when we scan g's stack
and otherwise move it back and forth between the actual context
register and g.sched.ctxt without write barriers. This means we can
save it in morestack along with all of the other g.sched, eliminate
the save from newstack along with its troublesome write barrier, and
eliminate the shenanigans in gogo to invoke the write barrier when
restoring it.
Once we've done all of this, we can mark newstack
go:nowritebarrierrec.
Fixes #22385.
For #22460.
Change-Id: I43c24958e3f6785b53c1350e1e83c2844e0d1522
Reviewed-on: https://go-review.googlesource.com/72553
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
2017-10-22 21:37:05 -04:00
|
|
|
// heap-allocated funcval, so GC needs to track it, but it
|
|
|
|
|
// needs to be set and cleared from assembly, where it's
|
|
|
|
|
// difficult to have write barriers. However, ctxt is really a
|
|
|
|
|
// saved, live register, and we only ever exchange it between
|
|
|
|
|
// the real register and the gobuf. Hence, we treat it as a
|
|
|
|
|
// root during stack scanning, which means assembly that saves
|
|
|
|
|
// and restores it doesn't need write barriers. It's still
|
|
|
|
|
// typed as a pointer so that any other writes from Go get
|
|
|
|
|
// write barriers.
|
2014-11-11 17:05:19 -05:00
|
|
|
sp uintptr
|
|
|
|
|
pc uintptr
|
2014-12-22 22:43:49 -05:00
|
|
|
g guintptr
|
runtime: remove write barriers from newstack, gogo
Currently, newstack and gogo have write barriers for maintaining the
context register saved in g.sched.ctxt. This is troublesome, because
newstack can be called from go:nowritebarrierrec places that can't
allow write barriers. It happens to be benign because g.sched.ctxt
will always be nil on entry to newstack *and* it so happens the
incoming ctxt will also always be nil in these contexts (I
think/hope), but this is playing with fire. It's also desirable to
mark newstack go:nowritebarrierrec to prevent any other, non-benign
write barriers from creeping in, but we can't do that right now
because of this one write barrier.
Fix all of this by observing that g.sched.ctxt is really just a saved
live pointer register. Hence, we can shade it when we scan g's stack
and otherwise move it back and forth between the actual context
register and g.sched.ctxt without write barriers. This means we can
save it in morestack along with all of the other g.sched, eliminate
the save from newstack along with its troublesome write barrier, and
eliminate the shenanigans in gogo to invoke the write barrier when
restoring it.
Once we've done all of this, we can mark newstack
go:nowritebarrierrec.
Fixes #22385.
For #22460.
Change-Id: I43c24958e3f6785b53c1350e1e83c2844e0d1522
Reviewed-on: https://go-review.googlesource.com/72553
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
2017-10-22 21:37:05 -04:00
|
|
|
ctxt unsafe.Pointer
|
2014-11-11 17:05:19 -05:00
|
|
|
lr uintptr
|
2020-08-21 11:09:45 -07:00
|
|
|
bp uintptr // for framepointer-enabled architectures
|
2014-11-11 17:05:19 -05:00
|
|
|
}
|
|
|
|
|
|
2025-10-02 11:57:58 +00:00
|
|
|
// maybeTraceablePtr is a special pointer that is conditionally trackable
|
|
|
|
|
// by the GC. It consists of an address as a uintptr (vu) and a pointer
|
|
|
|
|
// to a data element (vp).
|
|
|
|
|
//
|
|
|
|
|
// maybeTraceablePtr values can be in one of three states:
|
|
|
|
|
// 1. Unset: vu == 0 && vp == nil
|
|
|
|
|
// 2. Untracked: vu != 0 && vp == nil
|
|
|
|
|
// 3. Tracked: vu != 0 && vp != nil
|
|
|
|
|
//
|
|
|
|
|
// Do not set fields manually. Use methods instead.
|
|
|
|
|
// Extend this type with additional methods if needed.
|
|
|
|
|
type maybeTraceablePtr struct {
|
|
|
|
|
vp unsafe.Pointer // For liveness only.
|
|
|
|
|
vu uintptr // Source of truth.
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// untrack unsets the pointer but preserves the address.
|
|
|
|
|
// This is used to hide the pointer from the GC.
|
|
|
|
|
//
|
|
|
|
|
//go:nosplit
|
|
|
|
|
func (p *maybeTraceablePtr) setUntraceable() {
|
|
|
|
|
p.vp = nil
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// setTraceable resets the pointer to the stored address.
|
|
|
|
|
// This is used to make the pointer visible to the GC.
|
|
|
|
|
//
|
|
|
|
|
//go:nosplit
|
|
|
|
|
func (p *maybeTraceablePtr) setTraceable() {
|
|
|
|
|
p.vp = unsafe.Pointer(p.vu)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// set sets the pointer to the data element and updates the address.
|
|
|
|
|
//
|
|
|
|
|
//go:nosplit
|
|
|
|
|
func (p *maybeTraceablePtr) set(v unsafe.Pointer) {
|
|
|
|
|
p.vp = v
|
|
|
|
|
p.vu = uintptr(v)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// get retrieves the pointer to the data element.
|
|
|
|
|
//
|
|
|
|
|
//go:nosplit
|
|
|
|
|
func (p *maybeTraceablePtr) get() unsafe.Pointer {
|
|
|
|
|
return unsafe.Pointer(p.vu)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// uintptr returns the uintptr address of the pointer.
|
|
|
|
|
//
|
|
|
|
|
//go:nosplit
|
|
|
|
|
func (p *maybeTraceablePtr) uintptr() uintptr {
|
|
|
|
|
return p.vu
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// maybeTraceableChan extends conditionally trackable pointers (maybeTraceablePtr)
|
|
|
|
|
// to track hchan pointers.
|
|
|
|
|
//
|
|
|
|
|
// Do not set fields manually. Use methods instead.
|
|
|
|
|
type maybeTraceableChan struct {
|
|
|
|
|
maybeTraceablePtr
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
//go:nosplit
|
|
|
|
|
func (p *maybeTraceableChan) set(c *hchan) {
|
|
|
|
|
p.maybeTraceablePtr.set(unsafe.Pointer(c))
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
//go:nosplit
|
|
|
|
|
func (p *maybeTraceableChan) get() *hchan {
|
|
|
|
|
return (*hchan)(p.maybeTraceablePtr.get())
|
|
|
|
|
}
|
|
|
|
|
|
runtime: change mutex profile to count every blocked goroutine
The pprof mutex profile was meant to match the Google C++ (now Abseil)
mutex profiler, originally designed and implemented by Mike Burrows.
When we worked on the Go version, pjw and I missed that C++ counts the
time each thread is blocked, even if multiple threads are blocked on a
mutex. That is, if 100 threads are blocked on the same mutex for the
same 10ms, that still counts as 1000ms of contention in C++. In Go, to
date, /debug/pprof/mutex has counted that as only 10ms of contention.
If 100 goroutines are blocked on one mutex and only 1 goroutine is
blocked on another mutex, we probably do want to see the first mutex
as being more contended, so the Abseil approach is the more useful one.
This CL adopts "contention scales with number of goroutines blocked",
to better match Abseil [1]. However, it still makes sure to attribute the
time to the unlock that caused the backup, not subsequent innocent
unlocks that were affected by the congestion. In this way it still gives
more accurate profiles than Abseil does.
[1] https://github.com/abseil/abseil-cpp/blob/lts_2023_01_25/absl/synchronization/mutex.cc#L2390
Fixes #61015.
Change-Id: I7eb9e706867ffa8c0abb5b26a1b448f6eba49331
Reviewed-on: https://go-review.googlesource.com/c/go/+/506415
Run-TryBot: Russ Cox <rsc@golang.org>
Auto-Submit: Russ Cox <rsc@golang.org>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
2023-06-26 17:12:44 -04:00
|
|
|
// sudog (pseudo-g) represents a g in a wait list, such as for sending/receiving
|
2016-03-17 14:08:32 -04:00
|
|
|
// on a channel.
|
|
|
|
|
//
|
|
|
|
|
// sudog is necessary because the g ↔ synchronization object relation
|
|
|
|
|
// is many-to-many. A g can be on many wait lists, so there may be
|
|
|
|
|
// many sudogs for one g; and many gs may be waiting on the same
|
|
|
|
|
// synchronization object, so there may be many sudogs for one object.
|
|
|
|
|
//
|
|
|
|
|
// sudogs are allocated from a special pool. Use acquireSudog and
|
|
|
|
|
// releaseSudog to allocate and free them.
|
2014-11-11 17:05:19 -05:00
|
|
|
type sudog struct {
|
2016-02-18 09:34:43 -05:00
|
|
|
// The following fields are protected by the hchan.lock of the
|
2016-02-15 17:38:06 -05:00
|
|
|
// channel this sudog is blocking on. shrinkstack depends on
|
2017-02-10 14:45:41 -05:00
|
|
|
// this for sudogs involved in channel ops.
|
2016-02-18 09:34:43 -05:00
|
|
|
|
2017-08-02 19:01:17 +01:00
|
|
|
g *g
|
|
|
|
|
|
2020-05-19 16:33:17 +00:00
|
|
|
next *sudog
|
|
|
|
|
prev *sudog
|
2025-10-02 11:57:58 +00:00
|
|
|
|
|
|
|
|
elem maybeTraceablePtr // data element (may point to stack)
|
2016-02-18 09:34:43 -05:00
|
|
|
|
|
|
|
|
// The following fields are never accessed concurrently.
|
2017-02-10 14:45:41 -05:00
|
|
|
// For channels, waitlink is only accessed by g.
|
|
|
|
|
// For semaphores, all fields (including the ones above)
|
|
|
|
|
// are only accessed when holding a semaRoot lock.
|
2016-02-18 09:34:43 -05:00
|
|
|
|
2016-09-22 09:48:30 -04:00
|
|
|
acquiretime int64
|
2014-11-11 17:05:19 -05:00
|
|
|
releasetime int64
|
2016-01-24 19:23:48 +01:00
|
|
|
ticket uint32
|
2020-01-14 19:13:47 +00:00
|
|
|
|
|
|
|
|
// isSelect indicates g is participating in a select, so
|
|
|
|
|
// g.selectDone must be CAS'd to win the wake-up race.
|
|
|
|
|
isSelect bool
|
|
|
|
|
|
2020-07-27 12:40:18 -07:00
|
|
|
// success indicates whether communication over channel c
|
|
|
|
|
// succeeded. It is true if the goroutine was awoken because a
|
|
|
|
|
// value was delivered over channel c, and false if awoken
|
|
|
|
|
// because c was closed.
|
|
|
|
|
success bool
|
|
|
|
|
|
runtime: change mutex profile to count every blocked goroutine
The pprof mutex profile was meant to match the Google C++ (now Abseil)
mutex profiler, originally designed and implemented by Mike Burrows.
When we worked on the Go version, pjw and I missed that C++ counts the
time each thread is blocked, even if multiple threads are blocked on a
mutex. That is, if 100 threads are blocked on the same mutex for the
same 10ms, that still counts as 1000ms of contention in C++. In Go, to
date, /debug/pprof/mutex has counted that as only 10ms of contention.
If 100 goroutines are blocked on one mutex and only 1 goroutine is
blocked on another mutex, we probably do want to see the first mutex
as being more contended, so the Abseil approach is the more useful one.
This CL adopts "contention scales with number of goroutines blocked",
to better match Abseil [1]. However, it still makes sure to attribute the
time to the unlock that caused the backup, not subsequent innocent
unlocks that were affected by the congestion. In this way it still gives
more accurate profiles than Abseil does.
[1] https://github.com/abseil/abseil-cpp/blob/lts_2023_01_25/absl/synchronization/mutex.cc#L2390
Fixes #61015.
Change-Id: I7eb9e706867ffa8c0abb5b26a1b448f6eba49331
Reviewed-on: https://go-review.googlesource.com/c/go/+/506415
Run-TryBot: Russ Cox <rsc@golang.org>
Auto-Submit: Russ Cox <rsc@golang.org>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
2023-06-26 17:12:44 -04:00
|
|
|
// waiters is a count of semaRoot waiting list other than head of list,
|
|
|
|
|
// clamped to a uint16 to fit in unused space.
|
|
|
|
|
// Only meaningful at the head of the list.
|
|
|
|
|
// (If we wanted to be overly clever, we could store a high 16 bits
|
|
|
|
|
// in the second entry in the list.)
|
|
|
|
|
waiters uint16
|
|
|
|
|
|
2025-10-02 11:57:58 +00:00
|
|
|
parent *sudog // semaRoot binary tree
|
|
|
|
|
waitlink *sudog // g.waiting list or semaRoot
|
|
|
|
|
waittail *sudog // semaRoot
|
|
|
|
|
c maybeTraceableChan // channel
|
2014-11-11 17:05:19 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
type libcall struct {
|
|
|
|
|
fn uintptr
|
|
|
|
|
n uintptr // number of parameters
|
|
|
|
|
args uintptr // parameters
|
|
|
|
|
r1 uintptr // return values
|
|
|
|
|
r2 uintptr
|
|
|
|
|
err uintptr // error number
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Stack describes a Go execution stack.
|
|
|
|
|
// The bounds of the stack are exactly [lo, hi),
|
|
|
|
|
// with no implicit data structures on either side.
|
|
|
|
|
type stack struct {
|
|
|
|
|
lo uintptr
|
|
|
|
|
hi uintptr
|
|
|
|
|
}
|
|
|
|
|
|
runtime: static lock ranking for the runtime (enabled by GOEXPERIMENT)
I took some of the infrastructure from Austin's lock logging CR
https://go-review.googlesource.com/c/go/+/192704 (with deadlock
detection from the logs), and developed a setup to give static lock
ranking for runtime locks.
Static lock ranking establishes a documented total ordering among locks,
and then reports an error if the total order is violated. This can
happen if a deadlock happens (by acquiring a sequence of locks in
different orders), or if just one side of a possible deadlock happens.
Lock ordering deadlocks cannot happen as long as the lock ordering is
followed.
Along the way, I found a deadlock involving the new timer code, which Ian fixed
via https://go-review.googlesource.com/c/go/+/207348, as well as two other
potential deadlocks.
See the constants at the top of runtime/lockrank.go to show the static
lock ranking that I ended up with, along with some comments. This is
great documentation of the current intended lock ordering when acquiring
multiple locks in the runtime.
I also added an array lockPartialOrder[] which shows and enforces the
current partial ordering among locks (which is embedded within the total
ordering). This is more specific about the dependencies among locks.
I don't try to check the ranking within a lock class with multiple locks
that can be acquired at the same time (i.e. check the ranking when
multiple hchan locks are acquired).
Currently, I am doing a lockInit() call to set the lock rank of most
locks. Any lock that is not otherwise initialized is assumed to be a
leaf lock (a very high rank lock), so that eliminates the need to do
anything for a bunch of locks (including all architecture-dependent
locks). For two locks, root.lock and notifyList.lock (only in the
runtime/sema.go file), it is not as easy to do lock initialization, so
instead, I am passing the lock rank with the lock calls.
For Windows compilation, I needed to increase the StackGuard size from
896 to 928 because of the new lock-rank checking functions.
Checking of the static lock ranking is enabled by setting
GOEXPERIMENT=staticlockranking before doing a run.
To make sure that the static lock ranking code has no overhead in memory
or CPU when not enabled by GOEXPERIMENT, I changed 'go build/install' so
that it defines a build tag (with the same name) whenever any experiment
has been baked into the toolchain (by checking Expstring()). This allows
me to avoid increasing the size of the 'mutex' type when static lock
ranking is not enabled.
Fixes #38029
Change-Id: I154217ff307c47051f8dae9c2a03b53081acd83a
Reviewed-on: https://go-review.googlesource.com/c/go/+/207619
Reviewed-by: Dan Scales <danscales@google.com>
Reviewed-by: Keith Randall <khr@golang.org>
Run-TryBot: Dan Scales <danscales@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2019-11-13 17:34:47 -08:00
|
|
|
// heldLockInfo gives info on a held lock and the rank of that lock
|
|
|
|
|
type heldLockInfo struct {
|
|
|
|
|
lockAddr uintptr
|
|
|
|
|
rank lockRank
|
|
|
|
|
}
|
|
|
|
|
|
2014-11-11 17:05:19 -05:00
|
|
|
type g struct {
|
|
|
|
|
// Stack parameters.
|
|
|
|
|
// stack describes the actual stack memory: [stack.lo, stack.hi).
|
2015-01-05 16:29:21 +00:00
|
|
|
// stackguard0 is the stack pointer compared in the Go stack growth prologue.
|
2014-11-11 17:05:19 -05:00
|
|
|
// It is stack.lo+StackGuard normally, but can be StackPreempt to trigger a preemption.
|
2023-09-08 14:54:29 -04:00
|
|
|
// stackguard1 is the stack pointer compared in the //go:systemstack stack growth prologue.
|
2015-01-05 16:29:21 +00:00
|
|
|
// It is stack.lo+StackGuard on g0 and gsignal stacks.
|
|
|
|
|
// It is ~0 on other goroutine stacks, to trigger a call to morestackc (and crash).
|
|
|
|
|
stack stack // offset known to runtime/cgo
|
|
|
|
|
stackguard0 uintptr // offset known to liblink
|
|
|
|
|
stackguard1 uintptr // offset known to liblink
|
2014-11-11 17:05:19 -05:00
|
|
|
|
runtime: update debug call protocol for register ABI
The debug call tests currently assume that the target Go function is
ABI0; this is clearly no longer true when we switch to the new ABI, so
make the tests set up argument register state in the debug call handler
and copy back results returned in registers.
A small snag in calling a Go function that follows the new ABI is that
the debug call protocol depends on the AX register being set to a
specific value as it bounces in and out of the handler, but this
register is part of the new register ABI, so results end up being
clobbered. Use R12 instead.
Next, the new desugaring behavior for "go" statements means that
newosproc1 must always call a function with no frame; if it takes any
arguments, it closes over them and they're passed in the context
register. Currently when debugCallWrap creates a new goroutine, it uses
newosproc1 directly and passes a non-zero-sized frame, so that needs to
be updated. To fix this, briefly use the g's param field which is
otherwise only used for channels to pass an explicitly allocated object
containing the "closed over" variables. While we could manually do the
desugaring ourselves (we cannot do so automatically because the Go
compiler prevents heap-allocated closures in the runtime), that bakes in
more ABI details in a place that really doesn't need to care about them.
Finally, there's an old bug here where the context register was set up
in CX, so technically closure calls never worked. Oops. It was otherwise
harmless for other types of calls before, but now CX is an argument
register, so now that interferes with regular calls, too.
For #40724.
Change-Id: I652c25ed56a25741bb04c24cfb603063c099edde
Reviewed-on: https://go-review.googlesource.com/c/go/+/309169
Trust: Michael Knyszek <mknyszek@google.com>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Austin Clements <austin@google.com>
Reviewed-by: Alessandro Arzilli <alessandro.arzilli@gmail.com>
TryBot-Result: Go Bot <gobot@golang.org>
2021-04-10 00:05:07 +00:00
|
|
|
_panic *_panic // innermost panic - offset known to liblink
|
|
|
|
|
_defer *_defer // innermost defer
|
|
|
|
|
m *m // current m; offset known to arm liblink
|
|
|
|
|
sched gobuf
|
|
|
|
|
syscallsp uintptr // if status==Gsyscall, syscallsp = sched.sp to use during gc
|
|
|
|
|
syscallpc uintptr // if status==Gsyscall, syscallpc = sched.pc to use during gc
|
2024-04-18 20:54:55 +00:00
|
|
|
syscallbp uintptr // if status==Gsyscall, syscallbp = sched.bp to use in fpTraceback
|
runtime: update debug call protocol for register ABI
The debug call tests currently assume that the target Go function is
ABI0; this is clearly no longer true when we switch to the new ABI, so
make the tests set up argument register state in the debug call handler
and copy back results returned in registers.
A small snag in calling a Go function that follows the new ABI is that
the debug call protocol depends on the AX register being set to a
specific value as it bounces in and out of the handler, but this
register is part of the new register ABI, so results end up being
clobbered. Use R12 instead.
Next, the new desugaring behavior for "go" statements means that
newosproc1 must always call a function with no frame; if it takes any
arguments, it closes over them and they're passed in the context
register. Currently when debugCallWrap creates a new goroutine, it uses
newosproc1 directly and passes a non-zero-sized frame, so that needs to
be updated. To fix this, briefly use the g's param field which is
otherwise only used for channels to pass an explicitly allocated object
containing the "closed over" variables. While we could manually do the
desugaring ourselves (we cannot do so automatically because the Go
compiler prevents heap-allocated closures in the runtime), that bakes in
more ABI details in a place that really doesn't need to care about them.
Finally, there's an old bug here where the context register was set up
in CX, so technically closure calls never worked. Oops. It was otherwise
harmless for other types of calls before, but now CX is an argument
register, so now that interferes with regular calls, too.
For #40724.
Change-Id: I652c25ed56a25741bb04c24cfb603063c099edde
Reviewed-on: https://go-review.googlesource.com/c/go/+/309169
Trust: Michael Knyszek <mknyszek@google.com>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Austin Clements <austin@google.com>
Reviewed-by: Alessandro Arzilli <alessandro.arzilli@gmail.com>
TryBot-Result: Go Bot <gobot@golang.org>
2021-04-10 00:05:07 +00:00
|
|
|
stktopsp uintptr // expected sp at top of stack, to check in traceback
|
|
|
|
|
// param is a generic pointer parameter field used to pass
|
|
|
|
|
// values in particular contexts where other storage for the
|
|
|
|
|
// parameter would be difficult to find. It is currently used
|
2023-08-03 20:29:47 -07:00
|
|
|
// in four ways:
|
runtime: update debug call protocol for register ABI
The debug call tests currently assume that the target Go function is
ABI0; this is clearly no longer true when we switch to the new ABI, so
make the tests set up argument register state in the debug call handler
and copy back results returned in registers.
A small snag in calling a Go function that follows the new ABI is that
the debug call protocol depends on the AX register being set to a
specific value as it bounces in and out of the handler, but this
register is part of the new register ABI, so results end up being
clobbered. Use R12 instead.
Next, the new desugaring behavior for "go" statements means that
newosproc1 must always call a function with no frame; if it takes any
arguments, it closes over them and they're passed in the context
register. Currently when debugCallWrap creates a new goroutine, it uses
newosproc1 directly and passes a non-zero-sized frame, so that needs to
be updated. To fix this, briefly use the g's param field which is
otherwise only used for channels to pass an explicitly allocated object
containing the "closed over" variables. While we could manually do the
desugaring ourselves (we cannot do so automatically because the Go
compiler prevents heap-allocated closures in the runtime), that bakes in
more ABI details in a place that really doesn't need to care about them.
Finally, there's an old bug here where the context register was set up
in CX, so technically closure calls never worked. Oops. It was otherwise
harmless for other types of calls before, but now CX is an argument
register, so now that interferes with regular calls, too.
For #40724.
Change-Id: I652c25ed56a25741bb04c24cfb603063c099edde
Reviewed-on: https://go-review.googlesource.com/c/go/+/309169
Trust: Michael Knyszek <mknyszek@google.com>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Austin Clements <austin@google.com>
Reviewed-by: Alessandro Arzilli <alessandro.arzilli@gmail.com>
TryBot-Result: Go Bot <gobot@golang.org>
2021-04-10 00:05:07 +00:00
|
|
|
// 1. When a channel operation wakes up a blocked goroutine, it sets param to
|
|
|
|
|
// point to the sudog of the completed blocking operation.
|
|
|
|
|
// 2. By gcAssistAlloc1 to signal back to its caller that the goroutine completed
|
|
|
|
|
// the GC cycle. It is unsafe to do so in any other way, because the goroutine's
|
|
|
|
|
// stack may have moved in the meantime.
|
|
|
|
|
// 3. By debugCallWrap to pass parameters to a new goroutine because allocating a
|
|
|
|
|
// closure in the runtime is forbidden.
|
2023-08-03 20:29:47 -07:00
|
|
|
// 4. When a panic is recovered and control returns to the respective frame,
|
|
|
|
|
// param may point to a savedOpenDeferState.
|
runtime: update debug call protocol for register ABI
The debug call tests currently assume that the target Go function is
ABI0; this is clearly no longer true when we switch to the new ABI, so
make the tests set up argument register state in the debug call handler
and copy back results returned in registers.
A small snag in calling a Go function that follows the new ABI is that
the debug call protocol depends on the AX register being set to a
specific value as it bounces in and out of the handler, but this
register is part of the new register ABI, so results end up being
clobbered. Use R12 instead.
Next, the new desugaring behavior for "go" statements means that
newosproc1 must always call a function with no frame; if it takes any
arguments, it closes over them and they're passed in the context
register. Currently when debugCallWrap creates a new goroutine, it uses
newosproc1 directly and passes a non-zero-sized frame, so that needs to
be updated. To fix this, briefly use the g's param field which is
otherwise only used for channels to pass an explicitly allocated object
containing the "closed over" variables. While we could manually do the
desugaring ourselves (we cannot do so automatically because the Go
compiler prevents heap-allocated closures in the runtime), that bakes in
more ABI details in a place that really doesn't need to care about them.
Finally, there's an old bug here where the context register was set up
in CX, so technically closure calls never worked. Oops. It was otherwise
harmless for other types of calls before, but now CX is an argument
register, so now that interferes with regular calls, too.
For #40724.
Change-Id: I652c25ed56a25741bb04c24cfb603063c099edde
Reviewed-on: https://go-review.googlesource.com/c/go/+/309169
Trust: Michael Knyszek <mknyszek@google.com>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Austin Clements <austin@google.com>
Reviewed-by: Alessandro Arzilli <alessandro.arzilli@gmail.com>
TryBot-Result: Go Bot <gobot@golang.org>
2021-04-10 00:05:07 +00:00
|
|
|
param unsafe.Pointer
|
2022-08-25 03:27:02 +08:00
|
|
|
atomicstatus atomic.Uint32
|
runtime: make copystack/sudog synchronization more explicit
When we copy a stack of a goroutine blocked in a channel operation, we
have to be very careful because other goroutines may be writing to
that goroutine's stack. To handle this, stack copying acquires the
locks for the channels a goroutine is waiting on.
One complication is that stack growth may happen while a goroutine
holds these locks, in which case stack copying must *not* acquire
these locks because that would self-deadlock.
Currently, stack growth never acquires these locks because stack
growth only happens when a goroutine is running, which means it's
either not blocking on a channel or it's holding the channel locks
already. Stack shrinking always acquires these locks because shrinking
happens asynchronously, so the goroutine is never running, so there
are either no locks or they've been released by the goroutine.
However, we're about to change when stack shrinking can happen, which
is going to break the current rules. Rather than find a new way to
derive whether to acquire these locks or not, this CL simply adds a
flag to the g struct that indicates that stack copying should acquire
channel locks. This flag is set while the goroutine is blocked on a
channel op.
For #10958, #24543.
Change-Id: Ia2ac8831b1bfda98d39bb30285e144c4f7eaf9ab
Reviewed-on: https://go-review.googlesource.com/c/go/+/172982
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
2019-04-03 14:00:12 -04:00
|
|
|
stackLock uint32 // sigprof/scang lock; TODO: fold in to atomicstatus
|
2022-07-19 13:49:33 -04:00
|
|
|
goid uint64
|
runtime: make copystack/sudog synchronization more explicit
When we copy a stack of a goroutine blocked in a channel operation, we
have to be very careful because other goroutines may be writing to
that goroutine's stack. To handle this, stack copying acquires the
locks for the channels a goroutine is waiting on.
One complication is that stack growth may happen while a goroutine
holds these locks, in which case stack copying must *not* acquire
these locks because that would self-deadlock.
Currently, stack growth never acquires these locks because stack
growth only happens when a goroutine is running, which means it's
either not blocking on a channel or it's holding the channel locks
already. Stack shrinking always acquires these locks because shrinking
happens asynchronously, so the goroutine is never running, so there
are either no locks or they've been released by the goroutine.
However, we're about to change when stack shrinking can happen, which
is going to break the current rules. Rather than find a new way to
derive whether to acquire these locks or not, this CL simply adds a
flag to the g struct that indicates that stack copying should acquire
channel locks. This flag is set while the goroutine is blocked on a
channel op.
For #10958, #24543.
Change-Id: Ia2ac8831b1bfda98d39bb30285e144c4f7eaf9ab
Reviewed-on: https://go-review.googlesource.com/c/go/+/172982
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
2019-04-03 14:00:12 -04:00
|
|
|
schedlink guintptr
|
|
|
|
|
waitsince int64 // approx time when the g become blocked
|
|
|
|
|
waitreason waitReason // if status==Gwaiting
|
2019-09-27 14:34:05 -04:00
|
|
|
|
|
|
|
|
preempt bool // preemption signal, duplicates stackguard0 = stackpreempt
|
|
|
|
|
preemptStop bool // transition to _Gpreempted on preemption; otherwise, just deschedule
|
|
|
|
|
preemptShrink bool // shrink stack at synchronous safe point
|
|
|
|
|
|
2019-10-08 13:23:51 -04:00
|
|
|
// asyncSafePoint is set if g is stopped at an asynchronous
|
|
|
|
|
// safe point. This means there are frames on the stack
|
|
|
|
|
// without precise pointer information.
|
|
|
|
|
asyncSafePoint bool
|
|
|
|
|
|
2019-09-27 14:34:05 -04:00
|
|
|
paniconfault bool // panic (instead of crash) on unexpected fault address
|
|
|
|
|
gcscandone bool // g has scanned stack; protected by _Gscan bit in status
|
|
|
|
|
throwsplit bool // must not split stack
|
runtime: make copystack/sudog synchronization more explicit
When we copy a stack of a goroutine blocked in a channel operation, we
have to be very careful because other goroutines may be writing to
that goroutine's stack. To handle this, stack copying acquires the
locks for the channels a goroutine is waiting on.
One complication is that stack growth may happen while a goroutine
holds these locks, in which case stack copying must *not* acquire
these locks because that would self-deadlock.
Currently, stack growth never acquires these locks because stack
growth only happens when a goroutine is running, which means it's
either not blocking on a channel or it's holding the channel locks
already. Stack shrinking always acquires these locks because shrinking
happens asynchronously, so the goroutine is never running, so there
are either no locks or they've been released by the goroutine.
However, we're about to change when stack shrinking can happen, which
is going to break the current rules. Rather than find a new way to
derive whether to acquire these locks or not, this CL simply adds a
flag to the g struct that indicates that stack copying should acquire
channel locks. This flag is set while the goroutine is blocked on a
channel op.
For #10958, #24543.
Change-Id: Ia2ac8831b1bfda98d39bb30285e144c4f7eaf9ab
Reviewed-on: https://go-review.googlesource.com/c/go/+/172982
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
2019-04-03 14:00:12 -04:00
|
|
|
// activeStackChans indicates that there are unlocked channels
|
|
|
|
|
// pointing into this goroutine's stack. If true, stack
|
|
|
|
|
// copying needs to acquire channel locks to protect these
|
|
|
|
|
// areas of the stack.
|
|
|
|
|
activeStackChans bool
|
runtime: disable stack shrinking in activeStackChans race window
Currently activeStackChans is set before a goroutine blocks on a channel
operation in an unlockf passed to gopark. The trouble is that the
unlockf is called *after* the G's status is changed, and the G's status
is what is used by a concurrent mark worker (calling suspendG) to
determine that a G has successfully been suspended. In this window
between the status change and unlockf, the mark worker could try to
shrink the G's stack, and in particular observe that activeStackChans is
false. This observation will cause the mark worker to *not* synchronize
with concurrent channel operations when it should, and so updating
pointers in the sudog for the blocked goroutine (which may point to the
goroutine's stack) races with channel operations which may also
manipulate the pointer (read it, dereference it, update it, etc.).
Fix the problem by adding a new atomically-updated flag to the g struct
called parkingOnChan, which is non-zero in the race window above. Then,
in isShrinkStackSafe, check if parkingOnChan is zero. The race is
resolved like so:
* Blocking G sets parkingOnChan, then changes status in gopark.
* Mark worker successfully suspends blocking G.
* If the mark worker observes parkingOnChan is non-zero when checking
isShrinkStackSafe, then it's not safe to shrink (we're in the race
window).
* If the mark worker observes parkingOnChan as zero, then because
the mark worker observed the G status change, it can be sure that
gopark's unlockf completed, and gp.activeStackChans will be correct.
The risk of this change is low, since although it reduces the number of
places that stack shrinking is allowed, the window here is incredibly
small. Essentially, every place that it might crash now is replaced with
no shrink.
This change adds a test, but the race window is so small that it's hard
to trigger without a well-placed sleep in park_m. Also, this change
fixes stackGrowRecursive in proc_test.go to actually allocate a 128-byte
stack frame. It turns out the compiler was destructuring the "pad" field
and only allocating one uint64 on the stack.
Fixes #40641.
Change-Id: I7dfbe7d460f6972b8956116b137bc13bc24464e8
Reviewed-on: https://go-review.googlesource.com/c/go/+/247050
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Michael Pratt <mpratt@google.com>
Trust: Michael Knyszek <mknyszek@google.com>
2020-08-10 20:02:22 +00:00
|
|
|
// parkingOnChan indicates that the goroutine is about to
|
|
|
|
|
// park on a chansend or chanrecv. Used to signal an unsafe point
|
2022-08-17 14:13:06 +07:00
|
|
|
// for stack shrinking.
|
|
|
|
|
parkingOnChan atomic.Bool
|
2023-05-08 22:29:52 +00:00
|
|
|
// inMarkAssist indicates whether the goroutine is in mark assist.
|
|
|
|
|
// Used by the execution tracer.
|
|
|
|
|
inMarkAssist bool
|
2023-11-20 11:22:48 +11:00
|
|
|
coroexit bool // argument to coroswitch_m
|
runtime: make copystack/sudog synchronization more explicit
When we copy a stack of a goroutine blocked in a channel operation, we
have to be very careful because other goroutines may be writing to
that goroutine's stack. To handle this, stack copying acquires the
locks for the channels a goroutine is waiting on.
One complication is that stack growth may happen while a goroutine
holds these locks, in which case stack copying must *not* acquire
these locks because that would self-deadlock.
Currently, stack growth never acquires these locks because stack
growth only happens when a goroutine is running, which means it's
either not blocking on a channel or it's holding the channel locks
already. Stack shrinking always acquires these locks because shrinking
happens asynchronously, so the goroutine is never running, so there
are either no locks or they've been released by the goroutine.
However, we're about to change when stack shrinking can happen, which
is going to break the current rules. Rather than find a new way to
derive whether to acquire these locks or not, this CL simply adds a
flag to the g struct that indicates that stack copying should acquire
channel locks. This flag is set while the goroutine is blocked on a
channel op.
For #10958, #24543.
Change-Id: Ia2ac8831b1bfda98d39bb30285e144c4f7eaf9ab
Reviewed-on: https://go-review.googlesource.com/c/go/+/172982
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
2019-04-03 14:00:12 -04:00
|
|
|
|
2025-02-19 16:33:21 +00:00
|
|
|
raceignore int8 // ignore race detection events
|
|
|
|
|
nocgocallback bool // whether disable callback from C
|
|
|
|
|
tracking bool // whether we're tracking this G for sched latency statistics
|
|
|
|
|
trackingSeq uint8 // used to decide whether to track this G
|
|
|
|
|
trackingStamp int64 // timestamp of when the G last started being tracked
|
|
|
|
|
runnableTime int64 // the amount of time spent runnable, cleared when running, only used when tracking
|
|
|
|
|
lockedm muintptr
|
|
|
|
|
fipsIndicator uint8
|
2025-06-27 00:59:49 +00:00
|
|
|
syncSafePoint bool // set if g is stopped at a synchronous safe point.
|
2025-02-19 16:33:21 +00:00
|
|
|
runningCleanups atomic.Bool
|
|
|
|
|
sig uint32
|
|
|
|
|
writebuf []byte
|
|
|
|
|
sigcode0 uintptr
|
|
|
|
|
sigcode1 uintptr
|
|
|
|
|
sigpc uintptr
|
|
|
|
|
parentGoid uint64 // goid of goroutine that created this goroutine
|
|
|
|
|
gopc uintptr // pc of go statement that created this goroutine
|
|
|
|
|
ancestors *[]ancestorInfo // ancestor information goroutine(s) that created this goroutine (only used if debug.tracebackancestors)
|
|
|
|
|
startpc uintptr // pc of goroutine function
|
|
|
|
|
racectx uintptr
|
|
|
|
|
waiting *sudog // sudog structures this g is waiting on (that have a valid elem ptr); in lock order
|
|
|
|
|
cgoCtxt []uintptr // cgo traceback context
|
|
|
|
|
labels unsafe.Pointer // profiler labels
|
|
|
|
|
timer *timer // cached timer for time.Sleep
|
|
|
|
|
sleepWhen int64 // when to sleep until
|
|
|
|
|
selectDone atomic.Uint32 // are we participating in a select and did someone win the race?
|
2016-12-09 16:00:02 -05:00
|
|
|
|
2022-02-18 10:56:16 -08:00
|
|
|
// goroutineProfiled indicates the status of this goroutine's stack for the
|
|
|
|
|
// current in-progress goroutine profile
|
|
|
|
|
goroutineProfiled goroutineProfileStateHolder
|
|
|
|
|
|
2025-05-01 13:24:50 -04:00
|
|
|
coroarg *coro // argument during coroutine transfers
|
|
|
|
|
bubble *synctestBubble
|
2024-04-14 16:55:57 +05:00
|
|
|
|
runtime: save scalar registers off stack in amd64 async preemption
Asynchronous preemption must save all registers that could be in use
by Go code. Currently, it saves all of these to the goroutine stack.
As a result, the stack frame requirements of asynchronous preemption
can be rather high. On amd64, this requires 368 bytes of stack space,
most of which is the XMM registers. Several RISC architectures are
around 0.5 KiB.
As we add support for SIMD instructions, this is going to become a
problem. The AVX-512 register state is 2.5 KiB. This well exceeds the
nosplit limit, and even if it didn't, could constrain when we can
asynchronously preempt goroutines on small stacks.
This CL fixes this by moving pure scalar state stored in non-GP
registers off the stack and into an allocated "extended register
state" object. To reduce space overhead, we only allocate these
objects as needed. While in the theoretical limit, every G could need
this register state, in practice very few do at a time.
However, we can't allocate when we're in the middle of saving the
register state during an asynchronous preemption, so we reserve
scratch space on every P to temporarily store the register state,
which can then be copied out to an allocated state object later by Go
code.
This commit only implements this for amd64, since that's where we're
about to add much more vector state, but it lays the groundwork for
doing this on any architecture that could benefit.
This is a cherry-pick of CL 680898 plus bug fix CL 684836 from the
dev.simd branch.
Change-Id: I123a95e21c11d5c10942d70e27f84d2d99bbf735
Reviewed-on: https://go-review.googlesource.com/c/go/+/669195
Auto-Submit: Austin Clements <austin@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
2025-04-29 22:55:40 -04:00
|
|
|
// xRegs stores the extended register state if this G has been
|
|
|
|
|
// asynchronously preempted.
|
|
|
|
|
xRegs xRegPerG
|
|
|
|
|
|
2023-05-10 20:12:35 +00:00
|
|
|
// Per-G tracer state.
|
|
|
|
|
trace gTraceState
|
|
|
|
|
|
2016-03-04 11:58:26 -05:00
|
|
|
// Per-G GC state
|
|
|
|
|
|
runtime: directly track GC assist balance
Currently we track the per-G GC assist balance as two monotonically
increasing values: the bytes allocated by the G this cycle (gcalloc)
and the scan work performed by the G this cycle (gcscanwork). The
assist balance is hence assistRatio*gcalloc - gcscanwork.
This works, but has two important downsides:
1) It requires floating-point math to figure out if a G is in debt or
not. This makes it inappropriate to check for assist debt in the
hot path of mallocgc, so we only do this when a G allocates a new
span. As a result, Gs can operate "in the red", leading to
under-assist and extended GC cycle length.
2) Revising the assist ratio during a GC cycle can lead to an "assist
burst". If you think of plotting the scan work performed versus
heaps size, the assist ratio controls the slope of this line.
However, in the current system, the target line always passes
through 0 at the heap size that triggered GC, so if the runtime
increases the assist ratio, there has to be a potentially large
assist to jump from the current amount of scan work up to the new
target scan work for the current heap size.
This commit replaces this approach with directly tracking the GC
assist balance in terms of allocation credit bytes. Allocating N bytes
simply decreases this by N and assisting raises it by the amount of
scan work performed divided by the assist ratio (to get back to
bytes).
This will make it cheap to figure out if a G is in debt, which will
let us efficiently check if an assist is necessary *before* performing
an allocation and hence keep Gs "in the black".
This also fixes assist bursts because the assist ratio is now in terms
of *remaining* work, rather than work from the beginning of the GC
cycle. Hence, the plot of scan work versus heap size becomes
continuous: we can revise the slope, but this slope always starts from
where we are right now, rather than where we were at the beginning of
the cycle.
Change-Id: Ia821c5f07f8a433e8da7f195b52adfedd58bdf2c
Reviewed-on: https://go-review.googlesource.com/15408
Reviewed-by: Rick Hudson <rlh@golang.org>
2015-10-04 20:16:57 -07:00
|
|
|
// gcAssistBytes is this G's GC assist credit in terms of
|
|
|
|
|
// bytes allocated. If this is positive, then the G has credit
|
|
|
|
|
// to allocate gcAssistBytes bytes without assisting. If this
|
|
|
|
|
// is negative, then the G must correct this by performing
|
|
|
|
|
// scan work. We track this in bytes to make it fast to update
|
|
|
|
|
// and check for debt in the malloc hot path. The assist ratio
|
|
|
|
|
// determines how this corresponds to scan work debt.
|
|
|
|
|
gcAssistBytes int64
|
runtime: add valgrind instrumentation
Add build tag gated Valgrind annotations to the runtime which let it
understand how the runtime manages memory. This allows for Go binaries
to be run under Valgrind without emitting spurious errors.
Instead of adding the Valgrind headers to the tree, and using cgo to
call the various Valgrind client request macros, we just add an assembly
function which emits the necessary instructions to trigger client
requests.
In particular we add instrumentation of the memory allocator, using a
two-level mempool structure (as described in the Valgrind manual [0]).
We also add annotations which allow Valgrind to track which memory we
use for stacks, which seems necessary to let it properly function.
We describe the memory model to Valgrind as follows: we treat heap
arenas as a "pool" created with VALGRIND_CREATE_MEMPOOL_EXT (so that we
can use VALGRIND_MEMPOOL_METAPOOL and VALGRIND_MEMPOOL_AUTO_FREE).
Within the pool we treat spans as "superblocks", annotated with
VALGRIND_MEMPOOL_ALLOC. We then allocate individual objects within spans
with VALGRIND_MALLOCLIKE_BLOCK.
It should be noted that running binaries under Valgrind can be _quite
slow_, and certain operations, such as running the GC, can be _very
slow_. It is recommended to run programs with GOGC=off. Additionally,
async preemption should be turned off, since it'll cause strange
behavior (GODEBUG=asyncpreemptoff=1).
Running Valgrind with --leak-check=yes will result in some errors
resulting from some things not being marked fully free'd. These likely
need more annotations to rectify, but for now it is recommended to run
with --leak-check=off.
Updates #73602
[0] https://valgrind.org/docs/manual/mc-manual.html#mc-manual.mempools
Change-Id: I71b26c47d7084de71ef1e03947ef6b1cc6d38301
Reviewed-on: https://go-review.googlesource.com/c/go/+/674077
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
2025-03-22 00:58:55 +00:00
|
|
|
|
|
|
|
|
// valgrindStackID is used to track what memory is used for stacks when a program is
|
|
|
|
|
// built with the "valgrind" build tag, otherwise it is unused.
|
|
|
|
|
valgrindStackID uintptr
|
2014-11-11 17:05:19 -05:00
|
|
|
}
|
|
|
|
|
|
2020-08-07 16:28:35 +00:00
|
|
|
// gTrackingPeriod is the number of transitions out of _Grunning between
|
|
|
|
|
// latency tracking runs.
|
|
|
|
|
const gTrackingPeriod = 8
|
|
|
|
|
|
runtime: set up read-only dummy TLS space for needm on Windows
On Windows, TLS is uninitialized for C threads calling into Go code.
In this path, before calling into user Go code, we call into needm which
runs without an m, but whose purpose is to pick one up. While in Go
code, we may occasionally restore the G register from TLS for a number
of reasons. Rather than try to flag all these cases, given that needm
(and its callees) are already somewhat special, just set up a dummy TLS
space for it that's read-only. If it ever actually tries to write to
this space (it shouldn't), it will fail loudly. Otherwise, code that
restores the G register will simply load a zero value, but that's OK
since needm is careful never to require the G at any point, because it
doesn't yet have a valid G. Furthermore, by the time needm returns, it
will have set up TLS properly for a Windows C thread, so there's no need
to do anything extra afterwards.
For #40724.
Change-Id: I34e8095059817e4ee663505e89cda8785b634b98
Reviewed-on: https://go-review.googlesource.com/c/go/+/307872
Trust: Michael Knyszek <mknyszek@google.com>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
2021-04-07 19:58:01 +00:00
|
|
|
const (
|
|
|
|
|
// tlsSlots is the number of pointer-sized slots reserved for TLS on some platforms,
|
|
|
|
|
// like Windows.
|
|
|
|
|
tlsSlots = 6
|
2021-06-16 23:05:44 +00:00
|
|
|
tlsSize = tlsSlots * goarch.PtrSize
|
runtime: set up read-only dummy TLS space for needm on Windows
On Windows, TLS is uninitialized for C threads calling into Go code.
In this path, before calling into user Go code, we call into needm which
runs without an m, but whose purpose is to pick one up. While in Go
code, we may occasionally restore the G register from TLS for a number
of reasons. Rather than try to flag all these cases, given that needm
(and its callees) are already somewhat special, just set up a dummy TLS
space for it that's read-only. If it ever actually tries to write to
this space (it shouldn't), it will fail loudly. Otherwise, code that
restores the G register will simply load a zero value, but that's OK
since needm is careful never to require the G at any point, because it
doesn't yet have a valid G. Furthermore, by the time needm returns, it
will have set up TLS properly for a Windows C thread, so there's no need
to do anything extra afterwards.
For #40724.
Change-Id: I34e8095059817e4ee663505e89cda8785b634b98
Reviewed-on: https://go-review.googlesource.com/c/go/+/307872
Trust: Michael Knyszek <mknyszek@google.com>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
2021-04-07 19:58:01 +00:00
|
|
|
)
|
|
|
|
|
|
2022-10-18 12:01:18 -04:00
|
|
|
// Values for m.freeWait.
|
|
|
|
|
const (
|
2022-10-20 16:48:21 -04:00
|
|
|
freeMStack = 0 // M done, free stack and reference.
|
|
|
|
|
freeMRef = 1 // M done, free reference.
|
|
|
|
|
freeMWait = 2 // M still in use.
|
2022-10-18 12:01:18 -04:00
|
|
|
)
|
|
|
|
|
|
2014-11-11 17:05:19 -05:00
|
|
|
type m struct {
|
2015-07-30 10:45:01 -04:00
|
|
|
g0 *g // goroutine with scheduling stack
|
|
|
|
|
morebuf gobuf // gobuf arg to morestack
|
2025-02-25 14:21:42 -08:00
|
|
|
divmod uint32 // div/mod denominator for arm - known to liblink (cmd/internal/obj/arm/obj5.go)
|
2014-11-11 17:05:19 -05:00
|
|
|
|
2025-02-02 19:50:39 +00:00
|
|
|
// Fields whose offsets are not known to debuggers.
|
|
|
|
|
|
|
|
|
|
procid uint64 // for debuggers, but offset not hard-coded
|
|
|
|
|
gsignal *g // signal-handling g
|
|
|
|
|
goSigStack gsignalStack // Go-allocated signal handling stack
|
|
|
|
|
sigmask sigset // storage for saved signal mask
|
|
|
|
|
tls [tlsSlots]uintptr // thread-local storage (for x86 extern register)
|
|
|
|
|
mstartfn func()
|
|
|
|
|
curg *g // current running goroutine
|
|
|
|
|
caughtsig guintptr // goroutine running during fatal signal
|
|
|
|
|
|
|
|
|
|
// p is the currently attached P for executing Go code, nil if not executing user Go code.
|
|
|
|
|
//
|
|
|
|
|
// A non-nil p implies exclusive ownership of the P, unless curg is in _Gsyscall.
|
|
|
|
|
// In _Gsyscall the scheduler may mutate this instead. The point of synchronization
|
|
|
|
|
// is the _Gscan bit on curg's status. The scheduler must arrange to prevent curg
|
|
|
|
|
// from transitioning out of _Gsyscall if it intends to mutate p.
|
|
|
|
|
p puintptr
|
|
|
|
|
|
|
|
|
|
nextp puintptr // The next P to install before executing. Implies exclusive ownership of this P.
|
|
|
|
|
oldp puintptr // The P that was attached before executing a syscall.
|
runtime: update and restore g0 stack bounds at cgocallback
Currently, at a cgo callback where there is already a Go frame on
the stack (i.e. C->Go->C->Go), we require that at the inner Go
callback the SP is within the g0's stack bounds set by a previous
callback. This is to prevent that the C code switches stack while
having a Go frame on the stack, which we don't really support. But
this could also happen when we cannot get accurate stack bounds,
e.g. when pthread_getattr_np is not available. Since the stack
bounds are just estimates based on the current SP, if there are
multiple C->Go callbacks with various stack depth, it is possible
that the SP of a later callback falls out of a previous call's
estimate. This leads to runtime throw in a seemingly reasonable
program.
This CL changes it to save the old g0 stack bounds at cgocallback,
update the bounds, and restore the old bounds at return. So each
callback will get its own stack bounds based on the current SP,
and when it returns, the outer callback has the its old stack
bounds restored.
Also, at a cgo callback when there is no Go frame on the stack,
we currently always get new stack bounds. We do this because if
we can only get estimated bounds based on the SP, and the stack
depth varies a lot between two C->Go calls, the previous
estimates may be off and we fall out or nearly fall out of the
previous bounds. But this causes a performance problem: the
pthread API to get accurate stack bounds (pthread_getattr_np) is
very slow when called on the main thread. Getting the stack bounds
every time significantly slows down repeated C->Go calls on the
main thread.
This CL fixes it by "caching" the stack bounds if they are
accurate. I.e. at the second time Go calls into C, if the previous
stack bounds are accurate, and the current SP is in bounds, we can
be sure it is the same stack and we don't need to update the bounds.
This avoids the repeated calls to pthread_getattr_np. If we cannot
get the accurate bounds, we continue to update the stack bounds
based on the SP, and that operation is very cheap.
On a Linux/AMD64 machine with glibc:
name old time/op new time/op delta
CgoCallbackMainThread-8 96.4µs ± 3% 0.1µs ± 2% -99.92% (p=0.000 n=10+9)
Fixes #68285.
Fixes #68587.
Change-Id: I3422badd5ad8ff63e1a733152d05fb7a44d5d435
Reviewed-on: https://go-review.googlesource.com/c/go/+/600296
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Michael Pratt <mpratt@google.com>
2024-07-22 16:23:43 -04:00
|
|
|
id int64
|
|
|
|
|
mallocing int32
|
|
|
|
|
throwing throwType
|
|
|
|
|
preemptoff string // if != "", keep curg running on this m
|
|
|
|
|
locks int32
|
|
|
|
|
dying int32
|
|
|
|
|
profilehz int32
|
|
|
|
|
spinning bool // m is out of work and is actively looking for work
|
|
|
|
|
blocked bool // m is blocked on a note
|
|
|
|
|
newSigstack bool // minit on C thread called sigaltstack
|
|
|
|
|
printlock int8
|
|
|
|
|
incgo bool // m is executing a cgo call
|
|
|
|
|
isextra bool // m is an extra m
|
2025-03-14 10:50:25 -04:00
|
|
|
isExtraInC bool // m is an extra m that does not have any Go frames
|
runtime: update and restore g0 stack bounds at cgocallback
Currently, at a cgo callback where there is already a Go frame on
the stack (i.e. C->Go->C->Go), we require that at the inner Go
callback the SP is within the g0's stack bounds set by a previous
callback. This is to prevent that the C code switches stack while
having a Go frame on the stack, which we don't really support. But
this could also happen when we cannot get accurate stack bounds,
e.g. when pthread_getattr_np is not available. Since the stack
bounds are just estimates based on the current SP, if there are
multiple C->Go callbacks with various stack depth, it is possible
that the SP of a later callback falls out of a previous call's
estimate. This leads to runtime throw in a seemingly reasonable
program.
This CL changes it to save the old g0 stack bounds at cgocallback,
update the bounds, and restore the old bounds at return. So each
callback will get its own stack bounds based on the current SP,
and when it returns, the outer callback has the its old stack
bounds restored.
Also, at a cgo callback when there is no Go frame on the stack,
we currently always get new stack bounds. We do this because if
we can only get estimated bounds based on the SP, and the stack
depth varies a lot between two C->Go calls, the previous
estimates may be off and we fall out or nearly fall out of the
previous bounds. But this causes a performance problem: the
pthread API to get accurate stack bounds (pthread_getattr_np) is
very slow when called on the main thread. Getting the stack bounds
every time significantly slows down repeated C->Go calls on the
main thread.
This CL fixes it by "caching" the stack bounds if they are
accurate. I.e. at the second time Go calls into C, if the previous
stack bounds are accurate, and the current SP is in bounds, we can
be sure it is the same stack and we don't need to update the bounds.
This avoids the repeated calls to pthread_getattr_np. If we cannot
get the accurate bounds, we continue to update the stack bounds
based on the SP, and that operation is very cheap.
On a Linux/AMD64 machine with glibc:
name old time/op new time/op delta
CgoCallbackMainThread-8 96.4µs ± 3% 0.1µs ± 2% -99.92% (p=0.000 n=10+9)
Fixes #68285.
Fixes #68587.
Change-Id: I3422badd5ad8ff63e1a733152d05fb7a44d5d435
Reviewed-on: https://go-review.googlesource.com/c/go/+/600296
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Michael Pratt <mpratt@google.com>
2024-07-22 16:23:43 -04:00
|
|
|
isExtraInSig bool // m is an extra m in a signal handler
|
|
|
|
|
freeWait atomic.Uint32 // Whether it is safe to free g0 and delete m (one of freeMRef, freeMStack, freeMWait)
|
|
|
|
|
needextram bool
|
|
|
|
|
g0StackAccurate bool // whether the g0 stack has accurate bounds
|
|
|
|
|
traceback uint8
|
2025-06-27 17:21:20 -04:00
|
|
|
allpSnapshot []*p // Snapshot of allp for use after dropping P in findRunnable, nil otherwise.
|
runtime: update and restore g0 stack bounds at cgocallback
Currently, at a cgo callback where there is already a Go frame on
the stack (i.e. C->Go->C->Go), we require that at the inner Go
callback the SP is within the g0's stack bounds set by a previous
callback. This is to prevent that the C code switches stack while
having a Go frame on the stack, which we don't really support. But
this could also happen when we cannot get accurate stack bounds,
e.g. when pthread_getattr_np is not available. Since the stack
bounds are just estimates based on the current SP, if there are
multiple C->Go callbacks with various stack depth, it is possible
that the SP of a later callback falls out of a previous call's
estimate. This leads to runtime throw in a seemingly reasonable
program.
This CL changes it to save the old g0 stack bounds at cgocallback,
update the bounds, and restore the old bounds at return. So each
callback will get its own stack bounds based on the current SP,
and when it returns, the outer callback has the its old stack
bounds restored.
Also, at a cgo callback when there is no Go frame on the stack,
we currently always get new stack bounds. We do this because if
we can only get estimated bounds based on the SP, and the stack
depth varies a lot between two C->Go calls, the previous
estimates may be off and we fall out or nearly fall out of the
previous bounds. But this causes a performance problem: the
pthread API to get accurate stack bounds (pthread_getattr_np) is
very slow when called on the main thread. Getting the stack bounds
every time significantly slows down repeated C->Go calls on the
main thread.
This CL fixes it by "caching" the stack bounds if they are
accurate. I.e. at the second time Go calls into C, if the previous
stack bounds are accurate, and the current SP is in bounds, we can
be sure it is the same stack and we don't need to update the bounds.
This avoids the repeated calls to pthread_getattr_np. If we cannot
get the accurate bounds, we continue to update the stack bounds
based on the SP, and that operation is very cheap.
On a Linux/AMD64 machine with glibc:
name old time/op new time/op delta
CgoCallbackMainThread-8 96.4µs ± 3% 0.1µs ± 2% -99.92% (p=0.000 n=10+9)
Fixes #68285.
Fixes #68587.
Change-Id: I3422badd5ad8ff63e1a733152d05fb7a44d5d435
Reviewed-on: https://go-review.googlesource.com/c/go/+/600296
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Michael Pratt <mpratt@google.com>
2024-07-22 16:23:43 -04:00
|
|
|
ncgocall uint64 // number of cgo calls in total
|
|
|
|
|
ncgo int32 // number of cgo calls currently in progress
|
|
|
|
|
cgoCallersUse atomic.Uint32 // if non-zero, cgoCallers in use temporarily
|
|
|
|
|
cgoCallers *cgoCallers // cgo traceback if crashing in cgo call
|
|
|
|
|
park note
|
|
|
|
|
alllink *m // on allm
|
|
|
|
|
schedlink muintptr
|
2025-10-24 14:54:21 -04:00
|
|
|
idleNode listNodeManual
|
runtime: update and restore g0 stack bounds at cgocallback
Currently, at a cgo callback where there is already a Go frame on
the stack (i.e. C->Go->C->Go), we require that at the inner Go
callback the SP is within the g0's stack bounds set by a previous
callback. This is to prevent that the C code switches stack while
having a Go frame on the stack, which we don't really support. But
this could also happen when we cannot get accurate stack bounds,
e.g. when pthread_getattr_np is not available. Since the stack
bounds are just estimates based on the current SP, if there are
multiple C->Go callbacks with various stack depth, it is possible
that the SP of a later callback falls out of a previous call's
estimate. This leads to runtime throw in a seemingly reasonable
program.
This CL changes it to save the old g0 stack bounds at cgocallback,
update the bounds, and restore the old bounds at return. So each
callback will get its own stack bounds based on the current SP,
and when it returns, the outer callback has the its old stack
bounds restored.
Also, at a cgo callback when there is no Go frame on the stack,
we currently always get new stack bounds. We do this because if
we can only get estimated bounds based on the SP, and the stack
depth varies a lot between two C->Go calls, the previous
estimates may be off and we fall out or nearly fall out of the
previous bounds. But this causes a performance problem: the
pthread API to get accurate stack bounds (pthread_getattr_np) is
very slow when called on the main thread. Getting the stack bounds
every time significantly slows down repeated C->Go calls on the
main thread.
This CL fixes it by "caching" the stack bounds if they are
accurate. I.e. at the second time Go calls into C, if the previous
stack bounds are accurate, and the current SP is in bounds, we can
be sure it is the same stack and we don't need to update the bounds.
This avoids the repeated calls to pthread_getattr_np. If we cannot
get the accurate bounds, we continue to update the stack bounds
based on the SP, and that operation is very cheap.
On a Linux/AMD64 machine with glibc:
name old time/op new time/op delta
CgoCallbackMainThread-8 96.4µs ± 3% 0.1µs ± 2% -99.92% (p=0.000 n=10+9)
Fixes #68285.
Fixes #68587.
Change-Id: I3422badd5ad8ff63e1a733152d05fb7a44d5d435
Reviewed-on: https://go-review.googlesource.com/c/go/+/600296
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Michael Pratt <mpratt@google.com>
2024-07-22 16:23:43 -04:00
|
|
|
lockedg guintptr
|
|
|
|
|
createstack [32]uintptr // stack that created this thread, it's used for StackRecord.Stack0, so it must align with it.
|
|
|
|
|
lockedExt uint32 // tracking for external LockOSThread
|
|
|
|
|
lockedInt uint32 // tracking for internal lockOSThread
|
2024-10-28 14:01:54 -07:00
|
|
|
mWaitList mWaitList // list of runtime lock waiters
|
2023-05-10 20:36:08 +00:00
|
|
|
|
2023-11-21 16:03:54 +00:00
|
|
|
mLockProfile mLockProfile // fields relating to runtime.lock contention
|
2024-03-29 19:59:47 +01:00
|
|
|
profStack []uintptr // used for memory/block/mutex stack traces
|
2023-11-21 16:03:54 +00:00
|
|
|
|
2023-05-10 20:36:08 +00:00
|
|
|
// wait* are used to carry arguments from gopark into park_m, because
|
|
|
|
|
// there's no stack to put them on. That is their sole purpose.
|
2023-05-17 14:22:55 +00:00
|
|
|
waitunlockf func(*g, unsafe.Pointer) bool
|
|
|
|
|
waitlock unsafe.Pointer
|
|
|
|
|
waitTraceSkip int
|
2024-04-14 16:55:57 +05:00
|
|
|
waitTraceBlockReason traceBlockReason
|
2023-05-10 20:36:08 +00:00
|
|
|
|
|
|
|
|
syscalltick uint32
|
|
|
|
|
freelink *m // on sched.freem
|
|
|
|
|
trace mTraceState
|
2016-04-13 18:16:21 +09:00
|
|
|
|
2025-07-21 14:39:04 +02:00
|
|
|
// These are here to avoid using the G stack so the stack can move during the call.
|
2024-03-11 10:49:44 +01:00
|
|
|
libcallpc uintptr // for cpu profiler
|
|
|
|
|
libcallsp uintptr
|
|
|
|
|
libcallg guintptr
|
|
|
|
|
winsyscall winlibcall // stores syscall parameters on windows
|
2016-04-13 18:16:21 +09:00
|
|
|
|
2018-02-26 14:03:47 -08:00
|
|
|
vdsoSP uintptr // SP for traceback while in VDSO call (0 if not in call)
|
|
|
|
|
vdsoPC uintptr // PC for traceback while in VDSO call
|
|
|
|
|
|
2019-10-08 13:23:51 -04:00
|
|
|
// preemptGen counts the number of completed preemption
|
|
|
|
|
// signals. This is used to detect when a preemption is
|
2022-08-17 17:36:12 +07:00
|
|
|
// requested, but fails.
|
|
|
|
|
preemptGen atomic.Uint32
|
2019-10-08 13:23:51 -04:00
|
|
|
|
2020-03-16 20:08:00 -04:00
|
|
|
// Whether this is a pending preemption signal on this M.
|
2022-08-17 17:33:39 +07:00
|
|
|
signalPending atomic.Uint32
|
2020-03-16 20:08:00 -04:00
|
|
|
|
2023-08-01 13:54:32 -04:00
|
|
|
// pcvalue lookup cache
|
|
|
|
|
pcvalueCache pcvalueCache
|
|
|
|
|
|
2018-12-30 19:47:27 -05:00
|
|
|
dlogPerM
|
|
|
|
|
|
2015-10-21 12:48:53 -07:00
|
|
|
mOS
|
runtime: static lock ranking for the runtime (enabled by GOEXPERIMENT)
I took some of the infrastructure from Austin's lock logging CR
https://go-review.googlesource.com/c/go/+/192704 (with deadlock
detection from the logs), and developed a setup to give static lock
ranking for runtime locks.
Static lock ranking establishes a documented total ordering among locks,
and then reports an error if the total order is violated. This can
happen if a deadlock happens (by acquiring a sequence of locks in
different orders), or if just one side of a possible deadlock happens.
Lock ordering deadlocks cannot happen as long as the lock ordering is
followed.
Along the way, I found a deadlock involving the new timer code, which Ian fixed
via https://go-review.googlesource.com/c/go/+/207348, as well as two other
potential deadlocks.
See the constants at the top of runtime/lockrank.go to show the static
lock ranking that I ended up with, along with some comments. This is
great documentation of the current intended lock ordering when acquiring
multiple locks in the runtime.
I also added an array lockPartialOrder[] which shows and enforces the
current partial ordering among locks (which is embedded within the total
ordering). This is more specific about the dependencies among locks.
I don't try to check the ranking within a lock class with multiple locks
that can be acquired at the same time (i.e. check the ranking when
multiple hchan locks are acquired).
Currently, I am doing a lockInit() call to set the lock rank of most
locks. Any lock that is not otherwise initialized is assumed to be a
leaf lock (a very high rank lock), so that eliminates the need to do
anything for a bunch of locks (including all architecture-dependent
locks). For two locks, root.lock and notifyList.lock (only in the
runtime/sema.go file), it is not as easy to do lock initialization, so
instead, I am passing the lock rank with the lock calls.
For Windows compilation, I needed to increase the StackGuard size from
896 to 928 because of the new lock-rank checking functions.
Checking of the static lock ranking is enabled by setting
GOEXPERIMENT=staticlockranking before doing a run.
To make sure that the static lock ranking code has no overhead in memory
or CPU when not enabled by GOEXPERIMENT, I changed 'go build/install' so
that it defines a build tag (with the same name) whenever any experiment
has been baked into the toolchain (by checking Expstring()). This allows
me to avoid increasing the size of the 'mutex' type when static lock
ranking is not enabled.
Fixes #38029
Change-Id: I154217ff307c47051f8dae9c2a03b53081acd83a
Reviewed-on: https://go-review.googlesource.com/c/go/+/207619
Reviewed-by: Dan Scales <danscales@google.com>
Reviewed-by: Keith Randall <khr@golang.org>
Run-TryBot: Dan Scales <danscales@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2019-11-13 17:34:47 -08:00
|
|
|
|
math/rand, math/rand/v2: use ChaCha8 for global rand
Move ChaCha8 code into internal/chacha8rand and use it to implement
runtime.rand, which is used for the unseeded global source for
both math/rand and math/rand/v2. This also affects the calculation of
the start point for iteration over very very large maps (when the
32-bit fastrand is not big enough).
The benefit is that misuse of the global random number generators
in math/rand and math/rand/v2 in contexts where non-predictable
randomness is important for security reasons is no longer a
security problem, removing a common mistake among programmers
who are unaware of the different kinds of randomness.
The cost is an extra 304 bytes per thread stored in the m struct
plus 2-3ns more per random uint64 due to the more sophisticated
algorithm. Using PCG looks like it would cost about the same,
although I haven't benchmarked that.
Before this, the math/rand and math/rand/v2 global generator
was wyrand (https://github.com/wangyi-fudan/wyhash).
For math/rand, using wyrand instead of the Mitchell/Reeds/Thompson
ALFG was justifiable, since the latter was not any better.
But for math/rand/v2, the global generator really should be
at least as good as one of the well-studied, specific algorithms
provided directly by the package, and it's not.
(Wyrand is still reasonable for scheduling and cache decisions.)
Good randomness does have a cost: about twice wyrand.
Also rationalize the various runtime rand references.
goos: linux
goarch: amd64
pkg: math/rand/v2
cpu: AMD Ryzen 9 7950X 16-Core Processor
│ bbb48afeb7.amd64 │ 5cf807d1ea.amd64 │
│ sec/op │ sec/op vs base │
ChaCha8-32 1.862n ± 2% 1.861n ± 2% ~ (p=0.825 n=20)
PCG_DXSM-32 1.471n ± 1% 1.460n ± 2% ~ (p=0.153 n=20)
SourceUint64-32 1.636n ± 2% 1.582n ± 1% -3.30% (p=0.000 n=20)
GlobalInt64-32 2.087n ± 1% 3.663n ± 1% +75.54% (p=0.000 n=20)
GlobalInt64Parallel-32 0.1042n ± 1% 0.2026n ± 1% +94.48% (p=0.000 n=20)
GlobalUint64-32 2.263n ± 2% 3.724n ± 1% +64.57% (p=0.000 n=20)
GlobalUint64Parallel-32 0.1019n ± 1% 0.1973n ± 1% +93.67% (p=0.000 n=20)
Int64-32 1.771n ± 1% 1.774n ± 1% ~ (p=0.449 n=20)
Uint64-32 1.863n ± 2% 1.866n ± 1% ~ (p=0.364 n=20)
GlobalIntN1000-32 3.134n ± 3% 4.730n ± 2% +50.95% (p=0.000 n=20)
IntN1000-32 2.489n ± 1% 2.489n ± 1% ~ (p=0.683 n=20)
Int64N1000-32 2.521n ± 1% 2.516n ± 1% ~ (p=0.394 n=20)
Int64N1e8-32 2.479n ± 1% 2.478n ± 2% ~ (p=0.743 n=20)
Int64N1e9-32 2.530n ± 2% 2.514n ± 2% ~ (p=0.193 n=20)
Int64N2e9-32 2.501n ± 1% 2.494n ± 1% ~ (p=0.616 n=20)
Int64N1e18-32 3.227n ± 1% 3.205n ± 1% ~ (p=0.101 n=20)
Int64N2e18-32 3.647n ± 1% 3.599n ± 1% ~ (p=0.019 n=20)
Int64N4e18-32 5.135n ± 1% 5.069n ± 2% ~ (p=0.034 n=20)
Int32N1000-32 2.657n ± 1% 2.637n ± 1% ~ (p=0.180 n=20)
Int32N1e8-32 2.636n ± 1% 2.636n ± 1% ~ (p=0.763 n=20)
Int32N1e9-32 2.660n ± 2% 2.638n ± 1% ~ (p=0.358 n=20)
Int32N2e9-32 2.662n ± 2% 2.618n ± 2% ~ (p=0.064 n=20)
Float32-32 2.272n ± 2% 2.239n ± 2% ~ (p=0.194 n=20)
Float64-32 2.272n ± 1% 2.286n ± 2% ~ (p=0.763 n=20)
ExpFloat64-32 3.762n ± 1% 3.744n ± 1% ~ (p=0.171 n=20)
NormFloat64-32 3.706n ± 1% 3.655n ± 2% ~ (p=0.066 n=20)
Perm3-32 32.93n ± 3% 34.62n ± 1% +5.13% (p=0.000 n=20)
Perm30-32 202.9n ± 1% 204.0n ± 1% ~ (p=0.482 n=20)
Perm30ViaShuffle-32 115.0n ± 1% 114.9n ± 1% ~ (p=0.358 n=20)
ShuffleOverhead-32 112.8n ± 1% 112.7n ± 1% ~ (p=0.692 n=20)
Concurrent-32 2.107n ± 0% 3.725n ± 1% +76.75% (p=0.000 n=20)
goos: darwin
goarch: arm64
pkg: math/rand/v2
│ bbb48afeb7.arm64 │ 5cf807d1ea.arm64 │
│ sec/op │ sec/op vs base │
ChaCha8-8 2.480n ± 0% 2.429n ± 0% -2.04% (p=0.000 n=20)
PCG_DXSM-8 2.531n ± 0% 2.530n ± 0% ~ (p=0.877 n=20)
SourceUint64-8 2.534n ± 0% 2.533n ± 0% ~ (p=0.732 n=20)
GlobalInt64-8 2.172n ± 1% 4.794n ± 0% +120.67% (p=0.000 n=20)
GlobalInt64Parallel-8 0.4320n ± 0% 0.9605n ± 0% +122.32% (p=0.000 n=20)
GlobalUint64-8 2.182n ± 0% 4.770n ± 0% +118.58% (p=0.000 n=20)
GlobalUint64Parallel-8 0.4307n ± 0% 0.9583n ± 0% +122.51% (p=0.000 n=20)
Int64-8 4.107n ± 0% 4.104n ± 0% ~ (p=0.416 n=20)
Uint64-8 4.080n ± 0% 4.080n ± 0% ~ (p=0.052 n=20)
GlobalIntN1000-8 2.814n ± 2% 5.643n ± 0% +100.50% (p=0.000 n=20)
IntN1000-8 4.141n ± 0% 4.139n ± 0% ~ (p=0.140 n=20)
Int64N1000-8 4.140n ± 0% 4.140n ± 0% ~ (p=0.313 n=20)
Int64N1e8-8 4.140n ± 0% 4.139n ± 0% ~ (p=0.103 n=20)
Int64N1e9-8 4.139n ± 0% 4.140n ± 0% ~ (p=0.761 n=20)
Int64N2e9-8 4.140n ± 0% 4.140n ± 0% ~ (p=0.636 n=20)
Int64N1e18-8 5.266n ± 0% 5.326n ± 1% +1.14% (p=0.001 n=20)
Int64N2e18-8 6.052n ± 0% 6.167n ± 0% +1.90% (p=0.000 n=20)
Int64N4e18-8 8.826n ± 0% 9.051n ± 0% +2.55% (p=0.000 n=20)
Int32N1000-8 4.127n ± 0% 4.132n ± 0% +0.12% (p=0.000 n=20)
Int32N1e8-8 4.126n ± 0% 4.131n ± 0% +0.12% (p=0.000 n=20)
Int32N1e9-8 4.127n ± 0% 4.132n ± 0% +0.12% (p=0.000 n=20)
Int32N2e9-8 4.132n ± 0% 4.131n ± 0% ~ (p=0.017 n=20)
Float32-8 4.109n ± 0% 4.105n ± 0% ~ (p=0.379 n=20)
Float64-8 4.107n ± 0% 4.106n ± 0% ~ (p=0.867 n=20)
ExpFloat64-8 5.339n ± 0% 5.383n ± 0% +0.82% (p=0.000 n=20)
NormFloat64-8 5.735n ± 0% 5.737n ± 1% ~ (p=0.856 n=20)
Perm3-8 26.65n ± 0% 26.80n ± 1% +0.58% (p=0.000 n=20)
Perm30-8 194.8n ± 1% 197.0n ± 0% +1.18% (p=0.000 n=20)
Perm30ViaShuffle-8 156.6n ± 0% 157.6n ± 1% +0.61% (p=0.000 n=20)
ShuffleOverhead-8 124.9n ± 0% 125.5n ± 0% +0.52% (p=0.000 n=20)
Concurrent-8 2.434n ± 3% 5.066n ± 0% +108.09% (p=0.000 n=20)
goos: linux
goarch: 386
pkg: math/rand/v2
cpu: AMD Ryzen 9 7950X 16-Core Processor
│ bbb48afeb7.386 │ 5cf807d1ea.386 │
│ sec/op │ sec/op vs base │
ChaCha8-32 11.295n ± 1% 4.748n ± 2% -57.96% (p=0.000 n=20)
PCG_DXSM-32 7.693n ± 1% 7.738n ± 2% ~ (p=0.542 n=20)
SourceUint64-32 7.658n ± 2% 7.622n ± 2% ~ (p=0.344 n=20)
GlobalInt64-32 3.473n ± 2% 7.526n ± 2% +116.73% (p=0.000 n=20)
GlobalInt64Parallel-32 0.3198n ± 0% 0.5444n ± 0% +70.22% (p=0.000 n=20)
GlobalUint64-32 3.612n ± 0% 7.575n ± 1% +109.69% (p=0.000 n=20)
GlobalUint64Parallel-32 0.3168n ± 0% 0.5403n ± 0% +70.51% (p=0.000 n=20)
Int64-32 7.673n ± 2% 7.789n ± 1% ~ (p=0.122 n=20)
Uint64-32 7.773n ± 1% 7.827n ± 2% ~ (p=0.920 n=20)
GlobalIntN1000-32 6.268n ± 1% 9.581n ± 1% +52.87% (p=0.000 n=20)
IntN1000-32 10.33n ± 2% 10.45n ± 1% ~ (p=0.233 n=20)
Int64N1000-32 10.98n ± 2% 11.01n ± 1% ~ (p=0.401 n=20)
Int64N1e8-32 11.19n ± 2% 10.97n ± 1% ~ (p=0.033 n=20)
Int64N1e9-32 11.06n ± 1% 11.08n ± 1% ~ (p=0.498 n=20)
Int64N2e9-32 11.10n ± 1% 11.01n ± 2% ~ (p=0.995 n=20)
Int64N1e18-32 15.23n ± 2% 15.04n ± 1% ~ (p=0.973 n=20)
Int64N2e18-32 15.89n ± 1% 15.85n ± 1% ~ (p=0.409 n=20)
Int64N4e18-32 18.96n ± 2% 19.34n ± 2% ~ (p=0.048 n=20)
Int32N1000-32 10.46n ± 2% 10.44n ± 2% ~ (p=0.480 n=20)
Int32N1e8-32 10.46n ± 2% 10.49n ± 2% ~ (p=0.951 n=20)
Int32N1e9-32 10.28n ± 2% 10.26n ± 1% ~ (p=0.431 n=20)
Int32N2e9-32 10.50n ± 2% 10.44n ± 2% ~ (p=0.249 n=20)
Float32-32 13.80n ± 2% 13.80n ± 2% ~ (p=0.751 n=20)
Float64-32 23.55n ± 2% 23.87n ± 0% ~ (p=0.408 n=20)
ExpFloat64-32 15.36n ± 1% 15.29n ± 2% ~ (p=0.316 n=20)
NormFloat64-32 13.57n ± 1% 13.79n ± 1% +1.66% (p=0.005 n=20)
Perm3-32 45.70n ± 2% 46.99n ± 2% +2.81% (p=0.001 n=20)
Perm30-32 399.0n ± 1% 403.8n ± 1% +1.19% (p=0.006 n=20)
Perm30ViaShuffle-32 349.0n ± 1% 350.4n ± 1% ~ (p=0.909 n=20)
ShuffleOverhead-32 322.3n ± 1% 323.8n ± 1% ~ (p=0.410 n=20)
Concurrent-32 3.331n ± 1% 7.312n ± 1% +119.50% (p=0.000 n=20)
For #61716.
Change-Id: Ibdddeed85c34d9ae397289dc899e04d4845f9ed2
Reviewed-on: https://go-review.googlesource.com/c/go/+/516860
Reviewed-by: Michael Pratt <mpratt@google.com>
Reviewed-by: Filippo Valsorda <filippo@golang.org>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
2023-08-06 13:26:28 +10:00
|
|
|
chacha8 chacha8rand.State
|
|
|
|
|
cheaprand uint64
|
|
|
|
|
|
runtime: static lock ranking for the runtime (enabled by GOEXPERIMENT)
I took some of the infrastructure from Austin's lock logging CR
https://go-review.googlesource.com/c/go/+/192704 (with deadlock
detection from the logs), and developed a setup to give static lock
ranking for runtime locks.
Static lock ranking establishes a documented total ordering among locks,
and then reports an error if the total order is violated. This can
happen if a deadlock happens (by acquiring a sequence of locks in
different orders), or if just one side of a possible deadlock happens.
Lock ordering deadlocks cannot happen as long as the lock ordering is
followed.
Along the way, I found a deadlock involving the new timer code, which Ian fixed
via https://go-review.googlesource.com/c/go/+/207348, as well as two other
potential deadlocks.
See the constants at the top of runtime/lockrank.go to show the static
lock ranking that I ended up with, along with some comments. This is
great documentation of the current intended lock ordering when acquiring
multiple locks in the runtime.
I also added an array lockPartialOrder[] which shows and enforces the
current partial ordering among locks (which is embedded within the total
ordering). This is more specific about the dependencies among locks.
I don't try to check the ranking within a lock class with multiple locks
that can be acquired at the same time (i.e. check the ranking when
multiple hchan locks are acquired).
Currently, I am doing a lockInit() call to set the lock rank of most
locks. Any lock that is not otherwise initialized is assumed to be a
leaf lock (a very high rank lock), so that eliminates the need to do
anything for a bunch of locks (including all architecture-dependent
locks). For two locks, root.lock and notifyList.lock (only in the
runtime/sema.go file), it is not as easy to do lock initialization, so
instead, I am passing the lock rank with the lock calls.
For Windows compilation, I needed to increase the StackGuard size from
896 to 928 because of the new lock-rank checking functions.
Checking of the static lock ranking is enabled by setting
GOEXPERIMENT=staticlockranking before doing a run.
To make sure that the static lock ranking code has no overhead in memory
or CPU when not enabled by GOEXPERIMENT, I changed 'go build/install' so
that it defines a build tag (with the same name) whenever any experiment
has been baked into the toolchain (by checking Expstring()). This allows
me to avoid increasing the size of the 'mutex' type when static lock
ranking is not enabled.
Fixes #38029
Change-Id: I154217ff307c47051f8dae9c2a03b53081acd83a
Reviewed-on: https://go-review.googlesource.com/c/go/+/207619
Reviewed-by: Dan Scales <danscales@google.com>
Reviewed-by: Keith Randall <khr@golang.org>
Run-TryBot: Dan Scales <danscales@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2019-11-13 17:34:47 -08:00
|
|
|
// Up to 10 locks held by this m, maintained by the lock ranking code.
|
|
|
|
|
locksHeldLen int
|
|
|
|
|
locksHeld [10]heldLockInfo
|
runtime: prefer to restart Ps on the same M after STW
Today, Ps jump around arbitrarily across STW. Instead, try to keep the P
on the previous M it ran on. In the future, we'll likely want to try to
expand this beyond STW to create a more general affinity for specific
Ms.
For this to be useful, the Ps need to have runnable Gs. Today, STW
preemption goes through goschedImpl, which places the G on the global
run queue. If that was the only G then the P won't have runnable
goroutines anymore.
It makes more sense to keep the G with its P across STW anyway, so add a
special case to goschedImpl for that.
On my machine, this CL reduces the error rate in TestTraceSTW from 99.8%
to 1.9%.
As a nearly 2% error rate shows, there are still cases where this best
effort scheduling doesn't work. The most obvious is that while
procresize assigns Ps back to their original M, startTheWorldWithSema
calls wakep to start a spinning M. The spinning M may steal a goroutine
from another P if that P is too slow to start.
For #65694.
Change-Id: I6a6a636c0969c587d039b68bc68ea16c74ff1fc9
Reviewed-on: https://go-review.googlesource.com/c/go/+/714801
Reviewed-by: Michael Knyszek <mknyszek@google.com>
Auto-Submit: Michael Pratt <mpratt@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
2025-10-24 15:14:59 -04:00
|
|
|
|
|
|
|
|
// self points this M until mexit clears it to return nil.
|
|
|
|
|
self mWeakPointer
|
2025-03-04 10:31:02 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const mRedZoneSize = (16 << 3) * asanenabledBit // redZoneSize(2048)
|
|
|
|
|
|
|
|
|
|
type mPadded struct {
|
|
|
|
|
m
|
runtime: unify lock2, allow deeper sleep
The tri-state mutex implementation (unlocked, locked, sleeping) avoids
sleep/wake syscalls when contention is low or absent, but its
performance degrades when many threads are contending for a mutex to
execute a fast critical section.
A fast critical section means frequent unlock2 calls. Each of those
finds the mutex in the "sleeping" state and so wakes a sleeping thread,
even if many other threads are already awake and in the spin loop of
lock2 attempting to acquire the mutex for themselves. Many spinning
threads means wasting energy and CPU time that could be used by other
processes on the machine. Many threads all spinning on the same cache
line leads to performance collapse.
Merge the futex- and semaphore-based mutex implementations by using a
semaphore abstraction for futex platforms. Then, add a bit to the mutex
state word that communicates whether one of the waiting threads is awake
and spinning. When threads in lock2 see the new "spinning" bit, they can
sleep immediately. In unlock2, the "spinning" bit means we can save a
syscall and not wake a sleeping thread.
This brings up the real possibility of starvation: waiting threads are
able to enter a deeper sleep than before, since one of their peers can
volunteer to be the sole "spinning" thread and thus cause unlock2 to
skip the semawakeup call. Additionally, the waiting threads form a LIFO
stack so any wakeups that do occur will target threads that have gone to
sleep most recently. Counteract those effects by periodically waking the
thread at the bottom of the stack and allowing it to spin.
Exempt sched.lock from most of the new behaviors; it's often used by
several threads in sequence to do thread-specific work, so low-latency
handoff is a priority over improved throughput.
Gate use of this implementation behind GOEXPERIMENT=spinbitmutex, so
it's easy to disable. Enable it by default on supported platforms (the
most efficient implementation requires atomic.Xchg8).
Fixes #68578
goos: linux
goarch: amd64
pkg: runtime
cpu: 13th Gen Intel(R) Core(TM) i7-13700H
│ old │ new │
│ sec/op │ sec/op vs base │
MutexContention 17.82n ± 0% 17.74n ± 0% -0.42% (p=0.000 n=10)
MutexContention-2 22.17n ± 9% 19.85n ± 12% ~ (p=0.089 n=10)
MutexContention-3 26.14n ± 14% 20.81n ± 13% -20.41% (p=0.000 n=10)
MutexContention-4 29.28n ± 8% 21.19n ± 10% -27.62% (p=0.000 n=10)
MutexContention-5 31.79n ± 2% 21.98n ± 10% -30.83% (p=0.000 n=10)
MutexContention-6 34.63n ± 1% 22.58n ± 5% -34.79% (p=0.000 n=10)
MutexContention-7 44.16n ± 2% 23.14n ± 7% -47.59% (p=0.000 n=10)
MutexContention-8 53.81n ± 3% 23.66n ± 6% -56.04% (p=0.000 n=10)
MutexContention-9 65.58n ± 4% 23.91n ± 9% -63.54% (p=0.000 n=10)
MutexContention-10 77.35n ± 3% 26.06n ± 9% -66.31% (p=0.000 n=10)
MutexContention-11 89.62n ± 1% 25.56n ± 9% -71.47% (p=0.000 n=10)
MutexContention-12 102.45n ± 2% 25.57n ± 7% -75.04% (p=0.000 n=10)
MutexContention-13 111.95n ± 1% 24.59n ± 8% -78.04% (p=0.000 n=10)
MutexContention-14 123.95n ± 3% 24.42n ± 6% -80.30% (p=0.000 n=10)
MutexContention-15 120.80n ± 10% 25.54n ± 6% -78.86% (p=0.000 n=10)
MutexContention-16 128.10n ± 25% 26.95n ± 4% -78.96% (p=0.000 n=10)
MutexContention-17 139.80n ± 18% 24.96n ± 5% -82.14% (p=0.000 n=10)
MutexContention-18 141.35n ± 7% 25.05n ± 8% -82.27% (p=0.000 n=10)
MutexContention-19 151.35n ± 18% 25.72n ± 6% -83.00% (p=0.000 n=10)
MutexContention-20 153.30n ± 20% 24.75n ± 6% -83.85% (p=0.000 n=10)
MutexHandoff/Solo-20 13.54n ± 1% 13.61n ± 4% ~ (p=0.206 n=10)
MutexHandoff/FastPingPong-20 141.3n ± 209% 164.8n ± 49% ~ (p=0.436 n=10)
MutexHandoff/SlowPingPong-20 1.572µ ± 16% 1.804µ ± 19% +14.76% (p=0.015 n=10)
geomean 74.34n 30.26n -59.30%
goos: darwin
goarch: arm64
pkg: runtime
cpu: Apple M1
│ old │ new │
│ sec/op │ sec/op vs base │
MutexContention 13.86n ± 3% 12.09n ± 3% -12.73% (p=0.000 n=10)
MutexContention-2 15.88n ± 1% 16.50n ± 2% +3.94% (p=0.001 n=10)
MutexContention-3 18.45n ± 2% 16.88n ± 2% -8.54% (p=0.000 n=10)
MutexContention-4 20.01n ± 2% 18.94n ± 18% ~ (p=0.469 n=10)
MutexContention-5 22.60n ± 1% 17.51n ± 9% -22.50% (p=0.000 n=10)
MutexContention-6 23.93n ± 2% 17.35n ± 2% -27.48% (p=0.000 n=10)
MutexContention-7 24.69n ± 1% 17.15n ± 3% -30.54% (p=0.000 n=10)
MutexContention-8 25.01n ± 1% 17.33n ± 2% -30.69% (p=0.000 n=10)
MutexHandoff/Solo-8 13.96n ± 4% 12.04n ± 4% -13.78% (p=0.000 n=10)
MutexHandoff/FastPingPong-8 68.89n ± 4% 64.62n ± 2% -6.20% (p=0.000 n=10)
MutexHandoff/SlowPingPong-8 9.698µ ± 22% 9.646µ ± 35% ~ (p=0.912 n=10)
geomean 38.20n 32.53n -14.84%
Change-Id: I0058c75eadf282d08eea7fce0d426f0518039f7c
Reviewed-on: https://go-review.googlesource.com/c/go/+/620435
Reviewed-by: Michael Knyszek <mknyszek@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Junyang Shao <shaojunyang@google.com>
Auto-Submit: Rhys Hiltner <rhys.hiltner@gmail.com>
2024-10-11 15:31:18 -07:00
|
|
|
|
|
|
|
|
// Size the runtime.m structure so it fits in the 2048-byte size class, and
|
|
|
|
|
// not in the next-smallest (1792-byte) size class. That leaves the 11 low
|
2025-03-14 12:38:34 -07:00
|
|
|
// bits of muintptr values available for flags, as required by
|
|
|
|
|
// lock_spinbit.go.
|
|
|
|
|
_ [(1 - goarch.IsWasm) * (2048 - mallocHeaderSize - mRedZoneSize - unsafe.Sizeof(m{}))]byte
|
2014-11-11 17:05:19 -05:00
|
|
|
}
|
|
|
|
|
|
runtime: prefer to restart Ps on the same M after STW
Today, Ps jump around arbitrarily across STW. Instead, try to keep the P
on the previous M it ran on. In the future, we'll likely want to try to
expand this beyond STW to create a more general affinity for specific
Ms.
For this to be useful, the Ps need to have runnable Gs. Today, STW
preemption goes through goschedImpl, which places the G on the global
run queue. If that was the only G then the P won't have runnable
goroutines anymore.
It makes more sense to keep the G with its P across STW anyway, so add a
special case to goschedImpl for that.
On my machine, this CL reduces the error rate in TestTraceSTW from 99.8%
to 1.9%.
As a nearly 2% error rate shows, there are still cases where this best
effort scheduling doesn't work. The most obvious is that while
procresize assigns Ps back to their original M, startTheWorldWithSema
calls wakep to start a spinning M. The spinning M may steal a goroutine
from another P if that P is too slow to start.
For #65694.
Change-Id: I6a6a636c0969c587d039b68bc68ea16c74ff1fc9
Reviewed-on: https://go-review.googlesource.com/c/go/+/714801
Reviewed-by: Michael Knyszek <mknyszek@google.com>
Auto-Submit: Michael Pratt <mpratt@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
2025-10-24 15:14:59 -04:00
|
|
|
// mWeakPointer is a "weak" pointer to an M. A weak pointer for each M is
|
|
|
|
|
// available as m.self. Users may copy mWeakPointer arbitrarily, and get will
|
|
|
|
|
// return the M if it is still live, or nil after mexit.
|
|
|
|
|
//
|
|
|
|
|
// The zero value is treated as a nil pointer.
|
|
|
|
|
//
|
|
|
|
|
// Note that get may race with M exit. A successful get will keep the m object
|
|
|
|
|
// alive, but the M itself may be exited and thus not actually usable.
|
|
|
|
|
type mWeakPointer struct {
|
|
|
|
|
m *atomic.Pointer[m]
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func newMWeakPointer(mp *m) mWeakPointer {
|
|
|
|
|
w := mWeakPointer{m: new(atomic.Pointer[m])}
|
|
|
|
|
w.m.Store(mp)
|
|
|
|
|
return w
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (w mWeakPointer) get() *m {
|
|
|
|
|
if w.m == nil {
|
|
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
return w.m.Load()
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// clear sets the weak pointer to nil. It cannot be used on zero value
|
|
|
|
|
// mWeakPointers.
|
|
|
|
|
func (w mWeakPointer) clear() {
|
|
|
|
|
w.m.Store(nil)
|
|
|
|
|
}
|
|
|
|
|
|
2014-11-11 17:05:19 -05:00
|
|
|
type p struct {
|
|
|
|
|
id int32
|
|
|
|
|
status uint32 // one of pidle/prunning/...
|
2015-04-17 00:21:30 -04:00
|
|
|
link puintptr
|
2017-06-13 11:14:43 -04:00
|
|
|
schedtick uint32 // incremented on every scheduler call
|
|
|
|
|
syscalltick uint32 // incremented on every system call
|
|
|
|
|
sysmontick sysmontick // last tick observed by sysmon
|
|
|
|
|
m muintptr // back-link to associated m (nil if idle)
|
2014-11-11 17:05:19 -05:00
|
|
|
mcache *mcache
|
2019-09-16 21:23:24 +00:00
|
|
|
pcache pageCache
|
2019-03-27 14:59:10 -07:00
|
|
|
raceprocctx uintptr
|
2015-02-05 13:35:41 +00:00
|
|
|
|
runtime: prefer to restart Ps on the same M after STW
Today, Ps jump around arbitrarily across STW. Instead, try to keep the P
on the previous M it ran on. In the future, we'll likely want to try to
expand this beyond STW to create a more general affinity for specific
Ms.
For this to be useful, the Ps need to have runnable Gs. Today, STW
preemption goes through goschedImpl, which places the G on the global
run queue. If that was the only G then the P won't have runnable
goroutines anymore.
It makes more sense to keep the G with its P across STW anyway, so add a
special case to goschedImpl for that.
On my machine, this CL reduces the error rate in TestTraceSTW from 99.8%
to 1.9%.
As a nearly 2% error rate shows, there are still cases where this best
effort scheduling doesn't work. The most obvious is that while
procresize assigns Ps back to their original M, startTheWorldWithSema
calls wakep to start a spinning M. The spinning M may steal a goroutine
from another P if that P is too slow to start.
For #65694.
Change-Id: I6a6a636c0969c587d039b68bc68ea16c74ff1fc9
Reviewed-on: https://go-review.googlesource.com/c/go/+/714801
Reviewed-by: Michael Knyszek <mknyszek@google.com>
Auto-Submit: Michael Pratt <mpratt@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
2025-10-24 15:14:59 -04:00
|
|
|
// oldm is the previous m this p ran on.
|
|
|
|
|
//
|
|
|
|
|
// We are not assosciated with this m, so we have no control over its
|
|
|
|
|
// lifecycle. This value is an m.self object which points to the m
|
|
|
|
|
// until the m exits.
|
|
|
|
|
//
|
|
|
|
|
// Note that this m may be idle, running, or exiting. It should only be
|
|
|
|
|
// used with mgetSpecific, which will take ownership of the m only if
|
|
|
|
|
// it is idle.
|
|
|
|
|
oldm mWeakPointer
|
|
|
|
|
|
2021-06-08 18:45:18 -04:00
|
|
|
deferpool []*_defer // pool of available defer structs (see panic.go)
|
|
|
|
|
deferpoolbuf [32]*_defer
|
2014-11-11 17:05:19 -05:00
|
|
|
|
|
|
|
|
// Cache of goroutine ids, amortizes accesses to runtime·sched.goidgen.
|
|
|
|
|
goidcache uint64
|
|
|
|
|
goidcacheend uint64
|
|
|
|
|
|
runtime: yield time slice to most recently readied G
Currently, when the runtime ready()s a G, it adds it to the end of the
current P's run queue and continues running. If there are many other
things in the run queue, this can result in a significant delay before
the ready()d G actually runs and can hurt fairness when other Gs in
the run queue are CPU hogs. For example, if there are three Gs sharing
a P, one of which is a CPU hog that never voluntarily gives up the P
and the other two of which are doing small amounts of work and
communicating back and forth on an unbuffered channel, the two
communicating Gs will get very little CPU time.
Change this so that when G1 ready()s G2 and then blocks, the scheduler
immediately hands off the remainder of G1's time slice to G2. In the
above example, the two communicating Gs will now act as a unit and
together get half of the CPU time, while the CPU hog gets the other
half of the CPU time.
This fixes the problem demonstrated by the ping-pong benchmark added
in the previous commit:
benchmark old ns/op new ns/op delta
BenchmarkPingPongHog 684287 825 -99.88%
On the x/benchmarks suite, this change improves the performance of
garbage by ~6% (for GOMAXPROCS=1 and 4), and json by 28% and 36% for
GOMAXPROCS=1 and 4. It has negligible effect on heap size.
This has no effect on the go1 benchmark suite since those benchmarks
are mostly single-threaded.
Change-Id: I858a08eaa78f702ea98a5fac99d28a4ac91d339f
Reviewed-on: https://go-review.googlesource.com/9289
Reviewed-by: Rick Hudson <rlh@golang.org>
Reviewed-by: Russ Cox <rsc@golang.org>
2015-04-22 14:42:26 -04:00
|
|
|
// Queue of runnable goroutines. Accessed without lock.
|
2015-05-13 17:08:16 -04:00
|
|
|
runqhead uint32
|
|
|
|
|
runqtail uint32
|
2015-11-02 16:59:39 -05:00
|
|
|
runq [256]guintptr
|
runtime: yield time slice to most recently readied G
Currently, when the runtime ready()s a G, it adds it to the end of the
current P's run queue and continues running. If there are many other
things in the run queue, this can result in a significant delay before
the ready()d G actually runs and can hurt fairness when other Gs in
the run queue are CPU hogs. For example, if there are three Gs sharing
a P, one of which is a CPU hog that never voluntarily gives up the P
and the other two of which are doing small amounts of work and
communicating back and forth on an unbuffered channel, the two
communicating Gs will get very little CPU time.
Change this so that when G1 ready()s G2 and then blocks, the scheduler
immediately hands off the remainder of G1's time slice to G2. In the
above example, the two communicating Gs will now act as a unit and
together get half of the CPU time, while the CPU hog gets the other
half of the CPU time.
This fixes the problem demonstrated by the ping-pong benchmark added
in the previous commit:
benchmark old ns/op new ns/op delta
BenchmarkPingPongHog 684287 825 -99.88%
On the x/benchmarks suite, this change improves the performance of
garbage by ~6% (for GOMAXPROCS=1 and 4), and json by 28% and 36% for
GOMAXPROCS=1 and 4. It has negligible effect on heap size.
This has no effect on the go1 benchmark suite since those benchmarks
are mostly single-threaded.
Change-Id: I858a08eaa78f702ea98a5fac99d28a4ac91d339f
Reviewed-on: https://go-review.googlesource.com/9289
Reviewed-by: Rick Hudson <rlh@golang.org>
Reviewed-by: Russ Cox <rsc@golang.org>
2015-04-22 14:42:26 -04:00
|
|
|
// runnext, if non-nil, is a runnable G that was ready'd by
|
|
|
|
|
// the current G and should be run next instead of what's in
|
|
|
|
|
// runq if there's time remaining in the running G's time
|
|
|
|
|
// slice. It will inherit the time left in the current time
|
|
|
|
|
// slice. If a set of goroutines is locked in a
|
|
|
|
|
// communicate-and-wait pattern, this schedules that set as a
|
|
|
|
|
// unit and eliminates the (potentially large) scheduling
|
|
|
|
|
// latency that otherwise arises from adding the ready'd
|
|
|
|
|
// goroutines to the end of the run queue.
|
2021-04-23 21:25:06 +08:00
|
|
|
//
|
|
|
|
|
// Note that while other P's may atomically CAS this to zero,
|
|
|
|
|
// only the owner P can CAS it to a valid G.
|
runtime: yield time slice to most recently readied G
Currently, when the runtime ready()s a G, it adds it to the end of the
current P's run queue and continues running. If there are many other
things in the run queue, this can result in a significant delay before
the ready()d G actually runs and can hurt fairness when other Gs in
the run queue are CPU hogs. For example, if there are three Gs sharing
a P, one of which is a CPU hog that never voluntarily gives up the P
and the other two of which are doing small amounts of work and
communicating back and forth on an unbuffered channel, the two
communicating Gs will get very little CPU time.
Change this so that when G1 ready()s G2 and then blocks, the scheduler
immediately hands off the remainder of G1's time slice to G2. In the
above example, the two communicating Gs will now act as a unit and
together get half of the CPU time, while the CPU hog gets the other
half of the CPU time.
This fixes the problem demonstrated by the ping-pong benchmark added
in the previous commit:
benchmark old ns/op new ns/op delta
BenchmarkPingPongHog 684287 825 -99.88%
On the x/benchmarks suite, this change improves the performance of
garbage by ~6% (for GOMAXPROCS=1 and 4), and json by 28% and 36% for
GOMAXPROCS=1 and 4. It has negligible effect on heap size.
This has no effect on the go1 benchmark suite since those benchmarks
are mostly single-threaded.
Change-Id: I858a08eaa78f702ea98a5fac99d28a4ac91d339f
Reviewed-on: https://go-review.googlesource.com/9289
Reviewed-by: Rick Hudson <rlh@golang.org>
Reviewed-by: Russ Cox <rsc@golang.org>
2015-04-22 14:42:26 -04:00
|
|
|
runnext guintptr
|
2014-11-11 17:05:19 -05:00
|
|
|
|
|
|
|
|
// Available G's (status == Gdead)
|
2025-04-07 17:08:19 +03:00
|
|
|
gFree gList
|
2014-11-11 17:05:19 -05:00
|
|
|
|
2015-02-03 00:33:02 +03:00
|
|
|
sudogcache []*sudog
|
|
|
|
|
sudogbuf [128]*sudog
|
|
|
|
|
|
2019-09-18 15:57:36 +00:00
|
|
|
// Cache of mspan objects from the heap.
|
|
|
|
|
mspancache struct {
|
|
|
|
|
// We need an explicit length here because this field is used
|
|
|
|
|
// in allocation codepaths where write barriers are not allowed,
|
|
|
|
|
// and eliminating the write barrier/keeping it eliminated from
|
2023-05-06 00:11:33 +08:00
|
|
|
// slice updates is tricky, more so than just managing the length
|
2019-09-18 15:57:36 +00:00
|
|
|
// ourselves.
|
|
|
|
|
len int
|
|
|
|
|
buf [128]*mspan
|
|
|
|
|
}
|
|
|
|
|
|
2023-05-19 19:12:35 +00:00
|
|
|
// Cache of a single pinner object to reduce allocations from repeated
|
|
|
|
|
// pinner creation.
|
|
|
|
|
pinnerCache *pinner
|
|
|
|
|
|
2023-05-10 20:42:23 +00:00
|
|
|
trace pTraceState
|
2017-04-14 13:52:27 -04:00
|
|
|
|
2015-03-08 20:56:15 -04:00
|
|
|
palloc persistentAlloc // per-P to avoid mutex
|
|
|
|
|
|
2015-03-17 12:17:47 -04:00
|
|
|
// Per-P GC state
|
2020-11-02 19:03:16 +00:00
|
|
|
gcAssistTime int64 // Nanoseconds in assistAlloc
|
2025-10-30 16:33:30 -04:00
|
|
|
gcFractionalMarkTime atomic.Int64 // Nanoseconds in fractional mark worker
|
2015-03-17 12:17:47 -04:00
|
|
|
|
runtime: only use CPU time from the current window in the GC CPU limiter
Currently the GC CPU limiter consumes CPU time from a few pools, but
because the events that flush to those pools may overlap, rather than be
strictly contained within, the update window for the GC CPU limiter, the
limiter's accounting is ultimately sloppy.
This sloppiness complicates accounting for idle time more completely,
and makes reasoning about the transient behavior of the GC CPU limiter
much more difficult.
To remedy this, this CL adds a field to the P struct that tracks the
start time of any in-flight event the limiter might care about, along
with information about the nature of that event. This timestamp is
managed atomically so that the GC CPU limiter can come in and perform a
read of the partial CPU time consumed by a given event. The limiter also
updates the timestamp so that only what's left over is flushed by the
event itself when it completes.
The end result of this change is that, since the GC CPU limiter is aware
of all past completed events, and all in-flight events, it can much more
accurately collect the CPU time of events since the last update. There's
still the possibility for skew, but any leftover time will be captured
in the following update, and the magnitude of this leftover time is
effectively bounded by the update period of the GC CPU limiter, which is
much easier to consider.
One caveat of managing this timestamp-type combo atomically is that they
need to be packed in 64 bits. So, this CL gives up the top 3 bits of the
timestamp and places the type information there. What this means is we
effectively have only a 61-bit resolution timestamp. This is fine when
the top 3 bits are the same between calls to nanotime, but becomes a
problem on boundaries when those 3 bits change. These cases may cause
hiccups in the GC CPU limiter by not accounting for some source of CPU
time correctly, but with 61 bits of resolution this should be extremely
rare. The rate of update is on the order of milliseconds, so at worst
the runtime will be off of any given measurement by only a few
CPU-milliseconds (and this is directly bounded by the rate of update).
We're probably more inaccurate from the fact that we don't measure real
CPU time but only approximate it.
For #52890.
Change-Id: I347f30ac9e2ba6061806c21dfe0193ef2ab3bbe9
Reviewed-on: https://go-review.googlesource.com/c/go/+/410120
Reviewed-by: Michael Pratt <mpratt@google.com>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
2022-06-02 19:06:27 +00:00
|
|
|
// limiterEvent tracks events for the GC CPU limiter.
|
|
|
|
|
limiterEvent limiterEvent
|
|
|
|
|
|
runtime: manage gcBgMarkWorkers with a global pool
Background mark workers perform per-P marking work. Currently each
worker is assigned a P at creation time. The worker "attaches" to the P
via p.gcBgMarkWorker, making itself (usually) available to
findRunnableGCWorker for scheduling GC work.
While running gcMarkDone, the worker "detaches" from the P (by clearing
p.gcBgMarkWorker), since it may park for other reasons and should not be
scheduled by findRunnableGCWorker.
Unfortunately, this design is complex and difficult to reason about. We
simplify things by changing the design to eliminate the hard P
attachment. Rather than workers always performing work from the same P,
workers perform work for whichever P they find themselves on. On park,
the workers are placed in a pool of free workers, which each P's
findRunnableGCWorker can use to run a worker for its P.
Now if a worker parks in gcMarkDone, a P may simply use another worker
from the pool to complete its own work.
The P's GC worker mode is used to communicate the mode to run to the
selected worker. It is also used to emit the appropriate worker
EvGoStart tracepoint. This is a slight change, as this G may be
preempted (e.g., in gcMarkDone). When it is rescheduled, the trace
viewer will show it as a normal goroutine again. It is currently a bit
difficult to connect to the original worker tracepoint, as the viewer
does not display the goid for the original worker (though the data is in
the trace file).
Change-Id: Id7bd3a364dc18a4d2b1c99c4dc4810fae1293c1b
Reviewed-on: https://go-review.googlesource.com/c/go/+/262348
Run-TryBot: Michael Pratt <mpratt@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
Trust: Michael Pratt <mpratt@google.com>
2020-10-13 12:39:13 -04:00
|
|
|
// gcMarkWorkerMode is the mode for the next mark worker to run in.
|
|
|
|
|
// That is, this is used to communicate with the worker goroutine
|
|
|
|
|
// selected for immediate execution by
|
|
|
|
|
// gcController.findRunnableGCWorker. When scheduling other goroutines,
|
|
|
|
|
// this field must be set to gcMarkWorkerNotWorker.
|
|
|
|
|
gcMarkWorkerMode gcMarkWorkerMode
|
|
|
|
|
// gcMarkWorkerStartTime is the nanotime() at which the most recent
|
|
|
|
|
// mark worker started.
|
2017-10-04 16:15:35 -04:00
|
|
|
gcMarkWorkerStartTime int64
|
|
|
|
|
|
2025-11-17 15:09:50 -05:00
|
|
|
// nextGCMarkWorker is the next mark worker to run. This may be set
|
|
|
|
|
// during start-the-world to assign a worker to this P. The P runs this
|
|
|
|
|
// worker on the next call to gcController.findRunnableGCWorker. If the
|
|
|
|
|
// P runs something else or stops, it must release this worker via
|
|
|
|
|
// gcController.releaseNextGCMarkWorker.
|
|
|
|
|
//
|
|
|
|
|
// See comment in gcBgMarkWorker about the lifetime of
|
|
|
|
|
// gcBgMarkWorkerNode.
|
|
|
|
|
//
|
|
|
|
|
// Only accessed by this P or during STW.
|
|
|
|
|
nextGCMarkWorker *gcBgMarkWorkerNode
|
|
|
|
|
|
runtime: replace per-M workbuf cache with per-P gcWork cache
Currently, each M has a cache of the most recently used *workbuf. This
is used primarily by the write barrier so it doesn't have to access
the global workbuf lists on every write barrier. It's also used by
stack scanning because it's convenient.
This cache is important for write barrier performance, but this
particular approach has several downsides. It's faster than no cache,
but far from optimal (as the benchmarks below show). It's complex:
access to the cache is sprinkled through most of the workbuf list
operations and it requires special care to transform into and back out
of the gcWork cache that's actually used for scanning and marking. It
requires atomic exchanges to take ownership of the cached workbuf and
to return it to the M's cache even though it's almost always used by
only the current M. Since it's per-M, flushing these caches is O(# of
Ms), which may be high. And it has some significant subtleties: for
example, in general the cache shouldn't be used after the
harvestwbufs() in mark termination because it could hide work from
mark termination, but stack scanning can happen after this and *will*
use the cache (but it turns out this is okay because it will always be
followed by a getfull(), which drains the cache).
This change replaces this cache with a per-P gcWork object. This
gcWork cache can be used directly by scanning and marking (as long as
preemption is disabled, which is a general requirement of gcWork).
Since it's per-P, it doesn't require synchronization, which simplifies
things and means the only atomic operations in the write barrier are
occasionally fetching new work buffers and setting a mark bit if the
object isn't already marked. This cache can be flushed in O(# of Ps),
which is generally small. It follows a simple flushing rule: the cache
can be used during any phase, but during mark termination it must be
flushed before allowing preemption. This also makes the dispose during
mutator assist no longer necessary, which eliminates the vast majority
of gcWork dispose calls and reduces contention on the global workbuf
lists. And it's a lot faster on some benchmarks:
benchmark old ns/op new ns/op delta
BenchmarkBinaryTree17 11963668673 11206112763 -6.33%
BenchmarkFannkuch11 2643217136 2649182499 +0.23%
BenchmarkFmtFprintfEmpty 70.4 70.2 -0.28%
BenchmarkFmtFprintfString 364 307 -15.66%
BenchmarkFmtFprintfInt 317 282 -11.04%
BenchmarkFmtFprintfIntInt 512 483 -5.66%
BenchmarkFmtFprintfPrefixedInt 404 380 -5.94%
BenchmarkFmtFprintfFloat 521 479 -8.06%
BenchmarkFmtManyArgs 2164 1894 -12.48%
BenchmarkGobDecode 30366146 22429593 -26.14%
BenchmarkGobEncode 29867472 26663152 -10.73%
BenchmarkGzip 391236616 396779490 +1.42%
BenchmarkGunzip 96639491 96297024 -0.35%
BenchmarkHTTPClientServer 100110 70763 -29.31%
BenchmarkJSONEncode 51866051 52511382 +1.24%
BenchmarkJSONDecode 103813138 86094963 -17.07%
BenchmarkMandelbrot200 4121834 4120886 -0.02%
BenchmarkGoParse 16472789 5879949 -64.31%
BenchmarkRegexpMatchEasy0_32 140 140 +0.00%
BenchmarkRegexpMatchEasy0_1K 394 394 +0.00%
BenchmarkRegexpMatchEasy1_32 120 120 +0.00%
BenchmarkRegexpMatchEasy1_1K 621 614 -1.13%
BenchmarkRegexpMatchMedium_32 209 202 -3.35%
BenchmarkRegexpMatchMedium_1K 54889 55175 +0.52%
BenchmarkRegexpMatchHard_32 2682 2675 -0.26%
BenchmarkRegexpMatchHard_1K 79383 79524 +0.18%
BenchmarkRevcomp 584116718 584595320 +0.08%
BenchmarkTemplate 125400565 109620196 -12.58%
BenchmarkTimeParse 386 387 +0.26%
BenchmarkTimeFormat 580 447 -22.93%
(Best out of 10 runs. The delta of averages is similar.)
This also puts us in a good position to flush these caches when
nearing the end of concurrent marking, which will let us increase the
size of the work buffers while still controlling mark termination
pause time.
Change-Id: I2dd94c8517a19297a98ec280203cccaa58792522
Reviewed-on: https://go-review.googlesource.com/9178
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Russ Cox <rsc@golang.org>
2015-04-19 15:22:20 -04:00
|
|
|
// gcw is this P's GC work buffer cache. The work buffer is
|
|
|
|
|
// filled by write barriers, drained by mutator assists, and
|
|
|
|
|
// disposed on certain GC state transitions.
|
|
|
|
|
gcw gcWork
|
|
|
|
|
|
runtime: buffered write barrier implementation
This implements runtime support for buffered write barriers on amd64.
The buffered write barrier has a fast path that simply enqueues
pointers in a per-P buffer. Unlike the current write barrier, this
fast path is *not* a normal Go call and does not require the compiler
to spill general-purpose registers or put arguments on the stack. When
the buffer fills up, the write barrier takes the slow path, which
spills all general purpose registers and flushes the buffer. We don't
allow safe-points or stack splits while this frame is active, so it
doesn't matter that we have no type information for the spilled
registers in this frame.
One minor complication is cgocheck=2 mode, which uses the write
barrier to detect Go pointers being written to non-Go memory. We
obviously can't buffer this, so instead we set the buffer to its
minimum size, forcing the write barrier into the slow path on every
call. For this specific case, we pass additional information as
arguments to the flush function. This also requires enabling the cgo
write barrier slightly later during runtime initialization, after Ps
(and the per-P write barrier buffers) have been initialized.
The code in this CL is not yet active. The next CL will modify the
compiler to generate calls to the new write barrier.
This reduces the average cost of the write barrier by roughly a factor
of 4, which will pay for the cost of having it enabled more of the
time after we make the GC pacer less aggressive. (Benchmarks will be
in the next CL.)
Updates #14951.
Updates #22460.
Change-Id: I396b5b0e2c5e5c4acfd761a3235fd15abadc6cb1
Reviewed-on: https://go-review.googlesource.com/73711
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
2017-10-26 12:21:16 -04:00
|
|
|
// wbBuf is this P's GC write barrier buffer.
|
|
|
|
|
//
|
|
|
|
|
// TODO: Consider caching this in the running G.
|
|
|
|
|
wbBuf wbBuf
|
|
|
|
|
|
2015-03-27 16:49:12 -04:00
|
|
|
runSafePointFn uint32 // if 1, run sched.safePointFn at next safe point
|
|
|
|
|
|
2020-11-02 19:03:16 +00:00
|
|
|
// statsSeq is a counter indicating whether this P is currently
|
|
|
|
|
// writing any stats. Its value is even when not, odd when it is.
|
2022-08-25 10:13:03 +08:00
|
|
|
statsSeq atomic.Uint32
|
2020-11-02 19:03:16 +00:00
|
|
|
|
2024-02-14 11:57:05 -05:00
|
|
|
// Timer heap.
|
|
|
|
|
timers timers
|
2019-04-11 14:20:54 -07:00
|
|
|
|
2025-02-19 16:33:21 +00:00
|
|
|
// Cleanups.
|
2025-05-09 19:33:22 +00:00
|
|
|
cleanups *cleanupBlock
|
|
|
|
|
cleanupsQueued uint64 // monotonic count of cleanups queued by this P
|
2025-02-19 16:33:21 +00:00
|
|
|
|
2021-08-28 15:50:52 -07:00
|
|
|
// maxStackScanDelta accumulates the amount of stack space held by
|
2021-04-12 22:33:54 +00:00
|
|
|
// live goroutines (i.e. those eligible for stack scanning).
|
2021-08-28 15:50:52 -07:00
|
|
|
// Flushed to gcController.maxStackScan once maxStackScanSlack
|
|
|
|
|
// or -maxStackScanSlack is reached.
|
|
|
|
|
maxStackScanDelta int64
|
|
|
|
|
|
|
|
|
|
// gc-time statistics about current goroutines
|
|
|
|
|
// Note that this differs from maxStackScan in that this
|
|
|
|
|
// accumulates the actual stack observed to be used at GC time (hi - sp),
|
|
|
|
|
// not an instantaneous measure of the total stack size that might need
|
|
|
|
|
// to be scanned (hi - lo).
|
|
|
|
|
scannedStackSize uint64 // stack size of goroutines scanned by this P
|
|
|
|
|
scannedStacks uint64 // number of goroutines scanned by this P
|
2021-04-12 22:33:54 +00:00
|
|
|
|
2019-10-12 21:23:29 -04:00
|
|
|
// preempt is set to indicate that this P should be enter the
|
|
|
|
|
// scheduler ASAP (regardless of what G is running on it).
|
|
|
|
|
preempt bool
|
|
|
|
|
|
2024-03-25 17:50:13 +00:00
|
|
|
// gcStopTime is the nanotime timestamp that this P last entered _Pgcstop.
|
|
|
|
|
gcStopTime int64
|
|
|
|
|
|
2025-07-23 18:41:56 +00:00
|
|
|
// goroutinesCreated is the total count of goroutines created by this P.
|
|
|
|
|
goroutinesCreated uint64
|
|
|
|
|
|
runtime: save scalar registers off stack in amd64 async preemption
Asynchronous preemption must save all registers that could be in use
by Go code. Currently, it saves all of these to the goroutine stack.
As a result, the stack frame requirements of asynchronous preemption
can be rather high. On amd64, this requires 368 bytes of stack space,
most of which is the XMM registers. Several RISC architectures are
around 0.5 KiB.
As we add support for SIMD instructions, this is going to become a
problem. The AVX-512 register state is 2.5 KiB. This well exceeds the
nosplit limit, and even if it didn't, could constrain when we can
asynchronously preempt goroutines on small stacks.
This CL fixes this by moving pure scalar state stored in non-GP
registers off the stack and into an allocated "extended register
state" object. To reduce space overhead, we only allocate these
objects as needed. While in the theoretical limit, every G could need
this register state, in practice very few do at a time.
However, we can't allocate when we're in the middle of saving the
register state during an asynchronous preemption, so we reserve
scratch space on every P to temporarily store the register state,
which can then be copied out to an allocated state object later by Go
code.
This commit only implements this for amd64, since that's where we're
about to add much more vector state, but it lays the groundwork for
doing this on any architecture that could benefit.
This is a cherry-pick of CL 680898 plus bug fix CL 684836 from the
dev.simd branch.
Change-Id: I123a95e21c11d5c10942d70e27f84d2d99bbf735
Reviewed-on: https://go-review.googlesource.com/c/go/+/669195
Auto-Submit: Austin Clements <austin@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
2025-04-29 22:55:40 -04:00
|
|
|
// xRegs is the per-P extended register state used by asynchronous
|
|
|
|
|
// preemption. This is an empty struct on platforms that don't use extended
|
|
|
|
|
// register state.
|
|
|
|
|
xRegs xRegPerP
|
|
|
|
|
|
2020-11-10 21:02:18 +08:00
|
|
|
// Padding is no longer needed. False sharing is now not a worry because p is large enough
|
|
|
|
|
// that its size class is an integer multiple of the cache line size (for any of our architectures).
|
2014-11-11 17:05:19 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
type schedt struct {
|
2025-04-30 15:17:45 -04:00
|
|
|
goidgen atomic.Uint64
|
|
|
|
|
lastpoll atomic.Int64 // time of last network poll, 0 if currently polling
|
|
|
|
|
pollUntil atomic.Int64 // time to which current poll is sleeping
|
|
|
|
|
pollingNet atomic.Int32 // 1 if some P doing non-blocking network poll
|
2014-11-11 17:05:19 -05:00
|
|
|
|
2016-01-06 21:16:01 -05:00
|
|
|
lock mutex
|
2014-11-11 17:05:19 -05:00
|
|
|
|
runtime: make it possible to exit Go-created threads
Currently, threads created by the runtime exist until the whole
program exits. For #14592 and #20395, we want to be able to exit and
clean up threads created by the runtime. This commit implements that
mechanism.
The main difficulty is how to clean up the g0 stack. In cgo mode and
on Solaris and Windows where the OS manages thread stacks, we simply
arrange to return from mstart and let the system clean up the thread.
If the runtime allocated the g0 stack, then we use a new exitThread
syscall wrapper that arranges to clear a flag in the M once the stack
can safely be reaped and call the thread termination syscall.
exitThread is based on the existing exit1 wrapper, which was always
meant to terminate the calling thread. However, exit1 has never been
used since it was introduced 9 years ago, so it was broken on several
platforms. exitThread also has the additional complication of having
to flag that the stack is unused, which requires some tricks on
platforms that use the stack for syscalls.
This still leaves the problem of how to reap the unused g0 stacks. For
this, we move the M from allm to a new freem list as part of the M
exiting. Later, allocm scans the freem list, finds Ms that are marked
as done with their stack, removes these from the list and frees their
g0 stacks. This also allows these Ms to be garbage collected.
This CL does not yet use any of this functionality. Follow-up CLs
will. Likewise, there are no new tests in this CL because we'll need
follow-up functionality to test it.
Change-Id: Ic851ee74227b6d39c6fc1219fc71b45d3004bc63
Reviewed-on: https://go-review.googlesource.com/46037
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2017-06-16 15:54:21 -04:00
|
|
|
// When increasing nmidle, nmidlelocked, nmsys, or nmfreed, be
|
|
|
|
|
// sure to call checkdead().
|
|
|
|
|
|
2025-10-24 14:54:21 -04:00
|
|
|
midle listHeadManual // idle m's waiting for work
|
2015-04-17 00:21:30 -04:00
|
|
|
nmidle int32 // number of idle m's waiting for work
|
|
|
|
|
nmidlelocked int32 // number of locked m's waiting for work
|
2017-10-05 21:28:01 -04:00
|
|
|
mnext int64 // number of m's that have been created and next M ID
|
2015-04-17 00:21:30 -04:00
|
|
|
maxmcount int32 // maximum number of m's allowed (or die)
|
2017-06-15 10:51:15 -04:00
|
|
|
nmsys int32 // number of system m's not counted for deadlock
|
runtime: make it possible to exit Go-created threads
Currently, threads created by the runtime exist until the whole
program exits. For #14592 and #20395, we want to be able to exit and
clean up threads created by the runtime. This commit implements that
mechanism.
The main difficulty is how to clean up the g0 stack. In cgo mode and
on Solaris and Windows where the OS manages thread stacks, we simply
arrange to return from mstart and let the system clean up the thread.
If the runtime allocated the g0 stack, then we use a new exitThread
syscall wrapper that arranges to clear a flag in the M once the stack
can safely be reaped and call the thread termination syscall.
exitThread is based on the existing exit1 wrapper, which was always
meant to terminate the calling thread. However, exit1 has never been
used since it was introduced 9 years ago, so it was broken on several
platforms. exitThread also has the additional complication of having
to flag that the stack is unused, which requires some tricks on
platforms that use the stack for syscalls.
This still leaves the problem of how to reap the unused g0 stacks. For
this, we move the M from allm to a new freem list as part of the M
exiting. Later, allocm scans the freem list, finds Ms that are marked
as done with their stack, removes these from the list and frees their
g0 stacks. This also allows these Ms to be garbage collected.
This CL does not yet use any of this functionality. Follow-up CLs
will. Likewise, there are no new tests in this CL because we'll need
follow-up functionality to test it.
Change-Id: Ic851ee74227b6d39c6fc1219fc71b45d3004bc63
Reviewed-on: https://go-review.googlesource.com/46037
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2017-06-16 15:54:21 -04:00
|
|
|
nmfreed int64 // cumulative number of freed m's
|
2014-11-11 17:05:19 -05:00
|
|
|
|
2025-07-23 17:35:54 +00:00
|
|
|
ngsys atomic.Int32 // number of system goroutines
|
|
|
|
|
nGsyscallNoP atomic.Int32 // number of goroutines in syscalls without a P
|
2016-01-06 21:16:01 -05:00
|
|
|
|
2022-03-01 15:06:37 -05:00
|
|
|
pidle puintptr // idle p's
|
|
|
|
|
npidle atomic.Int32
|
2022-08-12 16:37:42 +07:00
|
|
|
nmspinning atomic.Int32 // See "Worker thread parking/unparking" comment in proc.go.
|
2022-03-01 15:06:37 -05:00
|
|
|
needspinning atomic.Uint32 // See "Delicate dance" comment in proc.go. Boolean. Must hold sched.lock to set to 1.
|
2014-11-11 17:05:19 -05:00
|
|
|
|
|
|
|
|
// Global runnable queue.
|
2025-04-07 17:08:19 +03:00
|
|
|
runq gQueue
|
2014-11-11 17:05:19 -05:00
|
|
|
|
2018-09-11 11:28:24 -04:00
|
|
|
// disable controls selective disabling of the scheduler.
|
|
|
|
|
//
|
|
|
|
|
// Use schedEnableUser to control this.
|
|
|
|
|
//
|
|
|
|
|
// disable is protected by sched.lock.
|
|
|
|
|
disable struct {
|
|
|
|
|
// user disables scheduling of user goroutines.
|
|
|
|
|
user bool
|
|
|
|
|
runnable gQueue // pending runnable Gs
|
|
|
|
|
}
|
|
|
|
|
|
2014-11-11 17:05:19 -05:00
|
|
|
// Global cache of dead G's.
|
2018-08-10 10:19:03 -04:00
|
|
|
gFree struct {
|
|
|
|
|
lock mutex
|
|
|
|
|
stack gList // Gs with stacks
|
|
|
|
|
noStack gList // Gs without stacks
|
|
|
|
|
}
|
2014-11-11 17:05:19 -05:00
|
|
|
|
2015-02-03 00:33:02 +03:00
|
|
|
// Central cache of sudog structs.
|
|
|
|
|
sudoglock mutex
|
|
|
|
|
sudogcache *sudog
|
|
|
|
|
|
2021-06-08 18:45:18 -04:00
|
|
|
// Central pool of available defer structs.
|
2015-02-05 13:35:41 +00:00
|
|
|
deferlock mutex
|
2021-06-08 18:45:18 -04:00
|
|
|
deferpool *_defer
|
2015-02-05 13:35:41 +00:00
|
|
|
|
runtime: make it possible to exit Go-created threads
Currently, threads created by the runtime exist until the whole
program exits. For #14592 and #20395, we want to be able to exit and
clean up threads created by the runtime. This commit implements that
mechanism.
The main difficulty is how to clean up the g0 stack. In cgo mode and
on Solaris and Windows where the OS manages thread stacks, we simply
arrange to return from mstart and let the system clean up the thread.
If the runtime allocated the g0 stack, then we use a new exitThread
syscall wrapper that arranges to clear a flag in the M once the stack
can safely be reaped and call the thread termination syscall.
exitThread is based on the existing exit1 wrapper, which was always
meant to terminate the calling thread. However, exit1 has never been
used since it was introduced 9 years ago, so it was broken on several
platforms. exitThread also has the additional complication of having
to flag that the stack is unused, which requires some tricks on
platforms that use the stack for syscalls.
This still leaves the problem of how to reap the unused g0 stacks. For
this, we move the M from allm to a new freem list as part of the M
exiting. Later, allocm scans the freem list, finds Ms that are marked
as done with their stack, removes these from the list and frees their
g0 stacks. This also allows these Ms to be garbage collected.
This CL does not yet use any of this functionality. Follow-up CLs
will. Likewise, there are no new tests in this CL because we'll need
follow-up functionality to test it.
Change-Id: Ic851ee74227b6d39c6fc1219fc71b45d3004bc63
Reviewed-on: https://go-review.googlesource.com/46037
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2017-06-16 15:54:21 -04:00
|
|
|
// freem is the list of m's waiting to be freed when their
|
|
|
|
|
// m.exited is set. Linked through m.freelink.
|
|
|
|
|
freem *m
|
|
|
|
|
|
2022-07-25 15:31:03 -04:00
|
|
|
gcwaiting atomic.Bool // gc is waiting to run
|
2014-11-11 17:05:19 -05:00
|
|
|
stopwait int32
|
|
|
|
|
stopnote note
|
2022-07-25 15:39:07 -04:00
|
|
|
sysmonwait atomic.Bool
|
2014-11-11 17:05:19 -05:00
|
|
|
sysmonnote note
|
|
|
|
|
|
2023-11-18 13:51:35 +08:00
|
|
|
// safePointFn should be called on each P at the next GC
|
2015-03-27 16:49:12 -04:00
|
|
|
// safepoint if p.runSafePointFn is set.
|
runtime: use separate count and note for forEachP
Currently, forEachP reuses the stopwait and stopnote fields from
stopTheWorld to track how many Ps have not responded to the safe-point
request and to sleep until all Ps have responded.
It was assumed this was safe because both stopTheWorld and forEachP
must occur under the worlsema and hence stopwait and stopnote cannot
be used for both purposes simultaneously and callers could always
determine the appropriate use based on sched.gcwaiting (which is only
set by stopTheWorld). However, this is not the case, since it's
possible for there to be a window between when an M observes that
gcwaiting is set and when it checks stopwait during which stopwait
could have changed meanings. When this happens, the M decrements
stopwait and may wakeup stopnote, but does not otherwise participate
in the forEachP protocol. As a result, stopwait is decremented too
many times, so it may reach zero before all Ps have run the safe-point
function, causing forEachP to wake up early. It will then either
observe that some P has not run the safe-point function and panic with
"P did not run fn", or the remaining P (or Ps) will run the safe-point
function before it wakes up and it will observe that stopwait is
negative and panic with "not stopped".
Fix this problem by giving forEachP its own safePointWait and
safePointNote fields.
One known sequence of events that can cause this race is as
follows. It involves three actors:
G1 is running on M1 on P1. P1 has an empty run queue.
G2/M2 is in a blocked syscall and has lost its P. (The details of this
don't matter, it just needs to be in a position where it needs to grab
an idle P.)
GC just started on G3/M3/P3. (These aren't very involved, they just
have to be separate from the other G's, M's, and P's.)
1. GC calls stopTheWorld(), which sets sched.gcwaiting to 1.
Now G1/M1 begins to enter a syscall:
2. G1/M1 invokes reentersyscall, which sets the P1's status to
_Psyscall.
3. G1/M1's reentersyscall observes gcwaiting != 0 and calls
entersyscall_gcwait.
4. G1/M1's entersyscall_gcwait blocks acquiring sched.lock.
Back on GC:
5. stopTheWorld cas's P1's status to _Pgcstop, does other stuff, and
returns.
6. GC does stuff and then calls startTheWorld().
7. startTheWorld() calls procresize(), which sets P1's status to
_Pidle and puts P1 on the idle list.
Now G2/M2 returns from its syscall and takes over P1:
8. G2/M2 returns from its blocked syscall and gets P1 from the idle
list.
9. G2/M2 acquires P1, which sets P1's status to _Prunning.
10. G2/M2 starts a new syscall and invokes reentersyscall, which sets
P1's status to _Psyscall.
Back on G1/M1:
11. G1/M1 finally acquires sched.lock in entersyscall_gcwait.
At this point, G1/M1 still thinks it's running on P1. P1's status is
_Psyscall, which is consistent with what G1/M1 is doing, but it's
_Psyscall because *G2/M2* put it in to _Psyscall, not G1/M1. This is
basically an ABA race on P1's status.
Because forEachP currently shares stopwait with stopTheWorld. G1/M1's
entersyscall_gcwait observes the non-zero stopwait set by forEachP,
but mistakes it for a stopTheWorld. It cas's P1's status from
_Psyscall (set by G2/M2) to _Pgcstop and proceeds to decrement
stopwait one more time than forEachP was expecting.
Fixes #10618. (See the issue for details on why the above race is safe
when forEachP is not involved.)
Prior to this commit, the command
stress ./runtime.test -test.run TestFutexsleep\|TestGoroutineProfile
would reliably fail after a few hundred runs. With this commit, it
ran for over 2 million runs and never crashed.
Change-Id: I9a91ea20035b34b6e5f07ef135b144115f281f30
Reviewed-on: https://go-review.googlesource.com/10157
Reviewed-by: Russ Cox <rsc@golang.org>
2015-05-15 16:31:17 -04:00
|
|
|
safePointFn func(*p)
|
|
|
|
|
safePointWait int32
|
|
|
|
|
safePointNote note
|
2015-03-27 16:49:12 -04:00
|
|
|
|
2014-11-11 17:05:19 -05:00
|
|
|
profilehz int32 // cpu profiling rate
|
2015-04-01 13:47:35 -04:00
|
|
|
|
|
|
|
|
procresizetime int64 // nanotime() of last change to gomaxprocs
|
|
|
|
|
totaltime int64 // ∫gomaxprocs dt up to procresizetime
|
2020-05-19 16:33:17 +00:00
|
|
|
|
runtime: use cgroup CPU limit to set GOMAXPROCS
This CL adds two related features enabled by default via compatibility
GODEBUGs containermaxprocs and updatemaxprocs.
On Linux, containermaxprocs makes the Go runtime consider cgroup CPU
bandwidth limits (quota/period) when setting GOMAXPROCS. If the cgroup
limit is lower than the number of logical CPUs available, then the
cgroup limit takes precedence.
On all OSes, updatemaxprocs makes the Go runtime periodically
recalculate the default GOMAXPROCS value and update GOMAXPROCS if it has
changed. If GOMAXPROCS is set manually, this update does not occur. This
is intended primarily to detect changes to cgroup limits, but it applies
on all OSes because the CPU affinity mask can change as well.
The runtime only considers the limit in the leaf cgroup (the one that
actually contains the process), caching the CPU limit file
descriptor(s), which are periodically reread for updates. This is a
small departure from the original proposed design. It will not consider
limits of parent cgroups (which may be lower than the leaf), and it will
not detection cgroup migration after process start.
We can consider changing this in the future, but the simpler approach is
less invasive; less risk to packages that have some awareness of runtime
internals. e.g., if the runtime periodically opens new files during
execution, file descriptor leak detection is difficult to implement in a
stable way.
For #73193.
Cq-Include-Trybots: luci.golang.try:gotip-linux-amd64-longtest
Change-Id: I6a6a636c631c1ae577fb8254960377ba91c5dc98
Reviewed-on: https://go-review.googlesource.com/c/go/+/670497
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
2025-05-05 13:44:26 -04:00
|
|
|
customGOMAXPROCS bool // GOMAXPROCS was manually set from the environment or runtime.GOMAXPROCS
|
|
|
|
|
|
2020-05-19 16:33:17 +00:00
|
|
|
// sysmonlock protects sysmon's actions on the runtime.
|
|
|
|
|
//
|
|
|
|
|
// Acquire and hold this mutex to block sysmon from interacting
|
|
|
|
|
// with the rest of the runtime.
|
|
|
|
|
sysmonlock mutex
|
2020-08-07 16:28:35 +00:00
|
|
|
|
|
|
|
|
// timeToRun is a distribution of scheduling latencies, defined
|
|
|
|
|
// as the sum of time a G spends in the _Grunnable state before
|
|
|
|
|
// it transitions to _Grunning.
|
|
|
|
|
timeToRun timeHistogram
|
2022-05-06 20:11:28 +00:00
|
|
|
|
|
|
|
|
// idleTime is the total CPU time Ps have "spent" idle.
|
|
|
|
|
//
|
|
|
|
|
// Reset on each GC cycle.
|
|
|
|
|
idleTime atomic.Int64
|
2022-08-31 21:34:23 +00:00
|
|
|
|
|
|
|
|
// totalMutexWaitTime is the sum of time goroutines have spent in _Gwaiting
|
|
|
|
|
// with a waitreason of the form waitReasonSync{RW,}Mutex{R,}Lock.
|
|
|
|
|
totalMutexWaitTime atomic.Int64
|
runtime/metrics: add STW stopping and total time metrics
This CL adds four new time histogram metrics:
/sched/pauses/stopping/gc:seconds
/sched/pauses/stopping/other:seconds
/sched/pauses/total/gc:seconds
/sched/pauses/total/other:seconds
The "stopping" metrics measure the time taken to start a stop-the-world
pause. i.e., how long it takes stopTheWorldWithSema to stop all Ps.
This can be used to detect STW struggling to preempt Ps.
The "total" metrics measure the total duration of a stop-the-world
pause, from starting to stop-the-world until the world is started again.
This includes the time spent in the "start" phase.
The "gc" metrics are used for GC-related STW pauses. The "other" metrics
are used for all other STW pauses.
All of these metrics start timing in stopTheWorldWithSema only after
successfully acquiring sched.lock, thus excluding lock contention on
sched.lock. The reasoning behind this is that while waiting on
sched.lock the world is not stopped at all (all other Ps can run), so
the impact of this contention is primarily limited to the goroutine
attempting to stop-the-world. Additionally, we already have some
visibility into sched.lock contention via contention profiles (#57071).
/sched/pauses/total/gc:seconds is conceptually equivalent to
/gc/pauses:seconds, so the latter is marked as deprecated and returns
the same histogram as the former.
In the implementation, there are a few minor differences:
* For both mark and sweep termination stops, /gc/pauses:seconds started
timing prior to calling startTheWorldWithSema, thus including lock
contention.
These details are minor enough, that I do not believe the slight change
in reporting will matter. For mark termination stops, moving timing stop
into startTheWorldWithSema does have the side effect of requiring moving
other GC metric calculations outside of the STW, as they depend on the
same end time.
Fixes #63340
Change-Id: Iacd0bab11bedab85d3dcfb982361413a7d9c0d05
Reviewed-on: https://go-review.googlesource.com/c/go/+/534161
Reviewed-by: Michael Knyszek <mknyszek@google.com>
Auto-Submit: Michael Pratt <mpratt@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
2023-10-10 15:28:32 -04:00
|
|
|
|
|
|
|
|
// stwStoppingTimeGC/Other are distributions of stop-the-world stopping
|
|
|
|
|
// latencies, defined as the time taken by stopTheWorldWithSema to get
|
|
|
|
|
// all Ps to stop. stwStoppingTimeGC covers all GC-related STWs,
|
|
|
|
|
// stwStoppingTimeOther covers the others.
|
|
|
|
|
stwStoppingTimeGC timeHistogram
|
|
|
|
|
stwStoppingTimeOther timeHistogram
|
|
|
|
|
|
|
|
|
|
// stwTotalTimeGC/Other are distributions of stop-the-world total
|
|
|
|
|
// latencies, defined as the total time from stopTheWorldWithSema to
|
|
|
|
|
// startTheWorldWithSema. This is a superset of
|
|
|
|
|
// stwStoppingTimeGC/Other. stwTotalTimeGC covers all GC-related STWs,
|
|
|
|
|
// stwTotalTimeOther covers the others.
|
|
|
|
|
stwTotalTimeGC timeHistogram
|
|
|
|
|
stwTotalTimeOther timeHistogram
|
2023-11-21 16:03:54 +00:00
|
|
|
|
|
|
|
|
// totalRuntimeLockWaitTime (plus the value of lockWaitTime on each M in
|
|
|
|
|
// allm) is the sum of time goroutines have spent in _Grunnable and with an
|
|
|
|
|
// M, but waiting for locks within the runtime. This field stores the value
|
|
|
|
|
// for Ms that have exited.
|
|
|
|
|
totalRuntimeLockWaitTime atomic.Int64
|
2025-07-23 18:41:56 +00:00
|
|
|
|
|
|
|
|
// goroutinesCreated (plus the value of goroutinesCreated on each P in allp)
|
|
|
|
|
// is the sum of all goroutines created by the program.
|
|
|
|
|
goroutinesCreated atomic.Uint64
|
2014-11-11 17:05:19 -05:00
|
|
|
}
|
|
|
|
|
|
2017-08-25 13:18:30 -07:00
|
|
|
// Values for the flags field of a sigTabT.
|
2014-11-11 17:05:19 -05:00
|
|
|
const (
|
runtime: don't always unblock all signals
Ian proposed an improved way of handling signals masks in Go, motivated
by a problem where the Android java runtime expects certain signals to
be blocked for all JVM threads. Discussion here
https://groups.google.com/forum/#!topic/golang-dev/_TSCkQHJt6g
Ian's text is used in the following:
A Go program always needs to have the synchronous signals enabled.
These are the signals for which _SigPanic is set in sigtable, namely
SIGSEGV, SIGBUS, SIGFPE.
A Go program that uses the os/signal package, and calls signal.Notify,
needs to have at least one thread which is not blocking that signal,
but it doesn't matter much which one.
Unix programs do not change signal mask across execve. They inherit
signal masks across fork. The shell uses this fact to some extent;
for example, the job control signals (SIGTTIN, SIGTTOU, SIGTSTP) are
blocked for commands run due to backquote quoting or $().
Our current position on signal masks was not thought out. We wandered
into step by step, e.g., http://golang.org/cl/7323067 .
This CL does the following:
Introduce a new platform hook, msigsave, that saves the signal mask of
the current thread to m.sigsave.
Call msigsave from needm and newm.
In minit grab set up the signal mask from m.sigsave and unblock the
essential synchronous signals, and SIGILL, SIGTRAP, SIGPROF, SIGSTKFLT
(for systems that have it).
In unminit, restore the signal mask from m.sigsave.
The first time that os/signal.Notify is called, start a new thread whose
only purpose is to update its signal mask to make sure signals for
signal.Notify are unblocked on at least one thread.
The effect on Go programs will be that if they are invoked with some
non-synchronous signals blocked, those signals will normally be
ignored. Previously, those signals would mostly be ignored. A change
in behaviour will occur for programs started with any of these signals
blocked, if they receive the signal: SIGHUP, SIGINT, SIGQUIT, SIGABRT,
SIGTERM. Previously those signals would always cause a crash (unless
using the os/signal package); with this change, they will be ignored
if the program is started with the signal blocked (and does not use
the os/signal package).
./all.bash completes successfully on linux/amd64.
OpenBSD is missing the implementation.
Change-Id: I188098ba7eb85eae4c14861269cc466f2aa40e8c
Reviewed-on: https://go-review.googlesource.com/10173
Reviewed-by: Ian Lance Taylor <iant@golang.org>
2015-05-18 11:00:24 +02:00
|
|
|
_SigNotify = 1 << iota // let signal.Notify have signal, even if from kernel
|
|
|
|
|
_SigKill // if signal.Notify doesn't take it, exit quietly
|
|
|
|
|
_SigThrow // if signal.Notify doesn't take it, exit loudly
|
|
|
|
|
_SigPanic // if the signal is from the kernel, panic
|
|
|
|
|
_SigDefault // if the signal isn't explicitly requested, don't monitor it
|
|
|
|
|
_SigGoExit // cause all runtime procs to exit (only used on Plan 9).
|
runtime, syscall: reimplement AllThreadsSyscall using only signals.
In issue 50113, we see that a thread blocked in a system call can result
in a hang of AllThreadsSyscall. To resolve this, we must send a signal
to these threads to knock them out of the system call long enough to run
the per-thread syscall.
Stepping back, if we need to send signals anyway, it should be possible
to implement this entire mechanism on top of signals. This CL does so,
vastly simplifying the mechanism, both as a direct result of
newly-unnecessary code as well as some ancillary simplifications to make
things simpler to follow.
Major changes:
* The rest of the mechanism is moved to os_linux.go, with fields in mOS
instead of m itself.
* 'Fixup' fields and functions are renamed to 'perThreadSyscall' so they
are more precise about their purpose.
* Rather than getting passed a closure, doAllThreadsSyscall takes the
syscall number and arguments. This avoids a lot of hairy behavior:
* The closure may potentially only be live in fields in the M,
hidden from the GC. Not necessary with no closure.
* The need to loan out the race context. A direct RawSyscall6 call
does not require any race context.
* The closure previously conditionally panicked in strange
locations, like a signal handler. Now we simply throw.
* All manual fixup synchronization with mPark, sysmon, templateThread,
sigqueue, etc is gone. The core approach is much simpler:
doAllThreadsSyscall sends a signal to every thread in allm, which
executes the system call from the signal handler. We use (SIGRTMIN +
1), aka SIGSETXID, the same signal used by glibc for this purpose. As
such, we are careful to only handle this signal on non-cgo binaries.
Synchronization with thread creation is a key part of this CL. The
comment near the top of doAllThreadsSyscall describes the required
synchronization semantics and how they are achieved.
Note that current use of allocmLock protects the state mutations of allm
that are also protected by sched.lock. allocmLock is used instead of
sched.lock simply to avoid holding sched.lock for so long.
Fixes #50113
Change-Id: Ic7ea856dc66cf711731540a54996e08fc986ce84
Reviewed-on: https://go-review.googlesource.com/c/go/+/383434
Reviewed-by: Austin Clements <austin@google.com>
Trust: Michael Pratt <mpratt@google.com>
Run-TryBot: Michael Pratt <mpratt@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
2022-02-04 17:15:28 -05:00
|
|
|
_SigSetStack // Don't explicitly install handler, but add SA_ONSTACK to existing libc handler
|
2017-11-22 19:12:12 -08:00
|
|
|
_SigUnblock // always unblock; see blockableSig
|
2017-08-19 16:59:19 +02:00
|
|
|
_SigIgn // _SIG_DFL action is to ignore the signal
|
2014-11-11 17:05:19 -05:00
|
|
|
)
|
|
|
|
|
|
|
|
|
|
// Layout of in-memory per-function information prepared by linker
|
2015-07-10 17:17:11 -06:00
|
|
|
// See https://golang.org/s/go12symtab.
|
2016-12-14 13:24:21 -05:00
|
|
|
// Keep in sync with linker (../cmd/link/internal/ld/pcln.go:/pclntab)
|
2014-11-11 17:05:19 -05:00
|
|
|
// and with package debug/gosym and with symtab.go in package runtime.
|
|
|
|
|
type _func struct {
|
runtime: implement traceback iterator
Currently, all stack walking logic is in one venerable, large, and
very, very complicated function: runtime.gentraceback. This function
has three distinct operating modes: printing, populating a PC buffer,
or invoking a callback. And it has three different modes of unwinding:
physical Go frames, inlined Go frames, and cgo frames. It also has
several flags. All of this logic is very interwoven.
This CL reimplements the monolithic gentraceback function as an
"unwinder" type with an iterator API. It moves all of the logic for
stack walking into this new type, and gentraceback is now a
much-simplified wrapper around the new unwinder type that still
implements printing, populating a PC buffer, and invoking a callback.
Follow-up CLs will replace uses of gentraceback with direct uses of
unwinder.
Exposing traceback functionality as an iterator API will enable a lot
of follow-up work such as simplifying the open-coded defer
implementation (which should in turn help with #26813 and #37233),
printing the bottom of deep stacks (#7181), and eliminating the small
limit on CPU stacks in profiles (#56029).
Fixes #54466.
Change-Id: I36e046dc423c9429c4f286d47162af61aff49a0d
Reviewed-on: https://go-review.googlesource.com/c/go/+/458218
Reviewed-by: Michael Pratt <mpratt@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Run-TryBot: Austin Clements <austin@google.com>
2022-07-14 12:22:24 -04:00
|
|
|
sys.NotInHeap // Only in static data
|
|
|
|
|
|
2025-11-02 20:04:57 -08:00
|
|
|
entryOff uint32 // start pc, as offset from moduledata.text
|
2022-09-06 17:57:07 -04:00
|
|
|
nameOff int32 // function name, as index into moduledata.funcnametab.
|
2014-11-11 17:05:19 -05:00
|
|
|
|
2018-09-11 15:14:28 -07:00
|
|
|
args int32 // in/out args size
|
cmd/compile, cmd/link, runtime: make defers low-cost through inline code and extra funcdata
Generate inline code at defer time to save the args of defer calls to unique
(autotmp) stack slots, and generate inline code at exit time to check which defer
calls were made and make the associated function/method/interface calls. We
remember that a particular defer statement was reached by storing in the deferBits
variable (always stored on the stack). At exit time, we check the bits of the
deferBits variable to determine which defer function calls to make (in reverse
order). These low-cost defers are only used for functions where no defers
appear in loops. In addition, we don't do these low-cost defers if there are too
many defer statements or too many exits in a function (to limit code increase).
When a function uses open-coded defers, we produce extra
FUNCDATA_OpenCodedDeferInfo information that specifies the number of defers, and
for each defer, the stack slots where the closure and associated args have been
stored. The funcdata also includes the location of the deferBits variable.
Therefore, for panics, we can use this funcdata to determine exactly which defers
are active, and call the appropriate functions/methods/closures with the correct
arguments for each active defer.
In order to unwind the stack correctly after a recover(), we need to add an extra
code segment to functions with open-coded defers that simply calls deferreturn()
and returns. This segment is not reachable by the normal function, but is returned
to by the runtime during recovery. We set the liveness information of this
deferreturn() to be the same as the liveness at the first function call during the
last defer exit code (so all return values and all stack slots needed by the defer
calls will be live).
I needed to increase the stackguard constant from 880 to 896, because of a small
amount of new code in deferreturn().
The -N flag disables open-coded defers. '-d defer' prints out the kind of defer
being used at each defer statement (heap-allocated, stack-allocated, or
open-coded).
Cost of defer statement [ go test -run NONE -bench BenchmarkDefer$ runtime ]
With normal (stack-allocated) defers only: 35.4 ns/op
With open-coded defers: 5.6 ns/op
Cost of function call alone (remove defer keyword): 4.4 ns/op
Text size increase (including funcdata) for go binary without/with open-coded defers: 0.09%
The average size increase (including funcdata) for only the functions that use
open-coded defers is 1.1%.
The cost of a panic followed by a recover got noticeably slower, since panic
processing now requires a scan of the stack for open-coded defer frames. This scan
is required, even if no frames are using open-coded defers:
Cost of panic and recover [ go test -run NONE -bench BenchmarkPanicRecover runtime ]
Without open-coded defers: 62.0 ns/op
With open-coded defers: 255 ns/op
A CGO Go-to-C-to-Go benchmark got noticeably faster because of open-coded defers:
CGO Go-to-C-to-Go benchmark [cd misc/cgo/test; go test -run NONE -bench BenchmarkCGoCallback ]
Without open-coded defers: 443 ns/op
With open-coded defers: 347 ns/op
Updates #14939 (defer performance)
Updates #34481 (design doc)
Change-Id: I63b1a60d1ebf28126f55ee9fd7ecffe9cb23d1ff
Reviewed-on: https://go-review.googlesource.com/c/go/+/202340
Reviewed-by: Austin Clements <austin@google.com>
2019-06-24 12:59:22 -07:00
|
|
|
deferreturn uint32 // offset of start of a deferreturn call instruction from entry, if any.
|
2014-11-11 17:05:19 -05:00
|
|
|
|
2020-08-12 19:26:53 -04:00
|
|
|
pcsp uint32
|
|
|
|
|
pcfile uint32
|
|
|
|
|
pcln uint32
|
|
|
|
|
npcdata uint32
|
2023-04-17 15:43:29 -04:00
|
|
|
cuOffset uint32 // runtime.cutab offset of this function's CU
|
|
|
|
|
startLine int32 // line number of start of function (func keyword/TEXT directive)
|
|
|
|
|
funcID abi.FuncID // set for certain special runtime functions
|
2023-04-17 15:58:47 -04:00
|
|
|
flag abi.FuncFlag
|
cmd/asm, cmd/link, runtime: introduce FuncInfo flag bits
The runtime traceback code has its own definition of which functions
mark the top frame of a stack, separate from the TOPFRAME bits that
exist in the assembly and are passed along in DWARF information.
It's error-prone and redundant to have two different sources of truth.
This CL provides the actual TOPFRAME bits to the runtime, so that
the runtime can use those bits instead of reinventing its own category.
This CL also adds a new bit, SPWRITE, which marks functions that
write directly to SP (anything but adding and subtracting constants).
Such functions must stop a traceback, because the traceback has no
way to rederive the SP on entry. Again, the runtime has its own definition
which is mostly correct, but also missing some functions. During ordinary
goroutine context switches, such functions do not appear on the stack,
so the incompleteness in the runtime usually doesn't matter.
But profiling signals can arrive at any moment, and the runtime may
crash during traceback if it attempts to unwind an SP-writing frame
and gets out-of-sync with the actual stack. The runtime contains code
to try to detect likely candidates but again it is incomplete.
Deriving the SPWRITE bit automatically from the actual assembly code
provides the complete truth, and passing it to the runtime lets the
runtime use it.
This CL is part of a stack adding windows/arm64
support (#36439), intended to land in the Go 1.17 cycle.
This CL is, however, not windows/arm64-specific.
It is cleanup meant to make the port (and future ports) easier.
Change-Id: I227f53b23ac5b3dabfcc5e8ee3f00df4e113cf58
Reviewed-on: https://go-review.googlesource.com/c/go/+/288800
Trust: Russ Cox <rsc@golang.org>
Trust: Jason A. Donenfeld <Jason@zx2c4.com>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Reviewed-by: Jason A. Donenfeld <Jason@zx2c4.com>
2021-01-28 15:21:33 -05:00
|
|
|
_ [1]byte // pad
|
|
|
|
|
nfuncdata uint8 // must be last, must end on a uint32-aligned boundary
|
2022-09-02 10:10:54 -04:00
|
|
|
|
|
|
|
|
// The end of the struct is followed immediately by two variable-length
|
|
|
|
|
// arrays that reference the pcdata and funcdata locations for this
|
|
|
|
|
// function.
|
|
|
|
|
|
|
|
|
|
// pcdata contains the offset into moduledata.pctab for the start of
|
|
|
|
|
// that index's table. e.g.,
|
|
|
|
|
// &moduledata.pctab[_func.pcdata[_PCDATA_UnsafePoint]] is the start of
|
|
|
|
|
// the unsafe point table.
|
|
|
|
|
//
|
|
|
|
|
// An offset of 0 indicates that there is no table.
|
|
|
|
|
//
|
|
|
|
|
// pcdata [npcdata]uint32
|
|
|
|
|
|
|
|
|
|
// funcdata contains the offset past moduledata.gofunc which contains a
|
|
|
|
|
// pointer to that index's funcdata. e.g.,
|
|
|
|
|
// *(moduledata.gofunc + _func.funcdata[_FUNCDATA_ArgsPointerMaps]) is
|
|
|
|
|
// the argument pointer map.
|
|
|
|
|
//
|
|
|
|
|
// An offset of ^uint32(0) indicates that there is no entry.
|
|
|
|
|
//
|
|
|
|
|
// funcdata [nfuncdata]uint32
|
2014-11-11 17:05:19 -05:00
|
|
|
}
|
|
|
|
|
|
2019-01-05 14:31:23 -08:00
|
|
|
// Pseudo-Func that is returned for PCs that occur in inlined code.
|
|
|
|
|
// A *Func can be either a *_func or a *funcinl, and they are distinguished
|
|
|
|
|
// by the first uintptr.
|
2023-02-05 15:54:33 -05:00
|
|
|
//
|
|
|
|
|
// TODO(austin): Can we merge this with inlinedCall?
|
2019-01-05 14:31:23 -08:00
|
|
|
type funcinl struct {
|
2022-09-07 13:23:19 -04:00
|
|
|
ones uint32 // set to ^0 to distinguish from _func
|
|
|
|
|
entry uintptr // entry of the real (the "outermost") frame
|
|
|
|
|
name string
|
|
|
|
|
file string
|
|
|
|
|
line int32
|
|
|
|
|
startLine int32
|
2019-01-05 14:31:23 -08:00
|
|
|
}
|
|
|
|
|
|
2023-12-12 20:40:33 -08:00
|
|
|
type itab = abi.ITab
|
2014-11-11 17:05:19 -05:00
|
|
|
|
|
|
|
|
// Lock-free stack node.
|
2019-05-17 04:26:16 +00:00
|
|
|
// Also known to export_test.go.
|
2014-11-11 17:05:19 -05:00
|
|
|
type lfnode struct {
|
2014-11-15 08:00:38 -05:00
|
|
|
next uint64
|
2014-11-11 17:05:19 -05:00
|
|
|
pushcnt uintptr
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
type forcegcstate struct {
|
|
|
|
|
lock mutex
|
|
|
|
|
g *g
|
2022-08-25 03:03:35 +08:00
|
|
|
idle atomic.Bool
|
2014-11-11 17:05:19 -05:00
|
|
|
}
|
|
|
|
|
|
2017-09-13 15:53:47 -07:00
|
|
|
// A _defer holds an entry on the list of deferred calls.
|
2022-02-01 19:32:08 -08:00
|
|
|
// If you add a field here, add code to clear it in deferProcStack.
|
2021-06-07 18:18:00 -04:00
|
|
|
// This struct must match the code in cmd/compile/internal/ssagen/ssa.go:deferstruct
|
|
|
|
|
// and cmd/compile/internal/ssagen/ssa.go:(*state).call.
|
2019-06-08 17:20:57 +00:00
|
|
|
// Some defers will be allocated on the stack and some on the heap.
|
|
|
|
|
// All defers are logically part of the stack, so write barriers to
|
|
|
|
|
// initialize them are not required. All defers must be manually scanned,
|
|
|
|
|
// and for heap defers, marked.
|
2014-11-11 17:05:19 -05:00
|
|
|
type _defer struct {
|
2023-06-30 16:18:44 -04:00
|
|
|
heap bool
|
|
|
|
|
rangefunc bool // true for rangefunc list
|
|
|
|
|
sp uintptr // sp at time of defer
|
2025-11-05 18:05:42 -08:00
|
|
|
pc uintptr // pc at time of defer
|
2023-06-30 16:18:44 -04:00
|
|
|
fn func() // can be nil for open-coded defers
|
|
|
|
|
link *_defer // next defer on G; can point to either heap or stack!
|
|
|
|
|
|
|
|
|
|
// If rangefunc is true, *head is the head of the atomic linked list
|
|
|
|
|
// during a range-over-func execution.
|
|
|
|
|
head *atomic.Pointer[_defer]
|
2014-11-11 17:05:19 -05:00
|
|
|
}
|
|
|
|
|
|
2018-03-08 17:48:22 -05:00
|
|
|
// A _panic holds information about an active panic.
|
|
|
|
|
//
|
2020-08-21 20:20:12 -07:00
|
|
|
// A _panic value must only ever live on the stack.
|
2018-03-08 17:48:22 -05:00
|
|
|
//
|
2025-07-01 15:00:13 -07:00
|
|
|
// The gopanicFP and link fields are stack pointers, but don't need special
|
2018-03-08 17:48:22 -05:00
|
|
|
// handling during stack growth: because they are pointer-typed and
|
|
|
|
|
// _panic values only live on the stack, regular stack pointer
|
|
|
|
|
// adjustment takes care of them.
|
2014-11-11 17:05:19 -05:00
|
|
|
type _panic struct {
|
2025-07-01 15:00:13 -07:00
|
|
|
arg any // argument to panic
|
|
|
|
|
link *_panic // link to earlier panic
|
2023-07-27 16:20:36 -07:00
|
|
|
|
|
|
|
|
// startPC and startSP track where _panic.start was called.
|
|
|
|
|
startPC uintptr
|
|
|
|
|
startSP unsafe.Pointer
|
|
|
|
|
|
|
|
|
|
// The current stack frame that we're running deferred calls for.
|
2023-08-04 14:10:59 -07:00
|
|
|
sp unsafe.Pointer
|
|
|
|
|
lr uintptr
|
|
|
|
|
fp unsafe.Pointer
|
2023-07-27 16:20:36 -07:00
|
|
|
|
|
|
|
|
// retpc stores the PC where the panic should jump back to, if the
|
|
|
|
|
// function last returned by _panic.next() recovers the panic.
|
|
|
|
|
retpc uintptr
|
|
|
|
|
|
|
|
|
|
// Extra state for handling open-coded defers.
|
2023-08-04 14:10:59 -07:00
|
|
|
deferBitsPtr *uint8
|
|
|
|
|
slotsPtr unsafe.Pointer
|
2023-07-27 16:20:36 -07:00
|
|
|
|
|
|
|
|
recovered bool // whether this panic has been recovered
|
2025-05-05 12:33:46 -04:00
|
|
|
repanicked bool // whether this panic repanicked
|
2023-07-27 16:20:36 -07:00
|
|
|
goexit bool
|
|
|
|
|
deferreturn bool
|
2025-07-01 14:45:45 -07:00
|
|
|
|
|
|
|
|
gopanicFP unsafe.Pointer // frame pointer of the gopanic frame
|
2014-11-11 17:05:19 -05:00
|
|
|
}
|
|
|
|
|
|
2023-08-03 20:29:47 -07:00
|
|
|
// savedOpenDeferState tracks the extra state from _panic that's
|
|
|
|
|
// necessary for deferreturn to pick up where gopanic left off,
|
|
|
|
|
// without needing to unwind the stack.
|
|
|
|
|
type savedOpenDeferState struct {
|
|
|
|
|
retpc uintptr
|
|
|
|
|
deferBitsOffset uintptr
|
|
|
|
|
slotsOffset uintptr
|
|
|
|
|
}
|
|
|
|
|
|
2018-04-03 21:35:46 -04:00
|
|
|
// ancestorInfo records details of where a goroutine was started.
|
|
|
|
|
type ancestorInfo struct {
|
|
|
|
|
pcs []uintptr // pcs from the stack of this goroutine
|
2022-07-19 13:49:33 -04:00
|
|
|
goid uint64 // goroutine id of this goroutine; original goroutine possibly dead
|
2018-04-03 21:35:46 -04:00
|
|
|
gopc uintptr // pc of go statement that created this goroutine
|
|
|
|
|
}
|
|
|
|
|
|
2018-03-06 21:28:24 -08:00
|
|
|
// A waitReason explains why a goroutine has been stopped.
|
|
|
|
|
// See gopark. Do not re-use waitReasons, add new ones.
|
|
|
|
|
type waitReason uint8
|
|
|
|
|
|
|
|
|
|
const (
|
|
|
|
|
waitReasonZero waitReason = iota // ""
|
|
|
|
|
waitReasonGCAssistMarking // "GC assist marking"
|
|
|
|
|
waitReasonIOWait // "IO wait"
|
|
|
|
|
waitReasonDumpingHeap // "dumping heap"
|
|
|
|
|
waitReasonGarbageCollection // "garbage collection"
|
|
|
|
|
waitReasonGarbageCollectionScan // "garbage collection scan"
|
|
|
|
|
waitReasonPanicWait // "panicwait"
|
|
|
|
|
waitReasonGCAssistWait // "GC assist wait"
|
|
|
|
|
waitReasonGCSweepWait // "GC sweep wait"
|
2018-10-17 23:29:42 +00:00
|
|
|
waitReasonGCScavengeWait // "GC scavenge wait"
|
2018-03-06 21:28:24 -08:00
|
|
|
waitReasonFinalizerWait // "finalizer wait"
|
2020-05-05 01:43:57 +00:00
|
|
|
waitReasonForceGCIdle // "force gc (idle)"
|
runtime: use cgroup CPU limit to set GOMAXPROCS
This CL adds two related features enabled by default via compatibility
GODEBUGs containermaxprocs and updatemaxprocs.
On Linux, containermaxprocs makes the Go runtime consider cgroup CPU
bandwidth limits (quota/period) when setting GOMAXPROCS. If the cgroup
limit is lower than the number of logical CPUs available, then the
cgroup limit takes precedence.
On all OSes, updatemaxprocs makes the Go runtime periodically
recalculate the default GOMAXPROCS value and update GOMAXPROCS if it has
changed. If GOMAXPROCS is set manually, this update does not occur. This
is intended primarily to detect changes to cgroup limits, but it applies
on all OSes because the CPU affinity mask can change as well.
The runtime only considers the limit in the leaf cgroup (the one that
actually contains the process), caching the CPU limit file
descriptor(s), which are periodically reread for updates. This is a
small departure from the original proposed design. It will not consider
limits of parent cgroups (which may be lower than the leaf), and it will
not detection cgroup migration after process start.
We can consider changing this in the future, but the simpler approach is
less invasive; less risk to packages that have some awareness of runtime
internals. e.g., if the runtime periodically opens new files during
execution, file descriptor leak detection is difficult to implement in a
stable way.
For #73193.
Cq-Include-Trybots: luci.golang.try:gotip-linux-amd64-longtest
Change-Id: I6a6a636c631c1ae577fb8254960377ba91c5dc98
Reviewed-on: https://go-review.googlesource.com/c/go/+/670497
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
2025-05-05 13:44:26 -04:00
|
|
|
waitReasonUpdateGOMAXPROCSIdle // "GOMAXPROCS updater (idle)"
|
2018-03-06 21:28:24 -08:00
|
|
|
waitReasonSemacquire // "semacquire"
|
|
|
|
|
waitReasonSleep // "sleep"
|
2025-10-02 11:57:58 +00:00
|
|
|
waitReasonChanReceiveNilChan // "chan receive (nil chan)"
|
|
|
|
|
waitReasonChanSendNilChan // "chan send (nil chan)"
|
|
|
|
|
waitReasonSelectNoCases // "select (no cases)"
|
|
|
|
|
waitReasonSelect // "select"
|
|
|
|
|
waitReasonChanReceive // "chan receive"
|
|
|
|
|
waitReasonChanSend // "chan send"
|
2018-03-06 21:28:24 -08:00
|
|
|
waitReasonSyncCondWait // "sync.Cond.Wait"
|
2022-08-30 22:18:01 +00:00
|
|
|
waitReasonSyncMutexLock // "sync.Mutex.Lock"
|
|
|
|
|
waitReasonSyncRWMutexRLock // "sync.RWMutex.RLock"
|
|
|
|
|
waitReasonSyncRWMutexLock // "sync.RWMutex.Lock"
|
2024-06-11 11:02:18 -07:00
|
|
|
waitReasonSyncWaitGroupWait // "sync.WaitGroup.Wait"
|
2018-03-06 21:28:24 -08:00
|
|
|
waitReasonTraceReaderBlocked // "trace reader (blocked)"
|
|
|
|
|
waitReasonWaitForGCCycle // "wait for GC cycle"
|
|
|
|
|
waitReasonGCWorkerIdle // "GC worker (idle)"
|
runtime: set G wait reason more consistently
Currently, wait reasons are set somewhat inconsistently. In a follow-up
CL, we're going to want to rely on the wait reason being there for
casgstatus, so the status quo isn't really going to work for that. Plus
this inconsistency means there are a whole bunch of cases where we could
be more specific about the G's status but aren't.
So, this change adds a new function, casGToWaiting which is like
casgstatus but also sets the wait reason. The goal is that by using this
API it'll be harder to forget to set a wait reason (or the lack thereof
will at least be explicit). This change then updates all casgstatus(gp,
..., _Gwaiting) calls to casGToWaiting(gp, ..., waitReasonX) instead.
For a number of these cases, we're missing a wait reason, and it
wouldn't hurt to add a wait reason for them, so this change also adds
those wait reasons.
For #49881.
Change-Id: Ia95e06ecb74ed17bb7bb94f1a362ebfe6bec1518
Reviewed-on: https://go-review.googlesource.com/c/go/+/427617
Reviewed-by: Michael Pratt <mpratt@google.com>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
Auto-Submit: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
2022-08-31 18:21:48 +00:00
|
|
|
waitReasonGCWorkerActive // "GC worker (active)"
|
2019-09-27 12:27:51 -04:00
|
|
|
waitReasonPreempted // "preempted"
|
2020-04-15 15:38:00 -04:00
|
|
|
waitReasonDebugCall // "debug call"
|
runtime: set G wait reason more consistently
Currently, wait reasons are set somewhat inconsistently. In a follow-up
CL, we're going to want to rely on the wait reason being there for
casgstatus, so the status quo isn't really going to work for that. Plus
this inconsistency means there are a whole bunch of cases where we could
be more specific about the G's status but aren't.
So, this change adds a new function, casGToWaiting which is like
casgstatus but also sets the wait reason. The goal is that by using this
API it'll be harder to forget to set a wait reason (or the lack thereof
will at least be explicit). This change then updates all casgstatus(gp,
..., _Gwaiting) calls to casGToWaiting(gp, ..., waitReasonX) instead.
For a number of these cases, we're missing a wait reason, and it
wouldn't hurt to add a wait reason for them, so this change also adds
those wait reasons.
For #49881.
Change-Id: Ia95e06ecb74ed17bb7bb94f1a362ebfe6bec1518
Reviewed-on: https://go-review.googlesource.com/c/go/+/427617
Reviewed-by: Michael Pratt <mpratt@google.com>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
Auto-Submit: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
2022-08-31 18:21:48 +00:00
|
|
|
waitReasonGCMarkTermination // "GC mark termination"
|
|
|
|
|
waitReasonStoppingTheWorld // "stopping the world"
|
2023-10-06 15:07:28 +00:00
|
|
|
waitReasonFlushProcCaches // "flushing proc caches"
|
2023-05-08 22:29:52 +00:00
|
|
|
waitReasonTraceGoroutineStatus // "trace goroutine status"
|
|
|
|
|
waitReasonTraceProcStatus // "trace proc status"
|
2023-12-19 09:35:32 -08:00
|
|
|
waitReasonPageTraceFlush // "page trace flush"
|
2023-11-20 11:22:48 +11:00
|
|
|
waitReasonCoroutine // "coroutine"
|
runtime: prevent weak->strong conversions during mark termination
Currently it's possible for weak->strong conversions to create more GC
work during mark termination. When a weak->strong conversion happens
during the mark phase, we need to mark the newly-strong pointer, since
it may now be the only pointer to that object. In other words, the
object could be white.
But queueing new white objects creates GC work, and if this happens
during mark termination, we could end up violating mark termination
invariants. In the parlance of the mark termination algorithm, the
weak->strong conversion is a non-monotonic source of GC work, unlike the
write barriers (which will eventually only see black objects).
This change fixes the problem by forcing weak->strong conversions to
block during mark termination. We can do this efficiently by setting a
global flag before the ragged barrier that is checked at each
weak->strong conversion. If the flag is set, then the conversions block.
The ragged barrier ensures that all Ps have observed the flag and that
any weak->strong conversions which completed before the ragged barrier
have their newly-minted strong pointers visible in GC work queues if
necessary. We later unset the flag and wake all the blocked goroutines
during the mark termination STW.
There are a few subtleties that we need to account for. For one, it's
possible that a goroutine which blocked in a weak->strong conversion
wakes up only to find it's mark termination time again, so we need to
recheck the global flag on wake. We should also stay non-preemptible
while performing the check, so that if the check *does* appear as true,
it cannot switch back to false while we're actively trying to block. If
it switches to false while we try to block, then we'll be stuck in the
queue until the following GC.
All-in-all, this CL is more complicated than I would have liked, but
it's the only idea so far that is clearly correct to me at a high level.
This change adds a test which is somewhat invasive as it manipulates
mark termination, but hopefully that infrastructure will be useful for
debugging, fixing, and regression testing mark termination whenever we
do fix it.
Fixes #69803.
Change-Id: Ie314e6fd357c9e2a07a9be21f217f75f7aba8c4a
Reviewed-on: https://go-review.googlesource.com/c/go/+/623615
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
2024-11-01 21:54:07 +00:00
|
|
|
waitReasonGCWeakToStrongWait // "GC weak to strong wait"
|
2024-06-11 11:02:18 -07:00
|
|
|
waitReasonSynctestRun // "synctest.Run"
|
|
|
|
|
waitReasonSynctestWait // "synctest.Wait"
|
runtime: clarify stack traces for bubbled goroutines
Use the synctest bubble ID to identify bubbles in traces,
rather than the goroutine ID of the bubble's root goroutine.
Some waitReasons include a "(synctest)" suffix to distinguish
a durably blocking state from a non-durable one. For example,
"chan send" vs. "chan send (synctest)". Change this suffix
to "(durable)".
Always print a "(durable)" sufix for the state of durably
blocked bubbled goroutines. For example, print "sleep (durable)".
Drop the "[not] durably blocked" text from goroutine states,
since this is now entirely redundant with the waitReason.
Old:
goroutine 8 [chan receive (synctest), synctest bubble 7, durably blocked]:
goroutine 9 [select (no cases), synctest bubble 7, durably blocked]:
New:
goroutine 8 [chan receive (durable), synctest bubble 1]:
goroutine 9 [select (no cases) (durable), synctest bubble 1]:
Change-Id: I89112efb25150a98a2954f54d1910ccec52a5824
Reviewed-on: https://go-review.googlesource.com/c/go/+/679376
Auto-Submit: Damien Neil <dneil@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Michael Pratt <mpratt@google.com>
2025-06-05 14:21:47 -07:00
|
|
|
waitReasonSynctestChanReceive // "chan receive (durable)"
|
|
|
|
|
waitReasonSynctestChanSend // "chan send (durable)"
|
|
|
|
|
waitReasonSynctestSelect // "select (durable)"
|
|
|
|
|
waitReasonSynctestWaitGroupWait // "sync.WaitGroup.Wait (durable)"
|
2025-02-19 16:33:21 +00:00
|
|
|
waitReasonCleanupWait // "cleanup wait"
|
2018-03-06 21:28:24 -08:00
|
|
|
)
|
|
|
|
|
|
|
|
|
|
var waitReasonStrings = [...]string{
|
|
|
|
|
waitReasonZero: "",
|
|
|
|
|
waitReasonGCAssistMarking: "GC assist marking",
|
|
|
|
|
waitReasonIOWait: "IO wait",
|
|
|
|
|
waitReasonChanReceiveNilChan: "chan receive (nil chan)",
|
|
|
|
|
waitReasonChanSendNilChan: "chan send (nil chan)",
|
|
|
|
|
waitReasonDumpingHeap: "dumping heap",
|
|
|
|
|
waitReasonGarbageCollection: "garbage collection",
|
|
|
|
|
waitReasonGarbageCollectionScan: "garbage collection scan",
|
|
|
|
|
waitReasonPanicWait: "panicwait",
|
|
|
|
|
waitReasonSelect: "select",
|
|
|
|
|
waitReasonSelectNoCases: "select (no cases)",
|
|
|
|
|
waitReasonGCAssistWait: "GC assist wait",
|
|
|
|
|
waitReasonGCSweepWait: "GC sweep wait",
|
2018-10-17 23:29:42 +00:00
|
|
|
waitReasonGCScavengeWait: "GC scavenge wait",
|
2018-03-06 21:28:24 -08:00
|
|
|
waitReasonChanReceive: "chan receive",
|
|
|
|
|
waitReasonChanSend: "chan send",
|
|
|
|
|
waitReasonFinalizerWait: "finalizer wait",
|
2020-05-05 01:43:57 +00:00
|
|
|
waitReasonForceGCIdle: "force gc (idle)",
|
runtime: use cgroup CPU limit to set GOMAXPROCS
This CL adds two related features enabled by default via compatibility
GODEBUGs containermaxprocs and updatemaxprocs.
On Linux, containermaxprocs makes the Go runtime consider cgroup CPU
bandwidth limits (quota/period) when setting GOMAXPROCS. If the cgroup
limit is lower than the number of logical CPUs available, then the
cgroup limit takes precedence.
On all OSes, updatemaxprocs makes the Go runtime periodically
recalculate the default GOMAXPROCS value and update GOMAXPROCS if it has
changed. If GOMAXPROCS is set manually, this update does not occur. This
is intended primarily to detect changes to cgroup limits, but it applies
on all OSes because the CPU affinity mask can change as well.
The runtime only considers the limit in the leaf cgroup (the one that
actually contains the process), caching the CPU limit file
descriptor(s), which are periodically reread for updates. This is a
small departure from the original proposed design. It will not consider
limits of parent cgroups (which may be lower than the leaf), and it will
not detection cgroup migration after process start.
We can consider changing this in the future, but the simpler approach is
less invasive; less risk to packages that have some awareness of runtime
internals. e.g., if the runtime periodically opens new files during
execution, file descriptor leak detection is difficult to implement in a
stable way.
For #73193.
Cq-Include-Trybots: luci.golang.try:gotip-linux-amd64-longtest
Change-Id: I6a6a636c631c1ae577fb8254960377ba91c5dc98
Reviewed-on: https://go-review.googlesource.com/c/go/+/670497
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
2025-05-05 13:44:26 -04:00
|
|
|
waitReasonUpdateGOMAXPROCSIdle: "GOMAXPROCS updater (idle)",
|
2018-03-06 21:28:24 -08:00
|
|
|
waitReasonSemacquire: "semacquire",
|
|
|
|
|
waitReasonSleep: "sleep",
|
|
|
|
|
waitReasonSyncCondWait: "sync.Cond.Wait",
|
2022-08-30 22:18:01 +00:00
|
|
|
waitReasonSyncMutexLock: "sync.Mutex.Lock",
|
|
|
|
|
waitReasonSyncRWMutexRLock: "sync.RWMutex.RLock",
|
|
|
|
|
waitReasonSyncRWMutexLock: "sync.RWMutex.Lock",
|
2024-06-11 11:02:18 -07:00
|
|
|
waitReasonSyncWaitGroupWait: "sync.WaitGroup.Wait",
|
2018-03-06 21:28:24 -08:00
|
|
|
waitReasonTraceReaderBlocked: "trace reader (blocked)",
|
|
|
|
|
waitReasonWaitForGCCycle: "wait for GC cycle",
|
|
|
|
|
waitReasonGCWorkerIdle: "GC worker (idle)",
|
runtime: set G wait reason more consistently
Currently, wait reasons are set somewhat inconsistently. In a follow-up
CL, we're going to want to rely on the wait reason being there for
casgstatus, so the status quo isn't really going to work for that. Plus
this inconsistency means there are a whole bunch of cases where we could
be more specific about the G's status but aren't.
So, this change adds a new function, casGToWaiting which is like
casgstatus but also sets the wait reason. The goal is that by using this
API it'll be harder to forget to set a wait reason (or the lack thereof
will at least be explicit). This change then updates all casgstatus(gp,
..., _Gwaiting) calls to casGToWaiting(gp, ..., waitReasonX) instead.
For a number of these cases, we're missing a wait reason, and it
wouldn't hurt to add a wait reason for them, so this change also adds
those wait reasons.
For #49881.
Change-Id: Ia95e06ecb74ed17bb7bb94f1a362ebfe6bec1518
Reviewed-on: https://go-review.googlesource.com/c/go/+/427617
Reviewed-by: Michael Pratt <mpratt@google.com>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
Auto-Submit: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
2022-08-31 18:21:48 +00:00
|
|
|
waitReasonGCWorkerActive: "GC worker (active)",
|
2019-09-27 12:27:51 -04:00
|
|
|
waitReasonPreempted: "preempted",
|
2020-04-15 15:38:00 -04:00
|
|
|
waitReasonDebugCall: "debug call",
|
runtime: set G wait reason more consistently
Currently, wait reasons are set somewhat inconsistently. In a follow-up
CL, we're going to want to rely on the wait reason being there for
casgstatus, so the status quo isn't really going to work for that. Plus
this inconsistency means there are a whole bunch of cases where we could
be more specific about the G's status but aren't.
So, this change adds a new function, casGToWaiting which is like
casgstatus but also sets the wait reason. The goal is that by using this
API it'll be harder to forget to set a wait reason (or the lack thereof
will at least be explicit). This change then updates all casgstatus(gp,
..., _Gwaiting) calls to casGToWaiting(gp, ..., waitReasonX) instead.
For a number of these cases, we're missing a wait reason, and it
wouldn't hurt to add a wait reason for them, so this change also adds
those wait reasons.
For #49881.
Change-Id: Ia95e06ecb74ed17bb7bb94f1a362ebfe6bec1518
Reviewed-on: https://go-review.googlesource.com/c/go/+/427617
Reviewed-by: Michael Pratt <mpratt@google.com>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
Auto-Submit: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
2022-08-31 18:21:48 +00:00
|
|
|
waitReasonGCMarkTermination: "GC mark termination",
|
|
|
|
|
waitReasonStoppingTheWorld: "stopping the world",
|
2023-10-06 15:07:28 +00:00
|
|
|
waitReasonFlushProcCaches: "flushing proc caches",
|
2023-05-08 22:29:52 +00:00
|
|
|
waitReasonTraceGoroutineStatus: "trace goroutine status",
|
|
|
|
|
waitReasonTraceProcStatus: "trace proc status",
|
2023-12-19 09:35:32 -08:00
|
|
|
waitReasonPageTraceFlush: "page trace flush",
|
2023-11-20 11:22:48 +11:00
|
|
|
waitReasonCoroutine: "coroutine",
|
runtime: prevent weak->strong conversions during mark termination
Currently it's possible for weak->strong conversions to create more GC
work during mark termination. When a weak->strong conversion happens
during the mark phase, we need to mark the newly-strong pointer, since
it may now be the only pointer to that object. In other words, the
object could be white.
But queueing new white objects creates GC work, and if this happens
during mark termination, we could end up violating mark termination
invariants. In the parlance of the mark termination algorithm, the
weak->strong conversion is a non-monotonic source of GC work, unlike the
write barriers (which will eventually only see black objects).
This change fixes the problem by forcing weak->strong conversions to
block during mark termination. We can do this efficiently by setting a
global flag before the ragged barrier that is checked at each
weak->strong conversion. If the flag is set, then the conversions block.
The ragged barrier ensures that all Ps have observed the flag and that
any weak->strong conversions which completed before the ragged barrier
have their newly-minted strong pointers visible in GC work queues if
necessary. We later unset the flag and wake all the blocked goroutines
during the mark termination STW.
There are a few subtleties that we need to account for. For one, it's
possible that a goroutine which blocked in a weak->strong conversion
wakes up only to find it's mark termination time again, so we need to
recheck the global flag on wake. We should also stay non-preemptible
while performing the check, so that if the check *does* appear as true,
it cannot switch back to false while we're actively trying to block. If
it switches to false while we try to block, then we'll be stuck in the
queue until the following GC.
All-in-all, this CL is more complicated than I would have liked, but
it's the only idea so far that is clearly correct to me at a high level.
This change adds a test which is somewhat invasive as it manipulates
mark termination, but hopefully that infrastructure will be useful for
debugging, fixing, and regression testing mark termination whenever we
do fix it.
Fixes #69803.
Change-Id: Ie314e6fd357c9e2a07a9be21f217f75f7aba8c4a
Reviewed-on: https://go-review.googlesource.com/c/go/+/623615
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
2024-11-01 21:54:07 +00:00
|
|
|
waitReasonGCWeakToStrongWait: "GC weak to strong wait",
|
2024-06-11 11:02:18 -07:00
|
|
|
waitReasonSynctestRun: "synctest.Run",
|
|
|
|
|
waitReasonSynctestWait: "synctest.Wait",
|
runtime: clarify stack traces for bubbled goroutines
Use the synctest bubble ID to identify bubbles in traces,
rather than the goroutine ID of the bubble's root goroutine.
Some waitReasons include a "(synctest)" suffix to distinguish
a durably blocking state from a non-durable one. For example,
"chan send" vs. "chan send (synctest)". Change this suffix
to "(durable)".
Always print a "(durable)" sufix for the state of durably
blocked bubbled goroutines. For example, print "sleep (durable)".
Drop the "[not] durably blocked" text from goroutine states,
since this is now entirely redundant with the waitReason.
Old:
goroutine 8 [chan receive (synctest), synctest bubble 7, durably blocked]:
goroutine 9 [select (no cases), synctest bubble 7, durably blocked]:
New:
goroutine 8 [chan receive (durable), synctest bubble 1]:
goroutine 9 [select (no cases) (durable), synctest bubble 1]:
Change-Id: I89112efb25150a98a2954f54d1910ccec52a5824
Reviewed-on: https://go-review.googlesource.com/c/go/+/679376
Auto-Submit: Damien Neil <dneil@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Michael Pratt <mpratt@google.com>
2025-06-05 14:21:47 -07:00
|
|
|
waitReasonSynctestChanReceive: "chan receive (durable)",
|
|
|
|
|
waitReasonSynctestChanSend: "chan send (durable)",
|
|
|
|
|
waitReasonSynctestSelect: "select (durable)",
|
|
|
|
|
waitReasonSynctestWaitGroupWait: "sync.WaitGroup.Wait (durable)",
|
2025-02-19 16:33:21 +00:00
|
|
|
waitReasonCleanupWait: "cleanup wait",
|
2018-03-06 21:28:24 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (w waitReason) String() string {
|
|
|
|
|
if w < 0 || w >= waitReason(len(waitReasonStrings)) {
|
|
|
|
|
return "unknown wait reason"
|
|
|
|
|
}
|
|
|
|
|
return waitReasonStrings[w]
|
|
|
|
|
}
|
|
|
|
|
|
2025-10-02 11:57:58 +00:00
|
|
|
// isMutexWait returns true if the goroutine is blocked because of
|
|
|
|
|
// sync.Mutex.Lock or sync.RWMutex.[R]Lock.
|
|
|
|
|
//
|
|
|
|
|
//go:nosplit
|
2022-08-31 21:34:23 +00:00
|
|
|
func (w waitReason) isMutexWait() bool {
|
|
|
|
|
return w == waitReasonSyncMutexLock ||
|
|
|
|
|
w == waitReasonSyncRWMutexRLock ||
|
|
|
|
|
w == waitReasonSyncRWMutexLock
|
|
|
|
|
}
|
|
|
|
|
|
2025-10-02 11:57:58 +00:00
|
|
|
// isSyncWait returns true if the goroutine is blocked because of
|
|
|
|
|
// sync library primitive operations.
|
|
|
|
|
//
|
|
|
|
|
//go:nosplit
|
|
|
|
|
func (w waitReason) isSyncWait() bool {
|
|
|
|
|
return waitReasonSyncCondWait <= w && w <= waitReasonSyncWaitGroupWait
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// isChanWait is true if the goroutine is blocked because of non-nil
|
|
|
|
|
// channel operations or a select statement with at least one case.
|
|
|
|
|
//
|
|
|
|
|
//go:nosplit
|
|
|
|
|
func (w waitReason) isChanWait() bool {
|
|
|
|
|
return w == waitReasonSelect ||
|
|
|
|
|
w == waitReasonChanReceive ||
|
|
|
|
|
w == waitReasonChanSend
|
|
|
|
|
}
|
|
|
|
|
|
runtime: prevent mutual deadlock between GC stopTheWorld and suspendG
Almost everywhere we stop the world we casGToWaitingForGC to prevent
mutual deadlock with the GC trying to scan our stack. This historically
was only necessary if we weren't stopping the world to change the GC
phase, because what we were worried about was mutual deadlock with mark
workers' use of suspendG. And, they were the only users of suspendG.
In Go 1.22 this changed. The execution tracer began using suspendG, too.
This leads to the possibility of mutual deadlock between the execution
tracer and a goroutine trying to start or end the GC mark phase. The fix
is simple: make the stop-the-world calls for the GC also call
casGToWaitingForGC. This way, suspendG is guaranteed to make progress in
this circumstance, and once it completes, the stop-the-world can
complete as well.
We can take this a step further, though, and move casGToWaitingForGC
into stopTheWorldWithSema, since there's no longer really a place we can
afford to skip this detail.
While we're here, rename casGToWaitingForGC to casGToWaitingForSuspendG,
since the GC is now not the only potential source of mutual deadlock.
Fixes #72740.
Change-Id: I5e3739a463ef3e8173ad33c531e696e46260692f
Reviewed-on: https://go-review.googlesource.com/c/go/+/681501
Reviewed-by: Carlos Amedee <carlos@golang.org>
Auto-Submit: Michael Knyszek <mknyszek@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
2025-06-14 02:45:08 +00:00
|
|
|
func (w waitReason) isWaitingForSuspendG() bool {
|
|
|
|
|
return isWaitingForSuspendG[w]
|
runtime: take a stack trace during tracing only when we own the stack
Currently, the execution tracer may attempt to take a stack trace of a
goroutine whose stack it does not own. For example, if the goroutine is
in _Grunnable or _Gwaiting. This is easily fixed in all cases by simply
moving the emission of GoStop and GoBlock events to before the
casgstatus happens. The goroutine status is what is used to signal stack
ownership, and the GC may shrink a goroutine's stack if it can acquire
the scan bit.
Although this is easily fixed, the interaction here is very subtle,
because stack ownership is only implicit in the goroutine's scan status.
To make this invariant more maintainable and less error-prone in the
future, this change adds a GODEBUG setting that checks, at the point of
taking a stack trace, whether the caller owns the goroutine. This check
is not quite perfect because there's no way for the stack tracing code
to know that the _Gscan bit was acquired by the caller, so for
simplicity it assumes that it was the caller that acquired the scan bit.
In all other cases however, we can check for ownership precisely. At the
very least, this check is sufficient to catch the issue this change is
fixing.
To make sure this debug check doesn't bitrot, it's always enabled during
trace testing. This new mode has actually caught a few other issues
already, so this change fixes them.
One issue that this debug mode caught was that it's not safe to take a
stack trace of a _Gwaiting goroutine that's being unparked.
Another much bigger issue this debug mode caught was the fact that the
execution tracer could try to take a stack trace of a G that was in
_Gwaiting solely to avoid a deadlock in the GC. The execution tracer
already has a partial list of these cases since they're modeled as the
goroutine just executing as normal in the tracer, but this change takes
the list and makes it more formal. In this specific case, we now prevent
the GC from shrinking the stacks of goroutines in this state if tracing
is enabled. The stack traces from these scenarios are too useful to
discard, but there is indeed a race here between the tracer and any
attempt to shrink the stack by the GC.
Change-Id: I019850dabc8cede202fd6dcc0a4b1f16764209fb
Cq-Include-Trybots: luci.golang.try:gotip-linux-amd64-longtest,gotip-linux-amd64-longtest-race
Reviewed-on: https://go-review.googlesource.com/c/go/+/573155
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
Auto-Submit: Michael Knyszek <mknyszek@google.com>
2024-03-21 18:49:05 +00:00
|
|
|
}
|
|
|
|
|
|
runtime: prevent mutual deadlock between GC stopTheWorld and suspendG
Almost everywhere we stop the world we casGToWaitingForGC to prevent
mutual deadlock with the GC trying to scan our stack. This historically
was only necessary if we weren't stopping the world to change the GC
phase, because what we were worried about was mutual deadlock with mark
workers' use of suspendG. And, they were the only users of suspendG.
In Go 1.22 this changed. The execution tracer began using suspendG, too.
This leads to the possibility of mutual deadlock between the execution
tracer and a goroutine trying to start or end the GC mark phase. The fix
is simple: make the stop-the-world calls for the GC also call
casGToWaitingForGC. This way, suspendG is guaranteed to make progress in
this circumstance, and once it completes, the stop-the-world can
complete as well.
We can take this a step further, though, and move casGToWaitingForGC
into stopTheWorldWithSema, since there's no longer really a place we can
afford to skip this detail.
While we're here, rename casGToWaitingForGC to casGToWaitingForSuspendG,
since the GC is now not the only potential source of mutual deadlock.
Fixes #72740.
Change-Id: I5e3739a463ef3e8173ad33c531e696e46260692f
Reviewed-on: https://go-review.googlesource.com/c/go/+/681501
Reviewed-by: Carlos Amedee <carlos@golang.org>
Auto-Submit: Michael Knyszek <mknyszek@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
2025-06-14 02:45:08 +00:00
|
|
|
// isWaitingForSuspendG indicates that a goroutine is only entering _Gwaiting and
|
|
|
|
|
// setting a waitReason because it needs to be able to let the suspendG
|
|
|
|
|
// (used by the GC and the execution tracer) take ownership of its stack.
|
|
|
|
|
// The G is always actually executing on the system stack in these cases.
|
runtime: take a stack trace during tracing only when we own the stack
Currently, the execution tracer may attempt to take a stack trace of a
goroutine whose stack it does not own. For example, if the goroutine is
in _Grunnable or _Gwaiting. This is easily fixed in all cases by simply
moving the emission of GoStop and GoBlock events to before the
casgstatus happens. The goroutine status is what is used to signal stack
ownership, and the GC may shrink a goroutine's stack if it can acquire
the scan bit.
Although this is easily fixed, the interaction here is very subtle,
because stack ownership is only implicit in the goroutine's scan status.
To make this invariant more maintainable and less error-prone in the
future, this change adds a GODEBUG setting that checks, at the point of
taking a stack trace, whether the caller owns the goroutine. This check
is not quite perfect because there's no way for the stack tracing code
to know that the _Gscan bit was acquired by the caller, so for
simplicity it assumes that it was the caller that acquired the scan bit.
In all other cases however, we can check for ownership precisely. At the
very least, this check is sufficient to catch the issue this change is
fixing.
To make sure this debug check doesn't bitrot, it's always enabled during
trace testing. This new mode has actually caught a few other issues
already, so this change fixes them.
One issue that this debug mode caught was that it's not safe to take a
stack trace of a _Gwaiting goroutine that's being unparked.
Another much bigger issue this debug mode caught was the fact that the
execution tracer could try to take a stack trace of a G that was in
_Gwaiting solely to avoid a deadlock in the GC. The execution tracer
already has a partial list of these cases since they're modeled as the
goroutine just executing as normal in the tracer, but this change takes
the list and makes it more formal. In this specific case, we now prevent
the GC from shrinking the stacks of goroutines in this state if tracing
is enabled. The stack traces from these scenarios are too useful to
discard, but there is indeed a race here between the tracer and any
attempt to shrink the stack by the GC.
Change-Id: I019850dabc8cede202fd6dcc0a4b1f16764209fb
Cq-Include-Trybots: luci.golang.try:gotip-linux-amd64-longtest,gotip-linux-amd64-longtest-race
Reviewed-on: https://go-review.googlesource.com/c/go/+/573155
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
Auto-Submit: Michael Knyszek <mknyszek@google.com>
2024-03-21 18:49:05 +00:00
|
|
|
//
|
|
|
|
|
// TODO(mknyszek): Consider replacing this with a new dedicated G status.
|
runtime: prevent mutual deadlock between GC stopTheWorld and suspendG
Almost everywhere we stop the world we casGToWaitingForGC to prevent
mutual deadlock with the GC trying to scan our stack. This historically
was only necessary if we weren't stopping the world to change the GC
phase, because what we were worried about was mutual deadlock with mark
workers' use of suspendG. And, they were the only users of suspendG.
In Go 1.22 this changed. The execution tracer began using suspendG, too.
This leads to the possibility of mutual deadlock between the execution
tracer and a goroutine trying to start or end the GC mark phase. The fix
is simple: make the stop-the-world calls for the GC also call
casGToWaitingForGC. This way, suspendG is guaranteed to make progress in
this circumstance, and once it completes, the stop-the-world can
complete as well.
We can take this a step further, though, and move casGToWaitingForGC
into stopTheWorldWithSema, since there's no longer really a place we can
afford to skip this detail.
While we're here, rename casGToWaitingForGC to casGToWaitingForSuspendG,
since the GC is now not the only potential source of mutual deadlock.
Fixes #72740.
Change-Id: I5e3739a463ef3e8173ad33c531e696e46260692f
Reviewed-on: https://go-review.googlesource.com/c/go/+/681501
Reviewed-by: Carlos Amedee <carlos@golang.org>
Auto-Submit: Michael Knyszek <mknyszek@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
2025-06-14 02:45:08 +00:00
|
|
|
var isWaitingForSuspendG = [len(waitReasonStrings)]bool{
|
runtime: take a stack trace during tracing only when we own the stack
Currently, the execution tracer may attempt to take a stack trace of a
goroutine whose stack it does not own. For example, if the goroutine is
in _Grunnable or _Gwaiting. This is easily fixed in all cases by simply
moving the emission of GoStop and GoBlock events to before the
casgstatus happens. The goroutine status is what is used to signal stack
ownership, and the GC may shrink a goroutine's stack if it can acquire
the scan bit.
Although this is easily fixed, the interaction here is very subtle,
because stack ownership is only implicit in the goroutine's scan status.
To make this invariant more maintainable and less error-prone in the
future, this change adds a GODEBUG setting that checks, at the point of
taking a stack trace, whether the caller owns the goroutine. This check
is not quite perfect because there's no way for the stack tracing code
to know that the _Gscan bit was acquired by the caller, so for
simplicity it assumes that it was the caller that acquired the scan bit.
In all other cases however, we can check for ownership precisely. At the
very least, this check is sufficient to catch the issue this change is
fixing.
To make sure this debug check doesn't bitrot, it's always enabled during
trace testing. This new mode has actually caught a few other issues
already, so this change fixes them.
One issue that this debug mode caught was that it's not safe to take a
stack trace of a _Gwaiting goroutine that's being unparked.
Another much bigger issue this debug mode caught was the fact that the
execution tracer could try to take a stack trace of a G that was in
_Gwaiting solely to avoid a deadlock in the GC. The execution tracer
already has a partial list of these cases since they're modeled as the
goroutine just executing as normal in the tracer, but this change takes
the list and makes it more formal. In this specific case, we now prevent
the GC from shrinking the stacks of goroutines in this state if tracing
is enabled. The stack traces from these scenarios are too useful to
discard, but there is indeed a race here between the tracer and any
attempt to shrink the stack by the GC.
Change-Id: I019850dabc8cede202fd6dcc0a4b1f16764209fb
Cq-Include-Trybots: luci.golang.try:gotip-linux-amd64-longtest,gotip-linux-amd64-longtest-race
Reviewed-on: https://go-review.googlesource.com/c/go/+/573155
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
Auto-Submit: Michael Knyszek <mknyszek@google.com>
2024-03-21 18:49:05 +00:00
|
|
|
waitReasonStoppingTheWorld: true,
|
|
|
|
|
waitReasonGCMarkTermination: true,
|
|
|
|
|
waitReasonGarbageCollection: true,
|
|
|
|
|
waitReasonGarbageCollectionScan: true,
|
|
|
|
|
waitReasonTraceGoroutineStatus: true,
|
|
|
|
|
waitReasonTraceProcStatus: true,
|
|
|
|
|
waitReasonPageTraceFlush: true,
|
|
|
|
|
waitReasonGCAssistMarking: true,
|
|
|
|
|
waitReasonGCWorkerActive: true,
|
|
|
|
|
waitReasonFlushProcCaches: true,
|
|
|
|
|
}
|
|
|
|
|
|
2024-06-11 11:02:18 -07:00
|
|
|
func (w waitReason) isIdleInSynctest() bool {
|
|
|
|
|
return isIdleInSynctest[w]
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// isIdleInSynctest indicates that a goroutine is considered idle by synctest.Wait.
|
|
|
|
|
var isIdleInSynctest = [len(waitReasonStrings)]bool{
|
2025-05-20 15:56:43 -07:00
|
|
|
waitReasonChanReceiveNilChan: true,
|
|
|
|
|
waitReasonChanSendNilChan: true,
|
|
|
|
|
waitReasonSelectNoCases: true,
|
|
|
|
|
waitReasonSleep: true,
|
|
|
|
|
waitReasonSyncCondWait: true,
|
|
|
|
|
waitReasonSynctestWaitGroupWait: true,
|
|
|
|
|
waitReasonCoroutine: true,
|
|
|
|
|
waitReasonSynctestRun: true,
|
|
|
|
|
waitReasonSynctestWait: true,
|
|
|
|
|
waitReasonSynctestChanReceive: true,
|
|
|
|
|
waitReasonSynctestChanSend: true,
|
|
|
|
|
waitReasonSynctestSelect: true,
|
2024-06-11 11:02:18 -07:00
|
|
|
}
|
|
|
|
|
|
2014-11-11 17:05:19 -05:00
|
|
|
var (
|
2025-07-24 21:38:37 +00:00
|
|
|
// Linked-list of all Ms. Written under sched.lock, read atomically.
|
|
|
|
|
allm *m
|
|
|
|
|
|
runtime: clarify stack traces for bubbled goroutines
Use the synctest bubble ID to identify bubbles in traces,
rather than the goroutine ID of the bubble's root goroutine.
Some waitReasons include a "(synctest)" suffix to distinguish
a durably blocking state from a non-durable one. For example,
"chan send" vs. "chan send (synctest)". Change this suffix
to "(durable)".
Always print a "(durable)" sufix for the state of durably
blocked bubbled goroutines. For example, print "sleep (durable)".
Drop the "[not] durably blocked" text from goroutine states,
since this is now entirely redundant with the waitReason.
Old:
goroutine 8 [chan receive (synctest), synctest bubble 7, durably blocked]:
goroutine 9 [select (no cases), synctest bubble 7, durably blocked]:
New:
goroutine 8 [chan receive (durable), synctest bubble 1]:
goroutine 9 [select (no cases) (durable), synctest bubble 1]:
Change-Id: I89112efb25150a98a2954f54d1910ccec52a5824
Reviewed-on: https://go-review.googlesource.com/c/go/+/679376
Auto-Submit: Damien Neil <dneil@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Michael Pratt <mpratt@google.com>
2025-06-05 14:21:47 -07:00
|
|
|
gomaxprocs int32
|
|
|
|
|
numCPUStartup int32
|
|
|
|
|
forcegc forcegcstate
|
|
|
|
|
sched schedt
|
|
|
|
|
newprocs int32
|
runtime: avoid pp.timers.lock in updateTimerPMask
The comment in updateTimerPMask is wrong. It says:
// Looks like there are no timers, however another P
// may be adding one at this very moment.
// Take the lock to synchronize.
This was my incorrect simplification of the original comment
from CL 264477 when I was renaming all the things it mentioned:
// Looks like there are no timers, however another P may transiently
// decrement numTimers when handling a timerModified timer in
// checkTimers. We must take timersLock to serialize with these changes.
updateTimerPMask is being called by pidleput, so the P in question
is not in use. And other P's cannot add to this P.
As the original comment more precisely noted, the problem was
that other P's might be calling timers.check, which updates ts.len
occasionally while ts is locked, and one of those updates might
"leak" an ephemeral len==0 even when the heap is not going to
be empty when the P is finally unlocked. The lock/unlock in
updateTimerPMask synchronizes to avoid that. But this defeats
most of the purpose of using ts.len in the first place.
Instead of requiring that synchronization, we can arrange that
ts.len only ever shows a "publishable" length, meaning the len(ts.heap)
we leave behind during ts.unlock.
Having done that, updateTimerPMask can be inlined into pidleput.
The big comment on updateTimerPMask explaining how timerpMask
works is better placed as the doc comment for timerpMask itself,
so move it there.
Change-Id: I5442c9bb7f1473b5fd37c43165429d087012e73f
Reviewed-on: https://go-review.googlesource.com/c/go/+/568336
Reviewed-by: Michael Pratt <mpratt@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Auto-Submit: Russ Cox <rsc@golang.org>
2024-02-29 16:52:58 -05:00
|
|
|
)
|
2015-02-17 14:25:49 +03:00
|
|
|
|
runtime: avoid pp.timers.lock in updateTimerPMask
The comment in updateTimerPMask is wrong. It says:
// Looks like there are no timers, however another P
// may be adding one at this very moment.
// Take the lock to synchronize.
This was my incorrect simplification of the original comment
from CL 264477 when I was renaming all the things it mentioned:
// Looks like there are no timers, however another P may transiently
// decrement numTimers when handling a timerModified timer in
// checkTimers. We must take timersLock to serialize with these changes.
updateTimerPMask is being called by pidleput, so the P in question
is not in use. And other P's cannot add to this P.
As the original comment more precisely noted, the problem was
that other P's might be calling timers.check, which updates ts.len
occasionally while ts is locked, and one of those updates might
"leak" an ephemeral len==0 even when the heap is not going to
be empty when the P is finally unlocked. The lock/unlock in
updateTimerPMask synchronizes to avoid that. But this defeats
most of the purpose of using ts.len in the first place.
Instead of requiring that synchronization, we can arrange that
ts.len only ever shows a "publishable" length, meaning the len(ts.heap)
we leave behind during ts.unlock.
Having done that, updateTimerPMask can be inlined into pidleput.
The big comment on updateTimerPMask explaining how timerpMask
works is better placed as the doc comment for timerpMask itself,
so move it there.
Change-Id: I5442c9bb7f1473b5fd37c43165429d087012e73f
Reviewed-on: https://go-review.googlesource.com/c/go/+/568336
Reviewed-by: Michael Pratt <mpratt@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Auto-Submit: Russ Cox <rsc@golang.org>
2024-02-29 16:52:58 -05:00
|
|
|
var (
|
2020-10-05 18:12:35 -04:00
|
|
|
// allpLock protects P-less reads and size changes of allp, idlepMask,
|
|
|
|
|
// and timerpMask, and all writes to allp.
|
runtime: don't attempt to steal from idle Ps
Work stealing is a scalability bottleneck in the scheduler. Since each P
has a work queue, work stealing must look at every P to determine if
there is any work. The number of Ps scales linearly with GOMAXPROCS
(i.e., the number of Ps _is_ GOMAXPROCS), thus this work scales linearly
with GOMAXPROCS.
Work stealing is a later attempt by a P to find work before it goes
idle. Since the P has no work of its own, extra costs here tend not to
directly affect application-level benchmarks. Where they show up is
extra CPU usage by the process as a whole. These costs get particularly
expensive for applications that transition between blocked and running
frequently.
Long term, we need a more scalable approach in general, but for now we
can make a simple observation: idle Ps ([1]) cannot possibly have
anything in their runq, so we need not bother checking at all.
We track idle Ps via a new global bitmap, updated in pidleput/pidleget.
This is already a slow path (requires sched.lock), so we don't expect
high contention there.
Using a single bitmap avoids the need to touch every P to read p.status.
Currently, the bitmap approach is not significantly better than reading
p.status. However, in a future CL I'd like to apply a similiar
optimization to timers. Once done, findrunnable would not touch most Ps
at all (in mostly idle programs), which will avoid memory latency to
pull those Ps into cache.
When reading this bitmap, we are racing with Ps going in and out of
idle, so there are a few cases to consider:
1. _Prunning -> _Pidle: Running P goes idle after we check the bitmap.
In this case, we will try to steal (and find nothing) so there is no
harm.
2. _Pidle -> _Prunning while spinning: A P that starts running may queue
new work that we miss. This is OK: (a) that P cannot go back to sleep
without completing its work, and (b) more fundamentally, we will recheck
after we drop our P.
3. _Pidle -> _Prunning after spinning: After spinning, we really can
miss work from a newly woken P. (a) above still applies here as well,
but this is also the same delicate dance case described in findrunnable:
if nothing is spinning anymore, the other P will unpark a thread to run
the work it submits.
Benchmark results from WakeupParallel/syscall/pair/race/1ms (see
golang.org/cl/228577):
name old msec new msec delta
Perf-task-clock-8 250 ± 1% 247 ± 4% ~ (p=0.690 n=5+5)
Perf-task-clock-16 258 ± 2% 259 ± 2% ~ (p=0.841 n=5+5)
Perf-task-clock-32 284 ± 2% 270 ± 4% -4.94% (p=0.032 n=5+5)
Perf-task-clock-64 326 ± 3% 303 ± 2% -6.92% (p=0.008 n=5+5)
Perf-task-clock-128 407 ± 2% 363 ± 5% -10.69% (p=0.008 n=5+5)
Perf-task-clock-256 561 ± 1% 481 ± 1% -14.20% (p=0.016 n=4+5)
Perf-task-clock-512 840 ± 5% 683 ± 2% -18.70% (p=0.008 n=5+5)
Perf-task-clock-1024 1.38k ±14% 1.07k ± 2% -21.85% (p=0.008 n=5+5)
[1] "Idle Ps" here refers to _Pidle Ps in the sched.pidle list. In other
contexts, Ps may temporarily transition through _Pidle (e.g., in
handoffp); those Ps may have work.
Updates #28808
Updates #18237
Change-Id: Ieeb958bd72e7d8fb375b0b1f414e8d7378b14e29
Reviewed-on: https://go-review.googlesource.com/c/go/+/259578
Run-TryBot: Michael Pratt <mpratt@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Austin Clements <austin@google.com>
Trust: Michael Pratt <mpratt@google.com>
2020-10-01 15:21:37 -04:00
|
|
|
allpLock mutex
|
runtime: avoid pp.timers.lock in updateTimerPMask
The comment in updateTimerPMask is wrong. It says:
// Looks like there are no timers, however another P
// may be adding one at this very moment.
// Take the lock to synchronize.
This was my incorrect simplification of the original comment
from CL 264477 when I was renaming all the things it mentioned:
// Looks like there are no timers, however another P may transiently
// decrement numTimers when handling a timerModified timer in
// checkTimers. We must take timersLock to serialize with these changes.
updateTimerPMask is being called by pidleput, so the P in question
is not in use. And other P's cannot add to this P.
As the original comment more precisely noted, the problem was
that other P's might be calling timers.check, which updates ts.len
occasionally while ts is locked, and one of those updates might
"leak" an ephemeral len==0 even when the heap is not going to
be empty when the P is finally unlocked. The lock/unlock in
updateTimerPMask synchronizes to avoid that. But this defeats
most of the purpose of using ts.len in the first place.
Instead of requiring that synchronization, we can arrange that
ts.len only ever shows a "publishable" length, meaning the len(ts.heap)
we leave behind during ts.unlock.
Having done that, updateTimerPMask can be inlined into pidleput.
The big comment on updateTimerPMask explaining how timerpMask
works is better placed as the doc comment for timerpMask itself,
so move it there.
Change-Id: I5442c9bb7f1473b5fd37c43165429d087012e73f
Reviewed-on: https://go-review.googlesource.com/c/go/+/568336
Reviewed-by: Michael Pratt <mpratt@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Auto-Submit: Russ Cox <rsc@golang.org>
2024-02-29 16:52:58 -05:00
|
|
|
|
runtime: don't attempt to steal from idle Ps
Work stealing is a scalability bottleneck in the scheduler. Since each P
has a work queue, work stealing must look at every P to determine if
there is any work. The number of Ps scales linearly with GOMAXPROCS
(i.e., the number of Ps _is_ GOMAXPROCS), thus this work scales linearly
with GOMAXPROCS.
Work stealing is a later attempt by a P to find work before it goes
idle. Since the P has no work of its own, extra costs here tend not to
directly affect application-level benchmarks. Where they show up is
extra CPU usage by the process as a whole. These costs get particularly
expensive for applications that transition between blocked and running
frequently.
Long term, we need a more scalable approach in general, but for now we
can make a simple observation: idle Ps ([1]) cannot possibly have
anything in their runq, so we need not bother checking at all.
We track idle Ps via a new global bitmap, updated in pidleput/pidleget.
This is already a slow path (requires sched.lock), so we don't expect
high contention there.
Using a single bitmap avoids the need to touch every P to read p.status.
Currently, the bitmap approach is not significantly better than reading
p.status. However, in a future CL I'd like to apply a similiar
optimization to timers. Once done, findrunnable would not touch most Ps
at all (in mostly idle programs), which will avoid memory latency to
pull those Ps into cache.
When reading this bitmap, we are racing with Ps going in and out of
idle, so there are a few cases to consider:
1. _Prunning -> _Pidle: Running P goes idle after we check the bitmap.
In this case, we will try to steal (and find nothing) so there is no
harm.
2. _Pidle -> _Prunning while spinning: A P that starts running may queue
new work that we miss. This is OK: (a) that P cannot go back to sleep
without completing its work, and (b) more fundamentally, we will recheck
after we drop our P.
3. _Pidle -> _Prunning after spinning: After spinning, we really can
miss work from a newly woken P. (a) above still applies here as well,
but this is also the same delicate dance case described in findrunnable:
if nothing is spinning anymore, the other P will unpark a thread to run
the work it submits.
Benchmark results from WakeupParallel/syscall/pair/race/1ms (see
golang.org/cl/228577):
name old msec new msec delta
Perf-task-clock-8 250 ± 1% 247 ± 4% ~ (p=0.690 n=5+5)
Perf-task-clock-16 258 ± 2% 259 ± 2% ~ (p=0.841 n=5+5)
Perf-task-clock-32 284 ± 2% 270 ± 4% -4.94% (p=0.032 n=5+5)
Perf-task-clock-64 326 ± 3% 303 ± 2% -6.92% (p=0.008 n=5+5)
Perf-task-clock-128 407 ± 2% 363 ± 5% -10.69% (p=0.008 n=5+5)
Perf-task-clock-256 561 ± 1% 481 ± 1% -14.20% (p=0.016 n=4+5)
Perf-task-clock-512 840 ± 5% 683 ± 2% -18.70% (p=0.008 n=5+5)
Perf-task-clock-1024 1.38k ±14% 1.07k ± 2% -21.85% (p=0.008 n=5+5)
[1] "Idle Ps" here refers to _Pidle Ps in the sched.pidle list. In other
contexts, Ps may temporarily transition through _Pidle (e.g., in
handoffp); those Ps may have work.
Updates #28808
Updates #18237
Change-Id: Ieeb958bd72e7d8fb375b0b1f414e8d7378b14e29
Reviewed-on: https://go-review.googlesource.com/c/go/+/259578
Run-TryBot: Michael Pratt <mpratt@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Austin Clements <austin@google.com>
Trust: Michael Pratt <mpratt@google.com>
2020-10-01 15:21:37 -04:00
|
|
|
// len(allp) == gomaxprocs; may change at safe points, otherwise
|
|
|
|
|
// immutable.
|
|
|
|
|
allp []*p
|
runtime: avoid pp.timers.lock in updateTimerPMask
The comment in updateTimerPMask is wrong. It says:
// Looks like there are no timers, however another P
// may be adding one at this very moment.
// Take the lock to synchronize.
This was my incorrect simplification of the original comment
from CL 264477 when I was renaming all the things it mentioned:
// Looks like there are no timers, however another P may transiently
// decrement numTimers when handling a timerModified timer in
// checkTimers. We must take timersLock to serialize with these changes.
updateTimerPMask is being called by pidleput, so the P in question
is not in use. And other P's cannot add to this P.
As the original comment more precisely noted, the problem was
that other P's might be calling timers.check, which updates ts.len
occasionally while ts is locked, and one of those updates might
"leak" an ephemeral len==0 even when the heap is not going to
be empty when the P is finally unlocked. The lock/unlock in
updateTimerPMask synchronizes to avoid that. But this defeats
most of the purpose of using ts.len in the first place.
Instead of requiring that synchronization, we can arrange that
ts.len only ever shows a "publishable" length, meaning the len(ts.heap)
we leave behind during ts.unlock.
Having done that, updateTimerPMask can be inlined into pidleput.
The big comment on updateTimerPMask explaining how timerpMask
works is better placed as the doc comment for timerpMask itself,
so move it there.
Change-Id: I5442c9bb7f1473b5fd37c43165429d087012e73f
Reviewed-on: https://go-review.googlesource.com/c/go/+/568336
Reviewed-by: Michael Pratt <mpratt@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Auto-Submit: Russ Cox <rsc@golang.org>
2024-02-29 16:52:58 -05:00
|
|
|
|
runtime: don't attempt to steal from idle Ps
Work stealing is a scalability bottleneck in the scheduler. Since each P
has a work queue, work stealing must look at every P to determine if
there is any work. The number of Ps scales linearly with GOMAXPROCS
(i.e., the number of Ps _is_ GOMAXPROCS), thus this work scales linearly
with GOMAXPROCS.
Work stealing is a later attempt by a P to find work before it goes
idle. Since the P has no work of its own, extra costs here tend not to
directly affect application-level benchmarks. Where they show up is
extra CPU usage by the process as a whole. These costs get particularly
expensive for applications that transition between blocked and running
frequently.
Long term, we need a more scalable approach in general, but for now we
can make a simple observation: idle Ps ([1]) cannot possibly have
anything in their runq, so we need not bother checking at all.
We track idle Ps via a new global bitmap, updated in pidleput/pidleget.
This is already a slow path (requires sched.lock), so we don't expect
high contention there.
Using a single bitmap avoids the need to touch every P to read p.status.
Currently, the bitmap approach is not significantly better than reading
p.status. However, in a future CL I'd like to apply a similiar
optimization to timers. Once done, findrunnable would not touch most Ps
at all (in mostly idle programs), which will avoid memory latency to
pull those Ps into cache.
When reading this bitmap, we are racing with Ps going in and out of
idle, so there are a few cases to consider:
1. _Prunning -> _Pidle: Running P goes idle after we check the bitmap.
In this case, we will try to steal (and find nothing) so there is no
harm.
2. _Pidle -> _Prunning while spinning: A P that starts running may queue
new work that we miss. This is OK: (a) that P cannot go back to sleep
without completing its work, and (b) more fundamentally, we will recheck
after we drop our P.
3. _Pidle -> _Prunning after spinning: After spinning, we really can
miss work from a newly woken P. (a) above still applies here as well,
but this is also the same delicate dance case described in findrunnable:
if nothing is spinning anymore, the other P will unpark a thread to run
the work it submits.
Benchmark results from WakeupParallel/syscall/pair/race/1ms (see
golang.org/cl/228577):
name old msec new msec delta
Perf-task-clock-8 250 ± 1% 247 ± 4% ~ (p=0.690 n=5+5)
Perf-task-clock-16 258 ± 2% 259 ± 2% ~ (p=0.841 n=5+5)
Perf-task-clock-32 284 ± 2% 270 ± 4% -4.94% (p=0.032 n=5+5)
Perf-task-clock-64 326 ± 3% 303 ± 2% -6.92% (p=0.008 n=5+5)
Perf-task-clock-128 407 ± 2% 363 ± 5% -10.69% (p=0.008 n=5+5)
Perf-task-clock-256 561 ± 1% 481 ± 1% -14.20% (p=0.016 n=4+5)
Perf-task-clock-512 840 ± 5% 683 ± 2% -18.70% (p=0.008 n=5+5)
Perf-task-clock-1024 1.38k ±14% 1.07k ± 2% -21.85% (p=0.008 n=5+5)
[1] "Idle Ps" here refers to _Pidle Ps in the sched.pidle list. In other
contexts, Ps may temporarily transition through _Pidle (e.g., in
handoffp); those Ps may have work.
Updates #28808
Updates #18237
Change-Id: Ieeb958bd72e7d8fb375b0b1f414e8d7378b14e29
Reviewed-on: https://go-review.googlesource.com/c/go/+/259578
Run-TryBot: Michael Pratt <mpratt@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Austin Clements <austin@google.com>
Trust: Michael Pratt <mpratt@google.com>
2020-10-01 15:21:37 -04:00
|
|
|
// Bitmask of Ps in _Pidle list, one bit per P. Reads and writes must
|
|
|
|
|
// be atomic. Length may change at safe points.
|
2020-10-05 18:12:35 -04:00
|
|
|
//
|
|
|
|
|
// Each P must update only its own bit. In order to maintain
|
2025-10-28 10:59:33 +00:00
|
|
|
// consistency, a P going idle must set the idle mask simultaneously with
|
2020-10-05 18:12:35 -04:00
|
|
|
// updates to the idle P list under the sched.lock, otherwise a racing
|
|
|
|
|
// pidleget may clear the mask before pidleput sets the mask,
|
|
|
|
|
// corrupting the bitmap.
|
|
|
|
|
//
|
|
|
|
|
// N.B., procresize takes ownership of all Ps in stopTheWorldWithSema.
|
|
|
|
|
idlepMask pMask
|
runtime: avoid pp.timers.lock in updateTimerPMask
The comment in updateTimerPMask is wrong. It says:
// Looks like there are no timers, however another P
// may be adding one at this very moment.
// Take the lock to synchronize.
This was my incorrect simplification of the original comment
from CL 264477 when I was renaming all the things it mentioned:
// Looks like there are no timers, however another P may transiently
// decrement numTimers when handling a timerModified timer in
// checkTimers. We must take timersLock to serialize with these changes.
updateTimerPMask is being called by pidleput, so the P in question
is not in use. And other P's cannot add to this P.
As the original comment more precisely noted, the problem was
that other P's might be calling timers.check, which updates ts.len
occasionally while ts is locked, and one of those updates might
"leak" an ephemeral len==0 even when the heap is not going to
be empty when the P is finally unlocked. The lock/unlock in
updateTimerPMask synchronizes to avoid that. But this defeats
most of the purpose of using ts.len in the first place.
Instead of requiring that synchronization, we can arrange that
ts.len only ever shows a "publishable" length, meaning the len(ts.heap)
we leave behind during ts.unlock.
Having done that, updateTimerPMask can be inlined into pidleput.
The big comment on updateTimerPMask explaining how timerpMask
works is better placed as the doc comment for timerpMask itself,
so move it there.
Change-Id: I5442c9bb7f1473b5fd37c43165429d087012e73f
Reviewed-on: https://go-review.googlesource.com/c/go/+/568336
Reviewed-by: Michael Pratt <mpratt@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Auto-Submit: Russ Cox <rsc@golang.org>
2024-02-29 16:52:58 -05:00
|
|
|
|
2020-10-05 18:12:35 -04:00
|
|
|
// Bitmask of Ps that may have a timer, one bit per P. Reads and writes
|
|
|
|
|
// must be atomic. Length may change at safe points.
|
runtime: avoid pp.timers.lock in updateTimerPMask
The comment in updateTimerPMask is wrong. It says:
// Looks like there are no timers, however another P
// may be adding one at this very moment.
// Take the lock to synchronize.
This was my incorrect simplification of the original comment
from CL 264477 when I was renaming all the things it mentioned:
// Looks like there are no timers, however another P may transiently
// decrement numTimers when handling a timerModified timer in
// checkTimers. We must take timersLock to serialize with these changes.
updateTimerPMask is being called by pidleput, so the P in question
is not in use. And other P's cannot add to this P.
As the original comment more precisely noted, the problem was
that other P's might be calling timers.check, which updates ts.len
occasionally while ts is locked, and one of those updates might
"leak" an ephemeral len==0 even when the heap is not going to
be empty when the P is finally unlocked. The lock/unlock in
updateTimerPMask synchronizes to avoid that. But this defeats
most of the purpose of using ts.len in the first place.
Instead of requiring that synchronization, we can arrange that
ts.len only ever shows a "publishable" length, meaning the len(ts.heap)
we leave behind during ts.unlock.
Having done that, updateTimerPMask can be inlined into pidleput.
The big comment on updateTimerPMask explaining how timerpMask
works is better placed as the doc comment for timerpMask itself,
so move it there.
Change-Id: I5442c9bb7f1473b5fd37c43165429d087012e73f
Reviewed-on: https://go-review.googlesource.com/c/go/+/568336
Reviewed-by: Michael Pratt <mpratt@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Auto-Submit: Russ Cox <rsc@golang.org>
2024-02-29 16:52:58 -05:00
|
|
|
//
|
|
|
|
|
// Ideally, the timer mask would be kept immediately consistent on any timer
|
|
|
|
|
// operations. Unfortunately, updating a shared global data structure in the
|
|
|
|
|
// timer hot path adds too much overhead in applications frequently switching
|
|
|
|
|
// between no timers and some timers.
|
|
|
|
|
//
|
|
|
|
|
// As a compromise, the timer mask is updated only on pidleget / pidleput. A
|
|
|
|
|
// running P (returned by pidleget) may add a timer at any time, so its mask
|
|
|
|
|
// must be set. An idle P (passed to pidleput) cannot add new timers while
|
|
|
|
|
// idle, so if it has no timers at that time, its mask may be cleared.
|
|
|
|
|
//
|
2025-11-17 13:34:51 -05:00
|
|
|
// Thus, we get the following effects on timer-stealing in findRunnable:
|
runtime: avoid pp.timers.lock in updateTimerPMask
The comment in updateTimerPMask is wrong. It says:
// Looks like there are no timers, however another P
// may be adding one at this very moment.
// Take the lock to synchronize.
This was my incorrect simplification of the original comment
from CL 264477 when I was renaming all the things it mentioned:
// Looks like there are no timers, however another P may transiently
// decrement numTimers when handling a timerModified timer in
// checkTimers. We must take timersLock to serialize with these changes.
updateTimerPMask is being called by pidleput, so the P in question
is not in use. And other P's cannot add to this P.
As the original comment more precisely noted, the problem was
that other P's might be calling timers.check, which updates ts.len
occasionally while ts is locked, and one of those updates might
"leak" an ephemeral len==0 even when the heap is not going to
be empty when the P is finally unlocked. The lock/unlock in
updateTimerPMask synchronizes to avoid that. But this defeats
most of the purpose of using ts.len in the first place.
Instead of requiring that synchronization, we can arrange that
ts.len only ever shows a "publishable" length, meaning the len(ts.heap)
we leave behind during ts.unlock.
Having done that, updateTimerPMask can be inlined into pidleput.
The big comment on updateTimerPMask explaining how timerpMask
works is better placed as the doc comment for timerpMask itself,
so move it there.
Change-Id: I5442c9bb7f1473b5fd37c43165429d087012e73f
Reviewed-on: https://go-review.googlesource.com/c/go/+/568336
Reviewed-by: Michael Pratt <mpratt@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Auto-Submit: Russ Cox <rsc@golang.org>
2024-02-29 16:52:58 -05:00
|
|
|
//
|
2025-11-17 13:34:51 -05:00
|
|
|
// - Idle Ps with no timers when they go idle are never checked in findRunnable
|
runtime: avoid pp.timers.lock in updateTimerPMask
The comment in updateTimerPMask is wrong. It says:
// Looks like there are no timers, however another P
// may be adding one at this very moment.
// Take the lock to synchronize.
This was my incorrect simplification of the original comment
from CL 264477 when I was renaming all the things it mentioned:
// Looks like there are no timers, however another P may transiently
// decrement numTimers when handling a timerModified timer in
// checkTimers. We must take timersLock to serialize with these changes.
updateTimerPMask is being called by pidleput, so the P in question
is not in use. And other P's cannot add to this P.
As the original comment more precisely noted, the problem was
that other P's might be calling timers.check, which updates ts.len
occasionally while ts is locked, and one of those updates might
"leak" an ephemeral len==0 even when the heap is not going to
be empty when the P is finally unlocked. The lock/unlock in
updateTimerPMask synchronizes to avoid that. But this defeats
most of the purpose of using ts.len in the first place.
Instead of requiring that synchronization, we can arrange that
ts.len only ever shows a "publishable" length, meaning the len(ts.heap)
we leave behind during ts.unlock.
Having done that, updateTimerPMask can be inlined into pidleput.
The big comment on updateTimerPMask explaining how timerpMask
works is better placed as the doc comment for timerpMask itself,
so move it there.
Change-Id: I5442c9bb7f1473b5fd37c43165429d087012e73f
Reviewed-on: https://go-review.googlesource.com/c/go/+/568336
Reviewed-by: Michael Pratt <mpratt@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Auto-Submit: Russ Cox <rsc@golang.org>
2024-02-29 16:52:58 -05:00
|
|
|
// (for work- or timer-stealing; this is the ideal case).
|
|
|
|
|
// - Running Ps must always be checked.
|
|
|
|
|
// - Idle Ps whose timers are stolen must continue to be checked until they run
|
|
|
|
|
// again, even after timer expiration.
|
|
|
|
|
//
|
|
|
|
|
// When the P starts running again, the mask should be set, as a timer may be
|
|
|
|
|
// added at any time.
|
|
|
|
|
//
|
|
|
|
|
// TODO(prattmic): Additional targeted updates may improve the above cases.
|
|
|
|
|
// e.g., updating the mask when stealing a timer.
|
2020-10-05 18:12:35 -04:00
|
|
|
timerpMask pMask
|
runtime: avoid pp.timers.lock in updateTimerPMask
The comment in updateTimerPMask is wrong. It says:
// Looks like there are no timers, however another P
// may be adding one at this very moment.
// Take the lock to synchronize.
This was my incorrect simplification of the original comment
from CL 264477 when I was renaming all the things it mentioned:
// Looks like there are no timers, however another P may transiently
// decrement numTimers when handling a timerModified timer in
// checkTimers. We must take timersLock to serialize with these changes.
updateTimerPMask is being called by pidleput, so the P in question
is not in use. And other P's cannot add to this P.
As the original comment more precisely noted, the problem was
that other P's might be calling timers.check, which updates ts.len
occasionally while ts is locked, and one of those updates might
"leak" an ephemeral len==0 even when the heap is not going to
be empty when the P is finally unlocked. The lock/unlock in
updateTimerPMask synchronizes to avoid that. But this defeats
most of the purpose of using ts.len in the first place.
Instead of requiring that synchronization, we can arrange that
ts.len only ever shows a "publishable" length, meaning the len(ts.heap)
we leave behind during ts.unlock.
Having done that, updateTimerPMask can be inlined into pidleput.
The big comment on updateTimerPMask explaining how timerpMask
works is better placed as the doc comment for timerpMask itself,
so move it there.
Change-Id: I5442c9bb7f1473b5fd37c43165429d087012e73f
Reviewed-on: https://go-review.googlesource.com/c/go/+/568336
Reviewed-by: Michael Pratt <mpratt@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Auto-Submit: Russ Cox <rsc@golang.org>
2024-02-29 16:52:58 -05:00
|
|
|
)
|
runtime: don't attempt to steal from idle Ps
Work stealing is a scalability bottleneck in the scheduler. Since each P
has a work queue, work stealing must look at every P to determine if
there is any work. The number of Ps scales linearly with GOMAXPROCS
(i.e., the number of Ps _is_ GOMAXPROCS), thus this work scales linearly
with GOMAXPROCS.
Work stealing is a later attempt by a P to find work before it goes
idle. Since the P has no work of its own, extra costs here tend not to
directly affect application-level benchmarks. Where they show up is
extra CPU usage by the process as a whole. These costs get particularly
expensive for applications that transition between blocked and running
frequently.
Long term, we need a more scalable approach in general, but for now we
can make a simple observation: idle Ps ([1]) cannot possibly have
anything in their runq, so we need not bother checking at all.
We track idle Ps via a new global bitmap, updated in pidleput/pidleget.
This is already a slow path (requires sched.lock), so we don't expect
high contention there.
Using a single bitmap avoids the need to touch every P to read p.status.
Currently, the bitmap approach is not significantly better than reading
p.status. However, in a future CL I'd like to apply a similiar
optimization to timers. Once done, findrunnable would not touch most Ps
at all (in mostly idle programs), which will avoid memory latency to
pull those Ps into cache.
When reading this bitmap, we are racing with Ps going in and out of
idle, so there are a few cases to consider:
1. _Prunning -> _Pidle: Running P goes idle after we check the bitmap.
In this case, we will try to steal (and find nothing) so there is no
harm.
2. _Pidle -> _Prunning while spinning: A P that starts running may queue
new work that we miss. This is OK: (a) that P cannot go back to sleep
without completing its work, and (b) more fundamentally, we will recheck
after we drop our P.
3. _Pidle -> _Prunning after spinning: After spinning, we really can
miss work from a newly woken P. (a) above still applies here as well,
but this is also the same delicate dance case described in findrunnable:
if nothing is spinning anymore, the other P will unpark a thread to run
the work it submits.
Benchmark results from WakeupParallel/syscall/pair/race/1ms (see
golang.org/cl/228577):
name old msec new msec delta
Perf-task-clock-8 250 ± 1% 247 ± 4% ~ (p=0.690 n=5+5)
Perf-task-clock-16 258 ± 2% 259 ± 2% ~ (p=0.841 n=5+5)
Perf-task-clock-32 284 ± 2% 270 ± 4% -4.94% (p=0.032 n=5+5)
Perf-task-clock-64 326 ± 3% 303 ± 2% -6.92% (p=0.008 n=5+5)
Perf-task-clock-128 407 ± 2% 363 ± 5% -10.69% (p=0.008 n=5+5)
Perf-task-clock-256 561 ± 1% 481 ± 1% -14.20% (p=0.016 n=4+5)
Perf-task-clock-512 840 ± 5% 683 ± 2% -18.70% (p=0.008 n=5+5)
Perf-task-clock-1024 1.38k ±14% 1.07k ± 2% -21.85% (p=0.008 n=5+5)
[1] "Idle Ps" here refers to _Pidle Ps in the sched.pidle list. In other
contexts, Ps may temporarily transition through _Pidle (e.g., in
handoffp); those Ps may have work.
Updates #28808
Updates #18237
Change-Id: Ieeb958bd72e7d8fb375b0b1f414e8d7378b14e29
Reviewed-on: https://go-review.googlesource.com/c/go/+/259578
Run-TryBot: Michael Pratt <mpratt@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Austin Clements <austin@google.com>
Trust: Michael Pratt <mpratt@google.com>
2020-10-01 15:21:37 -04:00
|
|
|
|
2024-05-17 15:32:13 -04:00
|
|
|
// goarmsoftfp is used by runtime/cgo assembly.
|
|
|
|
|
//
|
|
|
|
|
//go:linkname goarmsoftfp
|
|
|
|
|
|
runtime: avoid pp.timers.lock in updateTimerPMask
The comment in updateTimerPMask is wrong. It says:
// Looks like there are no timers, however another P
// may be adding one at this very moment.
// Take the lock to synchronize.
This was my incorrect simplification of the original comment
from CL 264477 when I was renaming all the things it mentioned:
// Looks like there are no timers, however another P may transiently
// decrement numTimers when handling a timerModified timer in
// checkTimers. We must take timersLock to serialize with these changes.
updateTimerPMask is being called by pidleput, so the P in question
is not in use. And other P's cannot add to this P.
As the original comment more precisely noted, the problem was
that other P's might be calling timers.check, which updates ts.len
occasionally while ts is locked, and one of those updates might
"leak" an ephemeral len==0 even when the heap is not going to
be empty when the P is finally unlocked. The lock/unlock in
updateTimerPMask synchronizes to avoid that. But this defeats
most of the purpose of using ts.len in the first place.
Instead of requiring that synchronization, we can arrange that
ts.len only ever shows a "publishable" length, meaning the len(ts.heap)
we leave behind during ts.unlock.
Having done that, updateTimerPMask can be inlined into pidleput.
The big comment on updateTimerPMask explaining how timerpMask
works is better placed as the doc comment for timerpMask itself,
so move it there.
Change-Id: I5442c9bb7f1473b5fd37c43165429d087012e73f
Reviewed-on: https://go-review.googlesource.com/c/go/+/568336
Reviewed-by: Michael Pratt <mpratt@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Auto-Submit: Russ Cox <rsc@golang.org>
2024-02-29 16:52:58 -05:00
|
|
|
var (
|
runtime: manage gcBgMarkWorkers with a global pool
Background mark workers perform per-P marking work. Currently each
worker is assigned a P at creation time. The worker "attaches" to the P
via p.gcBgMarkWorker, making itself (usually) available to
findRunnableGCWorker for scheduling GC work.
While running gcMarkDone, the worker "detaches" from the P (by clearing
p.gcBgMarkWorker), since it may park for other reasons and should not be
scheduled by findRunnableGCWorker.
Unfortunately, this design is complex and difficult to reason about. We
simplify things by changing the design to eliminate the hard P
attachment. Rather than workers always performing work from the same P,
workers perform work for whichever P they find themselves on. On park,
the workers are placed in a pool of free workers, which each P's
findRunnableGCWorker can use to run a worker for its P.
Now if a worker parks in gcMarkDone, a P may simply use another worker
from the pool to complete its own work.
The P's GC worker mode is used to communicate the mode to run to the
selected worker. It is also used to emit the appropriate worker
EvGoStart tracepoint. This is a slight change, as this G may be
preempted (e.g., in gcMarkDone). When it is rescheduled, the trace
viewer will show it as a normal goroutine again. It is currently a bit
difficult to connect to the original worker tracepoint, as the viewer
does not display the goid for the original worker (though the data is in
the trace file).
Change-Id: Id7bd3a364dc18a4d2b1c99c4dc4810fae1293c1b
Reviewed-on: https://go-review.googlesource.com/c/go/+/262348
Run-TryBot: Michael Pratt <mpratt@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
Trust: Michael Pratt <mpratt@google.com>
2020-10-13 12:39:13 -04:00
|
|
|
// Pool of GC parked background workers. Entries are type
|
|
|
|
|
// *gcBgMarkWorkerNode.
|
|
|
|
|
gcBgMarkWorkerPool lfstack
|
|
|
|
|
|
|
|
|
|
// Total number of gcBgMarkWorker goroutines. Protected by worldsema.
|
|
|
|
|
gcBgMarkWorkerCount int32
|
|
|
|
|
|
2015-02-17 14:25:49 +03:00
|
|
|
// Information about what cpu features are available.
|
2017-04-27 08:30:27 +02:00
|
|
|
// Packages outside the runtime should not use these
|
|
|
|
|
// as they are not an external api.
|
2019-10-10 16:16:54 +00:00
|
|
|
// Set on startup in asm_{386,amd64}.s
|
2017-04-27 08:30:27 +02:00
|
|
|
processorVersionInfo uint32
|
|
|
|
|
isIntel bool
|
2024-05-22 23:06:30 -04:00
|
|
|
)
|
2017-11-03 02:05:28 +00:00
|
|
|
|
2024-05-22 23:06:30 -04:00
|
|
|
// set by cmd/link on arm systems
|
|
|
|
|
// accessed using linkname by internal/runtime/atomic.
|
|
|
|
|
//
|
|
|
|
|
// goarm should be an internal detail,
|
|
|
|
|
// but widely used packages access it using linkname.
|
|
|
|
|
// Notable members of the hall of shame include:
|
|
|
|
|
// - github.com/creativeprojects/go-selfupdate
|
|
|
|
|
//
|
|
|
|
|
// Do not remove or change the type signature.
|
|
|
|
|
// See go.dev/issue/67401.
|
|
|
|
|
//
|
|
|
|
|
//go:linkname goarm
|
|
|
|
|
var (
|
2023-07-29 18:25:42 -07:00
|
|
|
goarm uint8
|
|
|
|
|
goarmsoftfp uint8
|
2015-04-09 15:09:52 -04:00
|
|
|
)
|
2015-03-25 17:50:35 -07:00
|
|
|
|
2015-04-09 15:09:52 -04:00
|
|
|
// Set by the linker so the runtime can determine the buildmode.
|
|
|
|
|
var (
|
|
|
|
|
islibrary bool // -buildmode=c-shared
|
|
|
|
|
isarchive bool // -buildmode=c-archive
|
2014-11-11 17:05:19 -05:00
|
|
|
)
|
2020-08-21 11:09:45 -07:00
|
|
|
|
2022-03-23 13:47:08 +02:00
|
|
|
// Must agree with internal/buildcfg.FramePointerEnabled.
|
runtime: enable framepointer on all arm64
Frame pointers were already enabled on linux, darwin, ios,
but not freebsd, android, openbsd, netbsd.
But the space was reserved on all platforms, leading to
two different arm64 framepointer conditions in different
parts of the code, one of which had no name
(framepointer_enabled || GOARCH == "arm64",
which might have been "framepointer_space_reserved").
So on the disabled systems, the stack layouts were still
set up for frame pointers and the only difference was not
actually maintaining the FP register in the generated code.
Reduce complexity by just enabling the frame pointer
completely on all the arm64 systems.
This commit passes on freebsd, android, netbsd.
I have not been able to try it on openbsd.
This CL is part of a stack adding windows/arm64
support (#36439), intended to land in the Go 1.17 cycle.
This CL is, however, not windows/arm64-specific.
It is cleanup meant to make the port (and future ports) easier.
Change-Id: I83bd23369d24b76db4c6a648fa74f6917819a093
Reviewed-on: https://go-review.googlesource.com/c/go/+/288814
Trust: Russ Cox <rsc@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
2021-01-27 11:34:42 -05:00
|
|
|
const framepointer_enabled = GOARCH == "amd64" || GOARCH == "arm64"
|
2024-04-18 20:54:55 +00:00
|
|
|
|
|
|
|
|
// getcallerfp returns the frame pointer of the caller of the caller
|
|
|
|
|
// of this function.
|
|
|
|
|
//
|
|
|
|
|
//go:nosplit
|
|
|
|
|
//go:noinline
|
|
|
|
|
func getcallerfp() uintptr {
|
|
|
|
|
fp := getfp() // This frame's FP.
|
|
|
|
|
if fp != 0 {
|
|
|
|
|
fp = *(*uintptr)(unsafe.Pointer(fp)) // The caller's FP.
|
|
|
|
|
fp = *(*uintptr)(unsafe.Pointer(fp)) // The caller's caller's FP.
|
|
|
|
|
}
|
|
|
|
|
return fp
|
|
|
|
|
}
|