go/src/runtime/runtime2.go

1134 lines
42 KiB
Go
Raw Normal View History

// Copyright 2009 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package runtime
import (
"internal/goarch"
"runtime/internal/atomic"
"unsafe"
)
// defined constants
const (
// G status
//
// Beyond indicating the general state of a G, the G status
// acts like a lock on the goroutine's stack (and hence its
// ability to execute user code).
//
// If you add to this list, add to the list
// of "okay during garbage collection" status
// in mgcmark.go too.
//
// TODO(austin): The _Gscan bit could be much lighter-weight.
// For example, we could choose not to run _Gscanrunnable
// goroutines found in the run queue, rather than CAS-looping
// until they become _Grunnable. And transitions like
// _Gscanwaiting -> _Gscanrunnable are actually okay because
// they don't affect stack ownership.
// _Gidle means this goroutine was just allocated and has not
// yet been initialized.
_Gidle = iota // 0
// _Grunnable means this goroutine is on a run queue. It is
// not currently executing user code. The stack is not owned.
_Grunnable // 1
// _Grunning means this goroutine may execute user code. The
// stack is owned by this goroutine. It is not on a run queue.
// It is assigned an M and a P (g.m and g.m.p are valid).
_Grunning // 2
// _Gsyscall means this goroutine is executing a system call.
// It is not executing user code. The stack is owned by this
// goroutine. It is not on a run queue. It is assigned an M.
_Gsyscall // 3
// _Gwaiting means this goroutine is blocked in the runtime.
// It is not executing user code. It is not on a run queue,
// but should be recorded somewhere (e.g., a channel wait
// queue) so it can be ready()d when necessary. The stack is
// not owned *except* that a channel operation may read or
// write parts of the stack under the appropriate channel
// lock. Otherwise, it is not safe to access the stack after a
// goroutine enters _Gwaiting (e.g., it may get moved).
_Gwaiting // 4
// _Gmoribund_unused is currently unused, but hardcoded in gdb
// scripts.
_Gmoribund_unused // 5
// _Gdead means this goroutine is currently unused. It may be
// just exited, on a free list, or just being initialized. It
// is not executing user code. It may or may not have a stack
// allocated. The G and its stack (if any) are owned by the M
// that is exiting the G or that obtained the G from the free
// list.
_Gdead // 6
// _Genqueue_unused is currently unused.
_Genqueue_unused // 7
// _Gcopystack means this goroutine's stack is being moved. It
// is not executing user code and is not on a run queue. The
// stack is owned by the goroutine that put it in _Gcopystack.
_Gcopystack // 8
2019-09-27 12:27:51 -04:00
// _Gpreempted means this goroutine stopped itself for a
// suspendG preemption. It is like _Gwaiting, but nothing is
// yet responsible for ready()ing it. Some suspendG must CAS
// the status to _Gwaiting to take responsibility for
// ready()ing this G.
_Gpreempted // 9
// _Gscan combined with one of the above states other than
// _Grunning indicates that GC is scanning the stack. The
// goroutine is not executing user code and the stack is owned
// by the goroutine that set the _Gscan bit.
//
// _Gscanrunning is different: it is used to briefly block
// state transitions while GC signals the G to scan its own
// stack. This is otherwise like _Grunning.
//
// atomicstatus&~Gscan gives the state the goroutine will
// return to when the scan completes.
2019-09-27 12:27:51 -04:00
_Gscan = 0x1000
_Gscanrunnable = _Gscan + _Grunnable // 0x1001
_Gscanrunning = _Gscan + _Grunning // 0x1002
_Gscansyscall = _Gscan + _Gsyscall // 0x1003
_Gscanwaiting = _Gscan + _Gwaiting // 0x1004
_Gscanpreempted = _Gscan + _Gpreempted // 0x1009
)
const (
// P status
// _Pidle means a P is not being used to run user code or the
// scheduler. Typically, it's on the idle P list and available
// to the scheduler, but it may just be transitioning between
// other states.
//
// The P is owned by the idle list or by whatever is
// transitioning its state. Its run queue is empty.
_Pidle = iota
// _Prunning means a P is owned by an M and is being used to
// run user code or the scheduler. Only the M that owns this P
// is allowed to change the P's status from _Prunning. The M
// may transition the P to _Pidle (if it has no more work to
// do), _Psyscall (when entering a syscall), or _Pgcstop (to
// halt for the GC). The M may also hand ownership of the P
// off directly to another M (e.g., to schedule a locked G).
_Prunning
// _Psyscall means a P is not running user code. It has
// affinity to an M in a syscall but is not owned by it and
// may be stolen by another M. This is similar to _Pidle but
// uses lightweight transitions and maintains M affinity.
//
// Leaving _Psyscall must be done with a CAS, either to steal
// or retake the P. Note that there's an ABA hazard: even if
// an M successfully CASes its original P back to _Prunning
// after a syscall, it must understand the P may have been
// used by another M in the interim.
_Psyscall
// _Pgcstop means a P is halted for STW and owned by the M
// that stopped the world. The M that stopped the world
// continues to use its P, even in _Pgcstop. Transitioning
// from _Prunning to _Pgcstop causes an M to release its P and
// park.
//
// The P retains its run queue and startTheWorld will restart
// the scheduler on Ps with non-empty run queues.
_Pgcstop
// _Pdead means a P is no longer used (GOMAXPROCS shrank). We
// reuse Ps if GOMAXPROCS increases. A dead P is mostly
// stripped of its resources, though a few things remain
// (e.g., trace buffers).
_Pdead
)
// Mutual exclusion locks. In the uncontended case,
// as fast as spin locks (just a few user-level instructions),
// but on the contention path they sleep in the kernel.
// A zeroed Mutex is unlocked (no need to initialize each lock).
runtime: static lock ranking for the runtime (enabled by GOEXPERIMENT) I took some of the infrastructure from Austin's lock logging CR https://go-review.googlesource.com/c/go/+/192704 (with deadlock detection from the logs), and developed a setup to give static lock ranking for runtime locks. Static lock ranking establishes a documented total ordering among locks, and then reports an error if the total order is violated. This can happen if a deadlock happens (by acquiring a sequence of locks in different orders), or if just one side of a possible deadlock happens. Lock ordering deadlocks cannot happen as long as the lock ordering is followed. Along the way, I found a deadlock involving the new timer code, which Ian fixed via https://go-review.googlesource.com/c/go/+/207348, as well as two other potential deadlocks. See the constants at the top of runtime/lockrank.go to show the static lock ranking that I ended up with, along with some comments. This is great documentation of the current intended lock ordering when acquiring multiple locks in the runtime. I also added an array lockPartialOrder[] which shows and enforces the current partial ordering among locks (which is embedded within the total ordering). This is more specific about the dependencies among locks. I don't try to check the ranking within a lock class with multiple locks that can be acquired at the same time (i.e. check the ranking when multiple hchan locks are acquired). Currently, I am doing a lockInit() call to set the lock rank of most locks. Any lock that is not otherwise initialized is assumed to be a leaf lock (a very high rank lock), so that eliminates the need to do anything for a bunch of locks (including all architecture-dependent locks). For two locks, root.lock and notifyList.lock (only in the runtime/sema.go file), it is not as easy to do lock initialization, so instead, I am passing the lock rank with the lock calls. For Windows compilation, I needed to increase the StackGuard size from 896 to 928 because of the new lock-rank checking functions. Checking of the static lock ranking is enabled by setting GOEXPERIMENT=staticlockranking before doing a run. To make sure that the static lock ranking code has no overhead in memory or CPU when not enabled by GOEXPERIMENT, I changed 'go build/install' so that it defines a build tag (with the same name) whenever any experiment has been baked into the toolchain (by checking Expstring()). This allows me to avoid increasing the size of the 'mutex' type when static lock ranking is not enabled. Fixes #38029 Change-Id: I154217ff307c47051f8dae9c2a03b53081acd83a Reviewed-on: https://go-review.googlesource.com/c/go/+/207619 Reviewed-by: Dan Scales <danscales@google.com> Reviewed-by: Keith Randall <khr@golang.org> Run-TryBot: Dan Scales <danscales@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org>
2019-11-13 17:34:47 -08:00
// Initialization is helpful for static lock ranking, but not required.
type mutex struct {
runtime: static lock ranking for the runtime (enabled by GOEXPERIMENT) I took some of the infrastructure from Austin's lock logging CR https://go-review.googlesource.com/c/go/+/192704 (with deadlock detection from the logs), and developed a setup to give static lock ranking for runtime locks. Static lock ranking establishes a documented total ordering among locks, and then reports an error if the total order is violated. This can happen if a deadlock happens (by acquiring a sequence of locks in different orders), or if just one side of a possible deadlock happens. Lock ordering deadlocks cannot happen as long as the lock ordering is followed. Along the way, I found a deadlock involving the new timer code, which Ian fixed via https://go-review.googlesource.com/c/go/+/207348, as well as two other potential deadlocks. See the constants at the top of runtime/lockrank.go to show the static lock ranking that I ended up with, along with some comments. This is great documentation of the current intended lock ordering when acquiring multiple locks in the runtime. I also added an array lockPartialOrder[] which shows and enforces the current partial ordering among locks (which is embedded within the total ordering). This is more specific about the dependencies among locks. I don't try to check the ranking within a lock class with multiple locks that can be acquired at the same time (i.e. check the ranking when multiple hchan locks are acquired). Currently, I am doing a lockInit() call to set the lock rank of most locks. Any lock that is not otherwise initialized is assumed to be a leaf lock (a very high rank lock), so that eliminates the need to do anything for a bunch of locks (including all architecture-dependent locks). For two locks, root.lock and notifyList.lock (only in the runtime/sema.go file), it is not as easy to do lock initialization, so instead, I am passing the lock rank with the lock calls. For Windows compilation, I needed to increase the StackGuard size from 896 to 928 because of the new lock-rank checking functions. Checking of the static lock ranking is enabled by setting GOEXPERIMENT=staticlockranking before doing a run. To make sure that the static lock ranking code has no overhead in memory or CPU when not enabled by GOEXPERIMENT, I changed 'go build/install' so that it defines a build tag (with the same name) whenever any experiment has been baked into the toolchain (by checking Expstring()). This allows me to avoid increasing the size of the 'mutex' type when static lock ranking is not enabled. Fixes #38029 Change-Id: I154217ff307c47051f8dae9c2a03b53081acd83a Reviewed-on: https://go-review.googlesource.com/c/go/+/207619 Reviewed-by: Dan Scales <danscales@google.com> Reviewed-by: Keith Randall <khr@golang.org> Run-TryBot: Dan Scales <danscales@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org>
2019-11-13 17:34:47 -08:00
// Empty struct if lock ranking is disabled, otherwise includes the lock rank
lockRankStruct
// Futex-based impl treats it as uint32 key,
// while sema-based impl as M* waitm.
// Used to be a union, but unions break precise GC.
key uintptr
}
// sleep and wakeup on one-time events.
// before any calls to notesleep or notewakeup,
// must call noteclear to initialize the Note.
// then, exactly one thread can call notesleep
// and exactly one thread can call notewakeup (once).
// once notewakeup has been called, the notesleep
// will return. future notesleep will return immediately.
// subsequent noteclear must be called only after
// previous notesleep has returned, e.g. it's disallowed
// to call noteclear straight after notewakeup.
//
// notetsleep is like notesleep but wakes up after
// a given number of nanoseconds even if the event
// has not yet happened. if a goroutine uses notetsleep to
// wake up early, it must wait to call noteclear until it
// can be sure that no other goroutine is calling
// notewakeup.
//
// notesleep/notetsleep are generally called on g0,
// notetsleepg is similar to notetsleep but is called on user g.
type note struct {
// Futex-based impl treats it as uint32 key,
// while sema-based impl as M* waitm.
// Used to be a union, but unions break precise GC.
key uintptr
}
type funcval struct {
fn uintptr
// variable-size, fn-specific data here
}
type iface struct {
tab *itab
data unsafe.Pointer
}
type eface struct {
_type *_type
data unsafe.Pointer
}
func efaceOf(ep *any) *eface {
return (*eface)(unsafe.Pointer(ep))
}
// The guintptr, muintptr, and puintptr are all used to bypass write barriers.
// It is particularly important to avoid write barriers when the current P has
// been released, because the GC thinks the world is stopped, and an
// unexpected write barrier would not be synchronized with the GC,
// which can lead to a half-executed write barrier that has marked the object
// but not queued it. If the GC skips the object and completes before the
// queuing can occur, it will incorrectly free the object.
//
// We tried using special assignment functions invoked only when not
// holding a running P, but then some updates to a particular memory
// word went through write barriers and some did not. This breaks the
// write barrier shadow checking mode, and it is also scary: better to have
// a word that is completely ignored by the GC than to have one for which
// only a few updates are ignored.
//
runtime: make it possible to exit Go-created threads Currently, threads created by the runtime exist until the whole program exits. For #14592 and #20395, we want to be able to exit and clean up threads created by the runtime. This commit implements that mechanism. The main difficulty is how to clean up the g0 stack. In cgo mode and on Solaris and Windows where the OS manages thread stacks, we simply arrange to return from mstart and let the system clean up the thread. If the runtime allocated the g0 stack, then we use a new exitThread syscall wrapper that arranges to clear a flag in the M once the stack can safely be reaped and call the thread termination syscall. exitThread is based on the existing exit1 wrapper, which was always meant to terminate the calling thread. However, exit1 has never been used since it was introduced 9 years ago, so it was broken on several platforms. exitThread also has the additional complication of having to flag that the stack is unused, which requires some tricks on platforms that use the stack for syscalls. This still leaves the problem of how to reap the unused g0 stacks. For this, we move the M from allm to a new freem list as part of the M exiting. Later, allocm scans the freem list, finds Ms that are marked as done with their stack, removes these from the list and frees their g0 stacks. This also allows these Ms to be garbage collected. This CL does not yet use any of this functionality. Follow-up CLs will. Likewise, there are no new tests in this CL because we'll need follow-up functionality to test it. Change-Id: Ic851ee74227b6d39c6fc1219fc71b45d3004bc63 Reviewed-on: https://go-review.googlesource.com/46037 Run-TryBot: Austin Clements <austin@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
2017-06-16 15:54:21 -04:00
// Gs and Ps are always reachable via true pointers in the
// allgs and allp lists or (during allocation before they reach those lists)
// from stack variables.
runtime: make it possible to exit Go-created threads Currently, threads created by the runtime exist until the whole program exits. For #14592 and #20395, we want to be able to exit and clean up threads created by the runtime. This commit implements that mechanism. The main difficulty is how to clean up the g0 stack. In cgo mode and on Solaris and Windows where the OS manages thread stacks, we simply arrange to return from mstart and let the system clean up the thread. If the runtime allocated the g0 stack, then we use a new exitThread syscall wrapper that arranges to clear a flag in the M once the stack can safely be reaped and call the thread termination syscall. exitThread is based on the existing exit1 wrapper, which was always meant to terminate the calling thread. However, exit1 has never been used since it was introduced 9 years ago, so it was broken on several platforms. exitThread also has the additional complication of having to flag that the stack is unused, which requires some tricks on platforms that use the stack for syscalls. This still leaves the problem of how to reap the unused g0 stacks. For this, we move the M from allm to a new freem list as part of the M exiting. Later, allocm scans the freem list, finds Ms that are marked as done with their stack, removes these from the list and frees their g0 stacks. This also allows these Ms to be garbage collected. This CL does not yet use any of this functionality. Follow-up CLs will. Likewise, there are no new tests in this CL because we'll need follow-up functionality to test it. Change-Id: Ic851ee74227b6d39c6fc1219fc71b45d3004bc63 Reviewed-on: https://go-review.googlesource.com/46037 Run-TryBot: Austin Clements <austin@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
2017-06-16 15:54:21 -04:00
//
// Ms are always reachable via true pointers either from allm or
// freem. Unlike Gs and Ps we do free Ms, so it's important that
// nothing ever hold an muintptr across a safe point.
// A guintptr holds a goroutine pointer, but typed as a uintptr
// to bypass write barriers. It is used in the Gobuf goroutine state
// and in scheduling lists that are manipulated without a P.
//
// The Gobuf.g goroutine pointer is almost always updated by assembly code.
// In one of the few places it is updated by Go code - func save - it must be
// treated as a uintptr to avoid a write barrier being emitted at a bad time.
// Instead of figuring out how to emit the write barriers missing in the
// assembly manipulation, we change the type of the field to uintptr,
// so that it does not require write barriers at all.
//
// Goroutine structs are published in the allg list and never freed.
// That will keep the goroutine structs from being collected.
// There is never a time that Gobuf.g's contain the only references
// to a goroutine: the publishing of the goroutine in allg comes first.
// Goroutine pointers are also kept in non-GC-visible places like TLS,
// so I can't see them ever moving. If we did want to start moving data
// in the GC, we'd need to allocate the goroutine structs from an
// alternate arena. Using guintptr doesn't make that problem any worse.
// Note that pollDesc.rg, pollDesc.wg also store g in uintptr form,
// so they would need to be updated too if g's start moving.
type guintptr uintptr
//go:nosplit
func (gp guintptr) ptr() *g { return (*g)(unsafe.Pointer(gp)) }
//go:nosplit
func (gp *guintptr) set(g *g) { *gp = guintptr(unsafe.Pointer(g)) }
//go:nosplit
runtime: yield time slice to most recently readied G Currently, when the runtime ready()s a G, it adds it to the end of the current P's run queue and continues running. If there are many other things in the run queue, this can result in a significant delay before the ready()d G actually runs and can hurt fairness when other Gs in the run queue are CPU hogs. For example, if there are three Gs sharing a P, one of which is a CPU hog that never voluntarily gives up the P and the other two of which are doing small amounts of work and communicating back and forth on an unbuffered channel, the two communicating Gs will get very little CPU time. Change this so that when G1 ready()s G2 and then blocks, the scheduler immediately hands off the remainder of G1's time slice to G2. In the above example, the two communicating Gs will now act as a unit and together get half of the CPU time, while the CPU hog gets the other half of the CPU time. This fixes the problem demonstrated by the ping-pong benchmark added in the previous commit: benchmark old ns/op new ns/op delta BenchmarkPingPongHog 684287 825 -99.88% On the x/benchmarks suite, this change improves the performance of garbage by ~6% (for GOMAXPROCS=1 and 4), and json by 28% and 36% for GOMAXPROCS=1 and 4. It has negligible effect on heap size. This has no effect on the go1 benchmark suite since those benchmarks are mostly single-threaded. Change-Id: I858a08eaa78f702ea98a5fac99d28a4ac91d339f Reviewed-on: https://go-review.googlesource.com/9289 Reviewed-by: Rick Hudson <rlh@golang.org> Reviewed-by: Russ Cox <rsc@golang.org>
2015-04-22 14:42:26 -04:00
func (gp *guintptr) cas(old, new guintptr) bool {
return atomic.Casuintptr((*uintptr)(unsafe.Pointer(gp)), uintptr(old), uintptr(new))
runtime: yield time slice to most recently readied G Currently, when the runtime ready()s a G, it adds it to the end of the current P's run queue and continues running. If there are many other things in the run queue, this can result in a significant delay before the ready()d G actually runs and can hurt fairness when other Gs in the run queue are CPU hogs. For example, if there are three Gs sharing a P, one of which is a CPU hog that never voluntarily gives up the P and the other two of which are doing small amounts of work and communicating back and forth on an unbuffered channel, the two communicating Gs will get very little CPU time. Change this so that when G1 ready()s G2 and then blocks, the scheduler immediately hands off the remainder of G1's time slice to G2. In the above example, the two communicating Gs will now act as a unit and together get half of the CPU time, while the CPU hog gets the other half of the CPU time. This fixes the problem demonstrated by the ping-pong benchmark added in the previous commit: benchmark old ns/op new ns/op delta BenchmarkPingPongHog 684287 825 -99.88% On the x/benchmarks suite, this change improves the performance of garbage by ~6% (for GOMAXPROCS=1 and 4), and json by 28% and 36% for GOMAXPROCS=1 and 4. It has negligible effect on heap size. This has no effect on the go1 benchmark suite since those benchmarks are mostly single-threaded. Change-Id: I858a08eaa78f702ea98a5fac99d28a4ac91d339f Reviewed-on: https://go-review.googlesource.com/9289 Reviewed-by: Rick Hudson <rlh@golang.org> Reviewed-by: Russ Cox <rsc@golang.org>
2015-04-22 14:42:26 -04:00
}
// setGNoWB performs *gp = new without a write barrier.
// For times when it's impractical to use a guintptr.
//go:nosplit
//go:nowritebarrier
func setGNoWB(gp **g, new *g) {
(*guintptr)(unsafe.Pointer(gp)).set(new)
}
type puintptr uintptr
runtime: Remove write barriers during STW. The GC assumes that there will be no asynchronous write barriers when the world is stopped. This keeps the synchronization between write barriers and the GC simple. However, currently, there are a few places in runtime code where this assumption does not hold. The GC stops the world by collecting all Ps, which stops all user Go code, but small parts of the runtime can run without a P. For example, the code that releases a P must still deschedule its G onto a runnable queue before stopping. Similarly, when a G returns from a long-running syscall, it must run code to reacquire a P. Currently, this code can contain write barriers. This can lead to the GC collecting reachable objects if something like the following sequence of events happens: 1. GC stops the world by collecting all Ps. 2. G #1 returns from a syscall (for example), tries to install a pointer to object X, and calls greyobject on X. 3. greyobject on G #1 marks X, but does not yet add it to a write buffer. At this point, X is effectively black, not grey, even though it may point to white objects. 4. GC reaches X through some other path and calls greyobject on X, but greyobject does nothing because X is already marked. 5. GC completes. 6. greyobject on G #1 adds X to a work buffer, but it's too late. 7. Objects that were reachable only through X are incorrectly collected. To fix this, we check the invariant that no asynchronous write barriers happen when the world is stopped by checking that write barriers always have a P, and modify all currently known sources of these writes to disable the write barrier. In all modified cases this is safe because the object in question will always be reachable via some other path. Some of the trace code was turned off, in particular the code that traces returning from a syscall. The GC assumes that as far as the heap is concerned the thread is stopped when it is in a syscall. Upon returning the trace code must not do any heap writes for the same reasons discussed above. Fixes #10098 Fixes #9953 Fixes #9951 Fixes #9884 May relate to #9610 #9771 Change-Id: Ic2e70b7caffa053e56156838eb8d89503e3c0c8a Reviewed-on: https://go-review.googlesource.com/7504 Reviewed-by: Austin Clements <austin@google.com>
2015-03-12 14:19:21 -04:00
//go:nosplit
func (pp puintptr) ptr() *p { return (*p)(unsafe.Pointer(pp)) }
//go:nosplit
func (pp *puintptr) set(p *p) { *pp = puintptr(unsafe.Pointer(p)) }
runtime: Remove write barriers during STW. The GC assumes that there will be no asynchronous write barriers when the world is stopped. This keeps the synchronization between write barriers and the GC simple. However, currently, there are a few places in runtime code where this assumption does not hold. The GC stops the world by collecting all Ps, which stops all user Go code, but small parts of the runtime can run without a P. For example, the code that releases a P must still deschedule its G onto a runnable queue before stopping. Similarly, when a G returns from a long-running syscall, it must run code to reacquire a P. Currently, this code can contain write barriers. This can lead to the GC collecting reachable objects if something like the following sequence of events happens: 1. GC stops the world by collecting all Ps. 2. G #1 returns from a syscall (for example), tries to install a pointer to object X, and calls greyobject on X. 3. greyobject on G #1 marks X, but does not yet add it to a write buffer. At this point, X is effectively black, not grey, even though it may point to white objects. 4. GC reaches X through some other path and calls greyobject on X, but greyobject does nothing because X is already marked. 5. GC completes. 6. greyobject on G #1 adds X to a work buffer, but it's too late. 7. Objects that were reachable only through X are incorrectly collected. To fix this, we check the invariant that no asynchronous write barriers happen when the world is stopped by checking that write barriers always have a P, and modify all currently known sources of these writes to disable the write barrier. In all modified cases this is safe because the object in question will always be reachable via some other path. Some of the trace code was turned off, in particular the code that traces returning from a syscall. The GC assumes that as far as the heap is concerned the thread is stopped when it is in a syscall. Upon returning the trace code must not do any heap writes for the same reasons discussed above. Fixes #10098 Fixes #9953 Fixes #9951 Fixes #9884 May relate to #9610 #9771 Change-Id: Ic2e70b7caffa053e56156838eb8d89503e3c0c8a Reviewed-on: https://go-review.googlesource.com/7504 Reviewed-by: Austin Clements <austin@google.com>
2015-03-12 14:19:21 -04:00
runtime: make it possible to exit Go-created threads Currently, threads created by the runtime exist until the whole program exits. For #14592 and #20395, we want to be able to exit and clean up threads created by the runtime. This commit implements that mechanism. The main difficulty is how to clean up the g0 stack. In cgo mode and on Solaris and Windows where the OS manages thread stacks, we simply arrange to return from mstart and let the system clean up the thread. If the runtime allocated the g0 stack, then we use a new exitThread syscall wrapper that arranges to clear a flag in the M once the stack can safely be reaped and call the thread termination syscall. exitThread is based on the existing exit1 wrapper, which was always meant to terminate the calling thread. However, exit1 has never been used since it was introduced 9 years ago, so it was broken on several platforms. exitThread also has the additional complication of having to flag that the stack is unused, which requires some tricks on platforms that use the stack for syscalls. This still leaves the problem of how to reap the unused g0 stacks. For this, we move the M from allm to a new freem list as part of the M exiting. Later, allocm scans the freem list, finds Ms that are marked as done with their stack, removes these from the list and frees their g0 stacks. This also allows these Ms to be garbage collected. This CL does not yet use any of this functionality. Follow-up CLs will. Likewise, there are no new tests in this CL because we'll need follow-up functionality to test it. Change-Id: Ic851ee74227b6d39c6fc1219fc71b45d3004bc63 Reviewed-on: https://go-review.googlesource.com/46037 Run-TryBot: Austin Clements <austin@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
2017-06-16 15:54:21 -04:00
// muintptr is a *m that is not tracked by the garbage collector.
//
// Because we do free Ms, there are some additional constrains on
// muintptrs:
//
// 1. Never hold an muintptr locally across a safe point.
//
// 2. Any muintptr in the heap must be owned by the M itself so it can
// ensure it is not in use when the last true *m is released.
type muintptr uintptr
runtime: Remove write barriers during STW. The GC assumes that there will be no asynchronous write barriers when the world is stopped. This keeps the synchronization between write barriers and the GC simple. However, currently, there are a few places in runtime code where this assumption does not hold. The GC stops the world by collecting all Ps, which stops all user Go code, but small parts of the runtime can run without a P. For example, the code that releases a P must still deschedule its G onto a runnable queue before stopping. Similarly, when a G returns from a long-running syscall, it must run code to reacquire a P. Currently, this code can contain write barriers. This can lead to the GC collecting reachable objects if something like the following sequence of events happens: 1. GC stops the world by collecting all Ps. 2. G #1 returns from a syscall (for example), tries to install a pointer to object X, and calls greyobject on X. 3. greyobject on G #1 marks X, but does not yet add it to a write buffer. At this point, X is effectively black, not grey, even though it may point to white objects. 4. GC reaches X through some other path and calls greyobject on X, but greyobject does nothing because X is already marked. 5. GC completes. 6. greyobject on G #1 adds X to a work buffer, but it's too late. 7. Objects that were reachable only through X are incorrectly collected. To fix this, we check the invariant that no asynchronous write barriers happen when the world is stopped by checking that write barriers always have a P, and modify all currently known sources of these writes to disable the write barrier. In all modified cases this is safe because the object in question will always be reachable via some other path. Some of the trace code was turned off, in particular the code that traces returning from a syscall. The GC assumes that as far as the heap is concerned the thread is stopped when it is in a syscall. Upon returning the trace code must not do any heap writes for the same reasons discussed above. Fixes #10098 Fixes #9953 Fixes #9951 Fixes #9884 May relate to #9610 #9771 Change-Id: Ic2e70b7caffa053e56156838eb8d89503e3c0c8a Reviewed-on: https://go-review.googlesource.com/7504 Reviewed-by: Austin Clements <austin@google.com>
2015-03-12 14:19:21 -04:00
//go:nosplit
func (mp muintptr) ptr() *m { return (*m)(unsafe.Pointer(mp)) }
//go:nosplit
func (mp *muintptr) set(m *m) { *mp = muintptr(unsafe.Pointer(m)) }
runtime: Remove write barriers during STW. The GC assumes that there will be no asynchronous write barriers when the world is stopped. This keeps the synchronization between write barriers and the GC simple. However, currently, there are a few places in runtime code where this assumption does not hold. The GC stops the world by collecting all Ps, which stops all user Go code, but small parts of the runtime can run without a P. For example, the code that releases a P must still deschedule its G onto a runnable queue before stopping. Similarly, when a G returns from a long-running syscall, it must run code to reacquire a P. Currently, this code can contain write barriers. This can lead to the GC collecting reachable objects if something like the following sequence of events happens: 1. GC stops the world by collecting all Ps. 2. G #1 returns from a syscall (for example), tries to install a pointer to object X, and calls greyobject on X. 3. greyobject on G #1 marks X, but does not yet add it to a write buffer. At this point, X is effectively black, not grey, even though it may point to white objects. 4. GC reaches X through some other path and calls greyobject on X, but greyobject does nothing because X is already marked. 5. GC completes. 6. greyobject on G #1 adds X to a work buffer, but it's too late. 7. Objects that were reachable only through X are incorrectly collected. To fix this, we check the invariant that no asynchronous write barriers happen when the world is stopped by checking that write barriers always have a P, and modify all currently known sources of these writes to disable the write barrier. In all modified cases this is safe because the object in question will always be reachable via some other path. Some of the trace code was turned off, in particular the code that traces returning from a syscall. The GC assumes that as far as the heap is concerned the thread is stopped when it is in a syscall. Upon returning the trace code must not do any heap writes for the same reasons discussed above. Fixes #10098 Fixes #9953 Fixes #9951 Fixes #9884 May relate to #9610 #9771 Change-Id: Ic2e70b7caffa053e56156838eb8d89503e3c0c8a Reviewed-on: https://go-review.googlesource.com/7504 Reviewed-by: Austin Clements <austin@google.com>
2015-03-12 14:19:21 -04:00
// setMNoWB performs *mp = new without a write barrier.
// For times when it's impractical to use an muintptr.
//go:nosplit
//go:nowritebarrier
func setMNoWB(mp **m, new *m) {
(*muintptr)(unsafe.Pointer(mp)).set(new)
}
type gobuf struct {
// The offsets of sp, pc, and g are known to (hard-coded in) libmach.
//
// ctxt is unusual with respect to GC: it may be a
runtime: remove write barriers from newstack, gogo Currently, newstack and gogo have write barriers for maintaining the context register saved in g.sched.ctxt. This is troublesome, because newstack can be called from go:nowritebarrierrec places that can't allow write barriers. It happens to be benign because g.sched.ctxt will always be nil on entry to newstack *and* it so happens the incoming ctxt will also always be nil in these contexts (I think/hope), but this is playing with fire. It's also desirable to mark newstack go:nowritebarrierrec to prevent any other, non-benign write barriers from creeping in, but we can't do that right now because of this one write barrier. Fix all of this by observing that g.sched.ctxt is really just a saved live pointer register. Hence, we can shade it when we scan g's stack and otherwise move it back and forth between the actual context register and g.sched.ctxt without write barriers. This means we can save it in morestack along with all of the other g.sched, eliminate the save from newstack along with its troublesome write barrier, and eliminate the shenanigans in gogo to invoke the write barrier when restoring it. Once we've done all of this, we can mark newstack go:nowritebarrierrec. Fixes #22385. For #22460. Change-Id: I43c24958e3f6785b53c1350e1e83c2844e0d1522 Reviewed-on: https://go-review.googlesource.com/72553 Run-TryBot: Austin Clements <austin@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Rick Hudson <rlh@golang.org> Reviewed-by: Cherry Zhang <cherryyz@google.com>
2017-10-22 21:37:05 -04:00
// heap-allocated funcval, so GC needs to track it, but it
// needs to be set and cleared from assembly, where it's
// difficult to have write barriers. However, ctxt is really a
// saved, live register, and we only ever exchange it between
// the real register and the gobuf. Hence, we treat it as a
// root during stack scanning, which means assembly that saves
// and restores it doesn't need write barriers. It's still
// typed as a pointer so that any other writes from Go get
// write barriers.
sp uintptr
pc uintptr
g guintptr
runtime: remove write barriers from newstack, gogo Currently, newstack and gogo have write barriers for maintaining the context register saved in g.sched.ctxt. This is troublesome, because newstack can be called from go:nowritebarrierrec places that can't allow write barriers. It happens to be benign because g.sched.ctxt will always be nil on entry to newstack *and* it so happens the incoming ctxt will also always be nil in these contexts (I think/hope), but this is playing with fire. It's also desirable to mark newstack go:nowritebarrierrec to prevent any other, non-benign write barriers from creeping in, but we can't do that right now because of this one write barrier. Fix all of this by observing that g.sched.ctxt is really just a saved live pointer register. Hence, we can shade it when we scan g's stack and otherwise move it back and forth between the actual context register and g.sched.ctxt without write barriers. This means we can save it in morestack along with all of the other g.sched, eliminate the save from newstack along with its troublesome write barrier, and eliminate the shenanigans in gogo to invoke the write barrier when restoring it. Once we've done all of this, we can mark newstack go:nowritebarrierrec. Fixes #22385. For #22460. Change-Id: I43c24958e3f6785b53c1350e1e83c2844e0d1522 Reviewed-on: https://go-review.googlesource.com/72553 Run-TryBot: Austin Clements <austin@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Rick Hudson <rlh@golang.org> Reviewed-by: Cherry Zhang <cherryyz@google.com>
2017-10-22 21:37:05 -04:00
ctxt unsafe.Pointer
ret uintptr
lr uintptr
bp uintptr // for framepointer-enabled architectures
}
// sudog represents a g in a wait list, such as for sending/receiving
// on a channel.
//
// sudog is necessary because the g ↔ synchronization object relation
// is many-to-many. A g can be on many wait lists, so there may be
// many sudogs for one g; and many gs may be waiting on the same
// synchronization object, so there may be many sudogs for one object.
//
// sudogs are allocated from a special pool. Use acquireSudog and
// releaseSudog to allocate and free them.
type sudog struct {
// The following fields are protected by the hchan.lock of the
// channel this sudog is blocking on. shrinkstack depends on
// this for sudogs involved in channel ops.
g *g
next *sudog
prev *sudog
elem unsafe.Pointer // data element (may point to stack)
// The following fields are never accessed concurrently.
// For channels, waitlink is only accessed by g.
// For semaphores, all fields (including the ones above)
// are only accessed when holding a semaRoot lock.
acquiretime int64
releasetime int64
ticket uint32
// isSelect indicates g is participating in a select, so
// g.selectDone must be CAS'd to win the wake-up race.
isSelect bool
// success indicates whether communication over channel c
// succeeded. It is true if the goroutine was awoken because a
// value was delivered over channel c, and false if awoken
// because c was closed.
success bool
parent *sudog // semaRoot binary tree
waitlink *sudog // g.waiting list or semaRoot
waittail *sudog // semaRoot
c *hchan // channel
}
type libcall struct {
fn uintptr
n uintptr // number of parameters
args uintptr // parameters
r1 uintptr // return values
r2 uintptr
err uintptr // error number
}
// Stack describes a Go execution stack.
// The bounds of the stack are exactly [lo, hi),
// with no implicit data structures on either side.
type stack struct {
lo uintptr
hi uintptr
}
runtime: static lock ranking for the runtime (enabled by GOEXPERIMENT) I took some of the infrastructure from Austin's lock logging CR https://go-review.googlesource.com/c/go/+/192704 (with deadlock detection from the logs), and developed a setup to give static lock ranking for runtime locks. Static lock ranking establishes a documented total ordering among locks, and then reports an error if the total order is violated. This can happen if a deadlock happens (by acquiring a sequence of locks in different orders), or if just one side of a possible deadlock happens. Lock ordering deadlocks cannot happen as long as the lock ordering is followed. Along the way, I found a deadlock involving the new timer code, which Ian fixed via https://go-review.googlesource.com/c/go/+/207348, as well as two other potential deadlocks. See the constants at the top of runtime/lockrank.go to show the static lock ranking that I ended up with, along with some comments. This is great documentation of the current intended lock ordering when acquiring multiple locks in the runtime. I also added an array lockPartialOrder[] which shows and enforces the current partial ordering among locks (which is embedded within the total ordering). This is more specific about the dependencies among locks. I don't try to check the ranking within a lock class with multiple locks that can be acquired at the same time (i.e. check the ranking when multiple hchan locks are acquired). Currently, I am doing a lockInit() call to set the lock rank of most locks. Any lock that is not otherwise initialized is assumed to be a leaf lock (a very high rank lock), so that eliminates the need to do anything for a bunch of locks (including all architecture-dependent locks). For two locks, root.lock and notifyList.lock (only in the runtime/sema.go file), it is not as easy to do lock initialization, so instead, I am passing the lock rank with the lock calls. For Windows compilation, I needed to increase the StackGuard size from 896 to 928 because of the new lock-rank checking functions. Checking of the static lock ranking is enabled by setting GOEXPERIMENT=staticlockranking before doing a run. To make sure that the static lock ranking code has no overhead in memory or CPU when not enabled by GOEXPERIMENT, I changed 'go build/install' so that it defines a build tag (with the same name) whenever any experiment has been baked into the toolchain (by checking Expstring()). This allows me to avoid increasing the size of the 'mutex' type when static lock ranking is not enabled. Fixes #38029 Change-Id: I154217ff307c47051f8dae9c2a03b53081acd83a Reviewed-on: https://go-review.googlesource.com/c/go/+/207619 Reviewed-by: Dan Scales <danscales@google.com> Reviewed-by: Keith Randall <khr@golang.org> Run-TryBot: Dan Scales <danscales@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org>
2019-11-13 17:34:47 -08:00
// heldLockInfo gives info on a held lock and the rank of that lock
type heldLockInfo struct {
lockAddr uintptr
rank lockRank
}
type g struct {
// Stack parameters.
// stack describes the actual stack memory: [stack.lo, stack.hi).
// stackguard0 is the stack pointer compared in the Go stack growth prologue.
// It is stack.lo+StackGuard normally, but can be StackPreempt to trigger a preemption.
// stackguard1 is the stack pointer compared in the C stack growth prologue.
// It is stack.lo+StackGuard on g0 and gsignal stacks.
// It is ~0 on other goroutine stacks, to trigger a call to morestackc (and crash).
stack stack // offset known to runtime/cgo
stackguard0 uintptr // offset known to liblink
stackguard1 uintptr // offset known to liblink
runtime: update debug call protocol for register ABI The debug call tests currently assume that the target Go function is ABI0; this is clearly no longer true when we switch to the new ABI, so make the tests set up argument register state in the debug call handler and copy back results returned in registers. A small snag in calling a Go function that follows the new ABI is that the debug call protocol depends on the AX register being set to a specific value as it bounces in and out of the handler, but this register is part of the new register ABI, so results end up being clobbered. Use R12 instead. Next, the new desugaring behavior for "go" statements means that newosproc1 must always call a function with no frame; if it takes any arguments, it closes over them and they're passed in the context register. Currently when debugCallWrap creates a new goroutine, it uses newosproc1 directly and passes a non-zero-sized frame, so that needs to be updated. To fix this, briefly use the g's param field which is otherwise only used for channels to pass an explicitly allocated object containing the "closed over" variables. While we could manually do the desugaring ourselves (we cannot do so automatically because the Go compiler prevents heap-allocated closures in the runtime), that bakes in more ABI details in a place that really doesn't need to care about them. Finally, there's an old bug here where the context register was set up in CX, so technically closure calls never worked. Oops. It was otherwise harmless for other types of calls before, but now CX is an argument register, so now that interferes with regular calls, too. For #40724. Change-Id: I652c25ed56a25741bb04c24cfb603063c099edde Reviewed-on: https://go-review.googlesource.com/c/go/+/309169 Trust: Michael Knyszek <mknyszek@google.com> Run-TryBot: Michael Knyszek <mknyszek@google.com> Reviewed-by: Austin Clements <austin@google.com> Reviewed-by: Alessandro Arzilli <alessandro.arzilli@gmail.com> TryBot-Result: Go Bot <gobot@golang.org>
2021-04-10 00:05:07 +00:00
_panic *_panic // innermost panic - offset known to liblink
_defer *_defer // innermost defer
m *m // current m; offset known to arm liblink
sched gobuf
syscallsp uintptr // if status==Gsyscall, syscallsp = sched.sp to use during gc
syscallpc uintptr // if status==Gsyscall, syscallpc = sched.pc to use during gc
stktopsp uintptr // expected sp at top of stack, to check in traceback
// param is a generic pointer parameter field used to pass
// values in particular contexts where other storage for the
// parameter would be difficult to find. It is currently used
// in three ways:
// 1. When a channel operation wakes up a blocked goroutine, it sets param to
// point to the sudog of the completed blocking operation.
// 2. By gcAssistAlloc1 to signal back to its caller that the goroutine completed
// the GC cycle. It is unsafe to do so in any other way, because the goroutine's
// stack may have moved in the meantime.
// 3. By debugCallWrap to pass parameters to a new goroutine because allocating a
// closure in the runtime is forbidden.
param unsafe.Pointer
runtime: make copystack/sudog synchronization more explicit When we copy a stack of a goroutine blocked in a channel operation, we have to be very careful because other goroutines may be writing to that goroutine's stack. To handle this, stack copying acquires the locks for the channels a goroutine is waiting on. One complication is that stack growth may happen while a goroutine holds these locks, in which case stack copying must *not* acquire these locks because that would self-deadlock. Currently, stack growth never acquires these locks because stack growth only happens when a goroutine is running, which means it's either not blocking on a channel or it's holding the channel locks already. Stack shrinking always acquires these locks because shrinking happens asynchronously, so the goroutine is never running, so there are either no locks or they've been released by the goroutine. However, we're about to change when stack shrinking can happen, which is going to break the current rules. Rather than find a new way to derive whether to acquire these locks or not, this CL simply adds a flag to the g struct that indicates that stack copying should acquire channel locks. This flag is set while the goroutine is blocked on a channel op. For #10958, #24543. Change-Id: Ia2ac8831b1bfda98d39bb30285e144c4f7eaf9ab Reviewed-on: https://go-review.googlesource.com/c/go/+/172982 Run-TryBot: Austin Clements <austin@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Michael Knyszek <mknyszek@google.com>
2019-04-03 14:00:12 -04:00
atomicstatus uint32
stackLock uint32 // sigprof/scang lock; TODO: fold in to atomicstatus
goid int64
schedlink guintptr
waitsince int64 // approx time when the g become blocked
waitreason waitReason // if status==Gwaiting
preempt bool // preemption signal, duplicates stackguard0 = stackpreempt
preemptStop bool // transition to _Gpreempted on preemption; otherwise, just deschedule
preemptShrink bool // shrink stack at synchronous safe point
// asyncSafePoint is set if g is stopped at an asynchronous
// safe point. This means there are frames on the stack
// without precise pointer information.
asyncSafePoint bool
paniconfault bool // panic (instead of crash) on unexpected fault address
gcscandone bool // g has scanned stack; protected by _Gscan bit in status
throwsplit bool // must not split stack
runtime: make copystack/sudog synchronization more explicit When we copy a stack of a goroutine blocked in a channel operation, we have to be very careful because other goroutines may be writing to that goroutine's stack. To handle this, stack copying acquires the locks for the channels a goroutine is waiting on. One complication is that stack growth may happen while a goroutine holds these locks, in which case stack copying must *not* acquire these locks because that would self-deadlock. Currently, stack growth never acquires these locks because stack growth only happens when a goroutine is running, which means it's either not blocking on a channel or it's holding the channel locks already. Stack shrinking always acquires these locks because shrinking happens asynchronously, so the goroutine is never running, so there are either no locks or they've been released by the goroutine. However, we're about to change when stack shrinking can happen, which is going to break the current rules. Rather than find a new way to derive whether to acquire these locks or not, this CL simply adds a flag to the g struct that indicates that stack copying should acquire channel locks. This flag is set while the goroutine is blocked on a channel op. For #10958, #24543. Change-Id: Ia2ac8831b1bfda98d39bb30285e144c4f7eaf9ab Reviewed-on: https://go-review.googlesource.com/c/go/+/172982 Run-TryBot: Austin Clements <austin@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Michael Knyszek <mknyszek@google.com>
2019-04-03 14:00:12 -04:00
// activeStackChans indicates that there are unlocked channels
// pointing into this goroutine's stack. If true, stack
// copying needs to acquire channel locks to protect these
// areas of the stack.
activeStackChans bool
runtime: disable stack shrinking in activeStackChans race window Currently activeStackChans is set before a goroutine blocks on a channel operation in an unlockf passed to gopark. The trouble is that the unlockf is called *after* the G's status is changed, and the G's status is what is used by a concurrent mark worker (calling suspendG) to determine that a G has successfully been suspended. In this window between the status change and unlockf, the mark worker could try to shrink the G's stack, and in particular observe that activeStackChans is false. This observation will cause the mark worker to *not* synchronize with concurrent channel operations when it should, and so updating pointers in the sudog for the blocked goroutine (which may point to the goroutine's stack) races with channel operations which may also manipulate the pointer (read it, dereference it, update it, etc.). Fix the problem by adding a new atomically-updated flag to the g struct called parkingOnChan, which is non-zero in the race window above. Then, in isShrinkStackSafe, check if parkingOnChan is zero. The race is resolved like so: * Blocking G sets parkingOnChan, then changes status in gopark. * Mark worker successfully suspends blocking G. * If the mark worker observes parkingOnChan is non-zero when checking isShrinkStackSafe, then it's not safe to shrink (we're in the race window). * If the mark worker observes parkingOnChan as zero, then because the mark worker observed the G status change, it can be sure that gopark's unlockf completed, and gp.activeStackChans will be correct. The risk of this change is low, since although it reduces the number of places that stack shrinking is allowed, the window here is incredibly small. Essentially, every place that it might crash now is replaced with no shrink. This change adds a test, but the race window is so small that it's hard to trigger without a well-placed sleep in park_m. Also, this change fixes stackGrowRecursive in proc_test.go to actually allocate a 128-byte stack frame. It turns out the compiler was destructuring the "pad" field and only allocating one uint64 on the stack. Fixes #40641. Change-Id: I7dfbe7d460f6972b8956116b137bc13bc24464e8 Reviewed-on: https://go-review.googlesource.com/c/go/+/247050 Run-TryBot: Michael Knyszek <mknyszek@google.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Michael Pratt <mpratt@google.com> Trust: Michael Knyszek <mknyszek@google.com>
2020-08-10 20:02:22 +00:00
// parkingOnChan indicates that the goroutine is about to
// park on a chansend or chanrecv. Used to signal an unsafe point
// for stack shrinking. It's a boolean value, but is updated atomically.
parkingOnChan uint8
runtime: make copystack/sudog synchronization more explicit When we copy a stack of a goroutine blocked in a channel operation, we have to be very careful because other goroutines may be writing to that goroutine's stack. To handle this, stack copying acquires the locks for the channels a goroutine is waiting on. One complication is that stack growth may happen while a goroutine holds these locks, in which case stack copying must *not* acquire these locks because that would self-deadlock. Currently, stack growth never acquires these locks because stack growth only happens when a goroutine is running, which means it's either not blocking on a channel or it's holding the channel locks already. Stack shrinking always acquires these locks because shrinking happens asynchronously, so the goroutine is never running, so there are either no locks or they've been released by the goroutine. However, we're about to change when stack shrinking can happen, which is going to break the current rules. Rather than find a new way to derive whether to acquire these locks or not, this CL simply adds a flag to the g struct that indicates that stack copying should acquire channel locks. This flag is set while the goroutine is blocked on a channel op. For #10958, #24543. Change-Id: Ia2ac8831b1bfda98d39bb30285e144c4f7eaf9ab Reviewed-on: https://go-review.googlesource.com/c/go/+/172982 Run-TryBot: Austin Clements <austin@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Michael Knyszek <mknyszek@google.com>
2019-04-03 14:00:12 -04:00
raceignore int8 // ignore race detection events
sysblocktraced bool // StartTrace has emitted EvGoInSyscall about this goroutine
tracking bool // whether we're tracking this G for sched latency statistics
trackingSeq uint8 // used to decide whether to track this G
runnableStamp int64 // timestamp of when the G last became runnable, only used when tracking
runnableTime int64 // the amount of time spent runnable, cleared when running, only used when tracking
runtime: make copystack/sudog synchronization more explicit When we copy a stack of a goroutine blocked in a channel operation, we have to be very careful because other goroutines may be writing to that goroutine's stack. To handle this, stack copying acquires the locks for the channels a goroutine is waiting on. One complication is that stack growth may happen while a goroutine holds these locks, in which case stack copying must *not* acquire these locks because that would self-deadlock. Currently, stack growth never acquires these locks because stack growth only happens when a goroutine is running, which means it's either not blocking on a channel or it's holding the channel locks already. Stack shrinking always acquires these locks because shrinking happens asynchronously, so the goroutine is never running, so there are either no locks or they've been released by the goroutine. However, we're about to change when stack shrinking can happen, which is going to break the current rules. Rather than find a new way to derive whether to acquire these locks or not, this CL simply adds a flag to the g struct that indicates that stack copying should acquire channel locks. This flag is set while the goroutine is blocked on a channel op. For #10958, #24543. Change-Id: Ia2ac8831b1bfda98d39bb30285e144c4f7eaf9ab Reviewed-on: https://go-review.googlesource.com/c/go/+/172982 Run-TryBot: Austin Clements <austin@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Michael Knyszek <mknyszek@google.com>
2019-04-03 14:00:12 -04:00
sysexitticks int64 // cputicks when syscall has returned (for tracing)
traceseq uint64 // trace event sequencer
tracelastp puintptr // last P emitted an event for this goroutine
lockedm muintptr
sig uint32
writebuf []byte
sigcode0 uintptr
sigcode1 uintptr
sigpc uintptr
gopc uintptr // pc of go statement that created this goroutine
ancestors *[]ancestorInfo // ancestor information goroutine(s) that created this goroutine (only used if debug.tracebackancestors)
startpc uintptr // pc of goroutine function
racectx uintptr
waiting *sudog // sudog structures this g is waiting on (that have a valid elem ptr); in lock order
cgoCtxt []uintptr // cgo traceback context
labels unsafe.Pointer // profiler labels
timer *timer // cached timer for time.Sleep
selectDone uint32 // are we participating in a select and did someone win the race?
runtime: make stack re-scan O(# dirty stacks) Currently the stack re-scan during mark termination is O(# stacks) because we enqueue a root marking job for every goroutine. It takes ~34ns to process this root marking job for a valid (clean) stack, so at around 300k goroutines we exceed the 10ms pause goal. A non-trivial portion of this time is spent simply taking the cache miss to check the gcscanvalid flag, so simply optimizing the path that handles clean stacks can only improve this so much. Fix this by keeping an explicit list of goroutines with dirty stacks that need to be rescanned. When a goroutine first transitions to running after a stack scan and marks its stack dirty, it adds itself to this list. We enqueue root marking jobs only for the goroutines in this list, so this improves stack re-scanning asymptotically by completely eliminating time spent on clean goroutines. This reduces mark termination time for 500k idle goroutines from 15ms to 238µs. Overall performance effect is negligible. name \ 95%ile-time/markTerm old new delta IdleGs/gs:500000/gomaxprocs:12 15000µs ± 0% 238µs ± 5% -98.41% (p=0.000 n=10+10) name old time/op new time/op delta XBenchGarbage-12 2.30ms ± 3% 2.29ms ± 1% -0.43% (p=0.049 n=17+18) name old time/op new time/op delta BinaryTree17-12 2.57s ± 3% 2.59s ± 2% ~ (p=0.141 n=19+20) Fannkuch11-12 2.09s ± 0% 2.10s ± 1% +0.53% (p=0.000 n=19+19) FmtFprintfEmpty-12 45.3ns ± 3% 45.2ns ± 2% ~ (p=0.845 n=20+20) FmtFprintfString-12 129ns ± 0% 127ns ± 0% -1.55% (p=0.000 n=16+16) FmtFprintfInt-12 123ns ± 0% 119ns ± 1% -3.24% (p=0.000 n=19+19) FmtFprintfIntInt-12 195ns ± 1% 189ns ± 1% -3.11% (p=0.000 n=17+17) FmtFprintfPrefixedInt-12 193ns ± 1% 187ns ± 1% -3.06% (p=0.000 n=19+19) FmtFprintfFloat-12 254ns ± 0% 255ns ± 1% +0.35% (p=0.001 n=14+17) FmtManyArgs-12 781ns ± 0% 770ns ± 0% -1.48% (p=0.000 n=16+19) GobDecode-12 7.00ms ± 1% 6.98ms ± 1% ~ (p=0.563 n=19+19) GobEncode-12 5.91ms ± 1% 5.92ms ± 0% ~ (p=0.118 n=19+18) Gzip-12 219ms ± 1% 215ms ± 1% -1.81% (p=0.000 n=18+18) Gunzip-12 37.2ms ± 0% 37.4ms ± 0% +0.45% (p=0.000 n=17+19) HTTPClientServer-12 76.9µs ± 3% 77.5µs ± 2% +0.81% (p=0.030 n=20+19) JSONEncode-12 15.0ms ± 0% 14.8ms ± 1% -0.88% (p=0.001 n=15+19) JSONDecode-12 50.6ms ± 0% 53.2ms ± 2% +5.07% (p=0.000 n=17+19) Mandelbrot200-12 4.05ms ± 0% 4.05ms ± 1% ~ (p=0.581 n=16+17) GoParse-12 3.34ms ± 1% 3.30ms ± 1% -1.21% (p=0.000 n=15+20) RegexpMatchEasy0_32-12 69.6ns ± 1% 69.8ns ± 2% ~ (p=0.566 n=19+19) RegexpMatchEasy0_1K-12 238ns ± 1% 236ns ± 0% -0.91% (p=0.000 n=17+13) RegexpMatchEasy1_32-12 69.8ns ± 1% 70.0ns ± 1% +0.23% (p=0.026 n=17+16) RegexpMatchEasy1_1K-12 371ns ± 1% 363ns ± 1% -2.07% (p=0.000 n=19+19) RegexpMatchMedium_32-12 107ns ± 2% 106ns ± 1% -0.51% (p=0.031 n=18+20) RegexpMatchMedium_1K-12 33.0µs ± 0% 32.9µs ± 0% -0.30% (p=0.004 n=16+16) RegexpMatchHard_32-12 1.70µs ± 0% 1.70µs ± 0% +0.45% (p=0.000 n=16+17) RegexpMatchHard_1K-12 51.1µs ± 2% 51.4µs ± 1% +0.53% (p=0.000 n=17+19) Revcomp-12 378ms ± 1% 385ms ± 1% +1.92% (p=0.000 n=19+18) Template-12 64.3ms ± 2% 65.0ms ± 2% +1.09% (p=0.001 n=19+19) TimeParse-12 315ns ± 1% 317ns ± 2% ~ (p=0.108 n=18+20) TimeFormat-12 360ns ± 1% 337ns ± 0% -6.30% (p=0.000 n=18+13) [Geo mean] 51.8µs 51.6µs -0.48% Change-Id: Icf8994671476840e3998236e15407a505d4c760c Reviewed-on: https://go-review.googlesource.com/20700 Reviewed-by: Rick Hudson <rlh@golang.org> Run-TryBot: Austin Clements <austin@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org>
2016-03-04 11:58:26 -05:00
// Per-G GC state
runtime: directly track GC assist balance Currently we track the per-G GC assist balance as two monotonically increasing values: the bytes allocated by the G this cycle (gcalloc) and the scan work performed by the G this cycle (gcscanwork). The assist balance is hence assistRatio*gcalloc - gcscanwork. This works, but has two important downsides: 1) It requires floating-point math to figure out if a G is in debt or not. This makes it inappropriate to check for assist debt in the hot path of mallocgc, so we only do this when a G allocates a new span. As a result, Gs can operate "in the red", leading to under-assist and extended GC cycle length. 2) Revising the assist ratio during a GC cycle can lead to an "assist burst". If you think of plotting the scan work performed versus heaps size, the assist ratio controls the slope of this line. However, in the current system, the target line always passes through 0 at the heap size that triggered GC, so if the runtime increases the assist ratio, there has to be a potentially large assist to jump from the current amount of scan work up to the new target scan work for the current heap size. This commit replaces this approach with directly tracking the GC assist balance in terms of allocation credit bytes. Allocating N bytes simply decreases this by N and assisting raises it by the amount of scan work performed divided by the assist ratio (to get back to bytes). This will make it cheap to figure out if a G is in debt, which will let us efficiently check if an assist is necessary *before* performing an allocation and hence keep Gs "in the black". This also fixes assist bursts because the assist ratio is now in terms of *remaining* work, rather than work from the beginning of the GC cycle. Hence, the plot of scan work versus heap size becomes continuous: we can revise the slope, but this slope always starts from where we are right now, rather than where we were at the beginning of the cycle. Change-Id: Ia821c5f07f8a433e8da7f195b52adfedd58bdf2c Reviewed-on: https://go-review.googlesource.com/15408 Reviewed-by: Rick Hudson <rlh@golang.org>
2015-10-04 20:16:57 -07:00
// gcAssistBytes is this G's GC assist credit in terms of
// bytes allocated. If this is positive, then the G has credit
// to allocate gcAssistBytes bytes without assisting. If this
// is negative, then the G must correct this by performing
// scan work. We track this in bytes to make it fast to update
// and check for debt in the malloc hot path. The assist ratio
// determines how this corresponds to scan work debt.
gcAssistBytes int64
}
// gTrackingPeriod is the number of transitions out of _Grunning between
// latency tracking runs.
const gTrackingPeriod = 8
const (
// tlsSlots is the number of pointer-sized slots reserved for TLS on some platforms,
// like Windows.
tlsSlots = 6
tlsSize = tlsSlots * goarch.PtrSize
)
type m struct {
g0 *g // goroutine with scheduling stack
morebuf gobuf // gobuf arg to morestack
divmod uint32 // div/mod denominator for arm - known to liblink
// Fields not known to debuggers.
procid uint64 // for debuggers, but offset not hard-coded
gsignal *g // signal-handling g
goSigStack gsignalStack // Go-allocated signal handling stack
sigmask sigset // storage for saved signal mask
tls [tlsSlots]uintptr // thread-local storage (for x86 extern register)
mstartfn func()
curg *g // current running goroutine
caughtsig guintptr // goroutine running during fatal signal
p puintptr // attached p for executing go code (nil if not executing go code)
nextp puintptr
oldp puintptr // the p that was attached before executing a syscall
id int64
mallocing int32
throwing int32
preemptoff string // if != "", keep curg running on this m
locks int32
dying int32
profilehz int32
spinning bool // m is out of work and is actively looking for work
blocked bool // m is blocked on a note
newSigstack bool // minit on C thread called sigaltstack
printlock int8
runtime: make it possible to exit Go-created threads Currently, threads created by the runtime exist until the whole program exits. For #14592 and #20395, we want to be able to exit and clean up threads created by the runtime. This commit implements that mechanism. The main difficulty is how to clean up the g0 stack. In cgo mode and on Solaris and Windows where the OS manages thread stacks, we simply arrange to return from mstart and let the system clean up the thread. If the runtime allocated the g0 stack, then we use a new exitThread syscall wrapper that arranges to clear a flag in the M once the stack can safely be reaped and call the thread termination syscall. exitThread is based on the existing exit1 wrapper, which was always meant to terminate the calling thread. However, exit1 has never been used since it was introduced 9 years ago, so it was broken on several platforms. exitThread also has the additional complication of having to flag that the stack is unused, which requires some tricks on platforms that use the stack for syscalls. This still leaves the problem of how to reap the unused g0 stacks. For this, we move the M from allm to a new freem list as part of the M exiting. Later, allocm scans the freem list, finds Ms that are marked as done with their stack, removes these from the list and frees their g0 stacks. This also allows these Ms to be garbage collected. This CL does not yet use any of this functionality. Follow-up CLs will. Likewise, there are no new tests in this CL because we'll need follow-up functionality to test it. Change-Id: Ic851ee74227b6d39c6fc1219fc71b45d3004bc63 Reviewed-on: https://go-review.googlesource.com/46037 Run-TryBot: Austin Clements <austin@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
2017-06-16 15:54:21 -04:00
incgo bool // m is executing a cgo call
freeWait uint32 // if == 0, safe to free g0 and delete m (atomic)
runtime: using wyrand for fastrand For modern 64-bit CPU architecture multiplier is faster than xorshift darwin/amd64 name old time/op new time/op delta Fastrand 2.13ns ± 1% 1.78ns ± 1% -16.47% (p=0.000 n=9+10) FastrandHashiter 32.5ns ± 4% 32.1ns ± 3% ~ (p=0.277 n=8+9) Fastrandn/2 2.16ns ± 1% 1.99ns ± 1% -7.53% (p=0.000 n=10+10) Fastrandn/3 2.13ns ± 3% 2.00ns ± 1% -5.88% (p=0.000 n=10+10) Fastrandn/4 2.08ns ± 2% 1.98ns ± 2% -4.71% (p=0.000 n=10+10) Fastrandn/5 2.08ns ± 2% 1.98ns ± 1% -4.90% (p=0.000 n=10+9) linux/mips64le name old time/op new time/op delta Fastrand 12.1ns ± 0% 10.8ns ± 1% -10.58% (p=0.000 n=8+10) FastrandHashiter 105ns ± 1% 105ns ± 1% ~ (p=0.138 n=10+10) Fastrandn/2 16.9ns ± 0% 16.4ns ± 4% -2.84% (p=0.020 n=10+10) Fastrandn/3 16.9ns ± 0% 16.4ns ± 3% -3.03% (p=0.000 n=10+10) Fastrandn/4 16.9ns ± 0% 16.5ns ± 2% -2.01% (p=0.002 n=8+10) Fastrandn/5 16.9ns ± 0% 16.4ns ± 3% -2.70% (p=0.000 n=8+10) linux/riscv64 name old time/op new time/op delta Fastrand 22.7ns ± 0% 12.7ns ±19% -44.09% (p=0.000 n=9+10) FastrandHashiter 255ns ± 4% 250ns ± 7% ~ (p=0.363 n=10+10) Fastrandn/2 31.8ns ± 2% 28.5ns ±13% -10.45% (p=0.000 n=10+10) Fastrandn/3 33.0ns ± 2% 27.4ns ± 8% -17.16% (p=0.000 n=9+10) Fastrandn/4 29.6ns ± 3% 28.2ns ± 5% -4.81% (p=0.000 n=8+9) Fastrandn/5 33.4ns ± 3% 26.5ns ± 9% -20.49% (p=0.000 n=8+10) Change-Id: I88ac69625ef923f8be66647e3361e3be986de002 Reviewed-on: https://go-review.googlesource.com/c/go/+/337350 Trust: Meng Zhuo <mzh@golangcn.org> Run-TryBot: Meng Zhuo <mzh@golangcn.org> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
2021-09-14 10:48:18 +08:00
fastrand uint64
needextram bool
traceback uint8
ncgocall uint64 // number of cgo calls in total
ncgo int32 // number of cgo calls currently in progress
cgoCallersUse uint32 // if non-zero, cgoCallers in use temporarily
cgoCallers *cgoCallers // cgo traceback if crashing in cgo call
park note
alllink *m // on allm
schedlink muintptr
lockedg guintptr
createstack [32]uintptr // stack that created this thread.
lockedExt uint32 // tracking for external LockOSThread
lockedInt uint32 // tracking for internal lockOSThread
nextwaitm muintptr // next m waiting for lock
waitunlockf func(*g, unsafe.Pointer) bool
waitlock unsafe.Pointer
waittraceev byte
waittraceskip int
startingtrace bool
syscalltick uint32
freelink *m // on sched.freem
// these are here because they are too large to be on the stack
// of low-level NOSPLIT functions.
libcall libcall
libcallpc uintptr // for cpu profiler
libcallsp uintptr
libcallg guintptr
syscall libcall // stores syscall parameters on windows
vdsoSP uintptr // SP for traceback while in VDSO call (0 if not in call)
vdsoPC uintptr // PC for traceback while in VDSO call
// preemptGen counts the number of completed preemption
// signals. This is used to detect when a preemption is
// requested, but fails. Accessed atomically.
preemptGen uint32
// Whether this is a pending preemption signal on this M.
// Accessed atomically.
signalPending uint32
dlogPerM
mOS
runtime: static lock ranking for the runtime (enabled by GOEXPERIMENT) I took some of the infrastructure from Austin's lock logging CR https://go-review.googlesource.com/c/go/+/192704 (with deadlock detection from the logs), and developed a setup to give static lock ranking for runtime locks. Static lock ranking establishes a documented total ordering among locks, and then reports an error if the total order is violated. This can happen if a deadlock happens (by acquiring a sequence of locks in different orders), or if just one side of a possible deadlock happens. Lock ordering deadlocks cannot happen as long as the lock ordering is followed. Along the way, I found a deadlock involving the new timer code, which Ian fixed via https://go-review.googlesource.com/c/go/+/207348, as well as two other potential deadlocks. See the constants at the top of runtime/lockrank.go to show the static lock ranking that I ended up with, along with some comments. This is great documentation of the current intended lock ordering when acquiring multiple locks in the runtime. I also added an array lockPartialOrder[] which shows and enforces the current partial ordering among locks (which is embedded within the total ordering). This is more specific about the dependencies among locks. I don't try to check the ranking within a lock class with multiple locks that can be acquired at the same time (i.e. check the ranking when multiple hchan locks are acquired). Currently, I am doing a lockInit() call to set the lock rank of most locks. Any lock that is not otherwise initialized is assumed to be a leaf lock (a very high rank lock), so that eliminates the need to do anything for a bunch of locks (including all architecture-dependent locks). For two locks, root.lock and notifyList.lock (only in the runtime/sema.go file), it is not as easy to do lock initialization, so instead, I am passing the lock rank with the lock calls. For Windows compilation, I needed to increase the StackGuard size from 896 to 928 because of the new lock-rank checking functions. Checking of the static lock ranking is enabled by setting GOEXPERIMENT=staticlockranking before doing a run. To make sure that the static lock ranking code has no overhead in memory or CPU when not enabled by GOEXPERIMENT, I changed 'go build/install' so that it defines a build tag (with the same name) whenever any experiment has been baked into the toolchain (by checking Expstring()). This allows me to avoid increasing the size of the 'mutex' type when static lock ranking is not enabled. Fixes #38029 Change-Id: I154217ff307c47051f8dae9c2a03b53081acd83a Reviewed-on: https://go-review.googlesource.com/c/go/+/207619 Reviewed-by: Dan Scales <danscales@google.com> Reviewed-by: Keith Randall <khr@golang.org> Run-TryBot: Dan Scales <danscales@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org>
2019-11-13 17:34:47 -08:00
// Up to 10 locks held by this m, maintained by the lock ranking code.
locksHeldLen int
locksHeld [10]heldLockInfo
}
type p struct {
id int32
status uint32 // one of pidle/prunning/...
link puintptr
schedtick uint32 // incremented on every scheduler call
syscalltick uint32 // incremented on every system call
sysmontick sysmontick // last tick observed by sysmon
m muintptr // back-link to associated m (nil if idle)
mcache *mcache
pcache pageCache
raceprocctx uintptr
deferpool []*_defer // pool of available defer structs (see panic.go)
deferpoolbuf [32]*_defer
// Cache of goroutine ids, amortizes accesses to runtime·sched.goidgen.
goidcache uint64
goidcacheend uint64
runtime: yield time slice to most recently readied G Currently, when the runtime ready()s a G, it adds it to the end of the current P's run queue and continues running. If there are many other things in the run queue, this can result in a significant delay before the ready()d G actually runs and can hurt fairness when other Gs in the run queue are CPU hogs. For example, if there are three Gs sharing a P, one of which is a CPU hog that never voluntarily gives up the P and the other two of which are doing small amounts of work and communicating back and forth on an unbuffered channel, the two communicating Gs will get very little CPU time. Change this so that when G1 ready()s G2 and then blocks, the scheduler immediately hands off the remainder of G1's time slice to G2. In the above example, the two communicating Gs will now act as a unit and together get half of the CPU time, while the CPU hog gets the other half of the CPU time. This fixes the problem demonstrated by the ping-pong benchmark added in the previous commit: benchmark old ns/op new ns/op delta BenchmarkPingPongHog 684287 825 -99.88% On the x/benchmarks suite, this change improves the performance of garbage by ~6% (for GOMAXPROCS=1 and 4), and json by 28% and 36% for GOMAXPROCS=1 and 4. It has negligible effect on heap size. This has no effect on the go1 benchmark suite since those benchmarks are mostly single-threaded. Change-Id: I858a08eaa78f702ea98a5fac99d28a4ac91d339f Reviewed-on: https://go-review.googlesource.com/9289 Reviewed-by: Rick Hudson <rlh@golang.org> Reviewed-by: Russ Cox <rsc@golang.org>
2015-04-22 14:42:26 -04:00
// Queue of runnable goroutines. Accessed without lock.
runqhead uint32
runqtail uint32
runq [256]guintptr
runtime: yield time slice to most recently readied G Currently, when the runtime ready()s a G, it adds it to the end of the current P's run queue and continues running. If there are many other things in the run queue, this can result in a significant delay before the ready()d G actually runs and can hurt fairness when other Gs in the run queue are CPU hogs. For example, if there are three Gs sharing a P, one of which is a CPU hog that never voluntarily gives up the P and the other two of which are doing small amounts of work and communicating back and forth on an unbuffered channel, the two communicating Gs will get very little CPU time. Change this so that when G1 ready()s G2 and then blocks, the scheduler immediately hands off the remainder of G1's time slice to G2. In the above example, the two communicating Gs will now act as a unit and together get half of the CPU time, while the CPU hog gets the other half of the CPU time. This fixes the problem demonstrated by the ping-pong benchmark added in the previous commit: benchmark old ns/op new ns/op delta BenchmarkPingPongHog 684287 825 -99.88% On the x/benchmarks suite, this change improves the performance of garbage by ~6% (for GOMAXPROCS=1 and 4), and json by 28% and 36% for GOMAXPROCS=1 and 4. It has negligible effect on heap size. This has no effect on the go1 benchmark suite since those benchmarks are mostly single-threaded. Change-Id: I858a08eaa78f702ea98a5fac99d28a4ac91d339f Reviewed-on: https://go-review.googlesource.com/9289 Reviewed-by: Rick Hudson <rlh@golang.org> Reviewed-by: Russ Cox <rsc@golang.org>
2015-04-22 14:42:26 -04:00
// runnext, if non-nil, is a runnable G that was ready'd by
// the current G and should be run next instead of what's in
// runq if there's time remaining in the running G's time
// slice. It will inherit the time left in the current time
// slice. If a set of goroutines is locked in a
// communicate-and-wait pattern, this schedules that set as a
// unit and eliminates the (potentially large) scheduling
// latency that otherwise arises from adding the ready'd
// goroutines to the end of the run queue.
//
// Note that while other P's may atomically CAS this to zero,
// only the owner P can CAS it to a valid G.
runtime: yield time slice to most recently readied G Currently, when the runtime ready()s a G, it adds it to the end of the current P's run queue and continues running. If there are many other things in the run queue, this can result in a significant delay before the ready()d G actually runs and can hurt fairness when other Gs in the run queue are CPU hogs. For example, if there are three Gs sharing a P, one of which is a CPU hog that never voluntarily gives up the P and the other two of which are doing small amounts of work and communicating back and forth on an unbuffered channel, the two communicating Gs will get very little CPU time. Change this so that when G1 ready()s G2 and then blocks, the scheduler immediately hands off the remainder of G1's time slice to G2. In the above example, the two communicating Gs will now act as a unit and together get half of the CPU time, while the CPU hog gets the other half of the CPU time. This fixes the problem demonstrated by the ping-pong benchmark added in the previous commit: benchmark old ns/op new ns/op delta BenchmarkPingPongHog 684287 825 -99.88% On the x/benchmarks suite, this change improves the performance of garbage by ~6% (for GOMAXPROCS=1 and 4), and json by 28% and 36% for GOMAXPROCS=1 and 4. It has negligible effect on heap size. This has no effect on the go1 benchmark suite since those benchmarks are mostly single-threaded. Change-Id: I858a08eaa78f702ea98a5fac99d28a4ac91d339f Reviewed-on: https://go-review.googlesource.com/9289 Reviewed-by: Rick Hudson <rlh@golang.org> Reviewed-by: Russ Cox <rsc@golang.org>
2015-04-22 14:42:26 -04:00
runnext guintptr
// Available G's (status == Gdead)
gFree struct {
gList
n int32
}
sudogcache []*sudog
sudogbuf [128]*sudog
// Cache of mspan objects from the heap.
mspancache struct {
// We need an explicit length here because this field is used
// in allocation codepaths where write barriers are not allowed,
// and eliminating the write barrier/keeping it eliminated from
// slice updates is tricky, moreso than just managing the length
// ourselves.
len int
buf [128]*mspan
}
tracebuf traceBufPtr
// traceSweep indicates the sweep events should be traced.
// This is used to defer the sweep start event until a span
// has actually been swept.
traceSweep bool
// traceSwept and traceReclaimed track the number of bytes
// swept and reclaimed by sweeping in the current sweep loop.
traceSwept, traceReclaimed uintptr
palloc persistentAlloc // per-P to avoid mutex
_ uint32 // Alignment for atomic fields below
// The when field of the first entry on the timer heap.
// This is updated using atomic functions.
// This is 0 if the timer heap is empty.
timer0When uint64
// The earliest known nextwhen field of a timer with
// timerModifiedEarlier status. Because the timer may have been
// modified again, there need not be any timer with this value.
// This is updated using atomic functions.
// This is 0 if there are no timerModifiedEarlier timers.
timerModifiedEarliest uint64
// Per-P GC state
gcAssistTime int64 // Nanoseconds in assistAlloc
gcFractionalMarkTime int64 // Nanoseconds in fractional mark worker (atomic)
runtime: manage gcBgMarkWorkers with a global pool Background mark workers perform per-P marking work. Currently each worker is assigned a P at creation time. The worker "attaches" to the P via p.gcBgMarkWorker, making itself (usually) available to findRunnableGCWorker for scheduling GC work. While running gcMarkDone, the worker "detaches" from the P (by clearing p.gcBgMarkWorker), since it may park for other reasons and should not be scheduled by findRunnableGCWorker. Unfortunately, this design is complex and difficult to reason about. We simplify things by changing the design to eliminate the hard P attachment. Rather than workers always performing work from the same P, workers perform work for whichever P they find themselves on. On park, the workers are placed in a pool of free workers, which each P's findRunnableGCWorker can use to run a worker for its P. Now if a worker parks in gcMarkDone, a P may simply use another worker from the pool to complete its own work. The P's GC worker mode is used to communicate the mode to run to the selected worker. It is also used to emit the appropriate worker EvGoStart tracepoint. This is a slight change, as this G may be preempted (e.g., in gcMarkDone). When it is rescheduled, the trace viewer will show it as a normal goroutine again. It is currently a bit difficult to connect to the original worker tracepoint, as the viewer does not display the goid for the original worker (though the data is in the trace file). Change-Id: Id7bd3a364dc18a4d2b1c99c4dc4810fae1293c1b Reviewed-on: https://go-review.googlesource.com/c/go/+/262348 Run-TryBot: Michael Pratt <mpratt@google.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Michael Knyszek <mknyszek@google.com> Trust: Michael Pratt <mpratt@google.com>
2020-10-13 12:39:13 -04:00
// gcMarkWorkerMode is the mode for the next mark worker to run in.
// That is, this is used to communicate with the worker goroutine
// selected for immediate execution by
// gcController.findRunnableGCWorker. When scheduling other goroutines,
// this field must be set to gcMarkWorkerNotWorker.
gcMarkWorkerMode gcMarkWorkerMode
// gcMarkWorkerStartTime is the nanotime() at which the most recent
// mark worker started.
gcMarkWorkerStartTime int64
runtime: replace per-M workbuf cache with per-P gcWork cache Currently, each M has a cache of the most recently used *workbuf. This is used primarily by the write barrier so it doesn't have to access the global workbuf lists on every write barrier. It's also used by stack scanning because it's convenient. This cache is important for write barrier performance, but this particular approach has several downsides. It's faster than no cache, but far from optimal (as the benchmarks below show). It's complex: access to the cache is sprinkled through most of the workbuf list operations and it requires special care to transform into and back out of the gcWork cache that's actually used for scanning and marking. It requires atomic exchanges to take ownership of the cached workbuf and to return it to the M's cache even though it's almost always used by only the current M. Since it's per-M, flushing these caches is O(# of Ms), which may be high. And it has some significant subtleties: for example, in general the cache shouldn't be used after the harvestwbufs() in mark termination because it could hide work from mark termination, but stack scanning can happen after this and *will* use the cache (but it turns out this is okay because it will always be followed by a getfull(), which drains the cache). This change replaces this cache with a per-P gcWork object. This gcWork cache can be used directly by scanning and marking (as long as preemption is disabled, which is a general requirement of gcWork). Since it's per-P, it doesn't require synchronization, which simplifies things and means the only atomic operations in the write barrier are occasionally fetching new work buffers and setting a mark bit if the object isn't already marked. This cache can be flushed in O(# of Ps), which is generally small. It follows a simple flushing rule: the cache can be used during any phase, but during mark termination it must be flushed before allowing preemption. This also makes the dispose during mutator assist no longer necessary, which eliminates the vast majority of gcWork dispose calls and reduces contention on the global workbuf lists. And it's a lot faster on some benchmarks: benchmark old ns/op new ns/op delta BenchmarkBinaryTree17 11963668673 11206112763 -6.33% BenchmarkFannkuch11 2643217136 2649182499 +0.23% BenchmarkFmtFprintfEmpty 70.4 70.2 -0.28% BenchmarkFmtFprintfString 364 307 -15.66% BenchmarkFmtFprintfInt 317 282 -11.04% BenchmarkFmtFprintfIntInt 512 483 -5.66% BenchmarkFmtFprintfPrefixedInt 404 380 -5.94% BenchmarkFmtFprintfFloat 521 479 -8.06% BenchmarkFmtManyArgs 2164 1894 -12.48% BenchmarkGobDecode 30366146 22429593 -26.14% BenchmarkGobEncode 29867472 26663152 -10.73% BenchmarkGzip 391236616 396779490 +1.42% BenchmarkGunzip 96639491 96297024 -0.35% BenchmarkHTTPClientServer 100110 70763 -29.31% BenchmarkJSONEncode 51866051 52511382 +1.24% BenchmarkJSONDecode 103813138 86094963 -17.07% BenchmarkMandelbrot200 4121834 4120886 -0.02% BenchmarkGoParse 16472789 5879949 -64.31% BenchmarkRegexpMatchEasy0_32 140 140 +0.00% BenchmarkRegexpMatchEasy0_1K 394 394 +0.00% BenchmarkRegexpMatchEasy1_32 120 120 +0.00% BenchmarkRegexpMatchEasy1_1K 621 614 -1.13% BenchmarkRegexpMatchMedium_32 209 202 -3.35% BenchmarkRegexpMatchMedium_1K 54889 55175 +0.52% BenchmarkRegexpMatchHard_32 2682 2675 -0.26% BenchmarkRegexpMatchHard_1K 79383 79524 +0.18% BenchmarkRevcomp 584116718 584595320 +0.08% BenchmarkTemplate 125400565 109620196 -12.58% BenchmarkTimeParse 386 387 +0.26% BenchmarkTimeFormat 580 447 -22.93% (Best out of 10 runs. The delta of averages is similar.) This also puts us in a good position to flush these caches when nearing the end of concurrent marking, which will let us increase the size of the work buffers while still controlling mark termination pause time. Change-Id: I2dd94c8517a19297a98ec280203cccaa58792522 Reviewed-on: https://go-review.googlesource.com/9178 Run-TryBot: Austin Clements <austin@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Russ Cox <rsc@golang.org>
2015-04-19 15:22:20 -04:00
// gcw is this P's GC work buffer cache. The work buffer is
// filled by write barriers, drained by mutator assists, and
// disposed on certain GC state transitions.
gcw gcWork
runtime: buffered write barrier implementation This implements runtime support for buffered write barriers on amd64. The buffered write barrier has a fast path that simply enqueues pointers in a per-P buffer. Unlike the current write barrier, this fast path is *not* a normal Go call and does not require the compiler to spill general-purpose registers or put arguments on the stack. When the buffer fills up, the write barrier takes the slow path, which spills all general purpose registers and flushes the buffer. We don't allow safe-points or stack splits while this frame is active, so it doesn't matter that we have no type information for the spilled registers in this frame. One minor complication is cgocheck=2 mode, which uses the write barrier to detect Go pointers being written to non-Go memory. We obviously can't buffer this, so instead we set the buffer to its minimum size, forcing the write barrier into the slow path on every call. For this specific case, we pass additional information as arguments to the flush function. This also requires enabling the cgo write barrier slightly later during runtime initialization, after Ps (and the per-P write barrier buffers) have been initialized. The code in this CL is not yet active. The next CL will modify the compiler to generate calls to the new write barrier. This reduces the average cost of the write barrier by roughly a factor of 4, which will pay for the cost of having it enabled more of the time after we make the GC pacer less aggressive. (Benchmarks will be in the next CL.) Updates #14951. Updates #22460. Change-Id: I396b5b0e2c5e5c4acfd761a3235fd15abadc6cb1 Reviewed-on: https://go-review.googlesource.com/73711 Run-TryBot: Austin Clements <austin@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Rick Hudson <rlh@golang.org>
2017-10-26 12:21:16 -04:00
// wbBuf is this P's GC write barrier buffer.
//
// TODO: Consider caching this in the running G.
wbBuf wbBuf
runSafePointFn uint32 // if 1, run sched.safePointFn at next safe point
// statsSeq is a counter indicating whether this P is currently
// writing any stats. Its value is even when not, odd when it is.
statsSeq uint32
// Lock for timers. We normally access the timers while running
// on this P, but the scheduler can also do it from a different P.
timersLock mutex
// Actions to take at some time. This is used to implement the
// standard library's time package.
// Must hold timersLock to access.
timers []*timer
// Number of timers in P's heap.
// Modified using atomic instructions.
numTimers uint32
// Number of timerDeleted timers in P's heap.
// Modified using atomic instructions.
deletedTimers uint32
// Race context used while executing timer functions.
timerRaceCtx uintptr
// scannableStackSizeDelta accumulates the amount of stack space held by
// live goroutines (i.e. those eligible for stack scanning).
// Flushed to gcController.scannableStackSize once scannableStackSizeSlack
// or -scannableStackSizeSlack is reached.
scannableStackSizeDelta int64
// preempt is set to indicate that this P should be enter the
// scheduler ASAP (regardless of what G is running on it).
preempt bool
// Padding is no longer needed. False sharing is now not a worry because p is large enough
// that its size class is an integer multiple of the cache line size (for any of our architectures).
}
type schedt struct {
// accessed atomically. keep at top to ensure alignment on 32-bit systems.
goidgen uint64
lastpoll uint64 // time of last network poll, 0 if currently polling
pollUntil uint64 // time to which current poll is sleeping
lock mutex
runtime: make it possible to exit Go-created threads Currently, threads created by the runtime exist until the whole program exits. For #14592 and #20395, we want to be able to exit and clean up threads created by the runtime. This commit implements that mechanism. The main difficulty is how to clean up the g0 stack. In cgo mode and on Solaris and Windows where the OS manages thread stacks, we simply arrange to return from mstart and let the system clean up the thread. If the runtime allocated the g0 stack, then we use a new exitThread syscall wrapper that arranges to clear a flag in the M once the stack can safely be reaped and call the thread termination syscall. exitThread is based on the existing exit1 wrapper, which was always meant to terminate the calling thread. However, exit1 has never been used since it was introduced 9 years ago, so it was broken on several platforms. exitThread also has the additional complication of having to flag that the stack is unused, which requires some tricks on platforms that use the stack for syscalls. This still leaves the problem of how to reap the unused g0 stacks. For this, we move the M from allm to a new freem list as part of the M exiting. Later, allocm scans the freem list, finds Ms that are marked as done with their stack, removes these from the list and frees their g0 stacks. This also allows these Ms to be garbage collected. This CL does not yet use any of this functionality. Follow-up CLs will. Likewise, there are no new tests in this CL because we'll need follow-up functionality to test it. Change-Id: Ic851ee74227b6d39c6fc1219fc71b45d3004bc63 Reviewed-on: https://go-review.googlesource.com/46037 Run-TryBot: Austin Clements <austin@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
2017-06-16 15:54:21 -04:00
// When increasing nmidle, nmidlelocked, nmsys, or nmfreed, be
// sure to call checkdead().
midle muintptr // idle m's waiting for work
nmidle int32 // number of idle m's waiting for work
nmidlelocked int32 // number of locked m's waiting for work
mnext int64 // number of m's that have been created and next M ID
maxmcount int32 // maximum number of m's allowed (or die)
nmsys int32 // number of system m's not counted for deadlock
runtime: make it possible to exit Go-created threads Currently, threads created by the runtime exist until the whole program exits. For #14592 and #20395, we want to be able to exit and clean up threads created by the runtime. This commit implements that mechanism. The main difficulty is how to clean up the g0 stack. In cgo mode and on Solaris and Windows where the OS manages thread stacks, we simply arrange to return from mstart and let the system clean up the thread. If the runtime allocated the g0 stack, then we use a new exitThread syscall wrapper that arranges to clear a flag in the M once the stack can safely be reaped and call the thread termination syscall. exitThread is based on the existing exit1 wrapper, which was always meant to terminate the calling thread. However, exit1 has never been used since it was introduced 9 years ago, so it was broken on several platforms. exitThread also has the additional complication of having to flag that the stack is unused, which requires some tricks on platforms that use the stack for syscalls. This still leaves the problem of how to reap the unused g0 stacks. For this, we move the M from allm to a new freem list as part of the M exiting. Later, allocm scans the freem list, finds Ms that are marked as done with their stack, removes these from the list and frees their g0 stacks. This also allows these Ms to be garbage collected. This CL does not yet use any of this functionality. Follow-up CLs will. Likewise, there are no new tests in this CL because we'll need follow-up functionality to test it. Change-Id: Ic851ee74227b6d39c6fc1219fc71b45d3004bc63 Reviewed-on: https://go-review.googlesource.com/46037 Run-TryBot: Austin Clements <austin@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
2017-06-16 15:54:21 -04:00
nmfreed int64 // cumulative number of freed m's
ngsys uint32 // number of system goroutines; updated atomically
pidle puintptr // idle p's
npidle uint32
nmspinning uint32 // See "Worker thread parking/unparking" comment in proc.go.
// Global runnable queue.
runq gQueue
runqsize int32
// disable controls selective disabling of the scheduler.
//
// Use schedEnableUser to control this.
//
// disable is protected by sched.lock.
disable struct {
// user disables scheduling of user goroutines.
user bool
runnable gQueue // pending runnable Gs
n int32 // length of runnable
}
// Global cache of dead G's.
gFree struct {
lock mutex
stack gList // Gs with stacks
noStack gList // Gs without stacks
n int32
}
// Central cache of sudog structs.
sudoglock mutex
sudogcache *sudog
// Central pool of available defer structs.
deferlock mutex
deferpool *_defer
runtime: make it possible to exit Go-created threads Currently, threads created by the runtime exist until the whole program exits. For #14592 and #20395, we want to be able to exit and clean up threads created by the runtime. This commit implements that mechanism. The main difficulty is how to clean up the g0 stack. In cgo mode and on Solaris and Windows where the OS manages thread stacks, we simply arrange to return from mstart and let the system clean up the thread. If the runtime allocated the g0 stack, then we use a new exitThread syscall wrapper that arranges to clear a flag in the M once the stack can safely be reaped and call the thread termination syscall. exitThread is based on the existing exit1 wrapper, which was always meant to terminate the calling thread. However, exit1 has never been used since it was introduced 9 years ago, so it was broken on several platforms. exitThread also has the additional complication of having to flag that the stack is unused, which requires some tricks on platforms that use the stack for syscalls. This still leaves the problem of how to reap the unused g0 stacks. For this, we move the M from allm to a new freem list as part of the M exiting. Later, allocm scans the freem list, finds Ms that are marked as done with their stack, removes these from the list and frees their g0 stacks. This also allows these Ms to be garbage collected. This CL does not yet use any of this functionality. Follow-up CLs will. Likewise, there are no new tests in this CL because we'll need follow-up functionality to test it. Change-Id: Ic851ee74227b6d39c6fc1219fc71b45d3004bc63 Reviewed-on: https://go-review.googlesource.com/46037 Run-TryBot: Austin Clements <austin@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
2017-06-16 15:54:21 -04:00
// freem is the list of m's waiting to be freed when their
// m.exited is set. Linked through m.freelink.
freem *m
gcwaiting uint32 // gc is waiting to run
stopwait int32
stopnote note
sysmonwait uint32
sysmonnote note
// safepointFn should be called on each P at the next GC
// safepoint if p.runSafePointFn is set.
runtime: use separate count and note for forEachP Currently, forEachP reuses the stopwait and stopnote fields from stopTheWorld to track how many Ps have not responded to the safe-point request and to sleep until all Ps have responded. It was assumed this was safe because both stopTheWorld and forEachP must occur under the worlsema and hence stopwait and stopnote cannot be used for both purposes simultaneously and callers could always determine the appropriate use based on sched.gcwaiting (which is only set by stopTheWorld). However, this is not the case, since it's possible for there to be a window between when an M observes that gcwaiting is set and when it checks stopwait during which stopwait could have changed meanings. When this happens, the M decrements stopwait and may wakeup stopnote, but does not otherwise participate in the forEachP protocol. As a result, stopwait is decremented too many times, so it may reach zero before all Ps have run the safe-point function, causing forEachP to wake up early. It will then either observe that some P has not run the safe-point function and panic with "P did not run fn", or the remaining P (or Ps) will run the safe-point function before it wakes up and it will observe that stopwait is negative and panic with "not stopped". Fix this problem by giving forEachP its own safePointWait and safePointNote fields. One known sequence of events that can cause this race is as follows. It involves three actors: G1 is running on M1 on P1. P1 has an empty run queue. G2/M2 is in a blocked syscall and has lost its P. (The details of this don't matter, it just needs to be in a position where it needs to grab an idle P.) GC just started on G3/M3/P3. (These aren't very involved, they just have to be separate from the other G's, M's, and P's.) 1. GC calls stopTheWorld(), which sets sched.gcwaiting to 1. Now G1/M1 begins to enter a syscall: 2. G1/M1 invokes reentersyscall, which sets the P1's status to _Psyscall. 3. G1/M1's reentersyscall observes gcwaiting != 0 and calls entersyscall_gcwait. 4. G1/M1's entersyscall_gcwait blocks acquiring sched.lock. Back on GC: 5. stopTheWorld cas's P1's status to _Pgcstop, does other stuff, and returns. 6. GC does stuff and then calls startTheWorld(). 7. startTheWorld() calls procresize(), which sets P1's status to _Pidle and puts P1 on the idle list. Now G2/M2 returns from its syscall and takes over P1: 8. G2/M2 returns from its blocked syscall and gets P1 from the idle list. 9. G2/M2 acquires P1, which sets P1's status to _Prunning. 10. G2/M2 starts a new syscall and invokes reentersyscall, which sets P1's status to _Psyscall. Back on G1/M1: 11. G1/M1 finally acquires sched.lock in entersyscall_gcwait. At this point, G1/M1 still thinks it's running on P1. P1's status is _Psyscall, which is consistent with what G1/M1 is doing, but it's _Psyscall because *G2/M2* put it in to _Psyscall, not G1/M1. This is basically an ABA race on P1's status. Because forEachP currently shares stopwait with stopTheWorld. G1/M1's entersyscall_gcwait observes the non-zero stopwait set by forEachP, but mistakes it for a stopTheWorld. It cas's P1's status from _Psyscall (set by G2/M2) to _Pgcstop and proceeds to decrement stopwait one more time than forEachP was expecting. Fixes #10618. (See the issue for details on why the above race is safe when forEachP is not involved.) Prior to this commit, the command stress ./runtime.test -test.run TestFutexsleep\|TestGoroutineProfile would reliably fail after a few hundred runs. With this commit, it ran for over 2 million runs and never crashed. Change-Id: I9a91ea20035b34b6e5f07ef135b144115f281f30 Reviewed-on: https://go-review.googlesource.com/10157 Reviewed-by: Russ Cox <rsc@golang.org>
2015-05-15 16:31:17 -04:00
safePointFn func(*p)
safePointWait int32
safePointNote note
profilehz int32 // cpu profiling rate
procresizetime int64 // nanotime() of last change to gomaxprocs
totaltime int64 // ∫gomaxprocs dt up to procresizetime
// sysmonlock protects sysmon's actions on the runtime.
//
// Acquire and hold this mutex to block sysmon from interacting
// with the rest of the runtime.
sysmonlock mutex
// timeToRun is a distribution of scheduling latencies, defined
// as the sum of time a G spends in the _Grunnable state before
// it transitions to _Grunning.
//
// timeToRun is protected by sched.lock.
timeToRun timeHistogram
}
// Values for the flags field of a sigTabT.
const (
runtime: don't always unblock all signals Ian proposed an improved way of handling signals masks in Go, motivated by a problem where the Android java runtime expects certain signals to be blocked for all JVM threads. Discussion here https://groups.google.com/forum/#!topic/golang-dev/_TSCkQHJt6g Ian's text is used in the following: A Go program always needs to have the synchronous signals enabled. These are the signals for which _SigPanic is set in sigtable, namely SIGSEGV, SIGBUS, SIGFPE. A Go program that uses the os/signal package, and calls signal.Notify, needs to have at least one thread which is not blocking that signal, but it doesn't matter much which one. Unix programs do not change signal mask across execve. They inherit signal masks across fork. The shell uses this fact to some extent; for example, the job control signals (SIGTTIN, SIGTTOU, SIGTSTP) are blocked for commands run due to backquote quoting or $(). Our current position on signal masks was not thought out. We wandered into step by step, e.g., http://golang.org/cl/7323067 . This CL does the following: Introduce a new platform hook, msigsave, that saves the signal mask of the current thread to m.sigsave. Call msigsave from needm and newm. In minit grab set up the signal mask from m.sigsave and unblock the essential synchronous signals, and SIGILL, SIGTRAP, SIGPROF, SIGSTKFLT (for systems that have it). In unminit, restore the signal mask from m.sigsave. The first time that os/signal.Notify is called, start a new thread whose only purpose is to update its signal mask to make sure signals for signal.Notify are unblocked on at least one thread. The effect on Go programs will be that if they are invoked with some non-synchronous signals blocked, those signals will normally be ignored. Previously, those signals would mostly be ignored. A change in behaviour will occur for programs started with any of these signals blocked, if they receive the signal: SIGHUP, SIGINT, SIGQUIT, SIGABRT, SIGTERM. Previously those signals would always cause a crash (unless using the os/signal package); with this change, they will be ignored if the program is started with the signal blocked (and does not use the os/signal package). ./all.bash completes successfully on linux/amd64. OpenBSD is missing the implementation. Change-Id: I188098ba7eb85eae4c14861269cc466f2aa40e8c Reviewed-on: https://go-review.googlesource.com/10173 Reviewed-by: Ian Lance Taylor <iant@golang.org>
2015-05-18 11:00:24 +02:00
_SigNotify = 1 << iota // let signal.Notify have signal, even if from kernel
_SigKill // if signal.Notify doesn't take it, exit quietly
_SigThrow // if signal.Notify doesn't take it, exit loudly
_SigPanic // if the signal is from the kernel, panic
_SigDefault // if the signal isn't explicitly requested, don't monitor it
_SigGoExit // cause all runtime procs to exit (only used on Plan 9).
runtime, syscall: reimplement AllThreadsSyscall using only signals. In issue 50113, we see that a thread blocked in a system call can result in a hang of AllThreadsSyscall. To resolve this, we must send a signal to these threads to knock them out of the system call long enough to run the per-thread syscall. Stepping back, if we need to send signals anyway, it should be possible to implement this entire mechanism on top of signals. This CL does so, vastly simplifying the mechanism, both as a direct result of newly-unnecessary code as well as some ancillary simplifications to make things simpler to follow. Major changes: * The rest of the mechanism is moved to os_linux.go, with fields in mOS instead of m itself. * 'Fixup' fields and functions are renamed to 'perThreadSyscall' so they are more precise about their purpose. * Rather than getting passed a closure, doAllThreadsSyscall takes the syscall number and arguments. This avoids a lot of hairy behavior: * The closure may potentially only be live in fields in the M, hidden from the GC. Not necessary with no closure. * The need to loan out the race context. A direct RawSyscall6 call does not require any race context. * The closure previously conditionally panicked in strange locations, like a signal handler. Now we simply throw. * All manual fixup synchronization with mPark, sysmon, templateThread, sigqueue, etc is gone. The core approach is much simpler: doAllThreadsSyscall sends a signal to every thread in allm, which executes the system call from the signal handler. We use (SIGRTMIN + 1), aka SIGSETXID, the same signal used by glibc for this purpose. As such, we are careful to only handle this signal on non-cgo binaries. Synchronization with thread creation is a key part of this CL. The comment near the top of doAllThreadsSyscall describes the required synchronization semantics and how they are achieved. Note that current use of allocmLock protects the state mutations of allm that are also protected by sched.lock. allocmLock is used instead of sched.lock simply to avoid holding sched.lock for so long. Fixes #50113 Change-Id: Ic7ea856dc66cf711731540a54996e08fc986ce84 Reviewed-on: https://go-review.googlesource.com/c/go/+/383434 Reviewed-by: Austin Clements <austin@google.com> Trust: Michael Pratt <mpratt@google.com> Run-TryBot: Michael Pratt <mpratt@google.com> TryBot-Result: Gopher Robot <gobot@golang.org>
2022-02-04 17:15:28 -05:00
_SigSetStack // Don't explicitly install handler, but add SA_ONSTACK to existing libc handler
_SigUnblock // always unblock; see blockableSig
_SigIgn // _SIG_DFL action is to ignore the signal
)
// Layout of in-memory per-function information prepared by linker
// See https://golang.org/s/go12symtab.
// Keep in sync with linker (../cmd/link/internal/ld/pcln.go:/pclntab)
// and with package debug/gosym and with symtab.go in package runtime.
type _func struct {
cmd/link, runtime: use offset for _func.entry The first field of the func data stored by the linker is the entry PC for the function. Prior to this change, this was stored as a relocation to the function. Change this to be an offset relative to runtime.text. This reduces the number of relocations on darwin/arm64 by about 10%. It also slightly shrinks binaries: file before after Δ % addr2line 3803058 3791298 -11760 -0.309% api 5140114 5104242 -35872 -0.698% asm 4886850 4840626 -46224 -0.946% buildid 2512466 2503042 -9424 -0.375% cgo 4374770 4342274 -32496 -0.743% compile 22920530 22769202 -151328 -0.660% cover 4624626 4588242 -36384 -0.787% dist 3217570 3205522 -12048 -0.374% doc 3715026 3684498 -30528 -0.822% fix 3148226 3119266 -28960 -0.920% link 6350226 6313362 -36864 -0.581% nm 3768850 3757106 -11744 -0.312% objdump 4140594 4127618 -12976 -0.313% pack 2227474 2218818 -8656 -0.389% pprof 13598706 13506786 -91920 -0.676% test2json 2497234 2487426 -9808 -0.393% trace 10198066 10118498 -79568 -0.780% vet 6930658 6889074 -41584 -0.600% total 108055044 107366900 -688144 -0.637% It should also incrementally speed up binary launching. This is the first step towards removing enough relocations that pages that were previously dirtied by the loader may remain clean, which will offer memory savings useful in constrained environments. Change-Id: Icfba55e696ba2f9c99c4f179125ba5a3ba4369c9 Reviewed-on: https://go-review.googlesource.com/c/go/+/351463 Trust: Josh Bleecher Snyder <josharian@gmail.com> Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Cherry Mui <cherryyz@google.com>
2021-09-28 17:06:56 -07:00
entryoff uint32 // start pc, as offset from moduledata.text/pcHeader.textStart
nameoff int32 // function name
args int32 // in/out args size
cmd/compile, cmd/link, runtime: make defers low-cost through inline code and extra funcdata Generate inline code at defer time to save the args of defer calls to unique (autotmp) stack slots, and generate inline code at exit time to check which defer calls were made and make the associated function/method/interface calls. We remember that a particular defer statement was reached by storing in the deferBits variable (always stored on the stack). At exit time, we check the bits of the deferBits variable to determine which defer function calls to make (in reverse order). These low-cost defers are only used for functions where no defers appear in loops. In addition, we don't do these low-cost defers if there are too many defer statements or too many exits in a function (to limit code increase). When a function uses open-coded defers, we produce extra FUNCDATA_OpenCodedDeferInfo information that specifies the number of defers, and for each defer, the stack slots where the closure and associated args have been stored. The funcdata also includes the location of the deferBits variable. Therefore, for panics, we can use this funcdata to determine exactly which defers are active, and call the appropriate functions/methods/closures with the correct arguments for each active defer. In order to unwind the stack correctly after a recover(), we need to add an extra code segment to functions with open-coded defers that simply calls deferreturn() and returns. This segment is not reachable by the normal function, but is returned to by the runtime during recovery. We set the liveness information of this deferreturn() to be the same as the liveness at the first function call during the last defer exit code (so all return values and all stack slots needed by the defer calls will be live). I needed to increase the stackguard constant from 880 to 896, because of a small amount of new code in deferreturn(). The -N flag disables open-coded defers. '-d defer' prints out the kind of defer being used at each defer statement (heap-allocated, stack-allocated, or open-coded). Cost of defer statement [ go test -run NONE -bench BenchmarkDefer$ runtime ] With normal (stack-allocated) defers only: 35.4 ns/op With open-coded defers: 5.6 ns/op Cost of function call alone (remove defer keyword): 4.4 ns/op Text size increase (including funcdata) for go binary without/with open-coded defers: 0.09% The average size increase (including funcdata) for only the functions that use open-coded defers is 1.1%. The cost of a panic followed by a recover got noticeably slower, since panic processing now requires a scan of the stack for open-coded defer frames. This scan is required, even if no frames are using open-coded defers: Cost of panic and recover [ go test -run NONE -bench BenchmarkPanicRecover runtime ] Without open-coded defers: 62.0 ns/op With open-coded defers: 255 ns/op A CGO Go-to-C-to-Go benchmark got noticeably faster because of open-coded defers: CGO Go-to-C-to-Go benchmark [cd misc/cgo/test; go test -run NONE -bench BenchmarkCGoCallback ] Without open-coded defers: 443 ns/op With open-coded defers: 347 ns/op Updates #14939 (defer performance) Updates #34481 (design doc) Change-Id: I63b1a60d1ebf28126f55ee9fd7ecffe9cb23d1ff Reviewed-on: https://go-review.googlesource.com/c/go/+/202340 Reviewed-by: Austin Clements <austin@google.com>
2019-06-24 12:59:22 -07:00
deferreturn uint32 // offset of start of a deferreturn call instruction from entry, if any.
pcsp uint32
pcfile uint32
pcln uint32
npcdata uint32
cmd/asm, cmd/link, runtime: introduce FuncInfo flag bits The runtime traceback code has its own definition of which functions mark the top frame of a stack, separate from the TOPFRAME bits that exist in the assembly and are passed along in DWARF information. It's error-prone and redundant to have two different sources of truth. This CL provides the actual TOPFRAME bits to the runtime, so that the runtime can use those bits instead of reinventing its own category. This CL also adds a new bit, SPWRITE, which marks functions that write directly to SP (anything but adding and subtracting constants). Such functions must stop a traceback, because the traceback has no way to rederive the SP on entry. Again, the runtime has its own definition which is mostly correct, but also missing some functions. During ordinary goroutine context switches, such functions do not appear on the stack, so the incompleteness in the runtime usually doesn't matter. But profiling signals can arrive at any moment, and the runtime may crash during traceback if it attempts to unwind an SP-writing frame and gets out-of-sync with the actual stack. The runtime contains code to try to detect likely candidates but again it is incomplete. Deriving the SPWRITE bit automatically from the actual assembly code provides the complete truth, and passing it to the runtime lets the runtime use it. This CL is part of a stack adding windows/arm64 support (#36439), intended to land in the Go 1.17 cycle. This CL is, however, not windows/arm64-specific. It is cleanup meant to make the port (and future ports) easier. Change-Id: I227f53b23ac5b3dabfcc5e8ee3f00df4e113cf58 Reviewed-on: https://go-review.googlesource.com/c/go/+/288800 Trust: Russ Cox <rsc@golang.org> Trust: Jason A. Donenfeld <Jason@zx2c4.com> Reviewed-by: Cherry Zhang <cherryyz@google.com> Reviewed-by: Jason A. Donenfeld <Jason@zx2c4.com>
2021-01-28 15:21:33 -05:00
cuOffset uint32 // runtime.cutab offset of this function's CU
funcID funcID // set for certain special runtime functions
flag funcFlag
_ [1]byte // pad
nfuncdata uint8 // must be last, must end on a uint32-aligned boundary
}
// Pseudo-Func that is returned for PCs that occur in inlined code.
// A *Func can be either a *_func or a *funcinl, and they are distinguished
// by the first uintptr.
type funcinl struct {
cmd/link, runtime: use offset for _func.entry The first field of the func data stored by the linker is the entry PC for the function. Prior to this change, this was stored as a relocation to the function. Change this to be an offset relative to runtime.text. This reduces the number of relocations on darwin/arm64 by about 10%. It also slightly shrinks binaries: file before after Δ % addr2line 3803058 3791298 -11760 -0.309% api 5140114 5104242 -35872 -0.698% asm 4886850 4840626 -46224 -0.946% buildid 2512466 2503042 -9424 -0.375% cgo 4374770 4342274 -32496 -0.743% compile 22920530 22769202 -151328 -0.660% cover 4624626 4588242 -36384 -0.787% dist 3217570 3205522 -12048 -0.374% doc 3715026 3684498 -30528 -0.822% fix 3148226 3119266 -28960 -0.920% link 6350226 6313362 -36864 -0.581% nm 3768850 3757106 -11744 -0.312% objdump 4140594 4127618 -12976 -0.313% pack 2227474 2218818 -8656 -0.389% pprof 13598706 13506786 -91920 -0.676% test2json 2497234 2487426 -9808 -0.393% trace 10198066 10118498 -79568 -0.780% vet 6930658 6889074 -41584 -0.600% total 108055044 107366900 -688144 -0.637% It should also incrementally speed up binary launching. This is the first step towards removing enough relocations that pages that were previously dirtied by the loader may remain clean, which will offer memory savings useful in constrained environments. Change-Id: Icfba55e696ba2f9c99c4f179125ba5a3ba4369c9 Reviewed-on: https://go-review.googlesource.com/c/go/+/351463 Trust: Josh Bleecher Snyder <josharian@gmail.com> Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Cherry Mui <cherryyz@google.com>
2021-09-28 17:06:56 -07:00
ones uint32 // set to ^0 to distinguish from _func
entry uintptr // entry of the real (the "outermost") frame
name string
file string
line int
}
// layout of Itab known to compilers
// allocated in non-garbage-collected memory
// Needs to be in sync with
// ../cmd/compile/internal/reflectdata/reflect.go:/^func.WriteTabs.
type itab struct {
inter *interfacetype
_type *_type
hash uint32 // copy of _type.hash. Used for type switches.
_ [4]byte
fun [1]uintptr // variable sized. fun[0]==0 means _type does not implement inter.
}
// Lock-free stack node.
// Also known to export_test.go.
type lfnode struct {
next uint64
pushcnt uintptr
}
type forcegcstate struct {
lock mutex
g *g
idle uint32
}
// extendRandom extends the random numbers in r[:n] to the whole slice r.
// Treats n<0 as n==0.
func extendRandom(r []byte, n int) {
if n < 0 {
n = 0
}
for n < len(r) {
// Extend random bits using hash function & time seed
w := n
if w > 16 {
w = 16
}
h := memhash(unsafe.Pointer(&r[n-w]), uintptr(nanotime()), uintptr(w))
for i := 0; i < goarch.PtrSize && n < len(r); i++ {
r[n] = byte(h)
n++
h >>= 8
}
}
}
// A _defer holds an entry on the list of deferred calls.
// If you add a field here, add code to clear it in deferProcStack.
// This struct must match the code in cmd/compile/internal/ssagen/ssa.go:deferstruct
// and cmd/compile/internal/ssagen/ssa.go:(*state).call.
// Some defers will be allocated on the stack and some on the heap.
// All defers are logically part of the stack, so write barriers to
// initialize them are not required. All defers must be manually scanned,
// and for heap defers, marked.
type _defer struct {
started bool
heap bool
cmd/compile, cmd/link, runtime: make defers low-cost through inline code and extra funcdata Generate inline code at defer time to save the args of defer calls to unique (autotmp) stack slots, and generate inline code at exit time to check which defer calls were made and make the associated function/method/interface calls. We remember that a particular defer statement was reached by storing in the deferBits variable (always stored on the stack). At exit time, we check the bits of the deferBits variable to determine which defer function calls to make (in reverse order). These low-cost defers are only used for functions where no defers appear in loops. In addition, we don't do these low-cost defers if there are too many defer statements or too many exits in a function (to limit code increase). When a function uses open-coded defers, we produce extra FUNCDATA_OpenCodedDeferInfo information that specifies the number of defers, and for each defer, the stack slots where the closure and associated args have been stored. The funcdata also includes the location of the deferBits variable. Therefore, for panics, we can use this funcdata to determine exactly which defers are active, and call the appropriate functions/methods/closures with the correct arguments for each active defer. In order to unwind the stack correctly after a recover(), we need to add an extra code segment to functions with open-coded defers that simply calls deferreturn() and returns. This segment is not reachable by the normal function, but is returned to by the runtime during recovery. We set the liveness information of this deferreturn() to be the same as the liveness at the first function call during the last defer exit code (so all return values and all stack slots needed by the defer calls will be live). I needed to increase the stackguard constant from 880 to 896, because of a small amount of new code in deferreturn(). The -N flag disables open-coded defers. '-d defer' prints out the kind of defer being used at each defer statement (heap-allocated, stack-allocated, or open-coded). Cost of defer statement [ go test -run NONE -bench BenchmarkDefer$ runtime ] With normal (stack-allocated) defers only: 35.4 ns/op With open-coded defers: 5.6 ns/op Cost of function call alone (remove defer keyword): 4.4 ns/op Text size increase (including funcdata) for go binary without/with open-coded defers: 0.09% The average size increase (including funcdata) for only the functions that use open-coded defers is 1.1%. The cost of a panic followed by a recover got noticeably slower, since panic processing now requires a scan of the stack for open-coded defer frames. This scan is required, even if no frames are using open-coded defers: Cost of panic and recover [ go test -run NONE -bench BenchmarkPanicRecover runtime ] Without open-coded defers: 62.0 ns/op With open-coded defers: 255 ns/op A CGO Go-to-C-to-Go benchmark got noticeably faster because of open-coded defers: CGO Go-to-C-to-Go benchmark [cd misc/cgo/test; go test -run NONE -bench BenchmarkCGoCallback ] Without open-coded defers: 443 ns/op With open-coded defers: 347 ns/op Updates #14939 (defer performance) Updates #34481 (design doc) Change-Id: I63b1a60d1ebf28126f55ee9fd7ecffe9cb23d1ff Reviewed-on: https://go-review.googlesource.com/c/go/+/202340 Reviewed-by: Austin Clements <austin@google.com>
2019-06-24 12:59:22 -07:00
// openDefer indicates that this _defer is for a frame with open-coded
// defers. We have only one defer record for the entire frame (which may
// currently have 0, 1, or more defers active).
openDefer bool
sp uintptr // sp at time of defer
pc uintptr // pc at time of defer
fn func() // can be nil for open-coded defers
_panic *_panic // panic that is running defer
[dev.typeparams] runtime: handle d.link carefully when freeing a defer CL 339396 allowed stack copying on entry to and during freedefer, but this introduced a subtle bug: if d is heap-allocated, and d.link points to a stack-allocated defer, stack copying during freedefer can briefly introduce a stale pointer, which the garbage collector can discover and panic about. This happens because d has already been unlinked from the defer chain when freedefer is called, so stack copying won't update stack pointers in it. Fix this by making freedefer nosplit again and immediately clearing d.link. This should fix the longtest builders, which currently fail on GOMAXPROCS=2 runtime -cpu=1,2,4 -quick in the TestDeferHeapAndStack test. This seems like the simplest fix, but it just deals with the subtlety rather than eliminating it. Really, every call site of freedefer (of which there are surprisingly many) has hidden subtlety between unlinking the defer and calling freedefer. We could consolidate the subtlety into each call site by requiring that they unlink the defer and set d.link to nil before calling freedefer. freedefer could check this condition like it checks that various other fields have already been zeroed. A more radical option is to replace freedefer with "popDefer", which would both pop the defer off the link and take care of freeing it. There would still be a brief moment of subtlety, but it would be in one place, in popDefer. Annoyingly, *almost* every call to freedefer just pops the defer from the head of the G's list, but there's one place when handling open-coded defers where we have to remove a defer from the middle of the list. I'm inclined to first fix that subtlety by only expanding open-coded defer records when they're at the head of the defer list, and then revisit the popDefer idea. Change-Id: I3130d2542c01a421a5d60e8c31f5379263219627 Reviewed-on: https://go-review.googlesource.com/c/go/+/339730 Trust: Austin Clements <austin@google.com> Run-TryBot: Austin Clements <austin@google.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Cherry Mui <cherryyz@google.com>
2021-08-04 08:54:09 -04:00
link *_defer // next defer on G; can point to either heap or stack!
cmd/compile, cmd/link, runtime: make defers low-cost through inline code and extra funcdata Generate inline code at defer time to save the args of defer calls to unique (autotmp) stack slots, and generate inline code at exit time to check which defer calls were made and make the associated function/method/interface calls. We remember that a particular defer statement was reached by storing in the deferBits variable (always stored on the stack). At exit time, we check the bits of the deferBits variable to determine which defer function calls to make (in reverse order). These low-cost defers are only used for functions where no defers appear in loops. In addition, we don't do these low-cost defers if there are too many defer statements or too many exits in a function (to limit code increase). When a function uses open-coded defers, we produce extra FUNCDATA_OpenCodedDeferInfo information that specifies the number of defers, and for each defer, the stack slots where the closure and associated args have been stored. The funcdata also includes the location of the deferBits variable. Therefore, for panics, we can use this funcdata to determine exactly which defers are active, and call the appropriate functions/methods/closures with the correct arguments for each active defer. In order to unwind the stack correctly after a recover(), we need to add an extra code segment to functions with open-coded defers that simply calls deferreturn() and returns. This segment is not reachable by the normal function, but is returned to by the runtime during recovery. We set the liveness information of this deferreturn() to be the same as the liveness at the first function call during the last defer exit code (so all return values and all stack slots needed by the defer calls will be live). I needed to increase the stackguard constant from 880 to 896, because of a small amount of new code in deferreturn(). The -N flag disables open-coded defers. '-d defer' prints out the kind of defer being used at each defer statement (heap-allocated, stack-allocated, or open-coded). Cost of defer statement [ go test -run NONE -bench BenchmarkDefer$ runtime ] With normal (stack-allocated) defers only: 35.4 ns/op With open-coded defers: 5.6 ns/op Cost of function call alone (remove defer keyword): 4.4 ns/op Text size increase (including funcdata) for go binary without/with open-coded defers: 0.09% The average size increase (including funcdata) for only the functions that use open-coded defers is 1.1%. The cost of a panic followed by a recover got noticeably slower, since panic processing now requires a scan of the stack for open-coded defer frames. This scan is required, even if no frames are using open-coded defers: Cost of panic and recover [ go test -run NONE -bench BenchmarkPanicRecover runtime ] Without open-coded defers: 62.0 ns/op With open-coded defers: 255 ns/op A CGO Go-to-C-to-Go benchmark got noticeably faster because of open-coded defers: CGO Go-to-C-to-Go benchmark [cd misc/cgo/test; go test -run NONE -bench BenchmarkCGoCallback ] Without open-coded defers: 443 ns/op With open-coded defers: 347 ns/op Updates #14939 (defer performance) Updates #34481 (design doc) Change-Id: I63b1a60d1ebf28126f55ee9fd7ecffe9cb23d1ff Reviewed-on: https://go-review.googlesource.com/c/go/+/202340 Reviewed-by: Austin Clements <austin@google.com>
2019-06-24 12:59:22 -07:00
// If openDefer is true, the fields below record values about the stack
// frame and associated function that has the open-coded defer(s). sp
// above will be the sp for the frame, and pc will be address of the
// deferreturn call in the function.
fd unsafe.Pointer // funcdata for the function associated with the frame
varp uintptr // value of varp for the stack frame
// framepc is the current pc associated with the stack frame. Together,
// with sp above (which is the sp associated with the stack frame),
// framepc/sp can be used as pc/sp pair to continue a stack trace via
// gentraceback().
framepc uintptr
}
// A _panic holds information about an active panic.
//
// A _panic value must only ever live on the stack.
//
// The argp and link fields are stack pointers, but don't need special
// handling during stack growth: because they are pointer-typed and
// _panic values only live on the stack, regular stack pointer
// adjustment takes care of them.
type _panic struct {
argp unsafe.Pointer // pointer to arguments of deferred call run during panic; cannot move - known to liblink
arg any // argument to panic
link *_panic // link to earlier panic
runtime: ensure that Goexit cannot be aborted by a recursive panic/recover When we do a successful recover of a panic, we resume normal execution by returning from the frame that had the deferred call that did the recover (after executing any remaining deferred calls in that frame). However, suppose we have called runtime.Goexit and there is a panic during one of the deferred calls run by the Goexit. Further assume that there is a deferred call in the frame of the Goexit or a parent frame that does a recover. Then the recovery process will actually resume normal execution above the Goexit frame and hence abort the Goexit. We will not terminate the thread as expected, but continue running in the frame above the Goexit. To fix this, we explicitly create a _panic object for a Goexit call. We then change the "abort" behavior for Goexits, but not panics. After a recovery, if the top-level panic is actually a Goexit that is marked to be aborted, then we return to the Goexit defer-processing loop, so that the Goexit is not actually aborted. Actual code changes are just panic.go, runtime2.go, and funcid.go. Adjusted the test related to the new Goexit behavior (TestRecoverBeforePanicAfterGoexit) and added several new tests of aborted panics (whose behavior has not changed). Fixes #29226 Change-Id: Ib13cb0074f5acc2567a28db7ca6912cfc47eecb5 Reviewed-on: https://go-review.googlesource.com/c/go/+/200081 Run-TryBot: Dan Scales <danscales@google.com> Reviewed-by: Keith Randall <khr@golang.org>
2019-10-09 12:18:26 -07:00
pc uintptr // where to return to in runtime if this panic is bypassed
sp unsafe.Pointer // where to return to in runtime if this panic is bypassed
recovered bool // whether this panic is over
aborted bool // the panic was aborted
runtime: ensure that Goexit cannot be aborted by a recursive panic/recover When we do a successful recover of a panic, we resume normal execution by returning from the frame that had the deferred call that did the recover (after executing any remaining deferred calls in that frame). However, suppose we have called runtime.Goexit and there is a panic during one of the deferred calls run by the Goexit. Further assume that there is a deferred call in the frame of the Goexit or a parent frame that does a recover. Then the recovery process will actually resume normal execution above the Goexit frame and hence abort the Goexit. We will not terminate the thread as expected, but continue running in the frame above the Goexit. To fix this, we explicitly create a _panic object for a Goexit call. We then change the "abort" behavior for Goexits, but not panics. After a recovery, if the top-level panic is actually a Goexit that is marked to be aborted, then we return to the Goexit defer-processing loop, so that the Goexit is not actually aborted. Actual code changes are just panic.go, runtime2.go, and funcid.go. Adjusted the test related to the new Goexit behavior (TestRecoverBeforePanicAfterGoexit) and added several new tests of aborted panics (whose behavior has not changed). Fixes #29226 Change-Id: Ib13cb0074f5acc2567a28db7ca6912cfc47eecb5 Reviewed-on: https://go-review.googlesource.com/c/go/+/200081 Run-TryBot: Dan Scales <danscales@google.com> Reviewed-by: Keith Randall <khr@golang.org>
2019-10-09 12:18:26 -07:00
goexit bool
}
// stack traces
type stkframe struct {
fn funcInfo // function being run
pc uintptr // program counter within fn
continpc uintptr // program counter where execution can continue, or 0 if not
lr uintptr // program counter at caller aka link register
sp uintptr // stack pointer at pc
fp uintptr // stack pointer at caller aka frame pointer
varp uintptr // top of local variables
argp uintptr // pointer to function arguments
arglen uintptr // number of bytes at argp
argmap *bitvector // force use of this argmap
}
// ancestorInfo records details of where a goroutine was started.
type ancestorInfo struct {
pcs []uintptr // pcs from the stack of this goroutine
goid int64 // goroutine id of this goroutine; original goroutine possibly dead
gopc uintptr // pc of go statement that created this goroutine
}
const (
_TraceRuntimeFrames = 1 << iota // include frames for internal runtime functions.
_TraceTrap // the initial PC, SP are from a trap, not a return PC from a call
_TraceJumpStack // if traceback is on a systemstack, resume trace at g that called into it
)
// The maximum number of frames we print for a traceback
const _TracebackMaxFrames = 100
// A waitReason explains why a goroutine has been stopped.
// See gopark. Do not re-use waitReasons, add new ones.
type waitReason uint8
const (
waitReasonZero waitReason = iota // ""
waitReasonGCAssistMarking // "GC assist marking"
waitReasonIOWait // "IO wait"
waitReasonChanReceiveNilChan // "chan receive (nil chan)"
waitReasonChanSendNilChan // "chan send (nil chan)"
waitReasonDumpingHeap // "dumping heap"
waitReasonGarbageCollection // "garbage collection"
waitReasonGarbageCollectionScan // "garbage collection scan"
waitReasonPanicWait // "panicwait"
waitReasonSelect // "select"
waitReasonSelectNoCases // "select (no cases)"
waitReasonGCAssistWait // "GC assist wait"
waitReasonGCSweepWait // "GC sweep wait"
waitReasonGCScavengeWait // "GC scavenge wait"
waitReasonChanReceive // "chan receive"
waitReasonChanSend // "chan send"
waitReasonFinalizerWait // "finalizer wait"
waitReasonForceGCIdle // "force gc (idle)"
waitReasonSemacquire // "semacquire"
waitReasonSleep // "sleep"
waitReasonSyncCondWait // "sync.Cond.Wait"
waitReasonTimerGoroutineIdle // "timer goroutine (idle)"
waitReasonTraceReaderBlocked // "trace reader (blocked)"
waitReasonWaitForGCCycle // "wait for GC cycle"
waitReasonGCWorkerIdle // "GC worker (idle)"
2019-09-27 12:27:51 -04:00
waitReasonPreempted // "preempted"
waitReasonDebugCall // "debug call"
)
var waitReasonStrings = [...]string{
waitReasonZero: "",
waitReasonGCAssistMarking: "GC assist marking",
waitReasonIOWait: "IO wait",
waitReasonChanReceiveNilChan: "chan receive (nil chan)",
waitReasonChanSendNilChan: "chan send (nil chan)",
waitReasonDumpingHeap: "dumping heap",
waitReasonGarbageCollection: "garbage collection",
waitReasonGarbageCollectionScan: "garbage collection scan",
waitReasonPanicWait: "panicwait",
waitReasonSelect: "select",
waitReasonSelectNoCases: "select (no cases)",
waitReasonGCAssistWait: "GC assist wait",
waitReasonGCSweepWait: "GC sweep wait",
waitReasonGCScavengeWait: "GC scavenge wait",
waitReasonChanReceive: "chan receive",
waitReasonChanSend: "chan send",
waitReasonFinalizerWait: "finalizer wait",
waitReasonForceGCIdle: "force gc (idle)",
waitReasonSemacquire: "semacquire",
waitReasonSleep: "sleep",
waitReasonSyncCondWait: "sync.Cond.Wait",
waitReasonTimerGoroutineIdle: "timer goroutine (idle)",
waitReasonTraceReaderBlocked: "trace reader (blocked)",
waitReasonWaitForGCCycle: "wait for GC cycle",
waitReasonGCWorkerIdle: "GC worker (idle)",
2019-09-27 12:27:51 -04:00
waitReasonPreempted: "preempted",
waitReasonDebugCall: "debug call",
}
func (w waitReason) String() string {
if w < 0 || w >= waitReason(len(waitReasonStrings)) {
return "unknown wait reason"
}
return waitReasonStrings[w]
}
var (
allm *m
gomaxprocs int32
ncpu int32
forcegc forcegcstate
sched schedt
newprocs int32
runtime: try to elide timer stealing if P has no timers Following golang.org/cl/259578, findrunnable still must touch every other P in checkTimers in order to look for timers to steal. This scales poorly with GOMAXPROCS and potentially performs poorly by pulling remote Ps into cache. Add timerpMask, a bitmask that tracks whether each P may have any timers on its timer heap. Ideally we would update this field on any timer add / remove to always keep it up to date. Unfortunately, updating a shared global structure is antithetical to sharding timers by P, and doing so approximately doubles the cost of addtimer / deltimer in microbenchmarks. Instead we only (potentially) clear the mask when the P goes idle. This covers the best case of avoiding looking at a P _at all_ when it is idle and has no timers. See the comment on updateTimerPMask for more details on the trade-off. Future CLs may be able to expand cases we can avoid looking at the timers. Note that the addition of idlepMask to p.init is a no-op. The zero value of the mask is the correct init value so it is not necessary, but it is included for clarity. Benchmark results from WakeupParallel/syscall/pair/race/1ms (see golang.org/cl/228577). Note that these are on top of golang.org/cl/259578: name old msec new msec delta Perf-task-clock-8 244 ± 4% 246 ± 4% ~ (p=0.841 n=5+5) Perf-task-clock-16 247 ±11% 252 ± 4% ~ (p=1.000 n=5+5) Perf-task-clock-32 270 ± 1% 268 ± 2% ~ (p=0.548 n=5+5) Perf-task-clock-64 302 ± 3% 296 ± 1% ~ (p=0.222 n=5+5) Perf-task-clock-128 358 ± 3% 352 ± 2% ~ (p=0.310 n=5+5) Perf-task-clock-256 483 ± 3% 458 ± 1% -5.16% (p=0.008 n=5+5) Perf-task-clock-512 663 ± 1% 612 ± 4% -7.61% (p=0.008 n=5+5) Perf-task-clock-1024 1.06k ± 1% 0.95k ± 2% -10.24% (p=0.008 n=5+5) Updates #28808 Updates #18237 Change-Id: I4239cd89f21ad16dfbbef58d81981da48acd0605 Reviewed-on: https://go-review.googlesource.com/c/go/+/264477 Run-TryBot: Michael Pratt <mpratt@google.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Michael Knyszek <mknyszek@google.com> Trust: Michael Pratt <mpratt@google.com>
2020-10-05 18:12:35 -04:00
// allpLock protects P-less reads and size changes of allp, idlepMask,
// and timerpMask, and all writes to allp.
runtime: don't attempt to steal from idle Ps Work stealing is a scalability bottleneck in the scheduler. Since each P has a work queue, work stealing must look at every P to determine if there is any work. The number of Ps scales linearly with GOMAXPROCS (i.e., the number of Ps _is_ GOMAXPROCS), thus this work scales linearly with GOMAXPROCS. Work stealing is a later attempt by a P to find work before it goes idle. Since the P has no work of its own, extra costs here tend not to directly affect application-level benchmarks. Where they show up is extra CPU usage by the process as a whole. These costs get particularly expensive for applications that transition between blocked and running frequently. Long term, we need a more scalable approach in general, but for now we can make a simple observation: idle Ps ([1]) cannot possibly have anything in their runq, so we need not bother checking at all. We track idle Ps via a new global bitmap, updated in pidleput/pidleget. This is already a slow path (requires sched.lock), so we don't expect high contention there. Using a single bitmap avoids the need to touch every P to read p.status. Currently, the bitmap approach is not significantly better than reading p.status. However, in a future CL I'd like to apply a similiar optimization to timers. Once done, findrunnable would not touch most Ps at all (in mostly idle programs), which will avoid memory latency to pull those Ps into cache. When reading this bitmap, we are racing with Ps going in and out of idle, so there are a few cases to consider: 1. _Prunning -> _Pidle: Running P goes idle after we check the bitmap. In this case, we will try to steal (and find nothing) so there is no harm. 2. _Pidle -> _Prunning while spinning: A P that starts running may queue new work that we miss. This is OK: (a) that P cannot go back to sleep without completing its work, and (b) more fundamentally, we will recheck after we drop our P. 3. _Pidle -> _Prunning after spinning: After spinning, we really can miss work from a newly woken P. (a) above still applies here as well, but this is also the same delicate dance case described in findrunnable: if nothing is spinning anymore, the other P will unpark a thread to run the work it submits. Benchmark results from WakeupParallel/syscall/pair/race/1ms (see golang.org/cl/228577): name old msec new msec delta Perf-task-clock-8 250 ± 1% 247 ± 4% ~ (p=0.690 n=5+5) Perf-task-clock-16 258 ± 2% 259 ± 2% ~ (p=0.841 n=5+5) Perf-task-clock-32 284 ± 2% 270 ± 4% -4.94% (p=0.032 n=5+5) Perf-task-clock-64 326 ± 3% 303 ± 2% -6.92% (p=0.008 n=5+5) Perf-task-clock-128 407 ± 2% 363 ± 5% -10.69% (p=0.008 n=5+5) Perf-task-clock-256 561 ± 1% 481 ± 1% -14.20% (p=0.016 n=4+5) Perf-task-clock-512 840 ± 5% 683 ± 2% -18.70% (p=0.008 n=5+5) Perf-task-clock-1024 1.38k ±14% 1.07k ± 2% -21.85% (p=0.008 n=5+5) [1] "Idle Ps" here refers to _Pidle Ps in the sched.pidle list. In other contexts, Ps may temporarily transition through _Pidle (e.g., in handoffp); those Ps may have work. Updates #28808 Updates #18237 Change-Id: Ieeb958bd72e7d8fb375b0b1f414e8d7378b14e29 Reviewed-on: https://go-review.googlesource.com/c/go/+/259578 Run-TryBot: Michael Pratt <mpratt@google.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Michael Knyszek <mknyszek@google.com> Reviewed-by: Austin Clements <austin@google.com> Trust: Michael Pratt <mpratt@google.com>
2020-10-01 15:21:37 -04:00
allpLock mutex
// len(allp) == gomaxprocs; may change at safe points, otherwise
// immutable.
allp []*p
// Bitmask of Ps in _Pidle list, one bit per P. Reads and writes must
// be atomic. Length may change at safe points.
runtime: try to elide timer stealing if P has no timers Following golang.org/cl/259578, findrunnable still must touch every other P in checkTimers in order to look for timers to steal. This scales poorly with GOMAXPROCS and potentially performs poorly by pulling remote Ps into cache. Add timerpMask, a bitmask that tracks whether each P may have any timers on its timer heap. Ideally we would update this field on any timer add / remove to always keep it up to date. Unfortunately, updating a shared global structure is antithetical to sharding timers by P, and doing so approximately doubles the cost of addtimer / deltimer in microbenchmarks. Instead we only (potentially) clear the mask when the P goes idle. This covers the best case of avoiding looking at a P _at all_ when it is idle and has no timers. See the comment on updateTimerPMask for more details on the trade-off. Future CLs may be able to expand cases we can avoid looking at the timers. Note that the addition of idlepMask to p.init is a no-op. The zero value of the mask is the correct init value so it is not necessary, but it is included for clarity. Benchmark results from WakeupParallel/syscall/pair/race/1ms (see golang.org/cl/228577). Note that these are on top of golang.org/cl/259578: name old msec new msec delta Perf-task-clock-8 244 ± 4% 246 ± 4% ~ (p=0.841 n=5+5) Perf-task-clock-16 247 ±11% 252 ± 4% ~ (p=1.000 n=5+5) Perf-task-clock-32 270 ± 1% 268 ± 2% ~ (p=0.548 n=5+5) Perf-task-clock-64 302 ± 3% 296 ± 1% ~ (p=0.222 n=5+5) Perf-task-clock-128 358 ± 3% 352 ± 2% ~ (p=0.310 n=5+5) Perf-task-clock-256 483 ± 3% 458 ± 1% -5.16% (p=0.008 n=5+5) Perf-task-clock-512 663 ± 1% 612 ± 4% -7.61% (p=0.008 n=5+5) Perf-task-clock-1024 1.06k ± 1% 0.95k ± 2% -10.24% (p=0.008 n=5+5) Updates #28808 Updates #18237 Change-Id: I4239cd89f21ad16dfbbef58d81981da48acd0605 Reviewed-on: https://go-review.googlesource.com/c/go/+/264477 Run-TryBot: Michael Pratt <mpratt@google.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Michael Knyszek <mknyszek@google.com> Trust: Michael Pratt <mpratt@google.com>
2020-10-05 18:12:35 -04:00
//
// Each P must update only its own bit. In order to maintain
// consistency, a P going idle must the idle mask simultaneously with
// updates to the idle P list under the sched.lock, otherwise a racing
// pidleget may clear the mask before pidleput sets the mask,
// corrupting the bitmap.
//
// N.B., procresize takes ownership of all Ps in stopTheWorldWithSema.
idlepMask pMask
// Bitmask of Ps that may have a timer, one bit per P. Reads and writes
// must be atomic. Length may change at safe points.
timerpMask pMask
runtime: don't attempt to steal from idle Ps Work stealing is a scalability bottleneck in the scheduler. Since each P has a work queue, work stealing must look at every P to determine if there is any work. The number of Ps scales linearly with GOMAXPROCS (i.e., the number of Ps _is_ GOMAXPROCS), thus this work scales linearly with GOMAXPROCS. Work stealing is a later attempt by a P to find work before it goes idle. Since the P has no work of its own, extra costs here tend not to directly affect application-level benchmarks. Where they show up is extra CPU usage by the process as a whole. These costs get particularly expensive for applications that transition between blocked and running frequently. Long term, we need a more scalable approach in general, but for now we can make a simple observation: idle Ps ([1]) cannot possibly have anything in their runq, so we need not bother checking at all. We track idle Ps via a new global bitmap, updated in pidleput/pidleget. This is already a slow path (requires sched.lock), so we don't expect high contention there. Using a single bitmap avoids the need to touch every P to read p.status. Currently, the bitmap approach is not significantly better than reading p.status. However, in a future CL I'd like to apply a similiar optimization to timers. Once done, findrunnable would not touch most Ps at all (in mostly idle programs), which will avoid memory latency to pull those Ps into cache. When reading this bitmap, we are racing with Ps going in and out of idle, so there are a few cases to consider: 1. _Prunning -> _Pidle: Running P goes idle after we check the bitmap. In this case, we will try to steal (and find nothing) so there is no harm. 2. _Pidle -> _Prunning while spinning: A P that starts running may queue new work that we miss. This is OK: (a) that P cannot go back to sleep without completing its work, and (b) more fundamentally, we will recheck after we drop our P. 3. _Pidle -> _Prunning after spinning: After spinning, we really can miss work from a newly woken P. (a) above still applies here as well, but this is also the same delicate dance case described in findrunnable: if nothing is spinning anymore, the other P will unpark a thread to run the work it submits. Benchmark results from WakeupParallel/syscall/pair/race/1ms (see golang.org/cl/228577): name old msec new msec delta Perf-task-clock-8 250 ± 1% 247 ± 4% ~ (p=0.690 n=5+5) Perf-task-clock-16 258 ± 2% 259 ± 2% ~ (p=0.841 n=5+5) Perf-task-clock-32 284 ± 2% 270 ± 4% -4.94% (p=0.032 n=5+5) Perf-task-clock-64 326 ± 3% 303 ± 2% -6.92% (p=0.008 n=5+5) Perf-task-clock-128 407 ± 2% 363 ± 5% -10.69% (p=0.008 n=5+5) Perf-task-clock-256 561 ± 1% 481 ± 1% -14.20% (p=0.016 n=4+5) Perf-task-clock-512 840 ± 5% 683 ± 2% -18.70% (p=0.008 n=5+5) Perf-task-clock-1024 1.38k ±14% 1.07k ± 2% -21.85% (p=0.008 n=5+5) [1] "Idle Ps" here refers to _Pidle Ps in the sched.pidle list. In other contexts, Ps may temporarily transition through _Pidle (e.g., in handoffp); those Ps may have work. Updates #28808 Updates #18237 Change-Id: Ieeb958bd72e7d8fb375b0b1f414e8d7378b14e29 Reviewed-on: https://go-review.googlesource.com/c/go/+/259578 Run-TryBot: Michael Pratt <mpratt@google.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Michael Knyszek <mknyszek@google.com> Reviewed-by: Austin Clements <austin@google.com> Trust: Michael Pratt <mpratt@google.com>
2020-10-01 15:21:37 -04:00
runtime: manage gcBgMarkWorkers with a global pool Background mark workers perform per-P marking work. Currently each worker is assigned a P at creation time. The worker "attaches" to the P via p.gcBgMarkWorker, making itself (usually) available to findRunnableGCWorker for scheduling GC work. While running gcMarkDone, the worker "detaches" from the P (by clearing p.gcBgMarkWorker), since it may park for other reasons and should not be scheduled by findRunnableGCWorker. Unfortunately, this design is complex and difficult to reason about. We simplify things by changing the design to eliminate the hard P attachment. Rather than workers always performing work from the same P, workers perform work for whichever P they find themselves on. On park, the workers are placed in a pool of free workers, which each P's findRunnableGCWorker can use to run a worker for its P. Now if a worker parks in gcMarkDone, a P may simply use another worker from the pool to complete its own work. The P's GC worker mode is used to communicate the mode to run to the selected worker. It is also used to emit the appropriate worker EvGoStart tracepoint. This is a slight change, as this G may be preempted (e.g., in gcMarkDone). When it is rescheduled, the trace viewer will show it as a normal goroutine again. It is currently a bit difficult to connect to the original worker tracepoint, as the viewer does not display the goid for the original worker (though the data is in the trace file). Change-Id: Id7bd3a364dc18a4d2b1c99c4dc4810fae1293c1b Reviewed-on: https://go-review.googlesource.com/c/go/+/262348 Run-TryBot: Michael Pratt <mpratt@google.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Michael Knyszek <mknyszek@google.com> Trust: Michael Pratt <mpratt@google.com>
2020-10-13 12:39:13 -04:00
// Pool of GC parked background workers. Entries are type
// *gcBgMarkWorkerNode.
gcBgMarkWorkerPool lfstack
// Total number of gcBgMarkWorker goroutines. Protected by worldsema.
gcBgMarkWorkerCount int32
// Information about what cpu features are available.
// Packages outside the runtime should not use these
// as they are not an external api.
// Set on startup in asm_{386,amd64}.s
processorVersionInfo uint32
isIntel bool
goarm uint8 // set by cmd/link on arm systems
)
// Set by the linker so the runtime can determine the buildmode.
var (
islibrary bool // -buildmode=c-shared
isarchive bool // -buildmode=c-archive
)
// Must agree with internal/buildcfg.Experiment.FramePointer.
const framepointer_enabled = GOARCH == "amd64" || GOARCH == "arm64"