go/src/runtime/export_test.go

1721 lines
41 KiB
Go
Raw Normal View History

// Copyright 2010 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Export guts for testing.
package runtime
import (
"internal/goarch"
"internal/goos"
"runtime/internal/atomic"
"runtime/internal/sys"
"unsafe"
)
var Fadd64 = fadd64
var Fsub64 = fsub64
var Fmul64 = fmul64
var Fdiv64 = fdiv64
var F64to32 = f64to32
var F32to64 = f32to64
var Fcmp64 = fcmp64
var Fintto64 = fintto64
var F64toint = f64toint
var Entersyscall = entersyscall
var Exitsyscall = exitsyscall
var LockedOSThread = lockedOSThread
var Xadduintptr = atomic.Xadduintptr
var Fastlog2 = fastlog2
var Atoi = atoi
var Atoi32 = atoi32
var ParseByteCount = parseByteCount
var Nanotime = nanotime
var NetpollBreak = netpollBreak
var Usleep = usleep
var PhysPageSize = physPageSize
var PhysHugePageSize = physHugePageSize
var NetpollGenericInit = netpollGenericInit
var Memmove = memmove
var MemclrNoHeapPointers = memclrNoHeapPointers
var LockPartialOrder = lockPartialOrder
type LockRank lockRank
func (l LockRank) String() string {
return lockRank(l).String()
}
const PreemptMSupported = preemptMSupported
type LFNode struct {
Next uint64
Pushcnt uintptr
}
func LFStackPush(head *uint64, node *LFNode) {
(*lfstack)(head).push((*lfnode)(unsafe.Pointer(node)))
}
func LFStackPop(head *uint64) *LFNode {
return (*LFNode)(unsafe.Pointer((*lfstack)(head).pop()))
}
func LFNodeValidate(node *LFNode) {
lfnodeValidate((*lfnode)(unsafe.Pointer(node)))
}
func Netpoll(delta int64) {
systemstack(func() {
netpoll(delta)
})
}
func GCMask(x any) (ret []byte) {
[dev.cc] runtime: delete scalararg, ptrarg; rename onM to systemstack Scalararg and ptrarg are not "signal safe". Go code filling them out can be interrupted by a signal, and then the signal handler runs, and if it also ends up in Go code that uses scalararg or ptrarg, now the old values have been smashed. For the pieces of code that do need to run in a signal handler, we introduced onM_signalok, which is really just onM except that the _signalok is meant to convey that the caller asserts that scalarg and ptrarg will be restored to their old values after the call (instead of the usual behavior, zeroing them). Scalararg and ptrarg are also untyped and therefore error-prone. Go code can always pass a closure instead of using scalararg and ptrarg; they were only really necessary for C code. And there's no more C code. For all these reasons, delete scalararg and ptrarg, converting the few remaining references to use closures. Once those are gone, there is no need for a distinction between onM and onM_signalok, so replace both with a single function equivalent to the current onM_signalok (that is, it can be called on any of the curg, g0, and gsignal stacks). The name onM and the phrase 'm stack' are misnomers, because on most system an M has two system stacks: the main thread stack and the signal handling stack. Correct the misnomer by naming the replacement function systemstack. Fix a few references to "M stack" in code. The main motivation for this change is to eliminate scalararg/ptrarg. Rick and I have already seen them cause problems because the calling sequence m.ptrarg[0] = p is a heap pointer assignment, so it gets a write barrier. The write barrier also uses onM, so it has all the same problems as if it were being invoked by a signal handler. We worked around this by saving and restoring the old values and by calling onM_signalok, but there's no point in keeping this nice home for bugs around any longer. This CL also changes funcline to return the file name as a result instead of filling in a passed-in *string. (The *string signature is left over from when the code was written in and called from C.) That's arguably an unrelated change, except that once I had done the ptrarg/scalararg/onM cleanup I started getting false positives about the *string argument escaping (not allowed in package runtime). The compiler is wrong, but the easiest fix is to write the code like Go code instead of like C code. I am a bit worried that the compiler is wrong because of some use of uninitialized memory in the escape analysis. If that's the reason, it will go away when we convert the compiler to Go. (And if not, we'll debug it the next time.) LGTM=khr R=r, khr CC=austin, golang-codereviews, iant, rlh https://golang.org/cl/174950043
2014-11-12 14:54:31 -05:00
systemstack(func() {
ret = getgcmask(x)
})
return
}
func RunSchedLocalQueueTest() {
pp := new(p)
gs := make([]g, len(pp.runq))
Escape(gs) // Ensure gs doesn't move, since we use guintptrs
for i := 0; i < len(pp.runq); i++ {
if g, _ := runqget(pp); g != nil {
throw("runq is not empty initially")
}
for j := 0; j < i; j++ {
runqput(pp, &gs[i], false)
}
for j := 0; j < i; j++ {
if g, _ := runqget(pp); g != &gs[i] {
print("bad element at iter ", i, "/", j, "\n")
throw("bad element")
}
}
if g, _ := runqget(pp); g != nil {
throw("runq is not empty afterwards")
}
}
}
func RunSchedLocalQueueStealTest() {
p1 := new(p)
p2 := new(p)
gs := make([]g, len(p1.runq))
Escape(gs) // Ensure gs doesn't move, since we use guintptrs
for i := 0; i < len(p1.runq); i++ {
for j := 0; j < i; j++ {
gs[j].sig = 0
runqput(p1, &gs[j], false)
}
gp := runqsteal(p2, p1, true)
s := 0
if gp != nil {
s++
gp.sig++
}
for {
gp, _ = runqget(p2)
if gp == nil {
break
}
s++
gp.sig++
}
for {
gp, _ = runqget(p1)
if gp == nil {
break
}
gp.sig++
}
for j := 0; j < i; j++ {
if gs[j].sig != 1 {
print("bad element ", j, "(", gs[j].sig, ") at iter ", i, "\n")
throw("bad element")
}
}
if s != i/2 && s != i/2+1 {
print("bad steal ", s, ", want ", i/2, " or ", i/2+1, ", iter ", i, "\n")
throw("bad steal")
}
}
}
func RunSchedLocalQueueEmptyTest(iters int) {
// Test that runq is not spuriously reported as empty.
// Runq emptiness affects scheduling decisions and spurious emptiness
// can lead to underutilization (both runnable Gs and idle Ps coexist
// for arbitrary long time).
done := make(chan bool, 1)
p := new(p)
gs := make([]g, 2)
Escape(gs) // Ensure gs doesn't move, since we use guintptrs
ready := new(uint32)
for i := 0; i < iters; i++ {
*ready = 0
next0 := (i & 1) == 0
next1 := (i & 2) == 0
runqput(p, &gs[0], next0)
go func() {
for atomic.Xadd(ready, 1); atomic.Load(ready) != 2; {
}
if runqempty(p) {
println("next:", next0, next1)
throw("queue is empty")
}
done <- true
}()
for atomic.Xadd(ready, 1); atomic.Load(ready) != 2; {
}
runqput(p, &gs[1], next1)
runqget(p)
<-done
runqget(p)
}
}
var (
StringHash = stringHash
BytesHash = bytesHash
Int32Hash = int32Hash
Int64Hash = int64Hash
MemHash = memhash
MemHash32 = memhash32
MemHash64 = memhash64
EfaceHash = efaceHash
IfaceHash = ifaceHash
)
var UseAeshash = &useAeshash
func MemclrBytes(b []byte) {
s := (*slice)(unsafe.Pointer(&b))
memclrNoHeapPointers(s.array, uintptr(s.len))
}
const HashLoad = hashLoad
// entry point for testing
func GostringW(w []uint16) (s string) {
[dev.cc] runtime: delete scalararg, ptrarg; rename onM to systemstack Scalararg and ptrarg are not "signal safe". Go code filling them out can be interrupted by a signal, and then the signal handler runs, and if it also ends up in Go code that uses scalararg or ptrarg, now the old values have been smashed. For the pieces of code that do need to run in a signal handler, we introduced onM_signalok, which is really just onM except that the _signalok is meant to convey that the caller asserts that scalarg and ptrarg will be restored to their old values after the call (instead of the usual behavior, zeroing them). Scalararg and ptrarg are also untyped and therefore error-prone. Go code can always pass a closure instead of using scalararg and ptrarg; they were only really necessary for C code. And there's no more C code. For all these reasons, delete scalararg and ptrarg, converting the few remaining references to use closures. Once those are gone, there is no need for a distinction between onM and onM_signalok, so replace both with a single function equivalent to the current onM_signalok (that is, it can be called on any of the curg, g0, and gsignal stacks). The name onM and the phrase 'm stack' are misnomers, because on most system an M has two system stacks: the main thread stack and the signal handling stack. Correct the misnomer by naming the replacement function systemstack. Fix a few references to "M stack" in code. The main motivation for this change is to eliminate scalararg/ptrarg. Rick and I have already seen them cause problems because the calling sequence m.ptrarg[0] = p is a heap pointer assignment, so it gets a write barrier. The write barrier also uses onM, so it has all the same problems as if it were being invoked by a signal handler. We worked around this by saving and restoring the old values and by calling onM_signalok, but there's no point in keeping this nice home for bugs around any longer. This CL also changes funcline to return the file name as a result instead of filling in a passed-in *string. (The *string signature is left over from when the code was written in and called from C.) That's arguably an unrelated change, except that once I had done the ptrarg/scalararg/onM cleanup I started getting false positives about the *string argument escaping (not allowed in package runtime). The compiler is wrong, but the easiest fix is to write the code like Go code instead of like C code. I am a bit worried that the compiler is wrong because of some use of uninitialized memory in the escape analysis. If that's the reason, it will go away when we convert the compiler to Go. (And if not, we'll debug it the next time.) LGTM=khr R=r, khr CC=austin, golang-codereviews, iant, rlh https://golang.org/cl/174950043
2014-11-12 14:54:31 -05:00
systemstack(func() {
s = gostringw(&w[0])
})
return
}
var Open = open
var Close = closefd
var Read = read
var Write = write
func Envs() []string { return envs }
func SetEnvs(e []string) { envs = e }
// For benchmarking.
func BenchSetType(n int, x any) {
e := *efaceOf(&x)
t := e._type
var size uintptr
var p unsafe.Pointer
switch t.kind & kindMask {
case kindPtr:
t = (*ptrtype)(unsafe.Pointer(t)).elem
size = t.size
p = e.data
case kindSlice:
slice := *(*struct {
ptr unsafe.Pointer
len, cap uintptr
})(e.data)
t = (*slicetype)(unsafe.Pointer(t)).elem
size = t.size * slice.len
p = slice.ptr
}
allocSize := roundupsize(size)
systemstack(func() {
for i := 0; i < n; i++ {
heapBitsSetType(uintptr(p), allocSize, size, t)
}
})
}
const PtrSize = goarch.PtrSize
var ForceGCPeriod = &forcegcperiod
// SetTracebackEnv is like runtime/debug.SetTraceback, but it raises
// the "environment" traceback level, so later calls to
// debug.SetTraceback (e.g., from testing timeouts) can't lower it.
func SetTracebackEnv(level string) {
setTraceback(level)
traceback_env = traceback_cache
}
var ReadUnaligned32 = readUnaligned32
var ReadUnaligned64 = readUnaligned64
2016-03-29 12:28:24 -04:00
func CountPagesInUse() (pagesInUse, counted uintptr) {
stopTheWorld("CountPagesInUse")
runtime: retype mheap.pagesInUse as atomic.Uint64 [git-generate] cd src/runtime mv export_test.go export.go GOROOT=$(dirname $(dirname $PWD)) rf ' add mheap.pagesInUse \ // Proportional sweep \ // \ // These parameters represent a linear function from gcController.heapLive \ // to page sweep count. The proportional sweep system works to \ // stay in the black by keeping the current page sweep count \ // above this line at the current gcController.heapLive. \ // \ // The line has slope sweepPagesPerByte and passes through a \ // basis point at (sweepHeapLiveBasis, pagesSweptBasis). At \ // any given time, the system is at (gcController.heapLive, \ // pagesSwept) in this space. \ // \ // It is important that the line pass through a point we \ // control rather than simply starting at a 0,0 origin \ // because that lets us adjust sweep pacing at any time while \ // accounting for current progress. If we could only adjust \ // the slope, it would create a discontinuity in debt if any \ // progress has already been made. \ pagesInUse_ atomic.Uint64 // pages of spans in stats mSpanInUse ex { import "runtime/internal/atomic" var t mheap var v, w uint64 var d int64 t.pagesInUse -> t.pagesInUse_.Load() t.pagesInUse = v -> t.pagesInUse_.Store(v) atomic.Load64(&t.pagesInUse) -> t.pagesInUse_.Load() atomic.LoadAcq64(&t.pagesInUse) -> t.pagesInUse_.LoadAcquire() atomic.Store64(&t.pagesInUse, v) -> t.pagesInUse_.Store(v) atomic.StoreRel64(&t.pagesInUse, v) -> t.pagesInUse_.StoreRelease(v) atomic.Cas64(&t.pagesInUse, v, w) -> t.pagesInUse_.CompareAndSwap(v, w) atomic.Xchg64(&t.pagesInUse, v) -> t.pagesInUse_.Swap(v) atomic.Xadd64(&t.pagesInUse, d) -> t.pagesInUse_.Add(d) } rm mheap.pagesInUse mv mheap.pagesInUse_ mheap.pagesInUse ' mv export.go export_test.go Change-Id: I495d188683dba0778518563c46755b5ad43be298 Reviewed-on: https://go-review.googlesource.com/c/go/+/356549 Trust: Michael Knyszek <mknyszek@google.com> Reviewed-by: Austin Clements <austin@google.com>
2021-10-15 19:22:10 +00:00
pagesInUse = uintptr(mheap_.pagesInUse.Load())
2016-03-29 12:28:24 -04:00
for _, s := range mheap_.allspans {
runtime: atomically set span state and use as publication barrier When everything is working correctly, any pointer the garbage collector encounters can only point into a fully initialized heap span, since the span must have been initialized before that pointer could escape the heap allocator and become visible to the GC. However, in various cases, we try to be defensive against bad pointers. In findObject, this is just a sanity check: we never expect to find a bad pointer, but programming errors can lead to them. In spanOfHeap, we don't necessarily trust the pointer and we're trying to check if it really does point to the heap, though it should always point to something. Conservative scanning takes this to a new level, since it can only guess that a word may be a pointer and verify this. In all of these cases, we have a problem that the span lookup and check can race with span initialization, since the span becomes visible to lookups before it's fully initialized. Furthermore, we're about to start initializing the span without the heap lock held, which is going to introduce races where accesses were previously protected by the heap lock. To address this, this CL makes accesses to mspan.state atomic, and ensures that the span is fully initialized before setting the state to mSpanInUse. All loads are now atomic, and in any case where we don't trust the pointer, it first atomically loads the span state and checks that it's mSpanInUse, after which it will have synchronized with span initialization and can safely check the other span fields. For #10958, #24543, but a good fix in general. Change-Id: I518b7c63555b02064b98aa5f802c92b758fef853 Reviewed-on: https://go-review.googlesource.com/c/go/+/203286 Run-TryBot: Austin Clements <austin@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Michael Knyszek <mknyszek@google.com>
2019-10-23 11:25:38 -04:00
if s.state.get() == mSpanInUse {
2016-03-29 12:28:24 -04:00
counted += s.npages
}
}
startTheWorld()
return
}
func Fastrand() uint32 { return fastrand() }
func Fastrand64() uint64 { return fastrand64() }
func Fastrandn(n uint32) uint32 { return fastrandn(n) }
type ProfBuf profBuf
func NewProfBuf(hdrsize, bufwords, tags int) *ProfBuf {
return (*ProfBuf)(newProfBuf(hdrsize, bufwords, tags))
}
func (p *ProfBuf) Write(tag *unsafe.Pointer, now int64, hdr []uint64, stk []uintptr) {
(*profBuf)(p).write(tag, now, hdr, stk)
}
const (
ProfBufBlocking = profBufBlocking
ProfBufNonBlocking = profBufNonBlocking
)
func (p *ProfBuf) Read(mode profBufReadMode) ([]uint64, []unsafe.Pointer, bool) {
return (*profBuf)(p).read(profBufReadMode(mode))
}
func (p *ProfBuf) Close() {
(*profBuf)(p).close()
}
func ReadMetricsSlow(memStats *MemStats, samplesp unsafe.Pointer, len, cap int) {
stopTheWorld("ReadMetricsSlow")
// Initialize the metrics beforehand because this could
// allocate and skew the stats.
metricsLock()
initMetrics()
metricsUnlock()
systemstack(func() {
// Read memstats first. It's going to flush
// the mcaches which readMetrics does not do, so
// going the other way around may result in
// inconsistent statistics.
readmemstats_m(memStats)
})
// Read metrics off the system stack.
//
// The only part of readMetrics that could allocate
// and skew the stats is initMetrics.
readMetrics(samplesp, len, cap)
startTheWorld()
}
// ReadMemStatsSlow returns both the runtime-computed MemStats and
// MemStats accumulated by scanning the heap.
func ReadMemStatsSlow() (base, slow MemStats) {
stopTheWorld("ReadMemStatsSlow")
// Run on the system stack to avoid stack growth allocation.
systemstack(func() {
// Make sure stats don't change.
getg().m.mallocing++
readmemstats_m(&base)
// Initialize slow from base and zero the fields we're
// recomputing.
slow = base
slow.Alloc = 0
slow.TotalAlloc = 0
slow.Mallocs = 0
slow.Frees = 0
slow.HeapReleased = 0
var bySize [_NumSizeClasses]struct {
Mallocs, Frees uint64
}
// Add up current allocations in spans.
for _, s := range mheap_.allspans {
runtime: atomically set span state and use as publication barrier When everything is working correctly, any pointer the garbage collector encounters can only point into a fully initialized heap span, since the span must have been initialized before that pointer could escape the heap allocator and become visible to the GC. However, in various cases, we try to be defensive against bad pointers. In findObject, this is just a sanity check: we never expect to find a bad pointer, but programming errors can lead to them. In spanOfHeap, we don't necessarily trust the pointer and we're trying to check if it really does point to the heap, though it should always point to something. Conservative scanning takes this to a new level, since it can only guess that a word may be a pointer and verify this. In all of these cases, we have a problem that the span lookup and check can race with span initialization, since the span becomes visible to lookups before it's fully initialized. Furthermore, we're about to start initializing the span without the heap lock held, which is going to introduce races where accesses were previously protected by the heap lock. To address this, this CL makes accesses to mspan.state atomic, and ensures that the span is fully initialized before setting the state to mSpanInUse. All loads are now atomic, and in any case where we don't trust the pointer, it first atomically loads the span state and checks that it's mSpanInUse, after which it will have synchronized with span initialization and can safely check the other span fields. For #10958, #24543, but a good fix in general. Change-Id: I518b7c63555b02064b98aa5f802c92b758fef853 Reviewed-on: https://go-review.googlesource.com/c/go/+/203286 Run-TryBot: Austin Clements <austin@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Michael Knyszek <mknyszek@google.com>
2019-10-23 11:25:38 -04:00
if s.state.get() != mSpanInUse {
continue
}
runtime: add safe arena support to the runtime This change adds an API to the runtime for arenas. A later CL can potentially export it as an experimental API, but for now, just the runtime implementation will suffice. The purpose of arenas is to improve efficiency, primarily by allowing for an application to manually free memory, thereby delaying garbage collection. It comes with other potential performance benefits, such as better locality, a better allocation strategy, and better handling of interior pointers by the GC. This implementation is based on one by danscales@google.com with a few significant differences: * The implementation lives entirely in the runtime (all layers). * Arena chunks are the minimum of 8 MiB or the heap arena size. This choice is made because in practice 64 MiB appears to be way too large of an area for most real-world use-cases. * Arena chunks are not unmapped, instead they're placed on an evacuation list and when there are no pointers left pointing into them, they're allowed to be reused. * Reusing partially-used arena chunks no longer tries to find one used by the same P first; it just takes the first one available. * In order to ensure worst-case fragmentation is never worse than 25%, only types and slice backing stores whose sizes are 1/4th the size of a chunk or less may be used. Previously larger sizes, up to the size of the chunk, were allowed. * ASAN, MSAN, and the race detector are fully supported. * Sets arena chunks to fault that were deferred at the end of mark termination (a non-public patch once did this; I don't see a reason not to continue that). For #51317. Change-Id: I83b1693a17302554cb36b6daa4e9249a81b1644f Reviewed-on: https://go-review.googlesource.com/c/go/+/423359 Reviewed-by: Cherry Mui <cherryyz@google.com> TryBot-Result: Gopher Robot <gobot@golang.org> Run-TryBot: Michael Knyszek <mknyszek@google.com>
2022-08-12 21:40:46 +00:00
if s.isUnusedUserArenaChunk() {
continue
}
if sizeclass := s.spanclass.sizeclass(); sizeclass == 0 {
slow.Mallocs++
slow.Alloc += uint64(s.elemsize)
} else {
slow.Mallocs += uint64(s.allocCount)
slow.Alloc += uint64(s.allocCount) * uint64(s.elemsize)
bySize[sizeclass].Mallocs += uint64(s.allocCount)
}
}
// Add in frees by just reading the stats for those directly.
var m heapStatsDelta
memstats.heapStats.unsafeRead(&m)
// Collect per-sizeclass free stats.
var smallFree uint64
for i := 0; i < _NumSizeClasses; i++ {
slow.Frees += uint64(m.smallFreeCount[i])
bySize[i].Frees += uint64(m.smallFreeCount[i])
bySize[i].Mallocs += uint64(m.smallFreeCount[i])
smallFree += uint64(m.smallFreeCount[i]) * uint64(class_to_size[i])
}
slow.Frees += uint64(m.tinyAllocCount) + uint64(m.largeFreeCount)
slow.Mallocs += slow.Frees
slow.TotalAlloc = slow.Alloc + uint64(m.largeFree) + smallFree
for i := range slow.BySize {
slow.BySize[i].Mallocs = bySize[i].Mallocs
slow.BySize[i].Frees = bySize[i].Frees
}
for i := mheap_.pages.start; i < mheap_.pages.end; i++ {
chunk := mheap_.pages.tryChunkOf(i)
if chunk == nil {
continue
}
pg := chunk.scavenged.popcntRange(0, pallocChunkPages)
slow.HeapReleased += uint64(pg) * pageSize
}
for _, p := range allp {
pg := sys.OnesCount64(p.pcache.scav)
slow.HeapReleased += uint64(pg) * pageSize
}
getg().m.mallocing--
})
startTheWorld()
return
}
// BlockOnSystemStack switches to the system stack, prints "x\n" to
// stderr, and blocks in a stack containing
// "runtime.blockOnSystemStackInternal".
func BlockOnSystemStack() {
systemstack(blockOnSystemStackInternal)
}
func blockOnSystemStackInternal() {
print("x\n")
lock(&deadlock)
lock(&deadlock)
}
type RWMutex struct {
rw rwmutex
}
func (rw *RWMutex) RLock() {
rw.rw.rlock()
}
func (rw *RWMutex) RUnlock() {
rw.rw.runlock()
}
func (rw *RWMutex) Lock() {
rw.rw.lock()
}
func (rw *RWMutex) Unlock() {
rw.rw.unlock()
}
const RuntimeHmapSize = unsafe.Sizeof(hmap{})
func MapBucketsCount(m map[int]int) int {
h := *(**hmap)(unsafe.Pointer(&m))
return 1 << h.B
}
func MapBucketsPointerIsNil(m map[int]int) bool {
h := *(**hmap)(unsafe.Pointer(&m))
return h.buckets == nil
}
func LockOSCounts() (external, internal uint32) {
gp := getg()
if gp.m.lockedExt+gp.m.lockedInt == 0 {
if gp.lockedm != 0 {
panic("lockedm on non-locked goroutine")
}
} else {
if gp.lockedm == 0 {
panic("nil lockedm on locked goroutine")
}
}
return gp.m.lockedExt, gp.m.lockedInt
}
//go:noinline
func TracebackSystemstack(stk []uintptr, i int) int {
if i == 0 {
pc, sp := getcallerpc(), getcallersp()
return gentraceback(pc, sp, 0, getg(), 0, &stk[0], len(stk), nil, nil, _TraceJumpStack)
}
n := 0
systemstack(func() {
n = TracebackSystemstack(stk, i-1)
})
return n
}
runtime: use sparse mappings for the heap This replaces the contiguous heap arena mapping with a potentially sparse mapping that can support heap mappings anywhere in the address space. This has several advantages over the current approach: * There is no longer any limit on the size of the Go heap. (Currently it's limited to 512GB.) Hence, this fixes #10460. * It eliminates many failures modes of heap initialization and growing. In particular it eliminates any possibility of panicking with an address space conflict. This can happen for many reasons and even causes a low but steady rate of TSAN test failures because of conflicts with the TSAN runtime. See #16936 and #11993. * It eliminates the notion of "non-reserved" heap, which was added because creating huge address space reservations (particularly on 64-bit) led to huge process VSIZE. This was at best confusing and at worst conflicted badly with ulimit -v. However, the non-reserved heap logic is complicated, can race with other mappings in non-pure Go binaries (e.g., #18976), and requires that the entire heap be either reserved or non-reserved. We currently maintain the latter property, but it's quite difficult to convince yourself of that, and hence difficult to keep correct. This logic is still present, but will be removed in the next CL. * It fixes problems on 32-bit where skipping over parts of the address space leads to mapping huge (and never-to-be-used) metadata structures. See #19831. This also completely rewrites and significantly simplifies mheap.sysAlloc, which has been a source of many bugs. E.g., #21044, #20259, #18651, and #13143 (and maybe #23222). This change also makes it possible to allocate individual objects larger than 512GB. As a result, a few tests that expected huge allocations to fail needed to be changed to make even larger allocations. However, at the moment attempting to allocate a humongous object may cause the program to freeze for several minutes on Linux as we fall back to probing every page with addrspace_free. That logic (and this failure mode) will be removed in the next CL. Fixes #10460. Fixes #22204 (since it rewrites the code involved). This slightly slows down compilebench and the x/benchmarks garbage benchmark. name old time/op new time/op delta Template 184ms ± 1% 185ms ± 1% ~ (p=0.065 n=10+9) Unicode 86.9ms ± 3% 86.3ms ± 1% ~ (p=0.631 n=10+10) GoTypes 599ms ± 0% 602ms ± 0% +0.56% (p=0.000 n=10+9) Compiler 2.87s ± 1% 2.89s ± 1% +0.51% (p=0.002 n=9+10) SSA 7.29s ± 1% 7.25s ± 1% ~ (p=0.182 n=10+9) Flate 118ms ± 2% 118ms ± 1% ~ (p=0.113 n=9+9) GoParser 147ms ± 1% 148ms ± 1% +1.07% (p=0.003 n=9+10) Reflect 401ms ± 1% 404ms ± 1% +0.71% (p=0.003 n=10+9) Tar 175ms ± 1% 175ms ± 1% ~ (p=0.604 n=9+10) XML 209ms ± 1% 210ms ± 1% ~ (p=0.052 n=10+10) (https://perf.golang.org/search?q=upload:20171231.4) name old time/op new time/op delta Garbage/benchmem-MB=64-12 2.23ms ± 1% 2.25ms ± 1% +0.84% (p=0.000 n=19+19) (https://perf.golang.org/search?q=upload:20171231.3) Relative to the start of the sparse heap changes (starting at and including "runtime: fix various contiguous bitmap assumptions"), overall slowdown is roughly 1% on GC-intensive benchmarks: name old time/op new time/op delta Template 183ms ± 1% 185ms ± 1% +1.32% (p=0.000 n=9+9) Unicode 84.9ms ± 2% 86.3ms ± 1% +1.65% (p=0.000 n=9+10) GoTypes 595ms ± 1% 602ms ± 0% +1.19% (p=0.000 n=9+9) Compiler 2.86s ± 0% 2.89s ± 1% +0.91% (p=0.000 n=9+10) SSA 7.19s ± 0% 7.25s ± 1% +0.75% (p=0.000 n=8+9) Flate 117ms ± 1% 118ms ± 1% +1.10% (p=0.000 n=10+9) GoParser 146ms ± 2% 148ms ± 1% +1.48% (p=0.002 n=10+10) Reflect 398ms ± 1% 404ms ± 1% +1.51% (p=0.000 n=10+9) Tar 173ms ± 1% 175ms ± 1% +1.17% (p=0.000 n=10+10) XML 208ms ± 1% 210ms ± 1% +0.62% (p=0.011 n=10+10) [Geo mean] 369ms 373ms +1.17% (https://perf.golang.org/search?q=upload:20180101.2) name old time/op new time/op delta Garbage/benchmem-MB=64-12 2.22ms ± 1% 2.25ms ± 1% +1.51% (p=0.000 n=20+19) (https://perf.golang.org/search?q=upload:20180101.3) Change-Id: I5daf4cfec24b252e5a57001f0a6c03f22479d0f0 Reviewed-on: https://go-review.googlesource.com/85887 Run-TryBot: Austin Clements <austin@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Rick Hudson <rlh@golang.org>
2017-12-19 22:05:23 -08:00
func KeepNArenaHints(n int) {
hint := mheap_.arenaHints
for i := 1; i < n; i++ {
hint = hint.next
if hint == nil {
return
}
}
hint.next = nil
}
// MapNextArenaHint reserves a page at the next arena growth hint,
// preventing the arena from growing there, and returns the range of
// addresses that are no longer viable.
//
// This may fail to reserve memory. If it fails, it still returns the
// address range it attempted to reserve.
func MapNextArenaHint() (start, end uintptr, ok bool) {
runtime: use sparse mappings for the heap This replaces the contiguous heap arena mapping with a potentially sparse mapping that can support heap mappings anywhere in the address space. This has several advantages over the current approach: * There is no longer any limit on the size of the Go heap. (Currently it's limited to 512GB.) Hence, this fixes #10460. * It eliminates many failures modes of heap initialization and growing. In particular it eliminates any possibility of panicking with an address space conflict. This can happen for many reasons and even causes a low but steady rate of TSAN test failures because of conflicts with the TSAN runtime. See #16936 and #11993. * It eliminates the notion of "non-reserved" heap, which was added because creating huge address space reservations (particularly on 64-bit) led to huge process VSIZE. This was at best confusing and at worst conflicted badly with ulimit -v. However, the non-reserved heap logic is complicated, can race with other mappings in non-pure Go binaries (e.g., #18976), and requires that the entire heap be either reserved or non-reserved. We currently maintain the latter property, but it's quite difficult to convince yourself of that, and hence difficult to keep correct. This logic is still present, but will be removed in the next CL. * It fixes problems on 32-bit where skipping over parts of the address space leads to mapping huge (and never-to-be-used) metadata structures. See #19831. This also completely rewrites and significantly simplifies mheap.sysAlloc, which has been a source of many bugs. E.g., #21044, #20259, #18651, and #13143 (and maybe #23222). This change also makes it possible to allocate individual objects larger than 512GB. As a result, a few tests that expected huge allocations to fail needed to be changed to make even larger allocations. However, at the moment attempting to allocate a humongous object may cause the program to freeze for several minutes on Linux as we fall back to probing every page with addrspace_free. That logic (and this failure mode) will be removed in the next CL. Fixes #10460. Fixes #22204 (since it rewrites the code involved). This slightly slows down compilebench and the x/benchmarks garbage benchmark. name old time/op new time/op delta Template 184ms ± 1% 185ms ± 1% ~ (p=0.065 n=10+9) Unicode 86.9ms ± 3% 86.3ms ± 1% ~ (p=0.631 n=10+10) GoTypes 599ms ± 0% 602ms ± 0% +0.56% (p=0.000 n=10+9) Compiler 2.87s ± 1% 2.89s ± 1% +0.51% (p=0.002 n=9+10) SSA 7.29s ± 1% 7.25s ± 1% ~ (p=0.182 n=10+9) Flate 118ms ± 2% 118ms ± 1% ~ (p=0.113 n=9+9) GoParser 147ms ± 1% 148ms ± 1% +1.07% (p=0.003 n=9+10) Reflect 401ms ± 1% 404ms ± 1% +0.71% (p=0.003 n=10+9) Tar 175ms ± 1% 175ms ± 1% ~ (p=0.604 n=9+10) XML 209ms ± 1% 210ms ± 1% ~ (p=0.052 n=10+10) (https://perf.golang.org/search?q=upload:20171231.4) name old time/op new time/op delta Garbage/benchmem-MB=64-12 2.23ms ± 1% 2.25ms ± 1% +0.84% (p=0.000 n=19+19) (https://perf.golang.org/search?q=upload:20171231.3) Relative to the start of the sparse heap changes (starting at and including "runtime: fix various contiguous bitmap assumptions"), overall slowdown is roughly 1% on GC-intensive benchmarks: name old time/op new time/op delta Template 183ms ± 1% 185ms ± 1% +1.32% (p=0.000 n=9+9) Unicode 84.9ms ± 2% 86.3ms ± 1% +1.65% (p=0.000 n=9+10) GoTypes 595ms ± 1% 602ms ± 0% +1.19% (p=0.000 n=9+9) Compiler 2.86s ± 0% 2.89s ± 1% +0.91% (p=0.000 n=9+10) SSA 7.19s ± 0% 7.25s ± 1% +0.75% (p=0.000 n=8+9) Flate 117ms ± 1% 118ms ± 1% +1.10% (p=0.000 n=10+9) GoParser 146ms ± 2% 148ms ± 1% +1.48% (p=0.002 n=10+10) Reflect 398ms ± 1% 404ms ± 1% +1.51% (p=0.000 n=10+9) Tar 173ms ± 1% 175ms ± 1% +1.17% (p=0.000 n=10+10) XML 208ms ± 1% 210ms ± 1% +0.62% (p=0.011 n=10+10) [Geo mean] 369ms 373ms +1.17% (https://perf.golang.org/search?q=upload:20180101.2) name old time/op new time/op delta Garbage/benchmem-MB=64-12 2.22ms ± 1% 2.25ms ± 1% +1.51% (p=0.000 n=20+19) (https://perf.golang.org/search?q=upload:20180101.3) Change-Id: I5daf4cfec24b252e5a57001f0a6c03f22479d0f0 Reviewed-on: https://go-review.googlesource.com/85887 Run-TryBot: Austin Clements <austin@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Rick Hudson <rlh@golang.org>
2017-12-19 22:05:23 -08:00
hint := mheap_.arenaHints
addr := hint.addr
if hint.down {
start, end = addr-heapArenaBytes, addr
addr -= physPageSize
} else {
start, end = addr, addr+heapArenaBytes
}
got := sysReserve(unsafe.Pointer(addr), physPageSize)
ok = (addr == uintptr(got))
if !ok {
// We were unable to get the requested reservation.
// Release what we did get and fail.
sysFreeOS(got, physPageSize)
}
runtime: use sparse mappings for the heap This replaces the contiguous heap arena mapping with a potentially sparse mapping that can support heap mappings anywhere in the address space. This has several advantages over the current approach: * There is no longer any limit on the size of the Go heap. (Currently it's limited to 512GB.) Hence, this fixes #10460. * It eliminates many failures modes of heap initialization and growing. In particular it eliminates any possibility of panicking with an address space conflict. This can happen for many reasons and even causes a low but steady rate of TSAN test failures because of conflicts with the TSAN runtime. See #16936 and #11993. * It eliminates the notion of "non-reserved" heap, which was added because creating huge address space reservations (particularly on 64-bit) led to huge process VSIZE. This was at best confusing and at worst conflicted badly with ulimit -v. However, the non-reserved heap logic is complicated, can race with other mappings in non-pure Go binaries (e.g., #18976), and requires that the entire heap be either reserved or non-reserved. We currently maintain the latter property, but it's quite difficult to convince yourself of that, and hence difficult to keep correct. This logic is still present, but will be removed in the next CL. * It fixes problems on 32-bit where skipping over parts of the address space leads to mapping huge (and never-to-be-used) metadata structures. See #19831. This also completely rewrites and significantly simplifies mheap.sysAlloc, which has been a source of many bugs. E.g., #21044, #20259, #18651, and #13143 (and maybe #23222). This change also makes it possible to allocate individual objects larger than 512GB. As a result, a few tests that expected huge allocations to fail needed to be changed to make even larger allocations. However, at the moment attempting to allocate a humongous object may cause the program to freeze for several minutes on Linux as we fall back to probing every page with addrspace_free. That logic (and this failure mode) will be removed in the next CL. Fixes #10460. Fixes #22204 (since it rewrites the code involved). This slightly slows down compilebench and the x/benchmarks garbage benchmark. name old time/op new time/op delta Template 184ms ± 1% 185ms ± 1% ~ (p=0.065 n=10+9) Unicode 86.9ms ± 3% 86.3ms ± 1% ~ (p=0.631 n=10+10) GoTypes 599ms ± 0% 602ms ± 0% +0.56% (p=0.000 n=10+9) Compiler 2.87s ± 1% 2.89s ± 1% +0.51% (p=0.002 n=9+10) SSA 7.29s ± 1% 7.25s ± 1% ~ (p=0.182 n=10+9) Flate 118ms ± 2% 118ms ± 1% ~ (p=0.113 n=9+9) GoParser 147ms ± 1% 148ms ± 1% +1.07% (p=0.003 n=9+10) Reflect 401ms ± 1% 404ms ± 1% +0.71% (p=0.003 n=10+9) Tar 175ms ± 1% 175ms ± 1% ~ (p=0.604 n=9+10) XML 209ms ± 1% 210ms ± 1% ~ (p=0.052 n=10+10) (https://perf.golang.org/search?q=upload:20171231.4) name old time/op new time/op delta Garbage/benchmem-MB=64-12 2.23ms ± 1% 2.25ms ± 1% +0.84% (p=0.000 n=19+19) (https://perf.golang.org/search?q=upload:20171231.3) Relative to the start of the sparse heap changes (starting at and including "runtime: fix various contiguous bitmap assumptions"), overall slowdown is roughly 1% on GC-intensive benchmarks: name old time/op new time/op delta Template 183ms ± 1% 185ms ± 1% +1.32% (p=0.000 n=9+9) Unicode 84.9ms ± 2% 86.3ms ± 1% +1.65% (p=0.000 n=9+10) GoTypes 595ms ± 1% 602ms ± 0% +1.19% (p=0.000 n=9+9) Compiler 2.86s ± 0% 2.89s ± 1% +0.91% (p=0.000 n=9+10) SSA 7.19s ± 0% 7.25s ± 1% +0.75% (p=0.000 n=8+9) Flate 117ms ± 1% 118ms ± 1% +1.10% (p=0.000 n=10+9) GoParser 146ms ± 2% 148ms ± 1% +1.48% (p=0.002 n=10+10) Reflect 398ms ± 1% 404ms ± 1% +1.51% (p=0.000 n=10+9) Tar 173ms ± 1% 175ms ± 1% +1.17% (p=0.000 n=10+10) XML 208ms ± 1% 210ms ± 1% +0.62% (p=0.011 n=10+10) [Geo mean] 369ms 373ms +1.17% (https://perf.golang.org/search?q=upload:20180101.2) name old time/op new time/op delta Garbage/benchmem-MB=64-12 2.22ms ± 1% 2.25ms ± 1% +1.51% (p=0.000 n=20+19) (https://perf.golang.org/search?q=upload:20180101.3) Change-Id: I5daf4cfec24b252e5a57001f0a6c03f22479d0f0 Reviewed-on: https://go-review.googlesource.com/85887 Run-TryBot: Austin Clements <austin@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Rick Hudson <rlh@golang.org>
2017-12-19 22:05:23 -08:00
return
}
func GetNextArenaHint() uintptr {
return mheap_.arenaHints.addr
}
type G = g
type Sudog = sudog
func Getg() *G {
return getg()
}
func GIsWaitingOnMutex(gp *G) bool {
return readgstatus(gp) == _Gwaiting && gp.waitreason.isMutexWait()
}
var CasGStatusAlwaysTrack = &casgstatusAlwaysTrack
//go:noinline
func PanicForTesting(b []byte, i int) byte {
return unexportedPanicForTesting(b, i)
}
//go:noinline
func unexportedPanicForTesting(b []byte, i int) byte {
return b[i]
}
func G0StackOverflow() {
systemstack(func() {
stackOverflow(nil)
})
}
func stackOverflow(x *byte) {
var buf [256]byte
stackOverflow(&buf[0])
}
func MapTombstoneCheck(m map[int]int) {
// Make sure emptyOne and emptyRest are distributed correctly.
// We should have a series of filled and emptyOne cells, followed by
// a series of emptyRest cells.
h := *(**hmap)(unsafe.Pointer(&m))
i := any(m)
t := *(**maptype)(unsafe.Pointer(&i))
for x := 0; x < 1<<h.B; x++ {
b0 := (*bmap)(add(h.buckets, uintptr(x)*uintptr(t.bucketsize)))
n := 0
for b := b0; b != nil; b = b.overflow(t) {
for i := 0; i < bucketCnt; i++ {
if b.tophash[i] != emptyRest {
n++
}
}
}
k := 0
for b := b0; b != nil; b = b.overflow(t) {
for i := 0; i < bucketCnt; i++ {
if k < n && b.tophash[i] == emptyRest {
panic("early emptyRest")
}
if k >= n && b.tophash[i] != emptyRest {
panic("late non-emptyRest")
}
if k == n-1 && b.tophash[i] == emptyOne {
panic("last non-emptyRest entry is emptyOne")
}
k++
}
}
}
}
func RunGetgThreadSwitchTest() {
// Test that getg works correctly with thread switch.
// With gccgo, if we generate getg inlined, the backend
// may cache the address of the TLS variable, which
// will become invalid after a thread switch. This test
// checks that the bad caching doesn't happen.
ch := make(chan int)
go func(ch chan int) {
ch <- 5
LockOSThread()
}(ch)
g1 := getg()
// Block on a receive. This is likely to get us a thread
// switch. If we yield to the sender goroutine, it will
// lock the thread, forcing us to resume on a different
// thread.
<-ch
g2 := getg()
if g1 != g2 {
panic("g1 != g2")
}
// Also test getg after some control flow, as the
// backend is sensitive to control flow.
g3 := getg()
if g1 != g3 {
panic("g1 != g3")
}
}
const (
PageSize = pageSize
PallocChunkPages = pallocChunkPages
PageAlloc64Bit = pageAlloc64Bit
PallocSumBytes = pallocSumBytes
)
// Expose pallocSum for testing.
type PallocSum pallocSum
func PackPallocSum(start, max, end uint) PallocSum { return PallocSum(packPallocSum(start, max, end)) }
func (m PallocSum) Start() uint { return pallocSum(m).start() }
func (m PallocSum) Max() uint { return pallocSum(m).max() }
func (m PallocSum) End() uint { return pallocSum(m).end() }
// Expose pallocBits for testing.
type PallocBits pallocBits
func (b *PallocBits) Find(npages uintptr, searchIdx uint) (uint, uint) {
return (*pallocBits)(b).find(npages, searchIdx)
}
func (b *PallocBits) AllocRange(i, n uint) { (*pallocBits)(b).allocRange(i, n) }
func (b *PallocBits) Free(i, n uint) { (*pallocBits)(b).free(i, n) }
func (b *PallocBits) Summarize() PallocSum { return PallocSum((*pallocBits)(b).summarize()) }
func (b *PallocBits) PopcntRange(i, n uint) uint { return (*pageBits)(b).popcntRange(i, n) }
// SummarizeSlow is a slow but more obviously correct implementation
// of (*pallocBits).summarize. Used for testing.
func SummarizeSlow(b *PallocBits) PallocSum {
var start, max, end uint
const N = uint(len(b)) * 64
for start < N && (*pageBits)(b).get(start) == 0 {
start++
}
for end < N && (*pageBits)(b).get(N-end-1) == 0 {
end++
}
run := uint(0)
for i := uint(0); i < N; i++ {
if (*pageBits)(b).get(i) == 0 {
run++
} else {
run = 0
}
if run > max {
max = run
}
}
return PackPallocSum(start, max, end)
}
// Expose non-trivial helpers for testing.
func FindBitRange64(c uint64, n uint) uint { return findBitRange64(c, n) }
// Given two PallocBits, returns a set of bit ranges where
// they differ.
func DiffPallocBits(a, b *PallocBits) []BitRange {
ba := (*pageBits)(a)
bb := (*pageBits)(b)
var d []BitRange
base, size := uint(0), uint(0)
for i := uint(0); i < uint(len(ba))*64; i++ {
if ba.get(i) != bb.get(i) {
if size == 0 {
base = i
}
size++
} else {
if size != 0 {
d = append(d, BitRange{base, size})
}
size = 0
}
}
if size != 0 {
d = append(d, BitRange{base, size})
}
return d
}
// StringifyPallocBits gets the bits in the bit range r from b,
// and returns a string containing the bits as ASCII 0 and 1
// characters.
func StringifyPallocBits(b *PallocBits, r BitRange) string {
str := ""
for j := r.I; j < r.I+r.N; j++ {
if (*pageBits)(b).get(j) != 0 {
str += "1"
} else {
str += "0"
}
}
return str
}
// Expose pallocData for testing.
type PallocData pallocData
func (d *PallocData) FindScavengeCandidate(searchIdx uint, min, max uintptr) (uint, uint) {
return (*pallocData)(d).findScavengeCandidate(searchIdx, min, max)
}
func (d *PallocData) AllocRange(i, n uint) { (*pallocData)(d).allocRange(i, n) }
func (d *PallocData) ScavengedSetRange(i, n uint) {
(*pallocData)(d).scavenged.setRange(i, n)
}
func (d *PallocData) PallocBits() *PallocBits {
return (*PallocBits)(&(*pallocData)(d).pallocBits)
}
func (d *PallocData) Scavenged() *PallocBits {
return (*PallocBits)(&(*pallocData)(d).scavenged)
}
// Expose fillAligned for testing.
func FillAligned(x uint64, m uint) uint64 { return fillAligned(x, m) }
// Expose pageCache for testing.
type PageCache pageCache
const PageCachePages = pageCachePages
func NewPageCache(base uintptr, cache, scav uint64) PageCache {
return PageCache(pageCache{base: base, cache: cache, scav: scav})
}
func (c *PageCache) Empty() bool { return (*pageCache)(c).empty() }
func (c *PageCache) Base() uintptr { return (*pageCache)(c).base }
func (c *PageCache) Cache() uint64 { return (*pageCache)(c).cache }
func (c *PageCache) Scav() uint64 { return (*pageCache)(c).scav }
func (c *PageCache) Alloc(npages uintptr) (uintptr, uintptr) {
return (*pageCache)(c).alloc(npages)
}
func (c *PageCache) Flush(s *PageAlloc) {
cp := (*pageCache)(c)
sp := (*pageAlloc)(s)
systemstack(func() {
// None of the tests need any higher-level locking, so we just
// take the lock internally.
lock(sp.mheapLock)
cp.flush(sp)
unlock(sp.mheapLock)
})
}
// Expose chunk index type.
type ChunkIdx chunkIdx
// Expose pageAlloc for testing. Note that because pageAlloc is
// not in the heap, so is PageAlloc.
type PageAlloc pageAlloc
func (p *PageAlloc) Alloc(npages uintptr) (uintptr, uintptr) {
pp := (*pageAlloc)(p)
var addr, scav uintptr
systemstack(func() {
// None of the tests need any higher-level locking, so we just
// take the lock internally.
lock(pp.mheapLock)
addr, scav = pp.alloc(npages)
unlock(pp.mheapLock)
})
return addr, scav
}
func (p *PageAlloc) AllocToCache() PageCache {
pp := (*pageAlloc)(p)
var c PageCache
systemstack(func() {
// None of the tests need any higher-level locking, so we just
// take the lock internally.
lock(pp.mheapLock)
c = PageCache(pp.allocToCache())
unlock(pp.mheapLock)
})
return c
}
func (p *PageAlloc) Free(base, npages uintptr) {
pp := (*pageAlloc)(p)
systemstack(func() {
// None of the tests need any higher-level locking, so we just
// take the lock internally.
lock(pp.mheapLock)
runtime: don't hold the heap lock while scavenging This change modifies the scavenger to no longer hold the heap lock while actively scavenging pages. To achieve this, the change also: * Reverses the locking behavior of the (*pageAlloc).scavenge API, to only acquire the heap lock when necessary. * Introduces a new lock on the scavenger-related fields in a pageAlloc so that access to those fields doesn't require the heap lock. There are a few places in the scavenge path, notably reservation, that requires synchronization. The heap lock is far too heavy handed for this case. * Changes the scavenger to marks pages that are actively being scavenged as allocated, and "frees" them back to the page allocator the usual way. * Lifts the heap-growth scavenging code out of mheap.grow, where the heap lock is held, and into allocSpan, just after the lock is released. Releasing the lock during mheap.grow is not feasible if we want to ensure that allocation always makes progress (post-growth, another allocator could come in and take all that space, forcing the goroutine that just grew the heap to do so again). This change means that the scavenger now must do more work for each scavenge, but it is also now much more scalable. Although in theory it's not great by always taking the locked paths in the page allocator, it takes advantage of some properties of the allocator: * Most of the time, the scavenger will be working with one page at a time. The page allocator's locked path is optimized for this case. * On the allocation path, it doesn't need to do the find operation at all; it can go straight to setting bits for the range and updating the summary structure. Change-Id: Ie941d5e7c05dcc96476795c63fef74bcafc2a0f1 Reviewed-on: https://go-review.googlesource.com/c/go/+/353974 Trust: Michael Knyszek <mknyszek@google.com> Reviewed-by: Michael Pratt <mpratt@google.com>
2021-10-04 20:36:49 +00:00
pp.free(base, npages, true)
unlock(pp.mheapLock)
})
}
func (p *PageAlloc) Bounds() (ChunkIdx, ChunkIdx) {
return ChunkIdx((*pageAlloc)(p).start), ChunkIdx((*pageAlloc)(p).end)
}
runtime: don't hold the heap lock while scavenging This change modifies the scavenger to no longer hold the heap lock while actively scavenging pages. To achieve this, the change also: * Reverses the locking behavior of the (*pageAlloc).scavenge API, to only acquire the heap lock when necessary. * Introduces a new lock on the scavenger-related fields in a pageAlloc so that access to those fields doesn't require the heap lock. There are a few places in the scavenge path, notably reservation, that requires synchronization. The heap lock is far too heavy handed for this case. * Changes the scavenger to marks pages that are actively being scavenged as allocated, and "frees" them back to the page allocator the usual way. * Lifts the heap-growth scavenging code out of mheap.grow, where the heap lock is held, and into allocSpan, just after the lock is released. Releasing the lock during mheap.grow is not feasible if we want to ensure that allocation always makes progress (post-growth, another allocator could come in and take all that space, forcing the goroutine that just grew the heap to do so again). This change means that the scavenger now must do more work for each scavenge, but it is also now much more scalable. Although in theory it's not great by always taking the locked paths in the page allocator, it takes advantage of some properties of the allocator: * Most of the time, the scavenger will be working with one page at a time. The page allocator's locked path is optimized for this case. * On the allocation path, it doesn't need to do the find operation at all; it can go straight to setting bits for the range and updating the summary structure. Change-Id: Ie941d5e7c05dcc96476795c63fef74bcafc2a0f1 Reviewed-on: https://go-review.googlesource.com/c/go/+/353974 Trust: Michael Knyszek <mknyszek@google.com> Reviewed-by: Michael Pratt <mpratt@google.com>
2021-10-04 20:36:49 +00:00
func (p *PageAlloc) Scavenge(nbytes uintptr) (r uintptr) {
pp := (*pageAlloc)(p)
systemstack(func() {
r = pp.scavenge(nbytes, nil)
})
return
}
runtime: track ranges of address space which are owned by the heap This change adds a new inUse field to the allocator which tracks ranges of addresses that are owned by the heap. It is updated on each heap growth. These ranges are tracked in an array which is kept sorted. In practice this array shouldn't exceed its initial allocation except in rare cases and thus should be small (ideally exactly 1 element in size). In a hypothetical worst-case scenario wherein we have a 1 TiB heap and 4 MiB arenas (note that the address ranges will never be at a smaller granularity than an arena, since arenas are always allocated contiguously), inUse would use at most 4 MiB of memory if the heap mappings were completely discontiguous (highly unlikely) with an additional 2 MiB leaked from previous allocations. Furthermore, the copies that are done to keep the inUse array sorted will copy at most 4 MiB of memory in such a scenario, which, assuming a conservative copying rate of 5 GiB/s, amounts to about 800µs. However, note that in practice: 1) Most 64-bit platforms have 64 MiB arenas. 2) The copies should incur little-to-no page faults, meaning a copy rate closer to 25-50 GiB/s is expected. 3) Go heaps are almost always mostly contiguous. Updates #35514. Change-Id: I3ad07f1c2b5b9340acf59ecc3b9ae09e884814fe Reviewed-on: https://go-review.googlesource.com/c/go/+/207757 Run-TryBot: Michael Knyszek <mknyszek@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Cherry Zhang <cherryyz@google.com> Reviewed-by: Austin Clements <austin@google.com>
2019-11-15 23:30:30 +00:00
func (p *PageAlloc) InUse() []AddrRange {
ranges := make([]AddrRange, 0, len(p.inUse.ranges))
for _, r := range p.inUse.ranges {
ranges = append(ranges, AddrRange{r})
runtime: track ranges of address space which are owned by the heap This change adds a new inUse field to the allocator which tracks ranges of addresses that are owned by the heap. It is updated on each heap growth. These ranges are tracked in an array which is kept sorted. In practice this array shouldn't exceed its initial allocation except in rare cases and thus should be small (ideally exactly 1 element in size). In a hypothetical worst-case scenario wherein we have a 1 TiB heap and 4 MiB arenas (note that the address ranges will never be at a smaller granularity than an arena, since arenas are always allocated contiguously), inUse would use at most 4 MiB of memory if the heap mappings were completely discontiguous (highly unlikely) with an additional 2 MiB leaked from previous allocations. Furthermore, the copies that are done to keep the inUse array sorted will copy at most 4 MiB of memory in such a scenario, which, assuming a conservative copying rate of 5 GiB/s, amounts to about 800µs. However, note that in practice: 1) Most 64-bit platforms have 64 MiB arenas. 2) The copies should incur little-to-no page faults, meaning a copy rate closer to 25-50 GiB/s is expected. 3) Go heaps are almost always mostly contiguous. Updates #35514. Change-Id: I3ad07f1c2b5b9340acf59ecc3b9ae09e884814fe Reviewed-on: https://go-review.googlesource.com/c/go/+/207757 Run-TryBot: Michael Knyszek <mknyszek@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Cherry Zhang <cherryyz@google.com> Reviewed-by: Austin Clements <austin@google.com>
2019-11-15 23:30:30 +00:00
}
return ranges
}
runtime: convert page allocator bitmap to sparse array Currently the page allocator bitmap is implemented as a single giant memory mapping which is reserved at init time and committed as needed. This causes problems on systems that don't handle large uncommitted mappings well, or institute low virtual address space defaults as a memory limiting mechanism. This change modifies the implementation of the page allocator bitmap away from a directly-mapped set of bytes to a sparse array in same vein as mheap.arenas. This will hurt performance a little but the biggest gains are from the lockless allocation possible with the page allocator, so the impact of this extra layer of indirection should be minimal. In fact, this is exactly what we see: https://perf.golang.org/search?q=upload:20191125.5 This reduces the amount of mapped (PROT_NONE) memory needed on systems with 48-bit address spaces to ~600 MiB down from almost 9 GiB. The bulk of this remaining memory is used by the summaries. Go processes with 32-bit address spaces now always commit to 128 KiB of memory for the bitmap. Previously it would only commit the pages in the bitmap which represented the range of addresses (lowest address to highest address, even if there are unused regions in that range) used by the heap. Updates #35568. Updates #35451. Change-Id: I0ff10380156568642b80c366001eefd0a4e6c762 Reviewed-on: https://go-review.googlesource.com/c/go/+/207497 Run-TryBot: Michael Knyszek <mknyszek@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Austin Clements <austin@google.com> Reviewed-by: Cherry Zhang <cherryyz@google.com>
2019-11-14 23:58:50 +00:00
// Returns nil if the PallocData's L2 is missing.
func (p *PageAlloc) PallocData(i ChunkIdx) *PallocData {
ci := chunkIdx(i)
return (*PallocData)((*pageAlloc)(p).tryChunkOf(ci))
runtime: convert page allocator bitmap to sparse array Currently the page allocator bitmap is implemented as a single giant memory mapping which is reserved at init time and committed as needed. This causes problems on systems that don't handle large uncommitted mappings well, or institute low virtual address space defaults as a memory limiting mechanism. This change modifies the implementation of the page allocator bitmap away from a directly-mapped set of bytes to a sparse array in same vein as mheap.arenas. This will hurt performance a little but the biggest gains are from the lockless allocation possible with the page allocator, so the impact of this extra layer of indirection should be minimal. In fact, this is exactly what we see: https://perf.golang.org/search?q=upload:20191125.5 This reduces the amount of mapped (PROT_NONE) memory needed on systems with 48-bit address spaces to ~600 MiB down from almost 9 GiB. The bulk of this remaining memory is used by the summaries. Go processes with 32-bit address spaces now always commit to 128 KiB of memory for the bitmap. Previously it would only commit the pages in the bitmap which represented the range of addresses (lowest address to highest address, even if there are unused regions in that range) used by the heap. Updates #35568. Updates #35451. Change-Id: I0ff10380156568642b80c366001eefd0a4e6c762 Reviewed-on: https://go-review.googlesource.com/c/go/+/207497 Run-TryBot: Michael Knyszek <mknyszek@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Austin Clements <austin@google.com> Reviewed-by: Cherry Zhang <cherryyz@google.com>
2019-11-14 23:58:50 +00:00
}
// AddrRange is a wrapper around addrRange for testing.
runtime: track ranges of address space which are owned by the heap This change adds a new inUse field to the allocator which tracks ranges of addresses that are owned by the heap. It is updated on each heap growth. These ranges are tracked in an array which is kept sorted. In practice this array shouldn't exceed its initial allocation except in rare cases and thus should be small (ideally exactly 1 element in size). In a hypothetical worst-case scenario wherein we have a 1 TiB heap and 4 MiB arenas (note that the address ranges will never be at a smaller granularity than an arena, since arenas are always allocated contiguously), inUse would use at most 4 MiB of memory if the heap mappings were completely discontiguous (highly unlikely) with an additional 2 MiB leaked from previous allocations. Furthermore, the copies that are done to keep the inUse array sorted will copy at most 4 MiB of memory in such a scenario, which, assuming a conservative copying rate of 5 GiB/s, amounts to about 800µs. However, note that in practice: 1) Most 64-bit platforms have 64 MiB arenas. 2) The copies should incur little-to-no page faults, meaning a copy rate closer to 25-50 GiB/s is expected. 3) Go heaps are almost always mostly contiguous. Updates #35514. Change-Id: I3ad07f1c2b5b9340acf59ecc3b9ae09e884814fe Reviewed-on: https://go-review.googlesource.com/c/go/+/207757 Run-TryBot: Michael Knyszek <mknyszek@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Cherry Zhang <cherryyz@google.com> Reviewed-by: Austin Clements <austin@google.com>
2019-11-15 23:30:30 +00:00
type AddrRange struct {
addrRange
}
// MakeAddrRange creates a new address range.
func MakeAddrRange(base, limit uintptr) AddrRange {
return AddrRange{makeAddrRange(base, limit)}
}
// Base returns the virtual base address of the address range.
func (a AddrRange) Base() uintptr {
return a.addrRange.base.addr()
}
// Base returns the virtual address of the limit of the address range.
func (a AddrRange) Limit() uintptr {
return a.addrRange.limit.addr()
}
// Equals returns true if the two address ranges are exactly equal.
func (a AddrRange) Equals(b AddrRange) bool {
return a == b
runtime: track ranges of address space which are owned by the heap This change adds a new inUse field to the allocator which tracks ranges of addresses that are owned by the heap. It is updated on each heap growth. These ranges are tracked in an array which is kept sorted. In practice this array shouldn't exceed its initial allocation except in rare cases and thus should be small (ideally exactly 1 element in size). In a hypothetical worst-case scenario wherein we have a 1 TiB heap and 4 MiB arenas (note that the address ranges will never be at a smaller granularity than an arena, since arenas are always allocated contiguously), inUse would use at most 4 MiB of memory if the heap mappings were completely discontiguous (highly unlikely) with an additional 2 MiB leaked from previous allocations. Furthermore, the copies that are done to keep the inUse array sorted will copy at most 4 MiB of memory in such a scenario, which, assuming a conservative copying rate of 5 GiB/s, amounts to about 800µs. However, note that in practice: 1) Most 64-bit platforms have 64 MiB arenas. 2) The copies should incur little-to-no page faults, meaning a copy rate closer to 25-50 GiB/s is expected. 3) Go heaps are almost always mostly contiguous. Updates #35514. Change-Id: I3ad07f1c2b5b9340acf59ecc3b9ae09e884814fe Reviewed-on: https://go-review.googlesource.com/c/go/+/207757 Run-TryBot: Michael Knyszek <mknyszek@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Cherry Zhang <cherryyz@google.com> Reviewed-by: Austin Clements <austin@google.com>
2019-11-15 23:30:30 +00:00
}
// Size returns the size in bytes of the address range.
func (a AddrRange) Size() uintptr {
return a.addrRange.size()
}
runtime: track how much memory is mapped in the Ready state This change adds a field to memstats called mappedReady that tracks how much memory is in the Ready state at any given time. In essence, it's the total memory usage by the Go runtime (with one exception which is documented). Essentially, all memory mapped read/write that has either been paged in or will soon. To make tracking this not involve the many different stats that track mapped memory, we track this statistic at a very low level. The downside of tracking this statistic at such a low level is that it managed to catch lots of situations where the runtime wasn't fully accounting for memory. This change rectifies these situations by always accounting for memory that's mapped in some way (i.e. always passing a sysMemStat to a mem.go function), with *two* exceptions. Rectifying these situations means also having the memory mapped during testing being accounted for, so that tests (i.e. ReadMemStats) that ultimately check mappedReady continue to work correctly without special exceptions. We choose to simply account for this memory in other_sys. Let's talk about the exceptions. The first is the arenas array for finding heap arena metadata from an address is mapped as read/write in one large chunk. It's tens of MiB in size. On systems with demand paging, we assume that the whole thing isn't paged in at once (after all, it maps to the whole address space, and it's exceedingly difficult with today's technology to even broach having as much physical memory as the total address space). On systems where we have to commit memory manually, we use a two-level structure. Now, the reason why this is an exception is because we have no mechanism to track what memory is paged in, and we can't just account for the entire thing, because that would *look* like an enormous overhead. Furthermore, this structure is on a few really, really critical paths in the runtime, so doing more explicit tracking isn't really an option. So, we explicitly don't and call sysAllocOS to map this memory. The second exception is that we call sysFree with no accounting to clean up address space reservations, or otherwise to throw out mappings we don't care about. In this case, also drop down to a lower level and call sysFreeOS to explicitly avoid accounting. The third exception is debuglog allocations. That is purely a debugging facility and ideally we want it to have as small an impact on the runtime as possible. If we include it in mappedReady calculations, it could cause GC pacing shifts in future CLs, especailly if one increases the debuglog buffer sizes as a one-off. As of this CL, these are the only three places in the runtime that would pass nil for a stat to any of the functions in mem.go. As a result, this CL makes sysMemStats mandatory to facilitate better accounting in the future. It's now much easier to grep and find out where accounting is explicitly elided, because one doesn't have to follow the trail of sysMemStat nil pointer values, and can just look at the function name. For #48409. Change-Id: I274eb467fc2603881717482214fddc47c9eaf218 Reviewed-on: https://go-review.googlesource.com/c/go/+/393402 Reviewed-by: Michael Pratt <mpratt@google.com> TryBot-Result: Gopher Robot <gobot@golang.org> Run-TryBot: Michael Knyszek <mknyszek@google.com>
2022-03-15 02:48:18 +00:00
// testSysStat is the sysStat passed to test versions of various
// runtime structures. We do actually have to keep track of this
// because otherwise memstats.mappedReady won't actually line up
// with other stats in the runtime during tests.
var testSysStat = &memstats.other_sys
// AddrRanges is a wrapper around addrRanges for testing.
type AddrRanges struct {
addrRanges
mutable bool
}
// NewAddrRanges creates a new empty addrRanges.
//
// Note that this initializes addrRanges just like in the
// runtime, so its memory is persistentalloc'd. Call this
// function sparingly since the memory it allocates is
// leaked.
//
// This AddrRanges is mutable, so we can test methods like
// Add.
func NewAddrRanges() AddrRanges {
r := addrRanges{}
runtime: track how much memory is mapped in the Ready state This change adds a field to memstats called mappedReady that tracks how much memory is in the Ready state at any given time. In essence, it's the total memory usage by the Go runtime (with one exception which is documented). Essentially, all memory mapped read/write that has either been paged in or will soon. To make tracking this not involve the many different stats that track mapped memory, we track this statistic at a very low level. The downside of tracking this statistic at such a low level is that it managed to catch lots of situations where the runtime wasn't fully accounting for memory. This change rectifies these situations by always accounting for memory that's mapped in some way (i.e. always passing a sysMemStat to a mem.go function), with *two* exceptions. Rectifying these situations means also having the memory mapped during testing being accounted for, so that tests (i.e. ReadMemStats) that ultimately check mappedReady continue to work correctly without special exceptions. We choose to simply account for this memory in other_sys. Let's talk about the exceptions. The first is the arenas array for finding heap arena metadata from an address is mapped as read/write in one large chunk. It's tens of MiB in size. On systems with demand paging, we assume that the whole thing isn't paged in at once (after all, it maps to the whole address space, and it's exceedingly difficult with today's technology to even broach having as much physical memory as the total address space). On systems where we have to commit memory manually, we use a two-level structure. Now, the reason why this is an exception is because we have no mechanism to track what memory is paged in, and we can't just account for the entire thing, because that would *look* like an enormous overhead. Furthermore, this structure is on a few really, really critical paths in the runtime, so doing more explicit tracking isn't really an option. So, we explicitly don't and call sysAllocOS to map this memory. The second exception is that we call sysFree with no accounting to clean up address space reservations, or otherwise to throw out mappings we don't care about. In this case, also drop down to a lower level and call sysFreeOS to explicitly avoid accounting. The third exception is debuglog allocations. That is purely a debugging facility and ideally we want it to have as small an impact on the runtime as possible. If we include it in mappedReady calculations, it could cause GC pacing shifts in future CLs, especailly if one increases the debuglog buffer sizes as a one-off. As of this CL, these are the only three places in the runtime that would pass nil for a stat to any of the functions in mem.go. As a result, this CL makes sysMemStats mandatory to facilitate better accounting in the future. It's now much easier to grep and find out where accounting is explicitly elided, because one doesn't have to follow the trail of sysMemStat nil pointer values, and can just look at the function name. For #48409. Change-Id: I274eb467fc2603881717482214fddc47c9eaf218 Reviewed-on: https://go-review.googlesource.com/c/go/+/393402 Reviewed-by: Michael Pratt <mpratt@google.com> TryBot-Result: Gopher Robot <gobot@golang.org> Run-TryBot: Michael Knyszek <mknyszek@google.com>
2022-03-15 02:48:18 +00:00
r.init(testSysStat)
return AddrRanges{r, true}
}
// MakeAddrRanges creates a new addrRanges populated with
// the ranges in a.
//
// The returned AddrRanges is immutable, so methods like
// Add will fail.
func MakeAddrRanges(a ...AddrRange) AddrRanges {
// Methods that manipulate the backing store of addrRanges.ranges should
// not be used on the result from this function (e.g. add) since they may
// trigger reallocation. That would normally be fine, except the new
// backing store won't come from the heap, but from persistentalloc, so
// we'll leak some memory implicitly.
ranges := make([]addrRange, 0, len(a))
total := uintptr(0)
for _, r := range a {
ranges = append(ranges, r.addrRange)
total += r.Size()
}
return AddrRanges{addrRanges{
ranges: ranges,
totalBytes: total,
runtime: track how much memory is mapped in the Ready state This change adds a field to memstats called mappedReady that tracks how much memory is in the Ready state at any given time. In essence, it's the total memory usage by the Go runtime (with one exception which is documented). Essentially, all memory mapped read/write that has either been paged in or will soon. To make tracking this not involve the many different stats that track mapped memory, we track this statistic at a very low level. The downside of tracking this statistic at such a low level is that it managed to catch lots of situations where the runtime wasn't fully accounting for memory. This change rectifies these situations by always accounting for memory that's mapped in some way (i.e. always passing a sysMemStat to a mem.go function), with *two* exceptions. Rectifying these situations means also having the memory mapped during testing being accounted for, so that tests (i.e. ReadMemStats) that ultimately check mappedReady continue to work correctly without special exceptions. We choose to simply account for this memory in other_sys. Let's talk about the exceptions. The first is the arenas array for finding heap arena metadata from an address is mapped as read/write in one large chunk. It's tens of MiB in size. On systems with demand paging, we assume that the whole thing isn't paged in at once (after all, it maps to the whole address space, and it's exceedingly difficult with today's technology to even broach having as much physical memory as the total address space). On systems where we have to commit memory manually, we use a two-level structure. Now, the reason why this is an exception is because we have no mechanism to track what memory is paged in, and we can't just account for the entire thing, because that would *look* like an enormous overhead. Furthermore, this structure is on a few really, really critical paths in the runtime, so doing more explicit tracking isn't really an option. So, we explicitly don't and call sysAllocOS to map this memory. The second exception is that we call sysFree with no accounting to clean up address space reservations, or otherwise to throw out mappings we don't care about. In this case, also drop down to a lower level and call sysFreeOS to explicitly avoid accounting. The third exception is debuglog allocations. That is purely a debugging facility and ideally we want it to have as small an impact on the runtime as possible. If we include it in mappedReady calculations, it could cause GC pacing shifts in future CLs, especailly if one increases the debuglog buffer sizes as a one-off. As of this CL, these are the only three places in the runtime that would pass nil for a stat to any of the functions in mem.go. As a result, this CL makes sysMemStats mandatory to facilitate better accounting in the future. It's now much easier to grep and find out where accounting is explicitly elided, because one doesn't have to follow the trail of sysMemStat nil pointer values, and can just look at the function name. For #48409. Change-Id: I274eb467fc2603881717482214fddc47c9eaf218 Reviewed-on: https://go-review.googlesource.com/c/go/+/393402 Reviewed-by: Michael Pratt <mpratt@google.com> TryBot-Result: Gopher Robot <gobot@golang.org> Run-TryBot: Michael Knyszek <mknyszek@google.com>
2022-03-15 02:48:18 +00:00
sysStat: testSysStat,
}, false}
}
// Ranges returns a copy of the ranges described by the
// addrRanges.
func (a *AddrRanges) Ranges() []AddrRange {
result := make([]AddrRange, 0, len(a.addrRanges.ranges))
for _, r := range a.addrRanges.ranges {
result = append(result, AddrRange{r})
}
return result
}
// FindSucc returns the successor to base. See addrRanges.findSucc
// for more details.
func (a *AddrRanges) FindSucc(base uintptr) int {
return a.findSucc(base)
}
// Add adds a new AddrRange to the AddrRanges.
//
// The AddrRange must be mutable (i.e. created by NewAddrRanges),
// otherwise this method will throw.
func (a *AddrRanges) Add(r AddrRange) {
if !a.mutable {
throw("attempt to mutate immutable AddrRanges")
}
a.add(r.addrRange)
}
// TotalBytes returns the totalBytes field of the addrRanges.
func (a *AddrRanges) TotalBytes() uintptr {
return a.addrRanges.totalBytes
}
// BitRange represents a range over a bitmap.
type BitRange struct {
I, N uint // bit index and length in bits
}
// NewPageAlloc creates a new page allocator for testing and
// initializes it with the scav and chunks maps. Each key in these maps
// represents a chunk index and each value is a series of bit ranges to
// set within each bitmap's chunk.
//
// The initialization of the pageAlloc preserves the invariant that if a
// scavenged bit is set the alloc bit is necessarily unset, so some
// of the bits described by scav may be cleared in the final bitmap if
// ranges in chunks overlap with them.
//
// scav is optional, and if nil, the scavenged bitmap will be cleared
// (as opposed to all 1s, which it usually is). Furthermore, every
// chunk index in scav must appear in chunks; ones that do not are
// ignored.
func NewPageAlloc(chunks, scav map[ChunkIdx][]BitRange) *PageAlloc {
p := new(pageAlloc)
// We've got an entry, so initialize the pageAlloc.
runtime: track how much memory is mapped in the Ready state This change adds a field to memstats called mappedReady that tracks how much memory is in the Ready state at any given time. In essence, it's the total memory usage by the Go runtime (with one exception which is documented). Essentially, all memory mapped read/write that has either been paged in or will soon. To make tracking this not involve the many different stats that track mapped memory, we track this statistic at a very low level. The downside of tracking this statistic at such a low level is that it managed to catch lots of situations where the runtime wasn't fully accounting for memory. This change rectifies these situations by always accounting for memory that's mapped in some way (i.e. always passing a sysMemStat to a mem.go function), with *two* exceptions. Rectifying these situations means also having the memory mapped during testing being accounted for, so that tests (i.e. ReadMemStats) that ultimately check mappedReady continue to work correctly without special exceptions. We choose to simply account for this memory in other_sys. Let's talk about the exceptions. The first is the arenas array for finding heap arena metadata from an address is mapped as read/write in one large chunk. It's tens of MiB in size. On systems with demand paging, we assume that the whole thing isn't paged in at once (after all, it maps to the whole address space, and it's exceedingly difficult with today's technology to even broach having as much physical memory as the total address space). On systems where we have to commit memory manually, we use a two-level structure. Now, the reason why this is an exception is because we have no mechanism to track what memory is paged in, and we can't just account for the entire thing, because that would *look* like an enormous overhead. Furthermore, this structure is on a few really, really critical paths in the runtime, so doing more explicit tracking isn't really an option. So, we explicitly don't and call sysAllocOS to map this memory. The second exception is that we call sysFree with no accounting to clean up address space reservations, or otherwise to throw out mappings we don't care about. In this case, also drop down to a lower level and call sysFreeOS to explicitly avoid accounting. The third exception is debuglog allocations. That is purely a debugging facility and ideally we want it to have as small an impact on the runtime as possible. If we include it in mappedReady calculations, it could cause GC pacing shifts in future CLs, especailly if one increases the debuglog buffer sizes as a one-off. As of this CL, these are the only three places in the runtime that would pass nil for a stat to any of the functions in mem.go. As a result, this CL makes sysMemStats mandatory to facilitate better accounting in the future. It's now much easier to grep and find out where accounting is explicitly elided, because one doesn't have to follow the trail of sysMemStat nil pointer values, and can just look at the function name. For #48409. Change-Id: I274eb467fc2603881717482214fddc47c9eaf218 Reviewed-on: https://go-review.googlesource.com/c/go/+/393402 Reviewed-by: Michael Pratt <mpratt@google.com> TryBot-Result: Gopher Robot <gobot@golang.org> Run-TryBot: Michael Knyszek <mknyszek@google.com>
2022-03-15 02:48:18 +00:00
p.init(new(mutex), testSysStat)
runtime: static lock ranking for the runtime (enabled by GOEXPERIMENT) I took some of the infrastructure from Austin's lock logging CR https://go-review.googlesource.com/c/go/+/192704 (with deadlock detection from the logs), and developed a setup to give static lock ranking for runtime locks. Static lock ranking establishes a documented total ordering among locks, and then reports an error if the total order is violated. This can happen if a deadlock happens (by acquiring a sequence of locks in different orders), or if just one side of a possible deadlock happens. Lock ordering deadlocks cannot happen as long as the lock ordering is followed. Along the way, I found a deadlock involving the new timer code, which Ian fixed via https://go-review.googlesource.com/c/go/+/207348, as well as two other potential deadlocks. See the constants at the top of runtime/lockrank.go to show the static lock ranking that I ended up with, along with some comments. This is great documentation of the current intended lock ordering when acquiring multiple locks in the runtime. I also added an array lockPartialOrder[] which shows and enforces the current partial ordering among locks (which is embedded within the total ordering). This is more specific about the dependencies among locks. I don't try to check the ranking within a lock class with multiple locks that can be acquired at the same time (i.e. check the ranking when multiple hchan locks are acquired). Currently, I am doing a lockInit() call to set the lock rank of most locks. Any lock that is not otherwise initialized is assumed to be a leaf lock (a very high rank lock), so that eliminates the need to do anything for a bunch of locks (including all architecture-dependent locks). For two locks, root.lock and notifyList.lock (only in the runtime/sema.go file), it is not as easy to do lock initialization, so instead, I am passing the lock rank with the lock calls. For Windows compilation, I needed to increase the StackGuard size from 896 to 928 because of the new lock-rank checking functions. Checking of the static lock ranking is enabled by setting GOEXPERIMENT=staticlockranking before doing a run. To make sure that the static lock ranking code has no overhead in memory or CPU when not enabled by GOEXPERIMENT, I changed 'go build/install' so that it defines a build tag (with the same name) whenever any experiment has been baked into the toolchain (by checking Expstring()). This allows me to avoid increasing the size of the 'mutex' type when static lock ranking is not enabled. Fixes #38029 Change-Id: I154217ff307c47051f8dae9c2a03b53081acd83a Reviewed-on: https://go-review.googlesource.com/c/go/+/207619 Reviewed-by: Dan Scales <danscales@google.com> Reviewed-by: Keith Randall <khr@golang.org> Run-TryBot: Dan Scales <danscales@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org>
2019-11-13 17:34:47 -08:00
lockInit(p.mheapLock, lockRankMheap)
p.test = true
for i, init := range chunks {
addr := chunkBase(chunkIdx(i))
// Mark the chunk's existence in the pageAlloc.
systemstack(func() {
lock(p.mheapLock)
p.grow(addr, pallocChunkBytes)
unlock(p.mheapLock)
})
// Initialize the bitmap and update pageAlloc metadata.
runtime: convert page allocator bitmap to sparse array Currently the page allocator bitmap is implemented as a single giant memory mapping which is reserved at init time and committed as needed. This causes problems on systems that don't handle large uncommitted mappings well, or institute low virtual address space defaults as a memory limiting mechanism. This change modifies the implementation of the page allocator bitmap away from a directly-mapped set of bytes to a sparse array in same vein as mheap.arenas. This will hurt performance a little but the biggest gains are from the lockless allocation possible with the page allocator, so the impact of this extra layer of indirection should be minimal. In fact, this is exactly what we see: https://perf.golang.org/search?q=upload:20191125.5 This reduces the amount of mapped (PROT_NONE) memory needed on systems with 48-bit address spaces to ~600 MiB down from almost 9 GiB. The bulk of this remaining memory is used by the summaries. Go processes with 32-bit address spaces now always commit to 128 KiB of memory for the bitmap. Previously it would only commit the pages in the bitmap which represented the range of addresses (lowest address to highest address, even if there are unused regions in that range) used by the heap. Updates #35568. Updates #35451. Change-Id: I0ff10380156568642b80c366001eefd0a4e6c762 Reviewed-on: https://go-review.googlesource.com/c/go/+/207497 Run-TryBot: Michael Knyszek <mknyszek@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Austin Clements <austin@google.com> Reviewed-by: Cherry Zhang <cherryyz@google.com>
2019-11-14 23:58:50 +00:00
chunk := p.chunkOf(chunkIndex(addr))
// Clear all the scavenged bits which grow set.
chunk.scavenged.clearRange(0, pallocChunkPages)
// Apply scavenge state if applicable.
if scav != nil {
if scvg, ok := scav[i]; ok {
for _, s := range scvg {
// Ignore the case of s.N == 0. setRange doesn't handle
// it and it's a no-op anyway.
if s.N != 0 {
chunk.scavenged.setRange(s.I, s.N)
}
}
}
}
// Apply alloc state.
for _, s := range init {
// Ignore the case of s.N == 0. allocRange doesn't handle
// it and it's a no-op anyway.
if s.N != 0 {
chunk.allocRange(s.I, s.N)
}
}
runtime: redesign scavenging algorithm Currently the runtime's scavenging algorithm involves running from the top of the heap address space to the bottom (or as far as it gets) once per GC cycle. Once it treads some ground, it doesn't tread it again until the next GC cycle. This works just fine for the background scavenger, for heap-growth scavenging, and for debug.FreeOSMemory. However, it breaks down in the face of a memory limit for small heaps in the tens of MiB. Basically, because the scavenger never retreads old ground, it's completely oblivious to new memory it could scavenge, and that it really *should* in the face of a memory limit. Also, every time some thread goes to scavenge in the runtime, it reserves what could be a considerable amount of address space, hiding it from other scavengers. This change modifies and simplifies the implementation overall. It's less code with complexities that are much better encapsulated. The current implementation iterates optimistically over the address space looking for memory to scavenge, keeping track of what it last saw. The new implementation does the same, but instead of directly iterating over pages, it iterates over chunks. It maintains an index of chunks (as a bitmap over the address space) that indicate which chunks may contain scavenge work. The page allocator populates this index, while scavengers consume it and iterate over it optimistically. This has a two key benefits: 1. Scavenging is much simpler: find a candidate chunk, and check it, essentially just using the scavengeOne fast path. There's no need for the complexity of iterating beyond one chunk, because the index is lock-free and already maintains that information. 2. If pages are freed to the page allocator (always guaranteed to be unscavenged), the page allocator immediately notifies all scavengers of the new source of work, avoiding the hiding issues of the old implementation. One downside of the new implementation, however, is that it's potentially more expensive to find pages to scavenge. In the past, if a single page would become free high up in the address space, the runtime's scavengers would ignore it. Now that scavengers won't, one or more scavengers may need to iterate potentially across the whole heap to find the next source of work. For the background scavenger, this just means a potentially less reactive scavenger -- overall it should still use the same amount of CPU. It means worse overheads for memory limit scavenging, but that's not exactly something with a baseline yet. In practice, this shouldn't be too bad, hopefully since the chunk index is extremely compact. For a 48-bit address space, the index is only 8 MiB in size at worst, but even just one physical page in the index is able to support up to 128 GiB heaps, provided they aren't terribly sparse. On 32-bit platforms, the index is only 128 bytes in size. For #48409. Change-Id: I72b7e74365046b18c64a6417224c5d85511194fb Reviewed-on: https://go-review.googlesource.com/c/go/+/399474 Reviewed-by: Michael Pratt <mpratt@google.com> Run-TryBot: Michael Knyszek <mknyszek@google.com> TryBot-Result: Gopher Robot <gobot@golang.org>
2022-04-10 20:34:17 +00:00
// Make sure the scavenge index is updated.
//
// This is an inefficient way to do it, but it's also the simplest way.
minPages := physPageSize / pageSize
if minPages < 1 {
minPages = 1
}
_, npages := chunk.findScavengeCandidate(pallocChunkPages-1, minPages, minPages)
if npages != 0 {
p.scav.index.mark(addr, addr+pallocChunkBytes)
}
// Update heap metadata for the allocRange calls above.
systemstack(func() {
lock(p.mheapLock)
p.update(addr, pallocChunkPages, false, false)
unlock(p.mheapLock)
})
}
return (*PageAlloc)(p)
}
// FreePageAlloc releases hard OS resources owned by the pageAlloc. Once this
// is called the pageAlloc may no longer be used. The object itself will be
// collected by the garbage collector once it is no longer live.
func FreePageAlloc(pp *PageAlloc) {
p := (*pageAlloc)(pp)
// Free all the mapped space for the summary levels.
if pageAlloc64Bit != 0 {
for l := 0; l < summaryLevels; l++ {
runtime: track how much memory is mapped in the Ready state This change adds a field to memstats called mappedReady that tracks how much memory is in the Ready state at any given time. In essence, it's the total memory usage by the Go runtime (with one exception which is documented). Essentially, all memory mapped read/write that has either been paged in or will soon. To make tracking this not involve the many different stats that track mapped memory, we track this statistic at a very low level. The downside of tracking this statistic at such a low level is that it managed to catch lots of situations where the runtime wasn't fully accounting for memory. This change rectifies these situations by always accounting for memory that's mapped in some way (i.e. always passing a sysMemStat to a mem.go function), with *two* exceptions. Rectifying these situations means also having the memory mapped during testing being accounted for, so that tests (i.e. ReadMemStats) that ultimately check mappedReady continue to work correctly without special exceptions. We choose to simply account for this memory in other_sys. Let's talk about the exceptions. The first is the arenas array for finding heap arena metadata from an address is mapped as read/write in one large chunk. It's tens of MiB in size. On systems with demand paging, we assume that the whole thing isn't paged in at once (after all, it maps to the whole address space, and it's exceedingly difficult with today's technology to even broach having as much physical memory as the total address space). On systems where we have to commit memory manually, we use a two-level structure. Now, the reason why this is an exception is because we have no mechanism to track what memory is paged in, and we can't just account for the entire thing, because that would *look* like an enormous overhead. Furthermore, this structure is on a few really, really critical paths in the runtime, so doing more explicit tracking isn't really an option. So, we explicitly don't and call sysAllocOS to map this memory. The second exception is that we call sysFree with no accounting to clean up address space reservations, or otherwise to throw out mappings we don't care about. In this case, also drop down to a lower level and call sysFreeOS to explicitly avoid accounting. The third exception is debuglog allocations. That is purely a debugging facility and ideally we want it to have as small an impact on the runtime as possible. If we include it in mappedReady calculations, it could cause GC pacing shifts in future CLs, especailly if one increases the debuglog buffer sizes as a one-off. As of this CL, these are the only three places in the runtime that would pass nil for a stat to any of the functions in mem.go. As a result, this CL makes sysMemStats mandatory to facilitate better accounting in the future. It's now much easier to grep and find out where accounting is explicitly elided, because one doesn't have to follow the trail of sysMemStat nil pointer values, and can just look at the function name. For #48409. Change-Id: I274eb467fc2603881717482214fddc47c9eaf218 Reviewed-on: https://go-review.googlesource.com/c/go/+/393402 Reviewed-by: Michael Pratt <mpratt@google.com> TryBot-Result: Gopher Robot <gobot@golang.org> Run-TryBot: Michael Knyszek <mknyszek@google.com>
2022-03-15 02:48:18 +00:00
sysFreeOS(unsafe.Pointer(&p.summary[l][0]), uintptr(cap(p.summary[l]))*pallocSumBytes)
}
runtime: redesign scavenging algorithm Currently the runtime's scavenging algorithm involves running from the top of the heap address space to the bottom (or as far as it gets) once per GC cycle. Once it treads some ground, it doesn't tread it again until the next GC cycle. This works just fine for the background scavenger, for heap-growth scavenging, and for debug.FreeOSMemory. However, it breaks down in the face of a memory limit for small heaps in the tens of MiB. Basically, because the scavenger never retreads old ground, it's completely oblivious to new memory it could scavenge, and that it really *should* in the face of a memory limit. Also, every time some thread goes to scavenge in the runtime, it reserves what could be a considerable amount of address space, hiding it from other scavengers. This change modifies and simplifies the implementation overall. It's less code with complexities that are much better encapsulated. The current implementation iterates optimistically over the address space looking for memory to scavenge, keeping track of what it last saw. The new implementation does the same, but instead of directly iterating over pages, it iterates over chunks. It maintains an index of chunks (as a bitmap over the address space) that indicate which chunks may contain scavenge work. The page allocator populates this index, while scavengers consume it and iterate over it optimistically. This has a two key benefits: 1. Scavenging is much simpler: find a candidate chunk, and check it, essentially just using the scavengeOne fast path. There's no need for the complexity of iterating beyond one chunk, because the index is lock-free and already maintains that information. 2. If pages are freed to the page allocator (always guaranteed to be unscavenged), the page allocator immediately notifies all scavengers of the new source of work, avoiding the hiding issues of the old implementation. One downside of the new implementation, however, is that it's potentially more expensive to find pages to scavenge. In the past, if a single page would become free high up in the address space, the runtime's scavengers would ignore it. Now that scavengers won't, one or more scavengers may need to iterate potentially across the whole heap to find the next source of work. For the background scavenger, this just means a potentially less reactive scavenger -- overall it should still use the same amount of CPU. It means worse overheads for memory limit scavenging, but that's not exactly something with a baseline yet. In practice, this shouldn't be too bad, hopefully since the chunk index is extremely compact. For a 48-bit address space, the index is only 8 MiB in size at worst, but even just one physical page in the index is able to support up to 128 GiB heaps, provided they aren't terribly sparse. On 32-bit platforms, the index is only 128 bytes in size. For #48409. Change-Id: I72b7e74365046b18c64a6417224c5d85511194fb Reviewed-on: https://go-review.googlesource.com/c/go/+/399474 Reviewed-by: Michael Pratt <mpratt@google.com> Run-TryBot: Michael Knyszek <mknyszek@google.com> TryBot-Result: Gopher Robot <gobot@golang.org>
2022-04-10 20:34:17 +00:00
// Only necessary on 64-bit. This is a global on 32-bit.
sysFreeOS(unsafe.Pointer(&p.scav.index.chunks[0]), uintptr(cap(p.scav.index.chunks)))
} else {
resSize := uintptr(0)
for _, s := range p.summary {
resSize += uintptr(cap(s)) * pallocSumBytes
}
runtime: track how much memory is mapped in the Ready state This change adds a field to memstats called mappedReady that tracks how much memory is in the Ready state at any given time. In essence, it's the total memory usage by the Go runtime (with one exception which is documented). Essentially, all memory mapped read/write that has either been paged in or will soon. To make tracking this not involve the many different stats that track mapped memory, we track this statistic at a very low level. The downside of tracking this statistic at such a low level is that it managed to catch lots of situations where the runtime wasn't fully accounting for memory. This change rectifies these situations by always accounting for memory that's mapped in some way (i.e. always passing a sysMemStat to a mem.go function), with *two* exceptions. Rectifying these situations means also having the memory mapped during testing being accounted for, so that tests (i.e. ReadMemStats) that ultimately check mappedReady continue to work correctly without special exceptions. We choose to simply account for this memory in other_sys. Let's talk about the exceptions. The first is the arenas array for finding heap arena metadata from an address is mapped as read/write in one large chunk. It's tens of MiB in size. On systems with demand paging, we assume that the whole thing isn't paged in at once (after all, it maps to the whole address space, and it's exceedingly difficult with today's technology to even broach having as much physical memory as the total address space). On systems where we have to commit memory manually, we use a two-level structure. Now, the reason why this is an exception is because we have no mechanism to track what memory is paged in, and we can't just account for the entire thing, because that would *look* like an enormous overhead. Furthermore, this structure is on a few really, really critical paths in the runtime, so doing more explicit tracking isn't really an option. So, we explicitly don't and call sysAllocOS to map this memory. The second exception is that we call sysFree with no accounting to clean up address space reservations, or otherwise to throw out mappings we don't care about. In this case, also drop down to a lower level and call sysFreeOS to explicitly avoid accounting. The third exception is debuglog allocations. That is purely a debugging facility and ideally we want it to have as small an impact on the runtime as possible. If we include it in mappedReady calculations, it could cause GC pacing shifts in future CLs, especailly if one increases the debuglog buffer sizes as a one-off. As of this CL, these are the only three places in the runtime that would pass nil for a stat to any of the functions in mem.go. As a result, this CL makes sysMemStats mandatory to facilitate better accounting in the future. It's now much easier to grep and find out where accounting is explicitly elided, because one doesn't have to follow the trail of sysMemStat nil pointer values, and can just look at the function name. For #48409. Change-Id: I274eb467fc2603881717482214fddc47c9eaf218 Reviewed-on: https://go-review.googlesource.com/c/go/+/393402 Reviewed-by: Michael Pratt <mpratt@google.com> TryBot-Result: Gopher Robot <gobot@golang.org> Run-TryBot: Michael Knyszek <mknyszek@google.com>
2022-03-15 02:48:18 +00:00
sysFreeOS(unsafe.Pointer(&p.summary[0][0]), alignUp(resSize, physPageSize))
}
runtime: redesign scavenging algorithm Currently the runtime's scavenging algorithm involves running from the top of the heap address space to the bottom (or as far as it gets) once per GC cycle. Once it treads some ground, it doesn't tread it again until the next GC cycle. This works just fine for the background scavenger, for heap-growth scavenging, and for debug.FreeOSMemory. However, it breaks down in the face of a memory limit for small heaps in the tens of MiB. Basically, because the scavenger never retreads old ground, it's completely oblivious to new memory it could scavenge, and that it really *should* in the face of a memory limit. Also, every time some thread goes to scavenge in the runtime, it reserves what could be a considerable amount of address space, hiding it from other scavengers. This change modifies and simplifies the implementation overall. It's less code with complexities that are much better encapsulated. The current implementation iterates optimistically over the address space looking for memory to scavenge, keeping track of what it last saw. The new implementation does the same, but instead of directly iterating over pages, it iterates over chunks. It maintains an index of chunks (as a bitmap over the address space) that indicate which chunks may contain scavenge work. The page allocator populates this index, while scavengers consume it and iterate over it optimistically. This has a two key benefits: 1. Scavenging is much simpler: find a candidate chunk, and check it, essentially just using the scavengeOne fast path. There's no need for the complexity of iterating beyond one chunk, because the index is lock-free and already maintains that information. 2. If pages are freed to the page allocator (always guaranteed to be unscavenged), the page allocator immediately notifies all scavengers of the new source of work, avoiding the hiding issues of the old implementation. One downside of the new implementation, however, is that it's potentially more expensive to find pages to scavenge. In the past, if a single page would become free high up in the address space, the runtime's scavengers would ignore it. Now that scavengers won't, one or more scavengers may need to iterate potentially across the whole heap to find the next source of work. For the background scavenger, this just means a potentially less reactive scavenger -- overall it should still use the same amount of CPU. It means worse overheads for memory limit scavenging, but that's not exactly something with a baseline yet. In practice, this shouldn't be too bad, hopefully since the chunk index is extremely compact. For a 48-bit address space, the index is only 8 MiB in size at worst, but even just one physical page in the index is able to support up to 128 GiB heaps, provided they aren't terribly sparse. On 32-bit platforms, the index is only 128 bytes in size. For #48409. Change-Id: I72b7e74365046b18c64a6417224c5d85511194fb Reviewed-on: https://go-review.googlesource.com/c/go/+/399474 Reviewed-by: Michael Pratt <mpratt@google.com> Run-TryBot: Michael Knyszek <mknyszek@google.com> TryBot-Result: Gopher Robot <gobot@golang.org>
2022-04-10 20:34:17 +00:00
runtime: track how much memory is mapped in the Ready state This change adds a field to memstats called mappedReady that tracks how much memory is in the Ready state at any given time. In essence, it's the total memory usage by the Go runtime (with one exception which is documented). Essentially, all memory mapped read/write that has either been paged in or will soon. To make tracking this not involve the many different stats that track mapped memory, we track this statistic at a very low level. The downside of tracking this statistic at such a low level is that it managed to catch lots of situations where the runtime wasn't fully accounting for memory. This change rectifies these situations by always accounting for memory that's mapped in some way (i.e. always passing a sysMemStat to a mem.go function), with *two* exceptions. Rectifying these situations means also having the memory mapped during testing being accounted for, so that tests (i.e. ReadMemStats) that ultimately check mappedReady continue to work correctly without special exceptions. We choose to simply account for this memory in other_sys. Let's talk about the exceptions. The first is the arenas array for finding heap arena metadata from an address is mapped as read/write in one large chunk. It's tens of MiB in size. On systems with demand paging, we assume that the whole thing isn't paged in at once (after all, it maps to the whole address space, and it's exceedingly difficult with today's technology to even broach having as much physical memory as the total address space). On systems where we have to commit memory manually, we use a two-level structure. Now, the reason why this is an exception is because we have no mechanism to track what memory is paged in, and we can't just account for the entire thing, because that would *look* like an enormous overhead. Furthermore, this structure is on a few really, really critical paths in the runtime, so doing more explicit tracking isn't really an option. So, we explicitly don't and call sysAllocOS to map this memory. The second exception is that we call sysFree with no accounting to clean up address space reservations, or otherwise to throw out mappings we don't care about. In this case, also drop down to a lower level and call sysFreeOS to explicitly avoid accounting. The third exception is debuglog allocations. That is purely a debugging facility and ideally we want it to have as small an impact on the runtime as possible. If we include it in mappedReady calculations, it could cause GC pacing shifts in future CLs, especailly if one increases the debuglog buffer sizes as a one-off. As of this CL, these are the only three places in the runtime that would pass nil for a stat to any of the functions in mem.go. As a result, this CL makes sysMemStats mandatory to facilitate better accounting in the future. It's now much easier to grep and find out where accounting is explicitly elided, because one doesn't have to follow the trail of sysMemStat nil pointer values, and can just look at the function name. For #48409. Change-Id: I274eb467fc2603881717482214fddc47c9eaf218 Reviewed-on: https://go-review.googlesource.com/c/go/+/393402 Reviewed-by: Michael Pratt <mpratt@google.com> TryBot-Result: Gopher Robot <gobot@golang.org> Run-TryBot: Michael Knyszek <mknyszek@google.com>
2022-03-15 02:48:18 +00:00
// Subtract back out whatever we mapped for the summaries.
// sysUsed adds to p.sysStat and memstats.mappedReady no matter what
// (and in anger should actually be accounted for), and there's no other
// way to figure out how much we actually mapped.
gcController.mappedReady.Add(-int64(p.summaryMappedReady))
runtime: track how much memory is mapped in the Ready state This change adds a field to memstats called mappedReady that tracks how much memory is in the Ready state at any given time. In essence, it's the total memory usage by the Go runtime (with one exception which is documented). Essentially, all memory mapped read/write that has either been paged in or will soon. To make tracking this not involve the many different stats that track mapped memory, we track this statistic at a very low level. The downside of tracking this statistic at such a low level is that it managed to catch lots of situations where the runtime wasn't fully accounting for memory. This change rectifies these situations by always accounting for memory that's mapped in some way (i.e. always passing a sysMemStat to a mem.go function), with *two* exceptions. Rectifying these situations means also having the memory mapped during testing being accounted for, so that tests (i.e. ReadMemStats) that ultimately check mappedReady continue to work correctly without special exceptions. We choose to simply account for this memory in other_sys. Let's talk about the exceptions. The first is the arenas array for finding heap arena metadata from an address is mapped as read/write in one large chunk. It's tens of MiB in size. On systems with demand paging, we assume that the whole thing isn't paged in at once (after all, it maps to the whole address space, and it's exceedingly difficult with today's technology to even broach having as much physical memory as the total address space). On systems where we have to commit memory manually, we use a two-level structure. Now, the reason why this is an exception is because we have no mechanism to track what memory is paged in, and we can't just account for the entire thing, because that would *look* like an enormous overhead. Furthermore, this structure is on a few really, really critical paths in the runtime, so doing more explicit tracking isn't really an option. So, we explicitly don't and call sysAllocOS to map this memory. The second exception is that we call sysFree with no accounting to clean up address space reservations, or otherwise to throw out mappings we don't care about. In this case, also drop down to a lower level and call sysFreeOS to explicitly avoid accounting. The third exception is debuglog allocations. That is purely a debugging facility and ideally we want it to have as small an impact on the runtime as possible. If we include it in mappedReady calculations, it could cause GC pacing shifts in future CLs, especailly if one increases the debuglog buffer sizes as a one-off. As of this CL, these are the only three places in the runtime that would pass nil for a stat to any of the functions in mem.go. As a result, this CL makes sysMemStats mandatory to facilitate better accounting in the future. It's now much easier to grep and find out where accounting is explicitly elided, because one doesn't have to follow the trail of sysMemStat nil pointer values, and can just look at the function name. For #48409. Change-Id: I274eb467fc2603881717482214fddc47c9eaf218 Reviewed-on: https://go-review.googlesource.com/c/go/+/393402 Reviewed-by: Michael Pratt <mpratt@google.com> TryBot-Result: Gopher Robot <gobot@golang.org> Run-TryBot: Michael Knyszek <mknyszek@google.com>
2022-03-15 02:48:18 +00:00
testSysStat.add(-int64(p.summaryMappedReady))
// Free the mapped space for chunks.
runtime: convert page allocator bitmap to sparse array Currently the page allocator bitmap is implemented as a single giant memory mapping which is reserved at init time and committed as needed. This causes problems on systems that don't handle large uncommitted mappings well, or institute low virtual address space defaults as a memory limiting mechanism. This change modifies the implementation of the page allocator bitmap away from a directly-mapped set of bytes to a sparse array in same vein as mheap.arenas. This will hurt performance a little but the biggest gains are from the lockless allocation possible with the page allocator, so the impact of this extra layer of indirection should be minimal. In fact, this is exactly what we see: https://perf.golang.org/search?q=upload:20191125.5 This reduces the amount of mapped (PROT_NONE) memory needed on systems with 48-bit address spaces to ~600 MiB down from almost 9 GiB. The bulk of this remaining memory is used by the summaries. Go processes with 32-bit address spaces now always commit to 128 KiB of memory for the bitmap. Previously it would only commit the pages in the bitmap which represented the range of addresses (lowest address to highest address, even if there are unused regions in that range) used by the heap. Updates #35568. Updates #35451. Change-Id: I0ff10380156568642b80c366001eefd0a4e6c762 Reviewed-on: https://go-review.googlesource.com/c/go/+/207497 Run-TryBot: Michael Knyszek <mknyszek@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Austin Clements <austin@google.com> Reviewed-by: Cherry Zhang <cherryyz@google.com>
2019-11-14 23:58:50 +00:00
for i := range p.chunks {
if x := p.chunks[i]; x != nil {
p.chunks[i] = nil
// This memory comes from sysAlloc and will always be page-aligned.
runtime: track how much memory is mapped in the Ready state This change adds a field to memstats called mappedReady that tracks how much memory is in the Ready state at any given time. In essence, it's the total memory usage by the Go runtime (with one exception which is documented). Essentially, all memory mapped read/write that has either been paged in or will soon. To make tracking this not involve the many different stats that track mapped memory, we track this statistic at a very low level. The downside of tracking this statistic at such a low level is that it managed to catch lots of situations where the runtime wasn't fully accounting for memory. This change rectifies these situations by always accounting for memory that's mapped in some way (i.e. always passing a sysMemStat to a mem.go function), with *two* exceptions. Rectifying these situations means also having the memory mapped during testing being accounted for, so that tests (i.e. ReadMemStats) that ultimately check mappedReady continue to work correctly without special exceptions. We choose to simply account for this memory in other_sys. Let's talk about the exceptions. The first is the arenas array for finding heap arena metadata from an address is mapped as read/write in one large chunk. It's tens of MiB in size. On systems with demand paging, we assume that the whole thing isn't paged in at once (after all, it maps to the whole address space, and it's exceedingly difficult with today's technology to even broach having as much physical memory as the total address space). On systems where we have to commit memory manually, we use a two-level structure. Now, the reason why this is an exception is because we have no mechanism to track what memory is paged in, and we can't just account for the entire thing, because that would *look* like an enormous overhead. Furthermore, this structure is on a few really, really critical paths in the runtime, so doing more explicit tracking isn't really an option. So, we explicitly don't and call sysAllocOS to map this memory. The second exception is that we call sysFree with no accounting to clean up address space reservations, or otherwise to throw out mappings we don't care about. In this case, also drop down to a lower level and call sysFreeOS to explicitly avoid accounting. The third exception is debuglog allocations. That is purely a debugging facility and ideally we want it to have as small an impact on the runtime as possible. If we include it in mappedReady calculations, it could cause GC pacing shifts in future CLs, especailly if one increases the debuglog buffer sizes as a one-off. As of this CL, these are the only three places in the runtime that would pass nil for a stat to any of the functions in mem.go. As a result, this CL makes sysMemStats mandatory to facilitate better accounting in the future. It's now much easier to grep and find out where accounting is explicitly elided, because one doesn't have to follow the trail of sysMemStat nil pointer values, and can just look at the function name. For #48409. Change-Id: I274eb467fc2603881717482214fddc47c9eaf218 Reviewed-on: https://go-review.googlesource.com/c/go/+/393402 Reviewed-by: Michael Pratt <mpratt@google.com> TryBot-Result: Gopher Robot <gobot@golang.org> Run-TryBot: Michael Knyszek <mknyszek@google.com>
2022-03-15 02:48:18 +00:00
sysFree(unsafe.Pointer(x), unsafe.Sizeof(*p.chunks[0]), testSysStat)
runtime: convert page allocator bitmap to sparse array Currently the page allocator bitmap is implemented as a single giant memory mapping which is reserved at init time and committed as needed. This causes problems on systems that don't handle large uncommitted mappings well, or institute low virtual address space defaults as a memory limiting mechanism. This change modifies the implementation of the page allocator bitmap away from a directly-mapped set of bytes to a sparse array in same vein as mheap.arenas. This will hurt performance a little but the biggest gains are from the lockless allocation possible with the page allocator, so the impact of this extra layer of indirection should be minimal. In fact, this is exactly what we see: https://perf.golang.org/search?q=upload:20191125.5 This reduces the amount of mapped (PROT_NONE) memory needed on systems with 48-bit address spaces to ~600 MiB down from almost 9 GiB. The bulk of this remaining memory is used by the summaries. Go processes with 32-bit address spaces now always commit to 128 KiB of memory for the bitmap. Previously it would only commit the pages in the bitmap which represented the range of addresses (lowest address to highest address, even if there are unused regions in that range) used by the heap. Updates #35568. Updates #35451. Change-Id: I0ff10380156568642b80c366001eefd0a4e6c762 Reviewed-on: https://go-review.googlesource.com/c/go/+/207497 Run-TryBot: Michael Knyszek <mknyszek@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Austin Clements <austin@google.com> Reviewed-by: Cherry Zhang <cherryyz@google.com>
2019-11-14 23:58:50 +00:00
}
}
}
// BaseChunkIdx is a convenient chunkIdx value which works on both
// 64 bit and 32 bit platforms, allowing the tests to share code
// between the two.
//
// This should not be higher than 0x100*pallocChunkBytes to support
// mips and mipsle, which only have 31-bit address spaces.
var BaseChunkIdx = func() ChunkIdx {
var prefix uintptr
if pageAlloc64Bit != 0 {
prefix = 0xc000
} else {
prefix = 0x100
}
baseAddr := prefix * pallocChunkBytes
if goos.IsAix != 0 {
baseAddr += arenaBaseOffset
}
return ChunkIdx(chunkIndex(baseAddr))
}()
// PageBase returns an address given a chunk index and a page index
// relative to that chunk.
func PageBase(c ChunkIdx, pageIdx uint) uintptr {
return chunkBase(chunkIdx(c)) + uintptr(pageIdx)*pageSize
}
type BitsMismatch struct {
Base uintptr
Got, Want uint64
}
func CheckScavengedBitsCleared(mismatches []BitsMismatch) (n int, ok bool) {
ok = true
// Run on the system stack to avoid stack growth allocation.
systemstack(func() {
getg().m.mallocing++
// Lock so that we can safely access the bitmap.
lock(&mheap_.lock)
chunkLoop:
for i := mheap_.pages.start; i < mheap_.pages.end; i++ {
chunk := mheap_.pages.tryChunkOf(i)
if chunk == nil {
continue
}
for j := 0; j < pallocChunkPages/64; j++ {
// Run over each 64-bit bitmap section and ensure
// scavenged is being cleared properly on allocation.
// If a used bit and scavenged bit are both set, that's
// an error, and could indicate a larger problem, or
// an accounting problem.
want := chunk.scavenged[j] &^ chunk.pallocBits[j]
got := chunk.scavenged[j]
if want != got {
ok = false
if n >= len(mismatches) {
break chunkLoop
}
mismatches[n] = BitsMismatch{
Base: chunkBase(i) + uintptr(j)*64*pageSize,
Got: got,
Want: want,
}
n++
}
}
}
unlock(&mheap_.lock)
getg().m.mallocing--
})
return
}
func PageCachePagesLeaked() (leaked uintptr) {
stopTheWorld("PageCachePagesLeaked")
// Walk over destroyed Ps and look for unflushed caches.
deadp := allp[len(allp):cap(allp)]
for _, p := range deadp {
// Since we're going past len(allp) we may see nil Ps.
// Just ignore them.
if p != nil {
leaked += uintptr(sys.OnesCount64(p.pcache.cache))
}
}
startTheWorld()
return
}
var Semacquire = semacquire
var Semrelease1 = semrelease1
func SemNwait(addr *uint32) uint32 {
root := semtable.rootFor(addr)
return root.nwait.Load()
}
const SemTableSize = semTabSize
// SemTable is a wrapper around semTable exported for testing.
type SemTable struct {
semTable
}
// Enqueue simulates enqueuing a waiter for a semaphore (or lock) at addr.
func (t *SemTable) Enqueue(addr *uint32) {
s := acquireSudog()
s.releasetime = 0
s.acquiretime = 0
s.ticket = 0
t.semTable.rootFor(addr).queue(addr, s, false)
}
// Dequeue simulates dequeuing a waiter for a semaphore (or lock) at addr.
//
// Returns true if there actually was a waiter to be dequeued.
func (t *SemTable) Dequeue(addr *uint32) bool {
s, _ := t.semTable.rootFor(addr).dequeue(addr)
if s != nil {
releaseSudog(s)
return true
}
return false
}
// mspan wrapper for testing.
type MSpan mspan
// Allocate an mspan for testing.
func AllocMSpan() *MSpan {
var s *mspan
systemstack(func() {
lock(&mheap_.lock)
s = (*mspan)(mheap_.spanalloc.alloc())
unlock(&mheap_.lock)
})
return (*MSpan)(s)
}
// Free an allocated mspan.
func FreeMSpan(s *MSpan) {
systemstack(func() {
lock(&mheap_.lock)
mheap_.spanalloc.free(unsafe.Pointer(s))
unlock(&mheap_.lock)
})
}
func MSpanCountAlloc(ms *MSpan, bits []byte) int {
s := (*mspan)(ms)
s.nelems = uintptr(len(bits) * 8)
s.gcmarkBits = (*gcBits)(unsafe.Pointer(&bits[0]))
result := s.countAlloc()
s.gcmarkBits = nil
return result
}
const (
TimeHistSubBucketBits = timeHistSubBucketBits
TimeHistNumSubBuckets = timeHistNumSubBuckets
TimeHistNumBuckets = timeHistNumBuckets
TimeHistMinBucketBits = timeHistMinBucketBits
TimeHistMaxBucketBits = timeHistMaxBucketBits
)
type TimeHistogram timeHistogram
// Counts returns the counts for the given bucket, subBucket indices.
// Returns true if the bucket was valid, otherwise returns the counts
// for the overflow bucket if bucket > 0 or the underflow bucket if
// bucket < 0, and false.
func (th *TimeHistogram) Count(bucket, subBucket int) (uint64, bool) {
t := (*timeHistogram)(th)
if bucket < 0 {
return t.underflow.Load(), false
}
i := bucket*TimeHistNumSubBuckets + subBucket
if i >= len(t.counts) {
return t.overflow.Load(), false
}
return t.counts[i].Load(), true
}
func (th *TimeHistogram) Record(duration int64) {
(*timeHistogram)(th).record(duration)
}
var TimeHistogramMetricsBuckets = timeHistogramMetricsBuckets
func SetIntArgRegs(a int) int {
lock(&finlock)
old := intArgRegs
if a >= 0 {
intArgRegs = a
}
unlock(&finlock)
return old
}
func FinalizerGAsleep() bool {
return fingStatus.Load()&fingWait != 0
}
// For GCTestMoveStackOnNextCall, it's important not to introduce an
// extra layer of call, since then there's a return before the "real"
// next call.
var GCTestMoveStackOnNextCall = gcTestMoveStackOnNextCall
// For GCTestIsReachable, it's important that we do this as a call so
// escape analysis can see through it.
func GCTestIsReachable(ptrs ...unsafe.Pointer) (mask uint64) {
return gcTestIsReachable(ptrs...)
}
// For GCTestPointerClass, it's important that we do this as a call so
// escape analysis can see through it.
//
// This is nosplit because gcTestPointerClass is.
//
//go:nosplit
func GCTestPointerClass(p unsafe.Pointer) string {
return gcTestPointerClass(p)
}
const Raceenabled = raceenabled
const (
runtime: set the heap goal from the memory limit This change makes the memory limit functional by including it in the heap goal calculation. Specifically, we derive a heap goal from the memory limit, and compare that to the GOGC-based goal. If the goal based on the memory limit is lower, we prefer that. To derive the memory limit goal, the heap goal calculation now takes a few additional parameters as input. As a result, the heap goal, in the presence of a memory limit, may change dynamically. The consequences of this are that different parts of the runtime can have different views of the heap goal; this is OK. What's important is that all of the runtime is able to observe the correct heap goal for the moment it's doing something that affects it, like anything that should trigger a GC cycle. On the topic of triggering a GC cycle, this change also allows any manually managed memory allocation from the page heap to trigger a GC. So, specifically workbufs, unrolled GC scan programs, and goroutine stacks. The reason for this is that now non-heap memory can effect the trigger or the heap goal. Most sources of non-heap memory only change slowly, like GC pointer bitmaps, or change in response to explicit function calls like GOMAXPROCS. Note also that unrolled GC scan programs and workbufs are really only relevant during a GC cycle anyway, so they won't actually ever trigger a GC. Our primary target here is goroutine stacks. Goroutine stacks can increase quickly, and this is currently totally independent of the GC cycle. Thus, if for example a goroutine begins to recurse suddenly and deeply, then even though the heap goal and trigger react, we might not notice until its too late. As a result, we need to trigger a GC cycle. We do this trigger in allocManual instead of in stackalloc because it's far more general. We ultimately care about memory that's mapped read/write and not returned to the OS, which is much more the domain of the page heap than the stack allocator. Furthermore, there may be new sources of memory manual allocation in the future (e.g. arenas) that need to trigger a GC if necessary. As such, I'm inclined to leave the trigger in allocManual as an extra defensive measure. It's worth noting that because goroutine stacks do not behave quite as predictably as other non-heap memory, there is the potential for the heap goal to swing wildly. Fortunately, goroutine stacks that haven't been set up to shrink by the last GC cycle will not shrink until after the next one. This reduces the amount of possible churn in the heap goal because it means that shrinkage only happens once per goroutine, per GC cycle. After all the goroutines that should shrink did, then goroutine stacks will only grow. The shrink mechanism is analagous to sweeping, which is incremental and thus tends toward a steady amount of heap memory used. As a result, in practice, I expect this to be a non-issue. Note that if the memory limit is not set, this change should be a no-op. For #48409. Change-Id: Ie06d10175e5e36f9fb6450e26ed8acd3d30c681c Reviewed-on: https://go-review.googlesource.com/c/go/+/394221 Run-TryBot: Michael Knyszek <mknyszek@google.com> TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Michael Pratt <mpratt@google.com>
2022-03-21 21:27:06 +00:00
GCBackgroundUtilization = gcBackgroundUtilization
GCGoalUtilization = gcGoalUtilization
DefaultHeapMinimum = defaultHeapMinimum
MemoryLimitHeapGoalHeadroom = memoryLimitHeapGoalHeadroom
)
type GCController struct {
gcControllerState
}
runtime: set the heap goal from the memory limit This change makes the memory limit functional by including it in the heap goal calculation. Specifically, we derive a heap goal from the memory limit, and compare that to the GOGC-based goal. If the goal based on the memory limit is lower, we prefer that. To derive the memory limit goal, the heap goal calculation now takes a few additional parameters as input. As a result, the heap goal, in the presence of a memory limit, may change dynamically. The consequences of this are that different parts of the runtime can have different views of the heap goal; this is OK. What's important is that all of the runtime is able to observe the correct heap goal for the moment it's doing something that affects it, like anything that should trigger a GC cycle. On the topic of triggering a GC cycle, this change also allows any manually managed memory allocation from the page heap to trigger a GC. So, specifically workbufs, unrolled GC scan programs, and goroutine stacks. The reason for this is that now non-heap memory can effect the trigger or the heap goal. Most sources of non-heap memory only change slowly, like GC pointer bitmaps, or change in response to explicit function calls like GOMAXPROCS. Note also that unrolled GC scan programs and workbufs are really only relevant during a GC cycle anyway, so they won't actually ever trigger a GC. Our primary target here is goroutine stacks. Goroutine stacks can increase quickly, and this is currently totally independent of the GC cycle. Thus, if for example a goroutine begins to recurse suddenly and deeply, then even though the heap goal and trigger react, we might not notice until its too late. As a result, we need to trigger a GC cycle. We do this trigger in allocManual instead of in stackalloc because it's far more general. We ultimately care about memory that's mapped read/write and not returned to the OS, which is much more the domain of the page heap than the stack allocator. Furthermore, there may be new sources of memory manual allocation in the future (e.g. arenas) that need to trigger a GC if necessary. As such, I'm inclined to leave the trigger in allocManual as an extra defensive measure. It's worth noting that because goroutine stacks do not behave quite as predictably as other non-heap memory, there is the potential for the heap goal to swing wildly. Fortunately, goroutine stacks that haven't been set up to shrink by the last GC cycle will not shrink until after the next one. This reduces the amount of possible churn in the heap goal because it means that shrinkage only happens once per goroutine, per GC cycle. After all the goroutines that should shrink did, then goroutine stacks will only grow. The shrink mechanism is analagous to sweeping, which is incremental and thus tends toward a steady amount of heap memory used. As a result, in practice, I expect this to be a non-issue. Note that if the memory limit is not set, this change should be a no-op. For #48409. Change-Id: Ie06d10175e5e36f9fb6450e26ed8acd3d30c681c Reviewed-on: https://go-review.googlesource.com/c/go/+/394221 Run-TryBot: Michael Knyszek <mknyszek@google.com> TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Michael Pratt <mpratt@google.com>
2022-03-21 21:27:06 +00:00
func NewGCController(gcPercent int, memoryLimit int64) *GCController {
// Force the controller to escape. We're going to
// do 64-bit atomics on it, and if it gets stack-allocated
// on a 32-bit architecture, it may get allocated unaligned
// space.
g := Escape(new(GCController))
g.gcControllerState.test = true // Mark it as a test copy.
runtime: set the heap goal from the memory limit This change makes the memory limit functional by including it in the heap goal calculation. Specifically, we derive a heap goal from the memory limit, and compare that to the GOGC-based goal. If the goal based on the memory limit is lower, we prefer that. To derive the memory limit goal, the heap goal calculation now takes a few additional parameters as input. As a result, the heap goal, in the presence of a memory limit, may change dynamically. The consequences of this are that different parts of the runtime can have different views of the heap goal; this is OK. What's important is that all of the runtime is able to observe the correct heap goal for the moment it's doing something that affects it, like anything that should trigger a GC cycle. On the topic of triggering a GC cycle, this change also allows any manually managed memory allocation from the page heap to trigger a GC. So, specifically workbufs, unrolled GC scan programs, and goroutine stacks. The reason for this is that now non-heap memory can effect the trigger or the heap goal. Most sources of non-heap memory only change slowly, like GC pointer bitmaps, or change in response to explicit function calls like GOMAXPROCS. Note also that unrolled GC scan programs and workbufs are really only relevant during a GC cycle anyway, so they won't actually ever trigger a GC. Our primary target here is goroutine stacks. Goroutine stacks can increase quickly, and this is currently totally independent of the GC cycle. Thus, if for example a goroutine begins to recurse suddenly and deeply, then even though the heap goal and trigger react, we might not notice until its too late. As a result, we need to trigger a GC cycle. We do this trigger in allocManual instead of in stackalloc because it's far more general. We ultimately care about memory that's mapped read/write and not returned to the OS, which is much more the domain of the page heap than the stack allocator. Furthermore, there may be new sources of memory manual allocation in the future (e.g. arenas) that need to trigger a GC if necessary. As such, I'm inclined to leave the trigger in allocManual as an extra defensive measure. It's worth noting that because goroutine stacks do not behave quite as predictably as other non-heap memory, there is the potential for the heap goal to swing wildly. Fortunately, goroutine stacks that haven't been set up to shrink by the last GC cycle will not shrink until after the next one. This reduces the amount of possible churn in the heap goal because it means that shrinkage only happens once per goroutine, per GC cycle. After all the goroutines that should shrink did, then goroutine stacks will only grow. The shrink mechanism is analagous to sweeping, which is incremental and thus tends toward a steady amount of heap memory used. As a result, in practice, I expect this to be a non-issue. Note that if the memory limit is not set, this change should be a no-op. For #48409. Change-Id: Ie06d10175e5e36f9fb6450e26ed8acd3d30c681c Reviewed-on: https://go-review.googlesource.com/c/go/+/394221 Run-TryBot: Michael Knyszek <mknyszek@google.com> TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Michael Pratt <mpratt@google.com>
2022-03-21 21:27:06 +00:00
g.init(int32(gcPercent), memoryLimit)
return g
}
func (c *GCController) StartCycle(stackSize, globalsSize uint64, scannableFrac float64, gomaxprocs int) {
runtime: check the heap goal and trigger dynamically As it stands, the heap goal and the trigger are set once by gcController.commit, and then read out of gcController. However with the coming memory limit we need the GC to be able to respond to changes in non-heap memory. The simplest way of achieving this is to compute the heap goal and its associated trigger dynamically. In order to make this easier to implement, the GC trigger is now based on the heap goal, as opposed to the status quo of computing both simultaneously. In many cases we just want the heap goal anyway, not both, but we definitely need the goal to compute the trigger, because the trigger's bounds are entirely based on the goal (the initial runway is not). A consequence of this is that we can't rely on the trigger to enforce a minimum heap size anymore, and we need to lift that up directly to the goal. Specifically, we need to lift up any part of the calculation that *could* put the trigger ahead of the goal. Luckily this is just the heap minimum and minimum sweep distance. In the first case, the pacer may behave slightly differently, as the heap minimum is no longer the minimum trigger, but the actual minimum heap goal. In the second case it should be the same, as we ensure the additional runway for sweeping is added to both the goal *and* the trigger, as before, by computing that in gcControllerState.commit. There's also another place we update the heap goal: if a GC starts and we triggered beyond the goal, we always ensure there's some runway. That calculation uses the current trigger, which violates the rule of keeping the goal based on the trigger. Notice, however, that using the precomputed trigger for this isn't even quite correct: due to a bug, or something else, we might trigger a GC beyond the precomputed trigger. So this change also adds a "triggered" field to gcControllerState that tracks the point at which a GC actually triggered. This is independent of the precomputed trigger, so it's fine for the heap goal calculation to rely on it. It also turns out, there's more than just that one place where we really should be using the actual trigger point, so this change fixes those up too. Also, because the heap minimum is set by the goal and not the trigger, the maximum trigger calculation now happens *after* the goal is set, so the maximum trigger actually does what I originally intended (and what the comment says): at small heaps, the pacer picks 95% of the runway as the maximum trigger. Currently, the pacer picks a small trigger based on a not-yet-rounded-up heap goal, so the trigger gets rounded up to the goal, and as per the "ensure there's some runway" check, the runway ends up at always being 64 KiB. That check is supposed to be for exceptional circumstances, not the status quo. There's a test introduced in the last CL that needs to be updated to accomodate this slight change in behavior. So, this all sounds like a lot that changed, but what we're talking about here are really, really tight corner cases that arise from situations outside of our control, like pathologically bad behavior on the part of an OS or CPU. Even in these corner cases, it's very unlikely that users will notice any difference at all. What's more important, I think, is that the pacer behaves more closely to what all the comments describe, and what the original intent was. Another note: at first, one might think that computing the heap goal and trigger dynamically introduces some raciness, but not in this CL: the heap goal and trigger are completely static. Allocation outside of a GC cycle may now be a bit slower than before, as the GC trigger check is now significantly more complex. However, note that this executes basically just as often as gcController.revise, and that makes up for a vanishingly small part of any CPU profile. The next CL cleans up the floating point multiplications on this path nonetheless, just to be safe. For #48409. Change-Id: I280f5ad607a86756d33fb8449ad08555cbee93f9 Reviewed-on: https://go-review.googlesource.com/c/go/+/397014 Run-TryBot: Michael Knyszek <mknyszek@google.com> Reviewed-by: Michael Pratt <mpratt@google.com> TryBot-Result: Gopher Robot <gobot@golang.org>
2022-03-21 21:27:06 +00:00
trigger, _ := c.trigger()
runtime: set the heap goal from the memory limit This change makes the memory limit functional by including it in the heap goal calculation. Specifically, we derive a heap goal from the memory limit, and compare that to the GOGC-based goal. If the goal based on the memory limit is lower, we prefer that. To derive the memory limit goal, the heap goal calculation now takes a few additional parameters as input. As a result, the heap goal, in the presence of a memory limit, may change dynamically. The consequences of this are that different parts of the runtime can have different views of the heap goal; this is OK. What's important is that all of the runtime is able to observe the correct heap goal for the moment it's doing something that affects it, like anything that should trigger a GC cycle. On the topic of triggering a GC cycle, this change also allows any manually managed memory allocation from the page heap to trigger a GC. So, specifically workbufs, unrolled GC scan programs, and goroutine stacks. The reason for this is that now non-heap memory can effect the trigger or the heap goal. Most sources of non-heap memory only change slowly, like GC pointer bitmaps, or change in response to explicit function calls like GOMAXPROCS. Note also that unrolled GC scan programs and workbufs are really only relevant during a GC cycle anyway, so they won't actually ever trigger a GC. Our primary target here is goroutine stacks. Goroutine stacks can increase quickly, and this is currently totally independent of the GC cycle. Thus, if for example a goroutine begins to recurse suddenly and deeply, then even though the heap goal and trigger react, we might not notice until its too late. As a result, we need to trigger a GC cycle. We do this trigger in allocManual instead of in stackalloc because it's far more general. We ultimately care about memory that's mapped read/write and not returned to the OS, which is much more the domain of the page heap than the stack allocator. Furthermore, there may be new sources of memory manual allocation in the future (e.g. arenas) that need to trigger a GC if necessary. As such, I'm inclined to leave the trigger in allocManual as an extra defensive measure. It's worth noting that because goroutine stacks do not behave quite as predictably as other non-heap memory, there is the potential for the heap goal to swing wildly. Fortunately, goroutine stacks that haven't been set up to shrink by the last GC cycle will not shrink until after the next one. This reduces the amount of possible churn in the heap goal because it means that shrinkage only happens once per goroutine, per GC cycle. After all the goroutines that should shrink did, then goroutine stacks will only grow. The shrink mechanism is analagous to sweeping, which is incremental and thus tends toward a steady amount of heap memory used. As a result, in practice, I expect this to be a non-issue. Note that if the memory limit is not set, this change should be a no-op. For #48409. Change-Id: Ie06d10175e5e36f9fb6450e26ed8acd3d30c681c Reviewed-on: https://go-review.googlesource.com/c/go/+/394221 Run-TryBot: Michael Knyszek <mknyszek@google.com> TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Michael Pratt <mpratt@google.com>
2022-03-21 21:27:06 +00:00
if c.heapMarked > trigger {
trigger = c.heapMarked
}
c.maxStackScan.Store(stackSize)
c.globalsScan.Store(globalsSize)
c.heapLive.Store(trigger)
c.heapScan.Add(int64(float64(trigger-c.heapMarked) * scannableFrac))
runtime: reduce max idle mark workers during periodic GC cycles This change reduces the maximum number of idle mark workers during periodic (currently every 2 minutes) GC cycles to 1. Idle mark workers soak up all available and unused Ps, up to GOMAXPROCS. While this provides some throughput and latency benefit in general, it can cause what appear to be massive CPU utilization spikes in otherwise idle applications. This is mostly an issue for *very* idle applications, ones idle enough to trigger periodic GC cycles. This spike also tends to interact poorly with auto-scaling systems, as the system might assume the load average is very low and suddenly see a massive burst in activity. The result of this change is not to bring down this 100% (of GOMAXPROCS) CPU utilization spike to 0%, but rather min(25% + 1/GOMAXPROCS*100%, 100%) Idle mark workers also do incur a small latency penalty as they must be descheduled for other work that might pop up. Luckily the runtime is pretty good about getting idle mark workers off of Ps, so in general the latency benefit from shorter GC cycles outweighs this cost. But, the cost is still non-zero and may be more significant in idle applications that aren't invoking assists and write barriers quite as often. We can't completely eliminate idle mark workers because they're currently necessary for GC progress in some circumstances. Namely, they're critical for progress when all we have is fractional workers. If a fractional worker meets its quota, and all user goroutines are blocked directly or indirectly on a GC cycle (via runtime.GOMAXPROCS, or runtime.GC), the program may deadlock without GC workers, since the fractional worker will go to sleep with nothing to wake it. Fixes #37116. For #44163. Change-Id: Ib74793bb6b88d1765c52d445831310b0d11ef423 Reviewed-on: https://go-review.googlesource.com/c/go/+/393394 Reviewed-by: Michael Pratt <mpratt@google.com> Run-TryBot: Michael Knyszek <mknyszek@google.com> TryBot-Result: Gopher Robot <gobot@golang.org>
2022-03-16 15:47:57 +00:00
c.startCycle(0, gomaxprocs, gcTrigger{kind: gcTriggerHeap})
}
func (c *GCController) AssistWorkPerByte() float64 {
return c.assistWorkPerByte.Load()
}
func (c *GCController) HeapGoal() uint64 {
runtime: check the heap goal and trigger dynamically As it stands, the heap goal and the trigger are set once by gcController.commit, and then read out of gcController. However with the coming memory limit we need the GC to be able to respond to changes in non-heap memory. The simplest way of achieving this is to compute the heap goal and its associated trigger dynamically. In order to make this easier to implement, the GC trigger is now based on the heap goal, as opposed to the status quo of computing both simultaneously. In many cases we just want the heap goal anyway, not both, but we definitely need the goal to compute the trigger, because the trigger's bounds are entirely based on the goal (the initial runway is not). A consequence of this is that we can't rely on the trigger to enforce a minimum heap size anymore, and we need to lift that up directly to the goal. Specifically, we need to lift up any part of the calculation that *could* put the trigger ahead of the goal. Luckily this is just the heap minimum and minimum sweep distance. In the first case, the pacer may behave slightly differently, as the heap minimum is no longer the minimum trigger, but the actual minimum heap goal. In the second case it should be the same, as we ensure the additional runway for sweeping is added to both the goal *and* the trigger, as before, by computing that in gcControllerState.commit. There's also another place we update the heap goal: if a GC starts and we triggered beyond the goal, we always ensure there's some runway. That calculation uses the current trigger, which violates the rule of keeping the goal based on the trigger. Notice, however, that using the precomputed trigger for this isn't even quite correct: due to a bug, or something else, we might trigger a GC beyond the precomputed trigger. So this change also adds a "triggered" field to gcControllerState that tracks the point at which a GC actually triggered. This is independent of the precomputed trigger, so it's fine for the heap goal calculation to rely on it. It also turns out, there's more than just that one place where we really should be using the actual trigger point, so this change fixes those up too. Also, because the heap minimum is set by the goal and not the trigger, the maximum trigger calculation now happens *after* the goal is set, so the maximum trigger actually does what I originally intended (and what the comment says): at small heaps, the pacer picks 95% of the runway as the maximum trigger. Currently, the pacer picks a small trigger based on a not-yet-rounded-up heap goal, so the trigger gets rounded up to the goal, and as per the "ensure there's some runway" check, the runway ends up at always being 64 KiB. That check is supposed to be for exceptional circumstances, not the status quo. There's a test introduced in the last CL that needs to be updated to accomodate this slight change in behavior. So, this all sounds like a lot that changed, but what we're talking about here are really, really tight corner cases that arise from situations outside of our control, like pathologically bad behavior on the part of an OS or CPU. Even in these corner cases, it's very unlikely that users will notice any difference at all. What's more important, I think, is that the pacer behaves more closely to what all the comments describe, and what the original intent was. Another note: at first, one might think that computing the heap goal and trigger dynamically introduces some raciness, but not in this CL: the heap goal and trigger are completely static. Allocation outside of a GC cycle may now be a bit slower than before, as the GC trigger check is now significantly more complex. However, note that this executes basically just as often as gcController.revise, and that makes up for a vanishingly small part of any CPU profile. The next CL cleans up the floating point multiplications on this path nonetheless, just to be safe. For #48409. Change-Id: I280f5ad607a86756d33fb8449ad08555cbee93f9 Reviewed-on: https://go-review.googlesource.com/c/go/+/397014 Run-TryBot: Michael Knyszek <mknyszek@google.com> Reviewed-by: Michael Pratt <mpratt@google.com> TryBot-Result: Gopher Robot <gobot@golang.org>
2022-03-21 21:27:06 +00:00
return c.heapGoal()
}
func (c *GCController) HeapLive() uint64 {
return c.heapLive.Load()
}
func (c *GCController) HeapMarked() uint64 {
return c.heapMarked
}
runtime: check the heap goal and trigger dynamically As it stands, the heap goal and the trigger are set once by gcController.commit, and then read out of gcController. However with the coming memory limit we need the GC to be able to respond to changes in non-heap memory. The simplest way of achieving this is to compute the heap goal and its associated trigger dynamically. In order to make this easier to implement, the GC trigger is now based on the heap goal, as opposed to the status quo of computing both simultaneously. In many cases we just want the heap goal anyway, not both, but we definitely need the goal to compute the trigger, because the trigger's bounds are entirely based on the goal (the initial runway is not). A consequence of this is that we can't rely on the trigger to enforce a minimum heap size anymore, and we need to lift that up directly to the goal. Specifically, we need to lift up any part of the calculation that *could* put the trigger ahead of the goal. Luckily this is just the heap minimum and minimum sweep distance. In the first case, the pacer may behave slightly differently, as the heap minimum is no longer the minimum trigger, but the actual minimum heap goal. In the second case it should be the same, as we ensure the additional runway for sweeping is added to both the goal *and* the trigger, as before, by computing that in gcControllerState.commit. There's also another place we update the heap goal: if a GC starts and we triggered beyond the goal, we always ensure there's some runway. That calculation uses the current trigger, which violates the rule of keeping the goal based on the trigger. Notice, however, that using the precomputed trigger for this isn't even quite correct: due to a bug, or something else, we might trigger a GC beyond the precomputed trigger. So this change also adds a "triggered" field to gcControllerState that tracks the point at which a GC actually triggered. This is independent of the precomputed trigger, so it's fine for the heap goal calculation to rely on it. It also turns out, there's more than just that one place where we really should be using the actual trigger point, so this change fixes those up too. Also, because the heap minimum is set by the goal and not the trigger, the maximum trigger calculation now happens *after* the goal is set, so the maximum trigger actually does what I originally intended (and what the comment says): at small heaps, the pacer picks 95% of the runway as the maximum trigger. Currently, the pacer picks a small trigger based on a not-yet-rounded-up heap goal, so the trigger gets rounded up to the goal, and as per the "ensure there's some runway" check, the runway ends up at always being 64 KiB. That check is supposed to be for exceptional circumstances, not the status quo. There's a test introduced in the last CL that needs to be updated to accomodate this slight change in behavior. So, this all sounds like a lot that changed, but what we're talking about here are really, really tight corner cases that arise from situations outside of our control, like pathologically bad behavior on the part of an OS or CPU. Even in these corner cases, it's very unlikely that users will notice any difference at all. What's more important, I think, is that the pacer behaves more closely to what all the comments describe, and what the original intent was. Another note: at first, one might think that computing the heap goal and trigger dynamically introduces some raciness, but not in this CL: the heap goal and trigger are completely static. Allocation outside of a GC cycle may now be a bit slower than before, as the GC trigger check is now significantly more complex. However, note that this executes basically just as often as gcController.revise, and that makes up for a vanishingly small part of any CPU profile. The next CL cleans up the floating point multiplications on this path nonetheless, just to be safe. For #48409. Change-Id: I280f5ad607a86756d33fb8449ad08555cbee93f9 Reviewed-on: https://go-review.googlesource.com/c/go/+/397014 Run-TryBot: Michael Knyszek <mknyszek@google.com> Reviewed-by: Michael Pratt <mpratt@google.com> TryBot-Result: Gopher Robot <gobot@golang.org>
2022-03-21 21:27:06 +00:00
func (c *GCController) Triggered() uint64 {
return c.triggered
}
type GCControllerReviseDelta struct {
HeapLive int64
HeapScan int64
HeapScanWork int64
StackScanWork int64
GlobalsScanWork int64
}
func (c *GCController) Revise(d GCControllerReviseDelta) {
c.heapLive.Add(d.HeapLive)
c.heapScan.Add(d.HeapScan)
runtime: implement GC pacer redesign This change implements the GC pacer redesign outlined in #44167 and the accompanying design document, behind a GOEXPERIMENT flag that is on by default. In addition to adding the new pacer, this CL also includes code to track and account for stack and globals scan work in the pacer and in the assist credit system. The new pacer also deviates slightly from the document in that it increases the bound on the minimum trigger ratio from 0.6 (scaled by GOGC) to 0.7. The logic behind this change is that the new pacer much more consistently hits the goal (good!) leading to slightly less frequent GC cycles, but _longer_ ones (in this case, bad!). It turns out that the cost of having the GC on hurts throughput significantly (per byte of memory used), though tail latencies can improve by up to 10%! To be conservative, this change moves the value to 0.7 where there is a small improvement to both throughput and latency, given the memory use. Because the new pacer accounts for the two most significant sources of scan work after heap objects, it is now also safer to reduce the minimum heap size without leading to very poor amortization. This change thus decreases the minimum heap size to 512 KiB, which corresponds to the fact that the runtime has around 200 KiB of scannable globals always there, up-front, providing a baseline. Benchmark results: https://perf.golang.org/search?q=upload:20211001.6 tile38's KNearest benchmark shows a memory increase, but throughput (and latency) per byte of memory used is better. gopher-lua showed an increase in both CPU time and memory usage, but subsequent attempts to reproduce this behavior are inconsistent. Sometimes the overall performance is better, sometimes it's worse. This suggests that the benchmark is fairly noisy in a way not captured by the benchmarking framework itself. biogo-igor is the only benchmark to show a significant performance loss. This benchmark exhibits a very high GC rate, with relatively little work to do in each cycle. The idle mark workers are quite active. In the new pacer, mark phases are longer, mark assists are fewer, and some of that time in mark assists has shifted to idle workers. Linux perf indicates that the difference in CPU time can be mostly attributed to write-barrier slow path related calls, which in turn indicates that the write barrier being on for longer is the primary culprit. This also explains the memory increase, as a longer mark phase leads to more memory allocated black, surviving an extra cycle and contributing to the heap goal. For #44167. Change-Id: I8ac7cfef7d593e4a642c9b2be43fb3591a8ec9c4 Reviewed-on: https://go-review.googlesource.com/c/go/+/309869 Trust: Michael Knyszek <mknyszek@google.com> Run-TryBot: Michael Knyszek <mknyszek@google.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Austin Clements <austin@google.com> Reviewed-by: Michael Pratt <mpratt@google.com>
2021-04-13 03:07:27 +00:00
c.heapScanWork.Add(d.HeapScanWork)
c.stackScanWork.Add(d.StackScanWork)
c.globalsScanWork.Add(d.GlobalsScanWork)
c.revise()
}
func (c *GCController) EndCycle(bytesMarked uint64, assistTime, elapsed int64, gomaxprocs int) {
c.assistTime.Store(assistTime)
c.endCycle(elapsed, gomaxprocs, false)
c.resetLive(bytesMarked)
runtime: check the heap goal and trigger dynamically As it stands, the heap goal and the trigger are set once by gcController.commit, and then read out of gcController. However with the coming memory limit we need the GC to be able to respond to changes in non-heap memory. The simplest way of achieving this is to compute the heap goal and its associated trigger dynamically. In order to make this easier to implement, the GC trigger is now based on the heap goal, as opposed to the status quo of computing both simultaneously. In many cases we just want the heap goal anyway, not both, but we definitely need the goal to compute the trigger, because the trigger's bounds are entirely based on the goal (the initial runway is not). A consequence of this is that we can't rely on the trigger to enforce a minimum heap size anymore, and we need to lift that up directly to the goal. Specifically, we need to lift up any part of the calculation that *could* put the trigger ahead of the goal. Luckily this is just the heap minimum and minimum sweep distance. In the first case, the pacer may behave slightly differently, as the heap minimum is no longer the minimum trigger, but the actual minimum heap goal. In the second case it should be the same, as we ensure the additional runway for sweeping is added to both the goal *and* the trigger, as before, by computing that in gcControllerState.commit. There's also another place we update the heap goal: if a GC starts and we triggered beyond the goal, we always ensure there's some runway. That calculation uses the current trigger, which violates the rule of keeping the goal based on the trigger. Notice, however, that using the precomputed trigger for this isn't even quite correct: due to a bug, or something else, we might trigger a GC beyond the precomputed trigger. So this change also adds a "triggered" field to gcControllerState that tracks the point at which a GC actually triggered. This is independent of the precomputed trigger, so it's fine for the heap goal calculation to rely on it. It also turns out, there's more than just that one place where we really should be using the actual trigger point, so this change fixes those up too. Also, because the heap minimum is set by the goal and not the trigger, the maximum trigger calculation now happens *after* the goal is set, so the maximum trigger actually does what I originally intended (and what the comment says): at small heaps, the pacer picks 95% of the runway as the maximum trigger. Currently, the pacer picks a small trigger based on a not-yet-rounded-up heap goal, so the trigger gets rounded up to the goal, and as per the "ensure there's some runway" check, the runway ends up at always being 64 KiB. That check is supposed to be for exceptional circumstances, not the status quo. There's a test introduced in the last CL that needs to be updated to accomodate this slight change in behavior. So, this all sounds like a lot that changed, but what we're talking about here are really, really tight corner cases that arise from situations outside of our control, like pathologically bad behavior on the part of an OS or CPU. Even in these corner cases, it's very unlikely that users will notice any difference at all. What's more important, I think, is that the pacer behaves more closely to what all the comments describe, and what the original intent was. Another note: at first, one might think that computing the heap goal and trigger dynamically introduces some raciness, but not in this CL: the heap goal and trigger are completely static. Allocation outside of a GC cycle may now be a bit slower than before, as the GC trigger check is now significantly more complex. However, note that this executes basically just as often as gcController.revise, and that makes up for a vanishingly small part of any CPU profile. The next CL cleans up the floating point multiplications on this path nonetheless, just to be safe. For #48409. Change-Id: I280f5ad607a86756d33fb8449ad08555cbee93f9 Reviewed-on: https://go-review.googlesource.com/c/go/+/397014 Run-TryBot: Michael Knyszek <mknyszek@google.com> Reviewed-by: Michael Pratt <mpratt@google.com> TryBot-Result: Gopher Robot <gobot@golang.org>
2022-03-21 21:27:06 +00:00
c.commit(false)
}
runtime: reduce max idle mark workers during periodic GC cycles This change reduces the maximum number of idle mark workers during periodic (currently every 2 minutes) GC cycles to 1. Idle mark workers soak up all available and unused Ps, up to GOMAXPROCS. While this provides some throughput and latency benefit in general, it can cause what appear to be massive CPU utilization spikes in otherwise idle applications. This is mostly an issue for *very* idle applications, ones idle enough to trigger periodic GC cycles. This spike also tends to interact poorly with auto-scaling systems, as the system might assume the load average is very low and suddenly see a massive burst in activity. The result of this change is not to bring down this 100% (of GOMAXPROCS) CPU utilization spike to 0%, but rather min(25% + 1/GOMAXPROCS*100%, 100%) Idle mark workers also do incur a small latency penalty as they must be descheduled for other work that might pop up. Luckily the runtime is pretty good about getting idle mark workers off of Ps, so in general the latency benefit from shorter GC cycles outweighs this cost. But, the cost is still non-zero and may be more significant in idle applications that aren't invoking assists and write barriers quite as often. We can't completely eliminate idle mark workers because they're currently necessary for GC progress in some circumstances. Namely, they're critical for progress when all we have is fractional workers. If a fractional worker meets its quota, and all user goroutines are blocked directly or indirectly on a GC cycle (via runtime.GOMAXPROCS, or runtime.GC), the program may deadlock without GC workers, since the fractional worker will go to sleep with nothing to wake it. Fixes #37116. For #44163. Change-Id: Ib74793bb6b88d1765c52d445831310b0d11ef423 Reviewed-on: https://go-review.googlesource.com/c/go/+/393394 Reviewed-by: Michael Pratt <mpratt@google.com> Run-TryBot: Michael Knyszek <mknyszek@google.com> TryBot-Result: Gopher Robot <gobot@golang.org>
2022-03-16 15:47:57 +00:00
func (c *GCController) AddIdleMarkWorker() bool {
return c.addIdleMarkWorker()
}
func (c *GCController) NeedIdleMarkWorker() bool {
return c.needIdleMarkWorker()
}
func (c *GCController) RemoveIdleMarkWorker() {
c.removeIdleMarkWorker()
}
func (c *GCController) SetMaxIdleMarkWorkers(max int32) {
c.setMaxIdleMarkWorkers(max)
}
var alwaysFalse bool
var escapeSink any
func Escape[T any](x T) T {
if alwaysFalse {
escapeSink = x
}
return x
}
// Acquirem blocks preemption.
func Acquirem() {
acquirem()
}
func Releasem() {
releasem(getg().m)
}
var Timediv = timediv
runtime: make piController much more defensive about overflow If something goes horribly wrong with the assumptions surrounding a piController, its internal error state might accumulate in an unbounded manner. In practice this means unexpected Inf and NaN values. Avoid this by identifying cases where the error overflows and resetting controller state. In the scavenger, this case is much more likely. All that has to happen is the proportional relationship between sleep time and estimated CPU usage has to break down. Unfortunately because we're just measuring monotonic time for all this, there are lots of ways it could happen, especially in an oversubscribed system. In these cases, just fall back on a conservative pace for scavenging and try to wait out the issue. In the pacer I'm pretty sure this is impossible. Because we wire the output of the controller to the input, the response is very directly correlated, so it's impossible for the controller's core assumption to break down. While we're in the pacer, add more detail about why that controller is even there, as well as its purpose. Finally, let's be proactive about other sources of overflow, namely overflow from a very large input value. This change adds a check after the first few operations to detect overflow issues from the input, specifically the multiplication. No tests for the pacer because I was unable to actually break the pacer's controller under a fuzzer, and no tests for the scavenger because it is not really in a testable state. However: * This change includes a fuzz test for the piController. * I broke out the scavenger code locally and fuzz tested it, confirming that the patch eliminates the original failure mode. * I tested that on a local heap-spike test, the scavenger continues operating as expected under normal conditions. Fixes #51061. Change-Id: I02a01d2dbf0eb9d2a8a8e7274d4165c2b6a3415a Reviewed-on: https://go-review.googlesource.com/c/go/+/383954 Reviewed-by: David Chase <drchase@google.com> Reviewed-by: Michael Pratt <mpratt@google.com> Trust: Michael Knyszek <mknyszek@google.com> Run-TryBot: Michael Knyszek <mknyszek@google.com> TryBot-Result: Gopher Robot <gobot@golang.org>
2022-02-08 00:52:11 +00:00
type PIController struct {
piController
}
func NewPIController(kp, ti, tt, min, max float64) *PIController {
return &PIController{piController{
kp: kp,
ti: ti,
tt: tt,
min: min,
max: max,
}}
}
func (c *PIController) Next(input, setpoint, period float64) (float64, bool) {
return c.piController.next(input, setpoint, period)
}
const (
CapacityPerProc = capacityPerProc
GCCPULimiterUpdatePeriod = gcCPULimiterUpdatePeriod
)
type GCCPULimiter struct {
limiter gcCPULimiterState
}
func NewGCCPULimiter(now int64, gomaxprocs int32) *GCCPULimiter {
// Force the controller to escape. We're going to
// do 64-bit atomics on it, and if it gets stack-allocated
// on a 32-bit architecture, it may get allocated unaligned
// space.
l := Escape(new(GCCPULimiter))
runtime: only use CPU time from the current window in the GC CPU limiter Currently the GC CPU limiter consumes CPU time from a few pools, but because the events that flush to those pools may overlap, rather than be strictly contained within, the update window for the GC CPU limiter, the limiter's accounting is ultimately sloppy. This sloppiness complicates accounting for idle time more completely, and makes reasoning about the transient behavior of the GC CPU limiter much more difficult. To remedy this, this CL adds a field to the P struct that tracks the start time of any in-flight event the limiter might care about, along with information about the nature of that event. This timestamp is managed atomically so that the GC CPU limiter can come in and perform a read of the partial CPU time consumed by a given event. The limiter also updates the timestamp so that only what's left over is flushed by the event itself when it completes. The end result of this change is that, since the GC CPU limiter is aware of all past completed events, and all in-flight events, it can much more accurately collect the CPU time of events since the last update. There's still the possibility for skew, but any leftover time will be captured in the following update, and the magnitude of this leftover time is effectively bounded by the update period of the GC CPU limiter, which is much easier to consider. One caveat of managing this timestamp-type combo atomically is that they need to be packed in 64 bits. So, this CL gives up the top 3 bits of the timestamp and places the type information there. What this means is we effectively have only a 61-bit resolution timestamp. This is fine when the top 3 bits are the same between calls to nanotime, but becomes a problem on boundaries when those 3 bits change. These cases may cause hiccups in the GC CPU limiter by not accounting for some source of CPU time correctly, but with 61 bits of resolution this should be extremely rare. The rate of update is on the order of milliseconds, so at worst the runtime will be off of any given measurement by only a few CPU-milliseconds (and this is directly bounded by the rate of update). We're probably more inaccurate from the fact that we don't measure real CPU time but only approximate it. For #52890. Change-Id: I347f30ac9e2ba6061806c21dfe0193ef2ab3bbe9 Reviewed-on: https://go-review.googlesource.com/c/go/+/410120 Reviewed-by: Michael Pratt <mpratt@google.com> Run-TryBot: Michael Knyszek <mknyszek@google.com> TryBot-Result: Gopher Robot <gobot@golang.org>
2022-06-02 19:06:27 +00:00
l.limiter.test = true
l.limiter.resetCapacity(now, gomaxprocs)
return l
}
func (l *GCCPULimiter) Fill() uint64 {
return l.limiter.bucket.fill
}
func (l *GCCPULimiter) Capacity() uint64 {
return l.limiter.bucket.capacity
}
func (l *GCCPULimiter) Overflow() uint64 {
return l.limiter.overflow
}
func (l *GCCPULimiter) Limiting() bool {
return l.limiter.limiting()
}
func (l *GCCPULimiter) NeedUpdate(now int64) bool {
return l.limiter.needUpdate(now)
}
func (l *GCCPULimiter) StartGCTransition(enableGC bool, now int64) {
l.limiter.startGCTransition(enableGC, now)
}
func (l *GCCPULimiter) FinishGCTransition(now int64) {
l.limiter.finishGCTransition(now)
}
func (l *GCCPULimiter) Update(now int64) {
l.limiter.update(now)
}
func (l *GCCPULimiter) AddAssistTime(t int64) {
l.limiter.addAssistTime(t)
}
func (l *GCCPULimiter) ResetCapacity(now int64, nprocs int32) {
l.limiter.resetCapacity(now, nprocs)
}
const ScavengePercent = scavengePercent
type Scavenger struct {
Sleep func(int64) int64
Scavenge func(uintptr) (uintptr, int64)
ShouldStop func() bool
GoMaxProcs func() int32
released atomic.Uintptr
scavenger scavengerState
stop chan<- struct{}
done <-chan struct{}
}
func (s *Scavenger) Start() {
if s.Sleep == nil || s.Scavenge == nil || s.ShouldStop == nil || s.GoMaxProcs == nil {
panic("must populate all stubs")
}
// Install hooks.
s.scavenger.sleepStub = s.Sleep
s.scavenger.scavenge = s.Scavenge
s.scavenger.shouldStop = s.ShouldStop
s.scavenger.gomaxprocs = s.GoMaxProcs
// Start up scavenger goroutine, and wait for it to be ready.
stop := make(chan struct{})
s.stop = stop
done := make(chan struct{})
s.done = done
go func() {
// This should match bgscavenge, loosely.
s.scavenger.init()
s.scavenger.park()
for {
select {
case <-stop:
close(done)
return
default:
}
released, workTime := s.scavenger.run()
if released == 0 {
s.scavenger.park()
continue
}
s.released.Add(released)
s.scavenger.sleep(workTime)
}
}()
if !s.BlockUntilParked(1e9 /* 1 second */) {
panic("timed out waiting for scavenger to get ready")
}
}
// BlockUntilParked blocks until the scavenger parks, or until
// timeout is exceeded. Returns true if the scavenger parked.
//
// Note that in testing, parked means something slightly different.
// In anger, the scavenger parks to sleep, too, but in testing,
// it only parks when it actually has no work to do.
func (s *Scavenger) BlockUntilParked(timeout int64) bool {
// Just spin, waiting for it to park.
//
// The actual parking process is racy with respect to
// wakeups, which is fine, but for testing we need something
// a bit more robust.
start := nanotime()
for nanotime()-start < timeout {
lock(&s.scavenger.lock)
parked := s.scavenger.parked
unlock(&s.scavenger.lock)
if parked {
return true
}
Gosched()
}
return false
}
// Released returns how many bytes the scavenger released.
func (s *Scavenger) Released() uintptr {
return s.released.Load()
}
// Wake wakes up a parked scavenger to keep running.
func (s *Scavenger) Wake() {
s.scavenger.wake()
}
// Stop cleans up the scavenger's resources. The scavenger
// must be parked for this to work.
func (s *Scavenger) Stop() {
lock(&s.scavenger.lock)
parked := s.scavenger.parked
unlock(&s.scavenger.lock)
if !parked {
panic("tried to clean up scavenger that is not parked")
}
close(s.stop)
s.Wake()
<-s.done
}
runtime: redesign scavenging algorithm Currently the runtime's scavenging algorithm involves running from the top of the heap address space to the bottom (or as far as it gets) once per GC cycle. Once it treads some ground, it doesn't tread it again until the next GC cycle. This works just fine for the background scavenger, for heap-growth scavenging, and for debug.FreeOSMemory. However, it breaks down in the face of a memory limit for small heaps in the tens of MiB. Basically, because the scavenger never retreads old ground, it's completely oblivious to new memory it could scavenge, and that it really *should* in the face of a memory limit. Also, every time some thread goes to scavenge in the runtime, it reserves what could be a considerable amount of address space, hiding it from other scavengers. This change modifies and simplifies the implementation overall. It's less code with complexities that are much better encapsulated. The current implementation iterates optimistically over the address space looking for memory to scavenge, keeping track of what it last saw. The new implementation does the same, but instead of directly iterating over pages, it iterates over chunks. It maintains an index of chunks (as a bitmap over the address space) that indicate which chunks may contain scavenge work. The page allocator populates this index, while scavengers consume it and iterate over it optimistically. This has a two key benefits: 1. Scavenging is much simpler: find a candidate chunk, and check it, essentially just using the scavengeOne fast path. There's no need for the complexity of iterating beyond one chunk, because the index is lock-free and already maintains that information. 2. If pages are freed to the page allocator (always guaranteed to be unscavenged), the page allocator immediately notifies all scavengers of the new source of work, avoiding the hiding issues of the old implementation. One downside of the new implementation, however, is that it's potentially more expensive to find pages to scavenge. In the past, if a single page would become free high up in the address space, the runtime's scavengers would ignore it. Now that scavengers won't, one or more scavengers may need to iterate potentially across the whole heap to find the next source of work. For the background scavenger, this just means a potentially less reactive scavenger -- overall it should still use the same amount of CPU. It means worse overheads for memory limit scavenging, but that's not exactly something with a baseline yet. In practice, this shouldn't be too bad, hopefully since the chunk index is extremely compact. For a 48-bit address space, the index is only 8 MiB in size at worst, but even just one physical page in the index is able to support up to 128 GiB heaps, provided they aren't terribly sparse. On 32-bit platforms, the index is only 128 bytes in size. For #48409. Change-Id: I72b7e74365046b18c64a6417224c5d85511194fb Reviewed-on: https://go-review.googlesource.com/c/go/+/399474 Reviewed-by: Michael Pratt <mpratt@google.com> Run-TryBot: Michael Knyszek <mknyszek@google.com> TryBot-Result: Gopher Robot <gobot@golang.org>
2022-04-10 20:34:17 +00:00
type ScavengeIndex struct {
i scavengeIndex
}
func NewScavengeIndex(min, max ChunkIdx) *ScavengeIndex {
s := new(ScavengeIndex)
s.i.chunks = make([]atomic.Uint8, uintptr(1<<heapAddrBits/pallocChunkBytes/8))
s.i.min.Store(int32(min / 8))
s.i.max.Store(int32(max / 8))
return s
}
func (s *ScavengeIndex) Find() (ChunkIdx, uint) {
ci, off := s.i.find()
return ChunkIdx(ci), off
}
func (s *ScavengeIndex) Mark(base, limit uintptr) {
s.i.mark(base, limit)
}
func (s *ScavengeIndex) Clear(ci ChunkIdx) {
s.i.clear(chunkIdx(ci))
}
const GTrackingPeriod = gTrackingPeriod
runtime: add safe arena support to the runtime This change adds an API to the runtime for arenas. A later CL can potentially export it as an experimental API, but for now, just the runtime implementation will suffice. The purpose of arenas is to improve efficiency, primarily by allowing for an application to manually free memory, thereby delaying garbage collection. It comes with other potential performance benefits, such as better locality, a better allocation strategy, and better handling of interior pointers by the GC. This implementation is based on one by danscales@google.com with a few significant differences: * The implementation lives entirely in the runtime (all layers). * Arena chunks are the minimum of 8 MiB or the heap arena size. This choice is made because in practice 64 MiB appears to be way too large of an area for most real-world use-cases. * Arena chunks are not unmapped, instead they're placed on an evacuation list and when there are no pointers left pointing into them, they're allowed to be reused. * Reusing partially-used arena chunks no longer tries to find one used by the same P first; it just takes the first one available. * In order to ensure worst-case fragmentation is never worse than 25%, only types and slice backing stores whose sizes are 1/4th the size of a chunk or less may be used. Previously larger sizes, up to the size of the chunk, were allowed. * ASAN, MSAN, and the race detector are fully supported. * Sets arena chunks to fault that were deferred at the end of mark termination (a non-public patch once did this; I don't see a reason not to continue that). For #51317. Change-Id: I83b1693a17302554cb36b6daa4e9249a81b1644f Reviewed-on: https://go-review.googlesource.com/c/go/+/423359 Reviewed-by: Cherry Mui <cherryyz@google.com> TryBot-Result: Gopher Robot <gobot@golang.org> Run-TryBot: Michael Knyszek <mknyszek@google.com>
2022-08-12 21:40:46 +00:00
var ZeroBase = unsafe.Pointer(&zerobase)
const UserArenaChunkBytes = userArenaChunkBytes
type UserArena struct {
arena *userArena
}
func NewUserArena() *UserArena {
return &UserArena{newUserArena()}
}
func (a *UserArena) New(out *any) {
i := efaceOf(out)
typ := i._type
if typ.kind&kindMask != kindPtr {
panic("new result of non-ptr type")
}
typ = (*ptrtype)(unsafe.Pointer(typ)).elem
i.data = a.arena.new(typ)
}
func (a *UserArena) Slice(sl any, cap int) {
a.arena.slice(sl, cap)
}
func (a *UserArena) Free() {
a.arena.free()
}
func GlobalWaitingArenaChunks() int {
n := 0
systemstack(func() {
lock(&mheap_.lock)
for s := mheap_.userArena.quarantineList.first; s != nil; s = s.next {
n++
}
unlock(&mheap_.lock)
})
return n
}
func UserArenaClone[T any](s T) T {
return arena_heapify(s).(T)
}
runtime: add safe arena support to the runtime This change adds an API to the runtime for arenas. A later CL can potentially export it as an experimental API, but for now, just the runtime implementation will suffice. The purpose of arenas is to improve efficiency, primarily by allowing for an application to manually free memory, thereby delaying garbage collection. It comes with other potential performance benefits, such as better locality, a better allocation strategy, and better handling of interior pointers by the GC. This implementation is based on one by danscales@google.com with a few significant differences: * The implementation lives entirely in the runtime (all layers). * Arena chunks are the minimum of 8 MiB or the heap arena size. This choice is made because in practice 64 MiB appears to be way too large of an area for most real-world use-cases. * Arena chunks are not unmapped, instead they're placed on an evacuation list and when there are no pointers left pointing into them, they're allowed to be reused. * Reusing partially-used arena chunks no longer tries to find one used by the same P first; it just takes the first one available. * In order to ensure worst-case fragmentation is never worse than 25%, only types and slice backing stores whose sizes are 1/4th the size of a chunk or less may be used. Previously larger sizes, up to the size of the chunk, were allowed. * ASAN, MSAN, and the race detector are fully supported. * Sets arena chunks to fault that were deferred at the end of mark termination (a non-public patch once did this; I don't see a reason not to continue that). For #51317. Change-Id: I83b1693a17302554cb36b6daa4e9249a81b1644f Reviewed-on: https://go-review.googlesource.com/c/go/+/423359 Reviewed-by: Cherry Mui <cherryyz@google.com> TryBot-Result: Gopher Robot <gobot@golang.org> Run-TryBot: Michael Knyszek <mknyszek@google.com>
2022-08-12 21:40:46 +00:00
var AlignUp = alignUp
// BlockUntilEmptyFinalizerQueue blocks until either the finalizer
// queue is emptied (and the finalizers have executed) or the timeout
// is reached. Returns true if the finalizer queue was emptied.
func BlockUntilEmptyFinalizerQueue(timeout int64) bool {
start := nanotime()
for nanotime()-start < timeout {
lock(&finlock)
// We know the queue has been drained when both finq is nil
// and the finalizer g has stopped executing.
empty := finq == nil
empty = empty && readgstatus(fing) == _Gwaiting && fing.waitreason == waitReasonFinalizerWait
unlock(&finlock)
if empty {
return true
}
Gosched()
}
return false
}
cmd/compile,cmd/link,runtime: add start line numbers to func metadata This adds the function "start line number" to runtime._func and runtime.inlinedCall objects. The "start line number" is the line number of the func keyword or TEXT directive for assembly. Subtracting the start line number from PC line number provides the relative line offset of a PC from the the start of the function. This helps with source stability by allowing code above the function to move without invalidating samples within the function. Encoding start line rather than relative lines directly is convenient because the pprof format already contains a start line field. This CL uses a straightforward encoding of explictly including a start line field in every _func and inlinedCall. It is possible that we could compress this further in the future. e.g., functions with a prologue usually have <line of PC 0> == <start line>. In runtime.test, 95% of functions have <line of PC 0> == <start line>. According to bent, this is geomean +0.83% binary size vs master and -0.31% binary size vs 1.19. Note that //line directives can change the file and line numbers arbitrarily. The encoded start line is as adjusted by //line directives. Since this can change in the middle of a function, `line - start line` offset calculations may not be meaningful if //line directives are in use. For #55022. Change-Id: Iaabbc6dd4f85ffdda294266ef982ae838cc692f6 Reviewed-on: https://go-review.googlesource.com/c/go/+/429638 Run-TryBot: Michael Pratt <mpratt@google.com> Auto-Submit: Michael Pratt <mpratt@google.com> Reviewed-by: Matthew Dempsky <mdempsky@google.com> TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Cherry Mui <cherryyz@google.com>
2022-09-07 13:23:19 -04:00
func FrameStartLine(f *Frame) int {
return f.startLine
}
// PersistentAlloc allocates some memory that lives outside the Go heap.
// This memory will never be freed; use sparingly.
func PersistentAlloc(n uintptr) unsafe.Pointer {
return persistentalloc(n, 0, &memstats.other_sys)
}