2016-03-01 22:57:46 +00:00
|
|
|
// Copyright 2010 The Go Authors. All rights reserved.
|
2010-10-25 17:55:50 -07:00
|
|
|
// Use of this source code is governed by a BSD-style
|
|
|
|
|
// license that can be found in the LICENSE file.
|
|
|
|
|
|
|
|
|
|
// Export guts for testing.
|
|
|
|
|
|
|
|
|
|
package runtime
|
|
|
|
|
|
2015-11-02 14:09:24 -05:00
|
|
|
import (
|
2021-06-17 19:10:18 +00:00
|
|
|
"internal/goarch"
|
2021-06-16 21:52:24 +00:00
|
|
|
"internal/goos"
|
2015-11-02 14:09:24 -05:00
|
|
|
"runtime/internal/atomic"
|
2015-11-11 12:39:30 -05:00
|
|
|
"runtime/internal/sys"
|
2015-11-02 14:09:24 -05:00
|
|
|
"unsafe"
|
|
|
|
|
)
|
2014-08-21 21:10:45 +04:00
|
|
|
|
2010-10-25 17:55:50 -07:00
|
|
|
var Fadd64 = fadd64
|
|
|
|
|
var Fsub64 = fsub64
|
|
|
|
|
var Fmul64 = fmul64
|
|
|
|
|
var Fdiv64 = fdiv64
|
|
|
|
|
var F64to32 = f64to32
|
|
|
|
|
var F32to64 = f32to64
|
|
|
|
|
var Fcmp64 = fcmp64
|
|
|
|
|
var Fintto64 = fintto64
|
|
|
|
|
var F64toint = f64toint
|
2011-07-19 11:01:17 -04:00
|
|
|
|
|
|
|
|
var Entersyscall = entersyscall
|
|
|
|
|
var Exitsyscall = exitsyscall
|
2014-08-27 23:32:49 -04:00
|
|
|
var LockedOSThread = lockedOSThread
|
2015-11-02 14:09:24 -05:00
|
|
|
var Xadduintptr = atomic.Xadduintptr
|
2012-04-12 11:49:25 +04:00
|
|
|
|
2015-09-14 14:03:45 -07:00
|
|
|
var Fastlog2 = fastlog2
|
|
|
|
|
|
2016-10-30 01:54:19 +02:00
|
|
|
var Atoi = atoi
|
|
|
|
|
var Atoi32 = atoi32
|
2022-02-15 00:22:20 +00:00
|
|
|
var ParseByteCount = parseByteCount
|
2016-10-30 01:54:19 +02:00
|
|
|
|
2019-03-29 10:43:31 -07:00
|
|
|
var Nanotime = nanotime
|
2019-04-05 15:53:12 -07:00
|
|
|
var NetpollBreak = netpollBreak
|
|
|
|
|
var Usleep = usleep
|
2019-03-29 10:43:31 -07:00
|
|
|
|
2019-08-21 00:24:25 +00:00
|
|
|
var PhysPageSize = physPageSize
|
2019-04-29 21:02:18 +00:00
|
|
|
var PhysHugePageSize = physHugePageSize
|
|
|
|
|
|
2019-10-29 15:35:42 +01:00
|
|
|
var NetpollGenericInit = netpollGenericInit
|
|
|
|
|
|
2019-12-27 12:02:50 -05:00
|
|
|
var Memmove = memmove
|
|
|
|
|
var MemclrNoHeapPointers = memclrNoHeapPointers
|
|
|
|
|
|
2023-03-13 14:02:16 -04:00
|
|
|
const TracebackInnerFrames = tracebackInnerFrames
|
|
|
|
|
const TracebackOuterFrames = tracebackOuterFrames
|
|
|
|
|
|
2021-03-09 16:13:23 -05:00
|
|
|
var LockPartialOrder = lockPartialOrder
|
|
|
|
|
|
|
|
|
|
type LockRank lockRank
|
|
|
|
|
|
|
|
|
|
func (l LockRank) String() string {
|
|
|
|
|
return lockRank(l).String()
|
|
|
|
|
}
|
|
|
|
|
|
2019-04-25 14:10:29 -04:00
|
|
|
const PreemptMSupported = preemptMSupported
|
|
|
|
|
|
2012-04-12 11:49:25 +04:00
|
|
|
type LFNode struct {
|
2014-10-27 15:57:07 -04:00
|
|
|
Next uint64
|
2012-04-12 11:49:25 +04:00
|
|
|
Pushcnt uintptr
|
|
|
|
|
}
|
|
|
|
|
|
2014-08-21 21:10:45 +04:00
|
|
|
func LFStackPush(head *uint64, node *LFNode) {
|
2017-03-07 16:38:29 -05:00
|
|
|
(*lfstack)(head).push((*lfnode)(unsafe.Pointer(node)))
|
2014-08-21 21:10:45 +04:00
|
|
|
}
|
2012-04-12 11:49:25 +04:00
|
|
|
|
2014-08-21 21:10:45 +04:00
|
|
|
func LFStackPop(head *uint64) *LFNode {
|
2017-03-07 16:38:29 -05:00
|
|
|
return (*LFNode)(unsafe.Pointer((*lfstack)(head).pop()))
|
2014-08-21 21:10:45 +04:00
|
|
|
}
|
2022-11-08 17:48:48 -08:00
|
|
|
func LFNodeValidate(node *LFNode) {
|
|
|
|
|
lfnodeValidate((*lfnode)(unsafe.Pointer(node)))
|
|
|
|
|
}
|
2012-05-11 10:50:03 +04:00
|
|
|
|
2019-10-22 00:38:08 -07:00
|
|
|
func Netpoll(delta int64) {
|
|
|
|
|
systemstack(func() {
|
|
|
|
|
netpoll(delta)
|
|
|
|
|
})
|
|
|
|
|
}
|
|
|
|
|
|
2021-12-01 12:15:45 -05:00
|
|
|
func GCMask(x any) (ret []byte) {
|
[dev.cc] runtime: delete scalararg, ptrarg; rename onM to systemstack
Scalararg and ptrarg are not "signal safe".
Go code filling them out can be interrupted by a signal,
and then the signal handler runs, and if it also ends up
in Go code that uses scalararg or ptrarg, now the old
values have been smashed.
For the pieces of code that do need to run in a signal handler,
we introduced onM_signalok, which is really just onM
except that the _signalok is meant to convey that the caller
asserts that scalarg and ptrarg will be restored to their old
values after the call (instead of the usual behavior, zeroing them).
Scalararg and ptrarg are also untyped and therefore error-prone.
Go code can always pass a closure instead of using scalararg
and ptrarg; they were only really necessary for C code.
And there's no more C code.
For all these reasons, delete scalararg and ptrarg, converting
the few remaining references to use closures.
Once those are gone, there is no need for a distinction between
onM and onM_signalok, so replace both with a single function
equivalent to the current onM_signalok (that is, it can be called
on any of the curg, g0, and gsignal stacks).
The name onM and the phrase 'm stack' are misnomers,
because on most system an M has two system stacks:
the main thread stack and the signal handling stack.
Correct the misnomer by naming the replacement function systemstack.
Fix a few references to "M stack" in code.
The main motivation for this change is to eliminate scalararg/ptrarg.
Rick and I have already seen them cause problems because
the calling sequence m.ptrarg[0] = p is a heap pointer assignment,
so it gets a write barrier. The write barrier also uses onM, so it has
all the same problems as if it were being invoked by a signal handler.
We worked around this by saving and restoring the old values
and by calling onM_signalok, but there's no point in keeping this nice
home for bugs around any longer.
This CL also changes funcline to return the file name as a result
instead of filling in a passed-in *string. (The *string signature is
left over from when the code was written in and called from C.)
That's arguably an unrelated change, except that once I had done
the ptrarg/scalararg/onM cleanup I started getting false positives
about the *string argument escaping (not allowed in package runtime).
The compiler is wrong, but the easiest fix is to write the code like
Go code instead of like C code. I am a bit worried that the compiler
is wrong because of some use of uninitialized memory in the escape
analysis. If that's the reason, it will go away when we convert the
compiler to Go. (And if not, we'll debug it the next time.)
LGTM=khr
R=r, khr
CC=austin, golang-codereviews, iant, rlh
https://golang.org/cl/174950043
2014-11-12 14:54:31 -05:00
|
|
|
systemstack(func() {
|
2015-04-28 00:28:47 -04:00
|
|
|
ret = getgcmask(x)
|
2014-09-05 14:59:31 -07:00
|
|
|
})
|
|
|
|
|
return
|
|
|
|
|
}
|
2014-07-29 11:01:02 +04:00
|
|
|
|
2014-09-06 10:07:23 -07:00
|
|
|
func RunSchedLocalQueueTest() {
|
2021-02-09 15:48:41 -05:00
|
|
|
pp := new(p)
|
|
|
|
|
gs := make([]g, len(pp.runq))
|
2022-04-25 17:21:58 -04:00
|
|
|
Escape(gs) // Ensure gs doesn't move, since we use guintptrs
|
2021-02-09 15:48:41 -05:00
|
|
|
for i := 0; i < len(pp.runq); i++ {
|
|
|
|
|
if g, _ := runqget(pp); g != nil {
|
2016-03-12 16:41:08 -07:00
|
|
|
throw("runq is not empty initially")
|
|
|
|
|
}
|
|
|
|
|
for j := 0; j < i; j++ {
|
2021-02-09 15:48:41 -05:00
|
|
|
runqput(pp, &gs[i], false)
|
2016-03-12 16:41:08 -07:00
|
|
|
}
|
|
|
|
|
for j := 0; j < i; j++ {
|
2021-02-09 15:48:41 -05:00
|
|
|
if g, _ := runqget(pp); g != &gs[i] {
|
2016-03-12 16:41:08 -07:00
|
|
|
print("bad element at iter ", i, "/", j, "\n")
|
|
|
|
|
throw("bad element")
|
|
|
|
|
}
|
|
|
|
|
}
|
2021-02-09 15:48:41 -05:00
|
|
|
if g, _ := runqget(pp); g != nil {
|
2016-03-12 16:41:08 -07:00
|
|
|
throw("runq is not empty afterwards")
|
|
|
|
|
}
|
|
|
|
|
}
|
2014-09-06 10:07:23 -07:00
|
|
|
}
|
2016-03-12 16:41:08 -07:00
|
|
|
|
2014-09-06 10:07:23 -07:00
|
|
|
func RunSchedLocalQueueStealTest() {
|
2016-03-12 16:41:08 -07:00
|
|
|
p1 := new(p)
|
|
|
|
|
p2 := new(p)
|
|
|
|
|
gs := make([]g, len(p1.runq))
|
2022-04-25 17:21:58 -04:00
|
|
|
Escape(gs) // Ensure gs doesn't move, since we use guintptrs
|
2016-03-12 16:41:08 -07:00
|
|
|
for i := 0; i < len(p1.runq); i++ {
|
|
|
|
|
for j := 0; j < i; j++ {
|
|
|
|
|
gs[j].sig = 0
|
|
|
|
|
runqput(p1, &gs[j], false)
|
|
|
|
|
}
|
|
|
|
|
gp := runqsteal(p2, p1, true)
|
|
|
|
|
s := 0
|
|
|
|
|
if gp != nil {
|
|
|
|
|
s++
|
|
|
|
|
gp.sig++
|
|
|
|
|
}
|
|
|
|
|
for {
|
|
|
|
|
gp, _ = runqget(p2)
|
|
|
|
|
if gp == nil {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
s++
|
|
|
|
|
gp.sig++
|
|
|
|
|
}
|
|
|
|
|
for {
|
|
|
|
|
gp, _ = runqget(p1)
|
|
|
|
|
if gp == nil {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
gp.sig++
|
|
|
|
|
}
|
|
|
|
|
for j := 0; j < i; j++ {
|
|
|
|
|
if gs[j].sig != 1 {
|
|
|
|
|
print("bad element ", j, "(", gs[j].sig, ") at iter ", i, "\n")
|
|
|
|
|
throw("bad element")
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if s != i/2 && s != i/2+1 {
|
|
|
|
|
print("bad steal ", s, ", want ", i/2, " or ", i/2+1, ", iter ", i, "\n")
|
|
|
|
|
throw("bad steal")
|
|
|
|
|
}
|
|
|
|
|
}
|
2014-09-06 10:07:23 -07:00
|
|
|
}
|
2013-09-06 16:23:46 -07:00
|
|
|
|
2016-03-18 16:34:11 +01:00
|
|
|
func RunSchedLocalQueueEmptyTest(iters int) {
|
|
|
|
|
// Test that runq is not spuriously reported as empty.
|
|
|
|
|
// Runq emptiness affects scheduling decisions and spurious emptiness
|
|
|
|
|
// can lead to underutilization (both runnable Gs and idle Ps coexist
|
|
|
|
|
// for arbitrary long time).
|
|
|
|
|
done := make(chan bool, 1)
|
|
|
|
|
p := new(p)
|
|
|
|
|
gs := make([]g, 2)
|
2022-04-25 17:21:58 -04:00
|
|
|
Escape(gs) // Ensure gs doesn't move, since we use guintptrs
|
2016-03-18 16:34:11 +01:00
|
|
|
ready := new(uint32)
|
|
|
|
|
for i := 0; i < iters; i++ {
|
|
|
|
|
*ready = 0
|
|
|
|
|
next0 := (i & 1) == 0
|
|
|
|
|
next1 := (i & 2) == 0
|
|
|
|
|
runqput(p, &gs[0], next0)
|
|
|
|
|
go func() {
|
2021-06-03 18:29:05 -04:00
|
|
|
for atomic.Xadd(ready, 1); atomic.Load(ready) != 2; {
|
2016-03-18 16:34:11 +01:00
|
|
|
}
|
2021-06-03 18:29:05 -04:00
|
|
|
if runqempty(p) {
|
|
|
|
|
println("next:", next0, next1)
|
2016-03-18 16:34:11 +01:00
|
|
|
throw("queue is empty")
|
|
|
|
|
}
|
2021-06-03 18:29:05 -04:00
|
|
|
done <- true
|
2016-03-18 16:34:11 +01:00
|
|
|
}()
|
|
|
|
|
for atomic.Xadd(ready, 1); atomic.Load(ready) != 2; {
|
|
|
|
|
}
|
|
|
|
|
runqput(p, &gs[1], next1)
|
|
|
|
|
runqget(p)
|
|
|
|
|
<-done
|
|
|
|
|
runqget(p)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2017-08-27 14:05:11 +02:00
|
|
|
var (
|
|
|
|
|
StringHash = stringHash
|
|
|
|
|
BytesHash = bytesHash
|
|
|
|
|
Int32Hash = int32Hash
|
|
|
|
|
Int64Hash = int64Hash
|
|
|
|
|
MemHash = memhash
|
|
|
|
|
MemHash32 = memhash32
|
|
|
|
|
MemHash64 = memhash64
|
|
|
|
|
EfaceHash = efaceHash
|
|
|
|
|
IfaceHash = ifaceHash
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
var UseAeshash = &useAeshash
|
2016-10-17 18:41:56 -04:00
|
|
|
|
|
|
|
|
func MemclrBytes(b []byte) {
|
|
|
|
|
s := (*slice)(unsafe.Pointer(&b))
|
|
|
|
|
memclrNoHeapPointers(s.array, uintptr(s.len))
|
|
|
|
|
}
|
2013-09-13 14:19:23 -04:00
|
|
|
|
2021-07-25 23:15:15 +00:00
|
|
|
const HashLoad = hashLoad
|
2014-02-06 17:43:22 -08:00
|
|
|
|
2014-09-05 15:01:09 -07:00
|
|
|
// entry point for testing
|
|
|
|
|
func GostringW(w []uint16) (s string) {
|
[dev.cc] runtime: delete scalararg, ptrarg; rename onM to systemstack
Scalararg and ptrarg are not "signal safe".
Go code filling them out can be interrupted by a signal,
and then the signal handler runs, and if it also ends up
in Go code that uses scalararg or ptrarg, now the old
values have been smashed.
For the pieces of code that do need to run in a signal handler,
we introduced onM_signalok, which is really just onM
except that the _signalok is meant to convey that the caller
asserts that scalarg and ptrarg will be restored to their old
values after the call (instead of the usual behavior, zeroing them).
Scalararg and ptrarg are also untyped and therefore error-prone.
Go code can always pass a closure instead of using scalararg
and ptrarg; they were only really necessary for C code.
And there's no more C code.
For all these reasons, delete scalararg and ptrarg, converting
the few remaining references to use closures.
Once those are gone, there is no need for a distinction between
onM and onM_signalok, so replace both with a single function
equivalent to the current onM_signalok (that is, it can be called
on any of the curg, g0, and gsignal stacks).
The name onM and the phrase 'm stack' are misnomers,
because on most system an M has two system stacks:
the main thread stack and the signal handling stack.
Correct the misnomer by naming the replacement function systemstack.
Fix a few references to "M stack" in code.
The main motivation for this change is to eliminate scalararg/ptrarg.
Rick and I have already seen them cause problems because
the calling sequence m.ptrarg[0] = p is a heap pointer assignment,
so it gets a write barrier. The write barrier also uses onM, so it has
all the same problems as if it were being invoked by a signal handler.
We worked around this by saving and restoring the old values
and by calling onM_signalok, but there's no point in keeping this nice
home for bugs around any longer.
This CL also changes funcline to return the file name as a result
instead of filling in a passed-in *string. (The *string signature is
left over from when the code was written in and called from C.)
That's arguably an unrelated change, except that once I had done
the ptrarg/scalararg/onM cleanup I started getting false positives
about the *string argument escaping (not allowed in package runtime).
The compiler is wrong, but the easiest fix is to write the code like
Go code instead of like C code. I am a bit worried that the compiler
is wrong because of some use of uninitialized memory in the escape
analysis. If that's the reason, it will go away when we convert the
compiler to Go. (And if not, we'll debug it the next time.)
LGTM=khr
R=r, khr
CC=austin, golang-codereviews, iant, rlh
https://golang.org/cl/174950043
2014-11-12 14:54:31 -05:00
|
|
|
systemstack(func() {
|
2014-09-05 15:01:09 -07:00
|
|
|
s = gostringw(&w[0])
|
|
|
|
|
})
|
|
|
|
|
return
|
|
|
|
|
}
|
2014-09-11 16:53:34 -07:00
|
|
|
|
2015-03-02 20:16:48 -08:00
|
|
|
var Open = open
|
2015-04-13 19:37:04 -04:00
|
|
|
var Close = closefd
|
2015-03-02 20:16:48 -08:00
|
|
|
var Read = read
|
|
|
|
|
var Write = write
|
2015-03-03 13:55:22 -05:00
|
|
|
|
|
|
|
|
func Envs() []string { return envs }
|
|
|
|
|
func SetEnvs(e []string) { envs = e }
|
2015-04-16 14:32:18 -07:00
|
|
|
|
2015-05-02 22:59:35 -04:00
|
|
|
// For benchmarking.
|
|
|
|
|
|
2021-12-01 12:15:45 -05:00
|
|
|
func BenchSetType(n int, x any) {
|
2015-10-21 12:12:25 -07:00
|
|
|
e := *efaceOf(&x)
|
2015-05-02 22:59:35 -04:00
|
|
|
t := e._type
|
|
|
|
|
var size uintptr
|
|
|
|
|
var p unsafe.Pointer
|
|
|
|
|
switch t.kind & kindMask {
|
2015-10-12 16:01:51 -07:00
|
|
|
case kindPtr:
|
2015-05-02 22:59:35 -04:00
|
|
|
t = (*ptrtype)(unsafe.Pointer(t)).elem
|
|
|
|
|
size = t.size
|
|
|
|
|
p = e.data
|
2015-10-12 16:01:51 -07:00
|
|
|
case kindSlice:
|
2015-05-02 22:59:35 -04:00
|
|
|
slice := *(*struct {
|
|
|
|
|
ptr unsafe.Pointer
|
|
|
|
|
len, cap uintptr
|
|
|
|
|
})(e.data)
|
|
|
|
|
t = (*slicetype)(unsafe.Pointer(t)).elem
|
|
|
|
|
size = t.size * slice.len
|
|
|
|
|
p = slice.ptr
|
|
|
|
|
}
|
|
|
|
|
allocSize := roundupsize(size)
|
|
|
|
|
systemstack(func() {
|
|
|
|
|
for i := 0; i < n; i++ {
|
|
|
|
|
heapBitsSetType(uintptr(p), allocSize, size, t)
|
|
|
|
|
}
|
|
|
|
|
})
|
|
|
|
|
}
|
2015-05-15 14:23:23 -04:00
|
|
|
|
2021-06-16 23:05:44 +00:00
|
|
|
const PtrSize = goarch.PtrSize
|
2015-07-30 00:46:42 -04:00
|
|
|
|
2015-08-05 11:35:28 -04:00
|
|
|
var ForceGCPeriod = &forcegcperiod
|
2016-01-14 16:43:40 -05:00
|
|
|
|
|
|
|
|
// SetTracebackEnv is like runtime/debug.SetTraceback, but it raises
|
|
|
|
|
// the "environment" traceback level, so later calls to
|
|
|
|
|
// debug.SetTraceback (e.g., from testing timeouts) can't lower it.
|
|
|
|
|
func SetTracebackEnv(level string) {
|
|
|
|
|
setTraceback(level)
|
|
|
|
|
traceback_env = traceback_cache
|
|
|
|
|
}
|
2016-03-17 13:28:04 -07:00
|
|
|
|
|
|
|
|
var ReadUnaligned32 = readUnaligned32
|
|
|
|
|
var ReadUnaligned64 = readUnaligned64
|
runtime: fix pagesInUse accounting
When we grow the heap, we create a temporary "in use" span for the
memory acquired from the OS and then free that span to link it into
the heap. Hence, we (1) increase pagesInUse when we make the temporary
span so that (2) freeing the span will correctly decrease it.
However, currently step (1) increases pagesInUse by the number of
pages requested from the heap, while step (2) decreases it by the
number of pages requested from the OS (the size of the temporary
span). These aren't necessarily the same, since we round up the number
of pages we request from the OS, so steps 1 and 2 don't necessarily
cancel out like they're supposed to. Over time, this can add up and
cause pagesInUse to underflow and wrap around to 2^64. The garbage
collector computes the sweep ratio from this, so if this happens, the
sweep ratio becomes effectively infinite, causing the first allocation
on each P in a sweep cycle to sweep the entire heap. This makes
sweeping effectively STW.
Fix this by increasing pagesInUse in step 1 by the number of pages
requested from the OS, so that the two steps correctly cancel out. We
add a test that checks that the running total matches the actual state
of the heap.
Fixes #15022. For 1.6.x.
Change-Id: Iefd9d6abe37d0d447cbdbdf9941662e4f18eeffc
Reviewed-on: https://go-review.googlesource.com/21280
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Russ Cox <rsc@golang.org>
2016-03-29 12:28:24 -04:00
|
|
|
|
|
|
|
|
func CountPagesInUse() (pagesInUse, counted uintptr) {
|
|
|
|
|
stopTheWorld("CountPagesInUse")
|
|
|
|
|
|
runtime: retype mheap.pagesInUse as atomic.Uint64
[git-generate]
cd src/runtime
mv export_test.go export.go
GOROOT=$(dirname $(dirname $PWD)) rf '
add mheap.pagesInUse \
// Proportional sweep \
// \
// These parameters represent a linear function from gcController.heapLive \
// to page sweep count. The proportional sweep system works to \
// stay in the black by keeping the current page sweep count \
// above this line at the current gcController.heapLive. \
// \
// The line has slope sweepPagesPerByte and passes through a \
// basis point at (sweepHeapLiveBasis, pagesSweptBasis). At \
// any given time, the system is at (gcController.heapLive, \
// pagesSwept) in this space. \
// \
// It is important that the line pass through a point we \
// control rather than simply starting at a 0,0 origin \
// because that lets us adjust sweep pacing at any time while \
// accounting for current progress. If we could only adjust \
// the slope, it would create a discontinuity in debt if any \
// progress has already been made. \
pagesInUse_ atomic.Uint64 // pages of spans in stats mSpanInUse
ex {
import "runtime/internal/atomic"
var t mheap
var v, w uint64
var d int64
t.pagesInUse -> t.pagesInUse_.Load()
t.pagesInUse = v -> t.pagesInUse_.Store(v)
atomic.Load64(&t.pagesInUse) -> t.pagesInUse_.Load()
atomic.LoadAcq64(&t.pagesInUse) -> t.pagesInUse_.LoadAcquire()
atomic.Store64(&t.pagesInUse, v) -> t.pagesInUse_.Store(v)
atomic.StoreRel64(&t.pagesInUse, v) -> t.pagesInUse_.StoreRelease(v)
atomic.Cas64(&t.pagesInUse, v, w) -> t.pagesInUse_.CompareAndSwap(v, w)
atomic.Xchg64(&t.pagesInUse, v) -> t.pagesInUse_.Swap(v)
atomic.Xadd64(&t.pagesInUse, d) -> t.pagesInUse_.Add(d)
}
rm mheap.pagesInUse
mv mheap.pagesInUse_ mheap.pagesInUse
'
mv export.go export_test.go
Change-Id: I495d188683dba0778518563c46755b5ad43be298
Reviewed-on: https://go-review.googlesource.com/c/go/+/356549
Trust: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Austin Clements <austin@google.com>
2021-10-15 19:22:10 +00:00
|
|
|
pagesInUse = uintptr(mheap_.pagesInUse.Load())
|
runtime: fix pagesInUse accounting
When we grow the heap, we create a temporary "in use" span for the
memory acquired from the OS and then free that span to link it into
the heap. Hence, we (1) increase pagesInUse when we make the temporary
span so that (2) freeing the span will correctly decrease it.
However, currently step (1) increases pagesInUse by the number of
pages requested from the heap, while step (2) decreases it by the
number of pages requested from the OS (the size of the temporary
span). These aren't necessarily the same, since we round up the number
of pages we request from the OS, so steps 1 and 2 don't necessarily
cancel out like they're supposed to. Over time, this can add up and
cause pagesInUse to underflow and wrap around to 2^64. The garbage
collector computes the sweep ratio from this, so if this happens, the
sweep ratio becomes effectively infinite, causing the first allocation
on each P in a sweep cycle to sweep the entire heap. This makes
sweeping effectively STW.
Fix this by increasing pagesInUse in step 1 by the number of pages
requested from the OS, so that the two steps correctly cancel out. We
add a test that checks that the running total matches the actual state
of the heap.
Fixes #15022. For 1.6.x.
Change-Id: Iefd9d6abe37d0d447cbdbdf9941662e4f18eeffc
Reviewed-on: https://go-review.googlesource.com/21280
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Russ Cox <rsc@golang.org>
2016-03-29 12:28:24 -04:00
|
|
|
|
2016-10-04 15:51:31 -04:00
|
|
|
for _, s := range mheap_.allspans {
|
runtime: atomically set span state and use as publication barrier
When everything is working correctly, any pointer the garbage
collector encounters can only point into a fully initialized heap
span, since the span must have been initialized before that pointer
could escape the heap allocator and become visible to the GC.
However, in various cases, we try to be defensive against bad
pointers. In findObject, this is just a sanity check: we never expect
to find a bad pointer, but programming errors can lead to them. In
spanOfHeap, we don't necessarily trust the pointer and we're trying to
check if it really does point to the heap, though it should always
point to something. Conservative scanning takes this to a new level,
since it can only guess that a word may be a pointer and verify this.
In all of these cases, we have a problem that the span lookup and
check can race with span initialization, since the span becomes
visible to lookups before it's fully initialized.
Furthermore, we're about to start initializing the span without the
heap lock held, which is going to introduce races where accesses were
previously protected by the heap lock.
To address this, this CL makes accesses to mspan.state atomic, and
ensures that the span is fully initialized before setting the state to
mSpanInUse. All loads are now atomic, and in any case where we don't
trust the pointer, it first atomically loads the span state and checks
that it's mSpanInUse, after which it will have synchronized with span
initialization and can safely check the other span fields.
For #10958, #24543, but a good fix in general.
Change-Id: I518b7c63555b02064b98aa5f802c92b758fef853
Reviewed-on: https://go-review.googlesource.com/c/go/+/203286
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
2019-10-23 11:25:38 -04:00
|
|
|
if s.state.get() == mSpanInUse {
|
runtime: fix pagesInUse accounting
When we grow the heap, we create a temporary "in use" span for the
memory acquired from the OS and then free that span to link it into
the heap. Hence, we (1) increase pagesInUse when we make the temporary
span so that (2) freeing the span will correctly decrease it.
However, currently step (1) increases pagesInUse by the number of
pages requested from the heap, while step (2) decreases it by the
number of pages requested from the OS (the size of the temporary
span). These aren't necessarily the same, since we round up the number
of pages we request from the OS, so steps 1 and 2 don't necessarily
cancel out like they're supposed to. Over time, this can add up and
cause pagesInUse to underflow and wrap around to 2^64. The garbage
collector computes the sweep ratio from this, so if this happens, the
sweep ratio becomes effectively infinite, causing the first allocation
on each P in a sweep cycle to sweep the entire heap. This makes
sweeping effectively STW.
Fix this by increasing pagesInUse in step 1 by the number of pages
requested from the OS, so that the two steps correctly cancel out. We
add a test that checks that the running total matches the actual state
of the heap.
Fixes #15022. For 1.6.x.
Change-Id: Iefd9d6abe37d0d447cbdbdf9941662e4f18eeffc
Reviewed-on: https://go-review.googlesource.com/21280
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Russ Cox <rsc@golang.org>
2016-03-29 12:28:24 -04:00
|
|
|
counted += s.npages
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
startTheWorld()
|
|
|
|
|
|
|
|
|
|
return
|
|
|
|
|
}
|
2017-01-05 09:36:27 +03:00
|
|
|
|
2017-02-13 12:46:17 -08:00
|
|
|
func Fastrand() uint32 { return fastrand() }
|
2022-04-18 15:23:20 +08:00
|
|
|
func Fastrand64() uint64 { return fastrand64() }
|
2017-02-13 12:46:17 -08:00
|
|
|
func Fastrandn(n uint32) uint32 { return fastrandn(n) }
|
2017-02-09 13:58:48 -05:00
|
|
|
|
|
|
|
|
type ProfBuf profBuf
|
|
|
|
|
|
|
|
|
|
func NewProfBuf(hdrsize, bufwords, tags int) *ProfBuf {
|
|
|
|
|
return (*ProfBuf)(newProfBuf(hdrsize, bufwords, tags))
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (p *ProfBuf) Write(tag *unsafe.Pointer, now int64, hdr []uint64, stk []uintptr) {
|
|
|
|
|
(*profBuf)(p).write(tag, now, hdr, stk)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const (
|
|
|
|
|
ProfBufBlocking = profBufBlocking
|
|
|
|
|
ProfBufNonBlocking = profBufNonBlocking
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
func (p *ProfBuf) Read(mode profBufReadMode) ([]uint64, []unsafe.Pointer, bool) {
|
|
|
|
|
return (*profBuf)(p).read(profBufReadMode(mode))
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (p *ProfBuf) Close() {
|
|
|
|
|
(*profBuf)(p).close()
|
|
|
|
|
}
|
2017-01-03 10:15:55 -07:00
|
|
|
|
2020-07-01 16:02:42 +00:00
|
|
|
func ReadMetricsSlow(memStats *MemStats, samplesp unsafe.Pointer, len, cap int) {
|
|
|
|
|
stopTheWorld("ReadMetricsSlow")
|
|
|
|
|
|
|
|
|
|
// Initialize the metrics beforehand because this could
|
|
|
|
|
// allocate and skew the stats.
|
2022-06-28 15:17:12 -04:00
|
|
|
metricsLock()
|
2020-07-01 16:02:42 +00:00
|
|
|
initMetrics()
|
2022-06-28 15:17:12 -04:00
|
|
|
metricsUnlock()
|
2020-07-01 16:02:42 +00:00
|
|
|
|
|
|
|
|
systemstack(func() {
|
|
|
|
|
// Read memstats first. It's going to flush
|
|
|
|
|
// the mcaches which readMetrics does not do, so
|
|
|
|
|
// going the other way around may result in
|
|
|
|
|
// inconsistent statistics.
|
|
|
|
|
readmemstats_m(memStats)
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
// Read metrics off the system stack.
|
|
|
|
|
//
|
|
|
|
|
// The only part of readMetrics that could allocate
|
|
|
|
|
// and skew the stats is initMetrics.
|
|
|
|
|
readMetrics(samplesp, len, cap)
|
|
|
|
|
|
|
|
|
|
startTheWorld()
|
|
|
|
|
}
|
|
|
|
|
|
2017-01-03 10:15:55 -07:00
|
|
|
// ReadMemStatsSlow returns both the runtime-computed MemStats and
|
|
|
|
|
// MemStats accumulated by scanning the heap.
|
|
|
|
|
func ReadMemStatsSlow() (base, slow MemStats) {
|
|
|
|
|
stopTheWorld("ReadMemStatsSlow")
|
|
|
|
|
|
|
|
|
|
// Run on the system stack to avoid stack growth allocation.
|
|
|
|
|
systemstack(func() {
|
|
|
|
|
// Make sure stats don't change.
|
|
|
|
|
getg().m.mallocing++
|
|
|
|
|
|
|
|
|
|
readmemstats_m(&base)
|
|
|
|
|
|
|
|
|
|
// Initialize slow from base and zero the fields we're
|
|
|
|
|
// recomputing.
|
|
|
|
|
slow = base
|
|
|
|
|
slow.Alloc = 0
|
|
|
|
|
slow.TotalAlloc = 0
|
|
|
|
|
slow.Mallocs = 0
|
|
|
|
|
slow.Frees = 0
|
2018-10-15 23:00:58 +00:00
|
|
|
slow.HeapReleased = 0
|
2017-01-03 10:15:55 -07:00
|
|
|
var bySize [_NumSizeClasses]struct {
|
|
|
|
|
Mallocs, Frees uint64
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Add up current allocations in spans.
|
|
|
|
|
for _, s := range mheap_.allspans {
|
runtime: atomically set span state and use as publication barrier
When everything is working correctly, any pointer the garbage
collector encounters can only point into a fully initialized heap
span, since the span must have been initialized before that pointer
could escape the heap allocator and become visible to the GC.
However, in various cases, we try to be defensive against bad
pointers. In findObject, this is just a sanity check: we never expect
to find a bad pointer, but programming errors can lead to them. In
spanOfHeap, we don't necessarily trust the pointer and we're trying to
check if it really does point to the heap, though it should always
point to something. Conservative scanning takes this to a new level,
since it can only guess that a word may be a pointer and verify this.
In all of these cases, we have a problem that the span lookup and
check can race with span initialization, since the span becomes
visible to lookups before it's fully initialized.
Furthermore, we're about to start initializing the span without the
heap lock held, which is going to introduce races where accesses were
previously protected by the heap lock.
To address this, this CL makes accesses to mspan.state atomic, and
ensures that the span is fully initialized before setting the state to
mSpanInUse. All loads are now atomic, and in any case where we don't
trust the pointer, it first atomically loads the span state and checks
that it's mSpanInUse, after which it will have synchronized with span
initialization and can safely check the other span fields.
For #10958, #24543, but a good fix in general.
Change-Id: I518b7c63555b02064b98aa5f802c92b758fef853
Reviewed-on: https://go-review.googlesource.com/c/go/+/203286
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
2019-10-23 11:25:38 -04:00
|
|
|
if s.state.get() != mSpanInUse {
|
2017-01-03 10:15:55 -07:00
|
|
|
continue
|
|
|
|
|
}
|
runtime: add safe arena support to the runtime
This change adds an API to the runtime for arenas. A later CL can
potentially export it as an experimental API, but for now, just the
runtime implementation will suffice.
The purpose of arenas is to improve efficiency, primarily by allowing
for an application to manually free memory, thereby delaying garbage
collection. It comes with other potential performance benefits, such as
better locality, a better allocation strategy, and better handling of
interior pointers by the GC.
This implementation is based on one by danscales@google.com with a few
significant differences:
* The implementation lives entirely in the runtime (all layers).
* Arena chunks are the minimum of 8 MiB or the heap arena size. This
choice is made because in practice 64 MiB appears to be way too large
of an area for most real-world use-cases.
* Arena chunks are not unmapped, instead they're placed on an evacuation
list and when there are no pointers left pointing into them, they're
allowed to be reused.
* Reusing partially-used arena chunks no longer tries to find one used
by the same P first; it just takes the first one available.
* In order to ensure worst-case fragmentation is never worse than 25%,
only types and slice backing stores whose sizes are 1/4th the size of
a chunk or less may be used. Previously larger sizes, up to the size
of the chunk, were allowed.
* ASAN, MSAN, and the race detector are fully supported.
* Sets arena chunks to fault that were deferred at the end of mark
termination (a non-public patch once did this; I don't see a reason
not to continue that).
For #51317.
Change-Id: I83b1693a17302554cb36b6daa4e9249a81b1644f
Reviewed-on: https://go-review.googlesource.com/c/go/+/423359
Reviewed-by: Cherry Mui <cherryyz@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
2022-08-12 21:40:46 +00:00
|
|
|
if s.isUnusedUserArenaChunk() {
|
|
|
|
|
continue
|
|
|
|
|
}
|
2016-02-09 17:53:07 -05:00
|
|
|
if sizeclass := s.spanclass.sizeclass(); sizeclass == 0 {
|
2017-01-03 10:15:55 -07:00
|
|
|
slow.Mallocs++
|
|
|
|
|
slow.Alloc += uint64(s.elemsize)
|
|
|
|
|
} else {
|
|
|
|
|
slow.Mallocs += uint64(s.allocCount)
|
|
|
|
|
slow.Alloc += uint64(s.allocCount) * uint64(s.elemsize)
|
2016-02-09 17:53:07 -05:00
|
|
|
bySize[sizeclass].Mallocs += uint64(s.allocCount)
|
2017-01-03 10:15:55 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2020-08-04 17:29:03 +00:00
|
|
|
// Add in frees by just reading the stats for those directly.
|
|
|
|
|
var m heapStatsDelta
|
|
|
|
|
memstats.heapStats.unsafeRead(&m)
|
|
|
|
|
|
|
|
|
|
// Collect per-sizeclass free stats.
|
|
|
|
|
var smallFree uint64
|
|
|
|
|
for i := 0; i < _NumSizeClasses; i++ {
|
|
|
|
|
slow.Frees += uint64(m.smallFreeCount[i])
|
|
|
|
|
bySize[i].Frees += uint64(m.smallFreeCount[i])
|
|
|
|
|
bySize[i].Mallocs += uint64(m.smallFreeCount[i])
|
|
|
|
|
smallFree += uint64(m.smallFreeCount[i]) * uint64(class_to_size[i])
|
2017-01-03 10:15:55 -07:00
|
|
|
}
|
2021-04-21 23:50:58 +00:00
|
|
|
slow.Frees += uint64(m.tinyAllocCount) + uint64(m.largeFreeCount)
|
2017-01-03 10:15:55 -07:00
|
|
|
slow.Mallocs += slow.Frees
|
|
|
|
|
|
2020-08-04 17:29:03 +00:00
|
|
|
slow.TotalAlloc = slow.Alloc + uint64(m.largeFree) + smallFree
|
2017-01-03 10:15:55 -07:00
|
|
|
|
|
|
|
|
for i := range slow.BySize {
|
|
|
|
|
slow.BySize[i].Mallocs = bySize[i].Mallocs
|
|
|
|
|
slow.BySize[i].Frees = bySize[i].Frees
|
|
|
|
|
}
|
|
|
|
|
|
2019-09-04 16:12:10 +00:00
|
|
|
for i := mheap_.pages.start; i < mheap_.pages.end; i++ {
|
2020-09-09 16:52:18 +00:00
|
|
|
chunk := mheap_.pages.tryChunkOf(i)
|
|
|
|
|
if chunk == nil {
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
pg := chunk.scavenged.popcntRange(0, pallocChunkPages)
|
2019-09-04 16:12:10 +00:00
|
|
|
slow.HeapReleased += uint64(pg) * pageSize
|
2018-11-26 23:56:35 +00:00
|
|
|
}
|
2019-09-16 21:23:24 +00:00
|
|
|
for _, p := range allp {
|
2019-11-08 16:11:29 -05:00
|
|
|
pg := sys.OnesCount64(p.pcache.scav)
|
2019-09-16 21:23:24 +00:00
|
|
|
slow.HeapReleased += uint64(pg) * pageSize
|
|
|
|
|
}
|
2018-10-15 23:00:58 +00:00
|
|
|
|
2017-01-03 10:15:55 -07:00
|
|
|
getg().m.mallocing--
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
startTheWorld()
|
|
|
|
|
return
|
|
|
|
|
}
|
2017-03-10 10:59:39 -05:00
|
|
|
|
2023-04-26 10:07:02 +02:00
|
|
|
// ShrinkStackAndVerifyFramePointers attempts to shrink the stack of the current goroutine
|
|
|
|
|
// and verifies that unwinding the new stack doesn't crash, even if the old
|
|
|
|
|
// stack has been freed or reused (simulated via poisoning).
|
|
|
|
|
func ShrinkStackAndVerifyFramePointers() {
|
|
|
|
|
before := stackPoisonCopy
|
|
|
|
|
defer func() { stackPoisonCopy = before }()
|
|
|
|
|
stackPoisonCopy = 1
|
|
|
|
|
|
|
|
|
|
gp := getg()
|
|
|
|
|
systemstack(func() {
|
|
|
|
|
shrinkstack(gp)
|
|
|
|
|
})
|
|
|
|
|
// If our new stack contains frame pointers into the old stack, this will
|
|
|
|
|
// crash because the old stack has been poisoned.
|
|
|
|
|
FPCallers(0, make([]uintptr, 1024))
|
|
|
|
|
}
|
|
|
|
|
|
2017-03-10 10:59:39 -05:00
|
|
|
// BlockOnSystemStack switches to the system stack, prints "x\n" to
|
|
|
|
|
// stderr, and blocks in a stack containing
|
|
|
|
|
// "runtime.blockOnSystemStackInternal".
|
|
|
|
|
func BlockOnSystemStack() {
|
|
|
|
|
systemstack(blockOnSystemStackInternal)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func blockOnSystemStackInternal() {
|
|
|
|
|
print("x\n")
|
|
|
|
|
lock(&deadlock)
|
|
|
|
|
lock(&deadlock)
|
|
|
|
|
}
|
2017-06-15 16:42:08 -07:00
|
|
|
|
|
|
|
|
type RWMutex struct {
|
|
|
|
|
rw rwmutex
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (rw *RWMutex) RLock() {
|
|
|
|
|
rw.rw.rlock()
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (rw *RWMutex) RUnlock() {
|
|
|
|
|
rw.rw.runlock()
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (rw *RWMutex) Lock() {
|
|
|
|
|
rw.rw.lock()
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (rw *RWMutex) Unlock() {
|
|
|
|
|
rw.rw.unlock()
|
|
|
|
|
}
|
2017-09-01 12:32:38 -07:00
|
|
|
|
2018-01-29 21:40:57 +01:00
|
|
|
const RuntimeHmapSize = unsafe.Sizeof(hmap{})
|
|
|
|
|
|
2017-09-02 18:46:59 +02:00
|
|
|
func MapBucketsCount(m map[int]int) int {
|
2017-09-01 12:32:38 -07:00
|
|
|
h := *(**hmap)(unsafe.Pointer(&m))
|
|
|
|
|
return 1 << h.B
|
|
|
|
|
}
|
2017-06-14 11:46:35 -04:00
|
|
|
|
2017-09-02 18:46:59 +02:00
|
|
|
func MapBucketsPointerIsNil(m map[int]int) bool {
|
|
|
|
|
h := *(**hmap)(unsafe.Pointer(&m))
|
|
|
|
|
return h.buckets == nil
|
|
|
|
|
}
|
|
|
|
|
|
2017-06-14 11:46:35 -04:00
|
|
|
func LockOSCounts() (external, internal uint32) {
|
2022-07-20 13:18:06 -04:00
|
|
|
gp := getg()
|
|
|
|
|
if gp.m.lockedExt+gp.m.lockedInt == 0 {
|
|
|
|
|
if gp.lockedm != 0 {
|
2017-06-14 11:46:35 -04:00
|
|
|
panic("lockedm on non-locked goroutine")
|
|
|
|
|
}
|
|
|
|
|
} else {
|
2022-07-20 13:18:06 -04:00
|
|
|
if gp.lockedm == 0 {
|
2017-06-14 11:46:35 -04:00
|
|
|
panic("nil lockedm on locked goroutine")
|
|
|
|
|
}
|
|
|
|
|
}
|
2022-07-20 13:18:06 -04:00
|
|
|
return gp.m.lockedExt, gp.m.lockedInt
|
2017-06-14 11:46:35 -04:00
|
|
|
}
|
2017-10-27 15:20:21 -04:00
|
|
|
|
|
|
|
|
//go:noinline
|
|
|
|
|
func TracebackSystemstack(stk []uintptr, i int) int {
|
|
|
|
|
if i == 0 {
|
2018-04-26 14:06:08 -04:00
|
|
|
pc, sp := getcallerpc(), getcallersp()
|
2023-02-14 12:25:11 -05:00
|
|
|
var u unwinder
|
|
|
|
|
u.initAt(pc, sp, 0, getg(), unwindJumpStack) // Don't ignore errors, for testing
|
|
|
|
|
return tracebackPCs(&u, 0, stk)
|
2017-10-27 15:20:21 -04:00
|
|
|
}
|
|
|
|
|
n := 0
|
|
|
|
|
systemstack(func() {
|
|
|
|
|
n = TracebackSystemstack(stk, i-1)
|
|
|
|
|
})
|
|
|
|
|
return n
|
|
|
|
|
}
|
runtime: use sparse mappings for the heap
This replaces the contiguous heap arena mapping with a potentially
sparse mapping that can support heap mappings anywhere in the address
space.
This has several advantages over the current approach:
* There is no longer any limit on the size of the Go heap. (Currently
it's limited to 512GB.) Hence, this fixes #10460.
* It eliminates many failures modes of heap initialization and
growing. In particular it eliminates any possibility of panicking
with an address space conflict. This can happen for many reasons and
even causes a low but steady rate of TSAN test failures because of
conflicts with the TSAN runtime. See #16936 and #11993.
* It eliminates the notion of "non-reserved" heap, which was added
because creating huge address space reservations (particularly on
64-bit) led to huge process VSIZE. This was at best confusing and at
worst conflicted badly with ulimit -v. However, the non-reserved
heap logic is complicated, can race with other mappings in non-pure
Go binaries (e.g., #18976), and requires that the entire heap be
either reserved or non-reserved. We currently maintain the latter
property, but it's quite difficult to convince yourself of that, and
hence difficult to keep correct. This logic is still present, but
will be removed in the next CL.
* It fixes problems on 32-bit where skipping over parts of the address
space leads to mapping huge (and never-to-be-used) metadata
structures. See #19831.
This also completely rewrites and significantly simplifies
mheap.sysAlloc, which has been a source of many bugs. E.g., #21044,
#20259, #18651, and #13143 (and maybe #23222).
This change also makes it possible to allocate individual objects
larger than 512GB. As a result, a few tests that expected huge
allocations to fail needed to be changed to make even larger
allocations. However, at the moment attempting to allocate a humongous
object may cause the program to freeze for several minutes on Linux as
we fall back to probing every page with addrspace_free. That logic
(and this failure mode) will be removed in the next CL.
Fixes #10460.
Fixes #22204 (since it rewrites the code involved).
This slightly slows down compilebench and the x/benchmarks garbage
benchmark.
name old time/op new time/op delta
Template 184ms ± 1% 185ms ± 1% ~ (p=0.065 n=10+9)
Unicode 86.9ms ± 3% 86.3ms ± 1% ~ (p=0.631 n=10+10)
GoTypes 599ms ± 0% 602ms ± 0% +0.56% (p=0.000 n=10+9)
Compiler 2.87s ± 1% 2.89s ± 1% +0.51% (p=0.002 n=9+10)
SSA 7.29s ± 1% 7.25s ± 1% ~ (p=0.182 n=10+9)
Flate 118ms ± 2% 118ms ± 1% ~ (p=0.113 n=9+9)
GoParser 147ms ± 1% 148ms ± 1% +1.07% (p=0.003 n=9+10)
Reflect 401ms ± 1% 404ms ± 1% +0.71% (p=0.003 n=10+9)
Tar 175ms ± 1% 175ms ± 1% ~ (p=0.604 n=9+10)
XML 209ms ± 1% 210ms ± 1% ~ (p=0.052 n=10+10)
(https://perf.golang.org/search?q=upload:20171231.4)
name old time/op new time/op delta
Garbage/benchmem-MB=64-12 2.23ms ± 1% 2.25ms ± 1% +0.84% (p=0.000 n=19+19)
(https://perf.golang.org/search?q=upload:20171231.3)
Relative to the start of the sparse heap changes (starting at and
including "runtime: fix various contiguous bitmap assumptions"),
overall slowdown is roughly 1% on GC-intensive benchmarks:
name old time/op new time/op delta
Template 183ms ± 1% 185ms ± 1% +1.32% (p=0.000 n=9+9)
Unicode 84.9ms ± 2% 86.3ms ± 1% +1.65% (p=0.000 n=9+10)
GoTypes 595ms ± 1% 602ms ± 0% +1.19% (p=0.000 n=9+9)
Compiler 2.86s ± 0% 2.89s ± 1% +0.91% (p=0.000 n=9+10)
SSA 7.19s ± 0% 7.25s ± 1% +0.75% (p=0.000 n=8+9)
Flate 117ms ± 1% 118ms ± 1% +1.10% (p=0.000 n=10+9)
GoParser 146ms ± 2% 148ms ± 1% +1.48% (p=0.002 n=10+10)
Reflect 398ms ± 1% 404ms ± 1% +1.51% (p=0.000 n=10+9)
Tar 173ms ± 1% 175ms ± 1% +1.17% (p=0.000 n=10+10)
XML 208ms ± 1% 210ms ± 1% +0.62% (p=0.011 n=10+10)
[Geo mean] 369ms 373ms +1.17%
(https://perf.golang.org/search?q=upload:20180101.2)
name old time/op new time/op delta
Garbage/benchmem-MB=64-12 2.22ms ± 1% 2.25ms ± 1% +1.51% (p=0.000 n=20+19)
(https://perf.golang.org/search?q=upload:20180101.3)
Change-Id: I5daf4cfec24b252e5a57001f0a6c03f22479d0f0
Reviewed-on: https://go-review.googlesource.com/85887
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
2017-12-19 22:05:23 -08:00
|
|
|
|
|
|
|
|
func KeepNArenaHints(n int) {
|
|
|
|
|
hint := mheap_.arenaHints
|
|
|
|
|
for i := 1; i < n; i++ {
|
|
|
|
|
hint = hint.next
|
|
|
|
|
if hint == nil {
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
hint.next = nil
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// MapNextArenaHint reserves a page at the next arena growth hint,
|
|
|
|
|
// preventing the arena from growing there, and returns the range of
|
|
|
|
|
// addresses that are no longer viable.
|
2022-11-01 10:00:25 -04:00
|
|
|
//
|
|
|
|
|
// This may fail to reserve memory. If it fails, it still returns the
|
|
|
|
|
// address range it attempted to reserve.
|
|
|
|
|
func MapNextArenaHint() (start, end uintptr, ok bool) {
|
runtime: use sparse mappings for the heap
This replaces the contiguous heap arena mapping with a potentially
sparse mapping that can support heap mappings anywhere in the address
space.
This has several advantages over the current approach:
* There is no longer any limit on the size of the Go heap. (Currently
it's limited to 512GB.) Hence, this fixes #10460.
* It eliminates many failures modes of heap initialization and
growing. In particular it eliminates any possibility of panicking
with an address space conflict. This can happen for many reasons and
even causes a low but steady rate of TSAN test failures because of
conflicts with the TSAN runtime. See #16936 and #11993.
* It eliminates the notion of "non-reserved" heap, which was added
because creating huge address space reservations (particularly on
64-bit) led to huge process VSIZE. This was at best confusing and at
worst conflicted badly with ulimit -v. However, the non-reserved
heap logic is complicated, can race with other mappings in non-pure
Go binaries (e.g., #18976), and requires that the entire heap be
either reserved or non-reserved. We currently maintain the latter
property, but it's quite difficult to convince yourself of that, and
hence difficult to keep correct. This logic is still present, but
will be removed in the next CL.
* It fixes problems on 32-bit where skipping over parts of the address
space leads to mapping huge (and never-to-be-used) metadata
structures. See #19831.
This also completely rewrites and significantly simplifies
mheap.sysAlloc, which has been a source of many bugs. E.g., #21044,
#20259, #18651, and #13143 (and maybe #23222).
This change also makes it possible to allocate individual objects
larger than 512GB. As a result, a few tests that expected huge
allocations to fail needed to be changed to make even larger
allocations. However, at the moment attempting to allocate a humongous
object may cause the program to freeze for several minutes on Linux as
we fall back to probing every page with addrspace_free. That logic
(and this failure mode) will be removed in the next CL.
Fixes #10460.
Fixes #22204 (since it rewrites the code involved).
This slightly slows down compilebench and the x/benchmarks garbage
benchmark.
name old time/op new time/op delta
Template 184ms ± 1% 185ms ± 1% ~ (p=0.065 n=10+9)
Unicode 86.9ms ± 3% 86.3ms ± 1% ~ (p=0.631 n=10+10)
GoTypes 599ms ± 0% 602ms ± 0% +0.56% (p=0.000 n=10+9)
Compiler 2.87s ± 1% 2.89s ± 1% +0.51% (p=0.002 n=9+10)
SSA 7.29s ± 1% 7.25s ± 1% ~ (p=0.182 n=10+9)
Flate 118ms ± 2% 118ms ± 1% ~ (p=0.113 n=9+9)
GoParser 147ms ± 1% 148ms ± 1% +1.07% (p=0.003 n=9+10)
Reflect 401ms ± 1% 404ms ± 1% +0.71% (p=0.003 n=10+9)
Tar 175ms ± 1% 175ms ± 1% ~ (p=0.604 n=9+10)
XML 209ms ± 1% 210ms ± 1% ~ (p=0.052 n=10+10)
(https://perf.golang.org/search?q=upload:20171231.4)
name old time/op new time/op delta
Garbage/benchmem-MB=64-12 2.23ms ± 1% 2.25ms ± 1% +0.84% (p=0.000 n=19+19)
(https://perf.golang.org/search?q=upload:20171231.3)
Relative to the start of the sparse heap changes (starting at and
including "runtime: fix various contiguous bitmap assumptions"),
overall slowdown is roughly 1% on GC-intensive benchmarks:
name old time/op new time/op delta
Template 183ms ± 1% 185ms ± 1% +1.32% (p=0.000 n=9+9)
Unicode 84.9ms ± 2% 86.3ms ± 1% +1.65% (p=0.000 n=9+10)
GoTypes 595ms ± 1% 602ms ± 0% +1.19% (p=0.000 n=9+9)
Compiler 2.86s ± 0% 2.89s ± 1% +0.91% (p=0.000 n=9+10)
SSA 7.19s ± 0% 7.25s ± 1% +0.75% (p=0.000 n=8+9)
Flate 117ms ± 1% 118ms ± 1% +1.10% (p=0.000 n=10+9)
GoParser 146ms ± 2% 148ms ± 1% +1.48% (p=0.002 n=10+10)
Reflect 398ms ± 1% 404ms ± 1% +1.51% (p=0.000 n=10+9)
Tar 173ms ± 1% 175ms ± 1% +1.17% (p=0.000 n=10+10)
XML 208ms ± 1% 210ms ± 1% +0.62% (p=0.011 n=10+10)
[Geo mean] 369ms 373ms +1.17%
(https://perf.golang.org/search?q=upload:20180101.2)
name old time/op new time/op delta
Garbage/benchmem-MB=64-12 2.22ms ± 1% 2.25ms ± 1% +1.51% (p=0.000 n=20+19)
(https://perf.golang.org/search?q=upload:20180101.3)
Change-Id: I5daf4cfec24b252e5a57001f0a6c03f22479d0f0
Reviewed-on: https://go-review.googlesource.com/85887
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
2017-12-19 22:05:23 -08:00
|
|
|
hint := mheap_.arenaHints
|
|
|
|
|
addr := hint.addr
|
|
|
|
|
if hint.down {
|
|
|
|
|
start, end = addr-heapArenaBytes, addr
|
|
|
|
|
addr -= physPageSize
|
|
|
|
|
} else {
|
|
|
|
|
start, end = addr, addr+heapArenaBytes
|
|
|
|
|
}
|
2022-11-01 10:00:25 -04:00
|
|
|
got := sysReserve(unsafe.Pointer(addr), physPageSize)
|
|
|
|
|
ok = (addr == uintptr(got))
|
|
|
|
|
if !ok {
|
|
|
|
|
// We were unable to get the requested reservation.
|
|
|
|
|
// Release what we did get and fail.
|
|
|
|
|
sysFreeOS(got, physPageSize)
|
|
|
|
|
}
|
runtime: use sparse mappings for the heap
This replaces the contiguous heap arena mapping with a potentially
sparse mapping that can support heap mappings anywhere in the address
space.
This has several advantages over the current approach:
* There is no longer any limit on the size of the Go heap. (Currently
it's limited to 512GB.) Hence, this fixes #10460.
* It eliminates many failures modes of heap initialization and
growing. In particular it eliminates any possibility of panicking
with an address space conflict. This can happen for many reasons and
even causes a low but steady rate of TSAN test failures because of
conflicts with the TSAN runtime. See #16936 and #11993.
* It eliminates the notion of "non-reserved" heap, which was added
because creating huge address space reservations (particularly on
64-bit) led to huge process VSIZE. This was at best confusing and at
worst conflicted badly with ulimit -v. However, the non-reserved
heap logic is complicated, can race with other mappings in non-pure
Go binaries (e.g., #18976), and requires that the entire heap be
either reserved or non-reserved. We currently maintain the latter
property, but it's quite difficult to convince yourself of that, and
hence difficult to keep correct. This logic is still present, but
will be removed in the next CL.
* It fixes problems on 32-bit where skipping over parts of the address
space leads to mapping huge (and never-to-be-used) metadata
structures. See #19831.
This also completely rewrites and significantly simplifies
mheap.sysAlloc, which has been a source of many bugs. E.g., #21044,
#20259, #18651, and #13143 (and maybe #23222).
This change also makes it possible to allocate individual objects
larger than 512GB. As a result, a few tests that expected huge
allocations to fail needed to be changed to make even larger
allocations. However, at the moment attempting to allocate a humongous
object may cause the program to freeze for several minutes on Linux as
we fall back to probing every page with addrspace_free. That logic
(and this failure mode) will be removed in the next CL.
Fixes #10460.
Fixes #22204 (since it rewrites the code involved).
This slightly slows down compilebench and the x/benchmarks garbage
benchmark.
name old time/op new time/op delta
Template 184ms ± 1% 185ms ± 1% ~ (p=0.065 n=10+9)
Unicode 86.9ms ± 3% 86.3ms ± 1% ~ (p=0.631 n=10+10)
GoTypes 599ms ± 0% 602ms ± 0% +0.56% (p=0.000 n=10+9)
Compiler 2.87s ± 1% 2.89s ± 1% +0.51% (p=0.002 n=9+10)
SSA 7.29s ± 1% 7.25s ± 1% ~ (p=0.182 n=10+9)
Flate 118ms ± 2% 118ms ± 1% ~ (p=0.113 n=9+9)
GoParser 147ms ± 1% 148ms ± 1% +1.07% (p=0.003 n=9+10)
Reflect 401ms ± 1% 404ms ± 1% +0.71% (p=0.003 n=10+9)
Tar 175ms ± 1% 175ms ± 1% ~ (p=0.604 n=9+10)
XML 209ms ± 1% 210ms ± 1% ~ (p=0.052 n=10+10)
(https://perf.golang.org/search?q=upload:20171231.4)
name old time/op new time/op delta
Garbage/benchmem-MB=64-12 2.23ms ± 1% 2.25ms ± 1% +0.84% (p=0.000 n=19+19)
(https://perf.golang.org/search?q=upload:20171231.3)
Relative to the start of the sparse heap changes (starting at and
including "runtime: fix various contiguous bitmap assumptions"),
overall slowdown is roughly 1% on GC-intensive benchmarks:
name old time/op new time/op delta
Template 183ms ± 1% 185ms ± 1% +1.32% (p=0.000 n=9+9)
Unicode 84.9ms ± 2% 86.3ms ± 1% +1.65% (p=0.000 n=9+10)
GoTypes 595ms ± 1% 602ms ± 0% +1.19% (p=0.000 n=9+9)
Compiler 2.86s ± 0% 2.89s ± 1% +0.91% (p=0.000 n=9+10)
SSA 7.19s ± 0% 7.25s ± 1% +0.75% (p=0.000 n=8+9)
Flate 117ms ± 1% 118ms ± 1% +1.10% (p=0.000 n=10+9)
GoParser 146ms ± 2% 148ms ± 1% +1.48% (p=0.002 n=10+10)
Reflect 398ms ± 1% 404ms ± 1% +1.51% (p=0.000 n=10+9)
Tar 173ms ± 1% 175ms ± 1% +1.17% (p=0.000 n=10+10)
XML 208ms ± 1% 210ms ± 1% +0.62% (p=0.011 n=10+10)
[Geo mean] 369ms 373ms +1.17%
(https://perf.golang.org/search?q=upload:20180101.2)
name old time/op new time/op delta
Garbage/benchmem-MB=64-12 2.22ms ± 1% 2.25ms ± 1% +1.51% (p=0.000 n=20+19)
(https://perf.golang.org/search?q=upload:20180101.3)
Change-Id: I5daf4cfec24b252e5a57001f0a6c03f22479d0f0
Reviewed-on: https://go-review.googlesource.com/85887
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
2017-12-19 22:05:23 -08:00
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func GetNextArenaHint() uintptr {
|
|
|
|
|
return mheap_.arenaHints.addr
|
|
|
|
|
}
|
2018-03-09 08:24:10 -08:00
|
|
|
|
|
|
|
|
type G = g
|
2018-04-26 21:43:19 -04:00
|
|
|
|
2020-01-14 19:13:47 +00:00
|
|
|
type Sudog = sudog
|
|
|
|
|
|
2018-04-26 21:43:19 -04:00
|
|
|
func Getg() *G {
|
|
|
|
|
return getg()
|
|
|
|
|
}
|
2018-06-28 16:45:28 -07:00
|
|
|
|
2022-09-28 14:44:56 -04:00
|
|
|
func Goid() uint64 {
|
|
|
|
|
return getg().goid
|
|
|
|
|
}
|
|
|
|
|
|
2022-08-31 21:34:23 +00:00
|
|
|
func GIsWaitingOnMutex(gp *G) bool {
|
|
|
|
|
return readgstatus(gp) == _Gwaiting && gp.waitreason.isMutexWait()
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
var CasGStatusAlwaysTrack = &casgstatusAlwaysTrack
|
|
|
|
|
|
2018-06-28 16:45:28 -07:00
|
|
|
//go:noinline
|
|
|
|
|
func PanicForTesting(b []byte, i int) byte {
|
|
|
|
|
return unexportedPanicForTesting(b, i)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
//go:noinline
|
|
|
|
|
func unexportedPanicForTesting(b []byte, i int) byte {
|
|
|
|
|
return b[i]
|
|
|
|
|
}
|
2018-06-25 18:00:43 -04:00
|
|
|
|
|
|
|
|
func G0StackOverflow() {
|
|
|
|
|
systemstack(func() {
|
|
|
|
|
stackOverflow(nil)
|
|
|
|
|
})
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func stackOverflow(x *byte) {
|
|
|
|
|
var buf [256]byte
|
|
|
|
|
stackOverflow(&buf[0])
|
|
|
|
|
}
|
2018-10-15 17:24:21 -07:00
|
|
|
|
|
|
|
|
func MapTombstoneCheck(m map[int]int) {
|
|
|
|
|
// Make sure emptyOne and emptyRest are distributed correctly.
|
|
|
|
|
// We should have a series of filled and emptyOne cells, followed by
|
|
|
|
|
// a series of emptyRest cells.
|
|
|
|
|
h := *(**hmap)(unsafe.Pointer(&m))
|
2021-12-01 12:15:45 -05:00
|
|
|
i := any(m)
|
2018-10-15 17:24:21 -07:00
|
|
|
t := *(**maptype)(unsafe.Pointer(&i))
|
|
|
|
|
|
|
|
|
|
for x := 0; x < 1<<h.B; x++ {
|
|
|
|
|
b0 := (*bmap)(add(h.buckets, uintptr(x)*uintptr(t.bucketsize)))
|
|
|
|
|
n := 0
|
|
|
|
|
for b := b0; b != nil; b = b.overflow(t) {
|
|
|
|
|
for i := 0; i < bucketCnt; i++ {
|
|
|
|
|
if b.tophash[i] != emptyRest {
|
|
|
|
|
n++
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
k := 0
|
|
|
|
|
for b := b0; b != nil; b = b.overflow(t) {
|
|
|
|
|
for i := 0; i < bucketCnt; i++ {
|
|
|
|
|
if k < n && b.tophash[i] == emptyRest {
|
|
|
|
|
panic("early emptyRest")
|
|
|
|
|
}
|
|
|
|
|
if k >= n && b.tophash[i] != emptyRest {
|
|
|
|
|
panic("late non-emptyRest")
|
|
|
|
|
}
|
|
|
|
|
if k == n-1 && b.tophash[i] == emptyOne {
|
|
|
|
|
panic("last non-emptyRest entry is emptyOne")
|
|
|
|
|
}
|
|
|
|
|
k++
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
2019-02-09 00:13:37 +00:00
|
|
|
|
2019-07-16 11:33:10 -04:00
|
|
|
func RunGetgThreadSwitchTest() {
|
|
|
|
|
// Test that getg works correctly with thread switch.
|
|
|
|
|
// With gccgo, if we generate getg inlined, the backend
|
|
|
|
|
// may cache the address of the TLS variable, which
|
|
|
|
|
// will become invalid after a thread switch. This test
|
|
|
|
|
// checks that the bad caching doesn't happen.
|
|
|
|
|
|
|
|
|
|
ch := make(chan int)
|
|
|
|
|
go func(ch chan int) {
|
|
|
|
|
ch <- 5
|
|
|
|
|
LockOSThread()
|
|
|
|
|
}(ch)
|
|
|
|
|
|
|
|
|
|
g1 := getg()
|
|
|
|
|
|
|
|
|
|
// Block on a receive. This is likely to get us a thread
|
|
|
|
|
// switch. If we yield to the sender goroutine, it will
|
|
|
|
|
// lock the thread, forcing us to resume on a different
|
|
|
|
|
// thread.
|
|
|
|
|
<-ch
|
|
|
|
|
|
|
|
|
|
g2 := getg()
|
|
|
|
|
if g1 != g2 {
|
|
|
|
|
panic("g1 != g2")
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Also test getg after some control flow, as the
|
|
|
|
|
// backend is sensitive to control flow.
|
|
|
|
|
g3 := getg()
|
|
|
|
|
if g1 != g3 {
|
|
|
|
|
panic("g1 != g3")
|
|
|
|
|
}
|
|
|
|
|
}
|
2019-08-12 19:08:39 +00:00
|
|
|
|
|
|
|
|
const (
|
2019-08-21 00:24:25 +00:00
|
|
|
PageSize = pageSize
|
2019-08-12 19:08:39 +00:00
|
|
|
PallocChunkPages = pallocChunkPages
|
2019-11-18 19:23:39 +00:00
|
|
|
PageAlloc64Bit = pageAlloc64Bit
|
2020-01-28 19:59:19 +00:00
|
|
|
PallocSumBytes = pallocSumBytes
|
2019-08-12 19:08:39 +00:00
|
|
|
)
|
|
|
|
|
|
2019-09-25 15:55:29 +00:00
|
|
|
// Expose pallocSum for testing.
|
|
|
|
|
type PallocSum pallocSum
|
|
|
|
|
|
|
|
|
|
func PackPallocSum(start, max, end uint) PallocSum { return PallocSum(packPallocSum(start, max, end)) }
|
|
|
|
|
func (m PallocSum) Start() uint { return pallocSum(m).start() }
|
|
|
|
|
func (m PallocSum) Max() uint { return pallocSum(m).max() }
|
|
|
|
|
func (m PallocSum) End() uint { return pallocSum(m).end() }
|
|
|
|
|
|
2019-08-12 19:08:39 +00:00
|
|
|
// Expose pallocBits for testing.
|
|
|
|
|
type PallocBits pallocBits
|
|
|
|
|
|
|
|
|
|
func (b *PallocBits) Find(npages uintptr, searchIdx uint) (uint, uint) {
|
|
|
|
|
return (*pallocBits)(b).find(npages, searchIdx)
|
|
|
|
|
}
|
2019-09-10 18:53:51 +00:00
|
|
|
func (b *PallocBits) AllocRange(i, n uint) { (*pallocBits)(b).allocRange(i, n) }
|
|
|
|
|
func (b *PallocBits) Free(i, n uint) { (*pallocBits)(b).free(i, n) }
|
|
|
|
|
func (b *PallocBits) Summarize() PallocSum { return PallocSum((*pallocBits)(b).summarize()) }
|
|
|
|
|
func (b *PallocBits) PopcntRange(i, n uint) uint { return (*pageBits)(b).popcntRange(i, n) }
|
2019-09-25 15:55:29 +00:00
|
|
|
|
|
|
|
|
// SummarizeSlow is a slow but more obviously correct implementation
|
|
|
|
|
// of (*pallocBits).summarize. Used for testing.
|
|
|
|
|
func SummarizeSlow(b *PallocBits) PallocSum {
|
|
|
|
|
var start, max, end uint
|
|
|
|
|
|
|
|
|
|
const N = uint(len(b)) * 64
|
|
|
|
|
for start < N && (*pageBits)(b).get(start) == 0 {
|
|
|
|
|
start++
|
|
|
|
|
}
|
|
|
|
|
for end < N && (*pageBits)(b).get(N-end-1) == 0 {
|
|
|
|
|
end++
|
|
|
|
|
}
|
|
|
|
|
run := uint(0)
|
|
|
|
|
for i := uint(0); i < N; i++ {
|
|
|
|
|
if (*pageBits)(b).get(i) == 0 {
|
|
|
|
|
run++
|
|
|
|
|
} else {
|
|
|
|
|
run = 0
|
|
|
|
|
}
|
|
|
|
|
if run > max {
|
|
|
|
|
max = run
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return PackPallocSum(start, max, end)
|
|
|
|
|
}
|
2019-08-12 19:08:39 +00:00
|
|
|
|
|
|
|
|
// Expose non-trivial helpers for testing.
|
|
|
|
|
func FindBitRange64(c uint64, n uint) uint { return findBitRange64(c, n) }
|
|
|
|
|
|
|
|
|
|
// Given two PallocBits, returns a set of bit ranges where
|
|
|
|
|
// they differ.
|
|
|
|
|
func DiffPallocBits(a, b *PallocBits) []BitRange {
|
|
|
|
|
ba := (*pageBits)(a)
|
|
|
|
|
bb := (*pageBits)(b)
|
|
|
|
|
|
|
|
|
|
var d []BitRange
|
|
|
|
|
base, size := uint(0), uint(0)
|
|
|
|
|
for i := uint(0); i < uint(len(ba))*64; i++ {
|
|
|
|
|
if ba.get(i) != bb.get(i) {
|
|
|
|
|
if size == 0 {
|
|
|
|
|
base = i
|
|
|
|
|
}
|
|
|
|
|
size++
|
|
|
|
|
} else {
|
|
|
|
|
if size != 0 {
|
|
|
|
|
d = append(d, BitRange{base, size})
|
|
|
|
|
}
|
|
|
|
|
size = 0
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if size != 0 {
|
|
|
|
|
d = append(d, BitRange{base, size})
|
|
|
|
|
}
|
|
|
|
|
return d
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// StringifyPallocBits gets the bits in the bit range r from b,
|
|
|
|
|
// and returns a string containing the bits as ASCII 0 and 1
|
|
|
|
|
// characters.
|
|
|
|
|
func StringifyPallocBits(b *PallocBits, r BitRange) string {
|
|
|
|
|
str := ""
|
|
|
|
|
for j := r.I; j < r.I+r.N; j++ {
|
|
|
|
|
if (*pageBits)(b).get(j) != 0 {
|
|
|
|
|
str += "1"
|
|
|
|
|
} else {
|
|
|
|
|
str += "0"
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return str
|
|
|
|
|
}
|
|
|
|
|
|
2019-08-21 00:24:25 +00:00
|
|
|
// Expose pallocData for testing.
|
|
|
|
|
type PallocData pallocData
|
|
|
|
|
|
|
|
|
|
func (d *PallocData) FindScavengeCandidate(searchIdx uint, min, max uintptr) (uint, uint) {
|
|
|
|
|
return (*pallocData)(d).findScavengeCandidate(searchIdx, min, max)
|
|
|
|
|
}
|
|
|
|
|
func (d *PallocData) AllocRange(i, n uint) { (*pallocData)(d).allocRange(i, n) }
|
|
|
|
|
func (d *PallocData) ScavengedSetRange(i, n uint) {
|
|
|
|
|
(*pallocData)(d).scavenged.setRange(i, n)
|
|
|
|
|
}
|
|
|
|
|
func (d *PallocData) PallocBits() *PallocBits {
|
|
|
|
|
return (*PallocBits)(&(*pallocData)(d).pallocBits)
|
|
|
|
|
}
|
|
|
|
|
func (d *PallocData) Scavenged() *PallocBits {
|
|
|
|
|
return (*PallocBits)(&(*pallocData)(d).scavenged)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Expose fillAligned for testing.
|
|
|
|
|
func FillAligned(x uint64, m uint) uint64 { return fillAligned(x, m) }
|
|
|
|
|
|
2019-09-18 17:51:16 +00:00
|
|
|
// Expose pageCache for testing.
|
|
|
|
|
type PageCache pageCache
|
|
|
|
|
|
|
|
|
|
const PageCachePages = pageCachePages
|
|
|
|
|
|
|
|
|
|
func NewPageCache(base uintptr, cache, scav uint64) PageCache {
|
|
|
|
|
return PageCache(pageCache{base: base, cache: cache, scav: scav})
|
|
|
|
|
}
|
|
|
|
|
func (c *PageCache) Empty() bool { return (*pageCache)(c).empty() }
|
|
|
|
|
func (c *PageCache) Base() uintptr { return (*pageCache)(c).base }
|
|
|
|
|
func (c *PageCache) Cache() uint64 { return (*pageCache)(c).cache }
|
|
|
|
|
func (c *PageCache) Scav() uint64 { return (*pageCache)(c).scav }
|
|
|
|
|
func (c *PageCache) Alloc(npages uintptr) (uintptr, uintptr) {
|
|
|
|
|
return (*pageCache)(c).alloc(npages)
|
|
|
|
|
}
|
|
|
|
|
func (c *PageCache) Flush(s *PageAlloc) {
|
2020-08-21 11:59:55 -04:00
|
|
|
cp := (*pageCache)(c)
|
|
|
|
|
sp := (*pageAlloc)(s)
|
|
|
|
|
|
|
|
|
|
systemstack(func() {
|
|
|
|
|
// None of the tests need any higher-level locking, so we just
|
|
|
|
|
// take the lock internally.
|
|
|
|
|
lock(sp.mheapLock)
|
|
|
|
|
cp.flush(sp)
|
|
|
|
|
unlock(sp.mheapLock)
|
|
|
|
|
})
|
2019-09-18 17:51:16 +00:00
|
|
|
}
|
|
|
|
|
|
2019-08-14 16:32:12 +00:00
|
|
|
// Expose chunk index type.
|
|
|
|
|
type ChunkIdx chunkIdx
|
|
|
|
|
|
|
|
|
|
// Expose pageAlloc for testing. Note that because pageAlloc is
|
|
|
|
|
// not in the heap, so is PageAlloc.
|
|
|
|
|
type PageAlloc pageAlloc
|
|
|
|
|
|
2019-09-10 18:53:51 +00:00
|
|
|
func (p *PageAlloc) Alloc(npages uintptr) (uintptr, uintptr) {
|
2020-08-21 11:59:55 -04:00
|
|
|
pp := (*pageAlloc)(p)
|
|
|
|
|
|
|
|
|
|
var addr, scav uintptr
|
|
|
|
|
systemstack(func() {
|
|
|
|
|
// None of the tests need any higher-level locking, so we just
|
|
|
|
|
// take the lock internally.
|
|
|
|
|
lock(pp.mheapLock)
|
|
|
|
|
addr, scav = pp.alloc(npages)
|
|
|
|
|
unlock(pp.mheapLock)
|
|
|
|
|
})
|
|
|
|
|
return addr, scav
|
2019-09-10 18:53:51 +00:00
|
|
|
}
|
2019-09-18 17:51:16 +00:00
|
|
|
func (p *PageAlloc) AllocToCache() PageCache {
|
2020-08-21 11:59:55 -04:00
|
|
|
pp := (*pageAlloc)(p)
|
|
|
|
|
|
|
|
|
|
var c PageCache
|
|
|
|
|
systemstack(func() {
|
|
|
|
|
// None of the tests need any higher-level locking, so we just
|
|
|
|
|
// take the lock internally.
|
|
|
|
|
lock(pp.mheapLock)
|
|
|
|
|
c = PageCache(pp.allocToCache())
|
|
|
|
|
unlock(pp.mheapLock)
|
|
|
|
|
})
|
|
|
|
|
return c
|
2019-09-18 17:51:16 +00:00
|
|
|
}
|
2019-09-10 18:53:51 +00:00
|
|
|
func (p *PageAlloc) Free(base, npages uintptr) {
|
2020-08-21 11:59:55 -04:00
|
|
|
pp := (*pageAlloc)(p)
|
|
|
|
|
|
|
|
|
|
systemstack(func() {
|
|
|
|
|
// None of the tests need any higher-level locking, so we just
|
|
|
|
|
// take the lock internally.
|
|
|
|
|
lock(pp.mheapLock)
|
runtime: manage huge pages explicitly
This change makes it so that on Linux the Go runtime explicitly marks
page heap memory as either available to be backed by hugepages or not
using heuristics based on density.
The motivation behind this change is twofold:
1. In default Linux configurations, khugepaged can recoalesce hugepages
even after the scavenger breaks them up, resulting in significant
overheads for small heaps when their heaps shrink.
2. The Go runtime already has some heuristics about this, but those
heuristics appear to have bit-rotted and result in haphazard
hugepage management. Unlucky (but otherwise fairly dense) regions of
memory end up not backed by huge pages while sparse regions end up
accidentally marked MADV_HUGEPAGE and are not later broken up by the
scavenger, because it already got the memory it needed from more
dense sections (this is more likely to happen with small heaps that
go idle).
In this change, the runtime uses a new policy:
1. Mark all new memory MADV_HUGEPAGE.
2. Track whether each page chunk (4 MiB) became dense during the GC
cycle. Mark those MADV_HUGEPAGE, and hide them from the scavenger.
3. If a chunk is not dense for 1 full GC cycle, make it visible to the
scavenger.
4. The scavenger marks a chunk MADV_NOHUGEPAGE before it scavenges it.
This policy is intended to try and back memory that is a good candidate
for huge pages (high occupancy) with huge pages, and give memory that is
not (low occupancy) to the scavenger. Occupancy is defined not just by
occupancy at any instant of time, but also occupancy in the near future.
It's generally true that by the end of a GC cycle the heap gets quite
dense (from the perspective of the page allocator).
Because we want scavenging and huge page management to happen together
(the right time to MADV_NOHUGEPAGE is just before scavenging in order to
break up huge pages and keep them that way) and the cost of applying
MADV_HUGEPAGE and MADV_NOHUGEPAGE is somewhat high, the scavenger avoids
releasing memory in dense page chunks. All this together means the
scavenger will now more generally release memory on a ~1 GC cycle delay.
Notably this has implications for scavenging to maintain the memory
limit and the runtime/debug.FreeOSMemory API. This change makes it so
that in these cases all memory is visible to the scavenger regardless of
sparseness and delays the page allocator in re-marking this memory with
MADV_NOHUGEPAGE for around 1 GC cycle to mitigate churn.
The end result of this change should be little-to-no performance
difference for dense heaps (MADV_HUGEPAGE works a lot like the default
unmarked state) but should allow the scavenger to more effectively take
back fragments of huge pages. The main risk here is churn, because
MADV_HUGEPAGE usually forces the kernel to immediately back memory with
a huge page. That's the reason for the large amount of hysteresis (1
full GC cycle) and why the definition of high density is 96% occupancy.
Fixes #55328.
Change-Id: I8da7998f1a31b498a9cc9bc662c1ae1a6bf64630
Reviewed-on: https://go-review.googlesource.com/c/go/+/436395
Reviewed-by: Michael Pratt <mpratt@google.com>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
2022-09-23 16:32:34 +00:00
|
|
|
pp.free(base, npages)
|
2020-08-21 11:59:55 -04:00
|
|
|
unlock(pp.mheapLock)
|
|
|
|
|
})
|
2019-09-10 18:53:51 +00:00
|
|
|
}
|
2019-08-14 16:32:12 +00:00
|
|
|
func (p *PageAlloc) Bounds() (ChunkIdx, ChunkIdx) {
|
|
|
|
|
return ChunkIdx((*pageAlloc)(p).start), ChunkIdx((*pageAlloc)(p).end)
|
|
|
|
|
}
|
runtime: don't hold the heap lock while scavenging
This change modifies the scavenger to no longer hold the heap lock while
actively scavenging pages. To achieve this, the change also:
* Reverses the locking behavior of the (*pageAlloc).scavenge API, to
only acquire the heap lock when necessary.
* Introduces a new lock on the scavenger-related fields in a pageAlloc
so that access to those fields doesn't require the heap lock. There
are a few places in the scavenge path, notably reservation, that
requires synchronization. The heap lock is far too heavy handed for
this case.
* Changes the scavenger to marks pages that are actively being scavenged
as allocated, and "frees" them back to the page allocator the usual
way.
* Lifts the heap-growth scavenging code out of mheap.grow, where the
heap lock is held, and into allocSpan, just after the lock is
released. Releasing the lock during mheap.grow is not feasible if we
want to ensure that allocation always makes progress (post-growth,
another allocator could come in and take all that space, forcing the
goroutine that just grew the heap to do so again).
This change means that the scavenger now must do more work for each
scavenge, but it is also now much more scalable. Although in theory it's
not great by always taking the locked paths in the page allocator, it
takes advantage of some properties of the allocator:
* Most of the time, the scavenger will be working with one page at a
time. The page allocator's locked path is optimized for this case.
* On the allocation path, it doesn't need to do the find operation at
all; it can go straight to setting bits for the range and updating the
summary structure.
Change-Id: Ie941d5e7c05dcc96476795c63fef74bcafc2a0f1
Reviewed-on: https://go-review.googlesource.com/c/go/+/353974
Trust: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Michael Pratt <mpratt@google.com>
2021-10-04 20:36:49 +00:00
|
|
|
func (p *PageAlloc) Scavenge(nbytes uintptr) (r uintptr) {
|
2019-11-21 17:05:14 +00:00
|
|
|
pp := (*pageAlloc)(p)
|
2019-08-21 00:24:25 +00:00
|
|
|
systemstack(func() {
|
runtime: manage huge pages explicitly
This change makes it so that on Linux the Go runtime explicitly marks
page heap memory as either available to be backed by hugepages or not
using heuristics based on density.
The motivation behind this change is twofold:
1. In default Linux configurations, khugepaged can recoalesce hugepages
even after the scavenger breaks them up, resulting in significant
overheads for small heaps when their heaps shrink.
2. The Go runtime already has some heuristics about this, but those
heuristics appear to have bit-rotted and result in haphazard
hugepage management. Unlucky (but otherwise fairly dense) regions of
memory end up not backed by huge pages while sparse regions end up
accidentally marked MADV_HUGEPAGE and are not later broken up by the
scavenger, because it already got the memory it needed from more
dense sections (this is more likely to happen with small heaps that
go idle).
In this change, the runtime uses a new policy:
1. Mark all new memory MADV_HUGEPAGE.
2. Track whether each page chunk (4 MiB) became dense during the GC
cycle. Mark those MADV_HUGEPAGE, and hide them from the scavenger.
3. If a chunk is not dense for 1 full GC cycle, make it visible to the
scavenger.
4. The scavenger marks a chunk MADV_NOHUGEPAGE before it scavenges it.
This policy is intended to try and back memory that is a good candidate
for huge pages (high occupancy) with huge pages, and give memory that is
not (low occupancy) to the scavenger. Occupancy is defined not just by
occupancy at any instant of time, but also occupancy in the near future.
It's generally true that by the end of a GC cycle the heap gets quite
dense (from the perspective of the page allocator).
Because we want scavenging and huge page management to happen together
(the right time to MADV_NOHUGEPAGE is just before scavenging in order to
break up huge pages and keep them that way) and the cost of applying
MADV_HUGEPAGE and MADV_NOHUGEPAGE is somewhat high, the scavenger avoids
releasing memory in dense page chunks. All this together means the
scavenger will now more generally release memory on a ~1 GC cycle delay.
Notably this has implications for scavenging to maintain the memory
limit and the runtime/debug.FreeOSMemory API. This change makes it so
that in these cases all memory is visible to the scavenger regardless of
sparseness and delays the page allocator in re-marking this memory with
MADV_NOHUGEPAGE for around 1 GC cycle to mitigate churn.
The end result of this change should be little-to-no performance
difference for dense heaps (MADV_HUGEPAGE works a lot like the default
unmarked state) but should allow the scavenger to more effectively take
back fragments of huge pages. The main risk here is churn, because
MADV_HUGEPAGE usually forces the kernel to immediately back memory with
a huge page. That's the reason for the large amount of hysteresis (1
full GC cycle) and why the definition of high density is 96% occupancy.
Fixes #55328.
Change-Id: I8da7998f1a31b498a9cc9bc662c1ae1a6bf64630
Reviewed-on: https://go-review.googlesource.com/c/go/+/436395
Reviewed-by: Michael Pratt <mpratt@google.com>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
2022-09-23 16:32:34 +00:00
|
|
|
r = pp.scavenge(nbytes, nil, true)
|
2019-08-21 00:24:25 +00:00
|
|
|
})
|
|
|
|
|
return
|
2019-08-14 16:32:12 +00:00
|
|
|
}
|
2019-11-15 23:30:30 +00:00
|
|
|
func (p *PageAlloc) InUse() []AddrRange {
|
|
|
|
|
ranges := make([]AddrRange, 0, len(p.inUse.ranges))
|
|
|
|
|
for _, r := range p.inUse.ranges {
|
2020-07-14 20:27:27 +00:00
|
|
|
ranges = append(ranges, AddrRange{r})
|
2019-11-15 23:30:30 +00:00
|
|
|
}
|
|
|
|
|
return ranges
|
|
|
|
|
}
|
2019-08-14 16:32:12 +00:00
|
|
|
|
2019-11-14 23:58:50 +00:00
|
|
|
// Returns nil if the PallocData's L2 is missing.
|
|
|
|
|
func (p *PageAlloc) PallocData(i ChunkIdx) *PallocData {
|
|
|
|
|
ci := chunkIdx(i)
|
2020-09-09 16:52:18 +00:00
|
|
|
return (*PallocData)((*pageAlloc)(p).tryChunkOf(ci))
|
2019-11-14 23:58:50 +00:00
|
|
|
}
|
|
|
|
|
|
2020-07-14 20:27:27 +00:00
|
|
|
// AddrRange is a wrapper around addrRange for testing.
|
2019-11-15 23:30:30 +00:00
|
|
|
type AddrRange struct {
|
2020-07-14 20:27:27 +00:00
|
|
|
addrRange
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// MakeAddrRange creates a new address range.
|
|
|
|
|
func MakeAddrRange(base, limit uintptr) AddrRange {
|
|
|
|
|
return AddrRange{makeAddrRange(base, limit)}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Base returns the virtual base address of the address range.
|
|
|
|
|
func (a AddrRange) Base() uintptr {
|
|
|
|
|
return a.addrRange.base.addr()
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Base returns the virtual address of the limit of the address range.
|
|
|
|
|
func (a AddrRange) Limit() uintptr {
|
|
|
|
|
return a.addrRange.limit.addr()
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Equals returns true if the two address ranges are exactly equal.
|
|
|
|
|
func (a AddrRange) Equals(b AddrRange) bool {
|
|
|
|
|
return a == b
|
2019-11-15 23:30:30 +00:00
|
|
|
}
|
|
|
|
|
|
2020-07-14 21:41:12 +00:00
|
|
|
// Size returns the size in bytes of the address range.
|
|
|
|
|
func (a AddrRange) Size() uintptr {
|
|
|
|
|
return a.addrRange.size()
|
|
|
|
|
}
|
|
|
|
|
|
runtime: track how much memory is mapped in the Ready state
This change adds a field to memstats called mappedReady that tracks how
much memory is in the Ready state at any given time. In essence, it's
the total memory usage by the Go runtime (with one exception which is
documented). Essentially, all memory mapped read/write that has either
been paged in or will soon.
To make tracking this not involve the many different stats that track
mapped memory, we track this statistic at a very low level. The downside
of tracking this statistic at such a low level is that it managed to
catch lots of situations where the runtime wasn't fully accounting for
memory. This change rectifies these situations by always accounting for
memory that's mapped in some way (i.e. always passing a sysMemStat to a
mem.go function), with *two* exceptions.
Rectifying these situations means also having the memory mapped during
testing being accounted for, so that tests (i.e. ReadMemStats) that
ultimately check mappedReady continue to work correctly without special
exceptions. We choose to simply account for this memory in other_sys.
Let's talk about the exceptions. The first is the arenas array for
finding heap arena metadata from an address is mapped as read/write in
one large chunk. It's tens of MiB in size. On systems with demand
paging, we assume that the whole thing isn't paged in at once (after
all, it maps to the whole address space, and it's exceedingly difficult
with today's technology to even broach having as much physical memory as
the total address space). On systems where we have to commit memory
manually, we use a two-level structure.
Now, the reason why this is an exception is because we have no mechanism
to track what memory is paged in, and we can't just account for the
entire thing, because that would *look* like an enormous overhead.
Furthermore, this structure is on a few really, really critical paths in
the runtime, so doing more explicit tracking isn't really an option. So,
we explicitly don't and call sysAllocOS to map this memory.
The second exception is that we call sysFree with no accounting to clean
up address space reservations, or otherwise to throw out mappings we
don't care about. In this case, also drop down to a lower level and call
sysFreeOS to explicitly avoid accounting.
The third exception is debuglog allocations. That is purely a debugging
facility and ideally we want it to have as small an impact on the
runtime as possible. If we include it in mappedReady calculations, it
could cause GC pacing shifts in future CLs, especailly if one increases
the debuglog buffer sizes as a one-off.
As of this CL, these are the only three places in the runtime that would
pass nil for a stat to any of the functions in mem.go. As a result, this
CL makes sysMemStats mandatory to facilitate better accounting in the
future. It's now much easier to grep and find out where accounting is
explicitly elided, because one doesn't have to follow the trail of
sysMemStat nil pointer values, and can just look at the function name.
For #48409.
Change-Id: I274eb467fc2603881717482214fddc47c9eaf218
Reviewed-on: https://go-review.googlesource.com/c/go/+/393402
Reviewed-by: Michael Pratt <mpratt@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
2022-03-15 02:48:18 +00:00
|
|
|
// testSysStat is the sysStat passed to test versions of various
|
|
|
|
|
// runtime structures. We do actually have to keep track of this
|
|
|
|
|
// because otherwise memstats.mappedReady won't actually line up
|
|
|
|
|
// with other stats in the runtime during tests.
|
|
|
|
|
var testSysStat = &memstats.other_sys
|
|
|
|
|
|
2020-07-14 21:39:52 +00:00
|
|
|
// AddrRanges is a wrapper around addrRanges for testing.
|
|
|
|
|
type AddrRanges struct {
|
|
|
|
|
addrRanges
|
2020-07-14 21:41:12 +00:00
|
|
|
mutable bool
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// NewAddrRanges creates a new empty addrRanges.
|
|
|
|
|
//
|
|
|
|
|
// Note that this initializes addrRanges just like in the
|
|
|
|
|
// runtime, so its memory is persistentalloc'd. Call this
|
|
|
|
|
// function sparingly since the memory it allocates is
|
|
|
|
|
// leaked.
|
|
|
|
|
//
|
|
|
|
|
// This AddrRanges is mutable, so we can test methods like
|
|
|
|
|
// Add.
|
|
|
|
|
func NewAddrRanges() AddrRanges {
|
|
|
|
|
r := addrRanges{}
|
runtime: track how much memory is mapped in the Ready state
This change adds a field to memstats called mappedReady that tracks how
much memory is in the Ready state at any given time. In essence, it's
the total memory usage by the Go runtime (with one exception which is
documented). Essentially, all memory mapped read/write that has either
been paged in or will soon.
To make tracking this not involve the many different stats that track
mapped memory, we track this statistic at a very low level. The downside
of tracking this statistic at such a low level is that it managed to
catch lots of situations where the runtime wasn't fully accounting for
memory. This change rectifies these situations by always accounting for
memory that's mapped in some way (i.e. always passing a sysMemStat to a
mem.go function), with *two* exceptions.
Rectifying these situations means also having the memory mapped during
testing being accounted for, so that tests (i.e. ReadMemStats) that
ultimately check mappedReady continue to work correctly without special
exceptions. We choose to simply account for this memory in other_sys.
Let's talk about the exceptions. The first is the arenas array for
finding heap arena metadata from an address is mapped as read/write in
one large chunk. It's tens of MiB in size. On systems with demand
paging, we assume that the whole thing isn't paged in at once (after
all, it maps to the whole address space, and it's exceedingly difficult
with today's technology to even broach having as much physical memory as
the total address space). On systems where we have to commit memory
manually, we use a two-level structure.
Now, the reason why this is an exception is because we have no mechanism
to track what memory is paged in, and we can't just account for the
entire thing, because that would *look* like an enormous overhead.
Furthermore, this structure is on a few really, really critical paths in
the runtime, so doing more explicit tracking isn't really an option. So,
we explicitly don't and call sysAllocOS to map this memory.
The second exception is that we call sysFree with no accounting to clean
up address space reservations, or otherwise to throw out mappings we
don't care about. In this case, also drop down to a lower level and call
sysFreeOS to explicitly avoid accounting.
The third exception is debuglog allocations. That is purely a debugging
facility and ideally we want it to have as small an impact on the
runtime as possible. If we include it in mappedReady calculations, it
could cause GC pacing shifts in future CLs, especailly if one increases
the debuglog buffer sizes as a one-off.
As of this CL, these are the only three places in the runtime that would
pass nil for a stat to any of the functions in mem.go. As a result, this
CL makes sysMemStats mandatory to facilitate better accounting in the
future. It's now much easier to grep and find out where accounting is
explicitly elided, because one doesn't have to follow the trail of
sysMemStat nil pointer values, and can just look at the function name.
For #48409.
Change-Id: I274eb467fc2603881717482214fddc47c9eaf218
Reviewed-on: https://go-review.googlesource.com/c/go/+/393402
Reviewed-by: Michael Pratt <mpratt@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
2022-03-15 02:48:18 +00:00
|
|
|
r.init(testSysStat)
|
2020-07-14 21:41:12 +00:00
|
|
|
return AddrRanges{r, true}
|
2020-07-14 21:39:52 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// MakeAddrRanges creates a new addrRanges populated with
|
|
|
|
|
// the ranges in a.
|
2020-07-14 21:41:12 +00:00
|
|
|
//
|
|
|
|
|
// The returned AddrRanges is immutable, so methods like
|
|
|
|
|
// Add will fail.
|
2020-07-14 21:39:52 +00:00
|
|
|
func MakeAddrRanges(a ...AddrRange) AddrRanges {
|
|
|
|
|
// Methods that manipulate the backing store of addrRanges.ranges should
|
|
|
|
|
// not be used on the result from this function (e.g. add) since they may
|
2020-07-14 21:41:12 +00:00
|
|
|
// trigger reallocation. That would normally be fine, except the new
|
|
|
|
|
// backing store won't come from the heap, but from persistentalloc, so
|
|
|
|
|
// we'll leak some memory implicitly.
|
2020-07-14 21:39:52 +00:00
|
|
|
ranges := make([]addrRange, 0, len(a))
|
2020-07-14 21:41:12 +00:00
|
|
|
total := uintptr(0)
|
2020-07-14 21:39:52 +00:00
|
|
|
for _, r := range a {
|
|
|
|
|
ranges = append(ranges, r.addrRange)
|
2020-07-14 21:41:12 +00:00
|
|
|
total += r.Size()
|
2020-07-14 21:39:52 +00:00
|
|
|
}
|
2020-07-14 21:41:12 +00:00
|
|
|
return AddrRanges{addrRanges{
|
|
|
|
|
ranges: ranges,
|
|
|
|
|
totalBytes: total,
|
runtime: track how much memory is mapped in the Ready state
This change adds a field to memstats called mappedReady that tracks how
much memory is in the Ready state at any given time. In essence, it's
the total memory usage by the Go runtime (with one exception which is
documented). Essentially, all memory mapped read/write that has either
been paged in or will soon.
To make tracking this not involve the many different stats that track
mapped memory, we track this statistic at a very low level. The downside
of tracking this statistic at such a low level is that it managed to
catch lots of situations where the runtime wasn't fully accounting for
memory. This change rectifies these situations by always accounting for
memory that's mapped in some way (i.e. always passing a sysMemStat to a
mem.go function), with *two* exceptions.
Rectifying these situations means also having the memory mapped during
testing being accounted for, so that tests (i.e. ReadMemStats) that
ultimately check mappedReady continue to work correctly without special
exceptions. We choose to simply account for this memory in other_sys.
Let's talk about the exceptions. The first is the arenas array for
finding heap arena metadata from an address is mapped as read/write in
one large chunk. It's tens of MiB in size. On systems with demand
paging, we assume that the whole thing isn't paged in at once (after
all, it maps to the whole address space, and it's exceedingly difficult
with today's technology to even broach having as much physical memory as
the total address space). On systems where we have to commit memory
manually, we use a two-level structure.
Now, the reason why this is an exception is because we have no mechanism
to track what memory is paged in, and we can't just account for the
entire thing, because that would *look* like an enormous overhead.
Furthermore, this structure is on a few really, really critical paths in
the runtime, so doing more explicit tracking isn't really an option. So,
we explicitly don't and call sysAllocOS to map this memory.
The second exception is that we call sysFree with no accounting to clean
up address space reservations, or otherwise to throw out mappings we
don't care about. In this case, also drop down to a lower level and call
sysFreeOS to explicitly avoid accounting.
The third exception is debuglog allocations. That is purely a debugging
facility and ideally we want it to have as small an impact on the
runtime as possible. If we include it in mappedReady calculations, it
could cause GC pacing shifts in future CLs, especailly if one increases
the debuglog buffer sizes as a one-off.
As of this CL, these are the only three places in the runtime that would
pass nil for a stat to any of the functions in mem.go. As a result, this
CL makes sysMemStats mandatory to facilitate better accounting in the
future. It's now much easier to grep and find out where accounting is
explicitly elided, because one doesn't have to follow the trail of
sysMemStat nil pointer values, and can just look at the function name.
For #48409.
Change-Id: I274eb467fc2603881717482214fddc47c9eaf218
Reviewed-on: https://go-review.googlesource.com/c/go/+/393402
Reviewed-by: Michael Pratt <mpratt@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
2022-03-15 02:48:18 +00:00
|
|
|
sysStat: testSysStat,
|
2020-07-14 21:41:12 +00:00
|
|
|
}, false}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Ranges returns a copy of the ranges described by the
|
|
|
|
|
// addrRanges.
|
|
|
|
|
func (a *AddrRanges) Ranges() []AddrRange {
|
|
|
|
|
result := make([]AddrRange, 0, len(a.addrRanges.ranges))
|
|
|
|
|
for _, r := range a.addrRanges.ranges {
|
|
|
|
|
result = append(result, AddrRange{r})
|
|
|
|
|
}
|
|
|
|
|
return result
|
2020-07-14 21:39:52 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// FindSucc returns the successor to base. See addrRanges.findSucc
|
|
|
|
|
// for more details.
|
|
|
|
|
func (a *AddrRanges) FindSucc(base uintptr) int {
|
|
|
|
|
return a.findSucc(base)
|
|
|
|
|
}
|
|
|
|
|
|
2020-07-14 21:41:12 +00:00
|
|
|
// Add adds a new AddrRange to the AddrRanges.
|
|
|
|
|
//
|
|
|
|
|
// The AddrRange must be mutable (i.e. created by NewAddrRanges),
|
|
|
|
|
// otherwise this method will throw.
|
|
|
|
|
func (a *AddrRanges) Add(r AddrRange) {
|
|
|
|
|
if !a.mutable {
|
|
|
|
|
throw("attempt to mutate immutable AddrRanges")
|
|
|
|
|
}
|
|
|
|
|
a.add(r.addrRange)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// TotalBytes returns the totalBytes field of the addrRanges.
|
|
|
|
|
func (a *AddrRanges) TotalBytes() uintptr {
|
|
|
|
|
return a.addrRanges.totalBytes
|
|
|
|
|
}
|
|
|
|
|
|
2019-08-12 19:08:39 +00:00
|
|
|
// BitRange represents a range over a bitmap.
|
|
|
|
|
type BitRange struct {
|
|
|
|
|
I, N uint // bit index and length in bits
|
|
|
|
|
}
|
2019-08-14 16:32:12 +00:00
|
|
|
|
|
|
|
|
// NewPageAlloc creates a new page allocator for testing and
|
2019-08-21 00:24:25 +00:00
|
|
|
// initializes it with the scav and chunks maps. Each key in these maps
|
|
|
|
|
// represents a chunk index and each value is a series of bit ranges to
|
|
|
|
|
// set within each bitmap's chunk.
|
|
|
|
|
//
|
|
|
|
|
// The initialization of the pageAlloc preserves the invariant that if a
|
|
|
|
|
// scavenged bit is set the alloc bit is necessarily unset, so some
|
|
|
|
|
// of the bits described by scav may be cleared in the final bitmap if
|
|
|
|
|
// ranges in chunks overlap with them.
|
|
|
|
|
//
|
|
|
|
|
// scav is optional, and if nil, the scavenged bitmap will be cleared
|
|
|
|
|
// (as opposed to all 1s, which it usually is). Furthermore, every
|
|
|
|
|
// chunk index in scav must appear in chunks; ones that do not are
|
|
|
|
|
// ignored.
|
|
|
|
|
func NewPageAlloc(chunks, scav map[ChunkIdx][]BitRange) *PageAlloc {
|
2019-08-14 16:32:12 +00:00
|
|
|
p := new(pageAlloc)
|
|
|
|
|
|
|
|
|
|
// We've got an entry, so initialize the pageAlloc.
|
runtime: manage huge pages explicitly
This change makes it so that on Linux the Go runtime explicitly marks
page heap memory as either available to be backed by hugepages or not
using heuristics based on density.
The motivation behind this change is twofold:
1. In default Linux configurations, khugepaged can recoalesce hugepages
even after the scavenger breaks them up, resulting in significant
overheads for small heaps when their heaps shrink.
2. The Go runtime already has some heuristics about this, but those
heuristics appear to have bit-rotted and result in haphazard
hugepage management. Unlucky (but otherwise fairly dense) regions of
memory end up not backed by huge pages while sparse regions end up
accidentally marked MADV_HUGEPAGE and are not later broken up by the
scavenger, because it already got the memory it needed from more
dense sections (this is more likely to happen with small heaps that
go idle).
In this change, the runtime uses a new policy:
1. Mark all new memory MADV_HUGEPAGE.
2. Track whether each page chunk (4 MiB) became dense during the GC
cycle. Mark those MADV_HUGEPAGE, and hide them from the scavenger.
3. If a chunk is not dense for 1 full GC cycle, make it visible to the
scavenger.
4. The scavenger marks a chunk MADV_NOHUGEPAGE before it scavenges it.
This policy is intended to try and back memory that is a good candidate
for huge pages (high occupancy) with huge pages, and give memory that is
not (low occupancy) to the scavenger. Occupancy is defined not just by
occupancy at any instant of time, but also occupancy in the near future.
It's generally true that by the end of a GC cycle the heap gets quite
dense (from the perspective of the page allocator).
Because we want scavenging and huge page management to happen together
(the right time to MADV_NOHUGEPAGE is just before scavenging in order to
break up huge pages and keep them that way) and the cost of applying
MADV_HUGEPAGE and MADV_NOHUGEPAGE is somewhat high, the scavenger avoids
releasing memory in dense page chunks. All this together means the
scavenger will now more generally release memory on a ~1 GC cycle delay.
Notably this has implications for scavenging to maintain the memory
limit and the runtime/debug.FreeOSMemory API. This change makes it so
that in these cases all memory is visible to the scavenger regardless of
sparseness and delays the page allocator in re-marking this memory with
MADV_NOHUGEPAGE for around 1 GC cycle to mitigate churn.
The end result of this change should be little-to-no performance
difference for dense heaps (MADV_HUGEPAGE works a lot like the default
unmarked state) but should allow the scavenger to more effectively take
back fragments of huge pages. The main risk here is churn, because
MADV_HUGEPAGE usually forces the kernel to immediately back memory with
a huge page. That's the reason for the large amount of hysteresis (1
full GC cycle) and why the definition of high density is 96% occupancy.
Fixes #55328.
Change-Id: I8da7998f1a31b498a9cc9bc662c1ae1a6bf64630
Reviewed-on: https://go-review.googlesource.com/c/go/+/436395
Reviewed-by: Michael Pratt <mpratt@google.com>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
2022-09-23 16:32:34 +00:00
|
|
|
p.init(new(mutex), testSysStat, true)
|
runtime: static lock ranking for the runtime (enabled by GOEXPERIMENT)
I took some of the infrastructure from Austin's lock logging CR
https://go-review.googlesource.com/c/go/+/192704 (with deadlock
detection from the logs), and developed a setup to give static lock
ranking for runtime locks.
Static lock ranking establishes a documented total ordering among locks,
and then reports an error if the total order is violated. This can
happen if a deadlock happens (by acquiring a sequence of locks in
different orders), or if just one side of a possible deadlock happens.
Lock ordering deadlocks cannot happen as long as the lock ordering is
followed.
Along the way, I found a deadlock involving the new timer code, which Ian fixed
via https://go-review.googlesource.com/c/go/+/207348, as well as two other
potential deadlocks.
See the constants at the top of runtime/lockrank.go to show the static
lock ranking that I ended up with, along with some comments. This is
great documentation of the current intended lock ordering when acquiring
multiple locks in the runtime.
I also added an array lockPartialOrder[] which shows and enforces the
current partial ordering among locks (which is embedded within the total
ordering). This is more specific about the dependencies among locks.
I don't try to check the ranking within a lock class with multiple locks
that can be acquired at the same time (i.e. check the ranking when
multiple hchan locks are acquired).
Currently, I am doing a lockInit() call to set the lock rank of most
locks. Any lock that is not otherwise initialized is assumed to be a
leaf lock (a very high rank lock), so that eliminates the need to do
anything for a bunch of locks (including all architecture-dependent
locks). For two locks, root.lock and notifyList.lock (only in the
runtime/sema.go file), it is not as easy to do lock initialization, so
instead, I am passing the lock rank with the lock calls.
For Windows compilation, I needed to increase the StackGuard size from
896 to 928 because of the new lock-rank checking functions.
Checking of the static lock ranking is enabled by setting
GOEXPERIMENT=staticlockranking before doing a run.
To make sure that the static lock ranking code has no overhead in memory
or CPU when not enabled by GOEXPERIMENT, I changed 'go build/install' so
that it defines a build tag (with the same name) whenever any experiment
has been baked into the toolchain (by checking Expstring()). This allows
me to avoid increasing the size of the 'mutex' type when static lock
ranking is not enabled.
Fixes #38029
Change-Id: I154217ff307c47051f8dae9c2a03b53081acd83a
Reviewed-on: https://go-review.googlesource.com/c/go/+/207619
Reviewed-by: Dan Scales <danscales@google.com>
Reviewed-by: Keith Randall <khr@golang.org>
Run-TryBot: Dan Scales <danscales@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2019-11-13 17:34:47 -08:00
|
|
|
lockInit(p.mheapLock, lockRankMheap)
|
2019-08-14 16:32:12 +00:00
|
|
|
for i, init := range chunks {
|
|
|
|
|
addr := chunkBase(chunkIdx(i))
|
|
|
|
|
|
|
|
|
|
// Mark the chunk's existence in the pageAlloc.
|
2020-08-21 11:59:55 -04:00
|
|
|
systemstack(func() {
|
|
|
|
|
lock(p.mheapLock)
|
|
|
|
|
p.grow(addr, pallocChunkBytes)
|
|
|
|
|
unlock(p.mheapLock)
|
|
|
|
|
})
|
2019-08-14 16:32:12 +00:00
|
|
|
|
|
|
|
|
// Initialize the bitmap and update pageAlloc metadata.
|
runtime: manage huge pages explicitly
This change makes it so that on Linux the Go runtime explicitly marks
page heap memory as either available to be backed by hugepages or not
using heuristics based on density.
The motivation behind this change is twofold:
1. In default Linux configurations, khugepaged can recoalesce hugepages
even after the scavenger breaks them up, resulting in significant
overheads for small heaps when their heaps shrink.
2. The Go runtime already has some heuristics about this, but those
heuristics appear to have bit-rotted and result in haphazard
hugepage management. Unlucky (but otherwise fairly dense) regions of
memory end up not backed by huge pages while sparse regions end up
accidentally marked MADV_HUGEPAGE and are not later broken up by the
scavenger, because it already got the memory it needed from more
dense sections (this is more likely to happen with small heaps that
go idle).
In this change, the runtime uses a new policy:
1. Mark all new memory MADV_HUGEPAGE.
2. Track whether each page chunk (4 MiB) became dense during the GC
cycle. Mark those MADV_HUGEPAGE, and hide them from the scavenger.
3. If a chunk is not dense for 1 full GC cycle, make it visible to the
scavenger.
4. The scavenger marks a chunk MADV_NOHUGEPAGE before it scavenges it.
This policy is intended to try and back memory that is a good candidate
for huge pages (high occupancy) with huge pages, and give memory that is
not (low occupancy) to the scavenger. Occupancy is defined not just by
occupancy at any instant of time, but also occupancy in the near future.
It's generally true that by the end of a GC cycle the heap gets quite
dense (from the perspective of the page allocator).
Because we want scavenging and huge page management to happen together
(the right time to MADV_NOHUGEPAGE is just before scavenging in order to
break up huge pages and keep them that way) and the cost of applying
MADV_HUGEPAGE and MADV_NOHUGEPAGE is somewhat high, the scavenger avoids
releasing memory in dense page chunks. All this together means the
scavenger will now more generally release memory on a ~1 GC cycle delay.
Notably this has implications for scavenging to maintain the memory
limit and the runtime/debug.FreeOSMemory API. This change makes it so
that in these cases all memory is visible to the scavenger regardless of
sparseness and delays the page allocator in re-marking this memory with
MADV_NOHUGEPAGE for around 1 GC cycle to mitigate churn.
The end result of this change should be little-to-no performance
difference for dense heaps (MADV_HUGEPAGE works a lot like the default
unmarked state) but should allow the scavenger to more effectively take
back fragments of huge pages. The main risk here is churn, because
MADV_HUGEPAGE usually forces the kernel to immediately back memory with
a huge page. That's the reason for the large amount of hysteresis (1
full GC cycle) and why the definition of high density is 96% occupancy.
Fixes #55328.
Change-Id: I8da7998f1a31b498a9cc9bc662c1ae1a6bf64630
Reviewed-on: https://go-review.googlesource.com/c/go/+/436395
Reviewed-by: Michael Pratt <mpratt@google.com>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
2022-09-23 16:32:34 +00:00
|
|
|
ci := chunkIndex(addr)
|
|
|
|
|
chunk := p.chunkOf(ci)
|
2019-08-21 00:24:25 +00:00
|
|
|
|
|
|
|
|
// Clear all the scavenged bits which grow set.
|
|
|
|
|
chunk.scavenged.clearRange(0, pallocChunkPages)
|
|
|
|
|
|
runtime: manage huge pages explicitly
This change makes it so that on Linux the Go runtime explicitly marks
page heap memory as either available to be backed by hugepages or not
using heuristics based on density.
The motivation behind this change is twofold:
1. In default Linux configurations, khugepaged can recoalesce hugepages
even after the scavenger breaks them up, resulting in significant
overheads for small heaps when their heaps shrink.
2. The Go runtime already has some heuristics about this, but those
heuristics appear to have bit-rotted and result in haphazard
hugepage management. Unlucky (but otherwise fairly dense) regions of
memory end up not backed by huge pages while sparse regions end up
accidentally marked MADV_HUGEPAGE and are not later broken up by the
scavenger, because it already got the memory it needed from more
dense sections (this is more likely to happen with small heaps that
go idle).
In this change, the runtime uses a new policy:
1. Mark all new memory MADV_HUGEPAGE.
2. Track whether each page chunk (4 MiB) became dense during the GC
cycle. Mark those MADV_HUGEPAGE, and hide them from the scavenger.
3. If a chunk is not dense for 1 full GC cycle, make it visible to the
scavenger.
4. The scavenger marks a chunk MADV_NOHUGEPAGE before it scavenges it.
This policy is intended to try and back memory that is a good candidate
for huge pages (high occupancy) with huge pages, and give memory that is
not (low occupancy) to the scavenger. Occupancy is defined not just by
occupancy at any instant of time, but also occupancy in the near future.
It's generally true that by the end of a GC cycle the heap gets quite
dense (from the perspective of the page allocator).
Because we want scavenging and huge page management to happen together
(the right time to MADV_NOHUGEPAGE is just before scavenging in order to
break up huge pages and keep them that way) and the cost of applying
MADV_HUGEPAGE and MADV_NOHUGEPAGE is somewhat high, the scavenger avoids
releasing memory in dense page chunks. All this together means the
scavenger will now more generally release memory on a ~1 GC cycle delay.
Notably this has implications for scavenging to maintain the memory
limit and the runtime/debug.FreeOSMemory API. This change makes it so
that in these cases all memory is visible to the scavenger regardless of
sparseness and delays the page allocator in re-marking this memory with
MADV_NOHUGEPAGE for around 1 GC cycle to mitigate churn.
The end result of this change should be little-to-no performance
difference for dense heaps (MADV_HUGEPAGE works a lot like the default
unmarked state) but should allow the scavenger to more effectively take
back fragments of huge pages. The main risk here is churn, because
MADV_HUGEPAGE usually forces the kernel to immediately back memory with
a huge page. That's the reason for the large amount of hysteresis (1
full GC cycle) and why the definition of high density is 96% occupancy.
Fixes #55328.
Change-Id: I8da7998f1a31b498a9cc9bc662c1ae1a6bf64630
Reviewed-on: https://go-review.googlesource.com/c/go/+/436395
Reviewed-by: Michael Pratt <mpratt@google.com>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
2022-09-23 16:32:34 +00:00
|
|
|
// Simulate the allocation and subsequent free of all pages in
|
|
|
|
|
// the chunk for the scavenge index. This sets the state equivalent
|
|
|
|
|
// with all pages within the index being free.
|
|
|
|
|
p.scav.index.alloc(ci, pallocChunkPages)
|
|
|
|
|
p.scav.index.free(ci, 0, pallocChunkPages)
|
|
|
|
|
|
2019-08-21 00:24:25 +00:00
|
|
|
// Apply scavenge state if applicable.
|
|
|
|
|
if scav != nil {
|
|
|
|
|
if scvg, ok := scav[i]; ok {
|
|
|
|
|
for _, s := range scvg {
|
|
|
|
|
// Ignore the case of s.N == 0. setRange doesn't handle
|
|
|
|
|
// it and it's a no-op anyway.
|
|
|
|
|
if s.N != 0 {
|
|
|
|
|
chunk.scavenged.setRange(s.I, s.N)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Apply alloc state.
|
2019-08-14 16:32:12 +00:00
|
|
|
for _, s := range init {
|
|
|
|
|
// Ignore the case of s.N == 0. allocRange doesn't handle
|
|
|
|
|
// it and it's a no-op anyway.
|
|
|
|
|
if s.N != 0 {
|
|
|
|
|
chunk.allocRange(s.I, s.N)
|
|
|
|
|
|
runtime: manage huge pages explicitly
This change makes it so that on Linux the Go runtime explicitly marks
page heap memory as either available to be backed by hugepages or not
using heuristics based on density.
The motivation behind this change is twofold:
1. In default Linux configurations, khugepaged can recoalesce hugepages
even after the scavenger breaks them up, resulting in significant
overheads for small heaps when their heaps shrink.
2. The Go runtime already has some heuristics about this, but those
heuristics appear to have bit-rotted and result in haphazard
hugepage management. Unlucky (but otherwise fairly dense) regions of
memory end up not backed by huge pages while sparse regions end up
accidentally marked MADV_HUGEPAGE and are not later broken up by the
scavenger, because it already got the memory it needed from more
dense sections (this is more likely to happen with small heaps that
go idle).
In this change, the runtime uses a new policy:
1. Mark all new memory MADV_HUGEPAGE.
2. Track whether each page chunk (4 MiB) became dense during the GC
cycle. Mark those MADV_HUGEPAGE, and hide them from the scavenger.
3. If a chunk is not dense for 1 full GC cycle, make it visible to the
scavenger.
4. The scavenger marks a chunk MADV_NOHUGEPAGE before it scavenges it.
This policy is intended to try and back memory that is a good candidate
for huge pages (high occupancy) with huge pages, and give memory that is
not (low occupancy) to the scavenger. Occupancy is defined not just by
occupancy at any instant of time, but also occupancy in the near future.
It's generally true that by the end of a GC cycle the heap gets quite
dense (from the perspective of the page allocator).
Because we want scavenging and huge page management to happen together
(the right time to MADV_NOHUGEPAGE is just before scavenging in order to
break up huge pages and keep them that way) and the cost of applying
MADV_HUGEPAGE and MADV_NOHUGEPAGE is somewhat high, the scavenger avoids
releasing memory in dense page chunks. All this together means the
scavenger will now more generally release memory on a ~1 GC cycle delay.
Notably this has implications for scavenging to maintain the memory
limit and the runtime/debug.FreeOSMemory API. This change makes it so
that in these cases all memory is visible to the scavenger regardless of
sparseness and delays the page allocator in re-marking this memory with
MADV_NOHUGEPAGE for around 1 GC cycle to mitigate churn.
The end result of this change should be little-to-no performance
difference for dense heaps (MADV_HUGEPAGE works a lot like the default
unmarked state) but should allow the scavenger to more effectively take
back fragments of huge pages. The main risk here is churn, because
MADV_HUGEPAGE usually forces the kernel to immediately back memory with
a huge page. That's the reason for the large amount of hysteresis (1
full GC cycle) and why the definition of high density is 96% occupancy.
Fixes #55328.
Change-Id: I8da7998f1a31b498a9cc9bc662c1ae1a6bf64630
Reviewed-on: https://go-review.googlesource.com/c/go/+/436395
Reviewed-by: Michael Pratt <mpratt@google.com>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
2022-09-23 16:32:34 +00:00
|
|
|
// Make sure the scavenge index is updated.
|
|
|
|
|
p.scav.index.alloc(ci, s.N)
|
|
|
|
|
}
|
runtime: redesign scavenging algorithm
Currently the runtime's scavenging algorithm involves running from the
top of the heap address space to the bottom (or as far as it gets) once
per GC cycle. Once it treads some ground, it doesn't tread it again
until the next GC cycle.
This works just fine for the background scavenger, for heap-growth
scavenging, and for debug.FreeOSMemory. However, it breaks down in the
face of a memory limit for small heaps in the tens of MiB. Basically,
because the scavenger never retreads old ground, it's completely
oblivious to new memory it could scavenge, and that it really *should*
in the face of a memory limit.
Also, every time some thread goes to scavenge in the runtime, it
reserves what could be a considerable amount of address space, hiding it
from other scavengers.
This change modifies and simplifies the implementation overall. It's
less code with complexities that are much better encapsulated. The
current implementation iterates optimistically over the address space
looking for memory to scavenge, keeping track of what it last saw. The
new implementation does the same, but instead of directly iterating over
pages, it iterates over chunks. It maintains an index of chunks (as a
bitmap over the address space) that indicate which chunks may contain
scavenge work. The page allocator populates this index, while scavengers
consume it and iterate over it optimistically.
This has a two key benefits:
1. Scavenging is much simpler: find a candidate chunk, and check it,
essentially just using the scavengeOne fast path. There's no need for
the complexity of iterating beyond one chunk, because the index is
lock-free and already maintains that information.
2. If pages are freed to the page allocator (always guaranteed to be
unscavenged), the page allocator immediately notifies all scavengers
of the new source of work, avoiding the hiding issues of the old
implementation.
One downside of the new implementation, however, is that it's
potentially more expensive to find pages to scavenge. In the past, if
a single page would become free high up in the address space, the
runtime's scavengers would ignore it. Now that scavengers won't, one or
more scavengers may need to iterate potentially across the whole heap to
find the next source of work. For the background scavenger, this just
means a potentially less reactive scavenger -- overall it should still
use the same amount of CPU. It means worse overheads for memory limit
scavenging, but that's not exactly something with a baseline yet.
In practice, this shouldn't be too bad, hopefully since the chunk index
is extremely compact. For a 48-bit address space, the index is only 8
MiB in size at worst, but even just one physical page in the index is
able to support up to 128 GiB heaps, provided they aren't terribly
sparse. On 32-bit platforms, the index is only 128 bytes in size.
For #48409.
Change-Id: I72b7e74365046b18c64a6417224c5d85511194fb
Reviewed-on: https://go-review.googlesource.com/c/go/+/399474
Reviewed-by: Michael Pratt <mpratt@google.com>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
2022-04-10 20:34:17 +00:00
|
|
|
}
|
|
|
|
|
|
2019-08-14 16:32:12 +00:00
|
|
|
// Update heap metadata for the allocRange calls above.
|
2020-08-21 11:59:55 -04:00
|
|
|
systemstack(func() {
|
|
|
|
|
lock(p.mheapLock)
|
|
|
|
|
p.update(addr, pallocChunkPages, false, false)
|
|
|
|
|
unlock(p.mheapLock)
|
|
|
|
|
})
|
2019-08-14 16:32:12 +00:00
|
|
|
}
|
2020-08-21 11:59:55 -04:00
|
|
|
|
2019-08-14 16:32:12 +00:00
|
|
|
return (*PageAlloc)(p)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// FreePageAlloc releases hard OS resources owned by the pageAlloc. Once this
|
|
|
|
|
// is called the pageAlloc may no longer be used. The object itself will be
|
|
|
|
|
// collected by the garbage collector once it is no longer live.
|
|
|
|
|
func FreePageAlloc(pp *PageAlloc) {
|
|
|
|
|
p := (*pageAlloc)(pp)
|
|
|
|
|
|
|
|
|
|
// Free all the mapped space for the summary levels.
|
|
|
|
|
if pageAlloc64Bit != 0 {
|
|
|
|
|
for l := 0; l < summaryLevels; l++ {
|
runtime: track how much memory is mapped in the Ready state
This change adds a field to memstats called mappedReady that tracks how
much memory is in the Ready state at any given time. In essence, it's
the total memory usage by the Go runtime (with one exception which is
documented). Essentially, all memory mapped read/write that has either
been paged in or will soon.
To make tracking this not involve the many different stats that track
mapped memory, we track this statistic at a very low level. The downside
of tracking this statistic at such a low level is that it managed to
catch lots of situations where the runtime wasn't fully accounting for
memory. This change rectifies these situations by always accounting for
memory that's mapped in some way (i.e. always passing a sysMemStat to a
mem.go function), with *two* exceptions.
Rectifying these situations means also having the memory mapped during
testing being accounted for, so that tests (i.e. ReadMemStats) that
ultimately check mappedReady continue to work correctly without special
exceptions. We choose to simply account for this memory in other_sys.
Let's talk about the exceptions. The first is the arenas array for
finding heap arena metadata from an address is mapped as read/write in
one large chunk. It's tens of MiB in size. On systems with demand
paging, we assume that the whole thing isn't paged in at once (after
all, it maps to the whole address space, and it's exceedingly difficult
with today's technology to even broach having as much physical memory as
the total address space). On systems where we have to commit memory
manually, we use a two-level structure.
Now, the reason why this is an exception is because we have no mechanism
to track what memory is paged in, and we can't just account for the
entire thing, because that would *look* like an enormous overhead.
Furthermore, this structure is on a few really, really critical paths in
the runtime, so doing more explicit tracking isn't really an option. So,
we explicitly don't and call sysAllocOS to map this memory.
The second exception is that we call sysFree with no accounting to clean
up address space reservations, or otherwise to throw out mappings we
don't care about. In this case, also drop down to a lower level and call
sysFreeOS to explicitly avoid accounting.
The third exception is debuglog allocations. That is purely a debugging
facility and ideally we want it to have as small an impact on the
runtime as possible. If we include it in mappedReady calculations, it
could cause GC pacing shifts in future CLs, especailly if one increases
the debuglog buffer sizes as a one-off.
As of this CL, these are the only three places in the runtime that would
pass nil for a stat to any of the functions in mem.go. As a result, this
CL makes sysMemStats mandatory to facilitate better accounting in the
future. It's now much easier to grep and find out where accounting is
explicitly elided, because one doesn't have to follow the trail of
sysMemStat nil pointer values, and can just look at the function name.
For #48409.
Change-Id: I274eb467fc2603881717482214fddc47c9eaf218
Reviewed-on: https://go-review.googlesource.com/c/go/+/393402
Reviewed-by: Michael Pratt <mpratt@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
2022-03-15 02:48:18 +00:00
|
|
|
sysFreeOS(unsafe.Pointer(&p.summary[l][0]), uintptr(cap(p.summary[l]))*pallocSumBytes)
|
2019-08-14 16:32:12 +00:00
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
resSize := uintptr(0)
|
|
|
|
|
for _, s := range p.summary {
|
|
|
|
|
resSize += uintptr(cap(s)) * pallocSumBytes
|
|
|
|
|
}
|
runtime: track how much memory is mapped in the Ready state
This change adds a field to memstats called mappedReady that tracks how
much memory is in the Ready state at any given time. In essence, it's
the total memory usage by the Go runtime (with one exception which is
documented). Essentially, all memory mapped read/write that has either
been paged in or will soon.
To make tracking this not involve the many different stats that track
mapped memory, we track this statistic at a very low level. The downside
of tracking this statistic at such a low level is that it managed to
catch lots of situations where the runtime wasn't fully accounting for
memory. This change rectifies these situations by always accounting for
memory that's mapped in some way (i.e. always passing a sysMemStat to a
mem.go function), with *two* exceptions.
Rectifying these situations means also having the memory mapped during
testing being accounted for, so that tests (i.e. ReadMemStats) that
ultimately check mappedReady continue to work correctly without special
exceptions. We choose to simply account for this memory in other_sys.
Let's talk about the exceptions. The first is the arenas array for
finding heap arena metadata from an address is mapped as read/write in
one large chunk. It's tens of MiB in size. On systems with demand
paging, we assume that the whole thing isn't paged in at once (after
all, it maps to the whole address space, and it's exceedingly difficult
with today's technology to even broach having as much physical memory as
the total address space). On systems where we have to commit memory
manually, we use a two-level structure.
Now, the reason why this is an exception is because we have no mechanism
to track what memory is paged in, and we can't just account for the
entire thing, because that would *look* like an enormous overhead.
Furthermore, this structure is on a few really, really critical paths in
the runtime, so doing more explicit tracking isn't really an option. So,
we explicitly don't and call sysAllocOS to map this memory.
The second exception is that we call sysFree with no accounting to clean
up address space reservations, or otherwise to throw out mappings we
don't care about. In this case, also drop down to a lower level and call
sysFreeOS to explicitly avoid accounting.
The third exception is debuglog allocations. That is purely a debugging
facility and ideally we want it to have as small an impact on the
runtime as possible. If we include it in mappedReady calculations, it
could cause GC pacing shifts in future CLs, especailly if one increases
the debuglog buffer sizes as a one-off.
As of this CL, these are the only three places in the runtime that would
pass nil for a stat to any of the functions in mem.go. As a result, this
CL makes sysMemStats mandatory to facilitate better accounting in the
future. It's now much easier to grep and find out where accounting is
explicitly elided, because one doesn't have to follow the trail of
sysMemStat nil pointer values, and can just look at the function name.
For #48409.
Change-Id: I274eb467fc2603881717482214fddc47c9eaf218
Reviewed-on: https://go-review.googlesource.com/c/go/+/393402
Reviewed-by: Michael Pratt <mpratt@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
2022-03-15 02:48:18 +00:00
|
|
|
sysFreeOS(unsafe.Pointer(&p.summary[0][0]), alignUp(resSize, physPageSize))
|
2019-08-14 16:32:12 +00:00
|
|
|
}
|
runtime: redesign scavenging algorithm
Currently the runtime's scavenging algorithm involves running from the
top of the heap address space to the bottom (or as far as it gets) once
per GC cycle. Once it treads some ground, it doesn't tread it again
until the next GC cycle.
This works just fine for the background scavenger, for heap-growth
scavenging, and for debug.FreeOSMemory. However, it breaks down in the
face of a memory limit for small heaps in the tens of MiB. Basically,
because the scavenger never retreads old ground, it's completely
oblivious to new memory it could scavenge, and that it really *should*
in the face of a memory limit.
Also, every time some thread goes to scavenge in the runtime, it
reserves what could be a considerable amount of address space, hiding it
from other scavengers.
This change modifies and simplifies the implementation overall. It's
less code with complexities that are much better encapsulated. The
current implementation iterates optimistically over the address space
looking for memory to scavenge, keeping track of what it last saw. The
new implementation does the same, but instead of directly iterating over
pages, it iterates over chunks. It maintains an index of chunks (as a
bitmap over the address space) that indicate which chunks may contain
scavenge work. The page allocator populates this index, while scavengers
consume it and iterate over it optimistically.
This has a two key benefits:
1. Scavenging is much simpler: find a candidate chunk, and check it,
essentially just using the scavengeOne fast path. There's no need for
the complexity of iterating beyond one chunk, because the index is
lock-free and already maintains that information.
2. If pages are freed to the page allocator (always guaranteed to be
unscavenged), the page allocator immediately notifies all scavengers
of the new source of work, avoiding the hiding issues of the old
implementation.
One downside of the new implementation, however, is that it's
potentially more expensive to find pages to scavenge. In the past, if
a single page would become free high up in the address space, the
runtime's scavengers would ignore it. Now that scavengers won't, one or
more scavengers may need to iterate potentially across the whole heap to
find the next source of work. For the background scavenger, this just
means a potentially less reactive scavenger -- overall it should still
use the same amount of CPU. It means worse overheads for memory limit
scavenging, but that's not exactly something with a baseline yet.
In practice, this shouldn't be too bad, hopefully since the chunk index
is extremely compact. For a 48-bit address space, the index is only 8
MiB in size at worst, but even just one physical page in the index is
able to support up to 128 GiB heaps, provided they aren't terribly
sparse. On 32-bit platforms, the index is only 128 bytes in size.
For #48409.
Change-Id: I72b7e74365046b18c64a6417224c5d85511194fb
Reviewed-on: https://go-review.googlesource.com/c/go/+/399474
Reviewed-by: Michael Pratt <mpratt@google.com>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
2022-04-10 20:34:17 +00:00
|
|
|
|
runtime: manage huge pages explicitly
This change makes it so that on Linux the Go runtime explicitly marks
page heap memory as either available to be backed by hugepages or not
using heuristics based on density.
The motivation behind this change is twofold:
1. In default Linux configurations, khugepaged can recoalesce hugepages
even after the scavenger breaks them up, resulting in significant
overheads for small heaps when their heaps shrink.
2. The Go runtime already has some heuristics about this, but those
heuristics appear to have bit-rotted and result in haphazard
hugepage management. Unlucky (but otherwise fairly dense) regions of
memory end up not backed by huge pages while sparse regions end up
accidentally marked MADV_HUGEPAGE and are not later broken up by the
scavenger, because it already got the memory it needed from more
dense sections (this is more likely to happen with small heaps that
go idle).
In this change, the runtime uses a new policy:
1. Mark all new memory MADV_HUGEPAGE.
2. Track whether each page chunk (4 MiB) became dense during the GC
cycle. Mark those MADV_HUGEPAGE, and hide them from the scavenger.
3. If a chunk is not dense for 1 full GC cycle, make it visible to the
scavenger.
4. The scavenger marks a chunk MADV_NOHUGEPAGE before it scavenges it.
This policy is intended to try and back memory that is a good candidate
for huge pages (high occupancy) with huge pages, and give memory that is
not (low occupancy) to the scavenger. Occupancy is defined not just by
occupancy at any instant of time, but also occupancy in the near future.
It's generally true that by the end of a GC cycle the heap gets quite
dense (from the perspective of the page allocator).
Because we want scavenging and huge page management to happen together
(the right time to MADV_NOHUGEPAGE is just before scavenging in order to
break up huge pages and keep them that way) and the cost of applying
MADV_HUGEPAGE and MADV_NOHUGEPAGE is somewhat high, the scavenger avoids
releasing memory in dense page chunks. All this together means the
scavenger will now more generally release memory on a ~1 GC cycle delay.
Notably this has implications for scavenging to maintain the memory
limit and the runtime/debug.FreeOSMemory API. This change makes it so
that in these cases all memory is visible to the scavenger regardless of
sparseness and delays the page allocator in re-marking this memory with
MADV_NOHUGEPAGE for around 1 GC cycle to mitigate churn.
The end result of this change should be little-to-no performance
difference for dense heaps (MADV_HUGEPAGE works a lot like the default
unmarked state) but should allow the scavenger to more effectively take
back fragments of huge pages. The main risk here is churn, because
MADV_HUGEPAGE usually forces the kernel to immediately back memory with
a huge page. That's the reason for the large amount of hysteresis (1
full GC cycle) and why the definition of high density is 96% occupancy.
Fixes #55328.
Change-Id: I8da7998f1a31b498a9cc9bc662c1ae1a6bf64630
Reviewed-on: https://go-review.googlesource.com/c/go/+/436395
Reviewed-by: Michael Pratt <mpratt@google.com>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
2022-09-23 16:32:34 +00:00
|
|
|
// Free extra data structures.
|
|
|
|
|
sysFreeOS(unsafe.Pointer(&p.scav.index.chunks[0]), uintptr(cap(p.scav.index.chunks))*unsafe.Sizeof(atomicScavChunkData{}))
|
|
|
|
|
|
runtime: track how much memory is mapped in the Ready state
This change adds a field to memstats called mappedReady that tracks how
much memory is in the Ready state at any given time. In essence, it's
the total memory usage by the Go runtime (with one exception which is
documented). Essentially, all memory mapped read/write that has either
been paged in or will soon.
To make tracking this not involve the many different stats that track
mapped memory, we track this statistic at a very low level. The downside
of tracking this statistic at such a low level is that it managed to
catch lots of situations where the runtime wasn't fully accounting for
memory. This change rectifies these situations by always accounting for
memory that's mapped in some way (i.e. always passing a sysMemStat to a
mem.go function), with *two* exceptions.
Rectifying these situations means also having the memory mapped during
testing being accounted for, so that tests (i.e. ReadMemStats) that
ultimately check mappedReady continue to work correctly without special
exceptions. We choose to simply account for this memory in other_sys.
Let's talk about the exceptions. The first is the arenas array for
finding heap arena metadata from an address is mapped as read/write in
one large chunk. It's tens of MiB in size. On systems with demand
paging, we assume that the whole thing isn't paged in at once (after
all, it maps to the whole address space, and it's exceedingly difficult
with today's technology to even broach having as much physical memory as
the total address space). On systems where we have to commit memory
manually, we use a two-level structure.
Now, the reason why this is an exception is because we have no mechanism
to track what memory is paged in, and we can't just account for the
entire thing, because that would *look* like an enormous overhead.
Furthermore, this structure is on a few really, really critical paths in
the runtime, so doing more explicit tracking isn't really an option. So,
we explicitly don't and call sysAllocOS to map this memory.
The second exception is that we call sysFree with no accounting to clean
up address space reservations, or otherwise to throw out mappings we
don't care about. In this case, also drop down to a lower level and call
sysFreeOS to explicitly avoid accounting.
The third exception is debuglog allocations. That is purely a debugging
facility and ideally we want it to have as small an impact on the
runtime as possible. If we include it in mappedReady calculations, it
could cause GC pacing shifts in future CLs, especailly if one increases
the debuglog buffer sizes as a one-off.
As of this CL, these are the only three places in the runtime that would
pass nil for a stat to any of the functions in mem.go. As a result, this
CL makes sysMemStats mandatory to facilitate better accounting in the
future. It's now much easier to grep and find out where accounting is
explicitly elided, because one doesn't have to follow the trail of
sysMemStat nil pointer values, and can just look at the function name.
For #48409.
Change-Id: I274eb467fc2603881717482214fddc47c9eaf218
Reviewed-on: https://go-review.googlesource.com/c/go/+/393402
Reviewed-by: Michael Pratt <mpratt@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
2022-03-15 02:48:18 +00:00
|
|
|
// Subtract back out whatever we mapped for the summaries.
|
|
|
|
|
// sysUsed adds to p.sysStat and memstats.mappedReady no matter what
|
|
|
|
|
// (and in anger should actually be accounted for), and there's no other
|
|
|
|
|
// way to figure out how much we actually mapped.
|
2022-04-01 22:34:45 +00:00
|
|
|
gcController.mappedReady.Add(-int64(p.summaryMappedReady))
|
runtime: track how much memory is mapped in the Ready state
This change adds a field to memstats called mappedReady that tracks how
much memory is in the Ready state at any given time. In essence, it's
the total memory usage by the Go runtime (with one exception which is
documented). Essentially, all memory mapped read/write that has either
been paged in or will soon.
To make tracking this not involve the many different stats that track
mapped memory, we track this statistic at a very low level. The downside
of tracking this statistic at such a low level is that it managed to
catch lots of situations where the runtime wasn't fully accounting for
memory. This change rectifies these situations by always accounting for
memory that's mapped in some way (i.e. always passing a sysMemStat to a
mem.go function), with *two* exceptions.
Rectifying these situations means also having the memory mapped during
testing being accounted for, so that tests (i.e. ReadMemStats) that
ultimately check mappedReady continue to work correctly without special
exceptions. We choose to simply account for this memory in other_sys.
Let's talk about the exceptions. The first is the arenas array for
finding heap arena metadata from an address is mapped as read/write in
one large chunk. It's tens of MiB in size. On systems with demand
paging, we assume that the whole thing isn't paged in at once (after
all, it maps to the whole address space, and it's exceedingly difficult
with today's technology to even broach having as much physical memory as
the total address space). On systems where we have to commit memory
manually, we use a two-level structure.
Now, the reason why this is an exception is because we have no mechanism
to track what memory is paged in, and we can't just account for the
entire thing, because that would *look* like an enormous overhead.
Furthermore, this structure is on a few really, really critical paths in
the runtime, so doing more explicit tracking isn't really an option. So,
we explicitly don't and call sysAllocOS to map this memory.
The second exception is that we call sysFree with no accounting to clean
up address space reservations, or otherwise to throw out mappings we
don't care about. In this case, also drop down to a lower level and call
sysFreeOS to explicitly avoid accounting.
The third exception is debuglog allocations. That is purely a debugging
facility and ideally we want it to have as small an impact on the
runtime as possible. If we include it in mappedReady calculations, it
could cause GC pacing shifts in future CLs, especailly if one increases
the debuglog buffer sizes as a one-off.
As of this CL, these are the only three places in the runtime that would
pass nil for a stat to any of the functions in mem.go. As a result, this
CL makes sysMemStats mandatory to facilitate better accounting in the
future. It's now much easier to grep and find out where accounting is
explicitly elided, because one doesn't have to follow the trail of
sysMemStat nil pointer values, and can just look at the function name.
For #48409.
Change-Id: I274eb467fc2603881717482214fddc47c9eaf218
Reviewed-on: https://go-review.googlesource.com/c/go/+/393402
Reviewed-by: Michael Pratt <mpratt@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
2022-03-15 02:48:18 +00:00
|
|
|
testSysStat.add(-int64(p.summaryMappedReady))
|
2019-08-14 16:32:12 +00:00
|
|
|
|
|
|
|
|
// Free the mapped space for chunks.
|
2019-11-14 23:58:50 +00:00
|
|
|
for i := range p.chunks {
|
|
|
|
|
if x := p.chunks[i]; x != nil {
|
|
|
|
|
p.chunks[i] = nil
|
|
|
|
|
// This memory comes from sysAlloc and will always be page-aligned.
|
runtime: track how much memory is mapped in the Ready state
This change adds a field to memstats called mappedReady that tracks how
much memory is in the Ready state at any given time. In essence, it's
the total memory usage by the Go runtime (with one exception which is
documented). Essentially, all memory mapped read/write that has either
been paged in or will soon.
To make tracking this not involve the many different stats that track
mapped memory, we track this statistic at a very low level. The downside
of tracking this statistic at such a low level is that it managed to
catch lots of situations where the runtime wasn't fully accounting for
memory. This change rectifies these situations by always accounting for
memory that's mapped in some way (i.e. always passing a sysMemStat to a
mem.go function), with *two* exceptions.
Rectifying these situations means also having the memory mapped during
testing being accounted for, so that tests (i.e. ReadMemStats) that
ultimately check mappedReady continue to work correctly without special
exceptions. We choose to simply account for this memory in other_sys.
Let's talk about the exceptions. The first is the arenas array for
finding heap arena metadata from an address is mapped as read/write in
one large chunk. It's tens of MiB in size. On systems with demand
paging, we assume that the whole thing isn't paged in at once (after
all, it maps to the whole address space, and it's exceedingly difficult
with today's technology to even broach having as much physical memory as
the total address space). On systems where we have to commit memory
manually, we use a two-level structure.
Now, the reason why this is an exception is because we have no mechanism
to track what memory is paged in, and we can't just account for the
entire thing, because that would *look* like an enormous overhead.
Furthermore, this structure is on a few really, really critical paths in
the runtime, so doing more explicit tracking isn't really an option. So,
we explicitly don't and call sysAllocOS to map this memory.
The second exception is that we call sysFree with no accounting to clean
up address space reservations, or otherwise to throw out mappings we
don't care about. In this case, also drop down to a lower level and call
sysFreeOS to explicitly avoid accounting.
The third exception is debuglog allocations. That is purely a debugging
facility and ideally we want it to have as small an impact on the
runtime as possible. If we include it in mappedReady calculations, it
could cause GC pacing shifts in future CLs, especailly if one increases
the debuglog buffer sizes as a one-off.
As of this CL, these are the only three places in the runtime that would
pass nil for a stat to any of the functions in mem.go. As a result, this
CL makes sysMemStats mandatory to facilitate better accounting in the
future. It's now much easier to grep and find out where accounting is
explicitly elided, because one doesn't have to follow the trail of
sysMemStat nil pointer values, and can just look at the function name.
For #48409.
Change-Id: I274eb467fc2603881717482214fddc47c9eaf218
Reviewed-on: https://go-review.googlesource.com/c/go/+/393402
Reviewed-by: Michael Pratt <mpratt@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
2022-03-15 02:48:18 +00:00
|
|
|
sysFree(unsafe.Pointer(x), unsafe.Sizeof(*p.chunks[0]), testSysStat)
|
2019-11-14 23:58:50 +00:00
|
|
|
}
|
|
|
|
|
}
|
2019-08-14 16:32:12 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// BaseChunkIdx is a convenient chunkIdx value which works on both
|
|
|
|
|
// 64 bit and 32 bit platforms, allowing the tests to share code
|
|
|
|
|
// between the two.
|
2019-11-07 22:42:38 +00:00
|
|
|
//
|
|
|
|
|
// This should not be higher than 0x100*pallocChunkBytes to support
|
|
|
|
|
// mips and mipsle, which only have 31-bit address spaces.
|
2021-08-23 17:27:40 +00:00
|
|
|
var BaseChunkIdx = func() ChunkIdx {
|
|
|
|
|
var prefix uintptr
|
|
|
|
|
if pageAlloc64Bit != 0 {
|
|
|
|
|
prefix = 0xc000
|
|
|
|
|
} else {
|
|
|
|
|
prefix = 0x100
|
|
|
|
|
}
|
|
|
|
|
baseAddr := prefix * pallocChunkBytes
|
|
|
|
|
if goos.IsAix != 0 {
|
|
|
|
|
baseAddr += arenaBaseOffset
|
|
|
|
|
}
|
|
|
|
|
return ChunkIdx(chunkIndex(baseAddr))
|
|
|
|
|
}()
|
2019-08-14 16:32:12 +00:00
|
|
|
|
|
|
|
|
// PageBase returns an address given a chunk index and a page index
|
|
|
|
|
// relative to that chunk.
|
|
|
|
|
func PageBase(c ChunkIdx, pageIdx uint) uintptr {
|
|
|
|
|
return chunkBase(chunkIdx(c)) + uintptr(pageIdx)*pageSize
|
|
|
|
|
}
|
2019-10-17 17:42:15 +00:00
|
|
|
|
|
|
|
|
type BitsMismatch struct {
|
|
|
|
|
Base uintptr
|
|
|
|
|
Got, Want uint64
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func CheckScavengedBitsCleared(mismatches []BitsMismatch) (n int, ok bool) {
|
|
|
|
|
ok = true
|
|
|
|
|
|
|
|
|
|
// Run on the system stack to avoid stack growth allocation.
|
|
|
|
|
systemstack(func() {
|
|
|
|
|
getg().m.mallocing++
|
|
|
|
|
|
|
|
|
|
// Lock so that we can safely access the bitmap.
|
|
|
|
|
lock(&mheap_.lock)
|
|
|
|
|
chunkLoop:
|
|
|
|
|
for i := mheap_.pages.start; i < mheap_.pages.end; i++ {
|
2020-09-09 16:52:18 +00:00
|
|
|
chunk := mheap_.pages.tryChunkOf(i)
|
|
|
|
|
if chunk == nil {
|
|
|
|
|
continue
|
|
|
|
|
}
|
2019-10-17 17:42:15 +00:00
|
|
|
for j := 0; j < pallocChunkPages/64; j++ {
|
|
|
|
|
// Run over each 64-bit bitmap section and ensure
|
|
|
|
|
// scavenged is being cleared properly on allocation.
|
|
|
|
|
// If a used bit and scavenged bit are both set, that's
|
|
|
|
|
// an error, and could indicate a larger problem, or
|
|
|
|
|
// an accounting problem.
|
|
|
|
|
want := chunk.scavenged[j] &^ chunk.pallocBits[j]
|
|
|
|
|
got := chunk.scavenged[j]
|
|
|
|
|
if want != got {
|
|
|
|
|
ok = false
|
|
|
|
|
if n >= len(mismatches) {
|
|
|
|
|
break chunkLoop
|
|
|
|
|
}
|
|
|
|
|
mismatches[n] = BitsMismatch{
|
|
|
|
|
Base: chunkBase(i) + uintptr(j)*64*pageSize,
|
|
|
|
|
Got: got,
|
|
|
|
|
Want: want,
|
|
|
|
|
}
|
|
|
|
|
n++
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
unlock(&mheap_.lock)
|
|
|
|
|
|
|
|
|
|
getg().m.mallocing--
|
|
|
|
|
})
|
|
|
|
|
return
|
|
|
|
|
}
|
2019-09-16 21:23:24 +00:00
|
|
|
|
|
|
|
|
func PageCachePagesLeaked() (leaked uintptr) {
|
|
|
|
|
stopTheWorld("PageCachePagesLeaked")
|
|
|
|
|
|
|
|
|
|
// Walk over destroyed Ps and look for unflushed caches.
|
|
|
|
|
deadp := allp[len(allp):cap(allp)]
|
|
|
|
|
for _, p := range deadp {
|
|
|
|
|
// Since we're going past len(allp) we may see nil Ps.
|
|
|
|
|
// Just ignore them.
|
|
|
|
|
if p != nil {
|
2019-11-08 16:11:29 -05:00
|
|
|
leaked += uintptr(sys.OnesCount64(p.pcache.cache))
|
2019-09-16 21:23:24 +00:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
startTheWorld()
|
|
|
|
|
return
|
|
|
|
|
}
|
2019-11-08 10:30:24 -08:00
|
|
|
|
|
|
|
|
var Semacquire = semacquire
|
|
|
|
|
var Semrelease1 = semrelease1
|
|
|
|
|
|
|
|
|
|
func SemNwait(addr *uint32) uint32 {
|
runtime: write much more direct test for semaphore waiter scalability
This test originally existed as two tests in test/locklinear.go, but
this checked against actual locks and was flaky. The test was checking
a property of a deep part of the runtime but from a much higher level,
and it's easy for nondeterminism due to scheduling to completely mess
that up, especially on an oversubscribed system.
That test was then moved to the sync package with a more rigorous
testing methodology, but it could still flake pretty easily.
Finally, this CL makes semtable more testable, exports it in
export_test.go, then writes a very direct scalability test for exactly
the situation the original test described. As far as I can tell, this is
much, much more stable, because it's single-threaded and is just
checking exactly the algorithm we need to check.
Don't bother trying to bring in a test that checks for O(log n) behavior
on the other kind of iteration. It'll be perpetually flaky because the
underlying data structure is a treap, so it's only _expected_ to be
O(log n), but it's very easy for it to get unlucky without a large
number of iterations that's too much for a simple test.
Fixes #53381.
Change-Id: Ia1cd2d2b0e36d552d5a8ae137077260a16016602
Reviewed-on: https://go-review.googlesource.com/c/go/+/412875
Reviewed-by: Michael Pratt <mpratt@google.com>
2022-06-16 20:33:35 +00:00
|
|
|
root := semtable.rootFor(addr)
|
2022-08-26 10:27:57 +08:00
|
|
|
return root.nwait.Load()
|
2019-11-08 10:30:24 -08:00
|
|
|
}
|
2020-03-06 14:01:26 -08:00
|
|
|
|
runtime: write much more direct test for semaphore waiter scalability
This test originally existed as two tests in test/locklinear.go, but
this checked against actual locks and was flaky. The test was checking
a property of a deep part of the runtime but from a much higher level,
and it's easy for nondeterminism due to scheduling to completely mess
that up, especially on an oversubscribed system.
That test was then moved to the sync package with a more rigorous
testing methodology, but it could still flake pretty easily.
Finally, this CL makes semtable more testable, exports it in
export_test.go, then writes a very direct scalability test for exactly
the situation the original test described. As far as I can tell, this is
much, much more stable, because it's single-threaded and is just
checking exactly the algorithm we need to check.
Don't bother trying to bring in a test that checks for O(log n) behavior
on the other kind of iteration. It'll be perpetually flaky because the
underlying data structure is a treap, so it's only _expected_ to be
O(log n), but it's very easy for it to get unlucky without a large
number of iterations that's too much for a simple test.
Fixes #53381.
Change-Id: Ia1cd2d2b0e36d552d5a8ae137077260a16016602
Reviewed-on: https://go-review.googlesource.com/c/go/+/412875
Reviewed-by: Michael Pratt <mpratt@google.com>
2022-06-16 20:33:35 +00:00
|
|
|
const SemTableSize = semTabSize
|
|
|
|
|
|
|
|
|
|
// SemTable is a wrapper around semTable exported for testing.
|
|
|
|
|
type SemTable struct {
|
|
|
|
|
semTable
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Enqueue simulates enqueuing a waiter for a semaphore (or lock) at addr.
|
|
|
|
|
func (t *SemTable) Enqueue(addr *uint32) {
|
|
|
|
|
s := acquireSudog()
|
|
|
|
|
s.releasetime = 0
|
|
|
|
|
s.acquiretime = 0
|
|
|
|
|
s.ticket = 0
|
|
|
|
|
t.semTable.rootFor(addr).queue(addr, s, false)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Dequeue simulates dequeuing a waiter for a semaphore (or lock) at addr.
|
|
|
|
|
//
|
|
|
|
|
// Returns true if there actually was a waiter to be dequeued.
|
|
|
|
|
func (t *SemTable) Dequeue(addr *uint32) bool {
|
|
|
|
|
s, _ := t.semTable.rootFor(addr).dequeue(addr)
|
|
|
|
|
if s != nil {
|
|
|
|
|
releaseSudog(s)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return false
|
|
|
|
|
}
|
|
|
|
|
|
2020-09-16 16:22:28 +00:00
|
|
|
// mspan wrapper for testing.
|
|
|
|
|
type MSpan mspan
|
|
|
|
|
|
|
|
|
|
// Allocate an mspan for testing.
|
|
|
|
|
func AllocMSpan() *MSpan {
|
|
|
|
|
var s *mspan
|
|
|
|
|
systemstack(func() {
|
2020-09-16 17:08:55 +00:00
|
|
|
lock(&mheap_.lock)
|
2020-09-16 16:22:28 +00:00
|
|
|
s = (*mspan)(mheap_.spanalloc.alloc())
|
2020-09-16 17:08:55 +00:00
|
|
|
unlock(&mheap_.lock)
|
2020-09-16 16:22:28 +00:00
|
|
|
})
|
|
|
|
|
return (*MSpan)(s)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Free an allocated mspan.
|
|
|
|
|
func FreeMSpan(s *MSpan) {
|
|
|
|
|
systemstack(func() {
|
2020-09-16 17:08:55 +00:00
|
|
|
lock(&mheap_.lock)
|
2020-09-16 16:22:28 +00:00
|
|
|
mheap_.spanalloc.free(unsafe.Pointer(s))
|
2020-09-16 17:08:55 +00:00
|
|
|
unlock(&mheap_.lock)
|
2020-09-16 16:22:28 +00:00
|
|
|
})
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func MSpanCountAlloc(ms *MSpan, bits []byte) int {
|
|
|
|
|
s := (*mspan)(ms)
|
2020-08-21 20:20:12 -07:00
|
|
|
s.nelems = uintptr(len(bits) * 8)
|
|
|
|
|
s.gcmarkBits = (*gcBits)(unsafe.Pointer(&bits[0]))
|
2020-09-16 16:22:28 +00:00
|
|
|
result := s.countAlloc()
|
|
|
|
|
s.gcmarkBits = nil
|
|
|
|
|
return result
|
2020-03-18 18:46:04 +00:00
|
|
|
}
|
2020-08-06 20:36:49 +00:00
|
|
|
|
|
|
|
|
const (
|
2022-08-30 03:13:36 +00:00
|
|
|
TimeHistSubBucketBits = timeHistSubBucketBits
|
|
|
|
|
TimeHistNumSubBuckets = timeHistNumSubBuckets
|
|
|
|
|
TimeHistNumBuckets = timeHistNumBuckets
|
|
|
|
|
TimeHistMinBucketBits = timeHistMinBucketBits
|
|
|
|
|
TimeHistMaxBucketBits = timeHistMaxBucketBits
|
2020-08-06 20:36:49 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
|
|
type TimeHistogram timeHistogram
|
|
|
|
|
|
|
|
|
|
// Counts returns the counts for the given bucket, subBucket indices.
|
|
|
|
|
// Returns true if the bucket was valid, otherwise returns the counts
|
2022-08-30 03:13:36 +00:00
|
|
|
// for the overflow bucket if bucket > 0 or the underflow bucket if
|
|
|
|
|
// bucket < 0, and false.
|
|
|
|
|
func (th *TimeHistogram) Count(bucket, subBucket int) (uint64, bool) {
|
2020-08-06 20:36:49 +00:00
|
|
|
t := (*timeHistogram)(th)
|
2022-08-30 03:13:36 +00:00
|
|
|
if bucket < 0 {
|
2022-07-25 15:58:23 -04:00
|
|
|
return t.underflow.Load(), false
|
2020-08-06 20:36:49 +00:00
|
|
|
}
|
2022-08-30 03:13:36 +00:00
|
|
|
i := bucket*TimeHistNumSubBuckets + subBucket
|
|
|
|
|
if i >= len(t.counts) {
|
|
|
|
|
return t.overflow.Load(), false
|
|
|
|
|
}
|
2022-07-25 15:58:23 -04:00
|
|
|
return t.counts[i].Load(), true
|
2020-08-06 20:36:49 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (th *TimeHistogram) Record(duration int64) {
|
|
|
|
|
(*timeHistogram)(th).record(duration)
|
|
|
|
|
}
|
2020-10-22 16:55:55 +00:00
|
|
|
|
2022-01-21 06:52:43 +00:00
|
|
|
var TimeHistogramMetricsBuckets = timeHistogramMetricsBuckets
|
|
|
|
|
|
2020-10-22 16:55:55 +00:00
|
|
|
func SetIntArgRegs(a int) int {
|
|
|
|
|
lock(&finlock)
|
|
|
|
|
old := intArgRegs
|
2021-03-09 21:13:34 +00:00
|
|
|
if a >= 0 {
|
|
|
|
|
intArgRegs = a
|
|
|
|
|
}
|
2020-10-22 16:55:55 +00:00
|
|
|
unlock(&finlock)
|
|
|
|
|
return old
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func FinalizerGAsleep() bool {
|
2022-04-13 21:14:22 +08:00
|
|
|
return fingStatus.Load()&fingWait != 0
|
2020-10-22 16:55:55 +00:00
|
|
|
}
|
2021-03-24 10:45:20 -04:00
|
|
|
|
|
|
|
|
// For GCTestMoveStackOnNextCall, it's important not to introduce an
|
|
|
|
|
// extra layer of call, since then there's a return before the "real"
|
|
|
|
|
// next call.
|
|
|
|
|
var GCTestMoveStackOnNextCall = gcTestMoveStackOnNextCall
|
|
|
|
|
|
|
|
|
|
// For GCTestIsReachable, it's important that we do this as a call so
|
|
|
|
|
// escape analysis can see through it.
|
|
|
|
|
func GCTestIsReachable(ptrs ...unsafe.Pointer) (mask uint64) {
|
|
|
|
|
return gcTestIsReachable(ptrs...)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// For GCTestPointerClass, it's important that we do this as a call so
|
|
|
|
|
// escape analysis can see through it.
|
|
|
|
|
//
|
|
|
|
|
// This is nosplit because gcTestPointerClass is.
|
|
|
|
|
//
|
|
|
|
|
//go:nosplit
|
|
|
|
|
func GCTestPointerClass(p unsafe.Pointer) string {
|
|
|
|
|
return gcTestPointerClass(p)
|
|
|
|
|
}
|
2021-04-28 23:07:38 -07:00
|
|
|
|
|
|
|
|
const Raceenabled = raceenabled
|
2021-10-01 15:11:51 -04:00
|
|
|
|
|
|
|
|
const (
|
runtime: set the heap goal from the memory limit
This change makes the memory limit functional by including it in the
heap goal calculation. Specifically, we derive a heap goal from the
memory limit, and compare that to the GOGC-based goal. If the goal based
on the memory limit is lower, we prefer that.
To derive the memory limit goal, the heap goal calculation now takes
a few additional parameters as input. As a result, the heap goal, in the
presence of a memory limit, may change dynamically. The consequences of
this are that different parts of the runtime can have different views of
the heap goal; this is OK. What's important is that all of the runtime
is able to observe the correct heap goal for the moment it's doing
something that affects it, like anything that should trigger a GC cycle.
On the topic of triggering a GC cycle, this change also allows any
manually managed memory allocation from the page heap to trigger a GC.
So, specifically workbufs, unrolled GC scan programs, and goroutine
stacks. The reason for this is that now non-heap memory can effect the
trigger or the heap goal.
Most sources of non-heap memory only change slowly, like GC pointer
bitmaps, or change in response to explicit function calls like
GOMAXPROCS. Note also that unrolled GC scan programs and workbufs are
really only relevant during a GC cycle anyway, so they won't actually
ever trigger a GC. Our primary target here is goroutine stacks.
Goroutine stacks can increase quickly, and this is currently totally
independent of the GC cycle. Thus, if for example a goroutine begins to
recurse suddenly and deeply, then even though the heap goal and trigger
react, we might not notice until its too late. As a result, we need to
trigger a GC cycle.
We do this trigger in allocManual instead of in stackalloc because it's
far more general. We ultimately care about memory that's mapped
read/write and not returned to the OS, which is much more the domain of
the page heap than the stack allocator. Furthermore, there may be new
sources of memory manual allocation in the future (e.g. arenas) that
need to trigger a GC if necessary. As such, I'm inclined to leave the
trigger in allocManual as an extra defensive measure.
It's worth noting that because goroutine stacks do not behave quite as
predictably as other non-heap memory, there is the potential for the
heap goal to swing wildly. Fortunately, goroutine stacks that haven't
been set up to shrink by the last GC cycle will not shrink until after
the next one. This reduces the amount of possible churn in the heap goal
because it means that shrinkage only happens once per goroutine, per GC
cycle. After all the goroutines that should shrink did, then goroutine
stacks will only grow. The shrink mechanism is analagous to sweeping,
which is incremental and thus tends toward a steady amount of heap
memory used. As a result, in practice, I expect this to be a non-issue.
Note that if the memory limit is not set, this change should be a no-op.
For #48409.
Change-Id: Ie06d10175e5e36f9fb6450e26ed8acd3d30c681c
Reviewed-on: https://go-review.googlesource.com/c/go/+/394221
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Michael Pratt <mpratt@google.com>
2022-03-21 21:27:06 +00:00
|
|
|
GCBackgroundUtilization = gcBackgroundUtilization
|
|
|
|
|
GCGoalUtilization = gcGoalUtilization
|
|
|
|
|
DefaultHeapMinimum = defaultHeapMinimum
|
|
|
|
|
MemoryLimitHeapGoalHeadroom = memoryLimitHeapGoalHeadroom
|
2021-10-01 15:11:51 -04:00
|
|
|
)
|
|
|
|
|
|
|
|
|
|
type GCController struct {
|
|
|
|
|
gcControllerState
|
|
|
|
|
}
|
|
|
|
|
|
runtime: set the heap goal from the memory limit
This change makes the memory limit functional by including it in the
heap goal calculation. Specifically, we derive a heap goal from the
memory limit, and compare that to the GOGC-based goal. If the goal based
on the memory limit is lower, we prefer that.
To derive the memory limit goal, the heap goal calculation now takes
a few additional parameters as input. As a result, the heap goal, in the
presence of a memory limit, may change dynamically. The consequences of
this are that different parts of the runtime can have different views of
the heap goal; this is OK. What's important is that all of the runtime
is able to observe the correct heap goal for the moment it's doing
something that affects it, like anything that should trigger a GC cycle.
On the topic of triggering a GC cycle, this change also allows any
manually managed memory allocation from the page heap to trigger a GC.
So, specifically workbufs, unrolled GC scan programs, and goroutine
stacks. The reason for this is that now non-heap memory can effect the
trigger or the heap goal.
Most sources of non-heap memory only change slowly, like GC pointer
bitmaps, or change in response to explicit function calls like
GOMAXPROCS. Note also that unrolled GC scan programs and workbufs are
really only relevant during a GC cycle anyway, so they won't actually
ever trigger a GC. Our primary target here is goroutine stacks.
Goroutine stacks can increase quickly, and this is currently totally
independent of the GC cycle. Thus, if for example a goroutine begins to
recurse suddenly and deeply, then even though the heap goal and trigger
react, we might not notice until its too late. As a result, we need to
trigger a GC cycle.
We do this trigger in allocManual instead of in stackalloc because it's
far more general. We ultimately care about memory that's mapped
read/write and not returned to the OS, which is much more the domain of
the page heap than the stack allocator. Furthermore, there may be new
sources of memory manual allocation in the future (e.g. arenas) that
need to trigger a GC if necessary. As such, I'm inclined to leave the
trigger in allocManual as an extra defensive measure.
It's worth noting that because goroutine stacks do not behave quite as
predictably as other non-heap memory, there is the potential for the
heap goal to swing wildly. Fortunately, goroutine stacks that haven't
been set up to shrink by the last GC cycle will not shrink until after
the next one. This reduces the amount of possible churn in the heap goal
because it means that shrinkage only happens once per goroutine, per GC
cycle. After all the goroutines that should shrink did, then goroutine
stacks will only grow. The shrink mechanism is analagous to sweeping,
which is incremental and thus tends toward a steady amount of heap
memory used. As a result, in practice, I expect this to be a non-issue.
Note that if the memory limit is not set, this change should be a no-op.
For #48409.
Change-Id: Ie06d10175e5e36f9fb6450e26ed8acd3d30c681c
Reviewed-on: https://go-review.googlesource.com/c/go/+/394221
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Michael Pratt <mpratt@google.com>
2022-03-21 21:27:06 +00:00
|
|
|
func NewGCController(gcPercent int, memoryLimit int64) *GCController {
|
2021-10-01 15:11:51 -04:00
|
|
|
// Force the controller to escape. We're going to
|
|
|
|
|
// do 64-bit atomics on it, and if it gets stack-allocated
|
|
|
|
|
// on a 32-bit architecture, it may get allocated unaligned
|
|
|
|
|
// space.
|
2022-04-25 17:21:58 -04:00
|
|
|
g := Escape(new(GCController))
|
2021-10-29 22:42:50 +00:00
|
|
|
g.gcControllerState.test = true // Mark it as a test copy.
|
runtime: set the heap goal from the memory limit
This change makes the memory limit functional by including it in the
heap goal calculation. Specifically, we derive a heap goal from the
memory limit, and compare that to the GOGC-based goal. If the goal based
on the memory limit is lower, we prefer that.
To derive the memory limit goal, the heap goal calculation now takes
a few additional parameters as input. As a result, the heap goal, in the
presence of a memory limit, may change dynamically. The consequences of
this are that different parts of the runtime can have different views of
the heap goal; this is OK. What's important is that all of the runtime
is able to observe the correct heap goal for the moment it's doing
something that affects it, like anything that should trigger a GC cycle.
On the topic of triggering a GC cycle, this change also allows any
manually managed memory allocation from the page heap to trigger a GC.
So, specifically workbufs, unrolled GC scan programs, and goroutine
stacks. The reason for this is that now non-heap memory can effect the
trigger or the heap goal.
Most sources of non-heap memory only change slowly, like GC pointer
bitmaps, or change in response to explicit function calls like
GOMAXPROCS. Note also that unrolled GC scan programs and workbufs are
really only relevant during a GC cycle anyway, so they won't actually
ever trigger a GC. Our primary target here is goroutine stacks.
Goroutine stacks can increase quickly, and this is currently totally
independent of the GC cycle. Thus, if for example a goroutine begins to
recurse suddenly and deeply, then even though the heap goal and trigger
react, we might not notice until its too late. As a result, we need to
trigger a GC cycle.
We do this trigger in allocManual instead of in stackalloc because it's
far more general. We ultimately care about memory that's mapped
read/write and not returned to the OS, which is much more the domain of
the page heap than the stack allocator. Furthermore, there may be new
sources of memory manual allocation in the future (e.g. arenas) that
need to trigger a GC if necessary. As such, I'm inclined to leave the
trigger in allocManual as an extra defensive measure.
It's worth noting that because goroutine stacks do not behave quite as
predictably as other non-heap memory, there is the potential for the
heap goal to swing wildly. Fortunately, goroutine stacks that haven't
been set up to shrink by the last GC cycle will not shrink until after
the next one. This reduces the amount of possible churn in the heap goal
because it means that shrinkage only happens once per goroutine, per GC
cycle. After all the goroutines that should shrink did, then goroutine
stacks will only grow. The shrink mechanism is analagous to sweeping,
which is incremental and thus tends toward a steady amount of heap
memory used. As a result, in practice, I expect this to be a non-issue.
Note that if the memory limit is not set, this change should be a no-op.
For #48409.
Change-Id: Ie06d10175e5e36f9fb6450e26ed8acd3d30c681c
Reviewed-on: https://go-review.googlesource.com/c/go/+/394221
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Michael Pratt <mpratt@google.com>
2022-03-21 21:27:06 +00:00
|
|
|
g.init(int32(gcPercent), memoryLimit)
|
2021-10-01 15:11:51 -04:00
|
|
|
return g
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (c *GCController) StartCycle(stackSize, globalsSize uint64, scannableFrac float64, gomaxprocs int) {
|
runtime: check the heap goal and trigger dynamically
As it stands, the heap goal and the trigger are set once by
gcController.commit, and then read out of gcController. However with the
coming memory limit we need the GC to be able to respond to changes in
non-heap memory. The simplest way of achieving this is to compute the
heap goal and its associated trigger dynamically.
In order to make this easier to implement, the GC trigger is now based
on the heap goal, as opposed to the status quo of computing both
simultaneously. In many cases we just want the heap goal anyway, not
both, but we definitely need the goal to compute the trigger, because
the trigger's bounds are entirely based on the goal (the initial runway
is not). A consequence of this is that we can't rely on the trigger to
enforce a minimum heap size anymore, and we need to lift that up
directly to the goal. Specifically, we need to lift up any part of the
calculation that *could* put the trigger ahead of the goal. Luckily this
is just the heap minimum and minimum sweep distance. In the first case,
the pacer may behave slightly differently, as the heap minimum is no
longer the minimum trigger, but the actual minimum heap goal. In the
second case it should be the same, as we ensure the additional runway
for sweeping is added to both the goal *and* the trigger, as before, by
computing that in gcControllerState.commit.
There's also another place we update the heap goal: if a GC starts and
we triggered beyond the goal, we always ensure there's some runway.
That calculation uses the current trigger, which violates the rule of
keeping the goal based on the trigger. Notice, however, that using the
precomputed trigger for this isn't even quite correct: due to a bug, or
something else, we might trigger a GC beyond the precomputed trigger.
So this change also adds a "triggered" field to gcControllerState that
tracks the point at which a GC actually triggered. This is independent
of the precomputed trigger, so it's fine for the heap goal calculation
to rely on it. It also turns out, there's more than just that one place
where we really should be using the actual trigger point, so this change
fixes those up too.
Also, because the heap minimum is set by the goal and not the trigger,
the maximum trigger calculation now happens *after* the goal is set, so
the maximum trigger actually does what I originally intended (and what
the comment says): at small heaps, the pacer picks 95% of the runway as
the maximum trigger. Currently, the pacer picks a small trigger based
on a not-yet-rounded-up heap goal, so the trigger gets rounded up to the
goal, and as per the "ensure there's some runway" check, the runway ends
up at always being 64 KiB. That check is supposed to be for exceptional
circumstances, not the status quo. There's a test introduced in the last
CL that needs to be updated to accomodate this slight change in
behavior.
So, this all sounds like a lot that changed, but what we're talking about
here are really, really tight corner cases that arise from situations
outside of our control, like pathologically bad behavior on the part of
an OS or CPU. Even in these corner cases, it's very unlikely that users
will notice any difference at all. What's more important, I think, is
that the pacer behaves more closely to what all the comments describe,
and what the original intent was.
Another note: at first, one might think that computing the heap goal and
trigger dynamically introduces some raciness, but not in this CL: the heap
goal and trigger are completely static.
Allocation outside of a GC cycle may now be a bit slower than before, as
the GC trigger check is now significantly more complex. However, note
that this executes basically just as often as gcController.revise, and
that makes up for a vanishingly small part of any CPU profile. The next
CL cleans up the floating point multiplications on this path
nonetheless, just to be safe.
For #48409.
Change-Id: I280f5ad607a86756d33fb8449ad08555cbee93f9
Reviewed-on: https://go-review.googlesource.com/c/go/+/397014
Run-TryBot: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Michael Pratt <mpratt@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
2022-03-21 21:27:06 +00:00
|
|
|
trigger, _ := c.trigger()
|
runtime: set the heap goal from the memory limit
This change makes the memory limit functional by including it in the
heap goal calculation. Specifically, we derive a heap goal from the
memory limit, and compare that to the GOGC-based goal. If the goal based
on the memory limit is lower, we prefer that.
To derive the memory limit goal, the heap goal calculation now takes
a few additional parameters as input. As a result, the heap goal, in the
presence of a memory limit, may change dynamically. The consequences of
this are that different parts of the runtime can have different views of
the heap goal; this is OK. What's important is that all of the runtime
is able to observe the correct heap goal for the moment it's doing
something that affects it, like anything that should trigger a GC cycle.
On the topic of triggering a GC cycle, this change also allows any
manually managed memory allocation from the page heap to trigger a GC.
So, specifically workbufs, unrolled GC scan programs, and goroutine
stacks. The reason for this is that now non-heap memory can effect the
trigger or the heap goal.
Most sources of non-heap memory only change slowly, like GC pointer
bitmaps, or change in response to explicit function calls like
GOMAXPROCS. Note also that unrolled GC scan programs and workbufs are
really only relevant during a GC cycle anyway, so they won't actually
ever trigger a GC. Our primary target here is goroutine stacks.
Goroutine stacks can increase quickly, and this is currently totally
independent of the GC cycle. Thus, if for example a goroutine begins to
recurse suddenly and deeply, then even though the heap goal and trigger
react, we might not notice until its too late. As a result, we need to
trigger a GC cycle.
We do this trigger in allocManual instead of in stackalloc because it's
far more general. We ultimately care about memory that's mapped
read/write and not returned to the OS, which is much more the domain of
the page heap than the stack allocator. Furthermore, there may be new
sources of memory manual allocation in the future (e.g. arenas) that
need to trigger a GC if necessary. As such, I'm inclined to leave the
trigger in allocManual as an extra defensive measure.
It's worth noting that because goroutine stacks do not behave quite as
predictably as other non-heap memory, there is the potential for the
heap goal to swing wildly. Fortunately, goroutine stacks that haven't
been set up to shrink by the last GC cycle will not shrink until after
the next one. This reduces the amount of possible churn in the heap goal
because it means that shrinkage only happens once per goroutine, per GC
cycle. After all the goroutines that should shrink did, then goroutine
stacks will only grow. The shrink mechanism is analagous to sweeping,
which is incremental and thus tends toward a steady amount of heap
memory used. As a result, in practice, I expect this to be a non-issue.
Note that if the memory limit is not set, this change should be a no-op.
For #48409.
Change-Id: Ie06d10175e5e36f9fb6450e26ed8acd3d30c681c
Reviewed-on: https://go-review.googlesource.com/c/go/+/394221
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Michael Pratt <mpratt@google.com>
2022-03-21 21:27:06 +00:00
|
|
|
if c.heapMarked > trigger {
|
|
|
|
|
trigger = c.heapMarked
|
|
|
|
|
}
|
2022-07-15 14:27:48 -04:00
|
|
|
c.maxStackScan.Store(stackSize)
|
2022-07-15 14:33:16 -04:00
|
|
|
c.globalsScan.Store(globalsSize)
|
2022-07-15 14:21:02 -04:00
|
|
|
c.heapLive.Store(trigger)
|
2022-07-15 16:56:03 -04:00
|
|
|
c.heapScan.Add(int64(float64(trigger-c.heapMarked) * scannableFrac))
|
runtime: reduce max idle mark workers during periodic GC cycles
This change reduces the maximum number of idle mark workers during
periodic (currently every 2 minutes) GC cycles to 1.
Idle mark workers soak up all available and unused Ps, up to GOMAXPROCS.
While this provides some throughput and latency benefit in general, it
can cause what appear to be massive CPU utilization spikes in otherwise
idle applications. This is mostly an issue for *very* idle applications,
ones idle enough to trigger periodic GC cycles. This spike also tends to
interact poorly with auto-scaling systems, as the system might assume
the load average is very low and suddenly see a massive burst in
activity.
The result of this change is not to bring down this 100% (of GOMAXPROCS)
CPU utilization spike to 0%, but rather
min(25% + 1/GOMAXPROCS*100%, 100%)
Idle mark workers also do incur a small latency penalty as they must be
descheduled for other work that might pop up. Luckily the runtime is
pretty good about getting idle mark workers off of Ps, so in general
the latency benefit from shorter GC cycles outweighs this cost. But, the
cost is still non-zero and may be more significant in idle applications
that aren't invoking assists and write barriers quite as often.
We can't completely eliminate idle mark workers because they're
currently necessary for GC progress in some circumstances. Namely,
they're critical for progress when all we have is fractional workers. If
a fractional worker meets its quota, and all user goroutines are blocked
directly or indirectly on a GC cycle (via runtime.GOMAXPROCS, or
runtime.GC), the program may deadlock without GC workers, since the
fractional worker will go to sleep with nothing to wake it.
Fixes #37116.
For #44163.
Change-Id: Ib74793bb6b88d1765c52d445831310b0d11ef423
Reviewed-on: https://go-review.googlesource.com/c/go/+/393394
Reviewed-by: Michael Pratt <mpratt@google.com>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
2022-03-16 15:47:57 +00:00
|
|
|
c.startCycle(0, gomaxprocs, gcTrigger{kind: gcTriggerHeap})
|
2021-10-01 15:11:51 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (c *GCController) AssistWorkPerByte() float64 {
|
|
|
|
|
return c.assistWorkPerByte.Load()
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (c *GCController) HeapGoal() uint64 {
|
runtime: check the heap goal and trigger dynamically
As it stands, the heap goal and the trigger are set once by
gcController.commit, and then read out of gcController. However with the
coming memory limit we need the GC to be able to respond to changes in
non-heap memory. The simplest way of achieving this is to compute the
heap goal and its associated trigger dynamically.
In order to make this easier to implement, the GC trigger is now based
on the heap goal, as opposed to the status quo of computing both
simultaneously. In many cases we just want the heap goal anyway, not
both, but we definitely need the goal to compute the trigger, because
the trigger's bounds are entirely based on the goal (the initial runway
is not). A consequence of this is that we can't rely on the trigger to
enforce a minimum heap size anymore, and we need to lift that up
directly to the goal. Specifically, we need to lift up any part of the
calculation that *could* put the trigger ahead of the goal. Luckily this
is just the heap minimum and minimum sweep distance. In the first case,
the pacer may behave slightly differently, as the heap minimum is no
longer the minimum trigger, but the actual minimum heap goal. In the
second case it should be the same, as we ensure the additional runway
for sweeping is added to both the goal *and* the trigger, as before, by
computing that in gcControllerState.commit.
There's also another place we update the heap goal: if a GC starts and
we triggered beyond the goal, we always ensure there's some runway.
That calculation uses the current trigger, which violates the rule of
keeping the goal based on the trigger. Notice, however, that using the
precomputed trigger for this isn't even quite correct: due to a bug, or
something else, we might trigger a GC beyond the precomputed trigger.
So this change also adds a "triggered" field to gcControllerState that
tracks the point at which a GC actually triggered. This is independent
of the precomputed trigger, so it's fine for the heap goal calculation
to rely on it. It also turns out, there's more than just that one place
where we really should be using the actual trigger point, so this change
fixes those up too.
Also, because the heap minimum is set by the goal and not the trigger,
the maximum trigger calculation now happens *after* the goal is set, so
the maximum trigger actually does what I originally intended (and what
the comment says): at small heaps, the pacer picks 95% of the runway as
the maximum trigger. Currently, the pacer picks a small trigger based
on a not-yet-rounded-up heap goal, so the trigger gets rounded up to the
goal, and as per the "ensure there's some runway" check, the runway ends
up at always being 64 KiB. That check is supposed to be for exceptional
circumstances, not the status quo. There's a test introduced in the last
CL that needs to be updated to accomodate this slight change in
behavior.
So, this all sounds like a lot that changed, but what we're talking about
here are really, really tight corner cases that arise from situations
outside of our control, like pathologically bad behavior on the part of
an OS or CPU. Even in these corner cases, it's very unlikely that users
will notice any difference at all. What's more important, I think, is
that the pacer behaves more closely to what all the comments describe,
and what the original intent was.
Another note: at first, one might think that computing the heap goal and
trigger dynamically introduces some raciness, but not in this CL: the heap
goal and trigger are completely static.
Allocation outside of a GC cycle may now be a bit slower than before, as
the GC trigger check is now significantly more complex. However, note
that this executes basically just as often as gcController.revise, and
that makes up for a vanishingly small part of any CPU profile. The next
CL cleans up the floating point multiplications on this path
nonetheless, just to be safe.
For #48409.
Change-Id: I280f5ad607a86756d33fb8449ad08555cbee93f9
Reviewed-on: https://go-review.googlesource.com/c/go/+/397014
Run-TryBot: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Michael Pratt <mpratt@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
2022-03-21 21:27:06 +00:00
|
|
|
return c.heapGoal()
|
2021-10-01 15:11:51 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (c *GCController) HeapLive() uint64 {
|
2022-07-15 14:21:02 -04:00
|
|
|
return c.heapLive.Load()
|
2021-10-01 15:11:51 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (c *GCController) HeapMarked() uint64 {
|
|
|
|
|
return c.heapMarked
|
|
|
|
|
}
|
|
|
|
|
|
runtime: check the heap goal and trigger dynamically
As it stands, the heap goal and the trigger are set once by
gcController.commit, and then read out of gcController. However with the
coming memory limit we need the GC to be able to respond to changes in
non-heap memory. The simplest way of achieving this is to compute the
heap goal and its associated trigger dynamically.
In order to make this easier to implement, the GC trigger is now based
on the heap goal, as opposed to the status quo of computing both
simultaneously. In many cases we just want the heap goal anyway, not
both, but we definitely need the goal to compute the trigger, because
the trigger's bounds are entirely based on the goal (the initial runway
is not). A consequence of this is that we can't rely on the trigger to
enforce a minimum heap size anymore, and we need to lift that up
directly to the goal. Specifically, we need to lift up any part of the
calculation that *could* put the trigger ahead of the goal. Luckily this
is just the heap minimum and minimum sweep distance. In the first case,
the pacer may behave slightly differently, as the heap minimum is no
longer the minimum trigger, but the actual minimum heap goal. In the
second case it should be the same, as we ensure the additional runway
for sweeping is added to both the goal *and* the trigger, as before, by
computing that in gcControllerState.commit.
There's also another place we update the heap goal: if a GC starts and
we triggered beyond the goal, we always ensure there's some runway.
That calculation uses the current trigger, which violates the rule of
keeping the goal based on the trigger. Notice, however, that using the
precomputed trigger for this isn't even quite correct: due to a bug, or
something else, we might trigger a GC beyond the precomputed trigger.
So this change also adds a "triggered" field to gcControllerState that
tracks the point at which a GC actually triggered. This is independent
of the precomputed trigger, so it's fine for the heap goal calculation
to rely on it. It also turns out, there's more than just that one place
where we really should be using the actual trigger point, so this change
fixes those up too.
Also, because the heap minimum is set by the goal and not the trigger,
the maximum trigger calculation now happens *after* the goal is set, so
the maximum trigger actually does what I originally intended (and what
the comment says): at small heaps, the pacer picks 95% of the runway as
the maximum trigger. Currently, the pacer picks a small trigger based
on a not-yet-rounded-up heap goal, so the trigger gets rounded up to the
goal, and as per the "ensure there's some runway" check, the runway ends
up at always being 64 KiB. That check is supposed to be for exceptional
circumstances, not the status quo. There's a test introduced in the last
CL that needs to be updated to accomodate this slight change in
behavior.
So, this all sounds like a lot that changed, but what we're talking about
here are really, really tight corner cases that arise from situations
outside of our control, like pathologically bad behavior on the part of
an OS or CPU. Even in these corner cases, it's very unlikely that users
will notice any difference at all. What's more important, I think, is
that the pacer behaves more closely to what all the comments describe,
and what the original intent was.
Another note: at first, one might think that computing the heap goal and
trigger dynamically introduces some raciness, but not in this CL: the heap
goal and trigger are completely static.
Allocation outside of a GC cycle may now be a bit slower than before, as
the GC trigger check is now significantly more complex. However, note
that this executes basically just as often as gcController.revise, and
that makes up for a vanishingly small part of any CPU profile. The next
CL cleans up the floating point multiplications on this path
nonetheless, just to be safe.
For #48409.
Change-Id: I280f5ad607a86756d33fb8449ad08555cbee93f9
Reviewed-on: https://go-review.googlesource.com/c/go/+/397014
Run-TryBot: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Michael Pratt <mpratt@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
2022-03-21 21:27:06 +00:00
|
|
|
func (c *GCController) Triggered() uint64 {
|
|
|
|
|
return c.triggered
|
2021-10-01 15:11:51 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
type GCControllerReviseDelta struct {
|
|
|
|
|
HeapLive int64
|
|
|
|
|
HeapScan int64
|
|
|
|
|
HeapScanWork int64
|
|
|
|
|
StackScanWork int64
|
|
|
|
|
GlobalsScanWork int64
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (c *GCController) Revise(d GCControllerReviseDelta) {
|
2022-07-15 14:21:02 -04:00
|
|
|
c.heapLive.Add(d.HeapLive)
|
2022-07-15 16:56:03 -04:00
|
|
|
c.heapScan.Add(d.HeapScan)
|
runtime: implement GC pacer redesign
This change implements the GC pacer redesign outlined in #44167 and the
accompanying design document, behind a GOEXPERIMENT flag that is on by
default.
In addition to adding the new pacer, this CL also includes code to track
and account for stack and globals scan work in the pacer and in the
assist credit system.
The new pacer also deviates slightly from the document in that it
increases the bound on the minimum trigger ratio from 0.6 (scaled by
GOGC) to 0.7. The logic behind this change is that the new pacer much
more consistently hits the goal (good!) leading to slightly less
frequent GC cycles, but _longer_ ones (in this case, bad!). It turns out
that the cost of having the GC on hurts throughput significantly (per
byte of memory used), though tail latencies can improve by up to 10%! To
be conservative, this change moves the value to 0.7 where there is a
small improvement to both throughput and latency, given the memory use.
Because the new pacer accounts for the two most significant sources of
scan work after heap objects, it is now also safer to reduce the minimum
heap size without leading to very poor amortization. This change thus
decreases the minimum heap size to 512 KiB, which corresponds to the
fact that the runtime has around 200 KiB of scannable globals always
there, up-front, providing a baseline.
Benchmark results: https://perf.golang.org/search?q=upload:20211001.6
tile38's KNearest benchmark shows a memory increase, but throughput (and
latency) per byte of memory used is better.
gopher-lua showed an increase in both CPU time and memory usage, but
subsequent attempts to reproduce this behavior are inconsistent.
Sometimes the overall performance is better, sometimes it's worse. This
suggests that the benchmark is fairly noisy in a way not captured by the
benchmarking framework itself.
biogo-igor is the only benchmark to show a significant performance loss.
This benchmark exhibits a very high GC rate, with relatively little work
to do in each cycle. The idle mark workers are quite active. In the new
pacer, mark phases are longer, mark assists are fewer, and some of that
time in mark assists has shifted to idle workers. Linux perf indicates
that the difference in CPU time can be mostly attributed to write-barrier
slow path related calls, which in turn indicates that the write barrier
being on for longer is the primary culprit. This also explains the memory
increase, as a longer mark phase leads to more memory allocated black,
surviving an extra cycle and contributing to the heap goal.
For #44167.
Change-Id: I8ac7cfef7d593e4a642c9b2be43fb3591a8ec9c4
Reviewed-on: https://go-review.googlesource.com/c/go/+/309869
Trust: Michael Knyszek <mknyszek@google.com>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Austin Clements <austin@google.com>
Reviewed-by: Michael Pratt <mpratt@google.com>
2021-04-13 03:07:27 +00:00
|
|
|
c.heapScanWork.Add(d.HeapScanWork)
|
|
|
|
|
c.stackScanWork.Add(d.StackScanWork)
|
|
|
|
|
c.globalsScanWork.Add(d.GlobalsScanWork)
|
2021-10-01 15:11:51 -04:00
|
|
|
c.revise()
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (c *GCController) EndCycle(bytesMarked uint64, assistTime, elapsed int64, gomaxprocs int) {
|
runtime: add GC CPU utilization limiter
This change adds a GC CPU utilization limiter to the GC. It disables
assists to ensure GC CPU utilization remains under 50%. It uses a leaky
bucket mechanism that will only fill if GC CPU utilization exceeds 50%.
Once the bucket begins to overflow, GC assists are limited until the
bucket empties, at the risk of GC overshoot. The limiter is primarily
updated by assists. The scheduler may also update it, but only if the
GC is on and a few milliseconds have passed since the last update. This
second case exists to ensure that if the limiter is on, and no assists
are happening, we're still updating the limiter regularly.
The purpose of this limiter is to mitigate GC death spirals, opting to
use more memory instead.
This change turns the limiter on always. In practice, 50% overall GC CPU
utilization is very difficult to hit unless you're trying; even the most
allocation-heavy applications with complex heaps still need to do
something with that memory. Note that small GOGC values (i.e.
single-digit, or low teens) are more likely to trigger the limiter,
which means the GOGC tradeoff may no longer be respected. Even so, it
should still be relatively rare.
This change also introduces the feature flag for code to support the
memory limit feature.
For #48409.
Change-Id: Ia30f914e683e491a00900fd27868446c65e5d3c2
Reviewed-on: https://go-review.googlesource.com/c/go/+/353989
Reviewed-by: Michael Pratt <mpratt@google.com>
2021-10-01 22:52:12 -04:00
|
|
|
c.assistTime.Store(assistTime)
|
2022-02-14 22:36:25 +00:00
|
|
|
c.endCycle(elapsed, gomaxprocs, false)
|
2021-10-01 15:11:51 -04:00
|
|
|
c.resetLive(bytesMarked)
|
runtime: check the heap goal and trigger dynamically
As it stands, the heap goal and the trigger are set once by
gcController.commit, and then read out of gcController. However with the
coming memory limit we need the GC to be able to respond to changes in
non-heap memory. The simplest way of achieving this is to compute the
heap goal and its associated trigger dynamically.
In order to make this easier to implement, the GC trigger is now based
on the heap goal, as opposed to the status quo of computing both
simultaneously. In many cases we just want the heap goal anyway, not
both, but we definitely need the goal to compute the trigger, because
the trigger's bounds are entirely based on the goal (the initial runway
is not). A consequence of this is that we can't rely on the trigger to
enforce a minimum heap size anymore, and we need to lift that up
directly to the goal. Specifically, we need to lift up any part of the
calculation that *could* put the trigger ahead of the goal. Luckily this
is just the heap minimum and minimum sweep distance. In the first case,
the pacer may behave slightly differently, as the heap minimum is no
longer the minimum trigger, but the actual minimum heap goal. In the
second case it should be the same, as we ensure the additional runway
for sweeping is added to both the goal *and* the trigger, as before, by
computing that in gcControllerState.commit.
There's also another place we update the heap goal: if a GC starts and
we triggered beyond the goal, we always ensure there's some runway.
That calculation uses the current trigger, which violates the rule of
keeping the goal based on the trigger. Notice, however, that using the
precomputed trigger for this isn't even quite correct: due to a bug, or
something else, we might trigger a GC beyond the precomputed trigger.
So this change also adds a "triggered" field to gcControllerState that
tracks the point at which a GC actually triggered. This is independent
of the precomputed trigger, so it's fine for the heap goal calculation
to rely on it. It also turns out, there's more than just that one place
where we really should be using the actual trigger point, so this change
fixes those up too.
Also, because the heap minimum is set by the goal and not the trigger,
the maximum trigger calculation now happens *after* the goal is set, so
the maximum trigger actually does what I originally intended (and what
the comment says): at small heaps, the pacer picks 95% of the runway as
the maximum trigger. Currently, the pacer picks a small trigger based
on a not-yet-rounded-up heap goal, so the trigger gets rounded up to the
goal, and as per the "ensure there's some runway" check, the runway ends
up at always being 64 KiB. That check is supposed to be for exceptional
circumstances, not the status quo. There's a test introduced in the last
CL that needs to be updated to accomodate this slight change in
behavior.
So, this all sounds like a lot that changed, but what we're talking about
here are really, really tight corner cases that arise from situations
outside of our control, like pathologically bad behavior on the part of
an OS or CPU. Even in these corner cases, it's very unlikely that users
will notice any difference at all. What's more important, I think, is
that the pacer behaves more closely to what all the comments describe,
and what the original intent was.
Another note: at first, one might think that computing the heap goal and
trigger dynamically introduces some raciness, but not in this CL: the heap
goal and trigger are completely static.
Allocation outside of a GC cycle may now be a bit slower than before, as
the GC trigger check is now significantly more complex. However, note
that this executes basically just as often as gcController.revise, and
that makes up for a vanishingly small part of any CPU profile. The next
CL cleans up the floating point multiplications on this path
nonetheless, just to be safe.
For #48409.
Change-Id: I280f5ad607a86756d33fb8449ad08555cbee93f9
Reviewed-on: https://go-review.googlesource.com/c/go/+/397014
Run-TryBot: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Michael Pratt <mpratt@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
2022-03-21 21:27:06 +00:00
|
|
|
c.commit(false)
|
2021-10-01 15:11:51 -04:00
|
|
|
}
|
|
|
|
|
|
runtime: reduce max idle mark workers during periodic GC cycles
This change reduces the maximum number of idle mark workers during
periodic (currently every 2 minutes) GC cycles to 1.
Idle mark workers soak up all available and unused Ps, up to GOMAXPROCS.
While this provides some throughput and latency benefit in general, it
can cause what appear to be massive CPU utilization spikes in otherwise
idle applications. This is mostly an issue for *very* idle applications,
ones idle enough to trigger periodic GC cycles. This spike also tends to
interact poorly with auto-scaling systems, as the system might assume
the load average is very low and suddenly see a massive burst in
activity.
The result of this change is not to bring down this 100% (of GOMAXPROCS)
CPU utilization spike to 0%, but rather
min(25% + 1/GOMAXPROCS*100%, 100%)
Idle mark workers also do incur a small latency penalty as they must be
descheduled for other work that might pop up. Luckily the runtime is
pretty good about getting idle mark workers off of Ps, so in general
the latency benefit from shorter GC cycles outweighs this cost. But, the
cost is still non-zero and may be more significant in idle applications
that aren't invoking assists and write barriers quite as often.
We can't completely eliminate idle mark workers because they're
currently necessary for GC progress in some circumstances. Namely,
they're critical for progress when all we have is fractional workers. If
a fractional worker meets its quota, and all user goroutines are blocked
directly or indirectly on a GC cycle (via runtime.GOMAXPROCS, or
runtime.GC), the program may deadlock without GC workers, since the
fractional worker will go to sleep with nothing to wake it.
Fixes #37116.
For #44163.
Change-Id: Ib74793bb6b88d1765c52d445831310b0d11ef423
Reviewed-on: https://go-review.googlesource.com/c/go/+/393394
Reviewed-by: Michael Pratt <mpratt@google.com>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
2022-03-16 15:47:57 +00:00
|
|
|
func (c *GCController) AddIdleMarkWorker() bool {
|
|
|
|
|
return c.addIdleMarkWorker()
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (c *GCController) NeedIdleMarkWorker() bool {
|
|
|
|
|
return c.needIdleMarkWorker()
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (c *GCController) RemoveIdleMarkWorker() {
|
|
|
|
|
c.removeIdleMarkWorker()
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (c *GCController) SetMaxIdleMarkWorkers(max int32) {
|
|
|
|
|
c.setMaxIdleMarkWorkers(max)
|
|
|
|
|
}
|
|
|
|
|
|
2022-04-25 17:21:58 -04:00
|
|
|
var alwaysFalse bool
|
2021-12-01 12:15:45 -05:00
|
|
|
var escapeSink any
|
2021-10-01 15:11:51 -04:00
|
|
|
|
2022-04-25 17:21:58 -04:00
|
|
|
func Escape[T any](x T) T {
|
|
|
|
|
if alwaysFalse {
|
|
|
|
|
escapeSink = x
|
|
|
|
|
}
|
2021-10-01 15:11:51 -04:00
|
|
|
return x
|
|
|
|
|
}
|
2021-11-22 15:33:01 -05:00
|
|
|
|
|
|
|
|
// Acquirem blocks preemption.
|
|
|
|
|
func Acquirem() {
|
|
|
|
|
acquirem()
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func Releasem() {
|
|
|
|
|
releasem(getg().m)
|
|
|
|
|
}
|
2022-01-21 16:52:56 -05:00
|
|
|
|
|
|
|
|
var Timediv = timediv
|
runtime: make piController much more defensive about overflow
If something goes horribly wrong with the assumptions surrounding a
piController, its internal error state might accumulate in an unbounded
manner. In practice this means unexpected Inf and NaN values.
Avoid this by identifying cases where the error overflows and resetting
controller state.
In the scavenger, this case is much more likely. All that has to happen
is the proportional relationship between sleep time and estimated CPU
usage has to break down. Unfortunately because we're just measuring
monotonic time for all this, there are lots of ways it could happen,
especially in an oversubscribed system. In these cases, just fall back
on a conservative pace for scavenging and try to wait out the issue.
In the pacer I'm pretty sure this is impossible. Because we wire the
output of the controller to the input, the response is very directly
correlated, so it's impossible for the controller's core assumption to
break down.
While we're in the pacer, add more detail about why that controller is
even there, as well as its purpose.
Finally, let's be proactive about other sources of overflow, namely
overflow from a very large input value. This change adds a check after
the first few operations to detect overflow issues from the input,
specifically the multiplication.
No tests for the pacer because I was unable to actually break the
pacer's controller under a fuzzer, and no tests for the scavenger because
it is not really in a testable state.
However:
* This change includes a fuzz test for the piController.
* I broke out the scavenger code locally and fuzz tested it, confirming
that the patch eliminates the original failure mode.
* I tested that on a local heap-spike test, the scavenger continues
operating as expected under normal conditions.
Fixes #51061.
Change-Id: I02a01d2dbf0eb9d2a8a8e7274d4165c2b6a3415a
Reviewed-on: https://go-review.googlesource.com/c/go/+/383954
Reviewed-by: David Chase <drchase@google.com>
Reviewed-by: Michael Pratt <mpratt@google.com>
Trust: Michael Knyszek <mknyszek@google.com>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
2022-02-08 00:52:11 +00:00
|
|
|
|
|
|
|
|
type PIController struct {
|
|
|
|
|
piController
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func NewPIController(kp, ti, tt, min, max float64) *PIController {
|
|
|
|
|
return &PIController{piController{
|
|
|
|
|
kp: kp,
|
|
|
|
|
ti: ti,
|
|
|
|
|
tt: tt,
|
|
|
|
|
min: min,
|
|
|
|
|
max: max,
|
|
|
|
|
}}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (c *PIController) Next(input, setpoint, period float64) (float64, bool) {
|
|
|
|
|
return c.piController.next(input, setpoint, period)
|
|
|
|
|
}
|
2022-02-10 00:49:44 +00:00
|
|
|
|
runtime: add GC CPU utilization limiter
This change adds a GC CPU utilization limiter to the GC. It disables
assists to ensure GC CPU utilization remains under 50%. It uses a leaky
bucket mechanism that will only fill if GC CPU utilization exceeds 50%.
Once the bucket begins to overflow, GC assists are limited until the
bucket empties, at the risk of GC overshoot. The limiter is primarily
updated by assists. The scheduler may also update it, but only if the
GC is on and a few milliseconds have passed since the last update. This
second case exists to ensure that if the limiter is on, and no assists
are happening, we're still updating the limiter regularly.
The purpose of this limiter is to mitigate GC death spirals, opting to
use more memory instead.
This change turns the limiter on always. In practice, 50% overall GC CPU
utilization is very difficult to hit unless you're trying; even the most
allocation-heavy applications with complex heaps still need to do
something with that memory. Note that small GOGC values (i.e.
single-digit, or low teens) are more likely to trigger the limiter,
which means the GOGC tradeoff may no longer be respected. Even so, it
should still be relatively rare.
This change also introduces the feature flag for code to support the
memory limit feature.
For #48409.
Change-Id: Ia30f914e683e491a00900fd27868446c65e5d3c2
Reviewed-on: https://go-review.googlesource.com/c/go/+/353989
Reviewed-by: Michael Pratt <mpratt@google.com>
2021-10-01 22:52:12 -04:00
|
|
|
const (
|
|
|
|
|
CapacityPerProc = capacityPerProc
|
|
|
|
|
GCCPULimiterUpdatePeriod = gcCPULimiterUpdatePeriod
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
type GCCPULimiter struct {
|
|
|
|
|
limiter gcCPULimiterState
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func NewGCCPULimiter(now int64, gomaxprocs int32) *GCCPULimiter {
|
|
|
|
|
// Force the controller to escape. We're going to
|
|
|
|
|
// do 64-bit atomics on it, and if it gets stack-allocated
|
|
|
|
|
// on a 32-bit architecture, it may get allocated unaligned
|
|
|
|
|
// space.
|
2022-05-03 15:40:40 +00:00
|
|
|
l := Escape(new(GCCPULimiter))
|
runtime: only use CPU time from the current window in the GC CPU limiter
Currently the GC CPU limiter consumes CPU time from a few pools, but
because the events that flush to those pools may overlap, rather than be
strictly contained within, the update window for the GC CPU limiter, the
limiter's accounting is ultimately sloppy.
This sloppiness complicates accounting for idle time more completely,
and makes reasoning about the transient behavior of the GC CPU limiter
much more difficult.
To remedy this, this CL adds a field to the P struct that tracks the
start time of any in-flight event the limiter might care about, along
with information about the nature of that event. This timestamp is
managed atomically so that the GC CPU limiter can come in and perform a
read of the partial CPU time consumed by a given event. The limiter also
updates the timestamp so that only what's left over is flushed by the
event itself when it completes.
The end result of this change is that, since the GC CPU limiter is aware
of all past completed events, and all in-flight events, it can much more
accurately collect the CPU time of events since the last update. There's
still the possibility for skew, but any leftover time will be captured
in the following update, and the magnitude of this leftover time is
effectively bounded by the update period of the GC CPU limiter, which is
much easier to consider.
One caveat of managing this timestamp-type combo atomically is that they
need to be packed in 64 bits. So, this CL gives up the top 3 bits of the
timestamp and places the type information there. What this means is we
effectively have only a 61-bit resolution timestamp. This is fine when
the top 3 bits are the same between calls to nanotime, but becomes a
problem on boundaries when those 3 bits change. These cases may cause
hiccups in the GC CPU limiter by not accounting for some source of CPU
time correctly, but with 61 bits of resolution this should be extremely
rare. The rate of update is on the order of milliseconds, so at worst
the runtime will be off of any given measurement by only a few
CPU-milliseconds (and this is directly bounded by the rate of update).
We're probably more inaccurate from the fact that we don't measure real
CPU time but only approximate it.
For #52890.
Change-Id: I347f30ac9e2ba6061806c21dfe0193ef2ab3bbe9
Reviewed-on: https://go-review.googlesource.com/c/go/+/410120
Reviewed-by: Michael Pratt <mpratt@google.com>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
2022-06-02 19:06:27 +00:00
|
|
|
l.limiter.test = true
|
runtime: add GC CPU utilization limiter
This change adds a GC CPU utilization limiter to the GC. It disables
assists to ensure GC CPU utilization remains under 50%. It uses a leaky
bucket mechanism that will only fill if GC CPU utilization exceeds 50%.
Once the bucket begins to overflow, GC assists are limited until the
bucket empties, at the risk of GC overshoot. The limiter is primarily
updated by assists. The scheduler may also update it, but only if the
GC is on and a few milliseconds have passed since the last update. This
second case exists to ensure that if the limiter is on, and no assists
are happening, we're still updating the limiter regularly.
The purpose of this limiter is to mitigate GC death spirals, opting to
use more memory instead.
This change turns the limiter on always. In practice, 50% overall GC CPU
utilization is very difficult to hit unless you're trying; even the most
allocation-heavy applications with complex heaps still need to do
something with that memory. Note that small GOGC values (i.e.
single-digit, or low teens) are more likely to trigger the limiter,
which means the GOGC tradeoff may no longer be respected. Even so, it
should still be relatively rare.
This change also introduces the feature flag for code to support the
memory limit feature.
For #48409.
Change-Id: Ia30f914e683e491a00900fd27868446c65e5d3c2
Reviewed-on: https://go-review.googlesource.com/c/go/+/353989
Reviewed-by: Michael Pratt <mpratt@google.com>
2021-10-01 22:52:12 -04:00
|
|
|
l.limiter.resetCapacity(now, gomaxprocs)
|
|
|
|
|
return l
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (l *GCCPULimiter) Fill() uint64 {
|
|
|
|
|
return l.limiter.bucket.fill
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (l *GCCPULimiter) Capacity() uint64 {
|
|
|
|
|
return l.limiter.bucket.capacity
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (l *GCCPULimiter) Overflow() uint64 {
|
|
|
|
|
return l.limiter.overflow
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (l *GCCPULimiter) Limiting() bool {
|
|
|
|
|
return l.limiter.limiting()
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (l *GCCPULimiter) NeedUpdate(now int64) bool {
|
|
|
|
|
return l.limiter.needUpdate(now)
|
|
|
|
|
}
|
|
|
|
|
|
2022-05-06 20:17:52 +00:00
|
|
|
func (l *GCCPULimiter) StartGCTransition(enableGC bool, now int64) {
|
|
|
|
|
l.limiter.startGCTransition(enableGC, now)
|
runtime: add GC CPU utilization limiter
This change adds a GC CPU utilization limiter to the GC. It disables
assists to ensure GC CPU utilization remains under 50%. It uses a leaky
bucket mechanism that will only fill if GC CPU utilization exceeds 50%.
Once the bucket begins to overflow, GC assists are limited until the
bucket empties, at the risk of GC overshoot. The limiter is primarily
updated by assists. The scheduler may also update it, but only if the
GC is on and a few milliseconds have passed since the last update. This
second case exists to ensure that if the limiter is on, and no assists
are happening, we're still updating the limiter regularly.
The purpose of this limiter is to mitigate GC death spirals, opting to
use more memory instead.
This change turns the limiter on always. In practice, 50% overall GC CPU
utilization is very difficult to hit unless you're trying; even the most
allocation-heavy applications with complex heaps still need to do
something with that memory. Note that small GOGC values (i.e.
single-digit, or low teens) are more likely to trigger the limiter,
which means the GOGC tradeoff may no longer be respected. Even so, it
should still be relatively rare.
This change also introduces the feature flag for code to support the
memory limit feature.
For #48409.
Change-Id: Ia30f914e683e491a00900fd27868446c65e5d3c2
Reviewed-on: https://go-review.googlesource.com/c/go/+/353989
Reviewed-by: Michael Pratt <mpratt@google.com>
2021-10-01 22:52:12 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (l *GCCPULimiter) FinishGCTransition(now int64) {
|
|
|
|
|
l.limiter.finishGCTransition(now)
|
|
|
|
|
}
|
|
|
|
|
|
2022-05-06 20:17:52 +00:00
|
|
|
func (l *GCCPULimiter) Update(now int64) {
|
|
|
|
|
l.limiter.update(now)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (l *GCCPULimiter) AddAssistTime(t int64) {
|
|
|
|
|
l.limiter.addAssistTime(t)
|
runtime: add GC CPU utilization limiter
This change adds a GC CPU utilization limiter to the GC. It disables
assists to ensure GC CPU utilization remains under 50%. It uses a leaky
bucket mechanism that will only fill if GC CPU utilization exceeds 50%.
Once the bucket begins to overflow, GC assists are limited until the
bucket empties, at the risk of GC overshoot. The limiter is primarily
updated by assists. The scheduler may also update it, but only if the
GC is on and a few milliseconds have passed since the last update. This
second case exists to ensure that if the limiter is on, and no assists
are happening, we're still updating the limiter regularly.
The purpose of this limiter is to mitigate GC death spirals, opting to
use more memory instead.
This change turns the limiter on always. In practice, 50% overall GC CPU
utilization is very difficult to hit unless you're trying; even the most
allocation-heavy applications with complex heaps still need to do
something with that memory. Note that small GOGC values (i.e.
single-digit, or low teens) are more likely to trigger the limiter,
which means the GOGC tradeoff may no longer be respected. Even so, it
should still be relatively rare.
This change also introduces the feature flag for code to support the
memory limit feature.
For #48409.
Change-Id: Ia30f914e683e491a00900fd27868446c65e5d3c2
Reviewed-on: https://go-review.googlesource.com/c/go/+/353989
Reviewed-by: Michael Pratt <mpratt@google.com>
2021-10-01 22:52:12 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (l *GCCPULimiter) ResetCapacity(now int64, nprocs int32) {
|
|
|
|
|
l.limiter.resetCapacity(now, nprocs)
|
|
|
|
|
}
|
|
|
|
|
|
2022-02-10 00:49:44 +00:00
|
|
|
const ScavengePercent = scavengePercent
|
|
|
|
|
|
|
|
|
|
type Scavenger struct {
|
|
|
|
|
Sleep func(int64) int64
|
|
|
|
|
Scavenge func(uintptr) (uintptr, int64)
|
|
|
|
|
ShouldStop func() bool
|
|
|
|
|
GoMaxProcs func() int32
|
|
|
|
|
|
|
|
|
|
released atomic.Uintptr
|
|
|
|
|
scavenger scavengerState
|
|
|
|
|
stop chan<- struct{}
|
|
|
|
|
done <-chan struct{}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (s *Scavenger) Start() {
|
|
|
|
|
if s.Sleep == nil || s.Scavenge == nil || s.ShouldStop == nil || s.GoMaxProcs == nil {
|
|
|
|
|
panic("must populate all stubs")
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Install hooks.
|
|
|
|
|
s.scavenger.sleepStub = s.Sleep
|
|
|
|
|
s.scavenger.scavenge = s.Scavenge
|
|
|
|
|
s.scavenger.shouldStop = s.ShouldStop
|
|
|
|
|
s.scavenger.gomaxprocs = s.GoMaxProcs
|
|
|
|
|
|
|
|
|
|
// Start up scavenger goroutine, and wait for it to be ready.
|
|
|
|
|
stop := make(chan struct{})
|
|
|
|
|
s.stop = stop
|
|
|
|
|
done := make(chan struct{})
|
|
|
|
|
s.done = done
|
|
|
|
|
go func() {
|
|
|
|
|
// This should match bgscavenge, loosely.
|
|
|
|
|
s.scavenger.init()
|
|
|
|
|
s.scavenger.park()
|
|
|
|
|
for {
|
|
|
|
|
select {
|
|
|
|
|
case <-stop:
|
|
|
|
|
close(done)
|
|
|
|
|
return
|
|
|
|
|
default:
|
|
|
|
|
}
|
|
|
|
|
released, workTime := s.scavenger.run()
|
|
|
|
|
if released == 0 {
|
|
|
|
|
s.scavenger.park()
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
s.released.Add(released)
|
|
|
|
|
s.scavenger.sleep(workTime)
|
|
|
|
|
}
|
|
|
|
|
}()
|
|
|
|
|
if !s.BlockUntilParked(1e9 /* 1 second */) {
|
|
|
|
|
panic("timed out waiting for scavenger to get ready")
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// BlockUntilParked blocks until the scavenger parks, or until
|
|
|
|
|
// timeout is exceeded. Returns true if the scavenger parked.
|
|
|
|
|
//
|
|
|
|
|
// Note that in testing, parked means something slightly different.
|
|
|
|
|
// In anger, the scavenger parks to sleep, too, but in testing,
|
|
|
|
|
// it only parks when it actually has no work to do.
|
|
|
|
|
func (s *Scavenger) BlockUntilParked(timeout int64) bool {
|
|
|
|
|
// Just spin, waiting for it to park.
|
|
|
|
|
//
|
|
|
|
|
// The actual parking process is racy with respect to
|
|
|
|
|
// wakeups, which is fine, but for testing we need something
|
|
|
|
|
// a bit more robust.
|
|
|
|
|
start := nanotime()
|
|
|
|
|
for nanotime()-start < timeout {
|
|
|
|
|
lock(&s.scavenger.lock)
|
|
|
|
|
parked := s.scavenger.parked
|
|
|
|
|
unlock(&s.scavenger.lock)
|
|
|
|
|
if parked {
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
Gosched()
|
|
|
|
|
}
|
|
|
|
|
return false
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Released returns how many bytes the scavenger released.
|
|
|
|
|
func (s *Scavenger) Released() uintptr {
|
|
|
|
|
return s.released.Load()
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Wake wakes up a parked scavenger to keep running.
|
|
|
|
|
func (s *Scavenger) Wake() {
|
|
|
|
|
s.scavenger.wake()
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Stop cleans up the scavenger's resources. The scavenger
|
|
|
|
|
// must be parked for this to work.
|
|
|
|
|
func (s *Scavenger) Stop() {
|
|
|
|
|
lock(&s.scavenger.lock)
|
|
|
|
|
parked := s.scavenger.parked
|
|
|
|
|
unlock(&s.scavenger.lock)
|
|
|
|
|
if !parked {
|
|
|
|
|
panic("tried to clean up scavenger that is not parked")
|
|
|
|
|
}
|
|
|
|
|
close(s.stop)
|
|
|
|
|
s.Wake()
|
|
|
|
|
<-s.done
|
|
|
|
|
}
|
runtime: redesign scavenging algorithm
Currently the runtime's scavenging algorithm involves running from the
top of the heap address space to the bottom (or as far as it gets) once
per GC cycle. Once it treads some ground, it doesn't tread it again
until the next GC cycle.
This works just fine for the background scavenger, for heap-growth
scavenging, and for debug.FreeOSMemory. However, it breaks down in the
face of a memory limit for small heaps in the tens of MiB. Basically,
because the scavenger never retreads old ground, it's completely
oblivious to new memory it could scavenge, and that it really *should*
in the face of a memory limit.
Also, every time some thread goes to scavenge in the runtime, it
reserves what could be a considerable amount of address space, hiding it
from other scavengers.
This change modifies and simplifies the implementation overall. It's
less code with complexities that are much better encapsulated. The
current implementation iterates optimistically over the address space
looking for memory to scavenge, keeping track of what it last saw. The
new implementation does the same, but instead of directly iterating over
pages, it iterates over chunks. It maintains an index of chunks (as a
bitmap over the address space) that indicate which chunks may contain
scavenge work. The page allocator populates this index, while scavengers
consume it and iterate over it optimistically.
This has a two key benefits:
1. Scavenging is much simpler: find a candidate chunk, and check it,
essentially just using the scavengeOne fast path. There's no need for
the complexity of iterating beyond one chunk, because the index is
lock-free and already maintains that information.
2. If pages are freed to the page allocator (always guaranteed to be
unscavenged), the page allocator immediately notifies all scavengers
of the new source of work, avoiding the hiding issues of the old
implementation.
One downside of the new implementation, however, is that it's
potentially more expensive to find pages to scavenge. In the past, if
a single page would become free high up in the address space, the
runtime's scavengers would ignore it. Now that scavengers won't, one or
more scavengers may need to iterate potentially across the whole heap to
find the next source of work. For the background scavenger, this just
means a potentially less reactive scavenger -- overall it should still
use the same amount of CPU. It means worse overheads for memory limit
scavenging, but that's not exactly something with a baseline yet.
In practice, this shouldn't be too bad, hopefully since the chunk index
is extremely compact. For a 48-bit address space, the index is only 8
MiB in size at worst, but even just one physical page in the index is
able to support up to 128 GiB heaps, provided they aren't terribly
sparse. On 32-bit platforms, the index is only 128 bytes in size.
For #48409.
Change-Id: I72b7e74365046b18c64a6417224c5d85511194fb
Reviewed-on: https://go-review.googlesource.com/c/go/+/399474
Reviewed-by: Michael Pratt <mpratt@google.com>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
2022-04-10 20:34:17 +00:00
|
|
|
|
|
|
|
|
type ScavengeIndex struct {
|
|
|
|
|
i scavengeIndex
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func NewScavengeIndex(min, max ChunkIdx) *ScavengeIndex {
|
|
|
|
|
s := new(ScavengeIndex)
|
runtime: manage huge pages explicitly
This change makes it so that on Linux the Go runtime explicitly marks
page heap memory as either available to be backed by hugepages or not
using heuristics based on density.
The motivation behind this change is twofold:
1. In default Linux configurations, khugepaged can recoalesce hugepages
even after the scavenger breaks them up, resulting in significant
overheads for small heaps when their heaps shrink.
2. The Go runtime already has some heuristics about this, but those
heuristics appear to have bit-rotted and result in haphazard
hugepage management. Unlucky (but otherwise fairly dense) regions of
memory end up not backed by huge pages while sparse regions end up
accidentally marked MADV_HUGEPAGE and are not later broken up by the
scavenger, because it already got the memory it needed from more
dense sections (this is more likely to happen with small heaps that
go idle).
In this change, the runtime uses a new policy:
1. Mark all new memory MADV_HUGEPAGE.
2. Track whether each page chunk (4 MiB) became dense during the GC
cycle. Mark those MADV_HUGEPAGE, and hide them from the scavenger.
3. If a chunk is not dense for 1 full GC cycle, make it visible to the
scavenger.
4. The scavenger marks a chunk MADV_NOHUGEPAGE before it scavenges it.
This policy is intended to try and back memory that is a good candidate
for huge pages (high occupancy) with huge pages, and give memory that is
not (low occupancy) to the scavenger. Occupancy is defined not just by
occupancy at any instant of time, but also occupancy in the near future.
It's generally true that by the end of a GC cycle the heap gets quite
dense (from the perspective of the page allocator).
Because we want scavenging and huge page management to happen together
(the right time to MADV_NOHUGEPAGE is just before scavenging in order to
break up huge pages and keep them that way) and the cost of applying
MADV_HUGEPAGE and MADV_NOHUGEPAGE is somewhat high, the scavenger avoids
releasing memory in dense page chunks. All this together means the
scavenger will now more generally release memory on a ~1 GC cycle delay.
Notably this has implications for scavenging to maintain the memory
limit and the runtime/debug.FreeOSMemory API. This change makes it so
that in these cases all memory is visible to the scavenger regardless of
sparseness and delays the page allocator in re-marking this memory with
MADV_NOHUGEPAGE for around 1 GC cycle to mitigate churn.
The end result of this change should be little-to-no performance
difference for dense heaps (MADV_HUGEPAGE works a lot like the default
unmarked state) but should allow the scavenger to more effectively take
back fragments of huge pages. The main risk here is churn, because
MADV_HUGEPAGE usually forces the kernel to immediately back memory with
a huge page. That's the reason for the large amount of hysteresis (1
full GC cycle) and why the definition of high density is 96% occupancy.
Fixes #55328.
Change-Id: I8da7998f1a31b498a9cc9bc662c1ae1a6bf64630
Reviewed-on: https://go-review.googlesource.com/c/go/+/436395
Reviewed-by: Michael Pratt <mpratt@google.com>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
2022-09-23 16:32:34 +00:00
|
|
|
// This is a bit lazy but we easily guarantee we'll be able
|
|
|
|
|
// to reference all the relevant chunks. The worst-case
|
|
|
|
|
// memory usage here is 512 MiB, but tests generally use
|
|
|
|
|
// small offsets from BaseChunkIdx, which results in ~100s
|
|
|
|
|
// of KiB in memory use.
|
|
|
|
|
//
|
|
|
|
|
// This may still be worth making better, at least by sharing
|
|
|
|
|
// this fairly large array across calls with a sync.Pool or
|
|
|
|
|
// something. Currently, when the tests are run serially,
|
|
|
|
|
// it takes around 0.5s. Not all that much, but if we have
|
|
|
|
|
// a lot of tests like this it could add up.
|
|
|
|
|
s.i.chunks = make([]atomicScavChunkData, max)
|
|
|
|
|
s.i.min.Store(uintptr(min))
|
|
|
|
|
s.i.max.Store(uintptr(max))
|
2023-04-20 02:41:08 +00:00
|
|
|
s.i.minHeapIdx.Store(uintptr(min))
|
runtime: manage huge pages explicitly
This change makes it so that on Linux the Go runtime explicitly marks
page heap memory as either available to be backed by hugepages or not
using heuristics based on density.
The motivation behind this change is twofold:
1. In default Linux configurations, khugepaged can recoalesce hugepages
even after the scavenger breaks them up, resulting in significant
overheads for small heaps when their heaps shrink.
2. The Go runtime already has some heuristics about this, but those
heuristics appear to have bit-rotted and result in haphazard
hugepage management. Unlucky (but otherwise fairly dense) regions of
memory end up not backed by huge pages while sparse regions end up
accidentally marked MADV_HUGEPAGE and are not later broken up by the
scavenger, because it already got the memory it needed from more
dense sections (this is more likely to happen with small heaps that
go idle).
In this change, the runtime uses a new policy:
1. Mark all new memory MADV_HUGEPAGE.
2. Track whether each page chunk (4 MiB) became dense during the GC
cycle. Mark those MADV_HUGEPAGE, and hide them from the scavenger.
3. If a chunk is not dense for 1 full GC cycle, make it visible to the
scavenger.
4. The scavenger marks a chunk MADV_NOHUGEPAGE before it scavenges it.
This policy is intended to try and back memory that is a good candidate
for huge pages (high occupancy) with huge pages, and give memory that is
not (low occupancy) to the scavenger. Occupancy is defined not just by
occupancy at any instant of time, but also occupancy in the near future.
It's generally true that by the end of a GC cycle the heap gets quite
dense (from the perspective of the page allocator).
Because we want scavenging and huge page management to happen together
(the right time to MADV_NOHUGEPAGE is just before scavenging in order to
break up huge pages and keep them that way) and the cost of applying
MADV_HUGEPAGE and MADV_NOHUGEPAGE is somewhat high, the scavenger avoids
releasing memory in dense page chunks. All this together means the
scavenger will now more generally release memory on a ~1 GC cycle delay.
Notably this has implications for scavenging to maintain the memory
limit and the runtime/debug.FreeOSMemory API. This change makes it so
that in these cases all memory is visible to the scavenger regardless of
sparseness and delays the page allocator in re-marking this memory with
MADV_NOHUGEPAGE for around 1 GC cycle to mitigate churn.
The end result of this change should be little-to-no performance
difference for dense heaps (MADV_HUGEPAGE works a lot like the default
unmarked state) but should allow the scavenger to more effectively take
back fragments of huge pages. The main risk here is churn, because
MADV_HUGEPAGE usually forces the kernel to immediately back memory with
a huge page. That's the reason for the large amount of hysteresis (1
full GC cycle) and why the definition of high density is 96% occupancy.
Fixes #55328.
Change-Id: I8da7998f1a31b498a9cc9bc662c1ae1a6bf64630
Reviewed-on: https://go-review.googlesource.com/c/go/+/436395
Reviewed-by: Michael Pratt <mpratt@google.com>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
2022-09-23 16:32:34 +00:00
|
|
|
s.i.test = true
|
runtime: redesign scavenging algorithm
Currently the runtime's scavenging algorithm involves running from the
top of the heap address space to the bottom (or as far as it gets) once
per GC cycle. Once it treads some ground, it doesn't tread it again
until the next GC cycle.
This works just fine for the background scavenger, for heap-growth
scavenging, and for debug.FreeOSMemory. However, it breaks down in the
face of a memory limit for small heaps in the tens of MiB. Basically,
because the scavenger never retreads old ground, it's completely
oblivious to new memory it could scavenge, and that it really *should*
in the face of a memory limit.
Also, every time some thread goes to scavenge in the runtime, it
reserves what could be a considerable amount of address space, hiding it
from other scavengers.
This change modifies and simplifies the implementation overall. It's
less code with complexities that are much better encapsulated. The
current implementation iterates optimistically over the address space
looking for memory to scavenge, keeping track of what it last saw. The
new implementation does the same, but instead of directly iterating over
pages, it iterates over chunks. It maintains an index of chunks (as a
bitmap over the address space) that indicate which chunks may contain
scavenge work. The page allocator populates this index, while scavengers
consume it and iterate over it optimistically.
This has a two key benefits:
1. Scavenging is much simpler: find a candidate chunk, and check it,
essentially just using the scavengeOne fast path. There's no need for
the complexity of iterating beyond one chunk, because the index is
lock-free and already maintains that information.
2. If pages are freed to the page allocator (always guaranteed to be
unscavenged), the page allocator immediately notifies all scavengers
of the new source of work, avoiding the hiding issues of the old
implementation.
One downside of the new implementation, however, is that it's
potentially more expensive to find pages to scavenge. In the past, if
a single page would become free high up in the address space, the
runtime's scavengers would ignore it. Now that scavengers won't, one or
more scavengers may need to iterate potentially across the whole heap to
find the next source of work. For the background scavenger, this just
means a potentially less reactive scavenger -- overall it should still
use the same amount of CPU. It means worse overheads for memory limit
scavenging, but that's not exactly something with a baseline yet.
In practice, this shouldn't be too bad, hopefully since the chunk index
is extremely compact. For a 48-bit address space, the index is only 8
MiB in size at worst, but even just one physical page in the index is
able to support up to 128 GiB heaps, provided they aren't terribly
sparse. On 32-bit platforms, the index is only 128 bytes in size.
For #48409.
Change-Id: I72b7e74365046b18c64a6417224c5d85511194fb
Reviewed-on: https://go-review.googlesource.com/c/go/+/399474
Reviewed-by: Michael Pratt <mpratt@google.com>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
2022-04-10 20:34:17 +00:00
|
|
|
return s
|
|
|
|
|
}
|
|
|
|
|
|
runtime: manage huge pages explicitly
This change makes it so that on Linux the Go runtime explicitly marks
page heap memory as either available to be backed by hugepages or not
using heuristics based on density.
The motivation behind this change is twofold:
1. In default Linux configurations, khugepaged can recoalesce hugepages
even after the scavenger breaks them up, resulting in significant
overheads for small heaps when their heaps shrink.
2. The Go runtime already has some heuristics about this, but those
heuristics appear to have bit-rotted and result in haphazard
hugepage management. Unlucky (but otherwise fairly dense) regions of
memory end up not backed by huge pages while sparse regions end up
accidentally marked MADV_HUGEPAGE and are not later broken up by the
scavenger, because it already got the memory it needed from more
dense sections (this is more likely to happen with small heaps that
go idle).
In this change, the runtime uses a new policy:
1. Mark all new memory MADV_HUGEPAGE.
2. Track whether each page chunk (4 MiB) became dense during the GC
cycle. Mark those MADV_HUGEPAGE, and hide them from the scavenger.
3. If a chunk is not dense for 1 full GC cycle, make it visible to the
scavenger.
4. The scavenger marks a chunk MADV_NOHUGEPAGE before it scavenges it.
This policy is intended to try and back memory that is a good candidate
for huge pages (high occupancy) with huge pages, and give memory that is
not (low occupancy) to the scavenger. Occupancy is defined not just by
occupancy at any instant of time, but also occupancy in the near future.
It's generally true that by the end of a GC cycle the heap gets quite
dense (from the perspective of the page allocator).
Because we want scavenging and huge page management to happen together
(the right time to MADV_NOHUGEPAGE is just before scavenging in order to
break up huge pages and keep them that way) and the cost of applying
MADV_HUGEPAGE and MADV_NOHUGEPAGE is somewhat high, the scavenger avoids
releasing memory in dense page chunks. All this together means the
scavenger will now more generally release memory on a ~1 GC cycle delay.
Notably this has implications for scavenging to maintain the memory
limit and the runtime/debug.FreeOSMemory API. This change makes it so
that in these cases all memory is visible to the scavenger regardless of
sparseness and delays the page allocator in re-marking this memory with
MADV_NOHUGEPAGE for around 1 GC cycle to mitigate churn.
The end result of this change should be little-to-no performance
difference for dense heaps (MADV_HUGEPAGE works a lot like the default
unmarked state) but should allow the scavenger to more effectively take
back fragments of huge pages. The main risk here is churn, because
MADV_HUGEPAGE usually forces the kernel to immediately back memory with
a huge page. That's the reason for the large amount of hysteresis (1
full GC cycle) and why the definition of high density is 96% occupancy.
Fixes #55328.
Change-Id: I8da7998f1a31b498a9cc9bc662c1ae1a6bf64630
Reviewed-on: https://go-review.googlesource.com/c/go/+/436395
Reviewed-by: Michael Pratt <mpratt@google.com>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
2022-09-23 16:32:34 +00:00
|
|
|
func (s *ScavengeIndex) Find(force bool) (ChunkIdx, uint) {
|
|
|
|
|
ci, off := s.i.find(force)
|
runtime: redesign scavenging algorithm
Currently the runtime's scavenging algorithm involves running from the
top of the heap address space to the bottom (or as far as it gets) once
per GC cycle. Once it treads some ground, it doesn't tread it again
until the next GC cycle.
This works just fine for the background scavenger, for heap-growth
scavenging, and for debug.FreeOSMemory. However, it breaks down in the
face of a memory limit for small heaps in the tens of MiB. Basically,
because the scavenger never retreads old ground, it's completely
oblivious to new memory it could scavenge, and that it really *should*
in the face of a memory limit.
Also, every time some thread goes to scavenge in the runtime, it
reserves what could be a considerable amount of address space, hiding it
from other scavengers.
This change modifies and simplifies the implementation overall. It's
less code with complexities that are much better encapsulated. The
current implementation iterates optimistically over the address space
looking for memory to scavenge, keeping track of what it last saw. The
new implementation does the same, but instead of directly iterating over
pages, it iterates over chunks. It maintains an index of chunks (as a
bitmap over the address space) that indicate which chunks may contain
scavenge work. The page allocator populates this index, while scavengers
consume it and iterate over it optimistically.
This has a two key benefits:
1. Scavenging is much simpler: find a candidate chunk, and check it,
essentially just using the scavengeOne fast path. There's no need for
the complexity of iterating beyond one chunk, because the index is
lock-free and already maintains that information.
2. If pages are freed to the page allocator (always guaranteed to be
unscavenged), the page allocator immediately notifies all scavengers
of the new source of work, avoiding the hiding issues of the old
implementation.
One downside of the new implementation, however, is that it's
potentially more expensive to find pages to scavenge. In the past, if
a single page would become free high up in the address space, the
runtime's scavengers would ignore it. Now that scavengers won't, one or
more scavengers may need to iterate potentially across the whole heap to
find the next source of work. For the background scavenger, this just
means a potentially less reactive scavenger -- overall it should still
use the same amount of CPU. It means worse overheads for memory limit
scavenging, but that's not exactly something with a baseline yet.
In practice, this shouldn't be too bad, hopefully since the chunk index
is extremely compact. For a 48-bit address space, the index is only 8
MiB in size at worst, but even just one physical page in the index is
able to support up to 128 GiB heaps, provided they aren't terribly
sparse. On 32-bit platforms, the index is only 128 bytes in size.
For #48409.
Change-Id: I72b7e74365046b18c64a6417224c5d85511194fb
Reviewed-on: https://go-review.googlesource.com/c/go/+/399474
Reviewed-by: Michael Pratt <mpratt@google.com>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
2022-04-10 20:34:17 +00:00
|
|
|
return ChunkIdx(ci), off
|
|
|
|
|
}
|
|
|
|
|
|
runtime: manage huge pages explicitly
This change makes it so that on Linux the Go runtime explicitly marks
page heap memory as either available to be backed by hugepages or not
using heuristics based on density.
The motivation behind this change is twofold:
1. In default Linux configurations, khugepaged can recoalesce hugepages
even after the scavenger breaks them up, resulting in significant
overheads for small heaps when their heaps shrink.
2. The Go runtime already has some heuristics about this, but those
heuristics appear to have bit-rotted and result in haphazard
hugepage management. Unlucky (but otherwise fairly dense) regions of
memory end up not backed by huge pages while sparse regions end up
accidentally marked MADV_HUGEPAGE and are not later broken up by the
scavenger, because it already got the memory it needed from more
dense sections (this is more likely to happen with small heaps that
go idle).
In this change, the runtime uses a new policy:
1. Mark all new memory MADV_HUGEPAGE.
2. Track whether each page chunk (4 MiB) became dense during the GC
cycle. Mark those MADV_HUGEPAGE, and hide them from the scavenger.
3. If a chunk is not dense for 1 full GC cycle, make it visible to the
scavenger.
4. The scavenger marks a chunk MADV_NOHUGEPAGE before it scavenges it.
This policy is intended to try and back memory that is a good candidate
for huge pages (high occupancy) with huge pages, and give memory that is
not (low occupancy) to the scavenger. Occupancy is defined not just by
occupancy at any instant of time, but also occupancy in the near future.
It's generally true that by the end of a GC cycle the heap gets quite
dense (from the perspective of the page allocator).
Because we want scavenging and huge page management to happen together
(the right time to MADV_NOHUGEPAGE is just before scavenging in order to
break up huge pages and keep them that way) and the cost of applying
MADV_HUGEPAGE and MADV_NOHUGEPAGE is somewhat high, the scavenger avoids
releasing memory in dense page chunks. All this together means the
scavenger will now more generally release memory on a ~1 GC cycle delay.
Notably this has implications for scavenging to maintain the memory
limit and the runtime/debug.FreeOSMemory API. This change makes it so
that in these cases all memory is visible to the scavenger regardless of
sparseness and delays the page allocator in re-marking this memory with
MADV_NOHUGEPAGE for around 1 GC cycle to mitigate churn.
The end result of this change should be little-to-no performance
difference for dense heaps (MADV_HUGEPAGE works a lot like the default
unmarked state) but should allow the scavenger to more effectively take
back fragments of huge pages. The main risk here is churn, because
MADV_HUGEPAGE usually forces the kernel to immediately back memory with
a huge page. That's the reason for the large amount of hysteresis (1
full GC cycle) and why the definition of high density is 96% occupancy.
Fixes #55328.
Change-Id: I8da7998f1a31b498a9cc9bc662c1ae1a6bf64630
Reviewed-on: https://go-review.googlesource.com/c/go/+/436395
Reviewed-by: Michael Pratt <mpratt@google.com>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
2022-09-23 16:32:34 +00:00
|
|
|
func (s *ScavengeIndex) AllocRange(base, limit uintptr) {
|
|
|
|
|
sc, ec := chunkIndex(base), chunkIndex(limit-1)
|
|
|
|
|
si, ei := chunkPageIndex(base), chunkPageIndex(limit-1)
|
|
|
|
|
|
|
|
|
|
if sc == ec {
|
|
|
|
|
// The range doesn't cross any chunk boundaries.
|
|
|
|
|
s.i.alloc(sc, ei+1-si)
|
|
|
|
|
} else {
|
|
|
|
|
// The range crosses at least one chunk boundary.
|
|
|
|
|
s.i.alloc(sc, pallocChunkPages-si)
|
|
|
|
|
for c := sc + 1; c < ec; c++ {
|
|
|
|
|
s.i.alloc(c, pallocChunkPages)
|
|
|
|
|
}
|
|
|
|
|
s.i.alloc(ec, ei+1)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (s *ScavengeIndex) FreeRange(base, limit uintptr) {
|
|
|
|
|
sc, ec := chunkIndex(base), chunkIndex(limit-1)
|
|
|
|
|
si, ei := chunkPageIndex(base), chunkPageIndex(limit-1)
|
|
|
|
|
|
|
|
|
|
if sc == ec {
|
|
|
|
|
// The range doesn't cross any chunk boundaries.
|
|
|
|
|
s.i.free(sc, si, ei+1-si)
|
|
|
|
|
} else {
|
|
|
|
|
// The range crosses at least one chunk boundary.
|
|
|
|
|
s.i.free(sc, si, pallocChunkPages-si)
|
|
|
|
|
for c := sc + 1; c < ec; c++ {
|
|
|
|
|
s.i.free(c, 0, pallocChunkPages)
|
|
|
|
|
}
|
|
|
|
|
s.i.free(ec, 0, ei+1)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (s *ScavengeIndex) ResetSearchAddrs() {
|
|
|
|
|
for _, a := range []*atomicOffAddr{&s.i.searchAddrBg, &s.i.searchAddrForce} {
|
|
|
|
|
addr, marked := a.Load()
|
|
|
|
|
if marked {
|
|
|
|
|
a.StoreUnmark(addr, addr)
|
|
|
|
|
}
|
|
|
|
|
a.Clear()
|
|
|
|
|
}
|
|
|
|
|
s.i.freeHWM = minOffAddr
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (s *ScavengeIndex) NextGen() {
|
|
|
|
|
s.i.nextGen()
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (s *ScavengeIndex) SetEmpty(ci ChunkIdx) {
|
|
|
|
|
s.i.setEmpty(chunkIdx(ci))
|
runtime: redesign scavenging algorithm
Currently the runtime's scavenging algorithm involves running from the
top of the heap address space to the bottom (or as far as it gets) once
per GC cycle. Once it treads some ground, it doesn't tread it again
until the next GC cycle.
This works just fine for the background scavenger, for heap-growth
scavenging, and for debug.FreeOSMemory. However, it breaks down in the
face of a memory limit for small heaps in the tens of MiB. Basically,
because the scavenger never retreads old ground, it's completely
oblivious to new memory it could scavenge, and that it really *should*
in the face of a memory limit.
Also, every time some thread goes to scavenge in the runtime, it
reserves what could be a considerable amount of address space, hiding it
from other scavengers.
This change modifies and simplifies the implementation overall. It's
less code with complexities that are much better encapsulated. The
current implementation iterates optimistically over the address space
looking for memory to scavenge, keeping track of what it last saw. The
new implementation does the same, but instead of directly iterating over
pages, it iterates over chunks. It maintains an index of chunks (as a
bitmap over the address space) that indicate which chunks may contain
scavenge work. The page allocator populates this index, while scavengers
consume it and iterate over it optimistically.
This has a two key benefits:
1. Scavenging is much simpler: find a candidate chunk, and check it,
essentially just using the scavengeOne fast path. There's no need for
the complexity of iterating beyond one chunk, because the index is
lock-free and already maintains that information.
2. If pages are freed to the page allocator (always guaranteed to be
unscavenged), the page allocator immediately notifies all scavengers
of the new source of work, avoiding the hiding issues of the old
implementation.
One downside of the new implementation, however, is that it's
potentially more expensive to find pages to scavenge. In the past, if
a single page would become free high up in the address space, the
runtime's scavengers would ignore it. Now that scavengers won't, one or
more scavengers may need to iterate potentially across the whole heap to
find the next source of work. For the background scavenger, this just
means a potentially less reactive scavenger -- overall it should still
use the same amount of CPU. It means worse overheads for memory limit
scavenging, but that's not exactly something with a baseline yet.
In practice, this shouldn't be too bad, hopefully since the chunk index
is extremely compact. For a 48-bit address space, the index is only 8
MiB in size at worst, but even just one physical page in the index is
able to support up to 128 GiB heaps, provided they aren't terribly
sparse. On 32-bit platforms, the index is only 128 bytes in size.
For #48409.
Change-Id: I72b7e74365046b18c64a6417224c5d85511194fb
Reviewed-on: https://go-review.googlesource.com/c/go/+/399474
Reviewed-by: Michael Pratt <mpratt@google.com>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
2022-04-10 20:34:17 +00:00
|
|
|
}
|
|
|
|
|
|
runtime: manage huge pages explicitly
This change makes it so that on Linux the Go runtime explicitly marks
page heap memory as either available to be backed by hugepages or not
using heuristics based on density.
The motivation behind this change is twofold:
1. In default Linux configurations, khugepaged can recoalesce hugepages
even after the scavenger breaks them up, resulting in significant
overheads for small heaps when their heaps shrink.
2. The Go runtime already has some heuristics about this, but those
heuristics appear to have bit-rotted and result in haphazard
hugepage management. Unlucky (but otherwise fairly dense) regions of
memory end up not backed by huge pages while sparse regions end up
accidentally marked MADV_HUGEPAGE and are not later broken up by the
scavenger, because it already got the memory it needed from more
dense sections (this is more likely to happen with small heaps that
go idle).
In this change, the runtime uses a new policy:
1. Mark all new memory MADV_HUGEPAGE.
2. Track whether each page chunk (4 MiB) became dense during the GC
cycle. Mark those MADV_HUGEPAGE, and hide them from the scavenger.
3. If a chunk is not dense for 1 full GC cycle, make it visible to the
scavenger.
4. The scavenger marks a chunk MADV_NOHUGEPAGE before it scavenges it.
This policy is intended to try and back memory that is a good candidate
for huge pages (high occupancy) with huge pages, and give memory that is
not (low occupancy) to the scavenger. Occupancy is defined not just by
occupancy at any instant of time, but also occupancy in the near future.
It's generally true that by the end of a GC cycle the heap gets quite
dense (from the perspective of the page allocator).
Because we want scavenging and huge page management to happen together
(the right time to MADV_NOHUGEPAGE is just before scavenging in order to
break up huge pages and keep them that way) and the cost of applying
MADV_HUGEPAGE and MADV_NOHUGEPAGE is somewhat high, the scavenger avoids
releasing memory in dense page chunks. All this together means the
scavenger will now more generally release memory on a ~1 GC cycle delay.
Notably this has implications for scavenging to maintain the memory
limit and the runtime/debug.FreeOSMemory API. This change makes it so
that in these cases all memory is visible to the scavenger regardless of
sparseness and delays the page allocator in re-marking this memory with
MADV_NOHUGEPAGE for around 1 GC cycle to mitigate churn.
The end result of this change should be little-to-no performance
difference for dense heaps (MADV_HUGEPAGE works a lot like the default
unmarked state) but should allow the scavenger to more effectively take
back fragments of huge pages. The main risk here is churn, because
MADV_HUGEPAGE usually forces the kernel to immediately back memory with
a huge page. That's the reason for the large amount of hysteresis (1
full GC cycle) and why the definition of high density is 96% occupancy.
Fixes #55328.
Change-Id: I8da7998f1a31b498a9cc9bc662c1ae1a6bf64630
Reviewed-on: https://go-review.googlesource.com/c/go/+/436395
Reviewed-by: Michael Pratt <mpratt@google.com>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
2022-09-23 16:32:34 +00:00
|
|
|
func (s *ScavengeIndex) SetNoHugePage(ci ChunkIdx) bool {
|
|
|
|
|
return s.i.setNoHugePage(chunkIdx(ci))
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func CheckPackScavChunkData(gen uint32, inUse, lastInUse uint16, flags uint8) bool {
|
|
|
|
|
sc0 := scavChunkData{
|
|
|
|
|
gen: gen,
|
|
|
|
|
inUse: inUse,
|
|
|
|
|
lastInUse: lastInUse,
|
|
|
|
|
scavChunkFlags: scavChunkFlags(flags),
|
|
|
|
|
}
|
|
|
|
|
scp := sc0.pack()
|
|
|
|
|
sc1 := unpackScavChunkData(scp)
|
|
|
|
|
return sc0 == sc1
|
runtime: redesign scavenging algorithm
Currently the runtime's scavenging algorithm involves running from the
top of the heap address space to the bottom (or as far as it gets) once
per GC cycle. Once it treads some ground, it doesn't tread it again
until the next GC cycle.
This works just fine for the background scavenger, for heap-growth
scavenging, and for debug.FreeOSMemory. However, it breaks down in the
face of a memory limit for small heaps in the tens of MiB. Basically,
because the scavenger never retreads old ground, it's completely
oblivious to new memory it could scavenge, and that it really *should*
in the face of a memory limit.
Also, every time some thread goes to scavenge in the runtime, it
reserves what could be a considerable amount of address space, hiding it
from other scavengers.
This change modifies and simplifies the implementation overall. It's
less code with complexities that are much better encapsulated. The
current implementation iterates optimistically over the address space
looking for memory to scavenge, keeping track of what it last saw. The
new implementation does the same, but instead of directly iterating over
pages, it iterates over chunks. It maintains an index of chunks (as a
bitmap over the address space) that indicate which chunks may contain
scavenge work. The page allocator populates this index, while scavengers
consume it and iterate over it optimistically.
This has a two key benefits:
1. Scavenging is much simpler: find a candidate chunk, and check it,
essentially just using the scavengeOne fast path. There's no need for
the complexity of iterating beyond one chunk, because the index is
lock-free and already maintains that information.
2. If pages are freed to the page allocator (always guaranteed to be
unscavenged), the page allocator immediately notifies all scavengers
of the new source of work, avoiding the hiding issues of the old
implementation.
One downside of the new implementation, however, is that it's
potentially more expensive to find pages to scavenge. In the past, if
a single page would become free high up in the address space, the
runtime's scavengers would ignore it. Now that scavengers won't, one or
more scavengers may need to iterate potentially across the whole heap to
find the next source of work. For the background scavenger, this just
means a potentially less reactive scavenger -- overall it should still
use the same amount of CPU. It means worse overheads for memory limit
scavenging, but that's not exactly something with a baseline yet.
In practice, this shouldn't be too bad, hopefully since the chunk index
is extremely compact. For a 48-bit address space, the index is only 8
MiB in size at worst, but even just one physical page in the index is
able to support up to 128 GiB heaps, provided they aren't terribly
sparse. On 32-bit platforms, the index is only 128 bytes in size.
For #48409.
Change-Id: I72b7e74365046b18c64a6417224c5d85511194fb
Reviewed-on: https://go-review.googlesource.com/c/go/+/399474
Reviewed-by: Michael Pratt <mpratt@google.com>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
2022-04-10 20:34:17 +00:00
|
|
|
}
|
2022-08-31 21:34:23 +00:00
|
|
|
|
|
|
|
|
const GTrackingPeriod = gTrackingPeriod
|
runtime: add safe arena support to the runtime
This change adds an API to the runtime for arenas. A later CL can
potentially export it as an experimental API, but for now, just the
runtime implementation will suffice.
The purpose of arenas is to improve efficiency, primarily by allowing
for an application to manually free memory, thereby delaying garbage
collection. It comes with other potential performance benefits, such as
better locality, a better allocation strategy, and better handling of
interior pointers by the GC.
This implementation is based on one by danscales@google.com with a few
significant differences:
* The implementation lives entirely in the runtime (all layers).
* Arena chunks are the minimum of 8 MiB or the heap arena size. This
choice is made because in practice 64 MiB appears to be way too large
of an area for most real-world use-cases.
* Arena chunks are not unmapped, instead they're placed on an evacuation
list and when there are no pointers left pointing into them, they're
allowed to be reused.
* Reusing partially-used arena chunks no longer tries to find one used
by the same P first; it just takes the first one available.
* In order to ensure worst-case fragmentation is never worse than 25%,
only types and slice backing stores whose sizes are 1/4th the size of
a chunk or less may be used. Previously larger sizes, up to the size
of the chunk, were allowed.
* ASAN, MSAN, and the race detector are fully supported.
* Sets arena chunks to fault that were deferred at the end of mark
termination (a non-public patch once did this; I don't see a reason
not to continue that).
For #51317.
Change-Id: I83b1693a17302554cb36b6daa4e9249a81b1644f
Reviewed-on: https://go-review.googlesource.com/c/go/+/423359
Reviewed-by: Cherry Mui <cherryyz@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
2022-08-12 21:40:46 +00:00
|
|
|
|
|
|
|
|
var ZeroBase = unsafe.Pointer(&zerobase)
|
|
|
|
|
|
|
|
|
|
const UserArenaChunkBytes = userArenaChunkBytes
|
|
|
|
|
|
|
|
|
|
type UserArena struct {
|
|
|
|
|
arena *userArena
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func NewUserArena() *UserArena {
|
|
|
|
|
return &UserArena{newUserArena()}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (a *UserArena) New(out *any) {
|
|
|
|
|
i := efaceOf(out)
|
|
|
|
|
typ := i._type
|
|
|
|
|
if typ.kind&kindMask != kindPtr {
|
|
|
|
|
panic("new result of non-ptr type")
|
|
|
|
|
}
|
|
|
|
|
typ = (*ptrtype)(unsafe.Pointer(typ)).elem
|
|
|
|
|
i.data = a.arena.new(typ)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (a *UserArena) Slice(sl any, cap int) {
|
|
|
|
|
a.arena.slice(sl, cap)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (a *UserArena) Free() {
|
|
|
|
|
a.arena.free()
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func GlobalWaitingArenaChunks() int {
|
|
|
|
|
n := 0
|
|
|
|
|
systemstack(func() {
|
|
|
|
|
lock(&mheap_.lock)
|
|
|
|
|
for s := mheap_.userArena.quarantineList.first; s != nil; s = s.next {
|
|
|
|
|
n++
|
|
|
|
|
}
|
|
|
|
|
unlock(&mheap_.lock)
|
|
|
|
|
})
|
|
|
|
|
return n
|
|
|
|
|
}
|
|
|
|
|
|
2022-08-12 23:25:56 +00:00
|
|
|
func UserArenaClone[T any](s T) T {
|
|
|
|
|
return arena_heapify(s).(T)
|
|
|
|
|
}
|
|
|
|
|
|
runtime: add safe arena support to the runtime
This change adds an API to the runtime for arenas. A later CL can
potentially export it as an experimental API, but for now, just the
runtime implementation will suffice.
The purpose of arenas is to improve efficiency, primarily by allowing
for an application to manually free memory, thereby delaying garbage
collection. It comes with other potential performance benefits, such as
better locality, a better allocation strategy, and better handling of
interior pointers by the GC.
This implementation is based on one by danscales@google.com with a few
significant differences:
* The implementation lives entirely in the runtime (all layers).
* Arena chunks are the minimum of 8 MiB or the heap arena size. This
choice is made because in practice 64 MiB appears to be way too large
of an area for most real-world use-cases.
* Arena chunks are not unmapped, instead they're placed on an evacuation
list and when there are no pointers left pointing into them, they're
allowed to be reused.
* Reusing partially-used arena chunks no longer tries to find one used
by the same P first; it just takes the first one available.
* In order to ensure worst-case fragmentation is never worse than 25%,
only types and slice backing stores whose sizes are 1/4th the size of
a chunk or less may be used. Previously larger sizes, up to the size
of the chunk, were allowed.
* ASAN, MSAN, and the race detector are fully supported.
* Sets arena chunks to fault that were deferred at the end of mark
termination (a non-public patch once did this; I don't see a reason
not to continue that).
For #51317.
Change-Id: I83b1693a17302554cb36b6daa4e9249a81b1644f
Reviewed-on: https://go-review.googlesource.com/c/go/+/423359
Reviewed-by: Cherry Mui <cherryyz@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
2022-08-12 21:40:46 +00:00
|
|
|
var AlignUp = alignUp
|
|
|
|
|
|
|
|
|
|
// BlockUntilEmptyFinalizerQueue blocks until either the finalizer
|
|
|
|
|
// queue is emptied (and the finalizers have executed) or the timeout
|
|
|
|
|
// is reached. Returns true if the finalizer queue was emptied.
|
|
|
|
|
func BlockUntilEmptyFinalizerQueue(timeout int64) bool {
|
|
|
|
|
start := nanotime()
|
|
|
|
|
for nanotime()-start < timeout {
|
|
|
|
|
lock(&finlock)
|
|
|
|
|
// We know the queue has been drained when both finq is nil
|
|
|
|
|
// and the finalizer g has stopped executing.
|
|
|
|
|
empty := finq == nil
|
|
|
|
|
empty = empty && readgstatus(fing) == _Gwaiting && fing.waitreason == waitReasonFinalizerWait
|
|
|
|
|
unlock(&finlock)
|
|
|
|
|
if empty {
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
Gosched()
|
|
|
|
|
}
|
|
|
|
|
return false
|
|
|
|
|
}
|
2022-09-07 13:23:19 -04:00
|
|
|
|
|
|
|
|
func FrameStartLine(f *Frame) int {
|
|
|
|
|
return f.startLine
|
|
|
|
|
}
|
2022-11-08 17:48:48 -08:00
|
|
|
|
|
|
|
|
// PersistentAlloc allocates some memory that lives outside the Go heap.
|
|
|
|
|
// This memory will never be freed; use sparingly.
|
|
|
|
|
func PersistentAlloc(n uintptr) unsafe.Pointer {
|
|
|
|
|
return persistentalloc(n, 0, &memstats.other_sys)
|
|
|
|
|
}
|
2023-03-13 09:56:45 +01:00
|
|
|
|
|
|
|
|
// FPCallers works like Callers and uses frame pointer unwinding to populate
|
|
|
|
|
// pcBuf with the return addresses of the physical frames on the stack.
|
|
|
|
|
func FPCallers(skip int, pcBuf []uintptr) int {
|
|
|
|
|
return fpTracebackPCs(unsafe.Pointer(getcallerfp()), skip, pcBuf)
|
|
|
|
|
}
|