go/src/runtime/export_test.go

1223 lines
30 KiB
Go
Raw Normal View History

// Copyright 2010 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Export guts for testing.
package runtime
import (
"runtime/internal/atomic"
"runtime/internal/sys"
"unsafe"
)
var Fadd64 = fadd64
var Fsub64 = fsub64
var Fmul64 = fmul64
var Fdiv64 = fdiv64
var F64to32 = f64to32
var F32to64 = f32to64
var Fcmp64 = fcmp64
var Fintto64 = fintto64
var F64toint = f64toint
var Entersyscall = entersyscall
var Exitsyscall = exitsyscall
var LockedOSThread = lockedOSThread
var Xadduintptr = atomic.Xadduintptr
var FuncPC = funcPC
var Fastlog2 = fastlog2
var Atoi = atoi
var Atoi32 = atoi32
var Nanotime = nanotime
var NetpollBreak = netpollBreak
var Usleep = usleep
var PhysPageSize = physPageSize
var PhysHugePageSize = physHugePageSize
var NetpollGenericInit = netpollGenericInit
var Memmove = memmove
var MemclrNoHeapPointers = memclrNoHeapPointers
var LockPartialOrder = lockPartialOrder
type LockRank lockRank
func (l LockRank) String() string {
return lockRank(l).String()
}
const PreemptMSupported = preemptMSupported
type LFNode struct {
Next uint64
Pushcnt uintptr
}
func LFStackPush(head *uint64, node *LFNode) {
(*lfstack)(head).push((*lfnode)(unsafe.Pointer(node)))
}
func LFStackPop(head *uint64) *LFNode {
return (*LFNode)(unsafe.Pointer((*lfstack)(head).pop()))
}
func Netpoll(delta int64) {
systemstack(func() {
netpoll(delta)
})
}
func GCMask(x interface{}) (ret []byte) {
[dev.cc] runtime: delete scalararg, ptrarg; rename onM to systemstack Scalararg and ptrarg are not "signal safe". Go code filling them out can be interrupted by a signal, and then the signal handler runs, and if it also ends up in Go code that uses scalararg or ptrarg, now the old values have been smashed. For the pieces of code that do need to run in a signal handler, we introduced onM_signalok, which is really just onM except that the _signalok is meant to convey that the caller asserts that scalarg and ptrarg will be restored to their old values after the call (instead of the usual behavior, zeroing them). Scalararg and ptrarg are also untyped and therefore error-prone. Go code can always pass a closure instead of using scalararg and ptrarg; they were only really necessary for C code. And there's no more C code. For all these reasons, delete scalararg and ptrarg, converting the few remaining references to use closures. Once those are gone, there is no need for a distinction between onM and onM_signalok, so replace both with a single function equivalent to the current onM_signalok (that is, it can be called on any of the curg, g0, and gsignal stacks). The name onM and the phrase 'm stack' are misnomers, because on most system an M has two system stacks: the main thread stack and the signal handling stack. Correct the misnomer by naming the replacement function systemstack. Fix a few references to "M stack" in code. The main motivation for this change is to eliminate scalararg/ptrarg. Rick and I have already seen them cause problems because the calling sequence m.ptrarg[0] = p is a heap pointer assignment, so it gets a write barrier. The write barrier also uses onM, so it has all the same problems as if it were being invoked by a signal handler. We worked around this by saving and restoring the old values and by calling onM_signalok, but there's no point in keeping this nice home for bugs around any longer. This CL also changes funcline to return the file name as a result instead of filling in a passed-in *string. (The *string signature is left over from when the code was written in and called from C.) That's arguably an unrelated change, except that once I had done the ptrarg/scalararg/onM cleanup I started getting false positives about the *string argument escaping (not allowed in package runtime). The compiler is wrong, but the easiest fix is to write the code like Go code instead of like C code. I am a bit worried that the compiler is wrong because of some use of uninitialized memory in the escape analysis. If that's the reason, it will go away when we convert the compiler to Go. (And if not, we'll debug it the next time.) LGTM=khr R=r, khr CC=austin, golang-codereviews, iant, rlh https://golang.org/cl/174950043
2014-11-12 14:54:31 -05:00
systemstack(func() {
ret = getgcmask(x)
})
return
}
func RunSchedLocalQueueTest() {
_p_ := new(p)
gs := make([]g, len(_p_.runq))
for i := 0; i < len(_p_.runq); i++ {
if g, _ := runqget(_p_); g != nil {
throw("runq is not empty initially")
}
for j := 0; j < i; j++ {
runqput(_p_, &gs[i], false)
}
for j := 0; j < i; j++ {
if g, _ := runqget(_p_); g != &gs[i] {
print("bad element at iter ", i, "/", j, "\n")
throw("bad element")
}
}
if g, _ := runqget(_p_); g != nil {
throw("runq is not empty afterwards")
}
}
}
func RunSchedLocalQueueStealTest() {
p1 := new(p)
p2 := new(p)
gs := make([]g, len(p1.runq))
for i := 0; i < len(p1.runq); i++ {
for j := 0; j < i; j++ {
gs[j].sig = 0
runqput(p1, &gs[j], false)
}
gp := runqsteal(p2, p1, true)
s := 0
if gp != nil {
s++
gp.sig++
}
for {
gp, _ = runqget(p2)
if gp == nil {
break
}
s++
gp.sig++
}
for {
gp, _ = runqget(p1)
if gp == nil {
break
}
gp.sig++
}
for j := 0; j < i; j++ {
if gs[j].sig != 1 {
print("bad element ", j, "(", gs[j].sig, ") at iter ", i, "\n")
throw("bad element")
}
}
if s != i/2 && s != i/2+1 {
print("bad steal ", s, ", want ", i/2, " or ", i/2+1, ", iter ", i, "\n")
throw("bad steal")
}
}
}
func RunSchedLocalQueueEmptyTest(iters int) {
// Test that runq is not spuriously reported as empty.
// Runq emptiness affects scheduling decisions and spurious emptiness
// can lead to underutilization (both runnable Gs and idle Ps coexist
// for arbitrary long time).
done := make(chan bool, 1)
p := new(p)
gs := make([]g, 2)
ready := new(uint32)
for i := 0; i < iters; i++ {
*ready = 0
next0 := (i & 1) == 0
next1 := (i & 2) == 0
runqput(p, &gs[0], next0)
go func() {
for atomic.Xadd(ready, 1); atomic.Load(ready) != 2; {
}
if runqempty(p) {
println("next:", next0, next1)
throw("queue is empty")
}
done <- true
}()
for atomic.Xadd(ready, 1); atomic.Load(ready) != 2; {
}
runqput(p, &gs[1], next1)
runqget(p)
<-done
runqget(p)
}
}
var (
StringHash = stringHash
BytesHash = bytesHash
Int32Hash = int32Hash
Int64Hash = int64Hash
MemHash = memhash
MemHash32 = memhash32
MemHash64 = memhash64
EfaceHash = efaceHash
IfaceHash = ifaceHash
)
var UseAeshash = &useAeshash
func MemclrBytes(b []byte) {
s := (*slice)(unsafe.Pointer(&b))
memclrNoHeapPointers(s.array, uintptr(s.len))
}
var HashLoad = &hashLoad
// entry point for testing
func GostringW(w []uint16) (s string) {
[dev.cc] runtime: delete scalararg, ptrarg; rename onM to systemstack Scalararg and ptrarg are not "signal safe". Go code filling them out can be interrupted by a signal, and then the signal handler runs, and if it also ends up in Go code that uses scalararg or ptrarg, now the old values have been smashed. For the pieces of code that do need to run in a signal handler, we introduced onM_signalok, which is really just onM except that the _signalok is meant to convey that the caller asserts that scalarg and ptrarg will be restored to their old values after the call (instead of the usual behavior, zeroing them). Scalararg and ptrarg are also untyped and therefore error-prone. Go code can always pass a closure instead of using scalararg and ptrarg; they were only really necessary for C code. And there's no more C code. For all these reasons, delete scalararg and ptrarg, converting the few remaining references to use closures. Once those are gone, there is no need for a distinction between onM and onM_signalok, so replace both with a single function equivalent to the current onM_signalok (that is, it can be called on any of the curg, g0, and gsignal stacks). The name onM and the phrase 'm stack' are misnomers, because on most system an M has two system stacks: the main thread stack and the signal handling stack. Correct the misnomer by naming the replacement function systemstack. Fix a few references to "M stack" in code. The main motivation for this change is to eliminate scalararg/ptrarg. Rick and I have already seen them cause problems because the calling sequence m.ptrarg[0] = p is a heap pointer assignment, so it gets a write barrier. The write barrier also uses onM, so it has all the same problems as if it were being invoked by a signal handler. We worked around this by saving and restoring the old values and by calling onM_signalok, but there's no point in keeping this nice home for bugs around any longer. This CL also changes funcline to return the file name as a result instead of filling in a passed-in *string. (The *string signature is left over from when the code was written in and called from C.) That's arguably an unrelated change, except that once I had done the ptrarg/scalararg/onM cleanup I started getting false positives about the *string argument escaping (not allowed in package runtime). The compiler is wrong, but the easiest fix is to write the code like Go code instead of like C code. I am a bit worried that the compiler is wrong because of some use of uninitialized memory in the escape analysis. If that's the reason, it will go away when we convert the compiler to Go. (And if not, we'll debug it the next time.) LGTM=khr R=r, khr CC=austin, golang-codereviews, iant, rlh https://golang.org/cl/174950043
2014-11-12 14:54:31 -05:00
systemstack(func() {
s = gostringw(&w[0])
})
return
}
var Open = open
var Close = closefd
var Read = read
var Write = write
func Envs() []string { return envs }
func SetEnvs(e []string) { envs = e }
var BigEndian = sys.BigEndian
// For benchmarking.
func BenchSetType(n int, x interface{}) {
e := *efaceOf(&x)
t := e._type
var size uintptr
var p unsafe.Pointer
switch t.kind & kindMask {
case kindPtr:
t = (*ptrtype)(unsafe.Pointer(t)).elem
size = t.size
p = e.data
case kindSlice:
slice := *(*struct {
ptr unsafe.Pointer
len, cap uintptr
})(e.data)
t = (*slicetype)(unsafe.Pointer(t)).elem
size = t.size * slice.len
p = slice.ptr
}
allocSize := roundupsize(size)
systemstack(func() {
for i := 0; i < n; i++ {
heapBitsSetType(uintptr(p), allocSize, size, t)
}
})
}
const PtrSize = sys.PtrSize
var ForceGCPeriod = &forcegcperiod
// SetTracebackEnv is like runtime/debug.SetTraceback, but it raises
// the "environment" traceback level, so later calls to
// debug.SetTraceback (e.g., from testing timeouts) can't lower it.
func SetTracebackEnv(level string) {
setTraceback(level)
traceback_env = traceback_cache
}
var ReadUnaligned32 = readUnaligned32
var ReadUnaligned64 = readUnaligned64
2016-03-29 12:28:24 -04:00
func CountPagesInUse() (pagesInUse, counted uintptr) {
stopTheWorld("CountPagesInUse")
pagesInUse = uintptr(mheap_.pagesInUse)
for _, s := range mheap_.allspans {
runtime: atomically set span state and use as publication barrier When everything is working correctly, any pointer the garbage collector encounters can only point into a fully initialized heap span, since the span must have been initialized before that pointer could escape the heap allocator and become visible to the GC. However, in various cases, we try to be defensive against bad pointers. In findObject, this is just a sanity check: we never expect to find a bad pointer, but programming errors can lead to them. In spanOfHeap, we don't necessarily trust the pointer and we're trying to check if it really does point to the heap, though it should always point to something. Conservative scanning takes this to a new level, since it can only guess that a word may be a pointer and verify this. In all of these cases, we have a problem that the span lookup and check can race with span initialization, since the span becomes visible to lookups before it's fully initialized. Furthermore, we're about to start initializing the span without the heap lock held, which is going to introduce races where accesses were previously protected by the heap lock. To address this, this CL makes accesses to mspan.state atomic, and ensures that the span is fully initialized before setting the state to mSpanInUse. All loads are now atomic, and in any case where we don't trust the pointer, it first atomically loads the span state and checks that it's mSpanInUse, after which it will have synchronized with span initialization and can safely check the other span fields. For #10958, #24543, but a good fix in general. Change-Id: I518b7c63555b02064b98aa5f802c92b758fef853 Reviewed-on: https://go-review.googlesource.com/c/go/+/203286 Run-TryBot: Austin Clements <austin@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Michael Knyszek <mknyszek@google.com>
2019-10-23 11:25:38 -04:00
if s.state.get() == mSpanInUse {
2016-03-29 12:28:24 -04:00
counted += s.npages
}
}
startTheWorld()
return
}
func Fastrand() uint32 { return fastrand() }
func Fastrandn(n uint32) uint32 { return fastrandn(n) }
type ProfBuf profBuf
func NewProfBuf(hdrsize, bufwords, tags int) *ProfBuf {
return (*ProfBuf)(newProfBuf(hdrsize, bufwords, tags))
}
func (p *ProfBuf) Write(tag *unsafe.Pointer, now int64, hdr []uint64, stk []uintptr) {
(*profBuf)(p).write(tag, now, hdr, stk)
}
const (
ProfBufBlocking = profBufBlocking
ProfBufNonBlocking = profBufNonBlocking
)
func (p *ProfBuf) Read(mode profBufReadMode) ([]uint64, []unsafe.Pointer, bool) {
return (*profBuf)(p).read(profBufReadMode(mode))
}
func (p *ProfBuf) Close() {
(*profBuf)(p).close()
}
func ReadMetricsSlow(memStats *MemStats, samplesp unsafe.Pointer, len, cap int) {
stopTheWorld("ReadMetricsSlow")
// Initialize the metrics beforehand because this could
// allocate and skew the stats.
semacquire(&metricsSema)
initMetrics()
semrelease(&metricsSema)
systemstack(func() {
// Read memstats first. It's going to flush
// the mcaches which readMetrics does not do, so
// going the other way around may result in
// inconsistent statistics.
readmemstats_m(memStats)
})
// Read metrics off the system stack.
//
// The only part of readMetrics that could allocate
// and skew the stats is initMetrics.
readMetrics(samplesp, len, cap)
startTheWorld()
}
// ReadMemStatsSlow returns both the runtime-computed MemStats and
// MemStats accumulated by scanning the heap.
func ReadMemStatsSlow() (base, slow MemStats) {
stopTheWorld("ReadMemStatsSlow")
// Run on the system stack to avoid stack growth allocation.
systemstack(func() {
// Make sure stats don't change.
getg().m.mallocing++
readmemstats_m(&base)
// Initialize slow from base and zero the fields we're
// recomputing.
slow = base
slow.Alloc = 0
slow.TotalAlloc = 0
slow.Mallocs = 0
slow.Frees = 0
slow.HeapReleased = 0
var bySize [_NumSizeClasses]struct {
Mallocs, Frees uint64
}
// Add up current allocations in spans.
for _, s := range mheap_.allspans {
runtime: atomically set span state and use as publication barrier When everything is working correctly, any pointer the garbage collector encounters can only point into a fully initialized heap span, since the span must have been initialized before that pointer could escape the heap allocator and become visible to the GC. However, in various cases, we try to be defensive against bad pointers. In findObject, this is just a sanity check: we never expect to find a bad pointer, but programming errors can lead to them. In spanOfHeap, we don't necessarily trust the pointer and we're trying to check if it really does point to the heap, though it should always point to something. Conservative scanning takes this to a new level, since it can only guess that a word may be a pointer and verify this. In all of these cases, we have a problem that the span lookup and check can race with span initialization, since the span becomes visible to lookups before it's fully initialized. Furthermore, we're about to start initializing the span without the heap lock held, which is going to introduce races where accesses were previously protected by the heap lock. To address this, this CL makes accesses to mspan.state atomic, and ensures that the span is fully initialized before setting the state to mSpanInUse. All loads are now atomic, and in any case where we don't trust the pointer, it first atomically loads the span state and checks that it's mSpanInUse, after which it will have synchronized with span initialization and can safely check the other span fields. For #10958, #24543, but a good fix in general. Change-Id: I518b7c63555b02064b98aa5f802c92b758fef853 Reviewed-on: https://go-review.googlesource.com/c/go/+/203286 Run-TryBot: Austin Clements <austin@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Michael Knyszek <mknyszek@google.com>
2019-10-23 11:25:38 -04:00
if s.state.get() != mSpanInUse {
continue
}
if sizeclass := s.spanclass.sizeclass(); sizeclass == 0 {
slow.Mallocs++
slow.Alloc += uint64(s.elemsize)
} else {
slow.Mallocs += uint64(s.allocCount)
slow.Alloc += uint64(s.allocCount) * uint64(s.elemsize)
bySize[sizeclass].Mallocs += uint64(s.allocCount)
}
}
// Add in frees by just reading the stats for those directly.
var m heapStatsDelta
memstats.heapStats.unsafeRead(&m)
// Collect per-sizeclass free stats.
var smallFree uint64
for i := 0; i < _NumSizeClasses; i++ {
slow.Frees += uint64(m.smallFreeCount[i])
bySize[i].Frees += uint64(m.smallFreeCount[i])
bySize[i].Mallocs += uint64(m.smallFreeCount[i])
smallFree += uint64(m.smallFreeCount[i]) * uint64(class_to_size[i])
}
slow.Frees += memstats.tinyallocs + uint64(m.largeFreeCount)
slow.Mallocs += slow.Frees
slow.TotalAlloc = slow.Alloc + uint64(m.largeFree) + smallFree
for i := range slow.BySize {
slow.BySize[i].Mallocs = bySize[i].Mallocs
slow.BySize[i].Frees = bySize[i].Frees
}
for i := mheap_.pages.start; i < mheap_.pages.end; i++ {
chunk := mheap_.pages.tryChunkOf(i)
if chunk == nil {
continue
}
pg := chunk.scavenged.popcntRange(0, pallocChunkPages)
slow.HeapReleased += uint64(pg) * pageSize
}
for _, p := range allp {
pg := sys.OnesCount64(p.pcache.scav)
slow.HeapReleased += uint64(pg) * pageSize
}
runtime: grow the heap incrementally Currently, we map and grow the heap a whole arena (64MB) at a time. Unfortunately, in order to fix #32828, we need to switch from scavenging inline with allocation back to scavenging on heap growth, but heap-growth scavenging happens in large jumps because we grow the heap in large jumps. In order to prepare for better heap-growth scavenging, this CL separates mapping more space for the heap from actually "growing" it (tracking the new space with spans). Instead, growing the heap keeps track of the "current arena" it's growing into. It track that with new spans as needed, and only maps more arena space when the current arena is inadequate. The effect to the user is the same, but this will let us scavenge on much smaller increments of heap growth. There are two slightly subtleties to this change: 1. If an allocation requires mapping a new arena and that new arena isn't contiguous with the current arena, we don't want to lose the unused space in the current arena, so we have to immediately track that with a span. 2. The mapped space must be accounted as released and idle, even though it isn't actually tracked in a span. For #32828, since this makes heap-growth scavenging far more effective, especially at small heap sizes. For example, this change is necessary for TestPhysicalMemoryUtilization to pass once we remove inline scavenging. Change-Id: I300e74a0534062467e4ce91cdc3508e5ef9aa73a Reviewed-on: https://go-review.googlesource.com/c/go/+/189957 Run-TryBot: Austin Clements <austin@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org> Reviewed-by: Michael Knyszek <mknyszek@google.com>
2019-08-12 14:54:28 -04:00
// Unused space in the current arena also counts as released space.
slow.HeapReleased += uint64(mheap_.curArena.end - mheap_.curArena.base)
getg().m.mallocing--
})
startTheWorld()
return
}
// BlockOnSystemStack switches to the system stack, prints "x\n" to
// stderr, and blocks in a stack containing
// "runtime.blockOnSystemStackInternal".
func BlockOnSystemStack() {
systemstack(blockOnSystemStackInternal)
}
func blockOnSystemStackInternal() {
print("x\n")
lock(&deadlock)
lock(&deadlock)
}
type RWMutex struct {
rw rwmutex
}
func (rw *RWMutex) RLock() {
rw.rw.rlock()
}
func (rw *RWMutex) RUnlock() {
rw.rw.runlock()
}
func (rw *RWMutex) Lock() {
rw.rw.lock()
}
func (rw *RWMutex) Unlock() {
rw.rw.unlock()
}
const RuntimeHmapSize = unsafe.Sizeof(hmap{})
func MapBucketsCount(m map[int]int) int {
h := *(**hmap)(unsafe.Pointer(&m))
return 1 << h.B
}
func MapBucketsPointerIsNil(m map[int]int) bool {
h := *(**hmap)(unsafe.Pointer(&m))
return h.buckets == nil
}
func LockOSCounts() (external, internal uint32) {
g := getg()
if g.m.lockedExt+g.m.lockedInt == 0 {
if g.lockedm != 0 {
panic("lockedm on non-locked goroutine")
}
} else {
if g.lockedm == 0 {
panic("nil lockedm on locked goroutine")
}
}
return g.m.lockedExt, g.m.lockedInt
}
//go:noinline
func TracebackSystemstack(stk []uintptr, i int) int {
if i == 0 {
pc, sp := getcallerpc(), getcallersp()
return gentraceback(pc, sp, 0, getg(), 0, &stk[0], len(stk), nil, nil, _TraceJumpStack)
}
n := 0
systemstack(func() {
n = TracebackSystemstack(stk, i-1)
})
return n
}
runtime: use sparse mappings for the heap This replaces the contiguous heap arena mapping with a potentially sparse mapping that can support heap mappings anywhere in the address space. This has several advantages over the current approach: * There is no longer any limit on the size of the Go heap. (Currently it's limited to 512GB.) Hence, this fixes #10460. * It eliminates many failures modes of heap initialization and growing. In particular it eliminates any possibility of panicking with an address space conflict. This can happen for many reasons and even causes a low but steady rate of TSAN test failures because of conflicts with the TSAN runtime. See #16936 and #11993. * It eliminates the notion of "non-reserved" heap, which was added because creating huge address space reservations (particularly on 64-bit) led to huge process VSIZE. This was at best confusing and at worst conflicted badly with ulimit -v. However, the non-reserved heap logic is complicated, can race with other mappings in non-pure Go binaries (e.g., #18976), and requires that the entire heap be either reserved or non-reserved. We currently maintain the latter property, but it's quite difficult to convince yourself of that, and hence difficult to keep correct. This logic is still present, but will be removed in the next CL. * It fixes problems on 32-bit where skipping over parts of the address space leads to mapping huge (and never-to-be-used) metadata structures. See #19831. This also completely rewrites and significantly simplifies mheap.sysAlloc, which has been a source of many bugs. E.g., #21044, #20259, #18651, and #13143 (and maybe #23222). This change also makes it possible to allocate individual objects larger than 512GB. As a result, a few tests that expected huge allocations to fail needed to be changed to make even larger allocations. However, at the moment attempting to allocate a humongous object may cause the program to freeze for several minutes on Linux as we fall back to probing every page with addrspace_free. That logic (and this failure mode) will be removed in the next CL. Fixes #10460. Fixes #22204 (since it rewrites the code involved). This slightly slows down compilebench and the x/benchmarks garbage benchmark. name old time/op new time/op delta Template 184ms ± 1% 185ms ± 1% ~ (p=0.065 n=10+9) Unicode 86.9ms ± 3% 86.3ms ± 1% ~ (p=0.631 n=10+10) GoTypes 599ms ± 0% 602ms ± 0% +0.56% (p=0.000 n=10+9) Compiler 2.87s ± 1% 2.89s ± 1% +0.51% (p=0.002 n=9+10) SSA 7.29s ± 1% 7.25s ± 1% ~ (p=0.182 n=10+9) Flate 118ms ± 2% 118ms ± 1% ~ (p=0.113 n=9+9) GoParser 147ms ± 1% 148ms ± 1% +1.07% (p=0.003 n=9+10) Reflect 401ms ± 1% 404ms ± 1% +0.71% (p=0.003 n=10+9) Tar 175ms ± 1% 175ms ± 1% ~ (p=0.604 n=9+10) XML 209ms ± 1% 210ms ± 1% ~ (p=0.052 n=10+10) (https://perf.golang.org/search?q=upload:20171231.4) name old time/op new time/op delta Garbage/benchmem-MB=64-12 2.23ms ± 1% 2.25ms ± 1% +0.84% (p=0.000 n=19+19) (https://perf.golang.org/search?q=upload:20171231.3) Relative to the start of the sparse heap changes (starting at and including "runtime: fix various contiguous bitmap assumptions"), overall slowdown is roughly 1% on GC-intensive benchmarks: name old time/op new time/op delta Template 183ms ± 1% 185ms ± 1% +1.32% (p=0.000 n=9+9) Unicode 84.9ms ± 2% 86.3ms ± 1% +1.65% (p=0.000 n=9+10) GoTypes 595ms ± 1% 602ms ± 0% +1.19% (p=0.000 n=9+9) Compiler 2.86s ± 0% 2.89s ± 1% +0.91% (p=0.000 n=9+10) SSA 7.19s ± 0% 7.25s ± 1% +0.75% (p=0.000 n=8+9) Flate 117ms ± 1% 118ms ± 1% +1.10% (p=0.000 n=10+9) GoParser 146ms ± 2% 148ms ± 1% +1.48% (p=0.002 n=10+10) Reflect 398ms ± 1% 404ms ± 1% +1.51% (p=0.000 n=10+9) Tar 173ms ± 1% 175ms ± 1% +1.17% (p=0.000 n=10+10) XML 208ms ± 1% 210ms ± 1% +0.62% (p=0.011 n=10+10) [Geo mean] 369ms 373ms +1.17% (https://perf.golang.org/search?q=upload:20180101.2) name old time/op new time/op delta Garbage/benchmem-MB=64-12 2.22ms ± 1% 2.25ms ± 1% +1.51% (p=0.000 n=20+19) (https://perf.golang.org/search?q=upload:20180101.3) Change-Id: I5daf4cfec24b252e5a57001f0a6c03f22479d0f0 Reviewed-on: https://go-review.googlesource.com/85887 Run-TryBot: Austin Clements <austin@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Rick Hudson <rlh@golang.org>
2017-12-19 22:05:23 -08:00
func KeepNArenaHints(n int) {
hint := mheap_.arenaHints
for i := 1; i < n; i++ {
hint = hint.next
if hint == nil {
return
}
}
hint.next = nil
}
// MapNextArenaHint reserves a page at the next arena growth hint,
// preventing the arena from growing there, and returns the range of
// addresses that are no longer viable.
func MapNextArenaHint() (start, end uintptr) {
hint := mheap_.arenaHints
addr := hint.addr
if hint.down {
start, end = addr-heapArenaBytes, addr
addr -= physPageSize
} else {
start, end = addr, addr+heapArenaBytes
}
sysReserve(unsafe.Pointer(addr), physPageSize)
runtime: use sparse mappings for the heap This replaces the contiguous heap arena mapping with a potentially sparse mapping that can support heap mappings anywhere in the address space. This has several advantages over the current approach: * There is no longer any limit on the size of the Go heap. (Currently it's limited to 512GB.) Hence, this fixes #10460. * It eliminates many failures modes of heap initialization and growing. In particular it eliminates any possibility of panicking with an address space conflict. This can happen for many reasons and even causes a low but steady rate of TSAN test failures because of conflicts with the TSAN runtime. See #16936 and #11993. * It eliminates the notion of "non-reserved" heap, which was added because creating huge address space reservations (particularly on 64-bit) led to huge process VSIZE. This was at best confusing and at worst conflicted badly with ulimit -v. However, the non-reserved heap logic is complicated, can race with other mappings in non-pure Go binaries (e.g., #18976), and requires that the entire heap be either reserved or non-reserved. We currently maintain the latter property, but it's quite difficult to convince yourself of that, and hence difficult to keep correct. This logic is still present, but will be removed in the next CL. * It fixes problems on 32-bit where skipping over parts of the address space leads to mapping huge (and never-to-be-used) metadata structures. See #19831. This also completely rewrites and significantly simplifies mheap.sysAlloc, which has been a source of many bugs. E.g., #21044, #20259, #18651, and #13143 (and maybe #23222). This change also makes it possible to allocate individual objects larger than 512GB. As a result, a few tests that expected huge allocations to fail needed to be changed to make even larger allocations. However, at the moment attempting to allocate a humongous object may cause the program to freeze for several minutes on Linux as we fall back to probing every page with addrspace_free. That logic (and this failure mode) will be removed in the next CL. Fixes #10460. Fixes #22204 (since it rewrites the code involved). This slightly slows down compilebench and the x/benchmarks garbage benchmark. name old time/op new time/op delta Template 184ms ± 1% 185ms ± 1% ~ (p=0.065 n=10+9) Unicode 86.9ms ± 3% 86.3ms ± 1% ~ (p=0.631 n=10+10) GoTypes 599ms ± 0% 602ms ± 0% +0.56% (p=0.000 n=10+9) Compiler 2.87s ± 1% 2.89s ± 1% +0.51% (p=0.002 n=9+10) SSA 7.29s ± 1% 7.25s ± 1% ~ (p=0.182 n=10+9) Flate 118ms ± 2% 118ms ± 1% ~ (p=0.113 n=9+9) GoParser 147ms ± 1% 148ms ± 1% +1.07% (p=0.003 n=9+10) Reflect 401ms ± 1% 404ms ± 1% +0.71% (p=0.003 n=10+9) Tar 175ms ± 1% 175ms ± 1% ~ (p=0.604 n=9+10) XML 209ms ± 1% 210ms ± 1% ~ (p=0.052 n=10+10) (https://perf.golang.org/search?q=upload:20171231.4) name old time/op new time/op delta Garbage/benchmem-MB=64-12 2.23ms ± 1% 2.25ms ± 1% +0.84% (p=0.000 n=19+19) (https://perf.golang.org/search?q=upload:20171231.3) Relative to the start of the sparse heap changes (starting at and including "runtime: fix various contiguous bitmap assumptions"), overall slowdown is roughly 1% on GC-intensive benchmarks: name old time/op new time/op delta Template 183ms ± 1% 185ms ± 1% +1.32% (p=0.000 n=9+9) Unicode 84.9ms ± 2% 86.3ms ± 1% +1.65% (p=0.000 n=9+10) GoTypes 595ms ± 1% 602ms ± 0% +1.19% (p=0.000 n=9+9) Compiler 2.86s ± 0% 2.89s ± 1% +0.91% (p=0.000 n=9+10) SSA 7.19s ± 0% 7.25s ± 1% +0.75% (p=0.000 n=8+9) Flate 117ms ± 1% 118ms ± 1% +1.10% (p=0.000 n=10+9) GoParser 146ms ± 2% 148ms ± 1% +1.48% (p=0.002 n=10+10) Reflect 398ms ± 1% 404ms ± 1% +1.51% (p=0.000 n=10+9) Tar 173ms ± 1% 175ms ± 1% +1.17% (p=0.000 n=10+10) XML 208ms ± 1% 210ms ± 1% +0.62% (p=0.011 n=10+10) [Geo mean] 369ms 373ms +1.17% (https://perf.golang.org/search?q=upload:20180101.2) name old time/op new time/op delta Garbage/benchmem-MB=64-12 2.22ms ± 1% 2.25ms ± 1% +1.51% (p=0.000 n=20+19) (https://perf.golang.org/search?q=upload:20180101.3) Change-Id: I5daf4cfec24b252e5a57001f0a6c03f22479d0f0 Reviewed-on: https://go-review.googlesource.com/85887 Run-TryBot: Austin Clements <austin@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Rick Hudson <rlh@golang.org>
2017-12-19 22:05:23 -08:00
return
}
func GetNextArenaHint() uintptr {
return mheap_.arenaHints.addr
}
type G = g
type Sudog = sudog
func Getg() *G {
return getg()
}
//go:noinline
func PanicForTesting(b []byte, i int) byte {
return unexportedPanicForTesting(b, i)
}
//go:noinline
func unexportedPanicForTesting(b []byte, i int) byte {
return b[i]
}
func G0StackOverflow() {
systemstack(func() {
stackOverflow(nil)
})
}
func stackOverflow(x *byte) {
var buf [256]byte
stackOverflow(&buf[0])
}
func MapTombstoneCheck(m map[int]int) {
// Make sure emptyOne and emptyRest are distributed correctly.
// We should have a series of filled and emptyOne cells, followed by
// a series of emptyRest cells.
h := *(**hmap)(unsafe.Pointer(&m))
i := interface{}(m)
t := *(**maptype)(unsafe.Pointer(&i))
for x := 0; x < 1<<h.B; x++ {
b0 := (*bmap)(add(h.buckets, uintptr(x)*uintptr(t.bucketsize)))
n := 0
for b := b0; b != nil; b = b.overflow(t) {
for i := 0; i < bucketCnt; i++ {
if b.tophash[i] != emptyRest {
n++
}
}
}
k := 0
for b := b0; b != nil; b = b.overflow(t) {
for i := 0; i < bucketCnt; i++ {
if k < n && b.tophash[i] == emptyRest {
panic("early emptyRest")
}
if k >= n && b.tophash[i] != emptyRest {
panic("late non-emptyRest")
}
if k == n-1 && b.tophash[i] == emptyOne {
panic("last non-emptyRest entry is emptyOne")
}
k++
}
}
}
}
func RunGetgThreadSwitchTest() {
// Test that getg works correctly with thread switch.
// With gccgo, if we generate getg inlined, the backend
// may cache the address of the TLS variable, which
// will become invalid after a thread switch. This test
// checks that the bad caching doesn't happen.
ch := make(chan int)
go func(ch chan int) {
ch <- 5
LockOSThread()
}(ch)
g1 := getg()
// Block on a receive. This is likely to get us a thread
// switch. If we yield to the sender goroutine, it will
// lock the thread, forcing us to resume on a different
// thread.
<-ch
g2 := getg()
if g1 != g2 {
panic("g1 != g2")
}
// Also test getg after some control flow, as the
// backend is sensitive to control flow.
g3 := getg()
if g1 != g3 {
panic("g1 != g3")
}
}
const (
PageSize = pageSize
PallocChunkPages = pallocChunkPages
PageAlloc64Bit = pageAlloc64Bit
PallocSumBytes = pallocSumBytes
)
// Expose pallocSum for testing.
type PallocSum pallocSum
func PackPallocSum(start, max, end uint) PallocSum { return PallocSum(packPallocSum(start, max, end)) }
func (m PallocSum) Start() uint { return pallocSum(m).start() }
func (m PallocSum) Max() uint { return pallocSum(m).max() }
func (m PallocSum) End() uint { return pallocSum(m).end() }
// Expose pallocBits for testing.
type PallocBits pallocBits
func (b *PallocBits) Find(npages uintptr, searchIdx uint) (uint, uint) {
return (*pallocBits)(b).find(npages, searchIdx)
}
func (b *PallocBits) AllocRange(i, n uint) { (*pallocBits)(b).allocRange(i, n) }
func (b *PallocBits) Free(i, n uint) { (*pallocBits)(b).free(i, n) }
func (b *PallocBits) Summarize() PallocSum { return PallocSum((*pallocBits)(b).summarize()) }
func (b *PallocBits) PopcntRange(i, n uint) uint { return (*pageBits)(b).popcntRange(i, n) }
// SummarizeSlow is a slow but more obviously correct implementation
// of (*pallocBits).summarize. Used for testing.
func SummarizeSlow(b *PallocBits) PallocSum {
var start, max, end uint
const N = uint(len(b)) * 64
for start < N && (*pageBits)(b).get(start) == 0 {
start++
}
for end < N && (*pageBits)(b).get(N-end-1) == 0 {
end++
}
run := uint(0)
for i := uint(0); i < N; i++ {
if (*pageBits)(b).get(i) == 0 {
run++
} else {
run = 0
}
if run > max {
max = run
}
}
return PackPallocSum(start, max, end)
}
// Expose non-trivial helpers for testing.
func FindBitRange64(c uint64, n uint) uint { return findBitRange64(c, n) }
// Given two PallocBits, returns a set of bit ranges where
// they differ.
func DiffPallocBits(a, b *PallocBits) []BitRange {
ba := (*pageBits)(a)
bb := (*pageBits)(b)
var d []BitRange
base, size := uint(0), uint(0)
for i := uint(0); i < uint(len(ba))*64; i++ {
if ba.get(i) != bb.get(i) {
if size == 0 {
base = i
}
size++
} else {
if size != 0 {
d = append(d, BitRange{base, size})
}
size = 0
}
}
if size != 0 {
d = append(d, BitRange{base, size})
}
return d
}
// StringifyPallocBits gets the bits in the bit range r from b,
// and returns a string containing the bits as ASCII 0 and 1
// characters.
func StringifyPallocBits(b *PallocBits, r BitRange) string {
str := ""
for j := r.I; j < r.I+r.N; j++ {
if (*pageBits)(b).get(j) != 0 {
str += "1"
} else {
str += "0"
}
}
return str
}
// Expose pallocData for testing.
type PallocData pallocData
func (d *PallocData) FindScavengeCandidate(searchIdx uint, min, max uintptr) (uint, uint) {
return (*pallocData)(d).findScavengeCandidate(searchIdx, min, max)
}
func (d *PallocData) AllocRange(i, n uint) { (*pallocData)(d).allocRange(i, n) }
func (d *PallocData) ScavengedSetRange(i, n uint) {
(*pallocData)(d).scavenged.setRange(i, n)
}
func (d *PallocData) PallocBits() *PallocBits {
return (*PallocBits)(&(*pallocData)(d).pallocBits)
}
func (d *PallocData) Scavenged() *PallocBits {
return (*PallocBits)(&(*pallocData)(d).scavenged)
}
// Expose fillAligned for testing.
func FillAligned(x uint64, m uint) uint64 { return fillAligned(x, m) }
// Expose pageCache for testing.
type PageCache pageCache
const PageCachePages = pageCachePages
func NewPageCache(base uintptr, cache, scav uint64) PageCache {
return PageCache(pageCache{base: base, cache: cache, scav: scav})
}
func (c *PageCache) Empty() bool { return (*pageCache)(c).empty() }
func (c *PageCache) Base() uintptr { return (*pageCache)(c).base }
func (c *PageCache) Cache() uint64 { return (*pageCache)(c).cache }
func (c *PageCache) Scav() uint64 { return (*pageCache)(c).scav }
func (c *PageCache) Alloc(npages uintptr) (uintptr, uintptr) {
return (*pageCache)(c).alloc(npages)
}
func (c *PageCache) Flush(s *PageAlloc) {
cp := (*pageCache)(c)
sp := (*pageAlloc)(s)
systemstack(func() {
// None of the tests need any higher-level locking, so we just
// take the lock internally.
lock(sp.mheapLock)
cp.flush(sp)
unlock(sp.mheapLock)
})
}
// Expose chunk index type.
type ChunkIdx chunkIdx
// Expose pageAlloc for testing. Note that because pageAlloc is
// not in the heap, so is PageAlloc.
type PageAlloc pageAlloc
func (p *PageAlloc) Alloc(npages uintptr) (uintptr, uintptr) {
pp := (*pageAlloc)(p)
var addr, scav uintptr
systemstack(func() {
// None of the tests need any higher-level locking, so we just
// take the lock internally.
lock(pp.mheapLock)
addr, scav = pp.alloc(npages)
unlock(pp.mheapLock)
})
return addr, scav
}
func (p *PageAlloc) AllocToCache() PageCache {
pp := (*pageAlloc)(p)
var c PageCache
systemstack(func() {
// None of the tests need any higher-level locking, so we just
// take the lock internally.
lock(pp.mheapLock)
c = PageCache(pp.allocToCache())
unlock(pp.mheapLock)
})
return c
}
func (p *PageAlloc) Free(base, npages uintptr) {
pp := (*pageAlloc)(p)
systemstack(func() {
// None of the tests need any higher-level locking, so we just
// take the lock internally.
lock(pp.mheapLock)
pp.free(base, npages)
unlock(pp.mheapLock)
})
}
func (p *PageAlloc) Bounds() (ChunkIdx, ChunkIdx) {
return ChunkIdx((*pageAlloc)(p).start), ChunkIdx((*pageAlloc)(p).end)
}
func (p *PageAlloc) Scavenge(nbytes uintptr, mayUnlock bool) (r uintptr) {
pp := (*pageAlloc)(p)
systemstack(func() {
// None of the tests need any higher-level locking, so we just
// take the lock internally.
lock(pp.mheapLock)
r = pp.scavenge(nbytes, mayUnlock)
unlock(pp.mheapLock)
})
return
}
runtime: track ranges of address space which are owned by the heap This change adds a new inUse field to the allocator which tracks ranges of addresses that are owned by the heap. It is updated on each heap growth. These ranges are tracked in an array which is kept sorted. In practice this array shouldn't exceed its initial allocation except in rare cases and thus should be small (ideally exactly 1 element in size). In a hypothetical worst-case scenario wherein we have a 1 TiB heap and 4 MiB arenas (note that the address ranges will never be at a smaller granularity than an arena, since arenas are always allocated contiguously), inUse would use at most 4 MiB of memory if the heap mappings were completely discontiguous (highly unlikely) with an additional 2 MiB leaked from previous allocations. Furthermore, the copies that are done to keep the inUse array sorted will copy at most 4 MiB of memory in such a scenario, which, assuming a conservative copying rate of 5 GiB/s, amounts to about 800µs. However, note that in practice: 1) Most 64-bit platforms have 64 MiB arenas. 2) The copies should incur little-to-no page faults, meaning a copy rate closer to 25-50 GiB/s is expected. 3) Go heaps are almost always mostly contiguous. Updates #35514. Change-Id: I3ad07f1c2b5b9340acf59ecc3b9ae09e884814fe Reviewed-on: https://go-review.googlesource.com/c/go/+/207757 Run-TryBot: Michael Knyszek <mknyszek@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Cherry Zhang <cherryyz@google.com> Reviewed-by: Austin Clements <austin@google.com>
2019-11-15 23:30:30 +00:00
func (p *PageAlloc) InUse() []AddrRange {
ranges := make([]AddrRange, 0, len(p.inUse.ranges))
for _, r := range p.inUse.ranges {
ranges = append(ranges, AddrRange{r})
runtime: track ranges of address space which are owned by the heap This change adds a new inUse field to the allocator which tracks ranges of addresses that are owned by the heap. It is updated on each heap growth. These ranges are tracked in an array which is kept sorted. In practice this array shouldn't exceed its initial allocation except in rare cases and thus should be small (ideally exactly 1 element in size). In a hypothetical worst-case scenario wherein we have a 1 TiB heap and 4 MiB arenas (note that the address ranges will never be at a smaller granularity than an arena, since arenas are always allocated contiguously), inUse would use at most 4 MiB of memory if the heap mappings were completely discontiguous (highly unlikely) with an additional 2 MiB leaked from previous allocations. Furthermore, the copies that are done to keep the inUse array sorted will copy at most 4 MiB of memory in such a scenario, which, assuming a conservative copying rate of 5 GiB/s, amounts to about 800µs. However, note that in practice: 1) Most 64-bit platforms have 64 MiB arenas. 2) The copies should incur little-to-no page faults, meaning a copy rate closer to 25-50 GiB/s is expected. 3) Go heaps are almost always mostly contiguous. Updates #35514. Change-Id: I3ad07f1c2b5b9340acf59ecc3b9ae09e884814fe Reviewed-on: https://go-review.googlesource.com/c/go/+/207757 Run-TryBot: Michael Knyszek <mknyszek@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Cherry Zhang <cherryyz@google.com> Reviewed-by: Austin Clements <austin@google.com>
2019-11-15 23:30:30 +00:00
}
return ranges
}
runtime: convert page allocator bitmap to sparse array Currently the page allocator bitmap is implemented as a single giant memory mapping which is reserved at init time and committed as needed. This causes problems on systems that don't handle large uncommitted mappings well, or institute low virtual address space defaults as a memory limiting mechanism. This change modifies the implementation of the page allocator bitmap away from a directly-mapped set of bytes to a sparse array in same vein as mheap.arenas. This will hurt performance a little but the biggest gains are from the lockless allocation possible with the page allocator, so the impact of this extra layer of indirection should be minimal. In fact, this is exactly what we see: https://perf.golang.org/search?q=upload:20191125.5 This reduces the amount of mapped (PROT_NONE) memory needed on systems with 48-bit address spaces to ~600 MiB down from almost 9 GiB. The bulk of this remaining memory is used by the summaries. Go processes with 32-bit address spaces now always commit to 128 KiB of memory for the bitmap. Previously it would only commit the pages in the bitmap which represented the range of addresses (lowest address to highest address, even if there are unused regions in that range) used by the heap. Updates #35568. Updates #35451. Change-Id: I0ff10380156568642b80c366001eefd0a4e6c762 Reviewed-on: https://go-review.googlesource.com/c/go/+/207497 Run-TryBot: Michael Knyszek <mknyszek@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Austin Clements <austin@google.com> Reviewed-by: Cherry Zhang <cherryyz@google.com>
2019-11-14 23:58:50 +00:00
// Returns nil if the PallocData's L2 is missing.
func (p *PageAlloc) PallocData(i ChunkIdx) *PallocData {
ci := chunkIdx(i)
return (*PallocData)((*pageAlloc)(p).tryChunkOf(ci))
runtime: convert page allocator bitmap to sparse array Currently the page allocator bitmap is implemented as a single giant memory mapping which is reserved at init time and committed as needed. This causes problems on systems that don't handle large uncommitted mappings well, or institute low virtual address space defaults as a memory limiting mechanism. This change modifies the implementation of the page allocator bitmap away from a directly-mapped set of bytes to a sparse array in same vein as mheap.arenas. This will hurt performance a little but the biggest gains are from the lockless allocation possible with the page allocator, so the impact of this extra layer of indirection should be minimal. In fact, this is exactly what we see: https://perf.golang.org/search?q=upload:20191125.5 This reduces the amount of mapped (PROT_NONE) memory needed on systems with 48-bit address spaces to ~600 MiB down from almost 9 GiB. The bulk of this remaining memory is used by the summaries. Go processes with 32-bit address spaces now always commit to 128 KiB of memory for the bitmap. Previously it would only commit the pages in the bitmap which represented the range of addresses (lowest address to highest address, even if there are unused regions in that range) used by the heap. Updates #35568. Updates #35451. Change-Id: I0ff10380156568642b80c366001eefd0a4e6c762 Reviewed-on: https://go-review.googlesource.com/c/go/+/207497 Run-TryBot: Michael Knyszek <mknyszek@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Austin Clements <austin@google.com> Reviewed-by: Cherry Zhang <cherryyz@google.com>
2019-11-14 23:58:50 +00:00
}
// AddrRange is a wrapper around addrRange for testing.
runtime: track ranges of address space which are owned by the heap This change adds a new inUse field to the allocator which tracks ranges of addresses that are owned by the heap. It is updated on each heap growth. These ranges are tracked in an array which is kept sorted. In practice this array shouldn't exceed its initial allocation except in rare cases and thus should be small (ideally exactly 1 element in size). In a hypothetical worst-case scenario wherein we have a 1 TiB heap and 4 MiB arenas (note that the address ranges will never be at a smaller granularity than an arena, since arenas are always allocated contiguously), inUse would use at most 4 MiB of memory if the heap mappings were completely discontiguous (highly unlikely) with an additional 2 MiB leaked from previous allocations. Furthermore, the copies that are done to keep the inUse array sorted will copy at most 4 MiB of memory in such a scenario, which, assuming a conservative copying rate of 5 GiB/s, amounts to about 800µs. However, note that in practice: 1) Most 64-bit platforms have 64 MiB arenas. 2) The copies should incur little-to-no page faults, meaning a copy rate closer to 25-50 GiB/s is expected. 3) Go heaps are almost always mostly contiguous. Updates #35514. Change-Id: I3ad07f1c2b5b9340acf59ecc3b9ae09e884814fe Reviewed-on: https://go-review.googlesource.com/c/go/+/207757 Run-TryBot: Michael Knyszek <mknyszek@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Cherry Zhang <cherryyz@google.com> Reviewed-by: Austin Clements <austin@google.com>
2019-11-15 23:30:30 +00:00
type AddrRange struct {
addrRange
}
// MakeAddrRange creates a new address range.
func MakeAddrRange(base, limit uintptr) AddrRange {
return AddrRange{makeAddrRange(base, limit)}
}
// Base returns the virtual base address of the address range.
func (a AddrRange) Base() uintptr {
return a.addrRange.base.addr()
}
// Base returns the virtual address of the limit of the address range.
func (a AddrRange) Limit() uintptr {
return a.addrRange.limit.addr()
}
// Equals returns true if the two address ranges are exactly equal.
func (a AddrRange) Equals(b AddrRange) bool {
return a == b
runtime: track ranges of address space which are owned by the heap This change adds a new inUse field to the allocator which tracks ranges of addresses that are owned by the heap. It is updated on each heap growth. These ranges are tracked in an array which is kept sorted. In practice this array shouldn't exceed its initial allocation except in rare cases and thus should be small (ideally exactly 1 element in size). In a hypothetical worst-case scenario wherein we have a 1 TiB heap and 4 MiB arenas (note that the address ranges will never be at a smaller granularity than an arena, since arenas are always allocated contiguously), inUse would use at most 4 MiB of memory if the heap mappings were completely discontiguous (highly unlikely) with an additional 2 MiB leaked from previous allocations. Furthermore, the copies that are done to keep the inUse array sorted will copy at most 4 MiB of memory in such a scenario, which, assuming a conservative copying rate of 5 GiB/s, amounts to about 800µs. However, note that in practice: 1) Most 64-bit platforms have 64 MiB arenas. 2) The copies should incur little-to-no page faults, meaning a copy rate closer to 25-50 GiB/s is expected. 3) Go heaps are almost always mostly contiguous. Updates #35514. Change-Id: I3ad07f1c2b5b9340acf59ecc3b9ae09e884814fe Reviewed-on: https://go-review.googlesource.com/c/go/+/207757 Run-TryBot: Michael Knyszek <mknyszek@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Cherry Zhang <cherryyz@google.com> Reviewed-by: Austin Clements <austin@google.com>
2019-11-15 23:30:30 +00:00
}
// Size returns the size in bytes of the address range.
func (a AddrRange) Size() uintptr {
return a.addrRange.size()
}
// AddrRanges is a wrapper around addrRanges for testing.
type AddrRanges struct {
addrRanges
mutable bool
}
// NewAddrRanges creates a new empty addrRanges.
//
// Note that this initializes addrRanges just like in the
// runtime, so its memory is persistentalloc'd. Call this
// function sparingly since the memory it allocates is
// leaked.
//
// This AddrRanges is mutable, so we can test methods like
// Add.
func NewAddrRanges() AddrRanges {
r := addrRanges{}
runtime: delineate which memstats are system stats with a type This change modifies the type of several mstats fields to be a new type: sysMemStat. This type has the same structure as the fields used to have. The purpose of this change is to make it very clear which stats may be used in various functions for accounting (usually the platform-specific sys* functions, but there are others). Currently there's an implicit understanding that the *uint64 value passed to these functions is some kind of statistic whose value is atomically managed. This understanding isn't inherently problematic, but we're about to change how some stats (which currently use mSysStatInc and mSysStatDec) work, so we want to make it very clear what the various requirements are around "sysStat". This change also removes mSysStatInc and mSysStatDec in favor of a method on sysMemStat. Note that those two functions were originally written the way they were because atomic 64-bit adds required a valid G on ARM, but this hasn't been the case for a very long time (since golang.org/cl/14204, but even before then it wasn't clear if mutexes required a valid G anymore). Today we implement 64-bit adds on ARM with a spinlock table. Change-Id: I4e9b37cf14afc2ae20cf736e874eb0064af086d7 Reviewed-on: https://go-review.googlesource.com/c/go/+/246971 Run-TryBot: Michael Knyszek <mknyszek@google.com> TryBot-Result: Go Bot <gobot@golang.org> Trust: Michael Knyszek <mknyszek@google.com> Reviewed-by: Michael Pratt <mpratt@google.com>
2020-07-29 20:25:05 +00:00
r.init(new(sysMemStat))
return AddrRanges{r, true}
}
// MakeAddrRanges creates a new addrRanges populated with
// the ranges in a.
//
// The returned AddrRanges is immutable, so methods like
// Add will fail.
func MakeAddrRanges(a ...AddrRange) AddrRanges {
// Methods that manipulate the backing store of addrRanges.ranges should
// not be used on the result from this function (e.g. add) since they may
// trigger reallocation. That would normally be fine, except the new
// backing store won't come from the heap, but from persistentalloc, so
// we'll leak some memory implicitly.
ranges := make([]addrRange, 0, len(a))
total := uintptr(0)
for _, r := range a {
ranges = append(ranges, r.addrRange)
total += r.Size()
}
return AddrRanges{addrRanges{
ranges: ranges,
totalBytes: total,
runtime: delineate which memstats are system stats with a type This change modifies the type of several mstats fields to be a new type: sysMemStat. This type has the same structure as the fields used to have. The purpose of this change is to make it very clear which stats may be used in various functions for accounting (usually the platform-specific sys* functions, but there are others). Currently there's an implicit understanding that the *uint64 value passed to these functions is some kind of statistic whose value is atomically managed. This understanding isn't inherently problematic, but we're about to change how some stats (which currently use mSysStatInc and mSysStatDec) work, so we want to make it very clear what the various requirements are around "sysStat". This change also removes mSysStatInc and mSysStatDec in favor of a method on sysMemStat. Note that those two functions were originally written the way they were because atomic 64-bit adds required a valid G on ARM, but this hasn't been the case for a very long time (since golang.org/cl/14204, but even before then it wasn't clear if mutexes required a valid G anymore). Today we implement 64-bit adds on ARM with a spinlock table. Change-Id: I4e9b37cf14afc2ae20cf736e874eb0064af086d7 Reviewed-on: https://go-review.googlesource.com/c/go/+/246971 Run-TryBot: Michael Knyszek <mknyszek@google.com> TryBot-Result: Go Bot <gobot@golang.org> Trust: Michael Knyszek <mknyszek@google.com> Reviewed-by: Michael Pratt <mpratt@google.com>
2020-07-29 20:25:05 +00:00
sysStat: new(sysMemStat),
}, false}
}
// Ranges returns a copy of the ranges described by the
// addrRanges.
func (a *AddrRanges) Ranges() []AddrRange {
result := make([]AddrRange, 0, len(a.addrRanges.ranges))
for _, r := range a.addrRanges.ranges {
result = append(result, AddrRange{r})
}
return result
}
// FindSucc returns the successor to base. See addrRanges.findSucc
// for more details.
func (a *AddrRanges) FindSucc(base uintptr) int {
return a.findSucc(base)
}
// Add adds a new AddrRange to the AddrRanges.
//
// The AddrRange must be mutable (i.e. created by NewAddrRanges),
// otherwise this method will throw.
func (a *AddrRanges) Add(r AddrRange) {
if !a.mutable {
throw("attempt to mutate immutable AddrRanges")
}
a.add(r.addrRange)
}
// TotalBytes returns the totalBytes field of the addrRanges.
func (a *AddrRanges) TotalBytes() uintptr {
return a.addrRanges.totalBytes
}
// BitRange represents a range over a bitmap.
type BitRange struct {
I, N uint // bit index and length in bits
}
// NewPageAlloc creates a new page allocator for testing and
// initializes it with the scav and chunks maps. Each key in these maps
// represents a chunk index and each value is a series of bit ranges to
// set within each bitmap's chunk.
//
// The initialization of the pageAlloc preserves the invariant that if a
// scavenged bit is set the alloc bit is necessarily unset, so some
// of the bits described by scav may be cleared in the final bitmap if
// ranges in chunks overlap with them.
//
// scav is optional, and if nil, the scavenged bitmap will be cleared
// (as opposed to all 1s, which it usually is). Furthermore, every
// chunk index in scav must appear in chunks; ones that do not are
// ignored.
func NewPageAlloc(chunks, scav map[ChunkIdx][]BitRange) *PageAlloc {
p := new(pageAlloc)
// We've got an entry, so initialize the pageAlloc.
p.init(new(mutex), nil)
runtime: static lock ranking for the runtime (enabled by GOEXPERIMENT) I took some of the infrastructure from Austin's lock logging CR https://go-review.googlesource.com/c/go/+/192704 (with deadlock detection from the logs), and developed a setup to give static lock ranking for runtime locks. Static lock ranking establishes a documented total ordering among locks, and then reports an error if the total order is violated. This can happen if a deadlock happens (by acquiring a sequence of locks in different orders), or if just one side of a possible deadlock happens. Lock ordering deadlocks cannot happen as long as the lock ordering is followed. Along the way, I found a deadlock involving the new timer code, which Ian fixed via https://go-review.googlesource.com/c/go/+/207348, as well as two other potential deadlocks. See the constants at the top of runtime/lockrank.go to show the static lock ranking that I ended up with, along with some comments. This is great documentation of the current intended lock ordering when acquiring multiple locks in the runtime. I also added an array lockPartialOrder[] which shows and enforces the current partial ordering among locks (which is embedded within the total ordering). This is more specific about the dependencies among locks. I don't try to check the ranking within a lock class with multiple locks that can be acquired at the same time (i.e. check the ranking when multiple hchan locks are acquired). Currently, I am doing a lockInit() call to set the lock rank of most locks. Any lock that is not otherwise initialized is assumed to be a leaf lock (a very high rank lock), so that eliminates the need to do anything for a bunch of locks (including all architecture-dependent locks). For two locks, root.lock and notifyList.lock (only in the runtime/sema.go file), it is not as easy to do lock initialization, so instead, I am passing the lock rank with the lock calls. For Windows compilation, I needed to increase the StackGuard size from 896 to 928 because of the new lock-rank checking functions. Checking of the static lock ranking is enabled by setting GOEXPERIMENT=staticlockranking before doing a run. To make sure that the static lock ranking code has no overhead in memory or CPU when not enabled by GOEXPERIMENT, I changed 'go build/install' so that it defines a build tag (with the same name) whenever any experiment has been baked into the toolchain (by checking Expstring()). This allows me to avoid increasing the size of the 'mutex' type when static lock ranking is not enabled. Fixes #38029 Change-Id: I154217ff307c47051f8dae9c2a03b53081acd83a Reviewed-on: https://go-review.googlesource.com/c/go/+/207619 Reviewed-by: Dan Scales <danscales@google.com> Reviewed-by: Keith Randall <khr@golang.org> Run-TryBot: Dan Scales <danscales@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org>
2019-11-13 17:34:47 -08:00
lockInit(p.mheapLock, lockRankMheap)
p.test = true
for i, init := range chunks {
addr := chunkBase(chunkIdx(i))
// Mark the chunk's existence in the pageAlloc.
systemstack(func() {
lock(p.mheapLock)
p.grow(addr, pallocChunkBytes)
unlock(p.mheapLock)
})
// Initialize the bitmap and update pageAlloc metadata.
runtime: convert page allocator bitmap to sparse array Currently the page allocator bitmap is implemented as a single giant memory mapping which is reserved at init time and committed as needed. This causes problems on systems that don't handle large uncommitted mappings well, or institute low virtual address space defaults as a memory limiting mechanism. This change modifies the implementation of the page allocator bitmap away from a directly-mapped set of bytes to a sparse array in same vein as mheap.arenas. This will hurt performance a little but the biggest gains are from the lockless allocation possible with the page allocator, so the impact of this extra layer of indirection should be minimal. In fact, this is exactly what we see: https://perf.golang.org/search?q=upload:20191125.5 This reduces the amount of mapped (PROT_NONE) memory needed on systems with 48-bit address spaces to ~600 MiB down from almost 9 GiB. The bulk of this remaining memory is used by the summaries. Go processes with 32-bit address spaces now always commit to 128 KiB of memory for the bitmap. Previously it would only commit the pages in the bitmap which represented the range of addresses (lowest address to highest address, even if there are unused regions in that range) used by the heap. Updates #35568. Updates #35451. Change-Id: I0ff10380156568642b80c366001eefd0a4e6c762 Reviewed-on: https://go-review.googlesource.com/c/go/+/207497 Run-TryBot: Michael Knyszek <mknyszek@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Austin Clements <austin@google.com> Reviewed-by: Cherry Zhang <cherryyz@google.com>
2019-11-14 23:58:50 +00:00
chunk := p.chunkOf(chunkIndex(addr))
// Clear all the scavenged bits which grow set.
chunk.scavenged.clearRange(0, pallocChunkPages)
// Apply scavenge state if applicable.
if scav != nil {
if scvg, ok := scav[i]; ok {
for _, s := range scvg {
// Ignore the case of s.N == 0. setRange doesn't handle
// it and it's a no-op anyway.
if s.N != 0 {
chunk.scavenged.setRange(s.I, s.N)
}
}
}
}
// Apply alloc state.
for _, s := range init {
// Ignore the case of s.N == 0. allocRange doesn't handle
// it and it's a no-op anyway.
if s.N != 0 {
chunk.allocRange(s.I, s.N)
}
}
// Update heap metadata for the allocRange calls above.
systemstack(func() {
lock(p.mheapLock)
p.update(addr, pallocChunkPages, false, false)
unlock(p.mheapLock)
})
}
systemstack(func() {
lock(p.mheapLock)
p.scavengeStartGen()
unlock(p.mheapLock)
})
return (*PageAlloc)(p)
}
// FreePageAlloc releases hard OS resources owned by the pageAlloc. Once this
// is called the pageAlloc may no longer be used. The object itself will be
// collected by the garbage collector once it is no longer live.
func FreePageAlloc(pp *PageAlloc) {
p := (*pageAlloc)(pp)
// Free all the mapped space for the summary levels.
if pageAlloc64Bit != 0 {
for l := 0; l < summaryLevels; l++ {
sysFree(unsafe.Pointer(&p.summary[l][0]), uintptr(cap(p.summary[l]))*pallocSumBytes, nil)
}
} else {
resSize := uintptr(0)
for _, s := range p.summary {
resSize += uintptr(cap(s)) * pallocSumBytes
}
sysFree(unsafe.Pointer(&p.summary[0][0]), alignUp(resSize, physPageSize), nil)
}
// Free the mapped space for chunks.
runtime: convert page allocator bitmap to sparse array Currently the page allocator bitmap is implemented as a single giant memory mapping which is reserved at init time and committed as needed. This causes problems on systems that don't handle large uncommitted mappings well, or institute low virtual address space defaults as a memory limiting mechanism. This change modifies the implementation of the page allocator bitmap away from a directly-mapped set of bytes to a sparse array in same vein as mheap.arenas. This will hurt performance a little but the biggest gains are from the lockless allocation possible with the page allocator, so the impact of this extra layer of indirection should be minimal. In fact, this is exactly what we see: https://perf.golang.org/search?q=upload:20191125.5 This reduces the amount of mapped (PROT_NONE) memory needed on systems with 48-bit address spaces to ~600 MiB down from almost 9 GiB. The bulk of this remaining memory is used by the summaries. Go processes with 32-bit address spaces now always commit to 128 KiB of memory for the bitmap. Previously it would only commit the pages in the bitmap which represented the range of addresses (lowest address to highest address, even if there are unused regions in that range) used by the heap. Updates #35568. Updates #35451. Change-Id: I0ff10380156568642b80c366001eefd0a4e6c762 Reviewed-on: https://go-review.googlesource.com/c/go/+/207497 Run-TryBot: Michael Knyszek <mknyszek@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Austin Clements <austin@google.com> Reviewed-by: Cherry Zhang <cherryyz@google.com>
2019-11-14 23:58:50 +00:00
for i := range p.chunks {
if x := p.chunks[i]; x != nil {
p.chunks[i] = nil
// This memory comes from sysAlloc and will always be page-aligned.
sysFree(unsafe.Pointer(x), unsafe.Sizeof(*p.chunks[0]), nil)
}
}
}
// BaseChunkIdx is a convenient chunkIdx value which works on both
// 64 bit and 32 bit platforms, allowing the tests to share code
// between the two.
//
// This should not be higher than 0x100*pallocChunkBytes to support
// mips and mipsle, which only have 31-bit address spaces.
runtime: make maxOffAddr reflect the actual address space upper bound Currently maxOffAddr is defined in terms of the whole 64-bit address space, assuming that it's all supported, by using ^uintptr(0) as the maximal address in the offset space. In reality, the maximal address in the offset space is (1<<heapAddrBits)-1 because we don't have more than that actually available to us on a given platform. On most platforms this is fine, because arenaBaseOffset is just connecting two segments of address space, but on AIX we use it as an actual offset for the starting address of the available address space, which is limited. This means using ^uintptr(0) as the maximal address in the offset address space causes wrap-around, especially when we just want to represent a range approximately like [addr, infinity), which today we do by using maxOffAddr. To fix this, we define maxOffAddr more appropriately, in terms of (1<<heapAddrBits)-1. This change also redefines arenaBaseOffset to not be the negation of the virtual address corresponding to address zero in the virtual address space, but instead directly as the virtual address corresponding to zero. This matches the existing documentation more closely and makes the logic around arenaBaseOffset decidedly simpler, especially when trying to reason about its use on AIX. Fixes #38966. Change-Id: I1336e5036a39de846f64cc2d253e8536dee57611 Reviewed-on: https://go-review.googlesource.com/c/go/+/233497 Run-TryBot: Michael Knyszek <mknyszek@google.com> Reviewed-by: Austin Clements <austin@google.com> Reviewed-by: Michael Pratt <mpratt@google.com>
2020-05-12 16:08:50 +00:00
var BaseChunkIdx = ChunkIdx(chunkIndex(((0xc000*pageAlloc64Bit + 0x100*pageAlloc32Bit) * pallocChunkBytes) + arenaBaseOffset*sys.GoosAix))
// PageBase returns an address given a chunk index and a page index
// relative to that chunk.
func PageBase(c ChunkIdx, pageIdx uint) uintptr {
return chunkBase(chunkIdx(c)) + uintptr(pageIdx)*pageSize
}
type BitsMismatch struct {
Base uintptr
Got, Want uint64
}
func CheckScavengedBitsCleared(mismatches []BitsMismatch) (n int, ok bool) {
ok = true
// Run on the system stack to avoid stack growth allocation.
systemstack(func() {
getg().m.mallocing++
// Lock so that we can safely access the bitmap.
lock(&mheap_.lock)
chunkLoop:
for i := mheap_.pages.start; i < mheap_.pages.end; i++ {
chunk := mheap_.pages.tryChunkOf(i)
if chunk == nil {
continue
}
for j := 0; j < pallocChunkPages/64; j++ {
// Run over each 64-bit bitmap section and ensure
// scavenged is being cleared properly on allocation.
// If a used bit and scavenged bit are both set, that's
// an error, and could indicate a larger problem, or
// an accounting problem.
want := chunk.scavenged[j] &^ chunk.pallocBits[j]
got := chunk.scavenged[j]
if want != got {
ok = false
if n >= len(mismatches) {
break chunkLoop
}
mismatches[n] = BitsMismatch{
Base: chunkBase(i) + uintptr(j)*64*pageSize,
Got: got,
Want: want,
}
n++
}
}
}
unlock(&mheap_.lock)
getg().m.mallocing--
})
return
}
func PageCachePagesLeaked() (leaked uintptr) {
stopTheWorld("PageCachePagesLeaked")
// Walk over destroyed Ps and look for unflushed caches.
deadp := allp[len(allp):cap(allp)]
for _, p := range deadp {
// Since we're going past len(allp) we may see nil Ps.
// Just ignore them.
if p != nil {
leaked += uintptr(sys.OnesCount64(p.pcache.cache))
}
}
startTheWorld()
return
}
var Semacquire = semacquire
var Semrelease1 = semrelease1
func SemNwait(addr *uint32) uint32 {
root := semroot(addr)
return atomic.Load(&root.nwait)
}
// MapHashCheck computes the hash of the key k for the map m, twice.
// Method 1 uses the built-in hasher for the map.
// Method 2 uses the typehash function (the one used by reflect).
// Returns the two hash values, which should always be equal.
func MapHashCheck(m interface{}, k interface{}) (uintptr, uintptr) {
// Unpack m.
mt := (*maptype)(unsafe.Pointer(efaceOf(&m)._type))
mh := (*hmap)(efaceOf(&m).data)
// Unpack k.
kt := efaceOf(&k)._type
var p unsafe.Pointer
if isDirectIface(kt) {
q := efaceOf(&k).data
p = unsafe.Pointer(&q)
} else {
p = efaceOf(&k).data
}
// Compute the hash functions.
x := mt.hasher(noescape(p), uintptr(mh.hash0))
y := typehash(kt, noescape(p), uintptr(mh.hash0))
return x, y
}
// mspan wrapper for testing.
//go:notinheap
type MSpan mspan
// Allocate an mspan for testing.
func AllocMSpan() *MSpan {
var s *mspan
systemstack(func() {
lock(&mheap_.lock)
s = (*mspan)(mheap_.spanalloc.alloc())
unlock(&mheap_.lock)
})
return (*MSpan)(s)
}
// Free an allocated mspan.
func FreeMSpan(s *MSpan) {
systemstack(func() {
lock(&mheap_.lock)
mheap_.spanalloc.free(unsafe.Pointer(s))
unlock(&mheap_.lock)
})
}
func MSpanCountAlloc(ms *MSpan, bits []byte) int {
s := (*mspan)(ms)
s.nelems = uintptr(len(bits) * 8)
s.gcmarkBits = (*gcBits)(unsafe.Pointer(&bits[0]))
result := s.countAlloc()
s.gcmarkBits = nil
return result
}
const (
TimeHistSubBucketBits = timeHistSubBucketBits
TimeHistNumSubBuckets = timeHistNumSubBuckets
TimeHistNumSuperBuckets = timeHistNumSuperBuckets
)
type TimeHistogram timeHistogram
// Counts returns the counts for the given bucket, subBucket indices.
// Returns true if the bucket was valid, otherwise returns the counts
// for the underflow bucket and false.
func (th *TimeHistogram) Count(bucket, subBucket uint) (uint64, bool) {
t := (*timeHistogram)(th)
i := bucket*TimeHistNumSubBuckets + subBucket
if i >= uint(len(t.counts)) {
return t.underflow, false
}
return t.counts[i], true
}
func (th *TimeHistogram) Record(duration int64) {
(*timeHistogram)(th).record(duration)
}