2014-11-11 17:05:02 -05:00
|
|
|
// Copyright 2009 The Go Authors. All rights reserved.
|
|
|
|
|
// Use of this source code is governed by a BSD-style
|
|
|
|
|
// license that can be found in the LICENSE file.
|
|
|
|
|
|
|
|
|
|
// Page heap.
|
|
|
|
|
//
|
2015-02-19 13:38:46 -05:00
|
|
|
// See malloc.go for overview.
|
|
|
|
|
|
|
|
|
|
package runtime
|
|
|
|
|
|
2015-11-02 14:09:24 -05:00
|
|
|
import (
|
2018-06-05 08:14:57 +02:00
|
|
|
"internal/cpu"
|
2021-06-16 23:05:44 +00:00
|
|
|
"internal/goarch"
|
2021-06-17 19:10:18 +00:00
|
|
|
"runtime/internal/atomic"
|
2015-11-02 14:09:24 -05:00
|
|
|
"unsafe"
|
|
|
|
|
)
|
2015-02-19 13:38:46 -05:00
|
|
|
|
2019-11-07 21:14:37 +00:00
|
|
|
const (
|
|
|
|
|
// minPhysPageSize is a lower-bound on the physical page size. The
|
|
|
|
|
// true physical page size may be larger than this. In contrast,
|
|
|
|
|
// sys.PhysPageSize is an upper-bound on the physical page size.
|
|
|
|
|
minPhysPageSize = 4096
|
|
|
|
|
|
|
|
|
|
// maxPhysPageSize is the maximum page size the runtime supports.
|
|
|
|
|
maxPhysPageSize = 512 << 10
|
|
|
|
|
|
|
|
|
|
// maxPhysHugePageSize sets an upper-bound on the maximum huge page size
|
|
|
|
|
// that the runtime supports.
|
|
|
|
|
maxPhysHugePageSize = pallocChunkBytes
|
runtime: add bitmap-based markrootSpans implementation
Currently markrootSpans, the scanning routine which scans span specials
(particularly finalizers) as roots, uses sweepSpans to shard work and
find spans to mark.
However, as part of a future CL to change span ownership and how
mcentral works, we want to avoid having markrootSpans use the sweep bufs
to find specials, so in this change we introduce a new mechanism.
Much like for the page reclaimer, we set up a per-page bitmap where the
first page for a span is marked if the span contains any specials, and
unmarked if it has no specials. This bitmap is updated by addspecial,
removespecial, and during sweeping.
markrootSpans then shards this bitmap into mark work and markers iterate
over the bitmap looking for spans with specials to mark. Unlike the page
reclaimer, we don't need to use the pageInUse bits because having a
special implies that a span is in-use.
While in terms of computational complexity this design is technically
worse, because it needs to iterate over the mapped heap, in practice
this iteration is very fast (we can skip over large swathes of the heap
very quickly) and we only look at spans that have any specials at all,
rather than having to touch each span.
This new implementation of markrootSpans is behind a feature flag called
go115NewMarkrootSpans.
Updates #37487.
Change-Id: I8ea07b6c11059f6d412fe419e0ab512d989377b8
Reviewed-on: https://go-review.googlesource.com/c/go/+/221178
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Austin Clements <austin@google.com>
2020-02-20 20:03:39 +00:00
|
|
|
|
|
|
|
|
// pagesPerReclaimerChunk indicates how many pages to scan from the
|
|
|
|
|
// pageInUse bitmap at a time. Used by the page reclaimer.
|
|
|
|
|
//
|
|
|
|
|
// Higher values reduce contention on scanning indexes (such as
|
|
|
|
|
// h.reclaimIndex), but increase the minimum latency of the
|
|
|
|
|
// operation.
|
|
|
|
|
//
|
|
|
|
|
// The time required to scan this many pages can vary a lot depending
|
|
|
|
|
// on how many spans are actually freed. Experimentally, it can
|
|
|
|
|
// scan for pages at ~300 GB/ms on a 2.6GHz Core i7, but can only
|
|
|
|
|
// free spans at ~32 MB/ms. Using 512 pages bounds this at
|
|
|
|
|
// roughly 100µs.
|
|
|
|
|
//
|
|
|
|
|
// Must be a multiple of the pageInUse bitmap element size and
|
2020-08-14 10:35:46 +00:00
|
|
|
// must also evenly divide pagesPerArena.
|
runtime: add bitmap-based markrootSpans implementation
Currently markrootSpans, the scanning routine which scans span specials
(particularly finalizers) as roots, uses sweepSpans to shard work and
find spans to mark.
However, as part of a future CL to change span ownership and how
mcentral works, we want to avoid having markrootSpans use the sweep bufs
to find specials, so in this change we introduce a new mechanism.
Much like for the page reclaimer, we set up a per-page bitmap where the
first page for a span is marked if the span contains any specials, and
unmarked if it has no specials. This bitmap is updated by addspecial,
removespecial, and during sweeping.
markrootSpans then shards this bitmap into mark work and markers iterate
over the bitmap looking for spans with specials to mark. Unlike the page
reclaimer, we don't need to use the pageInUse bits because having a
special implies that a span is in-use.
While in terms of computational complexity this design is technically
worse, because it needs to iterate over the mapped heap, in practice
this iteration is very fast (we can skip over large swathes of the heap
very quickly) and we only look at spans that have any specials at all,
rather than having to touch each span.
This new implementation of markrootSpans is behind a feature flag called
go115NewMarkrootSpans.
Updates #37487.
Change-Id: I8ea07b6c11059f6d412fe419e0ab512d989377b8
Reviewed-on: https://go-review.googlesource.com/c/go/+/221178
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Austin Clements <austin@google.com>
2020-02-20 20:03:39 +00:00
|
|
|
pagesPerReclaimerChunk = 512
|
2020-11-02 03:58:08 +11:00
|
|
|
|
|
|
|
|
// physPageAlignedStacks indicates whether stack allocations must be
|
|
|
|
|
// physical page aligned. This is a requirement for MAP_STACK on
|
|
|
|
|
// OpenBSD.
|
|
|
|
|
physPageAlignedStacks = GOOS == "openbsd"
|
2019-11-07 21:14:37 +00:00
|
|
|
)
|
runtime: support smaller physical pages than PhysPageSize
Most operations need an upper bound on the physical page size, which
is what sys.PhysPageSize is for (this is checked at runtime init on
Linux). However, a few operations need a *lower* bound on the physical
page size. Introduce a "minPhysPageSize" constant to act as this lower
bound and use it where it makes sense:
1) In addrspace_free, we have to query each page in the given range.
Currently we increment by the upper bound on the physical page
size, which means we may skip over pages if the true size is
smaller. Worse, we currently pass a result buffer that only has
enough room for one page. If there are actually multiple pages in
the range passed to mincore, the kernel will overflow this buffer.
Fix these problems by incrementing by the lower-bound on the
physical page size and by passing "1" for the length, which the
kernel will round up to the true physical page size.
2) In the write barrier, the bad pointer check tests for pointers to
the first physical page, which are presumably small integers
masquerading as pointers. However, if physical pages are smaller
than we think, we may have legitimate pointers below
sys.PhysPageSize. Hence, use minPhysPageSize for this test since
pointers should never fall below that.
In particular, this applies to ARM64 and MIPS. The runtime is
configured to use 64kB pages on ARM64, but by default Linux uses 4kB
pages. Similarly, the runtime assumes 16kB pages on MIPS, but both 4kB
and 16kB kernel configurations are common. This also applies to ARM on
systems where the runtime is recompiled to deal with a larger page
size. It is also a step toward making the runtime use only a
dynamically-queried page size.
Change-Id: I1fdfd18f6e7cbca170cc100354b9faa22fde8a69
Reviewed-on: https://go-review.googlesource.com/25020
Reviewed-by: Ian Lance Taylor <iant@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Run-TryBot: Austin Clements <austin@google.com>
2016-07-18 16:01:22 -04:00
|
|
|
|
2015-02-19 13:38:46 -05:00
|
|
|
// Main malloc heap.
|
2018-10-17 20:16:45 +00:00
|
|
|
// The heap itself is the "free" and "scav" treaps,
|
2015-02-19 13:38:46 -05:00
|
|
|
// but all the other global data is here too.
|
2016-10-11 22:58:21 -04:00
|
|
|
//
|
|
|
|
|
// mheap must not be heap-allocated because it contains mSpanLists,
|
|
|
|
|
// which must not be heap-allocated.
|
|
|
|
|
//
|
|
|
|
|
//go:notinheap
|
2015-02-19 13:38:46 -05:00
|
|
|
type mheap struct {
|
2019-05-17 14:48:04 +00:00
|
|
|
// lock must only be acquired on the system stack, otherwise a g
|
|
|
|
|
// could self-deadlock if its stack grows with the lock held.
|
2021-04-06 19:25:28 -04:00
|
|
|
lock mutex
|
|
|
|
|
pages pageAlloc // page allocation data structure
|
|
|
|
|
|
|
|
|
|
sweepgen uint32 // sweep generation, see comment in mspan; written during STW
|
|
|
|
|
sweepDrained uint32 // all spans are swept or are being swept
|
|
|
|
|
sweepers uint32 // number of active sweepone calls
|
2016-10-04 15:51:31 -04:00
|
|
|
|
|
|
|
|
// allspans is a slice of all mspans ever created. Each mspan
|
|
|
|
|
// appears exactly once.
|
|
|
|
|
//
|
|
|
|
|
// The memory for allspans is manually managed and can be
|
|
|
|
|
// reallocated and move as the heap grows.
|
|
|
|
|
//
|
|
|
|
|
// In general, allspans is protected by mheap_.lock, which
|
|
|
|
|
// prevents concurrent access as well as freeing the backing
|
|
|
|
|
// store. Accesses during STW might not hold the lock, but
|
|
|
|
|
// must ensure that allocation cannot happen around the
|
|
|
|
|
// access (since that may free the backing store).
|
|
|
|
|
allspans []*mspan // all spans out there
|
|
|
|
|
|
2019-11-21 17:05:14 +00:00
|
|
|
_ uint32 // align uint64 fields on 32-bit for atomics
|
2015-02-19 13:38:46 -05:00
|
|
|
|
2015-05-11 12:03:30 -04:00
|
|
|
// Proportional sweep
|
2017-04-03 15:47:11 -04:00
|
|
|
//
|
2021-03-31 22:55:06 +00:00
|
|
|
// These parameters represent a linear function from gcController.heapLive
|
2017-04-03 15:47:11 -04:00
|
|
|
// to page sweep count. The proportional sweep system works to
|
|
|
|
|
// stay in the black by keeping the current page sweep count
|
2021-03-31 22:55:06 +00:00
|
|
|
// above this line at the current gcController.heapLive.
|
2017-04-03 15:47:11 -04:00
|
|
|
//
|
|
|
|
|
// The line has slope sweepPagesPerByte and passes through a
|
|
|
|
|
// basis point at (sweepHeapLiveBasis, pagesSweptBasis). At
|
2021-03-31 22:55:06 +00:00
|
|
|
// any given time, the system is at (gcController.heapLive,
|
2017-04-03 15:47:11 -04:00
|
|
|
// pagesSwept) in this space.
|
|
|
|
|
//
|
|
|
|
|
// It's important that the line pass through a point we
|
|
|
|
|
// control rather than simply starting at a (0,0) origin
|
|
|
|
|
// because that lets us adjust sweep pacing at any time while
|
|
|
|
|
// accounting for current progress. If we could only adjust
|
|
|
|
|
// the slope, it would create a discontinuity in debt if any
|
|
|
|
|
// progress has already been made.
|
2019-09-18 15:33:17 +00:00
|
|
|
pagesInUse uint64 // pages of spans in stats mSpanInUse; updated atomically
|
2017-04-03 15:22:06 -04:00
|
|
|
pagesSwept uint64 // pages swept this cycle; updated atomically
|
2017-04-03 15:47:11 -04:00
|
|
|
pagesSweptBasis uint64 // pagesSwept to use as the origin of the sweep ratio; updated atomically
|
2021-03-31 22:55:06 +00:00
|
|
|
sweepHeapLiveBasis uint64 // value of gcController.heapLive to use as the origin of sweep ratio; written with lock, read without
|
2017-04-03 15:22:06 -04:00
|
|
|
sweepPagesPerByte float64 // proportional sweep ratio; written with lock, read without
|
2015-09-26 12:31:59 -04:00
|
|
|
// TODO(austin): pagesInUse should be a uintptr, but the 386
|
|
|
|
|
// compiler can't 8-byte align fields.
|
2015-05-11 12:03:30 -04:00
|
|
|
|
runtime: make the scavenger self-paced
Currently the runtime background scavenger is paced externally,
controlled by a collection of variables which together describe a line
that we'd like to stay under.
However, the line to stay under is computed as a function of the number
of free and unscavenged huge pages in the heap at the end of the last
GC. Aside from this number being inaccurate (which is still acceptable),
the scavenging system also makes an order-of-magnitude assumption as to
how expensive scavenging a single page actually is.
This change simplifies the scavenger in preparation for making it
operate on bitmaps. It makes it so that the scavenger paces itself, by
measuring the amount of time it takes to scavenge a single page. The
scavenging methods on mheap already avoid breaking huge pages, so if we
scavenge a real huge page, then we'll have paced correctly, otherwise
we'll sleep for longer to avoid using more than scavengePercent wall
clock time.
Unfortunately, all this involves measuring time, which is quite tricky.
Currently we don't directly account for long process sleeps or OS-level
context switches (which is quite difficult to do in general), but we do
account for Go scheduler overhead and variations in it by maintaining an
EWMA of the ratio of time spent scavenging to the time spent sleeping.
This ratio, as well as the sleep time, are bounded in order to deal with
the aforementioned OS-related anomalies.
Updates #35112.
Change-Id: Ieca8b088fdfca2bebb06bcde25ef14a42fd5216b
Reviewed-on: https://go-review.googlesource.com/c/go/+/201763
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Austin Clements <austin@google.com>
2019-10-17 15:35:54 +00:00
|
|
|
// scavengeGoal is the amount of total retained heap memory (measured by
|
|
|
|
|
// heapRetained) that the runtime will try to maintain by returning memory
|
|
|
|
|
// to the OS.
|
|
|
|
|
scavengeGoal uint64
|
2018-10-17 23:29:42 +00:00
|
|
|
|
runtime: implement efficient page reclaimer
When we attempt to allocate an N page span (either for a large
allocation or when an mcentral runs dry), we first try to sweep spans
to release N pages. Currently, this can be extremely expensive:
sweeping a span to emptiness is the hardest thing to ask for and the
sweeper generally doesn't know where to even look for potentially
fruitful results. Since this is on the critical path of many
allocations, this is unfortunate.
This CL changes how we reclaim empty spans. Instead of trying lots of
spans and hoping for the best, it uses the newly introduced span marks
to efficiently find empty spans. The span marks (and in-use bits) are
in a dense bitmap, so these spans can be found with an efficient
sequential memory scan. This approach can scan for unmarked spans at
about 300 GB/ms and can free unmarked spans at about 32 MB/ms. We
could probably significantly improve the rate at which is can free
unmarked spans, but that's a separate issue.
Like the current reclaimer, this is still linear in the number of
spans that are swept, but the constant factor is now so vanishingly
small that it doesn't matter.
The benchmark in #18155 demonstrates both significant page reclaiming
delays, and object reclaiming delays. With "-retain-count=20000000
-preallocate=true -loop-count=3", the benchmark demonstrates several
page reclaiming delays on the order of 40ms. After this change, the
page reclaims are insignificant. The longest sweeps are still ~150ms,
but are object reclaiming delays. We'll address those in the next
several CLs.
Updates #18155.
Fixes #21378 by completely replacing the logic that had that bug.
Change-Id: Iad80eec11d7fc262d02c8f0761ac6998425c4064
Reviewed-on: https://go-review.googlesource.com/c/138959
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
2018-09-27 11:34:07 -04:00
|
|
|
// Page reclaimer state
|
|
|
|
|
|
|
|
|
|
// reclaimIndex is the page index in allArenas of next page to
|
|
|
|
|
// reclaim. Specifically, it refers to page (i %
|
|
|
|
|
// pagesPerArena) of arena allArenas[i / pagesPerArena].
|
|
|
|
|
//
|
|
|
|
|
// If this is >= 1<<63, the page reclaimer is done scanning
|
|
|
|
|
// the page marks.
|
|
|
|
|
//
|
|
|
|
|
// This is accessed atomically.
|
|
|
|
|
reclaimIndex uint64
|
|
|
|
|
// reclaimCredit is spare credit for extra pages swept. Since
|
|
|
|
|
// the page reclaimer works in large chunks, it may reclaim
|
|
|
|
|
// more than requested. Any spare pages released go to this
|
|
|
|
|
// credit pool.
|
|
|
|
|
//
|
|
|
|
|
// This is accessed atomically.
|
|
|
|
|
reclaimCredit uintptr
|
|
|
|
|
|
runtime: support a two-level arena map
Currently, the heap arena map is a single, large array that covers
every possible arena frame in the entire address space. This is
practical up to about 48 bits of address space with 64 MB arenas.
However, there are two problems with this:
1. mips64, ppc64, and s390x support full 64-bit address spaces (though
on Linux only s390x has kernel support for 64-bit address spaces).
On these platforms, it would be good to support these larger
address spaces.
2. On Windows, processes are charged for untouched memory, so for
processes with small heaps, the mostly-untouched 32 MB arena map
plus a 64 MB arena are significant overhead. Hence, it would be
good to reduce both the arena map size and the arena size, but with
a single-level arena, these are inversely proportional.
This CL adds support for a two-level arena map. Arena frame numbers
are now divided into arenaL1Bits of L1 index and arenaL2Bits of L2
index.
At the moment, arenaL1Bits is always 0, so we effectively have a
single level map. We do a few things so that this has no cost beyond
the current single-level map:
1. We embed the L2 array directly in mheap, so if there's a single
entry in the L2 array, the representation is identical to the
current representation and there's no extra level of indirection.
2. Hot code that accesses the arena map is structured so that it
optimizes to nearly the same machine code as it does currently.
3. We make some small tweaks to hot code paths and to the inliner
itself to keep some important functions inlined despite their
now-larger ASTs. In particular, this is necessary for
heapBitsForAddr and heapBits.next.
Possibly as a result of some of the tweaks, this actually slightly
improves the performance of the x/benchmarks garbage benchmark:
name old time/op new time/op delta
Garbage/benchmem-MB=64-12 2.28ms ± 1% 2.26ms ± 1% -1.07% (p=0.000 n=17+19)
(https://perf.golang.org/search?q=upload:20180223.2)
For #23900.
Change-Id: If5164e0961754f97eb9eca58f837f36d759505ff
Reviewed-on: https://go-review.googlesource.com/96779
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
2018-02-22 20:38:09 -05:00
|
|
|
// arenas is the heap arena map. It points to the metadata for
|
|
|
|
|
// the heap for every arena frame of the entire usable virtual
|
|
|
|
|
// address space.
|
2017-12-08 22:57:53 -05:00
|
|
|
//
|
2018-02-16 17:53:16 -05:00
|
|
|
// Use arenaIndex to compute indexes into this array.
|
|
|
|
|
//
|
2017-12-08 22:57:53 -05:00
|
|
|
// For regions of the address space that are not backed by the
|
2018-02-22 12:35:30 -05:00
|
|
|
// Go heap, the arena map contains nil.
|
2017-12-08 22:57:53 -05:00
|
|
|
//
|
|
|
|
|
// Modifications are protected by mheap_.lock. Reads can be
|
|
|
|
|
// performed without locking; however, a given entry can
|
|
|
|
|
// transition from nil to non-nil at any time when the lock
|
|
|
|
|
// isn't held. (Entries never transitions back to nil.)
|
|
|
|
|
//
|
runtime: support a two-level arena map
Currently, the heap arena map is a single, large array that covers
every possible arena frame in the entire address space. This is
practical up to about 48 bits of address space with 64 MB arenas.
However, there are two problems with this:
1. mips64, ppc64, and s390x support full 64-bit address spaces (though
on Linux only s390x has kernel support for 64-bit address spaces).
On these platforms, it would be good to support these larger
address spaces.
2. On Windows, processes are charged for untouched memory, so for
processes with small heaps, the mostly-untouched 32 MB arena map
plus a 64 MB arena are significant overhead. Hence, it would be
good to reduce both the arena map size and the arena size, but with
a single-level arena, these are inversely proportional.
This CL adds support for a two-level arena map. Arena frame numbers
are now divided into arenaL1Bits of L1 index and arenaL2Bits of L2
index.
At the moment, arenaL1Bits is always 0, so we effectively have a
single level map. We do a few things so that this has no cost beyond
the current single-level map:
1. We embed the L2 array directly in mheap, so if there's a single
entry in the L2 array, the representation is identical to the
current representation and there's no extra level of indirection.
2. Hot code that accesses the arena map is structured so that it
optimizes to nearly the same machine code as it does currently.
3. We make some small tweaks to hot code paths and to the inliner
itself to keep some important functions inlined despite their
now-larger ASTs. In particular, this is necessary for
heapBitsForAddr and heapBits.next.
Possibly as a result of some of the tweaks, this actually slightly
improves the performance of the x/benchmarks garbage benchmark:
name old time/op new time/op delta
Garbage/benchmem-MB=64-12 2.28ms ± 1% 2.26ms ± 1% -1.07% (p=0.000 n=17+19)
(https://perf.golang.org/search?q=upload:20180223.2)
For #23900.
Change-Id: If5164e0961754f97eb9eca58f837f36d759505ff
Reviewed-on: https://go-review.googlesource.com/96779
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
2018-02-22 20:38:09 -05:00
|
|
|
// In general, this is a two-level mapping consisting of an L1
|
|
|
|
|
// map and possibly many L2 maps. This saves space when there
|
|
|
|
|
// are a huge number of arena frames. However, on many
|
|
|
|
|
// platforms (even 64-bit), arenaL1Bits is 0, making this
|
|
|
|
|
// effectively a single-level map. In this case, arenas[0]
|
|
|
|
|
// will never be nil.
|
|
|
|
|
arenas [1 << arenaL1Bits]*[1 << arenaL2Bits]*heapArena
|
2017-12-08 22:57:53 -05:00
|
|
|
|
runtime: use sparse mappings for the heap
This replaces the contiguous heap arena mapping with a potentially
sparse mapping that can support heap mappings anywhere in the address
space.
This has several advantages over the current approach:
* There is no longer any limit on the size of the Go heap. (Currently
it's limited to 512GB.) Hence, this fixes #10460.
* It eliminates many failures modes of heap initialization and
growing. In particular it eliminates any possibility of panicking
with an address space conflict. This can happen for many reasons and
even causes a low but steady rate of TSAN test failures because of
conflicts with the TSAN runtime. See #16936 and #11993.
* It eliminates the notion of "non-reserved" heap, which was added
because creating huge address space reservations (particularly on
64-bit) led to huge process VSIZE. This was at best confusing and at
worst conflicted badly with ulimit -v. However, the non-reserved
heap logic is complicated, can race with other mappings in non-pure
Go binaries (e.g., #18976), and requires that the entire heap be
either reserved or non-reserved. We currently maintain the latter
property, but it's quite difficult to convince yourself of that, and
hence difficult to keep correct. This logic is still present, but
will be removed in the next CL.
* It fixes problems on 32-bit where skipping over parts of the address
space leads to mapping huge (and never-to-be-used) metadata
structures. See #19831.
This also completely rewrites and significantly simplifies
mheap.sysAlloc, which has been a source of many bugs. E.g., #21044,
#20259, #18651, and #13143 (and maybe #23222).
This change also makes it possible to allocate individual objects
larger than 512GB. As a result, a few tests that expected huge
allocations to fail needed to be changed to make even larger
allocations. However, at the moment attempting to allocate a humongous
object may cause the program to freeze for several minutes on Linux as
we fall back to probing every page with addrspace_free. That logic
(and this failure mode) will be removed in the next CL.
Fixes #10460.
Fixes #22204 (since it rewrites the code involved).
This slightly slows down compilebench and the x/benchmarks garbage
benchmark.
name old time/op new time/op delta
Template 184ms ± 1% 185ms ± 1% ~ (p=0.065 n=10+9)
Unicode 86.9ms ± 3% 86.3ms ± 1% ~ (p=0.631 n=10+10)
GoTypes 599ms ± 0% 602ms ± 0% +0.56% (p=0.000 n=10+9)
Compiler 2.87s ± 1% 2.89s ± 1% +0.51% (p=0.002 n=9+10)
SSA 7.29s ± 1% 7.25s ± 1% ~ (p=0.182 n=10+9)
Flate 118ms ± 2% 118ms ± 1% ~ (p=0.113 n=9+9)
GoParser 147ms ± 1% 148ms ± 1% +1.07% (p=0.003 n=9+10)
Reflect 401ms ± 1% 404ms ± 1% +0.71% (p=0.003 n=10+9)
Tar 175ms ± 1% 175ms ± 1% ~ (p=0.604 n=9+10)
XML 209ms ± 1% 210ms ± 1% ~ (p=0.052 n=10+10)
(https://perf.golang.org/search?q=upload:20171231.4)
name old time/op new time/op delta
Garbage/benchmem-MB=64-12 2.23ms ± 1% 2.25ms ± 1% +0.84% (p=0.000 n=19+19)
(https://perf.golang.org/search?q=upload:20171231.3)
Relative to the start of the sparse heap changes (starting at and
including "runtime: fix various contiguous bitmap assumptions"),
overall slowdown is roughly 1% on GC-intensive benchmarks:
name old time/op new time/op delta
Template 183ms ± 1% 185ms ± 1% +1.32% (p=0.000 n=9+9)
Unicode 84.9ms ± 2% 86.3ms ± 1% +1.65% (p=0.000 n=9+10)
GoTypes 595ms ± 1% 602ms ± 0% +1.19% (p=0.000 n=9+9)
Compiler 2.86s ± 0% 2.89s ± 1% +0.91% (p=0.000 n=9+10)
SSA 7.19s ± 0% 7.25s ± 1% +0.75% (p=0.000 n=8+9)
Flate 117ms ± 1% 118ms ± 1% +1.10% (p=0.000 n=10+9)
GoParser 146ms ± 2% 148ms ± 1% +1.48% (p=0.002 n=10+10)
Reflect 398ms ± 1% 404ms ± 1% +1.51% (p=0.000 n=10+9)
Tar 173ms ± 1% 175ms ± 1% +1.17% (p=0.000 n=10+10)
XML 208ms ± 1% 210ms ± 1% +0.62% (p=0.011 n=10+10)
[Geo mean] 369ms 373ms +1.17%
(https://perf.golang.org/search?q=upload:20180101.2)
name old time/op new time/op delta
Garbage/benchmem-MB=64-12 2.22ms ± 1% 2.25ms ± 1% +1.51% (p=0.000 n=20+19)
(https://perf.golang.org/search?q=upload:20180101.3)
Change-Id: I5daf4cfec24b252e5a57001f0a6c03f22479d0f0
Reviewed-on: https://go-review.googlesource.com/85887
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
2017-12-19 22:05:23 -08:00
|
|
|
// heapArenaAlloc is pre-reserved space for allocating heapArena
|
|
|
|
|
// objects. This is only used on 32-bit, where we pre-reserve
|
|
|
|
|
// this space to avoid interleaving it with the heap itself.
|
|
|
|
|
heapArenaAlloc linearAlloc
|
|
|
|
|
|
|
|
|
|
// arenaHints is a list of addresses at which to attempt to
|
|
|
|
|
// add more heap arenas. This is initially populated with a
|
|
|
|
|
// set of general hint addresses, and grown with the bounds of
|
|
|
|
|
// actual heap arena ranges.
|
|
|
|
|
arenaHints *arenaHint
|
|
|
|
|
|
|
|
|
|
// arena is a pre-reserved space for allocating heap arenas
|
|
|
|
|
// (the actual arenas). This is only used on 32-bit.
|
|
|
|
|
arena linearAlloc
|
|
|
|
|
|
2018-09-26 14:20:58 -04:00
|
|
|
// allArenas is the arenaIndex of every mapped arena. This can
|
|
|
|
|
// be used to iterate through the address space.
|
|
|
|
|
//
|
|
|
|
|
// Access is protected by mheap_.lock. However, since this is
|
|
|
|
|
// append-only and old backing arrays are never freed, it is
|
|
|
|
|
// safe to acquire mheap_.lock, copy the slice header, and
|
|
|
|
|
// then release mheap_.lock.
|
|
|
|
|
allArenas []arenaIdx
|
|
|
|
|
|
runtime: implement efficient page reclaimer
When we attempt to allocate an N page span (either for a large
allocation or when an mcentral runs dry), we first try to sweep spans
to release N pages. Currently, this can be extremely expensive:
sweeping a span to emptiness is the hardest thing to ask for and the
sweeper generally doesn't know where to even look for potentially
fruitful results. Since this is on the critical path of many
allocations, this is unfortunate.
This CL changes how we reclaim empty spans. Instead of trying lots of
spans and hoping for the best, it uses the newly introduced span marks
to efficiently find empty spans. The span marks (and in-use bits) are
in a dense bitmap, so these spans can be found with an efficient
sequential memory scan. This approach can scan for unmarked spans at
about 300 GB/ms and can free unmarked spans at about 32 MB/ms. We
could probably significantly improve the rate at which is can free
unmarked spans, but that's a separate issue.
Like the current reclaimer, this is still linear in the number of
spans that are swept, but the constant factor is now so vanishingly
small that it doesn't matter.
The benchmark in #18155 demonstrates both significant page reclaiming
delays, and object reclaiming delays. With "-retain-count=20000000
-preallocate=true -loop-count=3", the benchmark demonstrates several
page reclaiming delays on the order of 40ms. After this change, the
page reclaims are insignificant. The longest sweeps are still ~150ms,
but are object reclaiming delays. We'll address those in the next
several CLs.
Updates #18155.
Fixes #21378 by completely replacing the logic that had that bug.
Change-Id: Iad80eec11d7fc262d02c8f0761ac6998425c4064
Reviewed-on: https://go-review.googlesource.com/c/138959
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
2018-09-27 11:34:07 -04:00
|
|
|
// sweepArenas is a snapshot of allArenas taken at the
|
|
|
|
|
// beginning of the sweep cycle. This can be read safely by
|
|
|
|
|
// simply blocking GC (by disabling preemption).
|
|
|
|
|
sweepArenas []arenaIdx
|
|
|
|
|
|
runtime: add bitmap-based markrootSpans implementation
Currently markrootSpans, the scanning routine which scans span specials
(particularly finalizers) as roots, uses sweepSpans to shard work and
find spans to mark.
However, as part of a future CL to change span ownership and how
mcentral works, we want to avoid having markrootSpans use the sweep bufs
to find specials, so in this change we introduce a new mechanism.
Much like for the page reclaimer, we set up a per-page bitmap where the
first page for a span is marked if the span contains any specials, and
unmarked if it has no specials. This bitmap is updated by addspecial,
removespecial, and during sweeping.
markrootSpans then shards this bitmap into mark work and markers iterate
over the bitmap looking for spans with specials to mark. Unlike the page
reclaimer, we don't need to use the pageInUse bits because having a
special implies that a span is in-use.
While in terms of computational complexity this design is technically
worse, because it needs to iterate over the mapped heap, in practice
this iteration is very fast (we can skip over large swathes of the heap
very quickly) and we only look at spans that have any specials at all,
rather than having to touch each span.
This new implementation of markrootSpans is behind a feature flag called
go115NewMarkrootSpans.
Updates #37487.
Change-Id: I8ea07b6c11059f6d412fe419e0ab512d989377b8
Reviewed-on: https://go-review.googlesource.com/c/go/+/221178
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Austin Clements <austin@google.com>
2020-02-20 20:03:39 +00:00
|
|
|
// markArenas is a snapshot of allArenas taken at the beginning
|
|
|
|
|
// of the mark cycle. Because allArenas is append-only, neither
|
|
|
|
|
// this slice nor its contents will change during the mark, so
|
|
|
|
|
// it can be read safely.
|
|
|
|
|
markArenas []arenaIdx
|
|
|
|
|
|
runtime: grow the heap incrementally
Currently, we map and grow the heap a whole arena (64MB) at a time.
Unfortunately, in order to fix #32828, we need to switch from
scavenging inline with allocation back to scavenging on heap growth,
but heap-growth scavenging happens in large jumps because we grow the
heap in large jumps.
In order to prepare for better heap-growth scavenging, this CL
separates mapping more space for the heap from actually "growing" it
(tracking the new space with spans). Instead, growing the heap keeps
track of the "current arena" it's growing into. It track that with new
spans as needed, and only maps more arena space when the current arena
is inadequate. The effect to the user is the same, but this will let
us scavenge on much smaller increments of heap growth.
There are two slightly subtleties to this change:
1. If an allocation requires mapping a new arena and that new arena
isn't contiguous with the current arena, we don't want to lose the
unused space in the current arena, so we have to immediately track
that with a span.
2. The mapped space must be accounted as released and idle, even
though it isn't actually tracked in a span.
For #32828, since this makes heap-growth scavenging far more
effective, especially at small heap sizes. For example, this change is
necessary for TestPhysicalMemoryUtilization to pass once we remove
inline scavenging.
Change-Id: I300e74a0534062467e4ce91cdc3508e5ef9aa73a
Reviewed-on: https://go-review.googlesource.com/c/go/+/189957
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
2019-08-12 14:54:28 -04:00
|
|
|
// curArena is the arena that the heap is currently growing
|
|
|
|
|
// into. This should always be physPageSize-aligned.
|
|
|
|
|
curArena struct {
|
|
|
|
|
base, end uintptr
|
|
|
|
|
}
|
|
|
|
|
|
2020-02-19 16:37:48 +00:00
|
|
|
_ uint32 // ensure 64-bit alignment of central
|
runtime: accept non-monotonic arena allocation on 32-bit
Currently, the heap arena allocator allocates monotonically increasing
addresses. This is fine on 64-bit where we stake out a giant block of
the address space for ourselves and start at the beginning of it, but
on 32-bit the arena starts at address 0 but we start allocating from
wherever the OS feels like giving us memory. We can generally hint the
OS to start us at a low address, but this doesn't always work.
As a result, on 32-bit, if the OS gives us an arena block that's lower
than the current block we're allocating from, we simply say "thanks
but no thanks", return the whole (256MB!) block of memory, and then
take a fallback path that mmaps just the amount of memory we need
(which may be as little as 8K).
We have to do this because mheap_.arena_used is *both* the highest
used address in the arena and the next address we allocate from.
Fix all of this by separating the second role of arena_used out into a
new field called arena_alloc. This lets us accept any arena block the
OS gives us. This also slightly changes the invariants around
arena_end. Previously, we ensured arena_used <= arena_end, but this
was related to arena_used's second role, so the new invariant is
arena_alloc <= arena_end. As a result, we no longer necessarily update
arena_end when we're updating arena_used.
Fixes #20259 properly. (Unlike the original fix, this one should not
be cherry-picked to Go 1.8.)
This is reasonably low risk. I verified several key properties of the
32-bit code path with both 4K and 64K physical pages using a symbolic
model and the change does not materially affect 64-bit (arena_used ==
arena_alloc on 64-bit). The only oddity is that we no longer call
setArenaUsed with racemap == false to indicate that we're creating a
hole in the address space, but this only happened in a 32-bit-only
code path, and the race detector require 64-bit, so this never
mattered anyway.
Change-Id: Ib1334007933e615166bac4159bf357ae06ec6a25
Reviewed-on: https://go-review.googlesource.com/44010
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Ian Lance Taylor <iant@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
2017-05-23 17:07:26 -04:00
|
|
|
|
2015-02-19 13:38:46 -05:00
|
|
|
// central free lists for small size classes.
|
2018-11-05 19:26:25 +00:00
|
|
|
// the padding makes sure that the mcentrals are
|
|
|
|
|
// spaced CacheLinePadSize bytes apart, so that each mcentral.lock
|
2015-02-19 13:38:46 -05:00
|
|
|
// gets its own cache line.
|
2016-02-09 17:53:07 -05:00
|
|
|
// central is indexed by spanClass.
|
|
|
|
|
central [numSpanClasses]struct {
|
2015-02-19 13:38:46 -05:00
|
|
|
mcentral mcentral
|
2018-06-05 08:14:57 +02:00
|
|
|
pad [cpu.CacheLinePadSize - unsafe.Sizeof(mcentral{})%cpu.CacheLinePadSize]byte
|
2015-02-19 13:38:46 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
spanalloc fixalloc // allocator for span*
|
|
|
|
|
cachealloc fixalloc // allocator for mcache*
|
|
|
|
|
specialfinalizeralloc fixalloc // allocator for specialfinalizer*
|
|
|
|
|
specialprofilealloc fixalloc // allocator for specialprofile*
|
2021-03-24 10:45:20 -04:00
|
|
|
specialReachableAlloc fixalloc // allocator for specialReachable
|
2015-06-11 16:49:38 +03:00
|
|
|
speciallock mutex // lock for special record allocators.
|
runtime: use sparse mappings for the heap
This replaces the contiguous heap arena mapping with a potentially
sparse mapping that can support heap mappings anywhere in the address
space.
This has several advantages over the current approach:
* There is no longer any limit on the size of the Go heap. (Currently
it's limited to 512GB.) Hence, this fixes #10460.
* It eliminates many failures modes of heap initialization and
growing. In particular it eliminates any possibility of panicking
with an address space conflict. This can happen for many reasons and
even causes a low but steady rate of TSAN test failures because of
conflicts with the TSAN runtime. See #16936 and #11993.
* It eliminates the notion of "non-reserved" heap, which was added
because creating huge address space reservations (particularly on
64-bit) led to huge process VSIZE. This was at best confusing and at
worst conflicted badly with ulimit -v. However, the non-reserved
heap logic is complicated, can race with other mappings in non-pure
Go binaries (e.g., #18976), and requires that the entire heap be
either reserved or non-reserved. We currently maintain the latter
property, but it's quite difficult to convince yourself of that, and
hence difficult to keep correct. This logic is still present, but
will be removed in the next CL.
* It fixes problems on 32-bit where skipping over parts of the address
space leads to mapping huge (and never-to-be-used) metadata
structures. See #19831.
This also completely rewrites and significantly simplifies
mheap.sysAlloc, which has been a source of many bugs. E.g., #21044,
#20259, #18651, and #13143 (and maybe #23222).
This change also makes it possible to allocate individual objects
larger than 512GB. As a result, a few tests that expected huge
allocations to fail needed to be changed to make even larger
allocations. However, at the moment attempting to allocate a humongous
object may cause the program to freeze for several minutes on Linux as
we fall back to probing every page with addrspace_free. That logic
(and this failure mode) will be removed in the next CL.
Fixes #10460.
Fixes #22204 (since it rewrites the code involved).
This slightly slows down compilebench and the x/benchmarks garbage
benchmark.
name old time/op new time/op delta
Template 184ms ± 1% 185ms ± 1% ~ (p=0.065 n=10+9)
Unicode 86.9ms ± 3% 86.3ms ± 1% ~ (p=0.631 n=10+10)
GoTypes 599ms ± 0% 602ms ± 0% +0.56% (p=0.000 n=10+9)
Compiler 2.87s ± 1% 2.89s ± 1% +0.51% (p=0.002 n=9+10)
SSA 7.29s ± 1% 7.25s ± 1% ~ (p=0.182 n=10+9)
Flate 118ms ± 2% 118ms ± 1% ~ (p=0.113 n=9+9)
GoParser 147ms ± 1% 148ms ± 1% +1.07% (p=0.003 n=9+10)
Reflect 401ms ± 1% 404ms ± 1% +0.71% (p=0.003 n=10+9)
Tar 175ms ± 1% 175ms ± 1% ~ (p=0.604 n=9+10)
XML 209ms ± 1% 210ms ± 1% ~ (p=0.052 n=10+10)
(https://perf.golang.org/search?q=upload:20171231.4)
name old time/op new time/op delta
Garbage/benchmem-MB=64-12 2.23ms ± 1% 2.25ms ± 1% +0.84% (p=0.000 n=19+19)
(https://perf.golang.org/search?q=upload:20171231.3)
Relative to the start of the sparse heap changes (starting at and
including "runtime: fix various contiguous bitmap assumptions"),
overall slowdown is roughly 1% on GC-intensive benchmarks:
name old time/op new time/op delta
Template 183ms ± 1% 185ms ± 1% +1.32% (p=0.000 n=9+9)
Unicode 84.9ms ± 2% 86.3ms ± 1% +1.65% (p=0.000 n=9+10)
GoTypes 595ms ± 1% 602ms ± 0% +1.19% (p=0.000 n=9+9)
Compiler 2.86s ± 0% 2.89s ± 1% +0.91% (p=0.000 n=9+10)
SSA 7.19s ± 0% 7.25s ± 1% +0.75% (p=0.000 n=8+9)
Flate 117ms ± 1% 118ms ± 1% +1.10% (p=0.000 n=10+9)
GoParser 146ms ± 2% 148ms ± 1% +1.48% (p=0.002 n=10+10)
Reflect 398ms ± 1% 404ms ± 1% +1.51% (p=0.000 n=10+9)
Tar 173ms ± 1% 175ms ± 1% +1.17% (p=0.000 n=10+10)
XML 208ms ± 1% 210ms ± 1% +0.62% (p=0.011 n=10+10)
[Geo mean] 369ms 373ms +1.17%
(https://perf.golang.org/search?q=upload:20180101.2)
name old time/op new time/op delta
Garbage/benchmem-MB=64-12 2.22ms ± 1% 2.25ms ± 1% +1.51% (p=0.000 n=20+19)
(https://perf.golang.org/search?q=upload:20180101.3)
Change-Id: I5daf4cfec24b252e5a57001f0a6c03f22479d0f0
Reviewed-on: https://go-review.googlesource.com/85887
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
2017-12-19 22:05:23 -08:00
|
|
|
arenaHintAlloc fixalloc // allocator for arenaHints
|
2017-10-04 15:32:40 -07:00
|
|
|
|
|
|
|
|
unused *specialfinalizer // never set, just here to force the specialfinalizer type into DWARF
|
2015-02-19 13:38:46 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
var mheap_ mheap
|
|
|
|
|
|
2017-12-08 22:57:53 -05:00
|
|
|
// A heapArena stores metadata for a heap arena. heapArenas are stored
|
|
|
|
|
// outside of the Go heap and accessed via the mheap_.arenas index.
|
|
|
|
|
//
|
|
|
|
|
//go:notinheap
|
|
|
|
|
type heapArena struct {
|
|
|
|
|
// bitmap stores the pointer/scalar bitmap for the words in
|
|
|
|
|
// this arena. See mbitmap.go for a description. Use the
|
|
|
|
|
// heapBits type to access this.
|
|
|
|
|
bitmap [heapArenaBitmapBytes]byte
|
|
|
|
|
|
2017-12-13 16:09:02 -05:00
|
|
|
// spans maps from virtual address page ID within this arena to *mspan.
|
|
|
|
|
// For allocated spans, their pages map to the span itself.
|
|
|
|
|
// For free spans, only the lowest and highest pages map to the span itself.
|
|
|
|
|
// Internal pages map to an arbitrary span.
|
|
|
|
|
// For pages that have never been allocated, spans entries are nil.
|
|
|
|
|
//
|
|
|
|
|
// Modifications are protected by mheap.lock. Reads can be
|
|
|
|
|
// performed without locking, but ONLY from indexes that are
|
|
|
|
|
// known to contain in-use or stack spans. This means there
|
|
|
|
|
// must not be a safe-point between establishing that an
|
|
|
|
|
// address is live and looking it up in the spans array.
|
|
|
|
|
spans [pagesPerArena]*mspan
|
2018-09-26 16:32:52 -04:00
|
|
|
|
|
|
|
|
// pageInUse is a bitmap that indicates which spans are in
|
|
|
|
|
// state mSpanInUse. This bitmap is indexed by page number,
|
|
|
|
|
// but only the bit corresponding to the first page in each
|
|
|
|
|
// span is used.
|
|
|
|
|
//
|
2019-09-18 15:33:17 +00:00
|
|
|
// Reads and writes are atomic.
|
2018-09-26 16:32:52 -04:00
|
|
|
pageInUse [pagesPerArena / 8]uint8
|
2018-09-26 15:59:21 -04:00
|
|
|
|
|
|
|
|
// pageMarks is a bitmap that indicates which spans have any
|
|
|
|
|
// marked objects on them. Like pageInUse, only the bit
|
|
|
|
|
// corresponding to the first page in each span is used.
|
|
|
|
|
//
|
|
|
|
|
// Writes are done atomically during marking. Reads are
|
|
|
|
|
// non-atomic and lock-free since they only occur during
|
|
|
|
|
// sweeping (and hence never race with writes).
|
|
|
|
|
//
|
|
|
|
|
// This is used to quickly find whole spans that can be freed.
|
|
|
|
|
//
|
|
|
|
|
// TODO(austin): It would be nice if this was uint64 for
|
|
|
|
|
// faster scanning, but we don't have 64-bit atomic bit
|
|
|
|
|
// operations.
|
|
|
|
|
pageMarks [pagesPerArena / 8]uint8
|
2019-10-28 18:38:17 +00:00
|
|
|
|
runtime: add bitmap-based markrootSpans implementation
Currently markrootSpans, the scanning routine which scans span specials
(particularly finalizers) as roots, uses sweepSpans to shard work and
find spans to mark.
However, as part of a future CL to change span ownership and how
mcentral works, we want to avoid having markrootSpans use the sweep bufs
to find specials, so in this change we introduce a new mechanism.
Much like for the page reclaimer, we set up a per-page bitmap where the
first page for a span is marked if the span contains any specials, and
unmarked if it has no specials. This bitmap is updated by addspecial,
removespecial, and during sweeping.
markrootSpans then shards this bitmap into mark work and markers iterate
over the bitmap looking for spans with specials to mark. Unlike the page
reclaimer, we don't need to use the pageInUse bits because having a
special implies that a span is in-use.
While in terms of computational complexity this design is technically
worse, because it needs to iterate over the mapped heap, in practice
this iteration is very fast (we can skip over large swathes of the heap
very quickly) and we only look at spans that have any specials at all,
rather than having to touch each span.
This new implementation of markrootSpans is behind a feature flag called
go115NewMarkrootSpans.
Updates #37487.
Change-Id: I8ea07b6c11059f6d412fe419e0ab512d989377b8
Reviewed-on: https://go-review.googlesource.com/c/go/+/221178
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Austin Clements <austin@google.com>
2020-02-20 20:03:39 +00:00
|
|
|
// pageSpecials is a bitmap that indicates which spans have
|
|
|
|
|
// specials (finalizers or other). Like pageInUse, only the bit
|
|
|
|
|
// corresponding to the first page in each span is used.
|
|
|
|
|
//
|
|
|
|
|
// Writes are done atomically whenever a special is added to
|
|
|
|
|
// a span and whenever the last special is removed from a span.
|
|
|
|
|
// Reads are done atomically to find spans containing specials
|
|
|
|
|
// during marking.
|
|
|
|
|
pageSpecials [pagesPerArena / 8]uint8
|
|
|
|
|
|
2020-06-05 16:48:03 -04:00
|
|
|
// checkmarks stores the debug.gccheckmark state. It is only
|
|
|
|
|
// used if debug.gccheckmark > 0.
|
|
|
|
|
checkmarks *checkmarksMap
|
|
|
|
|
|
2019-10-28 18:38:17 +00:00
|
|
|
// zeroedBase marks the first byte of the first page in this
|
|
|
|
|
// arena which hasn't been used yet and is therefore already
|
|
|
|
|
// zero. zeroedBase is relative to the arena base.
|
|
|
|
|
// Increases monotonically until it hits heapArenaBytes.
|
|
|
|
|
//
|
|
|
|
|
// This field is sufficient to determine if an allocation
|
|
|
|
|
// needs to be zeroed because the page allocator follows an
|
|
|
|
|
// address-ordered first-fit policy.
|
|
|
|
|
//
|
2019-10-28 19:17:21 +00:00
|
|
|
// Read atomically and written with an atomic CAS.
|
2019-10-28 18:38:17 +00:00
|
|
|
zeroedBase uintptr
|
2017-12-08 22:57:53 -05:00
|
|
|
}
|
|
|
|
|
|
runtime: use sparse mappings for the heap
This replaces the contiguous heap arena mapping with a potentially
sparse mapping that can support heap mappings anywhere in the address
space.
This has several advantages over the current approach:
* There is no longer any limit on the size of the Go heap. (Currently
it's limited to 512GB.) Hence, this fixes #10460.
* It eliminates many failures modes of heap initialization and
growing. In particular it eliminates any possibility of panicking
with an address space conflict. This can happen for many reasons and
even causes a low but steady rate of TSAN test failures because of
conflicts with the TSAN runtime. See #16936 and #11993.
* It eliminates the notion of "non-reserved" heap, which was added
because creating huge address space reservations (particularly on
64-bit) led to huge process VSIZE. This was at best confusing and at
worst conflicted badly with ulimit -v. However, the non-reserved
heap logic is complicated, can race with other mappings in non-pure
Go binaries (e.g., #18976), and requires that the entire heap be
either reserved or non-reserved. We currently maintain the latter
property, but it's quite difficult to convince yourself of that, and
hence difficult to keep correct. This logic is still present, but
will be removed in the next CL.
* It fixes problems on 32-bit where skipping over parts of the address
space leads to mapping huge (and never-to-be-used) metadata
structures. See #19831.
This also completely rewrites and significantly simplifies
mheap.sysAlloc, which has been a source of many bugs. E.g., #21044,
#20259, #18651, and #13143 (and maybe #23222).
This change also makes it possible to allocate individual objects
larger than 512GB. As a result, a few tests that expected huge
allocations to fail needed to be changed to make even larger
allocations. However, at the moment attempting to allocate a humongous
object may cause the program to freeze for several minutes on Linux as
we fall back to probing every page with addrspace_free. That logic
(and this failure mode) will be removed in the next CL.
Fixes #10460.
Fixes #22204 (since it rewrites the code involved).
This slightly slows down compilebench and the x/benchmarks garbage
benchmark.
name old time/op new time/op delta
Template 184ms ± 1% 185ms ± 1% ~ (p=0.065 n=10+9)
Unicode 86.9ms ± 3% 86.3ms ± 1% ~ (p=0.631 n=10+10)
GoTypes 599ms ± 0% 602ms ± 0% +0.56% (p=0.000 n=10+9)
Compiler 2.87s ± 1% 2.89s ± 1% +0.51% (p=0.002 n=9+10)
SSA 7.29s ± 1% 7.25s ± 1% ~ (p=0.182 n=10+9)
Flate 118ms ± 2% 118ms ± 1% ~ (p=0.113 n=9+9)
GoParser 147ms ± 1% 148ms ± 1% +1.07% (p=0.003 n=9+10)
Reflect 401ms ± 1% 404ms ± 1% +0.71% (p=0.003 n=10+9)
Tar 175ms ± 1% 175ms ± 1% ~ (p=0.604 n=9+10)
XML 209ms ± 1% 210ms ± 1% ~ (p=0.052 n=10+10)
(https://perf.golang.org/search?q=upload:20171231.4)
name old time/op new time/op delta
Garbage/benchmem-MB=64-12 2.23ms ± 1% 2.25ms ± 1% +0.84% (p=0.000 n=19+19)
(https://perf.golang.org/search?q=upload:20171231.3)
Relative to the start of the sparse heap changes (starting at and
including "runtime: fix various contiguous bitmap assumptions"),
overall slowdown is roughly 1% on GC-intensive benchmarks:
name old time/op new time/op delta
Template 183ms ± 1% 185ms ± 1% +1.32% (p=0.000 n=9+9)
Unicode 84.9ms ± 2% 86.3ms ± 1% +1.65% (p=0.000 n=9+10)
GoTypes 595ms ± 1% 602ms ± 0% +1.19% (p=0.000 n=9+9)
Compiler 2.86s ± 0% 2.89s ± 1% +0.91% (p=0.000 n=9+10)
SSA 7.19s ± 0% 7.25s ± 1% +0.75% (p=0.000 n=8+9)
Flate 117ms ± 1% 118ms ± 1% +1.10% (p=0.000 n=10+9)
GoParser 146ms ± 2% 148ms ± 1% +1.48% (p=0.002 n=10+10)
Reflect 398ms ± 1% 404ms ± 1% +1.51% (p=0.000 n=10+9)
Tar 173ms ± 1% 175ms ± 1% +1.17% (p=0.000 n=10+10)
XML 208ms ± 1% 210ms ± 1% +0.62% (p=0.011 n=10+10)
[Geo mean] 369ms 373ms +1.17%
(https://perf.golang.org/search?q=upload:20180101.2)
name old time/op new time/op delta
Garbage/benchmem-MB=64-12 2.22ms ± 1% 2.25ms ± 1% +1.51% (p=0.000 n=20+19)
(https://perf.golang.org/search?q=upload:20180101.3)
Change-Id: I5daf4cfec24b252e5a57001f0a6c03f22479d0f0
Reviewed-on: https://go-review.googlesource.com/85887
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
2017-12-19 22:05:23 -08:00
|
|
|
// arenaHint is a hint for where to grow the heap arenas. See
|
|
|
|
|
// mheap_.arenaHints.
|
|
|
|
|
//
|
|
|
|
|
//go:notinheap
|
|
|
|
|
type arenaHint struct {
|
|
|
|
|
addr uintptr
|
|
|
|
|
down bool
|
|
|
|
|
next *arenaHint
|
|
|
|
|
}
|
|
|
|
|
|
2018-11-05 19:26:25 +00:00
|
|
|
// An mspan is a run of pages.
|
2014-11-11 17:05:02 -05:00
|
|
|
//
|
2018-11-05 19:26:25 +00:00
|
|
|
// When a mspan is in the heap free treap, state == mSpanFree
|
2014-11-11 17:05:02 -05:00
|
|
|
// and heapmap(s->start) == span, heapmap(s->start+s->npages-1) == span.
|
2018-11-05 19:26:25 +00:00
|
|
|
// If the mspan is in the heap scav treap, then in addition to the
|
2018-10-17 20:16:45 +00:00
|
|
|
// above scavenged == true. scavenged == false in all other cases.
|
2014-11-11 17:05:02 -05:00
|
|
|
//
|
2018-11-05 19:26:25 +00:00
|
|
|
// When a mspan is allocated, state == mSpanInUse or mSpanManual
|
2014-11-11 17:05:02 -05:00
|
|
|
// and heapmap(i) == span for all s->start <= i < s->start+s->npages.
|
|
|
|
|
|
2018-11-05 19:26:25 +00:00
|
|
|
// Every mspan is in one doubly-linked list, either in the mheap's
|
|
|
|
|
// busy list or one of the mcentral's span lists.
|
2015-02-19 13:38:46 -05:00
|
|
|
|
2018-11-05 19:26:25 +00:00
|
|
|
// An mspan representing actual memory has state mSpanInUse,
|
2018-09-26 16:39:02 -04:00
|
|
|
// mSpanManual, or mSpanFree. Transitions between these states are
|
runtime: don't free stack spans during GC
Memory for stacks is manually managed by the runtime and, currently
(with one exception) we free stack spans immediately when the last
stack on a span is freed. However, the garbage collector assumes that
spans can never transition from non-free to free during scan or mark.
This disagreement makes it possible for the garbage collector to mark
uninitialized objects and is blocking us from re-enabling the bad
pointer test in the garbage collector (issue #9880).
For example, the following sequence will result in marking an
uninitialized object:
1. scanobject loads a pointer slot out of the object it's scanning.
This happens to be one of the special pointers from the heap into a
stack. Call the pointer p and suppose it points into X's stack.
2. X, running on another thread, grows its stack and frees its old
stack.
3. The old stack happens to be large or was the last stack in its
span, so X frees this span, setting it to state _MSpanFree.
4. The span gets reused as a heap span.
5. scanobject calls heapBitsForObject, which loads the span containing
p, which is now in state _MSpanInUse, but doesn't necessarily have
an object at p. The not-object at p gets marked, and at this point
all sorts of things can go wrong.
We already have a partial solution to this. When shrinking a stack, we
put the old stack on a queue to be freed at the end of garbage
collection. This was done to address exactly this problem, but wasn't
a complete solution.
This commit generalizes this solution to both shrinking and growing
stacks. For stacks that fit in the stack pool, we simply don't free
the span, even if its reference count reaches zero. It's fine to reuse
the span for other stacks, and this enables that. At the end of GC, we
sweep for cached stack spans with a zero reference count and free
them. For larger stacks, we simply queue the stack span to be freed at
the end of GC. Ideally, we would reuse these large stack spans the way
we can small stack spans, but that's a more invasive change that will
have to wait until after the freeze.
Fixes #11267.
Change-Id: Ib7f2c5da4845cc0268e8dc098b08465116972a71
Reviewed-on: https://go-review.googlesource.com/11502
Reviewed-by: Russ Cox <rsc@golang.org>
2015-06-22 10:24:50 -04:00
|
|
|
// constrained as follows:
|
|
|
|
|
//
|
2017-03-16 14:16:31 -04:00
|
|
|
// * A span may transition from free to in-use or manual during any GC
|
runtime: don't free stack spans during GC
Memory for stacks is manually managed by the runtime and, currently
(with one exception) we free stack spans immediately when the last
stack on a span is freed. However, the garbage collector assumes that
spans can never transition from non-free to free during scan or mark.
This disagreement makes it possible for the garbage collector to mark
uninitialized objects and is blocking us from re-enabling the bad
pointer test in the garbage collector (issue #9880).
For example, the following sequence will result in marking an
uninitialized object:
1. scanobject loads a pointer slot out of the object it's scanning.
This happens to be one of the special pointers from the heap into a
stack. Call the pointer p and suppose it points into X's stack.
2. X, running on another thread, grows its stack and frees its old
stack.
3. The old stack happens to be large or was the last stack in its
span, so X frees this span, setting it to state _MSpanFree.
4. The span gets reused as a heap span.
5. scanobject calls heapBitsForObject, which loads the span containing
p, which is now in state _MSpanInUse, but doesn't necessarily have
an object at p. The not-object at p gets marked, and at this point
all sorts of things can go wrong.
We already have a partial solution to this. When shrinking a stack, we
put the old stack on a queue to be freed at the end of garbage
collection. This was done to address exactly this problem, but wasn't
a complete solution.
This commit generalizes this solution to both shrinking and growing
stacks. For stacks that fit in the stack pool, we simply don't free
the span, even if its reference count reaches zero. It's fine to reuse
the span for other stacks, and this enables that. At the end of GC, we
sweep for cached stack spans with a zero reference count and free
them. For larger stacks, we simply queue the stack span to be freed at
the end of GC. Ideally, we would reuse these large stack spans the way
we can small stack spans, but that's a more invasive change that will
have to wait until after the freeze.
Fixes #11267.
Change-Id: Ib7f2c5da4845cc0268e8dc098b08465116972a71
Reviewed-on: https://go-review.googlesource.com/11502
Reviewed-by: Russ Cox <rsc@golang.org>
2015-06-22 10:24:50 -04:00
|
|
|
// phase.
|
|
|
|
|
//
|
|
|
|
|
// * During sweeping (gcphase == _GCoff), a span may transition from
|
2017-03-16 14:16:31 -04:00
|
|
|
// in-use to free (as a result of sweeping) or manual to free (as a
|
runtime: don't free stack spans during GC
Memory for stacks is manually managed by the runtime and, currently
(with one exception) we free stack spans immediately when the last
stack on a span is freed. However, the garbage collector assumes that
spans can never transition from non-free to free during scan or mark.
This disagreement makes it possible for the garbage collector to mark
uninitialized objects and is blocking us from re-enabling the bad
pointer test in the garbage collector (issue #9880).
For example, the following sequence will result in marking an
uninitialized object:
1. scanobject loads a pointer slot out of the object it's scanning.
This happens to be one of the special pointers from the heap into a
stack. Call the pointer p and suppose it points into X's stack.
2. X, running on another thread, grows its stack and frees its old
stack.
3. The old stack happens to be large or was the last stack in its
span, so X frees this span, setting it to state _MSpanFree.
4. The span gets reused as a heap span.
5. scanobject calls heapBitsForObject, which loads the span containing
p, which is now in state _MSpanInUse, but doesn't necessarily have
an object at p. The not-object at p gets marked, and at this point
all sorts of things can go wrong.
We already have a partial solution to this. When shrinking a stack, we
put the old stack on a queue to be freed at the end of garbage
collection. This was done to address exactly this problem, but wasn't
a complete solution.
This commit generalizes this solution to both shrinking and growing
stacks. For stacks that fit in the stack pool, we simply don't free
the span, even if its reference count reaches zero. It's fine to reuse
the span for other stacks, and this enables that. At the end of GC, we
sweep for cached stack spans with a zero reference count and free
them. For larger stacks, we simply queue the stack span to be freed at
the end of GC. Ideally, we would reuse these large stack spans the way
we can small stack spans, but that's a more invasive change that will
have to wait until after the freeze.
Fixes #11267.
Change-Id: Ib7f2c5da4845cc0268e8dc098b08465116972a71
Reviewed-on: https://go-review.googlesource.com/11502
Reviewed-by: Russ Cox <rsc@golang.org>
2015-06-22 10:24:50 -04:00
|
|
|
// result of stacks being freed).
|
|
|
|
|
//
|
|
|
|
|
// * During GC (gcphase != _GCoff), a span *must not* transition from
|
2017-03-16 14:16:31 -04:00
|
|
|
// manual or in-use to free. Because concurrent GC may read a pointer
|
runtime: don't free stack spans during GC
Memory for stacks is manually managed by the runtime and, currently
(with one exception) we free stack spans immediately when the last
stack on a span is freed. However, the garbage collector assumes that
spans can never transition from non-free to free during scan or mark.
This disagreement makes it possible for the garbage collector to mark
uninitialized objects and is blocking us from re-enabling the bad
pointer test in the garbage collector (issue #9880).
For example, the following sequence will result in marking an
uninitialized object:
1. scanobject loads a pointer slot out of the object it's scanning.
This happens to be one of the special pointers from the heap into a
stack. Call the pointer p and suppose it points into X's stack.
2. X, running on another thread, grows its stack and frees its old
stack.
3. The old stack happens to be large or was the last stack in its
span, so X frees this span, setting it to state _MSpanFree.
4. The span gets reused as a heap span.
5. scanobject calls heapBitsForObject, which loads the span containing
p, which is now in state _MSpanInUse, but doesn't necessarily have
an object at p. The not-object at p gets marked, and at this point
all sorts of things can go wrong.
We already have a partial solution to this. When shrinking a stack, we
put the old stack on a queue to be freed at the end of garbage
collection. This was done to address exactly this problem, but wasn't
a complete solution.
This commit generalizes this solution to both shrinking and growing
stacks. For stacks that fit in the stack pool, we simply don't free
the span, even if its reference count reaches zero. It's fine to reuse
the span for other stacks, and this enables that. At the end of GC, we
sweep for cached stack spans with a zero reference count and free
them. For larger stacks, we simply queue the stack span to be freed at
the end of GC. Ideally, we would reuse these large stack spans the way
we can small stack spans, but that's a more invasive change that will
have to wait until after the freeze.
Fixes #11267.
Change-Id: Ib7f2c5da4845cc0268e8dc098b08465116972a71
Reviewed-on: https://go-review.googlesource.com/11502
Reviewed-by: Russ Cox <rsc@golang.org>
2015-06-22 10:24:50 -04:00
|
|
|
// and then look up its span, the span state must be monotonic.
|
runtime: atomically set span state and use as publication barrier
When everything is working correctly, any pointer the garbage
collector encounters can only point into a fully initialized heap
span, since the span must have been initialized before that pointer
could escape the heap allocator and become visible to the GC.
However, in various cases, we try to be defensive against bad
pointers. In findObject, this is just a sanity check: we never expect
to find a bad pointer, but programming errors can lead to them. In
spanOfHeap, we don't necessarily trust the pointer and we're trying to
check if it really does point to the heap, though it should always
point to something. Conservative scanning takes this to a new level,
since it can only guess that a word may be a pointer and verify this.
In all of these cases, we have a problem that the span lookup and
check can race with span initialization, since the span becomes
visible to lookups before it's fully initialized.
Furthermore, we're about to start initializing the span without the
heap lock held, which is going to introduce races where accesses were
previously protected by the heap lock.
To address this, this CL makes accesses to mspan.state atomic, and
ensures that the span is fully initialized before setting the state to
mSpanInUse. All loads are now atomic, and in any case where we don't
trust the pointer, it first atomically loads the span state and checks
that it's mSpanInUse, after which it will have synchronized with span
initialization and can safely check the other span fields.
For #10958, #24543, but a good fix in general.
Change-Id: I518b7c63555b02064b98aa5f802c92b758fef853
Reviewed-on: https://go-review.googlesource.com/c/go/+/203286
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
2019-10-23 11:25:38 -04:00
|
|
|
//
|
|
|
|
|
// Setting mspan.state to mSpanInUse or mSpanManual must be done
|
|
|
|
|
// atomically and only after all other span fields are valid.
|
|
|
|
|
// Likewise, if inspecting a span is contingent on it being
|
|
|
|
|
// mSpanInUse, the state should be loaded atomically and checked
|
|
|
|
|
// before depending on other fields. This allows the garbage collector
|
|
|
|
|
// to safely deal with potentially invalid pointers, since resolving
|
|
|
|
|
// such pointers may race with a span being allocated.
|
2016-09-09 10:31:27 -04:00
|
|
|
type mSpanState uint8
|
|
|
|
|
|
2015-02-19 13:38:46 -05:00
|
|
|
const (
|
2018-09-26 16:39:02 -04:00
|
|
|
mSpanDead mSpanState = iota
|
|
|
|
|
mSpanInUse // allocated for garbage collected heap
|
|
|
|
|
mSpanManual // allocated for manual management (e.g., stack allocator)
|
2015-02-19 13:38:46 -05:00
|
|
|
)
|
|
|
|
|
|
2016-09-09 10:22:10 -04:00
|
|
|
// mSpanStateNames are the names of the span states, indexed by
|
|
|
|
|
// mSpanState.
|
|
|
|
|
var mSpanStateNames = []string{
|
2018-09-26 16:39:02 -04:00
|
|
|
"mSpanDead",
|
|
|
|
|
"mSpanInUse",
|
|
|
|
|
"mSpanManual",
|
|
|
|
|
"mSpanFree",
|
2016-09-09 10:22:10 -04:00
|
|
|
}
|
|
|
|
|
|
runtime: atomically set span state and use as publication barrier
When everything is working correctly, any pointer the garbage
collector encounters can only point into a fully initialized heap
span, since the span must have been initialized before that pointer
could escape the heap allocator and become visible to the GC.
However, in various cases, we try to be defensive against bad
pointers. In findObject, this is just a sanity check: we never expect
to find a bad pointer, but programming errors can lead to them. In
spanOfHeap, we don't necessarily trust the pointer and we're trying to
check if it really does point to the heap, though it should always
point to something. Conservative scanning takes this to a new level,
since it can only guess that a word may be a pointer and verify this.
In all of these cases, we have a problem that the span lookup and
check can race with span initialization, since the span becomes
visible to lookups before it's fully initialized.
Furthermore, we're about to start initializing the span without the
heap lock held, which is going to introduce races where accesses were
previously protected by the heap lock.
To address this, this CL makes accesses to mspan.state atomic, and
ensures that the span is fully initialized before setting the state to
mSpanInUse. All loads are now atomic, and in any case where we don't
trust the pointer, it first atomically loads the span state and checks
that it's mSpanInUse, after which it will have synchronized with span
initialization and can safely check the other span fields.
For #10958, #24543, but a good fix in general.
Change-Id: I518b7c63555b02064b98aa5f802c92b758fef853
Reviewed-on: https://go-review.googlesource.com/c/go/+/203286
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
2019-10-23 11:25:38 -04:00
|
|
|
// mSpanStateBox holds an mSpanState and provides atomic operations on
|
|
|
|
|
// it. This is a separate type to disallow accidental comparison or
|
|
|
|
|
// assignment with mSpanState.
|
|
|
|
|
type mSpanStateBox struct {
|
|
|
|
|
s mSpanState
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (b *mSpanStateBox) set(s mSpanState) {
|
|
|
|
|
atomic.Store8((*uint8)(&b.s), uint8(s))
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (b *mSpanStateBox) get() mSpanState {
|
|
|
|
|
return mSpanState(atomic.Load8((*uint8)(&b.s)))
|
|
|
|
|
}
|
|
|
|
|
|
2015-10-15 15:59:49 -07:00
|
|
|
// mSpanList heads a linked list of spans.
|
|
|
|
|
//
|
2016-10-11 22:58:21 -04:00
|
|
|
//go:notinheap
|
2015-10-15 15:59:49 -07:00
|
|
|
type mSpanList struct {
|
2016-10-11 11:47:14 -04:00
|
|
|
first *mspan // first span in list, or nil if none
|
|
|
|
|
last *mspan // last span in list, or nil if none
|
2015-10-15 15:59:49 -07:00
|
|
|
}
|
|
|
|
|
|
2016-10-11 22:58:21 -04:00
|
|
|
//go:notinheap
|
2015-02-19 13:38:46 -05:00
|
|
|
type mspan struct {
|
2015-10-15 15:59:49 -07:00
|
|
|
next *mspan // next span in list, or nil if none
|
2016-10-11 11:47:14 -04:00
|
|
|
prev *mspan // previous span in list, or nil if none
|
2015-10-15 15:59:49 -07:00
|
|
|
list *mSpanList // For debugging. TODO: Remove.
|
2016-04-28 11:21:01 -04:00
|
|
|
|
2017-03-16 15:02:02 -04:00
|
|
|
startAddr uintptr // address of first byte of span aka s.base()
|
|
|
|
|
npages uintptr // number of pages in span
|
|
|
|
|
|
2018-09-26 16:39:02 -04:00
|
|
|
manualFreeList gclinkptr // list of free objects in mSpanManual spans
|
2016-02-04 11:41:48 -05:00
|
|
|
|
|
|
|
|
// freeindex is the slot index between 0 and nelems at which to begin scanning
|
|
|
|
|
// for the next free object in this span.
|
|
|
|
|
// Each allocation scans allocBits starting at freeindex until it encounters a 0
|
|
|
|
|
// indicating a free object. freeindex is then adjusted so that subsequent scans begin
|
2017-03-05 09:14:38 -08:00
|
|
|
// just past the newly discovered free object.
|
2016-02-04 11:41:48 -05:00
|
|
|
//
|
|
|
|
|
// If freeindex == nelem, this span has no free objects.
|
|
|
|
|
//
|
|
|
|
|
// allocBits is a bitmap of objects in this span.
|
|
|
|
|
// If n >= freeindex and allocBits[n/8] & (1<<(n%8)) is 0
|
|
|
|
|
// then object n is free;
|
|
|
|
|
// otherwise, object n is allocated. Bits starting at nelem are
|
|
|
|
|
// undefined and should never be referenced.
|
|
|
|
|
//
|
|
|
|
|
// Object n starts at address n*elemsize + (start << pageShift).
|
2016-02-24 14:36:30 -05:00
|
|
|
freeindex uintptr
|
2016-03-02 12:15:02 -05:00
|
|
|
// TODO: Look up nelems from sizeclass and remove this field if it
|
|
|
|
|
// helps performance.
|
|
|
|
|
nelems uintptr // number of object in the span.
|
2016-02-24 14:36:30 -05:00
|
|
|
|
|
|
|
|
// Cache of the allocBits at freeindex. allocCache is shifted
|
|
|
|
|
// such that the lowest bit corresponds to the bit freeindex.
|
|
|
|
|
// allocCache holds the complement of allocBits, thus allowing
|
2016-03-31 10:45:36 -04:00
|
|
|
// ctz (count trailing zero) to use it directly.
|
2016-02-24 14:36:30 -05:00
|
|
|
// allocCache may contain bits beyond s.nelems; the caller must ignore
|
|
|
|
|
// these.
|
|
|
|
|
allocCache uint64
|
2016-02-04 11:41:48 -05:00
|
|
|
|
2016-03-14 12:17:48 -04:00
|
|
|
// allocBits and gcmarkBits hold pointers to a span's mark and
|
|
|
|
|
// allocation bits. The pointers are 8 byte aligned.
|
|
|
|
|
// There are three arenas where this data is held.
|
|
|
|
|
// free: Dirty arenas that are no longer accessed
|
|
|
|
|
// and can be reused.
|
|
|
|
|
// next: Holds information to be used in the next GC cycle.
|
|
|
|
|
// current: Information being used during this GC cycle.
|
|
|
|
|
// previous: Information being used during the last GC cycle.
|
|
|
|
|
// A new GC cycle starts with the call to finishsweep_m.
|
|
|
|
|
// finishsweep_m moves the previous arena to the free arena,
|
|
|
|
|
// the current arena to the previous arena, and
|
|
|
|
|
// the next arena to the current arena.
|
|
|
|
|
// The next arena is populated as the spans request
|
|
|
|
|
// memory to hold gcmarkBits for the next GC cycle as well
|
|
|
|
|
// as allocBits for newly allocated spans.
|
|
|
|
|
//
|
|
|
|
|
// The pointer arithmetic is done "by hand" instead of using
|
|
|
|
|
// arrays to avoid bounds checks along critical performance
|
|
|
|
|
// paths.
|
|
|
|
|
// The sweep will free the old allocBits and set allocBits to the
|
|
|
|
|
// gcmarkBits. The gcmarkBits are replaced with a fresh zeroed
|
|
|
|
|
// out memory.
|
2017-03-24 12:02:12 -04:00
|
|
|
allocBits *gcBits
|
|
|
|
|
gcmarkBits *gcBits
|
2016-02-04 11:41:48 -05:00
|
|
|
|
2015-02-19 13:38:46 -05:00
|
|
|
// sweep generation:
|
|
|
|
|
// if sweepgen == h->sweepgen - 2, the span needs sweeping
|
|
|
|
|
// if sweepgen == h->sweepgen - 1, the span is currently being swept
|
|
|
|
|
// if sweepgen == h->sweepgen, the span is swept and ready to use
|
2018-08-23 13:14:19 -04:00
|
|
|
// if sweepgen == h->sweepgen + 1, the span was cached before sweep began and is still cached, and needs sweeping
|
|
|
|
|
// if sweepgen == h->sweepgen + 3, the span was swept and then cached and is still cached
|
2015-02-19 13:38:46 -05:00
|
|
|
// h->sweepgen is incremented by 2 after every GC
|
2015-04-15 17:08:58 -04:00
|
|
|
|
2015-02-19 13:38:46 -05:00
|
|
|
sweepgen uint32
|
2021-03-11 15:45:52 -08:00
|
|
|
divMul uint32 // for divide by elemsize
|
runtime: atomically set span state and use as publication barrier
When everything is working correctly, any pointer the garbage
collector encounters can only point into a fully initialized heap
span, since the span must have been initialized before that pointer
could escape the heap allocator and become visible to the GC.
However, in various cases, we try to be defensive against bad
pointers. In findObject, this is just a sanity check: we never expect
to find a bad pointer, but programming errors can lead to them. In
spanOfHeap, we don't necessarily trust the pointer and we're trying to
check if it really does point to the heap, though it should always
point to something. Conservative scanning takes this to a new level,
since it can only guess that a word may be a pointer and verify this.
In all of these cases, we have a problem that the span lookup and
check can race with span initialization, since the span becomes
visible to lookups before it's fully initialized.
Furthermore, we're about to start initializing the span without the
heap lock held, which is going to introduce races where accesses were
previously protected by the heap lock.
To address this, this CL makes accesses to mspan.state atomic, and
ensures that the span is fully initialized before setting the state to
mSpanInUse. All loads are now atomic, and in any case where we don't
trust the pointer, it first atomically loads the span state and checks
that it's mSpanInUse, after which it will have synchronized with span
initialization and can safely check the other span fields.
For #10958, #24543, but a good fix in general.
Change-Id: I518b7c63555b02064b98aa5f802c92b758fef853
Reviewed-on: https://go-review.googlesource.com/c/go/+/203286
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
2019-10-23 11:25:38 -04:00
|
|
|
allocCount uint16 // number of allocated objects
|
|
|
|
|
spanclass spanClass // size class and noscan (uint8)
|
|
|
|
|
state mSpanStateBox // mSpanInUse etc; accessed atomically (get/set methods)
|
|
|
|
|
needzero uint8 // needs to be zeroed before allocation
|
|
|
|
|
elemsize uintptr // computed from sizeclass or from npages
|
|
|
|
|
limit uintptr // end of data in span
|
|
|
|
|
speciallock mutex // guards specials list
|
|
|
|
|
specials *special // linked list of special records sorted by offset.
|
2015-02-19 13:38:46 -05:00
|
|
|
}
|
2014-11-11 17:05:02 -05:00
|
|
|
|
2015-02-19 13:38:46 -05:00
|
|
|
func (s *mspan) base() uintptr {
|
2016-03-14 12:02:02 -04:00
|
|
|
return s.startAddr
|
2015-02-19 13:38:46 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (s *mspan) layout() (size, n, total uintptr) {
|
|
|
|
|
total = s.npages << _PageShift
|
|
|
|
|
size = s.elemsize
|
|
|
|
|
if size > 0 {
|
|
|
|
|
n = total / size
|
|
|
|
|
}
|
|
|
|
|
return
|
|
|
|
|
}
|
2014-11-11 17:05:02 -05:00
|
|
|
|
2017-10-25 13:46:54 -04:00
|
|
|
// recordspan adds a newly allocated span to h.allspans.
|
|
|
|
|
//
|
|
|
|
|
// This only happens the first time a span is allocated from
|
|
|
|
|
// mheap.spanalloc (it is not called when a span is reused).
|
|
|
|
|
//
|
|
|
|
|
// Write barriers are disallowed here because it can be called from
|
|
|
|
|
// gcWork when allocating new workbufs. However, because it's an
|
|
|
|
|
// indirect call from the fixalloc initializer, the compiler can't see
|
|
|
|
|
// this.
|
|
|
|
|
//
|
2020-08-21 11:59:55 -04:00
|
|
|
// The heap lock must be held.
|
|
|
|
|
//
|
2017-10-25 13:46:54 -04:00
|
|
|
//go:nowritebarrierrec
|
2014-11-11 17:05:02 -05:00
|
|
|
func recordspan(vh unsafe.Pointer, p unsafe.Pointer) {
|
|
|
|
|
h := (*mheap)(vh)
|
|
|
|
|
s := (*mspan)(p)
|
2020-08-21 11:59:55 -04:00
|
|
|
|
|
|
|
|
assertLockHeld(&h.lock)
|
|
|
|
|
|
2016-10-04 15:51:31 -04:00
|
|
|
if len(h.allspans) >= cap(h.allspans) {
|
2021-06-16 23:05:44 +00:00
|
|
|
n := 64 * 1024 / goarch.PtrSize
|
2016-10-04 15:51:31 -04:00
|
|
|
if n < cap(h.allspans)*3/2 {
|
|
|
|
|
n = cap(h.allspans) * 3 / 2
|
2014-11-11 17:05:02 -05:00
|
|
|
}
|
|
|
|
|
var new []*mspan
|
|
|
|
|
sp := (*slice)(unsafe.Pointer(&new))
|
2021-06-16 23:05:44 +00:00
|
|
|
sp.array = sysAlloc(uintptr(n)*goarch.PtrSize, &memstats.other_sys)
|
2014-11-11 17:05:02 -05:00
|
|
|
if sp.array == nil {
|
2014-12-27 20:58:00 -08:00
|
|
|
throw("runtime: cannot allocate memory")
|
2014-11-11 17:05:02 -05:00
|
|
|
}
|
2016-10-04 15:51:31 -04:00
|
|
|
sp.len = len(h.allspans)
|
2015-04-11 10:01:54 +12:00
|
|
|
sp.cap = n
|
2016-10-04 15:51:31 -04:00
|
|
|
if len(h.allspans) > 0 {
|
|
|
|
|
copy(new, h.allspans)
|
|
|
|
|
}
|
|
|
|
|
oldAllspans := h.allspans
|
2017-10-25 13:46:54 -04:00
|
|
|
*(*notInHeapSlice)(unsafe.Pointer(&h.allspans)) = *(*notInHeapSlice)(unsafe.Pointer(&new))
|
2016-10-05 21:22:33 -04:00
|
|
|
if len(oldAllspans) != 0 {
|
2016-10-04 15:51:31 -04:00
|
|
|
sysFree(unsafe.Pointer(&oldAllspans[0]), uintptr(cap(oldAllspans))*unsafe.Sizeof(oldAllspans[0]), &memstats.other_sys)
|
2014-11-11 17:05:02 -05:00
|
|
|
}
|
|
|
|
|
}
|
2017-10-25 13:46:54 -04:00
|
|
|
h.allspans = h.allspans[:len(h.allspans)+1]
|
|
|
|
|
h.allspans[len(h.allspans)-1] = s
|
2014-11-11 17:05:02 -05:00
|
|
|
}
|
|
|
|
|
|
2016-02-09 17:53:07 -05:00
|
|
|
// A spanClass represents the size class and noscan-ness of a span.
|
|
|
|
|
//
|
|
|
|
|
// Each size class has a noscan spanClass and a scan spanClass. The
|
|
|
|
|
// noscan spanClass contains only noscan objects, which do not contain
|
|
|
|
|
// pointers and thus do not need to be scanned by the garbage
|
|
|
|
|
// collector.
|
|
|
|
|
type spanClass uint8
|
|
|
|
|
|
|
|
|
|
const (
|
|
|
|
|
numSpanClasses = _NumSizeClasses << 1
|
2016-06-17 09:33:33 -04:00
|
|
|
tinySpanClass = spanClass(tinySizeClass<<1 | 1)
|
2016-02-09 17:53:07 -05:00
|
|
|
)
|
|
|
|
|
|
|
|
|
|
func makeSpanClass(sizeclass uint8, noscan bool) spanClass {
|
|
|
|
|
return spanClass(sizeclass<<1) | spanClass(bool2int(noscan))
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (sc spanClass) sizeclass() int8 {
|
|
|
|
|
return int8(sc >> 1)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (sc spanClass) noscan() bool {
|
|
|
|
|
return sc&1 != 0
|
|
|
|
|
}
|
|
|
|
|
|
runtime: support a two-level arena map
Currently, the heap arena map is a single, large array that covers
every possible arena frame in the entire address space. This is
practical up to about 48 bits of address space with 64 MB arenas.
However, there are two problems with this:
1. mips64, ppc64, and s390x support full 64-bit address spaces (though
on Linux only s390x has kernel support for 64-bit address spaces).
On these platforms, it would be good to support these larger
address spaces.
2. On Windows, processes are charged for untouched memory, so for
processes with small heaps, the mostly-untouched 32 MB arena map
plus a 64 MB arena are significant overhead. Hence, it would be
good to reduce both the arena map size and the arena size, but with
a single-level arena, these are inversely proportional.
This CL adds support for a two-level arena map. Arena frame numbers
are now divided into arenaL1Bits of L1 index and arenaL2Bits of L2
index.
At the moment, arenaL1Bits is always 0, so we effectively have a
single level map. We do a few things so that this has no cost beyond
the current single-level map:
1. We embed the L2 array directly in mheap, so if there's a single
entry in the L2 array, the representation is identical to the
current representation and there's no extra level of indirection.
2. Hot code that accesses the arena map is structured so that it
optimizes to nearly the same machine code as it does currently.
3. We make some small tweaks to hot code paths and to the inliner
itself to keep some important functions inlined despite their
now-larger ASTs. In particular, this is necessary for
heapBitsForAddr and heapBits.next.
Possibly as a result of some of the tweaks, this actually slightly
improves the performance of the x/benchmarks garbage benchmark:
name old time/op new time/op delta
Garbage/benchmem-MB=64-12 2.28ms ± 1% 2.26ms ± 1% -1.07% (p=0.000 n=17+19)
(https://perf.golang.org/search?q=upload:20180223.2)
For #23900.
Change-Id: If5164e0961754f97eb9eca58f837f36d759505ff
Reviewed-on: https://go-review.googlesource.com/96779
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
2018-02-22 20:38:09 -05:00
|
|
|
// arenaIndex returns the index into mheap_.arenas of the arena
|
|
|
|
|
// containing metadata for p. This index combines of an index into the
|
|
|
|
|
// L1 map and an index into the L2 map and should be used as
|
|
|
|
|
// mheap_.arenas[ai.l1()][ai.l2()].
|
|
|
|
|
//
|
|
|
|
|
// If p is outside the range of valid heap addresses, either l1() or
|
|
|
|
|
// l2() will be out of bounds.
|
2018-02-16 17:53:16 -05:00
|
|
|
//
|
|
|
|
|
// It is nosplit because it's called by spanOf and several other
|
|
|
|
|
// nosplit functions.
|
|
|
|
|
//
|
|
|
|
|
//go:nosplit
|
runtime: support a two-level arena map
Currently, the heap arena map is a single, large array that covers
every possible arena frame in the entire address space. This is
practical up to about 48 bits of address space with 64 MB arenas.
However, there are two problems with this:
1. mips64, ppc64, and s390x support full 64-bit address spaces (though
on Linux only s390x has kernel support for 64-bit address spaces).
On these platforms, it would be good to support these larger
address spaces.
2. On Windows, processes are charged for untouched memory, so for
processes with small heaps, the mostly-untouched 32 MB arena map
plus a 64 MB arena are significant overhead. Hence, it would be
good to reduce both the arena map size and the arena size, but with
a single-level arena, these are inversely proportional.
This CL adds support for a two-level arena map. Arena frame numbers
are now divided into arenaL1Bits of L1 index and arenaL2Bits of L2
index.
At the moment, arenaL1Bits is always 0, so we effectively have a
single level map. We do a few things so that this has no cost beyond
the current single-level map:
1. We embed the L2 array directly in mheap, so if there's a single
entry in the L2 array, the representation is identical to the
current representation and there's no extra level of indirection.
2. Hot code that accesses the arena map is structured so that it
optimizes to nearly the same machine code as it does currently.
3. We make some small tweaks to hot code paths and to the inliner
itself to keep some important functions inlined despite their
now-larger ASTs. In particular, this is necessary for
heapBitsForAddr and heapBits.next.
Possibly as a result of some of the tweaks, this actually slightly
improves the performance of the x/benchmarks garbage benchmark:
name old time/op new time/op delta
Garbage/benchmem-MB=64-12 2.28ms ± 1% 2.26ms ± 1% -1.07% (p=0.000 n=17+19)
(https://perf.golang.org/search?q=upload:20180223.2)
For #23900.
Change-Id: If5164e0961754f97eb9eca58f837f36d759505ff
Reviewed-on: https://go-review.googlesource.com/96779
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
2018-02-22 20:38:09 -05:00
|
|
|
func arenaIndex(p uintptr) arenaIdx {
|
runtime: make maxOffAddr reflect the actual address space upper bound
Currently maxOffAddr is defined in terms of the whole 64-bit address
space, assuming that it's all supported, by using ^uintptr(0) as the
maximal address in the offset space. In reality, the maximal address in
the offset space is (1<<heapAddrBits)-1 because we don't have more than
that actually available to us on a given platform.
On most platforms this is fine, because arenaBaseOffset is just
connecting two segments of address space, but on AIX we use it as an
actual offset for the starting address of the available address space,
which is limited. This means using ^uintptr(0) as the maximal address in
the offset address space causes wrap-around, especially when we just
want to represent a range approximately like [addr, infinity), which
today we do by using maxOffAddr.
To fix this, we define maxOffAddr more appropriately, in terms of
(1<<heapAddrBits)-1.
This change also redefines arenaBaseOffset to not be the negation of the
virtual address corresponding to address zero in the virtual address
space, but instead directly as the virtual address corresponding to
zero. This matches the existing documentation more closely and makes the
logic around arenaBaseOffset decidedly simpler, especially when trying
to reason about its use on AIX.
Fixes #38966.
Change-Id: I1336e5036a39de846f64cc2d253e8536dee57611
Reviewed-on: https://go-review.googlesource.com/c/go/+/233497
Run-TryBot: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Austin Clements <austin@google.com>
Reviewed-by: Michael Pratt <mpratt@google.com>
2020-05-12 16:08:50 +00:00
|
|
|
return arenaIdx((p - arenaBaseOffset) / heapArenaBytes)
|
2018-02-16 17:53:16 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// arenaBase returns the low address of the region covered by heap
|
|
|
|
|
// arena i.
|
runtime: support a two-level arena map
Currently, the heap arena map is a single, large array that covers
every possible arena frame in the entire address space. This is
practical up to about 48 bits of address space with 64 MB arenas.
However, there are two problems with this:
1. mips64, ppc64, and s390x support full 64-bit address spaces (though
on Linux only s390x has kernel support for 64-bit address spaces).
On these platforms, it would be good to support these larger
address spaces.
2. On Windows, processes are charged for untouched memory, so for
processes with small heaps, the mostly-untouched 32 MB arena map
plus a 64 MB arena are significant overhead. Hence, it would be
good to reduce both the arena map size and the arena size, but with
a single-level arena, these are inversely proportional.
This CL adds support for a two-level arena map. Arena frame numbers
are now divided into arenaL1Bits of L1 index and arenaL2Bits of L2
index.
At the moment, arenaL1Bits is always 0, so we effectively have a
single level map. We do a few things so that this has no cost beyond
the current single-level map:
1. We embed the L2 array directly in mheap, so if there's a single
entry in the L2 array, the representation is identical to the
current representation and there's no extra level of indirection.
2. Hot code that accesses the arena map is structured so that it
optimizes to nearly the same machine code as it does currently.
3. We make some small tweaks to hot code paths and to the inliner
itself to keep some important functions inlined despite their
now-larger ASTs. In particular, this is necessary for
heapBitsForAddr and heapBits.next.
Possibly as a result of some of the tweaks, this actually slightly
improves the performance of the x/benchmarks garbage benchmark:
name old time/op new time/op delta
Garbage/benchmem-MB=64-12 2.28ms ± 1% 2.26ms ± 1% -1.07% (p=0.000 n=17+19)
(https://perf.golang.org/search?q=upload:20180223.2)
For #23900.
Change-Id: If5164e0961754f97eb9eca58f837f36d759505ff
Reviewed-on: https://go-review.googlesource.com/96779
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
2018-02-22 20:38:09 -05:00
|
|
|
func arenaBase(i arenaIdx) uintptr {
|
runtime: make maxOffAddr reflect the actual address space upper bound
Currently maxOffAddr is defined in terms of the whole 64-bit address
space, assuming that it's all supported, by using ^uintptr(0) as the
maximal address in the offset space. In reality, the maximal address in
the offset space is (1<<heapAddrBits)-1 because we don't have more than
that actually available to us on a given platform.
On most platforms this is fine, because arenaBaseOffset is just
connecting two segments of address space, but on AIX we use it as an
actual offset for the starting address of the available address space,
which is limited. This means using ^uintptr(0) as the maximal address in
the offset address space causes wrap-around, especially when we just
want to represent a range approximately like [addr, infinity), which
today we do by using maxOffAddr.
To fix this, we define maxOffAddr more appropriately, in terms of
(1<<heapAddrBits)-1.
This change also redefines arenaBaseOffset to not be the negation of the
virtual address corresponding to address zero in the virtual address
space, but instead directly as the virtual address corresponding to
zero. This matches the existing documentation more closely and makes the
logic around arenaBaseOffset decidedly simpler, especially when trying
to reason about its use on AIX.
Fixes #38966.
Change-Id: I1336e5036a39de846f64cc2d253e8536dee57611
Reviewed-on: https://go-review.googlesource.com/c/go/+/233497
Run-TryBot: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Austin Clements <austin@google.com>
Reviewed-by: Michael Pratt <mpratt@google.com>
2020-05-12 16:08:50 +00:00
|
|
|
return uintptr(i)*heapArenaBytes + arenaBaseOffset
|
2018-02-16 17:53:16 -05:00
|
|
|
}
|
|
|
|
|
|
runtime: support a two-level arena map
Currently, the heap arena map is a single, large array that covers
every possible arena frame in the entire address space. This is
practical up to about 48 bits of address space with 64 MB arenas.
However, there are two problems with this:
1. mips64, ppc64, and s390x support full 64-bit address spaces (though
on Linux only s390x has kernel support for 64-bit address spaces).
On these platforms, it would be good to support these larger
address spaces.
2. On Windows, processes are charged for untouched memory, so for
processes with small heaps, the mostly-untouched 32 MB arena map
plus a 64 MB arena are significant overhead. Hence, it would be
good to reduce both the arena map size and the arena size, but with
a single-level arena, these are inversely proportional.
This CL adds support for a two-level arena map. Arena frame numbers
are now divided into arenaL1Bits of L1 index and arenaL2Bits of L2
index.
At the moment, arenaL1Bits is always 0, so we effectively have a
single level map. We do a few things so that this has no cost beyond
the current single-level map:
1. We embed the L2 array directly in mheap, so if there's a single
entry in the L2 array, the representation is identical to the
current representation and there's no extra level of indirection.
2. Hot code that accesses the arena map is structured so that it
optimizes to nearly the same machine code as it does currently.
3. We make some small tweaks to hot code paths and to the inliner
itself to keep some important functions inlined despite their
now-larger ASTs. In particular, this is necessary for
heapBitsForAddr and heapBits.next.
Possibly as a result of some of the tweaks, this actually slightly
improves the performance of the x/benchmarks garbage benchmark:
name old time/op new time/op delta
Garbage/benchmem-MB=64-12 2.28ms ± 1% 2.26ms ± 1% -1.07% (p=0.000 n=17+19)
(https://perf.golang.org/search?q=upload:20180223.2)
For #23900.
Change-Id: If5164e0961754f97eb9eca58f837f36d759505ff
Reviewed-on: https://go-review.googlesource.com/96779
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
2018-02-22 20:38:09 -05:00
|
|
|
type arenaIdx uint
|
|
|
|
|
|
|
|
|
|
func (i arenaIdx) l1() uint {
|
|
|
|
|
if arenaL1Bits == 0 {
|
|
|
|
|
// Let the compiler optimize this away if there's no
|
|
|
|
|
// L1 map.
|
|
|
|
|
return 0
|
|
|
|
|
} else {
|
|
|
|
|
return uint(i) >> arenaL1Shift
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (i arenaIdx) l2() uint {
|
|
|
|
|
if arenaL1Bits == 0 {
|
|
|
|
|
return uint(i)
|
|
|
|
|
} else {
|
|
|
|
|
return uint(i) & (1<<arenaL2Bits - 1)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2015-02-19 13:38:46 -05:00
|
|
|
// inheap reports whether b is a pointer into a (potentially dead) heap object.
|
2018-09-26 16:39:02 -04:00
|
|
|
// It returns false for pointers into mSpanManual spans.
|
runtime: fix callwritebarrier
Given a call frame F of size N where the return values start at offset R,
callwritebarrier was instructing heapBitsBulkBarrier to scan the block
of memory [F+R, F+R+N). It should only scan [F+R, F+N). The extra N-R
bytes scanned might lead into the next allocated block in memory.
Because the scan was consulting the heap bitmap for type information,
scanning into the next block normally "just worked" in the sense of
not crashing.
Scanning the extra N-R bytes of memory is a problem mainly because
it causes the GC to consider pointers that might otherwise not be
considered, leading it to retain objects that should actually be freed.
This is very difficult to detect.
Luckily, juju turned up a case where the heap bitmap and the memory
were out of sync for the block immediately after the call frame, so that
heapBitsBulkBarrier saw an obvious non-pointer where it expected a
pointer, causing a loud crash.
Why is there a non-pointer in memory that the heap bitmap records as
a pointer? That is more difficult to answer. At least one way that it
could happen is that allocations containing no pointers at all do not
update the heap bitmap. So if heapBitsBulkBarrier walked out of the
current object and into a no-pointer object and consulted those bitmap
bits, it would be misled. This doesn't happen in general because all
the paths to heapBitsBulkBarrier first check for the no-pointer case.
This may or may not be what happened, but it's the only scenario
I've been able to construct.
I tried for quite a while to write a simple test for this and could not.
It does fix the juju crash, and it is clearly an improvement over the
old code.
Fixes #10844.
Change-Id: I53982c93ef23ef93155c4086bbd95a4c4fdaac9a
Reviewed-on: https://go-review.googlesource.com/10317
Reviewed-by: Austin Clements <austin@google.com>
2015-05-19 22:58:10 -04:00
|
|
|
// Non-preemptible because it is used by write barriers.
|
2015-02-19 13:38:46 -05:00
|
|
|
//go:nowritebarrier
|
runtime: fix callwritebarrier
Given a call frame F of size N where the return values start at offset R,
callwritebarrier was instructing heapBitsBulkBarrier to scan the block
of memory [F+R, F+R+N). It should only scan [F+R, F+N). The extra N-R
bytes scanned might lead into the next allocated block in memory.
Because the scan was consulting the heap bitmap for type information,
scanning into the next block normally "just worked" in the sense of
not crashing.
Scanning the extra N-R bytes of memory is a problem mainly because
it causes the GC to consider pointers that might otherwise not be
considered, leading it to retain objects that should actually be freed.
This is very difficult to detect.
Luckily, juju turned up a case where the heap bitmap and the memory
were out of sync for the block immediately after the call frame, so that
heapBitsBulkBarrier saw an obvious non-pointer where it expected a
pointer, causing a loud crash.
Why is there a non-pointer in memory that the heap bitmap records as
a pointer? That is more difficult to answer. At least one way that it
could happen is that allocations containing no pointers at all do not
update the heap bitmap. So if heapBitsBulkBarrier walked out of the
current object and into a no-pointer object and consulted those bitmap
bits, it would be misled. This doesn't happen in general because all
the paths to heapBitsBulkBarrier first check for the no-pointer case.
This may or may not be what happened, but it's the only scenario
I've been able to construct.
I tried for quite a while to write a simple test for this and could not.
It does fix the juju crash, and it is clearly an improvement over the
old code.
Fixes #10844.
Change-Id: I53982c93ef23ef93155c4086bbd95a4c4fdaac9a
Reviewed-on: https://go-review.googlesource.com/10317
Reviewed-by: Austin Clements <austin@google.com>
2015-05-19 22:58:10 -04:00
|
|
|
//go:nosplit
|
2015-02-19 13:38:46 -05:00
|
|
|
func inheap(b uintptr) bool {
|
2017-12-04 11:02:59 -05:00
|
|
|
return spanOfHeap(b) != nil
|
2015-02-19 13:38:46 -05:00
|
|
|
}
|
|
|
|
|
|
2017-03-16 14:16:31 -04:00
|
|
|
// inHeapOrStack is a variant of inheap that returns true for pointers
|
|
|
|
|
// into any allocated heap span.
|
|
|
|
|
//
|
runtime: use entire address space on 32 bit
In issue #13992, Russ mentioned that the heap bitmap footprint was
halved but that the bitmap size calculation hadn't been updated. This
presents the opportunity to either halve the bitmap size or double
the addressable virtual space. This CL doubles the addressable virtual
space. On 32 bit this can be tweaked further to allow the bitmap to
cover the entire 4GB virtual address space, removing a failure mode
if the kernel hands out memory with a too low address.
First, fix the calculation and double _MaxArena32 to cover 4GB virtual
memory space with the same bitmap size (256 MB).
Then, allow the fallback mode for the initial memory reservation
on 32 bit (or 64 bit with too little available virtual memory) to not
include space for the arena. mheap.sysAlloc will automatically reserve
additional space when the existing arena is full.
Finally, set arena_start to 0 in 32 bit mode, so that any address is
acceptable for subsequent (additional) reservations.
Before, the bitmap was always located just before arena_start, so
fix the two places relying on that assumption: Point the otherwise unused
mheap.bitmap to one byte after the end of the bitmap, and use it for
bitmap addressing instead of arena_start.
With arena_start set to 0 on 32 bit, the cgoInRange check is no longer a
sufficient check for Go pointers. Introduce and call inHeapOrStack to
check whether a pointer is to the Go heap or stack.
While we're here, remove sysReserveHigh which seems to be unused.
Fixes #13992
Change-Id: I592b513148a50b9d3967b5c5d94b86b3ec39acc2
Reviewed-on: https://go-review.googlesource.com/20471
Reviewed-by: Austin Clements <austin@google.com>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2016-03-09 10:00:12 +01:00
|
|
|
//go:nowritebarrier
|
|
|
|
|
//go:nosplit
|
|
|
|
|
func inHeapOrStack(b uintptr) bool {
|
2017-12-04 11:02:59 -05:00
|
|
|
s := spanOf(b)
|
runtime: use entire address space on 32 bit
In issue #13992, Russ mentioned that the heap bitmap footprint was
halved but that the bitmap size calculation hadn't been updated. This
presents the opportunity to either halve the bitmap size or double
the addressable virtual space. This CL doubles the addressable virtual
space. On 32 bit this can be tweaked further to allow the bitmap to
cover the entire 4GB virtual address space, removing a failure mode
if the kernel hands out memory with a too low address.
First, fix the calculation and double _MaxArena32 to cover 4GB virtual
memory space with the same bitmap size (256 MB).
Then, allow the fallback mode for the initial memory reservation
on 32 bit (or 64 bit with too little available virtual memory) to not
include space for the arena. mheap.sysAlloc will automatically reserve
additional space when the existing arena is full.
Finally, set arena_start to 0 in 32 bit mode, so that any address is
acceptable for subsequent (additional) reservations.
Before, the bitmap was always located just before arena_start, so
fix the two places relying on that assumption: Point the otherwise unused
mheap.bitmap to one byte after the end of the bitmap, and use it for
bitmap addressing instead of arena_start.
With arena_start set to 0 on 32 bit, the cgoInRange check is no longer a
sufficient check for Go pointers. Introduce and call inHeapOrStack to
check whether a pointer is to the Go heap or stack.
While we're here, remove sysReserveHigh which seems to be unused.
Fixes #13992
Change-Id: I592b513148a50b9d3967b5c5d94b86b3ec39acc2
Reviewed-on: https://go-review.googlesource.com/20471
Reviewed-by: Austin Clements <austin@google.com>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2016-03-09 10:00:12 +01:00
|
|
|
if s == nil || b < s.base() {
|
|
|
|
|
return false
|
|
|
|
|
}
|
runtime: atomically set span state and use as publication barrier
When everything is working correctly, any pointer the garbage
collector encounters can only point into a fully initialized heap
span, since the span must have been initialized before that pointer
could escape the heap allocator and become visible to the GC.
However, in various cases, we try to be defensive against bad
pointers. In findObject, this is just a sanity check: we never expect
to find a bad pointer, but programming errors can lead to them. In
spanOfHeap, we don't necessarily trust the pointer and we're trying to
check if it really does point to the heap, though it should always
point to something. Conservative scanning takes this to a new level,
since it can only guess that a word may be a pointer and verify this.
In all of these cases, we have a problem that the span lookup and
check can race with span initialization, since the span becomes
visible to lookups before it's fully initialized.
Furthermore, we're about to start initializing the span without the
heap lock held, which is going to introduce races where accesses were
previously protected by the heap lock.
To address this, this CL makes accesses to mspan.state atomic, and
ensures that the span is fully initialized before setting the state to
mSpanInUse. All loads are now atomic, and in any case where we don't
trust the pointer, it first atomically loads the span state and checks
that it's mSpanInUse, after which it will have synchronized with span
initialization and can safely check the other span fields.
For #10958, #24543, but a good fix in general.
Change-Id: I518b7c63555b02064b98aa5f802c92b758fef853
Reviewed-on: https://go-review.googlesource.com/c/go/+/203286
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
2019-10-23 11:25:38 -04:00
|
|
|
switch s.state.get() {
|
2018-09-26 16:39:02 -04:00
|
|
|
case mSpanInUse, mSpanManual:
|
runtime: use entire address space on 32 bit
In issue #13992, Russ mentioned that the heap bitmap footprint was
halved but that the bitmap size calculation hadn't been updated. This
presents the opportunity to either halve the bitmap size or double
the addressable virtual space. This CL doubles the addressable virtual
space. On 32 bit this can be tweaked further to allow the bitmap to
cover the entire 4GB virtual address space, removing a failure mode
if the kernel hands out memory with a too low address.
First, fix the calculation and double _MaxArena32 to cover 4GB virtual
memory space with the same bitmap size (256 MB).
Then, allow the fallback mode for the initial memory reservation
on 32 bit (or 64 bit with too little available virtual memory) to not
include space for the arena. mheap.sysAlloc will automatically reserve
additional space when the existing arena is full.
Finally, set arena_start to 0 in 32 bit mode, so that any address is
acceptable for subsequent (additional) reservations.
Before, the bitmap was always located just before arena_start, so
fix the two places relying on that assumption: Point the otherwise unused
mheap.bitmap to one byte after the end of the bitmap, and use it for
bitmap addressing instead of arena_start.
With arena_start set to 0 on 32 bit, the cgoInRange check is no longer a
sufficient check for Go pointers. Introduce and call inHeapOrStack to
check whether a pointer is to the Go heap or stack.
While we're here, remove sysReserveHigh which seems to be unused.
Fixes #13992
Change-Id: I592b513148a50b9d3967b5c5d94b86b3ec39acc2
Reviewed-on: https://go-review.googlesource.com/20471
Reviewed-by: Austin Clements <austin@google.com>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2016-03-09 10:00:12 +01:00
|
|
|
return b < s.limit
|
|
|
|
|
default:
|
|
|
|
|
return false
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2017-12-04 10:58:15 -05:00
|
|
|
// spanOf returns the span of p. If p does not point into the heap
|
|
|
|
|
// arena or no span has ever contained p, spanOf returns nil.
|
|
|
|
|
//
|
|
|
|
|
// If p does not point to allocated memory, this may return a non-nil
|
|
|
|
|
// span that does *not* contain p. If this is a possibility, the
|
|
|
|
|
// caller should either call spanOfHeap or check the span bounds
|
|
|
|
|
// explicitly.
|
2017-12-04 11:02:59 -05:00
|
|
|
//
|
|
|
|
|
// Must be nosplit because it has callers that are nosplit.
|
|
|
|
|
//
|
|
|
|
|
//go:nosplit
|
runtime: eliminate one heapBitsForObject from scanobject
scanobject with ptrmask!=nil is only ever called with the base
pointer of a heap object. Currently, scanobject calls
heapBitsForObject, which goes to a great deal of trouble to check
that the pointer points into the heap and to find the base of the
object it points to, both of which are completely unnecessary in
this case.
Replace this call to heapBitsForObject with much simpler logic to
fetch the span and compute the heap bits.
Benchmark results with five runs:
name old mean new mean delta
BenchmarkBinaryTree17 9.21s × (0.95,1.02) 8.55s × (0.91,1.03) -7.16% (p=0.022)
BenchmarkFannkuch11 2.65s × (1.00,1.00) 2.62s × (1.00,1.00) -1.10% (p=0.000)
BenchmarkFmtFprintfEmpty 73.2ns × (0.99,1.01) 71.7ns × (1.00,1.01) -1.99% (p=0.004)
BenchmarkFmtFprintfString 302ns × (0.99,1.00) 292ns × (0.98,1.02) -3.31% (p=0.020)
BenchmarkFmtFprintfInt 281ns × (0.98,1.01) 279ns × (0.96,1.02) ~ (p=0.596)
BenchmarkFmtFprintfIntInt 482ns × (0.98,1.01) 488ns × (0.95,1.02) ~ (p=0.419)
BenchmarkFmtFprintfPrefixedInt 382ns × (0.99,1.01) 365ns × (0.96,1.02) -4.35% (p=0.015)
BenchmarkFmtFprintfFloat 475ns × (0.99,1.01) 472ns × (1.00,1.00) ~ (p=0.108)
BenchmarkFmtManyArgs 1.89µs × (1.00,1.01) 1.90µs × (0.94,1.02) ~ (p=0.883)
BenchmarkGobDecode 22.4ms × (0.99,1.01) 21.9ms × (0.92,1.04) ~ (p=0.332)
BenchmarkGobEncode 24.7ms × (0.98,1.02) 23.9ms × (0.87,1.07) ~ (p=0.407)
BenchmarkGzip 397ms × (0.99,1.01) 398ms × (0.99,1.01) ~ (p=0.718)
BenchmarkGunzip 96.7ms × (1.00,1.00) 96.9ms × (1.00,1.00) ~ (p=0.230)
BenchmarkHTTPClientServer 71.5µs × (0.98,1.01) 68.5µs × (0.92,1.06) ~ (p=0.243)
BenchmarkJSONEncode 46.1ms × (0.98,1.01) 44.9ms × (0.98,1.03) -2.51% (p=0.040)
BenchmarkJSONDecode 86.1ms × (0.99,1.01) 86.5ms × (0.99,1.01) ~ (p=0.343)
BenchmarkMandelbrot200 4.12ms × (1.00,1.00) 4.13ms × (1.00,1.00) +0.23% (p=0.000)
BenchmarkGoParse 5.89ms × (0.96,1.03) 5.82ms × (0.96,1.04) ~ (p=0.522)
BenchmarkRegexpMatchEasy0_32 141ns × (0.99,1.01) 142ns × (1.00,1.00) ~ (p=0.178)
BenchmarkRegexpMatchEasy0_1K 408ns × (1.00,1.00) 392ns × (0.99,1.00) -3.83% (p=0.000)
BenchmarkRegexpMatchEasy1_32 122ns × (1.00,1.00) 122ns × (1.00,1.00) ~ (p=0.178)
BenchmarkRegexpMatchEasy1_1K 626ns × (1.00,1.01) 624ns × (0.99,1.00) ~ (p=0.122)
BenchmarkRegexpMatchMedium_32 202ns × (0.99,1.00) 205ns × (0.99,1.01) +1.58% (p=0.001)
BenchmarkRegexpMatchMedium_1K 54.4µs × (1.00,1.00) 55.5µs × (1.00,1.00) +1.86% (p=0.000)
BenchmarkRegexpMatchHard_32 2.68µs × (1.00,1.00) 2.71µs × (1.00,1.00) +0.97% (p=0.002)
BenchmarkRegexpMatchHard_1K 79.8µs × (1.00,1.01) 80.5µs × (1.00,1.01) +0.94% (p=0.003)
BenchmarkRevcomp 590ms × (0.99,1.01) 585ms × (1.00,1.00) ~ (p=0.066)
BenchmarkTemplate 111ms × (0.97,1.02) 112ms × (0.99,1.01) ~ (p=0.201)
BenchmarkTimeParse 392ns × (1.00,1.00) 385ns × (1.00,1.00) -1.69% (p=0.000)
BenchmarkTimeFormat 449ns × (0.98,1.01) 448ns × (0.99,1.01) ~ (p=0.550)
Change-Id: Ie7c3830c481d96c9043e7bf26853c6c1d05dc9f4
Reviewed-on: https://go-review.googlesource.com/9364
Reviewed-by: Rick Hudson <rlh@golang.org>
2015-04-26 18:27:17 -04:00
|
|
|
func spanOf(p uintptr) *mspan {
|
runtime: support a two-level arena map
Currently, the heap arena map is a single, large array that covers
every possible arena frame in the entire address space. This is
practical up to about 48 bits of address space with 64 MB arenas.
However, there are two problems with this:
1. mips64, ppc64, and s390x support full 64-bit address spaces (though
on Linux only s390x has kernel support for 64-bit address spaces).
On these platforms, it would be good to support these larger
address spaces.
2. On Windows, processes are charged for untouched memory, so for
processes with small heaps, the mostly-untouched 32 MB arena map
plus a 64 MB arena are significant overhead. Hence, it would be
good to reduce both the arena map size and the arena size, but with
a single-level arena, these are inversely proportional.
This CL adds support for a two-level arena map. Arena frame numbers
are now divided into arenaL1Bits of L1 index and arenaL2Bits of L2
index.
At the moment, arenaL1Bits is always 0, so we effectively have a
single level map. We do a few things so that this has no cost beyond
the current single-level map:
1. We embed the L2 array directly in mheap, so if there's a single
entry in the L2 array, the representation is identical to the
current representation and there's no extra level of indirection.
2. Hot code that accesses the arena map is structured so that it
optimizes to nearly the same machine code as it does currently.
3. We make some small tweaks to hot code paths and to the inliner
itself to keep some important functions inlined despite their
now-larger ASTs. In particular, this is necessary for
heapBitsForAddr and heapBits.next.
Possibly as a result of some of the tweaks, this actually slightly
improves the performance of the x/benchmarks garbage benchmark:
name old time/op new time/op delta
Garbage/benchmem-MB=64-12 2.28ms ± 1% 2.26ms ± 1% -1.07% (p=0.000 n=17+19)
(https://perf.golang.org/search?q=upload:20180223.2)
For #23900.
Change-Id: If5164e0961754f97eb9eca58f837f36d759505ff
Reviewed-on: https://go-review.googlesource.com/96779
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
2018-02-22 20:38:09 -05:00
|
|
|
// This function looks big, but we use a lot of constant
|
|
|
|
|
// folding around arenaL1Bits to get it under the inlining
|
|
|
|
|
// budget. Also, many of the checks here are safety checks
|
|
|
|
|
// that Go needs to do anyway, so the generated code is quite
|
|
|
|
|
// short.
|
2018-02-16 17:53:16 -05:00
|
|
|
ri := arenaIndex(p)
|
runtime: support a two-level arena map
Currently, the heap arena map is a single, large array that covers
every possible arena frame in the entire address space. This is
practical up to about 48 bits of address space with 64 MB arenas.
However, there are two problems with this:
1. mips64, ppc64, and s390x support full 64-bit address spaces (though
on Linux only s390x has kernel support for 64-bit address spaces).
On these platforms, it would be good to support these larger
address spaces.
2. On Windows, processes are charged for untouched memory, so for
processes with small heaps, the mostly-untouched 32 MB arena map
plus a 64 MB arena are significant overhead. Hence, it would be
good to reduce both the arena map size and the arena size, but with
a single-level arena, these are inversely proportional.
This CL adds support for a two-level arena map. Arena frame numbers
are now divided into arenaL1Bits of L1 index and arenaL2Bits of L2
index.
At the moment, arenaL1Bits is always 0, so we effectively have a
single level map. We do a few things so that this has no cost beyond
the current single-level map:
1. We embed the L2 array directly in mheap, so if there's a single
entry in the L2 array, the representation is identical to the
current representation and there's no extra level of indirection.
2. Hot code that accesses the arena map is structured so that it
optimizes to nearly the same machine code as it does currently.
3. We make some small tweaks to hot code paths and to the inliner
itself to keep some important functions inlined despite their
now-larger ASTs. In particular, this is necessary for
heapBitsForAddr and heapBits.next.
Possibly as a result of some of the tweaks, this actually slightly
improves the performance of the x/benchmarks garbage benchmark:
name old time/op new time/op delta
Garbage/benchmem-MB=64-12 2.28ms ± 1% 2.26ms ± 1% -1.07% (p=0.000 n=17+19)
(https://perf.golang.org/search?q=upload:20180223.2)
For #23900.
Change-Id: If5164e0961754f97eb9eca58f837f36d759505ff
Reviewed-on: https://go-review.googlesource.com/96779
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
2018-02-22 20:38:09 -05:00
|
|
|
if arenaL1Bits == 0 {
|
|
|
|
|
// If there's no L1, then ri.l1() can't be out of bounds but ri.l2() can.
|
|
|
|
|
if ri.l2() >= uint(len(mheap_.arenas[0])) {
|
|
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
// If there's an L1, then ri.l1() can be out of bounds but ri.l2() can't.
|
|
|
|
|
if ri.l1() >= uint(len(mheap_.arenas)) {
|
|
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
l2 := mheap_.arenas[ri.l1()]
|
|
|
|
|
if arenaL1Bits != 0 && l2 == nil { // Should never happen if there's no L1.
|
2017-12-13 16:09:02 -05:00
|
|
|
return nil
|
|
|
|
|
}
|
runtime: support a two-level arena map
Currently, the heap arena map is a single, large array that covers
every possible arena frame in the entire address space. This is
practical up to about 48 bits of address space with 64 MB arenas.
However, there are two problems with this:
1. mips64, ppc64, and s390x support full 64-bit address spaces (though
on Linux only s390x has kernel support for 64-bit address spaces).
On these platforms, it would be good to support these larger
address spaces.
2. On Windows, processes are charged for untouched memory, so for
processes with small heaps, the mostly-untouched 32 MB arena map
plus a 64 MB arena are significant overhead. Hence, it would be
good to reduce both the arena map size and the arena size, but with
a single-level arena, these are inversely proportional.
This CL adds support for a two-level arena map. Arena frame numbers
are now divided into arenaL1Bits of L1 index and arenaL2Bits of L2
index.
At the moment, arenaL1Bits is always 0, so we effectively have a
single level map. We do a few things so that this has no cost beyond
the current single-level map:
1. We embed the L2 array directly in mheap, so if there's a single
entry in the L2 array, the representation is identical to the
current representation and there's no extra level of indirection.
2. Hot code that accesses the arena map is structured so that it
optimizes to nearly the same machine code as it does currently.
3. We make some small tweaks to hot code paths and to the inliner
itself to keep some important functions inlined despite their
now-larger ASTs. In particular, this is necessary for
heapBitsForAddr and heapBits.next.
Possibly as a result of some of the tweaks, this actually slightly
improves the performance of the x/benchmarks garbage benchmark:
name old time/op new time/op delta
Garbage/benchmem-MB=64-12 2.28ms ± 1% 2.26ms ± 1% -1.07% (p=0.000 n=17+19)
(https://perf.golang.org/search?q=upload:20180223.2)
For #23900.
Change-Id: If5164e0961754f97eb9eca58f837f36d759505ff
Reviewed-on: https://go-review.googlesource.com/96779
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
2018-02-22 20:38:09 -05:00
|
|
|
ha := l2[ri.l2()]
|
2017-12-13 16:09:02 -05:00
|
|
|
if ha == nil {
|
runtime: eliminate one heapBitsForObject from scanobject
scanobject with ptrmask!=nil is only ever called with the base
pointer of a heap object. Currently, scanobject calls
heapBitsForObject, which goes to a great deal of trouble to check
that the pointer points into the heap and to find the base of the
object it points to, both of which are completely unnecessary in
this case.
Replace this call to heapBitsForObject with much simpler logic to
fetch the span and compute the heap bits.
Benchmark results with five runs:
name old mean new mean delta
BenchmarkBinaryTree17 9.21s × (0.95,1.02) 8.55s × (0.91,1.03) -7.16% (p=0.022)
BenchmarkFannkuch11 2.65s × (1.00,1.00) 2.62s × (1.00,1.00) -1.10% (p=0.000)
BenchmarkFmtFprintfEmpty 73.2ns × (0.99,1.01) 71.7ns × (1.00,1.01) -1.99% (p=0.004)
BenchmarkFmtFprintfString 302ns × (0.99,1.00) 292ns × (0.98,1.02) -3.31% (p=0.020)
BenchmarkFmtFprintfInt 281ns × (0.98,1.01) 279ns × (0.96,1.02) ~ (p=0.596)
BenchmarkFmtFprintfIntInt 482ns × (0.98,1.01) 488ns × (0.95,1.02) ~ (p=0.419)
BenchmarkFmtFprintfPrefixedInt 382ns × (0.99,1.01) 365ns × (0.96,1.02) -4.35% (p=0.015)
BenchmarkFmtFprintfFloat 475ns × (0.99,1.01) 472ns × (1.00,1.00) ~ (p=0.108)
BenchmarkFmtManyArgs 1.89µs × (1.00,1.01) 1.90µs × (0.94,1.02) ~ (p=0.883)
BenchmarkGobDecode 22.4ms × (0.99,1.01) 21.9ms × (0.92,1.04) ~ (p=0.332)
BenchmarkGobEncode 24.7ms × (0.98,1.02) 23.9ms × (0.87,1.07) ~ (p=0.407)
BenchmarkGzip 397ms × (0.99,1.01) 398ms × (0.99,1.01) ~ (p=0.718)
BenchmarkGunzip 96.7ms × (1.00,1.00) 96.9ms × (1.00,1.00) ~ (p=0.230)
BenchmarkHTTPClientServer 71.5µs × (0.98,1.01) 68.5µs × (0.92,1.06) ~ (p=0.243)
BenchmarkJSONEncode 46.1ms × (0.98,1.01) 44.9ms × (0.98,1.03) -2.51% (p=0.040)
BenchmarkJSONDecode 86.1ms × (0.99,1.01) 86.5ms × (0.99,1.01) ~ (p=0.343)
BenchmarkMandelbrot200 4.12ms × (1.00,1.00) 4.13ms × (1.00,1.00) +0.23% (p=0.000)
BenchmarkGoParse 5.89ms × (0.96,1.03) 5.82ms × (0.96,1.04) ~ (p=0.522)
BenchmarkRegexpMatchEasy0_32 141ns × (0.99,1.01) 142ns × (1.00,1.00) ~ (p=0.178)
BenchmarkRegexpMatchEasy0_1K 408ns × (1.00,1.00) 392ns × (0.99,1.00) -3.83% (p=0.000)
BenchmarkRegexpMatchEasy1_32 122ns × (1.00,1.00) 122ns × (1.00,1.00) ~ (p=0.178)
BenchmarkRegexpMatchEasy1_1K 626ns × (1.00,1.01) 624ns × (0.99,1.00) ~ (p=0.122)
BenchmarkRegexpMatchMedium_32 202ns × (0.99,1.00) 205ns × (0.99,1.01) +1.58% (p=0.001)
BenchmarkRegexpMatchMedium_1K 54.4µs × (1.00,1.00) 55.5µs × (1.00,1.00) +1.86% (p=0.000)
BenchmarkRegexpMatchHard_32 2.68µs × (1.00,1.00) 2.71µs × (1.00,1.00) +0.97% (p=0.002)
BenchmarkRegexpMatchHard_1K 79.8µs × (1.00,1.01) 80.5µs × (1.00,1.01) +0.94% (p=0.003)
BenchmarkRevcomp 590ms × (0.99,1.01) 585ms × (1.00,1.00) ~ (p=0.066)
BenchmarkTemplate 111ms × (0.97,1.02) 112ms × (0.99,1.01) ~ (p=0.201)
BenchmarkTimeParse 392ns × (1.00,1.00) 385ns × (1.00,1.00) -1.69% (p=0.000)
BenchmarkTimeFormat 449ns × (0.98,1.01) 448ns × (0.99,1.01) ~ (p=0.550)
Change-Id: Ie7c3830c481d96c9043e7bf26853c6c1d05dc9f4
Reviewed-on: https://go-review.googlesource.com/9364
Reviewed-by: Rick Hudson <rlh@golang.org>
2015-04-26 18:27:17 -04:00
|
|
|
return nil
|
|
|
|
|
}
|
2017-12-13 16:09:02 -05:00
|
|
|
return ha.spans[(p/pageSize)%pagesPerArena]
|
runtime: eliminate one heapBitsForObject from scanobject
scanobject with ptrmask!=nil is only ever called with the base
pointer of a heap object. Currently, scanobject calls
heapBitsForObject, which goes to a great deal of trouble to check
that the pointer points into the heap and to find the base of the
object it points to, both of which are completely unnecessary in
this case.
Replace this call to heapBitsForObject with much simpler logic to
fetch the span and compute the heap bits.
Benchmark results with five runs:
name old mean new mean delta
BenchmarkBinaryTree17 9.21s × (0.95,1.02) 8.55s × (0.91,1.03) -7.16% (p=0.022)
BenchmarkFannkuch11 2.65s × (1.00,1.00) 2.62s × (1.00,1.00) -1.10% (p=0.000)
BenchmarkFmtFprintfEmpty 73.2ns × (0.99,1.01) 71.7ns × (1.00,1.01) -1.99% (p=0.004)
BenchmarkFmtFprintfString 302ns × (0.99,1.00) 292ns × (0.98,1.02) -3.31% (p=0.020)
BenchmarkFmtFprintfInt 281ns × (0.98,1.01) 279ns × (0.96,1.02) ~ (p=0.596)
BenchmarkFmtFprintfIntInt 482ns × (0.98,1.01) 488ns × (0.95,1.02) ~ (p=0.419)
BenchmarkFmtFprintfPrefixedInt 382ns × (0.99,1.01) 365ns × (0.96,1.02) -4.35% (p=0.015)
BenchmarkFmtFprintfFloat 475ns × (0.99,1.01) 472ns × (1.00,1.00) ~ (p=0.108)
BenchmarkFmtManyArgs 1.89µs × (1.00,1.01) 1.90µs × (0.94,1.02) ~ (p=0.883)
BenchmarkGobDecode 22.4ms × (0.99,1.01) 21.9ms × (0.92,1.04) ~ (p=0.332)
BenchmarkGobEncode 24.7ms × (0.98,1.02) 23.9ms × (0.87,1.07) ~ (p=0.407)
BenchmarkGzip 397ms × (0.99,1.01) 398ms × (0.99,1.01) ~ (p=0.718)
BenchmarkGunzip 96.7ms × (1.00,1.00) 96.9ms × (1.00,1.00) ~ (p=0.230)
BenchmarkHTTPClientServer 71.5µs × (0.98,1.01) 68.5µs × (0.92,1.06) ~ (p=0.243)
BenchmarkJSONEncode 46.1ms × (0.98,1.01) 44.9ms × (0.98,1.03) -2.51% (p=0.040)
BenchmarkJSONDecode 86.1ms × (0.99,1.01) 86.5ms × (0.99,1.01) ~ (p=0.343)
BenchmarkMandelbrot200 4.12ms × (1.00,1.00) 4.13ms × (1.00,1.00) +0.23% (p=0.000)
BenchmarkGoParse 5.89ms × (0.96,1.03) 5.82ms × (0.96,1.04) ~ (p=0.522)
BenchmarkRegexpMatchEasy0_32 141ns × (0.99,1.01) 142ns × (1.00,1.00) ~ (p=0.178)
BenchmarkRegexpMatchEasy0_1K 408ns × (1.00,1.00) 392ns × (0.99,1.00) -3.83% (p=0.000)
BenchmarkRegexpMatchEasy1_32 122ns × (1.00,1.00) 122ns × (1.00,1.00) ~ (p=0.178)
BenchmarkRegexpMatchEasy1_1K 626ns × (1.00,1.01) 624ns × (0.99,1.00) ~ (p=0.122)
BenchmarkRegexpMatchMedium_32 202ns × (0.99,1.00) 205ns × (0.99,1.01) +1.58% (p=0.001)
BenchmarkRegexpMatchMedium_1K 54.4µs × (1.00,1.00) 55.5µs × (1.00,1.00) +1.86% (p=0.000)
BenchmarkRegexpMatchHard_32 2.68µs × (1.00,1.00) 2.71µs × (1.00,1.00) +0.97% (p=0.002)
BenchmarkRegexpMatchHard_1K 79.8µs × (1.00,1.01) 80.5µs × (1.00,1.01) +0.94% (p=0.003)
BenchmarkRevcomp 590ms × (0.99,1.01) 585ms × (1.00,1.00) ~ (p=0.066)
BenchmarkTemplate 111ms × (0.97,1.02) 112ms × (0.99,1.01) ~ (p=0.201)
BenchmarkTimeParse 392ns × (1.00,1.00) 385ns × (1.00,1.00) -1.69% (p=0.000)
BenchmarkTimeFormat 449ns × (0.98,1.01) 448ns × (0.99,1.01) ~ (p=0.550)
Change-Id: Ie7c3830c481d96c9043e7bf26853c6c1d05dc9f4
Reviewed-on: https://go-review.googlesource.com/9364
Reviewed-by: Rick Hudson <rlh@golang.org>
2015-04-26 18:27:17 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// spanOfUnchecked is equivalent to spanOf, but the caller must ensure
|
runtime: use sparse mappings for the heap
This replaces the contiguous heap arena mapping with a potentially
sparse mapping that can support heap mappings anywhere in the address
space.
This has several advantages over the current approach:
* There is no longer any limit on the size of the Go heap. (Currently
it's limited to 512GB.) Hence, this fixes #10460.
* It eliminates many failures modes of heap initialization and
growing. In particular it eliminates any possibility of panicking
with an address space conflict. This can happen for many reasons and
even causes a low but steady rate of TSAN test failures because of
conflicts with the TSAN runtime. See #16936 and #11993.
* It eliminates the notion of "non-reserved" heap, which was added
because creating huge address space reservations (particularly on
64-bit) led to huge process VSIZE. This was at best confusing and at
worst conflicted badly with ulimit -v. However, the non-reserved
heap logic is complicated, can race with other mappings in non-pure
Go binaries (e.g., #18976), and requires that the entire heap be
either reserved or non-reserved. We currently maintain the latter
property, but it's quite difficult to convince yourself of that, and
hence difficult to keep correct. This logic is still present, but
will be removed in the next CL.
* It fixes problems on 32-bit where skipping over parts of the address
space leads to mapping huge (and never-to-be-used) metadata
structures. See #19831.
This also completely rewrites and significantly simplifies
mheap.sysAlloc, which has been a source of many bugs. E.g., #21044,
#20259, #18651, and #13143 (and maybe #23222).
This change also makes it possible to allocate individual objects
larger than 512GB. As a result, a few tests that expected huge
allocations to fail needed to be changed to make even larger
allocations. However, at the moment attempting to allocate a humongous
object may cause the program to freeze for several minutes on Linux as
we fall back to probing every page with addrspace_free. That logic
(and this failure mode) will be removed in the next CL.
Fixes #10460.
Fixes #22204 (since it rewrites the code involved).
This slightly slows down compilebench and the x/benchmarks garbage
benchmark.
name old time/op new time/op delta
Template 184ms ± 1% 185ms ± 1% ~ (p=0.065 n=10+9)
Unicode 86.9ms ± 3% 86.3ms ± 1% ~ (p=0.631 n=10+10)
GoTypes 599ms ± 0% 602ms ± 0% +0.56% (p=0.000 n=10+9)
Compiler 2.87s ± 1% 2.89s ± 1% +0.51% (p=0.002 n=9+10)
SSA 7.29s ± 1% 7.25s ± 1% ~ (p=0.182 n=10+9)
Flate 118ms ± 2% 118ms ± 1% ~ (p=0.113 n=9+9)
GoParser 147ms ± 1% 148ms ± 1% +1.07% (p=0.003 n=9+10)
Reflect 401ms ± 1% 404ms ± 1% +0.71% (p=0.003 n=10+9)
Tar 175ms ± 1% 175ms ± 1% ~ (p=0.604 n=9+10)
XML 209ms ± 1% 210ms ± 1% ~ (p=0.052 n=10+10)
(https://perf.golang.org/search?q=upload:20171231.4)
name old time/op new time/op delta
Garbage/benchmem-MB=64-12 2.23ms ± 1% 2.25ms ± 1% +0.84% (p=0.000 n=19+19)
(https://perf.golang.org/search?q=upload:20171231.3)
Relative to the start of the sparse heap changes (starting at and
including "runtime: fix various contiguous bitmap assumptions"),
overall slowdown is roughly 1% on GC-intensive benchmarks:
name old time/op new time/op delta
Template 183ms ± 1% 185ms ± 1% +1.32% (p=0.000 n=9+9)
Unicode 84.9ms ± 2% 86.3ms ± 1% +1.65% (p=0.000 n=9+10)
GoTypes 595ms ± 1% 602ms ± 0% +1.19% (p=0.000 n=9+9)
Compiler 2.86s ± 0% 2.89s ± 1% +0.91% (p=0.000 n=9+10)
SSA 7.19s ± 0% 7.25s ± 1% +0.75% (p=0.000 n=8+9)
Flate 117ms ± 1% 118ms ± 1% +1.10% (p=0.000 n=10+9)
GoParser 146ms ± 2% 148ms ± 1% +1.48% (p=0.002 n=10+10)
Reflect 398ms ± 1% 404ms ± 1% +1.51% (p=0.000 n=10+9)
Tar 173ms ± 1% 175ms ± 1% +1.17% (p=0.000 n=10+10)
XML 208ms ± 1% 210ms ± 1% +0.62% (p=0.011 n=10+10)
[Geo mean] 369ms 373ms +1.17%
(https://perf.golang.org/search?q=upload:20180101.2)
name old time/op new time/op delta
Garbage/benchmem-MB=64-12 2.22ms ± 1% 2.25ms ± 1% +1.51% (p=0.000 n=20+19)
(https://perf.golang.org/search?q=upload:20180101.3)
Change-Id: I5daf4cfec24b252e5a57001f0a6c03f22479d0f0
Reviewed-on: https://go-review.googlesource.com/85887
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
2017-12-19 22:05:23 -08:00
|
|
|
// that p points into an allocated heap arena.
|
2017-12-04 11:02:59 -05:00
|
|
|
//
|
|
|
|
|
// Must be nosplit because it has callers that are nosplit.
|
|
|
|
|
//
|
|
|
|
|
//go:nosplit
|
runtime: eliminate one heapBitsForObject from scanobject
scanobject with ptrmask!=nil is only ever called with the base
pointer of a heap object. Currently, scanobject calls
heapBitsForObject, which goes to a great deal of trouble to check
that the pointer points into the heap and to find the base of the
object it points to, both of which are completely unnecessary in
this case.
Replace this call to heapBitsForObject with much simpler logic to
fetch the span and compute the heap bits.
Benchmark results with five runs:
name old mean new mean delta
BenchmarkBinaryTree17 9.21s × (0.95,1.02) 8.55s × (0.91,1.03) -7.16% (p=0.022)
BenchmarkFannkuch11 2.65s × (1.00,1.00) 2.62s × (1.00,1.00) -1.10% (p=0.000)
BenchmarkFmtFprintfEmpty 73.2ns × (0.99,1.01) 71.7ns × (1.00,1.01) -1.99% (p=0.004)
BenchmarkFmtFprintfString 302ns × (0.99,1.00) 292ns × (0.98,1.02) -3.31% (p=0.020)
BenchmarkFmtFprintfInt 281ns × (0.98,1.01) 279ns × (0.96,1.02) ~ (p=0.596)
BenchmarkFmtFprintfIntInt 482ns × (0.98,1.01) 488ns × (0.95,1.02) ~ (p=0.419)
BenchmarkFmtFprintfPrefixedInt 382ns × (0.99,1.01) 365ns × (0.96,1.02) -4.35% (p=0.015)
BenchmarkFmtFprintfFloat 475ns × (0.99,1.01) 472ns × (1.00,1.00) ~ (p=0.108)
BenchmarkFmtManyArgs 1.89µs × (1.00,1.01) 1.90µs × (0.94,1.02) ~ (p=0.883)
BenchmarkGobDecode 22.4ms × (0.99,1.01) 21.9ms × (0.92,1.04) ~ (p=0.332)
BenchmarkGobEncode 24.7ms × (0.98,1.02) 23.9ms × (0.87,1.07) ~ (p=0.407)
BenchmarkGzip 397ms × (0.99,1.01) 398ms × (0.99,1.01) ~ (p=0.718)
BenchmarkGunzip 96.7ms × (1.00,1.00) 96.9ms × (1.00,1.00) ~ (p=0.230)
BenchmarkHTTPClientServer 71.5µs × (0.98,1.01) 68.5µs × (0.92,1.06) ~ (p=0.243)
BenchmarkJSONEncode 46.1ms × (0.98,1.01) 44.9ms × (0.98,1.03) -2.51% (p=0.040)
BenchmarkJSONDecode 86.1ms × (0.99,1.01) 86.5ms × (0.99,1.01) ~ (p=0.343)
BenchmarkMandelbrot200 4.12ms × (1.00,1.00) 4.13ms × (1.00,1.00) +0.23% (p=0.000)
BenchmarkGoParse 5.89ms × (0.96,1.03) 5.82ms × (0.96,1.04) ~ (p=0.522)
BenchmarkRegexpMatchEasy0_32 141ns × (0.99,1.01) 142ns × (1.00,1.00) ~ (p=0.178)
BenchmarkRegexpMatchEasy0_1K 408ns × (1.00,1.00) 392ns × (0.99,1.00) -3.83% (p=0.000)
BenchmarkRegexpMatchEasy1_32 122ns × (1.00,1.00) 122ns × (1.00,1.00) ~ (p=0.178)
BenchmarkRegexpMatchEasy1_1K 626ns × (1.00,1.01) 624ns × (0.99,1.00) ~ (p=0.122)
BenchmarkRegexpMatchMedium_32 202ns × (0.99,1.00) 205ns × (0.99,1.01) +1.58% (p=0.001)
BenchmarkRegexpMatchMedium_1K 54.4µs × (1.00,1.00) 55.5µs × (1.00,1.00) +1.86% (p=0.000)
BenchmarkRegexpMatchHard_32 2.68µs × (1.00,1.00) 2.71µs × (1.00,1.00) +0.97% (p=0.002)
BenchmarkRegexpMatchHard_1K 79.8µs × (1.00,1.01) 80.5µs × (1.00,1.01) +0.94% (p=0.003)
BenchmarkRevcomp 590ms × (0.99,1.01) 585ms × (1.00,1.00) ~ (p=0.066)
BenchmarkTemplate 111ms × (0.97,1.02) 112ms × (0.99,1.01) ~ (p=0.201)
BenchmarkTimeParse 392ns × (1.00,1.00) 385ns × (1.00,1.00) -1.69% (p=0.000)
BenchmarkTimeFormat 449ns × (0.98,1.01) 448ns × (0.99,1.01) ~ (p=0.550)
Change-Id: Ie7c3830c481d96c9043e7bf26853c6c1d05dc9f4
Reviewed-on: https://go-review.googlesource.com/9364
Reviewed-by: Rick Hudson <rlh@golang.org>
2015-04-26 18:27:17 -04:00
|
|
|
func spanOfUnchecked(p uintptr) *mspan {
|
runtime: support a two-level arena map
Currently, the heap arena map is a single, large array that covers
every possible arena frame in the entire address space. This is
practical up to about 48 bits of address space with 64 MB arenas.
However, there are two problems with this:
1. mips64, ppc64, and s390x support full 64-bit address spaces (though
on Linux only s390x has kernel support for 64-bit address spaces).
On these platforms, it would be good to support these larger
address spaces.
2. On Windows, processes are charged for untouched memory, so for
processes with small heaps, the mostly-untouched 32 MB arena map
plus a 64 MB arena are significant overhead. Hence, it would be
good to reduce both the arena map size and the arena size, but with
a single-level arena, these are inversely proportional.
This CL adds support for a two-level arena map. Arena frame numbers
are now divided into arenaL1Bits of L1 index and arenaL2Bits of L2
index.
At the moment, arenaL1Bits is always 0, so we effectively have a
single level map. We do a few things so that this has no cost beyond
the current single-level map:
1. We embed the L2 array directly in mheap, so if there's a single
entry in the L2 array, the representation is identical to the
current representation and there's no extra level of indirection.
2. Hot code that accesses the arena map is structured so that it
optimizes to nearly the same machine code as it does currently.
3. We make some small tweaks to hot code paths and to the inliner
itself to keep some important functions inlined despite their
now-larger ASTs. In particular, this is necessary for
heapBitsForAddr and heapBits.next.
Possibly as a result of some of the tweaks, this actually slightly
improves the performance of the x/benchmarks garbage benchmark:
name old time/op new time/op delta
Garbage/benchmem-MB=64-12 2.28ms ± 1% 2.26ms ± 1% -1.07% (p=0.000 n=17+19)
(https://perf.golang.org/search?q=upload:20180223.2)
For #23900.
Change-Id: If5164e0961754f97eb9eca58f837f36d759505ff
Reviewed-on: https://go-review.googlesource.com/96779
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
2018-02-22 20:38:09 -05:00
|
|
|
ai := arenaIndex(p)
|
|
|
|
|
return mheap_.arenas[ai.l1()][ai.l2()].spans[(p/pageSize)%pagesPerArena]
|
runtime: eliminate one heapBitsForObject from scanobject
scanobject with ptrmask!=nil is only ever called with the base
pointer of a heap object. Currently, scanobject calls
heapBitsForObject, which goes to a great deal of trouble to check
that the pointer points into the heap and to find the base of the
object it points to, both of which are completely unnecessary in
this case.
Replace this call to heapBitsForObject with much simpler logic to
fetch the span and compute the heap bits.
Benchmark results with five runs:
name old mean new mean delta
BenchmarkBinaryTree17 9.21s × (0.95,1.02) 8.55s × (0.91,1.03) -7.16% (p=0.022)
BenchmarkFannkuch11 2.65s × (1.00,1.00) 2.62s × (1.00,1.00) -1.10% (p=0.000)
BenchmarkFmtFprintfEmpty 73.2ns × (0.99,1.01) 71.7ns × (1.00,1.01) -1.99% (p=0.004)
BenchmarkFmtFprintfString 302ns × (0.99,1.00) 292ns × (0.98,1.02) -3.31% (p=0.020)
BenchmarkFmtFprintfInt 281ns × (0.98,1.01) 279ns × (0.96,1.02) ~ (p=0.596)
BenchmarkFmtFprintfIntInt 482ns × (0.98,1.01) 488ns × (0.95,1.02) ~ (p=0.419)
BenchmarkFmtFprintfPrefixedInt 382ns × (0.99,1.01) 365ns × (0.96,1.02) -4.35% (p=0.015)
BenchmarkFmtFprintfFloat 475ns × (0.99,1.01) 472ns × (1.00,1.00) ~ (p=0.108)
BenchmarkFmtManyArgs 1.89µs × (1.00,1.01) 1.90µs × (0.94,1.02) ~ (p=0.883)
BenchmarkGobDecode 22.4ms × (0.99,1.01) 21.9ms × (0.92,1.04) ~ (p=0.332)
BenchmarkGobEncode 24.7ms × (0.98,1.02) 23.9ms × (0.87,1.07) ~ (p=0.407)
BenchmarkGzip 397ms × (0.99,1.01) 398ms × (0.99,1.01) ~ (p=0.718)
BenchmarkGunzip 96.7ms × (1.00,1.00) 96.9ms × (1.00,1.00) ~ (p=0.230)
BenchmarkHTTPClientServer 71.5µs × (0.98,1.01) 68.5µs × (0.92,1.06) ~ (p=0.243)
BenchmarkJSONEncode 46.1ms × (0.98,1.01) 44.9ms × (0.98,1.03) -2.51% (p=0.040)
BenchmarkJSONDecode 86.1ms × (0.99,1.01) 86.5ms × (0.99,1.01) ~ (p=0.343)
BenchmarkMandelbrot200 4.12ms × (1.00,1.00) 4.13ms × (1.00,1.00) +0.23% (p=0.000)
BenchmarkGoParse 5.89ms × (0.96,1.03) 5.82ms × (0.96,1.04) ~ (p=0.522)
BenchmarkRegexpMatchEasy0_32 141ns × (0.99,1.01) 142ns × (1.00,1.00) ~ (p=0.178)
BenchmarkRegexpMatchEasy0_1K 408ns × (1.00,1.00) 392ns × (0.99,1.00) -3.83% (p=0.000)
BenchmarkRegexpMatchEasy1_32 122ns × (1.00,1.00) 122ns × (1.00,1.00) ~ (p=0.178)
BenchmarkRegexpMatchEasy1_1K 626ns × (1.00,1.01) 624ns × (0.99,1.00) ~ (p=0.122)
BenchmarkRegexpMatchMedium_32 202ns × (0.99,1.00) 205ns × (0.99,1.01) +1.58% (p=0.001)
BenchmarkRegexpMatchMedium_1K 54.4µs × (1.00,1.00) 55.5µs × (1.00,1.00) +1.86% (p=0.000)
BenchmarkRegexpMatchHard_32 2.68µs × (1.00,1.00) 2.71µs × (1.00,1.00) +0.97% (p=0.002)
BenchmarkRegexpMatchHard_1K 79.8µs × (1.00,1.01) 80.5µs × (1.00,1.01) +0.94% (p=0.003)
BenchmarkRevcomp 590ms × (0.99,1.01) 585ms × (1.00,1.00) ~ (p=0.066)
BenchmarkTemplate 111ms × (0.97,1.02) 112ms × (0.99,1.01) ~ (p=0.201)
BenchmarkTimeParse 392ns × (1.00,1.00) 385ns × (1.00,1.00) -1.69% (p=0.000)
BenchmarkTimeFormat 449ns × (0.98,1.01) 448ns × (0.99,1.01) ~ (p=0.550)
Change-Id: Ie7c3830c481d96c9043e7bf26853c6c1d05dc9f4
Reviewed-on: https://go-review.googlesource.com/9364
Reviewed-by: Rick Hudson <rlh@golang.org>
2015-04-26 18:27:17 -04:00
|
|
|
}
|
|
|
|
|
|
2017-12-04 10:58:15 -05:00
|
|
|
// spanOfHeap is like spanOf, but returns nil if p does not point to a
|
|
|
|
|
// heap object.
|
2017-12-04 11:02:59 -05:00
|
|
|
//
|
|
|
|
|
// Must be nosplit because it has callers that are nosplit.
|
|
|
|
|
//
|
|
|
|
|
//go:nosplit
|
2017-12-04 10:58:15 -05:00
|
|
|
func spanOfHeap(p uintptr) *mspan {
|
|
|
|
|
s := spanOf(p)
|
runtime: atomically set span state and use as publication barrier
When everything is working correctly, any pointer the garbage
collector encounters can only point into a fully initialized heap
span, since the span must have been initialized before that pointer
could escape the heap allocator and become visible to the GC.
However, in various cases, we try to be defensive against bad
pointers. In findObject, this is just a sanity check: we never expect
to find a bad pointer, but programming errors can lead to them. In
spanOfHeap, we don't necessarily trust the pointer and we're trying to
check if it really does point to the heap, though it should always
point to something. Conservative scanning takes this to a new level,
since it can only guess that a word may be a pointer and verify this.
In all of these cases, we have a problem that the span lookup and
check can race with span initialization, since the span becomes
visible to lookups before it's fully initialized.
Furthermore, we're about to start initializing the span without the
heap lock held, which is going to introduce races where accesses were
previously protected by the heap lock.
To address this, this CL makes accesses to mspan.state atomic, and
ensures that the span is fully initialized before setting the state to
mSpanInUse. All loads are now atomic, and in any case where we don't
trust the pointer, it first atomically loads the span state and checks
that it's mSpanInUse, after which it will have synchronized with span
initialization and can safely check the other span fields.
For #10958, #24543, but a good fix in general.
Change-Id: I518b7c63555b02064b98aa5f802c92b758fef853
Reviewed-on: https://go-review.googlesource.com/c/go/+/203286
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
2019-10-23 11:25:38 -04:00
|
|
|
// s is nil if it's never been allocated. Otherwise, we check
|
|
|
|
|
// its state first because we don't trust this pointer, so we
|
|
|
|
|
// have to synchronize with span initialization. Then, it's
|
|
|
|
|
// still possible we picked up a stale span pointer, so we
|
|
|
|
|
// have to check the span's bounds.
|
|
|
|
|
if s == nil || s.state.get() != mSpanInUse || p < s.base() || p >= s.limit {
|
2017-12-04 10:58:15 -05:00
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
return s
|
|
|
|
|
}
|
|
|
|
|
|
2018-09-26 16:32:52 -04:00
|
|
|
// pageIndexOf returns the arena, page index, and page mask for pointer p.
|
|
|
|
|
// The caller must ensure p is in the heap.
|
|
|
|
|
func pageIndexOf(p uintptr) (arena *heapArena, pageIdx uintptr, pageMask uint8) {
|
|
|
|
|
ai := arenaIndex(p)
|
|
|
|
|
arena = mheap_.arenas[ai.l1()][ai.l2()]
|
|
|
|
|
pageIdx = ((p / pageSize) / 8) % uintptr(len(arena.pageInUse))
|
|
|
|
|
pageMask = byte(1 << ((p / pageSize) % 8))
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
|
2014-11-11 17:05:02 -05:00
|
|
|
// Initialize the heap.
|
2017-12-13 16:09:02 -05:00
|
|
|
func (h *mheap) init() {
|
runtime: static lock ranking for the runtime (enabled by GOEXPERIMENT)
I took some of the infrastructure from Austin's lock logging CR
https://go-review.googlesource.com/c/go/+/192704 (with deadlock
detection from the logs), and developed a setup to give static lock
ranking for runtime locks.
Static lock ranking establishes a documented total ordering among locks,
and then reports an error if the total order is violated. This can
happen if a deadlock happens (by acquiring a sequence of locks in
different orders), or if just one side of a possible deadlock happens.
Lock ordering deadlocks cannot happen as long as the lock ordering is
followed.
Along the way, I found a deadlock involving the new timer code, which Ian fixed
via https://go-review.googlesource.com/c/go/+/207348, as well as two other
potential deadlocks.
See the constants at the top of runtime/lockrank.go to show the static
lock ranking that I ended up with, along with some comments. This is
great documentation of the current intended lock ordering when acquiring
multiple locks in the runtime.
I also added an array lockPartialOrder[] which shows and enforces the
current partial ordering among locks (which is embedded within the total
ordering). This is more specific about the dependencies among locks.
I don't try to check the ranking within a lock class with multiple locks
that can be acquired at the same time (i.e. check the ranking when
multiple hchan locks are acquired).
Currently, I am doing a lockInit() call to set the lock rank of most
locks. Any lock that is not otherwise initialized is assumed to be a
leaf lock (a very high rank lock), so that eliminates the need to do
anything for a bunch of locks (including all architecture-dependent
locks). For two locks, root.lock and notifyList.lock (only in the
runtime/sema.go file), it is not as easy to do lock initialization, so
instead, I am passing the lock rank with the lock calls.
For Windows compilation, I needed to increase the StackGuard size from
896 to 928 because of the new lock-rank checking functions.
Checking of the static lock ranking is enabled by setting
GOEXPERIMENT=staticlockranking before doing a run.
To make sure that the static lock ranking code has no overhead in memory
or CPU when not enabled by GOEXPERIMENT, I changed 'go build/install' so
that it defines a build tag (with the same name) whenever any experiment
has been baked into the toolchain (by checking Expstring()). This allows
me to avoid increasing the size of the 'mutex' type when static lock
ranking is not enabled.
Fixes #38029
Change-Id: I154217ff307c47051f8dae9c2a03b53081acd83a
Reviewed-on: https://go-review.googlesource.com/c/go/+/207619
Reviewed-by: Dan Scales <danscales@google.com>
Reviewed-by: Keith Randall <khr@golang.org>
Run-TryBot: Dan Scales <danscales@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2019-11-13 17:34:47 -08:00
|
|
|
lockInit(&h.lock, lockRankMheap)
|
2020-04-17 15:36:13 -04:00
|
|
|
lockInit(&h.speciallock, lockRankMheapSpecial)
|
runtime: static lock ranking for the runtime (enabled by GOEXPERIMENT)
I took some of the infrastructure from Austin's lock logging CR
https://go-review.googlesource.com/c/go/+/192704 (with deadlock
detection from the logs), and developed a setup to give static lock
ranking for runtime locks.
Static lock ranking establishes a documented total ordering among locks,
and then reports an error if the total order is violated. This can
happen if a deadlock happens (by acquiring a sequence of locks in
different orders), or if just one side of a possible deadlock happens.
Lock ordering deadlocks cannot happen as long as the lock ordering is
followed.
Along the way, I found a deadlock involving the new timer code, which Ian fixed
via https://go-review.googlesource.com/c/go/+/207348, as well as two other
potential deadlocks.
See the constants at the top of runtime/lockrank.go to show the static
lock ranking that I ended up with, along with some comments. This is
great documentation of the current intended lock ordering when acquiring
multiple locks in the runtime.
I also added an array lockPartialOrder[] which shows and enforces the
current partial ordering among locks (which is embedded within the total
ordering). This is more specific about the dependencies among locks.
I don't try to check the ranking within a lock class with multiple locks
that can be acquired at the same time (i.e. check the ranking when
multiple hchan locks are acquired).
Currently, I am doing a lockInit() call to set the lock rank of most
locks. Any lock that is not otherwise initialized is assumed to be a
leaf lock (a very high rank lock), so that eliminates the need to do
anything for a bunch of locks (including all architecture-dependent
locks). For two locks, root.lock and notifyList.lock (only in the
runtime/sema.go file), it is not as easy to do lock initialization, so
instead, I am passing the lock rank with the lock calls.
For Windows compilation, I needed to increase the StackGuard size from
896 to 928 because of the new lock-rank checking functions.
Checking of the static lock ranking is enabled by setting
GOEXPERIMENT=staticlockranking before doing a run.
To make sure that the static lock ranking code has no overhead in memory
or CPU when not enabled by GOEXPERIMENT, I changed 'go build/install' so
that it defines a build tag (with the same name) whenever any experiment
has been baked into the toolchain (by checking Expstring()). This allows
me to avoid increasing the size of the 'mutex' type when static lock
ranking is not enabled.
Fixes #38029
Change-Id: I154217ff307c47051f8dae9c2a03b53081acd83a
Reviewed-on: https://go-review.googlesource.com/c/go/+/207619
Reviewed-by: Dan Scales <danscales@google.com>
Reviewed-by: Keith Randall <khr@golang.org>
Run-TryBot: Dan Scales <danscales@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2019-11-13 17:34:47 -08:00
|
|
|
|
2015-11-11 16:13:51 -08:00
|
|
|
h.spanalloc.init(unsafe.Sizeof(mspan{}), recordspan, unsafe.Pointer(h), &memstats.mspan_sys)
|
|
|
|
|
h.cachealloc.init(unsafe.Sizeof(mcache{}), nil, nil, &memstats.mcache_sys)
|
|
|
|
|
h.specialfinalizeralloc.init(unsafe.Sizeof(specialfinalizer{}), nil, nil, &memstats.other_sys)
|
|
|
|
|
h.specialprofilealloc.init(unsafe.Sizeof(specialprofile{}), nil, nil, &memstats.other_sys)
|
2021-03-24 10:45:20 -04:00
|
|
|
h.specialReachableAlloc.init(unsafe.Sizeof(specialReachable{}), nil, nil, &memstats.other_sys)
|
runtime: use sparse mappings for the heap
This replaces the contiguous heap arena mapping with a potentially
sparse mapping that can support heap mappings anywhere in the address
space.
This has several advantages over the current approach:
* There is no longer any limit on the size of the Go heap. (Currently
it's limited to 512GB.) Hence, this fixes #10460.
* It eliminates many failures modes of heap initialization and
growing. In particular it eliminates any possibility of panicking
with an address space conflict. This can happen for many reasons and
even causes a low but steady rate of TSAN test failures because of
conflicts with the TSAN runtime. See #16936 and #11993.
* It eliminates the notion of "non-reserved" heap, which was added
because creating huge address space reservations (particularly on
64-bit) led to huge process VSIZE. This was at best confusing and at
worst conflicted badly with ulimit -v. However, the non-reserved
heap logic is complicated, can race with other mappings in non-pure
Go binaries (e.g., #18976), and requires that the entire heap be
either reserved or non-reserved. We currently maintain the latter
property, but it's quite difficult to convince yourself of that, and
hence difficult to keep correct. This logic is still present, but
will be removed in the next CL.
* It fixes problems on 32-bit where skipping over parts of the address
space leads to mapping huge (and never-to-be-used) metadata
structures. See #19831.
This also completely rewrites and significantly simplifies
mheap.sysAlloc, which has been a source of many bugs. E.g., #21044,
#20259, #18651, and #13143 (and maybe #23222).
This change also makes it possible to allocate individual objects
larger than 512GB. As a result, a few tests that expected huge
allocations to fail needed to be changed to make even larger
allocations. However, at the moment attempting to allocate a humongous
object may cause the program to freeze for several minutes on Linux as
we fall back to probing every page with addrspace_free. That logic
(and this failure mode) will be removed in the next CL.
Fixes #10460.
Fixes #22204 (since it rewrites the code involved).
This slightly slows down compilebench and the x/benchmarks garbage
benchmark.
name old time/op new time/op delta
Template 184ms ± 1% 185ms ± 1% ~ (p=0.065 n=10+9)
Unicode 86.9ms ± 3% 86.3ms ± 1% ~ (p=0.631 n=10+10)
GoTypes 599ms ± 0% 602ms ± 0% +0.56% (p=0.000 n=10+9)
Compiler 2.87s ± 1% 2.89s ± 1% +0.51% (p=0.002 n=9+10)
SSA 7.29s ± 1% 7.25s ± 1% ~ (p=0.182 n=10+9)
Flate 118ms ± 2% 118ms ± 1% ~ (p=0.113 n=9+9)
GoParser 147ms ± 1% 148ms ± 1% +1.07% (p=0.003 n=9+10)
Reflect 401ms ± 1% 404ms ± 1% +0.71% (p=0.003 n=10+9)
Tar 175ms ± 1% 175ms ± 1% ~ (p=0.604 n=9+10)
XML 209ms ± 1% 210ms ± 1% ~ (p=0.052 n=10+10)
(https://perf.golang.org/search?q=upload:20171231.4)
name old time/op new time/op delta
Garbage/benchmem-MB=64-12 2.23ms ± 1% 2.25ms ± 1% +0.84% (p=0.000 n=19+19)
(https://perf.golang.org/search?q=upload:20171231.3)
Relative to the start of the sparse heap changes (starting at and
including "runtime: fix various contiguous bitmap assumptions"),
overall slowdown is roughly 1% on GC-intensive benchmarks:
name old time/op new time/op delta
Template 183ms ± 1% 185ms ± 1% +1.32% (p=0.000 n=9+9)
Unicode 84.9ms ± 2% 86.3ms ± 1% +1.65% (p=0.000 n=9+10)
GoTypes 595ms ± 1% 602ms ± 0% +1.19% (p=0.000 n=9+9)
Compiler 2.86s ± 0% 2.89s ± 1% +0.91% (p=0.000 n=9+10)
SSA 7.19s ± 0% 7.25s ± 1% +0.75% (p=0.000 n=8+9)
Flate 117ms ± 1% 118ms ± 1% +1.10% (p=0.000 n=10+9)
GoParser 146ms ± 2% 148ms ± 1% +1.48% (p=0.002 n=10+10)
Reflect 398ms ± 1% 404ms ± 1% +1.51% (p=0.000 n=10+9)
Tar 173ms ± 1% 175ms ± 1% +1.17% (p=0.000 n=10+10)
XML 208ms ± 1% 210ms ± 1% +0.62% (p=0.011 n=10+10)
[Geo mean] 369ms 373ms +1.17%
(https://perf.golang.org/search?q=upload:20180101.2)
name old time/op new time/op delta
Garbage/benchmem-MB=64-12 2.22ms ± 1% 2.25ms ± 1% +1.51% (p=0.000 n=20+19)
(https://perf.golang.org/search?q=upload:20180101.3)
Change-Id: I5daf4cfec24b252e5a57001f0a6c03f22479d0f0
Reviewed-on: https://go-review.googlesource.com/85887
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
2017-12-19 22:05:23 -08:00
|
|
|
h.arenaHintAlloc.init(unsafe.Sizeof(arenaHint{}), nil, nil, &memstats.other_sys)
|
2014-11-11 17:05:02 -05:00
|
|
|
|
runtime: make fixalloc zero allocations on reuse
Currently fixalloc does not zero memory it reuses. This is dangerous
with the hybrid barrier if the type may contain heap pointers, since
it may cause us to observe a dead heap pointer on reuse. It's also
error-prone since it's the only allocator that doesn't zero on
allocation (mallocgc of course zeroes, but so do persistentalloc and
sysAlloc). It's also largely pointless: for mcache, the caller
immediately memclrs the allocation; and the two specials types are
tiny so there's no real cost to zeroing them.
Change fixalloc to zero allocations by default.
The only type we don't zero by default is mspan. This actually
requires that the spsn's sweepgen survive across freeing and
reallocating a span. If we were to zero it, the following race would
be possible:
1. The current sweepgen is 2. Span s is on the unswept list.
2. Direct sweeping sweeps span s, finds it's all free, and releases s
to the fixalloc.
3. Thread 1 allocates s from fixalloc. Suppose this zeros s, including
s.sweepgen.
4. Thread 1 calls s.init, which sets s.state to _MSpanDead.
5. On thread 2, background sweeping comes across span s in allspans
and cas's s.sweepgen from 0 (sg-2) to 1 (sg-1). Now it thinks it
owns it for sweeping. 6. Thread 1 continues initializing s.
Everything breaks.
I would like to fix this because it's obviously confusing, but it's a
subtle enough problem that I'm leaving it alone for now. The solution
may be to skip sweepgen 0, but then we have to think about wrap-around
much more carefully.
Updates #17503.
Change-Id: Ie08691feed3abbb06a31381b94beb0a2e36a0613
Reviewed-on: https://go-review.googlesource.com/31368
Reviewed-by: Keith Randall <khr@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
2016-09-25 17:12:43 -04:00
|
|
|
// Don't zero mspan allocations. Background sweeping can
|
|
|
|
|
// inspect a span concurrently with allocating it, so it's
|
|
|
|
|
// important that the span's sweepgen survive across freeing
|
|
|
|
|
// and re-allocating a span to prevent background sweeping
|
|
|
|
|
// from improperly cas'ing it from 0.
|
|
|
|
|
//
|
|
|
|
|
// This is safe because mspan contains no heap pointers.
|
|
|
|
|
h.spanalloc.zero = false
|
|
|
|
|
|
2014-11-11 17:05:02 -05:00
|
|
|
// h->mapcache needs no init
|
2018-09-27 11:50:46 -04:00
|
|
|
|
2014-11-11 17:05:02 -05:00
|
|
|
for i := range h.central {
|
2016-02-09 17:53:07 -05:00
|
|
|
h.central[i].mcentral.init(spanClass(i))
|
2014-11-11 17:05:02 -05:00
|
|
|
}
|
2019-10-17 17:42:15 +00:00
|
|
|
|
2020-08-03 19:23:30 +00:00
|
|
|
h.pages.init(&h.lock, &memstats.gcMiscSys)
|
2017-04-07 13:49:51 -04:00
|
|
|
}
|
|
|
|
|
|
runtime: implement efficient page reclaimer
When we attempt to allocate an N page span (either for a large
allocation or when an mcentral runs dry), we first try to sweep spans
to release N pages. Currently, this can be extremely expensive:
sweeping a span to emptiness is the hardest thing to ask for and the
sweeper generally doesn't know where to even look for potentially
fruitful results. Since this is on the critical path of many
allocations, this is unfortunate.
This CL changes how we reclaim empty spans. Instead of trying lots of
spans and hoping for the best, it uses the newly introduced span marks
to efficiently find empty spans. The span marks (and in-use bits) are
in a dense bitmap, so these spans can be found with an efficient
sequential memory scan. This approach can scan for unmarked spans at
about 300 GB/ms and can free unmarked spans at about 32 MB/ms. We
could probably significantly improve the rate at which is can free
unmarked spans, but that's a separate issue.
Like the current reclaimer, this is still linear in the number of
spans that are swept, but the constant factor is now so vanishingly
small that it doesn't matter.
The benchmark in #18155 demonstrates both significant page reclaiming
delays, and object reclaiming delays. With "-retain-count=20000000
-preallocate=true -loop-count=3", the benchmark demonstrates several
page reclaiming delays on the order of 40ms. After this change, the
page reclaims are insignificant. The longest sweeps are still ~150ms,
but are object reclaiming delays. We'll address those in the next
several CLs.
Updates #18155.
Fixes #21378 by completely replacing the logic that had that bug.
Change-Id: Iad80eec11d7fc262d02c8f0761ac6998425c4064
Reviewed-on: https://go-review.googlesource.com/c/138959
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
2018-09-27 11:34:07 -04:00
|
|
|
// reclaim sweeps and reclaims at least npage pages into the heap.
|
|
|
|
|
// It is called before allocating npage pages to keep growth in check.
|
|
|
|
|
//
|
|
|
|
|
// reclaim implements the page-reclaimer half of the sweeper.
|
|
|
|
|
//
|
2020-08-21 11:59:55 -04:00
|
|
|
// h.lock must NOT be held.
|
runtime: implement efficient page reclaimer
When we attempt to allocate an N page span (either for a large
allocation or when an mcentral runs dry), we first try to sweep spans
to release N pages. Currently, this can be extremely expensive:
sweeping a span to emptiness is the hardest thing to ask for and the
sweeper generally doesn't know where to even look for potentially
fruitful results. Since this is on the critical path of many
allocations, this is unfortunate.
This CL changes how we reclaim empty spans. Instead of trying lots of
spans and hoping for the best, it uses the newly introduced span marks
to efficiently find empty spans. The span marks (and in-use bits) are
in a dense bitmap, so these spans can be found with an efficient
sequential memory scan. This approach can scan for unmarked spans at
about 300 GB/ms and can free unmarked spans at about 32 MB/ms. We
could probably significantly improve the rate at which is can free
unmarked spans, but that's a separate issue.
Like the current reclaimer, this is still linear in the number of
spans that are swept, but the constant factor is now so vanishingly
small that it doesn't matter.
The benchmark in #18155 demonstrates both significant page reclaiming
delays, and object reclaiming delays. With "-retain-count=20000000
-preallocate=true -loop-count=3", the benchmark demonstrates several
page reclaiming delays on the order of 40ms. After this change, the
page reclaims are insignificant. The longest sweeps are still ~150ms,
but are object reclaiming delays. We'll address those in the next
several CLs.
Updates #18155.
Fixes #21378 by completely replacing the logic that had that bug.
Change-Id: Iad80eec11d7fc262d02c8f0761ac6998425c4064
Reviewed-on: https://go-review.googlesource.com/c/138959
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
2018-09-27 11:34:07 -04:00
|
|
|
func (h *mheap) reclaim(npage uintptr) {
|
|
|
|
|
// TODO(austin): Half of the time spent freeing spans is in
|
|
|
|
|
// locking/unlocking the heap (even with low contention). We
|
|
|
|
|
// could make the slow path here several times faster by
|
|
|
|
|
// batching heap frees.
|
|
|
|
|
|
|
|
|
|
// Bail early if there's no more reclaim work.
|
|
|
|
|
if atomic.Load64(&h.reclaimIndex) >= 1<<63 {
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Disable preemption so the GC can't start while we're
|
|
|
|
|
// sweeping, so we can read h.sweepArenas, and so
|
|
|
|
|
// traceGCSweepStart/Done pair on the P.
|
|
|
|
|
mp := acquirem()
|
|
|
|
|
|
|
|
|
|
if trace.enabled {
|
|
|
|
|
traceGCSweepStart()
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
arenas := h.sweepArenas
|
|
|
|
|
locked := false
|
|
|
|
|
for npage > 0 {
|
|
|
|
|
// Pull from accumulated credit first.
|
|
|
|
|
if credit := atomic.Loaduintptr(&h.reclaimCredit); credit > 0 {
|
|
|
|
|
take := credit
|
|
|
|
|
if take > npage {
|
|
|
|
|
// Take only what we need.
|
|
|
|
|
take = npage
|
2014-11-11 17:05:02 -05:00
|
|
|
}
|
runtime: implement efficient page reclaimer
When we attempt to allocate an N page span (either for a large
allocation or when an mcentral runs dry), we first try to sweep spans
to release N pages. Currently, this can be extremely expensive:
sweeping a span to emptiness is the hardest thing to ask for and the
sweeper generally doesn't know where to even look for potentially
fruitful results. Since this is on the critical path of many
allocations, this is unfortunate.
This CL changes how we reclaim empty spans. Instead of trying lots of
spans and hoping for the best, it uses the newly introduced span marks
to efficiently find empty spans. The span marks (and in-use bits) are
in a dense bitmap, so these spans can be found with an efficient
sequential memory scan. This approach can scan for unmarked spans at
about 300 GB/ms and can free unmarked spans at about 32 MB/ms. We
could probably significantly improve the rate at which is can free
unmarked spans, but that's a separate issue.
Like the current reclaimer, this is still linear in the number of
spans that are swept, but the constant factor is now so vanishingly
small that it doesn't matter.
The benchmark in #18155 demonstrates both significant page reclaiming
delays, and object reclaiming delays. With "-retain-count=20000000
-preallocate=true -loop-count=3", the benchmark demonstrates several
page reclaiming delays on the order of 40ms. After this change, the
page reclaims are insignificant. The longest sweeps are still ~150ms,
but are object reclaiming delays. We'll address those in the next
several CLs.
Updates #18155.
Fixes #21378 by completely replacing the logic that had that bug.
Change-Id: Iad80eec11d7fc262d02c8f0761ac6998425c4064
Reviewed-on: https://go-review.googlesource.com/c/138959
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
2018-09-27 11:34:07 -04:00
|
|
|
if atomic.Casuintptr(&h.reclaimCredit, credit, credit-take) {
|
|
|
|
|
npage -= take
|
2014-11-11 17:05:02 -05:00
|
|
|
}
|
|
|
|
|
continue
|
|
|
|
|
}
|
runtime: implement efficient page reclaimer
When we attempt to allocate an N page span (either for a large
allocation or when an mcentral runs dry), we first try to sweep spans
to release N pages. Currently, this can be extremely expensive:
sweeping a span to emptiness is the hardest thing to ask for and the
sweeper generally doesn't know where to even look for potentially
fruitful results. Since this is on the critical path of many
allocations, this is unfortunate.
This CL changes how we reclaim empty spans. Instead of trying lots of
spans and hoping for the best, it uses the newly introduced span marks
to efficiently find empty spans. The span marks (and in-use bits) are
in a dense bitmap, so these spans can be found with an efficient
sequential memory scan. This approach can scan for unmarked spans at
about 300 GB/ms and can free unmarked spans at about 32 MB/ms. We
could probably significantly improve the rate at which is can free
unmarked spans, but that's a separate issue.
Like the current reclaimer, this is still linear in the number of
spans that are swept, but the constant factor is now so vanishingly
small that it doesn't matter.
The benchmark in #18155 demonstrates both significant page reclaiming
delays, and object reclaiming delays. With "-retain-count=20000000
-preallocate=true -loop-count=3", the benchmark demonstrates several
page reclaiming delays on the order of 40ms. After this change, the
page reclaims are insignificant. The longest sweeps are still ~150ms,
but are object reclaiming delays. We'll address those in the next
several CLs.
Updates #18155.
Fixes #21378 by completely replacing the logic that had that bug.
Change-Id: Iad80eec11d7fc262d02c8f0761ac6998425c4064
Reviewed-on: https://go-review.googlesource.com/c/138959
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
2018-09-27 11:34:07 -04:00
|
|
|
|
|
|
|
|
// Claim a chunk of work.
|
runtime: add bitmap-based markrootSpans implementation
Currently markrootSpans, the scanning routine which scans span specials
(particularly finalizers) as roots, uses sweepSpans to shard work and
find spans to mark.
However, as part of a future CL to change span ownership and how
mcentral works, we want to avoid having markrootSpans use the sweep bufs
to find specials, so in this change we introduce a new mechanism.
Much like for the page reclaimer, we set up a per-page bitmap where the
first page for a span is marked if the span contains any specials, and
unmarked if it has no specials. This bitmap is updated by addspecial,
removespecial, and during sweeping.
markrootSpans then shards this bitmap into mark work and markers iterate
over the bitmap looking for spans with specials to mark. Unlike the page
reclaimer, we don't need to use the pageInUse bits because having a
special implies that a span is in-use.
While in terms of computational complexity this design is technically
worse, because it needs to iterate over the mapped heap, in practice
this iteration is very fast (we can skip over large swathes of the heap
very quickly) and we only look at spans that have any specials at all,
rather than having to touch each span.
This new implementation of markrootSpans is behind a feature flag called
go115NewMarkrootSpans.
Updates #37487.
Change-Id: I8ea07b6c11059f6d412fe419e0ab512d989377b8
Reviewed-on: https://go-review.googlesource.com/c/go/+/221178
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Austin Clements <austin@google.com>
2020-02-20 20:03:39 +00:00
|
|
|
idx := uintptr(atomic.Xadd64(&h.reclaimIndex, pagesPerReclaimerChunk) - pagesPerReclaimerChunk)
|
runtime: implement efficient page reclaimer
When we attempt to allocate an N page span (either for a large
allocation or when an mcentral runs dry), we first try to sweep spans
to release N pages. Currently, this can be extremely expensive:
sweeping a span to emptiness is the hardest thing to ask for and the
sweeper generally doesn't know where to even look for potentially
fruitful results. Since this is on the critical path of many
allocations, this is unfortunate.
This CL changes how we reclaim empty spans. Instead of trying lots of
spans and hoping for the best, it uses the newly introduced span marks
to efficiently find empty spans. The span marks (and in-use bits) are
in a dense bitmap, so these spans can be found with an efficient
sequential memory scan. This approach can scan for unmarked spans at
about 300 GB/ms and can free unmarked spans at about 32 MB/ms. We
could probably significantly improve the rate at which is can free
unmarked spans, but that's a separate issue.
Like the current reclaimer, this is still linear in the number of
spans that are swept, but the constant factor is now so vanishingly
small that it doesn't matter.
The benchmark in #18155 demonstrates both significant page reclaiming
delays, and object reclaiming delays. With "-retain-count=20000000
-preallocate=true -loop-count=3", the benchmark demonstrates several
page reclaiming delays on the order of 40ms. After this change, the
page reclaims are insignificant. The longest sweeps are still ~150ms,
but are object reclaiming delays. We'll address those in the next
several CLs.
Updates #18155.
Fixes #21378 by completely replacing the logic that had that bug.
Change-Id: Iad80eec11d7fc262d02c8f0761ac6998425c4064
Reviewed-on: https://go-review.googlesource.com/c/138959
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
2018-09-27 11:34:07 -04:00
|
|
|
if idx/pagesPerArena >= uintptr(len(arenas)) {
|
|
|
|
|
// Page reclaiming is done.
|
|
|
|
|
atomic.Store64(&h.reclaimIndex, 1<<63)
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if !locked {
|
|
|
|
|
// Lock the heap for reclaimChunk.
|
|
|
|
|
lock(&h.lock)
|
|
|
|
|
locked = true
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Scan this chunk.
|
runtime: add bitmap-based markrootSpans implementation
Currently markrootSpans, the scanning routine which scans span specials
(particularly finalizers) as roots, uses sweepSpans to shard work and
find spans to mark.
However, as part of a future CL to change span ownership and how
mcentral works, we want to avoid having markrootSpans use the sweep bufs
to find specials, so in this change we introduce a new mechanism.
Much like for the page reclaimer, we set up a per-page bitmap where the
first page for a span is marked if the span contains any specials, and
unmarked if it has no specials. This bitmap is updated by addspecial,
removespecial, and during sweeping.
markrootSpans then shards this bitmap into mark work and markers iterate
over the bitmap looking for spans with specials to mark. Unlike the page
reclaimer, we don't need to use the pageInUse bits because having a
special implies that a span is in-use.
While in terms of computational complexity this design is technically
worse, because it needs to iterate over the mapped heap, in practice
this iteration is very fast (we can skip over large swathes of the heap
very quickly) and we only look at spans that have any specials at all,
rather than having to touch each span.
This new implementation of markrootSpans is behind a feature flag called
go115NewMarkrootSpans.
Updates #37487.
Change-Id: I8ea07b6c11059f6d412fe419e0ab512d989377b8
Reviewed-on: https://go-review.googlesource.com/c/go/+/221178
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Austin Clements <austin@google.com>
2020-02-20 20:03:39 +00:00
|
|
|
nfound := h.reclaimChunk(arenas, idx, pagesPerReclaimerChunk)
|
runtime: implement efficient page reclaimer
When we attempt to allocate an N page span (either for a large
allocation or when an mcentral runs dry), we first try to sweep spans
to release N pages. Currently, this can be extremely expensive:
sweeping a span to emptiness is the hardest thing to ask for and the
sweeper generally doesn't know where to even look for potentially
fruitful results. Since this is on the critical path of many
allocations, this is unfortunate.
This CL changes how we reclaim empty spans. Instead of trying lots of
spans and hoping for the best, it uses the newly introduced span marks
to efficiently find empty spans. The span marks (and in-use bits) are
in a dense bitmap, so these spans can be found with an efficient
sequential memory scan. This approach can scan for unmarked spans at
about 300 GB/ms and can free unmarked spans at about 32 MB/ms. We
could probably significantly improve the rate at which is can free
unmarked spans, but that's a separate issue.
Like the current reclaimer, this is still linear in the number of
spans that are swept, but the constant factor is now so vanishingly
small that it doesn't matter.
The benchmark in #18155 demonstrates both significant page reclaiming
delays, and object reclaiming delays. With "-retain-count=20000000
-preallocate=true -loop-count=3", the benchmark demonstrates several
page reclaiming delays on the order of 40ms. After this change, the
page reclaims are insignificant. The longest sweeps are still ~150ms,
but are object reclaiming delays. We'll address those in the next
several CLs.
Updates #18155.
Fixes #21378 by completely replacing the logic that had that bug.
Change-Id: Iad80eec11d7fc262d02c8f0761ac6998425c4064
Reviewed-on: https://go-review.googlesource.com/c/138959
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
2018-09-27 11:34:07 -04:00
|
|
|
if nfound <= npage {
|
|
|
|
|
npage -= nfound
|
|
|
|
|
} else {
|
|
|
|
|
// Put spare pages toward global credit.
|
|
|
|
|
atomic.Xadduintptr(&h.reclaimCredit, nfound-npage)
|
|
|
|
|
npage = 0
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if locked {
|
|
|
|
|
unlock(&h.lock)
|
2014-11-11 17:05:02 -05:00
|
|
|
}
|
|
|
|
|
|
runtime: implement efficient page reclaimer
When we attempt to allocate an N page span (either for a large
allocation or when an mcentral runs dry), we first try to sweep spans
to release N pages. Currently, this can be extremely expensive:
sweeping a span to emptiness is the hardest thing to ask for and the
sweeper generally doesn't know where to even look for potentially
fruitful results. Since this is on the critical path of many
allocations, this is unfortunate.
This CL changes how we reclaim empty spans. Instead of trying lots of
spans and hoping for the best, it uses the newly introduced span marks
to efficiently find empty spans. The span marks (and in-use bits) are
in a dense bitmap, so these spans can be found with an efficient
sequential memory scan. This approach can scan for unmarked spans at
about 300 GB/ms and can free unmarked spans at about 32 MB/ms. We
could probably significantly improve the rate at which is can free
unmarked spans, but that's a separate issue.
Like the current reclaimer, this is still linear in the number of
spans that are swept, but the constant factor is now so vanishingly
small that it doesn't matter.
The benchmark in #18155 demonstrates both significant page reclaiming
delays, and object reclaiming delays. With "-retain-count=20000000
-preallocate=true -loop-count=3", the benchmark demonstrates several
page reclaiming delays on the order of 40ms. After this change, the
page reclaims are insignificant. The longest sweeps are still ~150ms,
but are object reclaiming delays. We'll address those in the next
several CLs.
Updates #18155.
Fixes #21378 by completely replacing the logic that had that bug.
Change-Id: Iad80eec11d7fc262d02c8f0761ac6998425c4064
Reviewed-on: https://go-review.googlesource.com/c/138959
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
2018-09-27 11:34:07 -04:00
|
|
|
if trace.enabled {
|
|
|
|
|
traceGCSweepDone()
|
2014-11-11 17:05:02 -05:00
|
|
|
}
|
runtime: implement efficient page reclaimer
When we attempt to allocate an N page span (either for a large
allocation or when an mcentral runs dry), we first try to sweep spans
to release N pages. Currently, this can be extremely expensive:
sweeping a span to emptiness is the hardest thing to ask for and the
sweeper generally doesn't know where to even look for potentially
fruitful results. Since this is on the critical path of many
allocations, this is unfortunate.
This CL changes how we reclaim empty spans. Instead of trying lots of
spans and hoping for the best, it uses the newly introduced span marks
to efficiently find empty spans. The span marks (and in-use bits) are
in a dense bitmap, so these spans can be found with an efficient
sequential memory scan. This approach can scan for unmarked spans at
about 300 GB/ms and can free unmarked spans at about 32 MB/ms. We
could probably significantly improve the rate at which is can free
unmarked spans, but that's a separate issue.
Like the current reclaimer, this is still linear in the number of
spans that are swept, but the constant factor is now so vanishingly
small that it doesn't matter.
The benchmark in #18155 demonstrates both significant page reclaiming
delays, and object reclaiming delays. With "-retain-count=20000000
-preallocate=true -loop-count=3", the benchmark demonstrates several
page reclaiming delays on the order of 40ms. After this change, the
page reclaims are insignificant. The longest sweeps are still ~150ms,
but are object reclaiming delays. We'll address those in the next
several CLs.
Updates #18155.
Fixes #21378 by completely replacing the logic that had that bug.
Change-Id: Iad80eec11d7fc262d02c8f0761ac6998425c4064
Reviewed-on: https://go-review.googlesource.com/c/138959
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
2018-09-27 11:34:07 -04:00
|
|
|
releasem(mp)
|
|
|
|
|
}
|
2014-11-11 17:05:02 -05:00
|
|
|
|
runtime: implement efficient page reclaimer
When we attempt to allocate an N page span (either for a large
allocation or when an mcentral runs dry), we first try to sweep spans
to release N pages. Currently, this can be extremely expensive:
sweeping a span to emptiness is the hardest thing to ask for and the
sweeper generally doesn't know where to even look for potentially
fruitful results. Since this is on the critical path of many
allocations, this is unfortunate.
This CL changes how we reclaim empty spans. Instead of trying lots of
spans and hoping for the best, it uses the newly introduced span marks
to efficiently find empty spans. The span marks (and in-use bits) are
in a dense bitmap, so these spans can be found with an efficient
sequential memory scan. This approach can scan for unmarked spans at
about 300 GB/ms and can free unmarked spans at about 32 MB/ms. We
could probably significantly improve the rate at which is can free
unmarked spans, but that's a separate issue.
Like the current reclaimer, this is still linear in the number of
spans that are swept, but the constant factor is now so vanishingly
small that it doesn't matter.
The benchmark in #18155 demonstrates both significant page reclaiming
delays, and object reclaiming delays. With "-retain-count=20000000
-preallocate=true -loop-count=3", the benchmark demonstrates several
page reclaiming delays on the order of 40ms. After this change, the
page reclaims are insignificant. The longest sweeps are still ~150ms,
but are object reclaiming delays. We'll address those in the next
several CLs.
Updates #18155.
Fixes #21378 by completely replacing the logic that had that bug.
Change-Id: Iad80eec11d7fc262d02c8f0761ac6998425c4064
Reviewed-on: https://go-review.googlesource.com/c/138959
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
2018-09-27 11:34:07 -04:00
|
|
|
// reclaimChunk sweeps unmarked spans that start at page indexes [pageIdx, pageIdx+n).
|
|
|
|
|
// It returns the number of pages returned to the heap.
|
|
|
|
|
//
|
2019-11-19 13:58:28 -08:00
|
|
|
// h.lock must be held and the caller must be non-preemptible. Note: h.lock may be
|
|
|
|
|
// temporarily unlocked and re-locked in order to do sweeping or if tracing is
|
|
|
|
|
// enabled.
|
runtime: implement efficient page reclaimer
When we attempt to allocate an N page span (either for a large
allocation or when an mcentral runs dry), we first try to sweep spans
to release N pages. Currently, this can be extremely expensive:
sweeping a span to emptiness is the hardest thing to ask for and the
sweeper generally doesn't know where to even look for potentially
fruitful results. Since this is on the critical path of many
allocations, this is unfortunate.
This CL changes how we reclaim empty spans. Instead of trying lots of
spans and hoping for the best, it uses the newly introduced span marks
to efficiently find empty spans. The span marks (and in-use bits) are
in a dense bitmap, so these spans can be found with an efficient
sequential memory scan. This approach can scan for unmarked spans at
about 300 GB/ms and can free unmarked spans at about 32 MB/ms. We
could probably significantly improve the rate at which is can free
unmarked spans, but that's a separate issue.
Like the current reclaimer, this is still linear in the number of
spans that are swept, but the constant factor is now so vanishingly
small that it doesn't matter.
The benchmark in #18155 demonstrates both significant page reclaiming
delays, and object reclaiming delays. With "-retain-count=20000000
-preallocate=true -loop-count=3", the benchmark demonstrates several
page reclaiming delays on the order of 40ms. After this change, the
page reclaims are insignificant. The longest sweeps are still ~150ms,
but are object reclaiming delays. We'll address those in the next
several CLs.
Updates #18155.
Fixes #21378 by completely replacing the logic that had that bug.
Change-Id: Iad80eec11d7fc262d02c8f0761ac6998425c4064
Reviewed-on: https://go-review.googlesource.com/c/138959
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
2018-09-27 11:34:07 -04:00
|
|
|
func (h *mheap) reclaimChunk(arenas []arenaIdx, pageIdx, n uintptr) uintptr {
|
|
|
|
|
// The heap lock must be held because this accesses the
|
|
|
|
|
// heapArena.spans arrays using potentially non-live pointers.
|
|
|
|
|
// In particular, if a span were freed and merged concurrently
|
|
|
|
|
// with this probing heapArena.spans, it would be possible to
|
|
|
|
|
// observe arbitrary, stale span pointers.
|
2020-08-21 11:59:55 -04:00
|
|
|
assertLockHeld(&h.lock)
|
|
|
|
|
|
runtime: implement efficient page reclaimer
When we attempt to allocate an N page span (either for a large
allocation or when an mcentral runs dry), we first try to sweep spans
to release N pages. Currently, this can be extremely expensive:
sweeping a span to emptiness is the hardest thing to ask for and the
sweeper generally doesn't know where to even look for potentially
fruitful results. Since this is on the critical path of many
allocations, this is unfortunate.
This CL changes how we reclaim empty spans. Instead of trying lots of
spans and hoping for the best, it uses the newly introduced span marks
to efficiently find empty spans. The span marks (and in-use bits) are
in a dense bitmap, so these spans can be found with an efficient
sequential memory scan. This approach can scan for unmarked spans at
about 300 GB/ms and can free unmarked spans at about 32 MB/ms. We
could probably significantly improve the rate at which is can free
unmarked spans, but that's a separate issue.
Like the current reclaimer, this is still linear in the number of
spans that are swept, but the constant factor is now so vanishingly
small that it doesn't matter.
The benchmark in #18155 demonstrates both significant page reclaiming
delays, and object reclaiming delays. With "-retain-count=20000000
-preallocate=true -loop-count=3", the benchmark demonstrates several
page reclaiming delays on the order of 40ms. After this change, the
page reclaims are insignificant. The longest sweeps are still ~150ms,
but are object reclaiming delays. We'll address those in the next
several CLs.
Updates #18155.
Fixes #21378 by completely replacing the logic that had that bug.
Change-Id: Iad80eec11d7fc262d02c8f0761ac6998425c4064
Reviewed-on: https://go-review.googlesource.com/c/138959
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
2018-09-27 11:34:07 -04:00
|
|
|
n0 := n
|
|
|
|
|
var nFreed uintptr
|
runtime: block sweep completion on all sweep paths
The runtime currently has two different notions of sweep completion:
1. All spans are either swept or have begun sweeping.
2. The sweeper has *finished* sweeping all spans.
Most things depend on condition 1. Notably, GC correctness depends on
condition 1, but since all sweep operations a non-preemptible, the STW
at the beginning of GC forces condition 1 to become condition 2.
runtime.GC(), however, depends on condition 2, since the intent is to
complete a complete GC cycle, and also update the heap profile (which
can only be done after sweeping is complete).
However, the way we compute condition 2 is racy right now and may in
fact only indicate condition 1. Specifically, sweepone blocks
condition 2 until all sweepone calls are done, but there are many
other ways to enter the sweeper that don't block this. Hence, sweepone
may see that there are no more spans in the sweep list and see that
it's the last sweepone and declare sweeping done, while there's some
other sweeper still working on a span.
Fix this by making sure every entry to the sweeper participates in the
protocol that blocks condition 2. To make sure we get this right, this
CL introduces a type to track sweep blocking and (lightly) enforces
span sweep ownership via the type system. This has the nice
side-effect of abstracting the pattern of acquiring sweep ownership
that's currently repeated in many different places.
Fixes #45315.
Change-Id: I7fab30170c5ae14c8b2f10998628735b8be6d901
Reviewed-on: https://go-review.googlesource.com/c/go/+/307915
Trust: Austin Clements <austin@google.com>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
2021-04-02 15:54:24 -04:00
|
|
|
sl := newSweepLocker()
|
runtime: implement efficient page reclaimer
When we attempt to allocate an N page span (either for a large
allocation or when an mcentral runs dry), we first try to sweep spans
to release N pages. Currently, this can be extremely expensive:
sweeping a span to emptiness is the hardest thing to ask for and the
sweeper generally doesn't know where to even look for potentially
fruitful results. Since this is on the critical path of many
allocations, this is unfortunate.
This CL changes how we reclaim empty spans. Instead of trying lots of
spans and hoping for the best, it uses the newly introduced span marks
to efficiently find empty spans. The span marks (and in-use bits) are
in a dense bitmap, so these spans can be found with an efficient
sequential memory scan. This approach can scan for unmarked spans at
about 300 GB/ms and can free unmarked spans at about 32 MB/ms. We
could probably significantly improve the rate at which is can free
unmarked spans, but that's a separate issue.
Like the current reclaimer, this is still linear in the number of
spans that are swept, but the constant factor is now so vanishingly
small that it doesn't matter.
The benchmark in #18155 demonstrates both significant page reclaiming
delays, and object reclaiming delays. With "-retain-count=20000000
-preallocate=true -loop-count=3", the benchmark demonstrates several
page reclaiming delays on the order of 40ms. After this change, the
page reclaims are insignificant. The longest sweeps are still ~150ms,
but are object reclaiming delays. We'll address those in the next
several CLs.
Updates #18155.
Fixes #21378 by completely replacing the logic that had that bug.
Change-Id: Iad80eec11d7fc262d02c8f0761ac6998425c4064
Reviewed-on: https://go-review.googlesource.com/c/138959
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
2018-09-27 11:34:07 -04:00
|
|
|
for n > 0 {
|
|
|
|
|
ai := arenas[pageIdx/pagesPerArena]
|
|
|
|
|
ha := h.arenas[ai.l1()][ai.l2()]
|
|
|
|
|
|
|
|
|
|
// Get a chunk of the bitmap to work on.
|
|
|
|
|
arenaPage := uint(pageIdx % pagesPerArena)
|
|
|
|
|
inUse := ha.pageInUse[arenaPage/8:]
|
|
|
|
|
marked := ha.pageMarks[arenaPage/8:]
|
|
|
|
|
if uintptr(len(inUse)) > n/8 {
|
|
|
|
|
inUse = inUse[:n/8]
|
|
|
|
|
marked = marked[:n/8]
|
2014-11-11 17:05:02 -05:00
|
|
|
}
|
runtime: implement efficient page reclaimer
When we attempt to allocate an N page span (either for a large
allocation or when an mcentral runs dry), we first try to sweep spans
to release N pages. Currently, this can be extremely expensive:
sweeping a span to emptiness is the hardest thing to ask for and the
sweeper generally doesn't know where to even look for potentially
fruitful results. Since this is on the critical path of many
allocations, this is unfortunate.
This CL changes how we reclaim empty spans. Instead of trying lots of
spans and hoping for the best, it uses the newly introduced span marks
to efficiently find empty spans. The span marks (and in-use bits) are
in a dense bitmap, so these spans can be found with an efficient
sequential memory scan. This approach can scan for unmarked spans at
about 300 GB/ms and can free unmarked spans at about 32 MB/ms. We
could probably significantly improve the rate at which is can free
unmarked spans, but that's a separate issue.
Like the current reclaimer, this is still linear in the number of
spans that are swept, but the constant factor is now so vanishingly
small that it doesn't matter.
The benchmark in #18155 demonstrates both significant page reclaiming
delays, and object reclaiming delays. With "-retain-count=20000000
-preallocate=true -loop-count=3", the benchmark demonstrates several
page reclaiming delays on the order of 40ms. After this change, the
page reclaims are insignificant. The longest sweeps are still ~150ms,
but are object reclaiming delays. We'll address those in the next
several CLs.
Updates #18155.
Fixes #21378 by completely replacing the logic that had that bug.
Change-Id: Iad80eec11d7fc262d02c8f0761ac6998425c4064
Reviewed-on: https://go-review.googlesource.com/c/138959
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
2018-09-27 11:34:07 -04:00
|
|
|
|
|
|
|
|
// Scan this bitmap chunk for spans that are in-use
|
|
|
|
|
// but have no marked objects on them.
|
|
|
|
|
for i := range inUse {
|
2019-09-18 15:33:17 +00:00
|
|
|
inUseUnmarked := atomic.Load8(&inUse[i]) &^ marked[i]
|
runtime: implement efficient page reclaimer
When we attempt to allocate an N page span (either for a large
allocation or when an mcentral runs dry), we first try to sweep spans
to release N pages. Currently, this can be extremely expensive:
sweeping a span to emptiness is the hardest thing to ask for and the
sweeper generally doesn't know where to even look for potentially
fruitful results. Since this is on the critical path of many
allocations, this is unfortunate.
This CL changes how we reclaim empty spans. Instead of trying lots of
spans and hoping for the best, it uses the newly introduced span marks
to efficiently find empty spans. The span marks (and in-use bits) are
in a dense bitmap, so these spans can be found with an efficient
sequential memory scan. This approach can scan for unmarked spans at
about 300 GB/ms and can free unmarked spans at about 32 MB/ms. We
could probably significantly improve the rate at which is can free
unmarked spans, but that's a separate issue.
Like the current reclaimer, this is still linear in the number of
spans that are swept, but the constant factor is now so vanishingly
small that it doesn't matter.
The benchmark in #18155 demonstrates both significant page reclaiming
delays, and object reclaiming delays. With "-retain-count=20000000
-preallocate=true -loop-count=3", the benchmark demonstrates several
page reclaiming delays on the order of 40ms. After this change, the
page reclaims are insignificant. The longest sweeps are still ~150ms,
but are object reclaiming delays. We'll address those in the next
several CLs.
Updates #18155.
Fixes #21378 by completely replacing the logic that had that bug.
Change-Id: Iad80eec11d7fc262d02c8f0761ac6998425c4064
Reviewed-on: https://go-review.googlesource.com/c/138959
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
2018-09-27 11:34:07 -04:00
|
|
|
if inUseUnmarked == 0 {
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for j := uint(0); j < 8; j++ {
|
|
|
|
|
if inUseUnmarked&(1<<j) != 0 {
|
|
|
|
|
s := ha.spans[arenaPage+uint(i)*8+j]
|
runtime: block sweep completion on all sweep paths
The runtime currently has two different notions of sweep completion:
1. All spans are either swept or have begun sweeping.
2. The sweeper has *finished* sweeping all spans.
Most things depend on condition 1. Notably, GC correctness depends on
condition 1, but since all sweep operations a non-preemptible, the STW
at the beginning of GC forces condition 1 to become condition 2.
runtime.GC(), however, depends on condition 2, since the intent is to
complete a complete GC cycle, and also update the heap profile (which
can only be done after sweeping is complete).
However, the way we compute condition 2 is racy right now and may in
fact only indicate condition 1. Specifically, sweepone blocks
condition 2 until all sweepone calls are done, but there are many
other ways to enter the sweeper that don't block this. Hence, sweepone
may see that there are no more spans in the sweep list and see that
it's the last sweepone and declare sweeping done, while there's some
other sweeper still working on a span.
Fix this by making sure every entry to the sweeper participates in the
protocol that blocks condition 2. To make sure we get this right, this
CL introduces a type to track sweep blocking and (lightly) enforces
span sweep ownership via the type system. This has the nice
side-effect of abstracting the pattern of acquiring sweep ownership
that's currently repeated in many different places.
Fixes #45315.
Change-Id: I7fab30170c5ae14c8b2f10998628735b8be6d901
Reviewed-on: https://go-review.googlesource.com/c/go/+/307915
Trust: Austin Clements <austin@google.com>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
2021-04-02 15:54:24 -04:00
|
|
|
if s, ok := sl.tryAcquire(s); ok {
|
runtime: implement efficient page reclaimer
When we attempt to allocate an N page span (either for a large
allocation or when an mcentral runs dry), we first try to sweep spans
to release N pages. Currently, this can be extremely expensive:
sweeping a span to emptiness is the hardest thing to ask for and the
sweeper generally doesn't know where to even look for potentially
fruitful results. Since this is on the critical path of many
allocations, this is unfortunate.
This CL changes how we reclaim empty spans. Instead of trying lots of
spans and hoping for the best, it uses the newly introduced span marks
to efficiently find empty spans. The span marks (and in-use bits) are
in a dense bitmap, so these spans can be found with an efficient
sequential memory scan. This approach can scan for unmarked spans at
about 300 GB/ms and can free unmarked spans at about 32 MB/ms. We
could probably significantly improve the rate at which is can free
unmarked spans, but that's a separate issue.
Like the current reclaimer, this is still linear in the number of
spans that are swept, but the constant factor is now so vanishingly
small that it doesn't matter.
The benchmark in #18155 demonstrates both significant page reclaiming
delays, and object reclaiming delays. With "-retain-count=20000000
-preallocate=true -loop-count=3", the benchmark demonstrates several
page reclaiming delays on the order of 40ms. After this change, the
page reclaims are insignificant. The longest sweeps are still ~150ms,
but are object reclaiming delays. We'll address those in the next
several CLs.
Updates #18155.
Fixes #21378 by completely replacing the logic that had that bug.
Change-Id: Iad80eec11d7fc262d02c8f0761ac6998425c4064
Reviewed-on: https://go-review.googlesource.com/c/138959
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
2018-09-27 11:34:07 -04:00
|
|
|
npages := s.npages
|
|
|
|
|
unlock(&h.lock)
|
|
|
|
|
if s.sweep(false) {
|
|
|
|
|
nFreed += npages
|
|
|
|
|
}
|
|
|
|
|
lock(&h.lock)
|
|
|
|
|
// Reload inUse. It's possible nearby
|
|
|
|
|
// spans were freed when we dropped the
|
|
|
|
|
// lock and we don't want to get stale
|
|
|
|
|
// pointers from the spans array.
|
2019-09-18 15:33:17 +00:00
|
|
|
inUseUnmarked = atomic.Load8(&inUse[i]) &^ marked[i]
|
runtime: implement efficient page reclaimer
When we attempt to allocate an N page span (either for a large
allocation or when an mcentral runs dry), we first try to sweep spans
to release N pages. Currently, this can be extremely expensive:
sweeping a span to emptiness is the hardest thing to ask for and the
sweeper generally doesn't know where to even look for potentially
fruitful results. Since this is on the critical path of many
allocations, this is unfortunate.
This CL changes how we reclaim empty spans. Instead of trying lots of
spans and hoping for the best, it uses the newly introduced span marks
to efficiently find empty spans. The span marks (and in-use bits) are
in a dense bitmap, so these spans can be found with an efficient
sequential memory scan. This approach can scan for unmarked spans at
about 300 GB/ms and can free unmarked spans at about 32 MB/ms. We
could probably significantly improve the rate at which is can free
unmarked spans, but that's a separate issue.
Like the current reclaimer, this is still linear in the number of
spans that are swept, but the constant factor is now so vanishingly
small that it doesn't matter.
The benchmark in #18155 demonstrates both significant page reclaiming
delays, and object reclaiming delays. With "-retain-count=20000000
-preallocate=true -loop-count=3", the benchmark demonstrates several
page reclaiming delays on the order of 40ms. After this change, the
page reclaims are insignificant. The longest sweeps are still ~150ms,
but are object reclaiming delays. We'll address those in the next
several CLs.
Updates #18155.
Fixes #21378 by completely replacing the logic that had that bug.
Change-Id: Iad80eec11d7fc262d02c8f0761ac6998425c4064
Reviewed-on: https://go-review.googlesource.com/c/138959
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
2018-09-27 11:34:07 -04:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
2014-11-11 17:05:02 -05:00
|
|
|
}
|
runtime: implement efficient page reclaimer
When we attempt to allocate an N page span (either for a large
allocation or when an mcentral runs dry), we first try to sweep spans
to release N pages. Currently, this can be extremely expensive:
sweeping a span to emptiness is the hardest thing to ask for and the
sweeper generally doesn't know where to even look for potentially
fruitful results. Since this is on the critical path of many
allocations, this is unfortunate.
This CL changes how we reclaim empty spans. Instead of trying lots of
spans and hoping for the best, it uses the newly introduced span marks
to efficiently find empty spans. The span marks (and in-use bits) are
in a dense bitmap, so these spans can be found with an efficient
sequential memory scan. This approach can scan for unmarked spans at
about 300 GB/ms and can free unmarked spans at about 32 MB/ms. We
could probably significantly improve the rate at which is can free
unmarked spans, but that's a separate issue.
Like the current reclaimer, this is still linear in the number of
spans that are swept, but the constant factor is now so vanishingly
small that it doesn't matter.
The benchmark in #18155 demonstrates both significant page reclaiming
delays, and object reclaiming delays. With "-retain-count=20000000
-preallocate=true -loop-count=3", the benchmark demonstrates several
page reclaiming delays on the order of 40ms. After this change, the
page reclaims are insignificant. The longest sweeps are still ~150ms,
but are object reclaiming delays. We'll address those in the next
several CLs.
Updates #18155.
Fixes #21378 by completely replacing the logic that had that bug.
Change-Id: Iad80eec11d7fc262d02c8f0761ac6998425c4064
Reviewed-on: https://go-review.googlesource.com/c/138959
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
2018-09-27 11:34:07 -04:00
|
|
|
|
|
|
|
|
// Advance.
|
|
|
|
|
pageIdx += uintptr(len(inUse) * 8)
|
|
|
|
|
n -= uintptr(len(inUse) * 8)
|
2014-11-11 17:05:02 -05:00
|
|
|
}
|
runtime: block sweep completion on all sweep paths
The runtime currently has two different notions of sweep completion:
1. All spans are either swept or have begun sweeping.
2. The sweeper has *finished* sweeping all spans.
Most things depend on condition 1. Notably, GC correctness depends on
condition 1, but since all sweep operations a non-preemptible, the STW
at the beginning of GC forces condition 1 to become condition 2.
runtime.GC(), however, depends on condition 2, since the intent is to
complete a complete GC cycle, and also update the heap profile (which
can only be done after sweeping is complete).
However, the way we compute condition 2 is racy right now and may in
fact only indicate condition 1. Specifically, sweepone blocks
condition 2 until all sweepone calls are done, but there are many
other ways to enter the sweeper that don't block this. Hence, sweepone
may see that there are no more spans in the sweep list and see that
it's the last sweepone and declare sweeping done, while there's some
other sweeper still working on a span.
Fix this by making sure every entry to the sweeper participates in the
protocol that blocks condition 2. To make sure we get this right, this
CL introduces a type to track sweep blocking and (lightly) enforces
span sweep ownership via the type system. This has the nice
side-effect of abstracting the pattern of acquiring sweep ownership
that's currently repeated in many different places.
Fixes #45315.
Change-Id: I7fab30170c5ae14c8b2f10998628735b8be6d901
Reviewed-on: https://go-review.googlesource.com/c/go/+/307915
Trust: Austin Clements <austin@google.com>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
2021-04-02 15:54:24 -04:00
|
|
|
sl.dispose()
|
runtime: implement efficient page reclaimer
When we attempt to allocate an N page span (either for a large
allocation or when an mcentral runs dry), we first try to sweep spans
to release N pages. Currently, this can be extremely expensive:
sweeping a span to emptiness is the hardest thing to ask for and the
sweeper generally doesn't know where to even look for potentially
fruitful results. Since this is on the critical path of many
allocations, this is unfortunate.
This CL changes how we reclaim empty spans. Instead of trying lots of
spans and hoping for the best, it uses the newly introduced span marks
to efficiently find empty spans. The span marks (and in-use bits) are
in a dense bitmap, so these spans can be found with an efficient
sequential memory scan. This approach can scan for unmarked spans at
about 300 GB/ms and can free unmarked spans at about 32 MB/ms. We
could probably significantly improve the rate at which is can free
unmarked spans, but that's a separate issue.
Like the current reclaimer, this is still linear in the number of
spans that are swept, but the constant factor is now so vanishingly
small that it doesn't matter.
The benchmark in #18155 demonstrates both significant page reclaiming
delays, and object reclaiming delays. With "-retain-count=20000000
-preallocate=true -loop-count=3", the benchmark demonstrates several
page reclaiming delays on the order of 40ms. After this change, the
page reclaims are insignificant. The longest sweeps are still ~150ms,
but are object reclaiming delays. We'll address those in the next
several CLs.
Updates #18155.
Fixes #21378 by completely replacing the logic that had that bug.
Change-Id: Iad80eec11d7fc262d02c8f0761ac6998425c4064
Reviewed-on: https://go-review.googlesource.com/c/138959
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
2018-09-27 11:34:07 -04:00
|
|
|
if trace.enabled {
|
2019-11-19 13:58:28 -08:00
|
|
|
unlock(&h.lock)
|
runtime: implement efficient page reclaimer
When we attempt to allocate an N page span (either for a large
allocation or when an mcentral runs dry), we first try to sweep spans
to release N pages. Currently, this can be extremely expensive:
sweeping a span to emptiness is the hardest thing to ask for and the
sweeper generally doesn't know where to even look for potentially
fruitful results. Since this is on the critical path of many
allocations, this is unfortunate.
This CL changes how we reclaim empty spans. Instead of trying lots of
spans and hoping for the best, it uses the newly introduced span marks
to efficiently find empty spans. The span marks (and in-use bits) are
in a dense bitmap, so these spans can be found with an efficient
sequential memory scan. This approach can scan for unmarked spans at
about 300 GB/ms and can free unmarked spans at about 32 MB/ms. We
could probably significantly improve the rate at which is can free
unmarked spans, but that's a separate issue.
Like the current reclaimer, this is still linear in the number of
spans that are swept, but the constant factor is now so vanishingly
small that it doesn't matter.
The benchmark in #18155 demonstrates both significant page reclaiming
delays, and object reclaiming delays. With "-retain-count=20000000
-preallocate=true -loop-count=3", the benchmark demonstrates several
page reclaiming delays on the order of 40ms. After this change, the
page reclaims are insignificant. The longest sweeps are still ~150ms,
but are object reclaiming delays. We'll address those in the next
several CLs.
Updates #18155.
Fixes #21378 by completely replacing the logic that had that bug.
Change-Id: Iad80eec11d7fc262d02c8f0761ac6998425c4064
Reviewed-on: https://go-review.googlesource.com/c/138959
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
2018-09-27 11:34:07 -04:00
|
|
|
// Account for pages scanned but not reclaimed.
|
|
|
|
|
traceGCSweepSpan((n0 - nFreed) * pageSize)
|
2019-11-19 13:58:28 -08:00
|
|
|
lock(&h.lock)
|
runtime: implement efficient page reclaimer
When we attempt to allocate an N page span (either for a large
allocation or when an mcentral runs dry), we first try to sweep spans
to release N pages. Currently, this can be extremely expensive:
sweeping a span to emptiness is the hardest thing to ask for and the
sweeper generally doesn't know where to even look for potentially
fruitful results. Since this is on the critical path of many
allocations, this is unfortunate.
This CL changes how we reclaim empty spans. Instead of trying lots of
spans and hoping for the best, it uses the newly introduced span marks
to efficiently find empty spans. The span marks (and in-use bits) are
in a dense bitmap, so these spans can be found with an efficient
sequential memory scan. This approach can scan for unmarked spans at
about 300 GB/ms and can free unmarked spans at about 32 MB/ms. We
could probably significantly improve the rate at which is can free
unmarked spans, but that's a separate issue.
Like the current reclaimer, this is still linear in the number of
spans that are swept, but the constant factor is now so vanishingly
small that it doesn't matter.
The benchmark in #18155 demonstrates both significant page reclaiming
delays, and object reclaiming delays. With "-retain-count=20000000
-preallocate=true -loop-count=3", the benchmark demonstrates several
page reclaiming delays on the order of 40ms. After this change, the
page reclaims are insignificant. The longest sweeps are still ~150ms,
but are object reclaiming delays. We'll address those in the next
several CLs.
Updates #18155.
Fixes #21378 by completely replacing the logic that had that bug.
Change-Id: Iad80eec11d7fc262d02c8f0761ac6998425c4064
Reviewed-on: https://go-review.googlesource.com/c/138959
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
2018-09-27 11:34:07 -04:00
|
|
|
}
|
2020-08-21 11:59:55 -04:00
|
|
|
|
|
|
|
|
assertLockHeld(&h.lock) // Must be locked on return.
|
runtime: implement efficient page reclaimer
When we attempt to allocate an N page span (either for a large
allocation or when an mcentral runs dry), we first try to sweep spans
to release N pages. Currently, this can be extremely expensive:
sweeping a span to emptiness is the hardest thing to ask for and the
sweeper generally doesn't know where to even look for potentially
fruitful results. Since this is on the critical path of many
allocations, this is unfortunate.
This CL changes how we reclaim empty spans. Instead of trying lots of
spans and hoping for the best, it uses the newly introduced span marks
to efficiently find empty spans. The span marks (and in-use bits) are
in a dense bitmap, so these spans can be found with an efficient
sequential memory scan. This approach can scan for unmarked spans at
about 300 GB/ms and can free unmarked spans at about 32 MB/ms. We
could probably significantly improve the rate at which is can free
unmarked spans, but that's a separate issue.
Like the current reclaimer, this is still linear in the number of
spans that are swept, but the constant factor is now so vanishingly
small that it doesn't matter.
The benchmark in #18155 demonstrates both significant page reclaiming
delays, and object reclaiming delays. With "-retain-count=20000000
-preallocate=true -loop-count=3", the benchmark demonstrates several
page reclaiming delays on the order of 40ms. After this change, the
page reclaims are insignificant. The longest sweeps are still ~150ms,
but are object reclaiming delays. We'll address those in the next
several CLs.
Updates #18155.
Fixes #21378 by completely replacing the logic that had that bug.
Change-Id: Iad80eec11d7fc262d02c8f0761ac6998425c4064
Reviewed-on: https://go-review.googlesource.com/c/138959
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
2018-09-27 11:34:07 -04:00
|
|
|
return nFreed
|
2014-11-11 17:05:02 -05:00
|
|
|
}
|
|
|
|
|
|
2020-07-29 19:00:37 +00:00
|
|
|
// spanAllocType represents the type of allocation to make, or
|
|
|
|
|
// the type of allocation to be freed.
|
|
|
|
|
type spanAllocType uint8
|
|
|
|
|
|
|
|
|
|
const (
|
|
|
|
|
spanAllocHeap spanAllocType = iota // heap span
|
|
|
|
|
spanAllocStack // stack span
|
|
|
|
|
spanAllocPtrScalarBits // unrolled GC prog bitmap span
|
|
|
|
|
spanAllocWorkBuf // work buf span
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
// manual returns true if the span allocation is manually managed.
|
|
|
|
|
func (s spanAllocType) manual() bool {
|
|
|
|
|
return s != spanAllocHeap
|
|
|
|
|
}
|
|
|
|
|
|
2017-06-30 12:10:01 -04:00
|
|
|
// alloc allocates a new span of npage pages from the GC'd heap.
|
|
|
|
|
//
|
2019-09-18 15:15:59 +00:00
|
|
|
// spanclass indicates the span's size class and scannability.
|
2017-06-30 12:10:01 -04:00
|
|
|
//
|
|
|
|
|
// If needzero is true, the memory for the returned span will be zeroed.
|
2021-05-03 13:24:27 -04:00
|
|
|
// The boolean returned indicates whether the returned span contains zeroes,
|
|
|
|
|
// either because this was requested, or because it was already zeroed.
|
2020-11-17 19:54:31 -05:00
|
|
|
func (h *mheap) alloc(npages uintptr, spanclass spanClass, needzero bool) (*mspan, bool) {
|
2014-11-11 17:05:02 -05:00
|
|
|
// Don't do any operations that lock the heap on the G stack.
|
|
|
|
|
// It might trigger stack growth, and the stack growth code needs
|
|
|
|
|
// to be able to allocate heap.
|
|
|
|
|
var s *mspan
|
[dev.cc] runtime: delete scalararg, ptrarg; rename onM to systemstack
Scalararg and ptrarg are not "signal safe".
Go code filling them out can be interrupted by a signal,
and then the signal handler runs, and if it also ends up
in Go code that uses scalararg or ptrarg, now the old
values have been smashed.
For the pieces of code that do need to run in a signal handler,
we introduced onM_signalok, which is really just onM
except that the _signalok is meant to convey that the caller
asserts that scalarg and ptrarg will be restored to their old
values after the call (instead of the usual behavior, zeroing them).
Scalararg and ptrarg are also untyped and therefore error-prone.
Go code can always pass a closure instead of using scalararg
and ptrarg; they were only really necessary for C code.
And there's no more C code.
For all these reasons, delete scalararg and ptrarg, converting
the few remaining references to use closures.
Once those are gone, there is no need for a distinction between
onM and onM_signalok, so replace both with a single function
equivalent to the current onM_signalok (that is, it can be called
on any of the curg, g0, and gsignal stacks).
The name onM and the phrase 'm stack' are misnomers,
because on most system an M has two system stacks:
the main thread stack and the signal handling stack.
Correct the misnomer by naming the replacement function systemstack.
Fix a few references to "M stack" in code.
The main motivation for this change is to eliminate scalararg/ptrarg.
Rick and I have already seen them cause problems because
the calling sequence m.ptrarg[0] = p is a heap pointer assignment,
so it gets a write barrier. The write barrier also uses onM, so it has
all the same problems as if it were being invoked by a signal handler.
We worked around this by saving and restoring the old values
and by calling onM_signalok, but there's no point in keeping this nice
home for bugs around any longer.
This CL also changes funcline to return the file name as a result
instead of filling in a passed-in *string. (The *string signature is
left over from when the code was written in and called from C.)
That's arguably an unrelated change, except that once I had done
the ptrarg/scalararg/onM cleanup I started getting false positives
about the *string argument escaping (not allowed in package runtime).
The compiler is wrong, but the easiest fix is to write the code like
Go code instead of like C code. I am a bit worried that the compiler
is wrong because of some use of uninitialized memory in the escape
analysis. If that's the reason, it will go away when we convert the
compiler to Go. (And if not, we'll debug it the next time.)
LGTM=khr
R=r, khr
CC=austin, golang-codereviews, iant, rlh
https://golang.org/cl/174950043
2014-11-12 14:54:31 -05:00
|
|
|
systemstack(func() {
|
2019-09-18 15:44:11 +00:00
|
|
|
// To prevent excessive heap growth, before allocating n pages
|
|
|
|
|
// we need to sweep and reclaim at least n pages.
|
2021-04-06 19:25:28 -04:00
|
|
|
if !isSweepDone() {
|
2019-09-18 15:44:11 +00:00
|
|
|
h.reclaim(npages)
|
|
|
|
|
}
|
2020-07-29 19:00:37 +00:00
|
|
|
s = h.allocSpan(npages, spanAllocHeap, spanclass)
|
2014-11-11 17:05:02 -05:00
|
|
|
})
|
|
|
|
|
|
2021-05-03 13:24:27 -04:00
|
|
|
if s == nil {
|
|
|
|
|
return nil, false
|
|
|
|
|
}
|
2020-11-17 19:54:31 -05:00
|
|
|
isZeroed := s.needzero == 0
|
2021-05-03 13:24:27 -04:00
|
|
|
if needzero && !isZeroed {
|
|
|
|
|
memclrNoHeapPointers(unsafe.Pointer(s.base()), s.npages<<_PageShift)
|
|
|
|
|
isZeroed = true
|
2014-11-11 17:05:02 -05:00
|
|
|
}
|
2021-05-03 13:24:27 -04:00
|
|
|
s.needzero = 0
|
2020-11-17 19:54:31 -05:00
|
|
|
return s, isZeroed
|
2014-11-11 17:05:02 -05:00
|
|
|
}
|
|
|
|
|
|
2017-03-22 13:45:12 -04:00
|
|
|
// allocManual allocates a manually-managed span of npage pages.
|
|
|
|
|
// allocManual returns nil if allocation fails.
|
|
|
|
|
//
|
|
|
|
|
// allocManual adds the bytes used to *stat, which should be a
|
|
|
|
|
// memstats in-use field. Unlike allocations in the GC'd heap, the
|
|
|
|
|
// allocation does *not* count toward heap_inuse or heap_sys.
|
2017-03-16 14:46:53 -04:00
|
|
|
//
|
|
|
|
|
// The memory backing the returned span may not be zeroed if
|
|
|
|
|
// span.needzero is set.
|
|
|
|
|
//
|
2019-09-18 15:44:11 +00:00
|
|
|
// allocManual must be called on the system stack because it may
|
|
|
|
|
// acquire the heap lock via allocSpan. See mheap for details.
|
2017-03-16 14:46:53 -04:00
|
|
|
//
|
2020-07-29 19:00:37 +00:00
|
|
|
// If new code is written to call allocManual, do NOT use an
|
|
|
|
|
// existing spanAllocType value and instead declare a new one.
|
|
|
|
|
//
|
2017-03-16 14:46:53 -04:00
|
|
|
//go:systemstack
|
2020-07-29 19:00:37 +00:00
|
|
|
func (h *mheap) allocManual(npages uintptr, typ spanAllocType) *mspan {
|
|
|
|
|
if !typ.manual() {
|
|
|
|
|
throw("manual span allocation called with non-manually-managed type")
|
|
|
|
|
}
|
|
|
|
|
return h.allocSpan(npages, typ, 0)
|
2014-11-11 17:05:02 -05:00
|
|
|
}
|
|
|
|
|
|
2017-12-13 16:03:23 -05:00
|
|
|
// setSpans modifies the span map so [spanOf(base), spanOf(base+npage*pageSize))
|
|
|
|
|
// is s.
|
|
|
|
|
func (h *mheap) setSpans(base, npage uintptr, s *mspan) {
|
2017-12-13 16:09:02 -05:00
|
|
|
p := base / pageSize
|
runtime: support a two-level arena map
Currently, the heap arena map is a single, large array that covers
every possible arena frame in the entire address space. This is
practical up to about 48 bits of address space with 64 MB arenas.
However, there are two problems with this:
1. mips64, ppc64, and s390x support full 64-bit address spaces (though
on Linux only s390x has kernel support for 64-bit address spaces).
On these platforms, it would be good to support these larger
address spaces.
2. On Windows, processes are charged for untouched memory, so for
processes with small heaps, the mostly-untouched 32 MB arena map
plus a 64 MB arena are significant overhead. Hence, it would be
good to reduce both the arena map size and the arena size, but with
a single-level arena, these are inversely proportional.
This CL adds support for a two-level arena map. Arena frame numbers
are now divided into arenaL1Bits of L1 index and arenaL2Bits of L2
index.
At the moment, arenaL1Bits is always 0, so we effectively have a
single level map. We do a few things so that this has no cost beyond
the current single-level map:
1. We embed the L2 array directly in mheap, so if there's a single
entry in the L2 array, the representation is identical to the
current representation and there's no extra level of indirection.
2. Hot code that accesses the arena map is structured so that it
optimizes to nearly the same machine code as it does currently.
3. We make some small tweaks to hot code paths and to the inliner
itself to keep some important functions inlined despite their
now-larger ASTs. In particular, this is necessary for
heapBitsForAddr and heapBits.next.
Possibly as a result of some of the tweaks, this actually slightly
improves the performance of the x/benchmarks garbage benchmark:
name old time/op new time/op delta
Garbage/benchmem-MB=64-12 2.28ms ± 1% 2.26ms ± 1% -1.07% (p=0.000 n=17+19)
(https://perf.golang.org/search?q=upload:20180223.2)
For #23900.
Change-Id: If5164e0961754f97eb9eca58f837f36d759505ff
Reviewed-on: https://go-review.googlesource.com/96779
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
2018-02-22 20:38:09 -05:00
|
|
|
ai := arenaIndex(base)
|
|
|
|
|
ha := h.arenas[ai.l1()][ai.l2()]
|
2017-12-13 16:03:23 -05:00
|
|
|
for n := uintptr(0); n < npage; n++ {
|
2017-12-13 16:09:02 -05:00
|
|
|
i := (p + n) % pagesPerArena
|
|
|
|
|
if i == 0 {
|
runtime: support a two-level arena map
Currently, the heap arena map is a single, large array that covers
every possible arena frame in the entire address space. This is
practical up to about 48 bits of address space with 64 MB arenas.
However, there are two problems with this:
1. mips64, ppc64, and s390x support full 64-bit address spaces (though
on Linux only s390x has kernel support for 64-bit address spaces).
On these platforms, it would be good to support these larger
address spaces.
2. On Windows, processes are charged for untouched memory, so for
processes with small heaps, the mostly-untouched 32 MB arena map
plus a 64 MB arena are significant overhead. Hence, it would be
good to reduce both the arena map size and the arena size, but with
a single-level arena, these are inversely proportional.
This CL adds support for a two-level arena map. Arena frame numbers
are now divided into arenaL1Bits of L1 index and arenaL2Bits of L2
index.
At the moment, arenaL1Bits is always 0, so we effectively have a
single level map. We do a few things so that this has no cost beyond
the current single-level map:
1. We embed the L2 array directly in mheap, so if there's a single
entry in the L2 array, the representation is identical to the
current representation and there's no extra level of indirection.
2. Hot code that accesses the arena map is structured so that it
optimizes to nearly the same machine code as it does currently.
3. We make some small tweaks to hot code paths and to the inliner
itself to keep some important functions inlined despite their
now-larger ASTs. In particular, this is necessary for
heapBitsForAddr and heapBits.next.
Possibly as a result of some of the tweaks, this actually slightly
improves the performance of the x/benchmarks garbage benchmark:
name old time/op new time/op delta
Garbage/benchmem-MB=64-12 2.28ms ± 1% 2.26ms ± 1% -1.07% (p=0.000 n=17+19)
(https://perf.golang.org/search?q=upload:20180223.2)
For #23900.
Change-Id: If5164e0961754f97eb9eca58f837f36d759505ff
Reviewed-on: https://go-review.googlesource.com/96779
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
2018-02-22 20:38:09 -05:00
|
|
|
ai = arenaIndex(base + n*pageSize)
|
|
|
|
|
ha = h.arenas[ai.l1()][ai.l2()]
|
2017-12-13 16:09:02 -05:00
|
|
|
}
|
|
|
|
|
ha.spans[i] = s
|
2017-12-13 16:03:23 -05:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2019-10-28 18:38:17 +00:00
|
|
|
// allocNeedsZero checks if the region of address space [base, base+npage*pageSize),
|
|
|
|
|
// assumed to be allocated, needs to be zeroed, updating heap arena metadata for
|
|
|
|
|
// future allocations.
|
|
|
|
|
//
|
|
|
|
|
// This must be called each time pages are allocated from the heap, even if the page
|
|
|
|
|
// allocator can otherwise prove the memory it's allocating is already zero because
|
|
|
|
|
// they're fresh from the operating system. It updates heapArena metadata that is
|
|
|
|
|
// critical for future page allocations.
|
|
|
|
|
//
|
2019-10-28 19:17:21 +00:00
|
|
|
// There are no locking constraints on this method.
|
2019-10-28 18:38:17 +00:00
|
|
|
func (h *mheap) allocNeedsZero(base, npage uintptr) (needZero bool) {
|
|
|
|
|
for npage > 0 {
|
|
|
|
|
ai := arenaIndex(base)
|
|
|
|
|
ha := h.arenas[ai.l1()][ai.l2()]
|
|
|
|
|
|
2019-10-28 19:17:21 +00:00
|
|
|
zeroedBase := atomic.Loaduintptr(&ha.zeroedBase)
|
2019-10-28 18:38:17 +00:00
|
|
|
arenaBase := base % heapArenaBytes
|
2019-10-28 19:17:21 +00:00
|
|
|
if arenaBase < zeroedBase {
|
2019-10-28 18:38:17 +00:00
|
|
|
// We extended into the non-zeroed part of the
|
|
|
|
|
// arena, so this region needs to be zeroed before use.
|
|
|
|
|
//
|
2019-10-28 19:17:21 +00:00
|
|
|
// zeroedBase is monotonically increasing, so if we see this now then
|
|
|
|
|
// we can be sure we need to zero this memory region.
|
|
|
|
|
//
|
2019-10-28 18:38:17 +00:00
|
|
|
// We still need to update zeroedBase for this arena, and
|
|
|
|
|
// potentially more arenas.
|
|
|
|
|
needZero = true
|
|
|
|
|
}
|
2019-10-28 19:17:21 +00:00
|
|
|
// We may observe arenaBase > zeroedBase if we're racing with one or more
|
|
|
|
|
// allocations which are acquiring memory directly before us in the address
|
|
|
|
|
// space. But, because we know no one else is acquiring *this* memory, it's
|
|
|
|
|
// still safe to not zero.
|
2019-10-28 18:38:17 +00:00
|
|
|
|
|
|
|
|
// Compute how far into the arena we extend into, capped
|
|
|
|
|
// at heapArenaBytes.
|
|
|
|
|
arenaLimit := arenaBase + npage*pageSize
|
|
|
|
|
if arenaLimit > heapArenaBytes {
|
|
|
|
|
arenaLimit = heapArenaBytes
|
|
|
|
|
}
|
2019-10-28 19:17:21 +00:00
|
|
|
// Increase ha.zeroedBase so it's >= arenaLimit.
|
|
|
|
|
// We may be racing with other updates.
|
|
|
|
|
for arenaLimit > zeroedBase {
|
|
|
|
|
if atomic.Casuintptr(&ha.zeroedBase, zeroedBase, arenaLimit) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
zeroedBase = atomic.Loaduintptr(&ha.zeroedBase)
|
|
|
|
|
// Sanity check zeroedBase.
|
|
|
|
|
if zeroedBase <= arenaLimit && zeroedBase > arenaBase {
|
|
|
|
|
// The zeroedBase moved into the space we were trying to
|
|
|
|
|
// claim. That's very bad, and indicates someone allocated
|
|
|
|
|
// the same region we did.
|
|
|
|
|
throw("potentially overlapping in-use allocations detected")
|
|
|
|
|
}
|
2019-10-28 18:38:17 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Move base forward and subtract from npage to move into
|
|
|
|
|
// the next arena, or finish.
|
|
|
|
|
base += arenaLimit - arenaBase
|
|
|
|
|
npage -= (arenaLimit - arenaBase) / pageSize
|
|
|
|
|
}
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
|
2019-09-18 15:57:36 +00:00
|
|
|
// tryAllocMSpan attempts to allocate an mspan object from
|
|
|
|
|
// the P-local cache, but may fail.
|
|
|
|
|
//
|
2020-08-21 11:59:55 -04:00
|
|
|
// h.lock need not be held.
|
2019-09-18 15:57:36 +00:00
|
|
|
//
|
|
|
|
|
// This caller must ensure that its P won't change underneath
|
|
|
|
|
// it during this function. Currently to ensure that we enforce
|
|
|
|
|
// that the function is run on the system stack, because that's
|
|
|
|
|
// the only place it is used now. In the future, this requirement
|
|
|
|
|
// may be relaxed if its use is necessary elsewhere.
|
|
|
|
|
//
|
|
|
|
|
//go:systemstack
|
|
|
|
|
func (h *mheap) tryAllocMSpan() *mspan {
|
|
|
|
|
pp := getg().m.p.ptr()
|
|
|
|
|
// If we don't have a p or the cache is empty, we can't do
|
|
|
|
|
// anything here.
|
|
|
|
|
if pp == nil || pp.mspancache.len == 0 {
|
|
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
// Pull off the last entry in the cache.
|
|
|
|
|
s := pp.mspancache.buf[pp.mspancache.len-1]
|
|
|
|
|
pp.mspancache.len--
|
|
|
|
|
return s
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// allocMSpanLocked allocates an mspan object.
|
|
|
|
|
//
|
2020-08-21 11:59:55 -04:00
|
|
|
// h.lock must be held.
|
2019-09-18 15:57:36 +00:00
|
|
|
//
|
|
|
|
|
// allocMSpanLocked must be called on the system stack because
|
|
|
|
|
// its caller holds the heap lock. See mheap for details.
|
|
|
|
|
// Running on the system stack also ensures that we won't
|
|
|
|
|
// switch Ps during this function. See tryAllocMSpan for details.
|
|
|
|
|
//
|
|
|
|
|
//go:systemstack
|
|
|
|
|
func (h *mheap) allocMSpanLocked() *mspan {
|
2020-08-21 11:59:55 -04:00
|
|
|
assertLockHeld(&h.lock)
|
|
|
|
|
|
2019-09-18 15:57:36 +00:00
|
|
|
pp := getg().m.p.ptr()
|
|
|
|
|
if pp == nil {
|
|
|
|
|
// We don't have a p so just do the normal thing.
|
|
|
|
|
return (*mspan)(h.spanalloc.alloc())
|
|
|
|
|
}
|
|
|
|
|
// Refill the cache if necessary.
|
|
|
|
|
if pp.mspancache.len == 0 {
|
|
|
|
|
const refillCount = len(pp.mspancache.buf) / 2
|
|
|
|
|
for i := 0; i < refillCount; i++ {
|
|
|
|
|
pp.mspancache.buf[i] = (*mspan)(h.spanalloc.alloc())
|
|
|
|
|
}
|
|
|
|
|
pp.mspancache.len = refillCount
|
|
|
|
|
}
|
|
|
|
|
// Pull off the last entry in the cache.
|
|
|
|
|
s := pp.mspancache.buf[pp.mspancache.len-1]
|
|
|
|
|
pp.mspancache.len--
|
|
|
|
|
return s
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// freeMSpanLocked free an mspan object.
|
|
|
|
|
//
|
2020-08-21 11:59:55 -04:00
|
|
|
// h.lock must be held.
|
2019-09-18 15:57:36 +00:00
|
|
|
//
|
|
|
|
|
// freeMSpanLocked must be called on the system stack because
|
|
|
|
|
// its caller holds the heap lock. See mheap for details.
|
|
|
|
|
// Running on the system stack also ensures that we won't
|
|
|
|
|
// switch Ps during this function. See tryAllocMSpan for details.
|
|
|
|
|
//
|
|
|
|
|
//go:systemstack
|
|
|
|
|
func (h *mheap) freeMSpanLocked(s *mspan) {
|
2020-08-21 11:59:55 -04:00
|
|
|
assertLockHeld(&h.lock)
|
|
|
|
|
|
2019-09-18 15:57:36 +00:00
|
|
|
pp := getg().m.p.ptr()
|
|
|
|
|
// First try to free the mspan directly to the cache.
|
|
|
|
|
if pp != nil && pp.mspancache.len < len(pp.mspancache.buf) {
|
|
|
|
|
pp.mspancache.buf[pp.mspancache.len] = s
|
|
|
|
|
pp.mspancache.len++
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
// Failing that (or if we don't have a p), just free it to
|
|
|
|
|
// the heap.
|
|
|
|
|
h.spanalloc.free(unsafe.Pointer(s))
|
|
|
|
|
}
|
|
|
|
|
|
2019-09-18 15:44:11 +00:00
|
|
|
// allocSpan allocates an mspan which owns npages worth of memory.
|
|
|
|
|
//
|
2020-07-29 19:00:37 +00:00
|
|
|
// If typ.manual() == false, allocSpan allocates a heap span of class spanclass
|
2019-09-18 15:44:11 +00:00
|
|
|
// and updates heap accounting. If manual == true, allocSpan allocates a
|
|
|
|
|
// manually-managed span (spanclass is ignored), and the caller is
|
|
|
|
|
// responsible for any accounting related to its use of the span. Either
|
|
|
|
|
// way, allocSpan will atomically add the bytes in the newly allocated
|
|
|
|
|
// span to *sysStat.
|
|
|
|
|
//
|
|
|
|
|
// The returned span is fully initialized.
|
|
|
|
|
//
|
2020-08-21 11:59:55 -04:00
|
|
|
// h.lock must not be held.
|
2019-09-18 15:44:11 +00:00
|
|
|
//
|
|
|
|
|
// allocSpan must be called on the system stack both because it acquires
|
|
|
|
|
// the heap lock and because it must block GC transitions.
|
|
|
|
|
//
|
|
|
|
|
//go:systemstack
|
2020-07-29 19:00:37 +00:00
|
|
|
func (h *mheap) allocSpan(npages uintptr, typ spanAllocType, spanclass spanClass) (s *mspan) {
|
2019-09-18 15:44:11 +00:00
|
|
|
// Function-global state.
|
|
|
|
|
gp := getg()
|
|
|
|
|
base, scav := uintptr(0), uintptr(0)
|
|
|
|
|
|
2020-11-02 03:58:08 +11:00
|
|
|
// On some platforms we need to provide physical page aligned stack
|
|
|
|
|
// allocations. Where the page size is less than the physical page
|
|
|
|
|
// size, we already manage to do this by default.
|
|
|
|
|
needPhysPageAlign := physPageAlignedStacks && typ == spanAllocStack && pageSize < physPageSize
|
|
|
|
|
|
2019-09-16 21:23:24 +00:00
|
|
|
// If the allocation is small enough, try the page cache!
|
2020-11-02 03:58:08 +11:00
|
|
|
// The page cache does not support aligned allocations, so we cannot use
|
|
|
|
|
// it if we need to provide a physical page aligned stack allocation.
|
2019-09-16 21:23:24 +00:00
|
|
|
pp := gp.m.p.ptr()
|
2020-11-02 03:58:08 +11:00
|
|
|
if !needPhysPageAlign && pp != nil && npages < pageCachePages/4 {
|
2019-09-16 21:23:24 +00:00
|
|
|
c := &pp.pcache
|
2019-09-18 15:57:36 +00:00
|
|
|
|
2019-09-16 21:23:24 +00:00
|
|
|
// If the cache is empty, refill it.
|
|
|
|
|
if c.empty() {
|
|
|
|
|
lock(&h.lock)
|
|
|
|
|
*c = h.pages.allocToCache()
|
|
|
|
|
unlock(&h.lock)
|
|
|
|
|
}
|
2019-09-18 15:44:11 +00:00
|
|
|
|
2019-09-16 21:23:24 +00:00
|
|
|
// Try to allocate from the cache.
|
|
|
|
|
base, scav = c.alloc(npages)
|
|
|
|
|
if base != 0 {
|
|
|
|
|
s = h.tryAllocMSpan()
|
runtime: flush local_scan directly and more often
Now that local_scan is the last mcache-based statistic that is flushed
by purgecachedstats, and heap_scan and gcController.revise may be
interacted with concurrently, we don't need to flush heap_scan at
arbitrary locations where the heap is locked, and we don't need
purgecachedstats and cachestats anymore. Instead, we can flush
local_scan at the same time we update heap_live in refill, so the two
updates may share the same revise call.
Clean up unused functions, remove code that would cause the heap to get
locked in the allocSpan when it didn't need to (other than to flush
local_scan), and flush local_scan explicitly in a few important places.
Notably we need to flush local_scan whenever we flush the other stats,
but it doesn't need to be donated anywhere, so have releaseAll do the
flushing. Also, we need to flush local_scan before we set heap_scan at
the end of a GC, which was previously handled by cachestats. Just do so
explicitly -- it's not much code and it becomes a lot more clear why we
need to do so.
Change-Id: I35ac081784df7744d515479896a41d530653692d
Reviewed-on: https://go-review.googlesource.com/c/go/+/246968
Run-TryBot: Michael Knyszek <mknyszek@google.com>
Trust: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Michael Pratt <mpratt@google.com>
2020-07-23 22:36:58 +00:00
|
|
|
if s != nil {
|
2019-09-16 21:23:24 +00:00
|
|
|
goto HaveSpan
|
|
|
|
|
}
|
runtime: flush local_scan directly and more often
Now that local_scan is the last mcache-based statistic that is flushed
by purgecachedstats, and heap_scan and gcController.revise may be
interacted with concurrently, we don't need to flush heap_scan at
arbitrary locations where the heap is locked, and we don't need
purgecachedstats and cachestats anymore. Instead, we can flush
local_scan at the same time we update heap_live in refill, so the two
updates may share the same revise call.
Clean up unused functions, remove code that would cause the heap to get
locked in the allocSpan when it didn't need to (other than to flush
local_scan), and flush local_scan explicitly in a few important places.
Notably we need to flush local_scan whenever we flush the other stats,
but it doesn't need to be donated anywhere, so have releaseAll do the
flushing. Also, we need to flush local_scan before we set heap_scan at
the end of a GC, which was previously handled by cachestats. Just do so
explicitly -- it's not much code and it becomes a lot more clear why we
need to do so.
Change-Id: I35ac081784df7744d515479896a41d530653692d
Reviewed-on: https://go-review.googlesource.com/c/go/+/246968
Run-TryBot: Michael Knyszek <mknyszek@google.com>
Trust: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Michael Pratt <mpratt@google.com>
2020-07-23 22:36:58 +00:00
|
|
|
// We have a base but no mspan, so we need
|
|
|
|
|
// to lock the heap.
|
2019-09-16 21:23:24 +00:00
|
|
|
}
|
2019-10-17 17:42:15 +00:00
|
|
|
}
|
|
|
|
|
|
2019-09-16 21:23:24 +00:00
|
|
|
// For one reason or another, we couldn't get the
|
|
|
|
|
// whole job done without the heap lock.
|
|
|
|
|
lock(&h.lock)
|
|
|
|
|
|
2020-11-02 03:58:08 +11:00
|
|
|
if needPhysPageAlign {
|
|
|
|
|
// Overallocate by a physical page to allow for later alignment.
|
|
|
|
|
npages += physPageSize / pageSize
|
|
|
|
|
}
|
|
|
|
|
|
2019-09-16 21:23:24 +00:00
|
|
|
if base == 0 {
|
|
|
|
|
// Try to acquire a base address.
|
|
|
|
|
base, scav = h.pages.alloc(npages)
|
|
|
|
|
if base == 0 {
|
|
|
|
|
if !h.grow(npages) {
|
|
|
|
|
unlock(&h.lock)
|
|
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
base, scav = h.pages.alloc(npages)
|
|
|
|
|
if base == 0 {
|
|
|
|
|
throw("grew heap, but no adequate free space found")
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
2019-09-18 15:57:36 +00:00
|
|
|
if s == nil {
|
|
|
|
|
// We failed to get an mspan earlier, so grab
|
|
|
|
|
// one now that we have the heap lock.
|
|
|
|
|
s = h.allocMSpanLocked()
|
|
|
|
|
}
|
2020-11-02 03:58:08 +11:00
|
|
|
|
|
|
|
|
if needPhysPageAlign {
|
|
|
|
|
allocBase, allocPages := base, npages
|
|
|
|
|
base = alignUp(allocBase, physPageSize)
|
|
|
|
|
npages -= physPageSize / pageSize
|
|
|
|
|
|
|
|
|
|
// Return memory around the aligned allocation.
|
|
|
|
|
spaceBefore := base - allocBase
|
|
|
|
|
if spaceBefore > 0 {
|
|
|
|
|
h.pages.free(allocBase, spaceBefore/pageSize)
|
|
|
|
|
}
|
|
|
|
|
spaceAfter := (allocPages-npages)*pageSize - spaceBefore
|
|
|
|
|
if spaceAfter > 0 {
|
|
|
|
|
h.pages.free(base+npages*pageSize, spaceAfter/pageSize)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2019-09-18 15:44:11 +00:00
|
|
|
unlock(&h.lock)
|
|
|
|
|
|
2019-09-16 21:23:24 +00:00
|
|
|
HaveSpan:
|
|
|
|
|
// At this point, both s != nil and base != 0, and the heap
|
|
|
|
|
// lock is no longer held. Initialize the span.
|
2019-09-18 15:44:11 +00:00
|
|
|
s.init(base, npages)
|
|
|
|
|
if h.allocNeedsZero(base, npages) {
|
2019-10-28 18:38:17 +00:00
|
|
|
s.needzero = 1
|
|
|
|
|
}
|
2019-09-18 15:44:11 +00:00
|
|
|
nbytes := npages * pageSize
|
2020-07-29 19:00:37 +00:00
|
|
|
if typ.manual() {
|
2019-09-18 15:44:11 +00:00
|
|
|
s.manualFreeList = 0
|
|
|
|
|
s.nelems = 0
|
|
|
|
|
s.limit = s.base() + s.npages*pageSize
|
|
|
|
|
s.state.set(mSpanManual)
|
|
|
|
|
} else {
|
|
|
|
|
// We must set span properties before the span is published anywhere
|
|
|
|
|
// since we're not holding the heap lock.
|
|
|
|
|
s.spanclass = spanclass
|
|
|
|
|
if sizeclass := spanclass.sizeclass(); sizeclass == 0 {
|
|
|
|
|
s.elemsize = nbytes
|
|
|
|
|
s.nelems = 1
|
|
|
|
|
s.divMul = 0
|
|
|
|
|
} else {
|
|
|
|
|
s.elemsize = uintptr(class_to_size[sizeclass])
|
|
|
|
|
s.nelems = nbytes / s.elemsize
|
2021-03-11 15:45:52 -08:00
|
|
|
s.divMul = class_to_divmagic[sizeclass]
|
2019-09-18 15:44:11 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Initialize mark and allocation structures.
|
|
|
|
|
s.freeindex = 0
|
|
|
|
|
s.allocCache = ^uint64(0) // all 1s indicating all free.
|
|
|
|
|
s.gcmarkBits = newMarkBits(s.nelems)
|
|
|
|
|
s.allocBits = newAllocBits(s.nelems)
|
|
|
|
|
|
|
|
|
|
// It's safe to access h.sweepgen without the heap lock because it's
|
|
|
|
|
// only ever updated with the world stopped and we run on the
|
|
|
|
|
// systemstack which blocks a STW transition.
|
|
|
|
|
atomic.Store(&s.sweepgen, h.sweepgen)
|
|
|
|
|
|
|
|
|
|
// Now that the span is filled in, set its state. This
|
|
|
|
|
// is a publication barrier for the other fields in
|
|
|
|
|
// the span. While valid pointers into this span
|
|
|
|
|
// should never be visible until the span is returned,
|
|
|
|
|
// if the garbage collector finds an invalid pointer,
|
|
|
|
|
// access to the span may race with initialization of
|
|
|
|
|
// the span. We resolve this race by atomically
|
|
|
|
|
// setting the state after the span is fully
|
|
|
|
|
// initialized, and atomically checking the state in
|
|
|
|
|
// any situation where a pointer is suspect.
|
|
|
|
|
s.state.set(mSpanInUse)
|
|
|
|
|
}
|
2019-10-17 17:42:15 +00:00
|
|
|
|
2019-09-18 15:44:11 +00:00
|
|
|
// Commit and account for any scavenged memory that the span now owns.
|
|
|
|
|
if scav != 0 {
|
|
|
|
|
// sysUsed all the pages that are actually available
|
|
|
|
|
// in the span since some of them might be scavenged.
|
|
|
|
|
sysUsed(unsafe.Pointer(base), nbytes)
|
2020-07-29 20:25:05 +00:00
|
|
|
atomic.Xadd64(&memstats.heap_released, -int64(scav))
|
2019-09-18 15:44:11 +00:00
|
|
|
}
|
2019-09-18 15:03:50 +00:00
|
|
|
// Update stats.
|
2020-08-03 20:35:40 +00:00
|
|
|
if typ == spanAllocHeap {
|
2020-07-29 20:25:05 +00:00
|
|
|
atomic.Xadd64(&memstats.heap_inuse, int64(nbytes))
|
2020-07-29 19:00:37 +00:00
|
|
|
}
|
|
|
|
|
if typ.manual() {
|
|
|
|
|
// Manually managed memory doesn't count toward heap_sys.
|
2020-07-29 20:25:05 +00:00
|
|
|
memstats.heap_sys.add(-int64(nbytes))
|
2020-07-29 19:00:37 +00:00
|
|
|
}
|
2020-08-03 20:11:04 +00:00
|
|
|
// Update consistent stats.
|
2020-11-02 19:03:16 +00:00
|
|
|
stats := memstats.heapStats.acquire()
|
2020-08-03 20:11:04 +00:00
|
|
|
atomic.Xaddint64(&stats.committed, int64(scav))
|
|
|
|
|
atomic.Xaddint64(&stats.released, -int64(scav))
|
|
|
|
|
switch typ {
|
|
|
|
|
case spanAllocHeap:
|
|
|
|
|
atomic.Xaddint64(&stats.inHeap, int64(nbytes))
|
|
|
|
|
case spanAllocStack:
|
|
|
|
|
atomic.Xaddint64(&stats.inStacks, int64(nbytes))
|
|
|
|
|
case spanAllocPtrScalarBits:
|
|
|
|
|
atomic.Xaddint64(&stats.inPtrScalarBits, int64(nbytes))
|
|
|
|
|
case spanAllocWorkBuf:
|
|
|
|
|
atomic.Xaddint64(&stats.inWorkBufs, int64(nbytes))
|
|
|
|
|
}
|
2020-11-02 19:03:16 +00:00
|
|
|
memstats.heapStats.release()
|
2019-10-17 17:42:15 +00:00
|
|
|
|
2019-09-18 15:44:11 +00:00
|
|
|
// Publish the span in various locations.
|
|
|
|
|
|
|
|
|
|
// This is safe to call without the lock held because the slots
|
2020-05-18 14:14:11 -04:00
|
|
|
// related to this span will only ever be read or modified by
|
|
|
|
|
// this thread until pointers into the span are published (and
|
|
|
|
|
// we execute a publication barrier at the end of this function
|
|
|
|
|
// before that happens) or pageInUse is updated.
|
2019-09-18 15:44:11 +00:00
|
|
|
h.setSpans(s.base(), npages, s)
|
|
|
|
|
|
2020-07-29 19:00:37 +00:00
|
|
|
if !typ.manual() {
|
2019-09-18 15:44:11 +00:00
|
|
|
// Mark in-use span in arena page bitmap.
|
|
|
|
|
//
|
|
|
|
|
// This publishes the span to the page sweeper, so
|
|
|
|
|
// it's imperative that the span be completely initialized
|
|
|
|
|
// prior to this line.
|
|
|
|
|
arena, pageIdx, pageMask := pageIndexOf(s.base())
|
|
|
|
|
atomic.Or8(&arena.pageInUse[pageIdx], pageMask)
|
|
|
|
|
|
|
|
|
|
// Update related page sweeper stats.
|
|
|
|
|
atomic.Xadd64(&h.pagesInUse, int64(npages))
|
|
|
|
|
}
|
2020-05-18 14:14:11 -04:00
|
|
|
|
|
|
|
|
// Make sure the newly allocated span will be observed
|
|
|
|
|
// by the GC before pointers into the span are published.
|
|
|
|
|
publicationBarrier()
|
|
|
|
|
|
2019-10-17 17:42:15 +00:00
|
|
|
return s
|
|
|
|
|
}
|
|
|
|
|
|
2014-11-11 17:05:02 -05:00
|
|
|
// Try to add at least npage pages of memory to the heap,
|
|
|
|
|
// returning whether it worked.
|
2015-09-26 12:31:59 -04:00
|
|
|
//
|
2020-08-21 11:59:55 -04:00
|
|
|
// h.lock must be held.
|
2015-11-11 16:13:51 -08:00
|
|
|
func (h *mheap) grow(npage uintptr) bool {
|
2020-08-21 11:59:55 -04:00
|
|
|
assertLockHeld(&h.lock)
|
|
|
|
|
|
2019-09-04 16:12:10 +00:00
|
|
|
// We must grow the heap in whole palloc chunks.
|
2020-11-16 21:57:32 +00:00
|
|
|
// We call sysMap below but note that because we
|
|
|
|
|
// round up to pallocChunkPages which is on the order
|
|
|
|
|
// of MiB (generally >= to the huge page size) we
|
|
|
|
|
// won't be calling it too much.
|
2019-09-04 16:12:10 +00:00
|
|
|
ask := alignUp(npage, pallocChunkPages) * pageSize
|
runtime: grow the heap incrementally
Currently, we map and grow the heap a whole arena (64MB) at a time.
Unfortunately, in order to fix #32828, we need to switch from
scavenging inline with allocation back to scavenging on heap growth,
but heap-growth scavenging happens in large jumps because we grow the
heap in large jumps.
In order to prepare for better heap-growth scavenging, this CL
separates mapping more space for the heap from actually "growing" it
(tracking the new space with spans). Instead, growing the heap keeps
track of the "current arena" it's growing into. It track that with new
spans as needed, and only maps more arena space when the current arena
is inadequate. The effect to the user is the same, but this will let
us scavenge on much smaller increments of heap growth.
There are two slightly subtleties to this change:
1. If an allocation requires mapping a new arena and that new arena
isn't contiguous with the current arena, we don't want to lose the
unused space in the current arena, so we have to immediately track
that with a span.
2. The mapped space must be accounted as released and idle, even
though it isn't actually tracked in a span.
For #32828, since this makes heap-growth scavenging far more
effective, especially at small heap sizes. For example, this change is
necessary for TestPhysicalMemoryUtilization to pass once we remove
inline scavenging.
Change-Id: I300e74a0534062467e4ce91cdc3508e5ef9aa73a
Reviewed-on: https://go-review.googlesource.com/c/go/+/189957
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
2019-08-12 14:54:28 -04:00
|
|
|
|
2019-10-17 17:42:15 +00:00
|
|
|
totalGrowth := uintptr(0)
|
2020-05-06 19:18:07 +00:00
|
|
|
// This may overflow because ask could be very large
|
|
|
|
|
// and is otherwise unrelated to h.curArena.base.
|
|
|
|
|
end := h.curArena.base + ask
|
|
|
|
|
nBase := alignUp(end, physPageSize)
|
|
|
|
|
if nBase > h.curArena.end || /* overflow */ end < h.curArena.base {
|
runtime: grow the heap incrementally
Currently, we map and grow the heap a whole arena (64MB) at a time.
Unfortunately, in order to fix #32828, we need to switch from
scavenging inline with allocation back to scavenging on heap growth,
but heap-growth scavenging happens in large jumps because we grow the
heap in large jumps.
In order to prepare for better heap-growth scavenging, this CL
separates mapping more space for the heap from actually "growing" it
(tracking the new space with spans). Instead, growing the heap keeps
track of the "current arena" it's growing into. It track that with new
spans as needed, and only maps more arena space when the current arena
is inadequate. The effect to the user is the same, but this will let
us scavenge on much smaller increments of heap growth.
There are two slightly subtleties to this change:
1. If an allocation requires mapping a new arena and that new arena
isn't contiguous with the current arena, we don't want to lose the
unused space in the current arena, so we have to immediately track
that with a span.
2. The mapped space must be accounted as released and idle, even
though it isn't actually tracked in a span.
For #32828, since this makes heap-growth scavenging far more
effective, especially at small heap sizes. For example, this change is
necessary for TestPhysicalMemoryUtilization to pass once we remove
inline scavenging.
Change-Id: I300e74a0534062467e4ce91cdc3508e5ef9aa73a
Reviewed-on: https://go-review.googlesource.com/c/go/+/189957
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
2019-08-12 14:54:28 -04:00
|
|
|
// Not enough room in the current arena. Allocate more
|
|
|
|
|
// arena space. This may not be contiguous with the
|
|
|
|
|
// current arena, so we have to request the full ask.
|
|
|
|
|
av, asize := h.sysAlloc(ask)
|
|
|
|
|
if av == nil {
|
|
|
|
|
print("runtime: out of memory: cannot allocate ", ask, "-byte block (", memstats.heap_sys, " in use)\n")
|
|
|
|
|
return false
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if uintptr(av) == h.curArena.end {
|
|
|
|
|
// The new space is contiguous with the old
|
|
|
|
|
// space, so just extend the current space.
|
|
|
|
|
h.curArena.end = uintptr(av) + asize
|
|
|
|
|
} else {
|
|
|
|
|
// The new space is discontiguous. Track what
|
|
|
|
|
// remains of the current space and switch to
|
|
|
|
|
// the new space. This should be rare.
|
|
|
|
|
if size := h.curArena.end - h.curArena.base; size != 0 {
|
2020-11-16 21:57:32 +00:00
|
|
|
// Transition this space from Reserved to Prepared and mark it
|
|
|
|
|
// as released since we'll be able to start using it after updating
|
|
|
|
|
// the page allocator and releasing the lock at any time.
|
|
|
|
|
sysMap(unsafe.Pointer(h.curArena.base), size, &memstats.heap_sys)
|
|
|
|
|
// Update stats.
|
|
|
|
|
atomic.Xadd64(&memstats.heap_released, int64(size))
|
|
|
|
|
stats := memstats.heapStats.acquire()
|
|
|
|
|
atomic.Xaddint64(&stats.released, int64(size))
|
|
|
|
|
memstats.heapStats.release()
|
|
|
|
|
// Update the page allocator's structures to make this
|
|
|
|
|
// space ready for allocation.
|
2019-09-04 16:12:10 +00:00
|
|
|
h.pages.grow(h.curArena.base, size)
|
2019-10-17 17:42:15 +00:00
|
|
|
totalGrowth += size
|
runtime: grow the heap incrementally
Currently, we map and grow the heap a whole arena (64MB) at a time.
Unfortunately, in order to fix #32828, we need to switch from
scavenging inline with allocation back to scavenging on heap growth,
but heap-growth scavenging happens in large jumps because we grow the
heap in large jumps.
In order to prepare for better heap-growth scavenging, this CL
separates mapping more space for the heap from actually "growing" it
(tracking the new space with spans). Instead, growing the heap keeps
track of the "current arena" it's growing into. It track that with new
spans as needed, and only maps more arena space when the current arena
is inadequate. The effect to the user is the same, but this will let
us scavenge on much smaller increments of heap growth.
There are two slightly subtleties to this change:
1. If an allocation requires mapping a new arena and that new arena
isn't contiguous with the current arena, we don't want to lose the
unused space in the current arena, so we have to immediately track
that with a span.
2. The mapped space must be accounted as released and idle, even
though it isn't actually tracked in a span.
For #32828, since this makes heap-growth scavenging far more
effective, especially at small heap sizes. For example, this change is
necessary for TestPhysicalMemoryUtilization to pass once we remove
inline scavenging.
Change-Id: I300e74a0534062467e4ce91cdc3508e5ef9aa73a
Reviewed-on: https://go-review.googlesource.com/c/go/+/189957
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
2019-08-12 14:54:28 -04:00
|
|
|
}
|
|
|
|
|
// Switch to the new space.
|
|
|
|
|
h.curArena.base = uintptr(av)
|
|
|
|
|
h.curArena.end = uintptr(av) + asize
|
|
|
|
|
}
|
|
|
|
|
|
2020-05-06 19:18:07 +00:00
|
|
|
// Recalculate nBase.
|
|
|
|
|
// We know this won't overflow, because sysAlloc returned
|
|
|
|
|
// a valid region starting at h.curArena.base which is at
|
|
|
|
|
// least ask bytes in size.
|
2019-06-28 16:44:07 +00:00
|
|
|
nBase = alignUp(h.curArena.base+ask, physPageSize)
|
2014-11-11 17:05:02 -05:00
|
|
|
}
|
|
|
|
|
|
runtime: grow the heap incrementally
Currently, we map and grow the heap a whole arena (64MB) at a time.
Unfortunately, in order to fix #32828, we need to switch from
scavenging inline with allocation back to scavenging on heap growth,
but heap-growth scavenging happens in large jumps because we grow the
heap in large jumps.
In order to prepare for better heap-growth scavenging, this CL
separates mapping more space for the heap from actually "growing" it
(tracking the new space with spans). Instead, growing the heap keeps
track of the "current arena" it's growing into. It track that with new
spans as needed, and only maps more arena space when the current arena
is inadequate. The effect to the user is the same, but this will let
us scavenge on much smaller increments of heap growth.
There are two slightly subtleties to this change:
1. If an allocation requires mapping a new arena and that new arena
isn't contiguous with the current arena, we don't want to lose the
unused space in the current arena, so we have to immediately track
that with a span.
2. The mapped space must be accounted as released and idle, even
though it isn't actually tracked in a span.
For #32828, since this makes heap-growth scavenging far more
effective, especially at small heap sizes. For example, this change is
necessary for TestPhysicalMemoryUtilization to pass once we remove
inline scavenging.
Change-Id: I300e74a0534062467e4ce91cdc3508e5ef9aa73a
Reviewed-on: https://go-review.googlesource.com/c/go/+/189957
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
2019-08-12 14:54:28 -04:00
|
|
|
// Grow into the current arena.
|
|
|
|
|
v := h.curArena.base
|
|
|
|
|
h.curArena.base = nBase
|
2020-11-16 21:57:32 +00:00
|
|
|
|
|
|
|
|
// Transition the space we're going to use from Reserved to Prepared.
|
|
|
|
|
sysMap(unsafe.Pointer(v), nBase-v, &memstats.heap_sys)
|
|
|
|
|
|
|
|
|
|
// The memory just allocated counts as both released
|
|
|
|
|
// and idle, even though it's not yet backed by spans.
|
|
|
|
|
//
|
|
|
|
|
// The allocation is always aligned to the heap arena
|
|
|
|
|
// size which is always > physPageSize, so its safe to
|
|
|
|
|
// just add directly to heap_released.
|
|
|
|
|
atomic.Xadd64(&memstats.heap_released, int64(nBase-v))
|
|
|
|
|
stats := memstats.heapStats.acquire()
|
|
|
|
|
atomic.Xaddint64(&stats.released, int64(nBase-v))
|
|
|
|
|
memstats.heapStats.release()
|
|
|
|
|
|
|
|
|
|
// Update the page allocator's structures to make this
|
|
|
|
|
// space ready for allocation.
|
2019-09-04 16:12:10 +00:00
|
|
|
h.pages.grow(v, nBase-v)
|
|
|
|
|
totalGrowth += nBase - v
|
|
|
|
|
|
|
|
|
|
// We just caused a heap growth, so scavenge down what will soon be used.
|
|
|
|
|
// By scavenging inline we deal with the failure to allocate out of
|
|
|
|
|
// memory fragments by scavenging the memory fragments that are least
|
|
|
|
|
// likely to be re-used.
|
|
|
|
|
if retained := heapRetained(); retained+uint64(totalGrowth) > h.scavengeGoal {
|
|
|
|
|
todo := totalGrowth
|
|
|
|
|
if overage := uintptr(retained + uint64(totalGrowth) - h.scavengeGoal); todo > overage {
|
|
|
|
|
todo = overage
|
2019-10-17 17:42:15 +00:00
|
|
|
}
|
2019-11-21 17:05:14 +00:00
|
|
|
h.pages.scavenge(todo, false)
|
2019-10-17 17:42:15 +00:00
|
|
|
}
|
runtime: grow the heap incrementally
Currently, we map and grow the heap a whole arena (64MB) at a time.
Unfortunately, in order to fix #32828, we need to switch from
scavenging inline with allocation back to scavenging on heap growth,
but heap-growth scavenging happens in large jumps because we grow the
heap in large jumps.
In order to prepare for better heap-growth scavenging, this CL
separates mapping more space for the heap from actually "growing" it
(tracking the new space with spans). Instead, growing the heap keeps
track of the "current arena" it's growing into. It track that with new
spans as needed, and only maps more arena space when the current arena
is inadequate. The effect to the user is the same, but this will let
us scavenge on much smaller increments of heap growth.
There are two slightly subtleties to this change:
1. If an allocation requires mapping a new arena and that new arena
isn't contiguous with the current arena, we don't want to lose the
unused space in the current arena, so we have to immediately track
that with a span.
2. The mapped space must be accounted as released and idle, even
though it isn't actually tracked in a span.
For #32828, since this makes heap-growth scavenging far more
effective, especially at small heap sizes. For example, this change is
necessary for TestPhysicalMemoryUtilization to pass once we remove
inline scavenging.
Change-Id: I300e74a0534062467e4ce91cdc3508e5ef9aa73a
Reviewed-on: https://go-review.googlesource.com/c/go/+/189957
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
2019-08-12 14:54:28 -04:00
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
|
2014-11-11 17:05:02 -05:00
|
|
|
// Free the span back into the heap.
|
2019-09-18 14:11:28 +00:00
|
|
|
func (h *mheap) freeSpan(s *mspan) {
|
[dev.cc] runtime: delete scalararg, ptrarg; rename onM to systemstack
Scalararg and ptrarg are not "signal safe".
Go code filling them out can be interrupted by a signal,
and then the signal handler runs, and if it also ends up
in Go code that uses scalararg or ptrarg, now the old
values have been smashed.
For the pieces of code that do need to run in a signal handler,
we introduced onM_signalok, which is really just onM
except that the _signalok is meant to convey that the caller
asserts that scalarg and ptrarg will be restored to their old
values after the call (instead of the usual behavior, zeroing them).
Scalararg and ptrarg are also untyped and therefore error-prone.
Go code can always pass a closure instead of using scalararg
and ptrarg; they were only really necessary for C code.
And there's no more C code.
For all these reasons, delete scalararg and ptrarg, converting
the few remaining references to use closures.
Once those are gone, there is no need for a distinction between
onM and onM_signalok, so replace both with a single function
equivalent to the current onM_signalok (that is, it can be called
on any of the curg, g0, and gsignal stacks).
The name onM and the phrase 'm stack' are misnomers,
because on most system an M has two system stacks:
the main thread stack and the signal handling stack.
Correct the misnomer by naming the replacement function systemstack.
Fix a few references to "M stack" in code.
The main motivation for this change is to eliminate scalararg/ptrarg.
Rick and I have already seen them cause problems because
the calling sequence m.ptrarg[0] = p is a heap pointer assignment,
so it gets a write barrier. The write barrier also uses onM, so it has
all the same problems as if it were being invoked by a signal handler.
We worked around this by saving and restoring the old values
and by calling onM_signalok, but there's no point in keeping this nice
home for bugs around any longer.
This CL also changes funcline to return the file name as a result
instead of filling in a passed-in *string. (The *string signature is
left over from when the code was written in and called from C.)
That's arguably an unrelated change, except that once I had done
the ptrarg/scalararg/onM cleanup I started getting false positives
about the *string argument escaping (not allowed in package runtime).
The compiler is wrong, but the easiest fix is to write the code like
Go code instead of like C code. I am a bit worried that the compiler
is wrong because of some use of uninitialized memory in the escape
analysis. If that's the reason, it will go away when we convert the
compiler to Go. (And if not, we'll debug it the next time.)
LGTM=khr
R=r, khr
CC=austin, golang-codereviews, iant, rlh
https://golang.org/cl/174950043
2014-11-12 14:54:31 -05:00
|
|
|
systemstack(func() {
|
2014-11-11 17:05:02 -05:00
|
|
|
lock(&h.lock)
|
2016-03-02 12:15:02 -05:00
|
|
|
if msanenabled {
|
|
|
|
|
// Tell msan that this entire span is no longer in use.
|
|
|
|
|
base := unsafe.Pointer(s.base())
|
|
|
|
|
bytes := s.npages << _PageShift
|
|
|
|
|
msanfree(base, bytes)
|
|
|
|
|
}
|
2020-07-29 19:00:37 +00:00
|
|
|
h.freeSpanLocked(s, spanAllocHeap)
|
2014-11-11 17:05:02 -05:00
|
|
|
unlock(&h.lock)
|
|
|
|
|
})
|
|
|
|
|
}
|
|
|
|
|
|
2017-03-16 14:46:53 -04:00
|
|
|
// freeManual frees a manually-managed span returned by allocManual.
|
2020-07-29 19:00:37 +00:00
|
|
|
// typ must be the same as the spanAllocType passed to the allocManual that
|
2017-03-16 14:46:53 -04:00
|
|
|
// allocated s.
|
|
|
|
|
//
|
|
|
|
|
// This must only be called when gcphase == _GCoff. See mSpanState for
|
|
|
|
|
// an explanation.
|
|
|
|
|
//
|
2019-05-17 14:48:04 +00:00
|
|
|
// freeManual must be called on the system stack because it acquires
|
|
|
|
|
// the heap lock. See mheap for details.
|
2017-03-16 14:46:53 -04:00
|
|
|
//
|
|
|
|
|
//go:systemstack
|
2020-07-29 19:00:37 +00:00
|
|
|
func (h *mheap) freeManual(s *mspan, typ spanAllocType) {
|
2014-11-11 17:05:02 -05:00
|
|
|
s.needzero = 1
|
|
|
|
|
lock(&h.lock)
|
2020-07-29 19:00:37 +00:00
|
|
|
h.freeSpanLocked(s, typ)
|
2014-11-11 17:05:02 -05:00
|
|
|
unlock(&h.lock)
|
|
|
|
|
}
|
|
|
|
|
|
2020-07-29 19:00:37 +00:00
|
|
|
func (h *mheap) freeSpanLocked(s *mspan, typ spanAllocType) {
|
2020-08-21 11:59:55 -04:00
|
|
|
assertLockHeld(&h.lock)
|
|
|
|
|
|
runtime: atomically set span state and use as publication barrier
When everything is working correctly, any pointer the garbage
collector encounters can only point into a fully initialized heap
span, since the span must have been initialized before that pointer
could escape the heap allocator and become visible to the GC.
However, in various cases, we try to be defensive against bad
pointers. In findObject, this is just a sanity check: we never expect
to find a bad pointer, but programming errors can lead to them. In
spanOfHeap, we don't necessarily trust the pointer and we're trying to
check if it really does point to the heap, though it should always
point to something. Conservative scanning takes this to a new level,
since it can only guess that a word may be a pointer and verify this.
In all of these cases, we have a problem that the span lookup and
check can race with span initialization, since the span becomes
visible to lookups before it's fully initialized.
Furthermore, we're about to start initializing the span without the
heap lock held, which is going to introduce races where accesses were
previously protected by the heap lock.
To address this, this CL makes accesses to mspan.state atomic, and
ensures that the span is fully initialized before setting the state to
mSpanInUse. All loads are now atomic, and in any case where we don't
trust the pointer, it first atomically loads the span state and checks
that it's mSpanInUse, after which it will have synchronized with span
initialization and can safely check the other span fields.
For #10958, #24543, but a good fix in general.
Change-Id: I518b7c63555b02064b98aa5f802c92b758fef853
Reviewed-on: https://go-review.googlesource.com/c/go/+/203286
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
2019-10-23 11:25:38 -04:00
|
|
|
switch s.state.get() {
|
2018-09-26 16:39:02 -04:00
|
|
|
case mSpanManual:
|
2016-02-16 17:16:43 -05:00
|
|
|
if s.allocCount != 0 {
|
2018-11-05 19:26:25 +00:00
|
|
|
throw("mheap.freeSpanLocked - invalid stack free")
|
2014-11-11 17:05:02 -05:00
|
|
|
}
|
2018-09-26 16:39:02 -04:00
|
|
|
case mSpanInUse:
|
2016-02-16 17:16:43 -05:00
|
|
|
if s.allocCount != 0 || s.sweepgen != h.sweepgen {
|
2018-11-05 19:26:25 +00:00
|
|
|
print("mheap.freeSpanLocked - span ", s, " ptr ", hex(s.base()), " allocCount ", s.allocCount, " sweepgen ", s.sweepgen, "/", h.sweepgen, "\n")
|
|
|
|
|
throw("mheap.freeSpanLocked - invalid free")
|
2014-11-11 17:05:02 -05:00
|
|
|
}
|
2019-09-18 15:33:17 +00:00
|
|
|
atomic.Xadd64(&h.pagesInUse, -int64(s.npages))
|
2018-09-26 16:32:52 -04:00
|
|
|
|
|
|
|
|
// Clear in-use bit in arena page bitmap.
|
|
|
|
|
arena, pageIdx, pageMask := pageIndexOf(s.base())
|
2019-09-18 15:33:17 +00:00
|
|
|
atomic.And8(&arena.pageInUse[pageIdx], ^pageMask)
|
2014-11-11 17:05:02 -05:00
|
|
|
default:
|
2018-11-05 19:26:25 +00:00
|
|
|
throw("mheap.freeSpanLocked - invalid span state")
|
2014-11-11 17:05:02 -05:00
|
|
|
}
|
|
|
|
|
|
2020-07-29 19:00:37 +00:00
|
|
|
// Update stats.
|
|
|
|
|
//
|
|
|
|
|
// Mirrors the code in allocSpan.
|
2020-07-29 20:25:05 +00:00
|
|
|
nbytes := s.npages * pageSize
|
2020-08-03 20:35:40 +00:00
|
|
|
if typ == spanAllocHeap {
|
2020-07-29 20:25:05 +00:00
|
|
|
atomic.Xadd64(&memstats.heap_inuse, -int64(nbytes))
|
2014-11-11 17:05:02 -05:00
|
|
|
}
|
2020-07-29 19:00:37 +00:00
|
|
|
if typ.manual() {
|
2020-07-29 20:25:05 +00:00
|
|
|
// Manually managed memory doesn't count toward heap_sys, so add it back.
|
|
|
|
|
memstats.heap_sys.add(int64(nbytes))
|
2014-11-11 17:05:02 -05:00
|
|
|
}
|
2020-08-03 20:11:04 +00:00
|
|
|
// Update consistent stats.
|
2020-11-02 19:03:16 +00:00
|
|
|
stats := memstats.heapStats.acquire()
|
2020-08-03 20:11:04 +00:00
|
|
|
switch typ {
|
|
|
|
|
case spanAllocHeap:
|
|
|
|
|
atomic.Xaddint64(&stats.inHeap, -int64(nbytes))
|
|
|
|
|
case spanAllocStack:
|
|
|
|
|
atomic.Xaddint64(&stats.inStacks, -int64(nbytes))
|
|
|
|
|
case spanAllocPtrScalarBits:
|
|
|
|
|
atomic.Xaddint64(&stats.inPtrScalarBits, -int64(nbytes))
|
|
|
|
|
case spanAllocWorkBuf:
|
|
|
|
|
atomic.Xaddint64(&stats.inWorkBufs, -int64(nbytes))
|
|
|
|
|
}
|
2020-11-02 19:03:16 +00:00
|
|
|
memstats.heapStats.release()
|
2014-11-11 17:05:02 -05:00
|
|
|
|
2019-10-17 17:42:15 +00:00
|
|
|
// Mark the space as free.
|
|
|
|
|
h.pages.free(s.base(), s.npages)
|
|
|
|
|
|
|
|
|
|
// Free the span structure. We no longer have a use for it.
|
|
|
|
|
s.state.set(mSpanDead)
|
2019-09-18 15:57:36 +00:00
|
|
|
h.freeMSpanLocked(s)
|
2014-11-11 17:05:02 -05:00
|
|
|
}
|
|
|
|
|
|
2019-11-21 17:05:14 +00:00
|
|
|
// scavengeAll acquires the heap lock (blocking any additional
|
|
|
|
|
// manipulation of the page allocator) and iterates over the whole
|
|
|
|
|
// heap, scavenging every free page available.
|
2018-10-18 20:09:03 +00:00
|
|
|
func (h *mheap) scavengeAll() {
|
2017-03-16 17:02:24 -04:00
|
|
|
// Disallow malloc or panic while holding the heap lock. We do
|
2019-11-15 19:49:30 +00:00
|
|
|
// this here because this is a non-mallocgc entry-point to
|
2017-03-16 17:02:24 -04:00
|
|
|
// the mheap API.
|
|
|
|
|
gp := getg()
|
|
|
|
|
gp.m.mallocing++
|
2014-11-11 17:05:02 -05:00
|
|
|
lock(&h.lock)
|
2019-11-21 17:05:14 +00:00
|
|
|
// Start a new scavenge generation so we have a chance to walk
|
|
|
|
|
// over the whole heap.
|
|
|
|
|
h.pages.scavengeStartGen()
|
|
|
|
|
released := h.pages.scavenge(^uintptr(0), false)
|
|
|
|
|
gen := h.pages.scav.gen
|
2014-11-11 17:05:02 -05:00
|
|
|
unlock(&h.lock)
|
2017-03-16 17:02:24 -04:00
|
|
|
gp.m.mallocing--
|
2014-11-11 17:05:02 -05:00
|
|
|
|
2019-12-27 16:48:23 +00:00
|
|
|
if debug.scavtrace > 0 {
|
2019-11-21 17:05:14 +00:00
|
|
|
printScavTrace(gen, released, true)
|
2014-11-11 17:05:02 -05:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2015-02-19 15:48:40 -05:00
|
|
|
//go:linkname runtime_debug_freeOSMemory runtime/debug.freeOSMemory
|
|
|
|
|
func runtime_debug_freeOSMemory() {
|
2017-02-23 21:55:37 -05:00
|
|
|
GC()
|
2018-10-18 20:09:03 +00:00
|
|
|
systemstack(func() { mheap_.scavengeAll() })
|
2014-11-11 17:05:02 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Initialize a new span with the given start and npages.
|
2016-04-28 11:21:01 -04:00
|
|
|
func (span *mspan) init(base uintptr, npages uintptr) {
|
runtime: make fixalloc zero allocations on reuse
Currently fixalloc does not zero memory it reuses. This is dangerous
with the hybrid barrier if the type may contain heap pointers, since
it may cause us to observe a dead heap pointer on reuse. It's also
error-prone since it's the only allocator that doesn't zero on
allocation (mallocgc of course zeroes, but so do persistentalloc and
sysAlloc). It's also largely pointless: for mcache, the caller
immediately memclrs the allocation; and the two specials types are
tiny so there's no real cost to zeroing them.
Change fixalloc to zero allocations by default.
The only type we don't zero by default is mspan. This actually
requires that the spsn's sweepgen survive across freeing and
reallocating a span. If we were to zero it, the following race would
be possible:
1. The current sweepgen is 2. Span s is on the unswept list.
2. Direct sweeping sweeps span s, finds it's all free, and releases s
to the fixalloc.
3. Thread 1 allocates s from fixalloc. Suppose this zeros s, including
s.sweepgen.
4. Thread 1 calls s.init, which sets s.state to _MSpanDead.
5. On thread 2, background sweeping comes across span s in allspans
and cas's s.sweepgen from 0 (sg-2) to 1 (sg-1). Now it thinks it
owns it for sweeping. 6. Thread 1 continues initializing s.
Everything breaks.
I would like to fix this because it's obviously confusing, but it's a
subtle enough problem that I'm leaving it alone for now. The solution
may be to skip sweepgen 0, but then we have to think about wrap-around
much more carefully.
Updates #17503.
Change-Id: Ie08691feed3abbb06a31381b94beb0a2e36a0613
Reviewed-on: https://go-review.googlesource.com/31368
Reviewed-by: Keith Randall <khr@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
2016-09-25 17:12:43 -04:00
|
|
|
// span is *not* zeroed.
|
2014-11-11 17:05:02 -05:00
|
|
|
span.next = nil
|
|
|
|
|
span.prev = nil
|
2015-10-15 15:59:49 -07:00
|
|
|
span.list = nil
|
2016-04-28 11:21:01 -04:00
|
|
|
span.startAddr = base
|
2014-11-11 17:05:02 -05:00
|
|
|
span.npages = npages
|
2016-02-16 17:16:43 -05:00
|
|
|
span.allocCount = 0
|
2016-02-09 17:53:07 -05:00
|
|
|
span.spanclass = 0
|
2014-11-11 17:05:02 -05:00
|
|
|
span.elemsize = 0
|
|
|
|
|
span.speciallock.key = 0
|
|
|
|
|
span.specials = nil
|
|
|
|
|
span.needzero = 0
|
2016-02-11 13:57:58 -05:00
|
|
|
span.freeindex = 0
|
2016-03-14 12:17:48 -04:00
|
|
|
span.allocBits = nil
|
|
|
|
|
span.gcmarkBits = nil
|
runtime: atomically set span state and use as publication barrier
When everything is working correctly, any pointer the garbage
collector encounters can only point into a fully initialized heap
span, since the span must have been initialized before that pointer
could escape the heap allocator and become visible to the GC.
However, in various cases, we try to be defensive against bad
pointers. In findObject, this is just a sanity check: we never expect
to find a bad pointer, but programming errors can lead to them. In
spanOfHeap, we don't necessarily trust the pointer and we're trying to
check if it really does point to the heap, though it should always
point to something. Conservative scanning takes this to a new level,
since it can only guess that a word may be a pointer and verify this.
In all of these cases, we have a problem that the span lookup and
check can race with span initialization, since the span becomes
visible to lookups before it's fully initialized.
Furthermore, we're about to start initializing the span without the
heap lock held, which is going to introduce races where accesses were
previously protected by the heap lock.
To address this, this CL makes accesses to mspan.state atomic, and
ensures that the span is fully initialized before setting the state to
mSpanInUse. All loads are now atomic, and in any case where we don't
trust the pointer, it first atomically loads the span state and checks
that it's mSpanInUse, after which it will have synchronized with span
initialization and can safely check the other span fields.
For #10958, #24543, but a good fix in general.
Change-Id: I518b7c63555b02064b98aa5f802c92b758fef853
Reviewed-on: https://go-review.googlesource.com/c/go/+/203286
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
2019-10-23 11:25:38 -04:00
|
|
|
span.state.set(mSpanDead)
|
runtime: static lock ranking for the runtime (enabled by GOEXPERIMENT)
I took some of the infrastructure from Austin's lock logging CR
https://go-review.googlesource.com/c/go/+/192704 (with deadlock
detection from the logs), and developed a setup to give static lock
ranking for runtime locks.
Static lock ranking establishes a documented total ordering among locks,
and then reports an error if the total order is violated. This can
happen if a deadlock happens (by acquiring a sequence of locks in
different orders), or if just one side of a possible deadlock happens.
Lock ordering deadlocks cannot happen as long as the lock ordering is
followed.
Along the way, I found a deadlock involving the new timer code, which Ian fixed
via https://go-review.googlesource.com/c/go/+/207348, as well as two other
potential deadlocks.
See the constants at the top of runtime/lockrank.go to show the static
lock ranking that I ended up with, along with some comments. This is
great documentation of the current intended lock ordering when acquiring
multiple locks in the runtime.
I also added an array lockPartialOrder[] which shows and enforces the
current partial ordering among locks (which is embedded within the total
ordering). This is more specific about the dependencies among locks.
I don't try to check the ranking within a lock class with multiple locks
that can be acquired at the same time (i.e. check the ranking when
multiple hchan locks are acquired).
Currently, I am doing a lockInit() call to set the lock rank of most
locks. Any lock that is not otherwise initialized is assumed to be a
leaf lock (a very high rank lock), so that eliminates the need to do
anything for a bunch of locks (including all architecture-dependent
locks). For two locks, root.lock and notifyList.lock (only in the
runtime/sema.go file), it is not as easy to do lock initialization, so
instead, I am passing the lock rank with the lock calls.
For Windows compilation, I needed to increase the StackGuard size from
896 to 928 because of the new lock-rank checking functions.
Checking of the static lock ranking is enabled by setting
GOEXPERIMENT=staticlockranking before doing a run.
To make sure that the static lock ranking code has no overhead in memory
or CPU when not enabled by GOEXPERIMENT, I changed 'go build/install' so
that it defines a build tag (with the same name) whenever any experiment
has been baked into the toolchain (by checking Expstring()). This allows
me to avoid increasing the size of the 'mutex' type when static lock
ranking is not enabled.
Fixes #38029
Change-Id: I154217ff307c47051f8dae9c2a03b53081acd83a
Reviewed-on: https://go-review.googlesource.com/c/go/+/207619
Reviewed-by: Dan Scales <danscales@google.com>
Reviewed-by: Keith Randall <khr@golang.org>
Run-TryBot: Dan Scales <danscales@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2019-11-13 17:34:47 -08:00
|
|
|
lockInit(&span.speciallock, lockRankMspanSpecial)
|
2014-11-11 17:05:02 -05:00
|
|
|
}
|
|
|
|
|
|
2015-11-11 16:13:51 -08:00
|
|
|
func (span *mspan) inList() bool {
|
2016-10-11 11:47:14 -04:00
|
|
|
return span.list != nil
|
2015-10-15 15:59:49 -07:00
|
|
|
}
|
|
|
|
|
|
2014-11-11 17:05:02 -05:00
|
|
|
// Initialize an empty doubly-linked list.
|
2015-11-11 16:13:51 -08:00
|
|
|
func (list *mSpanList) init() {
|
2015-10-15 15:59:49 -07:00
|
|
|
list.first = nil
|
2016-10-11 11:47:14 -04:00
|
|
|
list.last = nil
|
2014-11-11 17:05:02 -05:00
|
|
|
}
|
|
|
|
|
|
2015-11-11 16:13:51 -08:00
|
|
|
func (list *mSpanList) remove(span *mspan) {
|
2016-10-11 11:47:14 -04:00
|
|
|
if span.list != list {
|
2018-11-05 19:26:25 +00:00
|
|
|
print("runtime: failed mSpanList.remove span.npages=", span.npages,
|
2017-03-27 14:20:35 -04:00
|
|
|
" span=", span, " prev=", span.prev, " span.list=", span.list, " list=", list, "\n")
|
2018-11-05 19:26:25 +00:00
|
|
|
throw("mSpanList.remove")
|
2014-11-11 17:05:02 -05:00
|
|
|
}
|
2016-10-11 11:47:14 -04:00
|
|
|
if list.first == span {
|
|
|
|
|
list.first = span.next
|
2015-10-15 15:59:49 -07:00
|
|
|
} else {
|
2016-10-11 11:47:14 -04:00
|
|
|
span.prev.next = span.next
|
|
|
|
|
}
|
|
|
|
|
if list.last == span {
|
2015-10-15 15:59:49 -07:00
|
|
|
list.last = span.prev
|
2016-10-11 11:47:14 -04:00
|
|
|
} else {
|
|
|
|
|
span.next.prev = span.prev
|
2015-10-15 15:59:49 -07:00
|
|
|
}
|
2014-11-11 17:05:02 -05:00
|
|
|
span.next = nil
|
2015-10-15 15:59:49 -07:00
|
|
|
span.prev = nil
|
|
|
|
|
span.list = nil
|
2014-11-11 17:05:02 -05:00
|
|
|
}
|
|
|
|
|
|
2015-11-11 16:13:51 -08:00
|
|
|
func (list *mSpanList) isEmpty() bool {
|
2015-10-15 15:59:49 -07:00
|
|
|
return list.first == nil
|
2014-11-11 17:05:02 -05:00
|
|
|
}
|
|
|
|
|
|
2015-11-11 16:13:51 -08:00
|
|
|
func (list *mSpanList) insert(span *mspan) {
|
2015-10-15 15:59:49 -07:00
|
|
|
if span.next != nil || span.prev != nil || span.list != nil {
|
2018-11-05 19:26:25 +00:00
|
|
|
println("runtime: failed mSpanList.insert", span, span.next, span.prev, span.list)
|
|
|
|
|
throw("mSpanList.insert")
|
2014-11-11 17:05:02 -05:00
|
|
|
}
|
2015-10-15 15:59:49 -07:00
|
|
|
span.next = list.first
|
|
|
|
|
if list.first != nil {
|
2016-10-11 11:47:14 -04:00
|
|
|
// The list contains at least one span; link it in.
|
|
|
|
|
// The last span in the list doesn't change.
|
|
|
|
|
list.first.prev = span
|
2015-10-15 15:59:49 -07:00
|
|
|
} else {
|
2016-10-11 11:47:14 -04:00
|
|
|
// The list contains no spans, so this is also the last span.
|
|
|
|
|
list.last = span
|
2015-10-15 15:59:49 -07:00
|
|
|
}
|
|
|
|
|
list.first = span
|
|
|
|
|
span.list = list
|
2014-11-11 17:05:02 -05:00
|
|
|
}
|
|
|
|
|
|
2015-11-11 16:13:51 -08:00
|
|
|
func (list *mSpanList) insertBack(span *mspan) {
|
2015-10-15 15:59:49 -07:00
|
|
|
if span.next != nil || span.prev != nil || span.list != nil {
|
2018-11-05 19:26:25 +00:00
|
|
|
println("runtime: failed mSpanList.insertBack", span, span.next, span.prev, span.list)
|
|
|
|
|
throw("mSpanList.insertBack")
|
2014-11-11 17:05:02 -05:00
|
|
|
}
|
2015-10-15 15:59:49 -07:00
|
|
|
span.prev = list.last
|
2016-10-11 11:47:14 -04:00
|
|
|
if list.last != nil {
|
|
|
|
|
// The list contains at least one span.
|
|
|
|
|
list.last.next = span
|
|
|
|
|
} else {
|
|
|
|
|
// The list contains no spans, so this is also the first span.
|
|
|
|
|
list.first = span
|
|
|
|
|
}
|
|
|
|
|
list.last = span
|
2015-10-15 15:59:49 -07:00
|
|
|
span.list = list
|
2014-11-11 17:05:02 -05:00
|
|
|
}
|
|
|
|
|
|
2017-03-20 17:25:59 -04:00
|
|
|
// takeAll removes all spans from other and inserts them at the front
|
|
|
|
|
// of list.
|
|
|
|
|
func (list *mSpanList) takeAll(other *mSpanList) {
|
|
|
|
|
if other.isEmpty() {
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Reparent everything in other to list.
|
|
|
|
|
for s := other.first; s != nil; s = s.next {
|
|
|
|
|
s.list = list
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Concatenate the lists.
|
|
|
|
|
if list.isEmpty() {
|
|
|
|
|
*list = *other
|
|
|
|
|
} else {
|
|
|
|
|
// Neither list is empty. Put other before list.
|
|
|
|
|
other.last.next = list.first
|
|
|
|
|
list.first.prev = other.last
|
|
|
|
|
list.first = other.first
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
other.first, other.last = nil, nil
|
|
|
|
|
}
|
|
|
|
|
|
2015-02-19 13:38:46 -05:00
|
|
|
const (
|
|
|
|
|
_KindSpecialFinalizer = 1
|
|
|
|
|
_KindSpecialProfile = 2
|
2021-03-24 10:45:20 -04:00
|
|
|
// _KindSpecialReachable is a special used for tracking
|
|
|
|
|
// reachability during testing.
|
|
|
|
|
_KindSpecialReachable = 3
|
2015-02-19 13:38:46 -05:00
|
|
|
// Note: The finalizer special must be first because if we're freeing
|
|
|
|
|
// an object, a finalizer special will cause the freeing operation
|
|
|
|
|
// to abort, and we want to keep the other special records around
|
|
|
|
|
// if that happens.
|
|
|
|
|
)
|
|
|
|
|
|
2016-10-11 22:58:21 -04:00
|
|
|
//go:notinheap
|
2015-02-19 13:38:46 -05:00
|
|
|
type special struct {
|
|
|
|
|
next *special // linked list in span
|
|
|
|
|
offset uint16 // span offset of object
|
|
|
|
|
kind byte // kind of special
|
|
|
|
|
}
|
|
|
|
|
|
runtime: add bitmap-based markrootSpans implementation
Currently markrootSpans, the scanning routine which scans span specials
(particularly finalizers) as roots, uses sweepSpans to shard work and
find spans to mark.
However, as part of a future CL to change span ownership and how
mcentral works, we want to avoid having markrootSpans use the sweep bufs
to find specials, so in this change we introduce a new mechanism.
Much like for the page reclaimer, we set up a per-page bitmap where the
first page for a span is marked if the span contains any specials, and
unmarked if it has no specials. This bitmap is updated by addspecial,
removespecial, and during sweeping.
markrootSpans then shards this bitmap into mark work and markers iterate
over the bitmap looking for spans with specials to mark. Unlike the page
reclaimer, we don't need to use the pageInUse bits because having a
special implies that a span is in-use.
While in terms of computational complexity this design is technically
worse, because it needs to iterate over the mapped heap, in practice
this iteration is very fast (we can skip over large swathes of the heap
very quickly) and we only look at spans that have any specials at all,
rather than having to touch each span.
This new implementation of markrootSpans is behind a feature flag called
go115NewMarkrootSpans.
Updates #37487.
Change-Id: I8ea07b6c11059f6d412fe419e0ab512d989377b8
Reviewed-on: https://go-review.googlesource.com/c/go/+/221178
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Austin Clements <austin@google.com>
2020-02-20 20:03:39 +00:00
|
|
|
// spanHasSpecials marks a span as having specials in the arena bitmap.
|
|
|
|
|
func spanHasSpecials(s *mspan) {
|
|
|
|
|
arenaPage := (s.base() / pageSize) % pagesPerArena
|
|
|
|
|
ai := arenaIndex(s.base())
|
|
|
|
|
ha := mheap_.arenas[ai.l1()][ai.l2()]
|
|
|
|
|
atomic.Or8(&ha.pageSpecials[arenaPage/8], uint8(1)<<(arenaPage%8))
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// spanHasNoSpecials marks a span as having no specials in the arena bitmap.
|
|
|
|
|
func spanHasNoSpecials(s *mspan) {
|
|
|
|
|
arenaPage := (s.base() / pageSize) % pagesPerArena
|
|
|
|
|
ai := arenaIndex(s.base())
|
|
|
|
|
ha := mheap_.arenas[ai.l1()][ai.l2()]
|
|
|
|
|
atomic.And8(&ha.pageSpecials[arenaPage/8], ^(uint8(1) << (arenaPage % 8)))
|
|
|
|
|
}
|
|
|
|
|
|
2014-11-11 17:05:02 -05:00
|
|
|
// Adds the special record s to the list of special records for
|
2016-03-01 23:21:55 +00:00
|
|
|
// the object p. All fields of s should be filled in except for
|
2014-11-11 17:05:02 -05:00
|
|
|
// offset & next, which this routine will fill in.
|
|
|
|
|
// Returns true if the special was successfully added, false otherwise.
|
|
|
|
|
// (The add will fail only if a record with the same p and s->kind
|
|
|
|
|
// already exists.)
|
|
|
|
|
func addspecial(p unsafe.Pointer, s *special) bool {
|
2017-12-04 10:58:15 -05:00
|
|
|
span := spanOfHeap(uintptr(p))
|
2014-11-11 17:05:02 -05:00
|
|
|
if span == nil {
|
2014-12-27 20:58:00 -08:00
|
|
|
throw("addspecial on invalid pointer")
|
2014-11-11 17:05:02 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Ensure that the span is swept.
|
runtime: scan objects with finalizers concurrently
This reduces pause time by ~25% relative to tip and by ~50% relative
to Go 1.5.1.
Currently one of the steps of STW mark termination is to loop (in
parallel) over all spans to find objects with finalizers in order to
mark all objects reachable from these objects and to treat the
finalizer special as a root. Unfortunately, even if there are no
finalizers at all, this loop takes roughly 1 ms/heap GB/core, so
multi-gigabyte heaps can quickly push our STW time past 10ms.
Fix this by moving this scan from mark termination to concurrent scan,
where it can run in parallel with mutators. The loop itself could also
be optimized, but this cost is small compared to concurrent marking.
Making this scan concurrent introduces two complications:
1) The scan currently walks the specials list of each span without
locking it, which is safe only with the world stopped. We fix this by
speculatively checking if a span has any specials (the vast majority
won't) and then locking the specials list only if there are specials
to check.
2) An object can have a finalizer set after concurrent scan, in which
case it won't have been marked appropriately by concurrent scan. If
the finalizer is a closure and is only reachable from the special, it
could be swept before it is run. Likewise, if the object is not marked
yet when the finalizer is set and then becomes unreachable before it
is marked, other objects reachable only from it may be swept before
the finalizer function is run. We fix this issue by making
addfinalizer ensure the same marking invariants as markroot does.
For multi-gigabyte heaps, this reduces max pause time by 20%–30%
relative to tip (depending on GOMAXPROCS) and by ~50% relative to Go
1.5.1 (where this loop was neither concurrent nor parallel). Here are
the results for the garbage benchmark:
---------------- max pause ----------------
Heap Procs Concurrent scan STW parallel scan 1.5.1
24GB 12 18ms 23ms 37ms
24GB 4 18ms 25ms 37ms
4GB 4 3.8ms 4.9ms 6.9ms
In all cases, 95%ile pause time is similar to the max pause time. This
also improves mean STW time by 10%–30%.
Fixes #11485.
Change-Id: I9359d8c3d120a51d23d924b52bf853a1299b1dfd
Reviewed-on: https://go-review.googlesource.com/14982
Reviewed-by: Rick Hudson <rlh@golang.org>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2015-09-24 14:39:27 -04:00
|
|
|
// Sweeping accesses the specials list w/o locks, so we have
|
|
|
|
|
// to synchronize with it. And it's just much safer.
|
2014-11-11 17:05:02 -05:00
|
|
|
mp := acquirem()
|
2015-11-11 16:13:51 -08:00
|
|
|
span.ensureSwept()
|
2014-11-11 17:05:02 -05:00
|
|
|
|
2016-04-28 10:59:00 -04:00
|
|
|
offset := uintptr(p) - span.base()
|
2014-11-11 17:05:02 -05:00
|
|
|
kind := s.kind
|
|
|
|
|
|
|
|
|
|
lock(&span.speciallock)
|
|
|
|
|
|
|
|
|
|
// Find splice point, check for existing record.
|
|
|
|
|
t := &span.specials
|
|
|
|
|
for {
|
|
|
|
|
x := *t
|
|
|
|
|
if x == nil {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
if offset == uintptr(x.offset) && kind == x.kind {
|
|
|
|
|
unlock(&span.speciallock)
|
|
|
|
|
releasem(mp)
|
|
|
|
|
return false // already exists
|
|
|
|
|
}
|
|
|
|
|
if offset < uintptr(x.offset) || (offset == uintptr(x.offset) && kind < x.kind) {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
t = &x.next
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Splice in record, fill in offset.
|
|
|
|
|
s.offset = uint16(offset)
|
|
|
|
|
s.next = *t
|
|
|
|
|
*t = s
|
2020-02-19 19:45:57 +00:00
|
|
|
spanHasSpecials(span)
|
2014-11-11 17:05:02 -05:00
|
|
|
unlock(&span.speciallock)
|
|
|
|
|
releasem(mp)
|
|
|
|
|
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Removes the Special record of the given kind for the object p.
|
|
|
|
|
// Returns the record if the record existed, nil otherwise.
|
|
|
|
|
// The caller must FixAlloc_Free the result.
|
|
|
|
|
func removespecial(p unsafe.Pointer, kind uint8) *special {
|
2017-12-04 10:58:15 -05:00
|
|
|
span := spanOfHeap(uintptr(p))
|
2014-11-11 17:05:02 -05:00
|
|
|
if span == nil {
|
2014-12-27 20:58:00 -08:00
|
|
|
throw("removespecial on invalid pointer")
|
2014-11-11 17:05:02 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Ensure that the span is swept.
|
runtime: scan objects with finalizers concurrently
This reduces pause time by ~25% relative to tip and by ~50% relative
to Go 1.5.1.
Currently one of the steps of STW mark termination is to loop (in
parallel) over all spans to find objects with finalizers in order to
mark all objects reachable from these objects and to treat the
finalizer special as a root. Unfortunately, even if there are no
finalizers at all, this loop takes roughly 1 ms/heap GB/core, so
multi-gigabyte heaps can quickly push our STW time past 10ms.
Fix this by moving this scan from mark termination to concurrent scan,
where it can run in parallel with mutators. The loop itself could also
be optimized, but this cost is small compared to concurrent marking.
Making this scan concurrent introduces two complications:
1) The scan currently walks the specials list of each span without
locking it, which is safe only with the world stopped. We fix this by
speculatively checking if a span has any specials (the vast majority
won't) and then locking the specials list only if there are specials
to check.
2) An object can have a finalizer set after concurrent scan, in which
case it won't have been marked appropriately by concurrent scan. If
the finalizer is a closure and is only reachable from the special, it
could be swept before it is run. Likewise, if the object is not marked
yet when the finalizer is set and then becomes unreachable before it
is marked, other objects reachable only from it may be swept before
the finalizer function is run. We fix this issue by making
addfinalizer ensure the same marking invariants as markroot does.
For multi-gigabyte heaps, this reduces max pause time by 20%–30%
relative to tip (depending on GOMAXPROCS) and by ~50% relative to Go
1.5.1 (where this loop was neither concurrent nor parallel). Here are
the results for the garbage benchmark:
---------------- max pause ----------------
Heap Procs Concurrent scan STW parallel scan 1.5.1
24GB 12 18ms 23ms 37ms
24GB 4 18ms 25ms 37ms
4GB 4 3.8ms 4.9ms 6.9ms
In all cases, 95%ile pause time is similar to the max pause time. This
also improves mean STW time by 10%–30%.
Fixes #11485.
Change-Id: I9359d8c3d120a51d23d924b52bf853a1299b1dfd
Reviewed-on: https://go-review.googlesource.com/14982
Reviewed-by: Rick Hudson <rlh@golang.org>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2015-09-24 14:39:27 -04:00
|
|
|
// Sweeping accesses the specials list w/o locks, so we have
|
|
|
|
|
// to synchronize with it. And it's just much safer.
|
2014-11-11 17:05:02 -05:00
|
|
|
mp := acquirem()
|
2015-11-11 16:13:51 -08:00
|
|
|
span.ensureSwept()
|
2014-11-11 17:05:02 -05:00
|
|
|
|
2016-04-28 10:59:00 -04:00
|
|
|
offset := uintptr(p) - span.base()
|
2014-11-11 17:05:02 -05:00
|
|
|
|
runtime: add bitmap-based markrootSpans implementation
Currently markrootSpans, the scanning routine which scans span specials
(particularly finalizers) as roots, uses sweepSpans to shard work and
find spans to mark.
However, as part of a future CL to change span ownership and how
mcentral works, we want to avoid having markrootSpans use the sweep bufs
to find specials, so in this change we introduce a new mechanism.
Much like for the page reclaimer, we set up a per-page bitmap where the
first page for a span is marked if the span contains any specials, and
unmarked if it has no specials. This bitmap is updated by addspecial,
removespecial, and during sweeping.
markrootSpans then shards this bitmap into mark work and markers iterate
over the bitmap looking for spans with specials to mark. Unlike the page
reclaimer, we don't need to use the pageInUse bits because having a
special implies that a span is in-use.
While in terms of computational complexity this design is technically
worse, because it needs to iterate over the mapped heap, in practice
this iteration is very fast (we can skip over large swathes of the heap
very quickly) and we only look at spans that have any specials at all,
rather than having to touch each span.
This new implementation of markrootSpans is behind a feature flag called
go115NewMarkrootSpans.
Updates #37487.
Change-Id: I8ea07b6c11059f6d412fe419e0ab512d989377b8
Reviewed-on: https://go-review.googlesource.com/c/go/+/221178
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Austin Clements <austin@google.com>
2020-02-20 20:03:39 +00:00
|
|
|
var result *special
|
2014-11-11 17:05:02 -05:00
|
|
|
lock(&span.speciallock)
|
|
|
|
|
t := &span.specials
|
|
|
|
|
for {
|
|
|
|
|
s := *t
|
|
|
|
|
if s == nil {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
// This function is used for finalizers only, so we don't check for
|
|
|
|
|
// "interior" specials (p must be exactly equal to s->offset).
|
|
|
|
|
if offset == uintptr(s.offset) && kind == s.kind {
|
|
|
|
|
*t = s.next
|
runtime: add bitmap-based markrootSpans implementation
Currently markrootSpans, the scanning routine which scans span specials
(particularly finalizers) as roots, uses sweepSpans to shard work and
find spans to mark.
However, as part of a future CL to change span ownership and how
mcentral works, we want to avoid having markrootSpans use the sweep bufs
to find specials, so in this change we introduce a new mechanism.
Much like for the page reclaimer, we set up a per-page bitmap where the
first page for a span is marked if the span contains any specials, and
unmarked if it has no specials. This bitmap is updated by addspecial,
removespecial, and during sweeping.
markrootSpans then shards this bitmap into mark work and markers iterate
over the bitmap looking for spans with specials to mark. Unlike the page
reclaimer, we don't need to use the pageInUse bits because having a
special implies that a span is in-use.
While in terms of computational complexity this design is technically
worse, because it needs to iterate over the mapped heap, in practice
this iteration is very fast (we can skip over large swathes of the heap
very quickly) and we only look at spans that have any specials at all,
rather than having to touch each span.
This new implementation of markrootSpans is behind a feature flag called
go115NewMarkrootSpans.
Updates #37487.
Change-Id: I8ea07b6c11059f6d412fe419e0ab512d989377b8
Reviewed-on: https://go-review.googlesource.com/c/go/+/221178
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Austin Clements <austin@google.com>
2020-02-20 20:03:39 +00:00
|
|
|
result = s
|
|
|
|
|
break
|
2014-11-11 17:05:02 -05:00
|
|
|
}
|
|
|
|
|
t = &s.next
|
|
|
|
|
}
|
2020-02-19 19:45:57 +00:00
|
|
|
if span.specials == nil {
|
runtime: add bitmap-based markrootSpans implementation
Currently markrootSpans, the scanning routine which scans span specials
(particularly finalizers) as roots, uses sweepSpans to shard work and
find spans to mark.
However, as part of a future CL to change span ownership and how
mcentral works, we want to avoid having markrootSpans use the sweep bufs
to find specials, so in this change we introduce a new mechanism.
Much like for the page reclaimer, we set up a per-page bitmap where the
first page for a span is marked if the span contains any specials, and
unmarked if it has no specials. This bitmap is updated by addspecial,
removespecial, and during sweeping.
markrootSpans then shards this bitmap into mark work and markers iterate
over the bitmap looking for spans with specials to mark. Unlike the page
reclaimer, we don't need to use the pageInUse bits because having a
special implies that a span is in-use.
While in terms of computational complexity this design is technically
worse, because it needs to iterate over the mapped heap, in practice
this iteration is very fast (we can skip over large swathes of the heap
very quickly) and we only look at spans that have any specials at all,
rather than having to touch each span.
This new implementation of markrootSpans is behind a feature flag called
go115NewMarkrootSpans.
Updates #37487.
Change-Id: I8ea07b6c11059f6d412fe419e0ab512d989377b8
Reviewed-on: https://go-review.googlesource.com/c/go/+/221178
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Austin Clements <austin@google.com>
2020-02-20 20:03:39 +00:00
|
|
|
spanHasNoSpecials(span)
|
|
|
|
|
}
|
2014-11-11 17:05:02 -05:00
|
|
|
unlock(&span.speciallock)
|
|
|
|
|
releasem(mp)
|
runtime: add bitmap-based markrootSpans implementation
Currently markrootSpans, the scanning routine which scans span specials
(particularly finalizers) as roots, uses sweepSpans to shard work and
find spans to mark.
However, as part of a future CL to change span ownership and how
mcentral works, we want to avoid having markrootSpans use the sweep bufs
to find specials, so in this change we introduce a new mechanism.
Much like for the page reclaimer, we set up a per-page bitmap where the
first page for a span is marked if the span contains any specials, and
unmarked if it has no specials. This bitmap is updated by addspecial,
removespecial, and during sweeping.
markrootSpans then shards this bitmap into mark work and markers iterate
over the bitmap looking for spans with specials to mark. Unlike the page
reclaimer, we don't need to use the pageInUse bits because having a
special implies that a span is in-use.
While in terms of computational complexity this design is technically
worse, because it needs to iterate over the mapped heap, in practice
this iteration is very fast (we can skip over large swathes of the heap
very quickly) and we only look at spans that have any specials at all,
rather than having to touch each span.
This new implementation of markrootSpans is behind a feature flag called
go115NewMarkrootSpans.
Updates #37487.
Change-Id: I8ea07b6c11059f6d412fe419e0ab512d989377b8
Reviewed-on: https://go-review.googlesource.com/c/go/+/221178
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Austin Clements <austin@google.com>
2020-02-20 20:03:39 +00:00
|
|
|
return result
|
2014-11-11 17:05:02 -05:00
|
|
|
}
|
|
|
|
|
|
2015-02-19 13:38:46 -05:00
|
|
|
// The described object has a finalizer set for it.
|
2016-10-11 22:58:21 -04:00
|
|
|
//
|
|
|
|
|
// specialfinalizer is allocated from non-GC'd memory, so any heap
|
|
|
|
|
// pointers must be specially handled.
|
|
|
|
|
//
|
|
|
|
|
//go:notinheap
|
2015-02-19 13:38:46 -05:00
|
|
|
type specialfinalizer struct {
|
|
|
|
|
special special
|
2016-10-11 22:58:21 -04:00
|
|
|
fn *funcval // May be a heap pointer.
|
2015-02-19 13:38:46 -05:00
|
|
|
nret uintptr
|
2016-10-11 22:58:21 -04:00
|
|
|
fint *_type // May be a heap pointer, but always live.
|
|
|
|
|
ot *ptrtype // May be a heap pointer, but always live.
|
2015-02-19 13:38:46 -05:00
|
|
|
}
|
|
|
|
|
|
2016-03-01 23:21:55 +00:00
|
|
|
// Adds a finalizer to the object p. Returns true if it succeeded.
|
2014-11-11 17:05:02 -05:00
|
|
|
func addfinalizer(p unsafe.Pointer, f *funcval, nret uintptr, fint *_type, ot *ptrtype) bool {
|
|
|
|
|
lock(&mheap_.speciallock)
|
2015-11-11 16:13:51 -08:00
|
|
|
s := (*specialfinalizer)(mheap_.specialfinalizeralloc.alloc())
|
2014-11-11 17:05:02 -05:00
|
|
|
unlock(&mheap_.speciallock)
|
|
|
|
|
s.special.kind = _KindSpecialFinalizer
|
|
|
|
|
s.fn = f
|
|
|
|
|
s.nret = nret
|
|
|
|
|
s.fint = fint
|
|
|
|
|
s.ot = ot
|
|
|
|
|
if addspecial(p, &s.special) {
|
runtime: scan objects with finalizers concurrently
This reduces pause time by ~25% relative to tip and by ~50% relative
to Go 1.5.1.
Currently one of the steps of STW mark termination is to loop (in
parallel) over all spans to find objects with finalizers in order to
mark all objects reachable from these objects and to treat the
finalizer special as a root. Unfortunately, even if there are no
finalizers at all, this loop takes roughly 1 ms/heap GB/core, so
multi-gigabyte heaps can quickly push our STW time past 10ms.
Fix this by moving this scan from mark termination to concurrent scan,
where it can run in parallel with mutators. The loop itself could also
be optimized, but this cost is small compared to concurrent marking.
Making this scan concurrent introduces two complications:
1) The scan currently walks the specials list of each span without
locking it, which is safe only with the world stopped. We fix this by
speculatively checking if a span has any specials (the vast majority
won't) and then locking the specials list only if there are specials
to check.
2) An object can have a finalizer set after concurrent scan, in which
case it won't have been marked appropriately by concurrent scan. If
the finalizer is a closure and is only reachable from the special, it
could be swept before it is run. Likewise, if the object is not marked
yet when the finalizer is set and then becomes unreachable before it
is marked, other objects reachable only from it may be swept before
the finalizer function is run. We fix this issue by making
addfinalizer ensure the same marking invariants as markroot does.
For multi-gigabyte heaps, this reduces max pause time by 20%–30%
relative to tip (depending on GOMAXPROCS) and by ~50% relative to Go
1.5.1 (where this loop was neither concurrent nor parallel). Here are
the results for the garbage benchmark:
---------------- max pause ----------------
Heap Procs Concurrent scan STW parallel scan 1.5.1
24GB 12 18ms 23ms 37ms
24GB 4 18ms 25ms 37ms
4GB 4 3.8ms 4.9ms 6.9ms
In all cases, 95%ile pause time is similar to the max pause time. This
also improves mean STW time by 10%–30%.
Fixes #11485.
Change-Id: I9359d8c3d120a51d23d924b52bf853a1299b1dfd
Reviewed-on: https://go-review.googlesource.com/14982
Reviewed-by: Rick Hudson <rlh@golang.org>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2015-09-24 14:39:27 -04:00
|
|
|
// This is responsible for maintaining the same
|
|
|
|
|
// GC-related invariants as markrootSpans in any
|
|
|
|
|
// situation where it's possible that markrootSpans
|
|
|
|
|
// has already run but mark termination hasn't yet.
|
|
|
|
|
if gcphase != _GCoff {
|
runtime: split object finding out of heapBitsForObject
heapBitsForObject does two things: it finds the base of the object and
it creates the heapBits for the base of the object. There are several
places where we just care about the base of the object. Furthermore,
greyobject only needs the heapBits in the checkmark path and can
easily compute them only when needed. Once we eliminate passing the
heap bits to grayobject, almost all uses of heapBitsForObject don't
need the heap bits.
Hence, this splits heapBitsForObject into findObject and
heapBitsForAddr (the latter already exists), removes the hbits
argument to grayobject, and replaces all heapBitsForObject calls with
calls to findObject.
In addition to making things cleaner overall, heapBitsForAddr is going
to get more expensive shortly, so it's important that we don't do it
needlessly.
Note that there's an interesting performance pitfall here. I had
originally moved findObject to mheap.go, since it made more sense
there. However, that leads to a ~2% slow down and a whopping 11%
increase in L1 icache misses on both the x/garbage and compilebench
benchmarks. This suggests we may want to be more principled about
this, but, for now, let's just leave findObject in mbitmap.go.
(I tried to make findObject small enough to inline by splitting out
the error case, but, sadly, wasn't quite able to get it under the
inlining budget.)
Change-Id: I7bcb92f383ade565d22a9f2494e4c66fd513fb10
Reviewed-on: https://go-review.googlesource.com/85878
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
2017-12-11 19:40:12 -05:00
|
|
|
base, _, _ := findObject(uintptr(p), 0, 0)
|
runtime: scan objects with finalizers concurrently
This reduces pause time by ~25% relative to tip and by ~50% relative
to Go 1.5.1.
Currently one of the steps of STW mark termination is to loop (in
parallel) over all spans to find objects with finalizers in order to
mark all objects reachable from these objects and to treat the
finalizer special as a root. Unfortunately, even if there are no
finalizers at all, this loop takes roughly 1 ms/heap GB/core, so
multi-gigabyte heaps can quickly push our STW time past 10ms.
Fix this by moving this scan from mark termination to concurrent scan,
where it can run in parallel with mutators. The loop itself could also
be optimized, but this cost is small compared to concurrent marking.
Making this scan concurrent introduces two complications:
1) The scan currently walks the specials list of each span without
locking it, which is safe only with the world stopped. We fix this by
speculatively checking if a span has any specials (the vast majority
won't) and then locking the specials list only if there are specials
to check.
2) An object can have a finalizer set after concurrent scan, in which
case it won't have been marked appropriately by concurrent scan. If
the finalizer is a closure and is only reachable from the special, it
could be swept before it is run. Likewise, if the object is not marked
yet when the finalizer is set and then becomes unreachable before it
is marked, other objects reachable only from it may be swept before
the finalizer function is run. We fix this issue by making
addfinalizer ensure the same marking invariants as markroot does.
For multi-gigabyte heaps, this reduces max pause time by 20%–30%
relative to tip (depending on GOMAXPROCS) and by ~50% relative to Go
1.5.1 (where this loop was neither concurrent nor parallel). Here are
the results for the garbage benchmark:
---------------- max pause ----------------
Heap Procs Concurrent scan STW parallel scan 1.5.1
24GB 12 18ms 23ms 37ms
24GB 4 18ms 25ms 37ms
4GB 4 3.8ms 4.9ms 6.9ms
In all cases, 95%ile pause time is similar to the max pause time. This
also improves mean STW time by 10%–30%.
Fixes #11485.
Change-Id: I9359d8c3d120a51d23d924b52bf853a1299b1dfd
Reviewed-on: https://go-review.googlesource.com/14982
Reviewed-by: Rick Hudson <rlh@golang.org>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2015-09-24 14:39:27 -04:00
|
|
|
mp := acquirem()
|
|
|
|
|
gcw := &mp.p.ptr().gcw
|
|
|
|
|
// Mark everything reachable from the object
|
|
|
|
|
// so it's retained for the finalizer.
|
2017-12-04 10:43:11 -05:00
|
|
|
scanobject(base, gcw)
|
runtime: scan objects with finalizers concurrently
This reduces pause time by ~25% relative to tip and by ~50% relative
to Go 1.5.1.
Currently one of the steps of STW mark termination is to loop (in
parallel) over all spans to find objects with finalizers in order to
mark all objects reachable from these objects and to treat the
finalizer special as a root. Unfortunately, even if there are no
finalizers at all, this loop takes roughly 1 ms/heap GB/core, so
multi-gigabyte heaps can quickly push our STW time past 10ms.
Fix this by moving this scan from mark termination to concurrent scan,
where it can run in parallel with mutators. The loop itself could also
be optimized, but this cost is small compared to concurrent marking.
Making this scan concurrent introduces two complications:
1) The scan currently walks the specials list of each span without
locking it, which is safe only with the world stopped. We fix this by
speculatively checking if a span has any specials (the vast majority
won't) and then locking the specials list only if there are specials
to check.
2) An object can have a finalizer set after concurrent scan, in which
case it won't have been marked appropriately by concurrent scan. If
the finalizer is a closure and is only reachable from the special, it
could be swept before it is run. Likewise, if the object is not marked
yet when the finalizer is set and then becomes unreachable before it
is marked, other objects reachable only from it may be swept before
the finalizer function is run. We fix this issue by making
addfinalizer ensure the same marking invariants as markroot does.
For multi-gigabyte heaps, this reduces max pause time by 20%–30%
relative to tip (depending on GOMAXPROCS) and by ~50% relative to Go
1.5.1 (where this loop was neither concurrent nor parallel). Here are
the results for the garbage benchmark:
---------------- max pause ----------------
Heap Procs Concurrent scan STW parallel scan 1.5.1
24GB 12 18ms 23ms 37ms
24GB 4 18ms 25ms 37ms
4GB 4 3.8ms 4.9ms 6.9ms
In all cases, 95%ile pause time is similar to the max pause time. This
also improves mean STW time by 10%–30%.
Fixes #11485.
Change-Id: I9359d8c3d120a51d23d924b52bf853a1299b1dfd
Reviewed-on: https://go-review.googlesource.com/14982
Reviewed-by: Rick Hudson <rlh@golang.org>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2015-09-24 14:39:27 -04:00
|
|
|
// Mark the finalizer itself, since the
|
|
|
|
|
// special isn't part of the GC'd heap.
|
2021-06-16 23:05:44 +00:00
|
|
|
scanblock(uintptr(unsafe.Pointer(&s.fn)), goarch.PtrSize, &oneptrmask[0], gcw, nil)
|
runtime: scan objects with finalizers concurrently
This reduces pause time by ~25% relative to tip and by ~50% relative
to Go 1.5.1.
Currently one of the steps of STW mark termination is to loop (in
parallel) over all spans to find objects with finalizers in order to
mark all objects reachable from these objects and to treat the
finalizer special as a root. Unfortunately, even if there are no
finalizers at all, this loop takes roughly 1 ms/heap GB/core, so
multi-gigabyte heaps can quickly push our STW time past 10ms.
Fix this by moving this scan from mark termination to concurrent scan,
where it can run in parallel with mutators. The loop itself could also
be optimized, but this cost is small compared to concurrent marking.
Making this scan concurrent introduces two complications:
1) The scan currently walks the specials list of each span without
locking it, which is safe only with the world stopped. We fix this by
speculatively checking if a span has any specials (the vast majority
won't) and then locking the specials list only if there are specials
to check.
2) An object can have a finalizer set after concurrent scan, in which
case it won't have been marked appropriately by concurrent scan. If
the finalizer is a closure and is only reachable from the special, it
could be swept before it is run. Likewise, if the object is not marked
yet when the finalizer is set and then becomes unreachable before it
is marked, other objects reachable only from it may be swept before
the finalizer function is run. We fix this issue by making
addfinalizer ensure the same marking invariants as markroot does.
For multi-gigabyte heaps, this reduces max pause time by 20%–30%
relative to tip (depending on GOMAXPROCS) and by ~50% relative to Go
1.5.1 (where this loop was neither concurrent nor parallel). Here are
the results for the garbage benchmark:
---------------- max pause ----------------
Heap Procs Concurrent scan STW parallel scan 1.5.1
24GB 12 18ms 23ms 37ms
24GB 4 18ms 25ms 37ms
4GB 4 3.8ms 4.9ms 6.9ms
In all cases, 95%ile pause time is similar to the max pause time. This
also improves mean STW time by 10%–30%.
Fixes #11485.
Change-Id: I9359d8c3d120a51d23d924b52bf853a1299b1dfd
Reviewed-on: https://go-review.googlesource.com/14982
Reviewed-by: Rick Hudson <rlh@golang.org>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2015-09-24 14:39:27 -04:00
|
|
|
releasem(mp)
|
|
|
|
|
}
|
2014-11-11 17:05:02 -05:00
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// There was an old finalizer
|
|
|
|
|
lock(&mheap_.speciallock)
|
2015-11-11 16:13:51 -08:00
|
|
|
mheap_.specialfinalizeralloc.free(unsafe.Pointer(s))
|
2014-11-11 17:05:02 -05:00
|
|
|
unlock(&mheap_.speciallock)
|
|
|
|
|
return false
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Removes the finalizer (if any) from the object p.
|
|
|
|
|
func removefinalizer(p unsafe.Pointer) {
|
|
|
|
|
s := (*specialfinalizer)(unsafe.Pointer(removespecial(p, _KindSpecialFinalizer)))
|
|
|
|
|
if s == nil {
|
|
|
|
|
return // there wasn't a finalizer to remove
|
|
|
|
|
}
|
|
|
|
|
lock(&mheap_.speciallock)
|
2015-11-11 16:13:51 -08:00
|
|
|
mheap_.specialfinalizeralloc.free(unsafe.Pointer(s))
|
2014-11-11 17:05:02 -05:00
|
|
|
unlock(&mheap_.speciallock)
|
|
|
|
|
}
|
|
|
|
|
|
2015-02-19 13:38:46 -05:00
|
|
|
// The described object is being heap profiled.
|
2016-10-11 22:58:21 -04:00
|
|
|
//
|
|
|
|
|
//go:notinheap
|
2015-02-19 13:38:46 -05:00
|
|
|
type specialprofile struct {
|
|
|
|
|
special special
|
|
|
|
|
b *bucket
|
|
|
|
|
}
|
|
|
|
|
|
2014-11-11 17:05:02 -05:00
|
|
|
// Set the heap profile bucket associated with addr to b.
|
|
|
|
|
func setprofilebucket(p unsafe.Pointer, b *bucket) {
|
|
|
|
|
lock(&mheap_.speciallock)
|
2015-11-11 16:13:51 -08:00
|
|
|
s := (*specialprofile)(mheap_.specialprofilealloc.alloc())
|
2014-11-11 17:05:02 -05:00
|
|
|
unlock(&mheap_.speciallock)
|
|
|
|
|
s.special.kind = _KindSpecialProfile
|
|
|
|
|
s.b = b
|
|
|
|
|
if !addspecial(p, &s.special) {
|
2014-12-27 20:58:00 -08:00
|
|
|
throw("setprofilebucket: profile already set")
|
2014-11-11 17:05:02 -05:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2021-03-24 10:45:20 -04:00
|
|
|
// specialReachable tracks whether an object is reachable on the next
|
|
|
|
|
// GC cycle. This is used by testing.
|
|
|
|
|
type specialReachable struct {
|
|
|
|
|
special special
|
|
|
|
|
done bool
|
|
|
|
|
reachable bool
|
|
|
|
|
}
|
|
|
|
|
|
2021-03-22 15:00:22 -04:00
|
|
|
// specialsIter helps iterate over specials lists.
|
|
|
|
|
type specialsIter struct {
|
|
|
|
|
pprev **special
|
|
|
|
|
s *special
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func newSpecialsIter(span *mspan) specialsIter {
|
|
|
|
|
return specialsIter{&span.specials, span.specials}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (i *specialsIter) valid() bool {
|
|
|
|
|
return i.s != nil
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (i *specialsIter) next() {
|
|
|
|
|
i.pprev = &i.s.next
|
|
|
|
|
i.s = *i.pprev
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// unlinkAndNext removes the current special from the list and moves
|
|
|
|
|
// the iterator to the next special. It returns the unlinked special.
|
|
|
|
|
func (i *specialsIter) unlinkAndNext() *special {
|
|
|
|
|
cur := i.s
|
|
|
|
|
i.s = cur.next
|
|
|
|
|
*i.pprev = i.s
|
|
|
|
|
return cur
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// freeSpecial performs any cleanup on special s and deallocates it.
|
|
|
|
|
// s must already be unlinked from the specials list.
|
|
|
|
|
func freeSpecial(s *special, p unsafe.Pointer, size uintptr) {
|
2014-11-11 17:05:02 -05:00
|
|
|
switch s.kind {
|
|
|
|
|
case _KindSpecialFinalizer:
|
|
|
|
|
sf := (*specialfinalizer)(unsafe.Pointer(s))
|
|
|
|
|
queuefinalizer(p, sf.fn, sf.nret, sf.fint, sf.ot)
|
|
|
|
|
lock(&mheap_.speciallock)
|
2015-11-11 16:13:51 -08:00
|
|
|
mheap_.specialfinalizeralloc.free(unsafe.Pointer(sf))
|
2014-11-11 17:05:02 -05:00
|
|
|
unlock(&mheap_.speciallock)
|
|
|
|
|
case _KindSpecialProfile:
|
|
|
|
|
sp := (*specialprofile)(unsafe.Pointer(s))
|
2015-11-03 20:00:21 +01:00
|
|
|
mProf_Free(sp.b, size)
|
2014-11-11 17:05:02 -05:00
|
|
|
lock(&mheap_.speciallock)
|
2015-11-11 16:13:51 -08:00
|
|
|
mheap_.specialprofilealloc.free(unsafe.Pointer(sp))
|
2014-11-11 17:05:02 -05:00
|
|
|
unlock(&mheap_.speciallock)
|
2021-03-24 10:45:20 -04:00
|
|
|
case _KindSpecialReachable:
|
|
|
|
|
sp := (*specialReachable)(unsafe.Pointer(s))
|
|
|
|
|
sp.done = true
|
|
|
|
|
// The creator frees these.
|
2014-11-11 17:05:02 -05:00
|
|
|
default:
|
2014-12-27 20:58:00 -08:00
|
|
|
throw("bad special kind")
|
2014-11-11 17:05:02 -05:00
|
|
|
panic("not reached")
|
|
|
|
|
}
|
|
|
|
|
}
|
2016-03-14 12:17:48 -04:00
|
|
|
|
2017-03-24 12:02:12 -04:00
|
|
|
// gcBits is an alloc/mark bitmap. This is always used as *gcBits.
|
|
|
|
|
//
|
|
|
|
|
//go:notinheap
|
|
|
|
|
type gcBits uint8
|
|
|
|
|
|
|
|
|
|
// bytep returns a pointer to the n'th byte of b.
|
|
|
|
|
func (b *gcBits) bytep(n uintptr) *uint8 {
|
|
|
|
|
return addb((*uint8)(b), n)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// bitp returns a pointer to the byte containing bit n and a mask for
|
|
|
|
|
// selecting that bit from *bytep.
|
|
|
|
|
func (b *gcBits) bitp(n uintptr) (bytep *uint8, mask uint8) {
|
|
|
|
|
return b.bytep(n / 8), 1 << (n % 8)
|
|
|
|
|
}
|
|
|
|
|
|
2016-05-09 11:29:34 -04:00
|
|
|
const gcBitsChunkBytes = uintptr(64 << 10)
|
2016-03-14 12:17:48 -04:00
|
|
|
const gcBitsHeaderBytes = unsafe.Sizeof(gcBitsHeader{})
|
|
|
|
|
|
|
|
|
|
type gcBitsHeader struct {
|
|
|
|
|
free uintptr // free is the index into bits of the next free byte.
|
|
|
|
|
next uintptr // *gcBits triggers recursive type bug. (issue 14620)
|
|
|
|
|
}
|
|
|
|
|
|
2016-10-11 22:58:21 -04:00
|
|
|
//go:notinheap
|
2017-03-24 11:36:40 -04:00
|
|
|
type gcBitsArena struct {
|
2016-03-14 12:17:48 -04:00
|
|
|
// gcBitsHeader // side step recursive type bug (issue 14620) by including fields by hand.
|
2016-12-17 22:07:27 -05:00
|
|
|
free uintptr // free is the index into bits of the next free byte; read/write atomically
|
2017-03-24 11:36:40 -04:00
|
|
|
next *gcBitsArena
|
2017-03-24 12:02:12 -04:00
|
|
|
bits [gcBitsChunkBytes - gcBitsHeaderBytes]gcBits
|
2016-03-14 12:17:48 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
var gcBitsArenas struct {
|
|
|
|
|
lock mutex
|
2017-03-24 11:36:40 -04:00
|
|
|
free *gcBitsArena
|
|
|
|
|
next *gcBitsArena // Read atomically. Write atomically under lock.
|
|
|
|
|
current *gcBitsArena
|
|
|
|
|
previous *gcBitsArena
|
2016-03-14 12:17:48 -04:00
|
|
|
}
|
|
|
|
|
|
2016-12-16 15:56:13 -05:00
|
|
|
// tryAlloc allocates from b or returns nil if b does not have enough room.
|
2016-12-17 22:07:27 -05:00
|
|
|
// This is safe to call concurrently.
|
2017-03-24 12:02:12 -04:00
|
|
|
func (b *gcBitsArena) tryAlloc(bytes uintptr) *gcBits {
|
2016-12-17 22:07:27 -05:00
|
|
|
if b == nil || atomic.Loaduintptr(&b.free)+bytes > uintptr(len(b.bits)) {
|
2016-12-16 15:56:13 -05:00
|
|
|
return nil
|
|
|
|
|
}
|
2016-12-17 22:07:27 -05:00
|
|
|
// Try to allocate from this block.
|
|
|
|
|
end := atomic.Xadduintptr(&b.free, bytes)
|
|
|
|
|
if end > uintptr(len(b.bits)) {
|
|
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
// There was enough room.
|
|
|
|
|
start := end - bytes
|
|
|
|
|
return &b.bits[start]
|
2016-12-16 15:56:13 -05:00
|
|
|
}
|
|
|
|
|
|
2016-03-14 12:17:48 -04:00
|
|
|
// newMarkBits returns a pointer to 8 byte aligned bytes
|
|
|
|
|
// to be used for a span's mark bits.
|
2017-03-24 12:02:12 -04:00
|
|
|
func newMarkBits(nelems uintptr) *gcBits {
|
2016-03-14 12:17:48 -04:00
|
|
|
blocksNeeded := uintptr((nelems + 63) / 64)
|
|
|
|
|
bytesNeeded := blocksNeeded * 8
|
2016-12-17 22:07:27 -05:00
|
|
|
|
|
|
|
|
// Try directly allocating from the current head arena.
|
2017-03-24 11:36:40 -04:00
|
|
|
head := (*gcBitsArena)(atomic.Loadp(unsafe.Pointer(&gcBitsArenas.next)))
|
2016-12-17 22:07:27 -05:00
|
|
|
if p := head.tryAlloc(bytesNeeded); p != nil {
|
|
|
|
|
return p
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// There's not enough room in the head arena. We may need to
|
|
|
|
|
// allocate a new arena.
|
|
|
|
|
lock(&gcBitsArenas.lock)
|
|
|
|
|
// Try the head arena again, since it may have changed. Now
|
|
|
|
|
// that we hold the lock, the list head can't change, but its
|
|
|
|
|
// free position still can.
|
2016-12-16 15:56:13 -05:00
|
|
|
if p := gcBitsArenas.next.tryAlloc(bytesNeeded); p != nil {
|
|
|
|
|
unlock(&gcBitsArenas.lock)
|
|
|
|
|
return p
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Allocate a new arena. This may temporarily drop the lock.
|
|
|
|
|
fresh := newArenaMayUnlock()
|
|
|
|
|
// If newArenaMayUnlock dropped the lock, another thread may
|
|
|
|
|
// have put a fresh arena on the "next" list. Try allocating
|
|
|
|
|
// from next again.
|
|
|
|
|
if p := gcBitsArenas.next.tryAlloc(bytesNeeded); p != nil {
|
|
|
|
|
// Put fresh back on the free list.
|
|
|
|
|
// TODO: Mark it "already zeroed"
|
|
|
|
|
fresh.next = gcBitsArenas.free
|
|
|
|
|
gcBitsArenas.free = fresh
|
|
|
|
|
unlock(&gcBitsArenas.lock)
|
|
|
|
|
return p
|
|
|
|
|
}
|
|
|
|
|
|
2016-12-17 22:07:27 -05:00
|
|
|
// Allocate from the fresh arena. We haven't linked it in yet, so
|
|
|
|
|
// this cannot race and is guaranteed to succeed.
|
2016-12-16 15:56:13 -05:00
|
|
|
p := fresh.tryAlloc(bytesNeeded)
|
|
|
|
|
if p == nil {
|
2016-03-14 12:17:48 -04:00
|
|
|
throw("markBits overflow")
|
|
|
|
|
}
|
2016-12-16 15:56:13 -05:00
|
|
|
|
|
|
|
|
// Add the fresh arena to the "next" list.
|
|
|
|
|
fresh.next = gcBitsArenas.next
|
2016-12-17 22:07:27 -05:00
|
|
|
atomic.StorepNoWB(unsafe.Pointer(&gcBitsArenas.next), unsafe.Pointer(fresh))
|
2016-12-16 15:56:13 -05:00
|
|
|
|
2016-03-14 12:17:48 -04:00
|
|
|
unlock(&gcBitsArenas.lock)
|
2016-12-16 15:56:13 -05:00
|
|
|
return p
|
2016-03-14 12:17:48 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// newAllocBits returns a pointer to 8 byte aligned bytes
|
|
|
|
|
// to be used for this span's alloc bits.
|
|
|
|
|
// newAllocBits is used to provide newly initialized spans
|
|
|
|
|
// allocation bits. For spans not being initialized the
|
2018-02-20 15:10:49 +00:00
|
|
|
// mark bits are repurposed as allocation bits when
|
2016-03-14 12:17:48 -04:00
|
|
|
// the span is swept.
|
2017-03-24 12:02:12 -04:00
|
|
|
func newAllocBits(nelems uintptr) *gcBits {
|
2016-03-14 12:17:48 -04:00
|
|
|
return newMarkBits(nelems)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// nextMarkBitArenaEpoch establishes a new epoch for the arenas
|
|
|
|
|
// holding the mark bits. The arenas are named relative to the
|
|
|
|
|
// current GC cycle which is demarcated by the call to finishweep_m.
|
|
|
|
|
//
|
|
|
|
|
// All current spans have been swept.
|
|
|
|
|
// During that sweep each span allocated room for its gcmarkBits in
|
|
|
|
|
// gcBitsArenas.next block. gcBitsArenas.next becomes the gcBitsArenas.current
|
|
|
|
|
// where the GC will mark objects and after each span is swept these bits
|
|
|
|
|
// will be used to allocate objects.
|
|
|
|
|
// gcBitsArenas.current becomes gcBitsArenas.previous where the span's
|
|
|
|
|
// gcAllocBits live until all the spans have been swept during this GC cycle.
|
|
|
|
|
// The span's sweep extinguishes all the references to gcBitsArenas.previous
|
|
|
|
|
// by pointing gcAllocBits into the gcBitsArenas.current.
|
|
|
|
|
// The gcBitsArenas.previous is released to the gcBitsArenas.free list.
|
|
|
|
|
func nextMarkBitArenaEpoch() {
|
|
|
|
|
lock(&gcBitsArenas.lock)
|
|
|
|
|
if gcBitsArenas.previous != nil {
|
|
|
|
|
if gcBitsArenas.free == nil {
|
|
|
|
|
gcBitsArenas.free = gcBitsArenas.previous
|
|
|
|
|
} else {
|
|
|
|
|
// Find end of previous arenas.
|
|
|
|
|
last := gcBitsArenas.previous
|
|
|
|
|
for last = gcBitsArenas.previous; last.next != nil; last = last.next {
|
|
|
|
|
}
|
|
|
|
|
last.next = gcBitsArenas.free
|
|
|
|
|
gcBitsArenas.free = gcBitsArenas.previous
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
gcBitsArenas.previous = gcBitsArenas.current
|
|
|
|
|
gcBitsArenas.current = gcBitsArenas.next
|
2016-12-17 22:07:27 -05:00
|
|
|
atomic.StorepNoWB(unsafe.Pointer(&gcBitsArenas.next), nil) // newMarkBits calls newArena when needed
|
2016-03-14 12:17:48 -04:00
|
|
|
unlock(&gcBitsArenas.lock)
|
|
|
|
|
}
|
|
|
|
|
|
2016-12-16 15:56:13 -05:00
|
|
|
// newArenaMayUnlock allocates and zeroes a gcBits arena.
|
|
|
|
|
// The caller must hold gcBitsArena.lock. This may temporarily release it.
|
2017-03-24 11:36:40 -04:00
|
|
|
func newArenaMayUnlock() *gcBitsArena {
|
|
|
|
|
var result *gcBitsArena
|
2016-03-14 12:17:48 -04:00
|
|
|
if gcBitsArenas.free == nil {
|
2016-12-16 15:56:13 -05:00
|
|
|
unlock(&gcBitsArenas.lock)
|
2020-08-03 19:23:30 +00:00
|
|
|
result = (*gcBitsArena)(sysAlloc(gcBitsChunkBytes, &memstats.gcMiscSys))
|
2016-03-14 12:17:48 -04:00
|
|
|
if result == nil {
|
|
|
|
|
throw("runtime: cannot allocate memory")
|
|
|
|
|
}
|
2016-12-16 15:56:13 -05:00
|
|
|
lock(&gcBitsArenas.lock)
|
2016-03-14 12:17:48 -04:00
|
|
|
} else {
|
|
|
|
|
result = gcBitsArenas.free
|
|
|
|
|
gcBitsArenas.free = gcBitsArenas.free.next
|
2016-10-17 18:41:56 -04:00
|
|
|
memclrNoHeapPointers(unsafe.Pointer(result), gcBitsChunkBytes)
|
2016-03-14 12:17:48 -04:00
|
|
|
}
|
|
|
|
|
result.next = nil
|
|
|
|
|
// If result.bits is not 8 byte aligned adjust index so
|
|
|
|
|
// that &result.bits[result.free] is 8 byte aligned.
|
2017-03-24 11:36:40 -04:00
|
|
|
if uintptr(unsafe.Offsetof(gcBitsArena{}.bits))&7 == 0 {
|
2016-03-14 12:17:48 -04:00
|
|
|
result.free = 0
|
|
|
|
|
} else {
|
|
|
|
|
result.free = 8 - (uintptr(unsafe.Pointer(&result.bits[0])) & 7)
|
|
|
|
|
}
|
|
|
|
|
return result
|
|
|
|
|
}
|