| 
									
										
										
										
											2014-07-30 09:01:52 -07:00
										 |  |  | // Copyright 2014 The Go Authors. All rights reserved. | 
					
						
							|  |  |  | // Use of this source code is governed by a BSD-style | 
					
						
							|  |  |  | // license that can be found in the LICENSE file. | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-09-23 11:47:24 -04:00
										 |  |  | // Memory allocator. | 
					
						
							|  |  |  | // | 
					
						
							|  |  |  | // This was originally based on tcmalloc, but has diverged quite a bit. | 
					
						
							| 
									
										
										
										
											2015-02-19 13:38:46 -05:00
										 |  |  | // http://goog-perftools.sourceforge.net/doc/tcmalloc.html | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | // The main allocator works in runs of pages. | 
					
						
							|  |  |  | // Small allocation sizes (up to and including 32 kB) are | 
					
						
							| 
									
										
										
										
											2016-09-23 11:47:24 -04:00
										 |  |  | // rounded to one of about 70 size classes, each of which | 
					
						
							|  |  |  | // has its own free set of objects of exactly that size. | 
					
						
							| 
									
										
										
										
											2015-02-19 13:38:46 -05:00
										 |  |  | // Any free page of memory can be split into a set of objects | 
					
						
							| 
									
										
										
										
											2016-09-23 11:47:24 -04:00
										 |  |  | // of one size class, which are then managed using a free bitmap. | 
					
						
							| 
									
										
										
										
											2015-02-19 13:38:46 -05:00
										 |  |  | // | 
					
						
							|  |  |  | // The allocator's data structures are: | 
					
						
							|  |  |  | // | 
					
						
							| 
									
										
										
										
											2016-09-23 11:47:24 -04:00
										 |  |  | //	fixalloc: a free-list allocator for fixed-size off-heap objects, | 
					
						
							| 
									
										
										
										
											2015-02-19 13:38:46 -05:00
										 |  |  | //		used to manage storage used by the allocator. | 
					
						
							| 
									
										
										
										
											2016-09-23 11:47:24 -04:00
										 |  |  | //	mheap: the malloc heap, managed at page (8192-byte) granularity. | 
					
						
							| 
									
										
										
										
											2019-10-28 20:38:59 +00:00
										 |  |  | //	mspan: a run of in-use pages managed by the mheap. | 
					
						
							| 
									
										
										
										
											2016-09-23 11:47:24 -04:00
										 |  |  | //	mcentral: collects all spans of a given size class. | 
					
						
							|  |  |  | //	mcache: a per-P cache of mspans with free space. | 
					
						
							|  |  |  | //	mstats: allocation statistics. | 
					
						
							| 
									
										
										
										
											2015-02-19 13:38:46 -05:00
										 |  |  | // | 
					
						
							|  |  |  | // Allocating a small object proceeds up a hierarchy of caches: | 
					
						
							|  |  |  | // | 
					
						
							|  |  |  | //	1. Round the size up to one of the small size classes | 
					
						
							| 
									
										
										
										
											2016-09-23 11:47:24 -04:00
										 |  |  | //	   and look in the corresponding mspan in this P's mcache. | 
					
						
							|  |  |  | //	   Scan the mspan's free bitmap to find a free slot. | 
					
						
							|  |  |  | //	   If there is a free slot, allocate it. | 
					
						
							| 
									
										
										
										
											2015-02-19 13:38:46 -05:00
										 |  |  | //	   This can all be done without acquiring a lock. | 
					
						
							|  |  |  | // | 
					
						
							| 
									
										
										
										
											2016-09-23 11:47:24 -04:00
										 |  |  | //	2. If the mspan has no free slots, obtain a new mspan | 
					
						
							|  |  |  | //	   from the mcentral's list of mspans of the required size | 
					
						
							|  |  |  | //	   class that have free space. | 
					
						
							|  |  |  | //	   Obtaining a whole span amortizes the cost of locking | 
					
						
							|  |  |  | //	   the mcentral. | 
					
						
							| 
									
										
										
										
											2015-02-19 13:38:46 -05:00
										 |  |  | // | 
					
						
							| 
									
										
										
										
											2016-09-23 11:47:24 -04:00
										 |  |  | //	3. If the mcentral's mspan list is empty, obtain a run | 
					
						
							|  |  |  | //	   of pages from the mheap to use for the mspan. | 
					
						
							| 
									
										
										
										
											2015-02-19 13:38:46 -05:00
										 |  |  | // | 
					
						
							| 
									
										
										
										
											2016-09-23 11:47:24 -04:00
										 |  |  | //	4. If the mheap is empty or has no page runs large enough, | 
					
						
							| 
									
										
										
										
											2015-02-19 13:38:46 -05:00
										 |  |  | //	   allocate a new group of pages (at least 1MB) from the | 
					
						
							| 
									
										
										
										
											2016-09-23 11:47:24 -04:00
										 |  |  | //	   operating system. Allocating a large run of pages | 
					
						
							| 
									
										
										
										
											2015-02-19 13:38:46 -05:00
										 |  |  | //	   amortizes the cost of talking to the operating system. | 
					
						
							|  |  |  | // | 
					
						
							| 
									
										
										
										
											2016-09-23 11:47:24 -04:00
										 |  |  | // Sweeping an mspan and freeing objects on it proceeds up a similar | 
					
						
							|  |  |  | // hierarchy: | 
					
						
							|  |  |  | // | 
					
						
							|  |  |  | //	1. If the mspan is being swept in response to allocation, it | 
					
						
							|  |  |  | //	   is returned to the mcache to satisfy the allocation. | 
					
						
							| 
									
										
										
										
											2015-02-19 13:38:46 -05:00
										 |  |  | // | 
					
						
							| 
									
										
										
										
											2016-09-23 11:47:24 -04:00
										 |  |  | //	2. Otherwise, if the mspan still has allocated objects in it, | 
					
						
							|  |  |  | //	   it is placed on the mcentral free list for the mspan's size | 
					
						
							|  |  |  | //	   class. | 
					
						
							| 
									
										
										
										
											2015-02-19 13:38:46 -05:00
										 |  |  | // | 
					
						
							| 
									
										
										
										
											2019-10-28 20:38:59 +00:00
										 |  |  | //	3. Otherwise, if all objects in the mspan are free, the mspan's | 
					
						
							|  |  |  | //	   pages are returned to the mheap and the mspan is now dead. | 
					
						
							| 
									
										
										
										
											2015-02-19 13:38:46 -05:00
										 |  |  | // | 
					
						
							| 
									
										
										
										
											2016-09-23 11:47:24 -04:00
										 |  |  | // Allocating and freeing a large object uses the mheap | 
					
						
							|  |  |  | // directly, bypassing the mcache and mcentral. | 
					
						
							| 
									
										
										
										
											2015-02-19 13:38:46 -05:00
										 |  |  | // | 
					
						
							| 
									
										
										
										
											2020-02-05 01:21:11 +00:00
										 |  |  | // If mspan.needzero is false, then free object slots in the mspan are | 
					
						
							|  |  |  | // already zeroed. Otherwise if needzero is true, objects are zeroed as | 
					
						
							|  |  |  | // they are allocated. There are various benefits to delaying zeroing | 
					
						
							|  |  |  | // this way: | 
					
						
							| 
									
										
										
										
											2015-02-19 13:38:46 -05:00
										 |  |  | // | 
					
						
							| 
									
										
										
										
											2016-09-23 11:47:24 -04:00
										 |  |  | //	1. Stack frame allocation can avoid zeroing altogether. | 
					
						
							| 
									
										
										
										
											2015-02-19 13:38:46 -05:00
										 |  |  | // | 
					
						
							| 
									
										
										
										
											2016-09-23 11:47:24 -04:00
										 |  |  | //	2. It exhibits better temporal locality, since the program is | 
					
						
							|  |  |  | //	   probably about to write to the memory. | 
					
						
							| 
									
										
										
										
											2015-02-19 13:38:46 -05:00
										 |  |  | // | 
					
						
							| 
									
										
										
										
											2016-09-23 11:47:24 -04:00
										 |  |  | //	3. We don't zero pages that never get reused. | 
					
						
							| 
									
										
										
										
											2015-02-19 13:38:46 -05:00
										 |  |  | 
 | 
					
						
							| 
									
										
											  
											
												runtime: use sparse mappings for the heap
This replaces the contiguous heap arena mapping with a potentially
sparse mapping that can support heap mappings anywhere in the address
space.
This has several advantages over the current approach:
* There is no longer any limit on the size of the Go heap. (Currently
  it's limited to 512GB.) Hence, this fixes #10460.
* It eliminates many failures modes of heap initialization and
  growing. In particular it eliminates any possibility of panicking
  with an address space conflict. This can happen for many reasons and
  even causes a low but steady rate of TSAN test failures because of
  conflicts with the TSAN runtime. See #16936 and #11993.
* It eliminates the notion of "non-reserved" heap, which was added
  because creating huge address space reservations (particularly on
  64-bit) led to huge process VSIZE. This was at best confusing and at
  worst conflicted badly with ulimit -v. However, the non-reserved
  heap logic is complicated, can race with other mappings in non-pure
  Go binaries (e.g., #18976), and requires that the entire heap be
  either reserved or non-reserved. We currently maintain the latter
  property, but it's quite difficult to convince yourself of that, and
  hence difficult to keep correct. This logic is still present, but
  will be removed in the next CL.
* It fixes problems on 32-bit where skipping over parts of the address
  space leads to mapping huge (and never-to-be-used) metadata
  structures. See #19831.
This also completely rewrites and significantly simplifies
mheap.sysAlloc, which has been a source of many bugs. E.g., #21044,
 #20259, #18651, and #13143 (and maybe #23222).
This change also makes it possible to allocate individual objects
larger than 512GB. As a result, a few tests that expected huge
allocations to fail needed to be changed to make even larger
allocations. However, at the moment attempting to allocate a humongous
object may cause the program to freeze for several minutes on Linux as
we fall back to probing every page with addrspace_free. That logic
(and this failure mode) will be removed in the next CL.
Fixes #10460.
Fixes #22204 (since it rewrites the code involved).
This slightly slows down compilebench and the x/benchmarks garbage
benchmark.
name       old time/op     new time/op     delta
Template       184ms ± 1%      185ms ± 1%    ~     (p=0.065 n=10+9)
Unicode       86.9ms ± 3%     86.3ms ± 1%    ~     (p=0.631 n=10+10)
GoTypes        599ms ± 0%      602ms ± 0%  +0.56%  (p=0.000 n=10+9)
Compiler       2.87s ± 1%      2.89s ± 1%  +0.51%  (p=0.002 n=9+10)
SSA            7.29s ± 1%      7.25s ± 1%    ~     (p=0.182 n=10+9)
Flate          118ms ± 2%      118ms ± 1%    ~     (p=0.113 n=9+9)
GoParser       147ms ± 1%      148ms ± 1%  +1.07%  (p=0.003 n=9+10)
Reflect        401ms ± 1%      404ms ± 1%  +0.71%  (p=0.003 n=10+9)
Tar            175ms ± 1%      175ms ± 1%    ~     (p=0.604 n=9+10)
XML            209ms ± 1%      210ms ± 1%    ~     (p=0.052 n=10+10)
(https://perf.golang.org/search?q=upload:20171231.4)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.23ms ± 1%  2.25ms ± 1%  +0.84%  (p=0.000 n=19+19)
(https://perf.golang.org/search?q=upload:20171231.3)
Relative to the start of the sparse heap changes (starting at and
including "runtime: fix various contiguous bitmap assumptions"),
overall slowdown is roughly 1% on GC-intensive benchmarks:
name        old time/op     new time/op     delta
Template        183ms ± 1%      185ms ± 1%  +1.32%  (p=0.000 n=9+9)
Unicode        84.9ms ± 2%     86.3ms ± 1%  +1.65%  (p=0.000 n=9+10)
GoTypes         595ms ± 1%      602ms ± 0%  +1.19%  (p=0.000 n=9+9)
Compiler        2.86s ± 0%      2.89s ± 1%  +0.91%  (p=0.000 n=9+10)
SSA             7.19s ± 0%      7.25s ± 1%  +0.75%  (p=0.000 n=8+9)
Flate           117ms ± 1%      118ms ± 1%  +1.10%  (p=0.000 n=10+9)
GoParser        146ms ± 2%      148ms ± 1%  +1.48%  (p=0.002 n=10+10)
Reflect         398ms ± 1%      404ms ± 1%  +1.51%  (p=0.000 n=10+9)
Tar             173ms ± 1%      175ms ± 1%  +1.17%  (p=0.000 n=10+10)
XML             208ms ± 1%      210ms ± 1%  +0.62%  (p=0.011 n=10+10)
[Geo mean]      369ms           373ms       +1.17%
(https://perf.golang.org/search?q=upload:20180101.2)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.22ms ± 1%  2.25ms ± 1%  +1.51%  (p=0.000 n=20+19)
(https://perf.golang.org/search?q=upload:20180101.3)
Change-Id: I5daf4cfec24b252e5a57001f0a6c03f22479d0f0
Reviewed-on: https://go-review.googlesource.com/85887
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2017-12-19 22:05:23 -08:00
										 |  |  | // Virtual memory layout | 
					
						
							|  |  |  | // | 
					
						
							|  |  |  | // The heap consists of a set of arenas, which are 64MB on 64-bit and | 
					
						
							|  |  |  | // 4MB on 32-bit (heapArenaBytes). Each arena's start address is also | 
					
						
							|  |  |  | // aligned to the arena size. | 
					
						
							|  |  |  | // | 
					
						
							|  |  |  | // Each arena has an associated heapArena object that stores the | 
					
						
							|  |  |  | // metadata for that arena: the heap bitmap for all words in the arena | 
					
						
							|  |  |  | // and the span map for all pages in the arena. heapArena objects are | 
					
						
							|  |  |  | // themselves allocated off-heap. | 
					
						
							|  |  |  | // | 
					
						
							|  |  |  | // Since arenas are aligned, the address space can be viewed as a | 
					
						
							| 
									
										
										
										
											2018-02-22 12:35:30 -05:00
										 |  |  | // series of arena frames. The arena map (mheap_.arenas) maps from | 
					
						
							| 
									
										
											  
											
												runtime: use sparse mappings for the heap
This replaces the contiguous heap arena mapping with a potentially
sparse mapping that can support heap mappings anywhere in the address
space.
This has several advantages over the current approach:
* There is no longer any limit on the size of the Go heap. (Currently
  it's limited to 512GB.) Hence, this fixes #10460.
* It eliminates many failures modes of heap initialization and
  growing. In particular it eliminates any possibility of panicking
  with an address space conflict. This can happen for many reasons and
  even causes a low but steady rate of TSAN test failures because of
  conflicts with the TSAN runtime. See #16936 and #11993.
* It eliminates the notion of "non-reserved" heap, which was added
  because creating huge address space reservations (particularly on
  64-bit) led to huge process VSIZE. This was at best confusing and at
  worst conflicted badly with ulimit -v. However, the non-reserved
  heap logic is complicated, can race with other mappings in non-pure
  Go binaries (e.g., #18976), and requires that the entire heap be
  either reserved or non-reserved. We currently maintain the latter
  property, but it's quite difficult to convince yourself of that, and
  hence difficult to keep correct. This logic is still present, but
  will be removed in the next CL.
* It fixes problems on 32-bit where skipping over parts of the address
  space leads to mapping huge (and never-to-be-used) metadata
  structures. See #19831.
This also completely rewrites and significantly simplifies
mheap.sysAlloc, which has been a source of many bugs. E.g., #21044,
 #20259, #18651, and #13143 (and maybe #23222).
This change also makes it possible to allocate individual objects
larger than 512GB. As a result, a few tests that expected huge
allocations to fail needed to be changed to make even larger
allocations. However, at the moment attempting to allocate a humongous
object may cause the program to freeze for several minutes on Linux as
we fall back to probing every page with addrspace_free. That logic
(and this failure mode) will be removed in the next CL.
Fixes #10460.
Fixes #22204 (since it rewrites the code involved).
This slightly slows down compilebench and the x/benchmarks garbage
benchmark.
name       old time/op     new time/op     delta
Template       184ms ± 1%      185ms ± 1%    ~     (p=0.065 n=10+9)
Unicode       86.9ms ± 3%     86.3ms ± 1%    ~     (p=0.631 n=10+10)
GoTypes        599ms ± 0%      602ms ± 0%  +0.56%  (p=0.000 n=10+9)
Compiler       2.87s ± 1%      2.89s ± 1%  +0.51%  (p=0.002 n=9+10)
SSA            7.29s ± 1%      7.25s ± 1%    ~     (p=0.182 n=10+9)
Flate          118ms ± 2%      118ms ± 1%    ~     (p=0.113 n=9+9)
GoParser       147ms ± 1%      148ms ± 1%  +1.07%  (p=0.003 n=9+10)
Reflect        401ms ± 1%      404ms ± 1%  +0.71%  (p=0.003 n=10+9)
Tar            175ms ± 1%      175ms ± 1%    ~     (p=0.604 n=9+10)
XML            209ms ± 1%      210ms ± 1%    ~     (p=0.052 n=10+10)
(https://perf.golang.org/search?q=upload:20171231.4)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.23ms ± 1%  2.25ms ± 1%  +0.84%  (p=0.000 n=19+19)
(https://perf.golang.org/search?q=upload:20171231.3)
Relative to the start of the sparse heap changes (starting at and
including "runtime: fix various contiguous bitmap assumptions"),
overall slowdown is roughly 1% on GC-intensive benchmarks:
name        old time/op     new time/op     delta
Template        183ms ± 1%      185ms ± 1%  +1.32%  (p=0.000 n=9+9)
Unicode        84.9ms ± 2%     86.3ms ± 1%  +1.65%  (p=0.000 n=9+10)
GoTypes         595ms ± 1%      602ms ± 0%  +1.19%  (p=0.000 n=9+9)
Compiler        2.86s ± 0%      2.89s ± 1%  +0.91%  (p=0.000 n=9+10)
SSA             7.19s ± 0%      7.25s ± 1%  +0.75%  (p=0.000 n=8+9)
Flate           117ms ± 1%      118ms ± 1%  +1.10%  (p=0.000 n=10+9)
GoParser        146ms ± 2%      148ms ± 1%  +1.48%  (p=0.002 n=10+10)
Reflect         398ms ± 1%      404ms ± 1%  +1.51%  (p=0.000 n=10+9)
Tar             173ms ± 1%      175ms ± 1%  +1.17%  (p=0.000 n=10+10)
XML             208ms ± 1%      210ms ± 1%  +0.62%  (p=0.011 n=10+10)
[Geo mean]      369ms           373ms       +1.17%
(https://perf.golang.org/search?q=upload:20180101.2)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.22ms ± 1%  2.25ms ± 1%  +1.51%  (p=0.000 n=20+19)
(https://perf.golang.org/search?q=upload:20180101.3)
Change-Id: I5daf4cfec24b252e5a57001f0a6c03f22479d0f0
Reviewed-on: https://go-review.googlesource.com/85887
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2017-12-19 22:05:23 -08:00
										 |  |  | // arena frame number to *heapArena, or nil for parts of the address | 
					
						
							| 
									
										
											  
											
												runtime: support a two-level arena map
Currently, the heap arena map is a single, large array that covers
every possible arena frame in the entire address space. This is
practical up to about 48 bits of address space with 64 MB arenas.
However, there are two problems with this:
1. mips64, ppc64, and s390x support full 64-bit address spaces (though
   on Linux only s390x has kernel support for 64-bit address spaces).
   On these platforms, it would be good to support these larger
   address spaces.
2. On Windows, processes are charged for untouched memory, so for
   processes with small heaps, the mostly-untouched 32 MB arena map
   plus a 64 MB arena are significant overhead. Hence, it would be
   good to reduce both the arena map size and the arena size, but with
   a single-level arena, these are inversely proportional.
This CL adds support for a two-level arena map. Arena frame numbers
are now divided into arenaL1Bits of L1 index and arenaL2Bits of L2
index.
At the moment, arenaL1Bits is always 0, so we effectively have a
single level map. We do a few things so that this has no cost beyond
the current single-level map:
1. We embed the L2 array directly in mheap, so if there's a single
   entry in the L2 array, the representation is identical to the
   current representation and there's no extra level of indirection.
2. Hot code that accesses the arena map is structured so that it
   optimizes to nearly the same machine code as it does currently.
3. We make some small tweaks to hot code paths and to the inliner
   itself to keep some important functions inlined despite their
   now-larger ASTs. In particular, this is necessary for
   heapBitsForAddr and heapBits.next.
Possibly as a result of some of the tweaks, this actually slightly
improves the performance of the x/benchmarks garbage benchmark:
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.28ms ± 1%  2.26ms ± 1%  -1.07%  (p=0.000 n=17+19)
(https://perf.golang.org/search?q=upload:20180223.2)
For #23900.
Change-Id: If5164e0961754f97eb9eca58f837f36d759505ff
Reviewed-on: https://go-review.googlesource.com/96779
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2018-02-22 20:38:09 -05:00
										 |  |  | // space not backed by the Go heap. The arena map is structured as a | 
					
						
							|  |  |  | // two-level array consisting of a "L1" arena map and many "L2" arena | 
					
						
							|  |  |  | // maps; however, since arenas are large, on many architectures, the | 
					
						
							|  |  |  | // arena map consists of a single, large L2 map. | 
					
						
							| 
									
										
											  
											
												runtime: use sparse mappings for the heap
This replaces the contiguous heap arena mapping with a potentially
sparse mapping that can support heap mappings anywhere in the address
space.
This has several advantages over the current approach:
* There is no longer any limit on the size of the Go heap. (Currently
  it's limited to 512GB.) Hence, this fixes #10460.
* It eliminates many failures modes of heap initialization and
  growing. In particular it eliminates any possibility of panicking
  with an address space conflict. This can happen for many reasons and
  even causes a low but steady rate of TSAN test failures because of
  conflicts with the TSAN runtime. See #16936 and #11993.
* It eliminates the notion of "non-reserved" heap, which was added
  because creating huge address space reservations (particularly on
  64-bit) led to huge process VSIZE. This was at best confusing and at
  worst conflicted badly with ulimit -v. However, the non-reserved
  heap logic is complicated, can race with other mappings in non-pure
  Go binaries (e.g., #18976), and requires that the entire heap be
  either reserved or non-reserved. We currently maintain the latter
  property, but it's quite difficult to convince yourself of that, and
  hence difficult to keep correct. This logic is still present, but
  will be removed in the next CL.
* It fixes problems on 32-bit where skipping over parts of the address
  space leads to mapping huge (and never-to-be-used) metadata
  structures. See #19831.
This also completely rewrites and significantly simplifies
mheap.sysAlloc, which has been a source of many bugs. E.g., #21044,
 #20259, #18651, and #13143 (and maybe #23222).
This change also makes it possible to allocate individual objects
larger than 512GB. As a result, a few tests that expected huge
allocations to fail needed to be changed to make even larger
allocations. However, at the moment attempting to allocate a humongous
object may cause the program to freeze for several minutes on Linux as
we fall back to probing every page with addrspace_free. That logic
(and this failure mode) will be removed in the next CL.
Fixes #10460.
Fixes #22204 (since it rewrites the code involved).
This slightly slows down compilebench and the x/benchmarks garbage
benchmark.
name       old time/op     new time/op     delta
Template       184ms ± 1%      185ms ± 1%    ~     (p=0.065 n=10+9)
Unicode       86.9ms ± 3%     86.3ms ± 1%    ~     (p=0.631 n=10+10)
GoTypes        599ms ± 0%      602ms ± 0%  +0.56%  (p=0.000 n=10+9)
Compiler       2.87s ± 1%      2.89s ± 1%  +0.51%  (p=0.002 n=9+10)
SSA            7.29s ± 1%      7.25s ± 1%    ~     (p=0.182 n=10+9)
Flate          118ms ± 2%      118ms ± 1%    ~     (p=0.113 n=9+9)
GoParser       147ms ± 1%      148ms ± 1%  +1.07%  (p=0.003 n=9+10)
Reflect        401ms ± 1%      404ms ± 1%  +0.71%  (p=0.003 n=10+9)
Tar            175ms ± 1%      175ms ± 1%    ~     (p=0.604 n=9+10)
XML            209ms ± 1%      210ms ± 1%    ~     (p=0.052 n=10+10)
(https://perf.golang.org/search?q=upload:20171231.4)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.23ms ± 1%  2.25ms ± 1%  +0.84%  (p=0.000 n=19+19)
(https://perf.golang.org/search?q=upload:20171231.3)
Relative to the start of the sparse heap changes (starting at and
including "runtime: fix various contiguous bitmap assumptions"),
overall slowdown is roughly 1% on GC-intensive benchmarks:
name        old time/op     new time/op     delta
Template        183ms ± 1%      185ms ± 1%  +1.32%  (p=0.000 n=9+9)
Unicode        84.9ms ± 2%     86.3ms ± 1%  +1.65%  (p=0.000 n=9+10)
GoTypes         595ms ± 1%      602ms ± 0%  +1.19%  (p=0.000 n=9+9)
Compiler        2.86s ± 0%      2.89s ± 1%  +0.91%  (p=0.000 n=9+10)
SSA             7.19s ± 0%      7.25s ± 1%  +0.75%  (p=0.000 n=8+9)
Flate           117ms ± 1%      118ms ± 1%  +1.10%  (p=0.000 n=10+9)
GoParser        146ms ± 2%      148ms ± 1%  +1.48%  (p=0.002 n=10+10)
Reflect         398ms ± 1%      404ms ± 1%  +1.51%  (p=0.000 n=10+9)
Tar             173ms ± 1%      175ms ± 1%  +1.17%  (p=0.000 n=10+10)
XML             208ms ± 1%      210ms ± 1%  +0.62%  (p=0.011 n=10+10)
[Geo mean]      369ms           373ms       +1.17%
(https://perf.golang.org/search?q=upload:20180101.2)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.22ms ± 1%  2.25ms ± 1%  +1.51%  (p=0.000 n=20+19)
(https://perf.golang.org/search?q=upload:20180101.3)
Change-Id: I5daf4cfec24b252e5a57001f0a6c03f22479d0f0
Reviewed-on: https://go-review.googlesource.com/85887
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2017-12-19 22:05:23 -08:00
										 |  |  | // | 
					
						
							| 
									
										
										
										
											2018-02-22 12:35:30 -05:00
										 |  |  | // The arena map covers the entire possible address space, allowing | 
					
						
							| 
									
										
											  
											
												runtime: use sparse mappings for the heap
This replaces the contiguous heap arena mapping with a potentially
sparse mapping that can support heap mappings anywhere in the address
space.
This has several advantages over the current approach:
* There is no longer any limit on the size of the Go heap. (Currently
  it's limited to 512GB.) Hence, this fixes #10460.
* It eliminates many failures modes of heap initialization and
  growing. In particular it eliminates any possibility of panicking
  with an address space conflict. This can happen for many reasons and
  even causes a low but steady rate of TSAN test failures because of
  conflicts with the TSAN runtime. See #16936 and #11993.
* It eliminates the notion of "non-reserved" heap, which was added
  because creating huge address space reservations (particularly on
  64-bit) led to huge process VSIZE. This was at best confusing and at
  worst conflicted badly with ulimit -v. However, the non-reserved
  heap logic is complicated, can race with other mappings in non-pure
  Go binaries (e.g., #18976), and requires that the entire heap be
  either reserved or non-reserved. We currently maintain the latter
  property, but it's quite difficult to convince yourself of that, and
  hence difficult to keep correct. This logic is still present, but
  will be removed in the next CL.
* It fixes problems on 32-bit where skipping over parts of the address
  space leads to mapping huge (and never-to-be-used) metadata
  structures. See #19831.
This also completely rewrites and significantly simplifies
mheap.sysAlloc, which has been a source of many bugs. E.g., #21044,
 #20259, #18651, and #13143 (and maybe #23222).
This change also makes it possible to allocate individual objects
larger than 512GB. As a result, a few tests that expected huge
allocations to fail needed to be changed to make even larger
allocations. However, at the moment attempting to allocate a humongous
object may cause the program to freeze for several minutes on Linux as
we fall back to probing every page with addrspace_free. That logic
(and this failure mode) will be removed in the next CL.
Fixes #10460.
Fixes #22204 (since it rewrites the code involved).
This slightly slows down compilebench and the x/benchmarks garbage
benchmark.
name       old time/op     new time/op     delta
Template       184ms ± 1%      185ms ± 1%    ~     (p=0.065 n=10+9)
Unicode       86.9ms ± 3%     86.3ms ± 1%    ~     (p=0.631 n=10+10)
GoTypes        599ms ± 0%      602ms ± 0%  +0.56%  (p=0.000 n=10+9)
Compiler       2.87s ± 1%      2.89s ± 1%  +0.51%  (p=0.002 n=9+10)
SSA            7.29s ± 1%      7.25s ± 1%    ~     (p=0.182 n=10+9)
Flate          118ms ± 2%      118ms ± 1%    ~     (p=0.113 n=9+9)
GoParser       147ms ± 1%      148ms ± 1%  +1.07%  (p=0.003 n=9+10)
Reflect        401ms ± 1%      404ms ± 1%  +0.71%  (p=0.003 n=10+9)
Tar            175ms ± 1%      175ms ± 1%    ~     (p=0.604 n=9+10)
XML            209ms ± 1%      210ms ± 1%    ~     (p=0.052 n=10+10)
(https://perf.golang.org/search?q=upload:20171231.4)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.23ms ± 1%  2.25ms ± 1%  +0.84%  (p=0.000 n=19+19)
(https://perf.golang.org/search?q=upload:20171231.3)
Relative to the start of the sparse heap changes (starting at and
including "runtime: fix various contiguous bitmap assumptions"),
overall slowdown is roughly 1% on GC-intensive benchmarks:
name        old time/op     new time/op     delta
Template        183ms ± 1%      185ms ± 1%  +1.32%  (p=0.000 n=9+9)
Unicode        84.9ms ± 2%     86.3ms ± 1%  +1.65%  (p=0.000 n=9+10)
GoTypes         595ms ± 1%      602ms ± 0%  +1.19%  (p=0.000 n=9+9)
Compiler        2.86s ± 0%      2.89s ± 1%  +0.91%  (p=0.000 n=9+10)
SSA             7.19s ± 0%      7.25s ± 1%  +0.75%  (p=0.000 n=8+9)
Flate           117ms ± 1%      118ms ± 1%  +1.10%  (p=0.000 n=10+9)
GoParser        146ms ± 2%      148ms ± 1%  +1.48%  (p=0.002 n=10+10)
Reflect         398ms ± 1%      404ms ± 1%  +1.51%  (p=0.000 n=10+9)
Tar             173ms ± 1%      175ms ± 1%  +1.17%  (p=0.000 n=10+10)
XML             208ms ± 1%      210ms ± 1%  +0.62%  (p=0.011 n=10+10)
[Geo mean]      369ms           373ms       +1.17%
(https://perf.golang.org/search?q=upload:20180101.2)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.22ms ± 1%  2.25ms ± 1%  +1.51%  (p=0.000 n=20+19)
(https://perf.golang.org/search?q=upload:20180101.3)
Change-Id: I5daf4cfec24b252e5a57001f0a6c03f22479d0f0
Reviewed-on: https://go-review.googlesource.com/85887
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2017-12-19 22:05:23 -08:00
										 |  |  | // the Go heap to use any part of the address space. The allocator | 
					
						
							|  |  |  | // attempts to keep arenas contiguous so that large spans (and hence | 
					
						
							|  |  |  | // large objects) can cross arenas. | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-07-30 09:01:52 -07:00
										 |  |  | package runtime | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-11-11 12:39:30 -05:00
										 |  |  | import ( | 
					
						
							| 
									
										
										
										
											2021-06-17 19:10:18 +00:00
										 |  |  | 	"internal/goarch" | 
					
						
							| 
									
										
											  
											
												runtime: implement experiment to replace heap bitmap with alloc headers
This change replaces the 1-bit-per-word heap bitmap for most size
classes with allocation headers for objects that contain pointers. The
header consists of a single pointer to a type. All allocations with
headers are treated as implicitly containing one or more instances of
the type in the header.
As the name implies, headers are usually stored as the first word of an
object. There are two additional exceptions to where headers are stored
and how they're used.
Objects smaller than 512 bytes do not have headers. Instead, a heap
bitmap is reserved at the end of spans for objects of this size. A full
word of overhead is too much for these small objects. The bitmap is of
the same format of the old bitmap, minus the noMorePtrs bits which are
unnecessary. All the objects <512 bytes have a bitmap less than a
pointer-word in size, and that was the granularity at which noMorePtrs
could stop scanning early anyway.
Objects that are larger than 32 KiB (which have their own span) have
their headers stored directly in the span, to allow power-of-two-sized
allocations to not spill over into an extra page.
The full implementation is behind GOEXPERIMENT=allocheaders.
The purpose of this change is performance. First and foremost, with
headers we no longer have to unroll pointer/scalar data at allocation
time for most size classes. Small size classes still need some
unrolling, but their bitmaps are small so we can optimize that case
fairly well. Larger objects effectively have their pointer/scalar data
unrolled on-demand from type data, which is much more compactly
represented and results in less TLB pressure. Furthermore, since the
headers are usually right next to the object and where we're about to
start scanning, we get an additional temporal locality benefit in the
data cache when looking up type metadata. The pointer/scalar data is
now effectively unrolled on-demand, but it's also simpler to unroll than
before; that unrolled data is never written anywhere, and for arrays we
get the benefit of retreading the same data per element, as opposed to
looking it up from scratch for each pointer-word of bitmap. Lastly,
because we no longer have a heap bitmap that spans the entire heap,
there's a flat 1.5% memory use reduction. This is balanced slightly by
some objects possibly being bumped up a size class, but most objects are
not tightly optimized to size class sizes so there's some memory to
spare, making the header basically free in those cases.
See the follow-up CL which turns on this experiment by default for
benchmark results. (CL 538217.)
Change-Id: I4c9034ee200650d06d8bdecd579d5f7c1bbf1fc5
Reviewed-on: https://go-review.googlesource.com/c/go/+/437955
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: Keith Randall <khr@golang.org>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
											
										 
											2022-09-11 04:07:41 +00:00
										 |  |  | 	"internal/goexperiment" | 
					
						
							| 
									
										
										
										
											2021-06-16 21:52:24 +00:00
										 |  |  | 	"internal/goos" | 
					
						
							| 
									
										
											  
											
												runtime: use sparse mappings for the heap
This replaces the contiguous heap arena mapping with a potentially
sparse mapping that can support heap mappings anywhere in the address
space.
This has several advantages over the current approach:
* There is no longer any limit on the size of the Go heap. (Currently
  it's limited to 512GB.) Hence, this fixes #10460.
* It eliminates many failures modes of heap initialization and
  growing. In particular it eliminates any possibility of panicking
  with an address space conflict. This can happen for many reasons and
  even causes a low but steady rate of TSAN test failures because of
  conflicts with the TSAN runtime. See #16936 and #11993.
* It eliminates the notion of "non-reserved" heap, which was added
  because creating huge address space reservations (particularly on
  64-bit) led to huge process VSIZE. This was at best confusing and at
  worst conflicted badly with ulimit -v. However, the non-reserved
  heap logic is complicated, can race with other mappings in non-pure
  Go binaries (e.g., #18976), and requires that the entire heap be
  either reserved or non-reserved. We currently maintain the latter
  property, but it's quite difficult to convince yourself of that, and
  hence difficult to keep correct. This logic is still present, but
  will be removed in the next CL.
* It fixes problems on 32-bit where skipping over parts of the address
  space leads to mapping huge (and never-to-be-used) metadata
  structures. See #19831.
This also completely rewrites and significantly simplifies
mheap.sysAlloc, which has been a source of many bugs. E.g., #21044,
 #20259, #18651, and #13143 (and maybe #23222).
This change also makes it possible to allocate individual objects
larger than 512GB. As a result, a few tests that expected huge
allocations to fail needed to be changed to make even larger
allocations. However, at the moment attempting to allocate a humongous
object may cause the program to freeze for several minutes on Linux as
we fall back to probing every page with addrspace_free. That logic
(and this failure mode) will be removed in the next CL.
Fixes #10460.
Fixes #22204 (since it rewrites the code involved).
This slightly slows down compilebench and the x/benchmarks garbage
benchmark.
name       old time/op     new time/op     delta
Template       184ms ± 1%      185ms ± 1%    ~     (p=0.065 n=10+9)
Unicode       86.9ms ± 3%     86.3ms ± 1%    ~     (p=0.631 n=10+10)
GoTypes        599ms ± 0%      602ms ± 0%  +0.56%  (p=0.000 n=10+9)
Compiler       2.87s ± 1%      2.89s ± 1%  +0.51%  (p=0.002 n=9+10)
SSA            7.29s ± 1%      7.25s ± 1%    ~     (p=0.182 n=10+9)
Flate          118ms ± 2%      118ms ± 1%    ~     (p=0.113 n=9+9)
GoParser       147ms ± 1%      148ms ± 1%  +1.07%  (p=0.003 n=9+10)
Reflect        401ms ± 1%      404ms ± 1%  +0.71%  (p=0.003 n=10+9)
Tar            175ms ± 1%      175ms ± 1%    ~     (p=0.604 n=9+10)
XML            209ms ± 1%      210ms ± 1%    ~     (p=0.052 n=10+10)
(https://perf.golang.org/search?q=upload:20171231.4)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.23ms ± 1%  2.25ms ± 1%  +0.84%  (p=0.000 n=19+19)
(https://perf.golang.org/search?q=upload:20171231.3)
Relative to the start of the sparse heap changes (starting at and
including "runtime: fix various contiguous bitmap assumptions"),
overall slowdown is roughly 1% on GC-intensive benchmarks:
name        old time/op     new time/op     delta
Template        183ms ± 1%      185ms ± 1%  +1.32%  (p=0.000 n=9+9)
Unicode        84.9ms ± 2%     86.3ms ± 1%  +1.65%  (p=0.000 n=9+10)
GoTypes         595ms ± 1%      602ms ± 0%  +1.19%  (p=0.000 n=9+9)
Compiler        2.86s ± 0%      2.89s ± 1%  +0.91%  (p=0.000 n=9+10)
SSA             7.19s ± 0%      7.25s ± 1%  +0.75%  (p=0.000 n=8+9)
Flate           117ms ± 1%      118ms ± 1%  +1.10%  (p=0.000 n=10+9)
GoParser        146ms ± 2%      148ms ± 1%  +1.48%  (p=0.002 n=10+10)
Reflect         398ms ± 1%      404ms ± 1%  +1.51%  (p=0.000 n=10+9)
Tar             173ms ± 1%      175ms ± 1%  +1.17%  (p=0.000 n=10+10)
XML             208ms ± 1%      210ms ± 1%  +0.62%  (p=0.011 n=10+10)
[Geo mean]      369ms           373ms       +1.17%
(https://perf.golang.org/search?q=upload:20180101.2)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.22ms ± 1%  2.25ms ± 1%  +1.51%  (p=0.000 n=20+19)
(https://perf.golang.org/search?q=upload:20180101.3)
Change-Id: I5daf4cfec24b252e5a57001f0a6c03f22479d0f0
Reviewed-on: https://go-review.googlesource.com/85887
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2017-12-19 22:05:23 -08:00
										 |  |  | 	"runtime/internal/atomic" | 
					
						
							| 
									
										
										
										
											2018-10-22 20:47:54 +02:00
										 |  |  | 	"runtime/internal/math" | 
					
						
							| 
									
										
										
										
											2015-11-11 12:39:30 -05:00
										 |  |  | 	"runtime/internal/sys" | 
					
						
							|  |  |  | 	"unsafe" | 
					
						
							|  |  |  | ) | 
					
						
							| 
									
										
										
										
											2014-07-30 09:01:52 -07:00
										 |  |  | 
 | 
					
						
							|  |  |  | const ( | 
					
						
							| 
									
										
										
										
											2014-09-16 10:22:15 -04:00
										 |  |  | 	maxTinySize   = _TinySize | 
					
						
							|  |  |  | 	tinySizeClass = _TinySizeClass | 
					
						
							|  |  |  | 	maxSmallSize  = _MaxSmallSize | 
					
						
							| 
									
										
										
										
											2014-07-30 09:01:52 -07:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-09-16 10:22:15 -04:00
										 |  |  | 	pageShift = _PageShift | 
					
						
							|  |  |  | 	pageSize  = _PageSize | 
					
						
							| 
									
										
										
										
											2014-08-07 13:34:30 +04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-10-26 21:25:56 -07:00
										 |  |  | 	_PageSize = 1 << _PageShift | 
					
						
							|  |  |  | 	_PageMask = _PageSize - 1 | 
					
						
							| 
									
										
										
										
											2015-02-19 13:38:46 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  | 	// _64bit = 1 on 64-bit systems, 0 on 32-bit systems | 
					
						
							|  |  |  | 	_64bit = 1 << (^uintptr(0) >> 63) / 2 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	// Tiny allocator parameters, see "Tiny allocator" comment in malloc.go. | 
					
						
							|  |  |  | 	_TinySize      = 16 | 
					
						
							| 
									
										
										
										
											2016-06-17 09:33:33 -04:00
										 |  |  | 	_TinySizeClass = int8(2) | 
					
						
							| 
									
										
										
										
											2015-02-19 13:38:46 -05:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2018-10-04 21:03:26 +00:00
										 |  |  | 	_FixAllocChunk = 16 << 10 // Chunk size for FixAlloc | 
					
						
							| 
									
										
										
										
											2015-02-19 13:38:46 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  | 	// Per-P, per order stack segment cache size. | 
					
						
							|  |  |  | 	_StackCacheSize = 32 * 1024 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-03-01 23:21:55 +00:00
										 |  |  | 	// Number of orders that get caching. Order 0 is FixedStack | 
					
						
							| 
									
										
										
										
											2015-02-19 13:38:46 -05:00
										 |  |  | 	// and each successive order is twice as large. | 
					
						
							| 
									
										
										
										
											2016-03-01 23:21:55 +00:00
										 |  |  | 	// We want to cache 2KB, 4KB, 8KB, and 16KB stacks. Larger stacks | 
					
						
							| 
									
										
										
										
											2015-02-19 13:38:46 -05:00
										 |  |  | 	// will be allocated directly. | 
					
						
							|  |  |  | 	// Since FixedStack is different on different systems, we | 
					
						
							|  |  |  | 	// must vary NumStackOrders to keep the same maximum cached size. | 
					
						
							|  |  |  | 	//   OS               | FixedStack | NumStackOrders | 
					
						
							|  |  |  | 	//   -----------------+------------+--------------- | 
					
						
							|  |  |  | 	//   linux/darwin/bsd | 2KB        | 4 | 
					
						
							|  |  |  | 	//   windows/32       | 4KB        | 3 | 
					
						
							|  |  |  | 	//   windows/64       | 8KB        | 2 | 
					
						
							|  |  |  | 	//   plan9            | 4KB        | 3 | 
					
						
							| 
									
										
										
										
											2021-06-16 21:52:24 +00:00
										 |  |  | 	_NumStackOrders = 4 - goarch.PtrSize/4*goos.IsWindows - 1*goos.IsPlan9 | 
					
						
							| 
									
										
										
										
											2015-02-19 13:38:46 -05:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2018-02-20 11:59:02 -05:00
										 |  |  | 	// heapAddrBits is the number of bits in a heap address. On | 
					
						
							|  |  |  | 	// amd64, addresses are sign-extended beyond heapAddrBits. On | 
					
						
							|  |  |  | 	// other arches, they are zero-extended. | 
					
						
							| 
									
										
										
										
											2017-12-08 22:57:53 -05:00
										 |  |  | 	// | 
					
						
							| 
									
										
										
										
											2018-10-01 09:58:40 +02:00
										 |  |  | 	// On most 64-bit platforms, we limit this to 48 bits based on a | 
					
						
							| 
									
										
										
										
											2018-02-20 11:59:02 -05:00
										 |  |  | 	// combination of hardware and OS limitations. | 
					
						
							|  |  |  | 	// | 
					
						
							|  |  |  | 	// amd64 hardware limits addresses to 48 bits, sign-extended | 
					
						
							|  |  |  | 	// to 64 bits. Addresses where the top 16 bits are not either | 
					
						
							|  |  |  | 	// all 0 or all 1 are "non-canonical" and invalid. Because of | 
					
						
							|  |  |  | 	// these "negative" addresses, we offset addresses by 1<<47 | 
					
						
							|  |  |  | 	// (arenaBaseOffset) on amd64 before computing indexes into | 
					
						
							|  |  |  | 	// the heap arenas index. In 2017, amd64 hardware added | 
					
						
							|  |  |  | 	// support for 57 bit addresses; however, currently only Linux | 
					
						
							|  |  |  | 	// supports this extension and the kernel will never choose an | 
					
						
							|  |  |  | 	// address above 1<<47 unless mmap is called with a hint | 
					
						
							|  |  |  | 	// address above 1<<47 (which we never do). | 
					
						
							|  |  |  | 	// | 
					
						
							|  |  |  | 	// arm64 hardware (as of ARMv8) limits user addresses to 48 | 
					
						
							|  |  |  | 	// bits, in the range [0, 1<<48). | 
					
						
							|  |  |  | 	// | 
					
						
							|  |  |  | 	// ppc64, mips64, and s390x support arbitrary 64 bit addresses | 
					
						
							| 
									
										
										
										
											2018-10-01 09:58:40 +02:00
										 |  |  | 	// in hardware. On Linux, Go leans on stricter OS limits. Based | 
					
						
							|  |  |  | 	// on Linux's processor.h, the user address space is limited as | 
					
						
							|  |  |  | 	// follows on 64-bit architectures: | 
					
						
							| 
									
										
										
										
											2018-01-01 17:53:59 -05:00
										 |  |  | 	// | 
					
						
							|  |  |  | 	// Architecture  Name              Maximum Value (exclusive) | 
					
						
							|  |  |  | 	// --------------------------------------------------------------------- | 
					
						
							|  |  |  | 	// amd64         TASK_SIZE_MAX     0x007ffffffff000 (47 bit addresses) | 
					
						
							|  |  |  | 	// arm64         TASK_SIZE_64      0x01000000000000 (48 bit addresses) | 
					
						
							|  |  |  | 	// ppc64{,le}    TASK_SIZE_USER64  0x00400000000000 (46 bit addresses) | 
					
						
							|  |  |  | 	// mips64{,le}   TASK_SIZE64       0x00010000000000 (40 bit addresses) | 
					
						
							|  |  |  | 	// s390x         TASK_SIZE         1<<64 (64 bit addresses) | 
					
						
							|  |  |  | 	// | 
					
						
							| 
									
										
										
										
											2018-02-20 11:59:02 -05:00
										 |  |  | 	// These limits may increase over time, but are currently at | 
					
						
							|  |  |  | 	// most 48 bits except on s390x. On all architectures, Linux | 
					
						
							|  |  |  | 	// starts placing mmap'd regions at addresses that are | 
					
						
							|  |  |  | 	// significantly below 48 bits, so even if it's possible to | 
					
						
							|  |  |  | 	// exceed Go's 48 bit limit, it's extremely unlikely in | 
					
						
							|  |  |  | 	// practice. | 
					
						
							| 
									
										
										
										
											2017-12-08 22:57:53 -05:00
										 |  |  | 	// | 
					
						
							|  |  |  | 	// On 32-bit platforms, we accept the full 32-bit address | 
					
						
							|  |  |  | 	// space because doing so is cheap. | 
					
						
							|  |  |  | 	// mips32 only has access to the low 2GB of virtual memory, so | 
					
						
							|  |  |  | 	// we further limit it to 31 bits. | 
					
						
							| 
									
										
										
										
											2018-03-31 23:14:17 +02:00
										 |  |  | 	// | 
					
						
							| 
									
										
										
										
											2020-09-17 10:53:10 -04:00
										 |  |  | 	// On ios/arm64, although 64-bit pointers are presumably | 
					
						
							| 
									
										
										
										
											2021-08-23 17:27:40 +00:00
										 |  |  | 	// available, pointers are truncated to 33 bits in iOS <14. | 
					
						
							|  |  |  | 	// Furthermore, only the top 4 GiB of the address space are | 
					
						
							|  |  |  | 	// actually available to the application. In iOS >=14, more | 
					
						
							|  |  |  | 	// of the address space is available, and the OS can now | 
					
						
							|  |  |  | 	// provide addresses outside of those 33 bits. Pick 40 bits | 
					
						
							|  |  |  | 	// as a reasonable balance between address space usage by the | 
					
						
							|  |  |  | 	// page allocator, and flexibility for what mmap'd regions | 
					
						
							|  |  |  | 	// we'll accept for the heap. We can't just move to the full | 
					
						
							|  |  |  | 	// 48 bits because this uses too much address space for older | 
					
						
							|  |  |  | 	// iOS versions. | 
					
						
							|  |  |  | 	// TODO(mknyszek): Once iOS <14 is deprecated, promote ios/arm64 | 
					
						
							|  |  |  | 	// to a 48-bit address space like every other arm64 platform. | 
					
						
							| 
									
										
										
										
											2019-11-06 23:56:03 +00:00
										 |  |  | 	// | 
					
						
							| 
									
										
										
										
											2018-03-31 23:14:17 +02:00
										 |  |  | 	// WebAssembly currently has a limit of 4GB linear memory. | 
					
						
							| 
									
										
										
										
											2021-08-23 17:27:40 +00:00
										 |  |  | 	heapAddrBits = (_64bit*(1-goarch.IsWasm)*(1-goos.IsIos*goarch.IsArm64))*48 + (1-_64bit+goarch.IsWasm)*(32-(goarch.IsMips+goarch.IsMipsle)) + 40*goos.IsIos*goarch.IsArm64 | 
					
						
							| 
									
										
										
										
											2018-01-01 21:51:47 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  | 	// maxAlloc is the maximum size of an allocation. On 64-bit, | 
					
						
							| 
									
										
										
										
											2018-02-20 11:59:02 -05:00
										 |  |  | 	// it's theoretically possible to allocate 1<<heapAddrBits bytes. On | 
					
						
							|  |  |  | 	// 32-bit, however, this is one less than 1<<32 because the | 
					
						
							| 
									
										
										
										
											2018-01-01 21:51:47 -05:00
										 |  |  | 	// number of bytes in the address space doesn't actually fit | 
					
						
							|  |  |  | 	// in a uintptr. | 
					
						
							| 
									
										
										
										
											2018-02-20 11:59:02 -05:00
										 |  |  | 	maxAlloc = (1 << heapAddrBits) - (1-_64bit)*1 | 
					
						
							| 
									
										
										
										
											2017-12-08 22:57:53 -05:00
										 |  |  | 
 | 
					
						
							| 
									
										
											  
											
												runtime: reduce arena size to 4MB on 64-bit Windows
Currently, we use 64MB heap arenas on 64-bit platforms. This works
well on UNIX-like OSes because they treat untouched pages as
essentially free. However, on Windows, committed memory is charged
against a process whether or not it has demand-faulted physical pages
in. Hence, on Windows, even a process with a tiny heap will commit
64MB for one heap arena, plus another 32MB for the arena map. Things
are much worse under the race detector, which increases the heap
commitment by a factor of 5.5X, leading to 384MB of committed memory
at runtime init.
Fix this by reducing the heap arena size to 4MB on Windows.
To counterbalance the effect of increasing the arena map size by a
factor of 16, and to further reduce the impact of the commitment for
the arena map, we switch from a single entry L1 arena map to a 64
entry L1 arena map.
Compared to the original arena design, this slows down the
x/benchmarks garbage benchmark by 0.49% (the slow down of this commit
alone is 1.59%, but the previous commit bought us a 1% speed-up):
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.28ms ± 1%  2.29ms ± 1%  +0.49%  (p=0.000 n=17+18)
(https://perf.golang.org/search?q=upload:20180223.1)
(This was measured on linux/amd64 by modifying its arena configuration
as above.)
Fixes #23900.
Change-Id: I6b7fa5ecebee2947bf20cfeb78c248809469c6b1
Reviewed-on: https://go-review.googlesource.com/96780
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2018-02-23 12:03:00 -05:00
										 |  |  | 	// The number of bits in a heap address, the size of heap | 
					
						
							|  |  |  | 	// arenas, and the L1 and L2 arena map sizes are related by | 
					
						
							|  |  |  | 	// | 
					
						
							| 
									
										
										
										
											2018-10-04 10:46:20 -04:00
										 |  |  | 	//   (1 << addr bits) = arena size * L1 entries * L2 entries | 
					
						
							| 
									
										
											  
											
												runtime: reduce arena size to 4MB on 64-bit Windows
Currently, we use 64MB heap arenas on 64-bit platforms. This works
well on UNIX-like OSes because they treat untouched pages as
essentially free. However, on Windows, committed memory is charged
against a process whether or not it has demand-faulted physical pages
in. Hence, on Windows, even a process with a tiny heap will commit
64MB for one heap arena, plus another 32MB for the arena map. Things
are much worse under the race detector, which increases the heap
commitment by a factor of 5.5X, leading to 384MB of committed memory
at runtime init.
Fix this by reducing the heap arena size to 4MB on Windows.
To counterbalance the effect of increasing the arena map size by a
factor of 16, and to further reduce the impact of the commitment for
the arena map, we switch from a single entry L1 arena map to a 64
entry L1 arena map.
Compared to the original arena design, this slows down the
x/benchmarks garbage benchmark by 0.49% (the slow down of this commit
alone is 1.59%, but the previous commit bought us a 1% speed-up):
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.28ms ± 1%  2.29ms ± 1%  +0.49%  (p=0.000 n=17+18)
(https://perf.golang.org/search?q=upload:20180223.1)
(This was measured on linux/amd64 by modifying its arena configuration
as above.)
Fixes #23900.
Change-Id: I6b7fa5ecebee2947bf20cfeb78c248809469c6b1
Reviewed-on: https://go-review.googlesource.com/96780
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2018-02-23 12:03:00 -05:00
										 |  |  | 	// | 
					
						
							|  |  |  | 	// Currently, we balance these as follows: | 
					
						
							|  |  |  | 	// | 
					
						
							| 
									
										
										
										
											2018-10-04 10:46:20 -04:00
										 |  |  | 	//       Platform  Addr bits  Arena size  L1 entries   L2 entries | 
					
						
							|  |  |  | 	// --------------  ---------  ----------  ----------  ----------- | 
					
						
							|  |  |  | 	//       */64-bit         48        64MB           1    4M (32MB) | 
					
						
							|  |  |  | 	// windows/64-bit         48         4MB          64    1M  (8MB) | 
					
						
							| 
									
										
										
										
											2020-11-16 22:03:17 +00:00
										 |  |  | 	//      ios/arm64         33         4MB           1  2048  (8KB) | 
					
						
							| 
									
										
										
										
											2018-10-04 10:46:20 -04:00
										 |  |  | 	//       */32-bit         32         4MB           1  1024  (4KB) | 
					
						
							|  |  |  | 	//     */mips(le)         31         4MB           1   512  (2KB) | 
					
						
							| 
									
										
											  
											
												runtime: reduce arena size to 4MB on 64-bit Windows
Currently, we use 64MB heap arenas on 64-bit platforms. This works
well on UNIX-like OSes because they treat untouched pages as
essentially free. However, on Windows, committed memory is charged
against a process whether or not it has demand-faulted physical pages
in. Hence, on Windows, even a process with a tiny heap will commit
64MB for one heap arena, plus another 32MB for the arena map. Things
are much worse under the race detector, which increases the heap
commitment by a factor of 5.5X, leading to 384MB of committed memory
at runtime init.
Fix this by reducing the heap arena size to 4MB on Windows.
To counterbalance the effect of increasing the arena map size by a
factor of 16, and to further reduce the impact of the commitment for
the arena map, we switch from a single entry L1 arena map to a 64
entry L1 arena map.
Compared to the original arena design, this slows down the
x/benchmarks garbage benchmark by 0.49% (the slow down of this commit
alone is 1.59%, but the previous commit bought us a 1% speed-up):
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.28ms ± 1%  2.29ms ± 1%  +0.49%  (p=0.000 n=17+18)
(https://perf.golang.org/search?q=upload:20180223.1)
(This was measured on linux/amd64 by modifying its arena configuration
as above.)
Fixes #23900.
Change-Id: I6b7fa5ecebee2947bf20cfeb78c248809469c6b1
Reviewed-on: https://go-review.googlesource.com/96780
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2018-02-23 12:03:00 -05:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2017-12-08 22:57:53 -05:00
										 |  |  | 	// heapArenaBytes is the size of a heap arena. The heap | 
					
						
							|  |  |  | 	// consists of mappings of size heapArenaBytes, aligned to | 
					
						
							|  |  |  | 	// heapArenaBytes. The initial heap mapping is one arena. | 
					
						
							| 
									
										
											  
											
												runtime: support a two-level arena map
Currently, the heap arena map is a single, large array that covers
every possible arena frame in the entire address space. This is
practical up to about 48 bits of address space with 64 MB arenas.
However, there are two problems with this:
1. mips64, ppc64, and s390x support full 64-bit address spaces (though
   on Linux only s390x has kernel support for 64-bit address spaces).
   On these platforms, it would be good to support these larger
   address spaces.
2. On Windows, processes are charged for untouched memory, so for
   processes with small heaps, the mostly-untouched 32 MB arena map
   plus a 64 MB arena are significant overhead. Hence, it would be
   good to reduce both the arena map size and the arena size, but with
   a single-level arena, these are inversely proportional.
This CL adds support for a two-level arena map. Arena frame numbers
are now divided into arenaL1Bits of L1 index and arenaL2Bits of L2
index.
At the moment, arenaL1Bits is always 0, so we effectively have a
single level map. We do a few things so that this has no cost beyond
the current single-level map:
1. We embed the L2 array directly in mheap, so if there's a single
   entry in the L2 array, the representation is identical to the
   current representation and there's no extra level of indirection.
2. Hot code that accesses the arena map is structured so that it
   optimizes to nearly the same machine code as it does currently.
3. We make some small tweaks to hot code paths and to the inliner
   itself to keep some important functions inlined despite their
   now-larger ASTs. In particular, this is necessary for
   heapBitsForAddr and heapBits.next.
Possibly as a result of some of the tweaks, this actually slightly
improves the performance of the x/benchmarks garbage benchmark:
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.28ms ± 1%  2.26ms ± 1%  -1.07%  (p=0.000 n=17+19)
(https://perf.golang.org/search?q=upload:20180223.2)
For #23900.
Change-Id: If5164e0961754f97eb9eca58f837f36d759505ff
Reviewed-on: https://go-review.googlesource.com/96779
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2018-02-22 20:38:09 -05:00
										 |  |  | 	// | 
					
						
							| 
									
										
											  
											
												runtime: reduce arena size to 4MB on 64-bit Windows
Currently, we use 64MB heap arenas on 64-bit platforms. This works
well on UNIX-like OSes because they treat untouched pages as
essentially free. However, on Windows, committed memory is charged
against a process whether or not it has demand-faulted physical pages
in. Hence, on Windows, even a process with a tiny heap will commit
64MB for one heap arena, plus another 32MB for the arena map. Things
are much worse under the race detector, which increases the heap
commitment by a factor of 5.5X, leading to 384MB of committed memory
at runtime init.
Fix this by reducing the heap arena size to 4MB on Windows.
To counterbalance the effect of increasing the arena map size by a
factor of 16, and to further reduce the impact of the commitment for
the arena map, we switch from a single entry L1 arena map to a 64
entry L1 arena map.
Compared to the original arena design, this slows down the
x/benchmarks garbage benchmark by 0.49% (the slow down of this commit
alone is 1.59%, but the previous commit bought us a 1% speed-up):
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.28ms ± 1%  2.29ms ± 1%  +0.49%  (p=0.000 n=17+18)
(https://perf.golang.org/search?q=upload:20180223.1)
(This was measured on linux/amd64 by modifying its arena configuration
as above.)
Fixes #23900.
Change-Id: I6b7fa5ecebee2947bf20cfeb78c248809469c6b1
Reviewed-on: https://go-review.googlesource.com/96780
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2018-02-23 12:03:00 -05:00
										 |  |  | 	// This is currently 64MB on 64-bit non-Windows and 4MB on | 
					
						
							|  |  |  | 	// 32-bit and on Windows. We use smaller arenas on Windows | 
					
						
							|  |  |  | 	// because all committed memory is charged to the process, | 
					
						
							|  |  |  | 	// even if it's not touched. Hence, for processes with small | 
					
						
							|  |  |  | 	// heaps, the mapped arena space needs to be commensurate. | 
					
						
							|  |  |  | 	// This is particularly important with the race detector, | 
					
						
							|  |  |  | 	// since it significantly amplifies the cost of committed | 
					
						
							|  |  |  | 	// memory. | 
					
						
							| 
									
										
											  
											
												runtime: support a two-level arena map
Currently, the heap arena map is a single, large array that covers
every possible arena frame in the entire address space. This is
practical up to about 48 bits of address space with 64 MB arenas.
However, there are two problems with this:
1. mips64, ppc64, and s390x support full 64-bit address spaces (though
   on Linux only s390x has kernel support for 64-bit address spaces).
   On these platforms, it would be good to support these larger
   address spaces.
2. On Windows, processes are charged for untouched memory, so for
   processes with small heaps, the mostly-untouched 32 MB arena map
   plus a 64 MB arena are significant overhead. Hence, it would be
   good to reduce both the arena map size and the arena size, but with
   a single-level arena, these are inversely proportional.
This CL adds support for a two-level arena map. Arena frame numbers
are now divided into arenaL1Bits of L1 index and arenaL2Bits of L2
index.
At the moment, arenaL1Bits is always 0, so we effectively have a
single level map. We do a few things so that this has no cost beyond
the current single-level map:
1. We embed the L2 array directly in mheap, so if there's a single
   entry in the L2 array, the representation is identical to the
   current representation and there's no extra level of indirection.
2. Hot code that accesses the arena map is structured so that it
   optimizes to nearly the same machine code as it does currently.
3. We make some small tweaks to hot code paths and to the inliner
   itself to keep some important functions inlined despite their
   now-larger ASTs. In particular, this is necessary for
   heapBitsForAddr and heapBits.next.
Possibly as a result of some of the tweaks, this actually slightly
improves the performance of the x/benchmarks garbage benchmark:
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.28ms ± 1%  2.26ms ± 1%  -1.07%  (p=0.000 n=17+19)
(https://perf.golang.org/search?q=upload:20180223.2)
For #23900.
Change-Id: If5164e0961754f97eb9eca58f837f36d759505ff
Reviewed-on: https://go-review.googlesource.com/96779
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2018-02-22 20:38:09 -05:00
										 |  |  | 	heapArenaBytes = 1 << logHeapArenaBytes | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-04-29 13:21:44 -07:00
										 |  |  | 	heapArenaWords = heapArenaBytes / goarch.PtrSize | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
											  
											
												runtime: support a two-level arena map
Currently, the heap arena map is a single, large array that covers
every possible arena frame in the entire address space. This is
practical up to about 48 bits of address space with 64 MB arenas.
However, there are two problems with this:
1. mips64, ppc64, and s390x support full 64-bit address spaces (though
   on Linux only s390x has kernel support for 64-bit address spaces).
   On these platforms, it would be good to support these larger
   address spaces.
2. On Windows, processes are charged for untouched memory, so for
   processes with small heaps, the mostly-untouched 32 MB arena map
   plus a 64 MB arena are significant overhead. Hence, it would be
   good to reduce both the arena map size and the arena size, but with
   a single-level arena, these are inversely proportional.
This CL adds support for a two-level arena map. Arena frame numbers
are now divided into arenaL1Bits of L1 index and arenaL2Bits of L2
index.
At the moment, arenaL1Bits is always 0, so we effectively have a
single level map. We do a few things so that this has no cost beyond
the current single-level map:
1. We embed the L2 array directly in mheap, so if there's a single
   entry in the L2 array, the representation is identical to the
   current representation and there's no extra level of indirection.
2. Hot code that accesses the arena map is structured so that it
   optimizes to nearly the same machine code as it does currently.
3. We make some small tweaks to hot code paths and to the inliner
   itself to keep some important functions inlined despite their
   now-larger ASTs. In particular, this is necessary for
   heapBitsForAddr and heapBits.next.
Possibly as a result of some of the tweaks, this actually slightly
improves the performance of the x/benchmarks garbage benchmark:
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.28ms ± 1%  2.26ms ± 1%  -1.07%  (p=0.000 n=17+19)
(https://perf.golang.org/search?q=upload:20180223.2)
For #23900.
Change-Id: If5164e0961754f97eb9eca58f837f36d759505ff
Reviewed-on: https://go-review.googlesource.com/96779
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2018-02-22 20:38:09 -05:00
										 |  |  | 	// logHeapArenaBytes is log_2 of heapArenaBytes. For clarity, | 
					
						
							|  |  |  | 	// prefer using heapArenaBytes where possible (we need the | 
					
						
							|  |  |  | 	// constant to compute some other constants). | 
					
						
							| 
									
										
										
										
											2021-06-16 21:57:58 +00:00
										 |  |  | 	logHeapArenaBytes = (6+20)*(_64bit*(1-goos.IsWindows)*(1-goarch.IsWasm)*(1-goos.IsIos*goarch.IsArm64)) + (2+20)*(_64bit*goos.IsWindows) + (2+20)*(1-_64bit) + (2+20)*goarch.IsWasm + (2+20)*goos.IsIos*goarch.IsArm64 | 
					
						
							| 
									
										
										
										
											2017-12-08 22:57:53 -05:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-04-29 13:21:44 -07:00
										 |  |  | 	// heapArenaBitmapWords is the size of each heap arena's bitmap in uintptrs. | 
					
						
							|  |  |  | 	heapArenaBitmapWords = heapArenaWords / (8 * goarch.PtrSize) | 
					
						
							| 
									
										
										
										
											2017-12-08 22:57:53 -05:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2017-12-13 16:09:02 -05:00
										 |  |  | 	pagesPerArena = heapArenaBytes / pageSize | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
											  
											
												runtime: support a two-level arena map
Currently, the heap arena map is a single, large array that covers
every possible arena frame in the entire address space. This is
practical up to about 48 bits of address space with 64 MB arenas.
However, there are two problems with this:
1. mips64, ppc64, and s390x support full 64-bit address spaces (though
   on Linux only s390x has kernel support for 64-bit address spaces).
   On these platforms, it would be good to support these larger
   address spaces.
2. On Windows, processes are charged for untouched memory, so for
   processes with small heaps, the mostly-untouched 32 MB arena map
   plus a 64 MB arena are significant overhead. Hence, it would be
   good to reduce both the arena map size and the arena size, but with
   a single-level arena, these are inversely proportional.
This CL adds support for a two-level arena map. Arena frame numbers
are now divided into arenaL1Bits of L1 index and arenaL2Bits of L2
index.
At the moment, arenaL1Bits is always 0, so we effectively have a
single level map. We do a few things so that this has no cost beyond
the current single-level map:
1. We embed the L2 array directly in mheap, so if there's a single
   entry in the L2 array, the representation is identical to the
   current representation and there's no extra level of indirection.
2. Hot code that accesses the arena map is structured so that it
   optimizes to nearly the same machine code as it does currently.
3. We make some small tweaks to hot code paths and to the inliner
   itself to keep some important functions inlined despite their
   now-larger ASTs. In particular, this is necessary for
   heapBitsForAddr and heapBits.next.
Possibly as a result of some of the tweaks, this actually slightly
improves the performance of the x/benchmarks garbage benchmark:
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.28ms ± 1%  2.26ms ± 1%  -1.07%  (p=0.000 n=17+19)
(https://perf.golang.org/search?q=upload:20180223.2)
For #23900.
Change-Id: If5164e0961754f97eb9eca58f837f36d759505ff
Reviewed-on: https://go-review.googlesource.com/96779
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2018-02-22 20:38:09 -05:00
										 |  |  | 	// arenaL1Bits is the number of bits of the arena number | 
					
						
							|  |  |  | 	// covered by the first level arena map. | 
					
						
							|  |  |  | 	// | 
					
						
							|  |  |  | 	// This number should be small, since the first level arena | 
					
						
							|  |  |  | 	// map requires PtrSize*(1<<arenaL1Bits) of space in the | 
					
						
							|  |  |  | 	// binary's BSS. It can be zero, in which case the first level | 
					
						
							|  |  |  | 	// index is effectively unused. There is a performance benefit | 
					
						
							|  |  |  | 	// to this, since the generated code can be more efficient, | 
					
						
							|  |  |  | 	// but comes at the cost of having a large L2 mapping. | 
					
						
							| 
									
										
											  
											
												runtime: reduce arena size to 4MB on 64-bit Windows
Currently, we use 64MB heap arenas on 64-bit platforms. This works
well on UNIX-like OSes because they treat untouched pages as
essentially free. However, on Windows, committed memory is charged
against a process whether or not it has demand-faulted physical pages
in. Hence, on Windows, even a process with a tiny heap will commit
64MB for one heap arena, plus another 32MB for the arena map. Things
are much worse under the race detector, which increases the heap
commitment by a factor of 5.5X, leading to 384MB of committed memory
at runtime init.
Fix this by reducing the heap arena size to 4MB on Windows.
To counterbalance the effect of increasing the arena map size by a
factor of 16, and to further reduce the impact of the commitment for
the arena map, we switch from a single entry L1 arena map to a 64
entry L1 arena map.
Compared to the original arena design, this slows down the
x/benchmarks garbage benchmark by 0.49% (the slow down of this commit
alone is 1.59%, but the previous commit bought us a 1% speed-up):
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.28ms ± 1%  2.29ms ± 1%  +0.49%  (p=0.000 n=17+18)
(https://perf.golang.org/search?q=upload:20180223.1)
(This was measured on linux/amd64 by modifying its arena configuration
as above.)
Fixes #23900.
Change-Id: I6b7fa5ecebee2947bf20cfeb78c248809469c6b1
Reviewed-on: https://go-review.googlesource.com/96780
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2018-02-23 12:03:00 -05:00
										 |  |  | 	// | 
					
						
							|  |  |  | 	// We use the L1 map on 64-bit Windows because the arena size | 
					
						
							|  |  |  | 	// is small, but the address space is still 48 bits, and | 
					
						
							|  |  |  | 	// there's a high cost to having a large L2. | 
					
						
							| 
									
										
										
										
											2021-06-16 21:52:24 +00:00
										 |  |  | 	arenaL1Bits = 6 * (_64bit * goos.IsWindows) | 
					
						
							| 
									
										
											  
											
												runtime: support a two-level arena map
Currently, the heap arena map is a single, large array that covers
every possible arena frame in the entire address space. This is
practical up to about 48 bits of address space with 64 MB arenas.
However, there are two problems with this:
1. mips64, ppc64, and s390x support full 64-bit address spaces (though
   on Linux only s390x has kernel support for 64-bit address spaces).
   On these platforms, it would be good to support these larger
   address spaces.
2. On Windows, processes are charged for untouched memory, so for
   processes with small heaps, the mostly-untouched 32 MB arena map
   plus a 64 MB arena are significant overhead. Hence, it would be
   good to reduce both the arena map size and the arena size, but with
   a single-level arena, these are inversely proportional.
This CL adds support for a two-level arena map. Arena frame numbers
are now divided into arenaL1Bits of L1 index and arenaL2Bits of L2
index.
At the moment, arenaL1Bits is always 0, so we effectively have a
single level map. We do a few things so that this has no cost beyond
the current single-level map:
1. We embed the L2 array directly in mheap, so if there's a single
   entry in the L2 array, the representation is identical to the
   current representation and there's no extra level of indirection.
2. Hot code that accesses the arena map is structured so that it
   optimizes to nearly the same machine code as it does currently.
3. We make some small tweaks to hot code paths and to the inliner
   itself to keep some important functions inlined despite their
   now-larger ASTs. In particular, this is necessary for
   heapBitsForAddr and heapBits.next.
Possibly as a result of some of the tweaks, this actually slightly
improves the performance of the x/benchmarks garbage benchmark:
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.28ms ± 1%  2.26ms ± 1%  -1.07%  (p=0.000 n=17+19)
(https://perf.golang.org/search?q=upload:20180223.2)
For #23900.
Change-Id: If5164e0961754f97eb9eca58f837f36d759505ff
Reviewed-on: https://go-review.googlesource.com/96779
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2018-02-22 20:38:09 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  | 	// arenaL2Bits is the number of bits of the arena number | 
					
						
							|  |  |  | 	// covered by the second level arena index. | 
					
						
							|  |  |  | 	// | 
					
						
							|  |  |  | 	// The size of each arena map allocation is proportional to | 
					
						
							|  |  |  | 	// 1<<arenaL2Bits, so it's important that this not be too | 
					
						
							|  |  |  | 	// large. 48 bits leads to 32MB arena index allocations, which | 
					
						
							|  |  |  | 	// is about the practical threshold. | 
					
						
							|  |  |  | 	arenaL2Bits = heapAddrBits - logHeapArenaBytes - arenaL1Bits | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	// arenaL1Shift is the number of bits to shift an arena frame | 
					
						
							|  |  |  | 	// number by to compute an index into the first level arena map. | 
					
						
							|  |  |  | 	arenaL1Shift = arenaL2Bits | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	// arenaBits is the total bits in a combined arena map index. | 
					
						
							|  |  |  | 	// This is split between the index into the L1 arena map and | 
					
						
							|  |  |  | 	// the L2 arena map. | 
					
						
							|  |  |  | 	arenaBits = arenaL1Bits + arenaL2Bits | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
											  
											
												runtime: offset the heap arena index by 2^47 on amd64
On amd64, the virtual address space, when interpreted as signed
values, is [-2^47, 2^47). Currently, we only support heap addresses in
the "positive" half of this, [0, 2^47). This suffices for linux/amd64
and windows/amd64, but solaris/amd64 can map user addresses in the
negative part of this range. Specifically, addresses
0xFFFF8000'00000000 to 0xFFFFFD80'00000000 are part of user space.
This leads to "memory allocated by OS not in usable address space"
panic, since we don't map heap arena index space for these addresses.
Fix this by offsetting addresses when computing arena indexes so that
arena entry 0 corresponds to address -2^47 on amd64. We already map
enough arena space for 2^48 heap addresses on 64-bit (because arm64's
virtual address space is [0, 2^48)), so we don't need to grow any
structures to support this.
A different approach would be to simply mask out the top 16 bits.
However, there are two advantages to the offset approach: 1) invalid
heap addresses continue to naturally map to invalid arena indexes so
we don't need extra checks and 2) it perturbs the mapping of addresses
to arena indexes more, which helps check that we don't accidentally
compute incorrect arena indexes somewhere that happen to be right most
of the time.
Several comments and constant names are now somewhat misleading. We'll
fix that in the next CL. This CL is the core change the arena
indexing.
Fixes #23862.
Change-Id: Idb8e299fded04593a286b01a9582da6ddbac2f9a
Reviewed-on: https://go-review.googlesource.com/95497
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2018-02-19 16:10:58 -05:00
										 |  |  | 	// arenaBaseOffset is the pointer value that corresponds to | 
					
						
							| 
									
										
										
										
											2018-02-22 12:35:30 -05:00
										 |  |  | 	// index 0 in the heap arena map. | 
					
						
							| 
									
										
											  
											
												runtime: offset the heap arena index by 2^47 on amd64
On amd64, the virtual address space, when interpreted as signed
values, is [-2^47, 2^47). Currently, we only support heap addresses in
the "positive" half of this, [0, 2^47). This suffices for linux/amd64
and windows/amd64, but solaris/amd64 can map user addresses in the
negative part of this range. Specifically, addresses
0xFFFF8000'00000000 to 0xFFFFFD80'00000000 are part of user space.
This leads to "memory allocated by OS not in usable address space"
panic, since we don't map heap arena index space for these addresses.
Fix this by offsetting addresses when computing arena indexes so that
arena entry 0 corresponds to address -2^47 on amd64. We already map
enough arena space for 2^48 heap addresses on 64-bit (because arm64's
virtual address space is [0, 2^48)), so we don't need to grow any
structures to support this.
A different approach would be to simply mask out the top 16 bits.
However, there are two advantages to the offset approach: 1) invalid
heap addresses continue to naturally map to invalid arena indexes so
we don't need extra checks and 2) it perturbs the mapping of addresses
to arena indexes more, which helps check that we don't accidentally
compute incorrect arena indexes somewhere that happen to be right most
of the time.
Several comments and constant names are now somewhat misleading. We'll
fix that in the next CL. This CL is the core change the arena
indexing.
Fixes #23862.
Change-Id: Idb8e299fded04593a286b01a9582da6ddbac2f9a
Reviewed-on: https://go-review.googlesource.com/95497
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2018-02-19 16:10:58 -05:00
										 |  |  | 	// | 
					
						
							|  |  |  | 	// On amd64, the address space is 48 bits, sign extended to 64 | 
					
						
							|  |  |  | 	// bits. This offset lets us handle "negative" addresses (or | 
					
						
							|  |  |  | 	// high addresses if viewed as unsigned). | 
					
						
							|  |  |  | 	// | 
					
						
							| 
									
										
										
										
											2019-11-13 13:46:42 +01:00
										 |  |  | 	// On aix/ppc64, this offset allows to keep the heapAddrBits to | 
					
						
							| 
									
										
										
										
											2021-05-26 15:41:27 +00:00
										 |  |  | 	// 48. Otherwise, it would be 60 in order to handle mmap addresses | 
					
						
							| 
									
										
										
										
											2019-11-13 13:46:42 +01:00
										 |  |  | 	// (in range 0x0a00000000000000 - 0x0afffffffffffff). But in this | 
					
						
							|  |  |  | 	// case, the memory reserved in (s *pageAlloc).init for chunks | 
					
						
							|  |  |  | 	// is causing important slowdowns. | 
					
						
							|  |  |  | 	// | 
					
						
							| 
									
										
											  
											
												runtime: offset the heap arena index by 2^47 on amd64
On amd64, the virtual address space, when interpreted as signed
values, is [-2^47, 2^47). Currently, we only support heap addresses in
the "positive" half of this, [0, 2^47). This suffices for linux/amd64
and windows/amd64, but solaris/amd64 can map user addresses in the
negative part of this range. Specifically, addresses
0xFFFF8000'00000000 to 0xFFFFFD80'00000000 are part of user space.
This leads to "memory allocated by OS not in usable address space"
panic, since we don't map heap arena index space for these addresses.
Fix this by offsetting addresses when computing arena indexes so that
arena entry 0 corresponds to address -2^47 on amd64. We already map
enough arena space for 2^48 heap addresses on 64-bit (because arm64's
virtual address space is [0, 2^48)), so we don't need to grow any
structures to support this.
A different approach would be to simply mask out the top 16 bits.
However, there are two advantages to the offset approach: 1) invalid
heap addresses continue to naturally map to invalid arena indexes so
we don't need extra checks and 2) it perturbs the mapping of addresses
to arena indexes more, which helps check that we don't accidentally
compute incorrect arena indexes somewhere that happen to be right most
of the time.
Several comments and constant names are now somewhat misleading. We'll
fix that in the next CL. This CL is the core change the arena
indexing.
Fixes #23862.
Change-Id: Idb8e299fded04593a286b01a9582da6ddbac2f9a
Reviewed-on: https://go-review.googlesource.com/95497
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2018-02-19 16:10:58 -05:00
										 |  |  | 	// On other platforms, the user address space is contiguous | 
					
						
							|  |  |  | 	// and starts at 0, so no offset is necessary. | 
					
						
							| 
									
										
										
										
											2021-06-16 21:57:58 +00:00
										 |  |  | 	arenaBaseOffset = 0xffff800000000000*goarch.IsAmd64 + 0x0a00000000000000*goos.IsAix | 
					
						
							| 
									
										
										
										
											2020-06-26 16:38:40 -07:00
										 |  |  | 	// A typed version of this constant that will make it into DWARF (for viewcore). | 
					
						
							|  |  |  | 	arenaBaseOffsetUintptr = uintptr(arenaBaseOffset) | 
					
						
							| 
									
										
											  
											
												runtime: offset the heap arena index by 2^47 on amd64
On amd64, the virtual address space, when interpreted as signed
values, is [-2^47, 2^47). Currently, we only support heap addresses in
the "positive" half of this, [0, 2^47). This suffices for linux/amd64
and windows/amd64, but solaris/amd64 can map user addresses in the
negative part of this range. Specifically, addresses
0xFFFF8000'00000000 to 0xFFFFFD80'00000000 are part of user space.
This leads to "memory allocated by OS not in usable address space"
panic, since we don't map heap arena index space for these addresses.
Fix this by offsetting addresses when computing arena indexes so that
arena entry 0 corresponds to address -2^47 on amd64. We already map
enough arena space for 2^48 heap addresses on 64-bit (because arm64's
virtual address space is [0, 2^48)), so we don't need to grow any
structures to support this.
A different approach would be to simply mask out the top 16 bits.
However, there are two advantages to the offset approach: 1) invalid
heap addresses continue to naturally map to invalid arena indexes so
we don't need extra checks and 2) it perturbs the mapping of addresses
to arena indexes more, which helps check that we don't accidentally
compute incorrect arena indexes somewhere that happen to be right most
of the time.
Several comments and constant names are now somewhat misleading. We'll
fix that in the next CL. This CL is the core change the arena
indexing.
Fixes #23862.
Change-Id: Idb8e299fded04593a286b01a9582da6ddbac2f9a
Reviewed-on: https://go-review.googlesource.com/95497
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2018-02-19 16:10:58 -05:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-02-19 13:38:46 -05:00
										 |  |  | 	// Max number of threads to run garbage collection. | 
					
						
							|  |  |  | 	// 2, 3, and 4 are all plausible maximums depending | 
					
						
							| 
									
										
										
										
											2016-03-01 23:21:55 +00:00
										 |  |  | 	// on the hardware details of the machine. The garbage | 
					
						
							| 
									
										
										
										
											2015-02-19 13:38:46 -05:00
										 |  |  | 	// collector scales well to 32 cpus. | 
					
						
							|  |  |  | 	_MaxGcproc = 32 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2017-01-06 09:44:41 -05:00
										 |  |  | 	// minLegalPointer is the smallest possible legal pointer. | 
					
						
							|  |  |  | 	// This is the smallest possible architectural page size, | 
					
						
							|  |  |  | 	// since we assume that the first page is never mapped. | 
					
						
							|  |  |  | 	// | 
					
						
							|  |  |  | 	// This should agree with minZeroPage in the compiler. | 
					
						
							|  |  |  | 	minLegalPointer uintptr = 4096 | 
					
						
							| 
									
										
										
										
											2023-01-03 17:59:48 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | 	// minHeapForMetadataHugePages sets a threshold on when certain kinds of | 
					
						
							|  |  |  | 	// heap metadata, currently the arenas map L2 entries and page alloc bitmap | 
					
						
							|  |  |  | 	// mappings, are allowed to be backed by huge pages. If the heap goal ever | 
					
						
							|  |  |  | 	// exceeds this threshold, then huge pages are enabled. | 
					
						
							|  |  |  | 	// | 
					
						
							|  |  |  | 	// These numbers are chosen with the assumption that huge pages are on the | 
					
						
							|  |  |  | 	// order of a few MiB in size. | 
					
						
							|  |  |  | 	// | 
					
						
							|  |  |  | 	// The kind of metadata this applies to has a very low overhead when compared | 
					
						
							|  |  |  | 	// to address space used, but their constant overheads for small heaps would | 
					
						
							|  |  |  | 	// be very high if they were to be backed by huge pages (e.g. a few MiB makes | 
					
						
							|  |  |  | 	// a huge difference for an 8 MiB heap, but barely any difference for a 1 GiB | 
					
						
							|  |  |  | 	// heap). The benefit of huge pages is also not worth it for small heaps, | 
					
						
							|  |  |  | 	// because only a very, very small part of the metadata is used for small heaps. | 
					
						
							|  |  |  | 	// | 
					
						
							|  |  |  | 	// N.B. If the heap goal exceeds the threshold then shrinks to a very small size | 
					
						
							|  |  |  | 	// again, then huge pages will still be enabled for this mapping. The reason is that | 
					
						
							|  |  |  | 	// there's no point unless we're also returning the physical memory for these | 
					
						
							|  |  |  | 	// metadata mappings back to the OS. That would be quite complex to do in general | 
					
						
							|  |  |  | 	// as the heap is likely fragmented after a reduction in heap size. | 
					
						
							|  |  |  | 	minHeapForMetadataHugePages = 1 << 30 | 
					
						
							| 
									
										
										
										
											2016-10-26 21:25:56 -07:00
										 |  |  | ) | 
					
						
							| 
									
										
										
										
											2015-02-19 13:38:46 -05:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-07-18 21:40:02 -04:00
										 |  |  | // physPageSize is the size in bytes of the OS's physical pages. | 
					
						
							|  |  |  | // Mapping and unmapping operations must be done at multiples of | 
					
						
							|  |  |  | // physPageSize. | 
					
						
							|  |  |  | // | 
					
						
							|  |  |  | // This must be set by the OS init code (typically in osinit) before | 
					
						
							|  |  |  | // mallocinit. | 
					
						
							|  |  |  | var physPageSize uintptr | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-03-29 20:03:29 +00:00
										 |  |  | // physHugePageSize is the size in bytes of the OS's default physical huge | 
					
						
							| 
									
										
										
										
											2019-07-16 20:36:33 +00:00
										 |  |  | // page size whose allocation is opaque to the application. It is assumed | 
					
						
							|  |  |  | // and verified to be a power of two. | 
					
						
							| 
									
										
										
										
											2019-03-29 20:03:29 +00:00
										 |  |  | // | 
					
						
							|  |  |  | // If set, this must be set by the OS init code (typically in osinit) before | 
					
						
							|  |  |  | // mallocinit. However, setting it at all is optional, and leaving the default | 
					
						
							|  |  |  | // value is always safe (though potentially less efficient). | 
					
						
							| 
									
										
										
										
											2019-07-16 20:36:33 +00:00
										 |  |  | // | 
					
						
							|  |  |  | // Since physHugePageSize is always assumed to be a power of two, | 
					
						
							|  |  |  | // physHugePageShift is defined as physHugePageSize == 1 << physHugePageShift. | 
					
						
							|  |  |  | // The purpose of physHugePageShift is to avoid doing divisions in | 
					
						
							|  |  |  | // performance critical functions. | 
					
						
							|  |  |  | var ( | 
					
						
							|  |  |  | 	physHugePageSize  uintptr | 
					
						
							|  |  |  | 	physHugePageShift uint | 
					
						
							|  |  |  | ) | 
					
						
							| 
									
										
										
										
											2019-03-29 20:03:29 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-02-19 13:38:46 -05:00
										 |  |  | func mallocinit() { | 
					
						
							|  |  |  | 	if class_to_size[_TinySizeClass] != _TinySize { | 
					
						
							|  |  |  | 		throw("bad TinySizeClass") | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-04-29 13:21:44 -07:00
										 |  |  | 	if heapArenaBitmapWords&(heapArenaBitmapWords-1) != 0 { | 
					
						
							| 
									
										
										
										
											2017-12-08 22:57:53 -05:00
										 |  |  | 		// heapBits expects modular arithmetic on bitmap | 
					
						
							|  |  |  | 		// addresses to work. | 
					
						
							| 
									
										
										
										
											2022-04-29 13:21:44 -07:00
										 |  |  | 		throw("heapArenaBitmapWords not a power of 2") | 
					
						
							| 
									
										
										
										
											2017-12-08 22:57:53 -05:00
										 |  |  | 	} | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-07-18 21:40:02 -04:00
										 |  |  | 	// Check physPageSize. | 
					
						
							|  |  |  | 	if physPageSize == 0 { | 
					
						
							|  |  |  | 		// The OS init code failed to fetch the physical page size. | 
					
						
							|  |  |  | 		throw("failed to get system page size") | 
					
						
							|  |  |  | 	} | 
					
						
							| 
									
										
										
										
											2019-11-07 21:14:37 +00:00
										 |  |  | 	if physPageSize > maxPhysPageSize { | 
					
						
							|  |  |  | 		print("system page size (", physPageSize, ") is larger than maximum page size (", maxPhysPageSize, ")\n") | 
					
						
							|  |  |  | 		throw("bad system page size") | 
					
						
							|  |  |  | 	} | 
					
						
							| 
									
										
										
										
											2016-07-18 12:24:02 -04:00
										 |  |  | 	if physPageSize < minPhysPageSize { | 
					
						
							|  |  |  | 		print("system page size (", physPageSize, ") is smaller than minimum page size (", minPhysPageSize, ")\n") | 
					
						
							|  |  |  | 		throw("bad system page size") | 
					
						
							| 
									
										
										
										
											2016-07-18 21:40:02 -04:00
										 |  |  | 	} | 
					
						
							| 
									
										
										
										
											2016-07-18 12:24:02 -04:00
										 |  |  | 	if physPageSize&(physPageSize-1) != 0 { | 
					
						
							|  |  |  | 		print("system page size (", physPageSize, ") must be a power of 2\n") | 
					
						
							|  |  |  | 		throw("bad system page size") | 
					
						
							| 
									
										
										
										
											2016-07-18 21:40:02 -04:00
										 |  |  | 	} | 
					
						
							| 
									
										
										
										
											2019-07-16 20:36:33 +00:00
										 |  |  | 	if physHugePageSize&(physHugePageSize-1) != 0 { | 
					
						
							|  |  |  | 		print("system huge page size (", physHugePageSize, ") must be a power of 2\n") | 
					
						
							|  |  |  | 		throw("bad system huge page size") | 
					
						
							|  |  |  | 	} | 
					
						
							| 
									
										
										
										
											2019-11-07 21:14:37 +00:00
										 |  |  | 	if physHugePageSize > maxPhysHugePageSize { | 
					
						
							|  |  |  | 		// physHugePageSize is greater than the maximum supported huge page size. | 
					
						
							|  |  |  | 		// Don't throw here, like in the other cases, since a system configured | 
					
						
							|  |  |  | 		// in this way isn't wrong, we just don't have the code to support them. | 
					
						
							|  |  |  | 		// Instead, silently set the huge page size to zero. | 
					
						
							|  |  |  | 		physHugePageSize = 0 | 
					
						
							|  |  |  | 	} | 
					
						
							| 
									
										
										
										
											2019-07-16 20:36:33 +00:00
										 |  |  | 	if physHugePageSize != 0 { | 
					
						
							|  |  |  | 		// Since physHugePageSize is a power of 2, it suffices to increase | 
					
						
							|  |  |  | 		// physHugePageShift until 1<<physHugePageShift == physHugePageSize. | 
					
						
							|  |  |  | 		for 1<<physHugePageShift != physHugePageSize { | 
					
						
							|  |  |  | 			physHugePageShift++ | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 	} | 
					
						
							| 
									
										
											  
											
												runtime: add bitmap-based markrootSpans implementation
Currently markrootSpans, the scanning routine which scans span specials
(particularly finalizers) as roots, uses sweepSpans to shard work and
find spans to mark.
However, as part of a future CL to change span ownership and how
mcentral works, we want to avoid having markrootSpans use the sweep bufs
to find specials, so in this change we introduce a new mechanism.
Much like for the page reclaimer, we set up a per-page bitmap where the
first page for a span is marked if the span contains any specials, and
unmarked if it has no specials. This bitmap is updated by addspecial,
removespecial, and during sweeping.
markrootSpans then shards this bitmap into mark work and markers iterate
over the bitmap looking for spans with specials to mark. Unlike the page
reclaimer, we don't need to use the pageInUse bits because having a
special implies that a span is in-use.
While in terms of computational complexity this design is technically
worse, because it needs to iterate over the mapped heap, in practice
this iteration is very fast (we can skip over large swathes of the heap
very quickly) and we only look at spans that have any specials at all,
rather than having to touch each span.
This new implementation of markrootSpans is behind a feature flag called
go115NewMarkrootSpans.
Updates #37487.
Change-Id: I8ea07b6c11059f6d412fe419e0ab512d989377b8
Reviewed-on: https://go-review.googlesource.com/c/go/+/221178
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Austin Clements <austin@google.com>
											
										 
											2020-02-20 20:03:39 +00:00
										 |  |  | 	if pagesPerArena%pagesPerSpanRoot != 0 { | 
					
						
							|  |  |  | 		print("pagesPerArena (", pagesPerArena, ") is not divisible by pagesPerSpanRoot (", pagesPerSpanRoot, ")\n") | 
					
						
							|  |  |  | 		throw("bad pagesPerSpanRoot") | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 	if pagesPerArena%pagesPerReclaimerChunk != 0 { | 
					
						
							|  |  |  | 		print("pagesPerArena (", pagesPerArena, ") is not divisible by pagesPerReclaimerChunk (", pagesPerReclaimerChunk, ")\n") | 
					
						
							|  |  |  | 		throw("bad pagesPerReclaimerChunk") | 
					
						
							|  |  |  | 	} | 
					
						
							| 
									
										
											  
											
												runtime: implement experiment to replace heap bitmap with alloc headers
This change replaces the 1-bit-per-word heap bitmap for most size
classes with allocation headers for objects that contain pointers. The
header consists of a single pointer to a type. All allocations with
headers are treated as implicitly containing one or more instances of
the type in the header.
As the name implies, headers are usually stored as the first word of an
object. There are two additional exceptions to where headers are stored
and how they're used.
Objects smaller than 512 bytes do not have headers. Instead, a heap
bitmap is reserved at the end of spans for objects of this size. A full
word of overhead is too much for these small objects. The bitmap is of
the same format of the old bitmap, minus the noMorePtrs bits which are
unnecessary. All the objects <512 bytes have a bitmap less than a
pointer-word in size, and that was the granularity at which noMorePtrs
could stop scanning early anyway.
Objects that are larger than 32 KiB (which have their own span) have
their headers stored directly in the span, to allow power-of-two-sized
allocations to not spill over into an extra page.
The full implementation is behind GOEXPERIMENT=allocheaders.
The purpose of this change is performance. First and foremost, with
headers we no longer have to unroll pointer/scalar data at allocation
time for most size classes. Small size classes still need some
unrolling, but their bitmaps are small so we can optimize that case
fairly well. Larger objects effectively have their pointer/scalar data
unrolled on-demand from type data, which is much more compactly
represented and results in less TLB pressure. Furthermore, since the
headers are usually right next to the object and where we're about to
start scanning, we get an additional temporal locality benefit in the
data cache when looking up type metadata. The pointer/scalar data is
now effectively unrolled on-demand, but it's also simpler to unroll than
before; that unrolled data is never written anywhere, and for arrays we
get the benefit of retreading the same data per element, as opposed to
looking it up from scratch for each pointer-word of bitmap. Lastly,
because we no longer have a heap bitmap that spans the entire heap,
there's a flat 1.5% memory use reduction. This is balanced slightly by
some objects possibly being bumped up a size class, but most objects are
not tightly optimized to size class sizes so there's some memory to
spare, making the header basically free in those cases.
See the follow-up CL which turns on this experiment by default for
benchmark results. (CL 538217.)
Change-Id: I4c9034ee200650d06d8bdecd579d5f7c1bbf1fc5
Reviewed-on: https://go-review.googlesource.com/c/go/+/437955
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: Keith Randall <khr@golang.org>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
											
										 
											2022-09-11 04:07:41 +00:00
										 |  |  | 	if goexperiment.AllocHeaders { | 
					
						
							|  |  |  | 		// Check that the minimum size (exclusive) for a malloc header is also | 
					
						
							|  |  |  | 		// a size class boundary. This is important to making sure checks align | 
					
						
							|  |  |  | 		// across different parts of the runtime. | 
					
						
							|  |  |  | 		minSizeForMallocHeaderIsSizeClass := false | 
					
						
							|  |  |  | 		for i := 0; i < len(class_to_size); i++ { | 
					
						
							|  |  |  | 			if minSizeForMallocHeader == uintptr(class_to_size[i]) { | 
					
						
							|  |  |  | 				minSizeForMallocHeaderIsSizeClass = true | 
					
						
							|  |  |  | 				break | 
					
						
							|  |  |  | 			} | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 		if !minSizeForMallocHeaderIsSizeClass { | 
					
						
							|  |  |  | 			throw("min size of malloc header is not a size class boundary") | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 		// Check that the pointer bitmap for all small sizes without a malloc header | 
					
						
							|  |  |  | 		// fits in a word. | 
					
						
							|  |  |  | 		if minSizeForMallocHeader/goarch.PtrSize > 8*goarch.PtrSize { | 
					
						
							|  |  |  | 			throw("max pointer/scan bitmap size for headerless objects is too large") | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 	} | 
					
						
							| 
									
										
										
										
											2016-07-18 21:40:02 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-04-14 14:36:12 -07:00
										 |  |  | 	if minTagBits > taggedPointerBits { | 
					
						
							|  |  |  | 		throw("taggedPointerbits too small") | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
											  
											
												runtime: use sparse mappings for the heap
This replaces the contiguous heap arena mapping with a potentially
sparse mapping that can support heap mappings anywhere in the address
space.
This has several advantages over the current approach:
* There is no longer any limit on the size of the Go heap. (Currently
  it's limited to 512GB.) Hence, this fixes #10460.
* It eliminates many failures modes of heap initialization and
  growing. In particular it eliminates any possibility of panicking
  with an address space conflict. This can happen for many reasons and
  even causes a low but steady rate of TSAN test failures because of
  conflicts with the TSAN runtime. See #16936 and #11993.
* It eliminates the notion of "non-reserved" heap, which was added
  because creating huge address space reservations (particularly on
  64-bit) led to huge process VSIZE. This was at best confusing and at
  worst conflicted badly with ulimit -v. However, the non-reserved
  heap logic is complicated, can race with other mappings in non-pure
  Go binaries (e.g., #18976), and requires that the entire heap be
  either reserved or non-reserved. We currently maintain the latter
  property, but it's quite difficult to convince yourself of that, and
  hence difficult to keep correct. This logic is still present, but
  will be removed in the next CL.
* It fixes problems on 32-bit where skipping over parts of the address
  space leads to mapping huge (and never-to-be-used) metadata
  structures. See #19831.
This also completely rewrites and significantly simplifies
mheap.sysAlloc, which has been a source of many bugs. E.g., #21044,
 #20259, #18651, and #13143 (and maybe #23222).
This change also makes it possible to allocate individual objects
larger than 512GB. As a result, a few tests that expected huge
allocations to fail needed to be changed to make even larger
allocations. However, at the moment attempting to allocate a humongous
object may cause the program to freeze for several minutes on Linux as
we fall back to probing every page with addrspace_free. That logic
(and this failure mode) will be removed in the next CL.
Fixes #10460.
Fixes #22204 (since it rewrites the code involved).
This slightly slows down compilebench and the x/benchmarks garbage
benchmark.
name       old time/op     new time/op     delta
Template       184ms ± 1%      185ms ± 1%    ~     (p=0.065 n=10+9)
Unicode       86.9ms ± 3%     86.3ms ± 1%    ~     (p=0.631 n=10+10)
GoTypes        599ms ± 0%      602ms ± 0%  +0.56%  (p=0.000 n=10+9)
Compiler       2.87s ± 1%      2.89s ± 1%  +0.51%  (p=0.002 n=9+10)
SSA            7.29s ± 1%      7.25s ± 1%    ~     (p=0.182 n=10+9)
Flate          118ms ± 2%      118ms ± 1%    ~     (p=0.113 n=9+9)
GoParser       147ms ± 1%      148ms ± 1%  +1.07%  (p=0.003 n=9+10)
Reflect        401ms ± 1%      404ms ± 1%  +0.71%  (p=0.003 n=10+9)
Tar            175ms ± 1%      175ms ± 1%    ~     (p=0.604 n=9+10)
XML            209ms ± 1%      210ms ± 1%    ~     (p=0.052 n=10+10)
(https://perf.golang.org/search?q=upload:20171231.4)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.23ms ± 1%  2.25ms ± 1%  +0.84%  (p=0.000 n=19+19)
(https://perf.golang.org/search?q=upload:20171231.3)
Relative to the start of the sparse heap changes (starting at and
including "runtime: fix various contiguous bitmap assumptions"),
overall slowdown is roughly 1% on GC-intensive benchmarks:
name        old time/op     new time/op     delta
Template        183ms ± 1%      185ms ± 1%  +1.32%  (p=0.000 n=9+9)
Unicode        84.9ms ± 2%     86.3ms ± 1%  +1.65%  (p=0.000 n=9+10)
GoTypes         595ms ± 1%      602ms ± 0%  +1.19%  (p=0.000 n=9+9)
Compiler        2.86s ± 0%      2.89s ± 1%  +0.91%  (p=0.000 n=9+10)
SSA             7.19s ± 0%      7.25s ± 1%  +0.75%  (p=0.000 n=8+9)
Flate           117ms ± 1%      118ms ± 1%  +1.10%  (p=0.000 n=10+9)
GoParser        146ms ± 2%      148ms ± 1%  +1.48%  (p=0.002 n=10+10)
Reflect         398ms ± 1%      404ms ± 1%  +1.51%  (p=0.000 n=10+9)
Tar             173ms ± 1%      175ms ± 1%  +1.17%  (p=0.000 n=10+10)
XML             208ms ± 1%      210ms ± 1%  +0.62%  (p=0.011 n=10+10)
[Geo mean]      369ms           373ms       +1.17%
(https://perf.golang.org/search?q=upload:20180101.2)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.22ms ± 1%  2.25ms ± 1%  +1.51%  (p=0.000 n=20+19)
(https://perf.golang.org/search?q=upload:20180101.3)
Change-Id: I5daf4cfec24b252e5a57001f0a6c03f22479d0f0
Reviewed-on: https://go-review.googlesource.com/85887
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2017-12-19 22:05:23 -08:00
										 |  |  | 	// Initialize the heap. | 
					
						
							|  |  |  | 	mheap_.init() | 
					
						
							| 
									
										
										
										
											2019-11-04 14:25:22 -08:00
										 |  |  | 	mcache0 = allocmcache() | 
					
						
							| 
									
										
											  
											
												runtime:  static lock ranking for the runtime (enabled by GOEXPERIMENT)
I took some of the infrastructure from Austin's lock logging CR
https://go-review.googlesource.com/c/go/+/192704 (with deadlock
detection from the logs), and developed a setup to give static lock
ranking for runtime locks.
Static lock ranking establishes a documented total ordering among locks,
and then reports an error if the total order is violated. This can
happen if a deadlock happens (by acquiring a sequence of locks in
different orders), or if just one side of a possible deadlock happens.
Lock ordering deadlocks cannot happen as long as the lock ordering is
followed.
Along the way, I found a deadlock involving the new timer code, which Ian fixed
via https://go-review.googlesource.com/c/go/+/207348, as well as two other
potential deadlocks.
See the constants at the top of runtime/lockrank.go to show the static
lock ranking that I ended up with, along with some comments. This is
great documentation of the current intended lock ordering when acquiring
multiple locks in the runtime.
I also added an array lockPartialOrder[] which shows and enforces the
current partial ordering among locks (which is embedded within the total
ordering). This is more specific about the dependencies among locks.
I don't try to check the ranking within a lock class with multiple locks
that can be acquired at the same time (i.e. check the ranking when
multiple hchan locks are acquired).
Currently, I am doing a lockInit() call to set the lock rank of most
locks. Any lock that is not otherwise initialized is assumed to be a
leaf lock (a very high rank lock), so that eliminates the need to do
anything for a bunch of locks (including all architecture-dependent
locks). For two locks, root.lock and notifyList.lock (only in the
runtime/sema.go file), it is not as easy to do lock initialization, so
instead, I am passing the lock rank with the lock calls.
For Windows compilation, I needed to increase the StackGuard size from
896 to 928 because of the new lock-rank checking functions.
Checking of the static lock ranking is enabled by setting
GOEXPERIMENT=staticlockranking before doing a run.
To make sure that the static lock ranking code has no overhead in memory
or CPU when not enabled by GOEXPERIMENT, I changed 'go build/install' so
that it defines a build tag (with the same name) whenever any experiment
has been baked into the toolchain (by checking Expstring()). This allows
me to avoid increasing the size of the 'mutex' type when static lock
ranking is not enabled.
Fixes #38029
Change-Id: I154217ff307c47051f8dae9c2a03b53081acd83a
Reviewed-on: https://go-review.googlesource.com/c/go/+/207619
Reviewed-by: Dan Scales <danscales@google.com>
Reviewed-by: Keith Randall <khr@golang.org>
Run-TryBot: Dan Scales <danscales@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
											
										 
											2019-11-13 17:34:47 -08:00
										 |  |  | 	lockInit(&gcBitsArenas.lock, lockRankGcBitsArenas) | 
					
						
							| 
									
										
											  
											
												runtime: split mprof locks
The profiles for memory allocations, sync.Mutex contention, and general
blocking store their data in a shared hash table. The bookkeeping work
at the end of a garbage collection cycle involves maintenance on each
memory allocation record. Previously, a single lock guarded access to
the hash table and the contents of all records. When a program has
allocated memory at a large number of unique call stacks, the
maintenance following every garbage collection can hold that lock for
several milliseconds. That can prevent progress on all other goroutines
by delaying acquirep's call to mcache.prepareForSweep, which needs the
lock in mProf_Free to report when a profiled allocation is no longer in
use. With no user goroutines making progress, it is in effect a
multi-millisecond GC-related stop-the-world pause.
Split the lock so the call to mProf_Flush no longer delays each P's call
to mProf_Free: mProf_Free uses a lock on the memory records' N+1 cycle,
and mProf_Flush uses locks on the memory records' accumulator and their
N cycle. mProf_Malloc also no longer competes with mProf_Flush, as it
uses a lock on the memory records' N+2 cycle. The profiles for
sync.Mutex contention and general blocking now share a separate lock,
and another lock guards insertions to the shared hash table (uncommon in
the steady-state). Consumers of each type of profile take the matching
accumulator lock, so will observe consistent count and magnitude values
for each record.
For #45894
Change-Id: I615ff80618d10e71025423daa64b0b7f9dc57daa
Reviewed-on: https://go-review.googlesource.com/c/go/+/399956
Reviewed-by: Carlos Amedee <carlos@golang.org>
Run-TryBot: Rhys Hiltner <rhys@justin.tv>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
											
										 
											2022-04-01 12:56:49 -07:00
										 |  |  | 	lockInit(&profInsertLock, lockRankProfInsert) | 
					
						
							|  |  |  | 	lockInit(&profBlockLock, lockRankProfBlock) | 
					
						
							|  |  |  | 	lockInit(&profMemActiveLock, lockRankProfMemActive) | 
					
						
							|  |  |  | 	for i := range profMemFutureLock { | 
					
						
							|  |  |  | 		lockInit(&profMemFutureLock[i], lockRankProfMemFuture) | 
					
						
							|  |  |  | 	} | 
					
						
							| 
									
										
											  
											
												runtime:  static lock ranking for the runtime (enabled by GOEXPERIMENT)
I took some of the infrastructure from Austin's lock logging CR
https://go-review.googlesource.com/c/go/+/192704 (with deadlock
detection from the logs), and developed a setup to give static lock
ranking for runtime locks.
Static lock ranking establishes a documented total ordering among locks,
and then reports an error if the total order is violated. This can
happen if a deadlock happens (by acquiring a sequence of locks in
different orders), or if just one side of a possible deadlock happens.
Lock ordering deadlocks cannot happen as long as the lock ordering is
followed.
Along the way, I found a deadlock involving the new timer code, which Ian fixed
via https://go-review.googlesource.com/c/go/+/207348, as well as two other
potential deadlocks.
See the constants at the top of runtime/lockrank.go to show the static
lock ranking that I ended up with, along with some comments. This is
great documentation of the current intended lock ordering when acquiring
multiple locks in the runtime.
I also added an array lockPartialOrder[] which shows and enforces the
current partial ordering among locks (which is embedded within the total
ordering). This is more specific about the dependencies among locks.
I don't try to check the ranking within a lock class with multiple locks
that can be acquired at the same time (i.e. check the ranking when
multiple hchan locks are acquired).
Currently, I am doing a lockInit() call to set the lock rank of most
locks. Any lock that is not otherwise initialized is assumed to be a
leaf lock (a very high rank lock), so that eliminates the need to do
anything for a bunch of locks (including all architecture-dependent
locks). For two locks, root.lock and notifyList.lock (only in the
runtime/sema.go file), it is not as easy to do lock initialization, so
instead, I am passing the lock rank with the lock calls.
For Windows compilation, I needed to increase the StackGuard size from
896 to 928 because of the new lock-rank checking functions.
Checking of the static lock ranking is enabled by setting
GOEXPERIMENT=staticlockranking before doing a run.
To make sure that the static lock ranking code has no overhead in memory
or CPU when not enabled by GOEXPERIMENT, I changed 'go build/install' so
that it defines a build tag (with the same name) whenever any experiment
has been baked into the toolchain (by checking Expstring()). This allows
me to avoid increasing the size of the 'mutex' type when static lock
ranking is not enabled.
Fixes #38029
Change-Id: I154217ff307c47051f8dae9c2a03b53081acd83a
Reviewed-on: https://go-review.googlesource.com/c/go/+/207619
Reviewed-by: Dan Scales <danscales@google.com>
Reviewed-by: Keith Randall <khr@golang.org>
Run-TryBot: Dan Scales <danscales@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
											
										 
											2019-11-13 17:34:47 -08:00
										 |  |  | 	lockInit(&globalAlloc.mutex, lockRankGlobalAlloc) | 
					
						
							| 
									
										
										
										
											2015-02-19 13:38:46 -05:00
										 |  |  | 
 | 
					
						
							| 
									
										
											  
											
												runtime: use sparse mappings for the heap
This replaces the contiguous heap arena mapping with a potentially
sparse mapping that can support heap mappings anywhere in the address
space.
This has several advantages over the current approach:
* There is no longer any limit on the size of the Go heap. (Currently
  it's limited to 512GB.) Hence, this fixes #10460.
* It eliminates many failures modes of heap initialization and
  growing. In particular it eliminates any possibility of panicking
  with an address space conflict. This can happen for many reasons and
  even causes a low but steady rate of TSAN test failures because of
  conflicts with the TSAN runtime. See #16936 and #11993.
* It eliminates the notion of "non-reserved" heap, which was added
  because creating huge address space reservations (particularly on
  64-bit) led to huge process VSIZE. This was at best confusing and at
  worst conflicted badly with ulimit -v. However, the non-reserved
  heap logic is complicated, can race with other mappings in non-pure
  Go binaries (e.g., #18976), and requires that the entire heap be
  either reserved or non-reserved. We currently maintain the latter
  property, but it's quite difficult to convince yourself of that, and
  hence difficult to keep correct. This logic is still present, but
  will be removed in the next CL.
* It fixes problems on 32-bit where skipping over parts of the address
  space leads to mapping huge (and never-to-be-used) metadata
  structures. See #19831.
This also completely rewrites and significantly simplifies
mheap.sysAlloc, which has been a source of many bugs. E.g., #21044,
 #20259, #18651, and #13143 (and maybe #23222).
This change also makes it possible to allocate individual objects
larger than 512GB. As a result, a few tests that expected huge
allocations to fail needed to be changed to make even larger
allocations. However, at the moment attempting to allocate a humongous
object may cause the program to freeze for several minutes on Linux as
we fall back to probing every page with addrspace_free. That logic
(and this failure mode) will be removed in the next CL.
Fixes #10460.
Fixes #22204 (since it rewrites the code involved).
This slightly slows down compilebench and the x/benchmarks garbage
benchmark.
name       old time/op     new time/op     delta
Template       184ms ± 1%      185ms ± 1%    ~     (p=0.065 n=10+9)
Unicode       86.9ms ± 3%     86.3ms ± 1%    ~     (p=0.631 n=10+10)
GoTypes        599ms ± 0%      602ms ± 0%  +0.56%  (p=0.000 n=10+9)
Compiler       2.87s ± 1%      2.89s ± 1%  +0.51%  (p=0.002 n=9+10)
SSA            7.29s ± 1%      7.25s ± 1%    ~     (p=0.182 n=10+9)
Flate          118ms ± 2%      118ms ± 1%    ~     (p=0.113 n=9+9)
GoParser       147ms ± 1%      148ms ± 1%  +1.07%  (p=0.003 n=9+10)
Reflect        401ms ± 1%      404ms ± 1%  +0.71%  (p=0.003 n=10+9)
Tar            175ms ± 1%      175ms ± 1%    ~     (p=0.604 n=9+10)
XML            209ms ± 1%      210ms ± 1%    ~     (p=0.052 n=10+10)
(https://perf.golang.org/search?q=upload:20171231.4)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.23ms ± 1%  2.25ms ± 1%  +0.84%  (p=0.000 n=19+19)
(https://perf.golang.org/search?q=upload:20171231.3)
Relative to the start of the sparse heap changes (starting at and
including "runtime: fix various contiguous bitmap assumptions"),
overall slowdown is roughly 1% on GC-intensive benchmarks:
name        old time/op     new time/op     delta
Template        183ms ± 1%      185ms ± 1%  +1.32%  (p=0.000 n=9+9)
Unicode        84.9ms ± 2%     86.3ms ± 1%  +1.65%  (p=0.000 n=9+10)
GoTypes         595ms ± 1%      602ms ± 0%  +1.19%  (p=0.000 n=9+9)
Compiler        2.86s ± 0%      2.89s ± 1%  +0.91%  (p=0.000 n=9+10)
SSA             7.19s ± 0%      7.25s ± 1%  +0.75%  (p=0.000 n=8+9)
Flate           117ms ± 1%      118ms ± 1%  +1.10%  (p=0.000 n=10+9)
GoParser        146ms ± 2%      148ms ± 1%  +1.48%  (p=0.002 n=10+10)
Reflect         398ms ± 1%      404ms ± 1%  +1.51%  (p=0.000 n=10+9)
Tar             173ms ± 1%      175ms ± 1%  +1.17%  (p=0.000 n=10+10)
XML             208ms ± 1%      210ms ± 1%  +0.62%  (p=0.011 n=10+10)
[Geo mean]      369ms           373ms       +1.17%
(https://perf.golang.org/search?q=upload:20180101.2)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.22ms ± 1%  2.25ms ± 1%  +1.51%  (p=0.000 n=20+19)
(https://perf.golang.org/search?q=upload:20180101.3)
Change-Id: I5daf4cfec24b252e5a57001f0a6c03f22479d0f0
Reviewed-on: https://go-review.googlesource.com/85887
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2017-12-19 22:05:23 -08:00
										 |  |  | 	// Create initial arena growth hints. | 
					
						
							| 
									
										
										
										
											2021-06-16 23:05:44 +00:00
										 |  |  | 	if goarch.PtrSize == 8 { | 
					
						
							| 
									
										
											  
											
												runtime: use sparse mappings for the heap
This replaces the contiguous heap arena mapping with a potentially
sparse mapping that can support heap mappings anywhere in the address
space.
This has several advantages over the current approach:
* There is no longer any limit on the size of the Go heap. (Currently
  it's limited to 512GB.) Hence, this fixes #10460.
* It eliminates many failures modes of heap initialization and
  growing. In particular it eliminates any possibility of panicking
  with an address space conflict. This can happen for many reasons and
  even causes a low but steady rate of TSAN test failures because of
  conflicts with the TSAN runtime. See #16936 and #11993.
* It eliminates the notion of "non-reserved" heap, which was added
  because creating huge address space reservations (particularly on
  64-bit) led to huge process VSIZE. This was at best confusing and at
  worst conflicted badly with ulimit -v. However, the non-reserved
  heap logic is complicated, can race with other mappings in non-pure
  Go binaries (e.g., #18976), and requires that the entire heap be
  either reserved or non-reserved. We currently maintain the latter
  property, but it's quite difficult to convince yourself of that, and
  hence difficult to keep correct. This logic is still present, but
  will be removed in the next CL.
* It fixes problems on 32-bit where skipping over parts of the address
  space leads to mapping huge (and never-to-be-used) metadata
  structures. See #19831.
This also completely rewrites and significantly simplifies
mheap.sysAlloc, which has been a source of many bugs. E.g., #21044,
 #20259, #18651, and #13143 (and maybe #23222).
This change also makes it possible to allocate individual objects
larger than 512GB. As a result, a few tests that expected huge
allocations to fail needed to be changed to make even larger
allocations. However, at the moment attempting to allocate a humongous
object may cause the program to freeze for several minutes on Linux as
we fall back to probing every page with addrspace_free. That logic
(and this failure mode) will be removed in the next CL.
Fixes #10460.
Fixes #22204 (since it rewrites the code involved).
This slightly slows down compilebench and the x/benchmarks garbage
benchmark.
name       old time/op     new time/op     delta
Template       184ms ± 1%      185ms ± 1%    ~     (p=0.065 n=10+9)
Unicode       86.9ms ± 3%     86.3ms ± 1%    ~     (p=0.631 n=10+10)
GoTypes        599ms ± 0%      602ms ± 0%  +0.56%  (p=0.000 n=10+9)
Compiler       2.87s ± 1%      2.89s ± 1%  +0.51%  (p=0.002 n=9+10)
SSA            7.29s ± 1%      7.25s ± 1%    ~     (p=0.182 n=10+9)
Flate          118ms ± 2%      118ms ± 1%    ~     (p=0.113 n=9+9)
GoParser       147ms ± 1%      148ms ± 1%  +1.07%  (p=0.003 n=9+10)
Reflect        401ms ± 1%      404ms ± 1%  +0.71%  (p=0.003 n=10+9)
Tar            175ms ± 1%      175ms ± 1%    ~     (p=0.604 n=9+10)
XML            209ms ± 1%      210ms ± 1%    ~     (p=0.052 n=10+10)
(https://perf.golang.org/search?q=upload:20171231.4)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.23ms ± 1%  2.25ms ± 1%  +0.84%  (p=0.000 n=19+19)
(https://perf.golang.org/search?q=upload:20171231.3)
Relative to the start of the sparse heap changes (starting at and
including "runtime: fix various contiguous bitmap assumptions"),
overall slowdown is roughly 1% on GC-intensive benchmarks:
name        old time/op     new time/op     delta
Template        183ms ± 1%      185ms ± 1%  +1.32%  (p=0.000 n=9+9)
Unicode        84.9ms ± 2%     86.3ms ± 1%  +1.65%  (p=0.000 n=9+10)
GoTypes         595ms ± 1%      602ms ± 0%  +1.19%  (p=0.000 n=9+9)
Compiler        2.86s ± 0%      2.89s ± 1%  +0.91%  (p=0.000 n=9+10)
SSA             7.19s ± 0%      7.25s ± 1%  +0.75%  (p=0.000 n=8+9)
Flate           117ms ± 1%      118ms ± 1%  +1.10%  (p=0.000 n=10+9)
GoParser        146ms ± 2%      148ms ± 1%  +1.48%  (p=0.002 n=10+10)
Reflect         398ms ± 1%      404ms ± 1%  +1.51%  (p=0.000 n=10+9)
Tar             173ms ± 1%      175ms ± 1%  +1.17%  (p=0.000 n=10+10)
XML             208ms ± 1%      210ms ± 1%  +0.62%  (p=0.011 n=10+10)
[Geo mean]      369ms           373ms       +1.17%
(https://perf.golang.org/search?q=upload:20180101.2)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.22ms ± 1%  2.25ms ± 1%  +1.51%  (p=0.000 n=20+19)
(https://perf.golang.org/search?q=upload:20180101.3)
Change-Id: I5daf4cfec24b252e5a57001f0a6c03f22479d0f0
Reviewed-on: https://go-review.googlesource.com/85887
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2017-12-19 22:05:23 -08:00
										 |  |  | 		// On a 64-bit machine, we pick the following hints | 
					
						
							|  |  |  | 		// because: | 
					
						
							| 
									
										
										
										
											2015-02-19 13:38:46 -05:00
										 |  |  | 		// | 
					
						
							| 
									
										
											  
											
												runtime: use sparse mappings for the heap
This replaces the contiguous heap arena mapping with a potentially
sparse mapping that can support heap mappings anywhere in the address
space.
This has several advantages over the current approach:
* There is no longer any limit on the size of the Go heap. (Currently
  it's limited to 512GB.) Hence, this fixes #10460.
* It eliminates many failures modes of heap initialization and
  growing. In particular it eliminates any possibility of panicking
  with an address space conflict. This can happen for many reasons and
  even causes a low but steady rate of TSAN test failures because of
  conflicts with the TSAN runtime. See #16936 and #11993.
* It eliminates the notion of "non-reserved" heap, which was added
  because creating huge address space reservations (particularly on
  64-bit) led to huge process VSIZE. This was at best confusing and at
  worst conflicted badly with ulimit -v. However, the non-reserved
  heap logic is complicated, can race with other mappings in non-pure
  Go binaries (e.g., #18976), and requires that the entire heap be
  either reserved or non-reserved. We currently maintain the latter
  property, but it's quite difficult to convince yourself of that, and
  hence difficult to keep correct. This logic is still present, but
  will be removed in the next CL.
* It fixes problems on 32-bit where skipping over parts of the address
  space leads to mapping huge (and never-to-be-used) metadata
  structures. See #19831.
This also completely rewrites and significantly simplifies
mheap.sysAlloc, which has been a source of many bugs. E.g., #21044,
 #20259, #18651, and #13143 (and maybe #23222).
This change also makes it possible to allocate individual objects
larger than 512GB. As a result, a few tests that expected huge
allocations to fail needed to be changed to make even larger
allocations. However, at the moment attempting to allocate a humongous
object may cause the program to freeze for several minutes on Linux as
we fall back to probing every page with addrspace_free. That logic
(and this failure mode) will be removed in the next CL.
Fixes #10460.
Fixes #22204 (since it rewrites the code involved).
This slightly slows down compilebench and the x/benchmarks garbage
benchmark.
name       old time/op     new time/op     delta
Template       184ms ± 1%      185ms ± 1%    ~     (p=0.065 n=10+9)
Unicode       86.9ms ± 3%     86.3ms ± 1%    ~     (p=0.631 n=10+10)
GoTypes        599ms ± 0%      602ms ± 0%  +0.56%  (p=0.000 n=10+9)
Compiler       2.87s ± 1%      2.89s ± 1%  +0.51%  (p=0.002 n=9+10)
SSA            7.29s ± 1%      7.25s ± 1%    ~     (p=0.182 n=10+9)
Flate          118ms ± 2%      118ms ± 1%    ~     (p=0.113 n=9+9)
GoParser       147ms ± 1%      148ms ± 1%  +1.07%  (p=0.003 n=9+10)
Reflect        401ms ± 1%      404ms ± 1%  +0.71%  (p=0.003 n=10+9)
Tar            175ms ± 1%      175ms ± 1%    ~     (p=0.604 n=9+10)
XML            209ms ± 1%      210ms ± 1%    ~     (p=0.052 n=10+10)
(https://perf.golang.org/search?q=upload:20171231.4)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.23ms ± 1%  2.25ms ± 1%  +0.84%  (p=0.000 n=19+19)
(https://perf.golang.org/search?q=upload:20171231.3)
Relative to the start of the sparse heap changes (starting at and
including "runtime: fix various contiguous bitmap assumptions"),
overall slowdown is roughly 1% on GC-intensive benchmarks:
name        old time/op     new time/op     delta
Template        183ms ± 1%      185ms ± 1%  +1.32%  (p=0.000 n=9+9)
Unicode        84.9ms ± 2%     86.3ms ± 1%  +1.65%  (p=0.000 n=9+10)
GoTypes         595ms ± 1%      602ms ± 0%  +1.19%  (p=0.000 n=9+9)
Compiler        2.86s ± 0%      2.89s ± 1%  +0.91%  (p=0.000 n=9+10)
SSA             7.19s ± 0%      7.25s ± 1%  +0.75%  (p=0.000 n=8+9)
Flate           117ms ± 1%      118ms ± 1%  +1.10%  (p=0.000 n=10+9)
GoParser        146ms ± 2%      148ms ± 1%  +1.48%  (p=0.002 n=10+10)
Reflect         398ms ± 1%      404ms ± 1%  +1.51%  (p=0.000 n=10+9)
Tar             173ms ± 1%      175ms ± 1%  +1.17%  (p=0.000 n=10+10)
XML             208ms ± 1%      210ms ± 1%  +0.62%  (p=0.011 n=10+10)
[Geo mean]      369ms           373ms       +1.17%
(https://perf.golang.org/search?q=upload:20180101.2)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.22ms ± 1%  2.25ms ± 1%  +1.51%  (p=0.000 n=20+19)
(https://perf.golang.org/search?q=upload:20180101.3)
Change-Id: I5daf4cfec24b252e5a57001f0a6c03f22479d0f0
Reviewed-on: https://go-review.googlesource.com/85887
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2017-12-19 22:05:23 -08:00
										 |  |  | 		// 1. Starting from the middle of the address space | 
					
						
							|  |  |  | 		// makes it easier to grow out a contiguous range | 
					
						
							|  |  |  | 		// without running in to some other mapping. | 
					
						
							|  |  |  | 		// | 
					
						
							|  |  |  | 		// 2. This makes Go heap addresses more easily | 
					
						
							|  |  |  | 		// recognizable when debugging. | 
					
						
							|  |  |  | 		// | 
					
						
							|  |  |  | 		// 3. Stack scanning in gccgo is still conservative, | 
					
						
							|  |  |  | 		// so it's important that addresses be distinguishable | 
					
						
							|  |  |  | 		// from other data. | 
					
						
							|  |  |  | 		// | 
					
						
							|  |  |  | 		// Starting at 0x00c0 means that the valid memory addresses | 
					
						
							|  |  |  | 		// will begin 0x00c0, 0x00c1, ... | 
					
						
							|  |  |  | 		// In little-endian, that's c0 00, c1 00, ... None of those are valid | 
					
						
							| 
									
										
										
										
											2015-02-19 13:38:46 -05:00
										 |  |  | 		// UTF-8 sequences, and they are otherwise as far away from | 
					
						
							| 
									
										
										
										
											2016-03-01 23:21:55 +00:00
										 |  |  | 		// ff (likely a common byte) as possible. If that fails, we try other 0xXXc0 | 
					
						
							|  |  |  | 		// addresses. An earlier attempt to use 0x11f8 caused out of memory errors | 
					
						
							| 
									
										
										
										
											2015-02-19 13:38:46 -05:00
										 |  |  | 		// on OS X during thread allocations.  0x00c0 causes conflicts with | 
					
						
							|  |  |  | 		// AddressSanitizer which reserves all memory up to 0x0100. | 
					
						
							| 
									
										
											  
											
												runtime: use sparse mappings for the heap
This replaces the contiguous heap arena mapping with a potentially
sparse mapping that can support heap mappings anywhere in the address
space.
This has several advantages over the current approach:
* There is no longer any limit on the size of the Go heap. (Currently
  it's limited to 512GB.) Hence, this fixes #10460.
* It eliminates many failures modes of heap initialization and
  growing. In particular it eliminates any possibility of panicking
  with an address space conflict. This can happen for many reasons and
  even causes a low but steady rate of TSAN test failures because of
  conflicts with the TSAN runtime. See #16936 and #11993.
* It eliminates the notion of "non-reserved" heap, which was added
  because creating huge address space reservations (particularly on
  64-bit) led to huge process VSIZE. This was at best confusing and at
  worst conflicted badly with ulimit -v. However, the non-reserved
  heap logic is complicated, can race with other mappings in non-pure
  Go binaries (e.g., #18976), and requires that the entire heap be
  either reserved or non-reserved. We currently maintain the latter
  property, but it's quite difficult to convince yourself of that, and
  hence difficult to keep correct. This logic is still present, but
  will be removed in the next CL.
* It fixes problems on 32-bit where skipping over parts of the address
  space leads to mapping huge (and never-to-be-used) metadata
  structures. See #19831.
This also completely rewrites and significantly simplifies
mheap.sysAlloc, which has been a source of many bugs. E.g., #21044,
 #20259, #18651, and #13143 (and maybe #23222).
This change also makes it possible to allocate individual objects
larger than 512GB. As a result, a few tests that expected huge
allocations to fail needed to be changed to make even larger
allocations. However, at the moment attempting to allocate a humongous
object may cause the program to freeze for several minutes on Linux as
we fall back to probing every page with addrspace_free. That logic
(and this failure mode) will be removed in the next CL.
Fixes #10460.
Fixes #22204 (since it rewrites the code involved).
This slightly slows down compilebench and the x/benchmarks garbage
benchmark.
name       old time/op     new time/op     delta
Template       184ms ± 1%      185ms ± 1%    ~     (p=0.065 n=10+9)
Unicode       86.9ms ± 3%     86.3ms ± 1%    ~     (p=0.631 n=10+10)
GoTypes        599ms ± 0%      602ms ± 0%  +0.56%  (p=0.000 n=10+9)
Compiler       2.87s ± 1%      2.89s ± 1%  +0.51%  (p=0.002 n=9+10)
SSA            7.29s ± 1%      7.25s ± 1%    ~     (p=0.182 n=10+9)
Flate          118ms ± 2%      118ms ± 1%    ~     (p=0.113 n=9+9)
GoParser       147ms ± 1%      148ms ± 1%  +1.07%  (p=0.003 n=9+10)
Reflect        401ms ± 1%      404ms ± 1%  +0.71%  (p=0.003 n=10+9)
Tar            175ms ± 1%      175ms ± 1%    ~     (p=0.604 n=9+10)
XML            209ms ± 1%      210ms ± 1%    ~     (p=0.052 n=10+10)
(https://perf.golang.org/search?q=upload:20171231.4)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.23ms ± 1%  2.25ms ± 1%  +0.84%  (p=0.000 n=19+19)
(https://perf.golang.org/search?q=upload:20171231.3)
Relative to the start of the sparse heap changes (starting at and
including "runtime: fix various contiguous bitmap assumptions"),
overall slowdown is roughly 1% on GC-intensive benchmarks:
name        old time/op     new time/op     delta
Template        183ms ± 1%      185ms ± 1%  +1.32%  (p=0.000 n=9+9)
Unicode        84.9ms ± 2%     86.3ms ± 1%  +1.65%  (p=0.000 n=9+10)
GoTypes         595ms ± 1%      602ms ± 0%  +1.19%  (p=0.000 n=9+9)
Compiler        2.86s ± 0%      2.89s ± 1%  +0.91%  (p=0.000 n=9+10)
SSA             7.19s ± 0%      7.25s ± 1%  +0.75%  (p=0.000 n=8+9)
Flate           117ms ± 1%      118ms ± 1%  +1.10%  (p=0.000 n=10+9)
GoParser        146ms ± 2%      148ms ± 1%  +1.48%  (p=0.002 n=10+10)
Reflect         398ms ± 1%      404ms ± 1%  +1.51%  (p=0.000 n=10+9)
Tar             173ms ± 1%      175ms ± 1%  +1.17%  (p=0.000 n=10+10)
XML             208ms ± 1%      210ms ± 1%  +0.62%  (p=0.011 n=10+10)
[Geo mean]      369ms           373ms       +1.17%
(https://perf.golang.org/search?q=upload:20180101.2)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.22ms ± 1%  2.25ms ± 1%  +1.51%  (p=0.000 n=20+19)
(https://perf.golang.org/search?q=upload:20180101.3)
Change-Id: I5daf4cfec24b252e5a57001f0a6c03f22479d0f0
Reviewed-on: https://go-review.googlesource.com/85887
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2017-12-19 22:05:23 -08:00
										 |  |  | 		// These choices reduce the odds of a conservative garbage collector | 
					
						
							| 
									
										
										
										
											2015-06-08 00:14:08 -04:00
										 |  |  | 		// not collecting memory because some non-pointer block of memory | 
					
						
							|  |  |  | 		// had a bit pattern that matched a memory address. | 
					
						
							| 
									
										
										
										
											2015-02-19 13:38:46 -05:00
										 |  |  | 		// | 
					
						
							| 
									
										
										
										
											2015-03-08 14:20:20 +01:00
										 |  |  | 		// However, on arm64, we ignore all this advice above and slam the | 
					
						
							|  |  |  | 		// allocation at 0x40 << 32 because when using 4k pages with 3-level | 
					
						
							|  |  |  | 		// translation buffers, the user address space is limited to 39 bits | 
					
						
							| 
									
										
										
										
											2020-09-17 10:53:10 -04:00
										 |  |  | 		// On ios/arm64, the address space is even smaller. | 
					
						
							| 
									
										
										
										
											2019-11-04 04:58:37 +11:00
										 |  |  | 		// | 
					
						
							| 
									
										
										
										
											2018-10-01 09:58:40 +02:00
										 |  |  | 		// On AIX, mmaps starts at 0x0A00000000000000 for 64-bit. | 
					
						
							|  |  |  | 		// processes. | 
					
						
							| 
									
										
											  
											
												runtime: add safe arena support to the runtime
This change adds an API to the runtime for arenas. A later CL can
potentially export it as an experimental API, but for now, just the
runtime implementation will suffice.
The purpose of arenas is to improve efficiency, primarily by allowing
for an application to manually free memory, thereby delaying garbage
collection. It comes with other potential performance benefits, such as
better locality, a better allocation strategy, and better handling of
interior pointers by the GC.
This implementation is based on one by danscales@google.com with a few
significant differences:
* The implementation lives entirely in the runtime (all layers).
* Arena chunks are the minimum of 8 MiB or the heap arena size. This
  choice is made because in practice 64 MiB appears to be way too large
  of an area for most real-world use-cases.
* Arena chunks are not unmapped, instead they're placed on an evacuation
  list and when there are no pointers left pointing into them, they're
  allowed to be reused.
* Reusing partially-used arena chunks no longer tries to find one used
  by the same P first; it just takes the first one available.
* In order to ensure worst-case fragmentation is never worse than 25%,
  only types and slice backing stores whose sizes are 1/4th the size of
  a chunk or less may be used. Previously larger sizes, up to the size
  of the chunk, were allowed.
* ASAN, MSAN, and the race detector are fully supported.
* Sets arena chunks to fault that were deferred at the end of mark
  termination (a non-public patch once did this; I don't see a reason
  not to continue that).
For #51317.
Change-Id: I83b1693a17302554cb36b6daa4e9249a81b1644f
Reviewed-on: https://go-review.googlesource.com/c/go/+/423359
Reviewed-by: Cherry Mui <cherryyz@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
											
										 
											2022-08-12 21:40:46 +00:00
										 |  |  | 		// | 
					
						
							|  |  |  | 		// Space mapped for user arenas comes immediately after the range | 
					
						
							|  |  |  | 		// originally reserved for the regular heap when race mode is not | 
					
						
							|  |  |  | 		// enabled because user arena chunks can never be used for regular heap | 
					
						
							|  |  |  | 		// allocations and we want to avoid fragmenting the address space. | 
					
						
							|  |  |  | 		// | 
					
						
							|  |  |  | 		// In race mode we have no choice but to just use the same hints because | 
					
						
							|  |  |  | 		// the race detector requires that the heap be mapped contiguously. | 
					
						
							| 
									
										
											  
											
												runtime: use sparse mappings for the heap
This replaces the contiguous heap arena mapping with a potentially
sparse mapping that can support heap mappings anywhere in the address
space.
This has several advantages over the current approach:
* There is no longer any limit on the size of the Go heap. (Currently
  it's limited to 512GB.) Hence, this fixes #10460.
* It eliminates many failures modes of heap initialization and
  growing. In particular it eliminates any possibility of panicking
  with an address space conflict. This can happen for many reasons and
  even causes a low but steady rate of TSAN test failures because of
  conflicts with the TSAN runtime. See #16936 and #11993.
* It eliminates the notion of "non-reserved" heap, which was added
  because creating huge address space reservations (particularly on
  64-bit) led to huge process VSIZE. This was at best confusing and at
  worst conflicted badly with ulimit -v. However, the non-reserved
  heap logic is complicated, can race with other mappings in non-pure
  Go binaries (e.g., #18976), and requires that the entire heap be
  either reserved or non-reserved. We currently maintain the latter
  property, but it's quite difficult to convince yourself of that, and
  hence difficult to keep correct. This logic is still present, but
  will be removed in the next CL.
* It fixes problems on 32-bit where skipping over parts of the address
  space leads to mapping huge (and never-to-be-used) metadata
  structures. See #19831.
This also completely rewrites and significantly simplifies
mheap.sysAlloc, which has been a source of many bugs. E.g., #21044,
 #20259, #18651, and #13143 (and maybe #23222).
This change also makes it possible to allocate individual objects
larger than 512GB. As a result, a few tests that expected huge
allocations to fail needed to be changed to make even larger
allocations. However, at the moment attempting to allocate a humongous
object may cause the program to freeze for several minutes on Linux as
we fall back to probing every page with addrspace_free. That logic
(and this failure mode) will be removed in the next CL.
Fixes #10460.
Fixes #22204 (since it rewrites the code involved).
This slightly slows down compilebench and the x/benchmarks garbage
benchmark.
name       old time/op     new time/op     delta
Template       184ms ± 1%      185ms ± 1%    ~     (p=0.065 n=10+9)
Unicode       86.9ms ± 3%     86.3ms ± 1%    ~     (p=0.631 n=10+10)
GoTypes        599ms ± 0%      602ms ± 0%  +0.56%  (p=0.000 n=10+9)
Compiler       2.87s ± 1%      2.89s ± 1%  +0.51%  (p=0.002 n=9+10)
SSA            7.29s ± 1%      7.25s ± 1%    ~     (p=0.182 n=10+9)
Flate          118ms ± 2%      118ms ± 1%    ~     (p=0.113 n=9+9)
GoParser       147ms ± 1%      148ms ± 1%  +1.07%  (p=0.003 n=9+10)
Reflect        401ms ± 1%      404ms ± 1%  +0.71%  (p=0.003 n=10+9)
Tar            175ms ± 1%      175ms ± 1%    ~     (p=0.604 n=9+10)
XML            209ms ± 1%      210ms ± 1%    ~     (p=0.052 n=10+10)
(https://perf.golang.org/search?q=upload:20171231.4)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.23ms ± 1%  2.25ms ± 1%  +0.84%  (p=0.000 n=19+19)
(https://perf.golang.org/search?q=upload:20171231.3)
Relative to the start of the sparse heap changes (starting at and
including "runtime: fix various contiguous bitmap assumptions"),
overall slowdown is roughly 1% on GC-intensive benchmarks:
name        old time/op     new time/op     delta
Template        183ms ± 1%      185ms ± 1%  +1.32%  (p=0.000 n=9+9)
Unicode        84.9ms ± 2%     86.3ms ± 1%  +1.65%  (p=0.000 n=9+10)
GoTypes         595ms ± 1%      602ms ± 0%  +1.19%  (p=0.000 n=9+9)
Compiler        2.86s ± 0%      2.89s ± 1%  +0.91%  (p=0.000 n=9+10)
SSA             7.19s ± 0%      7.25s ± 1%  +0.75%  (p=0.000 n=8+9)
Flate           117ms ± 1%      118ms ± 1%  +1.10%  (p=0.000 n=10+9)
GoParser        146ms ± 2%      148ms ± 1%  +1.48%  (p=0.002 n=10+10)
Reflect         398ms ± 1%      404ms ± 1%  +1.51%  (p=0.000 n=10+9)
Tar             173ms ± 1%      175ms ± 1%  +1.17%  (p=0.000 n=10+10)
XML             208ms ± 1%      210ms ± 1%  +0.62%  (p=0.011 n=10+10)
[Geo mean]      369ms           373ms       +1.17%
(https://perf.golang.org/search?q=upload:20180101.2)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.22ms ± 1%  2.25ms ± 1%  +1.51%  (p=0.000 n=20+19)
(https://perf.golang.org/search?q=upload:20180101.3)
Change-Id: I5daf4cfec24b252e5a57001f0a6c03f22479d0f0
Reviewed-on: https://go-review.googlesource.com/85887
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2017-12-19 22:05:23 -08:00
										 |  |  | 		for i := 0x7f; i >= 0; i-- { | 
					
						
							|  |  |  | 			var p uintptr | 
					
						
							| 
									
										
										
										
											2015-04-10 22:14:43 -04:00
										 |  |  | 			switch { | 
					
						
							| 
									
										
										
										
											2020-10-29 15:50:53 -04:00
										 |  |  | 			case raceenabled: | 
					
						
							|  |  |  | 				// The TSAN runtime requires the heap | 
					
						
							|  |  |  | 				// to be in the range [0x00c000000000, | 
					
						
							|  |  |  | 				// 0x00e000000000). | 
					
						
							|  |  |  | 				p = uintptr(i)<<32 | uintptrMask&(0x00c0<<32) | 
					
						
							|  |  |  | 				if p >= uintptrMask&0x00e000000000 { | 
					
						
							|  |  |  | 					continue | 
					
						
							|  |  |  | 				} | 
					
						
							| 
									
										
										
										
											2020-09-17 10:53:10 -04:00
										 |  |  | 			case GOARCH == "arm64" && GOOS == "ios": | 
					
						
							| 
									
										
										
										
											2015-04-10 22:14:43 -04:00
										 |  |  | 				p = uintptr(i)<<40 | uintptrMask&(0x0013<<28) | 
					
						
							|  |  |  | 			case GOARCH == "arm64": | 
					
						
							| 
									
										
										
										
											2015-03-08 14:20:20 +01:00
										 |  |  | 				p = uintptr(i)<<40 | uintptrMask&(0x0040<<32) | 
					
						
							| 
									
										
										
										
											2018-10-01 09:58:40 +02:00
										 |  |  | 			case GOOS == "aix": | 
					
						
							|  |  |  | 				if i == 0 { | 
					
						
							|  |  |  | 					// We don't use addresses directly after 0x0A00000000000000 | 
					
						
							|  |  |  | 					// to avoid collisions with others mmaps done by non-go programs. | 
					
						
							|  |  |  | 					continue | 
					
						
							|  |  |  | 				} | 
					
						
							|  |  |  | 				p = uintptr(i)<<40 | uintptrMask&(0xa0<<52) | 
					
						
							| 
									
										
										
										
											2015-04-10 22:14:43 -04:00
										 |  |  | 			default: | 
					
						
							| 
									
										
										
										
											2015-03-08 14:20:20 +01:00
										 |  |  | 				p = uintptr(i)<<40 | uintptrMask&(0x00c0<<32) | 
					
						
							|  |  |  | 			} | 
					
						
							| 
									
										
											  
											
												runtime: add safe arena support to the runtime
This change adds an API to the runtime for arenas. A later CL can
potentially export it as an experimental API, but for now, just the
runtime implementation will suffice.
The purpose of arenas is to improve efficiency, primarily by allowing
for an application to manually free memory, thereby delaying garbage
collection. It comes with other potential performance benefits, such as
better locality, a better allocation strategy, and better handling of
interior pointers by the GC.
This implementation is based on one by danscales@google.com with a few
significant differences:
* The implementation lives entirely in the runtime (all layers).
* Arena chunks are the minimum of 8 MiB or the heap arena size. This
  choice is made because in practice 64 MiB appears to be way too large
  of an area for most real-world use-cases.
* Arena chunks are not unmapped, instead they're placed on an evacuation
  list and when there are no pointers left pointing into them, they're
  allowed to be reused.
* Reusing partially-used arena chunks no longer tries to find one used
  by the same P first; it just takes the first one available.
* In order to ensure worst-case fragmentation is never worse than 25%,
  only types and slice backing stores whose sizes are 1/4th the size of
  a chunk or less may be used. Previously larger sizes, up to the size
  of the chunk, were allowed.
* ASAN, MSAN, and the race detector are fully supported.
* Sets arena chunks to fault that were deferred at the end of mark
  termination (a non-public patch once did this; I don't see a reason
  not to continue that).
For #51317.
Change-Id: I83b1693a17302554cb36b6daa4e9249a81b1644f
Reviewed-on: https://go-review.googlesource.com/c/go/+/423359
Reviewed-by: Cherry Mui <cherryyz@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
											
										 
											2022-08-12 21:40:46 +00:00
										 |  |  | 			// Switch to generating hints for user arenas if we've gone | 
					
						
							|  |  |  | 			// through about half the hints. In race mode, take only about | 
					
						
							|  |  |  | 			// a quarter; we don't have very much space to work with. | 
					
						
							|  |  |  | 			hintList := &mheap_.arenaHints | 
					
						
							|  |  |  | 			if (!raceenabled && i > 0x3f) || (raceenabled && i > 0x5f) { | 
					
						
							|  |  |  | 				hintList = &mheap_.userArena.arenaHints | 
					
						
							|  |  |  | 			} | 
					
						
							| 
									
										
											  
											
												runtime: use sparse mappings for the heap
This replaces the contiguous heap arena mapping with a potentially
sparse mapping that can support heap mappings anywhere in the address
space.
This has several advantages over the current approach:
* There is no longer any limit on the size of the Go heap. (Currently
  it's limited to 512GB.) Hence, this fixes #10460.
* It eliminates many failures modes of heap initialization and
  growing. In particular it eliminates any possibility of panicking
  with an address space conflict. This can happen for many reasons and
  even causes a low but steady rate of TSAN test failures because of
  conflicts with the TSAN runtime. See #16936 and #11993.
* It eliminates the notion of "non-reserved" heap, which was added
  because creating huge address space reservations (particularly on
  64-bit) led to huge process VSIZE. This was at best confusing and at
  worst conflicted badly with ulimit -v. However, the non-reserved
  heap logic is complicated, can race with other mappings in non-pure
  Go binaries (e.g., #18976), and requires that the entire heap be
  either reserved or non-reserved. We currently maintain the latter
  property, but it's quite difficult to convince yourself of that, and
  hence difficult to keep correct. This logic is still present, but
  will be removed in the next CL.
* It fixes problems on 32-bit where skipping over parts of the address
  space leads to mapping huge (and never-to-be-used) metadata
  structures. See #19831.
This also completely rewrites and significantly simplifies
mheap.sysAlloc, which has been a source of many bugs. E.g., #21044,
 #20259, #18651, and #13143 (and maybe #23222).
This change also makes it possible to allocate individual objects
larger than 512GB. As a result, a few tests that expected huge
allocations to fail needed to be changed to make even larger
allocations. However, at the moment attempting to allocate a humongous
object may cause the program to freeze for several minutes on Linux as
we fall back to probing every page with addrspace_free. That logic
(and this failure mode) will be removed in the next CL.
Fixes #10460.
Fixes #22204 (since it rewrites the code involved).
This slightly slows down compilebench and the x/benchmarks garbage
benchmark.
name       old time/op     new time/op     delta
Template       184ms ± 1%      185ms ± 1%    ~     (p=0.065 n=10+9)
Unicode       86.9ms ± 3%     86.3ms ± 1%    ~     (p=0.631 n=10+10)
GoTypes        599ms ± 0%      602ms ± 0%  +0.56%  (p=0.000 n=10+9)
Compiler       2.87s ± 1%      2.89s ± 1%  +0.51%  (p=0.002 n=9+10)
SSA            7.29s ± 1%      7.25s ± 1%    ~     (p=0.182 n=10+9)
Flate          118ms ± 2%      118ms ± 1%    ~     (p=0.113 n=9+9)
GoParser       147ms ± 1%      148ms ± 1%  +1.07%  (p=0.003 n=9+10)
Reflect        401ms ± 1%      404ms ± 1%  +0.71%  (p=0.003 n=10+9)
Tar            175ms ± 1%      175ms ± 1%    ~     (p=0.604 n=9+10)
XML            209ms ± 1%      210ms ± 1%    ~     (p=0.052 n=10+10)
(https://perf.golang.org/search?q=upload:20171231.4)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.23ms ± 1%  2.25ms ± 1%  +0.84%  (p=0.000 n=19+19)
(https://perf.golang.org/search?q=upload:20171231.3)
Relative to the start of the sparse heap changes (starting at and
including "runtime: fix various contiguous bitmap assumptions"),
overall slowdown is roughly 1% on GC-intensive benchmarks:
name        old time/op     new time/op     delta
Template        183ms ± 1%      185ms ± 1%  +1.32%  (p=0.000 n=9+9)
Unicode        84.9ms ± 2%     86.3ms ± 1%  +1.65%  (p=0.000 n=9+10)
GoTypes         595ms ± 1%      602ms ± 0%  +1.19%  (p=0.000 n=9+9)
Compiler        2.86s ± 0%      2.89s ± 1%  +0.91%  (p=0.000 n=9+10)
SSA             7.19s ± 0%      7.25s ± 1%  +0.75%  (p=0.000 n=8+9)
Flate           117ms ± 1%      118ms ± 1%  +1.10%  (p=0.000 n=10+9)
GoParser        146ms ± 2%      148ms ± 1%  +1.48%  (p=0.002 n=10+10)
Reflect         398ms ± 1%      404ms ± 1%  +1.51%  (p=0.000 n=10+9)
Tar             173ms ± 1%      175ms ± 1%  +1.17%  (p=0.000 n=10+10)
XML             208ms ± 1%      210ms ± 1%  +0.62%  (p=0.011 n=10+10)
[Geo mean]      369ms           373ms       +1.17%
(https://perf.golang.org/search?q=upload:20180101.2)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.22ms ± 1%  2.25ms ± 1%  +1.51%  (p=0.000 n=20+19)
(https://perf.golang.org/search?q=upload:20180101.3)
Change-Id: I5daf4cfec24b252e5a57001f0a6c03f22479d0f0
Reviewed-on: https://go-review.googlesource.com/85887
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2017-12-19 22:05:23 -08:00
										 |  |  | 			hint := (*arenaHint)(mheap_.arenaHintAlloc.alloc()) | 
					
						
							|  |  |  | 			hint.addr = p | 
					
						
							| 
									
										
											  
											
												runtime: add safe arena support to the runtime
This change adds an API to the runtime for arenas. A later CL can
potentially export it as an experimental API, but for now, just the
runtime implementation will suffice.
The purpose of arenas is to improve efficiency, primarily by allowing
for an application to manually free memory, thereby delaying garbage
collection. It comes with other potential performance benefits, such as
better locality, a better allocation strategy, and better handling of
interior pointers by the GC.
This implementation is based on one by danscales@google.com with a few
significant differences:
* The implementation lives entirely in the runtime (all layers).
* Arena chunks are the minimum of 8 MiB or the heap arena size. This
  choice is made because in practice 64 MiB appears to be way too large
  of an area for most real-world use-cases.
* Arena chunks are not unmapped, instead they're placed on an evacuation
  list and when there are no pointers left pointing into them, they're
  allowed to be reused.
* Reusing partially-used arena chunks no longer tries to find one used
  by the same P first; it just takes the first one available.
* In order to ensure worst-case fragmentation is never worse than 25%,
  only types and slice backing stores whose sizes are 1/4th the size of
  a chunk or less may be used. Previously larger sizes, up to the size
  of the chunk, were allowed.
* ASAN, MSAN, and the race detector are fully supported.
* Sets arena chunks to fault that were deferred at the end of mark
  termination (a non-public patch once did this; I don't see a reason
  not to continue that).
For #51317.
Change-Id: I83b1693a17302554cb36b6daa4e9249a81b1644f
Reviewed-on: https://go-review.googlesource.com/c/go/+/423359
Reviewed-by: Cherry Mui <cherryyz@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
											
										 
											2022-08-12 21:40:46 +00:00
										 |  |  | 			hint.next, *hintList = *hintList, hint | 
					
						
							| 
									
										
											  
											
												runtime: use sparse mappings for the heap
This replaces the contiguous heap arena mapping with a potentially
sparse mapping that can support heap mappings anywhere in the address
space.
This has several advantages over the current approach:
* There is no longer any limit on the size of the Go heap. (Currently
  it's limited to 512GB.) Hence, this fixes #10460.
* It eliminates many failures modes of heap initialization and
  growing. In particular it eliminates any possibility of panicking
  with an address space conflict. This can happen for many reasons and
  even causes a low but steady rate of TSAN test failures because of
  conflicts with the TSAN runtime. See #16936 and #11993.
* It eliminates the notion of "non-reserved" heap, which was added
  because creating huge address space reservations (particularly on
  64-bit) led to huge process VSIZE. This was at best confusing and at
  worst conflicted badly with ulimit -v. However, the non-reserved
  heap logic is complicated, can race with other mappings in non-pure
  Go binaries (e.g., #18976), and requires that the entire heap be
  either reserved or non-reserved. We currently maintain the latter
  property, but it's quite difficult to convince yourself of that, and
  hence difficult to keep correct. This logic is still present, but
  will be removed in the next CL.
* It fixes problems on 32-bit where skipping over parts of the address
  space leads to mapping huge (and never-to-be-used) metadata
  structures. See #19831.
This also completely rewrites and significantly simplifies
mheap.sysAlloc, which has been a source of many bugs. E.g., #21044,
 #20259, #18651, and #13143 (and maybe #23222).
This change also makes it possible to allocate individual objects
larger than 512GB. As a result, a few tests that expected huge
allocations to fail needed to be changed to make even larger
allocations. However, at the moment attempting to allocate a humongous
object may cause the program to freeze for several minutes on Linux as
we fall back to probing every page with addrspace_free. That logic
(and this failure mode) will be removed in the next CL.
Fixes #10460.
Fixes #22204 (since it rewrites the code involved).
This slightly slows down compilebench and the x/benchmarks garbage
benchmark.
name       old time/op     new time/op     delta
Template       184ms ± 1%      185ms ± 1%    ~     (p=0.065 n=10+9)
Unicode       86.9ms ± 3%     86.3ms ± 1%    ~     (p=0.631 n=10+10)
GoTypes        599ms ± 0%      602ms ± 0%  +0.56%  (p=0.000 n=10+9)
Compiler       2.87s ± 1%      2.89s ± 1%  +0.51%  (p=0.002 n=9+10)
SSA            7.29s ± 1%      7.25s ± 1%    ~     (p=0.182 n=10+9)
Flate          118ms ± 2%      118ms ± 1%    ~     (p=0.113 n=9+9)
GoParser       147ms ± 1%      148ms ± 1%  +1.07%  (p=0.003 n=9+10)
Reflect        401ms ± 1%      404ms ± 1%  +0.71%  (p=0.003 n=10+9)
Tar            175ms ± 1%      175ms ± 1%    ~     (p=0.604 n=9+10)
XML            209ms ± 1%      210ms ± 1%    ~     (p=0.052 n=10+10)
(https://perf.golang.org/search?q=upload:20171231.4)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.23ms ± 1%  2.25ms ± 1%  +0.84%  (p=0.000 n=19+19)
(https://perf.golang.org/search?q=upload:20171231.3)
Relative to the start of the sparse heap changes (starting at and
including "runtime: fix various contiguous bitmap assumptions"),
overall slowdown is roughly 1% on GC-intensive benchmarks:
name        old time/op     new time/op     delta
Template        183ms ± 1%      185ms ± 1%  +1.32%  (p=0.000 n=9+9)
Unicode        84.9ms ± 2%     86.3ms ± 1%  +1.65%  (p=0.000 n=9+10)
GoTypes         595ms ± 1%      602ms ± 0%  +1.19%  (p=0.000 n=9+9)
Compiler        2.86s ± 0%      2.89s ± 1%  +0.91%  (p=0.000 n=9+10)
SSA             7.19s ± 0%      7.25s ± 1%  +0.75%  (p=0.000 n=8+9)
Flate           117ms ± 1%      118ms ± 1%  +1.10%  (p=0.000 n=10+9)
GoParser        146ms ± 2%      148ms ± 1%  +1.48%  (p=0.002 n=10+10)
Reflect         398ms ± 1%      404ms ± 1%  +1.51%  (p=0.000 n=10+9)
Tar             173ms ± 1%      175ms ± 1%  +1.17%  (p=0.000 n=10+10)
XML             208ms ± 1%      210ms ± 1%  +0.62%  (p=0.011 n=10+10)
[Geo mean]      369ms           373ms       +1.17%
(https://perf.golang.org/search?q=upload:20180101.2)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.22ms ± 1%  2.25ms ± 1%  +1.51%  (p=0.000 n=20+19)
(https://perf.golang.org/search?q=upload:20180101.3)
Change-Id: I5daf4cfec24b252e5a57001f0a6c03f22479d0f0
Reviewed-on: https://go-review.googlesource.com/85887
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2017-12-19 22:05:23 -08:00
										 |  |  | 		} | 
					
						
							|  |  |  | 	} else { | 
					
						
							|  |  |  | 		// On a 32-bit machine, we're much more concerned | 
					
						
							|  |  |  | 		// about keeping the usable heap contiguous. | 
					
						
							|  |  |  | 		// Hence: | 
					
						
							|  |  |  | 		// | 
					
						
							|  |  |  | 		// 1. We reserve space for all heapArenas up front so | 
					
						
							|  |  |  | 		// they don't get interleaved with the heap. They're | 
					
						
							|  |  |  | 		// ~258MB, so this isn't too bad. (We could reserve a | 
					
						
							|  |  |  | 		// smaller amount of space up front if this is a | 
					
						
							|  |  |  | 		// problem.) | 
					
						
							|  |  |  | 		// | 
					
						
							|  |  |  | 		// 2. We hint the heap to start right above the end of | 
					
						
							|  |  |  | 		// the binary so we have the best chance of keeping it | 
					
						
							|  |  |  | 		// contiguous. | 
					
						
							|  |  |  | 		// | 
					
						
							|  |  |  | 		// 3. We try to stake out a reasonably large initial | 
					
						
							|  |  |  | 		// heap reservation. | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2018-10-01 09:58:40 +02:00
										 |  |  | 		const arenaMetaSize = (1 << arenaBits) * unsafe.Sizeof(heapArena{}) | 
					
						
							| 
									
										
											  
											
												runtime: remove non-reserved heap logic
Currently large sysReserve calls on some OSes don't actually reserve
the memory, but just check that it can be reserved. This was important
when we called sysReserve to "reserve" many gigabytes for the heap up
front, but now that we map memory in small increments as we need it,
this complication is no longer necessary.
This has one curious side benefit: currently, on Linux, allocations
that are large enough to be rejected by mmap wind up freezing the
application for a long time before it panics. This happens because
sysReserve doesn't reserve the memory, so sysMap calls mmap_fixed,
which calls mmap, which fails because the mapping is too large.
However, mmap_fixed doesn't inspect *why* mmap fails, so it falls back
to probing every page in the desired region individually with mincore
before performing an (otherwise dangerous) MAP_FIXED mapping, which
will also fail. This takes a long time for a large region. Now this
logic is gone, so the mmap failure leads to an immediate panic.
Updates #10460.
Change-Id: I8efe88c611871cdb14f99fadd09db83e0161ca2e
Reviewed-on: https://go-review.googlesource.com/85888
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2017-12-30 19:35:46 -05:00
										 |  |  | 		meta := uintptr(sysReserve(nil, arenaMetaSize)) | 
					
						
							| 
									
										
											  
											
												runtime: use sparse mappings for the heap
This replaces the contiguous heap arena mapping with a potentially
sparse mapping that can support heap mappings anywhere in the address
space.
This has several advantages over the current approach:
* There is no longer any limit on the size of the Go heap. (Currently
  it's limited to 512GB.) Hence, this fixes #10460.
* It eliminates many failures modes of heap initialization and
  growing. In particular it eliminates any possibility of panicking
  with an address space conflict. This can happen for many reasons and
  even causes a low but steady rate of TSAN test failures because of
  conflicts with the TSAN runtime. See #16936 and #11993.
* It eliminates the notion of "non-reserved" heap, which was added
  because creating huge address space reservations (particularly on
  64-bit) led to huge process VSIZE. This was at best confusing and at
  worst conflicted badly with ulimit -v. However, the non-reserved
  heap logic is complicated, can race with other mappings in non-pure
  Go binaries (e.g., #18976), and requires that the entire heap be
  either reserved or non-reserved. We currently maintain the latter
  property, but it's quite difficult to convince yourself of that, and
  hence difficult to keep correct. This logic is still present, but
  will be removed in the next CL.
* It fixes problems on 32-bit where skipping over parts of the address
  space leads to mapping huge (and never-to-be-used) metadata
  structures. See #19831.
This also completely rewrites and significantly simplifies
mheap.sysAlloc, which has been a source of many bugs. E.g., #21044,
 #20259, #18651, and #13143 (and maybe #23222).
This change also makes it possible to allocate individual objects
larger than 512GB. As a result, a few tests that expected huge
allocations to fail needed to be changed to make even larger
allocations. However, at the moment attempting to allocate a humongous
object may cause the program to freeze for several minutes on Linux as
we fall back to probing every page with addrspace_free. That logic
(and this failure mode) will be removed in the next CL.
Fixes #10460.
Fixes #22204 (since it rewrites the code involved).
This slightly slows down compilebench and the x/benchmarks garbage
benchmark.
name       old time/op     new time/op     delta
Template       184ms ± 1%      185ms ± 1%    ~     (p=0.065 n=10+9)
Unicode       86.9ms ± 3%     86.3ms ± 1%    ~     (p=0.631 n=10+10)
GoTypes        599ms ± 0%      602ms ± 0%  +0.56%  (p=0.000 n=10+9)
Compiler       2.87s ± 1%      2.89s ± 1%  +0.51%  (p=0.002 n=9+10)
SSA            7.29s ± 1%      7.25s ± 1%    ~     (p=0.182 n=10+9)
Flate          118ms ± 2%      118ms ± 1%    ~     (p=0.113 n=9+9)
GoParser       147ms ± 1%      148ms ± 1%  +1.07%  (p=0.003 n=9+10)
Reflect        401ms ± 1%      404ms ± 1%  +0.71%  (p=0.003 n=10+9)
Tar            175ms ± 1%      175ms ± 1%    ~     (p=0.604 n=9+10)
XML            209ms ± 1%      210ms ± 1%    ~     (p=0.052 n=10+10)
(https://perf.golang.org/search?q=upload:20171231.4)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.23ms ± 1%  2.25ms ± 1%  +0.84%  (p=0.000 n=19+19)
(https://perf.golang.org/search?q=upload:20171231.3)
Relative to the start of the sparse heap changes (starting at and
including "runtime: fix various contiguous bitmap assumptions"),
overall slowdown is roughly 1% on GC-intensive benchmarks:
name        old time/op     new time/op     delta
Template        183ms ± 1%      185ms ± 1%  +1.32%  (p=0.000 n=9+9)
Unicode        84.9ms ± 2%     86.3ms ± 1%  +1.65%  (p=0.000 n=9+10)
GoTypes         595ms ± 1%      602ms ± 0%  +1.19%  (p=0.000 n=9+9)
Compiler        2.86s ± 0%      2.89s ± 1%  +0.91%  (p=0.000 n=9+10)
SSA             7.19s ± 0%      7.25s ± 1%  +0.75%  (p=0.000 n=8+9)
Flate           117ms ± 1%      118ms ± 1%  +1.10%  (p=0.000 n=10+9)
GoParser        146ms ± 2%      148ms ± 1%  +1.48%  (p=0.002 n=10+10)
Reflect         398ms ± 1%      404ms ± 1%  +1.51%  (p=0.000 n=10+9)
Tar             173ms ± 1%      175ms ± 1%  +1.17%  (p=0.000 n=10+10)
XML             208ms ± 1%      210ms ± 1%  +0.62%  (p=0.011 n=10+10)
[Geo mean]      369ms           373ms       +1.17%
(https://perf.golang.org/search?q=upload:20180101.2)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.22ms ± 1%  2.25ms ± 1%  +1.51%  (p=0.000 n=20+19)
(https://perf.golang.org/search?q=upload:20180101.3)
Change-Id: I5daf4cfec24b252e5a57001f0a6c03f22479d0f0
Reviewed-on: https://go-review.googlesource.com/85887
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2017-12-19 22:05:23 -08:00
										 |  |  | 		if meta != 0 { | 
					
						
							| 
									
										
										
										
											2020-11-16 21:57:32 +00:00
										 |  |  | 			mheap_.heapArenaAlloc.init(meta, arenaMetaSize, true) | 
					
						
							| 
									
										
										
										
											2015-02-19 13:38:46 -05:00
										 |  |  | 		} | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2017-04-06 14:32:37 -04:00
										 |  |  | 		// We want to start the arena low, but if we're linked | 
					
						
							|  |  |  | 		// against C code, it's possible global constructors | 
					
						
							|  |  |  | 		// have called malloc and adjusted the process' brk. | 
					
						
							|  |  |  | 		// Query the brk so we can avoid trying to map the | 
					
						
							| 
									
										
											  
											
												runtime: use sparse mappings for the heap
This replaces the contiguous heap arena mapping with a potentially
sparse mapping that can support heap mappings anywhere in the address
space.
This has several advantages over the current approach:
* There is no longer any limit on the size of the Go heap. (Currently
  it's limited to 512GB.) Hence, this fixes #10460.
* It eliminates many failures modes of heap initialization and
  growing. In particular it eliminates any possibility of panicking
  with an address space conflict. This can happen for many reasons and
  even causes a low but steady rate of TSAN test failures because of
  conflicts with the TSAN runtime. See #16936 and #11993.
* It eliminates the notion of "non-reserved" heap, which was added
  because creating huge address space reservations (particularly on
  64-bit) led to huge process VSIZE. This was at best confusing and at
  worst conflicted badly with ulimit -v. However, the non-reserved
  heap logic is complicated, can race with other mappings in non-pure
  Go binaries (e.g., #18976), and requires that the entire heap be
  either reserved or non-reserved. We currently maintain the latter
  property, but it's quite difficult to convince yourself of that, and
  hence difficult to keep correct. This logic is still present, but
  will be removed in the next CL.
* It fixes problems on 32-bit where skipping over parts of the address
  space leads to mapping huge (and never-to-be-used) metadata
  structures. See #19831.
This also completely rewrites and significantly simplifies
mheap.sysAlloc, which has been a source of many bugs. E.g., #21044,
 #20259, #18651, and #13143 (and maybe #23222).
This change also makes it possible to allocate individual objects
larger than 512GB. As a result, a few tests that expected huge
allocations to fail needed to be changed to make even larger
allocations. However, at the moment attempting to allocate a humongous
object may cause the program to freeze for several minutes on Linux as
we fall back to probing every page with addrspace_free. That logic
(and this failure mode) will be removed in the next CL.
Fixes #10460.
Fixes #22204 (since it rewrites the code involved).
This slightly slows down compilebench and the x/benchmarks garbage
benchmark.
name       old time/op     new time/op     delta
Template       184ms ± 1%      185ms ± 1%    ~     (p=0.065 n=10+9)
Unicode       86.9ms ± 3%     86.3ms ± 1%    ~     (p=0.631 n=10+10)
GoTypes        599ms ± 0%      602ms ± 0%  +0.56%  (p=0.000 n=10+9)
Compiler       2.87s ± 1%      2.89s ± 1%  +0.51%  (p=0.002 n=9+10)
SSA            7.29s ± 1%      7.25s ± 1%    ~     (p=0.182 n=10+9)
Flate          118ms ± 2%      118ms ± 1%    ~     (p=0.113 n=9+9)
GoParser       147ms ± 1%      148ms ± 1%  +1.07%  (p=0.003 n=9+10)
Reflect        401ms ± 1%      404ms ± 1%  +0.71%  (p=0.003 n=10+9)
Tar            175ms ± 1%      175ms ± 1%    ~     (p=0.604 n=9+10)
XML            209ms ± 1%      210ms ± 1%    ~     (p=0.052 n=10+10)
(https://perf.golang.org/search?q=upload:20171231.4)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.23ms ± 1%  2.25ms ± 1%  +0.84%  (p=0.000 n=19+19)
(https://perf.golang.org/search?q=upload:20171231.3)
Relative to the start of the sparse heap changes (starting at and
including "runtime: fix various contiguous bitmap assumptions"),
overall slowdown is roughly 1% on GC-intensive benchmarks:
name        old time/op     new time/op     delta
Template        183ms ± 1%      185ms ± 1%  +1.32%  (p=0.000 n=9+9)
Unicode        84.9ms ± 2%     86.3ms ± 1%  +1.65%  (p=0.000 n=9+10)
GoTypes         595ms ± 1%      602ms ± 0%  +1.19%  (p=0.000 n=9+9)
Compiler        2.86s ± 0%      2.89s ± 1%  +0.91%  (p=0.000 n=9+10)
SSA             7.19s ± 0%      7.25s ± 1%  +0.75%  (p=0.000 n=8+9)
Flate           117ms ± 1%      118ms ± 1%  +1.10%  (p=0.000 n=10+9)
GoParser        146ms ± 2%      148ms ± 1%  +1.48%  (p=0.002 n=10+10)
Reflect         398ms ± 1%      404ms ± 1%  +1.51%  (p=0.000 n=10+9)
Tar             173ms ± 1%      175ms ± 1%  +1.17%  (p=0.000 n=10+10)
XML             208ms ± 1%      210ms ± 1%  +0.62%  (p=0.011 n=10+10)
[Geo mean]      369ms           373ms       +1.17%
(https://perf.golang.org/search?q=upload:20180101.2)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.22ms ± 1%  2.25ms ± 1%  +1.51%  (p=0.000 n=20+19)
(https://perf.golang.org/search?q=upload:20180101.3)
Change-Id: I5daf4cfec24b252e5a57001f0a6c03f22479d0f0
Reviewed-on: https://go-review.googlesource.com/85887
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2017-12-19 22:05:23 -08:00
										 |  |  | 		// region over it (which will cause the kernel to put | 
					
						
							|  |  |  | 		// the region somewhere else, likely at a high | 
					
						
							| 
									
										
										
										
											2017-04-06 14:32:37 -04:00
										 |  |  | 		// address). | 
					
						
							|  |  |  | 		procBrk := sbrk0() | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
											  
											
												runtime: use sparse mappings for the heap
This replaces the contiguous heap arena mapping with a potentially
sparse mapping that can support heap mappings anywhere in the address
space.
This has several advantages over the current approach:
* There is no longer any limit on the size of the Go heap. (Currently
  it's limited to 512GB.) Hence, this fixes #10460.
* It eliminates many failures modes of heap initialization and
  growing. In particular it eliminates any possibility of panicking
  with an address space conflict. This can happen for many reasons and
  even causes a low but steady rate of TSAN test failures because of
  conflicts with the TSAN runtime. See #16936 and #11993.
* It eliminates the notion of "non-reserved" heap, which was added
  because creating huge address space reservations (particularly on
  64-bit) led to huge process VSIZE. This was at best confusing and at
  worst conflicted badly with ulimit -v. However, the non-reserved
  heap logic is complicated, can race with other mappings in non-pure
  Go binaries (e.g., #18976), and requires that the entire heap be
  either reserved or non-reserved. We currently maintain the latter
  property, but it's quite difficult to convince yourself of that, and
  hence difficult to keep correct. This logic is still present, but
  will be removed in the next CL.
* It fixes problems on 32-bit where skipping over parts of the address
  space leads to mapping huge (and never-to-be-used) metadata
  structures. See #19831.
This also completely rewrites and significantly simplifies
mheap.sysAlloc, which has been a source of many bugs. E.g., #21044,
 #20259, #18651, and #13143 (and maybe #23222).
This change also makes it possible to allocate individual objects
larger than 512GB. As a result, a few tests that expected huge
allocations to fail needed to be changed to make even larger
allocations. However, at the moment attempting to allocate a humongous
object may cause the program to freeze for several minutes on Linux as
we fall back to probing every page with addrspace_free. That logic
(and this failure mode) will be removed in the next CL.
Fixes #10460.
Fixes #22204 (since it rewrites the code involved).
This slightly slows down compilebench and the x/benchmarks garbage
benchmark.
name       old time/op     new time/op     delta
Template       184ms ± 1%      185ms ± 1%    ~     (p=0.065 n=10+9)
Unicode       86.9ms ± 3%     86.3ms ± 1%    ~     (p=0.631 n=10+10)
GoTypes        599ms ± 0%      602ms ± 0%  +0.56%  (p=0.000 n=10+9)
Compiler       2.87s ± 1%      2.89s ± 1%  +0.51%  (p=0.002 n=9+10)
SSA            7.29s ± 1%      7.25s ± 1%    ~     (p=0.182 n=10+9)
Flate          118ms ± 2%      118ms ± 1%    ~     (p=0.113 n=9+9)
GoParser       147ms ± 1%      148ms ± 1%  +1.07%  (p=0.003 n=9+10)
Reflect        401ms ± 1%      404ms ± 1%  +0.71%  (p=0.003 n=10+9)
Tar            175ms ± 1%      175ms ± 1%    ~     (p=0.604 n=9+10)
XML            209ms ± 1%      210ms ± 1%    ~     (p=0.052 n=10+10)
(https://perf.golang.org/search?q=upload:20171231.4)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.23ms ± 1%  2.25ms ± 1%  +0.84%  (p=0.000 n=19+19)
(https://perf.golang.org/search?q=upload:20171231.3)
Relative to the start of the sparse heap changes (starting at and
including "runtime: fix various contiguous bitmap assumptions"),
overall slowdown is roughly 1% on GC-intensive benchmarks:
name        old time/op     new time/op     delta
Template        183ms ± 1%      185ms ± 1%  +1.32%  (p=0.000 n=9+9)
Unicode        84.9ms ± 2%     86.3ms ± 1%  +1.65%  (p=0.000 n=9+10)
GoTypes         595ms ± 1%      602ms ± 0%  +1.19%  (p=0.000 n=9+9)
Compiler        2.86s ± 0%      2.89s ± 1%  +0.91%  (p=0.000 n=9+10)
SSA             7.19s ± 0%      7.25s ± 1%  +0.75%  (p=0.000 n=8+9)
Flate           117ms ± 1%      118ms ± 1%  +1.10%  (p=0.000 n=10+9)
GoParser        146ms ± 2%      148ms ± 1%  +1.48%  (p=0.002 n=10+10)
Reflect         398ms ± 1%      404ms ± 1%  +1.51%  (p=0.000 n=10+9)
Tar             173ms ± 1%      175ms ± 1%  +1.17%  (p=0.000 n=10+10)
XML             208ms ± 1%      210ms ± 1%  +0.62%  (p=0.011 n=10+10)
[Geo mean]      369ms           373ms       +1.17%
(https://perf.golang.org/search?q=upload:20180101.2)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.22ms ± 1%  2.25ms ± 1%  +1.51%  (p=0.000 n=20+19)
(https://perf.golang.org/search?q=upload:20180101.3)
Change-Id: I5daf4cfec24b252e5a57001f0a6c03f22479d0f0
Reviewed-on: https://go-review.googlesource.com/85887
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2017-12-19 22:05:23 -08:00
										 |  |  | 		// If we ask for the end of the data segment but the | 
					
						
							|  |  |  | 		// operating system requires a little more space | 
					
						
							|  |  |  | 		// before we can start allocating, it will give out a | 
					
						
							|  |  |  | 		// slightly higher pointer. Except QEMU, which is | 
					
						
							|  |  |  | 		// buggy, as usual: it won't adjust the pointer | 
					
						
							|  |  |  | 		// upward. So adjust it upward a little bit ourselves: | 
					
						
							|  |  |  | 		// 1/4 MB to get away from the running binary image. | 
					
						
							|  |  |  | 		p := firstmoduledata.end | 
					
						
							|  |  |  | 		if p < procBrk { | 
					
						
							|  |  |  | 			p = procBrk | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 		if mheap_.heapArenaAlloc.next <= p && p < mheap_.heapArenaAlloc.end { | 
					
						
							|  |  |  | 			p = mheap_.heapArenaAlloc.end | 
					
						
							|  |  |  | 		} | 
					
						
							| 
									
										
										
										
											2019-06-28 16:44:07 +00:00
										 |  |  | 		p = alignUp(p+(256<<10), heapArenaBytes) | 
					
						
							| 
									
										
											  
											
												runtime: use sparse mappings for the heap
This replaces the contiguous heap arena mapping with a potentially
sparse mapping that can support heap mappings anywhere in the address
space.
This has several advantages over the current approach:
* There is no longer any limit on the size of the Go heap. (Currently
  it's limited to 512GB.) Hence, this fixes #10460.
* It eliminates many failures modes of heap initialization and
  growing. In particular it eliminates any possibility of panicking
  with an address space conflict. This can happen for many reasons and
  even causes a low but steady rate of TSAN test failures because of
  conflicts with the TSAN runtime. See #16936 and #11993.
* It eliminates the notion of "non-reserved" heap, which was added
  because creating huge address space reservations (particularly on
  64-bit) led to huge process VSIZE. This was at best confusing and at
  worst conflicted badly with ulimit -v. However, the non-reserved
  heap logic is complicated, can race with other mappings in non-pure
  Go binaries (e.g., #18976), and requires that the entire heap be
  either reserved or non-reserved. We currently maintain the latter
  property, but it's quite difficult to convince yourself of that, and
  hence difficult to keep correct. This logic is still present, but
  will be removed in the next CL.
* It fixes problems on 32-bit where skipping over parts of the address
  space leads to mapping huge (and never-to-be-used) metadata
  structures. See #19831.
This also completely rewrites and significantly simplifies
mheap.sysAlloc, which has been a source of many bugs. E.g., #21044,
 #20259, #18651, and #13143 (and maybe #23222).
This change also makes it possible to allocate individual objects
larger than 512GB. As a result, a few tests that expected huge
allocations to fail needed to be changed to make even larger
allocations. However, at the moment attempting to allocate a humongous
object may cause the program to freeze for several minutes on Linux as
we fall back to probing every page with addrspace_free. That logic
(and this failure mode) will be removed in the next CL.
Fixes #10460.
Fixes #22204 (since it rewrites the code involved).
This slightly slows down compilebench and the x/benchmarks garbage
benchmark.
name       old time/op     new time/op     delta
Template       184ms ± 1%      185ms ± 1%    ~     (p=0.065 n=10+9)
Unicode       86.9ms ± 3%     86.3ms ± 1%    ~     (p=0.631 n=10+10)
GoTypes        599ms ± 0%      602ms ± 0%  +0.56%  (p=0.000 n=10+9)
Compiler       2.87s ± 1%      2.89s ± 1%  +0.51%  (p=0.002 n=9+10)
SSA            7.29s ± 1%      7.25s ± 1%    ~     (p=0.182 n=10+9)
Flate          118ms ± 2%      118ms ± 1%    ~     (p=0.113 n=9+9)
GoParser       147ms ± 1%      148ms ± 1%  +1.07%  (p=0.003 n=9+10)
Reflect        401ms ± 1%      404ms ± 1%  +0.71%  (p=0.003 n=10+9)
Tar            175ms ± 1%      175ms ± 1%    ~     (p=0.604 n=9+10)
XML            209ms ± 1%      210ms ± 1%    ~     (p=0.052 n=10+10)
(https://perf.golang.org/search?q=upload:20171231.4)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.23ms ± 1%  2.25ms ± 1%  +0.84%  (p=0.000 n=19+19)
(https://perf.golang.org/search?q=upload:20171231.3)
Relative to the start of the sparse heap changes (starting at and
including "runtime: fix various contiguous bitmap assumptions"),
overall slowdown is roughly 1% on GC-intensive benchmarks:
name        old time/op     new time/op     delta
Template        183ms ± 1%      185ms ± 1%  +1.32%  (p=0.000 n=9+9)
Unicode        84.9ms ± 2%     86.3ms ± 1%  +1.65%  (p=0.000 n=9+10)
GoTypes         595ms ± 1%      602ms ± 0%  +1.19%  (p=0.000 n=9+9)
Compiler        2.86s ± 0%      2.89s ± 1%  +0.91%  (p=0.000 n=9+10)
SSA             7.19s ± 0%      7.25s ± 1%  +0.75%  (p=0.000 n=8+9)
Flate           117ms ± 1%      118ms ± 1%  +1.10%  (p=0.000 n=10+9)
GoParser        146ms ± 2%      148ms ± 1%  +1.48%  (p=0.002 n=10+10)
Reflect         398ms ± 1%      404ms ± 1%  +1.51%  (p=0.000 n=10+9)
Tar             173ms ± 1%      175ms ± 1%  +1.17%  (p=0.000 n=10+10)
XML             208ms ± 1%      210ms ± 1%  +0.62%  (p=0.011 n=10+10)
[Geo mean]      369ms           373ms       +1.17%
(https://perf.golang.org/search?q=upload:20180101.2)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.22ms ± 1%  2.25ms ± 1%  +1.51%  (p=0.000 n=20+19)
(https://perf.golang.org/search?q=upload:20180101.3)
Change-Id: I5daf4cfec24b252e5a57001f0a6c03f22479d0f0
Reviewed-on: https://go-review.googlesource.com/85887
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2017-12-19 22:05:23 -08:00
										 |  |  | 		// Because we're worried about fragmentation on | 
					
						
							|  |  |  | 		// 32-bit, we try to make a large initial reservation. | 
					
						
							| 
									
										
										
										
											2015-02-19 13:38:46 -05:00
										 |  |  | 		arenaSizes := []uintptr{ | 
					
						
							|  |  |  | 			512 << 20, | 
					
						
							|  |  |  | 			256 << 20, | 
					
						
							| 
									
										
										
										
											2015-02-24 11:11:56 -05:00
										 |  |  | 			128 << 20, | 
					
						
							| 
									
										
										
										
											2015-02-19 13:38:46 -05:00
										 |  |  | 		} | 
					
						
							|  |  |  | 		for _, arenaSize := range arenaSizes { | 
					
						
							| 
									
										
											  
											
												runtime: remove non-reserved heap logic
Currently large sysReserve calls on some OSes don't actually reserve
the memory, but just check that it can be reserved. This was important
when we called sysReserve to "reserve" many gigabytes for the heap up
front, but now that we map memory in small increments as we need it,
this complication is no longer necessary.
This has one curious side benefit: currently, on Linux, allocations
that are large enough to be rejected by mmap wind up freezing the
application for a long time before it panics. This happens because
sysReserve doesn't reserve the memory, so sysMap calls mmap_fixed,
which calls mmap, which fails because the mapping is too large.
However, mmap_fixed doesn't inspect *why* mmap fails, so it falls back
to probing every page in the desired region individually with mincore
before performing an (otherwise dangerous) MAP_FIXED mapping, which
will also fail. This takes a long time for a large region. Now this
logic is gone, so the mmap failure leads to an immediate panic.
Updates #10460.
Change-Id: I8efe88c611871cdb14f99fadd09db83e0161ca2e
Reviewed-on: https://go-review.googlesource.com/85888
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2017-12-30 19:35:46 -05:00
										 |  |  | 			a, size := sysReserveAligned(unsafe.Pointer(p), arenaSize, heapArenaBytes) | 
					
						
							| 
									
										
											  
											
												runtime: use sparse mappings for the heap
This replaces the contiguous heap arena mapping with a potentially
sparse mapping that can support heap mappings anywhere in the address
space.
This has several advantages over the current approach:
* There is no longer any limit on the size of the Go heap. (Currently
  it's limited to 512GB.) Hence, this fixes #10460.
* It eliminates many failures modes of heap initialization and
  growing. In particular it eliminates any possibility of panicking
  with an address space conflict. This can happen for many reasons and
  even causes a low but steady rate of TSAN test failures because of
  conflicts with the TSAN runtime. See #16936 and #11993.
* It eliminates the notion of "non-reserved" heap, which was added
  because creating huge address space reservations (particularly on
  64-bit) led to huge process VSIZE. This was at best confusing and at
  worst conflicted badly with ulimit -v. However, the non-reserved
  heap logic is complicated, can race with other mappings in non-pure
  Go binaries (e.g., #18976), and requires that the entire heap be
  either reserved or non-reserved. We currently maintain the latter
  property, but it's quite difficult to convince yourself of that, and
  hence difficult to keep correct. This logic is still present, but
  will be removed in the next CL.
* It fixes problems on 32-bit where skipping over parts of the address
  space leads to mapping huge (and never-to-be-used) metadata
  structures. See #19831.
This also completely rewrites and significantly simplifies
mheap.sysAlloc, which has been a source of many bugs. E.g., #21044,
 #20259, #18651, and #13143 (and maybe #23222).
This change also makes it possible to allocate individual objects
larger than 512GB. As a result, a few tests that expected huge
allocations to fail needed to be changed to make even larger
allocations. However, at the moment attempting to allocate a humongous
object may cause the program to freeze for several minutes on Linux as
we fall back to probing every page with addrspace_free. That logic
(and this failure mode) will be removed in the next CL.
Fixes #10460.
Fixes #22204 (since it rewrites the code involved).
This slightly slows down compilebench and the x/benchmarks garbage
benchmark.
name       old time/op     new time/op     delta
Template       184ms ± 1%      185ms ± 1%    ~     (p=0.065 n=10+9)
Unicode       86.9ms ± 3%     86.3ms ± 1%    ~     (p=0.631 n=10+10)
GoTypes        599ms ± 0%      602ms ± 0%  +0.56%  (p=0.000 n=10+9)
Compiler       2.87s ± 1%      2.89s ± 1%  +0.51%  (p=0.002 n=9+10)
SSA            7.29s ± 1%      7.25s ± 1%    ~     (p=0.182 n=10+9)
Flate          118ms ± 2%      118ms ± 1%    ~     (p=0.113 n=9+9)
GoParser       147ms ± 1%      148ms ± 1%  +1.07%  (p=0.003 n=9+10)
Reflect        401ms ± 1%      404ms ± 1%  +0.71%  (p=0.003 n=10+9)
Tar            175ms ± 1%      175ms ± 1%    ~     (p=0.604 n=9+10)
XML            209ms ± 1%      210ms ± 1%    ~     (p=0.052 n=10+10)
(https://perf.golang.org/search?q=upload:20171231.4)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.23ms ± 1%  2.25ms ± 1%  +0.84%  (p=0.000 n=19+19)
(https://perf.golang.org/search?q=upload:20171231.3)
Relative to the start of the sparse heap changes (starting at and
including "runtime: fix various contiguous bitmap assumptions"),
overall slowdown is roughly 1% on GC-intensive benchmarks:
name        old time/op     new time/op     delta
Template        183ms ± 1%      185ms ± 1%  +1.32%  (p=0.000 n=9+9)
Unicode        84.9ms ± 2%     86.3ms ± 1%  +1.65%  (p=0.000 n=9+10)
GoTypes         595ms ± 1%      602ms ± 0%  +1.19%  (p=0.000 n=9+9)
Compiler        2.86s ± 0%      2.89s ± 1%  +0.91%  (p=0.000 n=9+10)
SSA             7.19s ± 0%      7.25s ± 1%  +0.75%  (p=0.000 n=8+9)
Flate           117ms ± 1%      118ms ± 1%  +1.10%  (p=0.000 n=10+9)
GoParser        146ms ± 2%      148ms ± 1%  +1.48%  (p=0.002 n=10+10)
Reflect         398ms ± 1%      404ms ± 1%  +1.51%  (p=0.000 n=10+9)
Tar             173ms ± 1%      175ms ± 1%  +1.17%  (p=0.000 n=10+10)
XML             208ms ± 1%      210ms ± 1%  +0.62%  (p=0.011 n=10+10)
[Geo mean]      369ms           373ms       +1.17%
(https://perf.golang.org/search?q=upload:20180101.2)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.22ms ± 1%  2.25ms ± 1%  +1.51%  (p=0.000 n=20+19)
(https://perf.golang.org/search?q=upload:20180101.3)
Change-Id: I5daf4cfec24b252e5a57001f0a6c03f22479d0f0
Reviewed-on: https://go-review.googlesource.com/85887
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2017-12-19 22:05:23 -08:00
										 |  |  | 			if a != nil { | 
					
						
							| 
									
										
										
										
											2020-11-16 21:57:32 +00:00
										 |  |  | 				mheap_.arena.init(uintptr(a), size, false) | 
					
						
							| 
									
										
										
										
											2020-04-30 19:35:12 +00:00
										 |  |  | 				p = mheap_.arena.end // For hint below | 
					
						
							| 
									
										
										
										
											2015-02-19 13:38:46 -05:00
										 |  |  | 				break | 
					
						
							|  |  |  | 			} | 
					
						
							|  |  |  | 		} | 
					
						
							| 
									
										
											  
											
												runtime: use sparse mappings for the heap
This replaces the contiguous heap arena mapping with a potentially
sparse mapping that can support heap mappings anywhere in the address
space.
This has several advantages over the current approach:
* There is no longer any limit on the size of the Go heap. (Currently
  it's limited to 512GB.) Hence, this fixes #10460.
* It eliminates many failures modes of heap initialization and
  growing. In particular it eliminates any possibility of panicking
  with an address space conflict. This can happen for many reasons and
  even causes a low but steady rate of TSAN test failures because of
  conflicts with the TSAN runtime. See #16936 and #11993.
* It eliminates the notion of "non-reserved" heap, which was added
  because creating huge address space reservations (particularly on
  64-bit) led to huge process VSIZE. This was at best confusing and at
  worst conflicted badly with ulimit -v. However, the non-reserved
  heap logic is complicated, can race with other mappings in non-pure
  Go binaries (e.g., #18976), and requires that the entire heap be
  either reserved or non-reserved. We currently maintain the latter
  property, but it's quite difficult to convince yourself of that, and
  hence difficult to keep correct. This logic is still present, but
  will be removed in the next CL.
* It fixes problems on 32-bit where skipping over parts of the address
  space leads to mapping huge (and never-to-be-used) metadata
  structures. See #19831.
This also completely rewrites and significantly simplifies
mheap.sysAlloc, which has been a source of many bugs. E.g., #21044,
 #20259, #18651, and #13143 (and maybe #23222).
This change also makes it possible to allocate individual objects
larger than 512GB. As a result, a few tests that expected huge
allocations to fail needed to be changed to make even larger
allocations. However, at the moment attempting to allocate a humongous
object may cause the program to freeze for several minutes on Linux as
we fall back to probing every page with addrspace_free. That logic
(and this failure mode) will be removed in the next CL.
Fixes #10460.
Fixes #22204 (since it rewrites the code involved).
This slightly slows down compilebench and the x/benchmarks garbage
benchmark.
name       old time/op     new time/op     delta
Template       184ms ± 1%      185ms ± 1%    ~     (p=0.065 n=10+9)
Unicode       86.9ms ± 3%     86.3ms ± 1%    ~     (p=0.631 n=10+10)
GoTypes        599ms ± 0%      602ms ± 0%  +0.56%  (p=0.000 n=10+9)
Compiler       2.87s ± 1%      2.89s ± 1%  +0.51%  (p=0.002 n=9+10)
SSA            7.29s ± 1%      7.25s ± 1%    ~     (p=0.182 n=10+9)
Flate          118ms ± 2%      118ms ± 1%    ~     (p=0.113 n=9+9)
GoParser       147ms ± 1%      148ms ± 1%  +1.07%  (p=0.003 n=9+10)
Reflect        401ms ± 1%      404ms ± 1%  +0.71%  (p=0.003 n=10+9)
Tar            175ms ± 1%      175ms ± 1%    ~     (p=0.604 n=9+10)
XML            209ms ± 1%      210ms ± 1%    ~     (p=0.052 n=10+10)
(https://perf.golang.org/search?q=upload:20171231.4)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.23ms ± 1%  2.25ms ± 1%  +0.84%  (p=0.000 n=19+19)
(https://perf.golang.org/search?q=upload:20171231.3)
Relative to the start of the sparse heap changes (starting at and
including "runtime: fix various contiguous bitmap assumptions"),
overall slowdown is roughly 1% on GC-intensive benchmarks:
name        old time/op     new time/op     delta
Template        183ms ± 1%      185ms ± 1%  +1.32%  (p=0.000 n=9+9)
Unicode        84.9ms ± 2%     86.3ms ± 1%  +1.65%  (p=0.000 n=9+10)
GoTypes         595ms ± 1%      602ms ± 0%  +1.19%  (p=0.000 n=9+9)
Compiler        2.86s ± 0%      2.89s ± 1%  +0.91%  (p=0.000 n=9+10)
SSA             7.19s ± 0%      7.25s ± 1%  +0.75%  (p=0.000 n=8+9)
Flate           117ms ± 1%      118ms ± 1%  +1.10%  (p=0.000 n=10+9)
GoParser        146ms ± 2%      148ms ± 1%  +1.48%  (p=0.002 n=10+10)
Reflect         398ms ± 1%      404ms ± 1%  +1.51%  (p=0.000 n=10+9)
Tar             173ms ± 1%      175ms ± 1%  +1.17%  (p=0.000 n=10+10)
XML             208ms ± 1%      210ms ± 1%  +0.62%  (p=0.011 n=10+10)
[Geo mean]      369ms           373ms       +1.17%
(https://perf.golang.org/search?q=upload:20180101.2)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.22ms ± 1%  2.25ms ± 1%  +1.51%  (p=0.000 n=20+19)
(https://perf.golang.org/search?q=upload:20180101.3)
Change-Id: I5daf4cfec24b252e5a57001f0a6c03f22479d0f0
Reviewed-on: https://go-review.googlesource.com/85887
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2017-12-19 22:05:23 -08:00
										 |  |  | 		hint := (*arenaHint)(mheap_.arenaHintAlloc.alloc()) | 
					
						
							|  |  |  | 		hint.addr = p | 
					
						
							|  |  |  | 		hint.next, mheap_.arenaHints = mheap_.arenaHints, hint | 
					
						
							| 
									
										
											  
											
												runtime: add safe arena support to the runtime
This change adds an API to the runtime for arenas. A later CL can
potentially export it as an experimental API, but for now, just the
runtime implementation will suffice.
The purpose of arenas is to improve efficiency, primarily by allowing
for an application to manually free memory, thereby delaying garbage
collection. It comes with other potential performance benefits, such as
better locality, a better allocation strategy, and better handling of
interior pointers by the GC.
This implementation is based on one by danscales@google.com with a few
significant differences:
* The implementation lives entirely in the runtime (all layers).
* Arena chunks are the minimum of 8 MiB or the heap arena size. This
  choice is made because in practice 64 MiB appears to be way too large
  of an area for most real-world use-cases.
* Arena chunks are not unmapped, instead they're placed on an evacuation
  list and when there are no pointers left pointing into them, they're
  allowed to be reused.
* Reusing partially-used arena chunks no longer tries to find one used
  by the same P first; it just takes the first one available.
* In order to ensure worst-case fragmentation is never worse than 25%,
  only types and slice backing stores whose sizes are 1/4th the size of
  a chunk or less may be used. Previously larger sizes, up to the size
  of the chunk, were allowed.
* ASAN, MSAN, and the race detector are fully supported.
* Sets arena chunks to fault that were deferred at the end of mark
  termination (a non-public patch once did this; I don't see a reason
  not to continue that).
For #51317.
Change-Id: I83b1693a17302554cb36b6daa4e9249a81b1644f
Reviewed-on: https://go-review.googlesource.com/c/go/+/423359
Reviewed-by: Cherry Mui <cherryyz@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
											
										 
											2022-08-12 21:40:46 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | 		// Place the hint for user arenas just after the large reservation. | 
					
						
							|  |  |  | 		// | 
					
						
							|  |  |  | 		// While this potentially competes with the hint above, in practice we probably | 
					
						
							|  |  |  | 		// aren't going to be getting this far anyway on 32-bit platforms. | 
					
						
							|  |  |  | 		userArenaHint := (*arenaHint)(mheap_.arenaHintAlloc.alloc()) | 
					
						
							|  |  |  | 		userArenaHint.addr = p | 
					
						
							|  |  |  | 		userArenaHint.next, mheap_.userArena.arenaHints = mheap_.userArena.arenaHints, userArenaHint | 
					
						
							| 
									
										
										
										
											2017-12-08 22:57:53 -05:00
										 |  |  | 	} | 
					
						
							| 
									
										
										
										
											2022-11-11 22:16:20 +00:00
										 |  |  | 	// Initialize the memory limit here because the allocator is going to look at it | 
					
						
							|  |  |  | 	// but we haven't called gcinit yet and we're definitely going to allocate memory before then. | 
					
						
							|  |  |  | 	gcController.memoryLimit.Store(maxInt64) | 
					
						
							| 
									
										
										
										
											2015-02-19 13:38:46 -05:00
										 |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
											  
											
												runtime: use sparse mappings for the heap
This replaces the contiguous heap arena mapping with a potentially
sparse mapping that can support heap mappings anywhere in the address
space.
This has several advantages over the current approach:
* There is no longer any limit on the size of the Go heap. (Currently
  it's limited to 512GB.) Hence, this fixes #10460.
* It eliminates many failures modes of heap initialization and
  growing. In particular it eliminates any possibility of panicking
  with an address space conflict. This can happen for many reasons and
  even causes a low but steady rate of TSAN test failures because of
  conflicts with the TSAN runtime. See #16936 and #11993.
* It eliminates the notion of "non-reserved" heap, which was added
  because creating huge address space reservations (particularly on
  64-bit) led to huge process VSIZE. This was at best confusing and at
  worst conflicted badly with ulimit -v. However, the non-reserved
  heap logic is complicated, can race with other mappings in non-pure
  Go binaries (e.g., #18976), and requires that the entire heap be
  either reserved or non-reserved. We currently maintain the latter
  property, but it's quite difficult to convince yourself of that, and
  hence difficult to keep correct. This logic is still present, but
  will be removed in the next CL.
* It fixes problems on 32-bit where skipping over parts of the address
  space leads to mapping huge (and never-to-be-used) metadata
  structures. See #19831.
This also completely rewrites and significantly simplifies
mheap.sysAlloc, which has been a source of many bugs. E.g., #21044,
 #20259, #18651, and #13143 (and maybe #23222).
This change also makes it possible to allocate individual objects
larger than 512GB. As a result, a few tests that expected huge
allocations to fail needed to be changed to make even larger
allocations. However, at the moment attempting to allocate a humongous
object may cause the program to freeze for several minutes on Linux as
we fall back to probing every page with addrspace_free. That logic
(and this failure mode) will be removed in the next CL.
Fixes #10460.
Fixes #22204 (since it rewrites the code involved).
This slightly slows down compilebench and the x/benchmarks garbage
benchmark.
name       old time/op     new time/op     delta
Template       184ms ± 1%      185ms ± 1%    ~     (p=0.065 n=10+9)
Unicode       86.9ms ± 3%     86.3ms ± 1%    ~     (p=0.631 n=10+10)
GoTypes        599ms ± 0%      602ms ± 0%  +0.56%  (p=0.000 n=10+9)
Compiler       2.87s ± 1%      2.89s ± 1%  +0.51%  (p=0.002 n=9+10)
SSA            7.29s ± 1%      7.25s ± 1%    ~     (p=0.182 n=10+9)
Flate          118ms ± 2%      118ms ± 1%    ~     (p=0.113 n=9+9)
GoParser       147ms ± 1%      148ms ± 1%  +1.07%  (p=0.003 n=9+10)
Reflect        401ms ± 1%      404ms ± 1%  +0.71%  (p=0.003 n=10+9)
Tar            175ms ± 1%      175ms ± 1%    ~     (p=0.604 n=9+10)
XML            209ms ± 1%      210ms ± 1%    ~     (p=0.052 n=10+10)
(https://perf.golang.org/search?q=upload:20171231.4)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.23ms ± 1%  2.25ms ± 1%  +0.84%  (p=0.000 n=19+19)
(https://perf.golang.org/search?q=upload:20171231.3)
Relative to the start of the sparse heap changes (starting at and
including "runtime: fix various contiguous bitmap assumptions"),
overall slowdown is roughly 1% on GC-intensive benchmarks:
name        old time/op     new time/op     delta
Template        183ms ± 1%      185ms ± 1%  +1.32%  (p=0.000 n=9+9)
Unicode        84.9ms ± 2%     86.3ms ± 1%  +1.65%  (p=0.000 n=9+10)
GoTypes         595ms ± 1%      602ms ± 0%  +1.19%  (p=0.000 n=9+9)
Compiler        2.86s ± 0%      2.89s ± 1%  +0.91%  (p=0.000 n=9+10)
SSA             7.19s ± 0%      7.25s ± 1%  +0.75%  (p=0.000 n=8+9)
Flate           117ms ± 1%      118ms ± 1%  +1.10%  (p=0.000 n=10+9)
GoParser        146ms ± 2%      148ms ± 1%  +1.48%  (p=0.002 n=10+10)
Reflect         398ms ± 1%      404ms ± 1%  +1.51%  (p=0.000 n=10+9)
Tar             173ms ± 1%      175ms ± 1%  +1.17%  (p=0.000 n=10+10)
XML             208ms ± 1%      210ms ± 1%  +0.62%  (p=0.011 n=10+10)
[Geo mean]      369ms           373ms       +1.17%
(https://perf.golang.org/search?q=upload:20180101.2)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.22ms ± 1%  2.25ms ± 1%  +1.51%  (p=0.000 n=20+19)
(https://perf.golang.org/search?q=upload:20180101.3)
Change-Id: I5daf4cfec24b252e5a57001f0a6c03f22479d0f0
Reviewed-on: https://go-review.googlesource.com/85887
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2017-12-19 22:05:23 -08:00
										 |  |  | // sysAlloc allocates heap arena space for at least n bytes. The | 
					
						
							|  |  |  | // returned pointer is always heapArenaBytes-aligned and backed by | 
					
						
							|  |  |  | // h.arenas metadata. The returned size is always a multiple of | 
					
						
							|  |  |  | // heapArenaBytes. sysAlloc returns nil on failure. | 
					
						
							| 
									
										
										
										
											2016-04-28 11:19:53 -04:00
										 |  |  | // There is no corresponding free function. | 
					
						
							| 
									
										
											  
											
												runtime: use sparse mappings for the heap
This replaces the contiguous heap arena mapping with a potentially
sparse mapping that can support heap mappings anywhere in the address
space.
This has several advantages over the current approach:
* There is no longer any limit on the size of the Go heap. (Currently
  it's limited to 512GB.) Hence, this fixes #10460.
* It eliminates many failures modes of heap initialization and
  growing. In particular it eliminates any possibility of panicking
  with an address space conflict. This can happen for many reasons and
  even causes a low but steady rate of TSAN test failures because of
  conflicts with the TSAN runtime. See #16936 and #11993.
* It eliminates the notion of "non-reserved" heap, which was added
  because creating huge address space reservations (particularly on
  64-bit) led to huge process VSIZE. This was at best confusing and at
  worst conflicted badly with ulimit -v. However, the non-reserved
  heap logic is complicated, can race with other mappings in non-pure
  Go binaries (e.g., #18976), and requires that the entire heap be
  either reserved or non-reserved. We currently maintain the latter
  property, but it's quite difficult to convince yourself of that, and
  hence difficult to keep correct. This logic is still present, but
  will be removed in the next CL.
* It fixes problems on 32-bit where skipping over parts of the address
  space leads to mapping huge (and never-to-be-used) metadata
  structures. See #19831.
This also completely rewrites and significantly simplifies
mheap.sysAlloc, which has been a source of many bugs. E.g., #21044,
 #20259, #18651, and #13143 (and maybe #23222).
This change also makes it possible to allocate individual objects
larger than 512GB. As a result, a few tests that expected huge
allocations to fail needed to be changed to make even larger
allocations. However, at the moment attempting to allocate a humongous
object may cause the program to freeze for several minutes on Linux as
we fall back to probing every page with addrspace_free. That logic
(and this failure mode) will be removed in the next CL.
Fixes #10460.
Fixes #22204 (since it rewrites the code involved).
This slightly slows down compilebench and the x/benchmarks garbage
benchmark.
name       old time/op     new time/op     delta
Template       184ms ± 1%      185ms ± 1%    ~     (p=0.065 n=10+9)
Unicode       86.9ms ± 3%     86.3ms ± 1%    ~     (p=0.631 n=10+10)
GoTypes        599ms ± 0%      602ms ± 0%  +0.56%  (p=0.000 n=10+9)
Compiler       2.87s ± 1%      2.89s ± 1%  +0.51%  (p=0.002 n=9+10)
SSA            7.29s ± 1%      7.25s ± 1%    ~     (p=0.182 n=10+9)
Flate          118ms ± 2%      118ms ± 1%    ~     (p=0.113 n=9+9)
GoParser       147ms ± 1%      148ms ± 1%  +1.07%  (p=0.003 n=9+10)
Reflect        401ms ± 1%      404ms ± 1%  +0.71%  (p=0.003 n=10+9)
Tar            175ms ± 1%      175ms ± 1%    ~     (p=0.604 n=9+10)
XML            209ms ± 1%      210ms ± 1%    ~     (p=0.052 n=10+10)
(https://perf.golang.org/search?q=upload:20171231.4)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.23ms ± 1%  2.25ms ± 1%  +0.84%  (p=0.000 n=19+19)
(https://perf.golang.org/search?q=upload:20171231.3)
Relative to the start of the sparse heap changes (starting at and
including "runtime: fix various contiguous bitmap assumptions"),
overall slowdown is roughly 1% on GC-intensive benchmarks:
name        old time/op     new time/op     delta
Template        183ms ± 1%      185ms ± 1%  +1.32%  (p=0.000 n=9+9)
Unicode        84.9ms ± 2%     86.3ms ± 1%  +1.65%  (p=0.000 n=9+10)
GoTypes         595ms ± 1%      602ms ± 0%  +1.19%  (p=0.000 n=9+9)
Compiler        2.86s ± 0%      2.89s ± 1%  +0.91%  (p=0.000 n=9+10)
SSA             7.19s ± 0%      7.25s ± 1%  +0.75%  (p=0.000 n=8+9)
Flate           117ms ± 1%      118ms ± 1%  +1.10%  (p=0.000 n=10+9)
GoParser        146ms ± 2%      148ms ± 1%  +1.48%  (p=0.002 n=10+10)
Reflect         398ms ± 1%      404ms ± 1%  +1.51%  (p=0.000 n=10+9)
Tar             173ms ± 1%      175ms ± 1%  +1.17%  (p=0.000 n=10+10)
XML             208ms ± 1%      210ms ± 1%  +0.62%  (p=0.011 n=10+10)
[Geo mean]      369ms           373ms       +1.17%
(https://perf.golang.org/search?q=upload:20180101.2)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.22ms ± 1%  2.25ms ± 1%  +1.51%  (p=0.000 n=20+19)
(https://perf.golang.org/search?q=upload:20180101.3)
Change-Id: I5daf4cfec24b252e5a57001f0a6c03f22479d0f0
Reviewed-on: https://go-review.googlesource.com/85887
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2017-12-19 22:05:23 -08:00
										 |  |  | // | 
					
						
							| 
									
										
										
										
											2022-09-20 18:33:03 +00:00
										 |  |  | // hintList is a list of hint addresses for where to allocate new | 
					
						
							|  |  |  | // heap arenas. It must be non-nil. | 
					
						
							|  |  |  | // | 
					
						
							|  |  |  | // register indicates whether the heap arena should be registered | 
					
						
							|  |  |  | // in allArenas. | 
					
						
							|  |  |  | // | 
					
						
							| 
									
										
										
										
											2020-11-16 21:57:32 +00:00
										 |  |  | // sysAlloc returns a memory region in the Reserved state. This region must | 
					
						
							|  |  |  | // be transitioned to Prepared and then Ready before use. | 
					
						
							| 
									
										
										
										
											2019-05-14 16:39:43 +00:00
										 |  |  | // | 
					
						
							| 
									
										
											  
											
												runtime: use sparse mappings for the heap
This replaces the contiguous heap arena mapping with a potentially
sparse mapping that can support heap mappings anywhere in the address
space.
This has several advantages over the current approach:
* There is no longer any limit on the size of the Go heap. (Currently
  it's limited to 512GB.) Hence, this fixes #10460.
* It eliminates many failures modes of heap initialization and
  growing. In particular it eliminates any possibility of panicking
  with an address space conflict. This can happen for many reasons and
  even causes a low but steady rate of TSAN test failures because of
  conflicts with the TSAN runtime. See #16936 and #11993.
* It eliminates the notion of "non-reserved" heap, which was added
  because creating huge address space reservations (particularly on
  64-bit) led to huge process VSIZE. This was at best confusing and at
  worst conflicted badly with ulimit -v. However, the non-reserved
  heap logic is complicated, can race with other mappings in non-pure
  Go binaries (e.g., #18976), and requires that the entire heap be
  either reserved or non-reserved. We currently maintain the latter
  property, but it's quite difficult to convince yourself of that, and
  hence difficult to keep correct. This logic is still present, but
  will be removed in the next CL.
* It fixes problems on 32-bit where skipping over parts of the address
  space leads to mapping huge (and never-to-be-used) metadata
  structures. See #19831.
This also completely rewrites and significantly simplifies
mheap.sysAlloc, which has been a source of many bugs. E.g., #21044,
 #20259, #18651, and #13143 (and maybe #23222).
This change also makes it possible to allocate individual objects
larger than 512GB. As a result, a few tests that expected huge
allocations to fail needed to be changed to make even larger
allocations. However, at the moment attempting to allocate a humongous
object may cause the program to freeze for several minutes on Linux as
we fall back to probing every page with addrspace_free. That logic
(and this failure mode) will be removed in the next CL.
Fixes #10460.
Fixes #22204 (since it rewrites the code involved).
This slightly slows down compilebench and the x/benchmarks garbage
benchmark.
name       old time/op     new time/op     delta
Template       184ms ± 1%      185ms ± 1%    ~     (p=0.065 n=10+9)
Unicode       86.9ms ± 3%     86.3ms ± 1%    ~     (p=0.631 n=10+10)
GoTypes        599ms ± 0%      602ms ± 0%  +0.56%  (p=0.000 n=10+9)
Compiler       2.87s ± 1%      2.89s ± 1%  +0.51%  (p=0.002 n=9+10)
SSA            7.29s ± 1%      7.25s ± 1%    ~     (p=0.182 n=10+9)
Flate          118ms ± 2%      118ms ± 1%    ~     (p=0.113 n=9+9)
GoParser       147ms ± 1%      148ms ± 1%  +1.07%  (p=0.003 n=9+10)
Reflect        401ms ± 1%      404ms ± 1%  +0.71%  (p=0.003 n=10+9)
Tar            175ms ± 1%      175ms ± 1%    ~     (p=0.604 n=9+10)
XML            209ms ± 1%      210ms ± 1%    ~     (p=0.052 n=10+10)
(https://perf.golang.org/search?q=upload:20171231.4)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.23ms ± 1%  2.25ms ± 1%  +0.84%  (p=0.000 n=19+19)
(https://perf.golang.org/search?q=upload:20171231.3)
Relative to the start of the sparse heap changes (starting at and
including "runtime: fix various contiguous bitmap assumptions"),
overall slowdown is roughly 1% on GC-intensive benchmarks:
name        old time/op     new time/op     delta
Template        183ms ± 1%      185ms ± 1%  +1.32%  (p=0.000 n=9+9)
Unicode        84.9ms ± 2%     86.3ms ± 1%  +1.65%  (p=0.000 n=9+10)
GoTypes         595ms ± 1%      602ms ± 0%  +1.19%  (p=0.000 n=9+9)
Compiler        2.86s ± 0%      2.89s ± 1%  +0.91%  (p=0.000 n=9+10)
SSA             7.19s ± 0%      7.25s ± 1%  +0.75%  (p=0.000 n=8+9)
Flate           117ms ± 1%      118ms ± 1%  +1.10%  (p=0.000 n=10+9)
GoParser        146ms ± 2%      148ms ± 1%  +1.48%  (p=0.002 n=10+10)
Reflect         398ms ± 1%      404ms ± 1%  +1.51%  (p=0.000 n=10+9)
Tar             173ms ± 1%      175ms ± 1%  +1.17%  (p=0.000 n=10+10)
XML             208ms ± 1%      210ms ± 1%  +0.62%  (p=0.011 n=10+10)
[Geo mean]      369ms           373ms       +1.17%
(https://perf.golang.org/search?q=upload:20180101.2)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.22ms ± 1%  2.25ms ± 1%  +1.51%  (p=0.000 n=20+19)
(https://perf.golang.org/search?q=upload:20180101.3)
Change-Id: I5daf4cfec24b252e5a57001f0a6c03f22479d0f0
Reviewed-on: https://go-review.googlesource.com/85887
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2017-12-19 22:05:23 -08:00
										 |  |  | // h must be locked. | 
					
						
							| 
									
										
										
										
											2022-09-20 18:33:03 +00:00
										 |  |  | func (h *mheap) sysAlloc(n uintptr, hintList **arenaHint, register bool) (v unsafe.Pointer, size uintptr) { | 
					
						
							| 
									
										
										
										
											2020-08-21 11:59:55 -04:00
										 |  |  | 	assertLockHeld(&h.lock) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-06-28 16:44:07 +00:00
										 |  |  | 	n = alignUp(n, heapArenaBytes) | 
					
						
							| 
									
										
											  
											
												runtime: use sparse mappings for the heap
This replaces the contiguous heap arena mapping with a potentially
sparse mapping that can support heap mappings anywhere in the address
space.
This has several advantages over the current approach:
* There is no longer any limit on the size of the Go heap. (Currently
  it's limited to 512GB.) Hence, this fixes #10460.
* It eliminates many failures modes of heap initialization and
  growing. In particular it eliminates any possibility of panicking
  with an address space conflict. This can happen for many reasons and
  even causes a low but steady rate of TSAN test failures because of
  conflicts with the TSAN runtime. See #16936 and #11993.
* It eliminates the notion of "non-reserved" heap, which was added
  because creating huge address space reservations (particularly on
  64-bit) led to huge process VSIZE. This was at best confusing and at
  worst conflicted badly with ulimit -v. However, the non-reserved
  heap logic is complicated, can race with other mappings in non-pure
  Go binaries (e.g., #18976), and requires that the entire heap be
  either reserved or non-reserved. We currently maintain the latter
  property, but it's quite difficult to convince yourself of that, and
  hence difficult to keep correct. This logic is still present, but
  will be removed in the next CL.
* It fixes problems on 32-bit where skipping over parts of the address
  space leads to mapping huge (and never-to-be-used) metadata
  structures. See #19831.
This also completely rewrites and significantly simplifies
mheap.sysAlloc, which has been a source of many bugs. E.g., #21044,
 #20259, #18651, and #13143 (and maybe #23222).
This change also makes it possible to allocate individual objects
larger than 512GB. As a result, a few tests that expected huge
allocations to fail needed to be changed to make even larger
allocations. However, at the moment attempting to allocate a humongous
object may cause the program to freeze for several minutes on Linux as
we fall back to probing every page with addrspace_free. That logic
(and this failure mode) will be removed in the next CL.
Fixes #10460.
Fixes #22204 (since it rewrites the code involved).
This slightly slows down compilebench and the x/benchmarks garbage
benchmark.
name       old time/op     new time/op     delta
Template       184ms ± 1%      185ms ± 1%    ~     (p=0.065 n=10+9)
Unicode       86.9ms ± 3%     86.3ms ± 1%    ~     (p=0.631 n=10+10)
GoTypes        599ms ± 0%      602ms ± 0%  +0.56%  (p=0.000 n=10+9)
Compiler       2.87s ± 1%      2.89s ± 1%  +0.51%  (p=0.002 n=9+10)
SSA            7.29s ± 1%      7.25s ± 1%    ~     (p=0.182 n=10+9)
Flate          118ms ± 2%      118ms ± 1%    ~     (p=0.113 n=9+9)
GoParser       147ms ± 1%      148ms ± 1%  +1.07%  (p=0.003 n=9+10)
Reflect        401ms ± 1%      404ms ± 1%  +0.71%  (p=0.003 n=10+9)
Tar            175ms ± 1%      175ms ± 1%    ~     (p=0.604 n=9+10)
XML            209ms ± 1%      210ms ± 1%    ~     (p=0.052 n=10+10)
(https://perf.golang.org/search?q=upload:20171231.4)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.23ms ± 1%  2.25ms ± 1%  +0.84%  (p=0.000 n=19+19)
(https://perf.golang.org/search?q=upload:20171231.3)
Relative to the start of the sparse heap changes (starting at and
including "runtime: fix various contiguous bitmap assumptions"),
overall slowdown is roughly 1% on GC-intensive benchmarks:
name        old time/op     new time/op     delta
Template        183ms ± 1%      185ms ± 1%  +1.32%  (p=0.000 n=9+9)
Unicode        84.9ms ± 2%     86.3ms ± 1%  +1.65%  (p=0.000 n=9+10)
GoTypes         595ms ± 1%      602ms ± 0%  +1.19%  (p=0.000 n=9+9)
Compiler        2.86s ± 0%      2.89s ± 1%  +0.91%  (p=0.000 n=9+10)
SSA             7.19s ± 0%      7.25s ± 1%  +0.75%  (p=0.000 n=8+9)
Flate           117ms ± 1%      118ms ± 1%  +1.10%  (p=0.000 n=10+9)
GoParser        146ms ± 2%      148ms ± 1%  +1.48%  (p=0.002 n=10+10)
Reflect         398ms ± 1%      404ms ± 1%  +1.51%  (p=0.000 n=10+9)
Tar             173ms ± 1%      175ms ± 1%  +1.17%  (p=0.000 n=10+10)
XML             208ms ± 1%      210ms ± 1%  +0.62%  (p=0.011 n=10+10)
[Geo mean]      369ms           373ms       +1.17%
(https://perf.golang.org/search?q=upload:20180101.2)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.22ms ± 1%  2.25ms ± 1%  +1.51%  (p=0.000 n=20+19)
(https://perf.golang.org/search?q=upload:20180101.3)
Change-Id: I5daf4cfec24b252e5a57001f0a6c03f22479d0f0
Reviewed-on: https://go-review.googlesource.com/85887
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2017-12-19 22:05:23 -08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-09-20 18:33:03 +00:00
										 |  |  | 	if hintList == &h.arenaHints { | 
					
						
							|  |  |  | 		// First, try the arena pre-reservation. | 
					
						
							|  |  |  | 		// Newly-used mappings are considered released. | 
					
						
							|  |  |  | 		// | 
					
						
							|  |  |  | 		// Only do this if we're using the regular heap arena hints. | 
					
						
							|  |  |  | 		// This behavior is only for the heap. | 
					
						
							|  |  |  | 		v = h.arena.alloc(n, heapArenaBytes, &gcController.heapReleased) | 
					
						
							|  |  |  | 		if v != nil { | 
					
						
							|  |  |  | 			size = n | 
					
						
							|  |  |  | 			goto mapped | 
					
						
							|  |  |  | 		} | 
					
						
							| 
									
										
											  
											
												runtime: use sparse mappings for the heap
This replaces the contiguous heap arena mapping with a potentially
sparse mapping that can support heap mappings anywhere in the address
space.
This has several advantages over the current approach:
* There is no longer any limit on the size of the Go heap. (Currently
  it's limited to 512GB.) Hence, this fixes #10460.
* It eliminates many failures modes of heap initialization and
  growing. In particular it eliminates any possibility of panicking
  with an address space conflict. This can happen for many reasons and
  even causes a low but steady rate of TSAN test failures because of
  conflicts with the TSAN runtime. See #16936 and #11993.
* It eliminates the notion of "non-reserved" heap, which was added
  because creating huge address space reservations (particularly on
  64-bit) led to huge process VSIZE. This was at best confusing and at
  worst conflicted badly with ulimit -v. However, the non-reserved
  heap logic is complicated, can race with other mappings in non-pure
  Go binaries (e.g., #18976), and requires that the entire heap be
  either reserved or non-reserved. We currently maintain the latter
  property, but it's quite difficult to convince yourself of that, and
  hence difficult to keep correct. This logic is still present, but
  will be removed in the next CL.
* It fixes problems on 32-bit where skipping over parts of the address
  space leads to mapping huge (and never-to-be-used) metadata
  structures. See #19831.
This also completely rewrites and significantly simplifies
mheap.sysAlloc, which has been a source of many bugs. E.g., #21044,
 #20259, #18651, and #13143 (and maybe #23222).
This change also makes it possible to allocate individual objects
larger than 512GB. As a result, a few tests that expected huge
allocations to fail needed to be changed to make even larger
allocations. However, at the moment attempting to allocate a humongous
object may cause the program to freeze for several minutes on Linux as
we fall back to probing every page with addrspace_free. That logic
(and this failure mode) will be removed in the next CL.
Fixes #10460.
Fixes #22204 (since it rewrites the code involved).
This slightly slows down compilebench and the x/benchmarks garbage
benchmark.
name       old time/op     new time/op     delta
Template       184ms ± 1%      185ms ± 1%    ~     (p=0.065 n=10+9)
Unicode       86.9ms ± 3%     86.3ms ± 1%    ~     (p=0.631 n=10+10)
GoTypes        599ms ± 0%      602ms ± 0%  +0.56%  (p=0.000 n=10+9)
Compiler       2.87s ± 1%      2.89s ± 1%  +0.51%  (p=0.002 n=9+10)
SSA            7.29s ± 1%      7.25s ± 1%    ~     (p=0.182 n=10+9)
Flate          118ms ± 2%      118ms ± 1%    ~     (p=0.113 n=9+9)
GoParser       147ms ± 1%      148ms ± 1%  +1.07%  (p=0.003 n=9+10)
Reflect        401ms ± 1%      404ms ± 1%  +0.71%  (p=0.003 n=10+9)
Tar            175ms ± 1%      175ms ± 1%    ~     (p=0.604 n=9+10)
XML            209ms ± 1%      210ms ± 1%    ~     (p=0.052 n=10+10)
(https://perf.golang.org/search?q=upload:20171231.4)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.23ms ± 1%  2.25ms ± 1%  +0.84%  (p=0.000 n=19+19)
(https://perf.golang.org/search?q=upload:20171231.3)
Relative to the start of the sparse heap changes (starting at and
including "runtime: fix various contiguous bitmap assumptions"),
overall slowdown is roughly 1% on GC-intensive benchmarks:
name        old time/op     new time/op     delta
Template        183ms ± 1%      185ms ± 1%  +1.32%  (p=0.000 n=9+9)
Unicode        84.9ms ± 2%     86.3ms ± 1%  +1.65%  (p=0.000 n=9+10)
GoTypes         595ms ± 1%      602ms ± 0%  +1.19%  (p=0.000 n=9+9)
Compiler        2.86s ± 0%      2.89s ± 1%  +0.91%  (p=0.000 n=9+10)
SSA             7.19s ± 0%      7.25s ± 1%  +0.75%  (p=0.000 n=8+9)
Flate           117ms ± 1%      118ms ± 1%  +1.10%  (p=0.000 n=10+9)
GoParser        146ms ± 2%      148ms ± 1%  +1.48%  (p=0.002 n=10+10)
Reflect         398ms ± 1%      404ms ± 1%  +1.51%  (p=0.000 n=10+9)
Tar             173ms ± 1%      175ms ± 1%  +1.17%  (p=0.000 n=10+10)
XML             208ms ± 1%      210ms ± 1%  +0.62%  (p=0.011 n=10+10)
[Geo mean]      369ms           373ms       +1.17%
(https://perf.golang.org/search?q=upload:20180101.2)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.22ms ± 1%  2.25ms ± 1%  +1.51%  (p=0.000 n=20+19)
(https://perf.golang.org/search?q=upload:20180101.3)
Change-Id: I5daf4cfec24b252e5a57001f0a6c03f22479d0f0
Reviewed-on: https://go-review.googlesource.com/85887
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2017-12-19 22:05:23 -08:00
										 |  |  | 	} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	// Try to grow the heap at a hint address. | 
					
						
							| 
									
										
										
										
											2022-09-20 18:33:03 +00:00
										 |  |  | 	for *hintList != nil { | 
					
						
							|  |  |  | 		hint := *hintList | 
					
						
							| 
									
										
											  
											
												runtime: use sparse mappings for the heap
This replaces the contiguous heap arena mapping with a potentially
sparse mapping that can support heap mappings anywhere in the address
space.
This has several advantages over the current approach:
* There is no longer any limit on the size of the Go heap. (Currently
  it's limited to 512GB.) Hence, this fixes #10460.
* It eliminates many failures modes of heap initialization and
  growing. In particular it eliminates any possibility of panicking
  with an address space conflict. This can happen for many reasons and
  even causes a low but steady rate of TSAN test failures because of
  conflicts with the TSAN runtime. See #16936 and #11993.
* It eliminates the notion of "non-reserved" heap, which was added
  because creating huge address space reservations (particularly on
  64-bit) led to huge process VSIZE. This was at best confusing and at
  worst conflicted badly with ulimit -v. However, the non-reserved
  heap logic is complicated, can race with other mappings in non-pure
  Go binaries (e.g., #18976), and requires that the entire heap be
  either reserved or non-reserved. We currently maintain the latter
  property, but it's quite difficult to convince yourself of that, and
  hence difficult to keep correct. This logic is still present, but
  will be removed in the next CL.
* It fixes problems on 32-bit where skipping over parts of the address
  space leads to mapping huge (and never-to-be-used) metadata
  structures. See #19831.
This also completely rewrites and significantly simplifies
mheap.sysAlloc, which has been a source of many bugs. E.g., #21044,
 #20259, #18651, and #13143 (and maybe #23222).
This change also makes it possible to allocate individual objects
larger than 512GB. As a result, a few tests that expected huge
allocations to fail needed to be changed to make even larger
allocations. However, at the moment attempting to allocate a humongous
object may cause the program to freeze for several minutes on Linux as
we fall back to probing every page with addrspace_free. That logic
(and this failure mode) will be removed in the next CL.
Fixes #10460.
Fixes #22204 (since it rewrites the code involved).
This slightly slows down compilebench and the x/benchmarks garbage
benchmark.
name       old time/op     new time/op     delta
Template       184ms ± 1%      185ms ± 1%    ~     (p=0.065 n=10+9)
Unicode       86.9ms ± 3%     86.3ms ± 1%    ~     (p=0.631 n=10+10)
GoTypes        599ms ± 0%      602ms ± 0%  +0.56%  (p=0.000 n=10+9)
Compiler       2.87s ± 1%      2.89s ± 1%  +0.51%  (p=0.002 n=9+10)
SSA            7.29s ± 1%      7.25s ± 1%    ~     (p=0.182 n=10+9)
Flate          118ms ± 2%      118ms ± 1%    ~     (p=0.113 n=9+9)
GoParser       147ms ± 1%      148ms ± 1%  +1.07%  (p=0.003 n=9+10)
Reflect        401ms ± 1%      404ms ± 1%  +0.71%  (p=0.003 n=10+9)
Tar            175ms ± 1%      175ms ± 1%    ~     (p=0.604 n=9+10)
XML            209ms ± 1%      210ms ± 1%    ~     (p=0.052 n=10+10)
(https://perf.golang.org/search?q=upload:20171231.4)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.23ms ± 1%  2.25ms ± 1%  +0.84%  (p=0.000 n=19+19)
(https://perf.golang.org/search?q=upload:20171231.3)
Relative to the start of the sparse heap changes (starting at and
including "runtime: fix various contiguous bitmap assumptions"),
overall slowdown is roughly 1% on GC-intensive benchmarks:
name        old time/op     new time/op     delta
Template        183ms ± 1%      185ms ± 1%  +1.32%  (p=0.000 n=9+9)
Unicode        84.9ms ± 2%     86.3ms ± 1%  +1.65%  (p=0.000 n=9+10)
GoTypes         595ms ± 1%      602ms ± 0%  +1.19%  (p=0.000 n=9+9)
Compiler        2.86s ± 0%      2.89s ± 1%  +0.91%  (p=0.000 n=9+10)
SSA             7.19s ± 0%      7.25s ± 1%  +0.75%  (p=0.000 n=8+9)
Flate           117ms ± 1%      118ms ± 1%  +1.10%  (p=0.000 n=10+9)
GoParser        146ms ± 2%      148ms ± 1%  +1.48%  (p=0.002 n=10+10)
Reflect         398ms ± 1%      404ms ± 1%  +1.51%  (p=0.000 n=10+9)
Tar             173ms ± 1%      175ms ± 1%  +1.17%  (p=0.000 n=10+10)
XML             208ms ± 1%      210ms ± 1%  +0.62%  (p=0.011 n=10+10)
[Geo mean]      369ms           373ms       +1.17%
(https://perf.golang.org/search?q=upload:20180101.2)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.22ms ± 1%  2.25ms ± 1%  +1.51%  (p=0.000 n=20+19)
(https://perf.golang.org/search?q=upload:20180101.3)
Change-Id: I5daf4cfec24b252e5a57001f0a6c03f22479d0f0
Reviewed-on: https://go-review.googlesource.com/85887
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2017-12-19 22:05:23 -08:00
										 |  |  | 		p := hint.addr | 
					
						
							|  |  |  | 		if hint.down { | 
					
						
							|  |  |  | 			p -= n | 
					
						
							|  |  |  | 		} | 
					
						
							| 
									
										
										
										
											2018-02-16 17:53:16 -05:00
										 |  |  | 		if p+n < p { | 
					
						
							| 
									
										
											  
											
												runtime: use sparse mappings for the heap
This replaces the contiguous heap arena mapping with a potentially
sparse mapping that can support heap mappings anywhere in the address
space.
This has several advantages over the current approach:
* There is no longer any limit on the size of the Go heap. (Currently
  it's limited to 512GB.) Hence, this fixes #10460.
* It eliminates many failures modes of heap initialization and
  growing. In particular it eliminates any possibility of panicking
  with an address space conflict. This can happen for many reasons and
  even causes a low but steady rate of TSAN test failures because of
  conflicts with the TSAN runtime. See #16936 and #11993.
* It eliminates the notion of "non-reserved" heap, which was added
  because creating huge address space reservations (particularly on
  64-bit) led to huge process VSIZE. This was at best confusing and at
  worst conflicted badly with ulimit -v. However, the non-reserved
  heap logic is complicated, can race with other mappings in non-pure
  Go binaries (e.g., #18976), and requires that the entire heap be
  either reserved or non-reserved. We currently maintain the latter
  property, but it's quite difficult to convince yourself of that, and
  hence difficult to keep correct. This logic is still present, but
  will be removed in the next CL.
* It fixes problems on 32-bit where skipping over parts of the address
  space leads to mapping huge (and never-to-be-used) metadata
  structures. See #19831.
This also completely rewrites and significantly simplifies
mheap.sysAlloc, which has been a source of many bugs. E.g., #21044,
 #20259, #18651, and #13143 (and maybe #23222).
This change also makes it possible to allocate individual objects
larger than 512GB. As a result, a few tests that expected huge
allocations to fail needed to be changed to make even larger
allocations. However, at the moment attempting to allocate a humongous
object may cause the program to freeze for several minutes on Linux as
we fall back to probing every page with addrspace_free. That logic
(and this failure mode) will be removed in the next CL.
Fixes #10460.
Fixes #22204 (since it rewrites the code involved).
This slightly slows down compilebench and the x/benchmarks garbage
benchmark.
name       old time/op     new time/op     delta
Template       184ms ± 1%      185ms ± 1%    ~     (p=0.065 n=10+9)
Unicode       86.9ms ± 3%     86.3ms ± 1%    ~     (p=0.631 n=10+10)
GoTypes        599ms ± 0%      602ms ± 0%  +0.56%  (p=0.000 n=10+9)
Compiler       2.87s ± 1%      2.89s ± 1%  +0.51%  (p=0.002 n=9+10)
SSA            7.29s ± 1%      7.25s ± 1%    ~     (p=0.182 n=10+9)
Flate          118ms ± 2%      118ms ± 1%    ~     (p=0.113 n=9+9)
GoParser       147ms ± 1%      148ms ± 1%  +1.07%  (p=0.003 n=9+10)
Reflect        401ms ± 1%      404ms ± 1%  +0.71%  (p=0.003 n=10+9)
Tar            175ms ± 1%      175ms ± 1%    ~     (p=0.604 n=9+10)
XML            209ms ± 1%      210ms ± 1%    ~     (p=0.052 n=10+10)
(https://perf.golang.org/search?q=upload:20171231.4)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.23ms ± 1%  2.25ms ± 1%  +0.84%  (p=0.000 n=19+19)
(https://perf.golang.org/search?q=upload:20171231.3)
Relative to the start of the sparse heap changes (starting at and
including "runtime: fix various contiguous bitmap assumptions"),
overall slowdown is roughly 1% on GC-intensive benchmarks:
name        old time/op     new time/op     delta
Template        183ms ± 1%      185ms ± 1%  +1.32%  (p=0.000 n=9+9)
Unicode        84.9ms ± 2%     86.3ms ± 1%  +1.65%  (p=0.000 n=9+10)
GoTypes         595ms ± 1%      602ms ± 0%  +1.19%  (p=0.000 n=9+9)
Compiler        2.86s ± 0%      2.89s ± 1%  +0.91%  (p=0.000 n=9+10)
SSA             7.19s ± 0%      7.25s ± 1%  +0.75%  (p=0.000 n=8+9)
Flate           117ms ± 1%      118ms ± 1%  +1.10%  (p=0.000 n=10+9)
GoParser        146ms ± 2%      148ms ± 1%  +1.48%  (p=0.002 n=10+10)
Reflect         398ms ± 1%      404ms ± 1%  +1.51%  (p=0.000 n=10+9)
Tar             173ms ± 1%      175ms ± 1%  +1.17%  (p=0.000 n=10+10)
XML             208ms ± 1%      210ms ± 1%  +0.62%  (p=0.011 n=10+10)
[Geo mean]      369ms           373ms       +1.17%
(https://perf.golang.org/search?q=upload:20180101.2)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.22ms ± 1%  2.25ms ± 1%  +1.51%  (p=0.000 n=20+19)
(https://perf.golang.org/search?q=upload:20180101.3)
Change-Id: I5daf4cfec24b252e5a57001f0a6c03f22479d0f0
Reviewed-on: https://go-review.googlesource.com/85887
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2017-12-19 22:05:23 -08:00
										 |  |  | 			// We can't use this, so don't ask. | 
					
						
							|  |  |  | 			v = nil | 
					
						
							| 
									
										
											  
											
												runtime: support a two-level arena map
Currently, the heap arena map is a single, large array that covers
every possible arena frame in the entire address space. This is
practical up to about 48 bits of address space with 64 MB arenas.
However, there are two problems with this:
1. mips64, ppc64, and s390x support full 64-bit address spaces (though
   on Linux only s390x has kernel support for 64-bit address spaces).
   On these platforms, it would be good to support these larger
   address spaces.
2. On Windows, processes are charged for untouched memory, so for
   processes with small heaps, the mostly-untouched 32 MB arena map
   plus a 64 MB arena are significant overhead. Hence, it would be
   good to reduce both the arena map size and the arena size, but with
   a single-level arena, these are inversely proportional.
This CL adds support for a two-level arena map. Arena frame numbers
are now divided into arenaL1Bits of L1 index and arenaL2Bits of L2
index.
At the moment, arenaL1Bits is always 0, so we effectively have a
single level map. We do a few things so that this has no cost beyond
the current single-level map:
1. We embed the L2 array directly in mheap, so if there's a single
   entry in the L2 array, the representation is identical to the
   current representation and there's no extra level of indirection.
2. Hot code that accesses the arena map is structured so that it
   optimizes to nearly the same machine code as it does currently.
3. We make some small tweaks to hot code paths and to the inliner
   itself to keep some important functions inlined despite their
   now-larger ASTs. In particular, this is necessary for
   heapBitsForAddr and heapBits.next.
Possibly as a result of some of the tweaks, this actually slightly
improves the performance of the x/benchmarks garbage benchmark:
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.28ms ± 1%  2.26ms ± 1%  -1.07%  (p=0.000 n=17+19)
(https://perf.golang.org/search?q=upload:20180223.2)
For #23900.
Change-Id: If5164e0961754f97eb9eca58f837f36d759505ff
Reviewed-on: https://go-review.googlesource.com/96779
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2018-02-22 20:38:09 -05:00
										 |  |  | 		} else if arenaIndex(p+n-1) >= 1<<arenaBits { | 
					
						
							| 
									
										
										
										
											2018-02-16 17:53:16 -05:00
										 |  |  | 			// Outside addressable heap. Can't use. | 
					
						
							|  |  |  | 			v = nil | 
					
						
							| 
									
										
											  
											
												runtime: use sparse mappings for the heap
This replaces the contiguous heap arena mapping with a potentially
sparse mapping that can support heap mappings anywhere in the address
space.
This has several advantages over the current approach:
* There is no longer any limit on the size of the Go heap. (Currently
  it's limited to 512GB.) Hence, this fixes #10460.
* It eliminates many failures modes of heap initialization and
  growing. In particular it eliminates any possibility of panicking
  with an address space conflict. This can happen for many reasons and
  even causes a low but steady rate of TSAN test failures because of
  conflicts with the TSAN runtime. See #16936 and #11993.
* It eliminates the notion of "non-reserved" heap, which was added
  because creating huge address space reservations (particularly on
  64-bit) led to huge process VSIZE. This was at best confusing and at
  worst conflicted badly with ulimit -v. However, the non-reserved
  heap logic is complicated, can race with other mappings in non-pure
  Go binaries (e.g., #18976), and requires that the entire heap be
  either reserved or non-reserved. We currently maintain the latter
  property, but it's quite difficult to convince yourself of that, and
  hence difficult to keep correct. This logic is still present, but
  will be removed in the next CL.
* It fixes problems on 32-bit where skipping over parts of the address
  space leads to mapping huge (and never-to-be-used) metadata
  structures. See #19831.
This also completely rewrites and significantly simplifies
mheap.sysAlloc, which has been a source of many bugs. E.g., #21044,
 #20259, #18651, and #13143 (and maybe #23222).
This change also makes it possible to allocate individual objects
larger than 512GB. As a result, a few tests that expected huge
allocations to fail needed to be changed to make even larger
allocations. However, at the moment attempting to allocate a humongous
object may cause the program to freeze for several minutes on Linux as
we fall back to probing every page with addrspace_free. That logic
(and this failure mode) will be removed in the next CL.
Fixes #10460.
Fixes #22204 (since it rewrites the code involved).
This slightly slows down compilebench and the x/benchmarks garbage
benchmark.
name       old time/op     new time/op     delta
Template       184ms ± 1%      185ms ± 1%    ~     (p=0.065 n=10+9)
Unicode       86.9ms ± 3%     86.3ms ± 1%    ~     (p=0.631 n=10+10)
GoTypes        599ms ± 0%      602ms ± 0%  +0.56%  (p=0.000 n=10+9)
Compiler       2.87s ± 1%      2.89s ± 1%  +0.51%  (p=0.002 n=9+10)
SSA            7.29s ± 1%      7.25s ± 1%    ~     (p=0.182 n=10+9)
Flate          118ms ± 2%      118ms ± 1%    ~     (p=0.113 n=9+9)
GoParser       147ms ± 1%      148ms ± 1%  +1.07%  (p=0.003 n=9+10)
Reflect        401ms ± 1%      404ms ± 1%  +0.71%  (p=0.003 n=10+9)
Tar            175ms ± 1%      175ms ± 1%    ~     (p=0.604 n=9+10)
XML            209ms ± 1%      210ms ± 1%    ~     (p=0.052 n=10+10)
(https://perf.golang.org/search?q=upload:20171231.4)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.23ms ± 1%  2.25ms ± 1%  +0.84%  (p=0.000 n=19+19)
(https://perf.golang.org/search?q=upload:20171231.3)
Relative to the start of the sparse heap changes (starting at and
including "runtime: fix various contiguous bitmap assumptions"),
overall slowdown is roughly 1% on GC-intensive benchmarks:
name        old time/op     new time/op     delta
Template        183ms ± 1%      185ms ± 1%  +1.32%  (p=0.000 n=9+9)
Unicode        84.9ms ± 2%     86.3ms ± 1%  +1.65%  (p=0.000 n=9+10)
GoTypes         595ms ± 1%      602ms ± 0%  +1.19%  (p=0.000 n=9+9)
Compiler        2.86s ± 0%      2.89s ± 1%  +0.91%  (p=0.000 n=9+10)
SSA             7.19s ± 0%      7.25s ± 1%  +0.75%  (p=0.000 n=8+9)
Flate           117ms ± 1%      118ms ± 1%  +1.10%  (p=0.000 n=10+9)
GoParser        146ms ± 2%      148ms ± 1%  +1.48%  (p=0.002 n=10+10)
Reflect         398ms ± 1%      404ms ± 1%  +1.51%  (p=0.000 n=10+9)
Tar             173ms ± 1%      175ms ± 1%  +1.17%  (p=0.000 n=10+10)
XML             208ms ± 1%      210ms ± 1%  +0.62%  (p=0.011 n=10+10)
[Geo mean]      369ms           373ms       +1.17%
(https://perf.golang.org/search?q=upload:20180101.2)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.22ms ± 1%  2.25ms ± 1%  +1.51%  (p=0.000 n=20+19)
(https://perf.golang.org/search?q=upload:20180101.3)
Change-Id: I5daf4cfec24b252e5a57001f0a6c03f22479d0f0
Reviewed-on: https://go-review.googlesource.com/85887
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2017-12-19 22:05:23 -08:00
										 |  |  | 		} else { | 
					
						
							| 
									
										
											  
											
												runtime: remove non-reserved heap logic
Currently large sysReserve calls on some OSes don't actually reserve
the memory, but just check that it can be reserved. This was important
when we called sysReserve to "reserve" many gigabytes for the heap up
front, but now that we map memory in small increments as we need it,
this complication is no longer necessary.
This has one curious side benefit: currently, on Linux, allocations
that are large enough to be rejected by mmap wind up freezing the
application for a long time before it panics. This happens because
sysReserve doesn't reserve the memory, so sysMap calls mmap_fixed,
which calls mmap, which fails because the mapping is too large.
However, mmap_fixed doesn't inspect *why* mmap fails, so it falls back
to probing every page in the desired region individually with mincore
before performing an (otherwise dangerous) MAP_FIXED mapping, which
will also fail. This takes a long time for a large region. Now this
logic is gone, so the mmap failure leads to an immediate panic.
Updates #10460.
Change-Id: I8efe88c611871cdb14f99fadd09db83e0161ca2e
Reviewed-on: https://go-review.googlesource.com/85888
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2017-12-30 19:35:46 -05:00
										 |  |  | 			v = sysReserve(unsafe.Pointer(p), n) | 
					
						
							| 
									
										
											  
											
												runtime: use sparse mappings for the heap
This replaces the contiguous heap arena mapping with a potentially
sparse mapping that can support heap mappings anywhere in the address
space.
This has several advantages over the current approach:
* There is no longer any limit on the size of the Go heap. (Currently
  it's limited to 512GB.) Hence, this fixes #10460.
* It eliminates many failures modes of heap initialization and
  growing. In particular it eliminates any possibility of panicking
  with an address space conflict. This can happen for many reasons and
  even causes a low but steady rate of TSAN test failures because of
  conflicts with the TSAN runtime. See #16936 and #11993.
* It eliminates the notion of "non-reserved" heap, which was added
  because creating huge address space reservations (particularly on
  64-bit) led to huge process VSIZE. This was at best confusing and at
  worst conflicted badly with ulimit -v. However, the non-reserved
  heap logic is complicated, can race with other mappings in non-pure
  Go binaries (e.g., #18976), and requires that the entire heap be
  either reserved or non-reserved. We currently maintain the latter
  property, but it's quite difficult to convince yourself of that, and
  hence difficult to keep correct. This logic is still present, but
  will be removed in the next CL.
* It fixes problems on 32-bit where skipping over parts of the address
  space leads to mapping huge (and never-to-be-used) metadata
  structures. See #19831.
This also completely rewrites and significantly simplifies
mheap.sysAlloc, which has been a source of many bugs. E.g., #21044,
 #20259, #18651, and #13143 (and maybe #23222).
This change also makes it possible to allocate individual objects
larger than 512GB. As a result, a few tests that expected huge
allocations to fail needed to be changed to make even larger
allocations. However, at the moment attempting to allocate a humongous
object may cause the program to freeze for several minutes on Linux as
we fall back to probing every page with addrspace_free. That logic
(and this failure mode) will be removed in the next CL.
Fixes #10460.
Fixes #22204 (since it rewrites the code involved).
This slightly slows down compilebench and the x/benchmarks garbage
benchmark.
name       old time/op     new time/op     delta
Template       184ms ± 1%      185ms ± 1%    ~     (p=0.065 n=10+9)
Unicode       86.9ms ± 3%     86.3ms ± 1%    ~     (p=0.631 n=10+10)
GoTypes        599ms ± 0%      602ms ± 0%  +0.56%  (p=0.000 n=10+9)
Compiler       2.87s ± 1%      2.89s ± 1%  +0.51%  (p=0.002 n=9+10)
SSA            7.29s ± 1%      7.25s ± 1%    ~     (p=0.182 n=10+9)
Flate          118ms ± 2%      118ms ± 1%    ~     (p=0.113 n=9+9)
GoParser       147ms ± 1%      148ms ± 1%  +1.07%  (p=0.003 n=9+10)
Reflect        401ms ± 1%      404ms ± 1%  +0.71%  (p=0.003 n=10+9)
Tar            175ms ± 1%      175ms ± 1%    ~     (p=0.604 n=9+10)
XML            209ms ± 1%      210ms ± 1%    ~     (p=0.052 n=10+10)
(https://perf.golang.org/search?q=upload:20171231.4)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.23ms ± 1%  2.25ms ± 1%  +0.84%  (p=0.000 n=19+19)
(https://perf.golang.org/search?q=upload:20171231.3)
Relative to the start of the sparse heap changes (starting at and
including "runtime: fix various contiguous bitmap assumptions"),
overall slowdown is roughly 1% on GC-intensive benchmarks:
name        old time/op     new time/op     delta
Template        183ms ± 1%      185ms ± 1%  +1.32%  (p=0.000 n=9+9)
Unicode        84.9ms ± 2%     86.3ms ± 1%  +1.65%  (p=0.000 n=9+10)
GoTypes         595ms ± 1%      602ms ± 0%  +1.19%  (p=0.000 n=9+9)
Compiler        2.86s ± 0%      2.89s ± 1%  +0.91%  (p=0.000 n=9+10)
SSA             7.19s ± 0%      7.25s ± 1%  +0.75%  (p=0.000 n=8+9)
Flate           117ms ± 1%      118ms ± 1%  +1.10%  (p=0.000 n=10+9)
GoParser        146ms ± 2%      148ms ± 1%  +1.48%  (p=0.002 n=10+10)
Reflect         398ms ± 1%      404ms ± 1%  +1.51%  (p=0.000 n=10+9)
Tar             173ms ± 1%      175ms ± 1%  +1.17%  (p=0.000 n=10+10)
XML             208ms ± 1%      210ms ± 1%  +0.62%  (p=0.011 n=10+10)
[Geo mean]      369ms           373ms       +1.17%
(https://perf.golang.org/search?q=upload:20180101.2)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.22ms ± 1%  2.25ms ± 1%  +1.51%  (p=0.000 n=20+19)
(https://perf.golang.org/search?q=upload:20180101.3)
Change-Id: I5daf4cfec24b252e5a57001f0a6c03f22479d0f0
Reviewed-on: https://go-review.googlesource.com/85887
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2017-12-19 22:05:23 -08:00
										 |  |  | 		} | 
					
						
							|  |  |  | 		if p == uintptr(v) { | 
					
						
							|  |  |  | 			// Success. Update the hint. | 
					
						
							|  |  |  | 			if !hint.down { | 
					
						
							|  |  |  | 				p += n | 
					
						
							| 
									
										
										
										
											2015-02-19 13:38:46 -05:00
										 |  |  | 			} | 
					
						
							| 
									
										
											  
											
												runtime: use sparse mappings for the heap
This replaces the contiguous heap arena mapping with a potentially
sparse mapping that can support heap mappings anywhere in the address
space.
This has several advantages over the current approach:
* There is no longer any limit on the size of the Go heap. (Currently
  it's limited to 512GB.) Hence, this fixes #10460.
* It eliminates many failures modes of heap initialization and
  growing. In particular it eliminates any possibility of panicking
  with an address space conflict. This can happen for many reasons and
  even causes a low but steady rate of TSAN test failures because of
  conflicts with the TSAN runtime. See #16936 and #11993.
* It eliminates the notion of "non-reserved" heap, which was added
  because creating huge address space reservations (particularly on
  64-bit) led to huge process VSIZE. This was at best confusing and at
  worst conflicted badly with ulimit -v. However, the non-reserved
  heap logic is complicated, can race with other mappings in non-pure
  Go binaries (e.g., #18976), and requires that the entire heap be
  either reserved or non-reserved. We currently maintain the latter
  property, but it's quite difficult to convince yourself of that, and
  hence difficult to keep correct. This logic is still present, but
  will be removed in the next CL.
* It fixes problems on 32-bit where skipping over parts of the address
  space leads to mapping huge (and never-to-be-used) metadata
  structures. See #19831.
This also completely rewrites and significantly simplifies
mheap.sysAlloc, which has been a source of many bugs. E.g., #21044,
 #20259, #18651, and #13143 (and maybe #23222).
This change also makes it possible to allocate individual objects
larger than 512GB. As a result, a few tests that expected huge
allocations to fail needed to be changed to make even larger
allocations. However, at the moment attempting to allocate a humongous
object may cause the program to freeze for several minutes on Linux as
we fall back to probing every page with addrspace_free. That logic
(and this failure mode) will be removed in the next CL.
Fixes #10460.
Fixes #22204 (since it rewrites the code involved).
This slightly slows down compilebench and the x/benchmarks garbage
benchmark.
name       old time/op     new time/op     delta
Template       184ms ± 1%      185ms ± 1%    ~     (p=0.065 n=10+9)
Unicode       86.9ms ± 3%     86.3ms ± 1%    ~     (p=0.631 n=10+10)
GoTypes        599ms ± 0%      602ms ± 0%  +0.56%  (p=0.000 n=10+9)
Compiler       2.87s ± 1%      2.89s ± 1%  +0.51%  (p=0.002 n=9+10)
SSA            7.29s ± 1%      7.25s ± 1%    ~     (p=0.182 n=10+9)
Flate          118ms ± 2%      118ms ± 1%    ~     (p=0.113 n=9+9)
GoParser       147ms ± 1%      148ms ± 1%  +1.07%  (p=0.003 n=9+10)
Reflect        401ms ± 1%      404ms ± 1%  +0.71%  (p=0.003 n=10+9)
Tar            175ms ± 1%      175ms ± 1%    ~     (p=0.604 n=9+10)
XML            209ms ± 1%      210ms ± 1%    ~     (p=0.052 n=10+10)
(https://perf.golang.org/search?q=upload:20171231.4)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.23ms ± 1%  2.25ms ± 1%  +0.84%  (p=0.000 n=19+19)
(https://perf.golang.org/search?q=upload:20171231.3)
Relative to the start of the sparse heap changes (starting at and
including "runtime: fix various contiguous bitmap assumptions"),
overall slowdown is roughly 1% on GC-intensive benchmarks:
name        old time/op     new time/op     delta
Template        183ms ± 1%      185ms ± 1%  +1.32%  (p=0.000 n=9+9)
Unicode        84.9ms ± 2%     86.3ms ± 1%  +1.65%  (p=0.000 n=9+10)
GoTypes         595ms ± 1%      602ms ± 0%  +1.19%  (p=0.000 n=9+9)
Compiler        2.86s ± 0%      2.89s ± 1%  +0.91%  (p=0.000 n=9+10)
SSA             7.19s ± 0%      7.25s ± 1%  +0.75%  (p=0.000 n=8+9)
Flate           117ms ± 1%      118ms ± 1%  +1.10%  (p=0.000 n=10+9)
GoParser        146ms ± 2%      148ms ± 1%  +1.48%  (p=0.002 n=10+10)
Reflect         398ms ± 1%      404ms ± 1%  +1.51%  (p=0.000 n=10+9)
Tar             173ms ± 1%      175ms ± 1%  +1.17%  (p=0.000 n=10+10)
XML             208ms ± 1%      210ms ± 1%  +0.62%  (p=0.011 n=10+10)
[Geo mean]      369ms           373ms       +1.17%
(https://perf.golang.org/search?q=upload:20180101.2)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.22ms ± 1%  2.25ms ± 1%  +1.51%  (p=0.000 n=20+19)
(https://perf.golang.org/search?q=upload:20180101.3)
Change-Id: I5daf4cfec24b252e5a57001f0a6c03f22479d0f0
Reviewed-on: https://go-review.googlesource.com/85887
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2017-12-19 22:05:23 -08:00
										 |  |  | 			hint.addr = p | 
					
						
							|  |  |  | 			size = n | 
					
						
							|  |  |  | 			break | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 		// Failed. Discard this hint and try the next. | 
					
						
							|  |  |  | 		// | 
					
						
							|  |  |  | 		// TODO: This would be cleaner if sysReserve could be | 
					
						
							|  |  |  | 		// told to only return the requested address. In | 
					
						
							|  |  |  | 		// particular, this is already how Windows behaves, so | 
					
						
							| 
									
										
										
										
											2019-05-14 16:39:43 +00:00
										 |  |  | 		// it would simplify things there. | 
					
						
							| 
									
										
											  
											
												runtime: use sparse mappings for the heap
This replaces the contiguous heap arena mapping with a potentially
sparse mapping that can support heap mappings anywhere in the address
space.
This has several advantages over the current approach:
* There is no longer any limit on the size of the Go heap. (Currently
  it's limited to 512GB.) Hence, this fixes #10460.
* It eliminates many failures modes of heap initialization and
  growing. In particular it eliminates any possibility of panicking
  with an address space conflict. This can happen for many reasons and
  even causes a low but steady rate of TSAN test failures because of
  conflicts with the TSAN runtime. See #16936 and #11993.
* It eliminates the notion of "non-reserved" heap, which was added
  because creating huge address space reservations (particularly on
  64-bit) led to huge process VSIZE. This was at best confusing and at
  worst conflicted badly with ulimit -v. However, the non-reserved
  heap logic is complicated, can race with other mappings in non-pure
  Go binaries (e.g., #18976), and requires that the entire heap be
  either reserved or non-reserved. We currently maintain the latter
  property, but it's quite difficult to convince yourself of that, and
  hence difficult to keep correct. This logic is still present, but
  will be removed in the next CL.
* It fixes problems on 32-bit where skipping over parts of the address
  space leads to mapping huge (and never-to-be-used) metadata
  structures. See #19831.
This also completely rewrites and significantly simplifies
mheap.sysAlloc, which has been a source of many bugs. E.g., #21044,
 #20259, #18651, and #13143 (and maybe #23222).
This change also makes it possible to allocate individual objects
larger than 512GB. As a result, a few tests that expected huge
allocations to fail needed to be changed to make even larger
allocations. However, at the moment attempting to allocate a humongous
object may cause the program to freeze for several minutes on Linux as
we fall back to probing every page with addrspace_free. That logic
(and this failure mode) will be removed in the next CL.
Fixes #10460.
Fixes #22204 (since it rewrites the code involved).
This slightly slows down compilebench and the x/benchmarks garbage
benchmark.
name       old time/op     new time/op     delta
Template       184ms ± 1%      185ms ± 1%    ~     (p=0.065 n=10+9)
Unicode       86.9ms ± 3%     86.3ms ± 1%    ~     (p=0.631 n=10+10)
GoTypes        599ms ± 0%      602ms ± 0%  +0.56%  (p=0.000 n=10+9)
Compiler       2.87s ± 1%      2.89s ± 1%  +0.51%  (p=0.002 n=9+10)
SSA            7.29s ± 1%      7.25s ± 1%    ~     (p=0.182 n=10+9)
Flate          118ms ± 2%      118ms ± 1%    ~     (p=0.113 n=9+9)
GoParser       147ms ± 1%      148ms ± 1%  +1.07%  (p=0.003 n=9+10)
Reflect        401ms ± 1%      404ms ± 1%  +0.71%  (p=0.003 n=10+9)
Tar            175ms ± 1%      175ms ± 1%    ~     (p=0.604 n=9+10)
XML            209ms ± 1%      210ms ± 1%    ~     (p=0.052 n=10+10)
(https://perf.golang.org/search?q=upload:20171231.4)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.23ms ± 1%  2.25ms ± 1%  +0.84%  (p=0.000 n=19+19)
(https://perf.golang.org/search?q=upload:20171231.3)
Relative to the start of the sparse heap changes (starting at and
including "runtime: fix various contiguous bitmap assumptions"),
overall slowdown is roughly 1% on GC-intensive benchmarks:
name        old time/op     new time/op     delta
Template        183ms ± 1%      185ms ± 1%  +1.32%  (p=0.000 n=9+9)
Unicode        84.9ms ± 2%     86.3ms ± 1%  +1.65%  (p=0.000 n=9+10)
GoTypes         595ms ± 1%      602ms ± 0%  +1.19%  (p=0.000 n=9+9)
Compiler        2.86s ± 0%      2.89s ± 1%  +0.91%  (p=0.000 n=9+10)
SSA             7.19s ± 0%      7.25s ± 1%  +0.75%  (p=0.000 n=8+9)
Flate           117ms ± 1%      118ms ± 1%  +1.10%  (p=0.000 n=10+9)
GoParser        146ms ± 2%      148ms ± 1%  +1.48%  (p=0.002 n=10+10)
Reflect         398ms ± 1%      404ms ± 1%  +1.51%  (p=0.000 n=10+9)
Tar             173ms ± 1%      175ms ± 1%  +1.17%  (p=0.000 n=10+10)
XML             208ms ± 1%      210ms ± 1%  +0.62%  (p=0.011 n=10+10)
[Geo mean]      369ms           373ms       +1.17%
(https://perf.golang.org/search?q=upload:20180101.2)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.22ms ± 1%  2.25ms ± 1%  +1.51%  (p=0.000 n=20+19)
(https://perf.golang.org/search?q=upload:20180101.3)
Change-Id: I5daf4cfec24b252e5a57001f0a6c03f22479d0f0
Reviewed-on: https://go-review.googlesource.com/85887
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2017-12-19 22:05:23 -08:00
										 |  |  | 		if v != nil { | 
					
						
							| 
									
										
											  
											
												runtime: track how much memory is mapped in the Ready state
This change adds a field to memstats called mappedReady that tracks how
much memory is in the Ready state at any given time. In essence, it's
the total memory usage by the Go runtime (with one exception which is
documented). Essentially, all memory mapped read/write that has either
been paged in or will soon.
To make tracking this not involve the many different stats that track
mapped memory, we track this statistic at a very low level. The downside
of tracking this statistic at such a low level is that it managed to
catch lots of situations where the runtime wasn't fully accounting for
memory. This change rectifies these situations by always accounting for
memory that's mapped in some way (i.e. always passing a sysMemStat to a
mem.go function), with *two* exceptions.
Rectifying these situations means also having the memory mapped during
testing being accounted for, so that tests (i.e. ReadMemStats) that
ultimately check mappedReady continue to work correctly without special
exceptions. We choose to simply account for this memory in other_sys.
Let's talk about the exceptions. The first is the arenas array for
finding heap arena metadata from an address is mapped as read/write in
one large chunk. It's tens of MiB in size. On systems with demand
paging, we assume that the whole thing isn't paged in at once (after
all, it maps to the whole address space, and it's exceedingly difficult
with today's technology to even broach having as much physical memory as
the total address space). On systems where we have to commit memory
manually, we use a two-level structure.
Now, the reason why this is an exception is because we have no mechanism
to track what memory is paged in, and we can't just account for the
entire thing, because that would *look* like an enormous overhead.
Furthermore, this structure is on a few really, really critical paths in
the runtime, so doing more explicit tracking isn't really an option. So,
we explicitly don't and call sysAllocOS to map this memory.
The second exception is that we call sysFree with no accounting to clean
up address space reservations, or otherwise to throw out mappings we
don't care about. In this case, also drop down to a lower level and call
sysFreeOS to explicitly avoid accounting.
The third exception is debuglog allocations. That is purely a debugging
facility and ideally we want it to have as small an impact on the
runtime as possible. If we include it in mappedReady calculations, it
could cause GC pacing shifts in future CLs, especailly if one increases
the debuglog buffer sizes as a one-off.
As of this CL, these are the only three places in the runtime that would
pass nil for a stat to any of the functions in mem.go. As a result, this
CL makes sysMemStats mandatory to facilitate better accounting in the
future. It's now much easier to grep and find out where accounting is
explicitly elided, because one doesn't have to follow the trail of
sysMemStat nil pointer values, and can just look at the function name.
For #48409.
Change-Id: I274eb467fc2603881717482214fddc47c9eaf218
Reviewed-on: https://go-review.googlesource.com/c/go/+/393402
Reviewed-by: Michael Pratt <mpratt@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
											
										 
											2022-03-15 02:48:18 +00:00
										 |  |  | 			sysFreeOS(v, n) | 
					
						
							| 
									
										
										
										
											2015-02-19 13:38:46 -05:00
										 |  |  | 		} | 
					
						
							| 
									
										
										
										
											2022-09-20 18:33:03 +00:00
										 |  |  | 		*hintList = hint.next | 
					
						
							| 
									
										
											  
											
												runtime: use sparse mappings for the heap
This replaces the contiguous heap arena mapping with a potentially
sparse mapping that can support heap mappings anywhere in the address
space.
This has several advantages over the current approach:
* There is no longer any limit on the size of the Go heap. (Currently
  it's limited to 512GB.) Hence, this fixes #10460.
* It eliminates many failures modes of heap initialization and
  growing. In particular it eliminates any possibility of panicking
  with an address space conflict. This can happen for many reasons and
  even causes a low but steady rate of TSAN test failures because of
  conflicts with the TSAN runtime. See #16936 and #11993.
* It eliminates the notion of "non-reserved" heap, which was added
  because creating huge address space reservations (particularly on
  64-bit) led to huge process VSIZE. This was at best confusing and at
  worst conflicted badly with ulimit -v. However, the non-reserved
  heap logic is complicated, can race with other mappings in non-pure
  Go binaries (e.g., #18976), and requires that the entire heap be
  either reserved or non-reserved. We currently maintain the latter
  property, but it's quite difficult to convince yourself of that, and
  hence difficult to keep correct. This logic is still present, but
  will be removed in the next CL.
* It fixes problems on 32-bit where skipping over parts of the address
  space leads to mapping huge (and never-to-be-used) metadata
  structures. See #19831.
This also completely rewrites and significantly simplifies
mheap.sysAlloc, which has been a source of many bugs. E.g., #21044,
 #20259, #18651, and #13143 (and maybe #23222).
This change also makes it possible to allocate individual objects
larger than 512GB. As a result, a few tests that expected huge
allocations to fail needed to be changed to make even larger
allocations. However, at the moment attempting to allocate a humongous
object may cause the program to freeze for several minutes on Linux as
we fall back to probing every page with addrspace_free. That logic
(and this failure mode) will be removed in the next CL.
Fixes #10460.
Fixes #22204 (since it rewrites the code involved).
This slightly slows down compilebench and the x/benchmarks garbage
benchmark.
name       old time/op     new time/op     delta
Template       184ms ± 1%      185ms ± 1%    ~     (p=0.065 n=10+9)
Unicode       86.9ms ± 3%     86.3ms ± 1%    ~     (p=0.631 n=10+10)
GoTypes        599ms ± 0%      602ms ± 0%  +0.56%  (p=0.000 n=10+9)
Compiler       2.87s ± 1%      2.89s ± 1%  +0.51%  (p=0.002 n=9+10)
SSA            7.29s ± 1%      7.25s ± 1%    ~     (p=0.182 n=10+9)
Flate          118ms ± 2%      118ms ± 1%    ~     (p=0.113 n=9+9)
GoParser       147ms ± 1%      148ms ± 1%  +1.07%  (p=0.003 n=9+10)
Reflect        401ms ± 1%      404ms ± 1%  +0.71%  (p=0.003 n=10+9)
Tar            175ms ± 1%      175ms ± 1%    ~     (p=0.604 n=9+10)
XML            209ms ± 1%      210ms ± 1%    ~     (p=0.052 n=10+10)
(https://perf.golang.org/search?q=upload:20171231.4)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.23ms ± 1%  2.25ms ± 1%  +0.84%  (p=0.000 n=19+19)
(https://perf.golang.org/search?q=upload:20171231.3)
Relative to the start of the sparse heap changes (starting at and
including "runtime: fix various contiguous bitmap assumptions"),
overall slowdown is roughly 1% on GC-intensive benchmarks:
name        old time/op     new time/op     delta
Template        183ms ± 1%      185ms ± 1%  +1.32%  (p=0.000 n=9+9)
Unicode        84.9ms ± 2%     86.3ms ± 1%  +1.65%  (p=0.000 n=9+10)
GoTypes         595ms ± 1%      602ms ± 0%  +1.19%  (p=0.000 n=9+9)
Compiler        2.86s ± 0%      2.89s ± 1%  +0.91%  (p=0.000 n=9+10)
SSA             7.19s ± 0%      7.25s ± 1%  +0.75%  (p=0.000 n=8+9)
Flate           117ms ± 1%      118ms ± 1%  +1.10%  (p=0.000 n=10+9)
GoParser        146ms ± 2%      148ms ± 1%  +1.48%  (p=0.002 n=10+10)
Reflect         398ms ± 1%      404ms ± 1%  +1.51%  (p=0.000 n=10+9)
Tar             173ms ± 1%      175ms ± 1%  +1.17%  (p=0.000 n=10+10)
XML             208ms ± 1%      210ms ± 1%  +0.62%  (p=0.011 n=10+10)
[Geo mean]      369ms           373ms       +1.17%
(https://perf.golang.org/search?q=upload:20180101.2)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.22ms ± 1%  2.25ms ± 1%  +1.51%  (p=0.000 n=20+19)
(https://perf.golang.org/search?q=upload:20180101.3)
Change-Id: I5daf4cfec24b252e5a57001f0a6c03f22479d0f0
Reviewed-on: https://go-review.googlesource.com/85887
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2017-12-19 22:05:23 -08:00
										 |  |  | 		h.arenaHintAlloc.free(unsafe.Pointer(hint)) | 
					
						
							| 
									
										
										
										
											2015-02-19 13:38:46 -05:00
										 |  |  | 	} | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
											  
											
												runtime: use sparse mappings for the heap
This replaces the contiguous heap arena mapping with a potentially
sparse mapping that can support heap mappings anywhere in the address
space.
This has several advantages over the current approach:
* There is no longer any limit on the size of the Go heap. (Currently
  it's limited to 512GB.) Hence, this fixes #10460.
* It eliminates many failures modes of heap initialization and
  growing. In particular it eliminates any possibility of panicking
  with an address space conflict. This can happen for many reasons and
  even causes a low but steady rate of TSAN test failures because of
  conflicts with the TSAN runtime. See #16936 and #11993.
* It eliminates the notion of "non-reserved" heap, which was added
  because creating huge address space reservations (particularly on
  64-bit) led to huge process VSIZE. This was at best confusing and at
  worst conflicted badly with ulimit -v. However, the non-reserved
  heap logic is complicated, can race with other mappings in non-pure
  Go binaries (e.g., #18976), and requires that the entire heap be
  either reserved or non-reserved. We currently maintain the latter
  property, but it's quite difficult to convince yourself of that, and
  hence difficult to keep correct. This logic is still present, but
  will be removed in the next CL.
* It fixes problems on 32-bit where skipping over parts of the address
  space leads to mapping huge (and never-to-be-used) metadata
  structures. See #19831.
This also completely rewrites and significantly simplifies
mheap.sysAlloc, which has been a source of many bugs. E.g., #21044,
 #20259, #18651, and #13143 (and maybe #23222).
This change also makes it possible to allocate individual objects
larger than 512GB. As a result, a few tests that expected huge
allocations to fail needed to be changed to make even larger
allocations. However, at the moment attempting to allocate a humongous
object may cause the program to freeze for several minutes on Linux as
we fall back to probing every page with addrspace_free. That logic
(and this failure mode) will be removed in the next CL.
Fixes #10460.
Fixes #22204 (since it rewrites the code involved).
This slightly slows down compilebench and the x/benchmarks garbage
benchmark.
name       old time/op     new time/op     delta
Template       184ms ± 1%      185ms ± 1%    ~     (p=0.065 n=10+9)
Unicode       86.9ms ± 3%     86.3ms ± 1%    ~     (p=0.631 n=10+10)
GoTypes        599ms ± 0%      602ms ± 0%  +0.56%  (p=0.000 n=10+9)
Compiler       2.87s ± 1%      2.89s ± 1%  +0.51%  (p=0.002 n=9+10)
SSA            7.29s ± 1%      7.25s ± 1%    ~     (p=0.182 n=10+9)
Flate          118ms ± 2%      118ms ± 1%    ~     (p=0.113 n=9+9)
GoParser       147ms ± 1%      148ms ± 1%  +1.07%  (p=0.003 n=9+10)
Reflect        401ms ± 1%      404ms ± 1%  +0.71%  (p=0.003 n=10+9)
Tar            175ms ± 1%      175ms ± 1%    ~     (p=0.604 n=9+10)
XML            209ms ± 1%      210ms ± 1%    ~     (p=0.052 n=10+10)
(https://perf.golang.org/search?q=upload:20171231.4)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.23ms ± 1%  2.25ms ± 1%  +0.84%  (p=0.000 n=19+19)
(https://perf.golang.org/search?q=upload:20171231.3)
Relative to the start of the sparse heap changes (starting at and
including "runtime: fix various contiguous bitmap assumptions"),
overall slowdown is roughly 1% on GC-intensive benchmarks:
name        old time/op     new time/op     delta
Template        183ms ± 1%      185ms ± 1%  +1.32%  (p=0.000 n=9+9)
Unicode        84.9ms ± 2%     86.3ms ± 1%  +1.65%  (p=0.000 n=9+10)
GoTypes         595ms ± 1%      602ms ± 0%  +1.19%  (p=0.000 n=9+9)
Compiler        2.86s ± 0%      2.89s ± 1%  +0.91%  (p=0.000 n=9+10)
SSA             7.19s ± 0%      7.25s ± 1%  +0.75%  (p=0.000 n=8+9)
Flate           117ms ± 1%      118ms ± 1%  +1.10%  (p=0.000 n=10+9)
GoParser        146ms ± 2%      148ms ± 1%  +1.48%  (p=0.002 n=10+10)
Reflect         398ms ± 1%      404ms ± 1%  +1.51%  (p=0.000 n=10+9)
Tar             173ms ± 1%      175ms ± 1%  +1.17%  (p=0.000 n=10+10)
XML             208ms ± 1%      210ms ± 1%  +0.62%  (p=0.011 n=10+10)
[Geo mean]      369ms           373ms       +1.17%
(https://perf.golang.org/search?q=upload:20180101.2)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.22ms ± 1%  2.25ms ± 1%  +1.51%  (p=0.000 n=20+19)
(https://perf.golang.org/search?q=upload:20180101.3)
Change-Id: I5daf4cfec24b252e5a57001f0a6c03f22479d0f0
Reviewed-on: https://go-review.googlesource.com/85887
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2017-12-19 22:05:23 -08:00
										 |  |  | 	if size == 0 { | 
					
						
							| 
									
										
										
										
											2018-04-04 12:54:47 -04:00
										 |  |  | 		if raceenabled { | 
					
						
							|  |  |  | 			// The race detector assumes the heap lives in | 
					
						
							|  |  |  | 			// [0x00c000000000, 0x00e000000000), but we | 
					
						
							|  |  |  | 			// just ran out of hints in this region. Give | 
					
						
							|  |  |  | 			// a nice failure. | 
					
						
							|  |  |  | 			throw("too many address space collisions for -race mode") | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
											  
											
												runtime: use sparse mappings for the heap
This replaces the contiguous heap arena mapping with a potentially
sparse mapping that can support heap mappings anywhere in the address
space.
This has several advantages over the current approach:
* There is no longer any limit on the size of the Go heap. (Currently
  it's limited to 512GB.) Hence, this fixes #10460.
* It eliminates many failures modes of heap initialization and
  growing. In particular it eliminates any possibility of panicking
  with an address space conflict. This can happen for many reasons and
  even causes a low but steady rate of TSAN test failures because of
  conflicts with the TSAN runtime. See #16936 and #11993.
* It eliminates the notion of "non-reserved" heap, which was added
  because creating huge address space reservations (particularly on
  64-bit) led to huge process VSIZE. This was at best confusing and at
  worst conflicted badly with ulimit -v. However, the non-reserved
  heap logic is complicated, can race with other mappings in non-pure
  Go binaries (e.g., #18976), and requires that the entire heap be
  either reserved or non-reserved. We currently maintain the latter
  property, but it's quite difficult to convince yourself of that, and
  hence difficult to keep correct. This logic is still present, but
  will be removed in the next CL.
* It fixes problems on 32-bit where skipping over parts of the address
  space leads to mapping huge (and never-to-be-used) metadata
  structures. See #19831.
This also completely rewrites and significantly simplifies
mheap.sysAlloc, which has been a source of many bugs. E.g., #21044,
 #20259, #18651, and #13143 (and maybe #23222).
This change also makes it possible to allocate individual objects
larger than 512GB. As a result, a few tests that expected huge
allocations to fail needed to be changed to make even larger
allocations. However, at the moment attempting to allocate a humongous
object may cause the program to freeze for several minutes on Linux as
we fall back to probing every page with addrspace_free. That logic
(and this failure mode) will be removed in the next CL.
Fixes #10460.
Fixes #22204 (since it rewrites the code involved).
This slightly slows down compilebench and the x/benchmarks garbage
benchmark.
name       old time/op     new time/op     delta
Template       184ms ± 1%      185ms ± 1%    ~     (p=0.065 n=10+9)
Unicode       86.9ms ± 3%     86.3ms ± 1%    ~     (p=0.631 n=10+10)
GoTypes        599ms ± 0%      602ms ± 0%  +0.56%  (p=0.000 n=10+9)
Compiler       2.87s ± 1%      2.89s ± 1%  +0.51%  (p=0.002 n=9+10)
SSA            7.29s ± 1%      7.25s ± 1%    ~     (p=0.182 n=10+9)
Flate          118ms ± 2%      118ms ± 1%    ~     (p=0.113 n=9+9)
GoParser       147ms ± 1%      148ms ± 1%  +1.07%  (p=0.003 n=9+10)
Reflect        401ms ± 1%      404ms ± 1%  +0.71%  (p=0.003 n=10+9)
Tar            175ms ± 1%      175ms ± 1%    ~     (p=0.604 n=9+10)
XML            209ms ± 1%      210ms ± 1%    ~     (p=0.052 n=10+10)
(https://perf.golang.org/search?q=upload:20171231.4)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.23ms ± 1%  2.25ms ± 1%  +0.84%  (p=0.000 n=19+19)
(https://perf.golang.org/search?q=upload:20171231.3)
Relative to the start of the sparse heap changes (starting at and
including "runtime: fix various contiguous bitmap assumptions"),
overall slowdown is roughly 1% on GC-intensive benchmarks:
name        old time/op     new time/op     delta
Template        183ms ± 1%      185ms ± 1%  +1.32%  (p=0.000 n=9+9)
Unicode        84.9ms ± 2%     86.3ms ± 1%  +1.65%  (p=0.000 n=9+10)
GoTypes         595ms ± 1%      602ms ± 0%  +1.19%  (p=0.000 n=9+9)
Compiler        2.86s ± 0%      2.89s ± 1%  +0.91%  (p=0.000 n=9+10)
SSA             7.19s ± 0%      7.25s ± 1%  +0.75%  (p=0.000 n=8+9)
Flate           117ms ± 1%      118ms ± 1%  +1.10%  (p=0.000 n=10+9)
GoParser        146ms ± 2%      148ms ± 1%  +1.48%  (p=0.002 n=10+10)
Reflect         398ms ± 1%      404ms ± 1%  +1.51%  (p=0.000 n=10+9)
Tar             173ms ± 1%      175ms ± 1%  +1.17%  (p=0.000 n=10+10)
XML             208ms ± 1%      210ms ± 1%  +0.62%  (p=0.011 n=10+10)
[Geo mean]      369ms           373ms       +1.17%
(https://perf.golang.org/search?q=upload:20180101.2)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.22ms ± 1%  2.25ms ± 1%  +1.51%  (p=0.000 n=20+19)
(https://perf.golang.org/search?q=upload:20180101.3)
Change-Id: I5daf4cfec24b252e5a57001f0a6c03f22479d0f0
Reviewed-on: https://go-review.googlesource.com/85887
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2017-12-19 22:05:23 -08:00
										 |  |  | 		// All of the hints failed, so we'll take any | 
					
						
							|  |  |  | 		// (sufficiently aligned) address the kernel will give | 
					
						
							|  |  |  | 		// us. | 
					
						
							| 
									
										
											  
											
												runtime: remove non-reserved heap logic
Currently large sysReserve calls on some OSes don't actually reserve
the memory, but just check that it can be reserved. This was important
when we called sysReserve to "reserve" many gigabytes for the heap up
front, but now that we map memory in small increments as we need it,
this complication is no longer necessary.
This has one curious side benefit: currently, on Linux, allocations
that are large enough to be rejected by mmap wind up freezing the
application for a long time before it panics. This happens because
sysReserve doesn't reserve the memory, so sysMap calls mmap_fixed,
which calls mmap, which fails because the mapping is too large.
However, mmap_fixed doesn't inspect *why* mmap fails, so it falls back
to probing every page in the desired region individually with mincore
before performing an (otherwise dangerous) MAP_FIXED mapping, which
will also fail. This takes a long time for a large region. Now this
logic is gone, so the mmap failure leads to an immediate panic.
Updates #10460.
Change-Id: I8efe88c611871cdb14f99fadd09db83e0161ca2e
Reviewed-on: https://go-review.googlesource.com/85888
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2017-12-30 19:35:46 -05:00
										 |  |  | 		v, size = sysReserveAligned(nil, n, heapArenaBytes) | 
					
						
							| 
									
										
											  
											
												runtime: use sparse mappings for the heap
This replaces the contiguous heap arena mapping with a potentially
sparse mapping that can support heap mappings anywhere in the address
space.
This has several advantages over the current approach:
* There is no longer any limit on the size of the Go heap. (Currently
  it's limited to 512GB.) Hence, this fixes #10460.
* It eliminates many failures modes of heap initialization and
  growing. In particular it eliminates any possibility of panicking
  with an address space conflict. This can happen for many reasons and
  even causes a low but steady rate of TSAN test failures because of
  conflicts with the TSAN runtime. See #16936 and #11993.
* It eliminates the notion of "non-reserved" heap, which was added
  because creating huge address space reservations (particularly on
  64-bit) led to huge process VSIZE. This was at best confusing and at
  worst conflicted badly with ulimit -v. However, the non-reserved
  heap logic is complicated, can race with other mappings in non-pure
  Go binaries (e.g., #18976), and requires that the entire heap be
  either reserved or non-reserved. We currently maintain the latter
  property, but it's quite difficult to convince yourself of that, and
  hence difficult to keep correct. This logic is still present, but
  will be removed in the next CL.
* It fixes problems on 32-bit where skipping over parts of the address
  space leads to mapping huge (and never-to-be-used) metadata
  structures. See #19831.
This also completely rewrites and significantly simplifies
mheap.sysAlloc, which has been a source of many bugs. E.g., #21044,
 #20259, #18651, and #13143 (and maybe #23222).
This change also makes it possible to allocate individual objects
larger than 512GB. As a result, a few tests that expected huge
allocations to fail needed to be changed to make even larger
allocations. However, at the moment attempting to allocate a humongous
object may cause the program to freeze for several minutes on Linux as
we fall back to probing every page with addrspace_free. That logic
(and this failure mode) will be removed in the next CL.
Fixes #10460.
Fixes #22204 (since it rewrites the code involved).
This slightly slows down compilebench and the x/benchmarks garbage
benchmark.
name       old time/op     new time/op     delta
Template       184ms ± 1%      185ms ± 1%    ~     (p=0.065 n=10+9)
Unicode       86.9ms ± 3%     86.3ms ± 1%    ~     (p=0.631 n=10+10)
GoTypes        599ms ± 0%      602ms ± 0%  +0.56%  (p=0.000 n=10+9)
Compiler       2.87s ± 1%      2.89s ± 1%  +0.51%  (p=0.002 n=9+10)
SSA            7.29s ± 1%      7.25s ± 1%    ~     (p=0.182 n=10+9)
Flate          118ms ± 2%      118ms ± 1%    ~     (p=0.113 n=9+9)
GoParser       147ms ± 1%      148ms ± 1%  +1.07%  (p=0.003 n=9+10)
Reflect        401ms ± 1%      404ms ± 1%  +0.71%  (p=0.003 n=10+9)
Tar            175ms ± 1%      175ms ± 1%    ~     (p=0.604 n=9+10)
XML            209ms ± 1%      210ms ± 1%    ~     (p=0.052 n=10+10)
(https://perf.golang.org/search?q=upload:20171231.4)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.23ms ± 1%  2.25ms ± 1%  +0.84%  (p=0.000 n=19+19)
(https://perf.golang.org/search?q=upload:20171231.3)
Relative to the start of the sparse heap changes (starting at and
including "runtime: fix various contiguous bitmap assumptions"),
overall slowdown is roughly 1% on GC-intensive benchmarks:
name        old time/op     new time/op     delta
Template        183ms ± 1%      185ms ± 1%  +1.32%  (p=0.000 n=9+9)
Unicode        84.9ms ± 2%     86.3ms ± 1%  +1.65%  (p=0.000 n=9+10)
GoTypes         595ms ± 1%      602ms ± 0%  +1.19%  (p=0.000 n=9+9)
Compiler        2.86s ± 0%      2.89s ± 1%  +0.91%  (p=0.000 n=9+10)
SSA             7.19s ± 0%      7.25s ± 1%  +0.75%  (p=0.000 n=8+9)
Flate           117ms ± 1%      118ms ± 1%  +1.10%  (p=0.000 n=10+9)
GoParser        146ms ± 2%      148ms ± 1%  +1.48%  (p=0.002 n=10+10)
Reflect         398ms ± 1%      404ms ± 1%  +1.51%  (p=0.000 n=10+9)
Tar             173ms ± 1%      175ms ± 1%  +1.17%  (p=0.000 n=10+10)
XML             208ms ± 1%      210ms ± 1%  +0.62%  (p=0.011 n=10+10)
[Geo mean]      369ms           373ms       +1.17%
(https://perf.golang.org/search?q=upload:20180101.2)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.22ms ± 1%  2.25ms ± 1%  +1.51%  (p=0.000 n=20+19)
(https://perf.golang.org/search?q=upload:20180101.3)
Change-Id: I5daf4cfec24b252e5a57001f0a6c03f22479d0f0
Reviewed-on: https://go-review.googlesource.com/85887
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2017-12-19 22:05:23 -08:00
										 |  |  | 		if v == nil { | 
					
						
							|  |  |  | 			return nil, 0 | 
					
						
							| 
									
										
											  
											
												runtime: accept non-monotonic arena allocation on 32-bit
Currently, the heap arena allocator allocates monotonically increasing
addresses. This is fine on 64-bit where we stake out a giant block of
the address space for ourselves and start at the beginning of it, but
on 32-bit the arena starts at address 0 but we start allocating from
wherever the OS feels like giving us memory. We can generally hint the
OS to start us at a low address, but this doesn't always work.
As a result, on 32-bit, if the OS gives us an arena block that's lower
than the current block we're allocating from, we simply say "thanks
but no thanks", return the whole (256MB!) block of memory, and then
take a fallback path that mmaps just the amount of memory we need
(which may be as little as 8K).
We have to do this because mheap_.arena_used is *both* the highest
used address in the arena and the next address we allocate from.
Fix all of this by separating the second role of arena_used out into a
new field called arena_alloc. This lets us accept any arena block the
OS gives us. This also slightly changes the invariants around
arena_end. Previously, we ensured arena_used <= arena_end, but this
was related to arena_used's second role, so the new invariant is
arena_alloc <= arena_end. As a result, we no longer necessarily update
arena_end when we're updating arena_used.
Fixes #20259 properly. (Unlike the original fix, this one should not
be cherry-picked to Go 1.8.)
This is reasonably low risk. I verified several key properties of the
32-bit code path with both 4K and 64K physical pages using a symbolic
model and the change does not materially affect 64-bit (arena_used ==
arena_alloc on 64-bit). The only oddity is that we no longer call
setArenaUsed with racemap == false to indicate that we're creating a
hole in the address space, but this only happened in a 32-bit-only
code path, and the race detector require 64-bit, so this never
mattered anyway.
Change-Id: Ib1334007933e615166bac4159bf357ae06ec6a25
Reviewed-on: https://go-review.googlesource.com/44010
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Ian Lance Taylor <iant@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2017-05-23 17:07:26 -04:00
										 |  |  | 		} | 
					
						
							| 
									
										
										
										
											2015-02-19 13:38:46 -05:00
										 |  |  | 
 | 
					
						
							| 
									
										
											  
											
												runtime: use sparse mappings for the heap
This replaces the contiguous heap arena mapping with a potentially
sparse mapping that can support heap mappings anywhere in the address
space.
This has several advantages over the current approach:
* There is no longer any limit on the size of the Go heap. (Currently
  it's limited to 512GB.) Hence, this fixes #10460.
* It eliminates many failures modes of heap initialization and
  growing. In particular it eliminates any possibility of panicking
  with an address space conflict. This can happen for many reasons and
  even causes a low but steady rate of TSAN test failures because of
  conflicts with the TSAN runtime. See #16936 and #11993.
* It eliminates the notion of "non-reserved" heap, which was added
  because creating huge address space reservations (particularly on
  64-bit) led to huge process VSIZE. This was at best confusing and at
  worst conflicted badly with ulimit -v. However, the non-reserved
  heap logic is complicated, can race with other mappings in non-pure
  Go binaries (e.g., #18976), and requires that the entire heap be
  either reserved or non-reserved. We currently maintain the latter
  property, but it's quite difficult to convince yourself of that, and
  hence difficult to keep correct. This logic is still present, but
  will be removed in the next CL.
* It fixes problems on 32-bit where skipping over parts of the address
  space leads to mapping huge (and never-to-be-used) metadata
  structures. See #19831.
This also completely rewrites and significantly simplifies
mheap.sysAlloc, which has been a source of many bugs. E.g., #21044,
 #20259, #18651, and #13143 (and maybe #23222).
This change also makes it possible to allocate individual objects
larger than 512GB. As a result, a few tests that expected huge
allocations to fail needed to be changed to make even larger
allocations. However, at the moment attempting to allocate a humongous
object may cause the program to freeze for several minutes on Linux as
we fall back to probing every page with addrspace_free. That logic
(and this failure mode) will be removed in the next CL.
Fixes #10460.
Fixes #22204 (since it rewrites the code involved).
This slightly slows down compilebench and the x/benchmarks garbage
benchmark.
name       old time/op     new time/op     delta
Template       184ms ± 1%      185ms ± 1%    ~     (p=0.065 n=10+9)
Unicode       86.9ms ± 3%     86.3ms ± 1%    ~     (p=0.631 n=10+10)
GoTypes        599ms ± 0%      602ms ± 0%  +0.56%  (p=0.000 n=10+9)
Compiler       2.87s ± 1%      2.89s ± 1%  +0.51%  (p=0.002 n=9+10)
SSA            7.29s ± 1%      7.25s ± 1%    ~     (p=0.182 n=10+9)
Flate          118ms ± 2%      118ms ± 1%    ~     (p=0.113 n=9+9)
GoParser       147ms ± 1%      148ms ± 1%  +1.07%  (p=0.003 n=9+10)
Reflect        401ms ± 1%      404ms ± 1%  +0.71%  (p=0.003 n=10+9)
Tar            175ms ± 1%      175ms ± 1%    ~     (p=0.604 n=9+10)
XML            209ms ± 1%      210ms ± 1%    ~     (p=0.052 n=10+10)
(https://perf.golang.org/search?q=upload:20171231.4)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.23ms ± 1%  2.25ms ± 1%  +0.84%  (p=0.000 n=19+19)
(https://perf.golang.org/search?q=upload:20171231.3)
Relative to the start of the sparse heap changes (starting at and
including "runtime: fix various contiguous bitmap assumptions"),
overall slowdown is roughly 1% on GC-intensive benchmarks:
name        old time/op     new time/op     delta
Template        183ms ± 1%      185ms ± 1%  +1.32%  (p=0.000 n=9+9)
Unicode        84.9ms ± 2%     86.3ms ± 1%  +1.65%  (p=0.000 n=9+10)
GoTypes         595ms ± 1%      602ms ± 0%  +1.19%  (p=0.000 n=9+9)
Compiler        2.86s ± 0%      2.89s ± 1%  +0.91%  (p=0.000 n=9+10)
SSA             7.19s ± 0%      7.25s ± 1%  +0.75%  (p=0.000 n=8+9)
Flate           117ms ± 1%      118ms ± 1%  +1.10%  (p=0.000 n=10+9)
GoParser        146ms ± 2%      148ms ± 1%  +1.48%  (p=0.002 n=10+10)
Reflect         398ms ± 1%      404ms ± 1%  +1.51%  (p=0.000 n=10+9)
Tar             173ms ± 1%      175ms ± 1%  +1.17%  (p=0.000 n=10+10)
XML             208ms ± 1%      210ms ± 1%  +0.62%  (p=0.011 n=10+10)
[Geo mean]      369ms           373ms       +1.17%
(https://perf.golang.org/search?q=upload:20180101.2)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.22ms ± 1%  2.25ms ± 1%  +1.51%  (p=0.000 n=20+19)
(https://perf.golang.org/search?q=upload:20180101.3)
Change-Id: I5daf4cfec24b252e5a57001f0a6c03f22479d0f0
Reviewed-on: https://go-review.googlesource.com/85887
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2017-12-19 22:05:23 -08:00
										 |  |  | 		// Create new hints for extending this region. | 
					
						
							|  |  |  | 		hint := (*arenaHint)(h.arenaHintAlloc.alloc()) | 
					
						
							|  |  |  | 		hint.addr, hint.down = uintptr(v), true | 
					
						
							|  |  |  | 		hint.next, mheap_.arenaHints = mheap_.arenaHints, hint | 
					
						
							|  |  |  | 		hint = (*arenaHint)(h.arenaHintAlloc.alloc()) | 
					
						
							|  |  |  | 		hint.addr = uintptr(v) + size | 
					
						
							|  |  |  | 		hint.next, mheap_.arenaHints = mheap_.arenaHints, hint | 
					
						
							| 
									
										
										
										
											2015-02-19 13:38:46 -05:00
										 |  |  | 	} | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2018-02-16 17:53:16 -05:00
										 |  |  | 	// Check for bad pointers or pointers we can't use. | 
					
						
							|  |  |  | 	{ | 
					
						
							|  |  |  | 		var bad string | 
					
						
							|  |  |  | 		p := uintptr(v) | 
					
						
							|  |  |  | 		if p+size < p { | 
					
						
							|  |  |  | 			bad = "region exceeds uintptr range" | 
					
						
							| 
									
										
											  
											
												runtime: support a two-level arena map
Currently, the heap arena map is a single, large array that covers
every possible arena frame in the entire address space. This is
practical up to about 48 bits of address space with 64 MB arenas.
However, there are two problems with this:
1. mips64, ppc64, and s390x support full 64-bit address spaces (though
   on Linux only s390x has kernel support for 64-bit address spaces).
   On these platforms, it would be good to support these larger
   address spaces.
2. On Windows, processes are charged for untouched memory, so for
   processes with small heaps, the mostly-untouched 32 MB arena map
   plus a 64 MB arena are significant overhead. Hence, it would be
   good to reduce both the arena map size and the arena size, but with
   a single-level arena, these are inversely proportional.
This CL adds support for a two-level arena map. Arena frame numbers
are now divided into arenaL1Bits of L1 index and arenaL2Bits of L2
index.
At the moment, arenaL1Bits is always 0, so we effectively have a
single level map. We do a few things so that this has no cost beyond
the current single-level map:
1. We embed the L2 array directly in mheap, so if there's a single
   entry in the L2 array, the representation is identical to the
   current representation and there's no extra level of indirection.
2. Hot code that accesses the arena map is structured so that it
   optimizes to nearly the same machine code as it does currently.
3. We make some small tweaks to hot code paths and to the inliner
   itself to keep some important functions inlined despite their
   now-larger ASTs. In particular, this is necessary for
   heapBitsForAddr and heapBits.next.
Possibly as a result of some of the tweaks, this actually slightly
improves the performance of the x/benchmarks garbage benchmark:
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.28ms ± 1%  2.26ms ± 1%  -1.07%  (p=0.000 n=17+19)
(https://perf.golang.org/search?q=upload:20180223.2)
For #23900.
Change-Id: If5164e0961754f97eb9eca58f837f36d759505ff
Reviewed-on: https://go-review.googlesource.com/96779
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2018-02-22 20:38:09 -05:00
										 |  |  | 		} else if arenaIndex(p) >= 1<<arenaBits { | 
					
						
							| 
									
										
										
										
											2018-02-16 17:53:16 -05:00
										 |  |  | 			bad = "base outside usable address space" | 
					
						
							| 
									
										
											  
											
												runtime: support a two-level arena map
Currently, the heap arena map is a single, large array that covers
every possible arena frame in the entire address space. This is
practical up to about 48 bits of address space with 64 MB arenas.
However, there are two problems with this:
1. mips64, ppc64, and s390x support full 64-bit address spaces (though
   on Linux only s390x has kernel support for 64-bit address spaces).
   On these platforms, it would be good to support these larger
   address spaces.
2. On Windows, processes are charged for untouched memory, so for
   processes with small heaps, the mostly-untouched 32 MB arena map
   plus a 64 MB arena are significant overhead. Hence, it would be
   good to reduce both the arena map size and the arena size, but with
   a single-level arena, these are inversely proportional.
This CL adds support for a two-level arena map. Arena frame numbers
are now divided into arenaL1Bits of L1 index and arenaL2Bits of L2
index.
At the moment, arenaL1Bits is always 0, so we effectively have a
single level map. We do a few things so that this has no cost beyond
the current single-level map:
1. We embed the L2 array directly in mheap, so if there's a single
   entry in the L2 array, the representation is identical to the
   current representation and there's no extra level of indirection.
2. Hot code that accesses the arena map is structured so that it
   optimizes to nearly the same machine code as it does currently.
3. We make some small tweaks to hot code paths and to the inliner
   itself to keep some important functions inlined despite their
   now-larger ASTs. In particular, this is necessary for
   heapBitsForAddr and heapBits.next.
Possibly as a result of some of the tweaks, this actually slightly
improves the performance of the x/benchmarks garbage benchmark:
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.28ms ± 1%  2.26ms ± 1%  -1.07%  (p=0.000 n=17+19)
(https://perf.golang.org/search?q=upload:20180223.2)
For #23900.
Change-Id: If5164e0961754f97eb9eca58f837f36d759505ff
Reviewed-on: https://go-review.googlesource.com/96779
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2018-02-22 20:38:09 -05:00
										 |  |  | 		} else if arenaIndex(p+size-1) >= 1<<arenaBits { | 
					
						
							| 
									
										
										
										
											2018-02-16 17:53:16 -05:00
										 |  |  | 			bad = "end outside usable address space" | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 		if bad != "" { | 
					
						
							|  |  |  | 			// This should be impossible on most architectures, | 
					
						
							|  |  |  | 			// but it would be really confusing to debug. | 
					
						
							|  |  |  | 			print("runtime: memory allocated by OS [", hex(p), ", ", hex(p+size), ") not in usable address space: ", bad, "\n") | 
					
						
							|  |  |  | 			throw("memory reservation exceeds address space limit") | 
					
						
							|  |  |  | 		} | 
					
						
							| 
									
										
										
										
											2015-02-19 13:38:46 -05:00
										 |  |  | 	} | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
											  
											
												runtime: use sparse mappings for the heap
This replaces the contiguous heap arena mapping with a potentially
sparse mapping that can support heap mappings anywhere in the address
space.
This has several advantages over the current approach:
* There is no longer any limit on the size of the Go heap. (Currently
  it's limited to 512GB.) Hence, this fixes #10460.
* It eliminates many failures modes of heap initialization and
  growing. In particular it eliminates any possibility of panicking
  with an address space conflict. This can happen for many reasons and
  even causes a low but steady rate of TSAN test failures because of
  conflicts with the TSAN runtime. See #16936 and #11993.
* It eliminates the notion of "non-reserved" heap, which was added
  because creating huge address space reservations (particularly on
  64-bit) led to huge process VSIZE. This was at best confusing and at
  worst conflicted badly with ulimit -v. However, the non-reserved
  heap logic is complicated, can race with other mappings in non-pure
  Go binaries (e.g., #18976), and requires that the entire heap be
  either reserved or non-reserved. We currently maintain the latter
  property, but it's quite difficult to convince yourself of that, and
  hence difficult to keep correct. This logic is still present, but
  will be removed in the next CL.
* It fixes problems on 32-bit where skipping over parts of the address
  space leads to mapping huge (and never-to-be-used) metadata
  structures. See #19831.
This also completely rewrites and significantly simplifies
mheap.sysAlloc, which has been a source of many bugs. E.g., #21044,
 #20259, #18651, and #13143 (and maybe #23222).
This change also makes it possible to allocate individual objects
larger than 512GB. As a result, a few tests that expected huge
allocations to fail needed to be changed to make even larger
allocations. However, at the moment attempting to allocate a humongous
object may cause the program to freeze for several minutes on Linux as
we fall back to probing every page with addrspace_free. That logic
(and this failure mode) will be removed in the next CL.
Fixes #10460.
Fixes #22204 (since it rewrites the code involved).
This slightly slows down compilebench and the x/benchmarks garbage
benchmark.
name       old time/op     new time/op     delta
Template       184ms ± 1%      185ms ± 1%    ~     (p=0.065 n=10+9)
Unicode       86.9ms ± 3%     86.3ms ± 1%    ~     (p=0.631 n=10+10)
GoTypes        599ms ± 0%      602ms ± 0%  +0.56%  (p=0.000 n=10+9)
Compiler       2.87s ± 1%      2.89s ± 1%  +0.51%  (p=0.002 n=9+10)
SSA            7.29s ± 1%      7.25s ± 1%    ~     (p=0.182 n=10+9)
Flate          118ms ± 2%      118ms ± 1%    ~     (p=0.113 n=9+9)
GoParser       147ms ± 1%      148ms ± 1%  +1.07%  (p=0.003 n=9+10)
Reflect        401ms ± 1%      404ms ± 1%  +0.71%  (p=0.003 n=10+9)
Tar            175ms ± 1%      175ms ± 1%    ~     (p=0.604 n=9+10)
XML            209ms ± 1%      210ms ± 1%    ~     (p=0.052 n=10+10)
(https://perf.golang.org/search?q=upload:20171231.4)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.23ms ± 1%  2.25ms ± 1%  +0.84%  (p=0.000 n=19+19)
(https://perf.golang.org/search?q=upload:20171231.3)
Relative to the start of the sparse heap changes (starting at and
including "runtime: fix various contiguous bitmap assumptions"),
overall slowdown is roughly 1% on GC-intensive benchmarks:
name        old time/op     new time/op     delta
Template        183ms ± 1%      185ms ± 1%  +1.32%  (p=0.000 n=9+9)
Unicode        84.9ms ± 2%     86.3ms ± 1%  +1.65%  (p=0.000 n=9+10)
GoTypes         595ms ± 1%      602ms ± 0%  +1.19%  (p=0.000 n=9+9)
Compiler        2.86s ± 0%      2.89s ± 1%  +0.91%  (p=0.000 n=9+10)
SSA             7.19s ± 0%      7.25s ± 1%  +0.75%  (p=0.000 n=8+9)
Flate           117ms ± 1%      118ms ± 1%  +1.10%  (p=0.000 n=10+9)
GoParser        146ms ± 2%      148ms ± 1%  +1.48%  (p=0.002 n=10+10)
Reflect         398ms ± 1%      404ms ± 1%  +1.51%  (p=0.000 n=10+9)
Tar             173ms ± 1%      175ms ± 1%  +1.17%  (p=0.000 n=10+10)
XML             208ms ± 1%      210ms ± 1%  +0.62%  (p=0.011 n=10+10)
[Geo mean]      369ms           373ms       +1.17%
(https://perf.golang.org/search?q=upload:20180101.2)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.22ms ± 1%  2.25ms ± 1%  +1.51%  (p=0.000 n=20+19)
(https://perf.golang.org/search?q=upload:20180101.3)
Change-Id: I5daf4cfec24b252e5a57001f0a6c03f22479d0f0
Reviewed-on: https://go-review.googlesource.com/85887
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2017-12-19 22:05:23 -08:00
										 |  |  | 	if uintptr(v)&(heapArenaBytes-1) != 0 { | 
					
						
							|  |  |  | 		throw("misrounded allocation in sysAlloc") | 
					
						
							| 
									
										
										
										
											2015-02-19 13:38:46 -05:00
										 |  |  | 	} | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
											  
											
												runtime: use sparse mappings for the heap
This replaces the contiguous heap arena mapping with a potentially
sparse mapping that can support heap mappings anywhere in the address
space.
This has several advantages over the current approach:
* There is no longer any limit on the size of the Go heap. (Currently
  it's limited to 512GB.) Hence, this fixes #10460.
* It eliminates many failures modes of heap initialization and
  growing. In particular it eliminates any possibility of panicking
  with an address space conflict. This can happen for many reasons and
  even causes a low but steady rate of TSAN test failures because of
  conflicts with the TSAN runtime. See #16936 and #11993.
* It eliminates the notion of "non-reserved" heap, which was added
  because creating huge address space reservations (particularly on
  64-bit) led to huge process VSIZE. This was at best confusing and at
  worst conflicted badly with ulimit -v. However, the non-reserved
  heap logic is complicated, can race with other mappings in non-pure
  Go binaries (e.g., #18976), and requires that the entire heap be
  either reserved or non-reserved. We currently maintain the latter
  property, but it's quite difficult to convince yourself of that, and
  hence difficult to keep correct. This logic is still present, but
  will be removed in the next CL.
* It fixes problems on 32-bit where skipping over parts of the address
  space leads to mapping huge (and never-to-be-used) metadata
  structures. See #19831.
This also completely rewrites and significantly simplifies
mheap.sysAlloc, which has been a source of many bugs. E.g., #21044,
 #20259, #18651, and #13143 (and maybe #23222).
This change also makes it possible to allocate individual objects
larger than 512GB. As a result, a few tests that expected huge
allocations to fail needed to be changed to make even larger
allocations. However, at the moment attempting to allocate a humongous
object may cause the program to freeze for several minutes on Linux as
we fall back to probing every page with addrspace_free. That logic
(and this failure mode) will be removed in the next CL.
Fixes #10460.
Fixes #22204 (since it rewrites the code involved).
This slightly slows down compilebench and the x/benchmarks garbage
benchmark.
name       old time/op     new time/op     delta
Template       184ms ± 1%      185ms ± 1%    ~     (p=0.065 n=10+9)
Unicode       86.9ms ± 3%     86.3ms ± 1%    ~     (p=0.631 n=10+10)
GoTypes        599ms ± 0%      602ms ± 0%  +0.56%  (p=0.000 n=10+9)
Compiler       2.87s ± 1%      2.89s ± 1%  +0.51%  (p=0.002 n=9+10)
SSA            7.29s ± 1%      7.25s ± 1%    ~     (p=0.182 n=10+9)
Flate          118ms ± 2%      118ms ± 1%    ~     (p=0.113 n=9+9)
GoParser       147ms ± 1%      148ms ± 1%  +1.07%  (p=0.003 n=9+10)
Reflect        401ms ± 1%      404ms ± 1%  +0.71%  (p=0.003 n=10+9)
Tar            175ms ± 1%      175ms ± 1%    ~     (p=0.604 n=9+10)
XML            209ms ± 1%      210ms ± 1%    ~     (p=0.052 n=10+10)
(https://perf.golang.org/search?q=upload:20171231.4)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.23ms ± 1%  2.25ms ± 1%  +0.84%  (p=0.000 n=19+19)
(https://perf.golang.org/search?q=upload:20171231.3)
Relative to the start of the sparse heap changes (starting at and
including "runtime: fix various contiguous bitmap assumptions"),
overall slowdown is roughly 1% on GC-intensive benchmarks:
name        old time/op     new time/op     delta
Template        183ms ± 1%      185ms ± 1%  +1.32%  (p=0.000 n=9+9)
Unicode        84.9ms ± 2%     86.3ms ± 1%  +1.65%  (p=0.000 n=9+10)
GoTypes         595ms ± 1%      602ms ± 0%  +1.19%  (p=0.000 n=9+9)
Compiler        2.86s ± 0%      2.89s ± 1%  +0.91%  (p=0.000 n=9+10)
SSA             7.19s ± 0%      7.25s ± 1%  +0.75%  (p=0.000 n=8+9)
Flate           117ms ± 1%      118ms ± 1%  +1.10%  (p=0.000 n=10+9)
GoParser        146ms ± 2%      148ms ± 1%  +1.48%  (p=0.002 n=10+10)
Reflect         398ms ± 1%      404ms ± 1%  +1.51%  (p=0.000 n=10+9)
Tar             173ms ± 1%      175ms ± 1%  +1.17%  (p=0.000 n=10+10)
XML             208ms ± 1%      210ms ± 1%  +0.62%  (p=0.011 n=10+10)
[Geo mean]      369ms           373ms       +1.17%
(https://perf.golang.org/search?q=upload:20180101.2)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.22ms ± 1%  2.25ms ± 1%  +1.51%  (p=0.000 n=20+19)
(https://perf.golang.org/search?q=upload:20180101.3)
Change-Id: I5daf4cfec24b252e5a57001f0a6c03f22479d0f0
Reviewed-on: https://go-review.googlesource.com/85887
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2017-12-19 22:05:23 -08:00
										 |  |  | mapped: | 
					
						
							|  |  |  | 	// Create arena metadata. | 
					
						
							| 
									
										
										
										
											2018-02-16 17:53:16 -05:00
										 |  |  | 	for ri := arenaIndex(uintptr(v)); ri <= arenaIndex(uintptr(v)+size-1); ri++ { | 
					
						
							| 
									
										
											  
											
												runtime: support a two-level arena map
Currently, the heap arena map is a single, large array that covers
every possible arena frame in the entire address space. This is
practical up to about 48 bits of address space with 64 MB arenas.
However, there are two problems with this:
1. mips64, ppc64, and s390x support full 64-bit address spaces (though
   on Linux only s390x has kernel support for 64-bit address spaces).
   On these platforms, it would be good to support these larger
   address spaces.
2. On Windows, processes are charged for untouched memory, so for
   processes with small heaps, the mostly-untouched 32 MB arena map
   plus a 64 MB arena are significant overhead. Hence, it would be
   good to reduce both the arena map size and the arena size, but with
   a single-level arena, these are inversely proportional.
This CL adds support for a two-level arena map. Arena frame numbers
are now divided into arenaL1Bits of L1 index and arenaL2Bits of L2
index.
At the moment, arenaL1Bits is always 0, so we effectively have a
single level map. We do a few things so that this has no cost beyond
the current single-level map:
1. We embed the L2 array directly in mheap, so if there's a single
   entry in the L2 array, the representation is identical to the
   current representation and there's no extra level of indirection.
2. Hot code that accesses the arena map is structured so that it
   optimizes to nearly the same machine code as it does currently.
3. We make some small tweaks to hot code paths and to the inliner
   itself to keep some important functions inlined despite their
   now-larger ASTs. In particular, this is necessary for
   heapBitsForAddr and heapBits.next.
Possibly as a result of some of the tweaks, this actually slightly
improves the performance of the x/benchmarks garbage benchmark:
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.28ms ± 1%  2.26ms ± 1%  -1.07%  (p=0.000 n=17+19)
(https://perf.golang.org/search?q=upload:20180223.2)
For #23900.
Change-Id: If5164e0961754f97eb9eca58f837f36d759505ff
Reviewed-on: https://go-review.googlesource.com/96779
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2018-02-22 20:38:09 -05:00
										 |  |  | 		l2 := h.arenas[ri.l1()] | 
					
						
							|  |  |  | 		if l2 == nil { | 
					
						
							|  |  |  | 			// Allocate an L2 arena map. | 
					
						
							| 
									
										
											  
											
												runtime: track how much memory is mapped in the Ready state
This change adds a field to memstats called mappedReady that tracks how
much memory is in the Ready state at any given time. In essence, it's
the total memory usage by the Go runtime (with one exception which is
documented). Essentially, all memory mapped read/write that has either
been paged in or will soon.
To make tracking this not involve the many different stats that track
mapped memory, we track this statistic at a very low level. The downside
of tracking this statistic at such a low level is that it managed to
catch lots of situations where the runtime wasn't fully accounting for
memory. This change rectifies these situations by always accounting for
memory that's mapped in some way (i.e. always passing a sysMemStat to a
mem.go function), with *two* exceptions.
Rectifying these situations means also having the memory mapped during
testing being accounted for, so that tests (i.e. ReadMemStats) that
ultimately check mappedReady continue to work correctly without special
exceptions. We choose to simply account for this memory in other_sys.
Let's talk about the exceptions. The first is the arenas array for
finding heap arena metadata from an address is mapped as read/write in
one large chunk. It's tens of MiB in size. On systems with demand
paging, we assume that the whole thing isn't paged in at once (after
all, it maps to the whole address space, and it's exceedingly difficult
with today's technology to even broach having as much physical memory as
the total address space). On systems where we have to commit memory
manually, we use a two-level structure.
Now, the reason why this is an exception is because we have no mechanism
to track what memory is paged in, and we can't just account for the
entire thing, because that would *look* like an enormous overhead.
Furthermore, this structure is on a few really, really critical paths in
the runtime, so doing more explicit tracking isn't really an option. So,
we explicitly don't and call sysAllocOS to map this memory.
The second exception is that we call sysFree with no accounting to clean
up address space reservations, or otherwise to throw out mappings we
don't care about. In this case, also drop down to a lower level and call
sysFreeOS to explicitly avoid accounting.
The third exception is debuglog allocations. That is purely a debugging
facility and ideally we want it to have as small an impact on the
runtime as possible. If we include it in mappedReady calculations, it
could cause GC pacing shifts in future CLs, especailly if one increases
the debuglog buffer sizes as a one-off.
As of this CL, these are the only three places in the runtime that would
pass nil for a stat to any of the functions in mem.go. As a result, this
CL makes sysMemStats mandatory to facilitate better accounting in the
future. It's now much easier to grep and find out where accounting is
explicitly elided, because one doesn't have to follow the trail of
sysMemStat nil pointer values, and can just look at the function name.
For #48409.
Change-Id: I274eb467fc2603881717482214fddc47c9eaf218
Reviewed-on: https://go-review.googlesource.com/c/go/+/393402
Reviewed-by: Michael Pratt <mpratt@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
											
										 
											2022-03-15 02:48:18 +00:00
										 |  |  | 			// | 
					
						
							|  |  |  | 			// Use sysAllocOS instead of sysAlloc or persistentalloc because there's no | 
					
						
							|  |  |  | 			// statistic we can comfortably account for this space in. With this structure, | 
					
						
							|  |  |  | 			// we rely on demand paging to avoid large overheads, but tracking which memory | 
					
						
							|  |  |  | 			// is paged in is too expensive. Trying to account for the whole region means | 
					
						
							|  |  |  | 			// that it will appear like an enormous memory overhead in statistics, even though | 
					
						
							|  |  |  | 			// it is not. | 
					
						
							|  |  |  | 			l2 = (*[1 << arenaL2Bits]*heapArena)(sysAllocOS(unsafe.Sizeof(*l2))) | 
					
						
							| 
									
										
											  
											
												runtime: support a two-level arena map
Currently, the heap arena map is a single, large array that covers
every possible arena frame in the entire address space. This is
practical up to about 48 bits of address space with 64 MB arenas.
However, there are two problems with this:
1. mips64, ppc64, and s390x support full 64-bit address spaces (though
   on Linux only s390x has kernel support for 64-bit address spaces).
   On these platforms, it would be good to support these larger
   address spaces.
2. On Windows, processes are charged for untouched memory, so for
   processes with small heaps, the mostly-untouched 32 MB arena map
   plus a 64 MB arena are significant overhead. Hence, it would be
   good to reduce both the arena map size and the arena size, but with
   a single-level arena, these are inversely proportional.
This CL adds support for a two-level arena map. Arena frame numbers
are now divided into arenaL1Bits of L1 index and arenaL2Bits of L2
index.
At the moment, arenaL1Bits is always 0, so we effectively have a
single level map. We do a few things so that this has no cost beyond
the current single-level map:
1. We embed the L2 array directly in mheap, so if there's a single
   entry in the L2 array, the representation is identical to the
   current representation and there's no extra level of indirection.
2. Hot code that accesses the arena map is structured so that it
   optimizes to nearly the same machine code as it does currently.
3. We make some small tweaks to hot code paths and to the inliner
   itself to keep some important functions inlined despite their
   now-larger ASTs. In particular, this is necessary for
   heapBitsForAddr and heapBits.next.
Possibly as a result of some of the tweaks, this actually slightly
improves the performance of the x/benchmarks garbage benchmark:
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.28ms ± 1%  2.26ms ± 1%  -1.07%  (p=0.000 n=17+19)
(https://perf.golang.org/search?q=upload:20180223.2)
For #23900.
Change-Id: If5164e0961754f97eb9eca58f837f36d759505ff
Reviewed-on: https://go-review.googlesource.com/96779
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2018-02-22 20:38:09 -05:00
										 |  |  | 			if l2 == nil { | 
					
						
							|  |  |  | 				throw("out of memory allocating heap arena map") | 
					
						
							|  |  |  | 			} | 
					
						
							| 
									
										
										
										
											2023-01-03 17:59:48 +00:00
										 |  |  | 			if h.arenasHugePages { | 
					
						
							|  |  |  | 				sysHugePage(unsafe.Pointer(l2), unsafe.Sizeof(*l2)) | 
					
						
							|  |  |  | 			} else { | 
					
						
							|  |  |  | 				sysNoHugePage(unsafe.Pointer(l2), unsafe.Sizeof(*l2)) | 
					
						
							|  |  |  | 			} | 
					
						
							| 
									
										
											  
											
												runtime: support a two-level arena map
Currently, the heap arena map is a single, large array that covers
every possible arena frame in the entire address space. This is
practical up to about 48 bits of address space with 64 MB arenas.
However, there are two problems with this:
1. mips64, ppc64, and s390x support full 64-bit address spaces (though
   on Linux only s390x has kernel support for 64-bit address spaces).
   On these platforms, it would be good to support these larger
   address spaces.
2. On Windows, processes are charged for untouched memory, so for
   processes with small heaps, the mostly-untouched 32 MB arena map
   plus a 64 MB arena are significant overhead. Hence, it would be
   good to reduce both the arena map size and the arena size, but with
   a single-level arena, these are inversely proportional.
This CL adds support for a two-level arena map. Arena frame numbers
are now divided into arenaL1Bits of L1 index and arenaL2Bits of L2
index.
At the moment, arenaL1Bits is always 0, so we effectively have a
single level map. We do a few things so that this has no cost beyond
the current single-level map:
1. We embed the L2 array directly in mheap, so if there's a single
   entry in the L2 array, the representation is identical to the
   current representation and there's no extra level of indirection.
2. Hot code that accesses the arena map is structured so that it
   optimizes to nearly the same machine code as it does currently.
3. We make some small tweaks to hot code paths and to the inliner
   itself to keep some important functions inlined despite their
   now-larger ASTs. In particular, this is necessary for
   heapBitsForAddr and heapBits.next.
Possibly as a result of some of the tweaks, this actually slightly
improves the performance of the x/benchmarks garbage benchmark:
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.28ms ± 1%  2.26ms ± 1%  -1.07%  (p=0.000 n=17+19)
(https://perf.golang.org/search?q=upload:20180223.2)
For #23900.
Change-Id: If5164e0961754f97eb9eca58f837f36d759505ff
Reviewed-on: https://go-review.googlesource.com/96779
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2018-02-22 20:38:09 -05:00
										 |  |  | 			atomic.StorepNoWB(unsafe.Pointer(&h.arenas[ri.l1()]), unsafe.Pointer(l2)) | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 		if l2[ri.l2()] != nil { | 
					
						
							| 
									
										
											  
											
												runtime: use sparse mappings for the heap
This replaces the contiguous heap arena mapping with a potentially
sparse mapping that can support heap mappings anywhere in the address
space.
This has several advantages over the current approach:
* There is no longer any limit on the size of the Go heap. (Currently
  it's limited to 512GB.) Hence, this fixes #10460.
* It eliminates many failures modes of heap initialization and
  growing. In particular it eliminates any possibility of panicking
  with an address space conflict. This can happen for many reasons and
  even causes a low but steady rate of TSAN test failures because of
  conflicts with the TSAN runtime. See #16936 and #11993.
* It eliminates the notion of "non-reserved" heap, which was added
  because creating huge address space reservations (particularly on
  64-bit) led to huge process VSIZE. This was at best confusing and at
  worst conflicted badly with ulimit -v. However, the non-reserved
  heap logic is complicated, can race with other mappings in non-pure
  Go binaries (e.g., #18976), and requires that the entire heap be
  either reserved or non-reserved. We currently maintain the latter
  property, but it's quite difficult to convince yourself of that, and
  hence difficult to keep correct. This logic is still present, but
  will be removed in the next CL.
* It fixes problems on 32-bit where skipping over parts of the address
  space leads to mapping huge (and never-to-be-used) metadata
  structures. See #19831.
This also completely rewrites and significantly simplifies
mheap.sysAlloc, which has been a source of many bugs. E.g., #21044,
 #20259, #18651, and #13143 (and maybe #23222).
This change also makes it possible to allocate individual objects
larger than 512GB. As a result, a few tests that expected huge
allocations to fail needed to be changed to make even larger
allocations. However, at the moment attempting to allocate a humongous
object may cause the program to freeze for several minutes on Linux as
we fall back to probing every page with addrspace_free. That logic
(and this failure mode) will be removed in the next CL.
Fixes #10460.
Fixes #22204 (since it rewrites the code involved).
This slightly slows down compilebench and the x/benchmarks garbage
benchmark.
name       old time/op     new time/op     delta
Template       184ms ± 1%      185ms ± 1%    ~     (p=0.065 n=10+9)
Unicode       86.9ms ± 3%     86.3ms ± 1%    ~     (p=0.631 n=10+10)
GoTypes        599ms ± 0%      602ms ± 0%  +0.56%  (p=0.000 n=10+9)
Compiler       2.87s ± 1%      2.89s ± 1%  +0.51%  (p=0.002 n=9+10)
SSA            7.29s ± 1%      7.25s ± 1%    ~     (p=0.182 n=10+9)
Flate          118ms ± 2%      118ms ± 1%    ~     (p=0.113 n=9+9)
GoParser       147ms ± 1%      148ms ± 1%  +1.07%  (p=0.003 n=9+10)
Reflect        401ms ± 1%      404ms ± 1%  +0.71%  (p=0.003 n=10+9)
Tar            175ms ± 1%      175ms ± 1%    ~     (p=0.604 n=9+10)
XML            209ms ± 1%      210ms ± 1%    ~     (p=0.052 n=10+10)
(https://perf.golang.org/search?q=upload:20171231.4)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.23ms ± 1%  2.25ms ± 1%  +0.84%  (p=0.000 n=19+19)
(https://perf.golang.org/search?q=upload:20171231.3)
Relative to the start of the sparse heap changes (starting at and
including "runtime: fix various contiguous bitmap assumptions"),
overall slowdown is roughly 1% on GC-intensive benchmarks:
name        old time/op     new time/op     delta
Template        183ms ± 1%      185ms ± 1%  +1.32%  (p=0.000 n=9+9)
Unicode        84.9ms ± 2%     86.3ms ± 1%  +1.65%  (p=0.000 n=9+10)
GoTypes         595ms ± 1%      602ms ± 0%  +1.19%  (p=0.000 n=9+9)
Compiler        2.86s ± 0%      2.89s ± 1%  +0.91%  (p=0.000 n=9+10)
SSA             7.19s ± 0%      7.25s ± 1%  +0.75%  (p=0.000 n=8+9)
Flate           117ms ± 1%      118ms ± 1%  +1.10%  (p=0.000 n=10+9)
GoParser        146ms ± 2%      148ms ± 1%  +1.48%  (p=0.002 n=10+10)
Reflect         398ms ± 1%      404ms ± 1%  +1.51%  (p=0.000 n=10+9)
Tar             173ms ± 1%      175ms ± 1%  +1.17%  (p=0.000 n=10+10)
XML             208ms ± 1%      210ms ± 1%  +0.62%  (p=0.011 n=10+10)
[Geo mean]      369ms           373ms       +1.17%
(https://perf.golang.org/search?q=upload:20180101.2)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.22ms ± 1%  2.25ms ± 1%  +1.51%  (p=0.000 n=20+19)
(https://perf.golang.org/search?q=upload:20180101.3)
Change-Id: I5daf4cfec24b252e5a57001f0a6c03f22479d0f0
Reviewed-on: https://go-review.googlesource.com/85887
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2017-12-19 22:05:23 -08:00
										 |  |  | 			throw("arena already initialized") | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 		var r *heapArena | 
					
						
							| 
									
										
										
										
											2021-06-16 23:05:44 +00:00
										 |  |  | 		r = (*heapArena)(h.heapArenaAlloc.alloc(unsafe.Sizeof(*r), goarch.PtrSize, &memstats.gcMiscSys)) | 
					
						
							| 
									
										
											  
											
												runtime: use sparse mappings for the heap
This replaces the contiguous heap arena mapping with a potentially
sparse mapping that can support heap mappings anywhere in the address
space.
This has several advantages over the current approach:
* There is no longer any limit on the size of the Go heap. (Currently
  it's limited to 512GB.) Hence, this fixes #10460.
* It eliminates many failures modes of heap initialization and
  growing. In particular it eliminates any possibility of panicking
  with an address space conflict. This can happen for many reasons and
  even causes a low but steady rate of TSAN test failures because of
  conflicts with the TSAN runtime. See #16936 and #11993.
* It eliminates the notion of "non-reserved" heap, which was added
  because creating huge address space reservations (particularly on
  64-bit) led to huge process VSIZE. This was at best confusing and at
  worst conflicted badly with ulimit -v. However, the non-reserved
  heap logic is complicated, can race with other mappings in non-pure
  Go binaries (e.g., #18976), and requires that the entire heap be
  either reserved or non-reserved. We currently maintain the latter
  property, but it's quite difficult to convince yourself of that, and
  hence difficult to keep correct. This logic is still present, but
  will be removed in the next CL.
* It fixes problems on 32-bit where skipping over parts of the address
  space leads to mapping huge (and never-to-be-used) metadata
  structures. See #19831.
This also completely rewrites and significantly simplifies
mheap.sysAlloc, which has been a source of many bugs. E.g., #21044,
 #20259, #18651, and #13143 (and maybe #23222).
This change also makes it possible to allocate individual objects
larger than 512GB. As a result, a few tests that expected huge
allocations to fail needed to be changed to make even larger
allocations. However, at the moment attempting to allocate a humongous
object may cause the program to freeze for several minutes on Linux as
we fall back to probing every page with addrspace_free. That logic
(and this failure mode) will be removed in the next CL.
Fixes #10460.
Fixes #22204 (since it rewrites the code involved).
This slightly slows down compilebench and the x/benchmarks garbage
benchmark.
name       old time/op     new time/op     delta
Template       184ms ± 1%      185ms ± 1%    ~     (p=0.065 n=10+9)
Unicode       86.9ms ± 3%     86.3ms ± 1%    ~     (p=0.631 n=10+10)
GoTypes        599ms ± 0%      602ms ± 0%  +0.56%  (p=0.000 n=10+9)
Compiler       2.87s ± 1%      2.89s ± 1%  +0.51%  (p=0.002 n=9+10)
SSA            7.29s ± 1%      7.25s ± 1%    ~     (p=0.182 n=10+9)
Flate          118ms ± 2%      118ms ± 1%    ~     (p=0.113 n=9+9)
GoParser       147ms ± 1%      148ms ± 1%  +1.07%  (p=0.003 n=9+10)
Reflect        401ms ± 1%      404ms ± 1%  +0.71%  (p=0.003 n=10+9)
Tar            175ms ± 1%      175ms ± 1%    ~     (p=0.604 n=9+10)
XML            209ms ± 1%      210ms ± 1%    ~     (p=0.052 n=10+10)
(https://perf.golang.org/search?q=upload:20171231.4)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.23ms ± 1%  2.25ms ± 1%  +0.84%  (p=0.000 n=19+19)
(https://perf.golang.org/search?q=upload:20171231.3)
Relative to the start of the sparse heap changes (starting at and
including "runtime: fix various contiguous bitmap assumptions"),
overall slowdown is roughly 1% on GC-intensive benchmarks:
name        old time/op     new time/op     delta
Template        183ms ± 1%      185ms ± 1%  +1.32%  (p=0.000 n=9+9)
Unicode        84.9ms ± 2%     86.3ms ± 1%  +1.65%  (p=0.000 n=9+10)
GoTypes         595ms ± 1%      602ms ± 0%  +1.19%  (p=0.000 n=9+9)
Compiler        2.86s ± 0%      2.89s ± 1%  +0.91%  (p=0.000 n=9+10)
SSA             7.19s ± 0%      7.25s ± 1%  +0.75%  (p=0.000 n=8+9)
Flate           117ms ± 1%      118ms ± 1%  +1.10%  (p=0.000 n=10+9)
GoParser        146ms ± 2%      148ms ± 1%  +1.48%  (p=0.002 n=10+10)
Reflect         398ms ± 1%      404ms ± 1%  +1.51%  (p=0.000 n=10+9)
Tar             173ms ± 1%      175ms ± 1%  +1.17%  (p=0.000 n=10+10)
XML             208ms ± 1%      210ms ± 1%  +0.62%  (p=0.011 n=10+10)
[Geo mean]      369ms           373ms       +1.17%
(https://perf.golang.org/search?q=upload:20180101.2)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.22ms ± 1%  2.25ms ± 1%  +1.51%  (p=0.000 n=20+19)
(https://perf.golang.org/search?q=upload:20180101.3)
Change-Id: I5daf4cfec24b252e5a57001f0a6c03f22479d0f0
Reviewed-on: https://go-review.googlesource.com/85887
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2017-12-19 22:05:23 -08:00
										 |  |  | 		if r == nil { | 
					
						
							| 
									
										
										
										
											2021-06-16 23:05:44 +00:00
										 |  |  | 			r = (*heapArena)(persistentalloc(unsafe.Sizeof(*r), goarch.PtrSize, &memstats.gcMiscSys)) | 
					
						
							| 
									
										
											  
											
												runtime: use sparse mappings for the heap
This replaces the contiguous heap arena mapping with a potentially
sparse mapping that can support heap mappings anywhere in the address
space.
This has several advantages over the current approach:
* There is no longer any limit on the size of the Go heap. (Currently
  it's limited to 512GB.) Hence, this fixes #10460.
* It eliminates many failures modes of heap initialization and
  growing. In particular it eliminates any possibility of panicking
  with an address space conflict. This can happen for many reasons and
  even causes a low but steady rate of TSAN test failures because of
  conflicts with the TSAN runtime. See #16936 and #11993.
* It eliminates the notion of "non-reserved" heap, which was added
  because creating huge address space reservations (particularly on
  64-bit) led to huge process VSIZE. This was at best confusing and at
  worst conflicted badly with ulimit -v. However, the non-reserved
  heap logic is complicated, can race with other mappings in non-pure
  Go binaries (e.g., #18976), and requires that the entire heap be
  either reserved or non-reserved. We currently maintain the latter
  property, but it's quite difficult to convince yourself of that, and
  hence difficult to keep correct. This logic is still present, but
  will be removed in the next CL.
* It fixes problems on 32-bit where skipping over parts of the address
  space leads to mapping huge (and never-to-be-used) metadata
  structures. See #19831.
This also completely rewrites and significantly simplifies
mheap.sysAlloc, which has been a source of many bugs. E.g., #21044,
 #20259, #18651, and #13143 (and maybe #23222).
This change also makes it possible to allocate individual objects
larger than 512GB. As a result, a few tests that expected huge
allocations to fail needed to be changed to make even larger
allocations. However, at the moment attempting to allocate a humongous
object may cause the program to freeze for several minutes on Linux as
we fall back to probing every page with addrspace_free. That logic
(and this failure mode) will be removed in the next CL.
Fixes #10460.
Fixes #22204 (since it rewrites the code involved).
This slightly slows down compilebench and the x/benchmarks garbage
benchmark.
name       old time/op     new time/op     delta
Template       184ms ± 1%      185ms ± 1%    ~     (p=0.065 n=10+9)
Unicode       86.9ms ± 3%     86.3ms ± 1%    ~     (p=0.631 n=10+10)
GoTypes        599ms ± 0%      602ms ± 0%  +0.56%  (p=0.000 n=10+9)
Compiler       2.87s ± 1%      2.89s ± 1%  +0.51%  (p=0.002 n=9+10)
SSA            7.29s ± 1%      7.25s ± 1%    ~     (p=0.182 n=10+9)
Flate          118ms ± 2%      118ms ± 1%    ~     (p=0.113 n=9+9)
GoParser       147ms ± 1%      148ms ± 1%  +1.07%  (p=0.003 n=9+10)
Reflect        401ms ± 1%      404ms ± 1%  +0.71%  (p=0.003 n=10+9)
Tar            175ms ± 1%      175ms ± 1%    ~     (p=0.604 n=9+10)
XML            209ms ± 1%      210ms ± 1%    ~     (p=0.052 n=10+10)
(https://perf.golang.org/search?q=upload:20171231.4)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.23ms ± 1%  2.25ms ± 1%  +0.84%  (p=0.000 n=19+19)
(https://perf.golang.org/search?q=upload:20171231.3)
Relative to the start of the sparse heap changes (starting at and
including "runtime: fix various contiguous bitmap assumptions"),
overall slowdown is roughly 1% on GC-intensive benchmarks:
name        old time/op     new time/op     delta
Template        183ms ± 1%      185ms ± 1%  +1.32%  (p=0.000 n=9+9)
Unicode        84.9ms ± 2%     86.3ms ± 1%  +1.65%  (p=0.000 n=9+10)
GoTypes         595ms ± 1%      602ms ± 0%  +1.19%  (p=0.000 n=9+9)
Compiler        2.86s ± 0%      2.89s ± 1%  +0.91%  (p=0.000 n=9+10)
SSA             7.19s ± 0%      7.25s ± 1%  +0.75%  (p=0.000 n=8+9)
Flate           117ms ± 1%      118ms ± 1%  +1.10%  (p=0.000 n=10+9)
GoParser        146ms ± 2%      148ms ± 1%  +1.48%  (p=0.002 n=10+10)
Reflect         398ms ± 1%      404ms ± 1%  +1.51%  (p=0.000 n=10+9)
Tar             173ms ± 1%      175ms ± 1%  +1.17%  (p=0.000 n=10+10)
XML             208ms ± 1%      210ms ± 1%  +0.62%  (p=0.011 n=10+10)
[Geo mean]      369ms           373ms       +1.17%
(https://perf.golang.org/search?q=upload:20180101.2)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.22ms ± 1%  2.25ms ± 1%  +1.51%  (p=0.000 n=20+19)
(https://perf.golang.org/search?q=upload:20180101.3)
Change-Id: I5daf4cfec24b252e5a57001f0a6c03f22479d0f0
Reviewed-on: https://go-review.googlesource.com/85887
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2017-12-19 22:05:23 -08:00
										 |  |  | 			if r == nil { | 
					
						
							|  |  |  | 				throw("out of memory allocating heap arena metadata") | 
					
						
							|  |  |  | 			} | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-09-20 18:33:03 +00:00
										 |  |  | 		// Register the arena in allArenas if requested. | 
					
						
							|  |  |  | 		if register { | 
					
						
							|  |  |  | 			if len(h.allArenas) == cap(h.allArenas) { | 
					
						
							|  |  |  | 				size := 2 * uintptr(cap(h.allArenas)) * goarch.PtrSize | 
					
						
							|  |  |  | 				if size == 0 { | 
					
						
							|  |  |  | 					size = physPageSize | 
					
						
							|  |  |  | 				} | 
					
						
							|  |  |  | 				newArray := (*notInHeap)(persistentalloc(size, goarch.PtrSize, &memstats.gcMiscSys)) | 
					
						
							|  |  |  | 				if newArray == nil { | 
					
						
							|  |  |  | 					throw("out of memory allocating allArenas") | 
					
						
							|  |  |  | 				} | 
					
						
							|  |  |  | 				oldSlice := h.allArenas | 
					
						
							|  |  |  | 				*(*notInHeapSlice)(unsafe.Pointer(&h.allArenas)) = notInHeapSlice{newArray, len(h.allArenas), int(size / goarch.PtrSize)} | 
					
						
							|  |  |  | 				copy(h.allArenas, oldSlice) | 
					
						
							|  |  |  | 				// Do not free the old backing array because | 
					
						
							|  |  |  | 				// there may be concurrent readers. Since we | 
					
						
							|  |  |  | 				// double the array each time, this can lead | 
					
						
							|  |  |  | 				// to at most 2x waste. | 
					
						
							| 
									
										
										
										
											2018-09-26 14:20:58 -04:00
										 |  |  | 			} | 
					
						
							| 
									
										
										
										
											2022-09-20 18:33:03 +00:00
										 |  |  | 			h.allArenas = h.allArenas[:len(h.allArenas)+1] | 
					
						
							|  |  |  | 			h.allArenas[len(h.allArenas)-1] = ri | 
					
						
							| 
									
										
										
										
											2018-09-26 14:20:58 -04:00
										 |  |  | 		} | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
											  
											
												runtime: use sparse mappings for the heap
This replaces the contiguous heap arena mapping with a potentially
sparse mapping that can support heap mappings anywhere in the address
space.
This has several advantages over the current approach:
* There is no longer any limit on the size of the Go heap. (Currently
  it's limited to 512GB.) Hence, this fixes #10460.
* It eliminates many failures modes of heap initialization and
  growing. In particular it eliminates any possibility of panicking
  with an address space conflict. This can happen for many reasons and
  even causes a low but steady rate of TSAN test failures because of
  conflicts with the TSAN runtime. See #16936 and #11993.
* It eliminates the notion of "non-reserved" heap, which was added
  because creating huge address space reservations (particularly on
  64-bit) led to huge process VSIZE. This was at best confusing and at
  worst conflicted badly with ulimit -v. However, the non-reserved
  heap logic is complicated, can race with other mappings in non-pure
  Go binaries (e.g., #18976), and requires that the entire heap be
  either reserved or non-reserved. We currently maintain the latter
  property, but it's quite difficult to convince yourself of that, and
  hence difficult to keep correct. This logic is still present, but
  will be removed in the next CL.
* It fixes problems on 32-bit where skipping over parts of the address
  space leads to mapping huge (and never-to-be-used) metadata
  structures. See #19831.
This also completely rewrites and significantly simplifies
mheap.sysAlloc, which has been a source of many bugs. E.g., #21044,
 #20259, #18651, and #13143 (and maybe #23222).
This change also makes it possible to allocate individual objects
larger than 512GB. As a result, a few tests that expected huge
allocations to fail needed to be changed to make even larger
allocations. However, at the moment attempting to allocate a humongous
object may cause the program to freeze for several minutes on Linux as
we fall back to probing every page with addrspace_free. That logic
(and this failure mode) will be removed in the next CL.
Fixes #10460.
Fixes #22204 (since it rewrites the code involved).
This slightly slows down compilebench and the x/benchmarks garbage
benchmark.
name       old time/op     new time/op     delta
Template       184ms ± 1%      185ms ± 1%    ~     (p=0.065 n=10+9)
Unicode       86.9ms ± 3%     86.3ms ± 1%    ~     (p=0.631 n=10+10)
GoTypes        599ms ± 0%      602ms ± 0%  +0.56%  (p=0.000 n=10+9)
Compiler       2.87s ± 1%      2.89s ± 1%  +0.51%  (p=0.002 n=9+10)
SSA            7.29s ± 1%      7.25s ± 1%    ~     (p=0.182 n=10+9)
Flate          118ms ± 2%      118ms ± 1%    ~     (p=0.113 n=9+9)
GoParser       147ms ± 1%      148ms ± 1%  +1.07%  (p=0.003 n=9+10)
Reflect        401ms ± 1%      404ms ± 1%  +0.71%  (p=0.003 n=10+9)
Tar            175ms ± 1%      175ms ± 1%    ~     (p=0.604 n=9+10)
XML            209ms ± 1%      210ms ± 1%    ~     (p=0.052 n=10+10)
(https://perf.golang.org/search?q=upload:20171231.4)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.23ms ± 1%  2.25ms ± 1%  +0.84%  (p=0.000 n=19+19)
(https://perf.golang.org/search?q=upload:20171231.3)
Relative to the start of the sparse heap changes (starting at and
including "runtime: fix various contiguous bitmap assumptions"),
overall slowdown is roughly 1% on GC-intensive benchmarks:
name        old time/op     new time/op     delta
Template        183ms ± 1%      185ms ± 1%  +1.32%  (p=0.000 n=9+9)
Unicode        84.9ms ± 2%     86.3ms ± 1%  +1.65%  (p=0.000 n=9+10)
GoTypes         595ms ± 1%      602ms ± 0%  +1.19%  (p=0.000 n=9+9)
Compiler        2.86s ± 0%      2.89s ± 1%  +0.91%  (p=0.000 n=9+10)
SSA             7.19s ± 0%      7.25s ± 1%  +0.75%  (p=0.000 n=8+9)
Flate           117ms ± 1%      118ms ± 1%  +1.10%  (p=0.000 n=10+9)
GoParser        146ms ± 2%      148ms ± 1%  +1.48%  (p=0.002 n=10+10)
Reflect         398ms ± 1%      404ms ± 1%  +1.51%  (p=0.000 n=10+9)
Tar             173ms ± 1%      175ms ± 1%  +1.17%  (p=0.000 n=10+10)
XML             208ms ± 1%      210ms ± 1%  +0.62%  (p=0.011 n=10+10)
[Geo mean]      369ms           373ms       +1.17%
(https://perf.golang.org/search?q=upload:20180101.2)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.22ms ± 1%  2.25ms ± 1%  +1.51%  (p=0.000 n=20+19)
(https://perf.golang.org/search?q=upload:20180101.3)
Change-Id: I5daf4cfec24b252e5a57001f0a6c03f22479d0f0
Reviewed-on: https://go-review.googlesource.com/85887
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2017-12-19 22:05:23 -08:00
										 |  |  | 		// Store atomically just in case an object from the | 
					
						
							|  |  |  | 		// new heap arena becomes visible before the heap lock | 
					
						
							|  |  |  | 		// is released (which shouldn't happen, but there's | 
					
						
							|  |  |  | 		// little downside to this). | 
					
						
							| 
									
										
											  
											
												runtime: support a two-level arena map
Currently, the heap arena map is a single, large array that covers
every possible arena frame in the entire address space. This is
practical up to about 48 bits of address space with 64 MB arenas.
However, there are two problems with this:
1. mips64, ppc64, and s390x support full 64-bit address spaces (though
   on Linux only s390x has kernel support for 64-bit address spaces).
   On these platforms, it would be good to support these larger
   address spaces.
2. On Windows, processes are charged for untouched memory, so for
   processes with small heaps, the mostly-untouched 32 MB arena map
   plus a 64 MB arena are significant overhead. Hence, it would be
   good to reduce both the arena map size and the arena size, but with
   a single-level arena, these are inversely proportional.
This CL adds support for a two-level arena map. Arena frame numbers
are now divided into arenaL1Bits of L1 index and arenaL2Bits of L2
index.
At the moment, arenaL1Bits is always 0, so we effectively have a
single level map. We do a few things so that this has no cost beyond
the current single-level map:
1. We embed the L2 array directly in mheap, so if there's a single
   entry in the L2 array, the representation is identical to the
   current representation and there's no extra level of indirection.
2. Hot code that accesses the arena map is structured so that it
   optimizes to nearly the same machine code as it does currently.
3. We make some small tweaks to hot code paths and to the inliner
   itself to keep some important functions inlined despite their
   now-larger ASTs. In particular, this is necessary for
   heapBitsForAddr and heapBits.next.
Possibly as a result of some of the tweaks, this actually slightly
improves the performance of the x/benchmarks garbage benchmark:
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.28ms ± 1%  2.26ms ± 1%  -1.07%  (p=0.000 n=17+19)
(https://perf.golang.org/search?q=upload:20180223.2)
For #23900.
Change-Id: If5164e0961754f97eb9eca58f837f36d759505ff
Reviewed-on: https://go-review.googlesource.com/96779
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2018-02-22 20:38:09 -05:00
										 |  |  | 		atomic.StorepNoWB(unsafe.Pointer(&l2[ri.l2()]), unsafe.Pointer(r)) | 
					
						
							| 
									
										
										
										
											2015-02-19 13:38:46 -05:00
										 |  |  | 	} | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
											  
											
												runtime: use sparse mappings for the heap
This replaces the contiguous heap arena mapping with a potentially
sparse mapping that can support heap mappings anywhere in the address
space.
This has several advantages over the current approach:
* There is no longer any limit on the size of the Go heap. (Currently
  it's limited to 512GB.) Hence, this fixes #10460.
* It eliminates many failures modes of heap initialization and
  growing. In particular it eliminates any possibility of panicking
  with an address space conflict. This can happen for many reasons and
  even causes a low but steady rate of TSAN test failures because of
  conflicts with the TSAN runtime. See #16936 and #11993.
* It eliminates the notion of "non-reserved" heap, which was added
  because creating huge address space reservations (particularly on
  64-bit) led to huge process VSIZE. This was at best confusing and at
  worst conflicted badly with ulimit -v. However, the non-reserved
  heap logic is complicated, can race with other mappings in non-pure
  Go binaries (e.g., #18976), and requires that the entire heap be
  either reserved or non-reserved. We currently maintain the latter
  property, but it's quite difficult to convince yourself of that, and
  hence difficult to keep correct. This logic is still present, but
  will be removed in the next CL.
* It fixes problems on 32-bit where skipping over parts of the address
  space leads to mapping huge (and never-to-be-used) metadata
  structures. See #19831.
This also completely rewrites and significantly simplifies
mheap.sysAlloc, which has been a source of many bugs. E.g., #21044,
 #20259, #18651, and #13143 (and maybe #23222).
This change also makes it possible to allocate individual objects
larger than 512GB. As a result, a few tests that expected huge
allocations to fail needed to be changed to make even larger
allocations. However, at the moment attempting to allocate a humongous
object may cause the program to freeze for several minutes on Linux as
we fall back to probing every page with addrspace_free. That logic
(and this failure mode) will be removed in the next CL.
Fixes #10460.
Fixes #22204 (since it rewrites the code involved).
This slightly slows down compilebench and the x/benchmarks garbage
benchmark.
name       old time/op     new time/op     delta
Template       184ms ± 1%      185ms ± 1%    ~     (p=0.065 n=10+9)
Unicode       86.9ms ± 3%     86.3ms ± 1%    ~     (p=0.631 n=10+10)
GoTypes        599ms ± 0%      602ms ± 0%  +0.56%  (p=0.000 n=10+9)
Compiler       2.87s ± 1%      2.89s ± 1%  +0.51%  (p=0.002 n=9+10)
SSA            7.29s ± 1%      7.25s ± 1%    ~     (p=0.182 n=10+9)
Flate          118ms ± 2%      118ms ± 1%    ~     (p=0.113 n=9+9)
GoParser       147ms ± 1%      148ms ± 1%  +1.07%  (p=0.003 n=9+10)
Reflect        401ms ± 1%      404ms ± 1%  +0.71%  (p=0.003 n=10+9)
Tar            175ms ± 1%      175ms ± 1%    ~     (p=0.604 n=9+10)
XML            209ms ± 1%      210ms ± 1%    ~     (p=0.052 n=10+10)
(https://perf.golang.org/search?q=upload:20171231.4)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.23ms ± 1%  2.25ms ± 1%  +0.84%  (p=0.000 n=19+19)
(https://perf.golang.org/search?q=upload:20171231.3)
Relative to the start of the sparse heap changes (starting at and
including "runtime: fix various contiguous bitmap assumptions"),
overall slowdown is roughly 1% on GC-intensive benchmarks:
name        old time/op     new time/op     delta
Template        183ms ± 1%      185ms ± 1%  +1.32%  (p=0.000 n=9+9)
Unicode        84.9ms ± 2%     86.3ms ± 1%  +1.65%  (p=0.000 n=9+10)
GoTypes         595ms ± 1%      602ms ± 0%  +1.19%  (p=0.000 n=9+9)
Compiler        2.86s ± 0%      2.89s ± 1%  +0.91%  (p=0.000 n=9+10)
SSA             7.19s ± 0%      7.25s ± 1%  +0.75%  (p=0.000 n=8+9)
Flate           117ms ± 1%      118ms ± 1%  +1.10%  (p=0.000 n=10+9)
GoParser        146ms ± 2%      148ms ± 1%  +1.48%  (p=0.002 n=10+10)
Reflect         398ms ± 1%      404ms ± 1%  +1.51%  (p=0.000 n=10+9)
Tar             173ms ± 1%      175ms ± 1%  +1.17%  (p=0.000 n=10+10)
XML             208ms ± 1%      210ms ± 1%  +0.62%  (p=0.011 n=10+10)
[Geo mean]      369ms           373ms       +1.17%
(https://perf.golang.org/search?q=upload:20180101.2)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.22ms ± 1%  2.25ms ± 1%  +1.51%  (p=0.000 n=20+19)
(https://perf.golang.org/search?q=upload:20180101.3)
Change-Id: I5daf4cfec24b252e5a57001f0a6c03f22479d0f0
Reviewed-on: https://go-review.googlesource.com/85887
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2017-12-19 22:05:23 -08:00
										 |  |  | 	// Tell the race detector about the new heap memory. | 
					
						
							|  |  |  | 	if raceenabled { | 
					
						
							|  |  |  | 		racemapshadow(v, size) | 
					
						
							| 
									
										
										
										
											2015-02-19 13:38:46 -05:00
										 |  |  | 	} | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
											  
											
												runtime: use sparse mappings for the heap
This replaces the contiguous heap arena mapping with a potentially
sparse mapping that can support heap mappings anywhere in the address
space.
This has several advantages over the current approach:
* There is no longer any limit on the size of the Go heap. (Currently
  it's limited to 512GB.) Hence, this fixes #10460.
* It eliminates many failures modes of heap initialization and
  growing. In particular it eliminates any possibility of panicking
  with an address space conflict. This can happen for many reasons and
  even causes a low but steady rate of TSAN test failures because of
  conflicts with the TSAN runtime. See #16936 and #11993.
* It eliminates the notion of "non-reserved" heap, which was added
  because creating huge address space reservations (particularly on
  64-bit) led to huge process VSIZE. This was at best confusing and at
  worst conflicted badly with ulimit -v. However, the non-reserved
  heap logic is complicated, can race with other mappings in non-pure
  Go binaries (e.g., #18976), and requires that the entire heap be
  either reserved or non-reserved. We currently maintain the latter
  property, but it's quite difficult to convince yourself of that, and
  hence difficult to keep correct. This logic is still present, but
  will be removed in the next CL.
* It fixes problems on 32-bit where skipping over parts of the address
  space leads to mapping huge (and never-to-be-used) metadata
  structures. See #19831.
This also completely rewrites and significantly simplifies
mheap.sysAlloc, which has been a source of many bugs. E.g., #21044,
 #20259, #18651, and #13143 (and maybe #23222).
This change also makes it possible to allocate individual objects
larger than 512GB. As a result, a few tests that expected huge
allocations to fail needed to be changed to make even larger
allocations. However, at the moment attempting to allocate a humongous
object may cause the program to freeze for several minutes on Linux as
we fall back to probing every page with addrspace_free. That logic
(and this failure mode) will be removed in the next CL.
Fixes #10460.
Fixes #22204 (since it rewrites the code involved).
This slightly slows down compilebench and the x/benchmarks garbage
benchmark.
name       old time/op     new time/op     delta
Template       184ms ± 1%      185ms ± 1%    ~     (p=0.065 n=10+9)
Unicode       86.9ms ± 3%     86.3ms ± 1%    ~     (p=0.631 n=10+10)
GoTypes        599ms ± 0%      602ms ± 0%  +0.56%  (p=0.000 n=10+9)
Compiler       2.87s ± 1%      2.89s ± 1%  +0.51%  (p=0.002 n=9+10)
SSA            7.29s ± 1%      7.25s ± 1%    ~     (p=0.182 n=10+9)
Flate          118ms ± 2%      118ms ± 1%    ~     (p=0.113 n=9+9)
GoParser       147ms ± 1%      148ms ± 1%  +1.07%  (p=0.003 n=9+10)
Reflect        401ms ± 1%      404ms ± 1%  +0.71%  (p=0.003 n=10+9)
Tar            175ms ± 1%      175ms ± 1%    ~     (p=0.604 n=9+10)
XML            209ms ± 1%      210ms ± 1%    ~     (p=0.052 n=10+10)
(https://perf.golang.org/search?q=upload:20171231.4)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.23ms ± 1%  2.25ms ± 1%  +0.84%  (p=0.000 n=19+19)
(https://perf.golang.org/search?q=upload:20171231.3)
Relative to the start of the sparse heap changes (starting at and
including "runtime: fix various contiguous bitmap assumptions"),
overall slowdown is roughly 1% on GC-intensive benchmarks:
name        old time/op     new time/op     delta
Template        183ms ± 1%      185ms ± 1%  +1.32%  (p=0.000 n=9+9)
Unicode        84.9ms ± 2%     86.3ms ± 1%  +1.65%  (p=0.000 n=9+10)
GoTypes         595ms ± 1%      602ms ± 0%  +1.19%  (p=0.000 n=9+9)
Compiler        2.86s ± 0%      2.89s ± 1%  +0.91%  (p=0.000 n=9+10)
SSA             7.19s ± 0%      7.25s ± 1%  +0.75%  (p=0.000 n=8+9)
Flate           117ms ± 1%      118ms ± 1%  +1.10%  (p=0.000 n=10+9)
GoParser        146ms ± 2%      148ms ± 1%  +1.48%  (p=0.002 n=10+10)
Reflect         398ms ± 1%      404ms ± 1%  +1.51%  (p=0.000 n=10+9)
Tar             173ms ± 1%      175ms ± 1%  +1.17%  (p=0.000 n=10+10)
XML             208ms ± 1%      210ms ± 1%  +0.62%  (p=0.011 n=10+10)
[Geo mean]      369ms           373ms       +1.17%
(https://perf.golang.org/search?q=upload:20180101.2)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.22ms ± 1%  2.25ms ± 1%  +1.51%  (p=0.000 n=20+19)
(https://perf.golang.org/search?q=upload:20180101.3)
Change-Id: I5daf4cfec24b252e5a57001f0a6c03f22479d0f0
Reviewed-on: https://go-review.googlesource.com/85887
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2017-12-19 22:05:23 -08:00
										 |  |  | 	return | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | // sysReserveAligned is like sysReserve, but the returned pointer is | 
					
						
							|  |  |  | // aligned to align bytes. It may reserve either n or n+align bytes, | 
					
						
							|  |  |  | // so it returns the size that was reserved. | 
					
						
							| 
									
										
											  
											
												runtime: remove non-reserved heap logic
Currently large sysReserve calls on some OSes don't actually reserve
the memory, but just check that it can be reserved. This was important
when we called sysReserve to "reserve" many gigabytes for the heap up
front, but now that we map memory in small increments as we need it,
this complication is no longer necessary.
This has one curious side benefit: currently, on Linux, allocations
that are large enough to be rejected by mmap wind up freezing the
application for a long time before it panics. This happens because
sysReserve doesn't reserve the memory, so sysMap calls mmap_fixed,
which calls mmap, which fails because the mapping is too large.
However, mmap_fixed doesn't inspect *why* mmap fails, so it falls back
to probing every page in the desired region individually with mincore
before performing an (otherwise dangerous) MAP_FIXED mapping, which
will also fail. This takes a long time for a large region. Now this
logic is gone, so the mmap failure leads to an immediate panic.
Updates #10460.
Change-Id: I8efe88c611871cdb14f99fadd09db83e0161ca2e
Reviewed-on: https://go-review.googlesource.com/85888
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2017-12-30 19:35:46 -05:00
										 |  |  | func sysReserveAligned(v unsafe.Pointer, size, align uintptr) (unsafe.Pointer, uintptr) { | 
					
						
							| 
									
										
											  
											
												runtime: use sparse mappings for the heap
This replaces the contiguous heap arena mapping with a potentially
sparse mapping that can support heap mappings anywhere in the address
space.
This has several advantages over the current approach:
* There is no longer any limit on the size of the Go heap. (Currently
  it's limited to 512GB.) Hence, this fixes #10460.
* It eliminates many failures modes of heap initialization and
  growing. In particular it eliminates any possibility of panicking
  with an address space conflict. This can happen for many reasons and
  even causes a low but steady rate of TSAN test failures because of
  conflicts with the TSAN runtime. See #16936 and #11993.
* It eliminates the notion of "non-reserved" heap, which was added
  because creating huge address space reservations (particularly on
  64-bit) led to huge process VSIZE. This was at best confusing and at
  worst conflicted badly with ulimit -v. However, the non-reserved
  heap logic is complicated, can race with other mappings in non-pure
  Go binaries (e.g., #18976), and requires that the entire heap be
  either reserved or non-reserved. We currently maintain the latter
  property, but it's quite difficult to convince yourself of that, and
  hence difficult to keep correct. This logic is still present, but
  will be removed in the next CL.
* It fixes problems on 32-bit where skipping over parts of the address
  space leads to mapping huge (and never-to-be-used) metadata
  structures. See #19831.
This also completely rewrites and significantly simplifies
mheap.sysAlloc, which has been a source of many bugs. E.g., #21044,
 #20259, #18651, and #13143 (and maybe #23222).
This change also makes it possible to allocate individual objects
larger than 512GB. As a result, a few tests that expected huge
allocations to fail needed to be changed to make even larger
allocations. However, at the moment attempting to allocate a humongous
object may cause the program to freeze for several minutes on Linux as
we fall back to probing every page with addrspace_free. That logic
(and this failure mode) will be removed in the next CL.
Fixes #10460.
Fixes #22204 (since it rewrites the code involved).
This slightly slows down compilebench and the x/benchmarks garbage
benchmark.
name       old time/op     new time/op     delta
Template       184ms ± 1%      185ms ± 1%    ~     (p=0.065 n=10+9)
Unicode       86.9ms ± 3%     86.3ms ± 1%    ~     (p=0.631 n=10+10)
GoTypes        599ms ± 0%      602ms ± 0%  +0.56%  (p=0.000 n=10+9)
Compiler       2.87s ± 1%      2.89s ± 1%  +0.51%  (p=0.002 n=9+10)
SSA            7.29s ± 1%      7.25s ± 1%    ~     (p=0.182 n=10+9)
Flate          118ms ± 2%      118ms ± 1%    ~     (p=0.113 n=9+9)
GoParser       147ms ± 1%      148ms ± 1%  +1.07%  (p=0.003 n=9+10)
Reflect        401ms ± 1%      404ms ± 1%  +0.71%  (p=0.003 n=10+9)
Tar            175ms ± 1%      175ms ± 1%    ~     (p=0.604 n=9+10)
XML            209ms ± 1%      210ms ± 1%    ~     (p=0.052 n=10+10)
(https://perf.golang.org/search?q=upload:20171231.4)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.23ms ± 1%  2.25ms ± 1%  +0.84%  (p=0.000 n=19+19)
(https://perf.golang.org/search?q=upload:20171231.3)
Relative to the start of the sparse heap changes (starting at and
including "runtime: fix various contiguous bitmap assumptions"),
overall slowdown is roughly 1% on GC-intensive benchmarks:
name        old time/op     new time/op     delta
Template        183ms ± 1%      185ms ± 1%  +1.32%  (p=0.000 n=9+9)
Unicode        84.9ms ± 2%     86.3ms ± 1%  +1.65%  (p=0.000 n=9+10)
GoTypes         595ms ± 1%      602ms ± 0%  +1.19%  (p=0.000 n=9+9)
Compiler        2.86s ± 0%      2.89s ± 1%  +0.91%  (p=0.000 n=9+10)
SSA             7.19s ± 0%      7.25s ± 1%  +0.75%  (p=0.000 n=8+9)
Flate           117ms ± 1%      118ms ± 1%  +1.10%  (p=0.000 n=10+9)
GoParser        146ms ± 2%      148ms ± 1%  +1.48%  (p=0.002 n=10+10)
Reflect         398ms ± 1%      404ms ± 1%  +1.51%  (p=0.000 n=10+9)
Tar             173ms ± 1%      175ms ± 1%  +1.17%  (p=0.000 n=10+10)
XML             208ms ± 1%      210ms ± 1%  +0.62%  (p=0.011 n=10+10)
[Geo mean]      369ms           373ms       +1.17%
(https://perf.golang.org/search?q=upload:20180101.2)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.22ms ± 1%  2.25ms ± 1%  +1.51%  (p=0.000 n=20+19)
(https://perf.golang.org/search?q=upload:20180101.3)
Change-Id: I5daf4cfec24b252e5a57001f0a6c03f22479d0f0
Reviewed-on: https://go-review.googlesource.com/85887
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2017-12-19 22:05:23 -08:00
										 |  |  | 	// Since the alignment is rather large in uses of this | 
					
						
							|  |  |  | 	// function, we're not likely to get it by chance, so we ask | 
					
						
							|  |  |  | 	// for a larger region and remove the parts we don't need. | 
					
						
							|  |  |  | 	retries := 0 | 
					
						
							|  |  |  | retry: | 
					
						
							| 
									
										
											  
											
												runtime: remove non-reserved heap logic
Currently large sysReserve calls on some OSes don't actually reserve
the memory, but just check that it can be reserved. This was important
when we called sysReserve to "reserve" many gigabytes for the heap up
front, but now that we map memory in small increments as we need it,
this complication is no longer necessary.
This has one curious side benefit: currently, on Linux, allocations
that are large enough to be rejected by mmap wind up freezing the
application for a long time before it panics. This happens because
sysReserve doesn't reserve the memory, so sysMap calls mmap_fixed,
which calls mmap, which fails because the mapping is too large.
However, mmap_fixed doesn't inspect *why* mmap fails, so it falls back
to probing every page in the desired region individually with mincore
before performing an (otherwise dangerous) MAP_FIXED mapping, which
will also fail. This takes a long time for a large region. Now this
logic is gone, so the mmap failure leads to an immediate panic.
Updates #10460.
Change-Id: I8efe88c611871cdb14f99fadd09db83e0161ca2e
Reviewed-on: https://go-review.googlesource.com/85888
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2017-12-30 19:35:46 -05:00
										 |  |  | 	p := uintptr(sysReserve(v, size+align)) | 
					
						
							| 
									
										
											  
											
												runtime: use sparse mappings for the heap
This replaces the contiguous heap arena mapping with a potentially
sparse mapping that can support heap mappings anywhere in the address
space.
This has several advantages over the current approach:
* There is no longer any limit on the size of the Go heap. (Currently
  it's limited to 512GB.) Hence, this fixes #10460.
* It eliminates many failures modes of heap initialization and
  growing. In particular it eliminates any possibility of panicking
  with an address space conflict. This can happen for many reasons and
  even causes a low but steady rate of TSAN test failures because of
  conflicts with the TSAN runtime. See #16936 and #11993.
* It eliminates the notion of "non-reserved" heap, which was added
  because creating huge address space reservations (particularly on
  64-bit) led to huge process VSIZE. This was at best confusing and at
  worst conflicted badly with ulimit -v. However, the non-reserved
  heap logic is complicated, can race with other mappings in non-pure
  Go binaries (e.g., #18976), and requires that the entire heap be
  either reserved or non-reserved. We currently maintain the latter
  property, but it's quite difficult to convince yourself of that, and
  hence difficult to keep correct. This logic is still present, but
  will be removed in the next CL.
* It fixes problems on 32-bit where skipping over parts of the address
  space leads to mapping huge (and never-to-be-used) metadata
  structures. See #19831.
This also completely rewrites and significantly simplifies
mheap.sysAlloc, which has been a source of many bugs. E.g., #21044,
 #20259, #18651, and #13143 (and maybe #23222).
This change also makes it possible to allocate individual objects
larger than 512GB. As a result, a few tests that expected huge
allocations to fail needed to be changed to make even larger
allocations. However, at the moment attempting to allocate a humongous
object may cause the program to freeze for several minutes on Linux as
we fall back to probing every page with addrspace_free. That logic
(and this failure mode) will be removed in the next CL.
Fixes #10460.
Fixes #22204 (since it rewrites the code involved).
This slightly slows down compilebench and the x/benchmarks garbage
benchmark.
name       old time/op     new time/op     delta
Template       184ms ± 1%      185ms ± 1%    ~     (p=0.065 n=10+9)
Unicode       86.9ms ± 3%     86.3ms ± 1%    ~     (p=0.631 n=10+10)
GoTypes        599ms ± 0%      602ms ± 0%  +0.56%  (p=0.000 n=10+9)
Compiler       2.87s ± 1%      2.89s ± 1%  +0.51%  (p=0.002 n=9+10)
SSA            7.29s ± 1%      7.25s ± 1%    ~     (p=0.182 n=10+9)
Flate          118ms ± 2%      118ms ± 1%    ~     (p=0.113 n=9+9)
GoParser       147ms ± 1%      148ms ± 1%  +1.07%  (p=0.003 n=9+10)
Reflect        401ms ± 1%      404ms ± 1%  +0.71%  (p=0.003 n=10+9)
Tar            175ms ± 1%      175ms ± 1%    ~     (p=0.604 n=9+10)
XML            209ms ± 1%      210ms ± 1%    ~     (p=0.052 n=10+10)
(https://perf.golang.org/search?q=upload:20171231.4)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.23ms ± 1%  2.25ms ± 1%  +0.84%  (p=0.000 n=19+19)
(https://perf.golang.org/search?q=upload:20171231.3)
Relative to the start of the sparse heap changes (starting at and
including "runtime: fix various contiguous bitmap assumptions"),
overall slowdown is roughly 1% on GC-intensive benchmarks:
name        old time/op     new time/op     delta
Template        183ms ± 1%      185ms ± 1%  +1.32%  (p=0.000 n=9+9)
Unicode        84.9ms ± 2%     86.3ms ± 1%  +1.65%  (p=0.000 n=9+10)
GoTypes         595ms ± 1%      602ms ± 0%  +1.19%  (p=0.000 n=9+9)
Compiler        2.86s ± 0%      2.89s ± 1%  +0.91%  (p=0.000 n=9+10)
SSA             7.19s ± 0%      7.25s ± 1%  +0.75%  (p=0.000 n=8+9)
Flate           117ms ± 1%      118ms ± 1%  +1.10%  (p=0.000 n=10+9)
GoParser        146ms ± 2%      148ms ± 1%  +1.48%  (p=0.002 n=10+10)
Reflect         398ms ± 1%      404ms ± 1%  +1.51%  (p=0.000 n=10+9)
Tar             173ms ± 1%      175ms ± 1%  +1.17%  (p=0.000 n=10+10)
XML             208ms ± 1%      210ms ± 1%  +0.62%  (p=0.011 n=10+10)
[Geo mean]      369ms           373ms       +1.17%
(https://perf.golang.org/search?q=upload:20180101.2)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.22ms ± 1%  2.25ms ± 1%  +1.51%  (p=0.000 n=20+19)
(https://perf.golang.org/search?q=upload:20180101.3)
Change-Id: I5daf4cfec24b252e5a57001f0a6c03f22479d0f0
Reviewed-on: https://go-review.googlesource.com/85887
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2017-12-19 22:05:23 -08:00
										 |  |  | 	switch { | 
					
						
							|  |  |  | 	case p == 0: | 
					
						
							|  |  |  | 		return nil, 0 | 
					
						
							|  |  |  | 	case p&(align-1) == 0: | 
					
						
							|  |  |  | 		return unsafe.Pointer(p), size + align | 
					
						
							|  |  |  | 	case GOOS == "windows": | 
					
						
							|  |  |  | 		// On Windows we can't release pieces of a | 
					
						
							|  |  |  | 		// reservation, so we release the whole thing and | 
					
						
							|  |  |  | 		// re-reserve the aligned sub-region. This may race, | 
					
						
							|  |  |  | 		// so we may have to try again. | 
					
						
							| 
									
										
											  
											
												runtime: track how much memory is mapped in the Ready state
This change adds a field to memstats called mappedReady that tracks how
much memory is in the Ready state at any given time. In essence, it's
the total memory usage by the Go runtime (with one exception which is
documented). Essentially, all memory mapped read/write that has either
been paged in or will soon.
To make tracking this not involve the many different stats that track
mapped memory, we track this statistic at a very low level. The downside
of tracking this statistic at such a low level is that it managed to
catch lots of situations where the runtime wasn't fully accounting for
memory. This change rectifies these situations by always accounting for
memory that's mapped in some way (i.e. always passing a sysMemStat to a
mem.go function), with *two* exceptions.
Rectifying these situations means also having the memory mapped during
testing being accounted for, so that tests (i.e. ReadMemStats) that
ultimately check mappedReady continue to work correctly without special
exceptions. We choose to simply account for this memory in other_sys.
Let's talk about the exceptions. The first is the arenas array for
finding heap arena metadata from an address is mapped as read/write in
one large chunk. It's tens of MiB in size. On systems with demand
paging, we assume that the whole thing isn't paged in at once (after
all, it maps to the whole address space, and it's exceedingly difficult
with today's technology to even broach having as much physical memory as
the total address space). On systems where we have to commit memory
manually, we use a two-level structure.
Now, the reason why this is an exception is because we have no mechanism
to track what memory is paged in, and we can't just account for the
entire thing, because that would *look* like an enormous overhead.
Furthermore, this structure is on a few really, really critical paths in
the runtime, so doing more explicit tracking isn't really an option. So,
we explicitly don't and call sysAllocOS to map this memory.
The second exception is that we call sysFree with no accounting to clean
up address space reservations, or otherwise to throw out mappings we
don't care about. In this case, also drop down to a lower level and call
sysFreeOS to explicitly avoid accounting.
The third exception is debuglog allocations. That is purely a debugging
facility and ideally we want it to have as small an impact on the
runtime as possible. If we include it in mappedReady calculations, it
could cause GC pacing shifts in future CLs, especailly if one increases
the debuglog buffer sizes as a one-off.
As of this CL, these are the only three places in the runtime that would
pass nil for a stat to any of the functions in mem.go. As a result, this
CL makes sysMemStats mandatory to facilitate better accounting in the
future. It's now much easier to grep and find out where accounting is
explicitly elided, because one doesn't have to follow the trail of
sysMemStat nil pointer values, and can just look at the function name.
For #48409.
Change-Id: I274eb467fc2603881717482214fddc47c9eaf218
Reviewed-on: https://go-review.googlesource.com/c/go/+/393402
Reviewed-by: Michael Pratt <mpratt@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
											
										 
											2022-03-15 02:48:18 +00:00
										 |  |  | 		sysFreeOS(unsafe.Pointer(p), size+align) | 
					
						
							| 
									
										
										
										
											2019-06-28 16:44:07 +00:00
										 |  |  | 		p = alignUp(p, align) | 
					
						
							| 
									
										
											  
											
												runtime: remove non-reserved heap logic
Currently large sysReserve calls on some OSes don't actually reserve
the memory, but just check that it can be reserved. This was important
when we called sysReserve to "reserve" many gigabytes for the heap up
front, but now that we map memory in small increments as we need it,
this complication is no longer necessary.
This has one curious side benefit: currently, on Linux, allocations
that are large enough to be rejected by mmap wind up freezing the
application for a long time before it panics. This happens because
sysReserve doesn't reserve the memory, so sysMap calls mmap_fixed,
which calls mmap, which fails because the mapping is too large.
However, mmap_fixed doesn't inspect *why* mmap fails, so it falls back
to probing every page in the desired region individually with mincore
before performing an (otherwise dangerous) MAP_FIXED mapping, which
will also fail. This takes a long time for a large region. Now this
logic is gone, so the mmap failure leads to an immediate panic.
Updates #10460.
Change-Id: I8efe88c611871cdb14f99fadd09db83e0161ca2e
Reviewed-on: https://go-review.googlesource.com/85888
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2017-12-30 19:35:46 -05:00
										 |  |  | 		p2 := sysReserve(unsafe.Pointer(p), size) | 
					
						
							| 
									
										
											  
											
												runtime: use sparse mappings for the heap
This replaces the contiguous heap arena mapping with a potentially
sparse mapping that can support heap mappings anywhere in the address
space.
This has several advantages over the current approach:
* There is no longer any limit on the size of the Go heap. (Currently
  it's limited to 512GB.) Hence, this fixes #10460.
* It eliminates many failures modes of heap initialization and
  growing. In particular it eliminates any possibility of panicking
  with an address space conflict. This can happen for many reasons and
  even causes a low but steady rate of TSAN test failures because of
  conflicts with the TSAN runtime. See #16936 and #11993.
* It eliminates the notion of "non-reserved" heap, which was added
  because creating huge address space reservations (particularly on
  64-bit) led to huge process VSIZE. This was at best confusing and at
  worst conflicted badly with ulimit -v. However, the non-reserved
  heap logic is complicated, can race with other mappings in non-pure
  Go binaries (e.g., #18976), and requires that the entire heap be
  either reserved or non-reserved. We currently maintain the latter
  property, but it's quite difficult to convince yourself of that, and
  hence difficult to keep correct. This logic is still present, but
  will be removed in the next CL.
* It fixes problems on 32-bit where skipping over parts of the address
  space leads to mapping huge (and never-to-be-used) metadata
  structures. See #19831.
This also completely rewrites and significantly simplifies
mheap.sysAlloc, which has been a source of many bugs. E.g., #21044,
 #20259, #18651, and #13143 (and maybe #23222).
This change also makes it possible to allocate individual objects
larger than 512GB. As a result, a few tests that expected huge
allocations to fail needed to be changed to make even larger
allocations. However, at the moment attempting to allocate a humongous
object may cause the program to freeze for several minutes on Linux as
we fall back to probing every page with addrspace_free. That logic
(and this failure mode) will be removed in the next CL.
Fixes #10460.
Fixes #22204 (since it rewrites the code involved).
This slightly slows down compilebench and the x/benchmarks garbage
benchmark.
name       old time/op     new time/op     delta
Template       184ms ± 1%      185ms ± 1%    ~     (p=0.065 n=10+9)
Unicode       86.9ms ± 3%     86.3ms ± 1%    ~     (p=0.631 n=10+10)
GoTypes        599ms ± 0%      602ms ± 0%  +0.56%  (p=0.000 n=10+9)
Compiler       2.87s ± 1%      2.89s ± 1%  +0.51%  (p=0.002 n=9+10)
SSA            7.29s ± 1%      7.25s ± 1%    ~     (p=0.182 n=10+9)
Flate          118ms ± 2%      118ms ± 1%    ~     (p=0.113 n=9+9)
GoParser       147ms ± 1%      148ms ± 1%  +1.07%  (p=0.003 n=9+10)
Reflect        401ms ± 1%      404ms ± 1%  +0.71%  (p=0.003 n=10+9)
Tar            175ms ± 1%      175ms ± 1%    ~     (p=0.604 n=9+10)
XML            209ms ± 1%      210ms ± 1%    ~     (p=0.052 n=10+10)
(https://perf.golang.org/search?q=upload:20171231.4)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.23ms ± 1%  2.25ms ± 1%  +0.84%  (p=0.000 n=19+19)
(https://perf.golang.org/search?q=upload:20171231.3)
Relative to the start of the sparse heap changes (starting at and
including "runtime: fix various contiguous bitmap assumptions"),
overall slowdown is roughly 1% on GC-intensive benchmarks:
name        old time/op     new time/op     delta
Template        183ms ± 1%      185ms ± 1%  +1.32%  (p=0.000 n=9+9)
Unicode        84.9ms ± 2%     86.3ms ± 1%  +1.65%  (p=0.000 n=9+10)
GoTypes         595ms ± 1%      602ms ± 0%  +1.19%  (p=0.000 n=9+9)
Compiler        2.86s ± 0%      2.89s ± 1%  +0.91%  (p=0.000 n=9+10)
SSA             7.19s ± 0%      7.25s ± 1%  +0.75%  (p=0.000 n=8+9)
Flate           117ms ± 1%      118ms ± 1%  +1.10%  (p=0.000 n=10+9)
GoParser        146ms ± 2%      148ms ± 1%  +1.48%  (p=0.002 n=10+10)
Reflect         398ms ± 1%      404ms ± 1%  +1.51%  (p=0.000 n=10+9)
Tar             173ms ± 1%      175ms ± 1%  +1.17%  (p=0.000 n=10+10)
XML             208ms ± 1%      210ms ± 1%  +0.62%  (p=0.011 n=10+10)
[Geo mean]      369ms           373ms       +1.17%
(https://perf.golang.org/search?q=upload:20180101.2)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.22ms ± 1%  2.25ms ± 1%  +1.51%  (p=0.000 n=20+19)
(https://perf.golang.org/search?q=upload:20180101.3)
Change-Id: I5daf4cfec24b252e5a57001f0a6c03f22479d0f0
Reviewed-on: https://go-review.googlesource.com/85887
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2017-12-19 22:05:23 -08:00
										 |  |  | 		if p != uintptr(p2) { | 
					
						
							|  |  |  | 			// Must have raced. Try again. | 
					
						
							| 
									
										
											  
											
												runtime: track how much memory is mapped in the Ready state
This change adds a field to memstats called mappedReady that tracks how
much memory is in the Ready state at any given time. In essence, it's
the total memory usage by the Go runtime (with one exception which is
documented). Essentially, all memory mapped read/write that has either
been paged in or will soon.
To make tracking this not involve the many different stats that track
mapped memory, we track this statistic at a very low level. The downside
of tracking this statistic at such a low level is that it managed to
catch lots of situations where the runtime wasn't fully accounting for
memory. This change rectifies these situations by always accounting for
memory that's mapped in some way (i.e. always passing a sysMemStat to a
mem.go function), with *two* exceptions.
Rectifying these situations means also having the memory mapped during
testing being accounted for, so that tests (i.e. ReadMemStats) that
ultimately check mappedReady continue to work correctly without special
exceptions. We choose to simply account for this memory in other_sys.
Let's talk about the exceptions. The first is the arenas array for
finding heap arena metadata from an address is mapped as read/write in
one large chunk. It's tens of MiB in size. On systems with demand
paging, we assume that the whole thing isn't paged in at once (after
all, it maps to the whole address space, and it's exceedingly difficult
with today's technology to even broach having as much physical memory as
the total address space). On systems where we have to commit memory
manually, we use a two-level structure.
Now, the reason why this is an exception is because we have no mechanism
to track what memory is paged in, and we can't just account for the
entire thing, because that would *look* like an enormous overhead.
Furthermore, this structure is on a few really, really critical paths in
the runtime, so doing more explicit tracking isn't really an option. So,
we explicitly don't and call sysAllocOS to map this memory.
The second exception is that we call sysFree with no accounting to clean
up address space reservations, or otherwise to throw out mappings we
don't care about. In this case, also drop down to a lower level and call
sysFreeOS to explicitly avoid accounting.
The third exception is debuglog allocations. That is purely a debugging
facility and ideally we want it to have as small an impact on the
runtime as possible. If we include it in mappedReady calculations, it
could cause GC pacing shifts in future CLs, especailly if one increases
the debuglog buffer sizes as a one-off.
As of this CL, these are the only three places in the runtime that would
pass nil for a stat to any of the functions in mem.go. As a result, this
CL makes sysMemStats mandatory to facilitate better accounting in the
future. It's now much easier to grep and find out where accounting is
explicitly elided, because one doesn't have to follow the trail of
sysMemStat nil pointer values, and can just look at the function name.
For #48409.
Change-Id: I274eb467fc2603881717482214fddc47c9eaf218
Reviewed-on: https://go-review.googlesource.com/c/go/+/393402
Reviewed-by: Michael Pratt <mpratt@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
											
										 
											2022-03-15 02:48:18 +00:00
										 |  |  | 			sysFreeOS(p2, size) | 
					
						
							| 
									
										
											  
											
												runtime: use sparse mappings for the heap
This replaces the contiguous heap arena mapping with a potentially
sparse mapping that can support heap mappings anywhere in the address
space.
This has several advantages over the current approach:
* There is no longer any limit on the size of the Go heap. (Currently
  it's limited to 512GB.) Hence, this fixes #10460.
* It eliminates many failures modes of heap initialization and
  growing. In particular it eliminates any possibility of panicking
  with an address space conflict. This can happen for many reasons and
  even causes a low but steady rate of TSAN test failures because of
  conflicts with the TSAN runtime. See #16936 and #11993.
* It eliminates the notion of "non-reserved" heap, which was added
  because creating huge address space reservations (particularly on
  64-bit) led to huge process VSIZE. This was at best confusing and at
  worst conflicted badly with ulimit -v. However, the non-reserved
  heap logic is complicated, can race with other mappings in non-pure
  Go binaries (e.g., #18976), and requires that the entire heap be
  either reserved or non-reserved. We currently maintain the latter
  property, but it's quite difficult to convince yourself of that, and
  hence difficult to keep correct. This logic is still present, but
  will be removed in the next CL.
* It fixes problems on 32-bit where skipping over parts of the address
  space leads to mapping huge (and never-to-be-used) metadata
  structures. See #19831.
This also completely rewrites and significantly simplifies
mheap.sysAlloc, which has been a source of many bugs. E.g., #21044,
 #20259, #18651, and #13143 (and maybe #23222).
This change also makes it possible to allocate individual objects
larger than 512GB. As a result, a few tests that expected huge
allocations to fail needed to be changed to make even larger
allocations. However, at the moment attempting to allocate a humongous
object may cause the program to freeze for several minutes on Linux as
we fall back to probing every page with addrspace_free. That logic
(and this failure mode) will be removed in the next CL.
Fixes #10460.
Fixes #22204 (since it rewrites the code involved).
This slightly slows down compilebench and the x/benchmarks garbage
benchmark.
name       old time/op     new time/op     delta
Template       184ms ± 1%      185ms ± 1%    ~     (p=0.065 n=10+9)
Unicode       86.9ms ± 3%     86.3ms ± 1%    ~     (p=0.631 n=10+10)
GoTypes        599ms ± 0%      602ms ± 0%  +0.56%  (p=0.000 n=10+9)
Compiler       2.87s ± 1%      2.89s ± 1%  +0.51%  (p=0.002 n=9+10)
SSA            7.29s ± 1%      7.25s ± 1%    ~     (p=0.182 n=10+9)
Flate          118ms ± 2%      118ms ± 1%    ~     (p=0.113 n=9+9)
GoParser       147ms ± 1%      148ms ± 1%  +1.07%  (p=0.003 n=9+10)
Reflect        401ms ± 1%      404ms ± 1%  +0.71%  (p=0.003 n=10+9)
Tar            175ms ± 1%      175ms ± 1%    ~     (p=0.604 n=9+10)
XML            209ms ± 1%      210ms ± 1%    ~     (p=0.052 n=10+10)
(https://perf.golang.org/search?q=upload:20171231.4)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.23ms ± 1%  2.25ms ± 1%  +0.84%  (p=0.000 n=19+19)
(https://perf.golang.org/search?q=upload:20171231.3)
Relative to the start of the sparse heap changes (starting at and
including "runtime: fix various contiguous bitmap assumptions"),
overall slowdown is roughly 1% on GC-intensive benchmarks:
name        old time/op     new time/op     delta
Template        183ms ± 1%      185ms ± 1%  +1.32%  (p=0.000 n=9+9)
Unicode        84.9ms ± 2%     86.3ms ± 1%  +1.65%  (p=0.000 n=9+10)
GoTypes         595ms ± 1%      602ms ± 0%  +1.19%  (p=0.000 n=9+9)
Compiler        2.86s ± 0%      2.89s ± 1%  +0.91%  (p=0.000 n=9+10)
SSA             7.19s ± 0%      7.25s ± 1%  +0.75%  (p=0.000 n=8+9)
Flate           117ms ± 1%      118ms ± 1%  +1.10%  (p=0.000 n=10+9)
GoParser        146ms ± 2%      148ms ± 1%  +1.48%  (p=0.002 n=10+10)
Reflect         398ms ± 1%      404ms ± 1%  +1.51%  (p=0.000 n=10+9)
Tar             173ms ± 1%      175ms ± 1%  +1.17%  (p=0.000 n=10+10)
XML             208ms ± 1%      210ms ± 1%  +0.62%  (p=0.011 n=10+10)
[Geo mean]      369ms           373ms       +1.17%
(https://perf.golang.org/search?q=upload:20180101.2)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.22ms ± 1%  2.25ms ± 1%  +1.51%  (p=0.000 n=20+19)
(https://perf.golang.org/search?q=upload:20180101.3)
Change-Id: I5daf4cfec24b252e5a57001f0a6c03f22479d0f0
Reviewed-on: https://go-review.googlesource.com/85887
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2017-12-19 22:05:23 -08:00
										 |  |  | 			if retries++; retries == 100 { | 
					
						
							|  |  |  | 				throw("failed to allocate aligned heap memory; too many retries") | 
					
						
							|  |  |  | 			} | 
					
						
							|  |  |  | 			goto retry | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 		// Success. | 
					
						
							|  |  |  | 		return p2, size | 
					
						
							|  |  |  | 	default: | 
					
						
							|  |  |  | 		// Trim off the unaligned parts. | 
					
						
							| 
									
										
										
										
											2019-06-28 16:44:07 +00:00
										 |  |  | 		pAligned := alignUp(p, align) | 
					
						
							| 
									
										
											  
											
												runtime: track how much memory is mapped in the Ready state
This change adds a field to memstats called mappedReady that tracks how
much memory is in the Ready state at any given time. In essence, it's
the total memory usage by the Go runtime (with one exception which is
documented). Essentially, all memory mapped read/write that has either
been paged in or will soon.
To make tracking this not involve the many different stats that track
mapped memory, we track this statistic at a very low level. The downside
of tracking this statistic at such a low level is that it managed to
catch lots of situations where the runtime wasn't fully accounting for
memory. This change rectifies these situations by always accounting for
memory that's mapped in some way (i.e. always passing a sysMemStat to a
mem.go function), with *two* exceptions.
Rectifying these situations means also having the memory mapped during
testing being accounted for, so that tests (i.e. ReadMemStats) that
ultimately check mappedReady continue to work correctly without special
exceptions. We choose to simply account for this memory in other_sys.
Let's talk about the exceptions. The first is the arenas array for
finding heap arena metadata from an address is mapped as read/write in
one large chunk. It's tens of MiB in size. On systems with demand
paging, we assume that the whole thing isn't paged in at once (after
all, it maps to the whole address space, and it's exceedingly difficult
with today's technology to even broach having as much physical memory as
the total address space). On systems where we have to commit memory
manually, we use a two-level structure.
Now, the reason why this is an exception is because we have no mechanism
to track what memory is paged in, and we can't just account for the
entire thing, because that would *look* like an enormous overhead.
Furthermore, this structure is on a few really, really critical paths in
the runtime, so doing more explicit tracking isn't really an option. So,
we explicitly don't and call sysAllocOS to map this memory.
The second exception is that we call sysFree with no accounting to clean
up address space reservations, or otherwise to throw out mappings we
don't care about. In this case, also drop down to a lower level and call
sysFreeOS to explicitly avoid accounting.
The third exception is debuglog allocations. That is purely a debugging
facility and ideally we want it to have as small an impact on the
runtime as possible. If we include it in mappedReady calculations, it
could cause GC pacing shifts in future CLs, especailly if one increases
the debuglog buffer sizes as a one-off.
As of this CL, these are the only three places in the runtime that would
pass nil for a stat to any of the functions in mem.go. As a result, this
CL makes sysMemStats mandatory to facilitate better accounting in the
future. It's now much easier to grep and find out where accounting is
explicitly elided, because one doesn't have to follow the trail of
sysMemStat nil pointer values, and can just look at the function name.
For #48409.
Change-Id: I274eb467fc2603881717482214fddc47c9eaf218
Reviewed-on: https://go-review.googlesource.com/c/go/+/393402
Reviewed-by: Michael Pratt <mpratt@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
											
										 
											2022-03-15 02:48:18 +00:00
										 |  |  | 		sysFreeOS(unsafe.Pointer(p), pAligned-p) | 
					
						
							| 
									
										
											  
											
												runtime: use sparse mappings for the heap
This replaces the contiguous heap arena mapping with a potentially
sparse mapping that can support heap mappings anywhere in the address
space.
This has several advantages over the current approach:
* There is no longer any limit on the size of the Go heap. (Currently
  it's limited to 512GB.) Hence, this fixes #10460.
* It eliminates many failures modes of heap initialization and
  growing. In particular it eliminates any possibility of panicking
  with an address space conflict. This can happen for many reasons and
  even causes a low but steady rate of TSAN test failures because of
  conflicts with the TSAN runtime. See #16936 and #11993.
* It eliminates the notion of "non-reserved" heap, which was added
  because creating huge address space reservations (particularly on
  64-bit) led to huge process VSIZE. This was at best confusing and at
  worst conflicted badly with ulimit -v. However, the non-reserved
  heap logic is complicated, can race with other mappings in non-pure
  Go binaries (e.g., #18976), and requires that the entire heap be
  either reserved or non-reserved. We currently maintain the latter
  property, but it's quite difficult to convince yourself of that, and
  hence difficult to keep correct. This logic is still present, but
  will be removed in the next CL.
* It fixes problems on 32-bit where skipping over parts of the address
  space leads to mapping huge (and never-to-be-used) metadata
  structures. See #19831.
This also completely rewrites and significantly simplifies
mheap.sysAlloc, which has been a source of many bugs. E.g., #21044,
 #20259, #18651, and #13143 (and maybe #23222).
This change also makes it possible to allocate individual objects
larger than 512GB. As a result, a few tests that expected huge
allocations to fail needed to be changed to make even larger
allocations. However, at the moment attempting to allocate a humongous
object may cause the program to freeze for several minutes on Linux as
we fall back to probing every page with addrspace_free. That logic
(and this failure mode) will be removed in the next CL.
Fixes #10460.
Fixes #22204 (since it rewrites the code involved).
This slightly slows down compilebench and the x/benchmarks garbage
benchmark.
name       old time/op     new time/op     delta
Template       184ms ± 1%      185ms ± 1%    ~     (p=0.065 n=10+9)
Unicode       86.9ms ± 3%     86.3ms ± 1%    ~     (p=0.631 n=10+10)
GoTypes        599ms ± 0%      602ms ± 0%  +0.56%  (p=0.000 n=10+9)
Compiler       2.87s ± 1%      2.89s ± 1%  +0.51%  (p=0.002 n=9+10)
SSA            7.29s ± 1%      7.25s ± 1%    ~     (p=0.182 n=10+9)
Flate          118ms ± 2%      118ms ± 1%    ~     (p=0.113 n=9+9)
GoParser       147ms ± 1%      148ms ± 1%  +1.07%  (p=0.003 n=9+10)
Reflect        401ms ± 1%      404ms ± 1%  +0.71%  (p=0.003 n=10+9)
Tar            175ms ± 1%      175ms ± 1%    ~     (p=0.604 n=9+10)
XML            209ms ± 1%      210ms ± 1%    ~     (p=0.052 n=10+10)
(https://perf.golang.org/search?q=upload:20171231.4)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.23ms ± 1%  2.25ms ± 1%  +0.84%  (p=0.000 n=19+19)
(https://perf.golang.org/search?q=upload:20171231.3)
Relative to the start of the sparse heap changes (starting at and
including "runtime: fix various contiguous bitmap assumptions"),
overall slowdown is roughly 1% on GC-intensive benchmarks:
name        old time/op     new time/op     delta
Template        183ms ± 1%      185ms ± 1%  +1.32%  (p=0.000 n=9+9)
Unicode        84.9ms ± 2%     86.3ms ± 1%  +1.65%  (p=0.000 n=9+10)
GoTypes         595ms ± 1%      602ms ± 0%  +1.19%  (p=0.000 n=9+9)
Compiler        2.86s ± 0%      2.89s ± 1%  +0.91%  (p=0.000 n=9+10)
SSA             7.19s ± 0%      7.25s ± 1%  +0.75%  (p=0.000 n=8+9)
Flate           117ms ± 1%      118ms ± 1%  +1.10%  (p=0.000 n=10+9)
GoParser        146ms ± 2%      148ms ± 1%  +1.48%  (p=0.002 n=10+10)
Reflect         398ms ± 1%      404ms ± 1%  +1.51%  (p=0.000 n=10+9)
Tar             173ms ± 1%      175ms ± 1%  +1.17%  (p=0.000 n=10+10)
XML             208ms ± 1%      210ms ± 1%  +0.62%  (p=0.011 n=10+10)
[Geo mean]      369ms           373ms       +1.17%
(https://perf.golang.org/search?q=upload:20180101.2)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.22ms ± 1%  2.25ms ± 1%  +1.51%  (p=0.000 n=20+19)
(https://perf.golang.org/search?q=upload:20180101.3)
Change-Id: I5daf4cfec24b252e5a57001f0a6c03f22479d0f0
Reviewed-on: https://go-review.googlesource.com/85887
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2017-12-19 22:05:23 -08:00
										 |  |  | 		end := pAligned + size | 
					
						
							|  |  |  | 		endLen := (p + size + align) - end | 
					
						
							|  |  |  | 		if endLen > 0 { | 
					
						
							| 
									
										
											  
											
												runtime: track how much memory is mapped in the Ready state
This change adds a field to memstats called mappedReady that tracks how
much memory is in the Ready state at any given time. In essence, it's
the total memory usage by the Go runtime (with one exception which is
documented). Essentially, all memory mapped read/write that has either
been paged in or will soon.
To make tracking this not involve the many different stats that track
mapped memory, we track this statistic at a very low level. The downside
of tracking this statistic at such a low level is that it managed to
catch lots of situations where the runtime wasn't fully accounting for
memory. This change rectifies these situations by always accounting for
memory that's mapped in some way (i.e. always passing a sysMemStat to a
mem.go function), with *two* exceptions.
Rectifying these situations means also having the memory mapped during
testing being accounted for, so that tests (i.e. ReadMemStats) that
ultimately check mappedReady continue to work correctly without special
exceptions. We choose to simply account for this memory in other_sys.
Let's talk about the exceptions. The first is the arenas array for
finding heap arena metadata from an address is mapped as read/write in
one large chunk. It's tens of MiB in size. On systems with demand
paging, we assume that the whole thing isn't paged in at once (after
all, it maps to the whole address space, and it's exceedingly difficult
with today's technology to even broach having as much physical memory as
the total address space). On systems where we have to commit memory
manually, we use a two-level structure.
Now, the reason why this is an exception is because we have no mechanism
to track what memory is paged in, and we can't just account for the
entire thing, because that would *look* like an enormous overhead.
Furthermore, this structure is on a few really, really critical paths in
the runtime, so doing more explicit tracking isn't really an option. So,
we explicitly don't and call sysAllocOS to map this memory.
The second exception is that we call sysFree with no accounting to clean
up address space reservations, or otherwise to throw out mappings we
don't care about. In this case, also drop down to a lower level and call
sysFreeOS to explicitly avoid accounting.
The third exception is debuglog allocations. That is purely a debugging
facility and ideally we want it to have as small an impact on the
runtime as possible. If we include it in mappedReady calculations, it
could cause GC pacing shifts in future CLs, especailly if one increases
the debuglog buffer sizes as a one-off.
As of this CL, these are the only three places in the runtime that would
pass nil for a stat to any of the functions in mem.go. As a result, this
CL makes sysMemStats mandatory to facilitate better accounting in the
future. It's now much easier to grep and find out where accounting is
explicitly elided, because one doesn't have to follow the trail of
sysMemStat nil pointer values, and can just look at the function name.
For #48409.
Change-Id: I274eb467fc2603881717482214fddc47c9eaf218
Reviewed-on: https://go-review.googlesource.com/c/go/+/393402
Reviewed-by: Michael Pratt <mpratt@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
											
										 
											2022-03-15 02:48:18 +00:00
										 |  |  | 			sysFreeOS(unsafe.Pointer(end), endLen) | 
					
						
							| 
									
										
											  
											
												runtime: use sparse mappings for the heap
This replaces the contiguous heap arena mapping with a potentially
sparse mapping that can support heap mappings anywhere in the address
space.
This has several advantages over the current approach:
* There is no longer any limit on the size of the Go heap. (Currently
  it's limited to 512GB.) Hence, this fixes #10460.
* It eliminates many failures modes of heap initialization and
  growing. In particular it eliminates any possibility of panicking
  with an address space conflict. This can happen for many reasons and
  even causes a low but steady rate of TSAN test failures because of
  conflicts with the TSAN runtime. See #16936 and #11993.
* It eliminates the notion of "non-reserved" heap, which was added
  because creating huge address space reservations (particularly on
  64-bit) led to huge process VSIZE. This was at best confusing and at
  worst conflicted badly with ulimit -v. However, the non-reserved
  heap logic is complicated, can race with other mappings in non-pure
  Go binaries (e.g., #18976), and requires that the entire heap be
  either reserved or non-reserved. We currently maintain the latter
  property, but it's quite difficult to convince yourself of that, and
  hence difficult to keep correct. This logic is still present, but
  will be removed in the next CL.
* It fixes problems on 32-bit where skipping over parts of the address
  space leads to mapping huge (and never-to-be-used) metadata
  structures. See #19831.
This also completely rewrites and significantly simplifies
mheap.sysAlloc, which has been a source of many bugs. E.g., #21044,
 #20259, #18651, and #13143 (and maybe #23222).
This change also makes it possible to allocate individual objects
larger than 512GB. As a result, a few tests that expected huge
allocations to fail needed to be changed to make even larger
allocations. However, at the moment attempting to allocate a humongous
object may cause the program to freeze for several minutes on Linux as
we fall back to probing every page with addrspace_free. That logic
(and this failure mode) will be removed in the next CL.
Fixes #10460.
Fixes #22204 (since it rewrites the code involved).
This slightly slows down compilebench and the x/benchmarks garbage
benchmark.
name       old time/op     new time/op     delta
Template       184ms ± 1%      185ms ± 1%    ~     (p=0.065 n=10+9)
Unicode       86.9ms ± 3%     86.3ms ± 1%    ~     (p=0.631 n=10+10)
GoTypes        599ms ± 0%      602ms ± 0%  +0.56%  (p=0.000 n=10+9)
Compiler       2.87s ± 1%      2.89s ± 1%  +0.51%  (p=0.002 n=9+10)
SSA            7.29s ± 1%      7.25s ± 1%    ~     (p=0.182 n=10+9)
Flate          118ms ± 2%      118ms ± 1%    ~     (p=0.113 n=9+9)
GoParser       147ms ± 1%      148ms ± 1%  +1.07%  (p=0.003 n=9+10)
Reflect        401ms ± 1%      404ms ± 1%  +0.71%  (p=0.003 n=10+9)
Tar            175ms ± 1%      175ms ± 1%    ~     (p=0.604 n=9+10)
XML            209ms ± 1%      210ms ± 1%    ~     (p=0.052 n=10+10)
(https://perf.golang.org/search?q=upload:20171231.4)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.23ms ± 1%  2.25ms ± 1%  +0.84%  (p=0.000 n=19+19)
(https://perf.golang.org/search?q=upload:20171231.3)
Relative to the start of the sparse heap changes (starting at and
including "runtime: fix various contiguous bitmap assumptions"),
overall slowdown is roughly 1% on GC-intensive benchmarks:
name        old time/op     new time/op     delta
Template        183ms ± 1%      185ms ± 1%  +1.32%  (p=0.000 n=9+9)
Unicode        84.9ms ± 2%     86.3ms ± 1%  +1.65%  (p=0.000 n=9+10)
GoTypes         595ms ± 1%      602ms ± 0%  +1.19%  (p=0.000 n=9+9)
Compiler        2.86s ± 0%      2.89s ± 1%  +0.91%  (p=0.000 n=9+10)
SSA             7.19s ± 0%      7.25s ± 1%  +0.75%  (p=0.000 n=8+9)
Flate           117ms ± 1%      118ms ± 1%  +1.10%  (p=0.000 n=10+9)
GoParser        146ms ± 2%      148ms ± 1%  +1.48%  (p=0.002 n=10+10)
Reflect         398ms ± 1%      404ms ± 1%  +1.51%  (p=0.000 n=10+9)
Tar             173ms ± 1%      175ms ± 1%  +1.17%  (p=0.000 n=10+10)
XML             208ms ± 1%      210ms ± 1%  +0.62%  (p=0.011 n=10+10)
[Geo mean]      369ms           373ms       +1.17%
(https://perf.golang.org/search?q=upload:20180101.2)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.22ms ± 1%  2.25ms ± 1%  +1.51%  (p=0.000 n=20+19)
(https://perf.golang.org/search?q=upload:20180101.3)
Change-Id: I5daf4cfec24b252e5a57001f0a6c03f22479d0f0
Reviewed-on: https://go-review.googlesource.com/85887
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2017-12-19 22:05:23 -08:00
										 |  |  | 		} | 
					
						
							|  |  |  | 		return unsafe.Pointer(pAligned), size | 
					
						
							| 
									
										
										
										
											2015-02-19 13:38:46 -05:00
										 |  |  | 	} | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-01-03 17:59:48 +00:00
										 |  |  | // enableMetadataHugePages enables huge pages for various sources of heap metadata. | 
					
						
							|  |  |  | // | 
					
						
							|  |  |  | // A note on latency: for sufficiently small heaps (<10s of GiB) this function will take constant | 
					
						
							|  |  |  | // time, but may take time proportional to the size of the mapped heap beyond that. | 
					
						
							|  |  |  | // | 
					
						
							|  |  |  | // This function is idempotent. | 
					
						
							|  |  |  | // | 
					
						
							|  |  |  | // The heap lock must not be held over this operation, since it will briefly acquire | 
					
						
							|  |  |  | // the heap lock. | 
					
						
							| 
									
										
										
										
											2023-11-10 21:23:38 +00:00
										 |  |  | // | 
					
						
							|  |  |  | // Must be called on the system stack because it acquires the heap lock. | 
					
						
							|  |  |  | // | 
					
						
							|  |  |  | //go:systemstack | 
					
						
							| 
									
										
										
										
											2023-01-03 17:59:48 +00:00
										 |  |  | func (h *mheap) enableMetadataHugePages() { | 
					
						
							|  |  |  | 	// Enable huge pages for page structure. | 
					
						
							|  |  |  | 	h.pages.enableChunkHugePages() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	// Grab the lock and set arenasHugePages if it's not. | 
					
						
							|  |  |  | 	// | 
					
						
							|  |  |  | 	// Once arenasHugePages is set, all new L2 entries will be eligible for | 
					
						
							|  |  |  | 	// huge pages. We'll set all the old entries after we release the lock. | 
					
						
							|  |  |  | 	lock(&h.lock) | 
					
						
							|  |  |  | 	if h.arenasHugePages { | 
					
						
							|  |  |  | 		unlock(&h.lock) | 
					
						
							|  |  |  | 		return | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 	h.arenasHugePages = true | 
					
						
							|  |  |  | 	unlock(&h.lock) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	// N.B. The arenas L1 map is quite small on all platforms, so it's fine to | 
					
						
							|  |  |  | 	// just iterate over the whole thing. | 
					
						
							|  |  |  | 	for i := range h.arenas { | 
					
						
							|  |  |  | 		l2 := (*[1 << arenaL2Bits]*heapArena)(atomic.Loadp(unsafe.Pointer(&h.arenas[i]))) | 
					
						
							|  |  |  | 		if l2 == nil { | 
					
						
							|  |  |  | 			continue | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 		sysHugePage(unsafe.Pointer(l2), unsafe.Sizeof(*l2)) | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-09-16 10:22:15 -04:00
										 |  |  | // base address for all 0-byte allocations | 
					
						
							|  |  |  | var zerobase uintptr | 
					
						
							| 
									
										
										
										
											2014-07-30 09:01:52 -07:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-11-11 18:34:18 +00:00
										 |  |  | // nextFreeFast returns the next free object if one is quickly available. | 
					
						
							|  |  |  | // Otherwise it returns 0. | 
					
						
							|  |  |  | func nextFreeFast(s *mspan) gclinkptr { | 
					
						
							| 
									
										
										
										
											2022-10-05 15:29:29 +08:00
										 |  |  | 	theBit := sys.TrailingZeros64(s.allocCache) // Is there a free object in the allocCache? | 
					
						
							| 
									
										
										
										
											2016-03-31 10:45:36 -04:00
										 |  |  | 	if theBit < 64 { | 
					
						
							| 
									
										
										
										
											2022-11-16 17:32:08 -05:00
										 |  |  | 		result := s.freeindex + uint16(theBit) | 
					
						
							| 
									
										
										
										
											2016-03-02 12:15:02 -05:00
										 |  |  | 		if result < s.nelems { | 
					
						
							| 
									
										
										
										
											2022-11-11 18:34:18 +00:00
										 |  |  | 			freeidx := result + 1 | 
					
						
							|  |  |  | 			if freeidx%64 == 0 && freeidx != s.nelems { | 
					
						
							|  |  |  | 				return 0 | 
					
						
							|  |  |  | 			} | 
					
						
							| 
									
										
										
										
											2017-03-14 13:25:12 -07:00
										 |  |  | 			s.allocCache >>= uint(theBit + 1) | 
					
						
							| 
									
										
										
										
											2022-11-11 18:34:18 +00:00
										 |  |  | 			s.freeindex = freeidx | 
					
						
							| 
									
										
										
										
											2016-03-02 12:15:02 -05:00
										 |  |  | 			s.allocCount++ | 
					
						
							| 
									
										
										
										
											2022-11-16 17:32:08 -05:00
										 |  |  | 			return gclinkptr(uintptr(result)*s.elemsize + s.base()) | 
					
						
							| 
									
										
										
										
											2016-03-02 12:15:02 -05:00
										 |  |  | 		} | 
					
						
							|  |  |  | 	} | 
					
						
							| 
									
										
										
										
											2022-11-11 18:34:18 +00:00
										 |  |  | 	return 0 | 
					
						
							| 
									
										
										
										
											2016-03-02 12:15:02 -05:00
										 |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-02-08 12:36:23 -05:00
										 |  |  | // nextFree returns the next free object from the cached span if one is available. | 
					
						
							|  |  |  | // Otherwise it refills the cache with a span with an available object and | 
					
						
							|  |  |  | // returns that object along with a flag indicating that this was a heavy | 
					
						
							|  |  |  | // weight allocation. If it is a heavy weight allocation the caller must | 
					
						
							|  |  |  | // determine whether a new GC cycle needs to be started or if the GC is active | 
					
						
							|  |  |  | // whether this goroutine needs to assist the GC. | 
					
						
							| 
									
										
										
										
											2018-09-23 19:12:15 -04:00
										 |  |  | // | 
					
						
							|  |  |  | // Must run in a non-preemptible context since otherwise the owner of | 
					
						
							|  |  |  | // c could change. | 
					
						
							| 
									
										
										
										
											2022-11-11 18:34:18 +00:00
										 |  |  | func (c *mcache) nextFree(spc spanClass) (v gclinkptr, s *mspan, shouldhelpgc bool) { | 
					
						
							| 
									
										
										
										
											2016-02-09 17:53:07 -05:00
										 |  |  | 	s = c.alloc[spc] | 
					
						
							| 
									
										
										
										
											2016-02-11 13:57:58 -05:00
										 |  |  | 	shouldhelpgc = false | 
					
						
							| 
									
										
										
										
											2022-11-11 18:34:18 +00:00
										 |  |  | 	freeIndex := s.nextFreeIndex() | 
					
						
							| 
									
										
										
										
											2016-02-11 13:57:58 -05:00
										 |  |  | 	if freeIndex == s.nelems { | 
					
						
							|  |  |  | 		// The span is full. | 
					
						
							| 
									
										
										
										
											2022-11-16 17:32:08 -05:00
										 |  |  | 		if s.allocCount != s.nelems { | 
					
						
							| 
									
										
										
										
											2016-02-24 14:36:30 -05:00
										 |  |  | 			println("runtime: s.allocCount=", s.allocCount, "s.nelems=", s.nelems) | 
					
						
							| 
									
										
										
										
											2016-03-02 12:15:02 -05:00
										 |  |  | 			throw("s.allocCount != s.nelems && freeIndex == s.nelems") | 
					
						
							| 
									
										
										
										
											2016-02-11 13:57:58 -05:00
										 |  |  | 		} | 
					
						
							| 
									
										
										
										
											2018-09-23 19:12:15 -04:00
										 |  |  | 		c.refill(spc) | 
					
						
							| 
									
										
										
										
											2016-02-08 12:36:23 -05:00
										 |  |  | 		shouldhelpgc = true | 
					
						
							| 
									
										
										
										
											2016-02-09 17:53:07 -05:00
										 |  |  | 		s = c.alloc[spc] | 
					
						
							| 
									
										
										
										
											2016-02-24 14:36:30 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  | 		freeIndex = s.nextFreeIndex() | 
					
						
							| 
									
										
										
										
											2016-02-11 13:57:58 -05:00
										 |  |  | 	} | 
					
						
							| 
									
										
										
										
											2016-02-24 14:36:30 -05:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-02-11 13:57:58 -05:00
										 |  |  | 	if freeIndex >= s.nelems { | 
					
						
							|  |  |  | 		throw("freeIndex is not valid") | 
					
						
							| 
									
										
										
										
											2016-02-08 12:36:23 -05:00
										 |  |  | 	} | 
					
						
							| 
									
										
										
										
											2016-02-11 13:57:58 -05:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-11-16 17:32:08 -05:00
										 |  |  | 	v = gclinkptr(uintptr(freeIndex)*s.elemsize + s.base()) | 
					
						
							| 
									
										
										
										
											2016-02-16 17:16:43 -05:00
										 |  |  | 	s.allocCount++ | 
					
						
							| 
									
										
										
										
											2022-11-16 17:32:08 -05:00
										 |  |  | 	if s.allocCount > s.nelems { | 
					
						
							| 
									
										
										
										
											2016-02-24 14:36:30 -05:00
										 |  |  | 		println("s.allocCount=", s.allocCount, "s.nelems=", s.nelems) | 
					
						
							| 
									
										
										
										
											2016-02-16 17:16:43 -05:00
										 |  |  | 		throw("s.allocCount > s.nelems") | 
					
						
							| 
									
										
										
										
											2016-02-11 13:57:58 -05:00
										 |  |  | 	} | 
					
						
							| 
									
										
										
										
											2016-02-08 12:36:23 -05:00
										 |  |  | 	return | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-08-05 17:03:06 +04:00
										 |  |  | // Allocate an object of size bytes. | 
					
						
							|  |  |  | // Small objects are allocated from the per-P cache's free lists. | 
					
						
							| 
									
										
										
										
											2014-07-30 09:01:52 -07:00
										 |  |  | // Large objects (> 32 kB) are allocated straight from the heap. | 
					
						
							| 
									
										
										
										
											2016-04-19 19:35:10 -07:00
										 |  |  | func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer { | 
					
						
							| 
									
										
										
										
											2015-03-05 17:33:08 -05:00
										 |  |  | 	if gcphase == _GCmarktermination { | 
					
						
							|  |  |  | 		throw("mallocgc called with gcphase == _GCmarktermination") | 
					
						
							|  |  |  | 	} | 
					
						
							| 
									
										
										
										
											2015-03-08 20:56:15 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-07-30 09:01:52 -07:00
										 |  |  | 	if size == 0 { | 
					
						
							| 
									
										
										
										
											2014-09-16 10:22:15 -04:00
										 |  |  | 		return unsafe.Pointer(&zerobase) | 
					
						
							| 
									
										
										
										
											2014-07-30 09:01:52 -07:00
										 |  |  | 	} | 
					
						
							| 
									
										
										
										
											2022-07-20 15:06:31 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | 	// It's possible for any malloc to trigger sweeping, which may in | 
					
						
							|  |  |  | 	// turn queue finalizers. Record this dynamic lock edge. | 
					
						
							|  |  |  | 	lockRankMayQueueFinalizer() | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-01-05 17:52:43 +08:00
										 |  |  | 	userSize := size | 
					
						
							|  |  |  | 	if asanenabled { | 
					
						
							|  |  |  | 		// Refer to ASAN runtime library, the malloc() function allocates extra memory, | 
					
						
							|  |  |  | 		// the redzone, around the user requested memory region. And the redzones are marked | 
					
						
							|  |  |  | 		// as unaddressable. We perform the same operations in Go to detect the overflows or | 
					
						
							|  |  |  | 		// underflows. | 
					
						
							|  |  |  | 		size += computeRZlog(size) | 
					
						
							|  |  |  | 	} | 
					
						
							| 
									
										
										
										
											2014-07-30 09:01:52 -07:00
										 |  |  | 
 | 
					
						
							| 
									
										
											  
											
												runtime: implement GODEBUG=inittrace=1 support
Setting inittrace=1 causes the runtime to emit a single line to standard error for
each package with init work, summarizing the execution time and memory allocation.
The emitted debug information for init functions can be used to find bottlenecks
or regressions in Go startup performance.
Packages with no init function work (user defined or compiler generated) are omitted.
Tracing plugin inits is not supported as they can execute concurrently. This would
make the implementation of tracing more complex while adding support for a very rare
use case. Plugin inits can be traced separately by testing a main package importing
the plugins package imports explicitly.
$ GODEBUG=inittrace=1 go test
init internal/bytealg @0.008 ms, 0 ms clock, 0 bytes, 0 allocs
init runtime @0.059 ms, 0.026 ms clock, 0 bytes, 0 allocs
init math @0.19 ms, 0.001 ms clock, 0 bytes, 0 allocs
init errors @0.22 ms, 0.004 ms clock, 0 bytes, 0 allocs
init strconv @0.24 ms, 0.002 ms clock, 32 bytes, 2 allocs
init sync @0.28 ms, 0.003 ms clock, 16 bytes, 1 allocs
init unicode @0.44 ms, 0.11 ms clock, 23328 bytes, 24 allocs
...
Inspired by stapelberg@google.com who instrumented doInit
in a prototype to measure init times with GDB.
Fixes #41378
Change-Id: Ic37c6a0cfc95488de9e737f5e346b8dbb39174e1
Reviewed-on: https://go-review.googlesource.com/c/go/+/254659
Trust: Martin Möhrmann <moehrmann@google.com>
Run-TryBot: Martin Möhrmann <moehrmann@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
											
										 
											2020-09-14 16:55:34 +02:00
										 |  |  | 	if debug.malloc { | 
					
						
							|  |  |  | 		if debug.sbrk != 0 { | 
					
						
							|  |  |  | 			align := uintptr(16) | 
					
						
							|  |  |  | 			if typ != nil { | 
					
						
							|  |  |  | 				// TODO(austin): This should be just | 
					
						
							|  |  |  | 				//   align = uintptr(typ.align) | 
					
						
							|  |  |  | 				// but that's only 4 on 32-bit platforms, | 
					
						
							|  |  |  | 				// even if there's a uint64 field in typ (see #599). | 
					
						
							|  |  |  | 				// This causes 64-bit atomic accesses to panic. | 
					
						
							|  |  |  | 				// Hence, we use stricter alignment that matches | 
					
						
							|  |  |  | 				// the normal allocator better. | 
					
						
							|  |  |  | 				if size&7 == 0 { | 
					
						
							|  |  |  | 					align = 8 | 
					
						
							|  |  |  | 				} else if size&3 == 0 { | 
					
						
							|  |  |  | 					align = 4 | 
					
						
							|  |  |  | 				} else if size&1 == 0 { | 
					
						
							|  |  |  | 					align = 2 | 
					
						
							|  |  |  | 				} else { | 
					
						
							|  |  |  | 					align = 1 | 
					
						
							|  |  |  | 				} | 
					
						
							| 
									
										
										
										
											2019-07-18 12:30:37 -04:00
										 |  |  | 			} | 
					
						
							| 
									
										
											  
											
												runtime: implement GODEBUG=inittrace=1 support
Setting inittrace=1 causes the runtime to emit a single line to standard error for
each package with init work, summarizing the execution time and memory allocation.
The emitted debug information for init functions can be used to find bottlenecks
or regressions in Go startup performance.
Packages with no init function work (user defined or compiler generated) are omitted.
Tracing plugin inits is not supported as they can execute concurrently. This would
make the implementation of tracing more complex while adding support for a very rare
use case. Plugin inits can be traced separately by testing a main package importing
the plugins package imports explicitly.
$ GODEBUG=inittrace=1 go test
init internal/bytealg @0.008 ms, 0 ms clock, 0 bytes, 0 allocs
init runtime @0.059 ms, 0.026 ms clock, 0 bytes, 0 allocs
init math @0.19 ms, 0.001 ms clock, 0 bytes, 0 allocs
init errors @0.22 ms, 0.004 ms clock, 0 bytes, 0 allocs
init strconv @0.24 ms, 0.002 ms clock, 32 bytes, 2 allocs
init sync @0.28 ms, 0.003 ms clock, 16 bytes, 1 allocs
init unicode @0.44 ms, 0.11 ms clock, 23328 bytes, 24 allocs
...
Inspired by stapelberg@google.com who instrumented doInit
in a prototype to measure init times with GDB.
Fixes #41378
Change-Id: Ic37c6a0cfc95488de9e737f5e346b8dbb39174e1
Reviewed-on: https://go-review.googlesource.com/c/go/+/254659
Trust: Martin Möhrmann <moehrmann@google.com>
Run-TryBot: Martin Möhrmann <moehrmann@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
											
										 
											2020-09-14 16:55:34 +02:00
										 |  |  | 			return persistentalloc(size, align, &memstats.other_sys) | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 		if inittrace.active && inittrace.id == getg().goid { | 
					
						
							| 
									
										
										
										
											2021-03-29 18:51:06 +08:00
										 |  |  | 			// Init functions are executed sequentially in a single goroutine. | 
					
						
							| 
									
										
											  
											
												runtime: implement GODEBUG=inittrace=1 support
Setting inittrace=1 causes the runtime to emit a single line to standard error for
each package with init work, summarizing the execution time and memory allocation.
The emitted debug information for init functions can be used to find bottlenecks
or regressions in Go startup performance.
Packages with no init function work (user defined or compiler generated) are omitted.
Tracing plugin inits is not supported as they can execute concurrently. This would
make the implementation of tracing more complex while adding support for a very rare
use case. Plugin inits can be traced separately by testing a main package importing
the plugins package imports explicitly.
$ GODEBUG=inittrace=1 go test
init internal/bytealg @0.008 ms, 0 ms clock, 0 bytes, 0 allocs
init runtime @0.059 ms, 0.026 ms clock, 0 bytes, 0 allocs
init math @0.19 ms, 0.001 ms clock, 0 bytes, 0 allocs
init errors @0.22 ms, 0.004 ms clock, 0 bytes, 0 allocs
init strconv @0.24 ms, 0.002 ms clock, 32 bytes, 2 allocs
init sync @0.28 ms, 0.003 ms clock, 16 bytes, 1 allocs
init unicode @0.44 ms, 0.11 ms clock, 23328 bytes, 24 allocs
...
Inspired by stapelberg@google.com who instrumented doInit
in a prototype to measure init times with GDB.
Fixes #41378
Change-Id: Ic37c6a0cfc95488de9e737f5e346b8dbb39174e1
Reviewed-on: https://go-review.googlesource.com/c/go/+/254659
Trust: Martin Möhrmann <moehrmann@google.com>
Run-TryBot: Martin Möhrmann <moehrmann@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
											
										 
											2020-09-14 16:55:34 +02:00
										 |  |  | 			inittrace.allocs += 1 | 
					
						
							| 
									
										
										
										
											2015-03-08 20:56:15 -04:00
										 |  |  | 		} | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-10-04 20:56:11 -07:00
										 |  |  | 	// assistG is the G to charge for this allocation, or nil if | 
					
						
							|  |  |  | 	// GC is not currently active. | 
					
						
							| 
									
										
										
										
											2022-08-13 16:37:09 +00:00
										 |  |  | 	assistG := deductAssistCredit(size) | 
					
						
							| 
									
										
										
										
											2015-10-04 20:56:11 -07:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-01-16 14:43:38 -05:00
										 |  |  | 	// Set mp.mallocing to keep from being preempted by GC. | 
					
						
							|  |  |  | 	mp := acquirem() | 
					
						
							|  |  |  | 	if mp.mallocing != 0 { | 
					
						
							|  |  |  | 		throw("malloc deadlock") | 
					
						
							| 
									
										
										
										
											2014-08-18 16:33:39 +04:00
										 |  |  | 	} | 
					
						
							| 
									
										
										
										
											2015-05-12 10:01:37 -07:00
										 |  |  | 	if mp.gsignal == getg() { | 
					
						
							|  |  |  | 		throw("malloc during signal") | 
					
						
							|  |  |  | 	} | 
					
						
							| 
									
										
										
										
											2015-01-16 14:43:38 -05:00
										 |  |  | 	mp.mallocing = 1 | 
					
						
							| 
									
										
										
										
											2014-08-18 16:33:39 +04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-03-08 20:56:15 -04:00
										 |  |  | 	shouldhelpgc := false | 
					
						
							| 
									
										
										
										
											2021-01-05 17:52:43 +08:00
										 |  |  | 	dataSize := userSize | 
					
						
							| 
									
										
										
										
											2021-09-11 20:53:24 +08:00
										 |  |  | 	c := getMCache(mp) | 
					
						
							| 
									
										
										
										
											2020-11-02 16:58:38 +00:00
										 |  |  | 	if c == nil { | 
					
						
							|  |  |  | 		throw("mallocgc called without a P or outside bootstrapping") | 
					
						
							|  |  |  | 	} | 
					
						
							| 
									
										
										
										
											2020-06-05 22:01:25 -04:00
										 |  |  | 	var span *mspan | 
					
						
							| 
									
										
											  
											
												runtime: implement experiment to replace heap bitmap with alloc headers
This change replaces the 1-bit-per-word heap bitmap for most size
classes with allocation headers for objects that contain pointers. The
header consists of a single pointer to a type. All allocations with
headers are treated as implicitly containing one or more instances of
the type in the header.
As the name implies, headers are usually stored as the first word of an
object. There are two additional exceptions to where headers are stored
and how they're used.
Objects smaller than 512 bytes do not have headers. Instead, a heap
bitmap is reserved at the end of spans for objects of this size. A full
word of overhead is too much for these small objects. The bitmap is of
the same format of the old bitmap, minus the noMorePtrs bits which are
unnecessary. All the objects <512 bytes have a bitmap less than a
pointer-word in size, and that was the granularity at which noMorePtrs
could stop scanning early anyway.
Objects that are larger than 32 KiB (which have their own span) have
their headers stored directly in the span, to allow power-of-two-sized
allocations to not spill over into an extra page.
The full implementation is behind GOEXPERIMENT=allocheaders.
The purpose of this change is performance. First and foremost, with
headers we no longer have to unroll pointer/scalar data at allocation
time for most size classes. Small size classes still need some
unrolling, but their bitmaps are small so we can optimize that case
fairly well. Larger objects effectively have their pointer/scalar data
unrolled on-demand from type data, which is much more compactly
represented and results in less TLB pressure. Furthermore, since the
headers are usually right next to the object and where we're about to
start scanning, we get an additional temporal locality benefit in the
data cache when looking up type metadata. The pointer/scalar data is
now effectively unrolled on-demand, but it's also simpler to unroll than
before; that unrolled data is never written anywhere, and for arrays we
get the benefit of retreading the same data per element, as opposed to
looking it up from scratch for each pointer-word of bitmap. Lastly,
because we no longer have a heap bitmap that spans the entire heap,
there's a flat 1.5% memory use reduction. This is balanced slightly by
some objects possibly being bumped up a size class, but most objects are
not tightly optimized to size class sizes so there's some memory to
spare, making the header basically free in those cases.
See the follow-up CL which turns on this experiment by default for
benchmark results. (CL 538217.)
Change-Id: I4c9034ee200650d06d8bdecd579d5f7c1bbf1fc5
Reviewed-on: https://go-review.googlesource.com/c/go/+/437955
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: Keith Randall <khr@golang.org>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
											
										 
											2022-09-11 04:07:41 +00:00
										 |  |  | 	var header **_type | 
					
						
							| 
									
										
										
										
											2014-07-30 09:01:52 -07:00
										 |  |  | 	var x unsafe.Pointer | 
					
						
							| 
									
										
										
										
											2023-01-20 16:41:57 -05:00
										 |  |  | 	noscan := typ == nil || typ.PtrBytes == 0 | 
					
						
							| 
									
										
										
										
											2020-11-17 19:54:31 -05:00
										 |  |  | 	// In some cases block zeroing can profitably (for latency reduction purposes) | 
					
						
							| 
									
										
											  
											
												runtime: clean up allocation zeroing
Currently, the runtime zeroes allocations in several ways. First, small
object spans are always zeroed if they come from mheap, and their slots
are zeroed later in mallocgc if needed. Second, large object spans
(objects that have their own spans) plumb the need for zeroing down into
mheap. Thirdly, large objects that have no pointers have their zeroing
delayed until after preemption is reenabled, but before returning in
mallocgc.
All of this has two consequences:
1. Spans for small objects that come from mheap are sometimes
   unnecessarily zeroed, even if the mallocgc call that created them
   doesn't need the object slot to be zeroed.
2. This is all messy and difficult to reason about.
This CL simplifies this code, resolving both (1) and (2). First, it
recognizes that zeroing in mheap is unnecessary for small object spans;
mallocgc and its callees in mcache and mcentral, by design, are *always*
able to deal with non-zeroed spans. They must, for they deal with
recycled spans all the time. Once this fact is made clear, the only
remaining use of zeroing in mheap is for large objects.
As a result, this CL lifts mheap zeroing for large objects into
mallocgc, to parallel all the other codepaths in mallocgc. This is makes
the large object allocation code less surprising.
Next, this CL sets the flag for the delayed zeroing explicitly in the one
case where it matters, and inverts and renames the flag from isZeroed to
delayZeroing.
Finally, it adds a check to make sure that only pointer-free allocations
take the delayed zeroing codepath, as an extra safety measure.
Benchmark results: https://perf.golang.org/search?q=upload:20211028.8
Inspired by tapir.liu@gmail.com's CL 343470.
Change-Id: I7e1296adc19ce8a02c8d93a0a5082aefb2673e8f
Reviewed-on: https://go-review.googlesource.com/c/go/+/359477
Trust: Michael Knyszek <mknyszek@google.com>
Reviewed-by: David Chase <drchase@google.com>
											
										 
											2021-10-28 17:52:22 +00:00
										 |  |  | 	// be delayed till preemption is possible; delayedZeroing tracks that state. | 
					
						
							|  |  |  | 	delayedZeroing := false | 
					
						
							| 
									
										
											  
											
												runtime: implement experiment to replace heap bitmap with alloc headers
This change replaces the 1-bit-per-word heap bitmap for most size
classes with allocation headers for objects that contain pointers. The
header consists of a single pointer to a type. All allocations with
headers are treated as implicitly containing one or more instances of
the type in the header.
As the name implies, headers are usually stored as the first word of an
object. There are two additional exceptions to where headers are stored
and how they're used.
Objects smaller than 512 bytes do not have headers. Instead, a heap
bitmap is reserved at the end of spans for objects of this size. A full
word of overhead is too much for these small objects. The bitmap is of
the same format of the old bitmap, minus the noMorePtrs bits which are
unnecessary. All the objects <512 bytes have a bitmap less than a
pointer-word in size, and that was the granularity at which noMorePtrs
could stop scanning early anyway.
Objects that are larger than 32 KiB (which have their own span) have
their headers stored directly in the span, to allow power-of-two-sized
allocations to not spill over into an extra page.
The full implementation is behind GOEXPERIMENT=allocheaders.
The purpose of this change is performance. First and foremost, with
headers we no longer have to unroll pointer/scalar data at allocation
time for most size classes. Small size classes still need some
unrolling, but their bitmaps are small so we can optimize that case
fairly well. Larger objects effectively have their pointer/scalar data
unrolled on-demand from type data, which is much more compactly
represented and results in less TLB pressure. Furthermore, since the
headers are usually right next to the object and where we're about to
start scanning, we get an additional temporal locality benefit in the
data cache when looking up type metadata. The pointer/scalar data is
now effectively unrolled on-demand, but it's also simpler to unroll than
before; that unrolled data is never written anywhere, and for arrays we
get the benefit of retreading the same data per element, as opposed to
looking it up from scratch for each pointer-word of bitmap. Lastly,
because we no longer have a heap bitmap that spans the entire heap,
there's a flat 1.5% memory use reduction. This is balanced slightly by
some objects possibly being bumped up a size class, but most objects are
not tightly optimized to size class sizes so there's some memory to
spare, making the header basically free in those cases.
See the follow-up CL which turns on this experiment by default for
benchmark results. (CL 538217.)
Change-Id: I4c9034ee200650d06d8bdecd579d5f7c1bbf1fc5
Reviewed-on: https://go-review.googlesource.com/c/go/+/437955
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: Keith Randall <khr@golang.org>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
											
										 
											2022-09-11 04:07:41 +00:00
										 |  |  | 	// Determine if it's a 'small' object that goes into a size-classed span. | 
					
						
							|  |  |  | 	// | 
					
						
							|  |  |  | 	// Note: This comparison looks a little strange, but it exists to smooth out | 
					
						
							|  |  |  | 	// the crossover between the largest size class and large objects that have | 
					
						
							|  |  |  | 	// their own spans. The small window of object sizes between maxSmallSize-mallocHeaderSize | 
					
						
							|  |  |  | 	// and maxSmallSize will be considered large, even though they might fit in | 
					
						
							|  |  |  | 	// a size class. In practice this is completely fine, since the largest small | 
					
						
							|  |  |  | 	// size class has a single object in it already, precisely to make the transition | 
					
						
							|  |  |  | 	// to large objects smooth. | 
					
						
							|  |  |  | 	if size <= maxSmallSize-mallocHeaderSize { | 
					
						
							| 
									
										
										
										
											2016-04-19 19:35:10 -07:00
										 |  |  | 		if noscan && size < maxTinySize { | 
					
						
							| 
									
										
										
										
											2014-07-30 09:01:52 -07:00
										 |  |  | 			// Tiny allocator. | 
					
						
							|  |  |  | 			// | 
					
						
							|  |  |  | 			// Tiny allocator combines several tiny allocation requests | 
					
						
							|  |  |  | 			// into a single memory block. The resulting memory block | 
					
						
							|  |  |  | 			// is freed when all subobjects are unreachable. The subobjects | 
					
						
							| 
									
										
										
										
											2016-04-19 19:35:10 -07:00
										 |  |  | 			// must be noscan (don't have pointers), this ensures that | 
					
						
							| 
									
										
										
										
											2014-07-30 09:01:52 -07:00
										 |  |  | 			// the amount of potentially wasted memory is bounded. | 
					
						
							|  |  |  | 			// | 
					
						
							|  |  |  | 			// Size of the memory block used for combining (maxTinySize) is tunable. | 
					
						
							|  |  |  | 			// Current setting is 16 bytes, which relates to 2x worst case memory | 
					
						
							|  |  |  | 			// wastage (when all but one subobjects are unreachable). | 
					
						
							|  |  |  | 			// 8 bytes would result in no wastage at all, but provides less | 
					
						
							|  |  |  | 			// opportunities for combining. | 
					
						
							|  |  |  | 			// 32 bytes provides more opportunities for combining, | 
					
						
							|  |  |  | 			// but can lead to 4x worst case wastage. | 
					
						
							|  |  |  | 			// The best case winning is 8x regardless of block size. | 
					
						
							|  |  |  | 			// | 
					
						
							|  |  |  | 			// Objects obtained from tiny allocator must not be freed explicitly. | 
					
						
							|  |  |  | 			// So when an object will be freed explicitly, we ensure that | 
					
						
							|  |  |  | 			// its size >= maxTinySize. | 
					
						
							|  |  |  | 			// | 
					
						
							|  |  |  | 			// SetFinalizer has a special case for objects potentially coming | 
					
						
							|  |  |  | 			// from tiny allocator, it such case it allows to set finalizers | 
					
						
							|  |  |  | 			// for an inner byte of a memory block. | 
					
						
							|  |  |  | 			// | 
					
						
							|  |  |  | 			// The main targets of tiny allocator are small strings and | 
					
						
							|  |  |  | 			// standalone escaping variables. On a json benchmark | 
					
						
							|  |  |  | 			// the allocator reduces number of allocations by ~12% and | 
					
						
							|  |  |  | 			// reduces heap size by ~20%. | 
					
						
							| 
									
										
										
										
											2015-01-14 14:13:55 -05:00
										 |  |  | 			off := c.tinyoffset | 
					
						
							|  |  |  | 			// Align tiny pointer for required (conservative) alignment. | 
					
						
							|  |  |  | 			if size&7 == 0 { | 
					
						
							| 
									
										
										
										
											2019-06-28 16:44:07 +00:00
										 |  |  | 				off = alignUp(off, 8) | 
					
						
							| 
									
										
										
										
											2021-06-16 23:05:44 +00:00
										 |  |  | 			} else if goarch.PtrSize == 4 && size == 12 { | 
					
						
							| 
									
										
										
										
											2020-09-10 21:20:46 +00:00
										 |  |  | 				// Conservatively align 12-byte objects to 8 bytes on 32-bit | 
					
						
							|  |  |  | 				// systems so that objects whose first field is a 64-bit | 
					
						
							|  |  |  | 				// value is aligned to 8 bytes and does not cause a fault on | 
					
						
							|  |  |  | 				// atomic access. See issue 37262. | 
					
						
							|  |  |  | 				// TODO(mknyszek): Remove this workaround if/when issue 36606 | 
					
						
							|  |  |  | 				// is resolved. | 
					
						
							|  |  |  | 				off = alignUp(off, 8) | 
					
						
							| 
									
										
										
										
											2015-01-14 14:13:55 -05:00
										 |  |  | 			} else if size&3 == 0 { | 
					
						
							| 
									
										
										
										
											2019-06-28 16:44:07 +00:00
										 |  |  | 				off = alignUp(off, 4) | 
					
						
							| 
									
										
										
										
											2015-01-14 14:13:55 -05:00
										 |  |  | 			} else if size&1 == 0 { | 
					
						
							| 
									
										
										
										
											2019-06-28 16:44:07 +00:00
										 |  |  | 				off = alignUp(off, 2) | 
					
						
							| 
									
										
										
										
											2015-01-14 14:13:55 -05:00
										 |  |  | 			} | 
					
						
							| 
									
										
										
										
											2015-11-16 15:31:50 -05:00
										 |  |  | 			if off+size <= maxTinySize && c.tiny != 0 { | 
					
						
							| 
									
										
										
										
											2015-01-14 14:13:55 -05:00
										 |  |  | 				// The object fits into existing tiny block. | 
					
						
							| 
									
										
										
										
											2015-11-16 15:31:50 -05:00
										 |  |  | 				x = unsafe.Pointer(c.tiny + off) | 
					
						
							| 
									
										
										
										
											2015-01-14 14:13:55 -05:00
										 |  |  | 				c.tinyoffset = off + size | 
					
						
							| 
									
										
										
										
											2020-08-04 17:29:03 +00:00
										 |  |  | 				c.tinyAllocs++ | 
					
						
							| 
									
										
										
										
											2015-01-16 14:43:38 -05:00
										 |  |  | 				mp.mallocing = 0 | 
					
						
							|  |  |  | 				releasem(mp) | 
					
						
							| 
									
										
										
										
											2015-01-14 14:13:55 -05:00
										 |  |  | 				return x | 
					
						
							| 
									
										
										
										
											2014-07-30 09:01:52 -07:00
										 |  |  | 			} | 
					
						
							|  |  |  | 			// Allocate a new maxTinySize block. | 
					
						
							| 
									
										
										
										
											2020-06-05 22:01:25 -04:00
										 |  |  | 			span = c.alloc[tinySpanClass] | 
					
						
							| 
									
										
										
										
											2022-11-11 18:34:18 +00:00
										 |  |  | 			v := nextFreeFast(span) | 
					
						
							| 
									
										
										
										
											2016-03-02 12:15:02 -05:00
										 |  |  | 			if v == 0 { | 
					
						
							| 
									
										
										
										
											2022-11-11 18:34:18 +00:00
										 |  |  | 				v, span, shouldhelpgc = c.nextFree(tinySpanClass) | 
					
						
							| 
									
										
										
										
											2016-03-02 12:15:02 -05:00
										 |  |  | 			} | 
					
						
							| 
									
										
										
										
											2014-07-30 09:01:52 -07:00
										 |  |  | 			x = unsafe.Pointer(v) | 
					
						
							|  |  |  | 			(*[2]uint64)(x)[0] = 0 | 
					
						
							|  |  |  | 			(*[2]uint64)(x)[1] = 0 | 
					
						
							|  |  |  | 			// See if we need to replace the existing tiny block with the new one | 
					
						
							|  |  |  | 			// based on amount of remaining free space. | 
					
						
							| 
									
										
										
										
											2021-04-28 23:07:38 -07:00
										 |  |  | 			if !raceenabled && (size < c.tinyoffset || c.tiny == 0) { | 
					
						
							|  |  |  | 				// Note: disabled when race detector is on, see comment near end of this function. | 
					
						
							| 
									
										
										
										
											2015-11-16 15:31:50 -05:00
										 |  |  | 				c.tiny = uintptr(x) | 
					
						
							| 
									
										
										
										
											2015-01-14 14:13:55 -05:00
										 |  |  | 				c.tinyoffset = size | 
					
						
							| 
									
										
										
										
											2014-07-30 09:01:52 -07:00
										 |  |  | 			} | 
					
						
							|  |  |  | 			size = maxTinySize | 
					
						
							|  |  |  | 		} else { | 
					
						
							| 
									
										
											  
											
												runtime: implement experiment to replace heap bitmap with alloc headers
This change replaces the 1-bit-per-word heap bitmap for most size
classes with allocation headers for objects that contain pointers. The
header consists of a single pointer to a type. All allocations with
headers are treated as implicitly containing one or more instances of
the type in the header.
As the name implies, headers are usually stored as the first word of an
object. There are two additional exceptions to where headers are stored
and how they're used.
Objects smaller than 512 bytes do not have headers. Instead, a heap
bitmap is reserved at the end of spans for objects of this size. A full
word of overhead is too much for these small objects. The bitmap is of
the same format of the old bitmap, minus the noMorePtrs bits which are
unnecessary. All the objects <512 bytes have a bitmap less than a
pointer-word in size, and that was the granularity at which noMorePtrs
could stop scanning early anyway.
Objects that are larger than 32 KiB (which have their own span) have
their headers stored directly in the span, to allow power-of-two-sized
allocations to not spill over into an extra page.
The full implementation is behind GOEXPERIMENT=allocheaders.
The purpose of this change is performance. First and foremost, with
headers we no longer have to unroll pointer/scalar data at allocation
time for most size classes. Small size classes still need some
unrolling, but their bitmaps are small so we can optimize that case
fairly well. Larger objects effectively have their pointer/scalar data
unrolled on-demand from type data, which is much more compactly
represented and results in less TLB pressure. Furthermore, since the
headers are usually right next to the object and where we're about to
start scanning, we get an additional temporal locality benefit in the
data cache when looking up type metadata. The pointer/scalar data is
now effectively unrolled on-demand, but it's also simpler to unroll than
before; that unrolled data is never written anywhere, and for arrays we
get the benefit of retreading the same data per element, as opposed to
looking it up from scratch for each pointer-word of bitmap. Lastly,
because we no longer have a heap bitmap that spans the entire heap,
there's a flat 1.5% memory use reduction. This is balanced slightly by
some objects possibly being bumped up a size class, but most objects are
not tightly optimized to size class sizes so there's some memory to
spare, making the header basically free in those cases.
See the follow-up CL which turns on this experiment by default for
benchmark results. (CL 538217.)
Change-Id: I4c9034ee200650d06d8bdecd579d5f7c1bbf1fc5
Reviewed-on: https://go-review.googlesource.com/c/go/+/437955
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: Keith Randall <khr@golang.org>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
											
										 
											2022-09-11 04:07:41 +00:00
										 |  |  | 			hasHeader := !noscan && !heapBitsInSpan(size) | 
					
						
							|  |  |  | 			if goexperiment.AllocHeaders && hasHeader { | 
					
						
							|  |  |  | 				size += mallocHeaderSize | 
					
						
							|  |  |  | 			} | 
					
						
							| 
									
										
										
										
											2016-06-27 12:23:39 +02:00
										 |  |  | 			var sizeclass uint8 | 
					
						
							|  |  |  | 			if size <= smallSizeMax-8 { | 
					
						
							| 
									
										
										
										
											2020-03-20 11:37:54 -04:00
										 |  |  | 				sizeclass = size_to_class8[divRoundUp(size, smallSizeDiv)] | 
					
						
							| 
									
										
										
										
											2014-07-30 09:01:52 -07:00
										 |  |  | 			} else { | 
					
						
							| 
									
										
										
										
											2020-03-20 11:37:54 -04:00
										 |  |  | 				sizeclass = size_to_class128[divRoundUp(size-smallSizeMax, largeSizeDiv)] | 
					
						
							| 
									
										
										
										
											2014-07-30 09:01:52 -07:00
										 |  |  | 			} | 
					
						
							|  |  |  | 			size = uintptr(class_to_size[sizeclass]) | 
					
						
							| 
									
										
										
										
											2016-02-09 17:53:07 -05:00
										 |  |  | 			spc := makeSpanClass(sizeclass, noscan) | 
					
						
							| 
									
										
										
										
											2020-06-05 22:01:25 -04:00
										 |  |  | 			span = c.alloc[spc] | 
					
						
							| 
									
										
										
										
											2022-11-11 18:34:18 +00:00
										 |  |  | 			v := nextFreeFast(span) | 
					
						
							| 
									
										
										
										
											2016-03-02 12:15:02 -05:00
										 |  |  | 			if v == 0 { | 
					
						
							| 
									
										
										
										
											2022-11-11 18:34:18 +00:00
										 |  |  | 				v, span, shouldhelpgc = c.nextFree(spc) | 
					
						
							| 
									
										
										
										
											2016-03-02 12:15:02 -05:00
										 |  |  | 			} | 
					
						
							| 
									
										
										
										
											2014-07-30 09:01:52 -07:00
										 |  |  | 			x = unsafe.Pointer(v) | 
					
						
							| 
									
										
											  
											
												[dev.garbage] runtime: reintroduce no-zeroing optimization
Currently we always zero objects when we allocate them. We used to
have an optimization that would not zero objects that had not been
allocated since the whole span was last zeroed (either by getting it
from the system or by getting it from the heap, which does a bulk
zero), but this depended on the sweeper clobbering the first two words
of each object. Hence, we lost this optimization when the bitmap
sweeper went away.
Re-introduce this optimization using a different mechanism. Each span
already keeps a flag indicating that it just came from the OS or was
just bulk zeroed by the mheap. We can simply use this flag to know
when we don't need to zero an object. This is slightly less efficient
than the old optimization: if a span gets allocated and partially
used, then GC happens and the span gets returned to the mcentral, then
the span gets re-acquired, the old optimization knew that it only had
to re-zero the objects that had been reclaimed, whereas this
optimization will re-zero everything. However, in this case, you're
already paying for the garbage collection, and you've only wasted one
zeroing of the span, so in practice there seems to be little
difference. (If we did want to revive the full optimization, each span
could keep track of a frontier beyond which all free slots are zeroed.
I prototyped this and it didn't obvious do any better than the much
simpler approach in this commit.)
This significantly improves BinaryTree17, which is allocation-heavy
(and runs first, so most pages are already zeroed), and slightly
improves everything else.
name              old time/op  new time/op  delta
XBenchGarbage-12  2.15ms ± 1%  2.14ms ± 1%  -0.80%  (p=0.000 n=17+17)
name                      old time/op    new time/op    delta
BinaryTree17-12              2.71s ± 1%     2.56s ± 1%  -5.73%        (p=0.000 n=18+19)
DivconstI64-12              1.70ns ± 1%    1.70ns ± 1%    ~           (p=0.562 n=18+18)
DivconstU64-12              1.74ns ± 2%    1.74ns ± 1%    ~           (p=0.394 n=20+20)
DivconstI32-12              1.74ns ± 0%    1.74ns ± 0%    ~     (all samples are equal)
DivconstU32-12              1.66ns ± 1%    1.66ns ± 0%    ~           (p=0.516 n=15+16)
DivconstI16-12              1.84ns ± 0%    1.84ns ± 0%    ~     (all samples are equal)
DivconstU16-12              1.82ns ± 0%    1.82ns ± 0%    ~     (all samples are equal)
DivconstI8-12               1.79ns ± 0%    1.79ns ± 0%    ~     (all samples are equal)
DivconstU8-12               1.60ns ± 0%    1.60ns ± 1%    ~           (p=0.603 n=17+19)
Fannkuch11-12                2.11s ± 1%     2.11s ± 0%    ~           (p=0.333 n=16+19)
FmtFprintfEmpty-12          45.1ns ± 4%    45.4ns ± 5%    ~           (p=0.111 n=20+20)
FmtFprintfString-12          134ns ± 0%     129ns ± 0%  -3.45%        (p=0.000 n=18+16)
FmtFprintfInt-12             131ns ± 1%     129ns ± 1%  -1.54%        (p=0.000 n=16+18)
FmtFprintfIntInt-12          205ns ± 2%     203ns ± 0%  -0.56%        (p=0.014 n=20+18)
FmtFprintfPrefixedInt-12     200ns ± 2%     197ns ± 1%  -1.48%        (p=0.000 n=20+18)
FmtFprintfFloat-12           256ns ± 1%     256ns ± 0%  -0.21%        (p=0.008 n=18+20)
FmtManyArgs-12               805ns ± 0%     804ns ± 0%  -0.19%        (p=0.001 n=18+18)
GobDecode-12                7.21ms ± 1%    7.14ms ± 1%  -0.92%        (p=0.000 n=19+20)
GobEncode-12                5.88ms ± 1%    5.88ms ± 1%    ~           (p=0.641 n=18+19)
Gzip-12                      218ms ± 1%     218ms ± 1%    ~           (p=0.271 n=19+18)
Gunzip-12                   37.1ms ± 0%    36.9ms ± 0%  -0.29%        (p=0.000 n=18+17)
HTTPClientServer-12         78.1µs ± 2%    77.4µs ± 2%    ~           (p=0.070 n=19+19)
JSONEncode-12               15.5ms ± 1%    15.5ms ± 0%    ~           (p=0.063 n=20+18)
JSONDecode-12               56.1ms ± 0%    55.4ms ± 1%  -1.18%        (p=0.000 n=19+18)
Mandelbrot200-12            4.05ms ± 0%    4.06ms ± 0%  +0.29%        (p=0.001 n=18+18)
GoParse-12                  3.28ms ± 1%    3.21ms ± 1%  -2.30%        (p=0.000 n=20+20)
RegexpMatchEasy0_32-12      69.4ns ± 2%    69.3ns ± 1%    ~           (p=0.205 n=18+16)
RegexpMatchEasy0_1K-12       239ns ± 0%     239ns ± 0%    ~     (all samples are equal)
RegexpMatchEasy1_32-12      69.4ns ± 1%    69.4ns ± 1%    ~           (p=0.620 n=15+18)
RegexpMatchEasy1_1K-12       370ns ± 1%     369ns ± 2%    ~           (p=0.088 n=20+20)
RegexpMatchMedium_32-12      108ns ± 0%     108ns ± 0%    ~     (all samples are equal)
RegexpMatchMedium_1K-12     33.6µs ± 3%    33.5µs ± 3%    ~           (p=0.718 n=20+20)
RegexpMatchHard_32-12       1.68µs ± 1%    1.67µs ± 2%    ~           (p=0.316 n=20+20)
RegexpMatchHard_1K-12       50.5µs ± 3%    50.4µs ± 3%    ~           (p=0.659 n=20+20)
Revcomp-12                   381ms ± 1%     381ms ± 1%    ~           (p=0.916 n=19+18)
Template-12                 66.5ms ± 1%    65.8ms ± 2%  -1.08%        (p=0.000 n=20+20)
TimeParse-12                 317ns ± 0%     319ns ± 0%  +0.48%        (p=0.000 n=19+12)
TimeFormat-12                338ns ± 0%     338ns ± 0%    ~           (p=0.124 n=19+18)
[Geo mean]                  5.99µs         5.96µs       -0.54%
Change-Id: I638ffd9d9f178835bbfa499bac20bd7224f1a907
Reviewed-on: https://go-review.googlesource.com/22591
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2016-04-28 15:32:01 -04:00
										 |  |  | 			if needzero && span.needzero != 0 { | 
					
						
							| 
									
										
										
										
											2022-10-16 20:00:42 +10:30
										 |  |  | 				memclrNoHeapPointers(x, size) | 
					
						
							| 
									
										
										
										
											2014-07-30 09:01:52 -07:00
										 |  |  | 			} | 
					
						
							| 
									
										
											  
											
												runtime: implement experiment to replace heap bitmap with alloc headers
This change replaces the 1-bit-per-word heap bitmap for most size
classes with allocation headers for objects that contain pointers. The
header consists of a single pointer to a type. All allocations with
headers are treated as implicitly containing one or more instances of
the type in the header.
As the name implies, headers are usually stored as the first word of an
object. There are two additional exceptions to where headers are stored
and how they're used.
Objects smaller than 512 bytes do not have headers. Instead, a heap
bitmap is reserved at the end of spans for objects of this size. A full
word of overhead is too much for these small objects. The bitmap is of
the same format of the old bitmap, minus the noMorePtrs bits which are
unnecessary. All the objects <512 bytes have a bitmap less than a
pointer-word in size, and that was the granularity at which noMorePtrs
could stop scanning early anyway.
Objects that are larger than 32 KiB (which have their own span) have
their headers stored directly in the span, to allow power-of-two-sized
allocations to not spill over into an extra page.
The full implementation is behind GOEXPERIMENT=allocheaders.
The purpose of this change is performance. First and foremost, with
headers we no longer have to unroll pointer/scalar data at allocation
time for most size classes. Small size classes still need some
unrolling, but their bitmaps are small so we can optimize that case
fairly well. Larger objects effectively have their pointer/scalar data
unrolled on-demand from type data, which is much more compactly
represented and results in less TLB pressure. Furthermore, since the
headers are usually right next to the object and where we're about to
start scanning, we get an additional temporal locality benefit in the
data cache when looking up type metadata. The pointer/scalar data is
now effectively unrolled on-demand, but it's also simpler to unroll than
before; that unrolled data is never written anywhere, and for arrays we
get the benefit of retreading the same data per element, as opposed to
looking it up from scratch for each pointer-word of bitmap. Lastly,
because we no longer have a heap bitmap that spans the entire heap,
there's a flat 1.5% memory use reduction. This is balanced slightly by
some objects possibly being bumped up a size class, but most objects are
not tightly optimized to size class sizes so there's some memory to
spare, making the header basically free in those cases.
See the follow-up CL which turns on this experiment by default for
benchmark results. (CL 538217.)
Change-Id: I4c9034ee200650d06d8bdecd579d5f7c1bbf1fc5
Reviewed-on: https://go-review.googlesource.com/c/go/+/437955
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: Keith Randall <khr@golang.org>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
											
										 
											2022-09-11 04:07:41 +00:00
										 |  |  | 			if goexperiment.AllocHeaders && hasHeader { | 
					
						
							| 
									
										
										
										
											2023-11-16 17:42:25 +00:00
										 |  |  | 				header = (**_type)(x) | 
					
						
							|  |  |  | 				x = add(x, mallocHeaderSize) | 
					
						
							| 
									
										
											  
											
												runtime: implement experiment to replace heap bitmap with alloc headers
This change replaces the 1-bit-per-word heap bitmap for most size
classes with allocation headers for objects that contain pointers. The
header consists of a single pointer to a type. All allocations with
headers are treated as implicitly containing one or more instances of
the type in the header.
As the name implies, headers are usually stored as the first word of an
object. There are two additional exceptions to where headers are stored
and how they're used.
Objects smaller than 512 bytes do not have headers. Instead, a heap
bitmap is reserved at the end of spans for objects of this size. A full
word of overhead is too much for these small objects. The bitmap is of
the same format of the old bitmap, minus the noMorePtrs bits which are
unnecessary. All the objects <512 bytes have a bitmap less than a
pointer-word in size, and that was the granularity at which noMorePtrs
could stop scanning early anyway.
Objects that are larger than 32 KiB (which have their own span) have
their headers stored directly in the span, to allow power-of-two-sized
allocations to not spill over into an extra page.
The full implementation is behind GOEXPERIMENT=allocheaders.
The purpose of this change is performance. First and foremost, with
headers we no longer have to unroll pointer/scalar data at allocation
time for most size classes. Small size classes still need some
unrolling, but their bitmaps are small so we can optimize that case
fairly well. Larger objects effectively have their pointer/scalar data
unrolled on-demand from type data, which is much more compactly
represented and results in less TLB pressure. Furthermore, since the
headers are usually right next to the object and where we're about to
start scanning, we get an additional temporal locality benefit in the
data cache when looking up type metadata. The pointer/scalar data is
now effectively unrolled on-demand, but it's also simpler to unroll than
before; that unrolled data is never written anywhere, and for arrays we
get the benefit of retreading the same data per element, as opposed to
looking it up from scratch for each pointer-word of bitmap. Lastly,
because we no longer have a heap bitmap that spans the entire heap,
there's a flat 1.5% memory use reduction. This is balanced slightly by
some objects possibly being bumped up a size class, but most objects are
not tightly optimized to size class sizes so there's some memory to
spare, making the header basically free in those cases.
See the follow-up CL which turns on this experiment by default for
benchmark results. (CL 538217.)
Change-Id: I4c9034ee200650d06d8bdecd579d5f7c1bbf1fc5
Reviewed-on: https://go-review.googlesource.com/c/go/+/437955
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: Keith Randall <khr@golang.org>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
											
										 
											2022-09-11 04:07:41 +00:00
										 |  |  | 				size -= mallocHeaderSize | 
					
						
							|  |  |  | 			} | 
					
						
							| 
									
										
										
										
											2014-07-30 09:01:52 -07:00
										 |  |  | 		} | 
					
						
							|  |  |  | 	} else { | 
					
						
							| 
									
										
										
										
											2015-01-06 14:58:49 -05:00
										 |  |  | 		shouldhelpgc = true | 
					
						
							| 
									
										
										
										
											2020-11-17 19:54:31 -05:00
										 |  |  | 		// For large allocations, keep track of zeroed state so that | 
					
						
							|  |  |  | 		// bulk zeroing can be happen later in a preemptible context. | 
					
						
							| 
									
										
											  
											
												runtime: clean up allocation zeroing
Currently, the runtime zeroes allocations in several ways. First, small
object spans are always zeroed if they come from mheap, and their slots
are zeroed later in mallocgc if needed. Second, large object spans
(objects that have their own spans) plumb the need for zeroing down into
mheap. Thirdly, large objects that have no pointers have their zeroing
delayed until after preemption is reenabled, but before returning in
mallocgc.
All of this has two consequences:
1. Spans for small objects that come from mheap are sometimes
   unnecessarily zeroed, even if the mallocgc call that created them
   doesn't need the object slot to be zeroed.
2. This is all messy and difficult to reason about.
This CL simplifies this code, resolving both (1) and (2). First, it
recognizes that zeroing in mheap is unnecessary for small object spans;
mallocgc and its callees in mcache and mcentral, by design, are *always*
able to deal with non-zeroed spans. They must, for they deal with
recycled spans all the time. Once this fact is made clear, the only
remaining use of zeroing in mheap is for large objects.
As a result, this CL lifts mheap zeroing for large objects into
mallocgc, to parallel all the other codepaths in mallocgc. This is makes
the large object allocation code less surprising.
Next, this CL sets the flag for the delayed zeroing explicitly in the one
case where it matters, and inverts and renames the flag from isZeroed to
delayZeroing.
Finally, it adds a check to make sure that only pointer-free allocations
take the delayed zeroing codepath, as an extra safety measure.
Benchmark results: https://perf.golang.org/search?q=upload:20211028.8
Inspired by tapir.liu@gmail.com's CL 343470.
Change-Id: I7e1296adc19ce8a02c8d93a0a5082aefb2673e8f
Reviewed-on: https://go-review.googlesource.com/c/go/+/359477
Trust: Michael Knyszek <mknyszek@google.com>
Reviewed-by: David Chase <drchase@google.com>
											
										 
											2021-10-28 17:52:22 +00:00
										 |  |  | 		span = c.allocLarge(size, noscan) | 
					
						
							| 
									
										
										
										
											2022-11-11 18:34:18 +00:00
										 |  |  | 		span.freeindex = 1 | 
					
						
							| 
									
										
										
										
											2020-06-05 22:01:25 -04:00
										 |  |  | 		span.allocCount = 1 | 
					
						
							|  |  |  | 		size = span.elemsize | 
					
						
							| 
									
										
											  
											
												runtime: clean up allocation zeroing
Currently, the runtime zeroes allocations in several ways. First, small
object spans are always zeroed if they come from mheap, and their slots
are zeroed later in mallocgc if needed. Second, large object spans
(objects that have their own spans) plumb the need for zeroing down into
mheap. Thirdly, large objects that have no pointers have their zeroing
delayed until after preemption is reenabled, but before returning in
mallocgc.
All of this has two consequences:
1. Spans for small objects that come from mheap are sometimes
   unnecessarily zeroed, even if the mallocgc call that created them
   doesn't need the object slot to be zeroed.
2. This is all messy and difficult to reason about.
This CL simplifies this code, resolving both (1) and (2). First, it
recognizes that zeroing in mheap is unnecessary for small object spans;
mallocgc and its callees in mcache and mcentral, by design, are *always*
able to deal with non-zeroed spans. They must, for they deal with
recycled spans all the time. Once this fact is made clear, the only
remaining use of zeroing in mheap is for large objects.
As a result, this CL lifts mheap zeroing for large objects into
mallocgc, to parallel all the other codepaths in mallocgc. This is makes
the large object allocation code less surprising.
Next, this CL sets the flag for the delayed zeroing explicitly in the one
case where it matters, and inverts and renames the flag from isZeroed to
delayZeroing.
Finally, it adds a check to make sure that only pointer-free allocations
take the delayed zeroing codepath, as an extra safety measure.
Benchmark results: https://perf.golang.org/search?q=upload:20211028.8
Inspired by tapir.liu@gmail.com's CL 343470.
Change-Id: I7e1296adc19ce8a02c8d93a0a5082aefb2673e8f
Reviewed-on: https://go-review.googlesource.com/c/go/+/359477
Trust: Michael Knyszek <mknyszek@google.com>
Reviewed-by: David Chase <drchase@google.com>
											
										 
											2021-10-28 17:52:22 +00:00
										 |  |  | 		x = unsafe.Pointer(span.base()) | 
					
						
							|  |  |  | 		if needzero && span.needzero != 0 { | 
					
						
							|  |  |  | 			if noscan { | 
					
						
							|  |  |  | 				delayedZeroing = true | 
					
						
							|  |  |  | 			} else { | 
					
						
							|  |  |  | 				memclrNoHeapPointers(x, size) | 
					
						
							|  |  |  | 			} | 
					
						
							|  |  |  | 		} | 
					
						
							| 
									
										
											  
											
												runtime: implement experiment to replace heap bitmap with alloc headers
This change replaces the 1-bit-per-word heap bitmap for most size
classes with allocation headers for objects that contain pointers. The
header consists of a single pointer to a type. All allocations with
headers are treated as implicitly containing one or more instances of
the type in the header.
As the name implies, headers are usually stored as the first word of an
object. There are two additional exceptions to where headers are stored
and how they're used.
Objects smaller than 512 bytes do not have headers. Instead, a heap
bitmap is reserved at the end of spans for objects of this size. A full
word of overhead is too much for these small objects. The bitmap is of
the same format of the old bitmap, minus the noMorePtrs bits which are
unnecessary. All the objects <512 bytes have a bitmap less than a
pointer-word in size, and that was the granularity at which noMorePtrs
could stop scanning early anyway.
Objects that are larger than 32 KiB (which have their own span) have
their headers stored directly in the span, to allow power-of-two-sized
allocations to not spill over into an extra page.
The full implementation is behind GOEXPERIMENT=allocheaders.
The purpose of this change is performance. First and foremost, with
headers we no longer have to unroll pointer/scalar data at allocation
time for most size classes. Small size classes still need some
unrolling, but their bitmaps are small so we can optimize that case
fairly well. Larger objects effectively have their pointer/scalar data
unrolled on-demand from type data, which is much more compactly
represented and results in less TLB pressure. Furthermore, since the
headers are usually right next to the object and where we're about to
start scanning, we get an additional temporal locality benefit in the
data cache when looking up type metadata. The pointer/scalar data is
now effectively unrolled on-demand, but it's also simpler to unroll than
before; that unrolled data is never written anywhere, and for arrays we
get the benefit of retreading the same data per element, as opposed to
looking it up from scratch for each pointer-word of bitmap. Lastly,
because we no longer have a heap bitmap that spans the entire heap,
there's a flat 1.5% memory use reduction. This is balanced slightly by
some objects possibly being bumped up a size class, but most objects are
not tightly optimized to size class sizes so there's some memory to
spare, making the header basically free in those cases.
See the follow-up CL which turns on this experiment by default for
benchmark results. (CL 538217.)
Change-Id: I4c9034ee200650d06d8bdecd579d5f7c1bbf1fc5
Reviewed-on: https://go-review.googlesource.com/c/go/+/437955
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: Keith Randall <khr@golang.org>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
											
										 
											2022-09-11 04:07:41 +00:00
										 |  |  | 		if goexperiment.AllocHeaders && !noscan { | 
					
						
							|  |  |  | 			header = &span.largeType | 
					
						
							|  |  |  | 		} | 
					
						
							| 
									
										
										
										
											2014-07-30 09:01:52 -07:00
										 |  |  | 	} | 
					
						
							| 
									
										
										
										
											2016-06-02 11:09:20 -04:00
										 |  |  | 	if !noscan { | 
					
						
							| 
									
										
											  
											
												runtime: implement experiment to replace heap bitmap with alloc headers
This change replaces the 1-bit-per-word heap bitmap for most size
classes with allocation headers for objects that contain pointers. The
header consists of a single pointer to a type. All allocations with
headers are treated as implicitly containing one or more instances of
the type in the header.
As the name implies, headers are usually stored as the first word of an
object. There are two additional exceptions to where headers are stored
and how they're used.
Objects smaller than 512 bytes do not have headers. Instead, a heap
bitmap is reserved at the end of spans for objects of this size. A full
word of overhead is too much for these small objects. The bitmap is of
the same format of the old bitmap, minus the noMorePtrs bits which are
unnecessary. All the objects <512 bytes have a bitmap less than a
pointer-word in size, and that was the granularity at which noMorePtrs
could stop scanning early anyway.
Objects that are larger than 32 KiB (which have their own span) have
their headers stored directly in the span, to allow power-of-two-sized
allocations to not spill over into an extra page.
The full implementation is behind GOEXPERIMENT=allocheaders.
The purpose of this change is performance. First and foremost, with
headers we no longer have to unroll pointer/scalar data at allocation
time for most size classes. Small size classes still need some
unrolling, but their bitmaps are small so we can optimize that case
fairly well. Larger objects effectively have their pointer/scalar data
unrolled on-demand from type data, which is much more compactly
represented and results in less TLB pressure. Furthermore, since the
headers are usually right next to the object and where we're about to
start scanning, we get an additional temporal locality benefit in the
data cache when looking up type metadata. The pointer/scalar data is
now effectively unrolled on-demand, but it's also simpler to unroll than
before; that unrolled data is never written anywhere, and for arrays we
get the benefit of retreading the same data per element, as opposed to
looking it up from scratch for each pointer-word of bitmap. Lastly,
because we no longer have a heap bitmap that spans the entire heap,
there's a flat 1.5% memory use reduction. This is balanced slightly by
some objects possibly being bumped up a size class, but most objects are
not tightly optimized to size class sizes so there's some memory to
spare, making the header basically free in those cases.
See the follow-up CL which turns on this experiment by default for
benchmark results. (CL 538217.)
Change-Id: I4c9034ee200650d06d8bdecd579d5f7c1bbf1fc5
Reviewed-on: https://go-review.googlesource.com/c/go/+/437955
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: Keith Randall <khr@golang.org>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
											
										 
											2022-09-11 04:07:41 +00:00
										 |  |  | 		if goexperiment.AllocHeaders { | 
					
						
							|  |  |  | 			c.scanAlloc += heapSetType(uintptr(x), dataSize, typ, header, span) | 
					
						
							| 
									
										
										
										
											2015-05-04 16:10:49 -04:00
										 |  |  | 		} else { | 
					
						
							| 
									
										
											  
											
												runtime: implement experiment to replace heap bitmap with alloc headers
This change replaces the 1-bit-per-word heap bitmap for most size
classes with allocation headers for objects that contain pointers. The
header consists of a single pointer to a type. All allocations with
headers are treated as implicitly containing one or more instances of
the type in the header.
As the name implies, headers are usually stored as the first word of an
object. There are two additional exceptions to where headers are stored
and how they're used.
Objects smaller than 512 bytes do not have headers. Instead, a heap
bitmap is reserved at the end of spans for objects of this size. A full
word of overhead is too much for these small objects. The bitmap is of
the same format of the old bitmap, minus the noMorePtrs bits which are
unnecessary. All the objects <512 bytes have a bitmap less than a
pointer-word in size, and that was the granularity at which noMorePtrs
could stop scanning early anyway.
Objects that are larger than 32 KiB (which have their own span) have
their headers stored directly in the span, to allow power-of-two-sized
allocations to not spill over into an extra page.
The full implementation is behind GOEXPERIMENT=allocheaders.
The purpose of this change is performance. First and foremost, with
headers we no longer have to unroll pointer/scalar data at allocation
time for most size classes. Small size classes still need some
unrolling, but their bitmaps are small so we can optimize that case
fairly well. Larger objects effectively have their pointer/scalar data
unrolled on-demand from type data, which is much more compactly
represented and results in less TLB pressure. Furthermore, since the
headers are usually right next to the object and where we're about to
start scanning, we get an additional temporal locality benefit in the
data cache when looking up type metadata. The pointer/scalar data is
now effectively unrolled on-demand, but it's also simpler to unroll than
before; that unrolled data is never written anywhere, and for arrays we
get the benefit of retreading the same data per element, as opposed to
looking it up from scratch for each pointer-word of bitmap. Lastly,
because we no longer have a heap bitmap that spans the entire heap,
there's a flat 1.5% memory use reduction. This is balanced slightly by
some objects possibly being bumped up a size class, but most objects are
not tightly optimized to size class sizes so there's some memory to
spare, making the header basically free in those cases.
See the follow-up CL which turns on this experiment by default for
benchmark results. (CL 538217.)
Change-Id: I4c9034ee200650d06d8bdecd579d5f7c1bbf1fc5
Reviewed-on: https://go-review.googlesource.com/c/go/+/437955
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: Keith Randall <khr@golang.org>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
											
										 
											2022-09-11 04:07:41 +00:00
										 |  |  | 			var scanSize uintptr | 
					
						
							|  |  |  | 			heapBitsSetType(uintptr(x), size, dataSize, typ) | 
					
						
							|  |  |  | 			if dataSize > typ.Size_ { | 
					
						
							|  |  |  | 				// Array allocation. If there are any | 
					
						
							|  |  |  | 				// pointers, GC has to scan to the last | 
					
						
							|  |  |  | 				// element. | 
					
						
							|  |  |  | 				if typ.PtrBytes != 0 { | 
					
						
							|  |  |  | 					scanSize = dataSize - typ.Size_ + typ.PtrBytes | 
					
						
							|  |  |  | 				} | 
					
						
							|  |  |  | 			} else { | 
					
						
							|  |  |  | 				scanSize = typ.PtrBytes | 
					
						
							|  |  |  | 			} | 
					
						
							|  |  |  | 			c.scanAlloc += scanSize | 
					
						
							| 
									
										
										
										
											2015-05-04 16:10:49 -04:00
										 |  |  | 		} | 
					
						
							| 
									
										
										
										
											2014-08-07 13:34:30 +04:00
										 |  |  | 	} | 
					
						
							| 
									
										
										
										
											2014-11-04 13:31:34 -05:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-05-11 14:57:33 -04:00
										 |  |  | 	// Ensure that the stores above that initialize x to | 
					
						
							|  |  |  | 	// type-safe memory and set the heap bits occur before | 
					
						
							|  |  |  | 	// the caller can make x observable to the garbage | 
					
						
							|  |  |  | 	// collector. Otherwise, on weakly ordered machines, | 
					
						
							|  |  |  | 	// the garbage collector could follow a pointer to x, | 
					
						
							|  |  |  | 	// but see uninitialized memory or stale heap bits. | 
					
						
							|  |  |  | 	publicationBarrier() | 
					
						
							| 
									
										
										
										
											2022-11-09 10:55:54 -05:00
										 |  |  | 	// As x and the heap bits are initialized, update | 
					
						
							|  |  |  | 	// freeIndexForScan now so x is seen by the GC | 
					
						
							| 
									
										
										
										
											2023-06-13 23:01:11 +00:00
										 |  |  | 	// (including conservative scan) as an allocated object. | 
					
						
							| 
									
										
										
										
											2022-11-09 10:55:54 -05:00
										 |  |  | 	// While this pointer can't escape into user code as a | 
					
						
							|  |  |  | 	// _live_ pointer until we return, conservative scanning | 
					
						
							|  |  |  | 	// may find a dead pointer that happens to point into this | 
					
						
							|  |  |  | 	// object. Delaying this update until now ensures that | 
					
						
							|  |  |  | 	// conservative scanning considers this pointer dead until | 
					
						
							|  |  |  | 	// this point. | 
					
						
							|  |  |  | 	span.freeIndexForScan = span.freeindex | 
					
						
							| 
									
										
										
										
											2016-05-11 14:57:33 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-03-30 17:02:23 -04:00
										 |  |  | 	// Allocate black during GC. | 
					
						
							| 
									
										
										
										
											2014-11-04 13:31:34 -05:00
										 |  |  | 	// All slots hold nil so no scanning is needed. | 
					
						
							|  |  |  | 	// This may be racing with GC so do it atomically if there can be | 
					
						
							|  |  |  | 	// a race marking the bit. | 
					
						
							| 
									
										
										
										
											2016-03-30 17:02:23 -04:00
										 |  |  | 	if gcphase != _GCoff { | 
					
						
							| 
									
										
											  
											
												runtime: implement experiment to replace heap bitmap with alloc headers
This change replaces the 1-bit-per-word heap bitmap for most size
classes with allocation headers for objects that contain pointers. The
header consists of a single pointer to a type. All allocations with
headers are treated as implicitly containing one or more instances of
the type in the header.
As the name implies, headers are usually stored as the first word of an
object. There are two additional exceptions to where headers are stored
and how they're used.
Objects smaller than 512 bytes do not have headers. Instead, a heap
bitmap is reserved at the end of spans for objects of this size. A full
word of overhead is too much for these small objects. The bitmap is of
the same format of the old bitmap, minus the noMorePtrs bits which are
unnecessary. All the objects <512 bytes have a bitmap less than a
pointer-word in size, and that was the granularity at which noMorePtrs
could stop scanning early anyway.
Objects that are larger than 32 KiB (which have their own span) have
their headers stored directly in the span, to allow power-of-two-sized
allocations to not spill over into an extra page.
The full implementation is behind GOEXPERIMENT=allocheaders.
The purpose of this change is performance. First and foremost, with
headers we no longer have to unroll pointer/scalar data at allocation
time for most size classes. Small size classes still need some
unrolling, but their bitmaps are small so we can optimize that case
fairly well. Larger objects effectively have their pointer/scalar data
unrolled on-demand from type data, which is much more compactly
represented and results in less TLB pressure. Furthermore, since the
headers are usually right next to the object and where we're about to
start scanning, we get an additional temporal locality benefit in the
data cache when looking up type metadata. The pointer/scalar data is
now effectively unrolled on-demand, but it's also simpler to unroll than
before; that unrolled data is never written anywhere, and for arrays we
get the benefit of retreading the same data per element, as opposed to
looking it up from scratch for each pointer-word of bitmap. Lastly,
because we no longer have a heap bitmap that spans the entire heap,
there's a flat 1.5% memory use reduction. This is balanced slightly by
some objects possibly being bumped up a size class, but most objects are
not tightly optimized to size class sizes so there's some memory to
spare, making the header basically free in those cases.
See the follow-up CL which turns on this experiment by default for
benchmark results. (CL 538217.)
Change-Id: I4c9034ee200650d06d8bdecd579d5f7c1bbf1fc5
Reviewed-on: https://go-review.googlesource.com/c/go/+/437955
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: Keith Randall <khr@golang.org>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
											
										 
											2022-09-11 04:07:41 +00:00
										 |  |  | 		// Pass the full size of the allocation to the number of bytes | 
					
						
							|  |  |  | 		// marked. | 
					
						
							|  |  |  | 		// | 
					
						
							|  |  |  | 		// If !goexperiment.AllocHeaders, "size" doesn't include the | 
					
						
							|  |  |  | 		// allocation header, so use span.elemsize unconditionally. | 
					
						
							|  |  |  | 		gcmarknewobject(span, uintptr(x), span.elemsize) | 
					
						
							| 
									
										
										
										
											2014-11-04 13:31:34 -05:00
										 |  |  | 	} | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-07-30 09:01:52 -07:00
										 |  |  | 	if raceenabled { | 
					
						
							|  |  |  | 		racemalloc(x, size) | 
					
						
							|  |  |  | 	} | 
					
						
							| 
									
										
										
										
											2016-03-02 12:15:02 -05:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-10-21 11:04:42 -07:00
										 |  |  | 	if msanenabled { | 
					
						
							|  |  |  | 		msanmalloc(x, size) | 
					
						
							|  |  |  | 	} | 
					
						
							| 
									
										
										
										
											2014-08-18 16:33:39 +04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-01-05 17:52:43 +08:00
										 |  |  | 	if asanenabled { | 
					
						
							|  |  |  | 		// We should only read/write the memory with the size asked by the user. | 
					
						
							|  |  |  | 		// The rest of the allocated memory should be poisoned, so that we can report | 
					
						
							|  |  |  | 		// errors when accessing poisoned memory. | 
					
						
							|  |  |  | 		// The allocated memory is larger than required userSize, it will also include | 
					
						
							|  |  |  | 		// redzone and some other padding bytes. | 
					
						
							|  |  |  | 		rzBeg := unsafe.Add(x, userSize) | 
					
						
							|  |  |  | 		asanpoison(rzBeg, size-userSize) | 
					
						
							|  |  |  | 		asanunpoison(x, userSize) | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-07-21 18:38:05 -04:00
										 |  |  | 	if rate := MemProfileRate; rate > 0 { | 
					
						
							|  |  |  | 		// Note cache c only valid while m acquired; see #47302 | 
					
						
							|  |  |  | 		if rate != 1 && size < c.nextSample { | 
					
						
							|  |  |  | 			c.nextSample -= size | 
					
						
							|  |  |  | 		} else { | 
					
						
							|  |  |  | 			profilealloc(mp, x, size) | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 	} | 
					
						
							| 
									
										
										
										
											2015-01-16 14:43:38 -05:00
										 |  |  | 	mp.mallocing = 0 | 
					
						
							|  |  |  | 	releasem(mp) | 
					
						
							| 
									
										
										
										
											2014-08-18 16:33:39 +04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-11-17 19:54:31 -05:00
										 |  |  | 	// Pointerfree data can be zeroed late in a context where preemption can occur. | 
					
						
							|  |  |  | 	// x will keep the memory alive. | 
					
						
							| 
									
										
											  
											
												runtime: clean up allocation zeroing
Currently, the runtime zeroes allocations in several ways. First, small
object spans are always zeroed if they come from mheap, and their slots
are zeroed later in mallocgc if needed. Second, large object spans
(objects that have their own spans) plumb the need for zeroing down into
mheap. Thirdly, large objects that have no pointers have their zeroing
delayed until after preemption is reenabled, but before returning in
mallocgc.
All of this has two consequences:
1. Spans for small objects that come from mheap are sometimes
   unnecessarily zeroed, even if the mallocgc call that created them
   doesn't need the object slot to be zeroed.
2. This is all messy and difficult to reason about.
This CL simplifies this code, resolving both (1) and (2). First, it
recognizes that zeroing in mheap is unnecessary for small object spans;
mallocgc and its callees in mcache and mcentral, by design, are *always*
able to deal with non-zeroed spans. They must, for they deal with
recycled spans all the time. Once this fact is made clear, the only
remaining use of zeroing in mheap is for large objects.
As a result, this CL lifts mheap zeroing for large objects into
mallocgc, to parallel all the other codepaths in mallocgc. This is makes
the large object allocation code less surprising.
Next, this CL sets the flag for the delayed zeroing explicitly in the one
case where it matters, and inverts and renames the flag from isZeroed to
delayZeroing.
Finally, it adds a check to make sure that only pointer-free allocations
take the delayed zeroing codepath, as an extra safety measure.
Benchmark results: https://perf.golang.org/search?q=upload:20211028.8
Inspired by tapir.liu@gmail.com's CL 343470.
Change-Id: I7e1296adc19ce8a02c8d93a0a5082aefb2673e8f
Reviewed-on: https://go-review.googlesource.com/c/go/+/359477
Trust: Michael Knyszek <mknyszek@google.com>
Reviewed-by: David Chase <drchase@google.com>
											
										 
											2021-10-28 17:52:22 +00:00
										 |  |  | 	if delayedZeroing { | 
					
						
							|  |  |  | 		if !noscan { | 
					
						
							|  |  |  | 			throw("delayed zeroing on data that may contain pointers") | 
					
						
							|  |  |  | 		} | 
					
						
							| 
									
										
											  
											
												runtime: implement experiment to replace heap bitmap with alloc headers
This change replaces the 1-bit-per-word heap bitmap for most size
classes with allocation headers for objects that contain pointers. The
header consists of a single pointer to a type. All allocations with
headers are treated as implicitly containing one or more instances of
the type in the header.
As the name implies, headers are usually stored as the first word of an
object. There are two additional exceptions to where headers are stored
and how they're used.
Objects smaller than 512 bytes do not have headers. Instead, a heap
bitmap is reserved at the end of spans for objects of this size. A full
word of overhead is too much for these small objects. The bitmap is of
the same format of the old bitmap, minus the noMorePtrs bits which are
unnecessary. All the objects <512 bytes have a bitmap less than a
pointer-word in size, and that was the granularity at which noMorePtrs
could stop scanning early anyway.
Objects that are larger than 32 KiB (which have their own span) have
their headers stored directly in the span, to allow power-of-two-sized
allocations to not spill over into an extra page.
The full implementation is behind GOEXPERIMENT=allocheaders.
The purpose of this change is performance. First and foremost, with
headers we no longer have to unroll pointer/scalar data at allocation
time for most size classes. Small size classes still need some
unrolling, but their bitmaps are small so we can optimize that case
fairly well. Larger objects effectively have their pointer/scalar data
unrolled on-demand from type data, which is much more compactly
represented and results in less TLB pressure. Furthermore, since the
headers are usually right next to the object and where we're about to
start scanning, we get an additional temporal locality benefit in the
data cache when looking up type metadata. The pointer/scalar data is
now effectively unrolled on-demand, but it's also simpler to unroll than
before; that unrolled data is never written anywhere, and for arrays we
get the benefit of retreading the same data per element, as opposed to
looking it up from scratch for each pointer-word of bitmap. Lastly,
because we no longer have a heap bitmap that spans the entire heap,
there's a flat 1.5% memory use reduction. This is balanced slightly by
some objects possibly being bumped up a size class, but most objects are
not tightly optimized to size class sizes so there's some memory to
spare, making the header basically free in those cases.
See the follow-up CL which turns on this experiment by default for
benchmark results. (CL 538217.)
Change-Id: I4c9034ee200650d06d8bdecd579d5f7c1bbf1fc5
Reviewed-on: https://go-review.googlesource.com/c/go/+/437955
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: Keith Randall <khr@golang.org>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
											
										 
											2022-09-11 04:07:41 +00:00
										 |  |  | 		if goexperiment.AllocHeaders && header != nil { | 
					
						
							|  |  |  | 			throw("unexpected malloc header in delayed zeroing of large object") | 
					
						
							|  |  |  | 		} | 
					
						
							| 
									
										
										
										
											2021-07-21 18:38:05 -04:00
										 |  |  | 		memclrNoHeapPointersChunked(size, x) // This is a possible preemption point: see #47302 | 
					
						
							| 
									
										
										
										
											2020-11-17 19:54:31 -05:00
										 |  |  | 	} | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
											  
											
												runtime: implement GODEBUG=inittrace=1 support
Setting inittrace=1 causes the runtime to emit a single line to standard error for
each package with init work, summarizing the execution time and memory allocation.
The emitted debug information for init functions can be used to find bottlenecks
or regressions in Go startup performance.
Packages with no init function work (user defined or compiler generated) are omitted.
Tracing plugin inits is not supported as they can execute concurrently. This would
make the implementation of tracing more complex while adding support for a very rare
use case. Plugin inits can be traced separately by testing a main package importing
the plugins package imports explicitly.
$ GODEBUG=inittrace=1 go test
init internal/bytealg @0.008 ms, 0 ms clock, 0 bytes, 0 allocs
init runtime @0.059 ms, 0.026 ms clock, 0 bytes, 0 allocs
init math @0.19 ms, 0.001 ms clock, 0 bytes, 0 allocs
init errors @0.22 ms, 0.004 ms clock, 0 bytes, 0 allocs
init strconv @0.24 ms, 0.002 ms clock, 32 bytes, 2 allocs
init sync @0.28 ms, 0.003 ms clock, 16 bytes, 1 allocs
init unicode @0.44 ms, 0.11 ms clock, 23328 bytes, 24 allocs
...
Inspired by stapelberg@google.com who instrumented doInit
in a prototype to measure init times with GDB.
Fixes #41378
Change-Id: Ic37c6a0cfc95488de9e737f5e346b8dbb39174e1
Reviewed-on: https://go-review.googlesource.com/c/go/+/254659
Trust: Martin Möhrmann <moehrmann@google.com>
Run-TryBot: Martin Möhrmann <moehrmann@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
											
										 
											2020-09-14 16:55:34 +02:00
										 |  |  | 	if debug.malloc { | 
					
						
							|  |  |  | 		if debug.allocfreetrace != 0 { | 
					
						
							|  |  |  | 			tracealloc(x, size, typ) | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 		if inittrace.active && inittrace.id == getg().goid { | 
					
						
							| 
									
										
										
										
											2021-03-29 18:51:06 +08:00
										 |  |  | 			// Init functions are executed sequentially in a single goroutine. | 
					
						
							| 
									
										
											  
											
												runtime: implement GODEBUG=inittrace=1 support
Setting inittrace=1 causes the runtime to emit a single line to standard error for
each package with init work, summarizing the execution time and memory allocation.
The emitted debug information for init functions can be used to find bottlenecks
or regressions in Go startup performance.
Packages with no init function work (user defined or compiler generated) are omitted.
Tracing plugin inits is not supported as they can execute concurrently. This would
make the implementation of tracing more complex while adding support for a very rare
use case. Plugin inits can be traced separately by testing a main package importing
the plugins package imports explicitly.
$ GODEBUG=inittrace=1 go test
init internal/bytealg @0.008 ms, 0 ms clock, 0 bytes, 0 allocs
init runtime @0.059 ms, 0.026 ms clock, 0 bytes, 0 allocs
init math @0.19 ms, 0.001 ms clock, 0 bytes, 0 allocs
init errors @0.22 ms, 0.004 ms clock, 0 bytes, 0 allocs
init strconv @0.24 ms, 0.002 ms clock, 32 bytes, 2 allocs
init sync @0.28 ms, 0.003 ms clock, 16 bytes, 1 allocs
init unicode @0.44 ms, 0.11 ms clock, 23328 bytes, 24 allocs
...
Inspired by stapelberg@google.com who instrumented doInit
in a prototype to measure init times with GDB.
Fixes #41378
Change-Id: Ic37c6a0cfc95488de9e737f5e346b8dbb39174e1
Reviewed-on: https://go-review.googlesource.com/c/go/+/254659
Trust: Martin Möhrmann <moehrmann@google.com>
Run-TryBot: Martin Möhrmann <moehrmann@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
											
										 
											2020-09-14 16:55:34 +02:00
										 |  |  | 			inittrace.bytes += uint64(size) | 
					
						
							|  |  |  | 		} | 
					
						
							| 
									
										
										
										
											2014-07-30 09:01:52 -07:00
										 |  |  | 	} | 
					
						
							| 
									
										
										
										
											2014-08-13 01:03:32 +04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-10-04 20:56:11 -07:00
										 |  |  | 	if assistG != nil { | 
					
						
							|  |  |  | 		// Account for internal fragmentation in the assist | 
					
						
							|  |  |  | 		// debt now that we know it. | 
					
						
							|  |  |  | 		assistG.gcAssistBytes -= int64(size - dataSize) | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2017-01-09 11:35:42 -05:00
										 |  |  | 	if shouldhelpgc { | 
					
						
							|  |  |  | 		if t := (gcTrigger{kind: gcTriggerHeap}); t.test() { | 
					
						
							| 
									
										
										
										
											2018-08-13 16:14:19 -04:00
										 |  |  | 			gcStart(t) | 
					
						
							| 
									
										
										
										
											2017-01-09 11:35:42 -05:00
										 |  |  | 		} | 
					
						
							| 
									
										
										
										
											2014-07-30 09:01:52 -07:00
										 |  |  | 	} | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-04-28 23:07:38 -07:00
										 |  |  | 	if raceenabled && noscan && dataSize < maxTinySize { | 
					
						
							|  |  |  | 		// Pad tinysize allocations so they are aligned with the end | 
					
						
							|  |  |  | 		// of the tinyalloc region. This ensures that any arithmetic | 
					
						
							|  |  |  | 		// that goes off the top end of the object will be detectable | 
					
						
							|  |  |  | 		// by checkptr (issue 38872). | 
					
						
							|  |  |  | 		// Note that we disable tinyalloc when raceenabled for this to work. | 
					
						
							|  |  |  | 		// TODO: This padding is only performed when the race detector | 
					
						
							|  |  |  | 		// is enabled. It would be nice to enable it if any package | 
					
						
							|  |  |  | 		// was compiled with checkptr, but there's no easy way to | 
					
						
							|  |  |  | 		// detect that (especially at compile time). | 
					
						
							|  |  |  | 		// TODO: enable this padding for all allocations, not just | 
					
						
							|  |  |  | 		// tinyalloc ones. It's tricky because of pointer maps. | 
					
						
							|  |  |  | 		// Maybe just all noscan objects? | 
					
						
							|  |  |  | 		x = add(x, size-dataSize) | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-07-30 09:01:52 -07:00
										 |  |  | 	return x | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-08-13 16:37:09 +00:00
										 |  |  | // deductAssistCredit reduces the current G's assist credit | 
					
						
							|  |  |  | // by size bytes, and assists the GC if necessary. | 
					
						
							|  |  |  | // | 
					
						
							|  |  |  | // Caller must be preemptible. | 
					
						
							|  |  |  | // | 
					
						
							|  |  |  | // Returns the G for which the assist credit was accounted. | 
					
						
							|  |  |  | func deductAssistCredit(size uintptr) *g { | 
					
						
							|  |  |  | 	var assistG *g | 
					
						
							|  |  |  | 	if gcBlackenEnabled != 0 { | 
					
						
							|  |  |  | 		// Charge the current user G for this allocation. | 
					
						
							|  |  |  | 		assistG = getg() | 
					
						
							|  |  |  | 		if assistG.m.curg != nil { | 
					
						
							|  |  |  | 			assistG = assistG.m.curg | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 		// Charge the allocation against the G. We'll account | 
					
						
							|  |  |  | 		// for internal fragmentation at the end of mallocgc. | 
					
						
							|  |  |  | 		assistG.gcAssistBytes -= int64(size) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 		if assistG.gcAssistBytes < 0 { | 
					
						
							|  |  |  | 			// This G is in debt. Assist the GC to correct | 
					
						
							|  |  |  | 			// this before allocating. This must happen | 
					
						
							|  |  |  | 			// before disabling preemption. | 
					
						
							|  |  |  | 			gcAssistAlloc(assistG) | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 	return assistG | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-11-17 19:54:31 -05:00
										 |  |  | // memclrNoHeapPointersChunked repeatedly calls memclrNoHeapPointers | 
					
						
							|  |  |  | // on chunks of the buffer to be zeroed, with opportunities for preemption | 
					
						
							|  |  |  | // along the way.  memclrNoHeapPointers contains no safepoints and also | 
					
						
							|  |  |  | // cannot be preemptively scheduled, so this provides a still-efficient | 
					
						
							|  |  |  | // block copy that can also be preempted on a reasonable granularity. | 
					
						
							|  |  |  | // | 
					
						
							|  |  |  | // Use this with care; if the data being cleared is tagged to contain | 
					
						
							|  |  |  | // pointers, this allows the GC to run before it is all cleared. | 
					
						
							|  |  |  | func memclrNoHeapPointersChunked(size uintptr, x unsafe.Pointer) { | 
					
						
							|  |  |  | 	v := uintptr(x) | 
					
						
							|  |  |  | 	// got this from benchmarking. 128k is too small, 512k is too large. | 
					
						
							|  |  |  | 	const chunkBytes = 256 * 1024 | 
					
						
							|  |  |  | 	vsize := v + size | 
					
						
							|  |  |  | 	for voff := v; voff < vsize; voff = voff + chunkBytes { | 
					
						
							|  |  |  | 		if getg().preempt { | 
					
						
							|  |  |  | 			// may hold locks, e.g., profiling | 
					
						
							|  |  |  | 			goschedguarded() | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 		// clear min(avail, lump) bytes | 
					
						
							|  |  |  | 		n := vsize - voff | 
					
						
							|  |  |  | 		if n > chunkBytes { | 
					
						
							|  |  |  | 			n = chunkBytes | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 		memclrNoHeapPointers(unsafe.Pointer(voff), n) | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-07-30 09:01:52 -07:00
										 |  |  | // implementation of new builtin | 
					
						
							| 
									
										
										
										
											2016-08-26 15:41:51 -04:00
										 |  |  | // compiler (both frontend and SSA backend) knows the signature | 
					
						
							| 
									
										
										
										
											2022-11-11 19:22:35 +08:00
										 |  |  | // of this function. | 
					
						
							| 
									
										
										
										
											2014-07-30 09:01:52 -07:00
										 |  |  | func newobject(typ *_type) unsafe.Pointer { | 
					
						
							| 
									
										
										
										
											2023-01-20 16:41:57 -05:00
										 |  |  | 	return mallocgc(typ.Size_, typ, true) | 
					
						
							| 
									
										
										
										
											2014-07-30 09:01:52 -07:00
										 |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-12-22 13:27:53 -05:00
										 |  |  | //go:linkname reflect_unsafe_New reflect.unsafe_New | 
					
						
							|  |  |  | func reflect_unsafe_New(typ *_type) unsafe.Pointer { | 
					
						
							| 
									
										
										
										
											2023-01-20 16:41:57 -05:00
										 |  |  | 	return mallocgc(typ.Size_, typ, true) | 
					
						
							| 
									
										
										
										
											2014-12-22 13:27:53 -05:00
										 |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-02-08 17:48:17 +01:00
										 |  |  | //go:linkname reflectlite_unsafe_New internal/reflectlite.unsafe_New | 
					
						
							|  |  |  | func reflectlite_unsafe_New(typ *_type) unsafe.Pointer { | 
					
						
							| 
									
										
										
										
											2023-01-20 16:41:57 -05:00
										 |  |  | 	return mallocgc(typ.Size_, typ, true) | 
					
						
							| 
									
										
										
										
											2019-02-08 17:48:17 +01:00
										 |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-04-20 18:00:52 +02:00
										 |  |  | // newarray allocates an array of n elements of type typ. | 
					
						
							|  |  |  | func newarray(typ *_type, n int) unsafe.Pointer { | 
					
						
							| 
									
										
										
										
											2017-07-16 17:46:52 -10:00
										 |  |  | 	if n == 1 { | 
					
						
							| 
									
										
										
										
											2023-01-20 16:41:57 -05:00
										 |  |  | 		return mallocgc(typ.Size_, typ, true) | 
					
						
							| 
									
										
										
										
											2017-07-16 17:46:52 -10:00
										 |  |  | 	} | 
					
						
							| 
									
										
										
										
											2023-01-20 16:41:57 -05:00
										 |  |  | 	mem, overflow := math.MulUintptr(typ.Size_, uintptr(n)) | 
					
						
							| 
									
										
										
										
											2018-10-22 20:47:54 +02:00
										 |  |  | 	if overflow || mem > maxAlloc || n < 0 { | 
					
						
							| 
									
										
										
										
											2016-03-27 17:29:53 -07:00
										 |  |  | 		panic(plainError("runtime: allocation size out of range")) | 
					
						
							| 
									
										
										
										
											2014-07-30 09:01:52 -07:00
										 |  |  | 	} | 
					
						
							| 
									
										
										
										
											2018-10-22 20:47:54 +02:00
										 |  |  | 	return mallocgc(mem, typ, true) | 
					
						
							| 
									
										
										
										
											2014-07-30 09:01:52 -07:00
										 |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-12-22 13:27:53 -05:00
										 |  |  | //go:linkname reflect_unsafe_NewArray reflect.unsafe_NewArray | 
					
						
							| 
									
										
										
										
											2016-04-20 18:00:52 +02:00
										 |  |  | func reflect_unsafe_NewArray(typ *_type, n int) unsafe.Pointer { | 
					
						
							| 
									
										
										
										
											2014-12-22 13:27:53 -05:00
										 |  |  | 	return newarray(typ, n) | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-07-30 09:01:52 -07:00
										 |  |  | func profilealloc(mp *m, x unsafe.Pointer, size uintptr) { | 
					
						
							| 
									
										
										
										
											2021-09-11 20:53:24 +08:00
										 |  |  | 	c := getMCache(mp) | 
					
						
							| 
									
										
										
										
											2020-11-02 16:58:38 +00:00
										 |  |  | 	if c == nil { | 
					
						
							|  |  |  | 		throw("profilealloc called without a P or outside bootstrapping") | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 	c.nextSample = nextSample() | 
					
						
							| 
									
										
										
										
											2014-09-01 18:51:12 -04:00
										 |  |  | 	mProf_Malloc(x, size) | 
					
						
							| 
									
										
										
										
											2014-07-30 09:01:52 -07:00
										 |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2017-09-24 18:13:26 +02:00
										 |  |  | // nextSample returns the next sampling point for heap profiling. The goal is | 
					
						
							|  |  |  | // to sample allocations on average every MemProfileRate bytes, but with a | 
					
						
							|  |  |  | // completely random distribution over the allocation timeline; this | 
					
						
							|  |  |  | // corresponds to a Poisson process with parameter MemProfileRate. In Poisson | 
					
						
							|  |  |  | // processes, the distance between two samples follows the exponential | 
					
						
							|  |  |  | // distribution (exp(MemProfileRate)), so the best return value is a random | 
					
						
							|  |  |  | // number taken from an exponential distribution whose mean is MemProfileRate. | 
					
						
							| 
									
										
										
										
											2019-07-29 12:51:19 -07:00
										 |  |  | func nextSample() uintptr { | 
					
						
							| 
									
										
										
										
											2020-06-03 11:03:22 -07:00
										 |  |  | 	if MemProfileRate == 1 { | 
					
						
							|  |  |  | 		// Callers assign our return value to | 
					
						
							|  |  |  | 		// mcache.next_sample, but next_sample is not used | 
					
						
							|  |  |  | 		// when the rate is 1. So avoid the math below and | 
					
						
							|  |  |  | 		// just return something. | 
					
						
							|  |  |  | 		return 0 | 
					
						
							|  |  |  | 	} | 
					
						
							| 
									
										
										
										
											2015-10-28 06:44:26 +01:00
										 |  |  | 	if GOOS == "plan9" { | 
					
						
							|  |  |  | 		// Plan 9 doesn't support floating point in note handler. | 
					
						
							| 
									
										
										
										
											2022-07-20 13:18:06 -04:00
										 |  |  | 		if gp := getg(); gp == gp.m.gsignal { | 
					
						
							| 
									
										
										
										
											2015-10-28 06:44:26 +01:00
										 |  |  | 			return nextSampleNoFP() | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-07-29 12:51:19 -07:00
										 |  |  | 	return uintptr(fastexprand(MemProfileRate)) | 
					
						
							| 
									
										
										
										
											2017-09-24 18:13:26 +02:00
										 |  |  | } | 
					
						
							| 
									
										
										
										
											2015-09-14 14:03:45 -07:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2017-09-24 18:13:26 +02:00
										 |  |  | // fastexprand returns a random number from an exponential distribution with | 
					
						
							|  |  |  | // the specified mean. | 
					
						
							|  |  |  | func fastexprand(mean int) int32 { | 
					
						
							|  |  |  | 	// Avoid overflow. Maximum possible step is | 
					
						
							|  |  |  | 	// -ln(1/(1<<randomBitCount)) * mean, approximately 20 * mean. | 
					
						
							| 
									
										
										
										
											2015-09-14 14:03:45 -07:00
										 |  |  | 	switch { | 
					
						
							| 
									
										
										
										
											2017-09-24 18:13:26 +02:00
										 |  |  | 	case mean > 0x7000000: | 
					
						
							|  |  |  | 		mean = 0x7000000 | 
					
						
							|  |  |  | 	case mean == 0: | 
					
						
							| 
									
										
										
										
											2015-09-14 14:03:45 -07:00
										 |  |  | 		return 0 | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2017-09-24 18:13:26 +02:00
										 |  |  | 	// Take a random sample of the exponential distribution exp(-mean*x). | 
					
						
							|  |  |  | 	// The probability distribution function is mean*exp(-mean*x), so the CDF is | 
					
						
							|  |  |  | 	// p = 1 - exp(-mean*x), so | 
					
						
							|  |  |  | 	// q = 1 - p == exp(-mean*x) | 
					
						
							|  |  |  | 	// log_e(q) = -mean*x | 
					
						
							|  |  |  | 	// -log_e(q)/mean = x | 
					
						
							|  |  |  | 	// x = -log_e(q) * mean | 
					
						
							|  |  |  | 	// x = log_2(q) * (-log_e(2)) * mean    ; Using log_2 for efficiency | 
					
						
							| 
									
										
										
										
											2015-09-14 14:03:45 -07:00
										 |  |  | 	const randomBitCount = 26 | 
					
						
							| 
									
										
										
										
											2021-09-30 22:46:09 +08:00
										 |  |  | 	q := fastrandn(1<<randomBitCount) + 1 | 
					
						
							| 
									
										
										
										
											2015-09-14 14:03:45 -07:00
										 |  |  | 	qlog := fastlog2(float64(q)) - randomBitCount | 
					
						
							|  |  |  | 	if qlog > 0 { | 
					
						
							|  |  |  | 		qlog = 0 | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 	const minusLog2 = -0.6931471805599453 // -ln(2) | 
					
						
							| 
									
										
										
										
											2017-09-24 18:13:26 +02:00
										 |  |  | 	return int32(qlog*(minusLog2*float64(mean))) + 1 | 
					
						
							| 
									
										
										
										
											2015-09-14 14:03:45 -07:00
										 |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-10-28 06:44:26 +01:00
										 |  |  | // nextSampleNoFP is similar to nextSample, but uses older, | 
					
						
							|  |  |  | // simpler code to avoid floating point. | 
					
						
							| 
									
										
										
										
											2019-07-29 12:51:19 -07:00
										 |  |  | func nextSampleNoFP() uintptr { | 
					
						
							| 
									
										
										
										
											2015-10-28 06:44:26 +01:00
										 |  |  | 	// Set first allocation sample size. | 
					
						
							|  |  |  | 	rate := MemProfileRate | 
					
						
							|  |  |  | 	if rate > 0x3fffffff { // make 2*rate not overflow | 
					
						
							|  |  |  | 		rate = 0x3fffffff | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 	if rate != 0 { | 
					
						
							| 
									
										
										
										
											2021-09-30 22:46:09 +08:00
										 |  |  | 		return uintptr(fastrandn(uint32(2 * rate))) | 
					
						
							| 
									
										
										
										
											2015-10-28 06:44:26 +01:00
										 |  |  | 	} | 
					
						
							|  |  |  | 	return 0 | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-03-08 20:56:15 -04:00
										 |  |  | type persistentAlloc struct { | 
					
						
							| 
									
										
										
										
											2017-10-22 18:10:08 -04:00
										 |  |  | 	base *notInHeap | 
					
						
							| 
									
										
										
										
											2015-01-14 14:13:55 -05:00
										 |  |  | 	off  uintptr | 
					
						
							| 
									
										
										
										
											2014-09-04 00:54:06 -04:00
										 |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-03-08 20:56:15 -04:00
										 |  |  | var globalAlloc struct { | 
					
						
							|  |  |  | 	mutex | 
					
						
							|  |  |  | 	persistentAlloc | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2018-12-21 16:06:54 -08:00
										 |  |  | // persistentChunkSize is the number of bytes we allocate when we grow | 
					
						
							|  |  |  | // a persistentAlloc. | 
					
						
							|  |  |  | const persistentChunkSize = 256 << 10 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | // persistentChunks is a list of all the persistent chunks we have | 
					
						
							|  |  |  | // allocated. The list is maintained through the first word in the | 
					
						
							|  |  |  | // persistent chunk. This is updated atomically. | 
					
						
							|  |  |  | var persistentChunks *notInHeap | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-09-04 00:54:06 -04:00
										 |  |  | // Wrapper around sysAlloc that can allocate small chunks. | 
					
						
							|  |  |  | // There is no associated free operation. | 
					
						
							|  |  |  | // Intended for things like function/type/debug-related persistent data. | 
					
						
							|  |  |  | // If align is 0, uses default align (currently 8). | 
					
						
							| 
									
										
											  
											
												runtime: make fixalloc zero allocations on reuse
Currently fixalloc does not zero memory it reuses. This is dangerous
with the hybrid barrier if the type may contain heap pointers, since
it may cause us to observe a dead heap pointer on reuse. It's also
error-prone since it's the only allocator that doesn't zero on
allocation (mallocgc of course zeroes, but so do persistentalloc and
sysAlloc). It's also largely pointless: for mcache, the caller
immediately memclrs the allocation; and the two specials types are
tiny so there's no real cost to zeroing them.
Change fixalloc to zero allocations by default.
The only type we don't zero by default is mspan. This actually
requires that the spsn's sweepgen survive across freeing and
reallocating a span. If we were to zero it, the following race would
be possible:
1. The current sweepgen is 2. Span s is on the unswept list.
2. Direct sweeping sweeps span s, finds it's all free, and releases s
   to the fixalloc.
3. Thread 1 allocates s from fixalloc. Suppose this zeros s, including
   s.sweepgen.
4. Thread 1 calls s.init, which sets s.state to _MSpanDead.
5. On thread 2, background sweeping comes across span s in allspans
   and cas's s.sweepgen from 0 (sg-2) to 1 (sg-1). Now it thinks it
   owns it for sweeping. 6. Thread 1 continues initializing s.
   Everything breaks.
I would like to fix this because it's obviously confusing, but it's a
subtle enough problem that I'm leaving it alone for now. The solution
may be to skip sweepgen 0, but then we have to think about wrap-around
much more carefully.
Updates #17503.
Change-Id: Ie08691feed3abbb06a31381b94beb0a2e36a0613
Reviewed-on: https://go-review.googlesource.com/31368
Reviewed-by: Keith Randall <khr@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2016-09-25 17:12:43 -04:00
										 |  |  | // The returned memory will be zeroed. | 
					
						
							| 
									
										
											  
											
												runtime: track how much memory is mapped in the Ready state
This change adds a field to memstats called mappedReady that tracks how
much memory is in the Ready state at any given time. In essence, it's
the total memory usage by the Go runtime (with one exception which is
documented). Essentially, all memory mapped read/write that has either
been paged in or will soon.
To make tracking this not involve the many different stats that track
mapped memory, we track this statistic at a very low level. The downside
of tracking this statistic at such a low level is that it managed to
catch lots of situations where the runtime wasn't fully accounting for
memory. This change rectifies these situations by always accounting for
memory that's mapped in some way (i.e. always passing a sysMemStat to a
mem.go function), with *two* exceptions.
Rectifying these situations means also having the memory mapped during
testing being accounted for, so that tests (i.e. ReadMemStats) that
ultimately check mappedReady continue to work correctly without special
exceptions. We choose to simply account for this memory in other_sys.
Let's talk about the exceptions. The first is the arenas array for
finding heap arena metadata from an address is mapped as read/write in
one large chunk. It's tens of MiB in size. On systems with demand
paging, we assume that the whole thing isn't paged in at once (after
all, it maps to the whole address space, and it's exceedingly difficult
with today's technology to even broach having as much physical memory as
the total address space). On systems where we have to commit memory
manually, we use a two-level structure.
Now, the reason why this is an exception is because we have no mechanism
to track what memory is paged in, and we can't just account for the
entire thing, because that would *look* like an enormous overhead.
Furthermore, this structure is on a few really, really critical paths in
the runtime, so doing more explicit tracking isn't really an option. So,
we explicitly don't and call sysAllocOS to map this memory.
The second exception is that we call sysFree with no accounting to clean
up address space reservations, or otherwise to throw out mappings we
don't care about. In this case, also drop down to a lower level and call
sysFreeOS to explicitly avoid accounting.
The third exception is debuglog allocations. That is purely a debugging
facility and ideally we want it to have as small an impact on the
runtime as possible. If we include it in mappedReady calculations, it
could cause GC pacing shifts in future CLs, especailly if one increases
the debuglog buffer sizes as a one-off.
As of this CL, these are the only three places in the runtime that would
pass nil for a stat to any of the functions in mem.go. As a result, this
CL makes sysMemStats mandatory to facilitate better accounting in the
future. It's now much easier to grep and find out where accounting is
explicitly elided, because one doesn't have to follow the trail of
sysMemStat nil pointer values, and can just look at the function name.
For #48409.
Change-Id: I274eb467fc2603881717482214fddc47c9eaf218
Reviewed-on: https://go-review.googlesource.com/c/go/+/393402
Reviewed-by: Michael Pratt <mpratt@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
											
										 
											2022-03-15 02:48:18 +00:00
										 |  |  | // sysStat must be non-nil. | 
					
						
							| 
									
										
										
										
											2016-10-11 22:58:21 -04:00
										 |  |  | // | 
					
						
							| 
									
										
										
										
											2022-08-07 17:43:57 +07:00
										 |  |  | // Consider marking persistentalloc'd types not in heap by embedding | 
					
						
							|  |  |  | // runtime/internal/sys.NotInHeap. | 
					
						
							| 
									
										
										
										
											2020-07-29 20:25:05 +00:00
										 |  |  | func persistentalloc(size, align uintptr, sysStat *sysMemStat) unsafe.Pointer { | 
					
						
							| 
									
										
										
										
											2017-10-22 18:10:08 -04:00
										 |  |  | 	var p *notInHeap | 
					
						
							| 
									
										
										
										
											2015-06-07 21:45:39 -04:00
										 |  |  | 	systemstack(func() { | 
					
						
							|  |  |  | 		p = persistentalloc1(size, align, sysStat) | 
					
						
							|  |  |  | 	}) | 
					
						
							| 
									
										
										
										
											2017-10-22 18:10:08 -04:00
										 |  |  | 	return unsafe.Pointer(p) | 
					
						
							| 
									
										
										
										
											2015-06-07 21:45:39 -04:00
										 |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | // Must run on system stack because stack growth can (re)invoke it. | 
					
						
							|  |  |  | // See issue 9174. | 
					
						
							| 
									
										
										
										
											2022-01-30 20:13:43 -05:00
										 |  |  | // | 
					
						
							| 
									
										
										
										
											2015-06-07 21:45:39 -04:00
										 |  |  | //go:systemstack | 
					
						
							| 
									
										
										
										
											2020-07-29 20:25:05 +00:00
										 |  |  | func persistentalloc1(size, align uintptr, sysStat *sysMemStat) *notInHeap { | 
					
						
							| 
									
										
										
										
											2014-09-04 00:54:06 -04:00
										 |  |  | 	const ( | 
					
						
							|  |  |  | 		maxBlock = 64 << 10 // VM reservation granularity is 64K on windows | 
					
						
							|  |  |  | 	) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-01-14 14:13:55 -05:00
										 |  |  | 	if size == 0 { | 
					
						
							|  |  |  | 		throw("persistentalloc: size == 0") | 
					
						
							|  |  |  | 	} | 
					
						
							| 
									
										
										
										
											2014-09-04 00:54:06 -04:00
										 |  |  | 	if align != 0 { | 
					
						
							|  |  |  | 		if align&(align-1) != 0 { | 
					
						
							| 
									
										
										
										
											2014-12-27 20:58:00 -08:00
										 |  |  | 			throw("persistentalloc: align is not a power of 2") | 
					
						
							| 
									
										
										
										
											2014-09-04 00:54:06 -04:00
										 |  |  | 		} | 
					
						
							|  |  |  | 		if align > _PageSize { | 
					
						
							| 
									
										
										
										
											2014-12-27 20:58:00 -08:00
										 |  |  | 			throw("persistentalloc: align is too large") | 
					
						
							| 
									
										
										
										
											2014-09-04 00:54:06 -04:00
										 |  |  | 		} | 
					
						
							|  |  |  | 	} else { | 
					
						
							|  |  |  | 		align = 8 | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	if size >= maxBlock { | 
					
						
							| 
									
										
										
										
											2017-10-22 18:10:08 -04:00
										 |  |  | 		return (*notInHeap)(sysAlloc(size, sysStat)) | 
					
						
							| 
									
										
										
										
											2014-09-04 00:54:06 -04:00
										 |  |  | 	} | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-03-08 20:56:15 -04:00
										 |  |  | 	mp := acquirem() | 
					
						
							|  |  |  | 	var persistent *persistentAlloc | 
					
						
							| 
									
										
										
										
											2015-04-17 00:21:30 -04:00
										 |  |  | 	if mp != nil && mp.p != 0 { | 
					
						
							|  |  |  | 		persistent = &mp.p.ptr().palloc | 
					
						
							| 
									
										
										
										
											2015-03-08 20:56:15 -04:00
										 |  |  | 	} else { | 
					
						
							|  |  |  | 		lock(&globalAlloc.mutex) | 
					
						
							|  |  |  | 		persistent = &globalAlloc.persistentAlloc | 
					
						
							|  |  |  | 	} | 
					
						
							| 
									
										
										
										
											2019-06-28 16:44:07 +00:00
										 |  |  | 	persistent.off = alignUp(persistent.off, align) | 
					
						
							| 
									
										
										
										
											2018-12-21 16:06:54 -08:00
										 |  |  | 	if persistent.off+size > persistentChunkSize || persistent.base == nil { | 
					
						
							|  |  |  | 		persistent.base = (*notInHeap)(sysAlloc(persistentChunkSize, &memstats.other_sys)) | 
					
						
							| 
									
										
										
										
											2015-01-14 14:13:55 -05:00
										 |  |  | 		if persistent.base == nil { | 
					
						
							| 
									
										
										
										
											2015-03-08 20:56:15 -04:00
										 |  |  | 			if persistent == &globalAlloc.persistentAlloc { | 
					
						
							|  |  |  | 				unlock(&globalAlloc.mutex) | 
					
						
							|  |  |  | 			} | 
					
						
							| 
									
										
										
										
											2014-12-27 20:58:00 -08:00
										 |  |  | 			throw("runtime: cannot allocate memory") | 
					
						
							| 
									
										
										
										
											2014-09-04 00:54:06 -04:00
										 |  |  | 		} | 
					
						
							| 
									
										
										
										
											2018-12-21 16:06:54 -08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 		// Add the new chunk to the persistentChunks list. | 
					
						
							|  |  |  | 		for { | 
					
						
							|  |  |  | 			chunks := uintptr(unsafe.Pointer(persistentChunks)) | 
					
						
							|  |  |  | 			*(*uintptr)(unsafe.Pointer(persistent.base)) = chunks | 
					
						
							|  |  |  | 			if atomic.Casuintptr((*uintptr)(unsafe.Pointer(&persistentChunks)), chunks, uintptr(unsafe.Pointer(persistent.base))) { | 
					
						
							|  |  |  | 				break | 
					
						
							|  |  |  | 			} | 
					
						
							|  |  |  | 		} | 
					
						
							| 
									
										
										
										
											2021-06-16 23:05:44 +00:00
										 |  |  | 		persistent.off = alignUp(goarch.PtrSize, align) | 
					
						
							| 
									
										
										
										
											2014-09-04 00:54:06 -04:00
										 |  |  | 	} | 
					
						
							| 
									
										
										
										
											2017-10-22 18:10:08 -04:00
										 |  |  | 	p := persistent.base.add(persistent.off) | 
					
						
							| 
									
										
										
										
											2015-01-14 14:13:55 -05:00
										 |  |  | 	persistent.off += size | 
					
						
							| 
									
										
										
										
											2015-03-08 20:56:15 -04:00
										 |  |  | 	releasem(mp) | 
					
						
							|  |  |  | 	if persistent == &globalAlloc.persistentAlloc { | 
					
						
							|  |  |  | 		unlock(&globalAlloc.mutex) | 
					
						
							|  |  |  | 	} | 
					
						
							| 
									
										
										
										
											2014-09-04 00:54:06 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-04-16 14:32:18 -07:00
										 |  |  | 	if sysStat != &memstats.other_sys { | 
					
						
							| 
									
										
										
										
											2020-07-29 20:25:05 +00:00
										 |  |  | 		sysStat.add(int64(size)) | 
					
						
							|  |  |  | 		memstats.other_sys.add(-int64(size)) | 
					
						
							| 
									
										
										
										
											2014-09-04 00:54:06 -04:00
										 |  |  | 	} | 
					
						
							|  |  |  | 	return p | 
					
						
							|  |  |  | } | 
					
						
							| 
									
										
										
										
											2017-10-22 18:10:08 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2018-12-21 16:06:54 -08:00
										 |  |  | // inPersistentAlloc reports whether p points to memory allocated by | 
					
						
							|  |  |  | // persistentalloc. This must be nosplit because it is called by the | 
					
						
							|  |  |  | // cgo checker code, which is called by the write barrier code. | 
					
						
							| 
									
										
										
										
											2022-01-30 20:13:43 -05:00
										 |  |  | // | 
					
						
							| 
									
										
										
										
											2018-12-21 16:06:54 -08:00
										 |  |  | //go:nosplit | 
					
						
							|  |  |  | func inPersistentAlloc(p uintptr) bool { | 
					
						
							|  |  |  | 	chunk := atomic.Loaduintptr((*uintptr)(unsafe.Pointer(&persistentChunks))) | 
					
						
							|  |  |  | 	for chunk != 0 { | 
					
						
							|  |  |  | 		if p >= chunk && p < chunk+persistentChunkSize { | 
					
						
							|  |  |  | 			return true | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 		chunk = *(*uintptr)(unsafe.Pointer(chunk)) | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 	return false | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
											  
											
												runtime: use sparse mappings for the heap
This replaces the contiguous heap arena mapping with a potentially
sparse mapping that can support heap mappings anywhere in the address
space.
This has several advantages over the current approach:
* There is no longer any limit on the size of the Go heap. (Currently
  it's limited to 512GB.) Hence, this fixes #10460.
* It eliminates many failures modes of heap initialization and
  growing. In particular it eliminates any possibility of panicking
  with an address space conflict. This can happen for many reasons and
  even causes a low but steady rate of TSAN test failures because of
  conflicts with the TSAN runtime. See #16936 and #11993.
* It eliminates the notion of "non-reserved" heap, which was added
  because creating huge address space reservations (particularly on
  64-bit) led to huge process VSIZE. This was at best confusing and at
  worst conflicted badly with ulimit -v. However, the non-reserved
  heap logic is complicated, can race with other mappings in non-pure
  Go binaries (e.g., #18976), and requires that the entire heap be
  either reserved or non-reserved. We currently maintain the latter
  property, but it's quite difficult to convince yourself of that, and
  hence difficult to keep correct. This logic is still present, but
  will be removed in the next CL.
* It fixes problems on 32-bit where skipping over parts of the address
  space leads to mapping huge (and never-to-be-used) metadata
  structures. See #19831.
This also completely rewrites and significantly simplifies
mheap.sysAlloc, which has been a source of many bugs. E.g., #21044,
 #20259, #18651, and #13143 (and maybe #23222).
This change also makes it possible to allocate individual objects
larger than 512GB. As a result, a few tests that expected huge
allocations to fail needed to be changed to make even larger
allocations. However, at the moment attempting to allocate a humongous
object may cause the program to freeze for several minutes on Linux as
we fall back to probing every page with addrspace_free. That logic
(and this failure mode) will be removed in the next CL.
Fixes #10460.
Fixes #22204 (since it rewrites the code involved).
This slightly slows down compilebench and the x/benchmarks garbage
benchmark.
name       old time/op     new time/op     delta
Template       184ms ± 1%      185ms ± 1%    ~     (p=0.065 n=10+9)
Unicode       86.9ms ± 3%     86.3ms ± 1%    ~     (p=0.631 n=10+10)
GoTypes        599ms ± 0%      602ms ± 0%  +0.56%  (p=0.000 n=10+9)
Compiler       2.87s ± 1%      2.89s ± 1%  +0.51%  (p=0.002 n=9+10)
SSA            7.29s ± 1%      7.25s ± 1%    ~     (p=0.182 n=10+9)
Flate          118ms ± 2%      118ms ± 1%    ~     (p=0.113 n=9+9)
GoParser       147ms ± 1%      148ms ± 1%  +1.07%  (p=0.003 n=9+10)
Reflect        401ms ± 1%      404ms ± 1%  +0.71%  (p=0.003 n=10+9)
Tar            175ms ± 1%      175ms ± 1%    ~     (p=0.604 n=9+10)
XML            209ms ± 1%      210ms ± 1%    ~     (p=0.052 n=10+10)
(https://perf.golang.org/search?q=upload:20171231.4)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.23ms ± 1%  2.25ms ± 1%  +0.84%  (p=0.000 n=19+19)
(https://perf.golang.org/search?q=upload:20171231.3)
Relative to the start of the sparse heap changes (starting at and
including "runtime: fix various contiguous bitmap assumptions"),
overall slowdown is roughly 1% on GC-intensive benchmarks:
name        old time/op     new time/op     delta
Template        183ms ± 1%      185ms ± 1%  +1.32%  (p=0.000 n=9+9)
Unicode        84.9ms ± 2%     86.3ms ± 1%  +1.65%  (p=0.000 n=9+10)
GoTypes         595ms ± 1%      602ms ± 0%  +1.19%  (p=0.000 n=9+9)
Compiler        2.86s ± 0%      2.89s ± 1%  +0.91%  (p=0.000 n=9+10)
SSA             7.19s ± 0%      7.25s ± 1%  +0.75%  (p=0.000 n=8+9)
Flate           117ms ± 1%      118ms ± 1%  +1.10%  (p=0.000 n=10+9)
GoParser        146ms ± 2%      148ms ± 1%  +1.48%  (p=0.002 n=10+10)
Reflect         398ms ± 1%      404ms ± 1%  +1.51%  (p=0.000 n=10+9)
Tar             173ms ± 1%      175ms ± 1%  +1.17%  (p=0.000 n=10+10)
XML             208ms ± 1%      210ms ± 1%  +0.62%  (p=0.011 n=10+10)
[Geo mean]      369ms           373ms       +1.17%
(https://perf.golang.org/search?q=upload:20180101.2)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.22ms ± 1%  2.25ms ± 1%  +1.51%  (p=0.000 n=20+19)
(https://perf.golang.org/search?q=upload:20180101.3)
Change-Id: I5daf4cfec24b252e5a57001f0a6c03f22479d0f0
Reviewed-on: https://go-review.googlesource.com/85887
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2017-12-19 22:05:23 -08:00
										 |  |  | // linearAlloc is a simple linear allocator that pre-reserves a region | 
					
						
							| 
									
										
										
										
											2020-11-16 21:57:32 +00:00
										 |  |  | // of memory and then optionally maps that region into the Ready state | 
					
						
							|  |  |  | // as needed. | 
					
						
							|  |  |  | // | 
					
						
							|  |  |  | // The caller is responsible for locking. | 
					
						
							| 
									
										
											  
											
												runtime: use sparse mappings for the heap
This replaces the contiguous heap arena mapping with a potentially
sparse mapping that can support heap mappings anywhere in the address
space.
This has several advantages over the current approach:
* There is no longer any limit on the size of the Go heap. (Currently
  it's limited to 512GB.) Hence, this fixes #10460.
* It eliminates many failures modes of heap initialization and
  growing. In particular it eliminates any possibility of panicking
  with an address space conflict. This can happen for many reasons and
  even causes a low but steady rate of TSAN test failures because of
  conflicts with the TSAN runtime. See #16936 and #11993.
* It eliminates the notion of "non-reserved" heap, which was added
  because creating huge address space reservations (particularly on
  64-bit) led to huge process VSIZE. This was at best confusing and at
  worst conflicted badly with ulimit -v. However, the non-reserved
  heap logic is complicated, can race with other mappings in non-pure
  Go binaries (e.g., #18976), and requires that the entire heap be
  either reserved or non-reserved. We currently maintain the latter
  property, but it's quite difficult to convince yourself of that, and
  hence difficult to keep correct. This logic is still present, but
  will be removed in the next CL.
* It fixes problems on 32-bit where skipping over parts of the address
  space leads to mapping huge (and never-to-be-used) metadata
  structures. See #19831.
This also completely rewrites and significantly simplifies
mheap.sysAlloc, which has been a source of many bugs. E.g., #21044,
 #20259, #18651, and #13143 (and maybe #23222).
This change also makes it possible to allocate individual objects
larger than 512GB. As a result, a few tests that expected huge
allocations to fail needed to be changed to make even larger
allocations. However, at the moment attempting to allocate a humongous
object may cause the program to freeze for several minutes on Linux as
we fall back to probing every page with addrspace_free. That logic
(and this failure mode) will be removed in the next CL.
Fixes #10460.
Fixes #22204 (since it rewrites the code involved).
This slightly slows down compilebench and the x/benchmarks garbage
benchmark.
name       old time/op     new time/op     delta
Template       184ms ± 1%      185ms ± 1%    ~     (p=0.065 n=10+9)
Unicode       86.9ms ± 3%     86.3ms ± 1%    ~     (p=0.631 n=10+10)
GoTypes        599ms ± 0%      602ms ± 0%  +0.56%  (p=0.000 n=10+9)
Compiler       2.87s ± 1%      2.89s ± 1%  +0.51%  (p=0.002 n=9+10)
SSA            7.29s ± 1%      7.25s ± 1%    ~     (p=0.182 n=10+9)
Flate          118ms ± 2%      118ms ± 1%    ~     (p=0.113 n=9+9)
GoParser       147ms ± 1%      148ms ± 1%  +1.07%  (p=0.003 n=9+10)
Reflect        401ms ± 1%      404ms ± 1%  +0.71%  (p=0.003 n=10+9)
Tar            175ms ± 1%      175ms ± 1%    ~     (p=0.604 n=9+10)
XML            209ms ± 1%      210ms ± 1%    ~     (p=0.052 n=10+10)
(https://perf.golang.org/search?q=upload:20171231.4)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.23ms ± 1%  2.25ms ± 1%  +0.84%  (p=0.000 n=19+19)
(https://perf.golang.org/search?q=upload:20171231.3)
Relative to the start of the sparse heap changes (starting at and
including "runtime: fix various contiguous bitmap assumptions"),
overall slowdown is roughly 1% on GC-intensive benchmarks:
name        old time/op     new time/op     delta
Template        183ms ± 1%      185ms ± 1%  +1.32%  (p=0.000 n=9+9)
Unicode        84.9ms ± 2%     86.3ms ± 1%  +1.65%  (p=0.000 n=9+10)
GoTypes         595ms ± 1%      602ms ± 0%  +1.19%  (p=0.000 n=9+9)
Compiler        2.86s ± 0%      2.89s ± 1%  +0.91%  (p=0.000 n=9+10)
SSA             7.19s ± 0%      7.25s ± 1%  +0.75%  (p=0.000 n=8+9)
Flate           117ms ± 1%      118ms ± 1%  +1.10%  (p=0.000 n=10+9)
GoParser        146ms ± 2%      148ms ± 1%  +1.48%  (p=0.002 n=10+10)
Reflect         398ms ± 1%      404ms ± 1%  +1.51%  (p=0.000 n=10+9)
Tar             173ms ± 1%      175ms ± 1%  +1.17%  (p=0.000 n=10+10)
XML             208ms ± 1%      210ms ± 1%  +0.62%  (p=0.011 n=10+10)
[Geo mean]      369ms           373ms       +1.17%
(https://perf.golang.org/search?q=upload:20180101.2)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.22ms ± 1%  2.25ms ± 1%  +1.51%  (p=0.000 n=20+19)
(https://perf.golang.org/search?q=upload:20180101.3)
Change-Id: I5daf4cfec24b252e5a57001f0a6c03f22479d0f0
Reviewed-on: https://go-review.googlesource.com/85887
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2017-12-19 22:05:23 -08:00
										 |  |  | type linearAlloc struct { | 
					
						
							|  |  |  | 	next   uintptr // next free byte | 
					
						
							|  |  |  | 	mapped uintptr // one byte past end of mapped space | 
					
						
							|  |  |  | 	end    uintptr // end of reserved space | 
					
						
							| 
									
										
										
										
											2020-11-16 21:57:32 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | 	mapMemory bool // transition memory from Reserved to Ready if true | 
					
						
							| 
									
										
											  
											
												runtime: use sparse mappings for the heap
This replaces the contiguous heap arena mapping with a potentially
sparse mapping that can support heap mappings anywhere in the address
space.
This has several advantages over the current approach:
* There is no longer any limit on the size of the Go heap. (Currently
  it's limited to 512GB.) Hence, this fixes #10460.
* It eliminates many failures modes of heap initialization and
  growing. In particular it eliminates any possibility of panicking
  with an address space conflict. This can happen for many reasons and
  even causes a low but steady rate of TSAN test failures because of
  conflicts with the TSAN runtime. See #16936 and #11993.
* It eliminates the notion of "non-reserved" heap, which was added
  because creating huge address space reservations (particularly on
  64-bit) led to huge process VSIZE. This was at best confusing and at
  worst conflicted badly with ulimit -v. However, the non-reserved
  heap logic is complicated, can race with other mappings in non-pure
  Go binaries (e.g., #18976), and requires that the entire heap be
  either reserved or non-reserved. We currently maintain the latter
  property, but it's quite difficult to convince yourself of that, and
  hence difficult to keep correct. This logic is still present, but
  will be removed in the next CL.
* It fixes problems on 32-bit where skipping over parts of the address
  space leads to mapping huge (and never-to-be-used) metadata
  structures. See #19831.
This also completely rewrites and significantly simplifies
mheap.sysAlloc, which has been a source of many bugs. E.g., #21044,
 #20259, #18651, and #13143 (and maybe #23222).
This change also makes it possible to allocate individual objects
larger than 512GB. As a result, a few tests that expected huge
allocations to fail needed to be changed to make even larger
allocations. However, at the moment attempting to allocate a humongous
object may cause the program to freeze for several minutes on Linux as
we fall back to probing every page with addrspace_free. That logic
(and this failure mode) will be removed in the next CL.
Fixes #10460.
Fixes #22204 (since it rewrites the code involved).
This slightly slows down compilebench and the x/benchmarks garbage
benchmark.
name       old time/op     new time/op     delta
Template       184ms ± 1%      185ms ± 1%    ~     (p=0.065 n=10+9)
Unicode       86.9ms ± 3%     86.3ms ± 1%    ~     (p=0.631 n=10+10)
GoTypes        599ms ± 0%      602ms ± 0%  +0.56%  (p=0.000 n=10+9)
Compiler       2.87s ± 1%      2.89s ± 1%  +0.51%  (p=0.002 n=9+10)
SSA            7.29s ± 1%      7.25s ± 1%    ~     (p=0.182 n=10+9)
Flate          118ms ± 2%      118ms ± 1%    ~     (p=0.113 n=9+9)
GoParser       147ms ± 1%      148ms ± 1%  +1.07%  (p=0.003 n=9+10)
Reflect        401ms ± 1%      404ms ± 1%  +0.71%  (p=0.003 n=10+9)
Tar            175ms ± 1%      175ms ± 1%    ~     (p=0.604 n=9+10)
XML            209ms ± 1%      210ms ± 1%    ~     (p=0.052 n=10+10)
(https://perf.golang.org/search?q=upload:20171231.4)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.23ms ± 1%  2.25ms ± 1%  +0.84%  (p=0.000 n=19+19)
(https://perf.golang.org/search?q=upload:20171231.3)
Relative to the start of the sparse heap changes (starting at and
including "runtime: fix various contiguous bitmap assumptions"),
overall slowdown is roughly 1% on GC-intensive benchmarks:
name        old time/op     new time/op     delta
Template        183ms ± 1%      185ms ± 1%  +1.32%  (p=0.000 n=9+9)
Unicode        84.9ms ± 2%     86.3ms ± 1%  +1.65%  (p=0.000 n=9+10)
GoTypes         595ms ± 1%      602ms ± 0%  +1.19%  (p=0.000 n=9+9)
Compiler        2.86s ± 0%      2.89s ± 1%  +0.91%  (p=0.000 n=9+10)
SSA             7.19s ± 0%      7.25s ± 1%  +0.75%  (p=0.000 n=8+9)
Flate           117ms ± 1%      118ms ± 1%  +1.10%  (p=0.000 n=10+9)
GoParser        146ms ± 2%      148ms ± 1%  +1.48%  (p=0.002 n=10+10)
Reflect         398ms ± 1%      404ms ± 1%  +1.51%  (p=0.000 n=10+9)
Tar             173ms ± 1%      175ms ± 1%  +1.17%  (p=0.000 n=10+10)
XML             208ms ± 1%      210ms ± 1%  +0.62%  (p=0.011 n=10+10)
[Geo mean]      369ms           373ms       +1.17%
(https://perf.golang.org/search?q=upload:20180101.2)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.22ms ± 1%  2.25ms ± 1%  +1.51%  (p=0.000 n=20+19)
(https://perf.golang.org/search?q=upload:20180101.3)
Change-Id: I5daf4cfec24b252e5a57001f0a6c03f22479d0f0
Reviewed-on: https://go-review.googlesource.com/85887
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2017-12-19 22:05:23 -08:00
										 |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-11-16 21:57:32 +00:00
										 |  |  | func (l *linearAlloc) init(base, size uintptr, mapMemory bool) { | 
					
						
							| 
									
										
										
										
											2020-04-30 19:35:12 +00:00
										 |  |  | 	if base+size < base { | 
					
						
							|  |  |  | 		// Chop off the last byte. The runtime isn't prepared | 
					
						
							|  |  |  | 		// to deal with situations where the bounds could overflow. | 
					
						
							|  |  |  | 		// Leave that memory reserved, though, so we don't map it | 
					
						
							|  |  |  | 		// later. | 
					
						
							|  |  |  | 		size -= 1 | 
					
						
							|  |  |  | 	} | 
					
						
							| 
									
										
											  
											
												runtime: use sparse mappings for the heap
This replaces the contiguous heap arena mapping with a potentially
sparse mapping that can support heap mappings anywhere in the address
space.
This has several advantages over the current approach:
* There is no longer any limit on the size of the Go heap. (Currently
  it's limited to 512GB.) Hence, this fixes #10460.
* It eliminates many failures modes of heap initialization and
  growing. In particular it eliminates any possibility of panicking
  with an address space conflict. This can happen for many reasons and
  even causes a low but steady rate of TSAN test failures because of
  conflicts with the TSAN runtime. See #16936 and #11993.
* It eliminates the notion of "non-reserved" heap, which was added
  because creating huge address space reservations (particularly on
  64-bit) led to huge process VSIZE. This was at best confusing and at
  worst conflicted badly with ulimit -v. However, the non-reserved
  heap logic is complicated, can race with other mappings in non-pure
  Go binaries (e.g., #18976), and requires that the entire heap be
  either reserved or non-reserved. We currently maintain the latter
  property, but it's quite difficult to convince yourself of that, and
  hence difficult to keep correct. This logic is still present, but
  will be removed in the next CL.
* It fixes problems on 32-bit where skipping over parts of the address
  space leads to mapping huge (and never-to-be-used) metadata
  structures. See #19831.
This also completely rewrites and significantly simplifies
mheap.sysAlloc, which has been a source of many bugs. E.g., #21044,
 #20259, #18651, and #13143 (and maybe #23222).
This change also makes it possible to allocate individual objects
larger than 512GB. As a result, a few tests that expected huge
allocations to fail needed to be changed to make even larger
allocations. However, at the moment attempting to allocate a humongous
object may cause the program to freeze for several minutes on Linux as
we fall back to probing every page with addrspace_free. That logic
(and this failure mode) will be removed in the next CL.
Fixes #10460.
Fixes #22204 (since it rewrites the code involved).
This slightly slows down compilebench and the x/benchmarks garbage
benchmark.
name       old time/op     new time/op     delta
Template       184ms ± 1%      185ms ± 1%    ~     (p=0.065 n=10+9)
Unicode       86.9ms ± 3%     86.3ms ± 1%    ~     (p=0.631 n=10+10)
GoTypes        599ms ± 0%      602ms ± 0%  +0.56%  (p=0.000 n=10+9)
Compiler       2.87s ± 1%      2.89s ± 1%  +0.51%  (p=0.002 n=9+10)
SSA            7.29s ± 1%      7.25s ± 1%    ~     (p=0.182 n=10+9)
Flate          118ms ± 2%      118ms ± 1%    ~     (p=0.113 n=9+9)
GoParser       147ms ± 1%      148ms ± 1%  +1.07%  (p=0.003 n=9+10)
Reflect        401ms ± 1%      404ms ± 1%  +0.71%  (p=0.003 n=10+9)
Tar            175ms ± 1%      175ms ± 1%    ~     (p=0.604 n=9+10)
XML            209ms ± 1%      210ms ± 1%    ~     (p=0.052 n=10+10)
(https://perf.golang.org/search?q=upload:20171231.4)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.23ms ± 1%  2.25ms ± 1%  +0.84%  (p=0.000 n=19+19)
(https://perf.golang.org/search?q=upload:20171231.3)
Relative to the start of the sparse heap changes (starting at and
including "runtime: fix various contiguous bitmap assumptions"),
overall slowdown is roughly 1% on GC-intensive benchmarks:
name        old time/op     new time/op     delta
Template        183ms ± 1%      185ms ± 1%  +1.32%  (p=0.000 n=9+9)
Unicode        84.9ms ± 2%     86.3ms ± 1%  +1.65%  (p=0.000 n=9+10)
GoTypes         595ms ± 1%      602ms ± 0%  +1.19%  (p=0.000 n=9+9)
Compiler        2.86s ± 0%      2.89s ± 1%  +0.91%  (p=0.000 n=9+10)
SSA             7.19s ± 0%      7.25s ± 1%  +0.75%  (p=0.000 n=8+9)
Flate           117ms ± 1%      118ms ± 1%  +1.10%  (p=0.000 n=10+9)
GoParser        146ms ± 2%      148ms ± 1%  +1.48%  (p=0.002 n=10+10)
Reflect         398ms ± 1%      404ms ± 1%  +1.51%  (p=0.000 n=10+9)
Tar             173ms ± 1%      175ms ± 1%  +1.17%  (p=0.000 n=10+10)
XML             208ms ± 1%      210ms ± 1%  +0.62%  (p=0.011 n=10+10)
[Geo mean]      369ms           373ms       +1.17%
(https://perf.golang.org/search?q=upload:20180101.2)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.22ms ± 1%  2.25ms ± 1%  +1.51%  (p=0.000 n=20+19)
(https://perf.golang.org/search?q=upload:20180101.3)
Change-Id: I5daf4cfec24b252e5a57001f0a6c03f22479d0f0
Reviewed-on: https://go-review.googlesource.com/85887
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2017-12-19 22:05:23 -08:00
										 |  |  | 	l.next, l.mapped = base, base | 
					
						
							|  |  |  | 	l.end = base + size | 
					
						
							| 
									
										
										
										
											2020-11-16 21:57:32 +00:00
										 |  |  | 	l.mapMemory = mapMemory | 
					
						
							| 
									
										
											  
											
												runtime: use sparse mappings for the heap
This replaces the contiguous heap arena mapping with a potentially
sparse mapping that can support heap mappings anywhere in the address
space.
This has several advantages over the current approach:
* There is no longer any limit on the size of the Go heap. (Currently
  it's limited to 512GB.) Hence, this fixes #10460.
* It eliminates many failures modes of heap initialization and
  growing. In particular it eliminates any possibility of panicking
  with an address space conflict. This can happen for many reasons and
  even causes a low but steady rate of TSAN test failures because of
  conflicts with the TSAN runtime. See #16936 and #11993.
* It eliminates the notion of "non-reserved" heap, which was added
  because creating huge address space reservations (particularly on
  64-bit) led to huge process VSIZE. This was at best confusing and at
  worst conflicted badly with ulimit -v. However, the non-reserved
  heap logic is complicated, can race with other mappings in non-pure
  Go binaries (e.g., #18976), and requires that the entire heap be
  either reserved or non-reserved. We currently maintain the latter
  property, but it's quite difficult to convince yourself of that, and
  hence difficult to keep correct. This logic is still present, but
  will be removed in the next CL.
* It fixes problems on 32-bit where skipping over parts of the address
  space leads to mapping huge (and never-to-be-used) metadata
  structures. See #19831.
This also completely rewrites and significantly simplifies
mheap.sysAlloc, which has been a source of many bugs. E.g., #21044,
 #20259, #18651, and #13143 (and maybe #23222).
This change also makes it possible to allocate individual objects
larger than 512GB. As a result, a few tests that expected huge
allocations to fail needed to be changed to make even larger
allocations. However, at the moment attempting to allocate a humongous
object may cause the program to freeze for several minutes on Linux as
we fall back to probing every page with addrspace_free. That logic
(and this failure mode) will be removed in the next CL.
Fixes #10460.
Fixes #22204 (since it rewrites the code involved).
This slightly slows down compilebench and the x/benchmarks garbage
benchmark.
name       old time/op     new time/op     delta
Template       184ms ± 1%      185ms ± 1%    ~     (p=0.065 n=10+9)
Unicode       86.9ms ± 3%     86.3ms ± 1%    ~     (p=0.631 n=10+10)
GoTypes        599ms ± 0%      602ms ± 0%  +0.56%  (p=0.000 n=10+9)
Compiler       2.87s ± 1%      2.89s ± 1%  +0.51%  (p=0.002 n=9+10)
SSA            7.29s ± 1%      7.25s ± 1%    ~     (p=0.182 n=10+9)
Flate          118ms ± 2%      118ms ± 1%    ~     (p=0.113 n=9+9)
GoParser       147ms ± 1%      148ms ± 1%  +1.07%  (p=0.003 n=9+10)
Reflect        401ms ± 1%      404ms ± 1%  +0.71%  (p=0.003 n=10+9)
Tar            175ms ± 1%      175ms ± 1%    ~     (p=0.604 n=9+10)
XML            209ms ± 1%      210ms ± 1%    ~     (p=0.052 n=10+10)
(https://perf.golang.org/search?q=upload:20171231.4)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.23ms ± 1%  2.25ms ± 1%  +0.84%  (p=0.000 n=19+19)
(https://perf.golang.org/search?q=upload:20171231.3)
Relative to the start of the sparse heap changes (starting at and
including "runtime: fix various contiguous bitmap assumptions"),
overall slowdown is roughly 1% on GC-intensive benchmarks:
name        old time/op     new time/op     delta
Template        183ms ± 1%      185ms ± 1%  +1.32%  (p=0.000 n=9+9)
Unicode        84.9ms ± 2%     86.3ms ± 1%  +1.65%  (p=0.000 n=9+10)
GoTypes         595ms ± 1%      602ms ± 0%  +1.19%  (p=0.000 n=9+9)
Compiler        2.86s ± 0%      2.89s ± 1%  +0.91%  (p=0.000 n=9+10)
SSA             7.19s ± 0%      7.25s ± 1%  +0.75%  (p=0.000 n=8+9)
Flate           117ms ± 1%      118ms ± 1%  +1.10%  (p=0.000 n=10+9)
GoParser        146ms ± 2%      148ms ± 1%  +1.48%  (p=0.002 n=10+10)
Reflect         398ms ± 1%      404ms ± 1%  +1.51%  (p=0.000 n=10+9)
Tar             173ms ± 1%      175ms ± 1%  +1.17%  (p=0.000 n=10+10)
XML             208ms ± 1%      210ms ± 1%  +0.62%  (p=0.011 n=10+10)
[Geo mean]      369ms           373ms       +1.17%
(https://perf.golang.org/search?q=upload:20180101.2)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.22ms ± 1%  2.25ms ± 1%  +1.51%  (p=0.000 n=20+19)
(https://perf.golang.org/search?q=upload:20180101.3)
Change-Id: I5daf4cfec24b252e5a57001f0a6c03f22479d0f0
Reviewed-on: https://go-review.googlesource.com/85887
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2017-12-19 22:05:23 -08:00
										 |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-07-29 20:25:05 +00:00
										 |  |  | func (l *linearAlloc) alloc(size, align uintptr, sysStat *sysMemStat) unsafe.Pointer { | 
					
						
							| 
									
										
										
										
											2019-06-28 16:44:07 +00:00
										 |  |  | 	p := alignUp(l.next, align) | 
					
						
							| 
									
										
											  
											
												runtime: use sparse mappings for the heap
This replaces the contiguous heap arena mapping with a potentially
sparse mapping that can support heap mappings anywhere in the address
space.
This has several advantages over the current approach:
* There is no longer any limit on the size of the Go heap. (Currently
  it's limited to 512GB.) Hence, this fixes #10460.
* It eliminates many failures modes of heap initialization and
  growing. In particular it eliminates any possibility of panicking
  with an address space conflict. This can happen for many reasons and
  even causes a low but steady rate of TSAN test failures because of
  conflicts with the TSAN runtime. See #16936 and #11993.
* It eliminates the notion of "non-reserved" heap, which was added
  because creating huge address space reservations (particularly on
  64-bit) led to huge process VSIZE. This was at best confusing and at
  worst conflicted badly with ulimit -v. However, the non-reserved
  heap logic is complicated, can race with other mappings in non-pure
  Go binaries (e.g., #18976), and requires that the entire heap be
  either reserved or non-reserved. We currently maintain the latter
  property, but it's quite difficult to convince yourself of that, and
  hence difficult to keep correct. This logic is still present, but
  will be removed in the next CL.
* It fixes problems on 32-bit where skipping over parts of the address
  space leads to mapping huge (and never-to-be-used) metadata
  structures. See #19831.
This also completely rewrites and significantly simplifies
mheap.sysAlloc, which has been a source of many bugs. E.g., #21044,
 #20259, #18651, and #13143 (and maybe #23222).
This change also makes it possible to allocate individual objects
larger than 512GB. As a result, a few tests that expected huge
allocations to fail needed to be changed to make even larger
allocations. However, at the moment attempting to allocate a humongous
object may cause the program to freeze for several minutes on Linux as
we fall back to probing every page with addrspace_free. That logic
(and this failure mode) will be removed in the next CL.
Fixes #10460.
Fixes #22204 (since it rewrites the code involved).
This slightly slows down compilebench and the x/benchmarks garbage
benchmark.
name       old time/op     new time/op     delta
Template       184ms ± 1%      185ms ± 1%    ~     (p=0.065 n=10+9)
Unicode       86.9ms ± 3%     86.3ms ± 1%    ~     (p=0.631 n=10+10)
GoTypes        599ms ± 0%      602ms ± 0%  +0.56%  (p=0.000 n=10+9)
Compiler       2.87s ± 1%      2.89s ± 1%  +0.51%  (p=0.002 n=9+10)
SSA            7.29s ± 1%      7.25s ± 1%    ~     (p=0.182 n=10+9)
Flate          118ms ± 2%      118ms ± 1%    ~     (p=0.113 n=9+9)
GoParser       147ms ± 1%      148ms ± 1%  +1.07%  (p=0.003 n=9+10)
Reflect        401ms ± 1%      404ms ± 1%  +0.71%  (p=0.003 n=10+9)
Tar            175ms ± 1%      175ms ± 1%    ~     (p=0.604 n=9+10)
XML            209ms ± 1%      210ms ± 1%    ~     (p=0.052 n=10+10)
(https://perf.golang.org/search?q=upload:20171231.4)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.23ms ± 1%  2.25ms ± 1%  +0.84%  (p=0.000 n=19+19)
(https://perf.golang.org/search?q=upload:20171231.3)
Relative to the start of the sparse heap changes (starting at and
including "runtime: fix various contiguous bitmap assumptions"),
overall slowdown is roughly 1% on GC-intensive benchmarks:
name        old time/op     new time/op     delta
Template        183ms ± 1%      185ms ± 1%  +1.32%  (p=0.000 n=9+9)
Unicode        84.9ms ± 2%     86.3ms ± 1%  +1.65%  (p=0.000 n=9+10)
GoTypes         595ms ± 1%      602ms ± 0%  +1.19%  (p=0.000 n=9+9)
Compiler        2.86s ± 0%      2.89s ± 1%  +0.91%  (p=0.000 n=9+10)
SSA             7.19s ± 0%      7.25s ± 1%  +0.75%  (p=0.000 n=8+9)
Flate           117ms ± 1%      118ms ± 1%  +1.10%  (p=0.000 n=10+9)
GoParser        146ms ± 2%      148ms ± 1%  +1.48%  (p=0.002 n=10+10)
Reflect         398ms ± 1%      404ms ± 1%  +1.51%  (p=0.000 n=10+9)
Tar             173ms ± 1%      175ms ± 1%  +1.17%  (p=0.000 n=10+10)
XML             208ms ± 1%      210ms ± 1%  +0.62%  (p=0.011 n=10+10)
[Geo mean]      369ms           373ms       +1.17%
(https://perf.golang.org/search?q=upload:20180101.2)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.22ms ± 1%  2.25ms ± 1%  +1.51%  (p=0.000 n=20+19)
(https://perf.golang.org/search?q=upload:20180101.3)
Change-Id: I5daf4cfec24b252e5a57001f0a6c03f22479d0f0
Reviewed-on: https://go-review.googlesource.com/85887
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2017-12-19 22:05:23 -08:00
										 |  |  | 	if p+size > l.end { | 
					
						
							|  |  |  | 		return nil | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 	l.next = p + size | 
					
						
							| 
									
										
										
										
											2019-06-28 16:44:07 +00:00
										 |  |  | 	if pEnd := alignUp(l.next-1, physPageSize); pEnd > l.mapped { | 
					
						
							| 
									
										
										
										
											2020-11-16 21:57:32 +00:00
										 |  |  | 		if l.mapMemory { | 
					
						
							|  |  |  | 			// Transition from Reserved to Prepared to Ready. | 
					
						
							| 
									
										
											  
											
												runtime: track how much memory is mapped in the Ready state
This change adds a field to memstats called mappedReady that tracks how
much memory is in the Ready state at any given time. In essence, it's
the total memory usage by the Go runtime (with one exception which is
documented). Essentially, all memory mapped read/write that has either
been paged in or will soon.
To make tracking this not involve the many different stats that track
mapped memory, we track this statistic at a very low level. The downside
of tracking this statistic at such a low level is that it managed to
catch lots of situations where the runtime wasn't fully accounting for
memory. This change rectifies these situations by always accounting for
memory that's mapped in some way (i.e. always passing a sysMemStat to a
mem.go function), with *two* exceptions.
Rectifying these situations means also having the memory mapped during
testing being accounted for, so that tests (i.e. ReadMemStats) that
ultimately check mappedReady continue to work correctly without special
exceptions. We choose to simply account for this memory in other_sys.
Let's talk about the exceptions. The first is the arenas array for
finding heap arena metadata from an address is mapped as read/write in
one large chunk. It's tens of MiB in size. On systems with demand
paging, we assume that the whole thing isn't paged in at once (after
all, it maps to the whole address space, and it's exceedingly difficult
with today's technology to even broach having as much physical memory as
the total address space). On systems where we have to commit memory
manually, we use a two-level structure.
Now, the reason why this is an exception is because we have no mechanism
to track what memory is paged in, and we can't just account for the
entire thing, because that would *look* like an enormous overhead.
Furthermore, this structure is on a few really, really critical paths in
the runtime, so doing more explicit tracking isn't really an option. So,
we explicitly don't and call sysAllocOS to map this memory.
The second exception is that we call sysFree with no accounting to clean
up address space reservations, or otherwise to throw out mappings we
don't care about. In this case, also drop down to a lower level and call
sysFreeOS to explicitly avoid accounting.
The third exception is debuglog allocations. That is purely a debugging
facility and ideally we want it to have as small an impact on the
runtime as possible. If we include it in mappedReady calculations, it
could cause GC pacing shifts in future CLs, especailly if one increases
the debuglog buffer sizes as a one-off.
As of this CL, these are the only three places in the runtime that would
pass nil for a stat to any of the functions in mem.go. As a result, this
CL makes sysMemStats mandatory to facilitate better accounting in the
future. It's now much easier to grep and find out where accounting is
explicitly elided, because one doesn't have to follow the trail of
sysMemStat nil pointer values, and can just look at the function name.
For #48409.
Change-Id: I274eb467fc2603881717482214fddc47c9eaf218
Reviewed-on: https://go-review.googlesource.com/c/go/+/393402
Reviewed-by: Michael Pratt <mpratt@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
											
										 
											2022-03-15 02:48:18 +00:00
										 |  |  | 			n := pEnd - l.mapped | 
					
						
							|  |  |  | 			sysMap(unsafe.Pointer(l.mapped), n, sysStat) | 
					
						
							|  |  |  | 			sysUsed(unsafe.Pointer(l.mapped), n, n) | 
					
						
							| 
									
										
										
										
											2020-11-16 21:57:32 +00:00
										 |  |  | 		} | 
					
						
							| 
									
										
											  
											
												runtime: use sparse mappings for the heap
This replaces the contiguous heap arena mapping with a potentially
sparse mapping that can support heap mappings anywhere in the address
space.
This has several advantages over the current approach:
* There is no longer any limit on the size of the Go heap. (Currently
  it's limited to 512GB.) Hence, this fixes #10460.
* It eliminates many failures modes of heap initialization and
  growing. In particular it eliminates any possibility of panicking
  with an address space conflict. This can happen for many reasons and
  even causes a low but steady rate of TSAN test failures because of
  conflicts with the TSAN runtime. See #16936 and #11993.
* It eliminates the notion of "non-reserved" heap, which was added
  because creating huge address space reservations (particularly on
  64-bit) led to huge process VSIZE. This was at best confusing and at
  worst conflicted badly with ulimit -v. However, the non-reserved
  heap logic is complicated, can race with other mappings in non-pure
  Go binaries (e.g., #18976), and requires that the entire heap be
  either reserved or non-reserved. We currently maintain the latter
  property, but it's quite difficult to convince yourself of that, and
  hence difficult to keep correct. This logic is still present, but
  will be removed in the next CL.
* It fixes problems on 32-bit where skipping over parts of the address
  space leads to mapping huge (and never-to-be-used) metadata
  structures. See #19831.
This also completely rewrites and significantly simplifies
mheap.sysAlloc, which has been a source of many bugs. E.g., #21044,
 #20259, #18651, and #13143 (and maybe #23222).
This change also makes it possible to allocate individual objects
larger than 512GB. As a result, a few tests that expected huge
allocations to fail needed to be changed to make even larger
allocations. However, at the moment attempting to allocate a humongous
object may cause the program to freeze for several minutes on Linux as
we fall back to probing every page with addrspace_free. That logic
(and this failure mode) will be removed in the next CL.
Fixes #10460.
Fixes #22204 (since it rewrites the code involved).
This slightly slows down compilebench and the x/benchmarks garbage
benchmark.
name       old time/op     new time/op     delta
Template       184ms ± 1%      185ms ± 1%    ~     (p=0.065 n=10+9)
Unicode       86.9ms ± 3%     86.3ms ± 1%    ~     (p=0.631 n=10+10)
GoTypes        599ms ± 0%      602ms ± 0%  +0.56%  (p=0.000 n=10+9)
Compiler       2.87s ± 1%      2.89s ± 1%  +0.51%  (p=0.002 n=9+10)
SSA            7.29s ± 1%      7.25s ± 1%    ~     (p=0.182 n=10+9)
Flate          118ms ± 2%      118ms ± 1%    ~     (p=0.113 n=9+9)
GoParser       147ms ± 1%      148ms ± 1%  +1.07%  (p=0.003 n=9+10)
Reflect        401ms ± 1%      404ms ± 1%  +0.71%  (p=0.003 n=10+9)
Tar            175ms ± 1%      175ms ± 1%    ~     (p=0.604 n=9+10)
XML            209ms ± 1%      210ms ± 1%    ~     (p=0.052 n=10+10)
(https://perf.golang.org/search?q=upload:20171231.4)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.23ms ± 1%  2.25ms ± 1%  +0.84%  (p=0.000 n=19+19)
(https://perf.golang.org/search?q=upload:20171231.3)
Relative to the start of the sparse heap changes (starting at and
including "runtime: fix various contiguous bitmap assumptions"),
overall slowdown is roughly 1% on GC-intensive benchmarks:
name        old time/op     new time/op     delta
Template        183ms ± 1%      185ms ± 1%  +1.32%  (p=0.000 n=9+9)
Unicode        84.9ms ± 2%     86.3ms ± 1%  +1.65%  (p=0.000 n=9+10)
GoTypes         595ms ± 1%      602ms ± 0%  +1.19%  (p=0.000 n=9+9)
Compiler        2.86s ± 0%      2.89s ± 1%  +0.91%  (p=0.000 n=9+10)
SSA             7.19s ± 0%      7.25s ± 1%  +0.75%  (p=0.000 n=8+9)
Flate           117ms ± 1%      118ms ± 1%  +1.10%  (p=0.000 n=10+9)
GoParser        146ms ± 2%      148ms ± 1%  +1.48%  (p=0.002 n=10+10)
Reflect         398ms ± 1%      404ms ± 1%  +1.51%  (p=0.000 n=10+9)
Tar             173ms ± 1%      175ms ± 1%  +1.17%  (p=0.000 n=10+10)
XML             208ms ± 1%      210ms ± 1%  +0.62%  (p=0.011 n=10+10)
[Geo mean]      369ms           373ms       +1.17%
(https://perf.golang.org/search?q=upload:20180101.2)
name                       old time/op  new time/op  delta
Garbage/benchmem-MB=64-12  2.22ms ± 1%  2.25ms ± 1%  +1.51%  (p=0.000 n=20+19)
(https://perf.golang.org/search?q=upload:20180101.3)
Change-Id: I5daf4cfec24b252e5a57001f0a6c03f22479d0f0
Reviewed-on: https://go-review.googlesource.com/85887
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
											
										 
											2017-12-19 22:05:23 -08:00
										 |  |  | 		l.mapped = pEnd | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 	return unsafe.Pointer(p) | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2017-10-22 18:10:08 -04:00
										 |  |  | // notInHeap is off-heap memory allocated by a lower-level allocator | 
					
						
							|  |  |  | // like sysAlloc or persistentAlloc. | 
					
						
							|  |  |  | // | 
					
						
							| 
									
										
										
										
											2022-08-07 17:43:57 +07:00
										 |  |  | // In general, it's better to use real types which embed | 
					
						
							|  |  |  | // runtime/internal/sys.NotInHeap, but this serves as a generic type | 
					
						
							|  |  |  | // for situations where that isn't possible (like in the allocators). | 
					
						
							| 
									
										
										
										
											2017-10-22 18:10:08 -04:00
										 |  |  | // | 
					
						
							|  |  |  | // TODO: Use this as the return type of sysAlloc, persistentAlloc, etc? | 
					
						
							| 
									
										
										
										
											2022-08-07 17:43:57 +07:00
										 |  |  | type notInHeap struct{ _ sys.NotInHeap } | 
					
						
							| 
									
										
										
										
											2017-10-22 18:10:08 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | func (p *notInHeap) add(bytes uintptr) *notInHeap { | 
					
						
							|  |  |  | 	return (*notInHeap)(unsafe.Pointer(uintptr(unsafe.Pointer(p)) + bytes)) | 
					
						
							|  |  |  | } | 
					
						
							| 
									
										
										
										
											2021-01-05 17:52:43 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | // computeRZlog computes the size of the redzone. | 
					
						
							|  |  |  | // Refer to the implementation of the compiler-rt. | 
					
						
							|  |  |  | func computeRZlog(userSize uintptr) uintptr { | 
					
						
							|  |  |  | 	switch { | 
					
						
							|  |  |  | 	case userSize <= (64 - 16): | 
					
						
							|  |  |  | 		return 16 << 0 | 
					
						
							|  |  |  | 	case userSize <= (128 - 32): | 
					
						
							|  |  |  | 		return 16 << 1 | 
					
						
							|  |  |  | 	case userSize <= (512 - 64): | 
					
						
							|  |  |  | 		return 16 << 2 | 
					
						
							|  |  |  | 	case userSize <= (4096 - 128): | 
					
						
							|  |  |  | 		return 16 << 3 | 
					
						
							|  |  |  | 	case userSize <= (1<<14)-256: | 
					
						
							|  |  |  | 		return 16 << 4 | 
					
						
							|  |  |  | 	case userSize <= (1<<15)-512: | 
					
						
							|  |  |  | 		return 16 << 5 | 
					
						
							|  |  |  | 	case userSize <= (1<<16)-1024: | 
					
						
							|  |  |  | 		return 16 << 6 | 
					
						
							|  |  |  | 	default: | 
					
						
							|  |  |  | 		return 16 << 7 | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | } |