2014-07-30 09:01:52 -07:00
|
|
|
// Copyright 2014 The Go Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style
|
|
|
|
// license that can be found in the LICENSE file.
|
|
|
|
|
2016-09-23 11:47:24 -04:00
|
|
|
// Memory allocator.
|
|
|
|
//
|
|
|
|
// This was originally based on tcmalloc, but has diverged quite a bit.
|
2015-02-19 13:38:46 -05:00
|
|
|
// http://goog-perftools.sourceforge.net/doc/tcmalloc.html
|
|
|
|
|
|
|
|
// The main allocator works in runs of pages.
|
|
|
|
// Small allocation sizes (up to and including 32 kB) are
|
2016-09-23 11:47:24 -04:00
|
|
|
// rounded to one of about 70 size classes, each of which
|
|
|
|
// has its own free set of objects of exactly that size.
|
2015-02-19 13:38:46 -05:00
|
|
|
// Any free page of memory can be split into a set of objects
|
2016-09-23 11:47:24 -04:00
|
|
|
// of one size class, which are then managed using a free bitmap.
|
2015-02-19 13:38:46 -05:00
|
|
|
//
|
|
|
|
// The allocator's data structures are:
|
|
|
|
//
|
2016-09-23 11:47:24 -04:00
|
|
|
// fixalloc: a free-list allocator for fixed-size off-heap objects,
|
2015-02-19 13:38:46 -05:00
|
|
|
// used to manage storage used by the allocator.
|
2016-09-23 11:47:24 -04:00
|
|
|
// mheap: the malloc heap, managed at page (8192-byte) granularity.
|
|
|
|
// mspan: a run of pages managed by the mheap.
|
|
|
|
// mcentral: collects all spans of a given size class.
|
|
|
|
// mcache: a per-P cache of mspans with free space.
|
|
|
|
// mstats: allocation statistics.
|
2015-02-19 13:38:46 -05:00
|
|
|
//
|
|
|
|
// Allocating a small object proceeds up a hierarchy of caches:
|
|
|
|
//
|
|
|
|
// 1. Round the size up to one of the small size classes
|
2016-09-23 11:47:24 -04:00
|
|
|
// and look in the corresponding mspan in this P's mcache.
|
|
|
|
// Scan the mspan's free bitmap to find a free slot.
|
|
|
|
// If there is a free slot, allocate it.
|
2015-02-19 13:38:46 -05:00
|
|
|
// This can all be done without acquiring a lock.
|
|
|
|
//
|
2016-09-23 11:47:24 -04:00
|
|
|
// 2. If the mspan has no free slots, obtain a new mspan
|
|
|
|
// from the mcentral's list of mspans of the required size
|
|
|
|
// class that have free space.
|
|
|
|
// Obtaining a whole span amortizes the cost of locking
|
|
|
|
// the mcentral.
|
2015-02-19 13:38:46 -05:00
|
|
|
//
|
2016-09-23 11:47:24 -04:00
|
|
|
// 3. If the mcentral's mspan list is empty, obtain a run
|
|
|
|
// of pages from the mheap to use for the mspan.
|
2015-02-19 13:38:46 -05:00
|
|
|
//
|
2016-09-23 11:47:24 -04:00
|
|
|
// 4. If the mheap is empty or has no page runs large enough,
|
2015-02-19 13:38:46 -05:00
|
|
|
// allocate a new group of pages (at least 1MB) from the
|
2016-09-23 11:47:24 -04:00
|
|
|
// operating system. Allocating a large run of pages
|
2015-02-19 13:38:46 -05:00
|
|
|
// amortizes the cost of talking to the operating system.
|
|
|
|
//
|
2016-09-23 11:47:24 -04:00
|
|
|
// Sweeping an mspan and freeing objects on it proceeds up a similar
|
|
|
|
// hierarchy:
|
|
|
|
//
|
|
|
|
// 1. If the mspan is being swept in response to allocation, it
|
|
|
|
// is returned to the mcache to satisfy the allocation.
|
2015-02-19 13:38:46 -05:00
|
|
|
//
|
2016-09-23 11:47:24 -04:00
|
|
|
// 2. Otherwise, if the mspan still has allocated objects in it,
|
|
|
|
// it is placed on the mcentral free list for the mspan's size
|
|
|
|
// class.
|
2015-02-19 13:38:46 -05:00
|
|
|
//
|
2016-09-23 11:47:24 -04:00
|
|
|
// 3. Otherwise, if all objects in the mspan are free, the mspan
|
|
|
|
// is now "idle", so it is returned to the mheap and no longer
|
|
|
|
// has a size class.
|
|
|
|
// This may coalesce it with adjacent idle mspans.
|
2015-02-19 13:38:46 -05:00
|
|
|
//
|
2016-09-23 11:47:24 -04:00
|
|
|
// 4. If an mspan remains idle for long enough, return its pages
|
|
|
|
// to the operating system.
|
2015-02-19 13:38:46 -05:00
|
|
|
//
|
2016-09-23 11:47:24 -04:00
|
|
|
// Allocating and freeing a large object uses the mheap
|
|
|
|
// directly, bypassing the mcache and mcentral.
|
2015-02-19 13:38:46 -05:00
|
|
|
//
|
2016-09-23 11:47:24 -04:00
|
|
|
// Free object slots in an mspan are zeroed only if mspan.needzero is
|
|
|
|
// false. If needzero is true, objects are zeroed as they are
|
|
|
|
// allocated. There are various benefits to delaying zeroing this way:
|
2015-02-19 13:38:46 -05:00
|
|
|
//
|
2016-09-23 11:47:24 -04:00
|
|
|
// 1. Stack frame allocation can avoid zeroing altogether.
|
2015-02-19 13:38:46 -05:00
|
|
|
//
|
2016-09-23 11:47:24 -04:00
|
|
|
// 2. It exhibits better temporal locality, since the program is
|
|
|
|
// probably about to write to the memory.
|
2015-02-19 13:38:46 -05:00
|
|
|
//
|
2016-09-23 11:47:24 -04:00
|
|
|
// 3. We don't zero pages that never get reused.
|
2015-02-19 13:38:46 -05:00
|
|
|
|
2014-07-30 09:01:52 -07:00
|
|
|
package runtime
|
|
|
|
|
2015-11-11 12:39:30 -05:00
|
|
|
import (
|
|
|
|
"runtime/internal/sys"
|
|
|
|
"unsafe"
|
|
|
|
)
|
2014-07-30 09:01:52 -07:00
|
|
|
|
|
|
|
const (
|
2014-08-07 13:34:30 +04:00
|
|
|
debugMalloc = false
|
|
|
|
|
2014-09-16 10:22:15 -04:00
|
|
|
maxTinySize = _TinySize
|
|
|
|
tinySizeClass = _TinySizeClass
|
|
|
|
maxSmallSize = _MaxSmallSize
|
2014-07-30 09:01:52 -07:00
|
|
|
|
2014-09-16 10:22:15 -04:00
|
|
|
pageShift = _PageShift
|
|
|
|
pageSize = _PageSize
|
|
|
|
pageMask = _PageMask
|
2016-02-04 11:41:48 -05:00
|
|
|
// By construction, single page spans of the smallest object class
|
|
|
|
// have the most objects per span.
|
|
|
|
maxObjsPerSpan = pageSize / 8
|
2014-08-07 13:34:30 +04:00
|
|
|
|
2014-09-16 10:22:15 -04:00
|
|
|
mSpanInUse = _MSpanInUse
|
2014-08-28 13:23:10 -07:00
|
|
|
|
2014-11-11 17:05:02 -05:00
|
|
|
concurrentSweep = _ConcurrentSweep
|
2014-07-30 09:01:52 -07:00
|
|
|
|
2016-10-26 21:25:56 -07:00
|
|
|
_PageSize = 1 << _PageShift
|
|
|
|
_PageMask = _PageSize - 1
|
2015-02-19 13:38:46 -05:00
|
|
|
|
|
|
|
// _64bit = 1 on 64-bit systems, 0 on 32-bit systems
|
|
|
|
_64bit = 1 << (^uintptr(0) >> 63) / 2
|
|
|
|
|
|
|
|
// Tiny allocator parameters, see "Tiny allocator" comment in malloc.go.
|
|
|
|
_TinySize = 16
|
|
|
|
_TinySizeClass = 2
|
|
|
|
|
|
|
|
_FixAllocChunk = 16 << 10 // Chunk size for FixAlloc
|
|
|
|
_MaxMHeapList = 1 << (20 - _PageShift) // Maximum page length for fixed-size list in MHeap.
|
|
|
|
_HeapAllocChunk = 1 << 20 // Chunk size for heap growth
|
|
|
|
|
|
|
|
// Per-P, per order stack segment cache size.
|
|
|
|
_StackCacheSize = 32 * 1024
|
|
|
|
|
2016-03-01 23:21:55 +00:00
|
|
|
// Number of orders that get caching. Order 0 is FixedStack
|
2015-02-19 13:38:46 -05:00
|
|
|
// and each successive order is twice as large.
|
2016-03-01 23:21:55 +00:00
|
|
|
// We want to cache 2KB, 4KB, 8KB, and 16KB stacks. Larger stacks
|
2015-02-19 13:38:46 -05:00
|
|
|
// will be allocated directly.
|
|
|
|
// Since FixedStack is different on different systems, we
|
|
|
|
// must vary NumStackOrders to keep the same maximum cached size.
|
|
|
|
// OS | FixedStack | NumStackOrders
|
|
|
|
// -----------------+------------+---------------
|
|
|
|
// linux/darwin/bsd | 2KB | 4
|
|
|
|
// windows/32 | 4KB | 3
|
|
|
|
// windows/64 | 8KB | 2
|
|
|
|
// plan9 | 4KB | 3
|
2015-11-11 12:39:30 -05:00
|
|
|
_NumStackOrders = 4 - sys.PtrSize/4*sys.GoosWindows - 1*sys.GoosPlan9
|
2015-02-19 13:38:46 -05:00
|
|
|
|
|
|
|
// Number of bits in page to span calculations (4k pages).
|
|
|
|
// On Windows 64-bit we limit the arena to 32GB or 35 bits.
|
|
|
|
// Windows counts memory used by page table into committed memory
|
|
|
|
// of the process, so we can't reserve too much memory.
|
2015-07-10 17:17:11 -06:00
|
|
|
// See https://golang.org/issue/5402 and https://golang.org/issue/5236.
|
2015-06-08 00:14:08 -04:00
|
|
|
// On other 64-bit platforms, we limit the arena to 512GB, or 39 bits.
|
2015-02-19 13:38:46 -05:00
|
|
|
// On 32-bit, we don't bother limiting anything, so we use the full 32-bit address.
|
2016-10-18 23:51:01 +02:00
|
|
|
// The only exception is mips32 which only has access to low 2GB of virtual memory.
|
2015-04-10 22:14:43 -04:00
|
|
|
// On Darwin/arm64, we cannot reserve more than ~5GB of virtual memory,
|
|
|
|
// but as most devices have less than 4GB of physical memory anyway, we
|
|
|
|
// try to be conservative here, and only ask for a 2GB heap.
|
2016-10-18 23:51:01 +02:00
|
|
|
_MHeapMap_TotalBits = (_64bit*sys.GoosWindows)*35 + (_64bit*(1-sys.GoosWindows)*(1-sys.GoosDarwin*sys.GoarchArm64))*39 + sys.GoosDarwin*sys.GoarchArm64*31 + (1-_64bit)*(32-(sys.GoarchMips+sys.GoarchMipsle))
|
2015-02-19 13:38:46 -05:00
|
|
|
_MHeapMap_Bits = _MHeapMap_TotalBits - _PageShift
|
|
|
|
|
2017-01-13 15:32:53 -05:00
|
|
|
// _MaxMem is the maximum heap arena size minus 1.
|
|
|
|
//
|
|
|
|
// On 32-bit, this is also the maximum heap pointer value,
|
|
|
|
// since the arena starts at address 0.
|
|
|
|
_MaxMem = 1<<_MHeapMap_TotalBits - 1
|
2015-02-19 13:38:46 -05:00
|
|
|
|
|
|
|
// Max number of threads to run garbage collection.
|
|
|
|
// 2, 3, and 4 are all plausible maximums depending
|
2016-03-01 23:21:55 +00:00
|
|
|
// on the hardware details of the machine. The garbage
|
2015-02-19 13:38:46 -05:00
|
|
|
// collector scales well to 32 cpus.
|
|
|
|
_MaxGcproc = 32
|
|
|
|
|
2017-01-06 09:44:41 -05:00
|
|
|
// minLegalPointer is the smallest possible legal pointer.
|
|
|
|
// This is the smallest possible architectural page size,
|
|
|
|
// since we assume that the first page is never mapped.
|
|
|
|
//
|
|
|
|
// This should agree with minZeroPage in the compiler.
|
|
|
|
minLegalPointer uintptr = 4096
|
2016-10-26 21:25:56 -07:00
|
|
|
)
|
2015-02-19 13:38:46 -05:00
|
|
|
|
2016-07-18 21:40:02 -04:00
|
|
|
// physPageSize is the size in bytes of the OS's physical pages.
|
|
|
|
// Mapping and unmapping operations must be done at multiples of
|
|
|
|
// physPageSize.
|
|
|
|
//
|
|
|
|
// This must be set by the OS init code (typically in osinit) before
|
|
|
|
// mallocinit.
|
|
|
|
var physPageSize uintptr
|
|
|
|
|
2015-02-19 13:38:46 -05:00
|
|
|
// OS-defined helpers:
|
|
|
|
//
|
|
|
|
// sysAlloc obtains a large chunk of zeroed memory from the
|
|
|
|
// operating system, typically on the order of a hundred kilobytes
|
|
|
|
// or a megabyte.
|
|
|
|
// NOTE: sysAlloc returns OS-aligned memory, but the heap allocator
|
|
|
|
// may use larger alignment, so the caller must be careful to realign the
|
|
|
|
// memory obtained by sysAlloc.
|
|
|
|
//
|
|
|
|
// SysUnused notifies the operating system that the contents
|
|
|
|
// of the memory region are no longer needed and can be reused
|
|
|
|
// for other purposes.
|
|
|
|
// SysUsed notifies the operating system that the contents
|
|
|
|
// of the memory region are needed again.
|
|
|
|
//
|
|
|
|
// SysFree returns it unconditionally; this is only used if
|
|
|
|
// an out-of-memory error has been detected midway through
|
2016-03-01 23:21:55 +00:00
|
|
|
// an allocation. It is okay if SysFree is a no-op.
|
2015-02-19 13:38:46 -05:00
|
|
|
//
|
|
|
|
// SysReserve reserves address space without allocating memory.
|
|
|
|
// If the pointer passed to it is non-nil, the caller wants the
|
|
|
|
// reservation there, but SysReserve can still choose another
|
2016-03-01 23:21:55 +00:00
|
|
|
// location if that one is unavailable. On some systems and in some
|
2015-02-19 13:38:46 -05:00
|
|
|
// cases SysReserve will simply check that the address space is
|
2016-03-01 23:21:55 +00:00
|
|
|
// available and not actually reserve it. If SysReserve returns
|
2015-02-19 13:38:46 -05:00
|
|
|
// non-nil, it sets *reserved to true if the address space is
|
|
|
|
// reserved, false if it has merely been checked.
|
|
|
|
// NOTE: SysReserve returns OS-aligned memory, but the heap allocator
|
|
|
|
// may use larger alignment, so the caller must be careful to realign the
|
|
|
|
// memory obtained by sysAlloc.
|
|
|
|
//
|
|
|
|
// SysMap maps previously reserved address space for use.
|
|
|
|
// The reserved argument is true if the address space was really
|
|
|
|
// reserved, not merely checked.
|
|
|
|
//
|
|
|
|
// SysFault marks a (already sysAlloc'd) region to fault
|
2016-03-01 23:21:55 +00:00
|
|
|
// if accessed. Used only for debugging the runtime.
|
2015-02-19 13:38:46 -05:00
|
|
|
|
|
|
|
func mallocinit() {
|
|
|
|
if class_to_size[_TinySizeClass] != _TinySize {
|
|
|
|
throw("bad TinySizeClass")
|
|
|
|
}
|
|
|
|
|
2016-10-26 21:25:56 -07:00
|
|
|
testdefersizes()
|
|
|
|
|
|
|
|
// Copy class sizes out for statistics table.
|
|
|
|
for i := range class_to_size {
|
|
|
|
memstats.by_size[i].size = uint32(class_to_size[i])
|
|
|
|
}
|
|
|
|
|
2016-07-18 21:40:02 -04:00
|
|
|
// Check physPageSize.
|
|
|
|
if physPageSize == 0 {
|
|
|
|
// The OS init code failed to fetch the physical page size.
|
|
|
|
throw("failed to get system page size")
|
|
|
|
}
|
2016-07-18 12:24:02 -04:00
|
|
|
if physPageSize < minPhysPageSize {
|
|
|
|
print("system page size (", physPageSize, ") is smaller than minimum page size (", minPhysPageSize, ")\n")
|
|
|
|
throw("bad system page size")
|
2016-07-18 21:40:02 -04:00
|
|
|
}
|
2016-07-18 12:24:02 -04:00
|
|
|
if physPageSize&(physPageSize-1) != 0 {
|
|
|
|
print("system page size (", physPageSize, ") must be a power of 2\n")
|
|
|
|
throw("bad system page size")
|
2016-07-18 21:40:02 -04:00
|
|
|
}
|
|
|
|
|
2017-01-13 14:19:52 -05:00
|
|
|
// The auxiliary regions start at p and are laid out in the
|
|
|
|
// following order: spans, bitmap, arena.
|
|
|
|
var p, pSize uintptr
|
2015-02-19 13:38:46 -05:00
|
|
|
var reserved bool
|
|
|
|
|
2017-01-13 14:19:52 -05:00
|
|
|
// The spans array holds one *mspan per _PageSize of arena.
|
|
|
|
var spansSize uintptr = (_MaxMem + 1) / _PageSize * sys.PtrSize
|
|
|
|
spansSize = round(spansSize, _PageSize)
|
|
|
|
// The bitmap holds 2 bits per word of arena.
|
|
|
|
var bitmapSize uintptr = (_MaxMem + 1) / (sys.PtrSize * 8 / 2)
|
|
|
|
bitmapSize = round(bitmapSize, _PageSize)
|
2015-02-19 13:38:46 -05:00
|
|
|
|
|
|
|
// Set up the allocation arena, a contiguous area of memory where
|
2017-01-13 14:19:52 -05:00
|
|
|
// allocated data will be found.
|
|
|
|
if sys.PtrSize == 8 {
|
2015-02-19 13:38:46 -05:00
|
|
|
// On a 64-bit machine, allocate from a single contiguous reservation.
|
2015-06-08 00:14:08 -04:00
|
|
|
// 512 GB (MaxMem) should be big enough for now.
|
2015-02-19 13:38:46 -05:00
|
|
|
//
|
|
|
|
// The code will work with the reservation at any address, but ask
|
|
|
|
// SysReserve to use 0x0000XXc000000000 if possible (XX=00...7f).
|
2015-06-08 00:14:08 -04:00
|
|
|
// Allocating a 512 GB region takes away 39 bits, and the amd64
|
|
|
|
// doesn't let us choose the top 17 bits, so that leaves the 9 bits
|
2016-03-01 23:21:55 +00:00
|
|
|
// in the middle of 0x00c0 for us to choose. Choosing 0x00c0 means
|
2015-02-19 13:38:46 -05:00
|
|
|
// that the valid memory addresses will begin 0x00c0, 0x00c1, ..., 0x00df.
|
|
|
|
// In little-endian, that's c0 00, c1 00, ..., df 00. None of those are valid
|
|
|
|
// UTF-8 sequences, and they are otherwise as far away from
|
2016-03-01 23:21:55 +00:00
|
|
|
// ff (likely a common byte) as possible. If that fails, we try other 0xXXc0
|
|
|
|
// addresses. An earlier attempt to use 0x11f8 caused out of memory errors
|
2015-02-19 13:38:46 -05:00
|
|
|
// on OS X during thread allocations. 0x00c0 causes conflicts with
|
|
|
|
// AddressSanitizer which reserves all memory up to 0x0100.
|
|
|
|
// These choices are both for debuggability and to reduce the
|
2015-06-08 00:14:08 -04:00
|
|
|
// odds of a conservative garbage collector (as is still used in gccgo)
|
|
|
|
// not collecting memory because some non-pointer block of memory
|
|
|
|
// had a bit pattern that matched a memory address.
|
2015-02-19 13:38:46 -05:00
|
|
|
//
|
2015-06-08 00:14:08 -04:00
|
|
|
// Actually we reserve 544 GB (because the bitmap ends up being 32 GB)
|
2015-02-19 13:38:46 -05:00
|
|
|
// but it hardly matters: e0 00 is not valid UTF-8 either.
|
|
|
|
//
|
|
|
|
// If this fails we fall back to the 32 bit memory mechanism
|
2015-03-08 14:20:20 +01:00
|
|
|
//
|
|
|
|
// However, on arm64, we ignore all this advice above and slam the
|
|
|
|
// allocation at 0x40 << 32 because when using 4k pages with 3-level
|
|
|
|
// translation buffers, the user address space is limited to 39 bits
|
2015-04-10 22:14:43 -04:00
|
|
|
// On darwin/arm64, the address space is even smaller.
|
2015-02-19 13:38:46 -05:00
|
|
|
arenaSize := round(_MaxMem, _PageSize)
|
2017-01-13 14:19:52 -05:00
|
|
|
pSize = bitmapSize + spansSize + arenaSize + _PageSize
|
2015-02-19 13:38:46 -05:00
|
|
|
for i := 0; i <= 0x7f; i++ {
|
2015-04-10 22:14:43 -04:00
|
|
|
switch {
|
|
|
|
case GOARCH == "arm64" && GOOS == "darwin":
|
|
|
|
p = uintptr(i)<<40 | uintptrMask&(0x0013<<28)
|
|
|
|
case GOARCH == "arm64":
|
2015-03-08 14:20:20 +01:00
|
|
|
p = uintptr(i)<<40 | uintptrMask&(0x0040<<32)
|
2015-04-10 22:14:43 -04:00
|
|
|
default:
|
2015-03-08 14:20:20 +01:00
|
|
|
p = uintptr(i)<<40 | uintptrMask&(0x00c0<<32)
|
|
|
|
}
|
2015-02-19 13:38:46 -05:00
|
|
|
p = uintptr(sysReserve(unsafe.Pointer(p), pSize, &reserved))
|
|
|
|
if p != 0 {
|
|
|
|
break
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if p == 0 {
|
|
|
|
// On a 32-bit machine, we can't typically get away
|
|
|
|
// with a giant virtual address space reservation.
|
|
|
|
// Instead we map the memory information bitmap
|
|
|
|
// immediately after the data segment, large enough
|
runtime: use entire address space on 32 bit
In issue #13992, Russ mentioned that the heap bitmap footprint was
halved but that the bitmap size calculation hadn't been updated. This
presents the opportunity to either halve the bitmap size or double
the addressable virtual space. This CL doubles the addressable virtual
space. On 32 bit this can be tweaked further to allow the bitmap to
cover the entire 4GB virtual address space, removing a failure mode
if the kernel hands out memory with a too low address.
First, fix the calculation and double _MaxArena32 to cover 4GB virtual
memory space with the same bitmap size (256 MB).
Then, allow the fallback mode for the initial memory reservation
on 32 bit (or 64 bit with too little available virtual memory) to not
include space for the arena. mheap.sysAlloc will automatically reserve
additional space when the existing arena is full.
Finally, set arena_start to 0 in 32 bit mode, so that any address is
acceptable for subsequent (additional) reservations.
Before, the bitmap was always located just before arena_start, so
fix the two places relying on that assumption: Point the otherwise unused
mheap.bitmap to one byte after the end of the bitmap, and use it for
bitmap addressing instead of arena_start.
With arena_start set to 0 on 32 bit, the cgoInRange check is no longer a
sufficient check for Go pointers. Introduce and call inHeapOrStack to
check whether a pointer is to the Go heap or stack.
While we're here, remove sysReserveHigh which seems to be unused.
Fixes #13992
Change-Id: I592b513148a50b9d3967b5c5d94b86b3ec39acc2
Reviewed-on: https://go-review.googlesource.com/20471
Reviewed-by: Austin Clements <austin@google.com>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2016-03-09 10:00:12 +01:00
|
|
|
// to handle the entire 4GB address space (256 MB),
|
2015-02-19 13:38:46 -05:00
|
|
|
// along with a reservation for an initial arena.
|
|
|
|
// When that gets used up, we'll start asking the kernel
|
runtime: use entire address space on 32 bit
In issue #13992, Russ mentioned that the heap bitmap footprint was
halved but that the bitmap size calculation hadn't been updated. This
presents the opportunity to either halve the bitmap size or double
the addressable virtual space. This CL doubles the addressable virtual
space. On 32 bit this can be tweaked further to allow the bitmap to
cover the entire 4GB virtual address space, removing a failure mode
if the kernel hands out memory with a too low address.
First, fix the calculation and double _MaxArena32 to cover 4GB virtual
memory space with the same bitmap size (256 MB).
Then, allow the fallback mode for the initial memory reservation
on 32 bit (or 64 bit with too little available virtual memory) to not
include space for the arena. mheap.sysAlloc will automatically reserve
additional space when the existing arena is full.
Finally, set arena_start to 0 in 32 bit mode, so that any address is
acceptable for subsequent (additional) reservations.
Before, the bitmap was always located just before arena_start, so
fix the two places relying on that assumption: Point the otherwise unused
mheap.bitmap to one byte after the end of the bitmap, and use it for
bitmap addressing instead of arena_start.
With arena_start set to 0 on 32 bit, the cgoInRange check is no longer a
sufficient check for Go pointers. Introduce and call inHeapOrStack to
check whether a pointer is to the Go heap or stack.
While we're here, remove sysReserveHigh which seems to be unused.
Fixes #13992
Change-Id: I592b513148a50b9d3967b5c5d94b86b3ec39acc2
Reviewed-on: https://go-review.googlesource.com/20471
Reviewed-by: Austin Clements <austin@google.com>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2016-03-09 10:00:12 +01:00
|
|
|
// for any memory anywhere.
|
2015-02-19 13:38:46 -05:00
|
|
|
|
2017-04-06 14:32:37 -04:00
|
|
|
// We want to start the arena low, but if we're linked
|
|
|
|
// against C code, it's possible global constructors
|
|
|
|
// have called malloc and adjusted the process' brk.
|
|
|
|
// Query the brk so we can avoid trying to map the
|
|
|
|
// arena over it (which will cause the kernel to put
|
|
|
|
// the arena somewhere else, likely at a high
|
|
|
|
// address).
|
|
|
|
procBrk := sbrk0()
|
|
|
|
|
2015-02-19 13:38:46 -05:00
|
|
|
// If we fail to allocate, try again with a smaller arena.
|
|
|
|
// This is necessary on Android L where we share a process
|
|
|
|
// with ART, which reserves virtual memory aggressively.
|
runtime: use entire address space on 32 bit
In issue #13992, Russ mentioned that the heap bitmap footprint was
halved but that the bitmap size calculation hadn't been updated. This
presents the opportunity to either halve the bitmap size or double
the addressable virtual space. This CL doubles the addressable virtual
space. On 32 bit this can be tweaked further to allow the bitmap to
cover the entire 4GB virtual address space, removing a failure mode
if the kernel hands out memory with a too low address.
First, fix the calculation and double _MaxArena32 to cover 4GB virtual
memory space with the same bitmap size (256 MB).
Then, allow the fallback mode for the initial memory reservation
on 32 bit (or 64 bit with too little available virtual memory) to not
include space for the arena. mheap.sysAlloc will automatically reserve
additional space when the existing arena is full.
Finally, set arena_start to 0 in 32 bit mode, so that any address is
acceptable for subsequent (additional) reservations.
Before, the bitmap was always located just before arena_start, so
fix the two places relying on that assumption: Point the otherwise unused
mheap.bitmap to one byte after the end of the bitmap, and use it for
bitmap addressing instead of arena_start.
With arena_start set to 0 on 32 bit, the cgoInRange check is no longer a
sufficient check for Go pointers. Introduce and call inHeapOrStack to
check whether a pointer is to the Go heap or stack.
While we're here, remove sysReserveHigh which seems to be unused.
Fixes #13992
Change-Id: I592b513148a50b9d3967b5c5d94b86b3ec39acc2
Reviewed-on: https://go-review.googlesource.com/20471
Reviewed-by: Austin Clements <austin@google.com>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2016-03-09 10:00:12 +01:00
|
|
|
// In the worst case, fall back to a 0-sized initial arena,
|
|
|
|
// in the hope that subsequent reservations will succeed.
|
2015-02-19 13:38:46 -05:00
|
|
|
arenaSizes := []uintptr{
|
|
|
|
512 << 20,
|
|
|
|
256 << 20,
|
2015-02-24 11:11:56 -05:00
|
|
|
128 << 20,
|
runtime: use entire address space on 32 bit
In issue #13992, Russ mentioned that the heap bitmap footprint was
halved but that the bitmap size calculation hadn't been updated. This
presents the opportunity to either halve the bitmap size or double
the addressable virtual space. This CL doubles the addressable virtual
space. On 32 bit this can be tweaked further to allow the bitmap to
cover the entire 4GB virtual address space, removing a failure mode
if the kernel hands out memory with a too low address.
First, fix the calculation and double _MaxArena32 to cover 4GB virtual
memory space with the same bitmap size (256 MB).
Then, allow the fallback mode for the initial memory reservation
on 32 bit (or 64 bit with too little available virtual memory) to not
include space for the arena. mheap.sysAlloc will automatically reserve
additional space when the existing arena is full.
Finally, set arena_start to 0 in 32 bit mode, so that any address is
acceptable for subsequent (additional) reservations.
Before, the bitmap was always located just before arena_start, so
fix the two places relying on that assumption: Point the otherwise unused
mheap.bitmap to one byte after the end of the bitmap, and use it for
bitmap addressing instead of arena_start.
With arena_start set to 0 on 32 bit, the cgoInRange check is no longer a
sufficient check for Go pointers. Introduce and call inHeapOrStack to
check whether a pointer is to the Go heap or stack.
While we're here, remove sysReserveHigh which seems to be unused.
Fixes #13992
Change-Id: I592b513148a50b9d3967b5c5d94b86b3ec39acc2
Reviewed-on: https://go-review.googlesource.com/20471
Reviewed-by: Austin Clements <austin@google.com>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2016-03-09 10:00:12 +01:00
|
|
|
0,
|
2015-02-19 13:38:46 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
for _, arenaSize := range arenaSizes {
|
|
|
|
// SysReserve treats the address we ask for, end, as a hint,
|
2016-03-01 23:21:55 +00:00
|
|
|
// not as an absolute requirement. If we ask for the end
|
2015-02-19 13:38:46 -05:00
|
|
|
// of the data segment but the operating system requires
|
|
|
|
// a little more space before we can start allocating, it will
|
2016-03-01 23:21:55 +00:00
|
|
|
// give out a slightly higher pointer. Except QEMU, which
|
2015-02-19 13:38:46 -05:00
|
|
|
// is buggy, as usual: it won't adjust the pointer upward.
|
|
|
|
// So adjust it upward a little bit ourselves: 1/4 MB to get
|
|
|
|
// away from the running binary image and then round up
|
|
|
|
// to a MB boundary.
|
2015-04-07 12:55:02 +12:00
|
|
|
p = round(firstmoduledata.end+(1<<18), 1<<20)
|
2015-02-19 13:38:46 -05:00
|
|
|
pSize = bitmapSize + spansSize + arenaSize + _PageSize
|
2017-04-06 14:32:37 -04:00
|
|
|
if p <= procBrk && procBrk < p+pSize {
|
|
|
|
// Move the start above the brk,
|
|
|
|
// leaving some room for future brk
|
|
|
|
// expansion.
|
|
|
|
p = round(procBrk+(1<<20), 1<<20)
|
|
|
|
}
|
2015-02-19 13:38:46 -05:00
|
|
|
p = uintptr(sysReserve(unsafe.Pointer(p), pSize, &reserved))
|
|
|
|
if p != 0 {
|
|
|
|
break
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if p == 0 {
|
|
|
|
throw("runtime: cannot reserve arena virtual address space")
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// PageSize can be larger than OS definition of page size,
|
|
|
|
// so SysReserve can give us a PageSize-unaligned pointer.
|
|
|
|
// To overcome this we ask for PageSize more and round up the pointer.
|
|
|
|
p1 := round(p, _PageSize)
|
2017-01-13 14:19:52 -05:00
|
|
|
pSize -= p1 - p
|
2015-02-19 13:38:46 -05:00
|
|
|
|
2016-10-04 16:03:00 -04:00
|
|
|
spansStart := p1
|
2017-01-13 14:19:52 -05:00
|
|
|
p1 += spansSize
|
|
|
|
mheap_.bitmap = p1 + bitmapSize
|
|
|
|
p1 += bitmapSize
|
runtime: use entire address space on 32 bit
In issue #13992, Russ mentioned that the heap bitmap footprint was
halved but that the bitmap size calculation hadn't been updated. This
presents the opportunity to either halve the bitmap size or double
the addressable virtual space. This CL doubles the addressable virtual
space. On 32 bit this can be tweaked further to allow the bitmap to
cover the entire 4GB virtual address space, removing a failure mode
if the kernel hands out memory with a too low address.
First, fix the calculation and double _MaxArena32 to cover 4GB virtual
memory space with the same bitmap size (256 MB).
Then, allow the fallback mode for the initial memory reservation
on 32 bit (or 64 bit with too little available virtual memory) to not
include space for the arena. mheap.sysAlloc will automatically reserve
additional space when the existing arena is full.
Finally, set arena_start to 0 in 32 bit mode, so that any address is
acceptable for subsequent (additional) reservations.
Before, the bitmap was always located just before arena_start, so
fix the two places relying on that assumption: Point the otherwise unused
mheap.bitmap to one byte after the end of the bitmap, and use it for
bitmap addressing instead of arena_start.
With arena_start set to 0 on 32 bit, the cgoInRange check is no longer a
sufficient check for Go pointers. Introduce and call inHeapOrStack to
check whether a pointer is to the Go heap or stack.
While we're here, remove sysReserveHigh which seems to be unused.
Fixes #13992
Change-Id: I592b513148a50b9d3967b5c5d94b86b3ec39acc2
Reviewed-on: https://go-review.googlesource.com/20471
Reviewed-by: Austin Clements <austin@google.com>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2016-03-09 10:00:12 +01:00
|
|
|
if sys.PtrSize == 4 {
|
|
|
|
// Set arena_start such that we can accept memory
|
|
|
|
// reservations located anywhere in the 4GB virtual space.
|
|
|
|
mheap_.arena_start = 0
|
|
|
|
} else {
|
2017-01-13 14:19:52 -05:00
|
|
|
mheap_.arena_start = p1
|
runtime: use entire address space on 32 bit
In issue #13992, Russ mentioned that the heap bitmap footprint was
halved but that the bitmap size calculation hadn't been updated. This
presents the opportunity to either halve the bitmap size or double
the addressable virtual space. This CL doubles the addressable virtual
space. On 32 bit this can be tweaked further to allow the bitmap to
cover the entire 4GB virtual address space, removing a failure mode
if the kernel hands out memory with a too low address.
First, fix the calculation and double _MaxArena32 to cover 4GB virtual
memory space with the same bitmap size (256 MB).
Then, allow the fallback mode for the initial memory reservation
on 32 bit (or 64 bit with too little available virtual memory) to not
include space for the arena. mheap.sysAlloc will automatically reserve
additional space when the existing arena is full.
Finally, set arena_start to 0 in 32 bit mode, so that any address is
acceptable for subsequent (additional) reservations.
Before, the bitmap was always located just before arena_start, so
fix the two places relying on that assumption: Point the otherwise unused
mheap.bitmap to one byte after the end of the bitmap, and use it for
bitmap addressing instead of arena_start.
With arena_start set to 0 on 32 bit, the cgoInRange check is no longer a
sufficient check for Go pointers. Introduce and call inHeapOrStack to
check whether a pointer is to the Go heap or stack.
While we're here, remove sysReserveHigh which seems to be unused.
Fixes #13992
Change-Id: I592b513148a50b9d3967b5c5d94b86b3ec39acc2
Reviewed-on: https://go-review.googlesource.com/20471
Reviewed-by: Austin Clements <austin@google.com>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2016-03-09 10:00:12 +01:00
|
|
|
}
|
2015-02-19 13:38:46 -05:00
|
|
|
mheap_.arena_end = p + pSize
|
2017-01-13 14:19:52 -05:00
|
|
|
mheap_.arena_used = p1
|
2015-02-19 13:38:46 -05:00
|
|
|
mheap_.arena_reserved = reserved
|
|
|
|
|
|
|
|
if mheap_.arena_start&(_PageSize-1) != 0 {
|
|
|
|
println("bad pagesize", hex(p), hex(p1), hex(spansSize), hex(bitmapSize), hex(_PageSize), "start", hex(mheap_.arena_start))
|
|
|
|
throw("misrounded allocation in mallocinit")
|
|
|
|
}
|
|
|
|
|
|
|
|
// Initialize the rest of the allocator.
|
2016-10-04 16:03:00 -04:00
|
|
|
mheap_.init(spansStart, spansSize)
|
2015-02-19 13:38:46 -05:00
|
|
|
_g_ := getg()
|
|
|
|
_g_.m.mcache = allocmcache()
|
|
|
|
}
|
|
|
|
|
2016-04-28 11:19:53 -04:00
|
|
|
// sysAlloc allocates the next n bytes from the heap arena. The
|
|
|
|
// returned pointer is always _PageSize aligned and between
|
|
|
|
// h.arena_start and h.arena_end. sysAlloc returns nil on failure.
|
|
|
|
// There is no corresponding free function.
|
2015-11-11 16:13:51 -08:00
|
|
|
func (h *mheap) sysAlloc(n uintptr) unsafe.Pointer {
|
2015-10-26 17:53:22 -07:00
|
|
|
if n > h.arena_end-h.arena_used {
|
runtime: fix confusion between _MaxMem and _MaxArena32
Currently both _MaxMem and _MaxArena32 represent the maximum arena
size on 32-bit hosts (except on MIPS32 where _MaxMem is confusingly
smaller than _MaxArena32).
Clean up sysAlloc so that it always uses _MaxMem, which is the maximum
arena size on both 32- and 64-bit architectures and is the arena size
we allocate auxiliary structures for. This lets us simplify and unify
some code paths and eliminate _MaxArena32.
Fixes #18651. mheap.sysAlloc currently assumes that if the arena is
small, we must be on a 32-bit machine and can therefore grow the arena
to _MaxArena32. This breaks down on darwin/arm64, where _MaxMem is
only 2 GB. As a result, on darwin/arm64, we only reserve spans and
bitmap space for a 2 GB heap, and if the application tries to allocate
beyond that, sysAlloc takes the 32-bit path, tries to grow the arena
beyond 2 GB, and panics when it tries to grow the spans array
allocation past its reserved size. This has probably been a problem
for several releases now, but was only noticed recently because
mapSpans didn't check the bounds on the span reservation until
recently. Most likely it corrupted the bitmap before. By using _MaxMem
consistently, we avoid thinking that we can grow the arena larger than
we have auxiliary structures for.
Change-Id: Ifef28cb746a3ead4b31c1d7348495c2242fef520
Reviewed-on: https://go-review.googlesource.com/35253
Reviewed-by: David Crawshaw <crawshaw@golang.org>
Reviewed-by: Elias Naur <elias.naur@gmail.com>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2017-01-13 13:23:41 -05:00
|
|
|
// If we haven't grown the arena to _MaxMem yet, try
|
|
|
|
// to reserve some more address space.
|
2015-02-19 13:38:46 -05:00
|
|
|
p_size := round(n+_PageSize, 256<<20)
|
2015-11-16 14:37:59 -05:00
|
|
|
new_end := h.arena_end + p_size // Careful: can overflow
|
runtime: fix confusion between _MaxMem and _MaxArena32
Currently both _MaxMem and _MaxArena32 represent the maximum arena
size on 32-bit hosts (except on MIPS32 where _MaxMem is confusingly
smaller than _MaxArena32).
Clean up sysAlloc so that it always uses _MaxMem, which is the maximum
arena size on both 32- and 64-bit architectures and is the arena size
we allocate auxiliary structures for. This lets us simplify and unify
some code paths and eliminate _MaxArena32.
Fixes #18651. mheap.sysAlloc currently assumes that if the arena is
small, we must be on a 32-bit machine and can therefore grow the arena
to _MaxArena32. This breaks down on darwin/arm64, where _MaxMem is
only 2 GB. As a result, on darwin/arm64, we only reserve spans and
bitmap space for a 2 GB heap, and if the application tries to allocate
beyond that, sysAlloc takes the 32-bit path, tries to grow the arena
beyond 2 GB, and panics when it tries to grow the spans array
allocation past its reserved size. This has probably been a problem
for several releases now, but was only noticed recently because
mapSpans didn't check the bounds on the span reservation until
recently. Most likely it corrupted the bitmap before. By using _MaxMem
consistently, we avoid thinking that we can grow the arena larger than
we have auxiliary structures for.
Change-Id: Ifef28cb746a3ead4b31c1d7348495c2242fef520
Reviewed-on: https://go-review.googlesource.com/35253
Reviewed-by: David Crawshaw <crawshaw@golang.org>
Reviewed-by: Elias Naur <elias.naur@gmail.com>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2017-01-13 13:23:41 -05:00
|
|
|
if h.arena_end <= new_end && new_end-h.arena_start-1 <= _MaxMem {
|
2015-02-19 13:38:46 -05:00
|
|
|
// TODO: It would be bad if part of the arena
|
|
|
|
// is reserved and part is not.
|
|
|
|
var reserved bool
|
2015-10-15 14:33:50 -07:00
|
|
|
p := uintptr(sysReserve(unsafe.Pointer(h.arena_end), p_size, &reserved))
|
2015-09-27 03:56:05 +10:00
|
|
|
if p == 0 {
|
|
|
|
return nil
|
|
|
|
}
|
2015-02-19 13:38:46 -05:00
|
|
|
if p == h.arena_end {
|
2017-04-07 13:49:51 -04:00
|
|
|
// The new reservation is contiguous
|
|
|
|
// with the old reservation.
|
2015-02-19 13:38:46 -05:00
|
|
|
h.arena_end = new_end
|
|
|
|
h.arena_reserved = reserved
|
runtime: fix confusion between _MaxMem and _MaxArena32
Currently both _MaxMem and _MaxArena32 represent the maximum arena
size on 32-bit hosts (except on MIPS32 where _MaxMem is confusingly
smaller than _MaxArena32).
Clean up sysAlloc so that it always uses _MaxMem, which is the maximum
arena size on both 32- and 64-bit architectures and is the arena size
we allocate auxiliary structures for. This lets us simplify and unify
some code paths and eliminate _MaxArena32.
Fixes #18651. mheap.sysAlloc currently assumes that if the arena is
small, we must be on a 32-bit machine and can therefore grow the arena
to _MaxArena32. This breaks down on darwin/arm64, where _MaxMem is
only 2 GB. As a result, on darwin/arm64, we only reserve spans and
bitmap space for a 2 GB heap, and if the application tries to allocate
beyond that, sysAlloc takes the 32-bit path, tries to grow the arena
beyond 2 GB, and panics when it tries to grow the spans array
allocation past its reserved size. This has probably been a problem
for several releases now, but was only noticed recently because
mapSpans didn't check the bounds on the span reservation until
recently. Most likely it corrupted the bitmap before. By using _MaxMem
consistently, we avoid thinking that we can grow the arena larger than
we have auxiliary structures for.
Change-Id: Ifef28cb746a3ead4b31c1d7348495c2242fef520
Reviewed-on: https://go-review.googlesource.com/35253
Reviewed-by: David Crawshaw <crawshaw@golang.org>
Reviewed-by: Elias Naur <elias.naur@gmail.com>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2017-01-13 13:23:41 -05:00
|
|
|
} else if h.arena_start <= p && p+p_size-h.arena_start-1 <= _MaxMem {
|
2017-04-07 13:49:51 -04:00
|
|
|
// We were able to reserve more memory
|
|
|
|
// within the arena space, but it's
|
|
|
|
// not contiguous with our previous
|
|
|
|
// reservation. Skip over the unused
|
|
|
|
// address space.
|
|
|
|
//
|
2015-02-19 13:38:46 -05:00
|
|
|
// Keep everything page-aligned.
|
|
|
|
// Our pages are bigger than hardware pages.
|
|
|
|
h.arena_end = p + p_size
|
2016-02-29 15:01:00 -08:00
|
|
|
used := p + (-p & (_PageSize - 1))
|
2017-04-07 13:49:51 -04:00
|
|
|
h.setArenaUsed(used, false)
|
2015-02-19 13:38:46 -05:00
|
|
|
h.arena_reserved = reserved
|
|
|
|
} else {
|
2015-11-15 23:09:16 -05:00
|
|
|
// We haven't added this allocation to
|
|
|
|
// the stats, so subtract it from a
|
|
|
|
// fake stat (but avoid underflow).
|
|
|
|
stat := uint64(p_size)
|
2015-10-15 14:33:50 -07:00
|
|
|
sysFree(unsafe.Pointer(p), p_size, &stat)
|
2015-02-19 13:38:46 -05:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-10-26 17:53:22 -07:00
|
|
|
if n <= h.arena_end-h.arena_used {
|
2015-02-19 13:38:46 -05:00
|
|
|
// Keep taking from our reservation.
|
|
|
|
p := h.arena_used
|
2015-10-15 14:33:50 -07:00
|
|
|
sysMap(unsafe.Pointer(p), n, h.arena_reserved, &memstats.heap_sys)
|
2017-04-07 13:49:51 -04:00
|
|
|
h.setArenaUsed(p+n, true)
|
2015-02-19 13:38:46 -05:00
|
|
|
|
2016-02-29 15:01:00 -08:00
|
|
|
if p&(_PageSize-1) != 0 {
|
2015-02-19 13:38:46 -05:00
|
|
|
throw("misrounded allocation in MHeap_SysAlloc")
|
|
|
|
}
|
2015-10-15 14:33:50 -07:00
|
|
|
return unsafe.Pointer(p)
|
2015-02-19 13:38:46 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
// If using 64-bit, our reservation is all we have.
|
runtime: fix confusion between _MaxMem and _MaxArena32
Currently both _MaxMem and _MaxArena32 represent the maximum arena
size on 32-bit hosts (except on MIPS32 where _MaxMem is confusingly
smaller than _MaxArena32).
Clean up sysAlloc so that it always uses _MaxMem, which is the maximum
arena size on both 32- and 64-bit architectures and is the arena size
we allocate auxiliary structures for. This lets us simplify and unify
some code paths and eliminate _MaxArena32.
Fixes #18651. mheap.sysAlloc currently assumes that if the arena is
small, we must be on a 32-bit machine and can therefore grow the arena
to _MaxArena32. This breaks down on darwin/arm64, where _MaxMem is
only 2 GB. As a result, on darwin/arm64, we only reserve spans and
bitmap space for a 2 GB heap, and if the application tries to allocate
beyond that, sysAlloc takes the 32-bit path, tries to grow the arena
beyond 2 GB, and panics when it tries to grow the spans array
allocation past its reserved size. This has probably been a problem
for several releases now, but was only noticed recently because
mapSpans didn't check the bounds on the span reservation until
recently. Most likely it corrupted the bitmap before. By using _MaxMem
consistently, we avoid thinking that we can grow the arena larger than
we have auxiliary structures for.
Change-Id: Ifef28cb746a3ead4b31c1d7348495c2242fef520
Reviewed-on: https://go-review.googlesource.com/35253
Reviewed-by: David Crawshaw <crawshaw@golang.org>
Reviewed-by: Elias Naur <elias.naur@gmail.com>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2017-01-13 13:23:41 -05:00
|
|
|
if sys.PtrSize != 4 {
|
2015-02-19 13:38:46 -05:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// On 32-bit, once the reservation is gone we can
|
runtime: use entire address space on 32 bit
In issue #13992, Russ mentioned that the heap bitmap footprint was
halved but that the bitmap size calculation hadn't been updated. This
presents the opportunity to either halve the bitmap size or double
the addressable virtual space. This CL doubles the addressable virtual
space. On 32 bit this can be tweaked further to allow the bitmap to
cover the entire 4GB virtual address space, removing a failure mode
if the kernel hands out memory with a too low address.
First, fix the calculation and double _MaxArena32 to cover 4GB virtual
memory space with the same bitmap size (256 MB).
Then, allow the fallback mode for the initial memory reservation
on 32 bit (or 64 bit with too little available virtual memory) to not
include space for the arena. mheap.sysAlloc will automatically reserve
additional space when the existing arena is full.
Finally, set arena_start to 0 in 32 bit mode, so that any address is
acceptable for subsequent (additional) reservations.
Before, the bitmap was always located just before arena_start, so
fix the two places relying on that assumption: Point the otherwise unused
mheap.bitmap to one byte after the end of the bitmap, and use it for
bitmap addressing instead of arena_start.
With arena_start set to 0 on 32 bit, the cgoInRange check is no longer a
sufficient check for Go pointers. Introduce and call inHeapOrStack to
check whether a pointer is to the Go heap or stack.
While we're here, remove sysReserveHigh which seems to be unused.
Fixes #13992
Change-Id: I592b513148a50b9d3967b5c5d94b86b3ec39acc2
Reviewed-on: https://go-review.googlesource.com/20471
Reviewed-by: Austin Clements <austin@google.com>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2016-03-09 10:00:12 +01:00
|
|
|
// try to get memory at a location chosen by the OS.
|
2015-02-19 13:38:46 -05:00
|
|
|
p_size := round(n, _PageSize) + _PageSize
|
|
|
|
p := uintptr(sysAlloc(p_size, &memstats.heap_sys))
|
|
|
|
if p == 0 {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
runtime: fix confusion between _MaxMem and _MaxArena32
Currently both _MaxMem and _MaxArena32 represent the maximum arena
size on 32-bit hosts (except on MIPS32 where _MaxMem is confusingly
smaller than _MaxArena32).
Clean up sysAlloc so that it always uses _MaxMem, which is the maximum
arena size on both 32- and 64-bit architectures and is the arena size
we allocate auxiliary structures for. This lets us simplify and unify
some code paths and eliminate _MaxArena32.
Fixes #18651. mheap.sysAlloc currently assumes that if the arena is
small, we must be on a 32-bit machine and can therefore grow the arena
to _MaxArena32. This breaks down on darwin/arm64, where _MaxMem is
only 2 GB. As a result, on darwin/arm64, we only reserve spans and
bitmap space for a 2 GB heap, and if the application tries to allocate
beyond that, sysAlloc takes the 32-bit path, tries to grow the arena
beyond 2 GB, and panics when it tries to grow the spans array
allocation past its reserved size. This has probably been a problem
for several releases now, but was only noticed recently because
mapSpans didn't check the bounds on the span reservation until
recently. Most likely it corrupted the bitmap before. By using _MaxMem
consistently, we avoid thinking that we can grow the arena larger than
we have auxiliary structures for.
Change-Id: Ifef28cb746a3ead4b31c1d7348495c2242fef520
Reviewed-on: https://go-review.googlesource.com/35253
Reviewed-by: David Crawshaw <crawshaw@golang.org>
Reviewed-by: Elias Naur <elias.naur@gmail.com>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2017-01-13 13:23:41 -05:00
|
|
|
if p < h.arena_start || p+p_size-h.arena_start > _MaxMem {
|
|
|
|
// This shouldn't be possible because _MaxMem is the
|
|
|
|
// whole address space on 32-bit.
|
|
|
|
top := uint64(h.arena_start) + _MaxMem
|
2016-01-26 22:13:01 -05:00
|
|
|
print("runtime: memory allocated by OS (", hex(p), ") not in usable range [", hex(h.arena_start), ",", hex(top), ")\n")
|
2015-10-15 14:33:50 -07:00
|
|
|
sysFree(unsafe.Pointer(p), p_size, &memstats.heap_sys)
|
2015-02-19 13:38:46 -05:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
p_end := p + p_size
|
|
|
|
p += -p & (_PageSize - 1)
|
2016-02-29 15:01:00 -08:00
|
|
|
if p+n > h.arena_used {
|
2017-04-07 13:49:51 -04:00
|
|
|
h.setArenaUsed(p+n, true)
|
2015-02-19 13:38:46 -05:00
|
|
|
if p_end > h.arena_end {
|
|
|
|
h.arena_end = p_end
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-02-29 15:01:00 -08:00
|
|
|
if p&(_PageSize-1) != 0 {
|
2015-02-19 13:38:46 -05:00
|
|
|
throw("misrounded allocation in MHeap_SysAlloc")
|
|
|
|
}
|
2015-10-15 14:33:50 -07:00
|
|
|
return unsafe.Pointer(p)
|
2015-02-19 13:38:46 -05:00
|
|
|
}
|
|
|
|
|
2014-09-16 10:22:15 -04:00
|
|
|
// base address for all 0-byte allocations
|
|
|
|
var zerobase uintptr
|
2014-07-30 09:01:52 -07:00
|
|
|
|
2016-03-02 12:15:02 -05:00
|
|
|
// nextFreeFast returns the next free object if one is quickly available.
|
|
|
|
// Otherwise it returns 0.
|
[dev.garbage] runtime: reintroduce no-zeroing optimization
Currently we always zero objects when we allocate them. We used to
have an optimization that would not zero objects that had not been
allocated since the whole span was last zeroed (either by getting it
from the system or by getting it from the heap, which does a bulk
zero), but this depended on the sweeper clobbering the first two words
of each object. Hence, we lost this optimization when the bitmap
sweeper went away.
Re-introduce this optimization using a different mechanism. Each span
already keeps a flag indicating that it just came from the OS or was
just bulk zeroed by the mheap. We can simply use this flag to know
when we don't need to zero an object. This is slightly less efficient
than the old optimization: if a span gets allocated and partially
used, then GC happens and the span gets returned to the mcentral, then
the span gets re-acquired, the old optimization knew that it only had
to re-zero the objects that had been reclaimed, whereas this
optimization will re-zero everything. However, in this case, you're
already paying for the garbage collection, and you've only wasted one
zeroing of the span, so in practice there seems to be little
difference. (If we did want to revive the full optimization, each span
could keep track of a frontier beyond which all free slots are zeroed.
I prototyped this and it didn't obvious do any better than the much
simpler approach in this commit.)
This significantly improves BinaryTree17, which is allocation-heavy
(and runs first, so most pages are already zeroed), and slightly
improves everything else.
name old time/op new time/op delta
XBenchGarbage-12 2.15ms ± 1% 2.14ms ± 1% -0.80% (p=0.000 n=17+17)
name old time/op new time/op delta
BinaryTree17-12 2.71s ± 1% 2.56s ± 1% -5.73% (p=0.000 n=18+19)
DivconstI64-12 1.70ns ± 1% 1.70ns ± 1% ~ (p=0.562 n=18+18)
DivconstU64-12 1.74ns ± 2% 1.74ns ± 1% ~ (p=0.394 n=20+20)
DivconstI32-12 1.74ns ± 0% 1.74ns ± 0% ~ (all samples are equal)
DivconstU32-12 1.66ns ± 1% 1.66ns ± 0% ~ (p=0.516 n=15+16)
DivconstI16-12 1.84ns ± 0% 1.84ns ± 0% ~ (all samples are equal)
DivconstU16-12 1.82ns ± 0% 1.82ns ± 0% ~ (all samples are equal)
DivconstI8-12 1.79ns ± 0% 1.79ns ± 0% ~ (all samples are equal)
DivconstU8-12 1.60ns ± 0% 1.60ns ± 1% ~ (p=0.603 n=17+19)
Fannkuch11-12 2.11s ± 1% 2.11s ± 0% ~ (p=0.333 n=16+19)
FmtFprintfEmpty-12 45.1ns ± 4% 45.4ns ± 5% ~ (p=0.111 n=20+20)
FmtFprintfString-12 134ns ± 0% 129ns ± 0% -3.45% (p=0.000 n=18+16)
FmtFprintfInt-12 131ns ± 1% 129ns ± 1% -1.54% (p=0.000 n=16+18)
FmtFprintfIntInt-12 205ns ± 2% 203ns ± 0% -0.56% (p=0.014 n=20+18)
FmtFprintfPrefixedInt-12 200ns ± 2% 197ns ± 1% -1.48% (p=0.000 n=20+18)
FmtFprintfFloat-12 256ns ± 1% 256ns ± 0% -0.21% (p=0.008 n=18+20)
FmtManyArgs-12 805ns ± 0% 804ns ± 0% -0.19% (p=0.001 n=18+18)
GobDecode-12 7.21ms ± 1% 7.14ms ± 1% -0.92% (p=0.000 n=19+20)
GobEncode-12 5.88ms ± 1% 5.88ms ± 1% ~ (p=0.641 n=18+19)
Gzip-12 218ms ± 1% 218ms ± 1% ~ (p=0.271 n=19+18)
Gunzip-12 37.1ms ± 0% 36.9ms ± 0% -0.29% (p=0.000 n=18+17)
HTTPClientServer-12 78.1µs ± 2% 77.4µs ± 2% ~ (p=0.070 n=19+19)
JSONEncode-12 15.5ms ± 1% 15.5ms ± 0% ~ (p=0.063 n=20+18)
JSONDecode-12 56.1ms ± 0% 55.4ms ± 1% -1.18% (p=0.000 n=19+18)
Mandelbrot200-12 4.05ms ± 0% 4.06ms ± 0% +0.29% (p=0.001 n=18+18)
GoParse-12 3.28ms ± 1% 3.21ms ± 1% -2.30% (p=0.000 n=20+20)
RegexpMatchEasy0_32-12 69.4ns ± 2% 69.3ns ± 1% ~ (p=0.205 n=18+16)
RegexpMatchEasy0_1K-12 239ns ± 0% 239ns ± 0% ~ (all samples are equal)
RegexpMatchEasy1_32-12 69.4ns ± 1% 69.4ns ± 1% ~ (p=0.620 n=15+18)
RegexpMatchEasy1_1K-12 370ns ± 1% 369ns ± 2% ~ (p=0.088 n=20+20)
RegexpMatchMedium_32-12 108ns ± 0% 108ns ± 0% ~ (all samples are equal)
RegexpMatchMedium_1K-12 33.6µs ± 3% 33.5µs ± 3% ~ (p=0.718 n=20+20)
RegexpMatchHard_32-12 1.68µs ± 1% 1.67µs ± 2% ~ (p=0.316 n=20+20)
RegexpMatchHard_1K-12 50.5µs ± 3% 50.4µs ± 3% ~ (p=0.659 n=20+20)
Revcomp-12 381ms ± 1% 381ms ± 1% ~ (p=0.916 n=19+18)
Template-12 66.5ms ± 1% 65.8ms ± 2% -1.08% (p=0.000 n=20+20)
TimeParse-12 317ns ± 0% 319ns ± 0% +0.48% (p=0.000 n=19+12)
TimeFormat-12 338ns ± 0% 338ns ± 0% ~ (p=0.124 n=19+18)
[Geo mean] 5.99µs 5.96µs -0.54%
Change-Id: I638ffd9d9f178835bbfa499bac20bd7224f1a907
Reviewed-on: https://go-review.googlesource.com/22591
Reviewed-by: Rick Hudson <rlh@golang.org>
2016-04-28 15:32:01 -04:00
|
|
|
func nextFreeFast(s *mspan) gclinkptr {
|
2016-03-31 10:45:36 -04:00
|
|
|
theBit := sys.Ctz64(s.allocCache) // Is there a free object in the allocCache?
|
|
|
|
if theBit < 64 {
|
|
|
|
result := s.freeindex + uintptr(theBit)
|
2016-03-02 12:15:02 -05:00
|
|
|
if result < s.nelems {
|
2016-03-31 10:45:36 -04:00
|
|
|
freeidx := result + 1
|
2016-03-02 12:15:02 -05:00
|
|
|
if freeidx%64 == 0 && freeidx != s.nelems {
|
2016-04-29 12:09:36 -04:00
|
|
|
return 0
|
2016-03-02 12:15:02 -05:00
|
|
|
}
|
2017-03-14 13:25:12 -07:00
|
|
|
s.allocCache >>= uint(theBit + 1)
|
2016-03-02 12:15:02 -05:00
|
|
|
s.freeindex = freeidx
|
|
|
|
v := gclinkptr(result*s.elemsize + s.base())
|
|
|
|
s.allocCount++
|
|
|
|
return v
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return 0
|
|
|
|
}
|
|
|
|
|
2016-02-08 12:36:23 -05:00
|
|
|
// nextFree returns the next free object from the cached span if one is available.
|
|
|
|
// Otherwise it refills the cache with a span with an available object and
|
|
|
|
// returns that object along with a flag indicating that this was a heavy
|
|
|
|
// weight allocation. If it is a heavy weight allocation the caller must
|
|
|
|
// determine whether a new GC cycle needs to be started or if the GC is active
|
|
|
|
// whether this goroutine needs to assist the GC.
|
2016-06-27 12:23:39 +02:00
|
|
|
func (c *mcache) nextFree(sizeclass uint8) (v gclinkptr, s *mspan, shouldhelpgc bool) {
|
[dev.garbage] runtime: reintroduce no-zeroing optimization
Currently we always zero objects when we allocate them. We used to
have an optimization that would not zero objects that had not been
allocated since the whole span was last zeroed (either by getting it
from the system or by getting it from the heap, which does a bulk
zero), but this depended on the sweeper clobbering the first two words
of each object. Hence, we lost this optimization when the bitmap
sweeper went away.
Re-introduce this optimization using a different mechanism. Each span
already keeps a flag indicating that it just came from the OS or was
just bulk zeroed by the mheap. We can simply use this flag to know
when we don't need to zero an object. This is slightly less efficient
than the old optimization: if a span gets allocated and partially
used, then GC happens and the span gets returned to the mcentral, then
the span gets re-acquired, the old optimization knew that it only had
to re-zero the objects that had been reclaimed, whereas this
optimization will re-zero everything. However, in this case, you're
already paying for the garbage collection, and you've only wasted one
zeroing of the span, so in practice there seems to be little
difference. (If we did want to revive the full optimization, each span
could keep track of a frontier beyond which all free slots are zeroed.
I prototyped this and it didn't obvious do any better than the much
simpler approach in this commit.)
This significantly improves BinaryTree17, which is allocation-heavy
(and runs first, so most pages are already zeroed), and slightly
improves everything else.
name old time/op new time/op delta
XBenchGarbage-12 2.15ms ± 1% 2.14ms ± 1% -0.80% (p=0.000 n=17+17)
name old time/op new time/op delta
BinaryTree17-12 2.71s ± 1% 2.56s ± 1% -5.73% (p=0.000 n=18+19)
DivconstI64-12 1.70ns ± 1% 1.70ns ± 1% ~ (p=0.562 n=18+18)
DivconstU64-12 1.74ns ± 2% 1.74ns ± 1% ~ (p=0.394 n=20+20)
DivconstI32-12 1.74ns ± 0% 1.74ns ± 0% ~ (all samples are equal)
DivconstU32-12 1.66ns ± 1% 1.66ns ± 0% ~ (p=0.516 n=15+16)
DivconstI16-12 1.84ns ± 0% 1.84ns ± 0% ~ (all samples are equal)
DivconstU16-12 1.82ns ± 0% 1.82ns ± 0% ~ (all samples are equal)
DivconstI8-12 1.79ns ± 0% 1.79ns ± 0% ~ (all samples are equal)
DivconstU8-12 1.60ns ± 0% 1.60ns ± 1% ~ (p=0.603 n=17+19)
Fannkuch11-12 2.11s ± 1% 2.11s ± 0% ~ (p=0.333 n=16+19)
FmtFprintfEmpty-12 45.1ns ± 4% 45.4ns ± 5% ~ (p=0.111 n=20+20)
FmtFprintfString-12 134ns ± 0% 129ns ± 0% -3.45% (p=0.000 n=18+16)
FmtFprintfInt-12 131ns ± 1% 129ns ± 1% -1.54% (p=0.000 n=16+18)
FmtFprintfIntInt-12 205ns ± 2% 203ns ± 0% -0.56% (p=0.014 n=20+18)
FmtFprintfPrefixedInt-12 200ns ± 2% 197ns ± 1% -1.48% (p=0.000 n=20+18)
FmtFprintfFloat-12 256ns ± 1% 256ns ± 0% -0.21% (p=0.008 n=18+20)
FmtManyArgs-12 805ns ± 0% 804ns ± 0% -0.19% (p=0.001 n=18+18)
GobDecode-12 7.21ms ± 1% 7.14ms ± 1% -0.92% (p=0.000 n=19+20)
GobEncode-12 5.88ms ± 1% 5.88ms ± 1% ~ (p=0.641 n=18+19)
Gzip-12 218ms ± 1% 218ms ± 1% ~ (p=0.271 n=19+18)
Gunzip-12 37.1ms ± 0% 36.9ms ± 0% -0.29% (p=0.000 n=18+17)
HTTPClientServer-12 78.1µs ± 2% 77.4µs ± 2% ~ (p=0.070 n=19+19)
JSONEncode-12 15.5ms ± 1% 15.5ms ± 0% ~ (p=0.063 n=20+18)
JSONDecode-12 56.1ms ± 0% 55.4ms ± 1% -1.18% (p=0.000 n=19+18)
Mandelbrot200-12 4.05ms ± 0% 4.06ms ± 0% +0.29% (p=0.001 n=18+18)
GoParse-12 3.28ms ± 1% 3.21ms ± 1% -2.30% (p=0.000 n=20+20)
RegexpMatchEasy0_32-12 69.4ns ± 2% 69.3ns ± 1% ~ (p=0.205 n=18+16)
RegexpMatchEasy0_1K-12 239ns ± 0% 239ns ± 0% ~ (all samples are equal)
RegexpMatchEasy1_32-12 69.4ns ± 1% 69.4ns ± 1% ~ (p=0.620 n=15+18)
RegexpMatchEasy1_1K-12 370ns ± 1% 369ns ± 2% ~ (p=0.088 n=20+20)
RegexpMatchMedium_32-12 108ns ± 0% 108ns ± 0% ~ (all samples are equal)
RegexpMatchMedium_1K-12 33.6µs ± 3% 33.5µs ± 3% ~ (p=0.718 n=20+20)
RegexpMatchHard_32-12 1.68µs ± 1% 1.67µs ± 2% ~ (p=0.316 n=20+20)
RegexpMatchHard_1K-12 50.5µs ± 3% 50.4µs ± 3% ~ (p=0.659 n=20+20)
Revcomp-12 381ms ± 1% 381ms ± 1% ~ (p=0.916 n=19+18)
Template-12 66.5ms ± 1% 65.8ms ± 2% -1.08% (p=0.000 n=20+20)
TimeParse-12 317ns ± 0% 319ns ± 0% +0.48% (p=0.000 n=19+12)
TimeFormat-12 338ns ± 0% 338ns ± 0% ~ (p=0.124 n=19+18)
[Geo mean] 5.99µs 5.96µs -0.54%
Change-Id: I638ffd9d9f178835bbfa499bac20bd7224f1a907
Reviewed-on: https://go-review.googlesource.com/22591
Reviewed-by: Rick Hudson <rlh@golang.org>
2016-04-28 15:32:01 -04:00
|
|
|
s = c.alloc[sizeclass]
|
2016-02-11 13:57:58 -05:00
|
|
|
shouldhelpgc = false
|
2016-02-24 14:36:30 -05:00
|
|
|
freeIndex := s.nextFreeIndex()
|
2016-02-11 13:57:58 -05:00
|
|
|
if freeIndex == s.nelems {
|
|
|
|
// The span is full.
|
2016-03-02 12:15:02 -05:00
|
|
|
if uintptr(s.allocCount) != s.nelems {
|
2016-02-24 14:36:30 -05:00
|
|
|
println("runtime: s.allocCount=", s.allocCount, "s.nelems=", s.nelems)
|
2016-03-02 12:15:02 -05:00
|
|
|
throw("s.allocCount != s.nelems && freeIndex == s.nelems")
|
2016-02-11 13:57:58 -05:00
|
|
|
}
|
2016-02-08 12:36:23 -05:00
|
|
|
systemstack(func() {
|
|
|
|
c.refill(int32(sizeclass))
|
|
|
|
})
|
|
|
|
shouldhelpgc = true
|
|
|
|
s = c.alloc[sizeclass]
|
2016-02-24 14:36:30 -05:00
|
|
|
|
|
|
|
freeIndex = s.nextFreeIndex()
|
2016-02-11 13:57:58 -05:00
|
|
|
}
|
2016-02-24 14:36:30 -05:00
|
|
|
|
2016-02-11 13:57:58 -05:00
|
|
|
if freeIndex >= s.nelems {
|
|
|
|
throw("freeIndex is not valid")
|
2016-02-08 12:36:23 -05:00
|
|
|
}
|
2016-02-11 13:57:58 -05:00
|
|
|
|
|
|
|
v = gclinkptr(freeIndex*s.elemsize + s.base())
|
2016-02-16 17:16:43 -05:00
|
|
|
s.allocCount++
|
|
|
|
if uintptr(s.allocCount) > s.nelems {
|
2016-02-24 14:36:30 -05:00
|
|
|
println("s.allocCount=", s.allocCount, "s.nelems=", s.nelems)
|
2016-02-16 17:16:43 -05:00
|
|
|
throw("s.allocCount > s.nelems")
|
2016-02-11 13:57:58 -05:00
|
|
|
}
|
2016-02-08 12:36:23 -05:00
|
|
|
return
|
|
|
|
}
|
|
|
|
|
2014-08-05 17:03:06 +04:00
|
|
|
// Allocate an object of size bytes.
|
|
|
|
// Small objects are allocated from the per-P cache's free lists.
|
2014-07-30 09:01:52 -07:00
|
|
|
// Large objects (> 32 kB) are allocated straight from the heap.
|
2016-04-19 19:35:10 -07:00
|
|
|
func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
|
2015-03-05 17:33:08 -05:00
|
|
|
if gcphase == _GCmarktermination {
|
|
|
|
throw("mallocgc called with gcphase == _GCmarktermination")
|
|
|
|
}
|
2015-03-08 20:56:15 -04:00
|
|
|
|
2014-07-30 09:01:52 -07:00
|
|
|
if size == 0 {
|
2014-09-16 10:22:15 -04:00
|
|
|
return unsafe.Pointer(&zerobase)
|
2014-07-30 09:01:52 -07:00
|
|
|
}
|
|
|
|
|
2015-03-08 20:56:15 -04:00
|
|
|
if debug.sbrk != 0 {
|
|
|
|
align := uintptr(16)
|
|
|
|
if typ != nil {
|
|
|
|
align = uintptr(typ.align)
|
|
|
|
}
|
|
|
|
return persistentalloc(size, align, &memstats.other_sys)
|
|
|
|
}
|
|
|
|
|
2015-10-04 20:56:11 -07:00
|
|
|
// assistG is the G to charge for this allocation, or nil if
|
|
|
|
// GC is not currently active.
|
|
|
|
var assistG *g
|
|
|
|
if gcBlackenEnabled != 0 {
|
|
|
|
// Charge the current user G for this allocation.
|
|
|
|
assistG = getg()
|
|
|
|
if assistG.m.curg != nil {
|
|
|
|
assistG = assistG.m.curg
|
|
|
|
}
|
|
|
|
// Charge the allocation against the G. We'll account
|
|
|
|
// for internal fragmentation at the end of mallocgc.
|
|
|
|
assistG.gcAssistBytes -= int64(size)
|
|
|
|
|
|
|
|
if assistG.gcAssistBytes < 0 {
|
|
|
|
// This G is in debt. Assist the GC to correct
|
|
|
|
// this before allocating. This must happen
|
|
|
|
// before disabling preemption.
|
|
|
|
gcAssistAlloc(assistG)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-01-16 14:43:38 -05:00
|
|
|
// Set mp.mallocing to keep from being preempted by GC.
|
|
|
|
mp := acquirem()
|
|
|
|
if mp.mallocing != 0 {
|
|
|
|
throw("malloc deadlock")
|
2014-08-18 16:33:39 +04:00
|
|
|
}
|
2015-05-12 10:01:37 -07:00
|
|
|
if mp.gsignal == getg() {
|
|
|
|
throw("malloc during signal")
|
|
|
|
}
|
2015-01-16 14:43:38 -05:00
|
|
|
mp.mallocing = 1
|
2014-08-18 16:33:39 +04:00
|
|
|
|
2015-03-08 20:56:15 -04:00
|
|
|
shouldhelpgc := false
|
|
|
|
dataSize := size
|
2014-08-18 16:33:39 +04:00
|
|
|
c := gomcache()
|
2014-07-30 09:01:52 -07:00
|
|
|
var x unsafe.Pointer
|
2016-04-19 19:35:10 -07:00
|
|
|
noscan := typ == nil || typ.kind&kindNoPointers != 0
|
2014-07-30 09:01:52 -07:00
|
|
|
if size <= maxSmallSize {
|
2016-04-19 19:35:10 -07:00
|
|
|
if noscan && size < maxTinySize {
|
2014-07-30 09:01:52 -07:00
|
|
|
// Tiny allocator.
|
|
|
|
//
|
|
|
|
// Tiny allocator combines several tiny allocation requests
|
|
|
|
// into a single memory block. The resulting memory block
|
|
|
|
// is freed when all subobjects are unreachable. The subobjects
|
2016-04-19 19:35:10 -07:00
|
|
|
// must be noscan (don't have pointers), this ensures that
|
2014-07-30 09:01:52 -07:00
|
|
|
// the amount of potentially wasted memory is bounded.
|
|
|
|
//
|
|
|
|
// Size of the memory block used for combining (maxTinySize) is tunable.
|
|
|
|
// Current setting is 16 bytes, which relates to 2x worst case memory
|
|
|
|
// wastage (when all but one subobjects are unreachable).
|
|
|
|
// 8 bytes would result in no wastage at all, but provides less
|
|
|
|
// opportunities for combining.
|
|
|
|
// 32 bytes provides more opportunities for combining,
|
|
|
|
// but can lead to 4x worst case wastage.
|
|
|
|
// The best case winning is 8x regardless of block size.
|
|
|
|
//
|
|
|
|
// Objects obtained from tiny allocator must not be freed explicitly.
|
|
|
|
// So when an object will be freed explicitly, we ensure that
|
|
|
|
// its size >= maxTinySize.
|
|
|
|
//
|
|
|
|
// SetFinalizer has a special case for objects potentially coming
|
|
|
|
// from tiny allocator, it such case it allows to set finalizers
|
|
|
|
// for an inner byte of a memory block.
|
|
|
|
//
|
|
|
|
// The main targets of tiny allocator are small strings and
|
|
|
|
// standalone escaping variables. On a json benchmark
|
|
|
|
// the allocator reduces number of allocations by ~12% and
|
|
|
|
// reduces heap size by ~20%.
|
2015-01-14 14:13:55 -05:00
|
|
|
off := c.tinyoffset
|
|
|
|
// Align tiny pointer for required (conservative) alignment.
|
|
|
|
if size&7 == 0 {
|
|
|
|
off = round(off, 8)
|
|
|
|
} else if size&3 == 0 {
|
|
|
|
off = round(off, 4)
|
|
|
|
} else if size&1 == 0 {
|
|
|
|
off = round(off, 2)
|
|
|
|
}
|
2015-11-16 15:31:50 -05:00
|
|
|
if off+size <= maxTinySize && c.tiny != 0 {
|
2015-01-14 14:13:55 -05:00
|
|
|
// The object fits into existing tiny block.
|
2015-11-16 15:31:50 -05:00
|
|
|
x = unsafe.Pointer(c.tiny + off)
|
2015-01-14 14:13:55 -05:00
|
|
|
c.tinyoffset = off + size
|
|
|
|
c.local_tinyallocs++
|
2015-01-16 14:43:38 -05:00
|
|
|
mp.mallocing = 0
|
|
|
|
releasem(mp)
|
2015-01-14 14:13:55 -05:00
|
|
|
return x
|
2014-07-30 09:01:52 -07:00
|
|
|
}
|
|
|
|
// Allocate a new maxTinySize block.
|
[dev.garbage] runtime: reintroduce no-zeroing optimization
Currently we always zero objects when we allocate them. We used to
have an optimization that would not zero objects that had not been
allocated since the whole span was last zeroed (either by getting it
from the system or by getting it from the heap, which does a bulk
zero), but this depended on the sweeper clobbering the first two words
of each object. Hence, we lost this optimization when the bitmap
sweeper went away.
Re-introduce this optimization using a different mechanism. Each span
already keeps a flag indicating that it just came from the OS or was
just bulk zeroed by the mheap. We can simply use this flag to know
when we don't need to zero an object. This is slightly less efficient
than the old optimization: if a span gets allocated and partially
used, then GC happens and the span gets returned to the mcentral, then
the span gets re-acquired, the old optimization knew that it only had
to re-zero the objects that had been reclaimed, whereas this
optimization will re-zero everything. However, in this case, you're
already paying for the garbage collection, and you've only wasted one
zeroing of the span, so in practice there seems to be little
difference. (If we did want to revive the full optimization, each span
could keep track of a frontier beyond which all free slots are zeroed.
I prototyped this and it didn't obvious do any better than the much
simpler approach in this commit.)
This significantly improves BinaryTree17, which is allocation-heavy
(and runs first, so most pages are already zeroed), and slightly
improves everything else.
name old time/op new time/op delta
XBenchGarbage-12 2.15ms ± 1% 2.14ms ± 1% -0.80% (p=0.000 n=17+17)
name old time/op new time/op delta
BinaryTree17-12 2.71s ± 1% 2.56s ± 1% -5.73% (p=0.000 n=18+19)
DivconstI64-12 1.70ns ± 1% 1.70ns ± 1% ~ (p=0.562 n=18+18)
DivconstU64-12 1.74ns ± 2% 1.74ns ± 1% ~ (p=0.394 n=20+20)
DivconstI32-12 1.74ns ± 0% 1.74ns ± 0% ~ (all samples are equal)
DivconstU32-12 1.66ns ± 1% 1.66ns ± 0% ~ (p=0.516 n=15+16)
DivconstI16-12 1.84ns ± 0% 1.84ns ± 0% ~ (all samples are equal)
DivconstU16-12 1.82ns ± 0% 1.82ns ± 0% ~ (all samples are equal)
DivconstI8-12 1.79ns ± 0% 1.79ns ± 0% ~ (all samples are equal)
DivconstU8-12 1.60ns ± 0% 1.60ns ± 1% ~ (p=0.603 n=17+19)
Fannkuch11-12 2.11s ± 1% 2.11s ± 0% ~ (p=0.333 n=16+19)
FmtFprintfEmpty-12 45.1ns ± 4% 45.4ns ± 5% ~ (p=0.111 n=20+20)
FmtFprintfString-12 134ns ± 0% 129ns ± 0% -3.45% (p=0.000 n=18+16)
FmtFprintfInt-12 131ns ± 1% 129ns ± 1% -1.54% (p=0.000 n=16+18)
FmtFprintfIntInt-12 205ns ± 2% 203ns ± 0% -0.56% (p=0.014 n=20+18)
FmtFprintfPrefixedInt-12 200ns ± 2% 197ns ± 1% -1.48% (p=0.000 n=20+18)
FmtFprintfFloat-12 256ns ± 1% 256ns ± 0% -0.21% (p=0.008 n=18+20)
FmtManyArgs-12 805ns ± 0% 804ns ± 0% -0.19% (p=0.001 n=18+18)
GobDecode-12 7.21ms ± 1% 7.14ms ± 1% -0.92% (p=0.000 n=19+20)
GobEncode-12 5.88ms ± 1% 5.88ms ± 1% ~ (p=0.641 n=18+19)
Gzip-12 218ms ± 1% 218ms ± 1% ~ (p=0.271 n=19+18)
Gunzip-12 37.1ms ± 0% 36.9ms ± 0% -0.29% (p=0.000 n=18+17)
HTTPClientServer-12 78.1µs ± 2% 77.4µs ± 2% ~ (p=0.070 n=19+19)
JSONEncode-12 15.5ms ± 1% 15.5ms ± 0% ~ (p=0.063 n=20+18)
JSONDecode-12 56.1ms ± 0% 55.4ms ± 1% -1.18% (p=0.000 n=19+18)
Mandelbrot200-12 4.05ms ± 0% 4.06ms ± 0% +0.29% (p=0.001 n=18+18)
GoParse-12 3.28ms ± 1% 3.21ms ± 1% -2.30% (p=0.000 n=20+20)
RegexpMatchEasy0_32-12 69.4ns ± 2% 69.3ns ± 1% ~ (p=0.205 n=18+16)
RegexpMatchEasy0_1K-12 239ns ± 0% 239ns ± 0% ~ (all samples are equal)
RegexpMatchEasy1_32-12 69.4ns ± 1% 69.4ns ± 1% ~ (p=0.620 n=15+18)
RegexpMatchEasy1_1K-12 370ns ± 1% 369ns ± 2% ~ (p=0.088 n=20+20)
RegexpMatchMedium_32-12 108ns ± 0% 108ns ± 0% ~ (all samples are equal)
RegexpMatchMedium_1K-12 33.6µs ± 3% 33.5µs ± 3% ~ (p=0.718 n=20+20)
RegexpMatchHard_32-12 1.68µs ± 1% 1.67µs ± 2% ~ (p=0.316 n=20+20)
RegexpMatchHard_1K-12 50.5µs ± 3% 50.4µs ± 3% ~ (p=0.659 n=20+20)
Revcomp-12 381ms ± 1% 381ms ± 1% ~ (p=0.916 n=19+18)
Template-12 66.5ms ± 1% 65.8ms ± 2% -1.08% (p=0.000 n=20+20)
TimeParse-12 317ns ± 0% 319ns ± 0% +0.48% (p=0.000 n=19+12)
TimeFormat-12 338ns ± 0% 338ns ± 0% ~ (p=0.124 n=19+18)
[Geo mean] 5.99µs 5.96µs -0.54%
Change-Id: I638ffd9d9f178835bbfa499bac20bd7224f1a907
Reviewed-on: https://go-review.googlesource.com/22591
Reviewed-by: Rick Hudson <rlh@golang.org>
2016-04-28 15:32:01 -04:00
|
|
|
span := c.alloc[tinySizeClass]
|
|
|
|
v := nextFreeFast(span)
|
2016-03-02 12:15:02 -05:00
|
|
|
if v == 0 {
|
[dev.garbage] runtime: reintroduce no-zeroing optimization
Currently we always zero objects when we allocate them. We used to
have an optimization that would not zero objects that had not been
allocated since the whole span was last zeroed (either by getting it
from the system or by getting it from the heap, which does a bulk
zero), but this depended on the sweeper clobbering the first two words
of each object. Hence, we lost this optimization when the bitmap
sweeper went away.
Re-introduce this optimization using a different mechanism. Each span
already keeps a flag indicating that it just came from the OS or was
just bulk zeroed by the mheap. We can simply use this flag to know
when we don't need to zero an object. This is slightly less efficient
than the old optimization: if a span gets allocated and partially
used, then GC happens and the span gets returned to the mcentral, then
the span gets re-acquired, the old optimization knew that it only had
to re-zero the objects that had been reclaimed, whereas this
optimization will re-zero everything. However, in this case, you're
already paying for the garbage collection, and you've only wasted one
zeroing of the span, so in practice there seems to be little
difference. (If we did want to revive the full optimization, each span
could keep track of a frontier beyond which all free slots are zeroed.
I prototyped this and it didn't obvious do any better than the much
simpler approach in this commit.)
This significantly improves BinaryTree17, which is allocation-heavy
(and runs first, so most pages are already zeroed), and slightly
improves everything else.
name old time/op new time/op delta
XBenchGarbage-12 2.15ms ± 1% 2.14ms ± 1% -0.80% (p=0.000 n=17+17)
name old time/op new time/op delta
BinaryTree17-12 2.71s ± 1% 2.56s ± 1% -5.73% (p=0.000 n=18+19)
DivconstI64-12 1.70ns ± 1% 1.70ns ± 1% ~ (p=0.562 n=18+18)
DivconstU64-12 1.74ns ± 2% 1.74ns ± 1% ~ (p=0.394 n=20+20)
DivconstI32-12 1.74ns ± 0% 1.74ns ± 0% ~ (all samples are equal)
DivconstU32-12 1.66ns ± 1% 1.66ns ± 0% ~ (p=0.516 n=15+16)
DivconstI16-12 1.84ns ± 0% 1.84ns ± 0% ~ (all samples are equal)
DivconstU16-12 1.82ns ± 0% 1.82ns ± 0% ~ (all samples are equal)
DivconstI8-12 1.79ns ± 0% 1.79ns ± 0% ~ (all samples are equal)
DivconstU8-12 1.60ns ± 0% 1.60ns ± 1% ~ (p=0.603 n=17+19)
Fannkuch11-12 2.11s ± 1% 2.11s ± 0% ~ (p=0.333 n=16+19)
FmtFprintfEmpty-12 45.1ns ± 4% 45.4ns ± 5% ~ (p=0.111 n=20+20)
FmtFprintfString-12 134ns ± 0% 129ns ± 0% -3.45% (p=0.000 n=18+16)
FmtFprintfInt-12 131ns ± 1% 129ns ± 1% -1.54% (p=0.000 n=16+18)
FmtFprintfIntInt-12 205ns ± 2% 203ns ± 0% -0.56% (p=0.014 n=20+18)
FmtFprintfPrefixedInt-12 200ns ± 2% 197ns ± 1% -1.48% (p=0.000 n=20+18)
FmtFprintfFloat-12 256ns ± 1% 256ns ± 0% -0.21% (p=0.008 n=18+20)
FmtManyArgs-12 805ns ± 0% 804ns ± 0% -0.19% (p=0.001 n=18+18)
GobDecode-12 7.21ms ± 1% 7.14ms ± 1% -0.92% (p=0.000 n=19+20)
GobEncode-12 5.88ms ± 1% 5.88ms ± 1% ~ (p=0.641 n=18+19)
Gzip-12 218ms ± 1% 218ms ± 1% ~ (p=0.271 n=19+18)
Gunzip-12 37.1ms ± 0% 36.9ms ± 0% -0.29% (p=0.000 n=18+17)
HTTPClientServer-12 78.1µs ± 2% 77.4µs ± 2% ~ (p=0.070 n=19+19)
JSONEncode-12 15.5ms ± 1% 15.5ms ± 0% ~ (p=0.063 n=20+18)
JSONDecode-12 56.1ms ± 0% 55.4ms ± 1% -1.18% (p=0.000 n=19+18)
Mandelbrot200-12 4.05ms ± 0% 4.06ms ± 0% +0.29% (p=0.001 n=18+18)
GoParse-12 3.28ms ± 1% 3.21ms ± 1% -2.30% (p=0.000 n=20+20)
RegexpMatchEasy0_32-12 69.4ns ± 2% 69.3ns ± 1% ~ (p=0.205 n=18+16)
RegexpMatchEasy0_1K-12 239ns ± 0% 239ns ± 0% ~ (all samples are equal)
RegexpMatchEasy1_32-12 69.4ns ± 1% 69.4ns ± 1% ~ (p=0.620 n=15+18)
RegexpMatchEasy1_1K-12 370ns ± 1% 369ns ± 2% ~ (p=0.088 n=20+20)
RegexpMatchMedium_32-12 108ns ± 0% 108ns ± 0% ~ (all samples are equal)
RegexpMatchMedium_1K-12 33.6µs ± 3% 33.5µs ± 3% ~ (p=0.718 n=20+20)
RegexpMatchHard_32-12 1.68µs ± 1% 1.67µs ± 2% ~ (p=0.316 n=20+20)
RegexpMatchHard_1K-12 50.5µs ± 3% 50.4µs ± 3% ~ (p=0.659 n=20+20)
Revcomp-12 381ms ± 1% 381ms ± 1% ~ (p=0.916 n=19+18)
Template-12 66.5ms ± 1% 65.8ms ± 2% -1.08% (p=0.000 n=20+20)
TimeParse-12 317ns ± 0% 319ns ± 0% +0.48% (p=0.000 n=19+12)
TimeFormat-12 338ns ± 0% 338ns ± 0% ~ (p=0.124 n=19+18)
[Geo mean] 5.99µs 5.96µs -0.54%
Change-Id: I638ffd9d9f178835bbfa499bac20bd7224f1a907
Reviewed-on: https://go-review.googlesource.com/22591
Reviewed-by: Rick Hudson <rlh@golang.org>
2016-04-28 15:32:01 -04:00
|
|
|
v, _, shouldhelpgc = c.nextFree(tinySizeClass)
|
2016-03-02 12:15:02 -05:00
|
|
|
}
|
2014-07-30 09:01:52 -07:00
|
|
|
x = unsafe.Pointer(v)
|
|
|
|
(*[2]uint64)(x)[0] = 0
|
|
|
|
(*[2]uint64)(x)[1] = 0
|
|
|
|
// See if we need to replace the existing tiny block with the new one
|
|
|
|
// based on amount of remaining free space.
|
2015-11-16 15:31:50 -05:00
|
|
|
if size < c.tinyoffset || c.tiny == 0 {
|
|
|
|
c.tiny = uintptr(x)
|
2015-01-14 14:13:55 -05:00
|
|
|
c.tinyoffset = size
|
2014-07-30 09:01:52 -07:00
|
|
|
}
|
|
|
|
size = maxTinySize
|
|
|
|
} else {
|
2016-06-27 12:23:39 +02:00
|
|
|
var sizeclass uint8
|
|
|
|
if size <= smallSizeMax-8 {
|
|
|
|
sizeclass = size_to_class8[(size+smallSizeDiv-1)/smallSizeDiv]
|
2014-07-30 09:01:52 -07:00
|
|
|
} else {
|
2016-06-27 12:23:39 +02:00
|
|
|
sizeclass = size_to_class128[(size-smallSizeMax+largeSizeDiv-1)/largeSizeDiv]
|
2014-07-30 09:01:52 -07:00
|
|
|
}
|
|
|
|
size = uintptr(class_to_size[sizeclass])
|
[dev.garbage] runtime: reintroduce no-zeroing optimization
Currently we always zero objects when we allocate them. We used to
have an optimization that would not zero objects that had not been
allocated since the whole span was last zeroed (either by getting it
from the system or by getting it from the heap, which does a bulk
zero), but this depended on the sweeper clobbering the first two words
of each object. Hence, we lost this optimization when the bitmap
sweeper went away.
Re-introduce this optimization using a different mechanism. Each span
already keeps a flag indicating that it just came from the OS or was
just bulk zeroed by the mheap. We can simply use this flag to know
when we don't need to zero an object. This is slightly less efficient
than the old optimization: if a span gets allocated and partially
used, then GC happens and the span gets returned to the mcentral, then
the span gets re-acquired, the old optimization knew that it only had
to re-zero the objects that had been reclaimed, whereas this
optimization will re-zero everything. However, in this case, you're
already paying for the garbage collection, and you've only wasted one
zeroing of the span, so in practice there seems to be little
difference. (If we did want to revive the full optimization, each span
could keep track of a frontier beyond which all free slots are zeroed.
I prototyped this and it didn't obvious do any better than the much
simpler approach in this commit.)
This significantly improves BinaryTree17, which is allocation-heavy
(and runs first, so most pages are already zeroed), and slightly
improves everything else.
name old time/op new time/op delta
XBenchGarbage-12 2.15ms ± 1% 2.14ms ± 1% -0.80% (p=0.000 n=17+17)
name old time/op new time/op delta
BinaryTree17-12 2.71s ± 1% 2.56s ± 1% -5.73% (p=0.000 n=18+19)
DivconstI64-12 1.70ns ± 1% 1.70ns ± 1% ~ (p=0.562 n=18+18)
DivconstU64-12 1.74ns ± 2% 1.74ns ± 1% ~ (p=0.394 n=20+20)
DivconstI32-12 1.74ns ± 0% 1.74ns ± 0% ~ (all samples are equal)
DivconstU32-12 1.66ns ± 1% 1.66ns ± 0% ~ (p=0.516 n=15+16)
DivconstI16-12 1.84ns ± 0% 1.84ns ± 0% ~ (all samples are equal)
DivconstU16-12 1.82ns ± 0% 1.82ns ± 0% ~ (all samples are equal)
DivconstI8-12 1.79ns ± 0% 1.79ns ± 0% ~ (all samples are equal)
DivconstU8-12 1.60ns ± 0% 1.60ns ± 1% ~ (p=0.603 n=17+19)
Fannkuch11-12 2.11s ± 1% 2.11s ± 0% ~ (p=0.333 n=16+19)
FmtFprintfEmpty-12 45.1ns ± 4% 45.4ns ± 5% ~ (p=0.111 n=20+20)
FmtFprintfString-12 134ns ± 0% 129ns ± 0% -3.45% (p=0.000 n=18+16)
FmtFprintfInt-12 131ns ± 1% 129ns ± 1% -1.54% (p=0.000 n=16+18)
FmtFprintfIntInt-12 205ns ± 2% 203ns ± 0% -0.56% (p=0.014 n=20+18)
FmtFprintfPrefixedInt-12 200ns ± 2% 197ns ± 1% -1.48% (p=0.000 n=20+18)
FmtFprintfFloat-12 256ns ± 1% 256ns ± 0% -0.21% (p=0.008 n=18+20)
FmtManyArgs-12 805ns ± 0% 804ns ± 0% -0.19% (p=0.001 n=18+18)
GobDecode-12 7.21ms ± 1% 7.14ms ± 1% -0.92% (p=0.000 n=19+20)
GobEncode-12 5.88ms ± 1% 5.88ms ± 1% ~ (p=0.641 n=18+19)
Gzip-12 218ms ± 1% 218ms ± 1% ~ (p=0.271 n=19+18)
Gunzip-12 37.1ms ± 0% 36.9ms ± 0% -0.29% (p=0.000 n=18+17)
HTTPClientServer-12 78.1µs ± 2% 77.4µs ± 2% ~ (p=0.070 n=19+19)
JSONEncode-12 15.5ms ± 1% 15.5ms ± 0% ~ (p=0.063 n=20+18)
JSONDecode-12 56.1ms ± 0% 55.4ms ± 1% -1.18% (p=0.000 n=19+18)
Mandelbrot200-12 4.05ms ± 0% 4.06ms ± 0% +0.29% (p=0.001 n=18+18)
GoParse-12 3.28ms ± 1% 3.21ms ± 1% -2.30% (p=0.000 n=20+20)
RegexpMatchEasy0_32-12 69.4ns ± 2% 69.3ns ± 1% ~ (p=0.205 n=18+16)
RegexpMatchEasy0_1K-12 239ns ± 0% 239ns ± 0% ~ (all samples are equal)
RegexpMatchEasy1_32-12 69.4ns ± 1% 69.4ns ± 1% ~ (p=0.620 n=15+18)
RegexpMatchEasy1_1K-12 370ns ± 1% 369ns ± 2% ~ (p=0.088 n=20+20)
RegexpMatchMedium_32-12 108ns ± 0% 108ns ± 0% ~ (all samples are equal)
RegexpMatchMedium_1K-12 33.6µs ± 3% 33.5µs ± 3% ~ (p=0.718 n=20+20)
RegexpMatchHard_32-12 1.68µs ± 1% 1.67µs ± 2% ~ (p=0.316 n=20+20)
RegexpMatchHard_1K-12 50.5µs ± 3% 50.4µs ± 3% ~ (p=0.659 n=20+20)
Revcomp-12 381ms ± 1% 381ms ± 1% ~ (p=0.916 n=19+18)
Template-12 66.5ms ± 1% 65.8ms ± 2% -1.08% (p=0.000 n=20+20)
TimeParse-12 317ns ± 0% 319ns ± 0% +0.48% (p=0.000 n=19+12)
TimeFormat-12 338ns ± 0% 338ns ± 0% ~ (p=0.124 n=19+18)
[Geo mean] 5.99µs 5.96µs -0.54%
Change-Id: I638ffd9d9f178835bbfa499bac20bd7224f1a907
Reviewed-on: https://go-review.googlesource.com/22591
Reviewed-by: Rick Hudson <rlh@golang.org>
2016-04-28 15:32:01 -04:00
|
|
|
span := c.alloc[sizeclass]
|
|
|
|
v := nextFreeFast(span)
|
2016-03-02 12:15:02 -05:00
|
|
|
if v == 0 {
|
[dev.garbage] runtime: reintroduce no-zeroing optimization
Currently we always zero objects when we allocate them. We used to
have an optimization that would not zero objects that had not been
allocated since the whole span was last zeroed (either by getting it
from the system or by getting it from the heap, which does a bulk
zero), but this depended on the sweeper clobbering the first two words
of each object. Hence, we lost this optimization when the bitmap
sweeper went away.
Re-introduce this optimization using a different mechanism. Each span
already keeps a flag indicating that it just came from the OS or was
just bulk zeroed by the mheap. We can simply use this flag to know
when we don't need to zero an object. This is slightly less efficient
than the old optimization: if a span gets allocated and partially
used, then GC happens and the span gets returned to the mcentral, then
the span gets re-acquired, the old optimization knew that it only had
to re-zero the objects that had been reclaimed, whereas this
optimization will re-zero everything. However, in this case, you're
already paying for the garbage collection, and you've only wasted one
zeroing of the span, so in practice there seems to be little
difference. (If we did want to revive the full optimization, each span
could keep track of a frontier beyond which all free slots are zeroed.
I prototyped this and it didn't obvious do any better than the much
simpler approach in this commit.)
This significantly improves BinaryTree17, which is allocation-heavy
(and runs first, so most pages are already zeroed), and slightly
improves everything else.
name old time/op new time/op delta
XBenchGarbage-12 2.15ms ± 1% 2.14ms ± 1% -0.80% (p=0.000 n=17+17)
name old time/op new time/op delta
BinaryTree17-12 2.71s ± 1% 2.56s ± 1% -5.73% (p=0.000 n=18+19)
DivconstI64-12 1.70ns ± 1% 1.70ns ± 1% ~ (p=0.562 n=18+18)
DivconstU64-12 1.74ns ± 2% 1.74ns ± 1% ~ (p=0.394 n=20+20)
DivconstI32-12 1.74ns ± 0% 1.74ns ± 0% ~ (all samples are equal)
DivconstU32-12 1.66ns ± 1% 1.66ns ± 0% ~ (p=0.516 n=15+16)
DivconstI16-12 1.84ns ± 0% 1.84ns ± 0% ~ (all samples are equal)
DivconstU16-12 1.82ns ± 0% 1.82ns ± 0% ~ (all samples are equal)
DivconstI8-12 1.79ns ± 0% 1.79ns ± 0% ~ (all samples are equal)
DivconstU8-12 1.60ns ± 0% 1.60ns ± 1% ~ (p=0.603 n=17+19)
Fannkuch11-12 2.11s ± 1% 2.11s ± 0% ~ (p=0.333 n=16+19)
FmtFprintfEmpty-12 45.1ns ± 4% 45.4ns ± 5% ~ (p=0.111 n=20+20)
FmtFprintfString-12 134ns ± 0% 129ns ± 0% -3.45% (p=0.000 n=18+16)
FmtFprintfInt-12 131ns ± 1% 129ns ± 1% -1.54% (p=0.000 n=16+18)
FmtFprintfIntInt-12 205ns ± 2% 203ns ± 0% -0.56% (p=0.014 n=20+18)
FmtFprintfPrefixedInt-12 200ns ± 2% 197ns ± 1% -1.48% (p=0.000 n=20+18)
FmtFprintfFloat-12 256ns ± 1% 256ns ± 0% -0.21% (p=0.008 n=18+20)
FmtManyArgs-12 805ns ± 0% 804ns ± 0% -0.19% (p=0.001 n=18+18)
GobDecode-12 7.21ms ± 1% 7.14ms ± 1% -0.92% (p=0.000 n=19+20)
GobEncode-12 5.88ms ± 1% 5.88ms ± 1% ~ (p=0.641 n=18+19)
Gzip-12 218ms ± 1% 218ms ± 1% ~ (p=0.271 n=19+18)
Gunzip-12 37.1ms ± 0% 36.9ms ± 0% -0.29% (p=0.000 n=18+17)
HTTPClientServer-12 78.1µs ± 2% 77.4µs ± 2% ~ (p=0.070 n=19+19)
JSONEncode-12 15.5ms ± 1% 15.5ms ± 0% ~ (p=0.063 n=20+18)
JSONDecode-12 56.1ms ± 0% 55.4ms ± 1% -1.18% (p=0.000 n=19+18)
Mandelbrot200-12 4.05ms ± 0% 4.06ms ± 0% +0.29% (p=0.001 n=18+18)
GoParse-12 3.28ms ± 1% 3.21ms ± 1% -2.30% (p=0.000 n=20+20)
RegexpMatchEasy0_32-12 69.4ns ± 2% 69.3ns ± 1% ~ (p=0.205 n=18+16)
RegexpMatchEasy0_1K-12 239ns ± 0% 239ns ± 0% ~ (all samples are equal)
RegexpMatchEasy1_32-12 69.4ns ± 1% 69.4ns ± 1% ~ (p=0.620 n=15+18)
RegexpMatchEasy1_1K-12 370ns ± 1% 369ns ± 2% ~ (p=0.088 n=20+20)
RegexpMatchMedium_32-12 108ns ± 0% 108ns ± 0% ~ (all samples are equal)
RegexpMatchMedium_1K-12 33.6µs ± 3% 33.5µs ± 3% ~ (p=0.718 n=20+20)
RegexpMatchHard_32-12 1.68µs ± 1% 1.67µs ± 2% ~ (p=0.316 n=20+20)
RegexpMatchHard_1K-12 50.5µs ± 3% 50.4µs ± 3% ~ (p=0.659 n=20+20)
Revcomp-12 381ms ± 1% 381ms ± 1% ~ (p=0.916 n=19+18)
Template-12 66.5ms ± 1% 65.8ms ± 2% -1.08% (p=0.000 n=20+20)
TimeParse-12 317ns ± 0% 319ns ± 0% +0.48% (p=0.000 n=19+12)
TimeFormat-12 338ns ± 0% 338ns ± 0% ~ (p=0.124 n=19+18)
[Geo mean] 5.99µs 5.96µs -0.54%
Change-Id: I638ffd9d9f178835bbfa499bac20bd7224f1a907
Reviewed-on: https://go-review.googlesource.com/22591
Reviewed-by: Rick Hudson <rlh@golang.org>
2016-04-28 15:32:01 -04:00
|
|
|
v, span, shouldhelpgc = c.nextFree(sizeclass)
|
2016-03-02 12:15:02 -05:00
|
|
|
}
|
2014-07-30 09:01:52 -07:00
|
|
|
x = unsafe.Pointer(v)
|
[dev.garbage] runtime: reintroduce no-zeroing optimization
Currently we always zero objects when we allocate them. We used to
have an optimization that would not zero objects that had not been
allocated since the whole span was last zeroed (either by getting it
from the system or by getting it from the heap, which does a bulk
zero), but this depended on the sweeper clobbering the first two words
of each object. Hence, we lost this optimization when the bitmap
sweeper went away.
Re-introduce this optimization using a different mechanism. Each span
already keeps a flag indicating that it just came from the OS or was
just bulk zeroed by the mheap. We can simply use this flag to know
when we don't need to zero an object. This is slightly less efficient
than the old optimization: if a span gets allocated and partially
used, then GC happens and the span gets returned to the mcentral, then
the span gets re-acquired, the old optimization knew that it only had
to re-zero the objects that had been reclaimed, whereas this
optimization will re-zero everything. However, in this case, you're
already paying for the garbage collection, and you've only wasted one
zeroing of the span, so in practice there seems to be little
difference. (If we did want to revive the full optimization, each span
could keep track of a frontier beyond which all free slots are zeroed.
I prototyped this and it didn't obvious do any better than the much
simpler approach in this commit.)
This significantly improves BinaryTree17, which is allocation-heavy
(and runs first, so most pages are already zeroed), and slightly
improves everything else.
name old time/op new time/op delta
XBenchGarbage-12 2.15ms ± 1% 2.14ms ± 1% -0.80% (p=0.000 n=17+17)
name old time/op new time/op delta
BinaryTree17-12 2.71s ± 1% 2.56s ± 1% -5.73% (p=0.000 n=18+19)
DivconstI64-12 1.70ns ± 1% 1.70ns ± 1% ~ (p=0.562 n=18+18)
DivconstU64-12 1.74ns ± 2% 1.74ns ± 1% ~ (p=0.394 n=20+20)
DivconstI32-12 1.74ns ± 0% 1.74ns ± 0% ~ (all samples are equal)
DivconstU32-12 1.66ns ± 1% 1.66ns ± 0% ~ (p=0.516 n=15+16)
DivconstI16-12 1.84ns ± 0% 1.84ns ± 0% ~ (all samples are equal)
DivconstU16-12 1.82ns ± 0% 1.82ns ± 0% ~ (all samples are equal)
DivconstI8-12 1.79ns ± 0% 1.79ns ± 0% ~ (all samples are equal)
DivconstU8-12 1.60ns ± 0% 1.60ns ± 1% ~ (p=0.603 n=17+19)
Fannkuch11-12 2.11s ± 1% 2.11s ± 0% ~ (p=0.333 n=16+19)
FmtFprintfEmpty-12 45.1ns ± 4% 45.4ns ± 5% ~ (p=0.111 n=20+20)
FmtFprintfString-12 134ns ± 0% 129ns ± 0% -3.45% (p=0.000 n=18+16)
FmtFprintfInt-12 131ns ± 1% 129ns ± 1% -1.54% (p=0.000 n=16+18)
FmtFprintfIntInt-12 205ns ± 2% 203ns ± 0% -0.56% (p=0.014 n=20+18)
FmtFprintfPrefixedInt-12 200ns ± 2% 197ns ± 1% -1.48% (p=0.000 n=20+18)
FmtFprintfFloat-12 256ns ± 1% 256ns ± 0% -0.21% (p=0.008 n=18+20)
FmtManyArgs-12 805ns ± 0% 804ns ± 0% -0.19% (p=0.001 n=18+18)
GobDecode-12 7.21ms ± 1% 7.14ms ± 1% -0.92% (p=0.000 n=19+20)
GobEncode-12 5.88ms ± 1% 5.88ms ± 1% ~ (p=0.641 n=18+19)
Gzip-12 218ms ± 1% 218ms ± 1% ~ (p=0.271 n=19+18)
Gunzip-12 37.1ms ± 0% 36.9ms ± 0% -0.29% (p=0.000 n=18+17)
HTTPClientServer-12 78.1µs ± 2% 77.4µs ± 2% ~ (p=0.070 n=19+19)
JSONEncode-12 15.5ms ± 1% 15.5ms ± 0% ~ (p=0.063 n=20+18)
JSONDecode-12 56.1ms ± 0% 55.4ms ± 1% -1.18% (p=0.000 n=19+18)
Mandelbrot200-12 4.05ms ± 0% 4.06ms ± 0% +0.29% (p=0.001 n=18+18)
GoParse-12 3.28ms ± 1% 3.21ms ± 1% -2.30% (p=0.000 n=20+20)
RegexpMatchEasy0_32-12 69.4ns ± 2% 69.3ns ± 1% ~ (p=0.205 n=18+16)
RegexpMatchEasy0_1K-12 239ns ± 0% 239ns ± 0% ~ (all samples are equal)
RegexpMatchEasy1_32-12 69.4ns ± 1% 69.4ns ± 1% ~ (p=0.620 n=15+18)
RegexpMatchEasy1_1K-12 370ns ± 1% 369ns ± 2% ~ (p=0.088 n=20+20)
RegexpMatchMedium_32-12 108ns ± 0% 108ns ± 0% ~ (all samples are equal)
RegexpMatchMedium_1K-12 33.6µs ± 3% 33.5µs ± 3% ~ (p=0.718 n=20+20)
RegexpMatchHard_32-12 1.68µs ± 1% 1.67µs ± 2% ~ (p=0.316 n=20+20)
RegexpMatchHard_1K-12 50.5µs ± 3% 50.4µs ± 3% ~ (p=0.659 n=20+20)
Revcomp-12 381ms ± 1% 381ms ± 1% ~ (p=0.916 n=19+18)
Template-12 66.5ms ± 1% 65.8ms ± 2% -1.08% (p=0.000 n=20+20)
TimeParse-12 317ns ± 0% 319ns ± 0% +0.48% (p=0.000 n=19+12)
TimeFormat-12 338ns ± 0% 338ns ± 0% ~ (p=0.124 n=19+18)
[Geo mean] 5.99µs 5.96µs -0.54%
Change-Id: I638ffd9d9f178835bbfa499bac20bd7224f1a907
Reviewed-on: https://go-review.googlesource.com/22591
Reviewed-by: Rick Hudson <rlh@golang.org>
2016-04-28 15:32:01 -04:00
|
|
|
if needzero && span.needzero != 0 {
|
2016-10-17 18:41:56 -04:00
|
|
|
memclrNoHeapPointers(unsafe.Pointer(v), size)
|
2014-07-30 09:01:52 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
} else {
|
2014-11-11 17:05:02 -05:00
|
|
|
var s *mspan
|
2015-01-06 14:58:49 -05:00
|
|
|
shouldhelpgc = true
|
[dev.cc] runtime: delete scalararg, ptrarg; rename onM to systemstack
Scalararg and ptrarg are not "signal safe".
Go code filling them out can be interrupted by a signal,
and then the signal handler runs, and if it also ends up
in Go code that uses scalararg or ptrarg, now the old
values have been smashed.
For the pieces of code that do need to run in a signal handler,
we introduced onM_signalok, which is really just onM
except that the _signalok is meant to convey that the caller
asserts that scalarg and ptrarg will be restored to their old
values after the call (instead of the usual behavior, zeroing them).
Scalararg and ptrarg are also untyped and therefore error-prone.
Go code can always pass a closure instead of using scalararg
and ptrarg; they were only really necessary for C code.
And there's no more C code.
For all these reasons, delete scalararg and ptrarg, converting
the few remaining references to use closures.
Once those are gone, there is no need for a distinction between
onM and onM_signalok, so replace both with a single function
equivalent to the current onM_signalok (that is, it can be called
on any of the curg, g0, and gsignal stacks).
The name onM and the phrase 'm stack' are misnomers,
because on most system an M has two system stacks:
the main thread stack and the signal handling stack.
Correct the misnomer by naming the replacement function systemstack.
Fix a few references to "M stack" in code.
The main motivation for this change is to eliminate scalararg/ptrarg.
Rick and I have already seen them cause problems because
the calling sequence m.ptrarg[0] = p is a heap pointer assignment,
so it gets a write barrier. The write barrier also uses onM, so it has
all the same problems as if it were being invoked by a signal handler.
We worked around this by saving and restoring the old values
and by calling onM_signalok, but there's no point in keeping this nice
home for bugs around any longer.
This CL also changes funcline to return the file name as a result
instead of filling in a passed-in *string. (The *string signature is
left over from when the code was written in and called from C.)
That's arguably an unrelated change, except that once I had done
the ptrarg/scalararg/onM cleanup I started getting false positives
about the *string argument escaping (not allowed in package runtime).
The compiler is wrong, but the easiest fix is to write the code like
Go code instead of like C code. I am a bit worried that the compiler
is wrong because of some use of uninitialized memory in the escape
analysis. If that's the reason, it will go away when we convert the
compiler to Go. (And if not, we'll debug it the next time.)
LGTM=khr
R=r, khr
CC=austin, golang-codereviews, iant, rlh
https://golang.org/cl/174950043
2014-11-12 14:54:31 -05:00
|
|
|
systemstack(func() {
|
2016-04-19 19:35:10 -07:00
|
|
|
s = largeAlloc(size, needzero)
|
2014-11-11 17:05:02 -05:00
|
|
|
})
|
2016-02-11 13:57:58 -05:00
|
|
|
s.freeindex = 1
|
2016-04-29 09:44:53 -04:00
|
|
|
s.allocCount = 1
|
2016-03-14 12:02:02 -04:00
|
|
|
x = unsafe.Pointer(s.base())
|
2016-02-29 15:01:00 -08:00
|
|
|
size = s.elemsize
|
2014-07-30 09:01:52 -07:00
|
|
|
}
|
|
|
|
|
2016-04-16 18:27:38 -04:00
|
|
|
var scanSize uintptr
|
2016-04-19 19:35:10 -07:00
|
|
|
if noscan {
|
runtime: reclaim scan/dead bit in first word
With the switch to separate mark bitmaps, the scan/dead bit for the
first word of each object is now unused. Reclaim this bit and use it
as a scan/dead bit, just like words three and on. The second word is
still used for checkmark.
This dramatically simplifies heapBitsSetTypeNoScan and hasPointers,
since they no longer need different cases for 1, 2, and 3+ word
objects. They can instead just manipulate the heap bitmap for the
first word and be done with it.
In order to enable this, we change heapBitsSetType and runGCProg to
always set the scan/dead bit to scan for the first word on every code
path. Since these functions only apply to types that have pointers,
there's no need to do this conditionally: it's *always* necessary to
set the scan bit in the first word.
We also change every place that scans an object and checks if there
are more pointers. Rather than only checking morePointers if the word
is >= 2, we now check morePointers if word != 1 (since that's the
checkmark word).
Looking forward, we should probably reclaim the checkmark bit, too,
but that's going to be quite a bit more work.
Tested by setting doubleCheck in heapBitsSetType and running all.bash
on both linux/amd64 and linux/386, and by running GOGC=10 all.bash.
This particularly improves the FmtFprintf* go1 benchmarks, since they
do a large amount of noscan allocation.
name old time/op new time/op delta
BinaryTree17-12 2.34s ± 1% 2.38s ± 1% +1.70% (p=0.000 n=17+19)
Fannkuch11-12 2.09s ± 0% 2.09s ± 1% ~ (p=0.276 n=17+16)
FmtFprintfEmpty-12 44.9ns ± 2% 44.8ns ± 2% ~ (p=0.340 n=19+18)
FmtFprintfString-12 127ns ± 0% 125ns ± 0% -1.57% (p=0.000 n=16+15)
FmtFprintfInt-12 128ns ± 0% 122ns ± 1% -4.45% (p=0.000 n=15+20)
FmtFprintfIntInt-12 207ns ± 1% 193ns ± 0% -6.55% (p=0.000 n=19+14)
FmtFprintfPrefixedInt-12 197ns ± 1% 191ns ± 0% -2.93% (p=0.000 n=17+18)
FmtFprintfFloat-12 263ns ± 0% 248ns ± 1% -5.88% (p=0.000 n=15+19)
FmtManyArgs-12 794ns ± 0% 779ns ± 1% -1.90% (p=0.000 n=18+18)
GobDecode-12 7.14ms ± 2% 7.11ms ± 1% ~ (p=0.072 n=20+20)
GobEncode-12 5.85ms ± 1% 5.82ms ± 1% -0.49% (p=0.000 n=20+20)
Gzip-12 218ms ± 1% 215ms ± 1% -1.22% (p=0.000 n=19+19)
Gunzip-12 36.8ms ± 0% 36.7ms ± 0% -0.18% (p=0.006 n=18+20)
HTTPClientServer-12 77.1µs ± 4% 77.1µs ± 3% ~ (p=0.945 n=19+20)
JSONEncode-12 15.6ms ± 1% 15.9ms ± 1% +1.68% (p=0.000 n=18+20)
JSONDecode-12 55.2ms ± 1% 53.6ms ± 1% -2.93% (p=0.000 n=17+19)
Mandelbrot200-12 4.05ms ± 1% 4.05ms ± 0% ~ (p=0.306 n=17+17)
GoParse-12 3.14ms ± 1% 3.10ms ± 1% -1.31% (p=0.000 n=19+18)
RegexpMatchEasy0_32-12 69.3ns ± 1% 70.0ns ± 0% +0.89% (p=0.000 n=19+17)
RegexpMatchEasy0_1K-12 237ns ± 1% 236ns ± 0% -0.62% (p=0.000 n=19+16)
RegexpMatchEasy1_32-12 69.5ns ± 1% 70.3ns ± 1% +1.14% (p=0.000 n=18+17)
RegexpMatchEasy1_1K-12 377ns ± 1% 366ns ± 1% -3.03% (p=0.000 n=15+19)
RegexpMatchMedium_32-12 107ns ± 1% 107ns ± 2% ~ (p=0.318 n=20+19)
RegexpMatchMedium_1K-12 33.8µs ± 3% 33.5µs ± 1% -1.04% (p=0.001 n=20+19)
RegexpMatchHard_32-12 1.68µs ± 1% 1.73µs ± 0% +2.50% (p=0.000 n=20+18)
RegexpMatchHard_1K-12 50.8µs ± 1% 52.0µs ± 1% +2.50% (p=0.000 n=19+18)
Revcomp-12 381ms ± 1% 385ms ± 1% +1.00% (p=0.000 n=17+18)
Template-12 64.9ms ± 3% 62.6ms ± 1% -3.55% (p=0.000 n=19+18)
TimeParse-12 324ns ± 0% 328ns ± 1% +1.25% (p=0.000 n=18+18)
TimeFormat-12 345ns ± 0% 334ns ± 0% -3.31% (p=0.000 n=15+17)
[Geo mean] 52.1µs 51.5µs -1.00%
Change-Id: I13e74da3193a7f80794c654f944d1f0d60817049
Reviewed-on: https://go-review.googlesource.com/22632
Reviewed-by: Rick Hudson <rlh@golang.org>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2016-04-29 14:51:48 -04:00
|
|
|
heapBitsSetTypeNoScan(uintptr(x))
|
2015-01-16 14:43:38 -05:00
|
|
|
} else {
|
|
|
|
// If allocating a defer+arg block, now that we've picked a malloc size
|
|
|
|
// large enough to hold everything, cut the "asked for" size down to
|
|
|
|
// just the defer header, so that the GC bitmap will record the arg block
|
|
|
|
// as containing nothing at all (as if it were unused space at the end of
|
|
|
|
// a malloc block caused by size rounding).
|
|
|
|
// The defer arg areas are scanned as part of scanstack.
|
|
|
|
if typ == deferType {
|
|
|
|
dataSize = unsafe.Sizeof(_defer{})
|
2014-08-07 13:34:30 +04:00
|
|
|
}
|
2015-01-16 14:43:38 -05:00
|
|
|
heapBitsSetType(uintptr(x), size, dataSize, typ)
|
2015-05-04 16:10:49 -04:00
|
|
|
if dataSize > typ.size {
|
|
|
|
// Array allocation. If there are any
|
|
|
|
// pointers, GC has to scan to the last
|
|
|
|
// element.
|
|
|
|
if typ.ptrdata != 0 {
|
2016-04-16 18:27:38 -04:00
|
|
|
scanSize = dataSize - typ.size + typ.ptrdata
|
2015-05-04 16:10:49 -04:00
|
|
|
}
|
|
|
|
} else {
|
2016-04-16 18:27:38 -04:00
|
|
|
scanSize = typ.ptrdata
|
2015-05-04 16:10:49 -04:00
|
|
|
}
|
2016-04-16 18:27:38 -04:00
|
|
|
c.local_scan += scanSize
|
2014-08-07 13:34:30 +04:00
|
|
|
}
|
2014-11-04 13:31:34 -05:00
|
|
|
|
2016-05-11 14:57:33 -04:00
|
|
|
// Ensure that the stores above that initialize x to
|
|
|
|
// type-safe memory and set the heap bits occur before
|
|
|
|
// the caller can make x observable to the garbage
|
|
|
|
// collector. Otherwise, on weakly ordered machines,
|
|
|
|
// the garbage collector could follow a pointer to x,
|
|
|
|
// but see uninitialized memory or stale heap bits.
|
|
|
|
publicationBarrier()
|
|
|
|
|
2016-03-30 17:02:23 -04:00
|
|
|
// Allocate black during GC.
|
2014-11-04 13:31:34 -05:00
|
|
|
// All slots hold nil so no scanning is needed.
|
|
|
|
// This may be racing with GC so do it atomically if there can be
|
|
|
|
// a race marking the bit.
|
2016-03-30 17:02:23 -04:00
|
|
|
if gcphase != _GCoff {
|
2016-04-17 11:42:37 -04:00
|
|
|
gcmarknewobject(uintptr(x), size, scanSize)
|
2014-11-04 13:31:34 -05:00
|
|
|
}
|
|
|
|
|
2014-07-30 09:01:52 -07:00
|
|
|
if raceenabled {
|
|
|
|
racemalloc(x, size)
|
|
|
|
}
|
2016-03-02 12:15:02 -05:00
|
|
|
|
2015-10-21 11:04:42 -07:00
|
|
|
if msanenabled {
|
|
|
|
msanmalloc(x, size)
|
|
|
|
}
|
2014-08-18 16:33:39 +04:00
|
|
|
|
2015-01-16 14:43:38 -05:00
|
|
|
mp.mallocing = 0
|
|
|
|
releasem(mp)
|
2014-08-18 16:33:39 +04:00
|
|
|
|
2014-07-30 09:01:52 -07:00
|
|
|
if debug.allocfreetrace != 0 {
|
|
|
|
tracealloc(x, size, typ)
|
|
|
|
}
|
2014-08-13 01:03:32 +04:00
|
|
|
|
|
|
|
if rate := MemProfileRate; rate > 0 {
|
|
|
|
if size < uintptr(rate) && int32(size) < c.next_sample {
|
|
|
|
c.next_sample -= int32(size)
|
|
|
|
} else {
|
2014-08-18 16:33:39 +04:00
|
|
|
mp := acquirem()
|
2014-08-13 01:03:32 +04:00
|
|
|
profilealloc(mp, x, size)
|
2014-08-18 16:33:39 +04:00
|
|
|
releasem(mp)
|
2014-07-30 09:01:52 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-10-04 20:56:11 -07:00
|
|
|
if assistG != nil {
|
|
|
|
// Account for internal fragmentation in the assist
|
|
|
|
// debt now that we know it.
|
|
|
|
assistG.gcAssistBytes -= int64(size - dataSize)
|
|
|
|
}
|
|
|
|
|
2017-01-09 11:35:42 -05:00
|
|
|
if shouldhelpgc {
|
|
|
|
if t := (gcTrigger{kind: gcTriggerHeap}); t.test() {
|
|
|
|
gcStart(gcBackgroundMode, t)
|
|
|
|
}
|
2014-07-30 09:01:52 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
return x
|
|
|
|
}
|
|
|
|
|
2016-04-19 19:35:10 -07:00
|
|
|
func largeAlloc(size uintptr, needzero bool) *mspan {
|
2015-02-19 13:38:46 -05:00
|
|
|
// print("largeAlloc size=", size, "\n")
|
|
|
|
|
|
|
|
if size+_PageSize < size {
|
|
|
|
throw("out of memory")
|
|
|
|
}
|
|
|
|
npages := size >> _PageShift
|
|
|
|
if size&_PageMask != 0 {
|
|
|
|
npages++
|
|
|
|
}
|
runtime: make sweep proportional to spans bytes allocated
Proportional concurrent sweep is currently based on a ratio of spans
to be swept per bytes of object allocation. However, proportional
sweeping is performed during span allocation, not object allocation,
in order to minimize contention and overhead. Since objects are
allocated from spans after those spans are allocated, the system tends
to operate in debt, which means when the next GC cycle starts, there
is often sweep debt remaining, so GC has to finish the sweep, which
delays the start of the cycle and delays enabling mutator assists.
For example, it's quite likely that many Ps will simultaneously refill
their span caches immediately after a GC cycle (because GC flushes the
span caches), but at this point, there has been very little object
allocation since the end of GC, so very little sweeping is done. The
Ps then allocate objects from these cached spans, which drives up the
bytes of object allocation, but since these allocations are coming
from cached spans, nothing considers whether more sweeping has to
happen. If the sweep ratio is high enough (which can happen if the
next GC trigger is very close to the retained heap size), this can
easily represent a sweep debt of thousands of pages.
Fix this by making proportional sweep proportional to the number of
bytes of spans allocated, rather than the number of bytes of objects
allocated. Prior to allocating a span, both the small object path and
the large object path ensure credit for allocating that span, so the
system operates in the black, rather than in the red.
Combined with the previous commit, this should eliminate all sweeping
from GC start up. On the stress test in issue #11911, this reduces the
time spent sweeping during GC (and delaying start up) by several
orders of magnitude:
mean 99%ile max
pre fix 1 ms 11 ms 144 ms
post fix 270 ns 735 ns 916 ns
Updates #11911.
Change-Id: I89223712883954c9d6ec2a7a51ecb97172097df3
Reviewed-on: https://go-review.googlesource.com/13044
Reviewed-by: Rick Hudson <rlh@golang.org>
Reviewed-by: Russ Cox <rsc@golang.org>
2015-08-03 09:46:50 -04:00
|
|
|
|
|
|
|
// Deduct credit for this span allocation and sweep if
|
|
|
|
// necessary. mHeap_Alloc will also sweep npages, so this only
|
|
|
|
// pays the debt down to npage pages.
|
|
|
|
deductSweepCredit(npages*_PageSize, npages)
|
|
|
|
|
2016-04-19 19:35:10 -07:00
|
|
|
s := mheap_.alloc(npages, 0, true, needzero)
|
2015-02-19 13:38:46 -05:00
|
|
|
if s == nil {
|
|
|
|
throw("out of memory")
|
|
|
|
}
|
2016-03-14 12:02:02 -04:00
|
|
|
s.limit = s.base() + size
|
2016-02-09 09:38:44 -05:00
|
|
|
heapBitsForSpan(s.base()).initSpan(s)
|
2015-02-19 13:38:46 -05:00
|
|
|
return s
|
|
|
|
}
|
|
|
|
|
2014-07-30 09:01:52 -07:00
|
|
|
// implementation of new builtin
|
2016-08-26 15:41:51 -04:00
|
|
|
// compiler (both frontend and SSA backend) knows the signature
|
|
|
|
// of this function
|
2014-07-30 09:01:52 -07:00
|
|
|
func newobject(typ *_type) unsafe.Pointer {
|
2016-04-19 19:35:10 -07:00
|
|
|
return mallocgc(typ.size, typ, true)
|
2014-07-30 09:01:52 -07:00
|
|
|
}
|
|
|
|
|
2014-12-22 13:27:53 -05:00
|
|
|
//go:linkname reflect_unsafe_New reflect.unsafe_New
|
|
|
|
func reflect_unsafe_New(typ *_type) unsafe.Pointer {
|
|
|
|
return newobject(typ)
|
|
|
|
}
|
|
|
|
|
2016-04-20 18:00:52 +02:00
|
|
|
// newarray allocates an array of n elements of type typ.
|
|
|
|
func newarray(typ *_type, n int) unsafe.Pointer {
|
|
|
|
if n < 0 || uintptr(n) > maxSliceCap(typ.size) {
|
2016-03-27 17:29:53 -07:00
|
|
|
panic(plainError("runtime: allocation size out of range"))
|
2014-07-30 09:01:52 -07:00
|
|
|
}
|
2016-04-20 18:00:52 +02:00
|
|
|
return mallocgc(typ.size*uintptr(n), typ, true)
|
2014-07-30 09:01:52 -07:00
|
|
|
}
|
|
|
|
|
2014-12-22 13:27:53 -05:00
|
|
|
//go:linkname reflect_unsafe_NewArray reflect.unsafe_NewArray
|
2016-04-20 18:00:52 +02:00
|
|
|
func reflect_unsafe_NewArray(typ *_type, n int) unsafe.Pointer {
|
2014-12-22 13:27:53 -05:00
|
|
|
return newarray(typ, n)
|
|
|
|
}
|
|
|
|
|
2014-07-30 09:01:52 -07:00
|
|
|
func profilealloc(mp *m, x unsafe.Pointer, size uintptr) {
|
2015-09-14 14:03:45 -07:00
|
|
|
mp.mcache.next_sample = nextSample()
|
2014-09-01 18:51:12 -04:00
|
|
|
mProf_Malloc(x, size)
|
2014-07-30 09:01:52 -07:00
|
|
|
}
|
|
|
|
|
2015-09-14 14:03:45 -07:00
|
|
|
// nextSample returns the next sampling point for heap profiling.
|
|
|
|
// It produces a random variable with a geometric distribution and
|
|
|
|
// mean MemProfileRate. This is done by generating a uniformly
|
|
|
|
// distributed random number and applying the cumulative distribution
|
|
|
|
// function for an exponential.
|
|
|
|
func nextSample() int32 {
|
2015-10-28 06:44:26 +01:00
|
|
|
if GOOS == "plan9" {
|
|
|
|
// Plan 9 doesn't support floating point in note handler.
|
|
|
|
if g := getg(); g == g.m.gsignal {
|
|
|
|
return nextSampleNoFP()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-09-14 14:03:45 -07:00
|
|
|
period := MemProfileRate
|
|
|
|
|
|
|
|
// make nextSample not overflow. Maximum possible step is
|
|
|
|
// -ln(1/(1<<kRandomBitCount)) * period, approximately 20 * period.
|
|
|
|
switch {
|
|
|
|
case period > 0x7000000:
|
|
|
|
period = 0x7000000
|
|
|
|
case period == 0:
|
|
|
|
return 0
|
|
|
|
}
|
|
|
|
|
|
|
|
// Let m be the sample rate,
|
|
|
|
// the probability distribution function is m*exp(-mx), so the CDF is
|
|
|
|
// p = 1 - exp(-mx), so
|
|
|
|
// q = 1 - p == exp(-mx)
|
|
|
|
// log_e(q) = -mx
|
|
|
|
// -log_e(q)/m = x
|
|
|
|
// x = -log_e(q) * period
|
|
|
|
// x = log_2(q) * (-log_e(2)) * period ; Using log_2 for efficiency
|
|
|
|
const randomBitCount = 26
|
2016-06-28 09:22:46 -07:00
|
|
|
q := fastrand()%(1<<randomBitCount) + 1
|
2015-09-14 14:03:45 -07:00
|
|
|
qlog := fastlog2(float64(q)) - randomBitCount
|
|
|
|
if qlog > 0 {
|
|
|
|
qlog = 0
|
|
|
|
}
|
|
|
|
const minusLog2 = -0.6931471805599453 // -ln(2)
|
|
|
|
return int32(qlog*(minusLog2*float64(period))) + 1
|
|
|
|
}
|
|
|
|
|
2015-10-28 06:44:26 +01:00
|
|
|
// nextSampleNoFP is similar to nextSample, but uses older,
|
|
|
|
// simpler code to avoid floating point.
|
|
|
|
func nextSampleNoFP() int32 {
|
|
|
|
// Set first allocation sample size.
|
|
|
|
rate := MemProfileRate
|
|
|
|
if rate > 0x3fffffff { // make 2*rate not overflow
|
|
|
|
rate = 0x3fffffff
|
|
|
|
}
|
|
|
|
if rate != 0 {
|
2017-02-12 13:18:22 +03:00
|
|
|
return int32(fastrand() % uint32(2*rate))
|
2015-10-28 06:44:26 +01:00
|
|
|
}
|
|
|
|
return 0
|
|
|
|
}
|
|
|
|
|
2015-03-08 20:56:15 -04:00
|
|
|
type persistentAlloc struct {
|
2015-01-14 14:13:55 -05:00
|
|
|
base unsafe.Pointer
|
|
|
|
off uintptr
|
2014-09-04 00:54:06 -04:00
|
|
|
}
|
|
|
|
|
2015-03-08 20:56:15 -04:00
|
|
|
var globalAlloc struct {
|
|
|
|
mutex
|
|
|
|
persistentAlloc
|
|
|
|
}
|
|
|
|
|
2014-09-04 00:54:06 -04:00
|
|
|
// Wrapper around sysAlloc that can allocate small chunks.
|
|
|
|
// There is no associated free operation.
|
|
|
|
// Intended for things like function/type/debug-related persistent data.
|
|
|
|
// If align is 0, uses default align (currently 8).
|
runtime: make fixalloc zero allocations on reuse
Currently fixalloc does not zero memory it reuses. This is dangerous
with the hybrid barrier if the type may contain heap pointers, since
it may cause us to observe a dead heap pointer on reuse. It's also
error-prone since it's the only allocator that doesn't zero on
allocation (mallocgc of course zeroes, but so do persistentalloc and
sysAlloc). It's also largely pointless: for mcache, the caller
immediately memclrs the allocation; and the two specials types are
tiny so there's no real cost to zeroing them.
Change fixalloc to zero allocations by default.
The only type we don't zero by default is mspan. This actually
requires that the spsn's sweepgen survive across freeing and
reallocating a span. If we were to zero it, the following race would
be possible:
1. The current sweepgen is 2. Span s is on the unswept list.
2. Direct sweeping sweeps span s, finds it's all free, and releases s
to the fixalloc.
3. Thread 1 allocates s from fixalloc. Suppose this zeros s, including
s.sweepgen.
4. Thread 1 calls s.init, which sets s.state to _MSpanDead.
5. On thread 2, background sweeping comes across span s in allspans
and cas's s.sweepgen from 0 (sg-2) to 1 (sg-1). Now it thinks it
owns it for sweeping. 6. Thread 1 continues initializing s.
Everything breaks.
I would like to fix this because it's obviously confusing, but it's a
subtle enough problem that I'm leaving it alone for now. The solution
may be to skip sweepgen 0, but then we have to think about wrap-around
much more carefully.
Updates #17503.
Change-Id: Ie08691feed3abbb06a31381b94beb0a2e36a0613
Reviewed-on: https://go-review.googlesource.com/31368
Reviewed-by: Keith Randall <khr@golang.org>
Reviewed-by: Rick Hudson <rlh@golang.org>
2016-09-25 17:12:43 -04:00
|
|
|
// The returned memory will be zeroed.
|
2016-10-11 22:58:21 -04:00
|
|
|
//
|
|
|
|
// Consider marking persistentalloc'd types go:notinheap.
|
2015-04-16 14:32:18 -07:00
|
|
|
func persistentalloc(size, align uintptr, sysStat *uint64) unsafe.Pointer {
|
2015-06-07 21:45:39 -04:00
|
|
|
var p unsafe.Pointer
|
|
|
|
systemstack(func() {
|
|
|
|
p = persistentalloc1(size, align, sysStat)
|
|
|
|
})
|
|
|
|
return p
|
|
|
|
}
|
|
|
|
|
|
|
|
// Must run on system stack because stack growth can (re)invoke it.
|
|
|
|
// See issue 9174.
|
|
|
|
//go:systemstack
|
|
|
|
func persistentalloc1(size, align uintptr, sysStat *uint64) unsafe.Pointer {
|
2014-09-04 00:54:06 -04:00
|
|
|
const (
|
|
|
|
chunk = 256 << 10
|
|
|
|
maxBlock = 64 << 10 // VM reservation granularity is 64K on windows
|
|
|
|
)
|
|
|
|
|
2015-01-14 14:13:55 -05:00
|
|
|
if size == 0 {
|
|
|
|
throw("persistentalloc: size == 0")
|
|
|
|
}
|
2014-09-04 00:54:06 -04:00
|
|
|
if align != 0 {
|
|
|
|
if align&(align-1) != 0 {
|
2014-12-27 20:58:00 -08:00
|
|
|
throw("persistentalloc: align is not a power of 2")
|
2014-09-04 00:54:06 -04:00
|
|
|
}
|
|
|
|
if align > _PageSize {
|
2014-12-27 20:58:00 -08:00
|
|
|
throw("persistentalloc: align is too large")
|
2014-09-04 00:54:06 -04:00
|
|
|
}
|
|
|
|
} else {
|
|
|
|
align = 8
|
|
|
|
}
|
|
|
|
|
|
|
|
if size >= maxBlock {
|
2015-04-16 14:32:18 -07:00
|
|
|
return sysAlloc(size, sysStat)
|
2014-09-04 00:54:06 -04:00
|
|
|
}
|
|
|
|
|
2015-03-08 20:56:15 -04:00
|
|
|
mp := acquirem()
|
|
|
|
var persistent *persistentAlloc
|
2015-04-17 00:21:30 -04:00
|
|
|
if mp != nil && mp.p != 0 {
|
|
|
|
persistent = &mp.p.ptr().palloc
|
2015-03-08 20:56:15 -04:00
|
|
|
} else {
|
|
|
|
lock(&globalAlloc.mutex)
|
|
|
|
persistent = &globalAlloc.persistentAlloc
|
|
|
|
}
|
2015-01-14 14:13:55 -05:00
|
|
|
persistent.off = round(persistent.off, align)
|
2015-01-14 15:48:32 -05:00
|
|
|
if persistent.off+size > chunk || persistent.base == nil {
|
2015-01-14 14:13:55 -05:00
|
|
|
persistent.base = sysAlloc(chunk, &memstats.other_sys)
|
|
|
|
if persistent.base == nil {
|
2015-03-08 20:56:15 -04:00
|
|
|
if persistent == &globalAlloc.persistentAlloc {
|
|
|
|
unlock(&globalAlloc.mutex)
|
|
|
|
}
|
2014-12-27 20:58:00 -08:00
|
|
|
throw("runtime: cannot allocate memory")
|
2014-09-04 00:54:06 -04:00
|
|
|
}
|
2015-01-14 14:13:55 -05:00
|
|
|
persistent.off = 0
|
2014-09-04 00:54:06 -04:00
|
|
|
}
|
2015-01-14 14:13:55 -05:00
|
|
|
p := add(persistent.base, persistent.off)
|
|
|
|
persistent.off += size
|
2015-03-08 20:56:15 -04:00
|
|
|
releasem(mp)
|
|
|
|
if persistent == &globalAlloc.persistentAlloc {
|
|
|
|
unlock(&globalAlloc.mutex)
|
|
|
|
}
|
2014-09-04 00:54:06 -04:00
|
|
|
|
2015-04-16 14:32:18 -07:00
|
|
|
if sysStat != &memstats.other_sys {
|
|
|
|
mSysStatInc(sysStat, size)
|
|
|
|
mSysStatDec(&memstats.other_sys, size)
|
2014-09-04 00:54:06 -04:00
|
|
|
}
|
|
|
|
return p
|
|
|
|
}
|