2015-01-16 14:43:38 -05:00
|
|
|
// Copyright 2009 The Go Authors. All rights reserved.
|
|
|
|
|
// Use of this source code is governed by a BSD-style
|
|
|
|
|
// license that can be found in the LICENSE file.
|
|
|
|
|
|
|
|
|
|
// Garbage collector: type and heap bitmaps.
|
|
|
|
|
//
|
cmd/internal/gc, runtime: use 1-bit bitmap for stack frames, data, bss
The bitmaps were 2 bits per pointer because we needed to distinguish
scalar, pointer, multiword, and we used the leftover value to distinguish
uninitialized from scalar, even though the garbage collector (GC) didn't care.
Now that there are no multiword structures from the GC's point of view,
cut the bitmaps down to 1 bit per pointer, recording just live pointer vs not.
The GC assumes the same layout for stack frames and for the maps
describing the global data and bss sections, so change them all in one CL.
The code still refers to 4-bit heap bitmaps and 2-bit "type bitmaps", since
the 2-bit representation lives (at least for now) in some of the reflect data.
Because these stack frame bitmaps are stored directly in the rodata in
the binary, this CL reduces the size of the 6g binary by about 1.1%.
Performance change is basically a wash, but using less memory,
and smaller binaries, and enables other bitmap reductions.
name old mean new mean delta
BenchmarkBinaryTree17 13.2s × (0.97,1.03) 13.0s × (0.99,1.01) -0.93% (p=0.005)
BenchmarkBinaryTree17-2 9.69s × (0.96,1.05) 9.51s × (0.96,1.03) -1.86% (p=0.001)
BenchmarkBinaryTree17-4 10.1s × (0.97,1.05) 10.0s × (0.96,1.05) ~ (p=0.141)
BenchmarkFannkuch11 4.35s × (0.99,1.01) 4.43s × (0.98,1.04) +1.75% (p=0.001)
BenchmarkFannkuch11-2 4.31s × (0.99,1.03) 4.32s × (1.00,1.00) ~ (p=0.095)
BenchmarkFannkuch11-4 4.32s × (0.99,1.02) 4.38s × (0.98,1.04) +1.38% (p=0.008)
BenchmarkFmtFprintfEmpty 83.5ns × (0.97,1.10) 87.3ns × (0.92,1.11) +4.55% (p=0.014)
BenchmarkFmtFprintfEmpty-2 81.8ns × (0.98,1.04) 82.5ns × (0.97,1.08) ~ (p=0.364)
BenchmarkFmtFprintfEmpty-4 80.9ns × (0.99,1.01) 82.6ns × (0.97,1.08) +2.12% (p=0.010)
BenchmarkFmtFprintfString 320ns × (0.95,1.04) 322ns × (0.97,1.05) ~ (p=0.368)
BenchmarkFmtFprintfString-2 303ns × (0.97,1.04) 304ns × (0.97,1.04) ~ (p=0.484)
BenchmarkFmtFprintfString-4 305ns × (0.97,1.05) 306ns × (0.98,1.05) ~ (p=0.543)
BenchmarkFmtFprintfInt 311ns × (0.98,1.03) 319ns × (0.97,1.03) +2.63% (p=0.000)
BenchmarkFmtFprintfInt-2 297ns × (0.98,1.04) 301ns × (0.97,1.04) +1.19% (p=0.023)
BenchmarkFmtFprintfInt-4 302ns × (0.98,1.02) 304ns × (0.97,1.03) ~ (p=0.126)
BenchmarkFmtFprintfIntInt 554ns × (0.96,1.05) 554ns × (0.97,1.03) ~ (p=0.975)
BenchmarkFmtFprintfIntInt-2 520ns × (0.98,1.03) 517ns × (0.98,1.02) ~ (p=0.153)
BenchmarkFmtFprintfIntInt-4 524ns × (0.98,1.02) 525ns × (0.98,1.03) ~ (p=0.597)
BenchmarkFmtFprintfPrefixedInt 433ns × (0.97,1.06) 434ns × (0.97,1.06) ~ (p=0.804)
BenchmarkFmtFprintfPrefixedInt-2 413ns × (0.98,1.04) 413ns × (0.98,1.03) ~ (p=0.881)
BenchmarkFmtFprintfPrefixedInt-4 420ns × (0.97,1.03) 421ns × (0.97,1.03) ~ (p=0.561)
BenchmarkFmtFprintfFloat 620ns × (0.99,1.03) 636ns × (0.97,1.03) +2.57% (p=0.000)
BenchmarkFmtFprintfFloat-2 601ns × (0.98,1.02) 617ns × (0.98,1.03) +2.58% (p=0.000)
BenchmarkFmtFprintfFloat-4 613ns × (0.98,1.03) 626ns × (0.98,1.02) +2.15% (p=0.000)
BenchmarkFmtManyArgs 2.19µs × (0.96,1.04) 2.23µs × (0.97,1.02) +1.65% (p=0.000)
BenchmarkFmtManyArgs-2 2.08µs × (0.98,1.03) 2.10µs × (0.99,1.02) +0.79% (p=0.019)
BenchmarkFmtManyArgs-4 2.10µs × (0.98,1.02) 2.13µs × (0.98,1.02) +1.72% (p=0.000)
BenchmarkGobDecode 21.3ms × (0.97,1.05) 21.1ms × (0.97,1.04) -1.36% (p=0.025)
BenchmarkGobDecode-2 20.0ms × (0.97,1.03) 19.2ms × (0.97,1.03) -4.00% (p=0.000)
BenchmarkGobDecode-4 19.5ms × (0.99,1.02) 19.0ms × (0.99,1.01) -2.39% (p=0.000)
BenchmarkGobEncode 18.3ms × (0.95,1.07) 18.1ms × (0.96,1.08) ~ (p=0.305)
BenchmarkGobEncode-2 16.8ms × (0.97,1.02) 16.4ms × (0.98,1.02) -2.79% (p=0.000)
BenchmarkGobEncode-4 15.4ms × (0.98,1.02) 15.4ms × (0.98,1.02) ~ (p=0.465)
BenchmarkGzip 650ms × (0.98,1.03) 655ms × (0.97,1.04) ~ (p=0.075)
BenchmarkGzip-2 652ms × (0.98,1.03) 655ms × (0.98,1.02) ~ (p=0.337)
BenchmarkGzip-4 656ms × (0.98,1.04) 653ms × (0.98,1.03) ~ (p=0.291)
BenchmarkGunzip 143ms × (1.00,1.01) 143ms × (1.00,1.01) ~ (p=0.507)
BenchmarkGunzip-2 143ms × (1.00,1.01) 143ms × (1.00,1.01) ~ (p=0.313)
BenchmarkGunzip-4 143ms × (1.00,1.01) 143ms × (1.00,1.01) ~ (p=0.312)
BenchmarkHTTPClientServer 110µs × (0.98,1.03) 109µs × (0.99,1.02) -1.40% (p=0.000)
BenchmarkHTTPClientServer-2 154µs × (0.90,1.08) 149µs × (0.90,1.08) -3.43% (p=0.007)
BenchmarkHTTPClientServer-4 138µs × (0.97,1.04) 138µs × (0.96,1.04) ~ (p=0.670)
BenchmarkJSONEncode 40.2ms × (0.98,1.02) 40.2ms × (0.98,1.05) ~ (p=0.828)
BenchmarkJSONEncode-2 35.1ms × (0.99,1.02) 35.2ms × (0.98,1.03) ~ (p=0.392)
BenchmarkJSONEncode-4 35.3ms × (0.98,1.03) 35.3ms × (0.98,1.02) ~ (p=0.813)
BenchmarkJSONDecode 119ms × (0.97,1.02) 117ms × (0.98,1.02) -1.80% (p=0.000)
BenchmarkJSONDecode-2 115ms × (0.99,1.02) 114ms × (0.98,1.02) -1.18% (p=0.000)
BenchmarkJSONDecode-4 116ms × (0.98,1.02) 114ms × (0.98,1.02) -1.43% (p=0.000)
BenchmarkMandelbrot200 6.03ms × (1.00,1.01) 6.03ms × (1.00,1.01) ~ (p=0.985)
BenchmarkMandelbrot200-2 6.03ms × (1.00,1.01) 6.02ms × (1.00,1.01) ~ (p=0.320)
BenchmarkMandelbrot200-4 6.03ms × (1.00,1.01) 6.03ms × (1.00,1.01) ~ (p=0.799)
BenchmarkGoParse 8.63ms × (0.89,1.10) 8.58ms × (0.93,1.09) ~ (p=0.667)
BenchmarkGoParse-2 8.20ms × (0.97,1.04) 8.37ms × (0.97,1.04) +1.96% (p=0.001)
BenchmarkGoParse-4 8.00ms × (0.98,1.02) 8.14ms × (0.99,1.02) +1.75% (p=0.000)
BenchmarkRegexpMatchEasy0_32 162ns × (1.00,1.01) 164ns × (0.98,1.04) +1.35% (p=0.011)
BenchmarkRegexpMatchEasy0_32-2 161ns × (1.00,1.01) 161ns × (1.00,1.00) ~ (p=0.185)
BenchmarkRegexpMatchEasy0_32-4 161ns × (1.00,1.00) 161ns × (1.00,1.00) -0.19% (p=0.001)
BenchmarkRegexpMatchEasy0_1K 540ns × (0.99,1.02) 566ns × (0.98,1.04) +4.98% (p=0.000)
BenchmarkRegexpMatchEasy0_1K-2 540ns × (0.99,1.01) 557ns × (0.99,1.01) +3.21% (p=0.000)
BenchmarkRegexpMatchEasy0_1K-4 541ns × (0.99,1.01) 559ns × (0.99,1.01) +3.26% (p=0.000)
BenchmarkRegexpMatchEasy1_32 139ns × (0.98,1.04) 139ns × (0.99,1.03) ~ (p=0.979)
BenchmarkRegexpMatchEasy1_32-2 139ns × (0.99,1.04) 139ns × (0.99,1.02) ~ (p=0.777)
BenchmarkRegexpMatchEasy1_32-4 139ns × (0.98,1.04) 139ns × (0.99,1.04) ~ (p=0.771)
BenchmarkRegexpMatchEasy1_1K 890ns × (0.99,1.03) 885ns × (1.00,1.01) -0.50% (p=0.004)
BenchmarkRegexpMatchEasy1_1K-2 888ns × (0.99,1.01) 885ns × (0.99,1.01) -0.37% (p=0.004)
BenchmarkRegexpMatchEasy1_1K-4 890ns × (0.99,1.02) 884ns × (1.00,1.00) -0.70% (p=0.000)
BenchmarkRegexpMatchMedium_32 252ns × (0.99,1.01) 251ns × (0.99,1.01) ~ (p=0.081)
BenchmarkRegexpMatchMedium_32-2 254ns × (0.99,1.04) 252ns × (0.99,1.01) -0.78% (p=0.027)
BenchmarkRegexpMatchMedium_32-4 253ns × (0.99,1.04) 252ns × (0.99,1.01) -0.70% (p=0.022)
BenchmarkRegexpMatchMedium_1K 72.9µs × (0.99,1.01) 72.7µs × (1.00,1.00) ~ (p=0.064)
BenchmarkRegexpMatchMedium_1K-2 74.1µs × (0.98,1.05) 72.9µs × (1.00,1.01) -1.61% (p=0.001)
BenchmarkRegexpMatchMedium_1K-4 73.6µs × (0.99,1.05) 72.8µs × (1.00,1.00) -1.13% (p=0.007)
BenchmarkRegexpMatchHard_32 3.88µs × (0.99,1.03) 3.92µs × (0.98,1.05) ~ (p=0.143)
BenchmarkRegexpMatchHard_32-2 3.89µs × (0.99,1.03) 3.93µs × (0.98,1.09) ~ (p=0.278)
BenchmarkRegexpMatchHard_32-4 3.90µs × (0.99,1.05) 3.93µs × (0.98,1.05) ~ (p=0.252)
BenchmarkRegexpMatchHard_1K 118µs × (0.99,1.01) 117µs × (0.99,1.02) -0.54% (p=0.003)
BenchmarkRegexpMatchHard_1K-2 118µs × (0.99,1.01) 118µs × (0.99,1.03) ~ (p=0.581)
BenchmarkRegexpMatchHard_1K-4 118µs × (0.99,1.02) 117µs × (0.99,1.01) -0.54% (p=0.002)
BenchmarkRevcomp 991ms × (0.95,1.10) 989ms × (0.94,1.08) ~ (p=0.879)
BenchmarkRevcomp-2 978ms × (0.95,1.11) 962ms × (0.96,1.08) ~ (p=0.257)
BenchmarkRevcomp-4 979ms × (0.96,1.07) 974ms × (0.96,1.11) ~ (p=0.678)
BenchmarkTemplate 141ms × (0.99,1.02) 145ms × (0.99,1.02) +2.75% (p=0.000)
BenchmarkTemplate-2 135ms × (0.98,1.02) 138ms × (0.99,1.02) +2.34% (p=0.000)
BenchmarkTemplate-4 136ms × (0.98,1.02) 140ms × (0.99,1.02) +2.71% (p=0.000)
BenchmarkTimeParse 640ns × (0.99,1.01) 622ns × (0.99,1.01) -2.88% (p=0.000)
BenchmarkTimeParse-2 640ns × (0.99,1.01) 622ns × (1.00,1.00) -2.81% (p=0.000)
BenchmarkTimeParse-4 640ns × (1.00,1.01) 622ns × (0.99,1.01) -2.82% (p=0.000)
BenchmarkTimeFormat 730ns × (0.98,1.02) 731ns × (0.98,1.03) ~ (p=0.767)
BenchmarkTimeFormat-2 709ns × (0.99,1.02) 707ns × (0.99,1.02) ~ (p=0.347)
BenchmarkTimeFormat-4 717ns × (0.98,1.01) 718ns × (0.98,1.02) ~ (p=0.793)
Change-Id: Ie779c47e912bf80eb918bafa13638bd8dfd6c2d9
Reviewed-on: https://go-review.googlesource.com/9406
Reviewed-by: Rick Hudson <rlh@golang.org>
2015-04-27 22:45:57 -04:00
|
|
|
// Stack, data, and bss bitmaps
|
|
|
|
|
//
|
2015-05-04 10:19:24 -04:00
|
|
|
// Stack frames and global variables in the data and bss sections are described
|
|
|
|
|
// by 1-bit bitmaps in which 0 means uninteresting and 1 means live pointer
|
|
|
|
|
// to be visited during GC.
|
2015-01-16 14:43:38 -05:00
|
|
|
//
|
|
|
|
|
// Heap bitmap
|
|
|
|
|
//
|
|
|
|
|
// The allocated heap comes from a subset of the memory in the range [start, used),
|
|
|
|
|
// where start == mheap_.arena_start and used == mheap_.arena_used.
|
2015-05-04 10:19:24 -04:00
|
|
|
// The heap bitmap comprises 2 bits for each pointer-sized word in that range,
|
2015-01-16 14:43:38 -05:00
|
|
|
// stored in bytes indexed backward in memory from start.
|
2015-05-04 10:19:24 -04:00
|
|
|
// That is, the byte at address start-1 holds the 2-bit entries for the four words
|
|
|
|
|
// start through start+3*ptrSize, the byte at start-2 holds the entries for
|
|
|
|
|
// start+4*ptrSize through start+7*ptrSize, and so on.
|
|
|
|
|
// In each byte, the low 2 bits describe the first word, the next 2 bits describe
|
|
|
|
|
// the next word, and so on.
|
2015-01-16 14:43:38 -05:00
|
|
|
//
|
2015-05-04 10:19:24 -04:00
|
|
|
// In each 2-bit entry, the lower bit holds the same information as in the 1-bit
|
|
|
|
|
// bitmaps: 0 means uninteresting and 1 means live pointer to be visited during GC.
|
|
|
|
|
// The meaning of the high bit depends on the position of the word being described
|
|
|
|
|
// in its allocated object. In the first word, the high bit is the GC ``marked'' bit.
|
|
|
|
|
// In the second word, the high bit is the GC ``checkmarked'' bit (see below).
|
|
|
|
|
// In the third and later words, the high bit indicates that the object is still
|
|
|
|
|
// being described. In these words, if a bit pair with a high bit 0 is encountered,
|
|
|
|
|
// the low bit can also be assumed to be 0, and the object description is over.
|
|
|
|
|
// This 00 is called the ``dead'' encoding: it signals that the rest of the words
|
|
|
|
|
// in the object are uninteresting to the garbage collector.
|
2015-01-16 14:43:38 -05:00
|
|
|
//
|
2015-05-04 10:19:24 -04:00
|
|
|
// The code makes use of the fact that the zero value for a heap bitmap
|
|
|
|
|
// has no live pointer bit set and is (depending on position), not marked,
|
|
|
|
|
// not checkmarked, and is the dead encoding.
|
2015-01-16 14:43:38 -05:00
|
|
|
// These properties must be preserved when modifying the encoding.
|
|
|
|
|
//
|
|
|
|
|
// Checkmarks
|
|
|
|
|
//
|
|
|
|
|
// In a concurrent garbage collector, one worries about failing to mark
|
|
|
|
|
// a live object due to mutations without write barriers or bugs in the
|
|
|
|
|
// collector implementation. As a sanity check, the GC has a 'checkmark'
|
|
|
|
|
// mode that retraverses the object graph with the world stopped, to make
|
|
|
|
|
// sure that everything that should be marked is marked.
|
2015-05-04 10:19:24 -04:00
|
|
|
// In checkmark mode, in the heap bitmap, the high bit of the 2-bit entry
|
|
|
|
|
// for the second word of the object holds the checkmark bit.
|
|
|
|
|
// When not in checkmark mode, this bit is set to 1.
|
2015-01-16 14:43:38 -05:00
|
|
|
//
|
2015-05-04 10:19:24 -04:00
|
|
|
// The smallest possible allocation is 8 bytes. On a 32-bit machine, that
|
|
|
|
|
// means every allocated object has two words, so there is room for the
|
|
|
|
|
// checkmark bit. On a 64-bit machine, however, the 8-byte allocation is
|
|
|
|
|
// just one word, so the second bit pair is not available for encoding the
|
|
|
|
|
// checkmark. However, because non-pointer allocations are combined
|
|
|
|
|
// into larger 16-byte (maxTinySize) allocations, a plain 8-byte allocation
|
|
|
|
|
// must be a pointer, so the type bit in the first word is not actually needed.
|
|
|
|
|
// It is still used in general, except in checkmark the type bit is repurposed
|
|
|
|
|
// as the checkmark bit and then reinitialized (to 1) as the type bit when
|
|
|
|
|
// finished.
|
2015-01-16 14:43:38 -05:00
|
|
|
|
|
|
|
|
package runtime
|
|
|
|
|
|
|
|
|
|
import "unsafe"
|
|
|
|
|
|
|
|
|
|
const (
|
2015-05-04 10:19:24 -04:00
|
|
|
bitPointer = 1
|
|
|
|
|
bitMarked = 2
|
2015-01-16 14:43:38 -05:00
|
|
|
|
2015-05-04 10:19:24 -04:00
|
|
|
heapBitsWidth = 2 // heap bitmap bits to describe one pointer
|
|
|
|
|
heapBitmapScale = ptrSize * (8 / heapBitsWidth) // number of data bytes described by one heap bitmap byte
|
|
|
|
|
)
|
2015-02-19 13:38:46 -05:00
|
|
|
|
2015-01-16 14:43:38 -05:00
|
|
|
// addb returns the byte pointer p+n.
|
|
|
|
|
//go:nowritebarrier
|
|
|
|
|
func addb(p *byte, n uintptr) *byte {
|
|
|
|
|
return (*byte)(add(unsafe.Pointer(p), n))
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// subtractb returns the byte pointer p-n.
|
|
|
|
|
//go:nowritebarrier
|
|
|
|
|
func subtractb(p *byte, n uintptr) *byte {
|
|
|
|
|
return (*byte)(add(unsafe.Pointer(p), -n))
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// mHeap_MapBits is called each time arena_used is extended.
|
|
|
|
|
// It maps any additional bitmap memory needed for the new arena memory.
|
|
|
|
|
//
|
|
|
|
|
//go:nowritebarrier
|
|
|
|
|
func mHeap_MapBits(h *mheap) {
|
|
|
|
|
// Caller has added extra mappings to the arena.
|
|
|
|
|
// Add extra mappings of bitmap words as needed.
|
|
|
|
|
// We allocate extra bitmap pieces in chunks of bitmapChunk.
|
|
|
|
|
const bitmapChunk = 8192
|
|
|
|
|
|
|
|
|
|
n := (mheap_.arena_used - mheap_.arena_start) / heapBitmapScale
|
|
|
|
|
n = round(n, bitmapChunk)
|
|
|
|
|
n = round(n, _PhysPageSize)
|
|
|
|
|
if h.bitmap_mapped >= n {
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
sysMap(unsafe.Pointer(h.arena_start-n), n-h.bitmap_mapped, h.arena_reserved, &memstats.gc_sys)
|
|
|
|
|
h.bitmap_mapped = n
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// heapBits provides access to the bitmap bits for a single heap word.
|
|
|
|
|
// The methods on heapBits take value receivers so that the compiler
|
|
|
|
|
// can more easily inline calls to those methods and registerize the
|
|
|
|
|
// struct fields independently.
|
|
|
|
|
type heapBits struct {
|
|
|
|
|
bitp *uint8
|
|
|
|
|
shift uint32
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// heapBitsForAddr returns the heapBits for the address addr.
|
|
|
|
|
// The caller must have already checked that addr is in the range [mheap_.arena_start, mheap_.arena_used).
|
|
|
|
|
func heapBitsForAddr(addr uintptr) heapBits {
|
2015-05-04 10:19:24 -04:00
|
|
|
// 2 bits per work, 4 pairs per byte, and a mask is hard coded.
|
2015-01-16 14:43:38 -05:00
|
|
|
off := (addr - mheap_.arena_start) / ptrSize
|
2015-05-04 10:19:24 -04:00
|
|
|
return heapBits{(*uint8)(unsafe.Pointer(mheap_.arena_start - off/4 - 1)), uint32(2 * (off & 3))}
|
2015-01-16 14:43:38 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// heapBitsForSpan returns the heapBits for the span base address base.
|
|
|
|
|
func heapBitsForSpan(base uintptr) (hbits heapBits) {
|
|
|
|
|
if base < mheap_.arena_start || base >= mheap_.arena_end {
|
|
|
|
|
throw("heapBitsForSpan: base out of range")
|
|
|
|
|
}
|
|
|
|
|
hbits = heapBitsForAddr(base)
|
|
|
|
|
if hbits.shift != 0 {
|
|
|
|
|
throw("heapBitsForSpan: unaligned start")
|
|
|
|
|
}
|
|
|
|
|
return hbits
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// heapBitsForObject returns the base address for the heap object
|
|
|
|
|
// containing the address p, along with the heapBits for base.
|
2015-03-03 16:55:14 -05:00
|
|
|
// If p does not point into a heap object,
|
|
|
|
|
// return base == 0
|
|
|
|
|
// otherwise return the base of the object.
|
|
|
|
|
func heapBitsForObject(p uintptr) (base uintptr, hbits heapBits, s *mspan) {
|
2015-04-15 17:08:58 -04:00
|
|
|
arenaStart := mheap_.arena_start
|
|
|
|
|
if p < arenaStart || p >= mheap_.arena_used {
|
2015-01-16 14:43:38 -05:00
|
|
|
return
|
|
|
|
|
}
|
2015-04-15 17:08:58 -04:00
|
|
|
off := p - arenaStart
|
|
|
|
|
idx := off >> _PageShift
|
2015-03-03 16:55:14 -05:00
|
|
|
// p points into the heap, but possibly to the middle of an object.
|
2015-01-16 14:43:38 -05:00
|
|
|
// Consult the span table to find the block beginning.
|
|
|
|
|
k := p >> _PageShift
|
2015-04-15 17:08:58 -04:00
|
|
|
s = h_spans[idx]
|
2015-01-16 14:43:38 -05:00
|
|
|
if s == nil || pageID(k) < s.start || p >= s.limit || s.state != mSpanInUse {
|
2015-02-24 09:25:09 -08:00
|
|
|
if s == nil || s.state == _MSpanStack {
|
|
|
|
|
// If s is nil, the virtual address has never been part of the heap.
|
|
|
|
|
// This pointer may be to some mmap'd region, so we allow it.
|
|
|
|
|
// Pointers into stacks are also ok, the runtime manages these explicitly.
|
2015-01-16 14:43:38 -05:00
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// The following ensures that we are rigorous about what data
|
|
|
|
|
// structures hold valid pointers.
|
|
|
|
|
// TODO(rsc): Check if this still happens.
|
|
|
|
|
if false {
|
|
|
|
|
// Still happens sometimes. We don't know why.
|
|
|
|
|
printlock()
|
|
|
|
|
print("runtime:objectstart Span weird: p=", hex(p), " k=", hex(k))
|
|
|
|
|
if s == nil {
|
|
|
|
|
print(" s=nil\n")
|
|
|
|
|
} else {
|
|
|
|
|
print(" s.start=", hex(s.start<<_PageShift), " s.limit=", hex(s.limit), " s.state=", s.state, "\n")
|
|
|
|
|
}
|
|
|
|
|
printunlock()
|
|
|
|
|
throw("objectstart: bad pointer in unexpected span")
|
|
|
|
|
}
|
2015-04-23 16:28:15 -04:00
|
|
|
return
|
2015-01-16 14:43:38 -05:00
|
|
|
}
|
2015-04-15 17:08:58 -04:00
|
|
|
// If this span holds object of a power of 2 size, just mask off the bits to
|
|
|
|
|
// the interior of the object. Otherwise use the size to get the base.
|
|
|
|
|
if s.baseMask != 0 {
|
|
|
|
|
// optimize for power of 2 sized objects.
|
|
|
|
|
base = s.base()
|
|
|
|
|
base = base + (p-base)&s.baseMask
|
|
|
|
|
// base = p & s.baseMask is faster for small spans,
|
|
|
|
|
// but doesn't work for large spans.
|
|
|
|
|
// Overall, it's faster to use the more general computation above.
|
|
|
|
|
} else {
|
|
|
|
|
base = s.base()
|
|
|
|
|
if p-base >= s.elemsize {
|
|
|
|
|
// n := (p - base) / s.elemsize, using division by multiplication
|
|
|
|
|
n := uintptr(uint64(p-base) >> s.divShift * uint64(s.divMul) >> s.divShift2)
|
|
|
|
|
base += n * s.elemsize
|
2015-03-04 11:34:50 -05:00
|
|
|
}
|
2015-01-16 14:43:38 -05:00
|
|
|
}
|
|
|
|
|
// Now that we know the actual base, compute heapBits to return to caller.
|
|
|
|
|
hbits = heapBitsForAddr(base)
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
|
2015-03-03 16:55:14 -05:00
|
|
|
// prefetch the bits.
|
|
|
|
|
func (h heapBits) prefetch() {
|
|
|
|
|
prefetchnta(uintptr(unsafe.Pointer((h.bitp))))
|
|
|
|
|
}
|
|
|
|
|
|
2015-01-16 14:43:38 -05:00
|
|
|
// next returns the heapBits describing the next pointer-sized word in memory.
|
|
|
|
|
// That is, if h describes address p, h.next() describes p+ptrSize.
|
|
|
|
|
// Note that next does not modify h. The caller must record the result.
|
|
|
|
|
func (h heapBits) next() heapBits {
|
2015-05-04 10:19:24 -04:00
|
|
|
if h.shift < 8-heapBitsWidth {
|
|
|
|
|
return heapBits{h.bitp, h.shift + heapBitsWidth}
|
2015-01-16 14:43:38 -05:00
|
|
|
}
|
|
|
|
|
return heapBits{subtractb(h.bitp, 1), 0}
|
|
|
|
|
}
|
|
|
|
|
|
2015-05-04 10:19:24 -04:00
|
|
|
// forward returns the heapBits describing n pointer-sized words ahead of h in memory.
|
|
|
|
|
// That is, if h describes address p, h.forward(n) describes p+n*ptrSize.
|
|
|
|
|
// h.forward(1) is equivalent to h.next(), just slower.
|
|
|
|
|
// Note that forward does not modify h. The caller must record the result.
|
|
|
|
|
// bits returns the heap bits for the current word.
|
|
|
|
|
func (h heapBits) forward(n uintptr) heapBits {
|
|
|
|
|
n += uintptr(h.shift) / heapBitsWidth
|
|
|
|
|
return heapBits{subtractb(h.bitp, n/4), uint32(n%4) * heapBitsWidth}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// The caller can test isMarked and isPointer by &-ing with bitMarked and bitPointer.
|
|
|
|
|
// The result includes in its higher bits the bits for subsequent words
|
|
|
|
|
// described by the same bitmap byte.
|
|
|
|
|
func (h heapBits) bits() uint32 {
|
|
|
|
|
return uint32(*h.bitp) >> h.shift
|
|
|
|
|
}
|
|
|
|
|
|
2015-01-16 14:43:38 -05:00
|
|
|
// isMarked reports whether the heap bits have the marked bit set.
|
2015-05-04 10:19:24 -04:00
|
|
|
// h must describe the initial word of the object.
|
2015-01-16 14:43:38 -05:00
|
|
|
func (h heapBits) isMarked() bool {
|
|
|
|
|
return *h.bitp&(bitMarked<<h.shift) != 0
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// setMarked sets the marked bit in the heap bits, atomically.
|
2015-05-04 10:19:24 -04:00
|
|
|
// h must describe the initial word of the object.
|
2015-01-16 14:43:38 -05:00
|
|
|
func (h heapBits) setMarked() {
|
2015-05-04 10:19:24 -04:00
|
|
|
// Each byte of GC bitmap holds info for four words.
|
2015-02-27 12:41:20 -05:00
|
|
|
// Might be racing with other updates, so use atomic update always.
|
|
|
|
|
// We used to be clever here and use a non-atomic update in certain
|
|
|
|
|
// cases, but it's not worth the risk.
|
2015-01-16 14:43:38 -05:00
|
|
|
atomicor8(h.bitp, bitMarked<<h.shift)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// setMarkedNonAtomic sets the marked bit in the heap bits, non-atomically.
|
2015-05-04 10:19:24 -04:00
|
|
|
// h must describe the initial word of the object.
|
2015-01-16 14:43:38 -05:00
|
|
|
func (h heapBits) setMarkedNonAtomic() {
|
|
|
|
|
*h.bitp |= bitMarked << h.shift
|
|
|
|
|
}
|
|
|
|
|
|
2015-05-04 10:19:24 -04:00
|
|
|
// isPointer reports whether the heap bits describe a pointer word.
|
|
|
|
|
// h must describe the initial word of the object.
|
|
|
|
|
func (h heapBits) isPointer() bool {
|
|
|
|
|
return (*h.bitp>>h.shift)&bitPointer != 0
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// hasPointers reports whether the given object has any pointers.
|
|
|
|
|
// It must be told how large the object at h is, so that it does not read too
|
|
|
|
|
// far into the bitmap.
|
|
|
|
|
// h must describe the initial word of the object.
|
|
|
|
|
func (h heapBits) hasPointers(size uintptr) bool {
|
|
|
|
|
if size == ptrSize { // 1-word objects are always pointers
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
// Otherwise, at least a 2-word object, and at least 2-word aligned,
|
|
|
|
|
// so h.shift is either 0 or 4, so we know we can get the bits for the
|
|
|
|
|
// first two words out of *h.bitp.
|
|
|
|
|
// If either of the first two words is a pointer, not pointer free.
|
|
|
|
|
b := uint32(*h.bitp >> h.shift)
|
|
|
|
|
if b&(bitPointer|bitPointer<<heapBitsWidth) != 0 {
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
if size == 2*ptrSize {
|
|
|
|
|
return false
|
|
|
|
|
}
|
|
|
|
|
// At least a 4-word object. Check scan bit (aka marked bit) in third word.
|
|
|
|
|
if h.shift == 0 {
|
|
|
|
|
return b&(bitMarked<<(2*heapBitsWidth)) != 0
|
|
|
|
|
}
|
|
|
|
|
return uint32(*subtractb(h.bitp, 1))&bitMarked != 0
|
2015-01-16 14:43:38 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// isCheckmarked reports whether the heap bits have the checkmarked bit set.
|
2015-05-04 10:19:24 -04:00
|
|
|
// It must be told how large the object at h is, because the encoding of the
|
|
|
|
|
// checkmark bit varies by size.
|
|
|
|
|
// h must describe the initial word of the object.
|
|
|
|
|
func (h heapBits) isCheckmarked(size uintptr) bool {
|
|
|
|
|
if size == ptrSize {
|
|
|
|
|
return (*h.bitp>>h.shift)&bitPointer != 0
|
|
|
|
|
}
|
|
|
|
|
// All multiword objects are 2-word aligned,
|
|
|
|
|
// so we know that the initial word's 2-bit pair
|
|
|
|
|
// and the second word's 2-bit pair are in the
|
|
|
|
|
// same heap bitmap byte, *h.bitp.
|
|
|
|
|
return (*h.bitp>>(heapBitsWidth+h.shift))&bitMarked != 0
|
2015-01-16 14:43:38 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// setCheckmarked sets the checkmarked bit.
|
2015-05-04 10:19:24 -04:00
|
|
|
// It must be told how large the object at h is, because the encoding of the
|
|
|
|
|
// checkmark bit varies by size.
|
|
|
|
|
// h must describe the initial word of the object.
|
|
|
|
|
func (h heapBits) setCheckmarked(size uintptr) {
|
|
|
|
|
if size == ptrSize {
|
|
|
|
|
atomicor8(h.bitp, bitPointer<<h.shift)
|
|
|
|
|
return
|
2015-01-16 14:43:38 -05:00
|
|
|
}
|
2015-05-04 10:19:24 -04:00
|
|
|
atomicor8(h.bitp, bitMarked<<(heapBitsWidth+h.shift))
|
2015-01-16 14:43:38 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// The methods operating on spans all require that h has been returned
|
|
|
|
|
// by heapBitsForSpan and that size, n, total are the span layout description
|
|
|
|
|
// returned by the mspan's layout method.
|
|
|
|
|
// If total > size*n, it means that there is extra leftover memory in the span,
|
|
|
|
|
// usually due to rounding.
|
|
|
|
|
//
|
|
|
|
|
// TODO(rsc): Perhaps introduce a different heapBitsSpan type.
|
|
|
|
|
|
|
|
|
|
// initSpan initializes the heap bitmap for a span.
|
|
|
|
|
func (h heapBits) initSpan(size, n, total uintptr) {
|
|
|
|
|
if total%heapBitmapScale != 0 {
|
2015-03-03 16:55:14 -05:00
|
|
|
throw("initSpan: unaligned length")
|
2015-01-16 14:43:38 -05:00
|
|
|
}
|
|
|
|
|
nbyte := total / heapBitmapScale
|
|
|
|
|
memclr(unsafe.Pointer(subtractb(h.bitp, nbyte-1)), nbyte)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// initCheckmarkSpan initializes a span for being checkmarked.
|
2015-05-04 10:19:24 -04:00
|
|
|
// It clears the checkmark bits, which are set to 1 in normal operation.
|
2015-01-16 14:43:38 -05:00
|
|
|
func (h heapBits) initCheckmarkSpan(size, n, total uintptr) {
|
2015-05-04 10:19:24 -04:00
|
|
|
// The ptrSize == 8 is a compile-time constant false on 32-bit and eliminates this code entirely.
|
|
|
|
|
if ptrSize == 8 && size == ptrSize {
|
|
|
|
|
// Checkmark bit is type bit, bottom bit of every 2-bit entry.
|
2015-01-16 14:43:38 -05:00
|
|
|
// Only possible on 64-bit system, since minimum size is 8.
|
2015-05-04 10:19:24 -04:00
|
|
|
// Must clear type bit (checkmark bit) of every word.
|
|
|
|
|
// The type bit is the lower of every two-bit pair.
|
2015-01-16 14:43:38 -05:00
|
|
|
bitp := h.bitp
|
2015-05-04 10:19:24 -04:00
|
|
|
for i := uintptr(0); i < n; i += 4 {
|
|
|
|
|
*bitp &^= bitPointer | bitPointer<<2 | bitPointer<<4 | bitPointer<<6
|
2015-01-16 14:43:38 -05:00
|
|
|
bitp = subtractb(bitp, 1)
|
|
|
|
|
}
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
for i := uintptr(0); i < n; i++ {
|
2015-05-04 10:19:24 -04:00
|
|
|
*h.bitp &^= bitMarked << (heapBitsWidth + h.shift)
|
|
|
|
|
h = h.forward(size / ptrSize)
|
2015-01-16 14:43:38 -05:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2015-05-04 10:19:24 -04:00
|
|
|
// clearCheckmarkSpan undoes all the checkmarking in a span.
|
|
|
|
|
// The actual checkmark bits are ignored, so the only work to do
|
|
|
|
|
// is to fix the pointer bits. (Pointer bits are ignored by scanobject
|
|
|
|
|
// but consulted by typedmemmove.)
|
2015-01-16 14:43:38 -05:00
|
|
|
func (h heapBits) clearCheckmarkSpan(size, n, total uintptr) {
|
2015-05-04 10:19:24 -04:00
|
|
|
// The ptrSize == 8 is a compile-time constant false on 32-bit and eliminates this code entirely.
|
|
|
|
|
if ptrSize == 8 && size == ptrSize {
|
|
|
|
|
// Checkmark bit is type bit, bottom bit of every 2-bit entry.
|
2015-01-16 14:43:38 -05:00
|
|
|
// Only possible on 64-bit system, since minimum size is 8.
|
2015-05-04 10:19:24 -04:00
|
|
|
// Must clear type bit (checkmark bit) of every word.
|
|
|
|
|
// The type bit is the lower of every two-bit pair.
|
2015-01-16 14:43:38 -05:00
|
|
|
bitp := h.bitp
|
2015-05-04 10:19:24 -04:00
|
|
|
for i := uintptr(0); i < n; i += 4 {
|
|
|
|
|
*bitp |= bitPointer | bitPointer<<2 | bitPointer<<4 | bitPointer<<6
|
2015-01-16 14:43:38 -05:00
|
|
|
bitp = subtractb(bitp, 1)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// heapBitsSweepSpan coordinates the sweeping of a span by reading
|
|
|
|
|
// and updating the corresponding heap bitmap entries.
|
|
|
|
|
// For each free object in the span, heapBitsSweepSpan sets the type
|
|
|
|
|
// bits for the first two words (or one for single-word objects) to typeDead
|
|
|
|
|
// and then calls f(p), where p is the object's base address.
|
|
|
|
|
// f is expected to add the object to a free list.
|
2015-05-04 10:19:24 -04:00
|
|
|
// For non-free objects, heapBitsSweepSpan turns off the marked bit.
|
2015-01-16 14:43:38 -05:00
|
|
|
func heapBitsSweepSpan(base, size, n uintptr, f func(uintptr)) {
|
|
|
|
|
h := heapBitsForSpan(base)
|
2015-05-04 10:19:24 -04:00
|
|
|
switch {
|
|
|
|
|
default:
|
|
|
|
|
throw("heapBitsSweepSpan")
|
|
|
|
|
case size == ptrSize:
|
|
|
|
|
// Consider mark bits in all four 2-bit entries of each bitmap byte.
|
2015-01-16 14:43:38 -05:00
|
|
|
bitp := h.bitp
|
2015-05-04 10:19:24 -04:00
|
|
|
for i := uintptr(0); i < n; i += 4 {
|
|
|
|
|
x := uint32(*bitp)
|
2015-01-16 14:43:38 -05:00
|
|
|
if x&bitMarked != 0 {
|
|
|
|
|
x &^= bitMarked
|
|
|
|
|
} else {
|
2015-05-04 10:19:24 -04:00
|
|
|
x &^= bitPointer
|
2015-01-16 14:43:38 -05:00
|
|
|
f(base + i*ptrSize)
|
|
|
|
|
}
|
2015-05-04 10:19:24 -04:00
|
|
|
if x&(bitMarked<<2) != 0 {
|
|
|
|
|
x &^= bitMarked << 2
|
|
|
|
|
} else {
|
|
|
|
|
x &^= bitPointer << 2
|
|
|
|
|
f(base + (i+1)*ptrSize)
|
|
|
|
|
}
|
2015-01-16 14:43:38 -05:00
|
|
|
if x&(bitMarked<<4) != 0 {
|
|
|
|
|
x &^= bitMarked << 4
|
|
|
|
|
} else {
|
2015-05-04 10:19:24 -04:00
|
|
|
x &^= bitPointer << 4
|
|
|
|
|
f(base + (i+2)*ptrSize)
|
|
|
|
|
}
|
|
|
|
|
if x&(bitMarked<<6) != 0 {
|
|
|
|
|
x &^= bitMarked << 6
|
|
|
|
|
} else {
|
|
|
|
|
x &^= bitPointer << 6
|
|
|
|
|
f(base + (i+3)*ptrSize)
|
2015-01-16 14:43:38 -05:00
|
|
|
}
|
|
|
|
|
*bitp = uint8(x)
|
|
|
|
|
bitp = subtractb(bitp, 1)
|
|
|
|
|
}
|
|
|
|
|
|
2015-05-04 10:19:24 -04:00
|
|
|
case size%(4*ptrSize) == 0:
|
|
|
|
|
// Mark bit is in first word of each object.
|
|
|
|
|
// Each object starts at bit 0 of a heap bitmap byte.
|
|
|
|
|
bitp := h.bitp
|
|
|
|
|
step := size / heapBitmapScale
|
|
|
|
|
for i := uintptr(0); i < n; i++ {
|
|
|
|
|
x := uint32(*bitp)
|
|
|
|
|
if x&bitMarked != 0 {
|
|
|
|
|
x &^= bitMarked
|
|
|
|
|
} else {
|
|
|
|
|
x = 0
|
|
|
|
|
f(base + i*size)
|
|
|
|
|
}
|
|
|
|
|
*bitp = uint8(x)
|
|
|
|
|
bitp = subtractb(bitp, step)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
case size%(4*ptrSize) == 2*ptrSize:
|
|
|
|
|
// Mark bit is in first word of each object,
|
|
|
|
|
// but every other object starts halfway through a heap bitmap byte.
|
|
|
|
|
// Unroll loop 2x to handle alternating shift count and step size.
|
|
|
|
|
bitp := h.bitp
|
|
|
|
|
step := size / heapBitmapScale
|
|
|
|
|
var i uintptr
|
|
|
|
|
for i = uintptr(0); i < n; i += 2 {
|
|
|
|
|
x := uint32(*bitp)
|
|
|
|
|
if x&bitMarked != 0 {
|
|
|
|
|
x &^= bitMarked
|
|
|
|
|
} else {
|
|
|
|
|
x &^= 0x0f
|
|
|
|
|
f(base + i*size)
|
|
|
|
|
if size > 2*ptrSize {
|
|
|
|
|
x = 0
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
*bitp = uint8(x)
|
|
|
|
|
if i+1 >= n {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
bitp = subtractb(bitp, step)
|
|
|
|
|
x = uint32(*bitp)
|
|
|
|
|
if x&(bitMarked<<4) != 0 {
|
|
|
|
|
x &^= bitMarked << 4
|
|
|
|
|
} else {
|
|
|
|
|
x &^= 0xf0
|
|
|
|
|
f(base + (i+1)*size)
|
|
|
|
|
if size > 2*ptrSize {
|
|
|
|
|
*subtractb(bitp, 1) = 0
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
*bitp = uint8(x)
|
|
|
|
|
bitp = subtractb(bitp, step+1)
|
2015-01-16 14:43:38 -05:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// TODO(rsc): Clean up the next two functions.
|
|
|
|
|
|
|
|
|
|
// heapBitsSetType records that the new allocation [x, x+size)
|
|
|
|
|
// holds in [x, x+dataSize) one or more values of type typ.
|
|
|
|
|
// (The number of values is given by dataSize / typ.size.)
|
|
|
|
|
// If dataSize < size, the fragment [x+dataSize, x+size) is
|
|
|
|
|
// recorded as non-pointer data.
|
runtime: optimize heapBitsSetType
For the conversion of the heap bitmap from 4-bit to 2-bit fields,
I replaced heapBitsSetType with the dumbest thing that could possibly work:
two atomic operations (atomicand8+atomicor8) per 2-bit field.
This CL replaces that code with a proper implementation that
avoids the atomics whenever possible. Benchmarks vs base CL
(before the conversion to 2-bit heap bitmap) and vs Go 1.4 below.
Compared to Go 1.4, SetTypePtr (a 1-pointer allocation)
is 10ns slower because a race against the concurrent GC requires the
use of an atomicor8 that used to be an ordinary write. This slowdown
was present even in the base CL.
Compared to both Go 1.4 and base, SetTypeNode8 (a 10-word allocation)
is 10ns slower because it too needs a new atomic, because with the
denser representation, the byte on the end of the allocation is now shared
with the object next to it; this was not true with the 4-bit representation.
Excluding these two (fundamental) slowdowns due to the use of atomics,
the new code is noticeably faster than both Go 1.4 and the base CL.
The next CL will reintroduce the ``typeDead'' optimization.
Stats are from 5 runs on a MacBookPro10,2 (late 2012 Core i5).
Compared to base CL (** = new atomic)
name old mean new mean delta
SetTypePtr 14.1ns × (0.99,1.02) 14.7ns × (0.93,1.10) ~ (p=0.175)
SetTypePtr8 18.4ns × (1.00,1.01) 18.6ns × (0.81,1.21) ~ (p=0.866)
SetTypePtr16 28.7ns × (1.00,1.00) 22.4ns × (0.90,1.27) -21.88% (p=0.015)
SetTypePtr32 52.3ns × (1.00,1.00) 33.8ns × (0.93,1.24) -35.37% (p=0.001)
SetTypePtr64 79.2ns × (1.00,1.00) 55.1ns × (1.00,1.01) -30.43% (p=0.000)
SetTypePtr126 118ns × (1.00,1.00) 100ns × (1.00,1.00) -15.97% (p=0.000)
SetTypePtr128 130ns × (0.92,1.19) 98ns × (1.00,1.00) -24.36% (p=0.008)
SetTypePtrSlice 726ns × (0.96,1.08) 760ns × (1.00,1.00) ~ (p=0.152)
SetTypeNode1 14.1ns × (0.94,1.15) 12.0ns × (1.00,1.01) -14.60% (p=0.020)
SetTypeNode1Slice 135ns × (0.96,1.07) 88ns × (1.00,1.00) -34.53% (p=0.000)
SetTypeNode8 20.9ns × (1.00,1.01) 32.6ns × (1.00,1.00) +55.37% (p=0.000) **
SetTypeNode8Slice 414ns × (0.99,1.02) 244ns × (1.00,1.00) -41.09% (p=0.000)
SetTypeNode64 80.0ns × (1.00,1.00) 57.4ns × (1.00,1.00) -28.23% (p=0.000)
SetTypeNode64Slice 2.15µs × (1.00,1.01) 1.56µs × (1.00,1.00) -27.43% (p=0.000)
SetTypeNode124 119ns × (0.99,1.00) 100ns × (1.00,1.00) -16.11% (p=0.000)
SetTypeNode124Slice 3.40µs × (1.00,1.00) 2.93µs × (1.00,1.00) -13.80% (p=0.000)
SetTypeNode126 120ns × (1.00,1.01) 98ns × (1.00,1.00) -18.19% (p=0.000)
SetTypeNode126Slice 3.53µs × (0.98,1.08) 3.02µs × (1.00,1.00) -14.49% (p=0.002)
SetTypeNode1024 726ns × (0.97,1.09) 740ns × (1.00,1.00) ~ (p=0.451)
SetTypeNode1024Slice 24.9µs × (0.89,1.37) 23.1µs × (1.00,1.00) ~ (p=0.476)
Compared to Go 1.4 (** = new atomic)
name old mean new mean delta
SetTypePtr 5.71ns × (0.89,1.19) 14.68ns × (0.93,1.10) +157.24% (p=0.000) **
SetTypePtr8 19.3ns × (0.96,1.10) 18.6ns × (0.81,1.21) ~ (p=0.638)
SetTypePtr16 30.7ns × (0.99,1.03) 22.4ns × (0.90,1.27) -26.88% (p=0.005)
SetTypePtr32 51.5ns × (1.00,1.00) 33.8ns × (0.93,1.24) -34.40% (p=0.001)
SetTypePtr64 83.6ns × (0.94,1.12) 55.1ns × (1.00,1.01) -34.12% (p=0.001)
SetTypePtr126 137ns × (0.87,1.26) 100ns × (1.00,1.00) -27.10% (p=0.028)
SetTypePtrSlice 865ns × (0.80,1.23) 760ns × (1.00,1.00) ~ (p=0.243)
SetTypeNode1 15.2ns × (0.88,1.12) 12.0ns × (1.00,1.01) -20.89% (p=0.014)
SetTypeNode1Slice 156ns × (0.93,1.16) 88ns × (1.00,1.00) -43.57% (p=0.001)
SetTypeNode8 23.8ns × (0.90,1.18) 32.6ns × (1.00,1.00) +36.76% (p=0.003) **
SetTypeNode8Slice 502ns × (0.92,1.10) 244ns × (1.00,1.00) -51.46% (p=0.000)
SetTypeNode64 85.6ns × (0.94,1.11) 57.4ns × (1.00,1.00) -32.89% (p=0.001)
SetTypeNode64Slice 2.36µs × (0.91,1.14) 1.56µs × (1.00,1.00) -33.96% (p=0.002)
SetTypeNode124 130ns × (0.91,1.12) 100ns × (1.00,1.00) -23.49% (p=0.004)
SetTypeNode124Slice 3.81µs × (0.90,1.22) 2.93µs × (1.00,1.00) -23.09% (p=0.025)
There are fewer benchmarks vs Go 1.4 because unrolling directly
into the heap bitmap is not yet implemented, so those would not
be meaningful comparisons.
These benchmarks were not present in Go 1.4 as distributed.
The backport to Go 1.4 is in github.com/rsc/go's go14bench branch,
commit 71d5ee5.
Change-Id: I95ed05a22bf484b0fc9efad549279e766c98d2b6
Reviewed-on: https://go-review.googlesource.com/9704
Reviewed-by: Rick Hudson <rlh@golang.org>
2015-05-04 11:30:10 -04:00
|
|
|
// It is known that the type has pointers somewhere;
|
|
|
|
|
// malloc does not call heapBitsSetType when there are no pointers,
|
|
|
|
|
// because all free objects are marked as noscan during
|
|
|
|
|
// heapBitsSweepSpan.
|
|
|
|
|
// There can only be one allocation from a given span active at a time,
|
|
|
|
|
// so this code is not racing with other instances of itself,
|
|
|
|
|
// and we don't allocate from a span until it has been swept,
|
|
|
|
|
// so this code is not racing with heapBitsSweepSpan.
|
|
|
|
|
// It is, however, racing with the concurrent GC mark phase,
|
|
|
|
|
// which can be setting the mark bit in the leading 2-bit entry
|
|
|
|
|
// of an allocated block. The block we are modifying is not quite
|
|
|
|
|
// allocated yet, so the GC marker is not racing with updates to x's bits,
|
|
|
|
|
// but if the start or end of x shares a bitmap byte with an adjacent
|
|
|
|
|
// object, the GC marker is racing with updates to those object's mark bits.
|
2015-01-16 14:43:38 -05:00
|
|
|
func heapBitsSetType(x, size, dataSize uintptr, typ *_type) {
|
|
|
|
|
// From here till marked label marking the object as allocated
|
|
|
|
|
// and storing type info in the GC bitmap.
|
|
|
|
|
h := heapBitsForAddr(x)
|
|
|
|
|
|
runtime: optimize heapBitsSetType
For the conversion of the heap bitmap from 4-bit to 2-bit fields,
I replaced heapBitsSetType with the dumbest thing that could possibly work:
two atomic operations (atomicand8+atomicor8) per 2-bit field.
This CL replaces that code with a proper implementation that
avoids the atomics whenever possible. Benchmarks vs base CL
(before the conversion to 2-bit heap bitmap) and vs Go 1.4 below.
Compared to Go 1.4, SetTypePtr (a 1-pointer allocation)
is 10ns slower because a race against the concurrent GC requires the
use of an atomicor8 that used to be an ordinary write. This slowdown
was present even in the base CL.
Compared to both Go 1.4 and base, SetTypeNode8 (a 10-word allocation)
is 10ns slower because it too needs a new atomic, because with the
denser representation, the byte on the end of the allocation is now shared
with the object next to it; this was not true with the 4-bit representation.
Excluding these two (fundamental) slowdowns due to the use of atomics,
the new code is noticeably faster than both Go 1.4 and the base CL.
The next CL will reintroduce the ``typeDead'' optimization.
Stats are from 5 runs on a MacBookPro10,2 (late 2012 Core i5).
Compared to base CL (** = new atomic)
name old mean new mean delta
SetTypePtr 14.1ns × (0.99,1.02) 14.7ns × (0.93,1.10) ~ (p=0.175)
SetTypePtr8 18.4ns × (1.00,1.01) 18.6ns × (0.81,1.21) ~ (p=0.866)
SetTypePtr16 28.7ns × (1.00,1.00) 22.4ns × (0.90,1.27) -21.88% (p=0.015)
SetTypePtr32 52.3ns × (1.00,1.00) 33.8ns × (0.93,1.24) -35.37% (p=0.001)
SetTypePtr64 79.2ns × (1.00,1.00) 55.1ns × (1.00,1.01) -30.43% (p=0.000)
SetTypePtr126 118ns × (1.00,1.00) 100ns × (1.00,1.00) -15.97% (p=0.000)
SetTypePtr128 130ns × (0.92,1.19) 98ns × (1.00,1.00) -24.36% (p=0.008)
SetTypePtrSlice 726ns × (0.96,1.08) 760ns × (1.00,1.00) ~ (p=0.152)
SetTypeNode1 14.1ns × (0.94,1.15) 12.0ns × (1.00,1.01) -14.60% (p=0.020)
SetTypeNode1Slice 135ns × (0.96,1.07) 88ns × (1.00,1.00) -34.53% (p=0.000)
SetTypeNode8 20.9ns × (1.00,1.01) 32.6ns × (1.00,1.00) +55.37% (p=0.000) **
SetTypeNode8Slice 414ns × (0.99,1.02) 244ns × (1.00,1.00) -41.09% (p=0.000)
SetTypeNode64 80.0ns × (1.00,1.00) 57.4ns × (1.00,1.00) -28.23% (p=0.000)
SetTypeNode64Slice 2.15µs × (1.00,1.01) 1.56µs × (1.00,1.00) -27.43% (p=0.000)
SetTypeNode124 119ns × (0.99,1.00) 100ns × (1.00,1.00) -16.11% (p=0.000)
SetTypeNode124Slice 3.40µs × (1.00,1.00) 2.93µs × (1.00,1.00) -13.80% (p=0.000)
SetTypeNode126 120ns × (1.00,1.01) 98ns × (1.00,1.00) -18.19% (p=0.000)
SetTypeNode126Slice 3.53µs × (0.98,1.08) 3.02µs × (1.00,1.00) -14.49% (p=0.002)
SetTypeNode1024 726ns × (0.97,1.09) 740ns × (1.00,1.00) ~ (p=0.451)
SetTypeNode1024Slice 24.9µs × (0.89,1.37) 23.1µs × (1.00,1.00) ~ (p=0.476)
Compared to Go 1.4 (** = new atomic)
name old mean new mean delta
SetTypePtr 5.71ns × (0.89,1.19) 14.68ns × (0.93,1.10) +157.24% (p=0.000) **
SetTypePtr8 19.3ns × (0.96,1.10) 18.6ns × (0.81,1.21) ~ (p=0.638)
SetTypePtr16 30.7ns × (0.99,1.03) 22.4ns × (0.90,1.27) -26.88% (p=0.005)
SetTypePtr32 51.5ns × (1.00,1.00) 33.8ns × (0.93,1.24) -34.40% (p=0.001)
SetTypePtr64 83.6ns × (0.94,1.12) 55.1ns × (1.00,1.01) -34.12% (p=0.001)
SetTypePtr126 137ns × (0.87,1.26) 100ns × (1.00,1.00) -27.10% (p=0.028)
SetTypePtrSlice 865ns × (0.80,1.23) 760ns × (1.00,1.00) ~ (p=0.243)
SetTypeNode1 15.2ns × (0.88,1.12) 12.0ns × (1.00,1.01) -20.89% (p=0.014)
SetTypeNode1Slice 156ns × (0.93,1.16) 88ns × (1.00,1.00) -43.57% (p=0.001)
SetTypeNode8 23.8ns × (0.90,1.18) 32.6ns × (1.00,1.00) +36.76% (p=0.003) **
SetTypeNode8Slice 502ns × (0.92,1.10) 244ns × (1.00,1.00) -51.46% (p=0.000)
SetTypeNode64 85.6ns × (0.94,1.11) 57.4ns × (1.00,1.00) -32.89% (p=0.001)
SetTypeNode64Slice 2.36µs × (0.91,1.14) 1.56µs × (1.00,1.00) -33.96% (p=0.002)
SetTypeNode124 130ns × (0.91,1.12) 100ns × (1.00,1.00) -23.49% (p=0.004)
SetTypeNode124Slice 3.81µs × (0.90,1.22) 2.93µs × (1.00,1.00) -23.09% (p=0.025)
There are fewer benchmarks vs Go 1.4 because unrolling directly
into the heap bitmap is not yet implemented, so those would not
be meaningful comparisons.
These benchmarks were not present in Go 1.4 as distributed.
The backport to Go 1.4 is in github.com/rsc/go's go14bench branch,
commit 71d5ee5.
Change-Id: I95ed05a22bf484b0fc9efad549279e766c98d2b6
Reviewed-on: https://go-review.googlesource.com/9704
Reviewed-by: Rick Hudson <rlh@golang.org>
2015-05-04 11:30:10 -04:00
|
|
|
// dataSize is always size rounded up to the next malloc size class,
|
|
|
|
|
// except in the case of allocating a defer block, in which case
|
|
|
|
|
// size is sizeof(_defer{}) (at least 6 words) and dataSize may be
|
|
|
|
|
// arbitrarily larger.
|
|
|
|
|
//
|
|
|
|
|
// The checks for size == ptrSize and size == 2*ptrSize can therefore
|
|
|
|
|
// assume that dataSize == size without checking it explicitly.
|
|
|
|
|
|
2015-01-16 14:43:38 -05:00
|
|
|
if size == ptrSize {
|
|
|
|
|
// It's one word and it has pointers, it must be a pointer.
|
|
|
|
|
// The bitmap byte is shared with the one-word object
|
|
|
|
|
// next to it, and concurrent GC might be marking that
|
|
|
|
|
// object, so we must use an atomic update.
|
2015-04-28 00:28:47 -04:00
|
|
|
// TODO(rsc): It may make sense to set all the pointer bits
|
|
|
|
|
// when initializing the span, and then the atomicor8 here
|
|
|
|
|
// goes away - heapBitsSetType would be a no-op
|
|
|
|
|
// in that case.
|
2015-05-04 10:19:24 -04:00
|
|
|
atomicor8(h.bitp, bitPointer<<h.shift)
|
2015-01-16 14:43:38 -05:00
|
|
|
return
|
|
|
|
|
}
|
runtime: optimize heapBitsSetType
For the conversion of the heap bitmap from 4-bit to 2-bit fields,
I replaced heapBitsSetType with the dumbest thing that could possibly work:
two atomic operations (atomicand8+atomicor8) per 2-bit field.
This CL replaces that code with a proper implementation that
avoids the atomics whenever possible. Benchmarks vs base CL
(before the conversion to 2-bit heap bitmap) and vs Go 1.4 below.
Compared to Go 1.4, SetTypePtr (a 1-pointer allocation)
is 10ns slower because a race against the concurrent GC requires the
use of an atomicor8 that used to be an ordinary write. This slowdown
was present even in the base CL.
Compared to both Go 1.4 and base, SetTypeNode8 (a 10-word allocation)
is 10ns slower because it too needs a new atomic, because with the
denser representation, the byte on the end of the allocation is now shared
with the object next to it; this was not true with the 4-bit representation.
Excluding these two (fundamental) slowdowns due to the use of atomics,
the new code is noticeably faster than both Go 1.4 and the base CL.
The next CL will reintroduce the ``typeDead'' optimization.
Stats are from 5 runs on a MacBookPro10,2 (late 2012 Core i5).
Compared to base CL (** = new atomic)
name old mean new mean delta
SetTypePtr 14.1ns × (0.99,1.02) 14.7ns × (0.93,1.10) ~ (p=0.175)
SetTypePtr8 18.4ns × (1.00,1.01) 18.6ns × (0.81,1.21) ~ (p=0.866)
SetTypePtr16 28.7ns × (1.00,1.00) 22.4ns × (0.90,1.27) -21.88% (p=0.015)
SetTypePtr32 52.3ns × (1.00,1.00) 33.8ns × (0.93,1.24) -35.37% (p=0.001)
SetTypePtr64 79.2ns × (1.00,1.00) 55.1ns × (1.00,1.01) -30.43% (p=0.000)
SetTypePtr126 118ns × (1.00,1.00) 100ns × (1.00,1.00) -15.97% (p=0.000)
SetTypePtr128 130ns × (0.92,1.19) 98ns × (1.00,1.00) -24.36% (p=0.008)
SetTypePtrSlice 726ns × (0.96,1.08) 760ns × (1.00,1.00) ~ (p=0.152)
SetTypeNode1 14.1ns × (0.94,1.15) 12.0ns × (1.00,1.01) -14.60% (p=0.020)
SetTypeNode1Slice 135ns × (0.96,1.07) 88ns × (1.00,1.00) -34.53% (p=0.000)
SetTypeNode8 20.9ns × (1.00,1.01) 32.6ns × (1.00,1.00) +55.37% (p=0.000) **
SetTypeNode8Slice 414ns × (0.99,1.02) 244ns × (1.00,1.00) -41.09% (p=0.000)
SetTypeNode64 80.0ns × (1.00,1.00) 57.4ns × (1.00,1.00) -28.23% (p=0.000)
SetTypeNode64Slice 2.15µs × (1.00,1.01) 1.56µs × (1.00,1.00) -27.43% (p=0.000)
SetTypeNode124 119ns × (0.99,1.00) 100ns × (1.00,1.00) -16.11% (p=0.000)
SetTypeNode124Slice 3.40µs × (1.00,1.00) 2.93µs × (1.00,1.00) -13.80% (p=0.000)
SetTypeNode126 120ns × (1.00,1.01) 98ns × (1.00,1.00) -18.19% (p=0.000)
SetTypeNode126Slice 3.53µs × (0.98,1.08) 3.02µs × (1.00,1.00) -14.49% (p=0.002)
SetTypeNode1024 726ns × (0.97,1.09) 740ns × (1.00,1.00) ~ (p=0.451)
SetTypeNode1024Slice 24.9µs × (0.89,1.37) 23.1µs × (1.00,1.00) ~ (p=0.476)
Compared to Go 1.4 (** = new atomic)
name old mean new mean delta
SetTypePtr 5.71ns × (0.89,1.19) 14.68ns × (0.93,1.10) +157.24% (p=0.000) **
SetTypePtr8 19.3ns × (0.96,1.10) 18.6ns × (0.81,1.21) ~ (p=0.638)
SetTypePtr16 30.7ns × (0.99,1.03) 22.4ns × (0.90,1.27) -26.88% (p=0.005)
SetTypePtr32 51.5ns × (1.00,1.00) 33.8ns × (0.93,1.24) -34.40% (p=0.001)
SetTypePtr64 83.6ns × (0.94,1.12) 55.1ns × (1.00,1.01) -34.12% (p=0.001)
SetTypePtr126 137ns × (0.87,1.26) 100ns × (1.00,1.00) -27.10% (p=0.028)
SetTypePtrSlice 865ns × (0.80,1.23) 760ns × (1.00,1.00) ~ (p=0.243)
SetTypeNode1 15.2ns × (0.88,1.12) 12.0ns × (1.00,1.01) -20.89% (p=0.014)
SetTypeNode1Slice 156ns × (0.93,1.16) 88ns × (1.00,1.00) -43.57% (p=0.001)
SetTypeNode8 23.8ns × (0.90,1.18) 32.6ns × (1.00,1.00) +36.76% (p=0.003) **
SetTypeNode8Slice 502ns × (0.92,1.10) 244ns × (1.00,1.00) -51.46% (p=0.000)
SetTypeNode64 85.6ns × (0.94,1.11) 57.4ns × (1.00,1.00) -32.89% (p=0.001)
SetTypeNode64Slice 2.36µs × (0.91,1.14) 1.56µs × (1.00,1.00) -33.96% (p=0.002)
SetTypeNode124 130ns × (0.91,1.12) 100ns × (1.00,1.00) -23.49% (p=0.004)
SetTypeNode124Slice 3.81µs × (0.90,1.22) 2.93µs × (1.00,1.00) -23.09% (p=0.025)
There are fewer benchmarks vs Go 1.4 because unrolling directly
into the heap bitmap is not yet implemented, so those would not
be meaningful comparisons.
These benchmarks were not present in Go 1.4 as distributed.
The backport to Go 1.4 is in github.com/rsc/go's go14bench branch,
commit 71d5ee5.
Change-Id: I95ed05a22bf484b0fc9efad549279e766c98d2b6
Reviewed-on: https://go-review.googlesource.com/9704
Reviewed-by: Rick Hudson <rlh@golang.org>
2015-05-04 11:30:10 -04:00
|
|
|
|
|
|
|
|
ptrmask := (*uint8)(unsafe.Pointer(typ.gc[0])) // pointer to unrolled mask
|
2015-01-16 14:43:38 -05:00
|
|
|
if typ.kind&kindGCProg != 0 {
|
|
|
|
|
nptr := (uintptr(typ.size) + ptrSize - 1) / ptrSize
|
2015-04-28 00:28:47 -04:00
|
|
|
masksize := (nptr + 7) / 8
|
|
|
|
|
masksize++ // unroll flag in the beginning
|
2015-01-16 14:43:38 -05:00
|
|
|
if masksize > maxGCMask && typ.gc[1] != 0 {
|
|
|
|
|
// write barriers have not been updated to deal with this case yet.
|
|
|
|
|
throw("maxGCMask too small for now")
|
|
|
|
|
// If the mask is too large, unroll the program directly
|
|
|
|
|
// into the GC bitmap. It's 7 times slower than copying
|
|
|
|
|
// from the pre-unrolled mask, but saves 1/16 of type size
|
|
|
|
|
// memory for the mask.
|
|
|
|
|
systemstack(func() {
|
|
|
|
|
unrollgcproginplace_m(unsafe.Pointer(x), typ, size, dataSize)
|
|
|
|
|
})
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
// Check whether the program is already unrolled
|
|
|
|
|
// by checking if the unroll flag byte is set
|
|
|
|
|
maskword := uintptr(atomicloadp(unsafe.Pointer(ptrmask)))
|
|
|
|
|
if *(*uint8)(unsafe.Pointer(&maskword)) == 0 {
|
|
|
|
|
systemstack(func() {
|
|
|
|
|
unrollgcprog_m(typ)
|
|
|
|
|
})
|
|
|
|
|
}
|
runtime: optimize heapBitsSetType
For the conversion of the heap bitmap from 4-bit to 2-bit fields,
I replaced heapBitsSetType with the dumbest thing that could possibly work:
two atomic operations (atomicand8+atomicor8) per 2-bit field.
This CL replaces that code with a proper implementation that
avoids the atomics whenever possible. Benchmarks vs base CL
(before the conversion to 2-bit heap bitmap) and vs Go 1.4 below.
Compared to Go 1.4, SetTypePtr (a 1-pointer allocation)
is 10ns slower because a race against the concurrent GC requires the
use of an atomicor8 that used to be an ordinary write. This slowdown
was present even in the base CL.
Compared to both Go 1.4 and base, SetTypeNode8 (a 10-word allocation)
is 10ns slower because it too needs a new atomic, because with the
denser representation, the byte on the end of the allocation is now shared
with the object next to it; this was not true with the 4-bit representation.
Excluding these two (fundamental) slowdowns due to the use of atomics,
the new code is noticeably faster than both Go 1.4 and the base CL.
The next CL will reintroduce the ``typeDead'' optimization.
Stats are from 5 runs on a MacBookPro10,2 (late 2012 Core i5).
Compared to base CL (** = new atomic)
name old mean new mean delta
SetTypePtr 14.1ns × (0.99,1.02) 14.7ns × (0.93,1.10) ~ (p=0.175)
SetTypePtr8 18.4ns × (1.00,1.01) 18.6ns × (0.81,1.21) ~ (p=0.866)
SetTypePtr16 28.7ns × (1.00,1.00) 22.4ns × (0.90,1.27) -21.88% (p=0.015)
SetTypePtr32 52.3ns × (1.00,1.00) 33.8ns × (0.93,1.24) -35.37% (p=0.001)
SetTypePtr64 79.2ns × (1.00,1.00) 55.1ns × (1.00,1.01) -30.43% (p=0.000)
SetTypePtr126 118ns × (1.00,1.00) 100ns × (1.00,1.00) -15.97% (p=0.000)
SetTypePtr128 130ns × (0.92,1.19) 98ns × (1.00,1.00) -24.36% (p=0.008)
SetTypePtrSlice 726ns × (0.96,1.08) 760ns × (1.00,1.00) ~ (p=0.152)
SetTypeNode1 14.1ns × (0.94,1.15) 12.0ns × (1.00,1.01) -14.60% (p=0.020)
SetTypeNode1Slice 135ns × (0.96,1.07) 88ns × (1.00,1.00) -34.53% (p=0.000)
SetTypeNode8 20.9ns × (1.00,1.01) 32.6ns × (1.00,1.00) +55.37% (p=0.000) **
SetTypeNode8Slice 414ns × (0.99,1.02) 244ns × (1.00,1.00) -41.09% (p=0.000)
SetTypeNode64 80.0ns × (1.00,1.00) 57.4ns × (1.00,1.00) -28.23% (p=0.000)
SetTypeNode64Slice 2.15µs × (1.00,1.01) 1.56µs × (1.00,1.00) -27.43% (p=0.000)
SetTypeNode124 119ns × (0.99,1.00) 100ns × (1.00,1.00) -16.11% (p=0.000)
SetTypeNode124Slice 3.40µs × (1.00,1.00) 2.93µs × (1.00,1.00) -13.80% (p=0.000)
SetTypeNode126 120ns × (1.00,1.01) 98ns × (1.00,1.00) -18.19% (p=0.000)
SetTypeNode126Slice 3.53µs × (0.98,1.08) 3.02µs × (1.00,1.00) -14.49% (p=0.002)
SetTypeNode1024 726ns × (0.97,1.09) 740ns × (1.00,1.00) ~ (p=0.451)
SetTypeNode1024Slice 24.9µs × (0.89,1.37) 23.1µs × (1.00,1.00) ~ (p=0.476)
Compared to Go 1.4 (** = new atomic)
name old mean new mean delta
SetTypePtr 5.71ns × (0.89,1.19) 14.68ns × (0.93,1.10) +157.24% (p=0.000) **
SetTypePtr8 19.3ns × (0.96,1.10) 18.6ns × (0.81,1.21) ~ (p=0.638)
SetTypePtr16 30.7ns × (0.99,1.03) 22.4ns × (0.90,1.27) -26.88% (p=0.005)
SetTypePtr32 51.5ns × (1.00,1.00) 33.8ns × (0.93,1.24) -34.40% (p=0.001)
SetTypePtr64 83.6ns × (0.94,1.12) 55.1ns × (1.00,1.01) -34.12% (p=0.001)
SetTypePtr126 137ns × (0.87,1.26) 100ns × (1.00,1.00) -27.10% (p=0.028)
SetTypePtrSlice 865ns × (0.80,1.23) 760ns × (1.00,1.00) ~ (p=0.243)
SetTypeNode1 15.2ns × (0.88,1.12) 12.0ns × (1.00,1.01) -20.89% (p=0.014)
SetTypeNode1Slice 156ns × (0.93,1.16) 88ns × (1.00,1.00) -43.57% (p=0.001)
SetTypeNode8 23.8ns × (0.90,1.18) 32.6ns × (1.00,1.00) +36.76% (p=0.003) **
SetTypeNode8Slice 502ns × (0.92,1.10) 244ns × (1.00,1.00) -51.46% (p=0.000)
SetTypeNode64 85.6ns × (0.94,1.11) 57.4ns × (1.00,1.00) -32.89% (p=0.001)
SetTypeNode64Slice 2.36µs × (0.91,1.14) 1.56µs × (1.00,1.00) -33.96% (p=0.002)
SetTypeNode124 130ns × (0.91,1.12) 100ns × (1.00,1.00) -23.49% (p=0.004)
SetTypeNode124Slice 3.81µs × (0.90,1.22) 2.93µs × (1.00,1.00) -23.09% (p=0.025)
There are fewer benchmarks vs Go 1.4 because unrolling directly
into the heap bitmap is not yet implemented, so those would not
be meaningful comparisons.
These benchmarks were not present in Go 1.4 as distributed.
The backport to Go 1.4 is in github.com/rsc/go's go14bench branch,
commit 71d5ee5.
Change-Id: I95ed05a22bf484b0fc9efad549279e766c98d2b6
Reviewed-on: https://go-review.googlesource.com/9704
Reviewed-by: Rick Hudson <rlh@golang.org>
2015-05-04 11:30:10 -04:00
|
|
|
ptrmask = addb(ptrmask, 1) // skip the unroll flag byte
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Heap bitmap bits for 2-word object are only 4 bits,
|
|
|
|
|
// so also shared with objects next to it; use atomic updates.
|
|
|
|
|
// This is called out as a special case primarily for 32-bit systems,
|
|
|
|
|
// so that on 32-bit systems the code below can assume all objects
|
|
|
|
|
// are 4-word aligned (because they're all 16-byte aligned).
|
|
|
|
|
if size == 2*ptrSize {
|
|
|
|
|
if typ.size == ptrSize {
|
|
|
|
|
// 2-element slice of pointer.
|
|
|
|
|
atomicor8(h.bitp, (bitPointer|bitPointer<<heapBitsWidth)<<h.shift)
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
// Otherwise typ.size must be 2*ptrSize, and typ.kind&kindGCProg == 0.
|
|
|
|
|
b := uint32(*ptrmask)
|
|
|
|
|
hb := b&1 | (b&2)<<(heapBitsWidth-1)
|
|
|
|
|
atomicor8(h.bitp, uint8(hb<<h.shift))
|
|
|
|
|
return
|
2015-01-16 14:43:38 -05:00
|
|
|
}
|
2015-04-28 00:28:47 -04:00
|
|
|
|
2015-05-04 10:19:24 -04:00
|
|
|
// Copy from 1-bit ptrmask into 2-bit bitmap.
|
runtime: optimize heapBitsSetType
For the conversion of the heap bitmap from 4-bit to 2-bit fields,
I replaced heapBitsSetType with the dumbest thing that could possibly work:
two atomic operations (atomicand8+atomicor8) per 2-bit field.
This CL replaces that code with a proper implementation that
avoids the atomics whenever possible. Benchmarks vs base CL
(before the conversion to 2-bit heap bitmap) and vs Go 1.4 below.
Compared to Go 1.4, SetTypePtr (a 1-pointer allocation)
is 10ns slower because a race against the concurrent GC requires the
use of an atomicor8 that used to be an ordinary write. This slowdown
was present even in the base CL.
Compared to both Go 1.4 and base, SetTypeNode8 (a 10-word allocation)
is 10ns slower because it too needs a new atomic, because with the
denser representation, the byte on the end of the allocation is now shared
with the object next to it; this was not true with the 4-bit representation.
Excluding these two (fundamental) slowdowns due to the use of atomics,
the new code is noticeably faster than both Go 1.4 and the base CL.
The next CL will reintroduce the ``typeDead'' optimization.
Stats are from 5 runs on a MacBookPro10,2 (late 2012 Core i5).
Compared to base CL (** = new atomic)
name old mean new mean delta
SetTypePtr 14.1ns × (0.99,1.02) 14.7ns × (0.93,1.10) ~ (p=0.175)
SetTypePtr8 18.4ns × (1.00,1.01) 18.6ns × (0.81,1.21) ~ (p=0.866)
SetTypePtr16 28.7ns × (1.00,1.00) 22.4ns × (0.90,1.27) -21.88% (p=0.015)
SetTypePtr32 52.3ns × (1.00,1.00) 33.8ns × (0.93,1.24) -35.37% (p=0.001)
SetTypePtr64 79.2ns × (1.00,1.00) 55.1ns × (1.00,1.01) -30.43% (p=0.000)
SetTypePtr126 118ns × (1.00,1.00) 100ns × (1.00,1.00) -15.97% (p=0.000)
SetTypePtr128 130ns × (0.92,1.19) 98ns × (1.00,1.00) -24.36% (p=0.008)
SetTypePtrSlice 726ns × (0.96,1.08) 760ns × (1.00,1.00) ~ (p=0.152)
SetTypeNode1 14.1ns × (0.94,1.15) 12.0ns × (1.00,1.01) -14.60% (p=0.020)
SetTypeNode1Slice 135ns × (0.96,1.07) 88ns × (1.00,1.00) -34.53% (p=0.000)
SetTypeNode8 20.9ns × (1.00,1.01) 32.6ns × (1.00,1.00) +55.37% (p=0.000) **
SetTypeNode8Slice 414ns × (0.99,1.02) 244ns × (1.00,1.00) -41.09% (p=0.000)
SetTypeNode64 80.0ns × (1.00,1.00) 57.4ns × (1.00,1.00) -28.23% (p=0.000)
SetTypeNode64Slice 2.15µs × (1.00,1.01) 1.56µs × (1.00,1.00) -27.43% (p=0.000)
SetTypeNode124 119ns × (0.99,1.00) 100ns × (1.00,1.00) -16.11% (p=0.000)
SetTypeNode124Slice 3.40µs × (1.00,1.00) 2.93µs × (1.00,1.00) -13.80% (p=0.000)
SetTypeNode126 120ns × (1.00,1.01) 98ns × (1.00,1.00) -18.19% (p=0.000)
SetTypeNode126Slice 3.53µs × (0.98,1.08) 3.02µs × (1.00,1.00) -14.49% (p=0.002)
SetTypeNode1024 726ns × (0.97,1.09) 740ns × (1.00,1.00) ~ (p=0.451)
SetTypeNode1024Slice 24.9µs × (0.89,1.37) 23.1µs × (1.00,1.00) ~ (p=0.476)
Compared to Go 1.4 (** = new atomic)
name old mean new mean delta
SetTypePtr 5.71ns × (0.89,1.19) 14.68ns × (0.93,1.10) +157.24% (p=0.000) **
SetTypePtr8 19.3ns × (0.96,1.10) 18.6ns × (0.81,1.21) ~ (p=0.638)
SetTypePtr16 30.7ns × (0.99,1.03) 22.4ns × (0.90,1.27) -26.88% (p=0.005)
SetTypePtr32 51.5ns × (1.00,1.00) 33.8ns × (0.93,1.24) -34.40% (p=0.001)
SetTypePtr64 83.6ns × (0.94,1.12) 55.1ns × (1.00,1.01) -34.12% (p=0.001)
SetTypePtr126 137ns × (0.87,1.26) 100ns × (1.00,1.00) -27.10% (p=0.028)
SetTypePtrSlice 865ns × (0.80,1.23) 760ns × (1.00,1.00) ~ (p=0.243)
SetTypeNode1 15.2ns × (0.88,1.12) 12.0ns × (1.00,1.01) -20.89% (p=0.014)
SetTypeNode1Slice 156ns × (0.93,1.16) 88ns × (1.00,1.00) -43.57% (p=0.001)
SetTypeNode8 23.8ns × (0.90,1.18) 32.6ns × (1.00,1.00) +36.76% (p=0.003) **
SetTypeNode8Slice 502ns × (0.92,1.10) 244ns × (1.00,1.00) -51.46% (p=0.000)
SetTypeNode64 85.6ns × (0.94,1.11) 57.4ns × (1.00,1.00) -32.89% (p=0.001)
SetTypeNode64Slice 2.36µs × (0.91,1.14) 1.56µs × (1.00,1.00) -33.96% (p=0.002)
SetTypeNode124 130ns × (0.91,1.12) 100ns × (1.00,1.00) -23.49% (p=0.004)
SetTypeNode124Slice 3.81µs × (0.90,1.22) 2.93µs × (1.00,1.00) -23.09% (p=0.025)
There are fewer benchmarks vs Go 1.4 because unrolling directly
into the heap bitmap is not yet implemented, so those would not
be meaningful comparisons.
These benchmarks were not present in Go 1.4 as distributed.
The backport to Go 1.4 is in github.com/rsc/go's go14bench branch,
commit 71d5ee5.
Change-Id: I95ed05a22bf484b0fc9efad549279e766c98d2b6
Reviewed-on: https://go-review.googlesource.com/9704
Reviewed-by: Rick Hudson <rlh@golang.org>
2015-05-04 11:30:10 -04:00
|
|
|
// The basic approach is to use a single uintptr as a bit buffer,
|
|
|
|
|
// alternating between reloading the buffer and writing bitmap bytes.
|
|
|
|
|
// In general, one load can supply two bitmap byte writes.
|
|
|
|
|
// This is a lot of lines of code, but it compiles into relatively few
|
|
|
|
|
// machine instructions.
|
|
|
|
|
var (
|
|
|
|
|
p *byte // last ptrmask byte read
|
|
|
|
|
b uintptr // ptrmask bits already loaded
|
|
|
|
|
nb uint32 // number of bits in b at next read
|
|
|
|
|
endp *byte // final ptrmask byte to read (then repeat)
|
|
|
|
|
endnb uint32 // number of valid bits in *endp
|
|
|
|
|
pbits uintptr // alternate source of bits
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
p = ptrmask
|
|
|
|
|
if typ.size < dataSize {
|
|
|
|
|
// Filling in bits for an array of typ.
|
|
|
|
|
// Set up for repetition of ptrmask during main loop.
|
|
|
|
|
if typ.size/ptrSize+7 <= ptrSize*8 {
|
|
|
|
|
// Entire ptrmask + a leftover fragment fits in uintptr.
|
|
|
|
|
// Load into pbits and never read from ptrmask again.
|
|
|
|
|
// This is especially important when the ptrmask has
|
|
|
|
|
// fewer than 8 bits in it; otherwise the reload in the middle
|
|
|
|
|
// of the Phase 2 loop would itself need to loop to gather
|
|
|
|
|
// at least 8 bits.
|
|
|
|
|
|
|
|
|
|
// Accumulate ptrmask into b.
|
|
|
|
|
nb = uint32(typ.size / ptrSize)
|
|
|
|
|
for i := uint32(0); i < nb; i += 8 {
|
|
|
|
|
b |= uintptr(*p) << i
|
|
|
|
|
p = addb(p, 1)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Replicate ptrmask to fill entire pbits uintptr.
|
|
|
|
|
// Doubling and truncating is fewer steps than
|
|
|
|
|
// iterating by nb each time. (nb could be 1.)
|
|
|
|
|
pbits = b
|
|
|
|
|
endnb = nb
|
|
|
|
|
for endnb <= ptrSize*8 {
|
|
|
|
|
pbits |= pbits << endnb
|
|
|
|
|
endnb += endnb
|
|
|
|
|
}
|
|
|
|
|
// Truncate to an multiple of original ptrmask.
|
|
|
|
|
endnb = (ptrSize*8 - 7) / nb * nb
|
|
|
|
|
pbits &= 1<<endnb - 1
|
|
|
|
|
b = pbits
|
|
|
|
|
nb = endnb
|
|
|
|
|
|
|
|
|
|
// Clear p and endp as sentinel for using pbits.
|
|
|
|
|
// Checked during Phase 2 loop.
|
|
|
|
|
p = nil
|
|
|
|
|
endp = nil
|
|
|
|
|
} else {
|
|
|
|
|
// Ptrmask is larger. Read it multiple times.
|
|
|
|
|
endp = addb(ptrmask, (typ.size/ptrSize+7)/8-1)
|
|
|
|
|
endnb = uint32(typ.size/ptrSize) % 8
|
|
|
|
|
if endnb == 0 {
|
|
|
|
|
endnb = 8
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if p != nil {
|
|
|
|
|
b = uintptr(*p)
|
|
|
|
|
p = addb(p, 1)
|
|
|
|
|
nb = 8
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
w := uintptr(0) // number of words processed
|
|
|
|
|
nw := dataSize / ptrSize // number of words to process
|
|
|
|
|
|
|
|
|
|
hbitp := h.bitp // next heap bitmap byte to write
|
|
|
|
|
var hb uintptr // bits being preapred for *h.bitp
|
|
|
|
|
|
|
|
|
|
// Phase 1: Special case for leading byte (shift==0) or half-byte (shift==4).
|
|
|
|
|
// The leading byte is special because it contains the bits for words 0 and 1,
|
|
|
|
|
// which do not have the marked bits set.
|
|
|
|
|
// The leading half-byte is special because it's a half a byte and must be
|
|
|
|
|
// manipulated atomically.
|
|
|
|
|
switch h.shift {
|
|
|
|
|
default:
|
|
|
|
|
throw("heapBitsSetType: unexpected shift")
|
|
|
|
|
|
|
|
|
|
case 0:
|
|
|
|
|
// Ptrmask and heap bitmap are aligned.
|
|
|
|
|
// Handle first byte of bitmap specially.
|
|
|
|
|
// The first byte we write out contains the first two words of the object.
|
|
|
|
|
// In those words, the mark bits are mark and checkmark, respectively,
|
|
|
|
|
// and must not be set. In all following words, we want to set the mark bit
|
|
|
|
|
// as a signal that the object continues to the next 2-bit entry in the bitmap.
|
|
|
|
|
hb = b&1 | (b&2)<<(heapBitsWidth-1) | (b&4)<<(2*heapBitsWidth-2) | (b&8)<<(3*heapBitsWidth-3)
|
|
|
|
|
hb |= bitMarked<<(2*heapBitsWidth) | bitMarked<<(3*heapBitsWidth)
|
|
|
|
|
if w += 4; w >= nw {
|
|
|
|
|
goto Phase3
|
|
|
|
|
}
|
|
|
|
|
*hbitp = uint8(hb)
|
|
|
|
|
hbitp = subtractb(hbitp, 1)
|
|
|
|
|
b >>= 4
|
|
|
|
|
nb -= 4
|
|
|
|
|
|
|
|
|
|
case 4:
|
|
|
|
|
// Ptrmask and heap bitmap are misaligned.
|
|
|
|
|
// The bits for the first two words are in a byte shared with another object
|
|
|
|
|
// and must be updated atomically.
|
|
|
|
|
// NOTE(rsc): The atomic here may not be necessary.
|
|
|
|
|
// We took care of 1-word and 2-word objects above,
|
|
|
|
|
// so this is at least a 6-word object, so our start bits
|
|
|
|
|
// are shared only with the type bits of another object,
|
|
|
|
|
// not with its mark bit. Since there is only one allocation
|
|
|
|
|
// from a given span at a time, we should be able to set
|
|
|
|
|
// these bits non-atomically. Not worth the risk right now.
|
|
|
|
|
hb = (b&1)<<4 | (b&2)<<(4+heapBitsWidth-1) // bits being prepared for *h.bitp
|
|
|
|
|
b >>= 2
|
|
|
|
|
nb -= 2
|
|
|
|
|
// Note: no bitMarker in hb because the first two words don't get markers from us.
|
|
|
|
|
atomicor8(hbitp, uint8(hb))
|
|
|
|
|
hbitp = subtractb(hbitp, 1)
|
|
|
|
|
|
|
|
|
|
// Expand 8-bit chunks of ptrmask into pairs of heap bitmap bytes.
|
|
|
|
|
// We know the object size is a multiple of 2 words but not 4, so the
|
|
|
|
|
// object size minus the 2 words we just handled is a multiple of 4,
|
|
|
|
|
// so we can use non-atomic writes to the heap bitmap for the
|
|
|
|
|
// rest of this code, even for the final fragment or a trailing dead marker byte.
|
|
|
|
|
|
|
|
|
|
// Loop prepares bits for final byte but stops before writing them,
|
|
|
|
|
// so that in the case where we need to write only part of a byte,
|
|
|
|
|
// the code below the loop can truncate the bitMarked.
|
|
|
|
|
w += 2
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Phase 2: Full bytes in bitmap, up to but not including write to last byte (full or partial) in bitmap.
|
|
|
|
|
// The loop computes the bits for that last write but does not execute the write;
|
|
|
|
|
// it leaves the bits in hb for processing by phase 3.
|
|
|
|
|
// To avoid repeated adjustment of nb, we subtract out the 4 bits we're going to
|
|
|
|
|
// use in the first half of the loop right now, and then we only adjust nb explicitly
|
|
|
|
|
// if the 8 bits used by each iteration isn't balanced by 8 bits loaded mid-loop.
|
|
|
|
|
nb -= 4
|
|
|
|
|
for {
|
|
|
|
|
// Emit bitmap byte.
|
|
|
|
|
// b has at least nb+4 bits, with one exception:
|
|
|
|
|
// if w+4 >= nw, then b has only nw-w bits,
|
|
|
|
|
// but we'll stop at the break and then truncate
|
|
|
|
|
// appropriately in Phase 3.
|
|
|
|
|
hb = b&1 | (b&2)<<(heapBitsWidth-1) | (b&4)<<(2*heapBitsWidth-2) | (b&8)<<(3*heapBitsWidth-3)
|
|
|
|
|
hb |= bitMarked | bitMarked<<heapBitsWidth | bitMarked<<(2*heapBitsWidth) | bitMarked<<(3*heapBitsWidth)
|
|
|
|
|
if w += 4; w >= nw {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
*hbitp = uint8(hb)
|
|
|
|
|
hbitp = subtractb(hbitp, 1)
|
|
|
|
|
b >>= 4
|
|
|
|
|
|
|
|
|
|
// Load more bits. b has nb right now.
|
|
|
|
|
if p != endp {
|
|
|
|
|
// Fast path: keep reading from ptrmask.
|
|
|
|
|
// nb unmodified: we just loaded 8 bits,
|
|
|
|
|
// and the next iteration will consume 8 bits,
|
|
|
|
|
// leaving us with the same nb the next time we're here.
|
|
|
|
|
b |= uintptr(*p) << nb
|
|
|
|
|
p = addb(p, 1)
|
|
|
|
|
} else if p == nil {
|
|
|
|
|
// Almost as fast path: track bit count and refill from pbits.
|
|
|
|
|
// For short repetitions.
|
|
|
|
|
if nb < 8 {
|
|
|
|
|
b |= pbits << nb
|
|
|
|
|
nb += endnb
|
|
|
|
|
}
|
|
|
|
|
nb -= 8 // for next iteration
|
|
|
|
|
} else {
|
|
|
|
|
// Slow path: reached end of ptrmask.
|
|
|
|
|
// Process final partial byte and rewind to start.
|
|
|
|
|
b |= uintptr(*p) << nb
|
|
|
|
|
nb += endnb
|
|
|
|
|
if nb < 8 {
|
|
|
|
|
b |= uintptr(*ptrmask) << nb
|
|
|
|
|
p = addb(ptrmask, 1)
|
|
|
|
|
} else {
|
|
|
|
|
nb -= 8
|
|
|
|
|
p = ptrmask
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Emit bitmap byte.
|
|
|
|
|
hb = b&1 | (b&2)<<(heapBitsWidth-1) | (b&4)<<(2*heapBitsWidth-2) | (b&8)<<(3*heapBitsWidth-3)
|
|
|
|
|
hb |= bitMarked | bitMarked<<heapBitsWidth | bitMarked<<(2*heapBitsWidth) | bitMarked<<(3*heapBitsWidth)
|
|
|
|
|
if w += 4; w >= nw {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
*hbitp = uint8(hb)
|
|
|
|
|
hbitp = subtractb(hbitp, 1)
|
|
|
|
|
b >>= 4
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
Phase3:
|
|
|
|
|
// Phase 3: Special case for final byte or half-byte describing final fragment of data.
|
|
|
|
|
// If there are not four data words for this final fragment, we must clear the mark bits
|
|
|
|
|
// in the 2-bit entries for the missing words. Clearing them creates a ``dead'' entry
|
|
|
|
|
// to tell the GC scan to stop scanning this object early.
|
|
|
|
|
// If there are four words in the final fragment but there is more data,
|
|
|
|
|
// then we must write a ``dead'' entry to the next bitmap byte.
|
|
|
|
|
if frag := (nw - w) % 4; frag != 0 {
|
|
|
|
|
// Data ends at least one word early.
|
|
|
|
|
hb &= 1<<(heapBitsWidth*frag) - 1
|
|
|
|
|
if w*ptrSize <= size {
|
|
|
|
|
// We own the whole byte and get the dead marker for free.
|
|
|
|
|
*hbitp = uint8(hb)
|
|
|
|
|
} else {
|
|
|
|
|
// We only own the bottom half of the byte.
|
|
|
|
|
// If frag == 1, we get a dead marker for free.
|
|
|
|
|
// If frag == 2, no dead marker needed (we've reached the end of the object).
|
|
|
|
|
atomicand8(hbitp, 0xf0)
|
|
|
|
|
atomicor8(hbitp, uint8(hb))
|
2015-05-04 10:19:24 -04:00
|
|
|
}
|
runtime: optimize heapBitsSetType
For the conversion of the heap bitmap from 4-bit to 2-bit fields,
I replaced heapBitsSetType with the dumbest thing that could possibly work:
two atomic operations (atomicand8+atomicor8) per 2-bit field.
This CL replaces that code with a proper implementation that
avoids the atomics whenever possible. Benchmarks vs base CL
(before the conversion to 2-bit heap bitmap) and vs Go 1.4 below.
Compared to Go 1.4, SetTypePtr (a 1-pointer allocation)
is 10ns slower because a race against the concurrent GC requires the
use of an atomicor8 that used to be an ordinary write. This slowdown
was present even in the base CL.
Compared to both Go 1.4 and base, SetTypeNode8 (a 10-word allocation)
is 10ns slower because it too needs a new atomic, because with the
denser representation, the byte on the end of the allocation is now shared
with the object next to it; this was not true with the 4-bit representation.
Excluding these two (fundamental) slowdowns due to the use of atomics,
the new code is noticeably faster than both Go 1.4 and the base CL.
The next CL will reintroduce the ``typeDead'' optimization.
Stats are from 5 runs on a MacBookPro10,2 (late 2012 Core i5).
Compared to base CL (** = new atomic)
name old mean new mean delta
SetTypePtr 14.1ns × (0.99,1.02) 14.7ns × (0.93,1.10) ~ (p=0.175)
SetTypePtr8 18.4ns × (1.00,1.01) 18.6ns × (0.81,1.21) ~ (p=0.866)
SetTypePtr16 28.7ns × (1.00,1.00) 22.4ns × (0.90,1.27) -21.88% (p=0.015)
SetTypePtr32 52.3ns × (1.00,1.00) 33.8ns × (0.93,1.24) -35.37% (p=0.001)
SetTypePtr64 79.2ns × (1.00,1.00) 55.1ns × (1.00,1.01) -30.43% (p=0.000)
SetTypePtr126 118ns × (1.00,1.00) 100ns × (1.00,1.00) -15.97% (p=0.000)
SetTypePtr128 130ns × (0.92,1.19) 98ns × (1.00,1.00) -24.36% (p=0.008)
SetTypePtrSlice 726ns × (0.96,1.08) 760ns × (1.00,1.00) ~ (p=0.152)
SetTypeNode1 14.1ns × (0.94,1.15) 12.0ns × (1.00,1.01) -14.60% (p=0.020)
SetTypeNode1Slice 135ns × (0.96,1.07) 88ns × (1.00,1.00) -34.53% (p=0.000)
SetTypeNode8 20.9ns × (1.00,1.01) 32.6ns × (1.00,1.00) +55.37% (p=0.000) **
SetTypeNode8Slice 414ns × (0.99,1.02) 244ns × (1.00,1.00) -41.09% (p=0.000)
SetTypeNode64 80.0ns × (1.00,1.00) 57.4ns × (1.00,1.00) -28.23% (p=0.000)
SetTypeNode64Slice 2.15µs × (1.00,1.01) 1.56µs × (1.00,1.00) -27.43% (p=0.000)
SetTypeNode124 119ns × (0.99,1.00) 100ns × (1.00,1.00) -16.11% (p=0.000)
SetTypeNode124Slice 3.40µs × (1.00,1.00) 2.93µs × (1.00,1.00) -13.80% (p=0.000)
SetTypeNode126 120ns × (1.00,1.01) 98ns × (1.00,1.00) -18.19% (p=0.000)
SetTypeNode126Slice 3.53µs × (0.98,1.08) 3.02µs × (1.00,1.00) -14.49% (p=0.002)
SetTypeNode1024 726ns × (0.97,1.09) 740ns × (1.00,1.00) ~ (p=0.451)
SetTypeNode1024Slice 24.9µs × (0.89,1.37) 23.1µs × (1.00,1.00) ~ (p=0.476)
Compared to Go 1.4 (** = new atomic)
name old mean new mean delta
SetTypePtr 5.71ns × (0.89,1.19) 14.68ns × (0.93,1.10) +157.24% (p=0.000) **
SetTypePtr8 19.3ns × (0.96,1.10) 18.6ns × (0.81,1.21) ~ (p=0.638)
SetTypePtr16 30.7ns × (0.99,1.03) 22.4ns × (0.90,1.27) -26.88% (p=0.005)
SetTypePtr32 51.5ns × (1.00,1.00) 33.8ns × (0.93,1.24) -34.40% (p=0.001)
SetTypePtr64 83.6ns × (0.94,1.12) 55.1ns × (1.00,1.01) -34.12% (p=0.001)
SetTypePtr126 137ns × (0.87,1.26) 100ns × (1.00,1.00) -27.10% (p=0.028)
SetTypePtrSlice 865ns × (0.80,1.23) 760ns × (1.00,1.00) ~ (p=0.243)
SetTypeNode1 15.2ns × (0.88,1.12) 12.0ns × (1.00,1.01) -20.89% (p=0.014)
SetTypeNode1Slice 156ns × (0.93,1.16) 88ns × (1.00,1.00) -43.57% (p=0.001)
SetTypeNode8 23.8ns × (0.90,1.18) 32.6ns × (1.00,1.00) +36.76% (p=0.003) **
SetTypeNode8Slice 502ns × (0.92,1.10) 244ns × (1.00,1.00) -51.46% (p=0.000)
SetTypeNode64 85.6ns × (0.94,1.11) 57.4ns × (1.00,1.00) -32.89% (p=0.001)
SetTypeNode64Slice 2.36µs × (0.91,1.14) 1.56µs × (1.00,1.00) -33.96% (p=0.002)
SetTypeNode124 130ns × (0.91,1.12) 100ns × (1.00,1.00) -23.49% (p=0.004)
SetTypeNode124Slice 3.81µs × (0.90,1.22) 2.93µs × (1.00,1.00) -23.09% (p=0.025)
There are fewer benchmarks vs Go 1.4 because unrolling directly
into the heap bitmap is not yet implemented, so those would not
be meaningful comparisons.
These benchmarks were not present in Go 1.4 as distributed.
The backport to Go 1.4 is in github.com/rsc/go's go14bench branch,
commit 71d5ee5.
Change-Id: I95ed05a22bf484b0fc9efad549279e766c98d2b6
Reviewed-on: https://go-review.googlesource.com/9704
Reviewed-by: Rick Hudson <rlh@golang.org>
2015-05-04 11:30:10 -04:00
|
|
|
} else {
|
|
|
|
|
// Data ends with a full bitmap byte.
|
|
|
|
|
*hbitp = uint8(hb)
|
|
|
|
|
if w*ptrSize < size {
|
|
|
|
|
// There's more data in the allocated object.
|
|
|
|
|
// Write a dead marker in the next byte.
|
|
|
|
|
hbitp = subtractb(hbitp, 1)
|
|
|
|
|
if (w+4)*ptrSize <= size {
|
|
|
|
|
// We own the whole byte.
|
|
|
|
|
*hbitp = 0
|
|
|
|
|
} else {
|
|
|
|
|
// We only own the bottom half of the byte.
|
|
|
|
|
atomicand8(hbitp, 0xf0)
|
|
|
|
|
}
|
2015-01-16 14:43:38 -05:00
|
|
|
}
|
2015-04-28 00:28:47 -04:00
|
|
|
}
|
runtime: optimize heapBitsSetType
For the conversion of the heap bitmap from 4-bit to 2-bit fields,
I replaced heapBitsSetType with the dumbest thing that could possibly work:
two atomic operations (atomicand8+atomicor8) per 2-bit field.
This CL replaces that code with a proper implementation that
avoids the atomics whenever possible. Benchmarks vs base CL
(before the conversion to 2-bit heap bitmap) and vs Go 1.4 below.
Compared to Go 1.4, SetTypePtr (a 1-pointer allocation)
is 10ns slower because a race against the concurrent GC requires the
use of an atomicor8 that used to be an ordinary write. This slowdown
was present even in the base CL.
Compared to both Go 1.4 and base, SetTypeNode8 (a 10-word allocation)
is 10ns slower because it too needs a new atomic, because with the
denser representation, the byte on the end of the allocation is now shared
with the object next to it; this was not true with the 4-bit representation.
Excluding these two (fundamental) slowdowns due to the use of atomics,
the new code is noticeably faster than both Go 1.4 and the base CL.
The next CL will reintroduce the ``typeDead'' optimization.
Stats are from 5 runs on a MacBookPro10,2 (late 2012 Core i5).
Compared to base CL (** = new atomic)
name old mean new mean delta
SetTypePtr 14.1ns × (0.99,1.02) 14.7ns × (0.93,1.10) ~ (p=0.175)
SetTypePtr8 18.4ns × (1.00,1.01) 18.6ns × (0.81,1.21) ~ (p=0.866)
SetTypePtr16 28.7ns × (1.00,1.00) 22.4ns × (0.90,1.27) -21.88% (p=0.015)
SetTypePtr32 52.3ns × (1.00,1.00) 33.8ns × (0.93,1.24) -35.37% (p=0.001)
SetTypePtr64 79.2ns × (1.00,1.00) 55.1ns × (1.00,1.01) -30.43% (p=0.000)
SetTypePtr126 118ns × (1.00,1.00) 100ns × (1.00,1.00) -15.97% (p=0.000)
SetTypePtr128 130ns × (0.92,1.19) 98ns × (1.00,1.00) -24.36% (p=0.008)
SetTypePtrSlice 726ns × (0.96,1.08) 760ns × (1.00,1.00) ~ (p=0.152)
SetTypeNode1 14.1ns × (0.94,1.15) 12.0ns × (1.00,1.01) -14.60% (p=0.020)
SetTypeNode1Slice 135ns × (0.96,1.07) 88ns × (1.00,1.00) -34.53% (p=0.000)
SetTypeNode8 20.9ns × (1.00,1.01) 32.6ns × (1.00,1.00) +55.37% (p=0.000) **
SetTypeNode8Slice 414ns × (0.99,1.02) 244ns × (1.00,1.00) -41.09% (p=0.000)
SetTypeNode64 80.0ns × (1.00,1.00) 57.4ns × (1.00,1.00) -28.23% (p=0.000)
SetTypeNode64Slice 2.15µs × (1.00,1.01) 1.56µs × (1.00,1.00) -27.43% (p=0.000)
SetTypeNode124 119ns × (0.99,1.00) 100ns × (1.00,1.00) -16.11% (p=0.000)
SetTypeNode124Slice 3.40µs × (1.00,1.00) 2.93µs × (1.00,1.00) -13.80% (p=0.000)
SetTypeNode126 120ns × (1.00,1.01) 98ns × (1.00,1.00) -18.19% (p=0.000)
SetTypeNode126Slice 3.53µs × (0.98,1.08) 3.02µs × (1.00,1.00) -14.49% (p=0.002)
SetTypeNode1024 726ns × (0.97,1.09) 740ns × (1.00,1.00) ~ (p=0.451)
SetTypeNode1024Slice 24.9µs × (0.89,1.37) 23.1µs × (1.00,1.00) ~ (p=0.476)
Compared to Go 1.4 (** = new atomic)
name old mean new mean delta
SetTypePtr 5.71ns × (0.89,1.19) 14.68ns × (0.93,1.10) +157.24% (p=0.000) **
SetTypePtr8 19.3ns × (0.96,1.10) 18.6ns × (0.81,1.21) ~ (p=0.638)
SetTypePtr16 30.7ns × (0.99,1.03) 22.4ns × (0.90,1.27) -26.88% (p=0.005)
SetTypePtr32 51.5ns × (1.00,1.00) 33.8ns × (0.93,1.24) -34.40% (p=0.001)
SetTypePtr64 83.6ns × (0.94,1.12) 55.1ns × (1.00,1.01) -34.12% (p=0.001)
SetTypePtr126 137ns × (0.87,1.26) 100ns × (1.00,1.00) -27.10% (p=0.028)
SetTypePtrSlice 865ns × (0.80,1.23) 760ns × (1.00,1.00) ~ (p=0.243)
SetTypeNode1 15.2ns × (0.88,1.12) 12.0ns × (1.00,1.01) -20.89% (p=0.014)
SetTypeNode1Slice 156ns × (0.93,1.16) 88ns × (1.00,1.00) -43.57% (p=0.001)
SetTypeNode8 23.8ns × (0.90,1.18) 32.6ns × (1.00,1.00) +36.76% (p=0.003) **
SetTypeNode8Slice 502ns × (0.92,1.10) 244ns × (1.00,1.00) -51.46% (p=0.000)
SetTypeNode64 85.6ns × (0.94,1.11) 57.4ns × (1.00,1.00) -32.89% (p=0.001)
SetTypeNode64Slice 2.36µs × (0.91,1.14) 1.56µs × (1.00,1.00) -33.96% (p=0.002)
SetTypeNode124 130ns × (0.91,1.12) 100ns × (1.00,1.00) -23.49% (p=0.004)
SetTypeNode124Slice 3.81µs × (0.90,1.22) 2.93µs × (1.00,1.00) -23.09% (p=0.025)
There are fewer benchmarks vs Go 1.4 because unrolling directly
into the heap bitmap is not yet implemented, so those would not
be meaningful comparisons.
These benchmarks were not present in Go 1.4 as distributed.
The backport to Go 1.4 is in github.com/rsc/go's go14bench branch,
commit 71d5ee5.
Change-Id: I95ed05a22bf484b0fc9efad549279e766c98d2b6
Reviewed-on: https://go-review.googlesource.com/9704
Reviewed-by: Rick Hudson <rlh@golang.org>
2015-05-04 11:30:10 -04:00
|
|
|
|
|
|
|
|
const test = false // slow but helpful
|
|
|
|
|
if test {
|
|
|
|
|
// Double-check that bits to be written were written correctly.
|
|
|
|
|
// Does not check that other bits were not written, unfortunately.
|
|
|
|
|
h := heapBitsForAddr(x)
|
|
|
|
|
nptr := typ.size / ptrSize
|
|
|
|
|
for i := uintptr(0); i <= dataSize/ptrSize; i++ {
|
|
|
|
|
j := i % nptr
|
|
|
|
|
var have, want uint8
|
|
|
|
|
if i == dataSize/ptrSize {
|
|
|
|
|
if dataSize >= size {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
have = (*h.bitp >> h.shift) & 3
|
|
|
|
|
want = 0 // dead bits
|
|
|
|
|
} else {
|
|
|
|
|
have = (*h.bitp >> h.shift) & 3
|
|
|
|
|
if (*addb(ptrmask, j/8)>>(j%8))&1 != 0 {
|
|
|
|
|
want |= bitPointer
|
|
|
|
|
}
|
|
|
|
|
if i >= 2 {
|
|
|
|
|
want |= bitMarked
|
|
|
|
|
} else {
|
|
|
|
|
have &^= bitMarked
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if have != want {
|
|
|
|
|
println("mismatch writing bits for", *typ._string, "x", dataSize/typ.size)
|
|
|
|
|
print("typ.size=", typ.size, " dataSize=", dataSize, " size=", size, "\n")
|
|
|
|
|
h = heapBitsForAddr(x)
|
|
|
|
|
print("initial bits h.bitp=", h.bitp, " h.shift=", h.shift, "\n")
|
|
|
|
|
print("p=", p, " endp=", endp, " endnb=", endnb, " pbits=", hex(pbits), " b=", hex(b), " nb=", nb, "\n")
|
|
|
|
|
println("at word", i, "offset", i*ptrSize, "have", have, "want", want)
|
|
|
|
|
throw("bad heapBitsSetType")
|
|
|
|
|
}
|
|
|
|
|
h = h.next()
|
|
|
|
|
}
|
2015-01-16 14:43:38 -05:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2015-04-28 00:28:47 -04:00
|
|
|
// ptrBitmapForType returns a bitmap indicating where pointers are
|
|
|
|
|
// in the memory representation of the type typ.
|
|
|
|
|
// The bit x[i/8]&(1<<(i%8)) is 1 if the i'th word in a value of type typ
|
|
|
|
|
// is a pointer.
|
|
|
|
|
func ptrBitmapForType(typ *_type) []uint8 {
|
2015-01-16 14:43:38 -05:00
|
|
|
var ptrmask *uint8
|
|
|
|
|
nptr := (uintptr(typ.size) + ptrSize - 1) / ptrSize
|
|
|
|
|
if typ.kind&kindGCProg != 0 {
|
2015-04-28 00:28:47 -04:00
|
|
|
masksize := (nptr + 7) / 8
|
|
|
|
|
masksize++ // unroll flag in the beginning
|
2015-01-16 14:43:38 -05:00
|
|
|
if masksize > maxGCMask && typ.gc[1] != 0 {
|
|
|
|
|
// write barriers have not been updated to deal with this case yet.
|
|
|
|
|
throw("maxGCMask too small for now")
|
|
|
|
|
}
|
|
|
|
|
ptrmask = (*uint8)(unsafe.Pointer(uintptr(typ.gc[0])))
|
|
|
|
|
// Check whether the program is already unrolled
|
|
|
|
|
// by checking if the unroll flag byte is set
|
|
|
|
|
maskword := uintptr(atomicloadp(unsafe.Pointer(ptrmask)))
|
|
|
|
|
if *(*uint8)(unsafe.Pointer(&maskword)) == 0 {
|
|
|
|
|
systemstack(func() {
|
|
|
|
|
unrollgcprog_m(typ)
|
|
|
|
|
})
|
|
|
|
|
}
|
|
|
|
|
ptrmask = (*uint8)(add(unsafe.Pointer(ptrmask), 1)) // skip the unroll flag byte
|
|
|
|
|
} else {
|
|
|
|
|
ptrmask = (*uint8)(unsafe.Pointer(typ.gc[0])) // pointer to unrolled mask
|
|
|
|
|
}
|
2015-04-28 00:28:47 -04:00
|
|
|
return (*[1 << 30]byte)(unsafe.Pointer(ptrmask))[:(nptr+7)/8]
|
2015-01-16 14:43:38 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// GC type info programs
|
|
|
|
|
//
|
|
|
|
|
// TODO(rsc): Clean up and enable.
|
|
|
|
|
|
|
|
|
|
const (
|
|
|
|
|
// GC type info programs.
|
|
|
|
|
// The programs allow to store type info required for GC in a compact form.
|
|
|
|
|
// Most importantly arrays take O(1) space instead of O(n).
|
|
|
|
|
// The program grammar is:
|
|
|
|
|
//
|
|
|
|
|
// Program = {Block} "insEnd"
|
|
|
|
|
// Block = Data | Array
|
|
|
|
|
// Data = "insData" DataSize DataBlock
|
|
|
|
|
// DataSize = int // size of the DataBlock in bit pairs, 1 byte
|
|
|
|
|
// DataBlock = binary // dense GC mask (2 bits per word) of size ]DataSize/4[ bytes
|
|
|
|
|
// Array = "insArray" ArrayLen Block "insArrayEnd"
|
|
|
|
|
// ArrayLen = int // length of the array, 8 bytes (4 bytes for 32-bit arch)
|
|
|
|
|
//
|
|
|
|
|
// Each instruction (insData, insArray, etc) is 1 byte.
|
|
|
|
|
// For example, for type struct { x []byte; y [20]struct{ z int; w *byte }; }
|
|
|
|
|
// the program looks as:
|
|
|
|
|
//
|
|
|
|
|
// insData 3 (typePointer typeScalar typeScalar)
|
|
|
|
|
// insArray 20 insData 2 (typeScalar typePointer) insArrayEnd insEnd
|
|
|
|
|
//
|
|
|
|
|
// Total size of the program is 17 bytes (13 bytes on 32-bits).
|
|
|
|
|
// The corresponding GC mask would take 43 bytes (it would be repeated
|
|
|
|
|
// because the type has odd number of words).
|
|
|
|
|
insData = 1 + iota
|
|
|
|
|
insArray
|
|
|
|
|
insArrayEnd
|
|
|
|
|
insEnd
|
|
|
|
|
|
|
|
|
|
// 64 bytes cover objects of size 1024/512 on 64/32 bits, respectively.
|
|
|
|
|
maxGCMask = 65536 // TODO(rsc): change back to 64
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
// Recursively unrolls GC program in prog.
|
|
|
|
|
// mask is where to store the result.
|
|
|
|
|
// If inplace is true, store the result not in mask but in the heap bitmap for mask.
|
|
|
|
|
// ppos is a pointer to position in mask, in bits.
|
cmd/internal/gc, runtime: use 1-bit bitmap for stack frames, data, bss
The bitmaps were 2 bits per pointer because we needed to distinguish
scalar, pointer, multiword, and we used the leftover value to distinguish
uninitialized from scalar, even though the garbage collector (GC) didn't care.
Now that there are no multiword structures from the GC's point of view,
cut the bitmaps down to 1 bit per pointer, recording just live pointer vs not.
The GC assumes the same layout for stack frames and for the maps
describing the global data and bss sections, so change them all in one CL.
The code still refers to 4-bit heap bitmaps and 2-bit "type bitmaps", since
the 2-bit representation lives (at least for now) in some of the reflect data.
Because these stack frame bitmaps are stored directly in the rodata in
the binary, this CL reduces the size of the 6g binary by about 1.1%.
Performance change is basically a wash, but using less memory,
and smaller binaries, and enables other bitmap reductions.
name old mean new mean delta
BenchmarkBinaryTree17 13.2s × (0.97,1.03) 13.0s × (0.99,1.01) -0.93% (p=0.005)
BenchmarkBinaryTree17-2 9.69s × (0.96,1.05) 9.51s × (0.96,1.03) -1.86% (p=0.001)
BenchmarkBinaryTree17-4 10.1s × (0.97,1.05) 10.0s × (0.96,1.05) ~ (p=0.141)
BenchmarkFannkuch11 4.35s × (0.99,1.01) 4.43s × (0.98,1.04) +1.75% (p=0.001)
BenchmarkFannkuch11-2 4.31s × (0.99,1.03) 4.32s × (1.00,1.00) ~ (p=0.095)
BenchmarkFannkuch11-4 4.32s × (0.99,1.02) 4.38s × (0.98,1.04) +1.38% (p=0.008)
BenchmarkFmtFprintfEmpty 83.5ns × (0.97,1.10) 87.3ns × (0.92,1.11) +4.55% (p=0.014)
BenchmarkFmtFprintfEmpty-2 81.8ns × (0.98,1.04) 82.5ns × (0.97,1.08) ~ (p=0.364)
BenchmarkFmtFprintfEmpty-4 80.9ns × (0.99,1.01) 82.6ns × (0.97,1.08) +2.12% (p=0.010)
BenchmarkFmtFprintfString 320ns × (0.95,1.04) 322ns × (0.97,1.05) ~ (p=0.368)
BenchmarkFmtFprintfString-2 303ns × (0.97,1.04) 304ns × (0.97,1.04) ~ (p=0.484)
BenchmarkFmtFprintfString-4 305ns × (0.97,1.05) 306ns × (0.98,1.05) ~ (p=0.543)
BenchmarkFmtFprintfInt 311ns × (0.98,1.03) 319ns × (0.97,1.03) +2.63% (p=0.000)
BenchmarkFmtFprintfInt-2 297ns × (0.98,1.04) 301ns × (0.97,1.04) +1.19% (p=0.023)
BenchmarkFmtFprintfInt-4 302ns × (0.98,1.02) 304ns × (0.97,1.03) ~ (p=0.126)
BenchmarkFmtFprintfIntInt 554ns × (0.96,1.05) 554ns × (0.97,1.03) ~ (p=0.975)
BenchmarkFmtFprintfIntInt-2 520ns × (0.98,1.03) 517ns × (0.98,1.02) ~ (p=0.153)
BenchmarkFmtFprintfIntInt-4 524ns × (0.98,1.02) 525ns × (0.98,1.03) ~ (p=0.597)
BenchmarkFmtFprintfPrefixedInt 433ns × (0.97,1.06) 434ns × (0.97,1.06) ~ (p=0.804)
BenchmarkFmtFprintfPrefixedInt-2 413ns × (0.98,1.04) 413ns × (0.98,1.03) ~ (p=0.881)
BenchmarkFmtFprintfPrefixedInt-4 420ns × (0.97,1.03) 421ns × (0.97,1.03) ~ (p=0.561)
BenchmarkFmtFprintfFloat 620ns × (0.99,1.03) 636ns × (0.97,1.03) +2.57% (p=0.000)
BenchmarkFmtFprintfFloat-2 601ns × (0.98,1.02) 617ns × (0.98,1.03) +2.58% (p=0.000)
BenchmarkFmtFprintfFloat-4 613ns × (0.98,1.03) 626ns × (0.98,1.02) +2.15% (p=0.000)
BenchmarkFmtManyArgs 2.19µs × (0.96,1.04) 2.23µs × (0.97,1.02) +1.65% (p=0.000)
BenchmarkFmtManyArgs-2 2.08µs × (0.98,1.03) 2.10µs × (0.99,1.02) +0.79% (p=0.019)
BenchmarkFmtManyArgs-4 2.10µs × (0.98,1.02) 2.13µs × (0.98,1.02) +1.72% (p=0.000)
BenchmarkGobDecode 21.3ms × (0.97,1.05) 21.1ms × (0.97,1.04) -1.36% (p=0.025)
BenchmarkGobDecode-2 20.0ms × (0.97,1.03) 19.2ms × (0.97,1.03) -4.00% (p=0.000)
BenchmarkGobDecode-4 19.5ms × (0.99,1.02) 19.0ms × (0.99,1.01) -2.39% (p=0.000)
BenchmarkGobEncode 18.3ms × (0.95,1.07) 18.1ms × (0.96,1.08) ~ (p=0.305)
BenchmarkGobEncode-2 16.8ms × (0.97,1.02) 16.4ms × (0.98,1.02) -2.79% (p=0.000)
BenchmarkGobEncode-4 15.4ms × (0.98,1.02) 15.4ms × (0.98,1.02) ~ (p=0.465)
BenchmarkGzip 650ms × (0.98,1.03) 655ms × (0.97,1.04) ~ (p=0.075)
BenchmarkGzip-2 652ms × (0.98,1.03) 655ms × (0.98,1.02) ~ (p=0.337)
BenchmarkGzip-4 656ms × (0.98,1.04) 653ms × (0.98,1.03) ~ (p=0.291)
BenchmarkGunzip 143ms × (1.00,1.01) 143ms × (1.00,1.01) ~ (p=0.507)
BenchmarkGunzip-2 143ms × (1.00,1.01) 143ms × (1.00,1.01) ~ (p=0.313)
BenchmarkGunzip-4 143ms × (1.00,1.01) 143ms × (1.00,1.01) ~ (p=0.312)
BenchmarkHTTPClientServer 110µs × (0.98,1.03) 109µs × (0.99,1.02) -1.40% (p=0.000)
BenchmarkHTTPClientServer-2 154µs × (0.90,1.08) 149µs × (0.90,1.08) -3.43% (p=0.007)
BenchmarkHTTPClientServer-4 138µs × (0.97,1.04) 138µs × (0.96,1.04) ~ (p=0.670)
BenchmarkJSONEncode 40.2ms × (0.98,1.02) 40.2ms × (0.98,1.05) ~ (p=0.828)
BenchmarkJSONEncode-2 35.1ms × (0.99,1.02) 35.2ms × (0.98,1.03) ~ (p=0.392)
BenchmarkJSONEncode-4 35.3ms × (0.98,1.03) 35.3ms × (0.98,1.02) ~ (p=0.813)
BenchmarkJSONDecode 119ms × (0.97,1.02) 117ms × (0.98,1.02) -1.80% (p=0.000)
BenchmarkJSONDecode-2 115ms × (0.99,1.02) 114ms × (0.98,1.02) -1.18% (p=0.000)
BenchmarkJSONDecode-4 116ms × (0.98,1.02) 114ms × (0.98,1.02) -1.43% (p=0.000)
BenchmarkMandelbrot200 6.03ms × (1.00,1.01) 6.03ms × (1.00,1.01) ~ (p=0.985)
BenchmarkMandelbrot200-2 6.03ms × (1.00,1.01) 6.02ms × (1.00,1.01) ~ (p=0.320)
BenchmarkMandelbrot200-4 6.03ms × (1.00,1.01) 6.03ms × (1.00,1.01) ~ (p=0.799)
BenchmarkGoParse 8.63ms × (0.89,1.10) 8.58ms × (0.93,1.09) ~ (p=0.667)
BenchmarkGoParse-2 8.20ms × (0.97,1.04) 8.37ms × (0.97,1.04) +1.96% (p=0.001)
BenchmarkGoParse-4 8.00ms × (0.98,1.02) 8.14ms × (0.99,1.02) +1.75% (p=0.000)
BenchmarkRegexpMatchEasy0_32 162ns × (1.00,1.01) 164ns × (0.98,1.04) +1.35% (p=0.011)
BenchmarkRegexpMatchEasy0_32-2 161ns × (1.00,1.01) 161ns × (1.00,1.00) ~ (p=0.185)
BenchmarkRegexpMatchEasy0_32-4 161ns × (1.00,1.00) 161ns × (1.00,1.00) -0.19% (p=0.001)
BenchmarkRegexpMatchEasy0_1K 540ns × (0.99,1.02) 566ns × (0.98,1.04) +4.98% (p=0.000)
BenchmarkRegexpMatchEasy0_1K-2 540ns × (0.99,1.01) 557ns × (0.99,1.01) +3.21% (p=0.000)
BenchmarkRegexpMatchEasy0_1K-4 541ns × (0.99,1.01) 559ns × (0.99,1.01) +3.26% (p=0.000)
BenchmarkRegexpMatchEasy1_32 139ns × (0.98,1.04) 139ns × (0.99,1.03) ~ (p=0.979)
BenchmarkRegexpMatchEasy1_32-2 139ns × (0.99,1.04) 139ns × (0.99,1.02) ~ (p=0.777)
BenchmarkRegexpMatchEasy1_32-4 139ns × (0.98,1.04) 139ns × (0.99,1.04) ~ (p=0.771)
BenchmarkRegexpMatchEasy1_1K 890ns × (0.99,1.03) 885ns × (1.00,1.01) -0.50% (p=0.004)
BenchmarkRegexpMatchEasy1_1K-2 888ns × (0.99,1.01) 885ns × (0.99,1.01) -0.37% (p=0.004)
BenchmarkRegexpMatchEasy1_1K-4 890ns × (0.99,1.02) 884ns × (1.00,1.00) -0.70% (p=0.000)
BenchmarkRegexpMatchMedium_32 252ns × (0.99,1.01) 251ns × (0.99,1.01) ~ (p=0.081)
BenchmarkRegexpMatchMedium_32-2 254ns × (0.99,1.04) 252ns × (0.99,1.01) -0.78% (p=0.027)
BenchmarkRegexpMatchMedium_32-4 253ns × (0.99,1.04) 252ns × (0.99,1.01) -0.70% (p=0.022)
BenchmarkRegexpMatchMedium_1K 72.9µs × (0.99,1.01) 72.7µs × (1.00,1.00) ~ (p=0.064)
BenchmarkRegexpMatchMedium_1K-2 74.1µs × (0.98,1.05) 72.9µs × (1.00,1.01) -1.61% (p=0.001)
BenchmarkRegexpMatchMedium_1K-4 73.6µs × (0.99,1.05) 72.8µs × (1.00,1.00) -1.13% (p=0.007)
BenchmarkRegexpMatchHard_32 3.88µs × (0.99,1.03) 3.92µs × (0.98,1.05) ~ (p=0.143)
BenchmarkRegexpMatchHard_32-2 3.89µs × (0.99,1.03) 3.93µs × (0.98,1.09) ~ (p=0.278)
BenchmarkRegexpMatchHard_32-4 3.90µs × (0.99,1.05) 3.93µs × (0.98,1.05) ~ (p=0.252)
BenchmarkRegexpMatchHard_1K 118µs × (0.99,1.01) 117µs × (0.99,1.02) -0.54% (p=0.003)
BenchmarkRegexpMatchHard_1K-2 118µs × (0.99,1.01) 118µs × (0.99,1.03) ~ (p=0.581)
BenchmarkRegexpMatchHard_1K-4 118µs × (0.99,1.02) 117µs × (0.99,1.01) -0.54% (p=0.002)
BenchmarkRevcomp 991ms × (0.95,1.10) 989ms × (0.94,1.08) ~ (p=0.879)
BenchmarkRevcomp-2 978ms × (0.95,1.11) 962ms × (0.96,1.08) ~ (p=0.257)
BenchmarkRevcomp-4 979ms × (0.96,1.07) 974ms × (0.96,1.11) ~ (p=0.678)
BenchmarkTemplate 141ms × (0.99,1.02) 145ms × (0.99,1.02) +2.75% (p=0.000)
BenchmarkTemplate-2 135ms × (0.98,1.02) 138ms × (0.99,1.02) +2.34% (p=0.000)
BenchmarkTemplate-4 136ms × (0.98,1.02) 140ms × (0.99,1.02) +2.71% (p=0.000)
BenchmarkTimeParse 640ns × (0.99,1.01) 622ns × (0.99,1.01) -2.88% (p=0.000)
BenchmarkTimeParse-2 640ns × (0.99,1.01) 622ns × (1.00,1.00) -2.81% (p=0.000)
BenchmarkTimeParse-4 640ns × (1.00,1.01) 622ns × (0.99,1.01) -2.82% (p=0.000)
BenchmarkTimeFormat 730ns × (0.98,1.02) 731ns × (0.98,1.03) ~ (p=0.767)
BenchmarkTimeFormat-2 709ns × (0.99,1.02) 707ns × (0.99,1.02) ~ (p=0.347)
BenchmarkTimeFormat-4 717ns × (0.98,1.01) 718ns × (0.98,1.02) ~ (p=0.793)
Change-Id: Ie779c47e912bf80eb918bafa13638bd8dfd6c2d9
Reviewed-on: https://go-review.googlesource.com/9406
Reviewed-by: Rick Hudson <rlh@golang.org>
2015-04-27 22:45:57 -04:00
|
|
|
// sparse says to generate 4-bits per word mask for heap (1-bit for data/bss otherwise).
|
2015-01-16 14:43:38 -05:00
|
|
|
//go:nowritebarrier
|
2015-05-04 10:19:24 -04:00
|
|
|
func unrollgcprog1(maskp *byte, prog *byte, ppos *uintptr, inplace bool) *byte {
|
2015-01-16 14:43:38 -05:00
|
|
|
pos := *ppos
|
|
|
|
|
mask := (*[1 << 30]byte)(unsafe.Pointer(maskp))
|
|
|
|
|
for {
|
|
|
|
|
switch *prog {
|
|
|
|
|
default:
|
|
|
|
|
throw("unrollgcprog: unknown instruction")
|
|
|
|
|
|
|
|
|
|
case insData:
|
|
|
|
|
prog = addb(prog, 1)
|
|
|
|
|
siz := int(*prog)
|
|
|
|
|
prog = addb(prog, 1)
|
|
|
|
|
p := (*[1 << 30]byte)(unsafe.Pointer(prog))
|
|
|
|
|
for i := 0; i < siz; i++ {
|
2015-04-28 00:28:47 -04:00
|
|
|
v := p[i/8] >> (uint(i) % 8) & 1
|
2015-01-16 14:43:38 -05:00
|
|
|
if inplace {
|
2015-05-04 10:19:24 -04:00
|
|
|
throw("gc inplace")
|
|
|
|
|
const typeShift = 2
|
2015-01-16 14:43:38 -05:00
|
|
|
// Store directly into GC bitmap.
|
|
|
|
|
h := heapBitsForAddr(uintptr(unsafe.Pointer(&mask[pos])))
|
|
|
|
|
if h.shift == 0 {
|
|
|
|
|
*h.bitp = v << typeShift
|
|
|
|
|
} else {
|
|
|
|
|
*h.bitp |= v << (4 + typeShift)
|
|
|
|
|
}
|
|
|
|
|
pos += ptrSize
|
|
|
|
|
} else {
|
cmd/internal/gc, runtime: use 1-bit bitmap for stack frames, data, bss
The bitmaps were 2 bits per pointer because we needed to distinguish
scalar, pointer, multiword, and we used the leftover value to distinguish
uninitialized from scalar, even though the garbage collector (GC) didn't care.
Now that there are no multiword structures from the GC's point of view,
cut the bitmaps down to 1 bit per pointer, recording just live pointer vs not.
The GC assumes the same layout for stack frames and for the maps
describing the global data and bss sections, so change them all in one CL.
The code still refers to 4-bit heap bitmaps and 2-bit "type bitmaps", since
the 2-bit representation lives (at least for now) in some of the reflect data.
Because these stack frame bitmaps are stored directly in the rodata in
the binary, this CL reduces the size of the 6g binary by about 1.1%.
Performance change is basically a wash, but using less memory,
and smaller binaries, and enables other bitmap reductions.
name old mean new mean delta
BenchmarkBinaryTree17 13.2s × (0.97,1.03) 13.0s × (0.99,1.01) -0.93% (p=0.005)
BenchmarkBinaryTree17-2 9.69s × (0.96,1.05) 9.51s × (0.96,1.03) -1.86% (p=0.001)
BenchmarkBinaryTree17-4 10.1s × (0.97,1.05) 10.0s × (0.96,1.05) ~ (p=0.141)
BenchmarkFannkuch11 4.35s × (0.99,1.01) 4.43s × (0.98,1.04) +1.75% (p=0.001)
BenchmarkFannkuch11-2 4.31s × (0.99,1.03) 4.32s × (1.00,1.00) ~ (p=0.095)
BenchmarkFannkuch11-4 4.32s × (0.99,1.02) 4.38s × (0.98,1.04) +1.38% (p=0.008)
BenchmarkFmtFprintfEmpty 83.5ns × (0.97,1.10) 87.3ns × (0.92,1.11) +4.55% (p=0.014)
BenchmarkFmtFprintfEmpty-2 81.8ns × (0.98,1.04) 82.5ns × (0.97,1.08) ~ (p=0.364)
BenchmarkFmtFprintfEmpty-4 80.9ns × (0.99,1.01) 82.6ns × (0.97,1.08) +2.12% (p=0.010)
BenchmarkFmtFprintfString 320ns × (0.95,1.04) 322ns × (0.97,1.05) ~ (p=0.368)
BenchmarkFmtFprintfString-2 303ns × (0.97,1.04) 304ns × (0.97,1.04) ~ (p=0.484)
BenchmarkFmtFprintfString-4 305ns × (0.97,1.05) 306ns × (0.98,1.05) ~ (p=0.543)
BenchmarkFmtFprintfInt 311ns × (0.98,1.03) 319ns × (0.97,1.03) +2.63% (p=0.000)
BenchmarkFmtFprintfInt-2 297ns × (0.98,1.04) 301ns × (0.97,1.04) +1.19% (p=0.023)
BenchmarkFmtFprintfInt-4 302ns × (0.98,1.02) 304ns × (0.97,1.03) ~ (p=0.126)
BenchmarkFmtFprintfIntInt 554ns × (0.96,1.05) 554ns × (0.97,1.03) ~ (p=0.975)
BenchmarkFmtFprintfIntInt-2 520ns × (0.98,1.03) 517ns × (0.98,1.02) ~ (p=0.153)
BenchmarkFmtFprintfIntInt-4 524ns × (0.98,1.02) 525ns × (0.98,1.03) ~ (p=0.597)
BenchmarkFmtFprintfPrefixedInt 433ns × (0.97,1.06) 434ns × (0.97,1.06) ~ (p=0.804)
BenchmarkFmtFprintfPrefixedInt-2 413ns × (0.98,1.04) 413ns × (0.98,1.03) ~ (p=0.881)
BenchmarkFmtFprintfPrefixedInt-4 420ns × (0.97,1.03) 421ns × (0.97,1.03) ~ (p=0.561)
BenchmarkFmtFprintfFloat 620ns × (0.99,1.03) 636ns × (0.97,1.03) +2.57% (p=0.000)
BenchmarkFmtFprintfFloat-2 601ns × (0.98,1.02) 617ns × (0.98,1.03) +2.58% (p=0.000)
BenchmarkFmtFprintfFloat-4 613ns × (0.98,1.03) 626ns × (0.98,1.02) +2.15% (p=0.000)
BenchmarkFmtManyArgs 2.19µs × (0.96,1.04) 2.23µs × (0.97,1.02) +1.65% (p=0.000)
BenchmarkFmtManyArgs-2 2.08µs × (0.98,1.03) 2.10µs × (0.99,1.02) +0.79% (p=0.019)
BenchmarkFmtManyArgs-4 2.10µs × (0.98,1.02) 2.13µs × (0.98,1.02) +1.72% (p=0.000)
BenchmarkGobDecode 21.3ms × (0.97,1.05) 21.1ms × (0.97,1.04) -1.36% (p=0.025)
BenchmarkGobDecode-2 20.0ms × (0.97,1.03) 19.2ms × (0.97,1.03) -4.00% (p=0.000)
BenchmarkGobDecode-4 19.5ms × (0.99,1.02) 19.0ms × (0.99,1.01) -2.39% (p=0.000)
BenchmarkGobEncode 18.3ms × (0.95,1.07) 18.1ms × (0.96,1.08) ~ (p=0.305)
BenchmarkGobEncode-2 16.8ms × (0.97,1.02) 16.4ms × (0.98,1.02) -2.79% (p=0.000)
BenchmarkGobEncode-4 15.4ms × (0.98,1.02) 15.4ms × (0.98,1.02) ~ (p=0.465)
BenchmarkGzip 650ms × (0.98,1.03) 655ms × (0.97,1.04) ~ (p=0.075)
BenchmarkGzip-2 652ms × (0.98,1.03) 655ms × (0.98,1.02) ~ (p=0.337)
BenchmarkGzip-4 656ms × (0.98,1.04) 653ms × (0.98,1.03) ~ (p=0.291)
BenchmarkGunzip 143ms × (1.00,1.01) 143ms × (1.00,1.01) ~ (p=0.507)
BenchmarkGunzip-2 143ms × (1.00,1.01) 143ms × (1.00,1.01) ~ (p=0.313)
BenchmarkGunzip-4 143ms × (1.00,1.01) 143ms × (1.00,1.01) ~ (p=0.312)
BenchmarkHTTPClientServer 110µs × (0.98,1.03) 109µs × (0.99,1.02) -1.40% (p=0.000)
BenchmarkHTTPClientServer-2 154µs × (0.90,1.08) 149µs × (0.90,1.08) -3.43% (p=0.007)
BenchmarkHTTPClientServer-4 138µs × (0.97,1.04) 138µs × (0.96,1.04) ~ (p=0.670)
BenchmarkJSONEncode 40.2ms × (0.98,1.02) 40.2ms × (0.98,1.05) ~ (p=0.828)
BenchmarkJSONEncode-2 35.1ms × (0.99,1.02) 35.2ms × (0.98,1.03) ~ (p=0.392)
BenchmarkJSONEncode-4 35.3ms × (0.98,1.03) 35.3ms × (0.98,1.02) ~ (p=0.813)
BenchmarkJSONDecode 119ms × (0.97,1.02) 117ms × (0.98,1.02) -1.80% (p=0.000)
BenchmarkJSONDecode-2 115ms × (0.99,1.02) 114ms × (0.98,1.02) -1.18% (p=0.000)
BenchmarkJSONDecode-4 116ms × (0.98,1.02) 114ms × (0.98,1.02) -1.43% (p=0.000)
BenchmarkMandelbrot200 6.03ms × (1.00,1.01) 6.03ms × (1.00,1.01) ~ (p=0.985)
BenchmarkMandelbrot200-2 6.03ms × (1.00,1.01) 6.02ms × (1.00,1.01) ~ (p=0.320)
BenchmarkMandelbrot200-4 6.03ms × (1.00,1.01) 6.03ms × (1.00,1.01) ~ (p=0.799)
BenchmarkGoParse 8.63ms × (0.89,1.10) 8.58ms × (0.93,1.09) ~ (p=0.667)
BenchmarkGoParse-2 8.20ms × (0.97,1.04) 8.37ms × (0.97,1.04) +1.96% (p=0.001)
BenchmarkGoParse-4 8.00ms × (0.98,1.02) 8.14ms × (0.99,1.02) +1.75% (p=0.000)
BenchmarkRegexpMatchEasy0_32 162ns × (1.00,1.01) 164ns × (0.98,1.04) +1.35% (p=0.011)
BenchmarkRegexpMatchEasy0_32-2 161ns × (1.00,1.01) 161ns × (1.00,1.00) ~ (p=0.185)
BenchmarkRegexpMatchEasy0_32-4 161ns × (1.00,1.00) 161ns × (1.00,1.00) -0.19% (p=0.001)
BenchmarkRegexpMatchEasy0_1K 540ns × (0.99,1.02) 566ns × (0.98,1.04) +4.98% (p=0.000)
BenchmarkRegexpMatchEasy0_1K-2 540ns × (0.99,1.01) 557ns × (0.99,1.01) +3.21% (p=0.000)
BenchmarkRegexpMatchEasy0_1K-4 541ns × (0.99,1.01) 559ns × (0.99,1.01) +3.26% (p=0.000)
BenchmarkRegexpMatchEasy1_32 139ns × (0.98,1.04) 139ns × (0.99,1.03) ~ (p=0.979)
BenchmarkRegexpMatchEasy1_32-2 139ns × (0.99,1.04) 139ns × (0.99,1.02) ~ (p=0.777)
BenchmarkRegexpMatchEasy1_32-4 139ns × (0.98,1.04) 139ns × (0.99,1.04) ~ (p=0.771)
BenchmarkRegexpMatchEasy1_1K 890ns × (0.99,1.03) 885ns × (1.00,1.01) -0.50% (p=0.004)
BenchmarkRegexpMatchEasy1_1K-2 888ns × (0.99,1.01) 885ns × (0.99,1.01) -0.37% (p=0.004)
BenchmarkRegexpMatchEasy1_1K-4 890ns × (0.99,1.02) 884ns × (1.00,1.00) -0.70% (p=0.000)
BenchmarkRegexpMatchMedium_32 252ns × (0.99,1.01) 251ns × (0.99,1.01) ~ (p=0.081)
BenchmarkRegexpMatchMedium_32-2 254ns × (0.99,1.04) 252ns × (0.99,1.01) -0.78% (p=0.027)
BenchmarkRegexpMatchMedium_32-4 253ns × (0.99,1.04) 252ns × (0.99,1.01) -0.70% (p=0.022)
BenchmarkRegexpMatchMedium_1K 72.9µs × (0.99,1.01) 72.7µs × (1.00,1.00) ~ (p=0.064)
BenchmarkRegexpMatchMedium_1K-2 74.1µs × (0.98,1.05) 72.9µs × (1.00,1.01) -1.61% (p=0.001)
BenchmarkRegexpMatchMedium_1K-4 73.6µs × (0.99,1.05) 72.8µs × (1.00,1.00) -1.13% (p=0.007)
BenchmarkRegexpMatchHard_32 3.88µs × (0.99,1.03) 3.92µs × (0.98,1.05) ~ (p=0.143)
BenchmarkRegexpMatchHard_32-2 3.89µs × (0.99,1.03) 3.93µs × (0.98,1.09) ~ (p=0.278)
BenchmarkRegexpMatchHard_32-4 3.90µs × (0.99,1.05) 3.93µs × (0.98,1.05) ~ (p=0.252)
BenchmarkRegexpMatchHard_1K 118µs × (0.99,1.01) 117µs × (0.99,1.02) -0.54% (p=0.003)
BenchmarkRegexpMatchHard_1K-2 118µs × (0.99,1.01) 118µs × (0.99,1.03) ~ (p=0.581)
BenchmarkRegexpMatchHard_1K-4 118µs × (0.99,1.02) 117µs × (0.99,1.01) -0.54% (p=0.002)
BenchmarkRevcomp 991ms × (0.95,1.10) 989ms × (0.94,1.08) ~ (p=0.879)
BenchmarkRevcomp-2 978ms × (0.95,1.11) 962ms × (0.96,1.08) ~ (p=0.257)
BenchmarkRevcomp-4 979ms × (0.96,1.07) 974ms × (0.96,1.11) ~ (p=0.678)
BenchmarkTemplate 141ms × (0.99,1.02) 145ms × (0.99,1.02) +2.75% (p=0.000)
BenchmarkTemplate-2 135ms × (0.98,1.02) 138ms × (0.99,1.02) +2.34% (p=0.000)
BenchmarkTemplate-4 136ms × (0.98,1.02) 140ms × (0.99,1.02) +2.71% (p=0.000)
BenchmarkTimeParse 640ns × (0.99,1.01) 622ns × (0.99,1.01) -2.88% (p=0.000)
BenchmarkTimeParse-2 640ns × (0.99,1.01) 622ns × (1.00,1.00) -2.81% (p=0.000)
BenchmarkTimeParse-4 640ns × (1.00,1.01) 622ns × (0.99,1.01) -2.82% (p=0.000)
BenchmarkTimeFormat 730ns × (0.98,1.02) 731ns × (0.98,1.03) ~ (p=0.767)
BenchmarkTimeFormat-2 709ns × (0.99,1.02) 707ns × (0.99,1.02) ~ (p=0.347)
BenchmarkTimeFormat-4 717ns × (0.98,1.01) 718ns × (0.98,1.02) ~ (p=0.793)
Change-Id: Ie779c47e912bf80eb918bafa13638bd8dfd6c2d9
Reviewed-on: https://go-review.googlesource.com/9406
Reviewed-by: Rick Hudson <rlh@golang.org>
2015-04-27 22:45:57 -04:00
|
|
|
// 1 bit per word, for data/bss bitmap
|
|
|
|
|
mask[pos/8] |= v << (pos % 8)
|
|
|
|
|
pos++
|
2015-01-16 14:43:38 -05:00
|
|
|
}
|
|
|
|
|
}
|
2015-04-28 00:28:47 -04:00
|
|
|
prog = addb(prog, (uintptr(siz)+7)/8)
|
2015-01-16 14:43:38 -05:00
|
|
|
|
|
|
|
|
case insArray:
|
|
|
|
|
prog = (*byte)(add(unsafe.Pointer(prog), 1))
|
|
|
|
|
siz := uintptr(0)
|
|
|
|
|
for i := uintptr(0); i < ptrSize; i++ {
|
|
|
|
|
siz = (siz << 8) + uintptr(*(*byte)(add(unsafe.Pointer(prog), ptrSize-i-1)))
|
|
|
|
|
}
|
|
|
|
|
prog = (*byte)(add(unsafe.Pointer(prog), ptrSize))
|
|
|
|
|
var prog1 *byte
|
|
|
|
|
for i := uintptr(0); i < siz; i++ {
|
2015-05-04 10:19:24 -04:00
|
|
|
prog1 = unrollgcprog1(&mask[0], prog, &pos, inplace)
|
2015-01-16 14:43:38 -05:00
|
|
|
}
|
|
|
|
|
if *prog1 != insArrayEnd {
|
|
|
|
|
throw("unrollgcprog: array does not end with insArrayEnd")
|
|
|
|
|
}
|
|
|
|
|
prog = (*byte)(add(unsafe.Pointer(prog1), 1))
|
|
|
|
|
|
|
|
|
|
case insArrayEnd, insEnd:
|
|
|
|
|
*ppos = pos
|
|
|
|
|
return prog
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2015-04-28 00:28:47 -04:00
|
|
|
// Unrolls GC program prog for data/bss, returns 1-bit GC mask.
|
2015-01-16 14:43:38 -05:00
|
|
|
func unrollglobgcprog(prog *byte, size uintptr) bitvector {
|
cmd/internal/gc, runtime: use 1-bit bitmap for stack frames, data, bss
The bitmaps were 2 bits per pointer because we needed to distinguish
scalar, pointer, multiword, and we used the leftover value to distinguish
uninitialized from scalar, even though the garbage collector (GC) didn't care.
Now that there are no multiword structures from the GC's point of view,
cut the bitmaps down to 1 bit per pointer, recording just live pointer vs not.
The GC assumes the same layout for stack frames and for the maps
describing the global data and bss sections, so change them all in one CL.
The code still refers to 4-bit heap bitmaps and 2-bit "type bitmaps", since
the 2-bit representation lives (at least for now) in some of the reflect data.
Because these stack frame bitmaps are stored directly in the rodata in
the binary, this CL reduces the size of the 6g binary by about 1.1%.
Performance change is basically a wash, but using less memory,
and smaller binaries, and enables other bitmap reductions.
name old mean new mean delta
BenchmarkBinaryTree17 13.2s × (0.97,1.03) 13.0s × (0.99,1.01) -0.93% (p=0.005)
BenchmarkBinaryTree17-2 9.69s × (0.96,1.05) 9.51s × (0.96,1.03) -1.86% (p=0.001)
BenchmarkBinaryTree17-4 10.1s × (0.97,1.05) 10.0s × (0.96,1.05) ~ (p=0.141)
BenchmarkFannkuch11 4.35s × (0.99,1.01) 4.43s × (0.98,1.04) +1.75% (p=0.001)
BenchmarkFannkuch11-2 4.31s × (0.99,1.03) 4.32s × (1.00,1.00) ~ (p=0.095)
BenchmarkFannkuch11-4 4.32s × (0.99,1.02) 4.38s × (0.98,1.04) +1.38% (p=0.008)
BenchmarkFmtFprintfEmpty 83.5ns × (0.97,1.10) 87.3ns × (0.92,1.11) +4.55% (p=0.014)
BenchmarkFmtFprintfEmpty-2 81.8ns × (0.98,1.04) 82.5ns × (0.97,1.08) ~ (p=0.364)
BenchmarkFmtFprintfEmpty-4 80.9ns × (0.99,1.01) 82.6ns × (0.97,1.08) +2.12% (p=0.010)
BenchmarkFmtFprintfString 320ns × (0.95,1.04) 322ns × (0.97,1.05) ~ (p=0.368)
BenchmarkFmtFprintfString-2 303ns × (0.97,1.04) 304ns × (0.97,1.04) ~ (p=0.484)
BenchmarkFmtFprintfString-4 305ns × (0.97,1.05) 306ns × (0.98,1.05) ~ (p=0.543)
BenchmarkFmtFprintfInt 311ns × (0.98,1.03) 319ns × (0.97,1.03) +2.63% (p=0.000)
BenchmarkFmtFprintfInt-2 297ns × (0.98,1.04) 301ns × (0.97,1.04) +1.19% (p=0.023)
BenchmarkFmtFprintfInt-4 302ns × (0.98,1.02) 304ns × (0.97,1.03) ~ (p=0.126)
BenchmarkFmtFprintfIntInt 554ns × (0.96,1.05) 554ns × (0.97,1.03) ~ (p=0.975)
BenchmarkFmtFprintfIntInt-2 520ns × (0.98,1.03) 517ns × (0.98,1.02) ~ (p=0.153)
BenchmarkFmtFprintfIntInt-4 524ns × (0.98,1.02) 525ns × (0.98,1.03) ~ (p=0.597)
BenchmarkFmtFprintfPrefixedInt 433ns × (0.97,1.06) 434ns × (0.97,1.06) ~ (p=0.804)
BenchmarkFmtFprintfPrefixedInt-2 413ns × (0.98,1.04) 413ns × (0.98,1.03) ~ (p=0.881)
BenchmarkFmtFprintfPrefixedInt-4 420ns × (0.97,1.03) 421ns × (0.97,1.03) ~ (p=0.561)
BenchmarkFmtFprintfFloat 620ns × (0.99,1.03) 636ns × (0.97,1.03) +2.57% (p=0.000)
BenchmarkFmtFprintfFloat-2 601ns × (0.98,1.02) 617ns × (0.98,1.03) +2.58% (p=0.000)
BenchmarkFmtFprintfFloat-4 613ns × (0.98,1.03) 626ns × (0.98,1.02) +2.15% (p=0.000)
BenchmarkFmtManyArgs 2.19µs × (0.96,1.04) 2.23µs × (0.97,1.02) +1.65% (p=0.000)
BenchmarkFmtManyArgs-2 2.08µs × (0.98,1.03) 2.10µs × (0.99,1.02) +0.79% (p=0.019)
BenchmarkFmtManyArgs-4 2.10µs × (0.98,1.02) 2.13µs × (0.98,1.02) +1.72% (p=0.000)
BenchmarkGobDecode 21.3ms × (0.97,1.05) 21.1ms × (0.97,1.04) -1.36% (p=0.025)
BenchmarkGobDecode-2 20.0ms × (0.97,1.03) 19.2ms × (0.97,1.03) -4.00% (p=0.000)
BenchmarkGobDecode-4 19.5ms × (0.99,1.02) 19.0ms × (0.99,1.01) -2.39% (p=0.000)
BenchmarkGobEncode 18.3ms × (0.95,1.07) 18.1ms × (0.96,1.08) ~ (p=0.305)
BenchmarkGobEncode-2 16.8ms × (0.97,1.02) 16.4ms × (0.98,1.02) -2.79% (p=0.000)
BenchmarkGobEncode-4 15.4ms × (0.98,1.02) 15.4ms × (0.98,1.02) ~ (p=0.465)
BenchmarkGzip 650ms × (0.98,1.03) 655ms × (0.97,1.04) ~ (p=0.075)
BenchmarkGzip-2 652ms × (0.98,1.03) 655ms × (0.98,1.02) ~ (p=0.337)
BenchmarkGzip-4 656ms × (0.98,1.04) 653ms × (0.98,1.03) ~ (p=0.291)
BenchmarkGunzip 143ms × (1.00,1.01) 143ms × (1.00,1.01) ~ (p=0.507)
BenchmarkGunzip-2 143ms × (1.00,1.01) 143ms × (1.00,1.01) ~ (p=0.313)
BenchmarkGunzip-4 143ms × (1.00,1.01) 143ms × (1.00,1.01) ~ (p=0.312)
BenchmarkHTTPClientServer 110µs × (0.98,1.03) 109µs × (0.99,1.02) -1.40% (p=0.000)
BenchmarkHTTPClientServer-2 154µs × (0.90,1.08) 149µs × (0.90,1.08) -3.43% (p=0.007)
BenchmarkHTTPClientServer-4 138µs × (0.97,1.04) 138µs × (0.96,1.04) ~ (p=0.670)
BenchmarkJSONEncode 40.2ms × (0.98,1.02) 40.2ms × (0.98,1.05) ~ (p=0.828)
BenchmarkJSONEncode-2 35.1ms × (0.99,1.02) 35.2ms × (0.98,1.03) ~ (p=0.392)
BenchmarkJSONEncode-4 35.3ms × (0.98,1.03) 35.3ms × (0.98,1.02) ~ (p=0.813)
BenchmarkJSONDecode 119ms × (0.97,1.02) 117ms × (0.98,1.02) -1.80% (p=0.000)
BenchmarkJSONDecode-2 115ms × (0.99,1.02) 114ms × (0.98,1.02) -1.18% (p=0.000)
BenchmarkJSONDecode-4 116ms × (0.98,1.02) 114ms × (0.98,1.02) -1.43% (p=0.000)
BenchmarkMandelbrot200 6.03ms × (1.00,1.01) 6.03ms × (1.00,1.01) ~ (p=0.985)
BenchmarkMandelbrot200-2 6.03ms × (1.00,1.01) 6.02ms × (1.00,1.01) ~ (p=0.320)
BenchmarkMandelbrot200-4 6.03ms × (1.00,1.01) 6.03ms × (1.00,1.01) ~ (p=0.799)
BenchmarkGoParse 8.63ms × (0.89,1.10) 8.58ms × (0.93,1.09) ~ (p=0.667)
BenchmarkGoParse-2 8.20ms × (0.97,1.04) 8.37ms × (0.97,1.04) +1.96% (p=0.001)
BenchmarkGoParse-4 8.00ms × (0.98,1.02) 8.14ms × (0.99,1.02) +1.75% (p=0.000)
BenchmarkRegexpMatchEasy0_32 162ns × (1.00,1.01) 164ns × (0.98,1.04) +1.35% (p=0.011)
BenchmarkRegexpMatchEasy0_32-2 161ns × (1.00,1.01) 161ns × (1.00,1.00) ~ (p=0.185)
BenchmarkRegexpMatchEasy0_32-4 161ns × (1.00,1.00) 161ns × (1.00,1.00) -0.19% (p=0.001)
BenchmarkRegexpMatchEasy0_1K 540ns × (0.99,1.02) 566ns × (0.98,1.04) +4.98% (p=0.000)
BenchmarkRegexpMatchEasy0_1K-2 540ns × (0.99,1.01) 557ns × (0.99,1.01) +3.21% (p=0.000)
BenchmarkRegexpMatchEasy0_1K-4 541ns × (0.99,1.01) 559ns × (0.99,1.01) +3.26% (p=0.000)
BenchmarkRegexpMatchEasy1_32 139ns × (0.98,1.04) 139ns × (0.99,1.03) ~ (p=0.979)
BenchmarkRegexpMatchEasy1_32-2 139ns × (0.99,1.04) 139ns × (0.99,1.02) ~ (p=0.777)
BenchmarkRegexpMatchEasy1_32-4 139ns × (0.98,1.04) 139ns × (0.99,1.04) ~ (p=0.771)
BenchmarkRegexpMatchEasy1_1K 890ns × (0.99,1.03) 885ns × (1.00,1.01) -0.50% (p=0.004)
BenchmarkRegexpMatchEasy1_1K-2 888ns × (0.99,1.01) 885ns × (0.99,1.01) -0.37% (p=0.004)
BenchmarkRegexpMatchEasy1_1K-4 890ns × (0.99,1.02) 884ns × (1.00,1.00) -0.70% (p=0.000)
BenchmarkRegexpMatchMedium_32 252ns × (0.99,1.01) 251ns × (0.99,1.01) ~ (p=0.081)
BenchmarkRegexpMatchMedium_32-2 254ns × (0.99,1.04) 252ns × (0.99,1.01) -0.78% (p=0.027)
BenchmarkRegexpMatchMedium_32-4 253ns × (0.99,1.04) 252ns × (0.99,1.01) -0.70% (p=0.022)
BenchmarkRegexpMatchMedium_1K 72.9µs × (0.99,1.01) 72.7µs × (1.00,1.00) ~ (p=0.064)
BenchmarkRegexpMatchMedium_1K-2 74.1µs × (0.98,1.05) 72.9µs × (1.00,1.01) -1.61% (p=0.001)
BenchmarkRegexpMatchMedium_1K-4 73.6µs × (0.99,1.05) 72.8µs × (1.00,1.00) -1.13% (p=0.007)
BenchmarkRegexpMatchHard_32 3.88µs × (0.99,1.03) 3.92µs × (0.98,1.05) ~ (p=0.143)
BenchmarkRegexpMatchHard_32-2 3.89µs × (0.99,1.03) 3.93µs × (0.98,1.09) ~ (p=0.278)
BenchmarkRegexpMatchHard_32-4 3.90µs × (0.99,1.05) 3.93µs × (0.98,1.05) ~ (p=0.252)
BenchmarkRegexpMatchHard_1K 118µs × (0.99,1.01) 117µs × (0.99,1.02) -0.54% (p=0.003)
BenchmarkRegexpMatchHard_1K-2 118µs × (0.99,1.01) 118µs × (0.99,1.03) ~ (p=0.581)
BenchmarkRegexpMatchHard_1K-4 118µs × (0.99,1.02) 117µs × (0.99,1.01) -0.54% (p=0.002)
BenchmarkRevcomp 991ms × (0.95,1.10) 989ms × (0.94,1.08) ~ (p=0.879)
BenchmarkRevcomp-2 978ms × (0.95,1.11) 962ms × (0.96,1.08) ~ (p=0.257)
BenchmarkRevcomp-4 979ms × (0.96,1.07) 974ms × (0.96,1.11) ~ (p=0.678)
BenchmarkTemplate 141ms × (0.99,1.02) 145ms × (0.99,1.02) +2.75% (p=0.000)
BenchmarkTemplate-2 135ms × (0.98,1.02) 138ms × (0.99,1.02) +2.34% (p=0.000)
BenchmarkTemplate-4 136ms × (0.98,1.02) 140ms × (0.99,1.02) +2.71% (p=0.000)
BenchmarkTimeParse 640ns × (0.99,1.01) 622ns × (0.99,1.01) -2.88% (p=0.000)
BenchmarkTimeParse-2 640ns × (0.99,1.01) 622ns × (1.00,1.00) -2.81% (p=0.000)
BenchmarkTimeParse-4 640ns × (1.00,1.01) 622ns × (0.99,1.01) -2.82% (p=0.000)
BenchmarkTimeFormat 730ns × (0.98,1.02) 731ns × (0.98,1.03) ~ (p=0.767)
BenchmarkTimeFormat-2 709ns × (0.99,1.02) 707ns × (0.99,1.02) ~ (p=0.347)
BenchmarkTimeFormat-4 717ns × (0.98,1.01) 718ns × (0.98,1.02) ~ (p=0.793)
Change-Id: Ie779c47e912bf80eb918bafa13638bd8dfd6c2d9
Reviewed-on: https://go-review.googlesource.com/9406
Reviewed-by: Rick Hudson <rlh@golang.org>
2015-04-27 22:45:57 -04:00
|
|
|
masksize := round(round(size, ptrSize)/ptrSize, 8) / 8
|
2015-01-16 14:43:38 -05:00
|
|
|
mask := (*[1 << 30]byte)(persistentalloc(masksize+1, 0, &memstats.gc_sys))
|
|
|
|
|
mask[masksize] = 0xa1
|
|
|
|
|
pos := uintptr(0)
|
2015-05-04 10:19:24 -04:00
|
|
|
prog = unrollgcprog1(&mask[0], prog, &pos, false)
|
cmd/internal/gc, runtime: use 1-bit bitmap for stack frames, data, bss
The bitmaps were 2 bits per pointer because we needed to distinguish
scalar, pointer, multiword, and we used the leftover value to distinguish
uninitialized from scalar, even though the garbage collector (GC) didn't care.
Now that there are no multiword structures from the GC's point of view,
cut the bitmaps down to 1 bit per pointer, recording just live pointer vs not.
The GC assumes the same layout for stack frames and for the maps
describing the global data and bss sections, so change them all in one CL.
The code still refers to 4-bit heap bitmaps and 2-bit "type bitmaps", since
the 2-bit representation lives (at least for now) in some of the reflect data.
Because these stack frame bitmaps are stored directly in the rodata in
the binary, this CL reduces the size of the 6g binary by about 1.1%.
Performance change is basically a wash, but using less memory,
and smaller binaries, and enables other bitmap reductions.
name old mean new mean delta
BenchmarkBinaryTree17 13.2s × (0.97,1.03) 13.0s × (0.99,1.01) -0.93% (p=0.005)
BenchmarkBinaryTree17-2 9.69s × (0.96,1.05) 9.51s × (0.96,1.03) -1.86% (p=0.001)
BenchmarkBinaryTree17-4 10.1s × (0.97,1.05) 10.0s × (0.96,1.05) ~ (p=0.141)
BenchmarkFannkuch11 4.35s × (0.99,1.01) 4.43s × (0.98,1.04) +1.75% (p=0.001)
BenchmarkFannkuch11-2 4.31s × (0.99,1.03) 4.32s × (1.00,1.00) ~ (p=0.095)
BenchmarkFannkuch11-4 4.32s × (0.99,1.02) 4.38s × (0.98,1.04) +1.38% (p=0.008)
BenchmarkFmtFprintfEmpty 83.5ns × (0.97,1.10) 87.3ns × (0.92,1.11) +4.55% (p=0.014)
BenchmarkFmtFprintfEmpty-2 81.8ns × (0.98,1.04) 82.5ns × (0.97,1.08) ~ (p=0.364)
BenchmarkFmtFprintfEmpty-4 80.9ns × (0.99,1.01) 82.6ns × (0.97,1.08) +2.12% (p=0.010)
BenchmarkFmtFprintfString 320ns × (0.95,1.04) 322ns × (0.97,1.05) ~ (p=0.368)
BenchmarkFmtFprintfString-2 303ns × (0.97,1.04) 304ns × (0.97,1.04) ~ (p=0.484)
BenchmarkFmtFprintfString-4 305ns × (0.97,1.05) 306ns × (0.98,1.05) ~ (p=0.543)
BenchmarkFmtFprintfInt 311ns × (0.98,1.03) 319ns × (0.97,1.03) +2.63% (p=0.000)
BenchmarkFmtFprintfInt-2 297ns × (0.98,1.04) 301ns × (0.97,1.04) +1.19% (p=0.023)
BenchmarkFmtFprintfInt-4 302ns × (0.98,1.02) 304ns × (0.97,1.03) ~ (p=0.126)
BenchmarkFmtFprintfIntInt 554ns × (0.96,1.05) 554ns × (0.97,1.03) ~ (p=0.975)
BenchmarkFmtFprintfIntInt-2 520ns × (0.98,1.03) 517ns × (0.98,1.02) ~ (p=0.153)
BenchmarkFmtFprintfIntInt-4 524ns × (0.98,1.02) 525ns × (0.98,1.03) ~ (p=0.597)
BenchmarkFmtFprintfPrefixedInt 433ns × (0.97,1.06) 434ns × (0.97,1.06) ~ (p=0.804)
BenchmarkFmtFprintfPrefixedInt-2 413ns × (0.98,1.04) 413ns × (0.98,1.03) ~ (p=0.881)
BenchmarkFmtFprintfPrefixedInt-4 420ns × (0.97,1.03) 421ns × (0.97,1.03) ~ (p=0.561)
BenchmarkFmtFprintfFloat 620ns × (0.99,1.03) 636ns × (0.97,1.03) +2.57% (p=0.000)
BenchmarkFmtFprintfFloat-2 601ns × (0.98,1.02) 617ns × (0.98,1.03) +2.58% (p=0.000)
BenchmarkFmtFprintfFloat-4 613ns × (0.98,1.03) 626ns × (0.98,1.02) +2.15% (p=0.000)
BenchmarkFmtManyArgs 2.19µs × (0.96,1.04) 2.23µs × (0.97,1.02) +1.65% (p=0.000)
BenchmarkFmtManyArgs-2 2.08µs × (0.98,1.03) 2.10µs × (0.99,1.02) +0.79% (p=0.019)
BenchmarkFmtManyArgs-4 2.10µs × (0.98,1.02) 2.13µs × (0.98,1.02) +1.72% (p=0.000)
BenchmarkGobDecode 21.3ms × (0.97,1.05) 21.1ms × (0.97,1.04) -1.36% (p=0.025)
BenchmarkGobDecode-2 20.0ms × (0.97,1.03) 19.2ms × (0.97,1.03) -4.00% (p=0.000)
BenchmarkGobDecode-4 19.5ms × (0.99,1.02) 19.0ms × (0.99,1.01) -2.39% (p=0.000)
BenchmarkGobEncode 18.3ms × (0.95,1.07) 18.1ms × (0.96,1.08) ~ (p=0.305)
BenchmarkGobEncode-2 16.8ms × (0.97,1.02) 16.4ms × (0.98,1.02) -2.79% (p=0.000)
BenchmarkGobEncode-4 15.4ms × (0.98,1.02) 15.4ms × (0.98,1.02) ~ (p=0.465)
BenchmarkGzip 650ms × (0.98,1.03) 655ms × (0.97,1.04) ~ (p=0.075)
BenchmarkGzip-2 652ms × (0.98,1.03) 655ms × (0.98,1.02) ~ (p=0.337)
BenchmarkGzip-4 656ms × (0.98,1.04) 653ms × (0.98,1.03) ~ (p=0.291)
BenchmarkGunzip 143ms × (1.00,1.01) 143ms × (1.00,1.01) ~ (p=0.507)
BenchmarkGunzip-2 143ms × (1.00,1.01) 143ms × (1.00,1.01) ~ (p=0.313)
BenchmarkGunzip-4 143ms × (1.00,1.01) 143ms × (1.00,1.01) ~ (p=0.312)
BenchmarkHTTPClientServer 110µs × (0.98,1.03) 109µs × (0.99,1.02) -1.40% (p=0.000)
BenchmarkHTTPClientServer-2 154µs × (0.90,1.08) 149µs × (0.90,1.08) -3.43% (p=0.007)
BenchmarkHTTPClientServer-4 138µs × (0.97,1.04) 138µs × (0.96,1.04) ~ (p=0.670)
BenchmarkJSONEncode 40.2ms × (0.98,1.02) 40.2ms × (0.98,1.05) ~ (p=0.828)
BenchmarkJSONEncode-2 35.1ms × (0.99,1.02) 35.2ms × (0.98,1.03) ~ (p=0.392)
BenchmarkJSONEncode-4 35.3ms × (0.98,1.03) 35.3ms × (0.98,1.02) ~ (p=0.813)
BenchmarkJSONDecode 119ms × (0.97,1.02) 117ms × (0.98,1.02) -1.80% (p=0.000)
BenchmarkJSONDecode-2 115ms × (0.99,1.02) 114ms × (0.98,1.02) -1.18% (p=0.000)
BenchmarkJSONDecode-4 116ms × (0.98,1.02) 114ms × (0.98,1.02) -1.43% (p=0.000)
BenchmarkMandelbrot200 6.03ms × (1.00,1.01) 6.03ms × (1.00,1.01) ~ (p=0.985)
BenchmarkMandelbrot200-2 6.03ms × (1.00,1.01) 6.02ms × (1.00,1.01) ~ (p=0.320)
BenchmarkMandelbrot200-4 6.03ms × (1.00,1.01) 6.03ms × (1.00,1.01) ~ (p=0.799)
BenchmarkGoParse 8.63ms × (0.89,1.10) 8.58ms × (0.93,1.09) ~ (p=0.667)
BenchmarkGoParse-2 8.20ms × (0.97,1.04) 8.37ms × (0.97,1.04) +1.96% (p=0.001)
BenchmarkGoParse-4 8.00ms × (0.98,1.02) 8.14ms × (0.99,1.02) +1.75% (p=0.000)
BenchmarkRegexpMatchEasy0_32 162ns × (1.00,1.01) 164ns × (0.98,1.04) +1.35% (p=0.011)
BenchmarkRegexpMatchEasy0_32-2 161ns × (1.00,1.01) 161ns × (1.00,1.00) ~ (p=0.185)
BenchmarkRegexpMatchEasy0_32-4 161ns × (1.00,1.00) 161ns × (1.00,1.00) -0.19% (p=0.001)
BenchmarkRegexpMatchEasy0_1K 540ns × (0.99,1.02) 566ns × (0.98,1.04) +4.98% (p=0.000)
BenchmarkRegexpMatchEasy0_1K-2 540ns × (0.99,1.01) 557ns × (0.99,1.01) +3.21% (p=0.000)
BenchmarkRegexpMatchEasy0_1K-4 541ns × (0.99,1.01) 559ns × (0.99,1.01) +3.26% (p=0.000)
BenchmarkRegexpMatchEasy1_32 139ns × (0.98,1.04) 139ns × (0.99,1.03) ~ (p=0.979)
BenchmarkRegexpMatchEasy1_32-2 139ns × (0.99,1.04) 139ns × (0.99,1.02) ~ (p=0.777)
BenchmarkRegexpMatchEasy1_32-4 139ns × (0.98,1.04) 139ns × (0.99,1.04) ~ (p=0.771)
BenchmarkRegexpMatchEasy1_1K 890ns × (0.99,1.03) 885ns × (1.00,1.01) -0.50% (p=0.004)
BenchmarkRegexpMatchEasy1_1K-2 888ns × (0.99,1.01) 885ns × (0.99,1.01) -0.37% (p=0.004)
BenchmarkRegexpMatchEasy1_1K-4 890ns × (0.99,1.02) 884ns × (1.00,1.00) -0.70% (p=0.000)
BenchmarkRegexpMatchMedium_32 252ns × (0.99,1.01) 251ns × (0.99,1.01) ~ (p=0.081)
BenchmarkRegexpMatchMedium_32-2 254ns × (0.99,1.04) 252ns × (0.99,1.01) -0.78% (p=0.027)
BenchmarkRegexpMatchMedium_32-4 253ns × (0.99,1.04) 252ns × (0.99,1.01) -0.70% (p=0.022)
BenchmarkRegexpMatchMedium_1K 72.9µs × (0.99,1.01) 72.7µs × (1.00,1.00) ~ (p=0.064)
BenchmarkRegexpMatchMedium_1K-2 74.1µs × (0.98,1.05) 72.9µs × (1.00,1.01) -1.61% (p=0.001)
BenchmarkRegexpMatchMedium_1K-4 73.6µs × (0.99,1.05) 72.8µs × (1.00,1.00) -1.13% (p=0.007)
BenchmarkRegexpMatchHard_32 3.88µs × (0.99,1.03) 3.92µs × (0.98,1.05) ~ (p=0.143)
BenchmarkRegexpMatchHard_32-2 3.89µs × (0.99,1.03) 3.93µs × (0.98,1.09) ~ (p=0.278)
BenchmarkRegexpMatchHard_32-4 3.90µs × (0.99,1.05) 3.93µs × (0.98,1.05) ~ (p=0.252)
BenchmarkRegexpMatchHard_1K 118µs × (0.99,1.01) 117µs × (0.99,1.02) -0.54% (p=0.003)
BenchmarkRegexpMatchHard_1K-2 118µs × (0.99,1.01) 118µs × (0.99,1.03) ~ (p=0.581)
BenchmarkRegexpMatchHard_1K-4 118µs × (0.99,1.02) 117µs × (0.99,1.01) -0.54% (p=0.002)
BenchmarkRevcomp 991ms × (0.95,1.10) 989ms × (0.94,1.08) ~ (p=0.879)
BenchmarkRevcomp-2 978ms × (0.95,1.11) 962ms × (0.96,1.08) ~ (p=0.257)
BenchmarkRevcomp-4 979ms × (0.96,1.07) 974ms × (0.96,1.11) ~ (p=0.678)
BenchmarkTemplate 141ms × (0.99,1.02) 145ms × (0.99,1.02) +2.75% (p=0.000)
BenchmarkTemplate-2 135ms × (0.98,1.02) 138ms × (0.99,1.02) +2.34% (p=0.000)
BenchmarkTemplate-4 136ms × (0.98,1.02) 140ms × (0.99,1.02) +2.71% (p=0.000)
BenchmarkTimeParse 640ns × (0.99,1.01) 622ns × (0.99,1.01) -2.88% (p=0.000)
BenchmarkTimeParse-2 640ns × (0.99,1.01) 622ns × (1.00,1.00) -2.81% (p=0.000)
BenchmarkTimeParse-4 640ns × (1.00,1.01) 622ns × (0.99,1.01) -2.82% (p=0.000)
BenchmarkTimeFormat 730ns × (0.98,1.02) 731ns × (0.98,1.03) ~ (p=0.767)
BenchmarkTimeFormat-2 709ns × (0.99,1.02) 707ns × (0.99,1.02) ~ (p=0.347)
BenchmarkTimeFormat-4 717ns × (0.98,1.01) 718ns × (0.98,1.02) ~ (p=0.793)
Change-Id: Ie779c47e912bf80eb918bafa13638bd8dfd6c2d9
Reviewed-on: https://go-review.googlesource.com/9406
Reviewed-by: Rick Hudson <rlh@golang.org>
2015-04-27 22:45:57 -04:00
|
|
|
if pos != size/ptrSize {
|
|
|
|
|
print("unrollglobgcprog: bad program size, got ", pos, ", expect ", size/ptrSize, "\n")
|
2015-01-16 14:43:38 -05:00
|
|
|
throw("unrollglobgcprog: bad program size")
|
|
|
|
|
}
|
|
|
|
|
if *prog != insEnd {
|
|
|
|
|
throw("unrollglobgcprog: program does not end with insEnd")
|
|
|
|
|
}
|
|
|
|
|
if mask[masksize] != 0xa1 {
|
|
|
|
|
throw("unrollglobgcprog: overflow")
|
|
|
|
|
}
|
|
|
|
|
return bitvector{int32(masksize * 8), &mask[0]}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func unrollgcproginplace_m(v unsafe.Pointer, typ *_type, size, size0 uintptr) {
|
2015-05-04 10:19:24 -04:00
|
|
|
throw("unrollinplace")
|
|
|
|
|
// TODO(rsc): Update for 1-bit bitmaps.
|
2015-01-16 14:43:38 -05:00
|
|
|
// TODO(rsc): Explain why these non-atomic updates are okay.
|
|
|
|
|
pos := uintptr(0)
|
|
|
|
|
prog := (*byte)(unsafe.Pointer(uintptr(typ.gc[1])))
|
|
|
|
|
for pos != size0 {
|
2015-05-04 10:19:24 -04:00
|
|
|
unrollgcprog1((*byte)(v), prog, &pos, true)
|
2015-01-16 14:43:38 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Mark first word as bitAllocated.
|
|
|
|
|
// Mark word after last as typeDead.
|
|
|
|
|
if size0 < size {
|
|
|
|
|
h := heapBitsForAddr(uintptr(v) + size0)
|
2015-05-04 10:19:24 -04:00
|
|
|
const typeMask = 0
|
|
|
|
|
const typeShift = 0
|
2015-01-16 14:43:38 -05:00
|
|
|
*h.bitp &^= typeMask << typeShift
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
var unroll mutex
|
|
|
|
|
|
|
|
|
|
// Unrolls GC program in typ.gc[1] into typ.gc[0]
|
|
|
|
|
//go:nowritebarrier
|
|
|
|
|
func unrollgcprog_m(typ *_type) {
|
|
|
|
|
lock(&unroll)
|
|
|
|
|
mask := (*byte)(unsafe.Pointer(uintptr(typ.gc[0])))
|
|
|
|
|
if *mask == 0 {
|
|
|
|
|
pos := uintptr(8) // skip the unroll flag
|
|
|
|
|
prog := (*byte)(unsafe.Pointer(uintptr(typ.gc[1])))
|
2015-05-04 10:19:24 -04:00
|
|
|
prog = unrollgcprog1(mask, prog, &pos, false)
|
2015-01-16 14:43:38 -05:00
|
|
|
if *prog != insEnd {
|
|
|
|
|
throw("unrollgcprog: program does not end with insEnd")
|
|
|
|
|
}
|
|
|
|
|
// atomic way to say mask[0] = 1
|
|
|
|
|
atomicor8(mask, 1)
|
|
|
|
|
}
|
|
|
|
|
unlock(&unroll)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Testing.
|
|
|
|
|
|
|
|
|
|
func getgcmaskcb(frame *stkframe, ctxt unsafe.Pointer) bool {
|
|
|
|
|
target := (*stkframe)(ctxt)
|
|
|
|
|
if frame.sp <= target.sp && target.sp < frame.varp {
|
|
|
|
|
*target = *frame
|
|
|
|
|
return false
|
|
|
|
|
}
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Returns GC type info for object p for testing.
|
2015-04-28 00:28:47 -04:00
|
|
|
func getgcmask(ep interface{}) (mask []byte) {
|
|
|
|
|
e := *(*eface)(unsafe.Pointer(&ep))
|
|
|
|
|
p := e.data
|
|
|
|
|
t := e._type
|
|
|
|
|
// data or bss
|
2015-04-07 12:55:02 +12:00
|
|
|
for datap := &firstmoduledata; datap != nil; datap = datap.next {
|
2015-04-28 00:28:47 -04:00
|
|
|
// data
|
2015-03-29 21:59:00 +00:00
|
|
|
if datap.data <= uintptr(p) && uintptr(p) < datap.edata {
|
2015-05-04 10:19:24 -04:00
|
|
|
bitmap := datap.gcdatamask.bytedata
|
2015-03-29 21:59:00 +00:00
|
|
|
n := (*ptrtype)(unsafe.Pointer(t)).elem.size
|
2015-04-28 00:28:47 -04:00
|
|
|
mask = make([]byte, n/ptrSize)
|
2015-03-29 21:59:00 +00:00
|
|
|
for i := uintptr(0); i < n; i += ptrSize {
|
|
|
|
|
off := (uintptr(p) + i - datap.data) / ptrSize
|
2015-05-04 10:19:24 -04:00
|
|
|
mask[i/ptrSize] = (*addb(bitmap, off/8) >> (off % 8)) & 1
|
2015-03-29 21:59:00 +00:00
|
|
|
}
|
|
|
|
|
return
|
2015-01-16 14:43:38 -05:00
|
|
|
}
|
|
|
|
|
|
2015-03-29 21:59:00 +00:00
|
|
|
// bss
|
|
|
|
|
if datap.bss <= uintptr(p) && uintptr(p) < datap.ebss {
|
2015-05-04 10:19:24 -04:00
|
|
|
bitmap := datap.gcbssmask.bytedata
|
2015-03-29 21:59:00 +00:00
|
|
|
n := (*ptrtype)(unsafe.Pointer(t)).elem.size
|
2015-04-28 00:28:47 -04:00
|
|
|
mask = make([]byte, n/ptrSize)
|
2015-03-29 21:59:00 +00:00
|
|
|
for i := uintptr(0); i < n; i += ptrSize {
|
|
|
|
|
off := (uintptr(p) + i - datap.bss) / ptrSize
|
2015-05-04 10:19:24 -04:00
|
|
|
mask[i/ptrSize] = (*addb(bitmap, off/8) >> (off % 8)) & 1
|
2015-03-29 21:59:00 +00:00
|
|
|
}
|
|
|
|
|
return
|
2015-01-16 14:43:38 -05:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// heap
|
|
|
|
|
var n uintptr
|
|
|
|
|
var base uintptr
|
|
|
|
|
if mlookup(uintptr(p), &base, &n, nil) != 0 {
|
2015-04-28 00:28:47 -04:00
|
|
|
mask = make([]byte, n/ptrSize)
|
2015-01-16 14:43:38 -05:00
|
|
|
for i := uintptr(0); i < n; i += ptrSize {
|
2015-05-04 10:19:24 -04:00
|
|
|
hbits := heapBitsForAddr(base + i)
|
|
|
|
|
if hbits.isPointer() {
|
|
|
|
|
mask[i/ptrSize] = 1
|
|
|
|
|
}
|
|
|
|
|
if i >= 2*ptrSize && !hbits.isMarked() {
|
|
|
|
|
mask[i/ptrSize] = 255
|
|
|
|
|
break
|
|
|
|
|
}
|
2015-01-16 14:43:38 -05:00
|
|
|
}
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// stack
|
|
|
|
|
var frame stkframe
|
|
|
|
|
frame.sp = uintptr(p)
|
|
|
|
|
_g_ := getg()
|
|
|
|
|
gentraceback(_g_.m.curg.sched.pc, _g_.m.curg.sched.sp, 0, _g_.m.curg, 0, nil, 1000, getgcmaskcb, noescape(unsafe.Pointer(&frame)), 0)
|
|
|
|
|
if frame.fn != nil {
|
|
|
|
|
f := frame.fn
|
|
|
|
|
targetpc := frame.continpc
|
|
|
|
|
if targetpc == 0 {
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
if targetpc != f.entry {
|
|
|
|
|
targetpc--
|
|
|
|
|
}
|
|
|
|
|
pcdata := pcdatavalue(f, _PCDATA_StackMapIndex, targetpc)
|
|
|
|
|
if pcdata == -1 {
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
stkmap := (*stackmap)(funcdata(f, _FUNCDATA_LocalsPointerMaps))
|
|
|
|
|
if stkmap == nil || stkmap.n <= 0 {
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
bv := stackmapdata(stkmap, pcdata)
|
cmd/internal/gc, runtime: use 1-bit bitmap for stack frames, data, bss
The bitmaps were 2 bits per pointer because we needed to distinguish
scalar, pointer, multiword, and we used the leftover value to distinguish
uninitialized from scalar, even though the garbage collector (GC) didn't care.
Now that there are no multiword structures from the GC's point of view,
cut the bitmaps down to 1 bit per pointer, recording just live pointer vs not.
The GC assumes the same layout for stack frames and for the maps
describing the global data and bss sections, so change them all in one CL.
The code still refers to 4-bit heap bitmaps and 2-bit "type bitmaps", since
the 2-bit representation lives (at least for now) in some of the reflect data.
Because these stack frame bitmaps are stored directly in the rodata in
the binary, this CL reduces the size of the 6g binary by about 1.1%.
Performance change is basically a wash, but using less memory,
and smaller binaries, and enables other bitmap reductions.
name old mean new mean delta
BenchmarkBinaryTree17 13.2s × (0.97,1.03) 13.0s × (0.99,1.01) -0.93% (p=0.005)
BenchmarkBinaryTree17-2 9.69s × (0.96,1.05) 9.51s × (0.96,1.03) -1.86% (p=0.001)
BenchmarkBinaryTree17-4 10.1s × (0.97,1.05) 10.0s × (0.96,1.05) ~ (p=0.141)
BenchmarkFannkuch11 4.35s × (0.99,1.01) 4.43s × (0.98,1.04) +1.75% (p=0.001)
BenchmarkFannkuch11-2 4.31s × (0.99,1.03) 4.32s × (1.00,1.00) ~ (p=0.095)
BenchmarkFannkuch11-4 4.32s × (0.99,1.02) 4.38s × (0.98,1.04) +1.38% (p=0.008)
BenchmarkFmtFprintfEmpty 83.5ns × (0.97,1.10) 87.3ns × (0.92,1.11) +4.55% (p=0.014)
BenchmarkFmtFprintfEmpty-2 81.8ns × (0.98,1.04) 82.5ns × (0.97,1.08) ~ (p=0.364)
BenchmarkFmtFprintfEmpty-4 80.9ns × (0.99,1.01) 82.6ns × (0.97,1.08) +2.12% (p=0.010)
BenchmarkFmtFprintfString 320ns × (0.95,1.04) 322ns × (0.97,1.05) ~ (p=0.368)
BenchmarkFmtFprintfString-2 303ns × (0.97,1.04) 304ns × (0.97,1.04) ~ (p=0.484)
BenchmarkFmtFprintfString-4 305ns × (0.97,1.05) 306ns × (0.98,1.05) ~ (p=0.543)
BenchmarkFmtFprintfInt 311ns × (0.98,1.03) 319ns × (0.97,1.03) +2.63% (p=0.000)
BenchmarkFmtFprintfInt-2 297ns × (0.98,1.04) 301ns × (0.97,1.04) +1.19% (p=0.023)
BenchmarkFmtFprintfInt-4 302ns × (0.98,1.02) 304ns × (0.97,1.03) ~ (p=0.126)
BenchmarkFmtFprintfIntInt 554ns × (0.96,1.05) 554ns × (0.97,1.03) ~ (p=0.975)
BenchmarkFmtFprintfIntInt-2 520ns × (0.98,1.03) 517ns × (0.98,1.02) ~ (p=0.153)
BenchmarkFmtFprintfIntInt-4 524ns × (0.98,1.02) 525ns × (0.98,1.03) ~ (p=0.597)
BenchmarkFmtFprintfPrefixedInt 433ns × (0.97,1.06) 434ns × (0.97,1.06) ~ (p=0.804)
BenchmarkFmtFprintfPrefixedInt-2 413ns × (0.98,1.04) 413ns × (0.98,1.03) ~ (p=0.881)
BenchmarkFmtFprintfPrefixedInt-4 420ns × (0.97,1.03) 421ns × (0.97,1.03) ~ (p=0.561)
BenchmarkFmtFprintfFloat 620ns × (0.99,1.03) 636ns × (0.97,1.03) +2.57% (p=0.000)
BenchmarkFmtFprintfFloat-2 601ns × (0.98,1.02) 617ns × (0.98,1.03) +2.58% (p=0.000)
BenchmarkFmtFprintfFloat-4 613ns × (0.98,1.03) 626ns × (0.98,1.02) +2.15% (p=0.000)
BenchmarkFmtManyArgs 2.19µs × (0.96,1.04) 2.23µs × (0.97,1.02) +1.65% (p=0.000)
BenchmarkFmtManyArgs-2 2.08µs × (0.98,1.03) 2.10µs × (0.99,1.02) +0.79% (p=0.019)
BenchmarkFmtManyArgs-4 2.10µs × (0.98,1.02) 2.13µs × (0.98,1.02) +1.72% (p=0.000)
BenchmarkGobDecode 21.3ms × (0.97,1.05) 21.1ms × (0.97,1.04) -1.36% (p=0.025)
BenchmarkGobDecode-2 20.0ms × (0.97,1.03) 19.2ms × (0.97,1.03) -4.00% (p=0.000)
BenchmarkGobDecode-4 19.5ms × (0.99,1.02) 19.0ms × (0.99,1.01) -2.39% (p=0.000)
BenchmarkGobEncode 18.3ms × (0.95,1.07) 18.1ms × (0.96,1.08) ~ (p=0.305)
BenchmarkGobEncode-2 16.8ms × (0.97,1.02) 16.4ms × (0.98,1.02) -2.79% (p=0.000)
BenchmarkGobEncode-4 15.4ms × (0.98,1.02) 15.4ms × (0.98,1.02) ~ (p=0.465)
BenchmarkGzip 650ms × (0.98,1.03) 655ms × (0.97,1.04) ~ (p=0.075)
BenchmarkGzip-2 652ms × (0.98,1.03) 655ms × (0.98,1.02) ~ (p=0.337)
BenchmarkGzip-4 656ms × (0.98,1.04) 653ms × (0.98,1.03) ~ (p=0.291)
BenchmarkGunzip 143ms × (1.00,1.01) 143ms × (1.00,1.01) ~ (p=0.507)
BenchmarkGunzip-2 143ms × (1.00,1.01) 143ms × (1.00,1.01) ~ (p=0.313)
BenchmarkGunzip-4 143ms × (1.00,1.01) 143ms × (1.00,1.01) ~ (p=0.312)
BenchmarkHTTPClientServer 110µs × (0.98,1.03) 109µs × (0.99,1.02) -1.40% (p=0.000)
BenchmarkHTTPClientServer-2 154µs × (0.90,1.08) 149µs × (0.90,1.08) -3.43% (p=0.007)
BenchmarkHTTPClientServer-4 138µs × (0.97,1.04) 138µs × (0.96,1.04) ~ (p=0.670)
BenchmarkJSONEncode 40.2ms × (0.98,1.02) 40.2ms × (0.98,1.05) ~ (p=0.828)
BenchmarkJSONEncode-2 35.1ms × (0.99,1.02) 35.2ms × (0.98,1.03) ~ (p=0.392)
BenchmarkJSONEncode-4 35.3ms × (0.98,1.03) 35.3ms × (0.98,1.02) ~ (p=0.813)
BenchmarkJSONDecode 119ms × (0.97,1.02) 117ms × (0.98,1.02) -1.80% (p=0.000)
BenchmarkJSONDecode-2 115ms × (0.99,1.02) 114ms × (0.98,1.02) -1.18% (p=0.000)
BenchmarkJSONDecode-4 116ms × (0.98,1.02) 114ms × (0.98,1.02) -1.43% (p=0.000)
BenchmarkMandelbrot200 6.03ms × (1.00,1.01) 6.03ms × (1.00,1.01) ~ (p=0.985)
BenchmarkMandelbrot200-2 6.03ms × (1.00,1.01) 6.02ms × (1.00,1.01) ~ (p=0.320)
BenchmarkMandelbrot200-4 6.03ms × (1.00,1.01) 6.03ms × (1.00,1.01) ~ (p=0.799)
BenchmarkGoParse 8.63ms × (0.89,1.10) 8.58ms × (0.93,1.09) ~ (p=0.667)
BenchmarkGoParse-2 8.20ms × (0.97,1.04) 8.37ms × (0.97,1.04) +1.96% (p=0.001)
BenchmarkGoParse-4 8.00ms × (0.98,1.02) 8.14ms × (0.99,1.02) +1.75% (p=0.000)
BenchmarkRegexpMatchEasy0_32 162ns × (1.00,1.01) 164ns × (0.98,1.04) +1.35% (p=0.011)
BenchmarkRegexpMatchEasy0_32-2 161ns × (1.00,1.01) 161ns × (1.00,1.00) ~ (p=0.185)
BenchmarkRegexpMatchEasy0_32-4 161ns × (1.00,1.00) 161ns × (1.00,1.00) -0.19% (p=0.001)
BenchmarkRegexpMatchEasy0_1K 540ns × (0.99,1.02) 566ns × (0.98,1.04) +4.98% (p=0.000)
BenchmarkRegexpMatchEasy0_1K-2 540ns × (0.99,1.01) 557ns × (0.99,1.01) +3.21% (p=0.000)
BenchmarkRegexpMatchEasy0_1K-4 541ns × (0.99,1.01) 559ns × (0.99,1.01) +3.26% (p=0.000)
BenchmarkRegexpMatchEasy1_32 139ns × (0.98,1.04) 139ns × (0.99,1.03) ~ (p=0.979)
BenchmarkRegexpMatchEasy1_32-2 139ns × (0.99,1.04) 139ns × (0.99,1.02) ~ (p=0.777)
BenchmarkRegexpMatchEasy1_32-4 139ns × (0.98,1.04) 139ns × (0.99,1.04) ~ (p=0.771)
BenchmarkRegexpMatchEasy1_1K 890ns × (0.99,1.03) 885ns × (1.00,1.01) -0.50% (p=0.004)
BenchmarkRegexpMatchEasy1_1K-2 888ns × (0.99,1.01) 885ns × (0.99,1.01) -0.37% (p=0.004)
BenchmarkRegexpMatchEasy1_1K-4 890ns × (0.99,1.02) 884ns × (1.00,1.00) -0.70% (p=0.000)
BenchmarkRegexpMatchMedium_32 252ns × (0.99,1.01) 251ns × (0.99,1.01) ~ (p=0.081)
BenchmarkRegexpMatchMedium_32-2 254ns × (0.99,1.04) 252ns × (0.99,1.01) -0.78% (p=0.027)
BenchmarkRegexpMatchMedium_32-4 253ns × (0.99,1.04) 252ns × (0.99,1.01) -0.70% (p=0.022)
BenchmarkRegexpMatchMedium_1K 72.9µs × (0.99,1.01) 72.7µs × (1.00,1.00) ~ (p=0.064)
BenchmarkRegexpMatchMedium_1K-2 74.1µs × (0.98,1.05) 72.9µs × (1.00,1.01) -1.61% (p=0.001)
BenchmarkRegexpMatchMedium_1K-4 73.6µs × (0.99,1.05) 72.8µs × (1.00,1.00) -1.13% (p=0.007)
BenchmarkRegexpMatchHard_32 3.88µs × (0.99,1.03) 3.92µs × (0.98,1.05) ~ (p=0.143)
BenchmarkRegexpMatchHard_32-2 3.89µs × (0.99,1.03) 3.93µs × (0.98,1.09) ~ (p=0.278)
BenchmarkRegexpMatchHard_32-4 3.90µs × (0.99,1.05) 3.93µs × (0.98,1.05) ~ (p=0.252)
BenchmarkRegexpMatchHard_1K 118µs × (0.99,1.01) 117µs × (0.99,1.02) -0.54% (p=0.003)
BenchmarkRegexpMatchHard_1K-2 118µs × (0.99,1.01) 118µs × (0.99,1.03) ~ (p=0.581)
BenchmarkRegexpMatchHard_1K-4 118µs × (0.99,1.02) 117µs × (0.99,1.01) -0.54% (p=0.002)
BenchmarkRevcomp 991ms × (0.95,1.10) 989ms × (0.94,1.08) ~ (p=0.879)
BenchmarkRevcomp-2 978ms × (0.95,1.11) 962ms × (0.96,1.08) ~ (p=0.257)
BenchmarkRevcomp-4 979ms × (0.96,1.07) 974ms × (0.96,1.11) ~ (p=0.678)
BenchmarkTemplate 141ms × (0.99,1.02) 145ms × (0.99,1.02) +2.75% (p=0.000)
BenchmarkTemplate-2 135ms × (0.98,1.02) 138ms × (0.99,1.02) +2.34% (p=0.000)
BenchmarkTemplate-4 136ms × (0.98,1.02) 140ms × (0.99,1.02) +2.71% (p=0.000)
BenchmarkTimeParse 640ns × (0.99,1.01) 622ns × (0.99,1.01) -2.88% (p=0.000)
BenchmarkTimeParse-2 640ns × (0.99,1.01) 622ns × (1.00,1.00) -2.81% (p=0.000)
BenchmarkTimeParse-4 640ns × (1.00,1.01) 622ns × (0.99,1.01) -2.82% (p=0.000)
BenchmarkTimeFormat 730ns × (0.98,1.02) 731ns × (0.98,1.03) ~ (p=0.767)
BenchmarkTimeFormat-2 709ns × (0.99,1.02) 707ns × (0.99,1.02) ~ (p=0.347)
BenchmarkTimeFormat-4 717ns × (0.98,1.01) 718ns × (0.98,1.02) ~ (p=0.793)
Change-Id: Ie779c47e912bf80eb918bafa13638bd8dfd6c2d9
Reviewed-on: https://go-review.googlesource.com/9406
Reviewed-by: Rick Hudson <rlh@golang.org>
2015-04-27 22:45:57 -04:00
|
|
|
size := uintptr(bv.n) * ptrSize
|
2015-01-16 14:43:38 -05:00
|
|
|
n := (*ptrtype)(unsafe.Pointer(t)).elem.size
|
2015-04-28 00:28:47 -04:00
|
|
|
mask = make([]byte, n/ptrSize)
|
2015-01-16 14:43:38 -05:00
|
|
|
for i := uintptr(0); i < n; i += ptrSize {
|
2015-05-04 10:19:24 -04:00
|
|
|
bitmap := bv.bytedata
|
2015-01-16 14:43:38 -05:00
|
|
|
off := (uintptr(p) + i - frame.varp + size) / ptrSize
|
2015-05-04 10:19:24 -04:00
|
|
|
mask[i/ptrSize] = (*addb(bitmap, off/8) >> (off % 8)) & 1
|
2015-01-16 14:43:38 -05:00
|
|
|
}
|
|
|
|
|
}
|
2015-04-28 00:28:47 -04:00
|
|
|
return
|
2015-01-16 14:43:38 -05:00
|
|
|
}
|