2015-02-27 22:57:28 -05:00
|
|
|
// Derived from Inferno utils/6l/obj.c and utils/6l/span.c
|
|
|
|
|
// http://code.google.com/p/inferno-os/source/browse/utils/6l/obj.c
|
|
|
|
|
// http://code.google.com/p/inferno-os/source/browse/utils/6l/span.c
|
|
|
|
|
//
|
|
|
|
|
// Copyright © 1994-1999 Lucent Technologies Inc. All rights reserved.
|
|
|
|
|
// Portions Copyright © 1995-1997 C H Forsyth (forsyth@terzarima.net)
|
|
|
|
|
// Portions Copyright © 1997-1999 Vita Nuova Limited
|
|
|
|
|
// Portions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com)
|
|
|
|
|
// Portions Copyright © 2004,2006 Bruce Ellis
|
|
|
|
|
// Portions Copyright © 2005-2007 C H Forsyth (forsyth@terzarima.net)
|
|
|
|
|
// Revisions Copyright © 2000-2007 Lucent Technologies Inc. and others
|
2016-04-10 14:32:26 -07:00
|
|
|
// Portions Copyright © 2009 The Go Authors. All rights reserved.
|
2015-02-27 22:57:28 -05:00
|
|
|
//
|
|
|
|
|
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
|
|
|
// of this software and associated documentation files (the "Software"), to deal
|
|
|
|
|
// in the Software without restriction, including without limitation the rights
|
|
|
|
|
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
|
|
|
// copies of the Software, and to permit persons to whom the Software is
|
|
|
|
|
// furnished to do so, subject to the following conditions:
|
|
|
|
|
//
|
|
|
|
|
// The above copyright notice and this permission notice shall be included in
|
|
|
|
|
// all copies or substantial portions of the Software.
|
|
|
|
|
//
|
|
|
|
|
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
|
|
|
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
|
|
|
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
|
|
|
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
|
|
|
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
|
|
|
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
|
|
|
// THE SOFTWARE.
|
|
|
|
|
|
|
|
|
|
package ld
|
|
|
|
|
|
|
|
|
|
import (
|
runtime: replace GC programs with simpler encoding, faster decoder
Small types record the location of pointers in their memory layout
by using a simple bitmap. In Go 1.4 the bitmap held 4-bit entries,
and in Go 1.5 the bitmap holds 1-bit entries, but in both cases using
a bitmap for a large type containing arrays does not make sense:
if someone refers to the type [1<<28]*byte in a program in such
a way that the type information makes it into the binary, it would be
a waste of space to write a 128 MB (for 4-bit entries) or even 32 MB
(for 1-bit entries) bitmap full of 1s into the binary or even to keep
one in memory during the execution of the program.
For large types containing arrays, it is much more compact to describe
the locations of pointers using a notation that can express repetition
than to lay out a bitmap of pointers. Go 1.4 included such a notation,
called ``GC programs'' but it was complex, required recursion during
decoding, and was generally slow. Dmitriy measured the execution of
these programs writing directly to the heap bitmap as being 7x slower
than copying from a preunrolled 4-bit mask (and frankly that code was
not terribly fast either). For some tests, unrollgcprog1 was seen costing
as much as 3x more than the rest of malloc combined.
This CL introduces a different form for the GC programs. They use a
simple Lempel-Ziv-style encoding of the 1-bit pointer information,
in which the only operations are (1) emit the following n bits
and (2) repeat the last n bits c more times. This encoding can be
generated directly from the Go type information (using repetition
only for arrays or large runs of non-pointer data) and it can be decoded
very efficiently. In particular the decoding requires little state and
no recursion, so that the entire decoding can run without any memory
accesses other than the reads of the encoding and the writes of the
decoded form to the heap bitmap. For recursive types like arrays of
arrays of arrays, the inner instructions are only executed once, not
n times, so that large repetitions run at full speed. (In contrast, large
repetitions in the old programs repeated the individual bit-level layout
of the inner data over and over.) The result is as much as 25x faster
decoding compared to the old form.
Because the old decoder was so slow, Go 1.4 had three (or so) cases
for how to set the heap bitmap bits for an allocation of a given type:
(1) If the type had an even number of words up to 32 words, then
the 4-bit pointer mask for the type fit in no more than 16 bytes;
store the 4-bit pointer mask directly in the binary and copy from it.
(1b) If the type had an odd number of words up to 15 words, then
the 4-bit pointer mask for the type, doubled to end on a byte boundary,
fit in no more than 16 bytes; store that doubled mask directly in the
binary and copy from it.
(2) If the type had an even number of words up to 128 words,
or an odd number of words up to 63 words (again due to doubling),
then the 4-bit pointer mask would fit in a 64-byte unrolled mask.
Store a GC program in the binary, but leave space in the BSS for
the unrolled mask. Execute the GC program to construct the mask the
first time it is needed, and thereafter copy from the mask.
(3) Otherwise, store a GC program and execute it to write directly to
the heap bitmap each time an object of that type is allocated.
(This is the case that was 7x slower than the other two.)
Because the new pointer masks store 1-bit entries instead of 4-bit
entries and because using the decoder no longer carries a significant
overhead, after this CL (that is, for Go 1.5) there are only two cases:
(1) If the type is 128 words or less (no condition about odd or even),
store the 1-bit pointer mask directly in the binary and use it to
initialize the heap bitmap during malloc. (Implemented in CL 9702.)
(2) There is no case 2 anymore.
(3) Otherwise, store a GC program and execute it to write directly to
the heap bitmap each time an object of that type is allocated.
Executing the GC program directly into the heap bitmap (case (3) above)
was disabled for the Go 1.5 dev cycle, both to avoid needing to use
GC programs for typedmemmove and to avoid updating that code as
the heap bitmap format changed. Typedmemmove no longer uses this
type information; as of CL 9886 it uses the heap bitmap directly.
Now that the heap bitmap format is stable, we reintroduce GC programs
and their space savings.
Benchmarks for heapBitsSetType, before this CL vs this CL:
name old mean new mean delta
SetTypePtr 7.59ns × (0.99,1.02) 5.16ns × (1.00,1.00) -32.05% (p=0.000)
SetTypePtr8 21.0ns × (0.98,1.05) 21.4ns × (1.00,1.00) ~ (p=0.179)
SetTypePtr16 24.1ns × (0.99,1.01) 24.6ns × (1.00,1.00) +2.41% (p=0.001)
SetTypePtr32 31.2ns × (0.99,1.01) 32.4ns × (0.99,1.02) +3.72% (p=0.001)
SetTypePtr64 45.2ns × (1.00,1.00) 47.2ns × (1.00,1.00) +4.42% (p=0.000)
SetTypePtr126 75.8ns × (0.99,1.01) 79.1ns × (1.00,1.00) +4.25% (p=0.000)
SetTypePtr128 74.3ns × (0.99,1.01) 77.6ns × (1.00,1.01) +4.55% (p=0.000)
SetTypePtrSlice 726ns × (1.00,1.01) 712ns × (1.00,1.00) -1.95% (p=0.001)
SetTypeNode1 20.0ns × (0.99,1.01) 20.7ns × (1.00,1.00) +3.71% (p=0.000)
SetTypeNode1Slice 112ns × (1.00,1.00) 113ns × (0.99,1.00) ~ (p=0.070)
SetTypeNode8 23.9ns × (1.00,1.00) 24.7ns × (1.00,1.01) +3.18% (p=0.000)
SetTypeNode8Slice 294ns × (0.99,1.02) 287ns × (0.99,1.01) -2.38% (p=0.015)
SetTypeNode64 52.8ns × (0.99,1.03) 51.8ns × (0.99,1.01) ~ (p=0.069)
SetTypeNode64Slice 1.13µs × (0.99,1.05) 1.14µs × (0.99,1.00) ~ (p=0.767)
SetTypeNode64Dead 36.0ns × (1.00,1.01) 32.5ns × (0.99,1.00) -9.67% (p=0.000)
SetTypeNode64DeadSlice 1.43µs × (0.99,1.01) 1.40µs × (1.00,1.00) -2.39% (p=0.001)
SetTypeNode124 75.7ns × (1.00,1.01) 79.0ns × (1.00,1.00) +4.44% (p=0.000)
SetTypeNode124Slice 1.94µs × (1.00,1.01) 2.04µs × (0.99,1.01) +4.98% (p=0.000)
SetTypeNode126 75.4ns × (1.00,1.01) 77.7ns × (0.99,1.01) +3.11% (p=0.000)
SetTypeNode126Slice 1.95µs × (0.99,1.01) 2.03µs × (1.00,1.00) +3.74% (p=0.000)
SetTypeNode128 85.4ns × (0.99,1.01) 122.0ns × (1.00,1.00) +42.89% (p=0.000)
SetTypeNode128Slice 2.20µs × (1.00,1.01) 2.36µs × (0.98,1.02) +7.48% (p=0.001)
SetTypeNode130 83.3ns × (1.00,1.00) 123.0ns × (1.00,1.00) +47.61% (p=0.000)
SetTypeNode130Slice 2.30µs × (0.99,1.01) 2.40µs × (0.98,1.01) +4.37% (p=0.000)
SetTypeNode1024 498ns × (1.00,1.00) 537ns × (1.00,1.00) +7.96% (p=0.000)
SetTypeNode1024Slice 15.5µs × (0.99,1.01) 17.8µs × (1.00,1.00) +15.27% (p=0.000)
The above compares always using a cached pointer mask (and the
corresponding waste of memory) against using the programs directly.
Some slowdown is expected, in exchange for having a better general algorithm.
The GC programs kick in for SetTypeNode128, SetTypeNode130, SetTypeNode1024,
along with the slice variants of those.
It is possible that the cutoff of 128 words (bits) should be raised
in a followup CL, but even with this low cutoff the GC programs are
faster than Go 1.4's "fast path" non-GC program case.
Benchmarks for heapBitsSetType, Go 1.4 vs this CL:
name old mean new mean delta
SetTypePtr 6.89ns × (1.00,1.00) 5.17ns × (1.00,1.00) -25.02% (p=0.000)
SetTypePtr8 25.8ns × (0.97,1.05) 21.5ns × (1.00,1.00) -16.70% (p=0.000)
SetTypePtr16 39.8ns × (0.97,1.02) 24.7ns × (0.99,1.01) -37.81% (p=0.000)
SetTypePtr32 68.8ns × (0.98,1.01) 32.2ns × (1.00,1.01) -53.18% (p=0.000)
SetTypePtr64 130ns × (1.00,1.00) 47ns × (1.00,1.00) -63.67% (p=0.000)
SetTypePtr126 241ns × (0.99,1.01) 79ns × (1.00,1.01) -67.25% (p=0.000)
SetTypePtr128 2.07µs × (1.00,1.00) 0.08µs × (1.00,1.00) -96.27% (p=0.000)
SetTypePtrSlice 1.05µs × (0.99,1.01) 0.72µs × (0.99,1.02) -31.70% (p=0.000)
SetTypeNode1 16.0ns × (0.99,1.01) 20.8ns × (0.99,1.03) +29.91% (p=0.000)
SetTypeNode1Slice 184ns × (0.99,1.01) 112ns × (0.99,1.01) -39.26% (p=0.000)
SetTypeNode8 29.5ns × (0.97,1.02) 24.6ns × (1.00,1.00) -16.50% (p=0.000)
SetTypeNode8Slice 624ns × (0.98,1.02) 285ns × (1.00,1.00) -54.31% (p=0.000)
SetTypeNode64 135ns × (0.96,1.08) 52ns × (0.99,1.02) -61.32% (p=0.000)
SetTypeNode64Slice 3.83µs × (1.00,1.00) 1.14µs × (0.99,1.01) -70.16% (p=0.000)
SetTypeNode64Dead 134ns × (0.99,1.01) 32ns × (1.00,1.01) -75.74% (p=0.000)
SetTypeNode64DeadSlice 3.83µs × (0.99,1.00) 1.40µs × (1.00,1.01) -63.42% (p=0.000)
SetTypeNode124 240ns × (0.99,1.01) 79ns × (1.00,1.01) -67.05% (p=0.000)
SetTypeNode124Slice 7.27µs × (1.00,1.00) 2.04µs × (1.00,1.00) -71.95% (p=0.000)
SetTypeNode126 2.06µs × (0.99,1.01) 0.08µs × (0.99,1.01) -96.23% (p=0.000)
SetTypeNode126Slice 64.4µs × (1.00,1.00) 2.0µs × (1.00,1.00) -96.85% (p=0.000)
SetTypeNode128 2.09µs × (1.00,1.01) 0.12µs × (1.00,1.00) -94.15% (p=0.000)
SetTypeNode128Slice 65.4µs × (1.00,1.00) 2.4µs × (0.99,1.03) -96.39% (p=0.000)
SetTypeNode130 2.11µs × (1.00,1.00) 0.12µs × (1.00,1.00) -94.18% (p=0.000)
SetTypeNode130Slice 66.3µs × (1.00,1.00) 2.4µs × (0.97,1.08) -96.34% (p=0.000)
SetTypeNode1024 16.0µs × (1.00,1.01) 0.5µs × (1.00,1.00) -96.65% (p=0.000)
SetTypeNode1024Slice 512µs × (1.00,1.00) 18µs × (0.98,1.04) -96.45% (p=0.000)
SetTypeNode124 uses a 124 data + 2 ptr = 126-word allocation.
Both Go 1.4 and this CL are using pointer bitmaps for this case,
so that's an overall 3x speedup for using pointer bitmaps.
SetTypeNode128 uses a 128 data + 2 ptr = 130-word allocation.
Both Go 1.4 and this CL are running the GC program for this case,
so that's an overall 17x speedup when using GC programs (and
I've seen >20x on other systems).
Comparing Go 1.4's SetTypeNode124 (pointer bitmap) against
this CL's SetTypeNode128 (GC program), the slow path in the
code in this CL is 2x faster than the fast path in Go 1.4.
The Go 1 benchmarks are basically unaffected compared to just before this CL.
Go 1 benchmarks, before this CL vs this CL:
name old mean new mean delta
BinaryTree17 5.87s × (0.97,1.04) 5.91s × (0.96,1.04) ~ (p=0.306)
Fannkuch11 4.38s × (1.00,1.00) 4.37s × (1.00,1.01) -0.22% (p=0.006)
FmtFprintfEmpty 90.7ns × (0.97,1.10) 89.3ns × (0.96,1.09) ~ (p=0.280)
FmtFprintfString 282ns × (0.98,1.04) 287ns × (0.98,1.07) +1.72% (p=0.039)
FmtFprintfInt 269ns × (0.99,1.03) 282ns × (0.97,1.04) +4.87% (p=0.000)
FmtFprintfIntInt 478ns × (0.99,1.02) 481ns × (0.99,1.02) +0.61% (p=0.048)
FmtFprintfPrefixedInt 399ns × (0.98,1.03) 400ns × (0.98,1.05) ~ (p=0.533)
FmtFprintfFloat 563ns × (0.99,1.01) 570ns × (1.00,1.01) +1.37% (p=0.000)
FmtManyArgs 1.89µs × (0.99,1.01) 1.92µs × (0.99,1.02) +1.88% (p=0.000)
GobDecode 15.2ms × (0.99,1.01) 15.2ms × (0.98,1.05) ~ (p=0.609)
GobEncode 11.6ms × (0.98,1.03) 11.9ms × (0.98,1.04) +2.17% (p=0.000)
Gzip 648ms × (0.99,1.01) 648ms × (1.00,1.01) ~ (p=0.835)
Gunzip 142ms × (1.00,1.00) 143ms × (1.00,1.01) ~ (p=0.169)
HTTPClientServer 90.5µs × (0.98,1.03) 91.5µs × (0.98,1.04) +1.04% (p=0.045)
JSONEncode 31.5ms × (0.98,1.03) 31.4ms × (0.98,1.03) ~ (p=0.549)
JSONDecode 111ms × (0.99,1.01) 107ms × (0.99,1.01) -3.21% (p=0.000)
Mandelbrot200 6.01ms × (1.00,1.00) 6.01ms × (1.00,1.00) ~ (p=0.878)
GoParse 6.54ms × (0.99,1.02) 6.61ms × (0.99,1.03) +1.08% (p=0.004)
RegexpMatchEasy0_32 160ns × (1.00,1.01) 161ns × (1.00,1.00) +0.40% (p=0.000)
RegexpMatchEasy0_1K 560ns × (0.99,1.01) 559ns × (0.99,1.01) ~ (p=0.088)
RegexpMatchEasy1_32 138ns × (0.99,1.01) 138ns × (1.00,1.00) ~ (p=0.380)
RegexpMatchEasy1_1K 877ns × (1.00,1.00) 878ns × (1.00,1.00) ~ (p=0.157)
RegexpMatchMedium_32 251ns × (0.99,1.00) 251ns × (1.00,1.01) +0.28% (p=0.021)
RegexpMatchMedium_1K 72.6µs × (1.00,1.00) 72.6µs × (1.00,1.00) ~ (p=0.539)
RegexpMatchHard_32 3.84µs × (1.00,1.00) 3.84µs × (1.00,1.00) ~ (p=0.378)
RegexpMatchHard_1K 117µs × (1.00,1.00) 117µs × (1.00,1.00) ~ (p=0.067)
Revcomp 904ms × (0.99,1.02) 904ms × (0.99,1.01) ~ (p=0.943)
Template 125ms × (0.99,1.02) 127ms × (0.99,1.01) +1.79% (p=0.000)
TimeParse 627ns × (0.99,1.01) 622ns × (0.99,1.01) -0.88% (p=0.000)
TimeFormat 655ns × (0.99,1.02) 655ns × (0.99,1.02) ~ (p=0.976)
For the record, Go 1 benchmarks, Go 1.4 vs this CL:
name old mean new mean delta
BinaryTree17 4.61s × (0.97,1.05) 5.91s × (0.98,1.03) +28.35% (p=0.000)
Fannkuch11 4.40s × (0.99,1.03) 4.41s × (0.99,1.01) ~ (p=0.212)
FmtFprintfEmpty 102ns × (0.99,1.01) 84ns × (0.99,1.02) -18.38% (p=0.000)
FmtFprintfString 302ns × (0.98,1.01) 303ns × (0.99,1.02) ~ (p=0.203)
FmtFprintfInt 313ns × (0.97,1.05) 270ns × (0.99,1.01) -13.69% (p=0.000)
FmtFprintfIntInt 524ns × (0.98,1.02) 477ns × (0.99,1.00) -8.87% (p=0.000)
FmtFprintfPrefixedInt 424ns × (0.98,1.02) 386ns × (0.99,1.01) -8.96% (p=0.000)
FmtFprintfFloat 652ns × (0.98,1.02) 594ns × (0.97,1.05) -8.97% (p=0.000)
FmtManyArgs 2.13µs × (0.99,1.02) 1.94µs × (0.99,1.01) -8.92% (p=0.000)
GobDecode 17.1ms × (0.99,1.02) 14.9ms × (0.98,1.03) -13.07% (p=0.000)
GobEncode 13.5ms × (0.98,1.03) 11.5ms × (0.98,1.03) -15.25% (p=0.000)
Gzip 656ms × (0.99,1.02) 647ms × (0.99,1.01) -1.29% (p=0.000)
Gunzip 143ms × (0.99,1.02) 144ms × (0.99,1.01) ~ (p=0.204)
HTTPClientServer 88.2µs × (0.98,1.02) 90.8µs × (0.98,1.01) +2.93% (p=0.000)
JSONEncode 32.2ms × (0.98,1.02) 30.9ms × (0.97,1.04) -4.06% (p=0.001)
JSONDecode 121ms × (0.98,1.02) 110ms × (0.98,1.05) -8.95% (p=0.000)
Mandelbrot200 6.06ms × (0.99,1.01) 6.11ms × (0.98,1.04) ~ (p=0.184)
GoParse 6.76ms × (0.97,1.04) 6.58ms × (0.98,1.05) -2.63% (p=0.003)
RegexpMatchEasy0_32 195ns × (1.00,1.01) 155ns × (0.99,1.01) -20.43% (p=0.000)
RegexpMatchEasy0_1K 479ns × (0.98,1.03) 535ns × (0.99,1.02) +11.59% (p=0.000)
RegexpMatchEasy1_32 169ns × (0.99,1.02) 131ns × (0.99,1.03) -22.44% (p=0.000)
RegexpMatchEasy1_1K 1.53µs × (0.99,1.01) 0.87µs × (0.99,1.02) -43.07% (p=0.000)
RegexpMatchMedium_32 334ns × (0.99,1.01) 242ns × (0.99,1.01) -27.53% (p=0.000)
RegexpMatchMedium_1K 125µs × (1.00,1.01) 72µs × (0.99,1.03) -42.53% (p=0.000)
RegexpMatchHard_32 6.03µs × (0.99,1.01) 3.79µs × (0.99,1.01) -37.12% (p=0.000)
RegexpMatchHard_1K 189µs × (0.99,1.02) 115µs × (0.99,1.01) -39.20% (p=0.000)
Revcomp 935ms × (0.96,1.03) 926ms × (0.98,1.02) ~ (p=0.083)
Template 146ms × (0.97,1.05) 119ms × (0.99,1.01) -18.37% (p=0.000)
TimeParse 660ns × (0.99,1.01) 624ns × (0.99,1.02) -5.43% (p=0.000)
TimeFormat 670ns × (0.98,1.02) 710ns × (1.00,1.01) +5.97% (p=0.000)
This CL is a bit larger than I would like, but the compiler, linker, runtime,
and package reflect all need to be in sync about the format of these programs,
so there is no easy way to split this into independent changes (at least
while keeping the build working at each change).
Fixes #9625.
Fixes #10524.
Change-Id: I9e3e20d6097099d0f8532d1cb5b1af528804989a
Reviewed-on: https://go-review.googlesource.com/9888
Reviewed-by: Austin Clements <austin@google.com>
Run-TryBot: Russ Cox <rsc@golang.org>
2015-05-08 01:43:18 -04:00
|
|
|
"cmd/internal/gcprog"
|
2015-02-27 22:57:28 -05:00
|
|
|
"cmd/internal/obj"
|
2016-04-06 12:01:40 -07:00
|
|
|
"cmd/internal/sys"
|
2015-02-27 22:57:28 -05:00
|
|
|
"fmt"
|
|
|
|
|
"log"
|
runtime: replace GC programs with simpler encoding, faster decoder
Small types record the location of pointers in their memory layout
by using a simple bitmap. In Go 1.4 the bitmap held 4-bit entries,
and in Go 1.5 the bitmap holds 1-bit entries, but in both cases using
a bitmap for a large type containing arrays does not make sense:
if someone refers to the type [1<<28]*byte in a program in such
a way that the type information makes it into the binary, it would be
a waste of space to write a 128 MB (for 4-bit entries) or even 32 MB
(for 1-bit entries) bitmap full of 1s into the binary or even to keep
one in memory during the execution of the program.
For large types containing arrays, it is much more compact to describe
the locations of pointers using a notation that can express repetition
than to lay out a bitmap of pointers. Go 1.4 included such a notation,
called ``GC programs'' but it was complex, required recursion during
decoding, and was generally slow. Dmitriy measured the execution of
these programs writing directly to the heap bitmap as being 7x slower
than copying from a preunrolled 4-bit mask (and frankly that code was
not terribly fast either). For some tests, unrollgcprog1 was seen costing
as much as 3x more than the rest of malloc combined.
This CL introduces a different form for the GC programs. They use a
simple Lempel-Ziv-style encoding of the 1-bit pointer information,
in which the only operations are (1) emit the following n bits
and (2) repeat the last n bits c more times. This encoding can be
generated directly from the Go type information (using repetition
only for arrays or large runs of non-pointer data) and it can be decoded
very efficiently. In particular the decoding requires little state and
no recursion, so that the entire decoding can run without any memory
accesses other than the reads of the encoding and the writes of the
decoded form to the heap bitmap. For recursive types like arrays of
arrays of arrays, the inner instructions are only executed once, not
n times, so that large repetitions run at full speed. (In contrast, large
repetitions in the old programs repeated the individual bit-level layout
of the inner data over and over.) The result is as much as 25x faster
decoding compared to the old form.
Because the old decoder was so slow, Go 1.4 had three (or so) cases
for how to set the heap bitmap bits for an allocation of a given type:
(1) If the type had an even number of words up to 32 words, then
the 4-bit pointer mask for the type fit in no more than 16 bytes;
store the 4-bit pointer mask directly in the binary and copy from it.
(1b) If the type had an odd number of words up to 15 words, then
the 4-bit pointer mask for the type, doubled to end on a byte boundary,
fit in no more than 16 bytes; store that doubled mask directly in the
binary and copy from it.
(2) If the type had an even number of words up to 128 words,
or an odd number of words up to 63 words (again due to doubling),
then the 4-bit pointer mask would fit in a 64-byte unrolled mask.
Store a GC program in the binary, but leave space in the BSS for
the unrolled mask. Execute the GC program to construct the mask the
first time it is needed, and thereafter copy from the mask.
(3) Otherwise, store a GC program and execute it to write directly to
the heap bitmap each time an object of that type is allocated.
(This is the case that was 7x slower than the other two.)
Because the new pointer masks store 1-bit entries instead of 4-bit
entries and because using the decoder no longer carries a significant
overhead, after this CL (that is, for Go 1.5) there are only two cases:
(1) If the type is 128 words or less (no condition about odd or even),
store the 1-bit pointer mask directly in the binary and use it to
initialize the heap bitmap during malloc. (Implemented in CL 9702.)
(2) There is no case 2 anymore.
(3) Otherwise, store a GC program and execute it to write directly to
the heap bitmap each time an object of that type is allocated.
Executing the GC program directly into the heap bitmap (case (3) above)
was disabled for the Go 1.5 dev cycle, both to avoid needing to use
GC programs for typedmemmove and to avoid updating that code as
the heap bitmap format changed. Typedmemmove no longer uses this
type information; as of CL 9886 it uses the heap bitmap directly.
Now that the heap bitmap format is stable, we reintroduce GC programs
and their space savings.
Benchmarks for heapBitsSetType, before this CL vs this CL:
name old mean new mean delta
SetTypePtr 7.59ns × (0.99,1.02) 5.16ns × (1.00,1.00) -32.05% (p=0.000)
SetTypePtr8 21.0ns × (0.98,1.05) 21.4ns × (1.00,1.00) ~ (p=0.179)
SetTypePtr16 24.1ns × (0.99,1.01) 24.6ns × (1.00,1.00) +2.41% (p=0.001)
SetTypePtr32 31.2ns × (0.99,1.01) 32.4ns × (0.99,1.02) +3.72% (p=0.001)
SetTypePtr64 45.2ns × (1.00,1.00) 47.2ns × (1.00,1.00) +4.42% (p=0.000)
SetTypePtr126 75.8ns × (0.99,1.01) 79.1ns × (1.00,1.00) +4.25% (p=0.000)
SetTypePtr128 74.3ns × (0.99,1.01) 77.6ns × (1.00,1.01) +4.55% (p=0.000)
SetTypePtrSlice 726ns × (1.00,1.01) 712ns × (1.00,1.00) -1.95% (p=0.001)
SetTypeNode1 20.0ns × (0.99,1.01) 20.7ns × (1.00,1.00) +3.71% (p=0.000)
SetTypeNode1Slice 112ns × (1.00,1.00) 113ns × (0.99,1.00) ~ (p=0.070)
SetTypeNode8 23.9ns × (1.00,1.00) 24.7ns × (1.00,1.01) +3.18% (p=0.000)
SetTypeNode8Slice 294ns × (0.99,1.02) 287ns × (0.99,1.01) -2.38% (p=0.015)
SetTypeNode64 52.8ns × (0.99,1.03) 51.8ns × (0.99,1.01) ~ (p=0.069)
SetTypeNode64Slice 1.13µs × (0.99,1.05) 1.14µs × (0.99,1.00) ~ (p=0.767)
SetTypeNode64Dead 36.0ns × (1.00,1.01) 32.5ns × (0.99,1.00) -9.67% (p=0.000)
SetTypeNode64DeadSlice 1.43µs × (0.99,1.01) 1.40µs × (1.00,1.00) -2.39% (p=0.001)
SetTypeNode124 75.7ns × (1.00,1.01) 79.0ns × (1.00,1.00) +4.44% (p=0.000)
SetTypeNode124Slice 1.94µs × (1.00,1.01) 2.04µs × (0.99,1.01) +4.98% (p=0.000)
SetTypeNode126 75.4ns × (1.00,1.01) 77.7ns × (0.99,1.01) +3.11% (p=0.000)
SetTypeNode126Slice 1.95µs × (0.99,1.01) 2.03µs × (1.00,1.00) +3.74% (p=0.000)
SetTypeNode128 85.4ns × (0.99,1.01) 122.0ns × (1.00,1.00) +42.89% (p=0.000)
SetTypeNode128Slice 2.20µs × (1.00,1.01) 2.36µs × (0.98,1.02) +7.48% (p=0.001)
SetTypeNode130 83.3ns × (1.00,1.00) 123.0ns × (1.00,1.00) +47.61% (p=0.000)
SetTypeNode130Slice 2.30µs × (0.99,1.01) 2.40µs × (0.98,1.01) +4.37% (p=0.000)
SetTypeNode1024 498ns × (1.00,1.00) 537ns × (1.00,1.00) +7.96% (p=0.000)
SetTypeNode1024Slice 15.5µs × (0.99,1.01) 17.8µs × (1.00,1.00) +15.27% (p=0.000)
The above compares always using a cached pointer mask (and the
corresponding waste of memory) against using the programs directly.
Some slowdown is expected, in exchange for having a better general algorithm.
The GC programs kick in for SetTypeNode128, SetTypeNode130, SetTypeNode1024,
along with the slice variants of those.
It is possible that the cutoff of 128 words (bits) should be raised
in a followup CL, but even with this low cutoff the GC programs are
faster than Go 1.4's "fast path" non-GC program case.
Benchmarks for heapBitsSetType, Go 1.4 vs this CL:
name old mean new mean delta
SetTypePtr 6.89ns × (1.00,1.00) 5.17ns × (1.00,1.00) -25.02% (p=0.000)
SetTypePtr8 25.8ns × (0.97,1.05) 21.5ns × (1.00,1.00) -16.70% (p=0.000)
SetTypePtr16 39.8ns × (0.97,1.02) 24.7ns × (0.99,1.01) -37.81% (p=0.000)
SetTypePtr32 68.8ns × (0.98,1.01) 32.2ns × (1.00,1.01) -53.18% (p=0.000)
SetTypePtr64 130ns × (1.00,1.00) 47ns × (1.00,1.00) -63.67% (p=0.000)
SetTypePtr126 241ns × (0.99,1.01) 79ns × (1.00,1.01) -67.25% (p=0.000)
SetTypePtr128 2.07µs × (1.00,1.00) 0.08µs × (1.00,1.00) -96.27% (p=0.000)
SetTypePtrSlice 1.05µs × (0.99,1.01) 0.72µs × (0.99,1.02) -31.70% (p=0.000)
SetTypeNode1 16.0ns × (0.99,1.01) 20.8ns × (0.99,1.03) +29.91% (p=0.000)
SetTypeNode1Slice 184ns × (0.99,1.01) 112ns × (0.99,1.01) -39.26% (p=0.000)
SetTypeNode8 29.5ns × (0.97,1.02) 24.6ns × (1.00,1.00) -16.50% (p=0.000)
SetTypeNode8Slice 624ns × (0.98,1.02) 285ns × (1.00,1.00) -54.31% (p=0.000)
SetTypeNode64 135ns × (0.96,1.08) 52ns × (0.99,1.02) -61.32% (p=0.000)
SetTypeNode64Slice 3.83µs × (1.00,1.00) 1.14µs × (0.99,1.01) -70.16% (p=0.000)
SetTypeNode64Dead 134ns × (0.99,1.01) 32ns × (1.00,1.01) -75.74% (p=0.000)
SetTypeNode64DeadSlice 3.83µs × (0.99,1.00) 1.40µs × (1.00,1.01) -63.42% (p=0.000)
SetTypeNode124 240ns × (0.99,1.01) 79ns × (1.00,1.01) -67.05% (p=0.000)
SetTypeNode124Slice 7.27µs × (1.00,1.00) 2.04µs × (1.00,1.00) -71.95% (p=0.000)
SetTypeNode126 2.06µs × (0.99,1.01) 0.08µs × (0.99,1.01) -96.23% (p=0.000)
SetTypeNode126Slice 64.4µs × (1.00,1.00) 2.0µs × (1.00,1.00) -96.85% (p=0.000)
SetTypeNode128 2.09µs × (1.00,1.01) 0.12µs × (1.00,1.00) -94.15% (p=0.000)
SetTypeNode128Slice 65.4µs × (1.00,1.00) 2.4µs × (0.99,1.03) -96.39% (p=0.000)
SetTypeNode130 2.11µs × (1.00,1.00) 0.12µs × (1.00,1.00) -94.18% (p=0.000)
SetTypeNode130Slice 66.3µs × (1.00,1.00) 2.4µs × (0.97,1.08) -96.34% (p=0.000)
SetTypeNode1024 16.0µs × (1.00,1.01) 0.5µs × (1.00,1.00) -96.65% (p=0.000)
SetTypeNode1024Slice 512µs × (1.00,1.00) 18µs × (0.98,1.04) -96.45% (p=0.000)
SetTypeNode124 uses a 124 data + 2 ptr = 126-word allocation.
Both Go 1.4 and this CL are using pointer bitmaps for this case,
so that's an overall 3x speedup for using pointer bitmaps.
SetTypeNode128 uses a 128 data + 2 ptr = 130-word allocation.
Both Go 1.4 and this CL are running the GC program for this case,
so that's an overall 17x speedup when using GC programs (and
I've seen >20x on other systems).
Comparing Go 1.4's SetTypeNode124 (pointer bitmap) against
this CL's SetTypeNode128 (GC program), the slow path in the
code in this CL is 2x faster than the fast path in Go 1.4.
The Go 1 benchmarks are basically unaffected compared to just before this CL.
Go 1 benchmarks, before this CL vs this CL:
name old mean new mean delta
BinaryTree17 5.87s × (0.97,1.04) 5.91s × (0.96,1.04) ~ (p=0.306)
Fannkuch11 4.38s × (1.00,1.00) 4.37s × (1.00,1.01) -0.22% (p=0.006)
FmtFprintfEmpty 90.7ns × (0.97,1.10) 89.3ns × (0.96,1.09) ~ (p=0.280)
FmtFprintfString 282ns × (0.98,1.04) 287ns × (0.98,1.07) +1.72% (p=0.039)
FmtFprintfInt 269ns × (0.99,1.03) 282ns × (0.97,1.04) +4.87% (p=0.000)
FmtFprintfIntInt 478ns × (0.99,1.02) 481ns × (0.99,1.02) +0.61% (p=0.048)
FmtFprintfPrefixedInt 399ns × (0.98,1.03) 400ns × (0.98,1.05) ~ (p=0.533)
FmtFprintfFloat 563ns × (0.99,1.01) 570ns × (1.00,1.01) +1.37% (p=0.000)
FmtManyArgs 1.89µs × (0.99,1.01) 1.92µs × (0.99,1.02) +1.88% (p=0.000)
GobDecode 15.2ms × (0.99,1.01) 15.2ms × (0.98,1.05) ~ (p=0.609)
GobEncode 11.6ms × (0.98,1.03) 11.9ms × (0.98,1.04) +2.17% (p=0.000)
Gzip 648ms × (0.99,1.01) 648ms × (1.00,1.01) ~ (p=0.835)
Gunzip 142ms × (1.00,1.00) 143ms × (1.00,1.01) ~ (p=0.169)
HTTPClientServer 90.5µs × (0.98,1.03) 91.5µs × (0.98,1.04) +1.04% (p=0.045)
JSONEncode 31.5ms × (0.98,1.03) 31.4ms × (0.98,1.03) ~ (p=0.549)
JSONDecode 111ms × (0.99,1.01) 107ms × (0.99,1.01) -3.21% (p=0.000)
Mandelbrot200 6.01ms × (1.00,1.00) 6.01ms × (1.00,1.00) ~ (p=0.878)
GoParse 6.54ms × (0.99,1.02) 6.61ms × (0.99,1.03) +1.08% (p=0.004)
RegexpMatchEasy0_32 160ns × (1.00,1.01) 161ns × (1.00,1.00) +0.40% (p=0.000)
RegexpMatchEasy0_1K 560ns × (0.99,1.01) 559ns × (0.99,1.01) ~ (p=0.088)
RegexpMatchEasy1_32 138ns × (0.99,1.01) 138ns × (1.00,1.00) ~ (p=0.380)
RegexpMatchEasy1_1K 877ns × (1.00,1.00) 878ns × (1.00,1.00) ~ (p=0.157)
RegexpMatchMedium_32 251ns × (0.99,1.00) 251ns × (1.00,1.01) +0.28% (p=0.021)
RegexpMatchMedium_1K 72.6µs × (1.00,1.00) 72.6µs × (1.00,1.00) ~ (p=0.539)
RegexpMatchHard_32 3.84µs × (1.00,1.00) 3.84µs × (1.00,1.00) ~ (p=0.378)
RegexpMatchHard_1K 117µs × (1.00,1.00) 117µs × (1.00,1.00) ~ (p=0.067)
Revcomp 904ms × (0.99,1.02) 904ms × (0.99,1.01) ~ (p=0.943)
Template 125ms × (0.99,1.02) 127ms × (0.99,1.01) +1.79% (p=0.000)
TimeParse 627ns × (0.99,1.01) 622ns × (0.99,1.01) -0.88% (p=0.000)
TimeFormat 655ns × (0.99,1.02) 655ns × (0.99,1.02) ~ (p=0.976)
For the record, Go 1 benchmarks, Go 1.4 vs this CL:
name old mean new mean delta
BinaryTree17 4.61s × (0.97,1.05) 5.91s × (0.98,1.03) +28.35% (p=0.000)
Fannkuch11 4.40s × (0.99,1.03) 4.41s × (0.99,1.01) ~ (p=0.212)
FmtFprintfEmpty 102ns × (0.99,1.01) 84ns × (0.99,1.02) -18.38% (p=0.000)
FmtFprintfString 302ns × (0.98,1.01) 303ns × (0.99,1.02) ~ (p=0.203)
FmtFprintfInt 313ns × (0.97,1.05) 270ns × (0.99,1.01) -13.69% (p=0.000)
FmtFprintfIntInt 524ns × (0.98,1.02) 477ns × (0.99,1.00) -8.87% (p=0.000)
FmtFprintfPrefixedInt 424ns × (0.98,1.02) 386ns × (0.99,1.01) -8.96% (p=0.000)
FmtFprintfFloat 652ns × (0.98,1.02) 594ns × (0.97,1.05) -8.97% (p=0.000)
FmtManyArgs 2.13µs × (0.99,1.02) 1.94µs × (0.99,1.01) -8.92% (p=0.000)
GobDecode 17.1ms × (0.99,1.02) 14.9ms × (0.98,1.03) -13.07% (p=0.000)
GobEncode 13.5ms × (0.98,1.03) 11.5ms × (0.98,1.03) -15.25% (p=0.000)
Gzip 656ms × (0.99,1.02) 647ms × (0.99,1.01) -1.29% (p=0.000)
Gunzip 143ms × (0.99,1.02) 144ms × (0.99,1.01) ~ (p=0.204)
HTTPClientServer 88.2µs × (0.98,1.02) 90.8µs × (0.98,1.01) +2.93% (p=0.000)
JSONEncode 32.2ms × (0.98,1.02) 30.9ms × (0.97,1.04) -4.06% (p=0.001)
JSONDecode 121ms × (0.98,1.02) 110ms × (0.98,1.05) -8.95% (p=0.000)
Mandelbrot200 6.06ms × (0.99,1.01) 6.11ms × (0.98,1.04) ~ (p=0.184)
GoParse 6.76ms × (0.97,1.04) 6.58ms × (0.98,1.05) -2.63% (p=0.003)
RegexpMatchEasy0_32 195ns × (1.00,1.01) 155ns × (0.99,1.01) -20.43% (p=0.000)
RegexpMatchEasy0_1K 479ns × (0.98,1.03) 535ns × (0.99,1.02) +11.59% (p=0.000)
RegexpMatchEasy1_32 169ns × (0.99,1.02) 131ns × (0.99,1.03) -22.44% (p=0.000)
RegexpMatchEasy1_1K 1.53µs × (0.99,1.01) 0.87µs × (0.99,1.02) -43.07% (p=0.000)
RegexpMatchMedium_32 334ns × (0.99,1.01) 242ns × (0.99,1.01) -27.53% (p=0.000)
RegexpMatchMedium_1K 125µs × (1.00,1.01) 72µs × (0.99,1.03) -42.53% (p=0.000)
RegexpMatchHard_32 6.03µs × (0.99,1.01) 3.79µs × (0.99,1.01) -37.12% (p=0.000)
RegexpMatchHard_1K 189µs × (0.99,1.02) 115µs × (0.99,1.01) -39.20% (p=0.000)
Revcomp 935ms × (0.96,1.03) 926ms × (0.98,1.02) ~ (p=0.083)
Template 146ms × (0.97,1.05) 119ms × (0.99,1.01) -18.37% (p=0.000)
TimeParse 660ns × (0.99,1.01) 624ns × (0.99,1.02) -5.43% (p=0.000)
TimeFormat 670ns × (0.98,1.02) 710ns × (1.00,1.01) +5.97% (p=0.000)
This CL is a bit larger than I would like, but the compiler, linker, runtime,
and package reflect all need to be in sync about the format of these programs,
so there is no easy way to split this into independent changes (at least
while keeping the build working at each change).
Fixes #9625.
Fixes #10524.
Change-Id: I9e3e20d6097099d0f8532d1cb5b1af528804989a
Reviewed-on: https://go-review.googlesource.com/9888
Reviewed-by: Austin Clements <austin@google.com>
Run-TryBot: Russ Cox <rsc@golang.org>
2015-05-08 01:43:18 -04:00
|
|
|
"os"
|
2016-03-09 16:23:25 +02:00
|
|
|
"sort"
|
2015-06-04 15:15:48 -04:00
|
|
|
"strconv"
|
2015-02-27 22:57:28 -05:00
|
|
|
"strings"
|
2016-04-18 14:50:14 -04:00
|
|
|
"sync"
|
2015-02-27 22:57:28 -05:00
|
|
|
)
|
|
|
|
|
|
2016-08-19 11:35:54 -04:00
|
|
|
func Symgrow(ctxt *Link, s *Symbol, siz int64) {
|
2015-02-27 22:57:28 -05:00
|
|
|
if int64(int(siz)) != siz {
|
|
|
|
|
log.Fatalf("symgrow size %d too long", siz)
|
|
|
|
|
}
|
|
|
|
|
if int64(len(s.P)) >= siz {
|
|
|
|
|
return
|
|
|
|
|
}
|
2016-04-07 18:00:57 +03:00
|
|
|
if cap(s.P) < int(siz) {
|
|
|
|
|
p := make([]byte, 2*(siz+1))
|
|
|
|
|
s.P = append(p[:0], s.P...)
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
s.P = s.P[:siz]
|
|
|
|
|
}
|
|
|
|
|
|
2016-08-19 11:35:54 -04:00
|
|
|
func Addrel(s *Symbol) *Reloc {
|
2015-02-27 22:57:28 -05:00
|
|
|
s.R = append(s.R, Reloc{})
|
|
|
|
|
return &s.R[len(s.R)-1]
|
|
|
|
|
}
|
|
|
|
|
|
2016-08-19 11:35:54 -04:00
|
|
|
func setuintxx(ctxt *Link, s *Symbol, off int64, v uint64, wid int64) int64 {
|
2015-02-27 22:57:28 -05:00
|
|
|
if s.Type == 0 {
|
2015-04-19 19:33:58 -07:00
|
|
|
s.Type = obj.SDATA
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
2016-03-02 07:59:49 -05:00
|
|
|
s.Attr |= AttrReachable
|
2015-02-27 22:57:28 -05:00
|
|
|
if s.Size < off+wid {
|
|
|
|
|
s.Size = off + wid
|
|
|
|
|
Symgrow(ctxt, s, s.Size)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
switch wid {
|
|
|
|
|
case 1:
|
|
|
|
|
s.P[off] = uint8(v)
|
|
|
|
|
case 2:
|
|
|
|
|
ctxt.Arch.ByteOrder.PutUint16(s.P[off:], uint16(v))
|
|
|
|
|
case 4:
|
|
|
|
|
ctxt.Arch.ByteOrder.PutUint32(s.P[off:], uint32(v))
|
|
|
|
|
case 8:
|
2016-04-14 19:04:45 -07:00
|
|
|
ctxt.Arch.ByteOrder.PutUint64(s.P[off:], v)
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return off + wid
|
|
|
|
|
}
|
|
|
|
|
|
2016-08-19 11:35:54 -04:00
|
|
|
func Addbytes(ctxt *Link, s *Symbol, bytes []byte) int64 {
|
2016-03-14 09:23:04 -07:00
|
|
|
if s.Type == 0 {
|
|
|
|
|
s.Type = obj.SDATA
|
|
|
|
|
}
|
|
|
|
|
s.Attr |= AttrReachable
|
|
|
|
|
s.P = append(s.P, bytes...)
|
2016-04-07 18:00:57 +03:00
|
|
|
s.Size = int64(len(s.P))
|
2016-03-14 09:23:04 -07:00
|
|
|
|
|
|
|
|
return s.Size
|
|
|
|
|
}
|
|
|
|
|
|
2016-08-19 11:35:54 -04:00
|
|
|
func adduintxx(ctxt *Link, s *Symbol, v uint64, wid int) int64 {
|
2015-03-02 12:35:15 -05:00
|
|
|
off := s.Size
|
2015-02-27 22:57:28 -05:00
|
|
|
setuintxx(ctxt, s, off, v, int64(wid))
|
|
|
|
|
return off
|
|
|
|
|
}
|
|
|
|
|
|
2016-08-19 11:35:54 -04:00
|
|
|
func Adduint8(ctxt *Link, s *Symbol, v uint8) int64 {
|
2016-04-07 18:00:57 +03:00
|
|
|
off := s.Size
|
|
|
|
|
if s.Type == 0 {
|
|
|
|
|
s.Type = obj.SDATA
|
|
|
|
|
}
|
|
|
|
|
s.Attr |= AttrReachable
|
|
|
|
|
s.Size++
|
|
|
|
|
s.P = append(s.P, v)
|
|
|
|
|
|
|
|
|
|
return off
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
|
2016-08-19 11:35:54 -04:00
|
|
|
func Adduint16(ctxt *Link, s *Symbol, v uint16) int64 {
|
2015-02-27 22:57:28 -05:00
|
|
|
return adduintxx(ctxt, s, uint64(v), 2)
|
|
|
|
|
}
|
|
|
|
|
|
2016-08-19 11:35:54 -04:00
|
|
|
func Adduint32(ctxt *Link, s *Symbol, v uint32) int64 {
|
2015-02-27 22:57:28 -05:00
|
|
|
return adduintxx(ctxt, s, uint64(v), 4)
|
|
|
|
|
}
|
|
|
|
|
|
2016-08-19 11:35:54 -04:00
|
|
|
func Adduint64(ctxt *Link, s *Symbol, v uint64) int64 {
|
2015-02-27 22:57:28 -05:00
|
|
|
return adduintxx(ctxt, s, v, 8)
|
|
|
|
|
}
|
|
|
|
|
|
2016-08-19 11:35:54 -04:00
|
|
|
func adduint(ctxt *Link, s *Symbol, v uint64) int64 {
|
2016-04-06 12:01:40 -07:00
|
|
|
return adduintxx(ctxt, s, v, SysArch.IntSize)
|
2015-03-16 11:53:08 +13:00
|
|
|
}
|
|
|
|
|
|
2016-08-19 11:35:54 -04:00
|
|
|
func setuint8(ctxt *Link, s *Symbol, r int64, v uint8) int64 {
|
2015-02-27 22:57:28 -05:00
|
|
|
return setuintxx(ctxt, s, r, uint64(v), 1)
|
|
|
|
|
}
|
|
|
|
|
|
2016-08-19 11:35:54 -04:00
|
|
|
func setuint32(ctxt *Link, s *Symbol, r int64, v uint32) int64 {
|
2015-02-27 22:57:28 -05:00
|
|
|
return setuintxx(ctxt, s, r, uint64(v), 4)
|
|
|
|
|
}
|
|
|
|
|
|
2016-08-19 11:35:54 -04:00
|
|
|
func Addaddrplus(ctxt *Link, s *Symbol, t *Symbol, add int64) int64 {
|
2015-02-27 22:57:28 -05:00
|
|
|
if s.Type == 0 {
|
2015-04-19 19:33:58 -07:00
|
|
|
s.Type = obj.SDATA
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
2016-03-02 07:59:49 -05:00
|
|
|
s.Attr |= AttrReachable
|
2015-03-02 12:35:15 -05:00
|
|
|
i := s.Size
|
2016-04-06 12:01:40 -07:00
|
|
|
s.Size += int64(ctxt.Arch.PtrSize)
|
2015-02-27 22:57:28 -05:00
|
|
|
Symgrow(ctxt, s, s.Size)
|
2015-03-02 12:35:15 -05:00
|
|
|
r := Addrel(s)
|
2015-02-27 22:57:28 -05:00
|
|
|
r.Sym = t
|
|
|
|
|
r.Off = int32(i)
|
2016-04-06 12:01:40 -07:00
|
|
|
r.Siz = uint8(ctxt.Arch.PtrSize)
|
2015-04-19 19:33:58 -07:00
|
|
|
r.Type = obj.R_ADDR
|
2015-02-27 22:57:28 -05:00
|
|
|
r.Add = add
|
|
|
|
|
return i + int64(r.Siz)
|
|
|
|
|
}
|
|
|
|
|
|
2016-08-19 11:35:54 -04:00
|
|
|
func Addpcrelplus(ctxt *Link, s *Symbol, t *Symbol, add int64) int64 {
|
2015-02-27 22:57:28 -05:00
|
|
|
if s.Type == 0 {
|
2015-04-19 19:33:58 -07:00
|
|
|
s.Type = obj.SDATA
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
2016-03-02 07:59:49 -05:00
|
|
|
s.Attr |= AttrReachable
|
2015-03-02 12:35:15 -05:00
|
|
|
i := s.Size
|
2015-02-27 22:57:28 -05:00
|
|
|
s.Size += 4
|
|
|
|
|
Symgrow(ctxt, s, s.Size)
|
2015-03-02 12:35:15 -05:00
|
|
|
r := Addrel(s)
|
2015-02-27 22:57:28 -05:00
|
|
|
r.Sym = t
|
|
|
|
|
r.Off = int32(i)
|
|
|
|
|
r.Add = add
|
2015-04-19 19:33:58 -07:00
|
|
|
r.Type = obj.R_PCREL
|
2015-02-27 22:57:28 -05:00
|
|
|
r.Siz = 4
|
2016-04-06 12:01:40 -07:00
|
|
|
if SysArch.Family == sys.S390X {
|
2016-03-18 16:57:54 -04:00
|
|
|
r.Variant = RV_390_DBL
|
|
|
|
|
}
|
2015-02-27 22:57:28 -05:00
|
|
|
return i + int64(r.Siz)
|
|
|
|
|
}
|
|
|
|
|
|
2016-08-19 11:35:54 -04:00
|
|
|
func Addaddr(ctxt *Link, s *Symbol, t *Symbol) int64 {
|
2015-02-27 22:57:28 -05:00
|
|
|
return Addaddrplus(ctxt, s, t, 0)
|
|
|
|
|
}
|
|
|
|
|
|
2016-08-19 11:35:54 -04:00
|
|
|
func setaddrplus(ctxt *Link, s *Symbol, off int64, t *Symbol, add int64) int64 {
|
2015-02-27 22:57:28 -05:00
|
|
|
if s.Type == 0 {
|
2015-04-19 19:33:58 -07:00
|
|
|
s.Type = obj.SDATA
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
2016-03-02 07:59:49 -05:00
|
|
|
s.Attr |= AttrReachable
|
2016-04-06 12:01:40 -07:00
|
|
|
if off+int64(ctxt.Arch.PtrSize) > s.Size {
|
|
|
|
|
s.Size = off + int64(ctxt.Arch.PtrSize)
|
2015-02-27 22:57:28 -05:00
|
|
|
Symgrow(ctxt, s, s.Size)
|
|
|
|
|
}
|
|
|
|
|
|
2015-03-02 12:35:15 -05:00
|
|
|
r := Addrel(s)
|
2015-02-27 22:57:28 -05:00
|
|
|
r.Sym = t
|
|
|
|
|
r.Off = int32(off)
|
2016-04-06 12:01:40 -07:00
|
|
|
r.Siz = uint8(ctxt.Arch.PtrSize)
|
2015-04-19 19:33:58 -07:00
|
|
|
r.Type = obj.R_ADDR
|
2015-02-27 22:57:28 -05:00
|
|
|
r.Add = add
|
|
|
|
|
return off + int64(r.Siz)
|
|
|
|
|
}
|
|
|
|
|
|
2016-08-19 11:35:54 -04:00
|
|
|
func setaddr(ctxt *Link, s *Symbol, off int64, t *Symbol) int64 {
|
2015-02-27 22:57:28 -05:00
|
|
|
return setaddrplus(ctxt, s, off, t, 0)
|
|
|
|
|
}
|
|
|
|
|
|
2016-08-19 11:35:54 -04:00
|
|
|
func addsize(ctxt *Link, s *Symbol, t *Symbol) int64 {
|
2015-02-27 22:57:28 -05:00
|
|
|
if s.Type == 0 {
|
2015-04-19 19:33:58 -07:00
|
|
|
s.Type = obj.SDATA
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
2016-03-02 07:59:49 -05:00
|
|
|
s.Attr |= AttrReachable
|
2015-03-02 12:35:15 -05:00
|
|
|
i := s.Size
|
2016-04-06 12:01:40 -07:00
|
|
|
s.Size += int64(ctxt.Arch.PtrSize)
|
2015-02-27 22:57:28 -05:00
|
|
|
Symgrow(ctxt, s, s.Size)
|
2015-03-02 12:35:15 -05:00
|
|
|
r := Addrel(s)
|
2015-02-27 22:57:28 -05:00
|
|
|
r.Sym = t
|
|
|
|
|
r.Off = int32(i)
|
2016-04-06 12:01:40 -07:00
|
|
|
r.Siz = uint8(ctxt.Arch.PtrSize)
|
2015-04-19 19:33:58 -07:00
|
|
|
r.Type = obj.R_SIZE
|
2015-02-27 22:57:28 -05:00
|
|
|
return i + int64(r.Siz)
|
|
|
|
|
}
|
|
|
|
|
|
2016-08-19 11:35:54 -04:00
|
|
|
func addaddrplus4(ctxt *Link, s *Symbol, t *Symbol, add int64) int64 {
|
2015-02-27 22:57:28 -05:00
|
|
|
if s.Type == 0 {
|
2015-04-19 19:33:58 -07:00
|
|
|
s.Type = obj.SDATA
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
2016-03-02 07:59:49 -05:00
|
|
|
s.Attr |= AttrReachable
|
2015-03-02 12:35:15 -05:00
|
|
|
i := s.Size
|
2015-02-27 22:57:28 -05:00
|
|
|
s.Size += 4
|
|
|
|
|
Symgrow(ctxt, s, s.Size)
|
2015-03-02 12:35:15 -05:00
|
|
|
r := Addrel(s)
|
2015-02-27 22:57:28 -05:00
|
|
|
r.Sym = t
|
|
|
|
|
r.Off = int32(i)
|
|
|
|
|
r.Siz = 4
|
2015-04-19 19:33:58 -07:00
|
|
|
r.Type = obj.R_ADDR
|
2015-02-27 22:57:28 -05:00
|
|
|
r.Add = add
|
|
|
|
|
return i + int64(r.Siz)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* divide-and-conquer list-link
|
2016-08-22 10:27:20 +12:00
|
|
|
* sort of Symbol* structures.
|
2015-02-27 22:57:28 -05:00
|
|
|
* Used for the data block.
|
|
|
|
|
*/
|
|
|
|
|
|
2016-08-19 11:35:54 -04:00
|
|
|
func listsubp(s *Symbol) **Symbol {
|
2015-02-27 22:57:28 -05:00
|
|
|
return &s.Sub
|
|
|
|
|
}
|
|
|
|
|
|
2016-08-19 11:35:54 -04:00
|
|
|
func listsort(l *Symbol, cmp func(*Symbol, *Symbol) int, nextp func(*Symbol) **Symbol) *Symbol {
|
2015-02-27 22:57:28 -05:00
|
|
|
if l == nil || *nextp(l) == nil {
|
|
|
|
|
return l
|
|
|
|
|
}
|
|
|
|
|
|
2015-03-02 12:35:15 -05:00
|
|
|
l1 := l
|
|
|
|
|
l2 := l
|
2015-02-27 22:57:28 -05:00
|
|
|
for {
|
|
|
|
|
l2 = *nextp(l2)
|
|
|
|
|
if l2 == nil {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
l2 = *nextp(l2)
|
|
|
|
|
if l2 == nil {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
l1 = *nextp(l1)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
l2 = *nextp(l1)
|
|
|
|
|
*nextp(l1) = nil
|
|
|
|
|
l1 = listsort(l, cmp, nextp)
|
|
|
|
|
l2 = listsort(l2, cmp, nextp)
|
|
|
|
|
|
|
|
|
|
/* set up lead element */
|
|
|
|
|
if cmp(l1, l2) < 0 {
|
|
|
|
|
l = l1
|
|
|
|
|
l1 = *nextp(l1)
|
|
|
|
|
} else {
|
|
|
|
|
l = l2
|
|
|
|
|
l2 = *nextp(l2)
|
|
|
|
|
}
|
|
|
|
|
|
2015-03-02 12:35:15 -05:00
|
|
|
le := l
|
2015-02-27 22:57:28 -05:00
|
|
|
|
|
|
|
|
for {
|
|
|
|
|
if l1 == nil {
|
|
|
|
|
for l2 != nil {
|
|
|
|
|
*nextp(le) = l2
|
|
|
|
|
le = l2
|
|
|
|
|
l2 = *nextp(l2)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
*nextp(le) = nil
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if l2 == nil {
|
|
|
|
|
for l1 != nil {
|
|
|
|
|
*nextp(le) = l1
|
|
|
|
|
le = l1
|
|
|
|
|
l1 = *nextp(l1)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if cmp(l1, l2) < 0 {
|
|
|
|
|
*nextp(le) = l1
|
|
|
|
|
le = l1
|
|
|
|
|
l1 = *nextp(l1)
|
|
|
|
|
} else {
|
|
|
|
|
*nextp(le) = l2
|
|
|
|
|
le = l2
|
|
|
|
|
l2 = *nextp(l2)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
*nextp(le) = nil
|
|
|
|
|
return l
|
|
|
|
|
}
|
|
|
|
|
|
2016-08-19 22:40:38 -04:00
|
|
|
func relocsym(ctxt *Link, s *Symbol) {
|
2015-02-27 22:57:28 -05:00
|
|
|
var r *Reloc
|
2016-08-19 11:35:54 -04:00
|
|
|
var rs *Symbol
|
2015-02-27 22:57:28 -05:00
|
|
|
var i16 int16
|
|
|
|
|
var off int32
|
|
|
|
|
var siz int32
|
|
|
|
|
var fl int32
|
|
|
|
|
var o int64
|
|
|
|
|
|
2016-08-19 22:40:38 -04:00
|
|
|
ctxt.Cursym = s
|
2015-03-02 12:35:15 -05:00
|
|
|
for ri := int32(0); ri < int32(len(s.R)); ri++ {
|
2015-02-27 22:57:28 -05:00
|
|
|
r = &s.R[ri]
|
|
|
|
|
r.Done = 1
|
|
|
|
|
off = r.Off
|
|
|
|
|
siz = int32(r.Siz)
|
|
|
|
|
if off < 0 || off+siz > int32(len(s.P)) {
|
2016-08-19 22:40:38 -04:00
|
|
|
ctxt.Diag("%s: invalid relocation %d+%d not in [%d,%d)", s.Name, off, siz, 0, len(s.P))
|
2015-02-27 22:57:28 -05:00
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
|
2015-04-19 19:33:58 -07:00
|
|
|
if r.Sym != nil && (r.Sym.Type&(obj.SMASK|obj.SHIDDEN) == 0 || r.Sym.Type&obj.SMASK == obj.SXREF) {
|
2015-03-30 02:59:10 +00:00
|
|
|
// When putting the runtime but not main into a shared library
|
|
|
|
|
// these symbols are undefined and that's OK.
|
2016-07-28 13:04:41 -04:00
|
|
|
if Buildmode == BuildmodeShared {
|
|
|
|
|
if r.Sym.Name == "main.main" || r.Sym.Name == "main.init" {
|
|
|
|
|
r.Sym.Type = obj.SDYNIMPORT
|
|
|
|
|
} else if strings.HasPrefix(r.Sym.Name, "go.info.") {
|
|
|
|
|
// Skip go.info symbols. They are only needed to communicate
|
|
|
|
|
// DWARF info between the compiler and linker.
|
|
|
|
|
continue
|
|
|
|
|
}
|
2015-03-30 02:59:10 +00:00
|
|
|
} else {
|
2016-08-19 22:40:38 -04:00
|
|
|
ctxt.Diag("%s: not defined", r.Sym.Name)
|
2015-03-30 02:59:10 +00:00
|
|
|
continue
|
|
|
|
|
}
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if r.Type >= 256 {
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
if r.Siz == 0 { // informational relocation - no work to do
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
|
2015-03-30 02:59:10 +00:00
|
|
|
// We need to be able to reference dynimport symbols when linking against
|
|
|
|
|
// shared libraries, and Solaris needs it always
|
2015-04-19 19:33:58 -07:00
|
|
|
if HEADTYPE != obj.Hsolaris && r.Sym != nil && r.Sym.Type == obj.SDYNIMPORT && !DynlinkingGo() {
|
2016-04-06 12:01:40 -07:00
|
|
|
if !(SysArch.Family == sys.PPC64 && Linkmode == LinkExternal && r.Sym.Name == ".TOC.") {
|
2016-08-19 22:40:38 -04:00
|
|
|
ctxt.Diag("unhandled relocation for %s (type %d rtype %d)", r.Sym.Name, r.Sym.Type, r.Type)
|
cmd/compile, cmd/link, runtime: on ppc64x, maintain the TOC pointer in R2 when compiling PIC
The PowerPC ISA does not have a PC-relative load instruction, which poses
obvious challenges when generating position-independent code. The way the ELFv2
ABI addresses this is to specify that r2 points to a per "module" (shared
library or executable) TOC pointer. Maintaining this pointer requires
cooperation between codegen and the system linker:
* Non-leaf functions leave space on the stack at r1+24 to save the TOC pointer.
* A call to a function that *might* have to go via a PLT stub must be followed
by a nop instruction that the system linker can replace with "ld r1, 24(r1)"
to restore the TOC pointer (only when dynamically linking Go code).
* When calling a function via a function pointer, the address of the function
must be in r12, and the first couple of instructions (the "global entry
point") of the called function use this to derive the address of the TOC
for the module it is in.
* When calling a function that is implemented in the same module, the system
linker adjusts the call to skip over the instructions mentioned above (the
"local entry point"), assuming that r2 is already correctly set.
So this changeset adds the global entry point instructions, sets the metadata so
the system linker knows where the local entry point is, inserts code to save the
TOC pointer at 24(r1), adds a nop after any call not known to be local and copes
with the odd non-local code transfer in the runtime (e.g. the stuff around
jmpdefer). It does not actually compile PIC yet.
Change-Id: I7522e22bdfd2f891745a900c60254fe9e372c854
Reviewed-on: https://go-review.googlesource.com/15967
Reviewed-by: Russ Cox <rsc@golang.org>
2015-10-16 15:42:09 +13:00
|
|
|
}
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
2016-03-02 07:59:49 -05:00
|
|
|
if r.Sym != nil && r.Sym.Type != obj.STLSBSS && !r.Sym.Attr.Reachable() {
|
2016-08-19 22:40:38 -04:00
|
|
|
ctxt.Diag("unreachable sym in relocation: %s %s", s.Name, r.Sym.Name)
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
|
2016-03-18 16:57:54 -04:00
|
|
|
// TODO(mundaym): remove this special case - see issue 14218.
|
2016-04-06 12:01:40 -07:00
|
|
|
if SysArch.Family == sys.S390X {
|
2016-03-18 16:57:54 -04:00
|
|
|
switch r.Type {
|
|
|
|
|
case obj.R_PCRELDBL:
|
|
|
|
|
r.Type = obj.R_PCREL
|
|
|
|
|
r.Variant = RV_390_DBL
|
|
|
|
|
case obj.R_CALL:
|
|
|
|
|
r.Variant = RV_390_DBL
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2015-02-27 22:57:28 -05:00
|
|
|
switch r.Type {
|
|
|
|
|
default:
|
2015-08-03 14:08:17 +12:00
|
|
|
switch siz {
|
|
|
|
|
default:
|
2016-08-19 22:40:38 -04:00
|
|
|
ctxt.Diag("bad reloc size %#x for %s", uint32(siz), r.Sym.Name)
|
2015-08-03 14:08:17 +12:00
|
|
|
case 1:
|
|
|
|
|
o = int64(s.P[off])
|
|
|
|
|
case 2:
|
2016-08-19 22:40:38 -04:00
|
|
|
o = int64(ctxt.Arch.ByteOrder.Uint16(s.P[off:]))
|
2015-08-03 14:08:17 +12:00
|
|
|
case 4:
|
2016-08-19 22:40:38 -04:00
|
|
|
o = int64(ctxt.Arch.ByteOrder.Uint32(s.P[off:]))
|
2015-08-03 14:08:17 +12:00
|
|
|
case 8:
|
2016-08-19 22:40:38 -04:00
|
|
|
o = int64(ctxt.Arch.ByteOrder.Uint64(s.P[off:]))
|
2015-08-03 14:08:17 +12:00
|
|
|
}
|
2016-08-21 13:52:23 -04:00
|
|
|
if Thearch.Archreloc(ctxt, r, s, &o) < 0 {
|
2016-08-19 22:40:38 -04:00
|
|
|
ctxt.Diag("unknown reloc %d", r.Type)
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
|
2015-04-19 19:33:58 -07:00
|
|
|
case obj.R_TLS_LE:
|
2016-04-06 12:01:40 -07:00
|
|
|
isAndroidX86 := goos == "android" && (SysArch.InFamily(sys.AMD64, sys.I386))
|
2015-10-16 14:04:29 -04:00
|
|
|
|
|
|
|
|
if Linkmode == LinkExternal && Iself && HEADTYPE != obj.Hopenbsd && !isAndroidX86 {
|
2015-02-27 22:57:28 -05:00
|
|
|
r.Done = 0
|
2015-09-02 10:35:54 +12:00
|
|
|
if r.Sym == nil {
|
2016-08-19 22:40:38 -04:00
|
|
|
r.Sym = ctxt.Tlsg
|
2015-09-02 10:35:54 +12:00
|
|
|
}
|
|
|
|
|
r.Xsym = r.Sym
|
2015-02-27 22:57:28 -05:00
|
|
|
r.Xadd = r.Add
|
|
|
|
|
o = 0
|
2016-04-06 12:01:40 -07:00
|
|
|
if SysArch.Family != sys.AMD64 {
|
2015-02-27 22:57:28 -05:00
|
|
|
o = r.Add
|
|
|
|
|
}
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
|
2016-04-06 12:01:40 -07:00
|
|
|
if Iself && SysArch.Family == sys.ARM {
|
2015-09-02 10:35:54 +12:00
|
|
|
// On ELF ARM, the thread pointer is 8 bytes before
|
|
|
|
|
// the start of the thread-local data block, so add 8
|
|
|
|
|
// to the actual TLS offset (r->sym->value).
|
|
|
|
|
// This 8 seems to be a fundamental constant of
|
|
|
|
|
// ELF on ARM (or maybe Glibc on ARM); it is not
|
|
|
|
|
// related to the fact that our own TLS storage happens
|
|
|
|
|
// to take up 8 bytes.
|
|
|
|
|
o = 8 + r.Sym.Value
|
2016-08-19 22:40:38 -04:00
|
|
|
} else if Iself || ctxt.Headtype == obj.Hplan9 || ctxt.Headtype == obj.Hdarwin || isAndroidX86 {
|
|
|
|
|
o = int64(ctxt.Tlsoffset) + r.Add
|
|
|
|
|
} else if ctxt.Headtype == obj.Hwindows {
|
2015-04-23 21:53:48 +12:00
|
|
|
o = r.Add
|
|
|
|
|
} else {
|
2016-08-19 22:40:38 -04:00
|
|
|
log.Fatalf("unexpected R_TLS_LE relocation for %s", Headstr(ctxt.Headtype))
|
2015-04-23 21:53:48 +12:00
|
|
|
}
|
2015-02-27 22:57:28 -05:00
|
|
|
|
2015-04-19 19:33:58 -07:00
|
|
|
case obj.R_TLS_IE:
|
2016-04-06 12:01:40 -07:00
|
|
|
isAndroidX86 := goos == "android" && (SysArch.InFamily(sys.AMD64, sys.I386))
|
2015-10-16 14:04:29 -04:00
|
|
|
|
|
|
|
|
if Linkmode == LinkExternal && Iself && HEADTYPE != obj.Hopenbsd && !isAndroidX86 {
|
2015-02-27 22:57:28 -05:00
|
|
|
r.Done = 0
|
2015-09-02 10:35:54 +12:00
|
|
|
if r.Sym == nil {
|
2016-08-19 22:40:38 -04:00
|
|
|
r.Sym = ctxt.Tlsg
|
2015-09-02 10:35:54 +12:00
|
|
|
}
|
|
|
|
|
r.Xsym = r.Sym
|
2015-02-27 22:57:28 -05:00
|
|
|
r.Xadd = r.Add
|
|
|
|
|
o = 0
|
2016-04-06 12:01:40 -07:00
|
|
|
if SysArch.Family != sys.AMD64 {
|
2015-02-27 22:57:28 -05:00
|
|
|
o = r.Add
|
|
|
|
|
}
|
|
|
|
|
break
|
|
|
|
|
}
|
2015-04-23 21:53:48 +12:00
|
|
|
log.Fatalf("cannot handle R_TLS_IE when linking internally")
|
2015-02-27 22:57:28 -05:00
|
|
|
|
2015-04-19 19:33:58 -07:00
|
|
|
case obj.R_ADDR:
|
|
|
|
|
if Linkmode == LinkExternal && r.Sym.Type != obj.SCONST {
|
2015-02-27 22:57:28 -05:00
|
|
|
r.Done = 0
|
|
|
|
|
|
|
|
|
|
// set up addend for eventual relocation via outer symbol.
|
|
|
|
|
rs = r.Sym
|
|
|
|
|
|
|
|
|
|
r.Xadd = r.Add
|
|
|
|
|
for rs.Outer != nil {
|
2016-08-19 22:40:38 -04:00
|
|
|
r.Xadd += Symaddr(ctxt, rs) - Symaddr(ctxt, rs.Outer)
|
2015-02-27 22:57:28 -05:00
|
|
|
rs = rs.Outer
|
|
|
|
|
}
|
|
|
|
|
|
2015-04-19 19:33:58 -07:00
|
|
|
if rs.Type != obj.SHOSTOBJ && rs.Type != obj.SDYNIMPORT && rs.Sect == nil {
|
2016-08-19 22:40:38 -04:00
|
|
|
ctxt.Diag("missing section for %s", rs.Name)
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
r.Xsym = rs
|
|
|
|
|
|
|
|
|
|
o = r.Xadd
|
|
|
|
|
if Iself {
|
2016-04-06 12:01:40 -07:00
|
|
|
if SysArch.Family == sys.AMD64 {
|
2015-02-27 22:57:28 -05:00
|
|
|
o = 0
|
|
|
|
|
}
|
2015-04-19 19:33:58 -07:00
|
|
|
} else if HEADTYPE == obj.Hdarwin {
|
2015-04-10 21:28:09 -04:00
|
|
|
// ld64 for arm64 has a bug where if the address pointed to by o exists in the
|
|
|
|
|
// symbol table (dynid >= 0), or is inside a symbol that exists in the symbol
|
|
|
|
|
// table, then it will add o twice into the relocated value.
|
|
|
|
|
// The workaround is that on arm64 don't ever add symaddr to o and always use
|
|
|
|
|
// extern relocation by requiring rs->dynid >= 0.
|
2015-04-19 19:33:58 -07:00
|
|
|
if rs.Type != obj.SHOSTOBJ {
|
2016-04-06 12:01:40 -07:00
|
|
|
if SysArch.Family == sys.ARM64 && rs.Dynid < 0 {
|
2016-08-19 22:40:38 -04:00
|
|
|
ctxt.Diag("R_ADDR reloc to %s+%d is not supported on darwin/arm64", rs.Name, o)
|
2015-04-10 21:28:09 -04:00
|
|
|
}
|
2016-04-06 12:01:40 -07:00
|
|
|
if SysArch.Family != sys.ARM64 {
|
2016-08-19 22:40:38 -04:00
|
|
|
o += Symaddr(ctxt, rs)
|
2015-04-10 21:28:09 -04:00
|
|
|
}
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
2015-04-19 19:33:58 -07:00
|
|
|
} else if HEADTYPE == obj.Hwindows {
|
2015-03-09 03:05:40 -04:00
|
|
|
// nothing to do
|
2015-02-27 22:57:28 -05:00
|
|
|
} else {
|
2016-08-19 22:40:38 -04:00
|
|
|
ctxt.Diag("unhandled pcrel relocation for %s", headstring)
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
|
2016-08-19 22:40:38 -04:00
|
|
|
o = Symaddr(ctxt, r.Sym) + r.Add
|
2015-02-27 22:57:28 -05:00
|
|
|
|
|
|
|
|
// On amd64, 4-byte offsets will be sign-extended, so it is impossible to
|
|
|
|
|
// access more than 2GB of static data; fail at link time is better than
|
2015-07-10 17:17:11 -06:00
|
|
|
// fail at runtime. See https://golang.org/issue/7980.
|
2015-02-27 22:57:28 -05:00
|
|
|
// Instead of special casing only amd64, we treat this as an error on all
|
|
|
|
|
// 64-bit architectures so as to be future-proof.
|
2016-04-06 12:01:40 -07:00
|
|
|
if int32(o) < 0 && SysArch.PtrSize > 4 && siz == 4 {
|
2016-08-19 22:40:38 -04:00
|
|
|
ctxt.Diag("non-pc-relative relocation address is too big: %#x (%#x + %#x)", uint64(o), Symaddr(ctxt, r.Sym), r.Add)
|
2015-04-09 07:37:17 -04:00
|
|
|
errorexit()
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
|
2016-03-14 09:23:04 -07:00
|
|
|
case obj.R_DWARFREF:
|
|
|
|
|
if r.Sym.Sect == nil {
|
2016-08-19 22:40:38 -04:00
|
|
|
ctxt.Diag("missing DWARF section: %s from %s", r.Sym.Name, s.Name)
|
2016-03-14 09:23:04 -07:00
|
|
|
}
|
|
|
|
|
if Linkmode == LinkExternal {
|
|
|
|
|
r.Done = 0
|
|
|
|
|
r.Type = obj.R_ADDR
|
|
|
|
|
|
2016-08-19 22:40:38 -04:00
|
|
|
r.Xsym = Linkrlookup(ctxt, r.Sym.Sect.Name, 0)
|
|
|
|
|
r.Xadd = r.Add + Symaddr(ctxt, r.Sym) - int64(r.Sym.Sect.Vaddr)
|
2016-03-14 09:23:04 -07:00
|
|
|
o = r.Xadd
|
|
|
|
|
rs = r.Xsym
|
2016-04-06 12:01:40 -07:00
|
|
|
if Iself && SysArch.Family == sys.AMD64 {
|
2016-03-14 09:23:04 -07:00
|
|
|
o = 0
|
|
|
|
|
}
|
|
|
|
|
break
|
|
|
|
|
}
|
2016-08-19 22:40:38 -04:00
|
|
|
o = Symaddr(ctxt, r.Sym) + r.Add - int64(r.Sym.Sect.Vaddr)
|
2016-03-14 09:23:04 -07:00
|
|
|
|
2016-03-27 10:21:48 -04:00
|
|
|
case obj.R_ADDROFF:
|
2016-08-19 22:40:38 -04:00
|
|
|
o = Symaddr(ctxt, r.Sym) - int64(r.Sym.Sect.Vaddr) + r.Add
|
2016-03-27 10:21:48 -04:00
|
|
|
|
2015-02-27 22:57:28 -05:00
|
|
|
// r->sym can be null when CALL $(constant) is transformed from absolute PC to relative PC call.
|
2015-04-19 19:33:58 -07:00
|
|
|
case obj.R_CALL, obj.R_GOTPCREL, obj.R_PCREL:
|
2016-08-19 22:40:38 -04:00
|
|
|
if Linkmode == LinkExternal && r.Sym != nil && r.Sym.Type != obj.SCONST && (r.Sym.Sect != ctxt.Cursym.Sect || r.Type == obj.R_GOTPCREL) {
|
2015-02-27 22:57:28 -05:00
|
|
|
r.Done = 0
|
|
|
|
|
|
|
|
|
|
// set up addend for eventual relocation via outer symbol.
|
|
|
|
|
rs = r.Sym
|
|
|
|
|
|
|
|
|
|
r.Xadd = r.Add
|
|
|
|
|
for rs.Outer != nil {
|
2016-08-19 22:40:38 -04:00
|
|
|
r.Xadd += Symaddr(ctxt, rs) - Symaddr(ctxt, rs.Outer)
|
2015-02-27 22:57:28 -05:00
|
|
|
rs = rs.Outer
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
r.Xadd -= int64(r.Siz) // relative to address after the relocated chunk
|
2015-04-19 19:33:58 -07:00
|
|
|
if rs.Type != obj.SHOSTOBJ && rs.Type != obj.SDYNIMPORT && rs.Sect == nil {
|
2016-08-19 22:40:38 -04:00
|
|
|
ctxt.Diag("missing section for %s", rs.Name)
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
r.Xsym = rs
|
|
|
|
|
|
|
|
|
|
o = r.Xadd
|
|
|
|
|
if Iself {
|
2016-04-06 12:01:40 -07:00
|
|
|
if SysArch.Family == sys.AMD64 {
|
2015-02-27 22:57:28 -05:00
|
|
|
o = 0
|
|
|
|
|
}
|
2015-04-19 19:33:58 -07:00
|
|
|
} else if HEADTYPE == obj.Hdarwin {
|
|
|
|
|
if r.Type == obj.R_CALL {
|
|
|
|
|
if rs.Type != obj.SHOSTOBJ {
|
2016-08-19 22:40:38 -04:00
|
|
|
o += int64(uint64(Symaddr(ctxt, rs)) - rs.Sect.Vaddr)
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
o -= int64(r.Off) // relative to section offset, not symbol
|
2016-04-26 15:17:56 -04:00
|
|
|
} else if SysArch.Family == sys.ARM {
|
|
|
|
|
// see ../arm/asm.go:/machoreloc1
|
2016-08-19 22:40:38 -04:00
|
|
|
o += Symaddr(ctxt, rs) - int64(ctxt.Cursym.Value) - int64(r.Off)
|
2015-02-27 22:57:28 -05:00
|
|
|
} else {
|
|
|
|
|
o += int64(r.Siz)
|
|
|
|
|
}
|
2016-04-06 12:01:40 -07:00
|
|
|
} else if HEADTYPE == obj.Hwindows && SysArch.Family == sys.AMD64 { // only amd64 needs PCREL
|
2015-03-13 22:10:48 -04:00
|
|
|
// PE/COFF's PC32 relocation uses the address after the relocated
|
|
|
|
|
// bytes as the base. Compensate by skewing the addend.
|
|
|
|
|
o += int64(r.Siz)
|
|
|
|
|
// GNU ld always add VirtualAddress of the .text section to the
|
|
|
|
|
// relocated address, compensate that.
|
2015-05-27 12:04:25 +12:00
|
|
|
o -= int64(s.Sect.Vaddr - PEBASE)
|
2015-02-27 22:57:28 -05:00
|
|
|
} else {
|
2016-08-19 22:40:38 -04:00
|
|
|
ctxt.Diag("unhandled pcrel relocation for %s", headstring)
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
o = 0
|
|
|
|
|
if r.Sym != nil {
|
2016-08-19 22:40:38 -04:00
|
|
|
o += Symaddr(ctxt, r.Sym)
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// NOTE: The (int32) cast on the next line works around a bug in Plan 9's 8c
|
|
|
|
|
// compiler. The expression s->value + r->off + r->siz is int32 + int32 +
|
|
|
|
|
// uchar, and Plan 9 8c incorrectly treats the expression as type uint32
|
|
|
|
|
// instead of int32, causing incorrect values when sign extended for adding
|
|
|
|
|
// to o. The bug only occurs on Plan 9, because this C program is compiled by
|
|
|
|
|
// the standard host compiler (gcc on most other systems).
|
|
|
|
|
o += r.Add - (s.Value + int64(r.Off) + int64(int32(r.Siz)))
|
|
|
|
|
|
2015-04-19 19:33:58 -07:00
|
|
|
case obj.R_SIZE:
|
2015-02-27 22:57:28 -05:00
|
|
|
o = r.Sym.Size + r.Add
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if r.Variant != RV_NONE {
|
2016-08-21 13:52:23 -04:00
|
|
|
o = Thearch.Archrelocvariant(ctxt, r, s, o)
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
|
2015-03-01 13:32:49 -05:00
|
|
|
if false {
|
|
|
|
|
nam := "<nil>"
|
|
|
|
|
if r.Sym != nil {
|
|
|
|
|
nam = r.Sym.Name
|
|
|
|
|
}
|
2016-08-19 22:40:38 -04:00
|
|
|
fmt.Printf("relocate %s %#x (%#x+%#x, size %d) => %s %#x +%#x [type %d/%d, %x]\n", s.Name, s.Value+int64(off), s.Value, r.Off, r.Siz, nam, Symaddr(ctxt, r.Sym), r.Add, r.Type, r.Variant, o)
|
2015-03-01 13:32:49 -05:00
|
|
|
}
|
2015-02-27 22:57:28 -05:00
|
|
|
switch siz {
|
|
|
|
|
default:
|
2016-08-19 22:40:38 -04:00
|
|
|
ctxt.Cursym = s
|
|
|
|
|
ctxt.Diag("bad reloc size %#x for %s", uint32(siz), r.Sym.Name)
|
2015-02-27 22:57:28 -05:00
|
|
|
fallthrough
|
|
|
|
|
|
|
|
|
|
// TODO(rsc): Remove.
|
|
|
|
|
case 1:
|
|
|
|
|
s.P[off] = byte(int8(o))
|
|
|
|
|
|
|
|
|
|
case 2:
|
|
|
|
|
if o != int64(int16(o)) {
|
2016-08-19 22:40:38 -04:00
|
|
|
ctxt.Diag("relocation address is too big: %#x", o)
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
i16 = int16(o)
|
2016-08-19 22:40:38 -04:00
|
|
|
ctxt.Arch.ByteOrder.PutUint16(s.P[off:], uint16(i16))
|
2015-02-27 22:57:28 -05:00
|
|
|
|
|
|
|
|
case 4:
|
2015-04-19 19:33:58 -07:00
|
|
|
if r.Type == obj.R_PCREL || r.Type == obj.R_CALL {
|
2015-02-27 22:57:28 -05:00
|
|
|
if o != int64(int32(o)) {
|
2016-08-19 22:40:38 -04:00
|
|
|
ctxt.Diag("pc-relative relocation address is too big: %#x", o)
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
if o != int64(int32(o)) && o != int64(uint32(o)) {
|
2016-08-19 22:40:38 -04:00
|
|
|
ctxt.Diag("non-pc-relative relocation address is too big: %#x", uint64(o))
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fl = int32(o)
|
2016-08-19 22:40:38 -04:00
|
|
|
ctxt.Arch.ByteOrder.PutUint32(s.P[off:], uint32(fl))
|
2015-02-27 22:57:28 -05:00
|
|
|
|
|
|
|
|
case 8:
|
2016-08-19 22:40:38 -04:00
|
|
|
ctxt.Arch.ByteOrder.PutUint64(s.P[off:], uint64(o))
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2016-08-19 22:40:38 -04:00
|
|
|
func (ctxt *Link) reloc() {
|
2015-02-27 22:57:28 -05:00
|
|
|
if Debug['v'] != 0 {
|
2016-08-21 13:52:23 -04:00
|
|
|
fmt.Fprintf(ctxt.Bso, "%5.2f reloc\n", obj.Cputime())
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
2016-08-21 13:52:23 -04:00
|
|
|
ctxt.Bso.Flush()
|
2015-02-27 22:57:28 -05:00
|
|
|
|
2016-08-19 22:40:38 -04:00
|
|
|
for _, s := range ctxt.Textp {
|
|
|
|
|
relocsym(ctxt, s)
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
2016-04-18 14:50:14 -04:00
|
|
|
for _, sym := range datap {
|
2016-08-19 22:40:38 -04:00
|
|
|
relocsym(ctxt, sym)
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
2016-04-22 10:31:14 +12:00
|
|
|
for _, s := range dwarfp {
|
2016-08-19 22:40:38 -04:00
|
|
|
relocsym(ctxt, s)
|
2016-03-14 09:23:04 -07:00
|
|
|
}
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
|
2016-08-19 22:40:38 -04:00
|
|
|
func dynrelocsym(ctxt *Link, s *Symbol) {
|
2015-04-19 19:33:58 -07:00
|
|
|
if HEADTYPE == obj.Hwindows && Linkmode != LinkExternal {
|
2016-08-19 22:40:38 -04:00
|
|
|
rel := Linklookup(ctxt, ".rel", 0)
|
2015-02-27 22:57:28 -05:00
|
|
|
if s == rel {
|
|
|
|
|
return
|
|
|
|
|
}
|
2015-03-02 12:35:15 -05:00
|
|
|
for ri := 0; ri < len(s.R); ri++ {
|
2016-04-20 10:36:49 -04:00
|
|
|
r := &s.R[ri]
|
|
|
|
|
targ := r.Sym
|
2015-02-27 22:57:28 -05:00
|
|
|
if targ == nil {
|
|
|
|
|
continue
|
|
|
|
|
}
|
2016-03-02 07:59:49 -05:00
|
|
|
if !targ.Attr.Reachable() {
|
2016-08-19 22:40:38 -04:00
|
|
|
ctxt.Diag("internal inconsistency: dynamic symbol %s is not reachable.", targ.Name)
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
if r.Sym.Plt == -2 && r.Sym.Got != -2 { // make dynimport JMP table for PE object files.
|
|
|
|
|
targ.Plt = int32(rel.Size)
|
|
|
|
|
r.Sym = rel
|
|
|
|
|
r.Add = int64(targ.Plt)
|
|
|
|
|
|
|
|
|
|
// jmp *addr
|
2016-04-06 12:01:40 -07:00
|
|
|
if SysArch.Family == sys.I386 {
|
2016-08-19 22:40:38 -04:00
|
|
|
Adduint8(ctxt, rel, 0xff)
|
|
|
|
|
Adduint8(ctxt, rel, 0x25)
|
|
|
|
|
Addaddr(ctxt, rel, targ)
|
|
|
|
|
Adduint8(ctxt, rel, 0x90)
|
|
|
|
|
Adduint8(ctxt, rel, 0x90)
|
2015-02-27 22:57:28 -05:00
|
|
|
} else {
|
2016-08-19 22:40:38 -04:00
|
|
|
Adduint8(ctxt, rel, 0xff)
|
|
|
|
|
Adduint8(ctxt, rel, 0x24)
|
|
|
|
|
Adduint8(ctxt, rel, 0x25)
|
|
|
|
|
addaddrplus4(ctxt, rel, targ, 0)
|
|
|
|
|
Adduint8(ctxt, rel, 0x90)
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
} else if r.Sym.Plt >= 0 {
|
|
|
|
|
r.Sym = rel
|
|
|
|
|
r.Add = int64(targ.Plt)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
|
2015-03-02 12:35:15 -05:00
|
|
|
for ri := 0; ri < len(s.R); ri++ {
|
2016-04-20 10:36:49 -04:00
|
|
|
r := &s.R[ri]
|
2015-04-19 19:33:58 -07:00
|
|
|
if r.Sym != nil && r.Sym.Type == obj.SDYNIMPORT || r.Type >= 256 {
|
2016-03-02 07:59:49 -05:00
|
|
|
if r.Sym != nil && !r.Sym.Attr.Reachable() {
|
2016-08-19 22:40:38 -04:00
|
|
|
ctxt.Diag("internal inconsistency: dynamic symbol %s is not reachable.", r.Sym.Name)
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
2016-08-21 13:52:23 -04:00
|
|
|
Thearch.Adddynrel(ctxt, s, r)
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2016-08-19 22:40:38 -04:00
|
|
|
func dynreloc(ctxt *Link, data *[obj.SXREF][]*Symbol) {
|
2015-02-27 22:57:28 -05:00
|
|
|
// -d suppresses dynamic loader format, so we may as well not
|
|
|
|
|
// compute these sections or mark their symbols as reachable.
|
2015-04-19 19:33:58 -07:00
|
|
|
if Debug['d'] != 0 && HEADTYPE != obj.Hwindows {
|
2015-02-27 22:57:28 -05:00
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
if Debug['v'] != 0 {
|
2016-08-21 13:52:23 -04:00
|
|
|
fmt.Fprintf(ctxt.Bso, "%5.2f reloc\n", obj.Cputime())
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
2016-08-21 13:52:23 -04:00
|
|
|
ctxt.Bso.Flush()
|
2015-02-27 22:57:28 -05:00
|
|
|
|
2016-08-19 22:40:38 -04:00
|
|
|
for _, s := range ctxt.Textp {
|
|
|
|
|
dynrelocsym(ctxt, s)
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
2016-04-18 14:50:14 -04:00
|
|
|
for _, syms := range data {
|
|
|
|
|
for _, sym := range syms {
|
2016-08-19 22:40:38 -04:00
|
|
|
dynrelocsym(ctxt, sym)
|
2016-04-18 14:50:14 -04:00
|
|
|
}
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
if Iself {
|
2016-08-19 22:40:38 -04:00
|
|
|
elfdynhash(ctxt)
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2016-08-19 22:40:38 -04:00
|
|
|
func Codeblk(ctxt *Link, addr int64, size int64) {
|
|
|
|
|
CodeblkPad(ctxt, addr, size, zeros[:])
|
2016-06-11 17:12:28 -07:00
|
|
|
}
|
2016-08-19 22:40:38 -04:00
|
|
|
func CodeblkPad(ctxt *Link, addr int64, size int64, pad []byte) {
|
2015-02-27 22:57:28 -05:00
|
|
|
if Debug['a'] != 0 {
|
2016-08-21 13:52:23 -04:00
|
|
|
fmt.Fprintf(ctxt.Bso, "codeblk [%#x,%#x) at offset %#x\n", addr, addr+size, Cpos())
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
|
2016-04-22 10:31:14 +12:00
|
|
|
blk(ctxt, ctxt.Textp, addr, size, pad)
|
2015-02-27 22:57:28 -05:00
|
|
|
|
|
|
|
|
/* again for printing */
|
|
|
|
|
if Debug['a'] == 0 {
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
|
2016-08-19 22:40:38 -04:00
|
|
|
syms := ctxt.Textp
|
2016-04-22 09:38:41 +12:00
|
|
|
for i, sym := range syms {
|
2016-03-02 07:59:49 -05:00
|
|
|
if !sym.Attr.Reachable() {
|
2015-02-27 22:57:28 -05:00
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
if sym.Value >= addr {
|
2016-04-22 09:38:41 +12:00
|
|
|
syms = syms[i:]
|
2015-02-27 22:57:28 -05:00
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2015-03-02 12:35:15 -05:00
|
|
|
eaddr := addr + size
|
|
|
|
|
var q []byte
|
2016-04-22 09:38:41 +12:00
|
|
|
for _, sym := range syms {
|
2016-03-02 07:59:49 -05:00
|
|
|
if !sym.Attr.Reachable() {
|
2015-02-27 22:57:28 -05:00
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
if sym.Value >= eaddr {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if addr < sym.Value {
|
2016-08-21 13:52:23 -04:00
|
|
|
fmt.Fprintf(ctxt.Bso, "%-20s %.8x|", "_", uint64(addr))
|
2015-02-27 22:57:28 -05:00
|
|
|
for ; addr < sym.Value; addr++ {
|
2016-08-21 13:52:23 -04:00
|
|
|
fmt.Fprintf(ctxt.Bso, " %.2x", 0)
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
2016-08-21 13:52:23 -04:00
|
|
|
fmt.Fprintf(ctxt.Bso, "\n")
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
|
2016-08-21 13:52:23 -04:00
|
|
|
fmt.Fprintf(ctxt.Bso, "%.6x\t%-20s\n", uint64(addr), sym.Name)
|
2015-02-27 22:57:28 -05:00
|
|
|
q = sym.P
|
|
|
|
|
|
2015-08-11 10:59:24 -04:00
|
|
|
for len(q) >= 16 {
|
2016-08-21 13:52:23 -04:00
|
|
|
fmt.Fprintf(ctxt.Bso, "%.6x\t% x\n", uint64(addr), q[:16])
|
2015-02-27 22:57:28 -05:00
|
|
|
addr += 16
|
|
|
|
|
q = q[16:]
|
|
|
|
|
}
|
|
|
|
|
|
2015-08-11 10:59:24 -04:00
|
|
|
if len(q) > 0 {
|
2016-08-21 13:52:23 -04:00
|
|
|
fmt.Fprintf(ctxt.Bso, "%.6x\t% x\n", uint64(addr), q)
|
2015-08-11 10:59:24 -04:00
|
|
|
addr += int64(len(q))
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if addr < eaddr {
|
2016-08-21 13:52:23 -04:00
|
|
|
fmt.Fprintf(ctxt.Bso, "%-20s %.8x|", "_", uint64(addr))
|
2015-02-27 22:57:28 -05:00
|
|
|
for ; addr < eaddr; addr++ {
|
2016-08-21 13:52:23 -04:00
|
|
|
fmt.Fprintf(ctxt.Bso, " %.2x", 0)
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2016-08-21 13:52:23 -04:00
|
|
|
ctxt.Bso.Flush()
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
|
2016-04-22 10:31:14 +12:00
|
|
|
func blk(ctxt *Link, syms []*Symbol, addr, size int64, pad []byte) {
|
2016-04-18 14:50:14 -04:00
|
|
|
for i, s := range syms {
|
|
|
|
|
if s.Type&obj.SSUB == 0 && s.Value >= addr {
|
|
|
|
|
syms = syms[i:]
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
eaddr := addr + size
|
|
|
|
|
for _, s := range syms {
|
|
|
|
|
if s.Type&obj.SSUB != 0 {
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
if s.Value >= eaddr {
|
|
|
|
|
break
|
|
|
|
|
}
|
2016-08-19 22:40:38 -04:00
|
|
|
ctxt.Cursym = s
|
2016-04-18 14:50:14 -04:00
|
|
|
if s.Value < addr {
|
2016-08-19 22:40:38 -04:00
|
|
|
ctxt.Diag("phase error: addr=%#x but sym=%#x type=%d", addr, s.Value, s.Type)
|
2016-04-18 14:50:14 -04:00
|
|
|
errorexit()
|
|
|
|
|
}
|
|
|
|
|
if addr < s.Value {
|
2016-06-11 17:12:28 -07:00
|
|
|
strnputPad("", int(s.Value-addr), pad)
|
2016-04-18 14:50:14 -04:00
|
|
|
addr = s.Value
|
|
|
|
|
}
|
|
|
|
|
Cwrite(s.P)
|
|
|
|
|
addr += int64(len(s.P))
|
|
|
|
|
if addr < s.Value+s.Size {
|
2016-06-11 17:12:28 -07:00
|
|
|
strnputPad("", int(s.Value+s.Size-addr), pad)
|
2016-04-18 14:50:14 -04:00
|
|
|
addr = s.Value + s.Size
|
|
|
|
|
}
|
|
|
|
|
if addr != s.Value+s.Size {
|
2016-08-19 22:40:38 -04:00
|
|
|
ctxt.Diag("phase error: addr=%#x value+size=%#x", addr, s.Value+s.Size)
|
2016-04-18 14:50:14 -04:00
|
|
|
errorexit()
|
|
|
|
|
}
|
|
|
|
|
if s.Value+s.Size >= eaddr {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if addr < eaddr {
|
2016-06-11 17:12:28 -07:00
|
|
|
strnputPad("", int(eaddr-addr), pad)
|
2016-04-18 14:50:14 -04:00
|
|
|
}
|
|
|
|
|
Cflush()
|
|
|
|
|
}
|
|
|
|
|
|
2016-08-19 22:40:38 -04:00
|
|
|
func Datblk(ctxt *Link, addr int64, size int64) {
|
2015-02-27 22:57:28 -05:00
|
|
|
if Debug['a'] != 0 {
|
2016-08-21 13:52:23 -04:00
|
|
|
fmt.Fprintf(ctxt.Bso, "datblk [%#x,%#x) at offset %#x\n", addr, addr+size, Cpos())
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
|
2016-04-22 10:31:14 +12:00
|
|
|
blk(ctxt, datap, addr, size, zeros[:])
|
2015-02-27 22:57:28 -05:00
|
|
|
|
|
|
|
|
/* again for printing */
|
|
|
|
|
if Debug['a'] == 0 {
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
|
2016-04-18 14:50:14 -04:00
|
|
|
syms := datap
|
|
|
|
|
for i, sym := range syms {
|
2015-02-27 22:57:28 -05:00
|
|
|
if sym.Value >= addr {
|
2016-04-18 14:50:14 -04:00
|
|
|
syms = syms[i:]
|
2015-02-27 22:57:28 -05:00
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2015-03-02 12:35:15 -05:00
|
|
|
eaddr := addr + size
|
2016-04-18 14:50:14 -04:00
|
|
|
for _, sym := range syms {
|
2015-02-27 22:57:28 -05:00
|
|
|
if sym.Value >= eaddr {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
if addr < sym.Value {
|
2016-08-21 13:52:23 -04:00
|
|
|
fmt.Fprintf(ctxt.Bso, "\t%.8x| 00 ...\n", uint64(addr))
|
2015-02-27 22:57:28 -05:00
|
|
|
addr = sym.Value
|
|
|
|
|
}
|
|
|
|
|
|
2016-08-21 13:52:23 -04:00
|
|
|
fmt.Fprintf(ctxt.Bso, "%s\n\t%.8x|", sym.Name, uint64(addr))
|
2016-04-20 10:36:49 -04:00
|
|
|
for i, b := range sym.P {
|
|
|
|
|
if i > 0 && i%16 == 0 {
|
2016-08-21 13:52:23 -04:00
|
|
|
fmt.Fprintf(ctxt.Bso, "\n\t%.8x|", uint64(addr)+uint64(i))
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
2016-08-21 13:52:23 -04:00
|
|
|
fmt.Fprintf(ctxt.Bso, " %.2x", b)
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
addr += int64(len(sym.P))
|
|
|
|
|
for ; addr < sym.Value+sym.Size; addr++ {
|
2016-08-21 13:52:23 -04:00
|
|
|
fmt.Fprintf(ctxt.Bso, " %.2x", 0)
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
2016-08-21 13:52:23 -04:00
|
|
|
fmt.Fprintf(ctxt.Bso, "\n")
|
2015-02-27 22:57:28 -05:00
|
|
|
|
2016-04-20 10:36:49 -04:00
|
|
|
if Linkmode != LinkExternal {
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
for _, r := range sym.R {
|
|
|
|
|
rsname := ""
|
|
|
|
|
if r.Sym != nil {
|
|
|
|
|
rsname = r.Sym.Name
|
|
|
|
|
}
|
|
|
|
|
typ := "?"
|
|
|
|
|
switch r.Type {
|
|
|
|
|
case obj.R_ADDR:
|
|
|
|
|
typ = "addr"
|
|
|
|
|
case obj.R_PCREL:
|
|
|
|
|
typ = "pcrel"
|
|
|
|
|
case obj.R_CALL:
|
|
|
|
|
typ = "call"
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
2016-08-21 13:52:23 -04:00
|
|
|
fmt.Fprintf(ctxt.Bso, "\treloc %.8x/%d %s %s+%#x [%#x]\n", uint(sym.Value+int64(r.Off)), r.Siz, typ, rsname, r.Add, r.Sym.Value+r.Add)
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if addr < eaddr {
|
2016-08-21 13:52:23 -04:00
|
|
|
fmt.Fprintf(ctxt.Bso, "\t%.8x| 00 ...\n", uint(addr))
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
2016-08-21 13:52:23 -04:00
|
|
|
fmt.Fprintf(ctxt.Bso, "\t%.8x|\n", uint(eaddr))
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
|
2016-08-19 22:40:38 -04:00
|
|
|
func Dwarfblk(ctxt *Link, addr int64, size int64) {
|
2016-03-14 09:23:04 -07:00
|
|
|
if Debug['a'] != 0 {
|
2016-08-21 13:52:23 -04:00
|
|
|
fmt.Fprintf(ctxt.Bso, "dwarfblk [%#x,%#x) at offset %#x\n", addr, addr+size, Cpos())
|
2016-03-14 09:23:04 -07:00
|
|
|
}
|
|
|
|
|
|
2016-04-22 10:31:14 +12:00
|
|
|
blk(ctxt, dwarfp, addr, size, zeros[:])
|
2016-03-14 09:23:04 -07:00
|
|
|
}
|
|
|
|
|
|
2016-02-29 11:49:49 +02:00
|
|
|
var zeros [512]byte
|
2015-02-27 22:57:28 -05:00
|
|
|
|
2016-02-29 11:49:49 +02:00
|
|
|
// strnput writes the first n bytes of s.
|
2016-06-11 17:12:28 -07:00
|
|
|
// If n is larger than len(s),
|
2016-02-29 11:49:49 +02:00
|
|
|
// it is padded with NUL bytes.
|
|
|
|
|
func strnput(s string, n int) {
|
2016-06-11 17:12:28 -07:00
|
|
|
strnputPad(s, n, zeros[:])
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// strnput writes the first n bytes of s.
|
|
|
|
|
// If n is larger than len(s),
|
|
|
|
|
// it is padded with the bytes in pad (repeated as needed).
|
|
|
|
|
func strnputPad(s string, n int, pad []byte) {
|
2016-02-29 11:49:49 +02:00
|
|
|
if len(s) >= n {
|
|
|
|
|
Cwritestring(s[:n])
|
|
|
|
|
} else {
|
|
|
|
|
Cwritestring(s)
|
|
|
|
|
n -= len(s)
|
2016-06-11 17:12:28 -07:00
|
|
|
for n > len(pad) {
|
|
|
|
|
Cwrite(pad)
|
|
|
|
|
n -= len(pad)
|
|
|
|
|
|
2016-02-29 11:49:49 +02:00
|
|
|
}
|
2016-06-11 17:12:28 -07:00
|
|
|
Cwrite(pad[:n])
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2016-08-19 11:35:54 -04:00
|
|
|
var strdata []*Symbol
|
2015-06-29 13:03:11 -04:00
|
|
|
|
2016-08-19 22:40:38 -04:00
|
|
|
func addstrdata1(ctxt *Link, arg string) {
|
2015-05-21 14:35:02 -04:00
|
|
|
i := strings.Index(arg, "=")
|
|
|
|
|
if i < 0 {
|
|
|
|
|
Exitf("-X flag requires argument of the form importpath.name=value")
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
2016-08-19 22:40:38 -04:00
|
|
|
addstrdata(ctxt, arg[:i], arg[i+1:])
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
|
2016-08-19 22:40:38 -04:00
|
|
|
func addstrdata(ctxt *Link, name string, value string) {
|
2015-03-02 12:35:15 -05:00
|
|
|
p := fmt.Sprintf("%s.str", name)
|
2016-08-19 22:40:38 -04:00
|
|
|
sp := Linklookup(ctxt, p, 0)
|
2015-02-27 22:57:28 -05:00
|
|
|
|
2016-08-19 22:40:38 -04:00
|
|
|
Addstring(ctxt, sp, value)
|
2015-04-19 19:33:58 -07:00
|
|
|
sp.Type = obj.SRODATA
|
2015-02-27 22:57:28 -05:00
|
|
|
|
2016-08-19 22:40:38 -04:00
|
|
|
s := Linklookup(ctxt, name, 0)
|
2015-02-27 22:57:28 -05:00
|
|
|
s.Size = 0
|
2016-03-02 07:59:49 -05:00
|
|
|
s.Attr |= AttrDuplicateOK
|
|
|
|
|
reachable := s.Attr.Reachable()
|
2016-08-19 22:40:38 -04:00
|
|
|
Addaddr(ctxt, s, sp)
|
|
|
|
|
adduintxx(ctxt, s, uint64(len(value)), SysArch.PtrSize)
|
2015-02-27 22:57:28 -05:00
|
|
|
|
|
|
|
|
// addstring, addaddr, etc., mark the symbols as reachable.
|
|
|
|
|
// In this case that is not necessarily true, so stick to what
|
|
|
|
|
// we know before entering this function.
|
2016-03-02 07:59:49 -05:00
|
|
|
s.Attr.Set(AttrReachable, reachable)
|
2015-02-27 22:57:28 -05:00
|
|
|
|
2015-06-29 13:03:11 -04:00
|
|
|
strdata = append(strdata, s)
|
|
|
|
|
|
2016-03-02 07:59:49 -05:00
|
|
|
sp.Attr.Set(AttrReachable, reachable)
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
|
2016-08-19 22:40:38 -04:00
|
|
|
func (ctxt *Link) checkstrdata() {
|
2015-06-29 13:03:11 -04:00
|
|
|
for _, s := range strdata {
|
|
|
|
|
if s.Type == obj.STEXT {
|
2016-08-19 22:40:38 -04:00
|
|
|
ctxt.Diag("cannot use -X with text symbol %s", s.Name)
|
2015-06-29 13:03:11 -04:00
|
|
|
} else if s.Gotype != nil && s.Gotype.Name != "type.string" {
|
2016-08-19 22:40:38 -04:00
|
|
|
ctxt.Diag("cannot use -X with non-string symbol %s", s.Name)
|
2015-06-29 13:03:11 -04:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2016-08-19 22:40:38 -04:00
|
|
|
func Addstring(ctxt *Link, s *Symbol, str string) int64 {
|
2015-02-27 22:57:28 -05:00
|
|
|
if s.Type == 0 {
|
2015-04-19 19:33:58 -07:00
|
|
|
s.Type = obj.SNOPTRDATA
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
2016-03-02 07:59:49 -05:00
|
|
|
s.Attr |= AttrReachable
|
2016-04-07 18:00:57 +03:00
|
|
|
r := s.Size
|
2015-02-27 22:57:28 -05:00
|
|
|
if s.Name == ".shstrtab" {
|
2016-08-19 22:40:38 -04:00
|
|
|
elfsetstring(ctxt, str, int(r))
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
2016-04-07 18:00:57 +03:00
|
|
|
s.P = append(s.P, str...)
|
|
|
|
|
s.P = append(s.P, 0)
|
|
|
|
|
s.Size = int64(len(s.P))
|
|
|
|
|
return r
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
|
2015-04-11 12:05:21 +08:00
|
|
|
// addgostring adds str, as a Go string value, to s. symname is the name of the
|
|
|
|
|
// symbol used to define the string data and must be unique per linked object.
|
2016-08-19 22:40:38 -04:00
|
|
|
func addgostring(ctxt *Link, s *Symbol, symname, str string) {
|
|
|
|
|
sym := Linklookup(ctxt, symname, 0)
|
2015-04-11 12:05:21 +08:00
|
|
|
if sym.Type != obj.Sxxx {
|
2016-08-19 22:40:38 -04:00
|
|
|
ctxt.Diag("duplicate symname in addgostring: %s", symname)
|
2015-04-11 12:05:21 +08:00
|
|
|
}
|
2016-03-02 07:59:49 -05:00
|
|
|
sym.Attr |= AttrReachable
|
|
|
|
|
sym.Attr |= AttrLocal
|
2015-04-11 12:05:21 +08:00
|
|
|
sym.Type = obj.SRODATA
|
|
|
|
|
sym.Size = int64(len(str))
|
|
|
|
|
sym.P = []byte(str)
|
2016-08-19 22:40:38 -04:00
|
|
|
Addaddr(ctxt, s, sym)
|
|
|
|
|
adduint(ctxt, s, uint64(len(str)))
|
2015-04-11 12:05:21 +08:00
|
|
|
}
|
|
|
|
|
|
2016-08-19 22:40:38 -04:00
|
|
|
func addinitarrdata(ctxt *Link, s *Symbol) {
|
2015-03-17 09:47:01 -07:00
|
|
|
p := s.Name + ".ptr"
|
2016-08-19 22:40:38 -04:00
|
|
|
sp := Linklookup(ctxt, p, 0)
|
2015-04-19 19:33:58 -07:00
|
|
|
sp.Type = obj.SINITARR
|
2015-03-17 09:47:01 -07:00
|
|
|
sp.Size = 0
|
2016-03-02 07:59:49 -05:00
|
|
|
sp.Attr |= AttrDuplicateOK
|
2016-08-19 22:40:38 -04:00
|
|
|
Addaddr(ctxt, sp, s)
|
2015-03-17 09:47:01 -07:00
|
|
|
}
|
|
|
|
|
|
2016-08-19 22:40:38 -04:00
|
|
|
func dosymtype(ctxt *Link) {
|
|
|
|
|
for _, s := range ctxt.Allsym {
|
2015-02-27 22:57:28 -05:00
|
|
|
if len(s.P) > 0 {
|
2015-04-19 19:33:58 -07:00
|
|
|
if s.Type == obj.SBSS {
|
|
|
|
|
s.Type = obj.SDATA
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
2015-04-19 19:33:58 -07:00
|
|
|
if s.Type == obj.SNOPTRBSS {
|
|
|
|
|
s.Type = obj.SNOPTRDATA
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
}
|
2015-03-17 09:47:01 -07:00
|
|
|
// Create a new entry in the .init_array section that points to the
|
|
|
|
|
// library initializer function.
|
2015-04-09 10:44:05 -04:00
|
|
|
switch Buildmode {
|
|
|
|
|
case BuildmodeCArchive, BuildmodeCShared:
|
|
|
|
|
if s.Name == INITENTRY {
|
2016-08-19 22:40:38 -04:00
|
|
|
addinitarrdata(ctxt, s)
|
2015-04-09 10:44:05 -04:00
|
|
|
}
|
2015-03-17 09:47:01 -07:00
|
|
|
}
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2016-03-02 21:24:04 -05:00
|
|
|
// symalign returns the required alignment for the given symbol s.
|
2016-08-19 11:35:54 -04:00
|
|
|
func symalign(s *Symbol) int32 {
|
2016-03-02 21:24:04 -05:00
|
|
|
min := int32(Thearch.Minalign)
|
|
|
|
|
if s.Align >= min {
|
2015-02-27 22:57:28 -05:00
|
|
|
return s.Align
|
2016-03-02 21:24:04 -05:00
|
|
|
} else if s.Align != 0 {
|
|
|
|
|
return min
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
2016-04-07 20:07:09 -04:00
|
|
|
if (strings.HasPrefix(s.Name, "go.string.") && !strings.HasPrefix(s.Name, "go.string.hdr.")) || strings.HasPrefix(s.Name, "type..namedata.") {
|
2016-03-07 22:48:07 -05:00
|
|
|
// String data is just bytes.
|
|
|
|
|
// If we align it, we waste a lot of space to padding.
|
2016-03-18 16:57:54 -04:00
|
|
|
return min
|
2016-03-07 22:48:07 -05:00
|
|
|
}
|
2015-03-02 12:35:15 -05:00
|
|
|
align := int32(Thearch.Maxalign)
|
2016-03-02 21:24:04 -05:00
|
|
|
for int64(align) > s.Size && align > min {
|
2015-02-27 22:57:28 -05:00
|
|
|
align >>= 1
|
|
|
|
|
}
|
|
|
|
|
return align
|
|
|
|
|
}
|
|
|
|
|
|
2016-08-19 11:35:54 -04:00
|
|
|
func aligndatsize(datsize int64, s *Symbol) int64 {
|
2015-02-27 22:57:28 -05:00
|
|
|
return Rnd(datsize, int64(symalign(s)))
|
|
|
|
|
}
|
|
|
|
|
|
runtime: replace GC programs with simpler encoding, faster decoder
Small types record the location of pointers in their memory layout
by using a simple bitmap. In Go 1.4 the bitmap held 4-bit entries,
and in Go 1.5 the bitmap holds 1-bit entries, but in both cases using
a bitmap for a large type containing arrays does not make sense:
if someone refers to the type [1<<28]*byte in a program in such
a way that the type information makes it into the binary, it would be
a waste of space to write a 128 MB (for 4-bit entries) or even 32 MB
(for 1-bit entries) bitmap full of 1s into the binary or even to keep
one in memory during the execution of the program.
For large types containing arrays, it is much more compact to describe
the locations of pointers using a notation that can express repetition
than to lay out a bitmap of pointers. Go 1.4 included such a notation,
called ``GC programs'' but it was complex, required recursion during
decoding, and was generally slow. Dmitriy measured the execution of
these programs writing directly to the heap bitmap as being 7x slower
than copying from a preunrolled 4-bit mask (and frankly that code was
not terribly fast either). For some tests, unrollgcprog1 was seen costing
as much as 3x more than the rest of malloc combined.
This CL introduces a different form for the GC programs. They use a
simple Lempel-Ziv-style encoding of the 1-bit pointer information,
in which the only operations are (1) emit the following n bits
and (2) repeat the last n bits c more times. This encoding can be
generated directly from the Go type information (using repetition
only for arrays or large runs of non-pointer data) and it can be decoded
very efficiently. In particular the decoding requires little state and
no recursion, so that the entire decoding can run without any memory
accesses other than the reads of the encoding and the writes of the
decoded form to the heap bitmap. For recursive types like arrays of
arrays of arrays, the inner instructions are only executed once, not
n times, so that large repetitions run at full speed. (In contrast, large
repetitions in the old programs repeated the individual bit-level layout
of the inner data over and over.) The result is as much as 25x faster
decoding compared to the old form.
Because the old decoder was so slow, Go 1.4 had three (or so) cases
for how to set the heap bitmap bits for an allocation of a given type:
(1) If the type had an even number of words up to 32 words, then
the 4-bit pointer mask for the type fit in no more than 16 bytes;
store the 4-bit pointer mask directly in the binary and copy from it.
(1b) If the type had an odd number of words up to 15 words, then
the 4-bit pointer mask for the type, doubled to end on a byte boundary,
fit in no more than 16 bytes; store that doubled mask directly in the
binary and copy from it.
(2) If the type had an even number of words up to 128 words,
or an odd number of words up to 63 words (again due to doubling),
then the 4-bit pointer mask would fit in a 64-byte unrolled mask.
Store a GC program in the binary, but leave space in the BSS for
the unrolled mask. Execute the GC program to construct the mask the
first time it is needed, and thereafter copy from the mask.
(3) Otherwise, store a GC program and execute it to write directly to
the heap bitmap each time an object of that type is allocated.
(This is the case that was 7x slower than the other two.)
Because the new pointer masks store 1-bit entries instead of 4-bit
entries and because using the decoder no longer carries a significant
overhead, after this CL (that is, for Go 1.5) there are only two cases:
(1) If the type is 128 words or less (no condition about odd or even),
store the 1-bit pointer mask directly in the binary and use it to
initialize the heap bitmap during malloc. (Implemented in CL 9702.)
(2) There is no case 2 anymore.
(3) Otherwise, store a GC program and execute it to write directly to
the heap bitmap each time an object of that type is allocated.
Executing the GC program directly into the heap bitmap (case (3) above)
was disabled for the Go 1.5 dev cycle, both to avoid needing to use
GC programs for typedmemmove and to avoid updating that code as
the heap bitmap format changed. Typedmemmove no longer uses this
type information; as of CL 9886 it uses the heap bitmap directly.
Now that the heap bitmap format is stable, we reintroduce GC programs
and their space savings.
Benchmarks for heapBitsSetType, before this CL vs this CL:
name old mean new mean delta
SetTypePtr 7.59ns × (0.99,1.02) 5.16ns × (1.00,1.00) -32.05% (p=0.000)
SetTypePtr8 21.0ns × (0.98,1.05) 21.4ns × (1.00,1.00) ~ (p=0.179)
SetTypePtr16 24.1ns × (0.99,1.01) 24.6ns × (1.00,1.00) +2.41% (p=0.001)
SetTypePtr32 31.2ns × (0.99,1.01) 32.4ns × (0.99,1.02) +3.72% (p=0.001)
SetTypePtr64 45.2ns × (1.00,1.00) 47.2ns × (1.00,1.00) +4.42% (p=0.000)
SetTypePtr126 75.8ns × (0.99,1.01) 79.1ns × (1.00,1.00) +4.25% (p=0.000)
SetTypePtr128 74.3ns × (0.99,1.01) 77.6ns × (1.00,1.01) +4.55% (p=0.000)
SetTypePtrSlice 726ns × (1.00,1.01) 712ns × (1.00,1.00) -1.95% (p=0.001)
SetTypeNode1 20.0ns × (0.99,1.01) 20.7ns × (1.00,1.00) +3.71% (p=0.000)
SetTypeNode1Slice 112ns × (1.00,1.00) 113ns × (0.99,1.00) ~ (p=0.070)
SetTypeNode8 23.9ns × (1.00,1.00) 24.7ns × (1.00,1.01) +3.18% (p=0.000)
SetTypeNode8Slice 294ns × (0.99,1.02) 287ns × (0.99,1.01) -2.38% (p=0.015)
SetTypeNode64 52.8ns × (0.99,1.03) 51.8ns × (0.99,1.01) ~ (p=0.069)
SetTypeNode64Slice 1.13µs × (0.99,1.05) 1.14µs × (0.99,1.00) ~ (p=0.767)
SetTypeNode64Dead 36.0ns × (1.00,1.01) 32.5ns × (0.99,1.00) -9.67% (p=0.000)
SetTypeNode64DeadSlice 1.43µs × (0.99,1.01) 1.40µs × (1.00,1.00) -2.39% (p=0.001)
SetTypeNode124 75.7ns × (1.00,1.01) 79.0ns × (1.00,1.00) +4.44% (p=0.000)
SetTypeNode124Slice 1.94µs × (1.00,1.01) 2.04µs × (0.99,1.01) +4.98% (p=0.000)
SetTypeNode126 75.4ns × (1.00,1.01) 77.7ns × (0.99,1.01) +3.11% (p=0.000)
SetTypeNode126Slice 1.95µs × (0.99,1.01) 2.03µs × (1.00,1.00) +3.74% (p=0.000)
SetTypeNode128 85.4ns × (0.99,1.01) 122.0ns × (1.00,1.00) +42.89% (p=0.000)
SetTypeNode128Slice 2.20µs × (1.00,1.01) 2.36µs × (0.98,1.02) +7.48% (p=0.001)
SetTypeNode130 83.3ns × (1.00,1.00) 123.0ns × (1.00,1.00) +47.61% (p=0.000)
SetTypeNode130Slice 2.30µs × (0.99,1.01) 2.40µs × (0.98,1.01) +4.37% (p=0.000)
SetTypeNode1024 498ns × (1.00,1.00) 537ns × (1.00,1.00) +7.96% (p=0.000)
SetTypeNode1024Slice 15.5µs × (0.99,1.01) 17.8µs × (1.00,1.00) +15.27% (p=0.000)
The above compares always using a cached pointer mask (and the
corresponding waste of memory) against using the programs directly.
Some slowdown is expected, in exchange for having a better general algorithm.
The GC programs kick in for SetTypeNode128, SetTypeNode130, SetTypeNode1024,
along with the slice variants of those.
It is possible that the cutoff of 128 words (bits) should be raised
in a followup CL, but even with this low cutoff the GC programs are
faster than Go 1.4's "fast path" non-GC program case.
Benchmarks for heapBitsSetType, Go 1.4 vs this CL:
name old mean new mean delta
SetTypePtr 6.89ns × (1.00,1.00) 5.17ns × (1.00,1.00) -25.02% (p=0.000)
SetTypePtr8 25.8ns × (0.97,1.05) 21.5ns × (1.00,1.00) -16.70% (p=0.000)
SetTypePtr16 39.8ns × (0.97,1.02) 24.7ns × (0.99,1.01) -37.81% (p=0.000)
SetTypePtr32 68.8ns × (0.98,1.01) 32.2ns × (1.00,1.01) -53.18% (p=0.000)
SetTypePtr64 130ns × (1.00,1.00) 47ns × (1.00,1.00) -63.67% (p=0.000)
SetTypePtr126 241ns × (0.99,1.01) 79ns × (1.00,1.01) -67.25% (p=0.000)
SetTypePtr128 2.07µs × (1.00,1.00) 0.08µs × (1.00,1.00) -96.27% (p=0.000)
SetTypePtrSlice 1.05µs × (0.99,1.01) 0.72µs × (0.99,1.02) -31.70% (p=0.000)
SetTypeNode1 16.0ns × (0.99,1.01) 20.8ns × (0.99,1.03) +29.91% (p=0.000)
SetTypeNode1Slice 184ns × (0.99,1.01) 112ns × (0.99,1.01) -39.26% (p=0.000)
SetTypeNode8 29.5ns × (0.97,1.02) 24.6ns × (1.00,1.00) -16.50% (p=0.000)
SetTypeNode8Slice 624ns × (0.98,1.02) 285ns × (1.00,1.00) -54.31% (p=0.000)
SetTypeNode64 135ns × (0.96,1.08) 52ns × (0.99,1.02) -61.32% (p=0.000)
SetTypeNode64Slice 3.83µs × (1.00,1.00) 1.14µs × (0.99,1.01) -70.16% (p=0.000)
SetTypeNode64Dead 134ns × (0.99,1.01) 32ns × (1.00,1.01) -75.74% (p=0.000)
SetTypeNode64DeadSlice 3.83µs × (0.99,1.00) 1.40µs × (1.00,1.01) -63.42% (p=0.000)
SetTypeNode124 240ns × (0.99,1.01) 79ns × (1.00,1.01) -67.05% (p=0.000)
SetTypeNode124Slice 7.27µs × (1.00,1.00) 2.04µs × (1.00,1.00) -71.95% (p=0.000)
SetTypeNode126 2.06µs × (0.99,1.01) 0.08µs × (0.99,1.01) -96.23% (p=0.000)
SetTypeNode126Slice 64.4µs × (1.00,1.00) 2.0µs × (1.00,1.00) -96.85% (p=0.000)
SetTypeNode128 2.09µs × (1.00,1.01) 0.12µs × (1.00,1.00) -94.15% (p=0.000)
SetTypeNode128Slice 65.4µs × (1.00,1.00) 2.4µs × (0.99,1.03) -96.39% (p=0.000)
SetTypeNode130 2.11µs × (1.00,1.00) 0.12µs × (1.00,1.00) -94.18% (p=0.000)
SetTypeNode130Slice 66.3µs × (1.00,1.00) 2.4µs × (0.97,1.08) -96.34% (p=0.000)
SetTypeNode1024 16.0µs × (1.00,1.01) 0.5µs × (1.00,1.00) -96.65% (p=0.000)
SetTypeNode1024Slice 512µs × (1.00,1.00) 18µs × (0.98,1.04) -96.45% (p=0.000)
SetTypeNode124 uses a 124 data + 2 ptr = 126-word allocation.
Both Go 1.4 and this CL are using pointer bitmaps for this case,
so that's an overall 3x speedup for using pointer bitmaps.
SetTypeNode128 uses a 128 data + 2 ptr = 130-word allocation.
Both Go 1.4 and this CL are running the GC program for this case,
so that's an overall 17x speedup when using GC programs (and
I've seen >20x on other systems).
Comparing Go 1.4's SetTypeNode124 (pointer bitmap) against
this CL's SetTypeNode128 (GC program), the slow path in the
code in this CL is 2x faster than the fast path in Go 1.4.
The Go 1 benchmarks are basically unaffected compared to just before this CL.
Go 1 benchmarks, before this CL vs this CL:
name old mean new mean delta
BinaryTree17 5.87s × (0.97,1.04) 5.91s × (0.96,1.04) ~ (p=0.306)
Fannkuch11 4.38s × (1.00,1.00) 4.37s × (1.00,1.01) -0.22% (p=0.006)
FmtFprintfEmpty 90.7ns × (0.97,1.10) 89.3ns × (0.96,1.09) ~ (p=0.280)
FmtFprintfString 282ns × (0.98,1.04) 287ns × (0.98,1.07) +1.72% (p=0.039)
FmtFprintfInt 269ns × (0.99,1.03) 282ns × (0.97,1.04) +4.87% (p=0.000)
FmtFprintfIntInt 478ns × (0.99,1.02) 481ns × (0.99,1.02) +0.61% (p=0.048)
FmtFprintfPrefixedInt 399ns × (0.98,1.03) 400ns × (0.98,1.05) ~ (p=0.533)
FmtFprintfFloat 563ns × (0.99,1.01) 570ns × (1.00,1.01) +1.37% (p=0.000)
FmtManyArgs 1.89µs × (0.99,1.01) 1.92µs × (0.99,1.02) +1.88% (p=0.000)
GobDecode 15.2ms × (0.99,1.01) 15.2ms × (0.98,1.05) ~ (p=0.609)
GobEncode 11.6ms × (0.98,1.03) 11.9ms × (0.98,1.04) +2.17% (p=0.000)
Gzip 648ms × (0.99,1.01) 648ms × (1.00,1.01) ~ (p=0.835)
Gunzip 142ms × (1.00,1.00) 143ms × (1.00,1.01) ~ (p=0.169)
HTTPClientServer 90.5µs × (0.98,1.03) 91.5µs × (0.98,1.04) +1.04% (p=0.045)
JSONEncode 31.5ms × (0.98,1.03) 31.4ms × (0.98,1.03) ~ (p=0.549)
JSONDecode 111ms × (0.99,1.01) 107ms × (0.99,1.01) -3.21% (p=0.000)
Mandelbrot200 6.01ms × (1.00,1.00) 6.01ms × (1.00,1.00) ~ (p=0.878)
GoParse 6.54ms × (0.99,1.02) 6.61ms × (0.99,1.03) +1.08% (p=0.004)
RegexpMatchEasy0_32 160ns × (1.00,1.01) 161ns × (1.00,1.00) +0.40% (p=0.000)
RegexpMatchEasy0_1K 560ns × (0.99,1.01) 559ns × (0.99,1.01) ~ (p=0.088)
RegexpMatchEasy1_32 138ns × (0.99,1.01) 138ns × (1.00,1.00) ~ (p=0.380)
RegexpMatchEasy1_1K 877ns × (1.00,1.00) 878ns × (1.00,1.00) ~ (p=0.157)
RegexpMatchMedium_32 251ns × (0.99,1.00) 251ns × (1.00,1.01) +0.28% (p=0.021)
RegexpMatchMedium_1K 72.6µs × (1.00,1.00) 72.6µs × (1.00,1.00) ~ (p=0.539)
RegexpMatchHard_32 3.84µs × (1.00,1.00) 3.84µs × (1.00,1.00) ~ (p=0.378)
RegexpMatchHard_1K 117µs × (1.00,1.00) 117µs × (1.00,1.00) ~ (p=0.067)
Revcomp 904ms × (0.99,1.02) 904ms × (0.99,1.01) ~ (p=0.943)
Template 125ms × (0.99,1.02) 127ms × (0.99,1.01) +1.79% (p=0.000)
TimeParse 627ns × (0.99,1.01) 622ns × (0.99,1.01) -0.88% (p=0.000)
TimeFormat 655ns × (0.99,1.02) 655ns × (0.99,1.02) ~ (p=0.976)
For the record, Go 1 benchmarks, Go 1.4 vs this CL:
name old mean new mean delta
BinaryTree17 4.61s × (0.97,1.05) 5.91s × (0.98,1.03) +28.35% (p=0.000)
Fannkuch11 4.40s × (0.99,1.03) 4.41s × (0.99,1.01) ~ (p=0.212)
FmtFprintfEmpty 102ns × (0.99,1.01) 84ns × (0.99,1.02) -18.38% (p=0.000)
FmtFprintfString 302ns × (0.98,1.01) 303ns × (0.99,1.02) ~ (p=0.203)
FmtFprintfInt 313ns × (0.97,1.05) 270ns × (0.99,1.01) -13.69% (p=0.000)
FmtFprintfIntInt 524ns × (0.98,1.02) 477ns × (0.99,1.00) -8.87% (p=0.000)
FmtFprintfPrefixedInt 424ns × (0.98,1.02) 386ns × (0.99,1.01) -8.96% (p=0.000)
FmtFprintfFloat 652ns × (0.98,1.02) 594ns × (0.97,1.05) -8.97% (p=0.000)
FmtManyArgs 2.13µs × (0.99,1.02) 1.94µs × (0.99,1.01) -8.92% (p=0.000)
GobDecode 17.1ms × (0.99,1.02) 14.9ms × (0.98,1.03) -13.07% (p=0.000)
GobEncode 13.5ms × (0.98,1.03) 11.5ms × (0.98,1.03) -15.25% (p=0.000)
Gzip 656ms × (0.99,1.02) 647ms × (0.99,1.01) -1.29% (p=0.000)
Gunzip 143ms × (0.99,1.02) 144ms × (0.99,1.01) ~ (p=0.204)
HTTPClientServer 88.2µs × (0.98,1.02) 90.8µs × (0.98,1.01) +2.93% (p=0.000)
JSONEncode 32.2ms × (0.98,1.02) 30.9ms × (0.97,1.04) -4.06% (p=0.001)
JSONDecode 121ms × (0.98,1.02) 110ms × (0.98,1.05) -8.95% (p=0.000)
Mandelbrot200 6.06ms × (0.99,1.01) 6.11ms × (0.98,1.04) ~ (p=0.184)
GoParse 6.76ms × (0.97,1.04) 6.58ms × (0.98,1.05) -2.63% (p=0.003)
RegexpMatchEasy0_32 195ns × (1.00,1.01) 155ns × (0.99,1.01) -20.43% (p=0.000)
RegexpMatchEasy0_1K 479ns × (0.98,1.03) 535ns × (0.99,1.02) +11.59% (p=0.000)
RegexpMatchEasy1_32 169ns × (0.99,1.02) 131ns × (0.99,1.03) -22.44% (p=0.000)
RegexpMatchEasy1_1K 1.53µs × (0.99,1.01) 0.87µs × (0.99,1.02) -43.07% (p=0.000)
RegexpMatchMedium_32 334ns × (0.99,1.01) 242ns × (0.99,1.01) -27.53% (p=0.000)
RegexpMatchMedium_1K 125µs × (1.00,1.01) 72µs × (0.99,1.03) -42.53% (p=0.000)
RegexpMatchHard_32 6.03µs × (0.99,1.01) 3.79µs × (0.99,1.01) -37.12% (p=0.000)
RegexpMatchHard_1K 189µs × (0.99,1.02) 115µs × (0.99,1.01) -39.20% (p=0.000)
Revcomp 935ms × (0.96,1.03) 926ms × (0.98,1.02) ~ (p=0.083)
Template 146ms × (0.97,1.05) 119ms × (0.99,1.01) -18.37% (p=0.000)
TimeParse 660ns × (0.99,1.01) 624ns × (0.99,1.02) -5.43% (p=0.000)
TimeFormat 670ns × (0.98,1.02) 710ns × (1.00,1.01) +5.97% (p=0.000)
This CL is a bit larger than I would like, but the compiler, linker, runtime,
and package reflect all need to be in sync about the format of these programs,
so there is no easy way to split this into independent changes (at least
while keeping the build working at each change).
Fixes #9625.
Fixes #10524.
Change-Id: I9e3e20d6097099d0f8532d1cb5b1af528804989a
Reviewed-on: https://go-review.googlesource.com/9888
Reviewed-by: Austin Clements <austin@google.com>
Run-TryBot: Russ Cox <rsc@golang.org>
2015-05-08 01:43:18 -04:00
|
|
|
const debugGCProg = false
|
2015-02-27 22:57:28 -05:00
|
|
|
|
runtime: replace GC programs with simpler encoding, faster decoder
Small types record the location of pointers in their memory layout
by using a simple bitmap. In Go 1.4 the bitmap held 4-bit entries,
and in Go 1.5 the bitmap holds 1-bit entries, but in both cases using
a bitmap for a large type containing arrays does not make sense:
if someone refers to the type [1<<28]*byte in a program in such
a way that the type information makes it into the binary, it would be
a waste of space to write a 128 MB (for 4-bit entries) or even 32 MB
(for 1-bit entries) bitmap full of 1s into the binary or even to keep
one in memory during the execution of the program.
For large types containing arrays, it is much more compact to describe
the locations of pointers using a notation that can express repetition
than to lay out a bitmap of pointers. Go 1.4 included such a notation,
called ``GC programs'' but it was complex, required recursion during
decoding, and was generally slow. Dmitriy measured the execution of
these programs writing directly to the heap bitmap as being 7x slower
than copying from a preunrolled 4-bit mask (and frankly that code was
not terribly fast either). For some tests, unrollgcprog1 was seen costing
as much as 3x more than the rest of malloc combined.
This CL introduces a different form for the GC programs. They use a
simple Lempel-Ziv-style encoding of the 1-bit pointer information,
in which the only operations are (1) emit the following n bits
and (2) repeat the last n bits c more times. This encoding can be
generated directly from the Go type information (using repetition
only for arrays or large runs of non-pointer data) and it can be decoded
very efficiently. In particular the decoding requires little state and
no recursion, so that the entire decoding can run without any memory
accesses other than the reads of the encoding and the writes of the
decoded form to the heap bitmap. For recursive types like arrays of
arrays of arrays, the inner instructions are only executed once, not
n times, so that large repetitions run at full speed. (In contrast, large
repetitions in the old programs repeated the individual bit-level layout
of the inner data over and over.) The result is as much as 25x faster
decoding compared to the old form.
Because the old decoder was so slow, Go 1.4 had three (or so) cases
for how to set the heap bitmap bits for an allocation of a given type:
(1) If the type had an even number of words up to 32 words, then
the 4-bit pointer mask for the type fit in no more than 16 bytes;
store the 4-bit pointer mask directly in the binary and copy from it.
(1b) If the type had an odd number of words up to 15 words, then
the 4-bit pointer mask for the type, doubled to end on a byte boundary,
fit in no more than 16 bytes; store that doubled mask directly in the
binary and copy from it.
(2) If the type had an even number of words up to 128 words,
or an odd number of words up to 63 words (again due to doubling),
then the 4-bit pointer mask would fit in a 64-byte unrolled mask.
Store a GC program in the binary, but leave space in the BSS for
the unrolled mask. Execute the GC program to construct the mask the
first time it is needed, and thereafter copy from the mask.
(3) Otherwise, store a GC program and execute it to write directly to
the heap bitmap each time an object of that type is allocated.
(This is the case that was 7x slower than the other two.)
Because the new pointer masks store 1-bit entries instead of 4-bit
entries and because using the decoder no longer carries a significant
overhead, after this CL (that is, for Go 1.5) there are only two cases:
(1) If the type is 128 words or less (no condition about odd or even),
store the 1-bit pointer mask directly in the binary and use it to
initialize the heap bitmap during malloc. (Implemented in CL 9702.)
(2) There is no case 2 anymore.
(3) Otherwise, store a GC program and execute it to write directly to
the heap bitmap each time an object of that type is allocated.
Executing the GC program directly into the heap bitmap (case (3) above)
was disabled for the Go 1.5 dev cycle, both to avoid needing to use
GC programs for typedmemmove and to avoid updating that code as
the heap bitmap format changed. Typedmemmove no longer uses this
type information; as of CL 9886 it uses the heap bitmap directly.
Now that the heap bitmap format is stable, we reintroduce GC programs
and their space savings.
Benchmarks for heapBitsSetType, before this CL vs this CL:
name old mean new mean delta
SetTypePtr 7.59ns × (0.99,1.02) 5.16ns × (1.00,1.00) -32.05% (p=0.000)
SetTypePtr8 21.0ns × (0.98,1.05) 21.4ns × (1.00,1.00) ~ (p=0.179)
SetTypePtr16 24.1ns × (0.99,1.01) 24.6ns × (1.00,1.00) +2.41% (p=0.001)
SetTypePtr32 31.2ns × (0.99,1.01) 32.4ns × (0.99,1.02) +3.72% (p=0.001)
SetTypePtr64 45.2ns × (1.00,1.00) 47.2ns × (1.00,1.00) +4.42% (p=0.000)
SetTypePtr126 75.8ns × (0.99,1.01) 79.1ns × (1.00,1.00) +4.25% (p=0.000)
SetTypePtr128 74.3ns × (0.99,1.01) 77.6ns × (1.00,1.01) +4.55% (p=0.000)
SetTypePtrSlice 726ns × (1.00,1.01) 712ns × (1.00,1.00) -1.95% (p=0.001)
SetTypeNode1 20.0ns × (0.99,1.01) 20.7ns × (1.00,1.00) +3.71% (p=0.000)
SetTypeNode1Slice 112ns × (1.00,1.00) 113ns × (0.99,1.00) ~ (p=0.070)
SetTypeNode8 23.9ns × (1.00,1.00) 24.7ns × (1.00,1.01) +3.18% (p=0.000)
SetTypeNode8Slice 294ns × (0.99,1.02) 287ns × (0.99,1.01) -2.38% (p=0.015)
SetTypeNode64 52.8ns × (0.99,1.03) 51.8ns × (0.99,1.01) ~ (p=0.069)
SetTypeNode64Slice 1.13µs × (0.99,1.05) 1.14µs × (0.99,1.00) ~ (p=0.767)
SetTypeNode64Dead 36.0ns × (1.00,1.01) 32.5ns × (0.99,1.00) -9.67% (p=0.000)
SetTypeNode64DeadSlice 1.43µs × (0.99,1.01) 1.40µs × (1.00,1.00) -2.39% (p=0.001)
SetTypeNode124 75.7ns × (1.00,1.01) 79.0ns × (1.00,1.00) +4.44% (p=0.000)
SetTypeNode124Slice 1.94µs × (1.00,1.01) 2.04µs × (0.99,1.01) +4.98% (p=0.000)
SetTypeNode126 75.4ns × (1.00,1.01) 77.7ns × (0.99,1.01) +3.11% (p=0.000)
SetTypeNode126Slice 1.95µs × (0.99,1.01) 2.03µs × (1.00,1.00) +3.74% (p=0.000)
SetTypeNode128 85.4ns × (0.99,1.01) 122.0ns × (1.00,1.00) +42.89% (p=0.000)
SetTypeNode128Slice 2.20µs × (1.00,1.01) 2.36µs × (0.98,1.02) +7.48% (p=0.001)
SetTypeNode130 83.3ns × (1.00,1.00) 123.0ns × (1.00,1.00) +47.61% (p=0.000)
SetTypeNode130Slice 2.30µs × (0.99,1.01) 2.40µs × (0.98,1.01) +4.37% (p=0.000)
SetTypeNode1024 498ns × (1.00,1.00) 537ns × (1.00,1.00) +7.96% (p=0.000)
SetTypeNode1024Slice 15.5µs × (0.99,1.01) 17.8µs × (1.00,1.00) +15.27% (p=0.000)
The above compares always using a cached pointer mask (and the
corresponding waste of memory) against using the programs directly.
Some slowdown is expected, in exchange for having a better general algorithm.
The GC programs kick in for SetTypeNode128, SetTypeNode130, SetTypeNode1024,
along with the slice variants of those.
It is possible that the cutoff of 128 words (bits) should be raised
in a followup CL, but even with this low cutoff the GC programs are
faster than Go 1.4's "fast path" non-GC program case.
Benchmarks for heapBitsSetType, Go 1.4 vs this CL:
name old mean new mean delta
SetTypePtr 6.89ns × (1.00,1.00) 5.17ns × (1.00,1.00) -25.02% (p=0.000)
SetTypePtr8 25.8ns × (0.97,1.05) 21.5ns × (1.00,1.00) -16.70% (p=0.000)
SetTypePtr16 39.8ns × (0.97,1.02) 24.7ns × (0.99,1.01) -37.81% (p=0.000)
SetTypePtr32 68.8ns × (0.98,1.01) 32.2ns × (1.00,1.01) -53.18% (p=0.000)
SetTypePtr64 130ns × (1.00,1.00) 47ns × (1.00,1.00) -63.67% (p=0.000)
SetTypePtr126 241ns × (0.99,1.01) 79ns × (1.00,1.01) -67.25% (p=0.000)
SetTypePtr128 2.07µs × (1.00,1.00) 0.08µs × (1.00,1.00) -96.27% (p=0.000)
SetTypePtrSlice 1.05µs × (0.99,1.01) 0.72µs × (0.99,1.02) -31.70% (p=0.000)
SetTypeNode1 16.0ns × (0.99,1.01) 20.8ns × (0.99,1.03) +29.91% (p=0.000)
SetTypeNode1Slice 184ns × (0.99,1.01) 112ns × (0.99,1.01) -39.26% (p=0.000)
SetTypeNode8 29.5ns × (0.97,1.02) 24.6ns × (1.00,1.00) -16.50% (p=0.000)
SetTypeNode8Slice 624ns × (0.98,1.02) 285ns × (1.00,1.00) -54.31% (p=0.000)
SetTypeNode64 135ns × (0.96,1.08) 52ns × (0.99,1.02) -61.32% (p=0.000)
SetTypeNode64Slice 3.83µs × (1.00,1.00) 1.14µs × (0.99,1.01) -70.16% (p=0.000)
SetTypeNode64Dead 134ns × (0.99,1.01) 32ns × (1.00,1.01) -75.74% (p=0.000)
SetTypeNode64DeadSlice 3.83µs × (0.99,1.00) 1.40µs × (1.00,1.01) -63.42% (p=0.000)
SetTypeNode124 240ns × (0.99,1.01) 79ns × (1.00,1.01) -67.05% (p=0.000)
SetTypeNode124Slice 7.27µs × (1.00,1.00) 2.04µs × (1.00,1.00) -71.95% (p=0.000)
SetTypeNode126 2.06µs × (0.99,1.01) 0.08µs × (0.99,1.01) -96.23% (p=0.000)
SetTypeNode126Slice 64.4µs × (1.00,1.00) 2.0µs × (1.00,1.00) -96.85% (p=0.000)
SetTypeNode128 2.09µs × (1.00,1.01) 0.12µs × (1.00,1.00) -94.15% (p=0.000)
SetTypeNode128Slice 65.4µs × (1.00,1.00) 2.4µs × (0.99,1.03) -96.39% (p=0.000)
SetTypeNode130 2.11µs × (1.00,1.00) 0.12µs × (1.00,1.00) -94.18% (p=0.000)
SetTypeNode130Slice 66.3µs × (1.00,1.00) 2.4µs × (0.97,1.08) -96.34% (p=0.000)
SetTypeNode1024 16.0µs × (1.00,1.01) 0.5µs × (1.00,1.00) -96.65% (p=0.000)
SetTypeNode1024Slice 512µs × (1.00,1.00) 18µs × (0.98,1.04) -96.45% (p=0.000)
SetTypeNode124 uses a 124 data + 2 ptr = 126-word allocation.
Both Go 1.4 and this CL are using pointer bitmaps for this case,
so that's an overall 3x speedup for using pointer bitmaps.
SetTypeNode128 uses a 128 data + 2 ptr = 130-word allocation.
Both Go 1.4 and this CL are running the GC program for this case,
so that's an overall 17x speedup when using GC programs (and
I've seen >20x on other systems).
Comparing Go 1.4's SetTypeNode124 (pointer bitmap) against
this CL's SetTypeNode128 (GC program), the slow path in the
code in this CL is 2x faster than the fast path in Go 1.4.
The Go 1 benchmarks are basically unaffected compared to just before this CL.
Go 1 benchmarks, before this CL vs this CL:
name old mean new mean delta
BinaryTree17 5.87s × (0.97,1.04) 5.91s × (0.96,1.04) ~ (p=0.306)
Fannkuch11 4.38s × (1.00,1.00) 4.37s × (1.00,1.01) -0.22% (p=0.006)
FmtFprintfEmpty 90.7ns × (0.97,1.10) 89.3ns × (0.96,1.09) ~ (p=0.280)
FmtFprintfString 282ns × (0.98,1.04) 287ns × (0.98,1.07) +1.72% (p=0.039)
FmtFprintfInt 269ns × (0.99,1.03) 282ns × (0.97,1.04) +4.87% (p=0.000)
FmtFprintfIntInt 478ns × (0.99,1.02) 481ns × (0.99,1.02) +0.61% (p=0.048)
FmtFprintfPrefixedInt 399ns × (0.98,1.03) 400ns × (0.98,1.05) ~ (p=0.533)
FmtFprintfFloat 563ns × (0.99,1.01) 570ns × (1.00,1.01) +1.37% (p=0.000)
FmtManyArgs 1.89µs × (0.99,1.01) 1.92µs × (0.99,1.02) +1.88% (p=0.000)
GobDecode 15.2ms × (0.99,1.01) 15.2ms × (0.98,1.05) ~ (p=0.609)
GobEncode 11.6ms × (0.98,1.03) 11.9ms × (0.98,1.04) +2.17% (p=0.000)
Gzip 648ms × (0.99,1.01) 648ms × (1.00,1.01) ~ (p=0.835)
Gunzip 142ms × (1.00,1.00) 143ms × (1.00,1.01) ~ (p=0.169)
HTTPClientServer 90.5µs × (0.98,1.03) 91.5µs × (0.98,1.04) +1.04% (p=0.045)
JSONEncode 31.5ms × (0.98,1.03) 31.4ms × (0.98,1.03) ~ (p=0.549)
JSONDecode 111ms × (0.99,1.01) 107ms × (0.99,1.01) -3.21% (p=0.000)
Mandelbrot200 6.01ms × (1.00,1.00) 6.01ms × (1.00,1.00) ~ (p=0.878)
GoParse 6.54ms × (0.99,1.02) 6.61ms × (0.99,1.03) +1.08% (p=0.004)
RegexpMatchEasy0_32 160ns × (1.00,1.01) 161ns × (1.00,1.00) +0.40% (p=0.000)
RegexpMatchEasy0_1K 560ns × (0.99,1.01) 559ns × (0.99,1.01) ~ (p=0.088)
RegexpMatchEasy1_32 138ns × (0.99,1.01) 138ns × (1.00,1.00) ~ (p=0.380)
RegexpMatchEasy1_1K 877ns × (1.00,1.00) 878ns × (1.00,1.00) ~ (p=0.157)
RegexpMatchMedium_32 251ns × (0.99,1.00) 251ns × (1.00,1.01) +0.28% (p=0.021)
RegexpMatchMedium_1K 72.6µs × (1.00,1.00) 72.6µs × (1.00,1.00) ~ (p=0.539)
RegexpMatchHard_32 3.84µs × (1.00,1.00) 3.84µs × (1.00,1.00) ~ (p=0.378)
RegexpMatchHard_1K 117µs × (1.00,1.00) 117µs × (1.00,1.00) ~ (p=0.067)
Revcomp 904ms × (0.99,1.02) 904ms × (0.99,1.01) ~ (p=0.943)
Template 125ms × (0.99,1.02) 127ms × (0.99,1.01) +1.79% (p=0.000)
TimeParse 627ns × (0.99,1.01) 622ns × (0.99,1.01) -0.88% (p=0.000)
TimeFormat 655ns × (0.99,1.02) 655ns × (0.99,1.02) ~ (p=0.976)
For the record, Go 1 benchmarks, Go 1.4 vs this CL:
name old mean new mean delta
BinaryTree17 4.61s × (0.97,1.05) 5.91s × (0.98,1.03) +28.35% (p=0.000)
Fannkuch11 4.40s × (0.99,1.03) 4.41s × (0.99,1.01) ~ (p=0.212)
FmtFprintfEmpty 102ns × (0.99,1.01) 84ns × (0.99,1.02) -18.38% (p=0.000)
FmtFprintfString 302ns × (0.98,1.01) 303ns × (0.99,1.02) ~ (p=0.203)
FmtFprintfInt 313ns × (0.97,1.05) 270ns × (0.99,1.01) -13.69% (p=0.000)
FmtFprintfIntInt 524ns × (0.98,1.02) 477ns × (0.99,1.00) -8.87% (p=0.000)
FmtFprintfPrefixedInt 424ns × (0.98,1.02) 386ns × (0.99,1.01) -8.96% (p=0.000)
FmtFprintfFloat 652ns × (0.98,1.02) 594ns × (0.97,1.05) -8.97% (p=0.000)
FmtManyArgs 2.13µs × (0.99,1.02) 1.94µs × (0.99,1.01) -8.92% (p=0.000)
GobDecode 17.1ms × (0.99,1.02) 14.9ms × (0.98,1.03) -13.07% (p=0.000)
GobEncode 13.5ms × (0.98,1.03) 11.5ms × (0.98,1.03) -15.25% (p=0.000)
Gzip 656ms × (0.99,1.02) 647ms × (0.99,1.01) -1.29% (p=0.000)
Gunzip 143ms × (0.99,1.02) 144ms × (0.99,1.01) ~ (p=0.204)
HTTPClientServer 88.2µs × (0.98,1.02) 90.8µs × (0.98,1.01) +2.93% (p=0.000)
JSONEncode 32.2ms × (0.98,1.02) 30.9ms × (0.97,1.04) -4.06% (p=0.001)
JSONDecode 121ms × (0.98,1.02) 110ms × (0.98,1.05) -8.95% (p=0.000)
Mandelbrot200 6.06ms × (0.99,1.01) 6.11ms × (0.98,1.04) ~ (p=0.184)
GoParse 6.76ms × (0.97,1.04) 6.58ms × (0.98,1.05) -2.63% (p=0.003)
RegexpMatchEasy0_32 195ns × (1.00,1.01) 155ns × (0.99,1.01) -20.43% (p=0.000)
RegexpMatchEasy0_1K 479ns × (0.98,1.03) 535ns × (0.99,1.02) +11.59% (p=0.000)
RegexpMatchEasy1_32 169ns × (0.99,1.02) 131ns × (0.99,1.03) -22.44% (p=0.000)
RegexpMatchEasy1_1K 1.53µs × (0.99,1.01) 0.87µs × (0.99,1.02) -43.07% (p=0.000)
RegexpMatchMedium_32 334ns × (0.99,1.01) 242ns × (0.99,1.01) -27.53% (p=0.000)
RegexpMatchMedium_1K 125µs × (1.00,1.01) 72µs × (0.99,1.03) -42.53% (p=0.000)
RegexpMatchHard_32 6.03µs × (0.99,1.01) 3.79µs × (0.99,1.01) -37.12% (p=0.000)
RegexpMatchHard_1K 189µs × (0.99,1.02) 115µs × (0.99,1.01) -39.20% (p=0.000)
Revcomp 935ms × (0.96,1.03) 926ms × (0.98,1.02) ~ (p=0.083)
Template 146ms × (0.97,1.05) 119ms × (0.99,1.01) -18.37% (p=0.000)
TimeParse 660ns × (0.99,1.01) 624ns × (0.99,1.02) -5.43% (p=0.000)
TimeFormat 670ns × (0.98,1.02) 710ns × (1.00,1.01) +5.97% (p=0.000)
This CL is a bit larger than I would like, but the compiler, linker, runtime,
and package reflect all need to be in sync about the format of these programs,
so there is no easy way to split this into independent changes (at least
while keeping the build working at each change).
Fixes #9625.
Fixes #10524.
Change-Id: I9e3e20d6097099d0f8532d1cb5b1af528804989a
Reviewed-on: https://go-review.googlesource.com/9888
Reviewed-by: Austin Clements <austin@google.com>
Run-TryBot: Russ Cox <rsc@golang.org>
2015-05-08 01:43:18 -04:00
|
|
|
type GCProg struct {
|
2016-08-19 22:40:38 -04:00
|
|
|
ctxt *Link
|
|
|
|
|
sym *Symbol
|
|
|
|
|
w gcprog.Writer
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
|
2016-08-19 22:40:38 -04:00
|
|
|
func (p *GCProg) Init(ctxt *Link, name string) {
|
|
|
|
|
p.ctxt = ctxt
|
|
|
|
|
p.sym = Linklookup(ctxt, name, 0)
|
|
|
|
|
p.w.Init(p.writeByte(ctxt))
|
runtime: replace GC programs with simpler encoding, faster decoder
Small types record the location of pointers in their memory layout
by using a simple bitmap. In Go 1.4 the bitmap held 4-bit entries,
and in Go 1.5 the bitmap holds 1-bit entries, but in both cases using
a bitmap for a large type containing arrays does not make sense:
if someone refers to the type [1<<28]*byte in a program in such
a way that the type information makes it into the binary, it would be
a waste of space to write a 128 MB (for 4-bit entries) or even 32 MB
(for 1-bit entries) bitmap full of 1s into the binary or even to keep
one in memory during the execution of the program.
For large types containing arrays, it is much more compact to describe
the locations of pointers using a notation that can express repetition
than to lay out a bitmap of pointers. Go 1.4 included such a notation,
called ``GC programs'' but it was complex, required recursion during
decoding, and was generally slow. Dmitriy measured the execution of
these programs writing directly to the heap bitmap as being 7x slower
than copying from a preunrolled 4-bit mask (and frankly that code was
not terribly fast either). For some tests, unrollgcprog1 was seen costing
as much as 3x more than the rest of malloc combined.
This CL introduces a different form for the GC programs. They use a
simple Lempel-Ziv-style encoding of the 1-bit pointer information,
in which the only operations are (1) emit the following n bits
and (2) repeat the last n bits c more times. This encoding can be
generated directly from the Go type information (using repetition
only for arrays or large runs of non-pointer data) and it can be decoded
very efficiently. In particular the decoding requires little state and
no recursion, so that the entire decoding can run without any memory
accesses other than the reads of the encoding and the writes of the
decoded form to the heap bitmap. For recursive types like arrays of
arrays of arrays, the inner instructions are only executed once, not
n times, so that large repetitions run at full speed. (In contrast, large
repetitions in the old programs repeated the individual bit-level layout
of the inner data over and over.) The result is as much as 25x faster
decoding compared to the old form.
Because the old decoder was so slow, Go 1.4 had three (or so) cases
for how to set the heap bitmap bits for an allocation of a given type:
(1) If the type had an even number of words up to 32 words, then
the 4-bit pointer mask for the type fit in no more than 16 bytes;
store the 4-bit pointer mask directly in the binary and copy from it.
(1b) If the type had an odd number of words up to 15 words, then
the 4-bit pointer mask for the type, doubled to end on a byte boundary,
fit in no more than 16 bytes; store that doubled mask directly in the
binary and copy from it.
(2) If the type had an even number of words up to 128 words,
or an odd number of words up to 63 words (again due to doubling),
then the 4-bit pointer mask would fit in a 64-byte unrolled mask.
Store a GC program in the binary, but leave space in the BSS for
the unrolled mask. Execute the GC program to construct the mask the
first time it is needed, and thereafter copy from the mask.
(3) Otherwise, store a GC program and execute it to write directly to
the heap bitmap each time an object of that type is allocated.
(This is the case that was 7x slower than the other two.)
Because the new pointer masks store 1-bit entries instead of 4-bit
entries and because using the decoder no longer carries a significant
overhead, after this CL (that is, for Go 1.5) there are only two cases:
(1) If the type is 128 words or less (no condition about odd or even),
store the 1-bit pointer mask directly in the binary and use it to
initialize the heap bitmap during malloc. (Implemented in CL 9702.)
(2) There is no case 2 anymore.
(3) Otherwise, store a GC program and execute it to write directly to
the heap bitmap each time an object of that type is allocated.
Executing the GC program directly into the heap bitmap (case (3) above)
was disabled for the Go 1.5 dev cycle, both to avoid needing to use
GC programs for typedmemmove and to avoid updating that code as
the heap bitmap format changed. Typedmemmove no longer uses this
type information; as of CL 9886 it uses the heap bitmap directly.
Now that the heap bitmap format is stable, we reintroduce GC programs
and their space savings.
Benchmarks for heapBitsSetType, before this CL vs this CL:
name old mean new mean delta
SetTypePtr 7.59ns × (0.99,1.02) 5.16ns × (1.00,1.00) -32.05% (p=0.000)
SetTypePtr8 21.0ns × (0.98,1.05) 21.4ns × (1.00,1.00) ~ (p=0.179)
SetTypePtr16 24.1ns × (0.99,1.01) 24.6ns × (1.00,1.00) +2.41% (p=0.001)
SetTypePtr32 31.2ns × (0.99,1.01) 32.4ns × (0.99,1.02) +3.72% (p=0.001)
SetTypePtr64 45.2ns × (1.00,1.00) 47.2ns × (1.00,1.00) +4.42% (p=0.000)
SetTypePtr126 75.8ns × (0.99,1.01) 79.1ns × (1.00,1.00) +4.25% (p=0.000)
SetTypePtr128 74.3ns × (0.99,1.01) 77.6ns × (1.00,1.01) +4.55% (p=0.000)
SetTypePtrSlice 726ns × (1.00,1.01) 712ns × (1.00,1.00) -1.95% (p=0.001)
SetTypeNode1 20.0ns × (0.99,1.01) 20.7ns × (1.00,1.00) +3.71% (p=0.000)
SetTypeNode1Slice 112ns × (1.00,1.00) 113ns × (0.99,1.00) ~ (p=0.070)
SetTypeNode8 23.9ns × (1.00,1.00) 24.7ns × (1.00,1.01) +3.18% (p=0.000)
SetTypeNode8Slice 294ns × (0.99,1.02) 287ns × (0.99,1.01) -2.38% (p=0.015)
SetTypeNode64 52.8ns × (0.99,1.03) 51.8ns × (0.99,1.01) ~ (p=0.069)
SetTypeNode64Slice 1.13µs × (0.99,1.05) 1.14µs × (0.99,1.00) ~ (p=0.767)
SetTypeNode64Dead 36.0ns × (1.00,1.01) 32.5ns × (0.99,1.00) -9.67% (p=0.000)
SetTypeNode64DeadSlice 1.43µs × (0.99,1.01) 1.40µs × (1.00,1.00) -2.39% (p=0.001)
SetTypeNode124 75.7ns × (1.00,1.01) 79.0ns × (1.00,1.00) +4.44% (p=0.000)
SetTypeNode124Slice 1.94µs × (1.00,1.01) 2.04µs × (0.99,1.01) +4.98% (p=0.000)
SetTypeNode126 75.4ns × (1.00,1.01) 77.7ns × (0.99,1.01) +3.11% (p=0.000)
SetTypeNode126Slice 1.95µs × (0.99,1.01) 2.03µs × (1.00,1.00) +3.74% (p=0.000)
SetTypeNode128 85.4ns × (0.99,1.01) 122.0ns × (1.00,1.00) +42.89% (p=0.000)
SetTypeNode128Slice 2.20µs × (1.00,1.01) 2.36µs × (0.98,1.02) +7.48% (p=0.001)
SetTypeNode130 83.3ns × (1.00,1.00) 123.0ns × (1.00,1.00) +47.61% (p=0.000)
SetTypeNode130Slice 2.30µs × (0.99,1.01) 2.40µs × (0.98,1.01) +4.37% (p=0.000)
SetTypeNode1024 498ns × (1.00,1.00) 537ns × (1.00,1.00) +7.96% (p=0.000)
SetTypeNode1024Slice 15.5µs × (0.99,1.01) 17.8µs × (1.00,1.00) +15.27% (p=0.000)
The above compares always using a cached pointer mask (and the
corresponding waste of memory) against using the programs directly.
Some slowdown is expected, in exchange for having a better general algorithm.
The GC programs kick in for SetTypeNode128, SetTypeNode130, SetTypeNode1024,
along with the slice variants of those.
It is possible that the cutoff of 128 words (bits) should be raised
in a followup CL, but even with this low cutoff the GC programs are
faster than Go 1.4's "fast path" non-GC program case.
Benchmarks for heapBitsSetType, Go 1.4 vs this CL:
name old mean new mean delta
SetTypePtr 6.89ns × (1.00,1.00) 5.17ns × (1.00,1.00) -25.02% (p=0.000)
SetTypePtr8 25.8ns × (0.97,1.05) 21.5ns × (1.00,1.00) -16.70% (p=0.000)
SetTypePtr16 39.8ns × (0.97,1.02) 24.7ns × (0.99,1.01) -37.81% (p=0.000)
SetTypePtr32 68.8ns × (0.98,1.01) 32.2ns × (1.00,1.01) -53.18% (p=0.000)
SetTypePtr64 130ns × (1.00,1.00) 47ns × (1.00,1.00) -63.67% (p=0.000)
SetTypePtr126 241ns × (0.99,1.01) 79ns × (1.00,1.01) -67.25% (p=0.000)
SetTypePtr128 2.07µs × (1.00,1.00) 0.08µs × (1.00,1.00) -96.27% (p=0.000)
SetTypePtrSlice 1.05µs × (0.99,1.01) 0.72µs × (0.99,1.02) -31.70% (p=0.000)
SetTypeNode1 16.0ns × (0.99,1.01) 20.8ns × (0.99,1.03) +29.91% (p=0.000)
SetTypeNode1Slice 184ns × (0.99,1.01) 112ns × (0.99,1.01) -39.26% (p=0.000)
SetTypeNode8 29.5ns × (0.97,1.02) 24.6ns × (1.00,1.00) -16.50% (p=0.000)
SetTypeNode8Slice 624ns × (0.98,1.02) 285ns × (1.00,1.00) -54.31% (p=0.000)
SetTypeNode64 135ns × (0.96,1.08) 52ns × (0.99,1.02) -61.32% (p=0.000)
SetTypeNode64Slice 3.83µs × (1.00,1.00) 1.14µs × (0.99,1.01) -70.16% (p=0.000)
SetTypeNode64Dead 134ns × (0.99,1.01) 32ns × (1.00,1.01) -75.74% (p=0.000)
SetTypeNode64DeadSlice 3.83µs × (0.99,1.00) 1.40µs × (1.00,1.01) -63.42% (p=0.000)
SetTypeNode124 240ns × (0.99,1.01) 79ns × (1.00,1.01) -67.05% (p=0.000)
SetTypeNode124Slice 7.27µs × (1.00,1.00) 2.04µs × (1.00,1.00) -71.95% (p=0.000)
SetTypeNode126 2.06µs × (0.99,1.01) 0.08µs × (0.99,1.01) -96.23% (p=0.000)
SetTypeNode126Slice 64.4µs × (1.00,1.00) 2.0µs × (1.00,1.00) -96.85% (p=0.000)
SetTypeNode128 2.09µs × (1.00,1.01) 0.12µs × (1.00,1.00) -94.15% (p=0.000)
SetTypeNode128Slice 65.4µs × (1.00,1.00) 2.4µs × (0.99,1.03) -96.39% (p=0.000)
SetTypeNode130 2.11µs × (1.00,1.00) 0.12µs × (1.00,1.00) -94.18% (p=0.000)
SetTypeNode130Slice 66.3µs × (1.00,1.00) 2.4µs × (0.97,1.08) -96.34% (p=0.000)
SetTypeNode1024 16.0µs × (1.00,1.01) 0.5µs × (1.00,1.00) -96.65% (p=0.000)
SetTypeNode1024Slice 512µs × (1.00,1.00) 18µs × (0.98,1.04) -96.45% (p=0.000)
SetTypeNode124 uses a 124 data + 2 ptr = 126-word allocation.
Both Go 1.4 and this CL are using pointer bitmaps for this case,
so that's an overall 3x speedup for using pointer bitmaps.
SetTypeNode128 uses a 128 data + 2 ptr = 130-word allocation.
Both Go 1.4 and this CL are running the GC program for this case,
so that's an overall 17x speedup when using GC programs (and
I've seen >20x on other systems).
Comparing Go 1.4's SetTypeNode124 (pointer bitmap) against
this CL's SetTypeNode128 (GC program), the slow path in the
code in this CL is 2x faster than the fast path in Go 1.4.
The Go 1 benchmarks are basically unaffected compared to just before this CL.
Go 1 benchmarks, before this CL vs this CL:
name old mean new mean delta
BinaryTree17 5.87s × (0.97,1.04) 5.91s × (0.96,1.04) ~ (p=0.306)
Fannkuch11 4.38s × (1.00,1.00) 4.37s × (1.00,1.01) -0.22% (p=0.006)
FmtFprintfEmpty 90.7ns × (0.97,1.10) 89.3ns × (0.96,1.09) ~ (p=0.280)
FmtFprintfString 282ns × (0.98,1.04) 287ns × (0.98,1.07) +1.72% (p=0.039)
FmtFprintfInt 269ns × (0.99,1.03) 282ns × (0.97,1.04) +4.87% (p=0.000)
FmtFprintfIntInt 478ns × (0.99,1.02) 481ns × (0.99,1.02) +0.61% (p=0.048)
FmtFprintfPrefixedInt 399ns × (0.98,1.03) 400ns × (0.98,1.05) ~ (p=0.533)
FmtFprintfFloat 563ns × (0.99,1.01) 570ns × (1.00,1.01) +1.37% (p=0.000)
FmtManyArgs 1.89µs × (0.99,1.01) 1.92µs × (0.99,1.02) +1.88% (p=0.000)
GobDecode 15.2ms × (0.99,1.01) 15.2ms × (0.98,1.05) ~ (p=0.609)
GobEncode 11.6ms × (0.98,1.03) 11.9ms × (0.98,1.04) +2.17% (p=0.000)
Gzip 648ms × (0.99,1.01) 648ms × (1.00,1.01) ~ (p=0.835)
Gunzip 142ms × (1.00,1.00) 143ms × (1.00,1.01) ~ (p=0.169)
HTTPClientServer 90.5µs × (0.98,1.03) 91.5µs × (0.98,1.04) +1.04% (p=0.045)
JSONEncode 31.5ms × (0.98,1.03) 31.4ms × (0.98,1.03) ~ (p=0.549)
JSONDecode 111ms × (0.99,1.01) 107ms × (0.99,1.01) -3.21% (p=0.000)
Mandelbrot200 6.01ms × (1.00,1.00) 6.01ms × (1.00,1.00) ~ (p=0.878)
GoParse 6.54ms × (0.99,1.02) 6.61ms × (0.99,1.03) +1.08% (p=0.004)
RegexpMatchEasy0_32 160ns × (1.00,1.01) 161ns × (1.00,1.00) +0.40% (p=0.000)
RegexpMatchEasy0_1K 560ns × (0.99,1.01) 559ns × (0.99,1.01) ~ (p=0.088)
RegexpMatchEasy1_32 138ns × (0.99,1.01) 138ns × (1.00,1.00) ~ (p=0.380)
RegexpMatchEasy1_1K 877ns × (1.00,1.00) 878ns × (1.00,1.00) ~ (p=0.157)
RegexpMatchMedium_32 251ns × (0.99,1.00) 251ns × (1.00,1.01) +0.28% (p=0.021)
RegexpMatchMedium_1K 72.6µs × (1.00,1.00) 72.6µs × (1.00,1.00) ~ (p=0.539)
RegexpMatchHard_32 3.84µs × (1.00,1.00) 3.84µs × (1.00,1.00) ~ (p=0.378)
RegexpMatchHard_1K 117µs × (1.00,1.00) 117µs × (1.00,1.00) ~ (p=0.067)
Revcomp 904ms × (0.99,1.02) 904ms × (0.99,1.01) ~ (p=0.943)
Template 125ms × (0.99,1.02) 127ms × (0.99,1.01) +1.79% (p=0.000)
TimeParse 627ns × (0.99,1.01) 622ns × (0.99,1.01) -0.88% (p=0.000)
TimeFormat 655ns × (0.99,1.02) 655ns × (0.99,1.02) ~ (p=0.976)
For the record, Go 1 benchmarks, Go 1.4 vs this CL:
name old mean new mean delta
BinaryTree17 4.61s × (0.97,1.05) 5.91s × (0.98,1.03) +28.35% (p=0.000)
Fannkuch11 4.40s × (0.99,1.03) 4.41s × (0.99,1.01) ~ (p=0.212)
FmtFprintfEmpty 102ns × (0.99,1.01) 84ns × (0.99,1.02) -18.38% (p=0.000)
FmtFprintfString 302ns × (0.98,1.01) 303ns × (0.99,1.02) ~ (p=0.203)
FmtFprintfInt 313ns × (0.97,1.05) 270ns × (0.99,1.01) -13.69% (p=0.000)
FmtFprintfIntInt 524ns × (0.98,1.02) 477ns × (0.99,1.00) -8.87% (p=0.000)
FmtFprintfPrefixedInt 424ns × (0.98,1.02) 386ns × (0.99,1.01) -8.96% (p=0.000)
FmtFprintfFloat 652ns × (0.98,1.02) 594ns × (0.97,1.05) -8.97% (p=0.000)
FmtManyArgs 2.13µs × (0.99,1.02) 1.94µs × (0.99,1.01) -8.92% (p=0.000)
GobDecode 17.1ms × (0.99,1.02) 14.9ms × (0.98,1.03) -13.07% (p=0.000)
GobEncode 13.5ms × (0.98,1.03) 11.5ms × (0.98,1.03) -15.25% (p=0.000)
Gzip 656ms × (0.99,1.02) 647ms × (0.99,1.01) -1.29% (p=0.000)
Gunzip 143ms × (0.99,1.02) 144ms × (0.99,1.01) ~ (p=0.204)
HTTPClientServer 88.2µs × (0.98,1.02) 90.8µs × (0.98,1.01) +2.93% (p=0.000)
JSONEncode 32.2ms × (0.98,1.02) 30.9ms × (0.97,1.04) -4.06% (p=0.001)
JSONDecode 121ms × (0.98,1.02) 110ms × (0.98,1.05) -8.95% (p=0.000)
Mandelbrot200 6.06ms × (0.99,1.01) 6.11ms × (0.98,1.04) ~ (p=0.184)
GoParse 6.76ms × (0.97,1.04) 6.58ms × (0.98,1.05) -2.63% (p=0.003)
RegexpMatchEasy0_32 195ns × (1.00,1.01) 155ns × (0.99,1.01) -20.43% (p=0.000)
RegexpMatchEasy0_1K 479ns × (0.98,1.03) 535ns × (0.99,1.02) +11.59% (p=0.000)
RegexpMatchEasy1_32 169ns × (0.99,1.02) 131ns × (0.99,1.03) -22.44% (p=0.000)
RegexpMatchEasy1_1K 1.53µs × (0.99,1.01) 0.87µs × (0.99,1.02) -43.07% (p=0.000)
RegexpMatchMedium_32 334ns × (0.99,1.01) 242ns × (0.99,1.01) -27.53% (p=0.000)
RegexpMatchMedium_1K 125µs × (1.00,1.01) 72µs × (0.99,1.03) -42.53% (p=0.000)
RegexpMatchHard_32 6.03µs × (0.99,1.01) 3.79µs × (0.99,1.01) -37.12% (p=0.000)
RegexpMatchHard_1K 189µs × (0.99,1.02) 115µs × (0.99,1.01) -39.20% (p=0.000)
Revcomp 935ms × (0.96,1.03) 926ms × (0.98,1.02) ~ (p=0.083)
Template 146ms × (0.97,1.05) 119ms × (0.99,1.01) -18.37% (p=0.000)
TimeParse 660ns × (0.99,1.01) 624ns × (0.99,1.02) -5.43% (p=0.000)
TimeFormat 670ns × (0.98,1.02) 710ns × (1.00,1.01) +5.97% (p=0.000)
This CL is a bit larger than I would like, but the compiler, linker, runtime,
and package reflect all need to be in sync about the format of these programs,
so there is no easy way to split this into independent changes (at least
while keeping the build working at each change).
Fixes #9625.
Fixes #10524.
Change-Id: I9e3e20d6097099d0f8532d1cb5b1af528804989a
Reviewed-on: https://go-review.googlesource.com/9888
Reviewed-by: Austin Clements <austin@google.com>
Run-TryBot: Russ Cox <rsc@golang.org>
2015-05-08 01:43:18 -04:00
|
|
|
if debugGCProg {
|
|
|
|
|
fmt.Fprintf(os.Stderr, "ld: start GCProg %s\n", name)
|
|
|
|
|
p.w.Debug(os.Stderr)
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2016-08-19 22:40:38 -04:00
|
|
|
func (p *GCProg) writeByte(ctxt *Link) func(x byte) {
|
|
|
|
|
return func(x byte) {
|
|
|
|
|
Adduint8(ctxt, p.sym, x)
|
|
|
|
|
}
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
|
runtime: replace GC programs with simpler encoding, faster decoder
Small types record the location of pointers in their memory layout
by using a simple bitmap. In Go 1.4 the bitmap held 4-bit entries,
and in Go 1.5 the bitmap holds 1-bit entries, but in both cases using
a bitmap for a large type containing arrays does not make sense:
if someone refers to the type [1<<28]*byte in a program in such
a way that the type information makes it into the binary, it would be
a waste of space to write a 128 MB (for 4-bit entries) or even 32 MB
(for 1-bit entries) bitmap full of 1s into the binary or even to keep
one in memory during the execution of the program.
For large types containing arrays, it is much more compact to describe
the locations of pointers using a notation that can express repetition
than to lay out a bitmap of pointers. Go 1.4 included such a notation,
called ``GC programs'' but it was complex, required recursion during
decoding, and was generally slow. Dmitriy measured the execution of
these programs writing directly to the heap bitmap as being 7x slower
than copying from a preunrolled 4-bit mask (and frankly that code was
not terribly fast either). For some tests, unrollgcprog1 was seen costing
as much as 3x more than the rest of malloc combined.
This CL introduces a different form for the GC programs. They use a
simple Lempel-Ziv-style encoding of the 1-bit pointer information,
in which the only operations are (1) emit the following n bits
and (2) repeat the last n bits c more times. This encoding can be
generated directly from the Go type information (using repetition
only for arrays or large runs of non-pointer data) and it can be decoded
very efficiently. In particular the decoding requires little state and
no recursion, so that the entire decoding can run without any memory
accesses other than the reads of the encoding and the writes of the
decoded form to the heap bitmap. For recursive types like arrays of
arrays of arrays, the inner instructions are only executed once, not
n times, so that large repetitions run at full speed. (In contrast, large
repetitions in the old programs repeated the individual bit-level layout
of the inner data over and over.) The result is as much as 25x faster
decoding compared to the old form.
Because the old decoder was so slow, Go 1.4 had three (or so) cases
for how to set the heap bitmap bits for an allocation of a given type:
(1) If the type had an even number of words up to 32 words, then
the 4-bit pointer mask for the type fit in no more than 16 bytes;
store the 4-bit pointer mask directly in the binary and copy from it.
(1b) If the type had an odd number of words up to 15 words, then
the 4-bit pointer mask for the type, doubled to end on a byte boundary,
fit in no more than 16 bytes; store that doubled mask directly in the
binary and copy from it.
(2) If the type had an even number of words up to 128 words,
or an odd number of words up to 63 words (again due to doubling),
then the 4-bit pointer mask would fit in a 64-byte unrolled mask.
Store a GC program in the binary, but leave space in the BSS for
the unrolled mask. Execute the GC program to construct the mask the
first time it is needed, and thereafter copy from the mask.
(3) Otherwise, store a GC program and execute it to write directly to
the heap bitmap each time an object of that type is allocated.
(This is the case that was 7x slower than the other two.)
Because the new pointer masks store 1-bit entries instead of 4-bit
entries and because using the decoder no longer carries a significant
overhead, after this CL (that is, for Go 1.5) there are only two cases:
(1) If the type is 128 words or less (no condition about odd or even),
store the 1-bit pointer mask directly in the binary and use it to
initialize the heap bitmap during malloc. (Implemented in CL 9702.)
(2) There is no case 2 anymore.
(3) Otherwise, store a GC program and execute it to write directly to
the heap bitmap each time an object of that type is allocated.
Executing the GC program directly into the heap bitmap (case (3) above)
was disabled for the Go 1.5 dev cycle, both to avoid needing to use
GC programs for typedmemmove and to avoid updating that code as
the heap bitmap format changed. Typedmemmove no longer uses this
type information; as of CL 9886 it uses the heap bitmap directly.
Now that the heap bitmap format is stable, we reintroduce GC programs
and their space savings.
Benchmarks for heapBitsSetType, before this CL vs this CL:
name old mean new mean delta
SetTypePtr 7.59ns × (0.99,1.02) 5.16ns × (1.00,1.00) -32.05% (p=0.000)
SetTypePtr8 21.0ns × (0.98,1.05) 21.4ns × (1.00,1.00) ~ (p=0.179)
SetTypePtr16 24.1ns × (0.99,1.01) 24.6ns × (1.00,1.00) +2.41% (p=0.001)
SetTypePtr32 31.2ns × (0.99,1.01) 32.4ns × (0.99,1.02) +3.72% (p=0.001)
SetTypePtr64 45.2ns × (1.00,1.00) 47.2ns × (1.00,1.00) +4.42% (p=0.000)
SetTypePtr126 75.8ns × (0.99,1.01) 79.1ns × (1.00,1.00) +4.25% (p=0.000)
SetTypePtr128 74.3ns × (0.99,1.01) 77.6ns × (1.00,1.01) +4.55% (p=0.000)
SetTypePtrSlice 726ns × (1.00,1.01) 712ns × (1.00,1.00) -1.95% (p=0.001)
SetTypeNode1 20.0ns × (0.99,1.01) 20.7ns × (1.00,1.00) +3.71% (p=0.000)
SetTypeNode1Slice 112ns × (1.00,1.00) 113ns × (0.99,1.00) ~ (p=0.070)
SetTypeNode8 23.9ns × (1.00,1.00) 24.7ns × (1.00,1.01) +3.18% (p=0.000)
SetTypeNode8Slice 294ns × (0.99,1.02) 287ns × (0.99,1.01) -2.38% (p=0.015)
SetTypeNode64 52.8ns × (0.99,1.03) 51.8ns × (0.99,1.01) ~ (p=0.069)
SetTypeNode64Slice 1.13µs × (0.99,1.05) 1.14µs × (0.99,1.00) ~ (p=0.767)
SetTypeNode64Dead 36.0ns × (1.00,1.01) 32.5ns × (0.99,1.00) -9.67% (p=0.000)
SetTypeNode64DeadSlice 1.43µs × (0.99,1.01) 1.40µs × (1.00,1.00) -2.39% (p=0.001)
SetTypeNode124 75.7ns × (1.00,1.01) 79.0ns × (1.00,1.00) +4.44% (p=0.000)
SetTypeNode124Slice 1.94µs × (1.00,1.01) 2.04µs × (0.99,1.01) +4.98% (p=0.000)
SetTypeNode126 75.4ns × (1.00,1.01) 77.7ns × (0.99,1.01) +3.11% (p=0.000)
SetTypeNode126Slice 1.95µs × (0.99,1.01) 2.03µs × (1.00,1.00) +3.74% (p=0.000)
SetTypeNode128 85.4ns × (0.99,1.01) 122.0ns × (1.00,1.00) +42.89% (p=0.000)
SetTypeNode128Slice 2.20µs × (1.00,1.01) 2.36µs × (0.98,1.02) +7.48% (p=0.001)
SetTypeNode130 83.3ns × (1.00,1.00) 123.0ns × (1.00,1.00) +47.61% (p=0.000)
SetTypeNode130Slice 2.30µs × (0.99,1.01) 2.40µs × (0.98,1.01) +4.37% (p=0.000)
SetTypeNode1024 498ns × (1.00,1.00) 537ns × (1.00,1.00) +7.96% (p=0.000)
SetTypeNode1024Slice 15.5µs × (0.99,1.01) 17.8µs × (1.00,1.00) +15.27% (p=0.000)
The above compares always using a cached pointer mask (and the
corresponding waste of memory) against using the programs directly.
Some slowdown is expected, in exchange for having a better general algorithm.
The GC programs kick in for SetTypeNode128, SetTypeNode130, SetTypeNode1024,
along with the slice variants of those.
It is possible that the cutoff of 128 words (bits) should be raised
in a followup CL, but even with this low cutoff the GC programs are
faster than Go 1.4's "fast path" non-GC program case.
Benchmarks for heapBitsSetType, Go 1.4 vs this CL:
name old mean new mean delta
SetTypePtr 6.89ns × (1.00,1.00) 5.17ns × (1.00,1.00) -25.02% (p=0.000)
SetTypePtr8 25.8ns × (0.97,1.05) 21.5ns × (1.00,1.00) -16.70% (p=0.000)
SetTypePtr16 39.8ns × (0.97,1.02) 24.7ns × (0.99,1.01) -37.81% (p=0.000)
SetTypePtr32 68.8ns × (0.98,1.01) 32.2ns × (1.00,1.01) -53.18% (p=0.000)
SetTypePtr64 130ns × (1.00,1.00) 47ns × (1.00,1.00) -63.67% (p=0.000)
SetTypePtr126 241ns × (0.99,1.01) 79ns × (1.00,1.01) -67.25% (p=0.000)
SetTypePtr128 2.07µs × (1.00,1.00) 0.08µs × (1.00,1.00) -96.27% (p=0.000)
SetTypePtrSlice 1.05µs × (0.99,1.01) 0.72µs × (0.99,1.02) -31.70% (p=0.000)
SetTypeNode1 16.0ns × (0.99,1.01) 20.8ns × (0.99,1.03) +29.91% (p=0.000)
SetTypeNode1Slice 184ns × (0.99,1.01) 112ns × (0.99,1.01) -39.26% (p=0.000)
SetTypeNode8 29.5ns × (0.97,1.02) 24.6ns × (1.00,1.00) -16.50% (p=0.000)
SetTypeNode8Slice 624ns × (0.98,1.02) 285ns × (1.00,1.00) -54.31% (p=0.000)
SetTypeNode64 135ns × (0.96,1.08) 52ns × (0.99,1.02) -61.32% (p=0.000)
SetTypeNode64Slice 3.83µs × (1.00,1.00) 1.14µs × (0.99,1.01) -70.16% (p=0.000)
SetTypeNode64Dead 134ns × (0.99,1.01) 32ns × (1.00,1.01) -75.74% (p=0.000)
SetTypeNode64DeadSlice 3.83µs × (0.99,1.00) 1.40µs × (1.00,1.01) -63.42% (p=0.000)
SetTypeNode124 240ns × (0.99,1.01) 79ns × (1.00,1.01) -67.05% (p=0.000)
SetTypeNode124Slice 7.27µs × (1.00,1.00) 2.04µs × (1.00,1.00) -71.95% (p=0.000)
SetTypeNode126 2.06µs × (0.99,1.01) 0.08µs × (0.99,1.01) -96.23% (p=0.000)
SetTypeNode126Slice 64.4µs × (1.00,1.00) 2.0µs × (1.00,1.00) -96.85% (p=0.000)
SetTypeNode128 2.09µs × (1.00,1.01) 0.12µs × (1.00,1.00) -94.15% (p=0.000)
SetTypeNode128Slice 65.4µs × (1.00,1.00) 2.4µs × (0.99,1.03) -96.39% (p=0.000)
SetTypeNode130 2.11µs × (1.00,1.00) 0.12µs × (1.00,1.00) -94.18% (p=0.000)
SetTypeNode130Slice 66.3µs × (1.00,1.00) 2.4µs × (0.97,1.08) -96.34% (p=0.000)
SetTypeNode1024 16.0µs × (1.00,1.01) 0.5µs × (1.00,1.00) -96.65% (p=0.000)
SetTypeNode1024Slice 512µs × (1.00,1.00) 18µs × (0.98,1.04) -96.45% (p=0.000)
SetTypeNode124 uses a 124 data + 2 ptr = 126-word allocation.
Both Go 1.4 and this CL are using pointer bitmaps for this case,
so that's an overall 3x speedup for using pointer bitmaps.
SetTypeNode128 uses a 128 data + 2 ptr = 130-word allocation.
Both Go 1.4 and this CL are running the GC program for this case,
so that's an overall 17x speedup when using GC programs (and
I've seen >20x on other systems).
Comparing Go 1.4's SetTypeNode124 (pointer bitmap) against
this CL's SetTypeNode128 (GC program), the slow path in the
code in this CL is 2x faster than the fast path in Go 1.4.
The Go 1 benchmarks are basically unaffected compared to just before this CL.
Go 1 benchmarks, before this CL vs this CL:
name old mean new mean delta
BinaryTree17 5.87s × (0.97,1.04) 5.91s × (0.96,1.04) ~ (p=0.306)
Fannkuch11 4.38s × (1.00,1.00) 4.37s × (1.00,1.01) -0.22% (p=0.006)
FmtFprintfEmpty 90.7ns × (0.97,1.10) 89.3ns × (0.96,1.09) ~ (p=0.280)
FmtFprintfString 282ns × (0.98,1.04) 287ns × (0.98,1.07) +1.72% (p=0.039)
FmtFprintfInt 269ns × (0.99,1.03) 282ns × (0.97,1.04) +4.87% (p=0.000)
FmtFprintfIntInt 478ns × (0.99,1.02) 481ns × (0.99,1.02) +0.61% (p=0.048)
FmtFprintfPrefixedInt 399ns × (0.98,1.03) 400ns × (0.98,1.05) ~ (p=0.533)
FmtFprintfFloat 563ns × (0.99,1.01) 570ns × (1.00,1.01) +1.37% (p=0.000)
FmtManyArgs 1.89µs × (0.99,1.01) 1.92µs × (0.99,1.02) +1.88% (p=0.000)
GobDecode 15.2ms × (0.99,1.01) 15.2ms × (0.98,1.05) ~ (p=0.609)
GobEncode 11.6ms × (0.98,1.03) 11.9ms × (0.98,1.04) +2.17% (p=0.000)
Gzip 648ms × (0.99,1.01) 648ms × (1.00,1.01) ~ (p=0.835)
Gunzip 142ms × (1.00,1.00) 143ms × (1.00,1.01) ~ (p=0.169)
HTTPClientServer 90.5µs × (0.98,1.03) 91.5µs × (0.98,1.04) +1.04% (p=0.045)
JSONEncode 31.5ms × (0.98,1.03) 31.4ms × (0.98,1.03) ~ (p=0.549)
JSONDecode 111ms × (0.99,1.01) 107ms × (0.99,1.01) -3.21% (p=0.000)
Mandelbrot200 6.01ms × (1.00,1.00) 6.01ms × (1.00,1.00) ~ (p=0.878)
GoParse 6.54ms × (0.99,1.02) 6.61ms × (0.99,1.03) +1.08% (p=0.004)
RegexpMatchEasy0_32 160ns × (1.00,1.01) 161ns × (1.00,1.00) +0.40% (p=0.000)
RegexpMatchEasy0_1K 560ns × (0.99,1.01) 559ns × (0.99,1.01) ~ (p=0.088)
RegexpMatchEasy1_32 138ns × (0.99,1.01) 138ns × (1.00,1.00) ~ (p=0.380)
RegexpMatchEasy1_1K 877ns × (1.00,1.00) 878ns × (1.00,1.00) ~ (p=0.157)
RegexpMatchMedium_32 251ns × (0.99,1.00) 251ns × (1.00,1.01) +0.28% (p=0.021)
RegexpMatchMedium_1K 72.6µs × (1.00,1.00) 72.6µs × (1.00,1.00) ~ (p=0.539)
RegexpMatchHard_32 3.84µs × (1.00,1.00) 3.84µs × (1.00,1.00) ~ (p=0.378)
RegexpMatchHard_1K 117µs × (1.00,1.00) 117µs × (1.00,1.00) ~ (p=0.067)
Revcomp 904ms × (0.99,1.02) 904ms × (0.99,1.01) ~ (p=0.943)
Template 125ms × (0.99,1.02) 127ms × (0.99,1.01) +1.79% (p=0.000)
TimeParse 627ns × (0.99,1.01) 622ns × (0.99,1.01) -0.88% (p=0.000)
TimeFormat 655ns × (0.99,1.02) 655ns × (0.99,1.02) ~ (p=0.976)
For the record, Go 1 benchmarks, Go 1.4 vs this CL:
name old mean new mean delta
BinaryTree17 4.61s × (0.97,1.05) 5.91s × (0.98,1.03) +28.35% (p=0.000)
Fannkuch11 4.40s × (0.99,1.03) 4.41s × (0.99,1.01) ~ (p=0.212)
FmtFprintfEmpty 102ns × (0.99,1.01) 84ns × (0.99,1.02) -18.38% (p=0.000)
FmtFprintfString 302ns × (0.98,1.01) 303ns × (0.99,1.02) ~ (p=0.203)
FmtFprintfInt 313ns × (0.97,1.05) 270ns × (0.99,1.01) -13.69% (p=0.000)
FmtFprintfIntInt 524ns × (0.98,1.02) 477ns × (0.99,1.00) -8.87% (p=0.000)
FmtFprintfPrefixedInt 424ns × (0.98,1.02) 386ns × (0.99,1.01) -8.96% (p=0.000)
FmtFprintfFloat 652ns × (0.98,1.02) 594ns × (0.97,1.05) -8.97% (p=0.000)
FmtManyArgs 2.13µs × (0.99,1.02) 1.94µs × (0.99,1.01) -8.92% (p=0.000)
GobDecode 17.1ms × (0.99,1.02) 14.9ms × (0.98,1.03) -13.07% (p=0.000)
GobEncode 13.5ms × (0.98,1.03) 11.5ms × (0.98,1.03) -15.25% (p=0.000)
Gzip 656ms × (0.99,1.02) 647ms × (0.99,1.01) -1.29% (p=0.000)
Gunzip 143ms × (0.99,1.02) 144ms × (0.99,1.01) ~ (p=0.204)
HTTPClientServer 88.2µs × (0.98,1.02) 90.8µs × (0.98,1.01) +2.93% (p=0.000)
JSONEncode 32.2ms × (0.98,1.02) 30.9ms × (0.97,1.04) -4.06% (p=0.001)
JSONDecode 121ms × (0.98,1.02) 110ms × (0.98,1.05) -8.95% (p=0.000)
Mandelbrot200 6.06ms × (0.99,1.01) 6.11ms × (0.98,1.04) ~ (p=0.184)
GoParse 6.76ms × (0.97,1.04) 6.58ms × (0.98,1.05) -2.63% (p=0.003)
RegexpMatchEasy0_32 195ns × (1.00,1.01) 155ns × (0.99,1.01) -20.43% (p=0.000)
RegexpMatchEasy0_1K 479ns × (0.98,1.03) 535ns × (0.99,1.02) +11.59% (p=0.000)
RegexpMatchEasy1_32 169ns × (0.99,1.02) 131ns × (0.99,1.03) -22.44% (p=0.000)
RegexpMatchEasy1_1K 1.53µs × (0.99,1.01) 0.87µs × (0.99,1.02) -43.07% (p=0.000)
RegexpMatchMedium_32 334ns × (0.99,1.01) 242ns × (0.99,1.01) -27.53% (p=0.000)
RegexpMatchMedium_1K 125µs × (1.00,1.01) 72µs × (0.99,1.03) -42.53% (p=0.000)
RegexpMatchHard_32 6.03µs × (0.99,1.01) 3.79µs × (0.99,1.01) -37.12% (p=0.000)
RegexpMatchHard_1K 189µs × (0.99,1.02) 115µs × (0.99,1.01) -39.20% (p=0.000)
Revcomp 935ms × (0.96,1.03) 926ms × (0.98,1.02) ~ (p=0.083)
Template 146ms × (0.97,1.05) 119ms × (0.99,1.01) -18.37% (p=0.000)
TimeParse 660ns × (0.99,1.01) 624ns × (0.99,1.02) -5.43% (p=0.000)
TimeFormat 670ns × (0.98,1.02) 710ns × (1.00,1.01) +5.97% (p=0.000)
This CL is a bit larger than I would like, but the compiler, linker, runtime,
and package reflect all need to be in sync about the format of these programs,
so there is no easy way to split this into independent changes (at least
while keeping the build working at each change).
Fixes #9625.
Fixes #10524.
Change-Id: I9e3e20d6097099d0f8532d1cb5b1af528804989a
Reviewed-on: https://go-review.googlesource.com/9888
Reviewed-by: Austin Clements <austin@google.com>
Run-TryBot: Russ Cox <rsc@golang.org>
2015-05-08 01:43:18 -04:00
|
|
|
func (p *GCProg) End(size int64) {
|
2016-04-06 12:01:40 -07:00
|
|
|
p.w.ZeroUntil(size / int64(SysArch.PtrSize))
|
runtime: replace GC programs with simpler encoding, faster decoder
Small types record the location of pointers in their memory layout
by using a simple bitmap. In Go 1.4 the bitmap held 4-bit entries,
and in Go 1.5 the bitmap holds 1-bit entries, but in both cases using
a bitmap for a large type containing arrays does not make sense:
if someone refers to the type [1<<28]*byte in a program in such
a way that the type information makes it into the binary, it would be
a waste of space to write a 128 MB (for 4-bit entries) or even 32 MB
(for 1-bit entries) bitmap full of 1s into the binary or even to keep
one in memory during the execution of the program.
For large types containing arrays, it is much more compact to describe
the locations of pointers using a notation that can express repetition
than to lay out a bitmap of pointers. Go 1.4 included such a notation,
called ``GC programs'' but it was complex, required recursion during
decoding, and was generally slow. Dmitriy measured the execution of
these programs writing directly to the heap bitmap as being 7x slower
than copying from a preunrolled 4-bit mask (and frankly that code was
not terribly fast either). For some tests, unrollgcprog1 was seen costing
as much as 3x more than the rest of malloc combined.
This CL introduces a different form for the GC programs. They use a
simple Lempel-Ziv-style encoding of the 1-bit pointer information,
in which the only operations are (1) emit the following n bits
and (2) repeat the last n bits c more times. This encoding can be
generated directly from the Go type information (using repetition
only for arrays or large runs of non-pointer data) and it can be decoded
very efficiently. In particular the decoding requires little state and
no recursion, so that the entire decoding can run without any memory
accesses other than the reads of the encoding and the writes of the
decoded form to the heap bitmap. For recursive types like arrays of
arrays of arrays, the inner instructions are only executed once, not
n times, so that large repetitions run at full speed. (In contrast, large
repetitions in the old programs repeated the individual bit-level layout
of the inner data over and over.) The result is as much as 25x faster
decoding compared to the old form.
Because the old decoder was so slow, Go 1.4 had three (or so) cases
for how to set the heap bitmap bits for an allocation of a given type:
(1) If the type had an even number of words up to 32 words, then
the 4-bit pointer mask for the type fit in no more than 16 bytes;
store the 4-bit pointer mask directly in the binary and copy from it.
(1b) If the type had an odd number of words up to 15 words, then
the 4-bit pointer mask for the type, doubled to end on a byte boundary,
fit in no more than 16 bytes; store that doubled mask directly in the
binary and copy from it.
(2) If the type had an even number of words up to 128 words,
or an odd number of words up to 63 words (again due to doubling),
then the 4-bit pointer mask would fit in a 64-byte unrolled mask.
Store a GC program in the binary, but leave space in the BSS for
the unrolled mask. Execute the GC program to construct the mask the
first time it is needed, and thereafter copy from the mask.
(3) Otherwise, store a GC program and execute it to write directly to
the heap bitmap each time an object of that type is allocated.
(This is the case that was 7x slower than the other two.)
Because the new pointer masks store 1-bit entries instead of 4-bit
entries and because using the decoder no longer carries a significant
overhead, after this CL (that is, for Go 1.5) there are only two cases:
(1) If the type is 128 words or less (no condition about odd or even),
store the 1-bit pointer mask directly in the binary and use it to
initialize the heap bitmap during malloc. (Implemented in CL 9702.)
(2) There is no case 2 anymore.
(3) Otherwise, store a GC program and execute it to write directly to
the heap bitmap each time an object of that type is allocated.
Executing the GC program directly into the heap bitmap (case (3) above)
was disabled for the Go 1.5 dev cycle, both to avoid needing to use
GC programs for typedmemmove and to avoid updating that code as
the heap bitmap format changed. Typedmemmove no longer uses this
type information; as of CL 9886 it uses the heap bitmap directly.
Now that the heap bitmap format is stable, we reintroduce GC programs
and their space savings.
Benchmarks for heapBitsSetType, before this CL vs this CL:
name old mean new mean delta
SetTypePtr 7.59ns × (0.99,1.02) 5.16ns × (1.00,1.00) -32.05% (p=0.000)
SetTypePtr8 21.0ns × (0.98,1.05) 21.4ns × (1.00,1.00) ~ (p=0.179)
SetTypePtr16 24.1ns × (0.99,1.01) 24.6ns × (1.00,1.00) +2.41% (p=0.001)
SetTypePtr32 31.2ns × (0.99,1.01) 32.4ns × (0.99,1.02) +3.72% (p=0.001)
SetTypePtr64 45.2ns × (1.00,1.00) 47.2ns × (1.00,1.00) +4.42% (p=0.000)
SetTypePtr126 75.8ns × (0.99,1.01) 79.1ns × (1.00,1.00) +4.25% (p=0.000)
SetTypePtr128 74.3ns × (0.99,1.01) 77.6ns × (1.00,1.01) +4.55% (p=0.000)
SetTypePtrSlice 726ns × (1.00,1.01) 712ns × (1.00,1.00) -1.95% (p=0.001)
SetTypeNode1 20.0ns × (0.99,1.01) 20.7ns × (1.00,1.00) +3.71% (p=0.000)
SetTypeNode1Slice 112ns × (1.00,1.00) 113ns × (0.99,1.00) ~ (p=0.070)
SetTypeNode8 23.9ns × (1.00,1.00) 24.7ns × (1.00,1.01) +3.18% (p=0.000)
SetTypeNode8Slice 294ns × (0.99,1.02) 287ns × (0.99,1.01) -2.38% (p=0.015)
SetTypeNode64 52.8ns × (0.99,1.03) 51.8ns × (0.99,1.01) ~ (p=0.069)
SetTypeNode64Slice 1.13µs × (0.99,1.05) 1.14µs × (0.99,1.00) ~ (p=0.767)
SetTypeNode64Dead 36.0ns × (1.00,1.01) 32.5ns × (0.99,1.00) -9.67% (p=0.000)
SetTypeNode64DeadSlice 1.43µs × (0.99,1.01) 1.40µs × (1.00,1.00) -2.39% (p=0.001)
SetTypeNode124 75.7ns × (1.00,1.01) 79.0ns × (1.00,1.00) +4.44% (p=0.000)
SetTypeNode124Slice 1.94µs × (1.00,1.01) 2.04µs × (0.99,1.01) +4.98% (p=0.000)
SetTypeNode126 75.4ns × (1.00,1.01) 77.7ns × (0.99,1.01) +3.11% (p=0.000)
SetTypeNode126Slice 1.95µs × (0.99,1.01) 2.03µs × (1.00,1.00) +3.74% (p=0.000)
SetTypeNode128 85.4ns × (0.99,1.01) 122.0ns × (1.00,1.00) +42.89% (p=0.000)
SetTypeNode128Slice 2.20µs × (1.00,1.01) 2.36µs × (0.98,1.02) +7.48% (p=0.001)
SetTypeNode130 83.3ns × (1.00,1.00) 123.0ns × (1.00,1.00) +47.61% (p=0.000)
SetTypeNode130Slice 2.30µs × (0.99,1.01) 2.40µs × (0.98,1.01) +4.37% (p=0.000)
SetTypeNode1024 498ns × (1.00,1.00) 537ns × (1.00,1.00) +7.96% (p=0.000)
SetTypeNode1024Slice 15.5µs × (0.99,1.01) 17.8µs × (1.00,1.00) +15.27% (p=0.000)
The above compares always using a cached pointer mask (and the
corresponding waste of memory) against using the programs directly.
Some slowdown is expected, in exchange for having a better general algorithm.
The GC programs kick in for SetTypeNode128, SetTypeNode130, SetTypeNode1024,
along with the slice variants of those.
It is possible that the cutoff of 128 words (bits) should be raised
in a followup CL, but even with this low cutoff the GC programs are
faster than Go 1.4's "fast path" non-GC program case.
Benchmarks for heapBitsSetType, Go 1.4 vs this CL:
name old mean new mean delta
SetTypePtr 6.89ns × (1.00,1.00) 5.17ns × (1.00,1.00) -25.02% (p=0.000)
SetTypePtr8 25.8ns × (0.97,1.05) 21.5ns × (1.00,1.00) -16.70% (p=0.000)
SetTypePtr16 39.8ns × (0.97,1.02) 24.7ns × (0.99,1.01) -37.81% (p=0.000)
SetTypePtr32 68.8ns × (0.98,1.01) 32.2ns × (1.00,1.01) -53.18% (p=0.000)
SetTypePtr64 130ns × (1.00,1.00) 47ns × (1.00,1.00) -63.67% (p=0.000)
SetTypePtr126 241ns × (0.99,1.01) 79ns × (1.00,1.01) -67.25% (p=0.000)
SetTypePtr128 2.07µs × (1.00,1.00) 0.08µs × (1.00,1.00) -96.27% (p=0.000)
SetTypePtrSlice 1.05µs × (0.99,1.01) 0.72µs × (0.99,1.02) -31.70% (p=0.000)
SetTypeNode1 16.0ns × (0.99,1.01) 20.8ns × (0.99,1.03) +29.91% (p=0.000)
SetTypeNode1Slice 184ns × (0.99,1.01) 112ns × (0.99,1.01) -39.26% (p=0.000)
SetTypeNode8 29.5ns × (0.97,1.02) 24.6ns × (1.00,1.00) -16.50% (p=0.000)
SetTypeNode8Slice 624ns × (0.98,1.02) 285ns × (1.00,1.00) -54.31% (p=0.000)
SetTypeNode64 135ns × (0.96,1.08) 52ns × (0.99,1.02) -61.32% (p=0.000)
SetTypeNode64Slice 3.83µs × (1.00,1.00) 1.14µs × (0.99,1.01) -70.16% (p=0.000)
SetTypeNode64Dead 134ns × (0.99,1.01) 32ns × (1.00,1.01) -75.74% (p=0.000)
SetTypeNode64DeadSlice 3.83µs × (0.99,1.00) 1.40µs × (1.00,1.01) -63.42% (p=0.000)
SetTypeNode124 240ns × (0.99,1.01) 79ns × (1.00,1.01) -67.05% (p=0.000)
SetTypeNode124Slice 7.27µs × (1.00,1.00) 2.04µs × (1.00,1.00) -71.95% (p=0.000)
SetTypeNode126 2.06µs × (0.99,1.01) 0.08µs × (0.99,1.01) -96.23% (p=0.000)
SetTypeNode126Slice 64.4µs × (1.00,1.00) 2.0µs × (1.00,1.00) -96.85% (p=0.000)
SetTypeNode128 2.09µs × (1.00,1.01) 0.12µs × (1.00,1.00) -94.15% (p=0.000)
SetTypeNode128Slice 65.4µs × (1.00,1.00) 2.4µs × (0.99,1.03) -96.39% (p=0.000)
SetTypeNode130 2.11µs × (1.00,1.00) 0.12µs × (1.00,1.00) -94.18% (p=0.000)
SetTypeNode130Slice 66.3µs × (1.00,1.00) 2.4µs × (0.97,1.08) -96.34% (p=0.000)
SetTypeNode1024 16.0µs × (1.00,1.01) 0.5µs × (1.00,1.00) -96.65% (p=0.000)
SetTypeNode1024Slice 512µs × (1.00,1.00) 18µs × (0.98,1.04) -96.45% (p=0.000)
SetTypeNode124 uses a 124 data + 2 ptr = 126-word allocation.
Both Go 1.4 and this CL are using pointer bitmaps for this case,
so that's an overall 3x speedup for using pointer bitmaps.
SetTypeNode128 uses a 128 data + 2 ptr = 130-word allocation.
Both Go 1.4 and this CL are running the GC program for this case,
so that's an overall 17x speedup when using GC programs (and
I've seen >20x on other systems).
Comparing Go 1.4's SetTypeNode124 (pointer bitmap) against
this CL's SetTypeNode128 (GC program), the slow path in the
code in this CL is 2x faster than the fast path in Go 1.4.
The Go 1 benchmarks are basically unaffected compared to just before this CL.
Go 1 benchmarks, before this CL vs this CL:
name old mean new mean delta
BinaryTree17 5.87s × (0.97,1.04) 5.91s × (0.96,1.04) ~ (p=0.306)
Fannkuch11 4.38s × (1.00,1.00) 4.37s × (1.00,1.01) -0.22% (p=0.006)
FmtFprintfEmpty 90.7ns × (0.97,1.10) 89.3ns × (0.96,1.09) ~ (p=0.280)
FmtFprintfString 282ns × (0.98,1.04) 287ns × (0.98,1.07) +1.72% (p=0.039)
FmtFprintfInt 269ns × (0.99,1.03) 282ns × (0.97,1.04) +4.87% (p=0.000)
FmtFprintfIntInt 478ns × (0.99,1.02) 481ns × (0.99,1.02) +0.61% (p=0.048)
FmtFprintfPrefixedInt 399ns × (0.98,1.03) 400ns × (0.98,1.05) ~ (p=0.533)
FmtFprintfFloat 563ns × (0.99,1.01) 570ns × (1.00,1.01) +1.37% (p=0.000)
FmtManyArgs 1.89µs × (0.99,1.01) 1.92µs × (0.99,1.02) +1.88% (p=0.000)
GobDecode 15.2ms × (0.99,1.01) 15.2ms × (0.98,1.05) ~ (p=0.609)
GobEncode 11.6ms × (0.98,1.03) 11.9ms × (0.98,1.04) +2.17% (p=0.000)
Gzip 648ms × (0.99,1.01) 648ms × (1.00,1.01) ~ (p=0.835)
Gunzip 142ms × (1.00,1.00) 143ms × (1.00,1.01) ~ (p=0.169)
HTTPClientServer 90.5µs × (0.98,1.03) 91.5µs × (0.98,1.04) +1.04% (p=0.045)
JSONEncode 31.5ms × (0.98,1.03) 31.4ms × (0.98,1.03) ~ (p=0.549)
JSONDecode 111ms × (0.99,1.01) 107ms × (0.99,1.01) -3.21% (p=0.000)
Mandelbrot200 6.01ms × (1.00,1.00) 6.01ms × (1.00,1.00) ~ (p=0.878)
GoParse 6.54ms × (0.99,1.02) 6.61ms × (0.99,1.03) +1.08% (p=0.004)
RegexpMatchEasy0_32 160ns × (1.00,1.01) 161ns × (1.00,1.00) +0.40% (p=0.000)
RegexpMatchEasy0_1K 560ns × (0.99,1.01) 559ns × (0.99,1.01) ~ (p=0.088)
RegexpMatchEasy1_32 138ns × (0.99,1.01) 138ns × (1.00,1.00) ~ (p=0.380)
RegexpMatchEasy1_1K 877ns × (1.00,1.00) 878ns × (1.00,1.00) ~ (p=0.157)
RegexpMatchMedium_32 251ns × (0.99,1.00) 251ns × (1.00,1.01) +0.28% (p=0.021)
RegexpMatchMedium_1K 72.6µs × (1.00,1.00) 72.6µs × (1.00,1.00) ~ (p=0.539)
RegexpMatchHard_32 3.84µs × (1.00,1.00) 3.84µs × (1.00,1.00) ~ (p=0.378)
RegexpMatchHard_1K 117µs × (1.00,1.00) 117µs × (1.00,1.00) ~ (p=0.067)
Revcomp 904ms × (0.99,1.02) 904ms × (0.99,1.01) ~ (p=0.943)
Template 125ms × (0.99,1.02) 127ms × (0.99,1.01) +1.79% (p=0.000)
TimeParse 627ns × (0.99,1.01) 622ns × (0.99,1.01) -0.88% (p=0.000)
TimeFormat 655ns × (0.99,1.02) 655ns × (0.99,1.02) ~ (p=0.976)
For the record, Go 1 benchmarks, Go 1.4 vs this CL:
name old mean new mean delta
BinaryTree17 4.61s × (0.97,1.05) 5.91s × (0.98,1.03) +28.35% (p=0.000)
Fannkuch11 4.40s × (0.99,1.03) 4.41s × (0.99,1.01) ~ (p=0.212)
FmtFprintfEmpty 102ns × (0.99,1.01) 84ns × (0.99,1.02) -18.38% (p=0.000)
FmtFprintfString 302ns × (0.98,1.01) 303ns × (0.99,1.02) ~ (p=0.203)
FmtFprintfInt 313ns × (0.97,1.05) 270ns × (0.99,1.01) -13.69% (p=0.000)
FmtFprintfIntInt 524ns × (0.98,1.02) 477ns × (0.99,1.00) -8.87% (p=0.000)
FmtFprintfPrefixedInt 424ns × (0.98,1.02) 386ns × (0.99,1.01) -8.96% (p=0.000)
FmtFprintfFloat 652ns × (0.98,1.02) 594ns × (0.97,1.05) -8.97% (p=0.000)
FmtManyArgs 2.13µs × (0.99,1.02) 1.94µs × (0.99,1.01) -8.92% (p=0.000)
GobDecode 17.1ms × (0.99,1.02) 14.9ms × (0.98,1.03) -13.07% (p=0.000)
GobEncode 13.5ms × (0.98,1.03) 11.5ms × (0.98,1.03) -15.25% (p=0.000)
Gzip 656ms × (0.99,1.02) 647ms × (0.99,1.01) -1.29% (p=0.000)
Gunzip 143ms × (0.99,1.02) 144ms × (0.99,1.01) ~ (p=0.204)
HTTPClientServer 88.2µs × (0.98,1.02) 90.8µs × (0.98,1.01) +2.93% (p=0.000)
JSONEncode 32.2ms × (0.98,1.02) 30.9ms × (0.97,1.04) -4.06% (p=0.001)
JSONDecode 121ms × (0.98,1.02) 110ms × (0.98,1.05) -8.95% (p=0.000)
Mandelbrot200 6.06ms × (0.99,1.01) 6.11ms × (0.98,1.04) ~ (p=0.184)
GoParse 6.76ms × (0.97,1.04) 6.58ms × (0.98,1.05) -2.63% (p=0.003)
RegexpMatchEasy0_32 195ns × (1.00,1.01) 155ns × (0.99,1.01) -20.43% (p=0.000)
RegexpMatchEasy0_1K 479ns × (0.98,1.03) 535ns × (0.99,1.02) +11.59% (p=0.000)
RegexpMatchEasy1_32 169ns × (0.99,1.02) 131ns × (0.99,1.03) -22.44% (p=0.000)
RegexpMatchEasy1_1K 1.53µs × (0.99,1.01) 0.87µs × (0.99,1.02) -43.07% (p=0.000)
RegexpMatchMedium_32 334ns × (0.99,1.01) 242ns × (0.99,1.01) -27.53% (p=0.000)
RegexpMatchMedium_1K 125µs × (1.00,1.01) 72µs × (0.99,1.03) -42.53% (p=0.000)
RegexpMatchHard_32 6.03µs × (0.99,1.01) 3.79µs × (0.99,1.01) -37.12% (p=0.000)
RegexpMatchHard_1K 189µs × (0.99,1.02) 115µs × (0.99,1.01) -39.20% (p=0.000)
Revcomp 935ms × (0.96,1.03) 926ms × (0.98,1.02) ~ (p=0.083)
Template 146ms × (0.97,1.05) 119ms × (0.99,1.01) -18.37% (p=0.000)
TimeParse 660ns × (0.99,1.01) 624ns × (0.99,1.02) -5.43% (p=0.000)
TimeFormat 670ns × (0.98,1.02) 710ns × (1.00,1.01) +5.97% (p=0.000)
This CL is a bit larger than I would like, but the compiler, linker, runtime,
and package reflect all need to be in sync about the format of these programs,
so there is no easy way to split this into independent changes (at least
while keeping the build working at each change).
Fixes #9625.
Fixes #10524.
Change-Id: I9e3e20d6097099d0f8532d1cb5b1af528804989a
Reviewed-on: https://go-review.googlesource.com/9888
Reviewed-by: Austin Clements <austin@google.com>
Run-TryBot: Russ Cox <rsc@golang.org>
2015-05-08 01:43:18 -04:00
|
|
|
p.w.End()
|
|
|
|
|
if debugGCProg {
|
|
|
|
|
fmt.Fprintf(os.Stderr, "ld: end GCProg\n")
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2016-08-19 11:35:54 -04:00
|
|
|
func (p *GCProg) AddSym(s *Symbol) {
|
runtime: replace GC programs with simpler encoding, faster decoder
Small types record the location of pointers in their memory layout
by using a simple bitmap. In Go 1.4 the bitmap held 4-bit entries,
and in Go 1.5 the bitmap holds 1-bit entries, but in both cases using
a bitmap for a large type containing arrays does not make sense:
if someone refers to the type [1<<28]*byte in a program in such
a way that the type information makes it into the binary, it would be
a waste of space to write a 128 MB (for 4-bit entries) or even 32 MB
(for 1-bit entries) bitmap full of 1s into the binary or even to keep
one in memory during the execution of the program.
For large types containing arrays, it is much more compact to describe
the locations of pointers using a notation that can express repetition
than to lay out a bitmap of pointers. Go 1.4 included such a notation,
called ``GC programs'' but it was complex, required recursion during
decoding, and was generally slow. Dmitriy measured the execution of
these programs writing directly to the heap bitmap as being 7x slower
than copying from a preunrolled 4-bit mask (and frankly that code was
not terribly fast either). For some tests, unrollgcprog1 was seen costing
as much as 3x more than the rest of malloc combined.
This CL introduces a different form for the GC programs. They use a
simple Lempel-Ziv-style encoding of the 1-bit pointer information,
in which the only operations are (1) emit the following n bits
and (2) repeat the last n bits c more times. This encoding can be
generated directly from the Go type information (using repetition
only for arrays or large runs of non-pointer data) and it can be decoded
very efficiently. In particular the decoding requires little state and
no recursion, so that the entire decoding can run without any memory
accesses other than the reads of the encoding and the writes of the
decoded form to the heap bitmap. For recursive types like arrays of
arrays of arrays, the inner instructions are only executed once, not
n times, so that large repetitions run at full speed. (In contrast, large
repetitions in the old programs repeated the individual bit-level layout
of the inner data over and over.) The result is as much as 25x faster
decoding compared to the old form.
Because the old decoder was so slow, Go 1.4 had three (or so) cases
for how to set the heap bitmap bits for an allocation of a given type:
(1) If the type had an even number of words up to 32 words, then
the 4-bit pointer mask for the type fit in no more than 16 bytes;
store the 4-bit pointer mask directly in the binary and copy from it.
(1b) If the type had an odd number of words up to 15 words, then
the 4-bit pointer mask for the type, doubled to end on a byte boundary,
fit in no more than 16 bytes; store that doubled mask directly in the
binary and copy from it.
(2) If the type had an even number of words up to 128 words,
or an odd number of words up to 63 words (again due to doubling),
then the 4-bit pointer mask would fit in a 64-byte unrolled mask.
Store a GC program in the binary, but leave space in the BSS for
the unrolled mask. Execute the GC program to construct the mask the
first time it is needed, and thereafter copy from the mask.
(3) Otherwise, store a GC program and execute it to write directly to
the heap bitmap each time an object of that type is allocated.
(This is the case that was 7x slower than the other two.)
Because the new pointer masks store 1-bit entries instead of 4-bit
entries and because using the decoder no longer carries a significant
overhead, after this CL (that is, for Go 1.5) there are only two cases:
(1) If the type is 128 words or less (no condition about odd or even),
store the 1-bit pointer mask directly in the binary and use it to
initialize the heap bitmap during malloc. (Implemented in CL 9702.)
(2) There is no case 2 anymore.
(3) Otherwise, store a GC program and execute it to write directly to
the heap bitmap each time an object of that type is allocated.
Executing the GC program directly into the heap bitmap (case (3) above)
was disabled for the Go 1.5 dev cycle, both to avoid needing to use
GC programs for typedmemmove and to avoid updating that code as
the heap bitmap format changed. Typedmemmove no longer uses this
type information; as of CL 9886 it uses the heap bitmap directly.
Now that the heap bitmap format is stable, we reintroduce GC programs
and their space savings.
Benchmarks for heapBitsSetType, before this CL vs this CL:
name old mean new mean delta
SetTypePtr 7.59ns × (0.99,1.02) 5.16ns × (1.00,1.00) -32.05% (p=0.000)
SetTypePtr8 21.0ns × (0.98,1.05) 21.4ns × (1.00,1.00) ~ (p=0.179)
SetTypePtr16 24.1ns × (0.99,1.01) 24.6ns × (1.00,1.00) +2.41% (p=0.001)
SetTypePtr32 31.2ns × (0.99,1.01) 32.4ns × (0.99,1.02) +3.72% (p=0.001)
SetTypePtr64 45.2ns × (1.00,1.00) 47.2ns × (1.00,1.00) +4.42% (p=0.000)
SetTypePtr126 75.8ns × (0.99,1.01) 79.1ns × (1.00,1.00) +4.25% (p=0.000)
SetTypePtr128 74.3ns × (0.99,1.01) 77.6ns × (1.00,1.01) +4.55% (p=0.000)
SetTypePtrSlice 726ns × (1.00,1.01) 712ns × (1.00,1.00) -1.95% (p=0.001)
SetTypeNode1 20.0ns × (0.99,1.01) 20.7ns × (1.00,1.00) +3.71% (p=0.000)
SetTypeNode1Slice 112ns × (1.00,1.00) 113ns × (0.99,1.00) ~ (p=0.070)
SetTypeNode8 23.9ns × (1.00,1.00) 24.7ns × (1.00,1.01) +3.18% (p=0.000)
SetTypeNode8Slice 294ns × (0.99,1.02) 287ns × (0.99,1.01) -2.38% (p=0.015)
SetTypeNode64 52.8ns × (0.99,1.03) 51.8ns × (0.99,1.01) ~ (p=0.069)
SetTypeNode64Slice 1.13µs × (0.99,1.05) 1.14µs × (0.99,1.00) ~ (p=0.767)
SetTypeNode64Dead 36.0ns × (1.00,1.01) 32.5ns × (0.99,1.00) -9.67% (p=0.000)
SetTypeNode64DeadSlice 1.43µs × (0.99,1.01) 1.40µs × (1.00,1.00) -2.39% (p=0.001)
SetTypeNode124 75.7ns × (1.00,1.01) 79.0ns × (1.00,1.00) +4.44% (p=0.000)
SetTypeNode124Slice 1.94µs × (1.00,1.01) 2.04µs × (0.99,1.01) +4.98% (p=0.000)
SetTypeNode126 75.4ns × (1.00,1.01) 77.7ns × (0.99,1.01) +3.11% (p=0.000)
SetTypeNode126Slice 1.95µs × (0.99,1.01) 2.03µs × (1.00,1.00) +3.74% (p=0.000)
SetTypeNode128 85.4ns × (0.99,1.01) 122.0ns × (1.00,1.00) +42.89% (p=0.000)
SetTypeNode128Slice 2.20µs × (1.00,1.01) 2.36µs × (0.98,1.02) +7.48% (p=0.001)
SetTypeNode130 83.3ns × (1.00,1.00) 123.0ns × (1.00,1.00) +47.61% (p=0.000)
SetTypeNode130Slice 2.30µs × (0.99,1.01) 2.40µs × (0.98,1.01) +4.37% (p=0.000)
SetTypeNode1024 498ns × (1.00,1.00) 537ns × (1.00,1.00) +7.96% (p=0.000)
SetTypeNode1024Slice 15.5µs × (0.99,1.01) 17.8µs × (1.00,1.00) +15.27% (p=0.000)
The above compares always using a cached pointer mask (and the
corresponding waste of memory) against using the programs directly.
Some slowdown is expected, in exchange for having a better general algorithm.
The GC programs kick in for SetTypeNode128, SetTypeNode130, SetTypeNode1024,
along with the slice variants of those.
It is possible that the cutoff of 128 words (bits) should be raised
in a followup CL, but even with this low cutoff the GC programs are
faster than Go 1.4's "fast path" non-GC program case.
Benchmarks for heapBitsSetType, Go 1.4 vs this CL:
name old mean new mean delta
SetTypePtr 6.89ns × (1.00,1.00) 5.17ns × (1.00,1.00) -25.02% (p=0.000)
SetTypePtr8 25.8ns × (0.97,1.05) 21.5ns × (1.00,1.00) -16.70% (p=0.000)
SetTypePtr16 39.8ns × (0.97,1.02) 24.7ns × (0.99,1.01) -37.81% (p=0.000)
SetTypePtr32 68.8ns × (0.98,1.01) 32.2ns × (1.00,1.01) -53.18% (p=0.000)
SetTypePtr64 130ns × (1.00,1.00) 47ns × (1.00,1.00) -63.67% (p=0.000)
SetTypePtr126 241ns × (0.99,1.01) 79ns × (1.00,1.01) -67.25% (p=0.000)
SetTypePtr128 2.07µs × (1.00,1.00) 0.08µs × (1.00,1.00) -96.27% (p=0.000)
SetTypePtrSlice 1.05µs × (0.99,1.01) 0.72µs × (0.99,1.02) -31.70% (p=0.000)
SetTypeNode1 16.0ns × (0.99,1.01) 20.8ns × (0.99,1.03) +29.91% (p=0.000)
SetTypeNode1Slice 184ns × (0.99,1.01) 112ns × (0.99,1.01) -39.26% (p=0.000)
SetTypeNode8 29.5ns × (0.97,1.02) 24.6ns × (1.00,1.00) -16.50% (p=0.000)
SetTypeNode8Slice 624ns × (0.98,1.02) 285ns × (1.00,1.00) -54.31% (p=0.000)
SetTypeNode64 135ns × (0.96,1.08) 52ns × (0.99,1.02) -61.32% (p=0.000)
SetTypeNode64Slice 3.83µs × (1.00,1.00) 1.14µs × (0.99,1.01) -70.16% (p=0.000)
SetTypeNode64Dead 134ns × (0.99,1.01) 32ns × (1.00,1.01) -75.74% (p=0.000)
SetTypeNode64DeadSlice 3.83µs × (0.99,1.00) 1.40µs × (1.00,1.01) -63.42% (p=0.000)
SetTypeNode124 240ns × (0.99,1.01) 79ns × (1.00,1.01) -67.05% (p=0.000)
SetTypeNode124Slice 7.27µs × (1.00,1.00) 2.04µs × (1.00,1.00) -71.95% (p=0.000)
SetTypeNode126 2.06µs × (0.99,1.01) 0.08µs × (0.99,1.01) -96.23% (p=0.000)
SetTypeNode126Slice 64.4µs × (1.00,1.00) 2.0µs × (1.00,1.00) -96.85% (p=0.000)
SetTypeNode128 2.09µs × (1.00,1.01) 0.12µs × (1.00,1.00) -94.15% (p=0.000)
SetTypeNode128Slice 65.4µs × (1.00,1.00) 2.4µs × (0.99,1.03) -96.39% (p=0.000)
SetTypeNode130 2.11µs × (1.00,1.00) 0.12µs × (1.00,1.00) -94.18% (p=0.000)
SetTypeNode130Slice 66.3µs × (1.00,1.00) 2.4µs × (0.97,1.08) -96.34% (p=0.000)
SetTypeNode1024 16.0µs × (1.00,1.01) 0.5µs × (1.00,1.00) -96.65% (p=0.000)
SetTypeNode1024Slice 512µs × (1.00,1.00) 18µs × (0.98,1.04) -96.45% (p=0.000)
SetTypeNode124 uses a 124 data + 2 ptr = 126-word allocation.
Both Go 1.4 and this CL are using pointer bitmaps for this case,
so that's an overall 3x speedup for using pointer bitmaps.
SetTypeNode128 uses a 128 data + 2 ptr = 130-word allocation.
Both Go 1.4 and this CL are running the GC program for this case,
so that's an overall 17x speedup when using GC programs (and
I've seen >20x on other systems).
Comparing Go 1.4's SetTypeNode124 (pointer bitmap) against
this CL's SetTypeNode128 (GC program), the slow path in the
code in this CL is 2x faster than the fast path in Go 1.4.
The Go 1 benchmarks are basically unaffected compared to just before this CL.
Go 1 benchmarks, before this CL vs this CL:
name old mean new mean delta
BinaryTree17 5.87s × (0.97,1.04) 5.91s × (0.96,1.04) ~ (p=0.306)
Fannkuch11 4.38s × (1.00,1.00) 4.37s × (1.00,1.01) -0.22% (p=0.006)
FmtFprintfEmpty 90.7ns × (0.97,1.10) 89.3ns × (0.96,1.09) ~ (p=0.280)
FmtFprintfString 282ns × (0.98,1.04) 287ns × (0.98,1.07) +1.72% (p=0.039)
FmtFprintfInt 269ns × (0.99,1.03) 282ns × (0.97,1.04) +4.87% (p=0.000)
FmtFprintfIntInt 478ns × (0.99,1.02) 481ns × (0.99,1.02) +0.61% (p=0.048)
FmtFprintfPrefixedInt 399ns × (0.98,1.03) 400ns × (0.98,1.05) ~ (p=0.533)
FmtFprintfFloat 563ns × (0.99,1.01) 570ns × (1.00,1.01) +1.37% (p=0.000)
FmtManyArgs 1.89µs × (0.99,1.01) 1.92µs × (0.99,1.02) +1.88% (p=0.000)
GobDecode 15.2ms × (0.99,1.01) 15.2ms × (0.98,1.05) ~ (p=0.609)
GobEncode 11.6ms × (0.98,1.03) 11.9ms × (0.98,1.04) +2.17% (p=0.000)
Gzip 648ms × (0.99,1.01) 648ms × (1.00,1.01) ~ (p=0.835)
Gunzip 142ms × (1.00,1.00) 143ms × (1.00,1.01) ~ (p=0.169)
HTTPClientServer 90.5µs × (0.98,1.03) 91.5µs × (0.98,1.04) +1.04% (p=0.045)
JSONEncode 31.5ms × (0.98,1.03) 31.4ms × (0.98,1.03) ~ (p=0.549)
JSONDecode 111ms × (0.99,1.01) 107ms × (0.99,1.01) -3.21% (p=0.000)
Mandelbrot200 6.01ms × (1.00,1.00) 6.01ms × (1.00,1.00) ~ (p=0.878)
GoParse 6.54ms × (0.99,1.02) 6.61ms × (0.99,1.03) +1.08% (p=0.004)
RegexpMatchEasy0_32 160ns × (1.00,1.01) 161ns × (1.00,1.00) +0.40% (p=0.000)
RegexpMatchEasy0_1K 560ns × (0.99,1.01) 559ns × (0.99,1.01) ~ (p=0.088)
RegexpMatchEasy1_32 138ns × (0.99,1.01) 138ns × (1.00,1.00) ~ (p=0.380)
RegexpMatchEasy1_1K 877ns × (1.00,1.00) 878ns × (1.00,1.00) ~ (p=0.157)
RegexpMatchMedium_32 251ns × (0.99,1.00) 251ns × (1.00,1.01) +0.28% (p=0.021)
RegexpMatchMedium_1K 72.6µs × (1.00,1.00) 72.6µs × (1.00,1.00) ~ (p=0.539)
RegexpMatchHard_32 3.84µs × (1.00,1.00) 3.84µs × (1.00,1.00) ~ (p=0.378)
RegexpMatchHard_1K 117µs × (1.00,1.00) 117µs × (1.00,1.00) ~ (p=0.067)
Revcomp 904ms × (0.99,1.02) 904ms × (0.99,1.01) ~ (p=0.943)
Template 125ms × (0.99,1.02) 127ms × (0.99,1.01) +1.79% (p=0.000)
TimeParse 627ns × (0.99,1.01) 622ns × (0.99,1.01) -0.88% (p=0.000)
TimeFormat 655ns × (0.99,1.02) 655ns × (0.99,1.02) ~ (p=0.976)
For the record, Go 1 benchmarks, Go 1.4 vs this CL:
name old mean new mean delta
BinaryTree17 4.61s × (0.97,1.05) 5.91s × (0.98,1.03) +28.35% (p=0.000)
Fannkuch11 4.40s × (0.99,1.03) 4.41s × (0.99,1.01) ~ (p=0.212)
FmtFprintfEmpty 102ns × (0.99,1.01) 84ns × (0.99,1.02) -18.38% (p=0.000)
FmtFprintfString 302ns × (0.98,1.01) 303ns × (0.99,1.02) ~ (p=0.203)
FmtFprintfInt 313ns × (0.97,1.05) 270ns × (0.99,1.01) -13.69% (p=0.000)
FmtFprintfIntInt 524ns × (0.98,1.02) 477ns × (0.99,1.00) -8.87% (p=0.000)
FmtFprintfPrefixedInt 424ns × (0.98,1.02) 386ns × (0.99,1.01) -8.96% (p=0.000)
FmtFprintfFloat 652ns × (0.98,1.02) 594ns × (0.97,1.05) -8.97% (p=0.000)
FmtManyArgs 2.13µs × (0.99,1.02) 1.94µs × (0.99,1.01) -8.92% (p=0.000)
GobDecode 17.1ms × (0.99,1.02) 14.9ms × (0.98,1.03) -13.07% (p=0.000)
GobEncode 13.5ms × (0.98,1.03) 11.5ms × (0.98,1.03) -15.25% (p=0.000)
Gzip 656ms × (0.99,1.02) 647ms × (0.99,1.01) -1.29% (p=0.000)
Gunzip 143ms × (0.99,1.02) 144ms × (0.99,1.01) ~ (p=0.204)
HTTPClientServer 88.2µs × (0.98,1.02) 90.8µs × (0.98,1.01) +2.93% (p=0.000)
JSONEncode 32.2ms × (0.98,1.02) 30.9ms × (0.97,1.04) -4.06% (p=0.001)
JSONDecode 121ms × (0.98,1.02) 110ms × (0.98,1.05) -8.95% (p=0.000)
Mandelbrot200 6.06ms × (0.99,1.01) 6.11ms × (0.98,1.04) ~ (p=0.184)
GoParse 6.76ms × (0.97,1.04) 6.58ms × (0.98,1.05) -2.63% (p=0.003)
RegexpMatchEasy0_32 195ns × (1.00,1.01) 155ns × (0.99,1.01) -20.43% (p=0.000)
RegexpMatchEasy0_1K 479ns × (0.98,1.03) 535ns × (0.99,1.02) +11.59% (p=0.000)
RegexpMatchEasy1_32 169ns × (0.99,1.02) 131ns × (0.99,1.03) -22.44% (p=0.000)
RegexpMatchEasy1_1K 1.53µs × (0.99,1.01) 0.87µs × (0.99,1.02) -43.07% (p=0.000)
RegexpMatchMedium_32 334ns × (0.99,1.01) 242ns × (0.99,1.01) -27.53% (p=0.000)
RegexpMatchMedium_1K 125µs × (1.00,1.01) 72µs × (0.99,1.03) -42.53% (p=0.000)
RegexpMatchHard_32 6.03µs × (0.99,1.01) 3.79µs × (0.99,1.01) -37.12% (p=0.000)
RegexpMatchHard_1K 189µs × (0.99,1.02) 115µs × (0.99,1.01) -39.20% (p=0.000)
Revcomp 935ms × (0.96,1.03) 926ms × (0.98,1.02) ~ (p=0.083)
Template 146ms × (0.97,1.05) 119ms × (0.99,1.01) -18.37% (p=0.000)
TimeParse 660ns × (0.99,1.01) 624ns × (0.99,1.02) -5.43% (p=0.000)
TimeFormat 670ns × (0.98,1.02) 710ns × (1.00,1.01) +5.97% (p=0.000)
This CL is a bit larger than I would like, but the compiler, linker, runtime,
and package reflect all need to be in sync about the format of these programs,
so there is no easy way to split this into independent changes (at least
while keeping the build working at each change).
Fixes #9625.
Fixes #10524.
Change-Id: I9e3e20d6097099d0f8532d1cb5b1af528804989a
Reviewed-on: https://go-review.googlesource.com/9888
Reviewed-by: Austin Clements <austin@google.com>
Run-TryBot: Russ Cox <rsc@golang.org>
2015-05-08 01:43:18 -04:00
|
|
|
typ := s.Gotype
|
|
|
|
|
// Things without pointers should be in SNOPTRDATA or SNOPTRBSS;
|
|
|
|
|
// everything we see should have pointers and should therefore have a type.
|
|
|
|
|
if typ == nil {
|
2016-08-19 22:40:38 -04:00
|
|
|
p.ctxt.Diag("missing Go type information for global symbol: %s size %d", s.Name, int(s.Size))
|
2015-02-27 22:57:28 -05:00
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
|
2016-04-06 12:01:40 -07:00
|
|
|
ptrsize := int64(SysArch.PtrSize)
|
2016-08-19 22:40:38 -04:00
|
|
|
nptr := decodetype_ptrdata(p.ctxt.Arch, typ) / ptrsize
|
2015-02-27 22:57:28 -05:00
|
|
|
|
runtime: replace GC programs with simpler encoding, faster decoder
Small types record the location of pointers in their memory layout
by using a simple bitmap. In Go 1.4 the bitmap held 4-bit entries,
and in Go 1.5 the bitmap holds 1-bit entries, but in both cases using
a bitmap for a large type containing arrays does not make sense:
if someone refers to the type [1<<28]*byte in a program in such
a way that the type information makes it into the binary, it would be
a waste of space to write a 128 MB (for 4-bit entries) or even 32 MB
(for 1-bit entries) bitmap full of 1s into the binary or even to keep
one in memory during the execution of the program.
For large types containing arrays, it is much more compact to describe
the locations of pointers using a notation that can express repetition
than to lay out a bitmap of pointers. Go 1.4 included such a notation,
called ``GC programs'' but it was complex, required recursion during
decoding, and was generally slow. Dmitriy measured the execution of
these programs writing directly to the heap bitmap as being 7x slower
than copying from a preunrolled 4-bit mask (and frankly that code was
not terribly fast either). For some tests, unrollgcprog1 was seen costing
as much as 3x more than the rest of malloc combined.
This CL introduces a different form for the GC programs. They use a
simple Lempel-Ziv-style encoding of the 1-bit pointer information,
in which the only operations are (1) emit the following n bits
and (2) repeat the last n bits c more times. This encoding can be
generated directly from the Go type information (using repetition
only for arrays or large runs of non-pointer data) and it can be decoded
very efficiently. In particular the decoding requires little state and
no recursion, so that the entire decoding can run without any memory
accesses other than the reads of the encoding and the writes of the
decoded form to the heap bitmap. For recursive types like arrays of
arrays of arrays, the inner instructions are only executed once, not
n times, so that large repetitions run at full speed. (In contrast, large
repetitions in the old programs repeated the individual bit-level layout
of the inner data over and over.) The result is as much as 25x faster
decoding compared to the old form.
Because the old decoder was so slow, Go 1.4 had three (or so) cases
for how to set the heap bitmap bits for an allocation of a given type:
(1) If the type had an even number of words up to 32 words, then
the 4-bit pointer mask for the type fit in no more than 16 bytes;
store the 4-bit pointer mask directly in the binary and copy from it.
(1b) If the type had an odd number of words up to 15 words, then
the 4-bit pointer mask for the type, doubled to end on a byte boundary,
fit in no more than 16 bytes; store that doubled mask directly in the
binary and copy from it.
(2) If the type had an even number of words up to 128 words,
or an odd number of words up to 63 words (again due to doubling),
then the 4-bit pointer mask would fit in a 64-byte unrolled mask.
Store a GC program in the binary, but leave space in the BSS for
the unrolled mask. Execute the GC program to construct the mask the
first time it is needed, and thereafter copy from the mask.
(3) Otherwise, store a GC program and execute it to write directly to
the heap bitmap each time an object of that type is allocated.
(This is the case that was 7x slower than the other two.)
Because the new pointer masks store 1-bit entries instead of 4-bit
entries and because using the decoder no longer carries a significant
overhead, after this CL (that is, for Go 1.5) there are only two cases:
(1) If the type is 128 words or less (no condition about odd or even),
store the 1-bit pointer mask directly in the binary and use it to
initialize the heap bitmap during malloc. (Implemented in CL 9702.)
(2) There is no case 2 anymore.
(3) Otherwise, store a GC program and execute it to write directly to
the heap bitmap each time an object of that type is allocated.
Executing the GC program directly into the heap bitmap (case (3) above)
was disabled for the Go 1.5 dev cycle, both to avoid needing to use
GC programs for typedmemmove and to avoid updating that code as
the heap bitmap format changed. Typedmemmove no longer uses this
type information; as of CL 9886 it uses the heap bitmap directly.
Now that the heap bitmap format is stable, we reintroduce GC programs
and their space savings.
Benchmarks for heapBitsSetType, before this CL vs this CL:
name old mean new mean delta
SetTypePtr 7.59ns × (0.99,1.02) 5.16ns × (1.00,1.00) -32.05% (p=0.000)
SetTypePtr8 21.0ns × (0.98,1.05) 21.4ns × (1.00,1.00) ~ (p=0.179)
SetTypePtr16 24.1ns × (0.99,1.01) 24.6ns × (1.00,1.00) +2.41% (p=0.001)
SetTypePtr32 31.2ns × (0.99,1.01) 32.4ns × (0.99,1.02) +3.72% (p=0.001)
SetTypePtr64 45.2ns × (1.00,1.00) 47.2ns × (1.00,1.00) +4.42% (p=0.000)
SetTypePtr126 75.8ns × (0.99,1.01) 79.1ns × (1.00,1.00) +4.25% (p=0.000)
SetTypePtr128 74.3ns × (0.99,1.01) 77.6ns × (1.00,1.01) +4.55% (p=0.000)
SetTypePtrSlice 726ns × (1.00,1.01) 712ns × (1.00,1.00) -1.95% (p=0.001)
SetTypeNode1 20.0ns × (0.99,1.01) 20.7ns × (1.00,1.00) +3.71% (p=0.000)
SetTypeNode1Slice 112ns × (1.00,1.00) 113ns × (0.99,1.00) ~ (p=0.070)
SetTypeNode8 23.9ns × (1.00,1.00) 24.7ns × (1.00,1.01) +3.18% (p=0.000)
SetTypeNode8Slice 294ns × (0.99,1.02) 287ns × (0.99,1.01) -2.38% (p=0.015)
SetTypeNode64 52.8ns × (0.99,1.03) 51.8ns × (0.99,1.01) ~ (p=0.069)
SetTypeNode64Slice 1.13µs × (0.99,1.05) 1.14µs × (0.99,1.00) ~ (p=0.767)
SetTypeNode64Dead 36.0ns × (1.00,1.01) 32.5ns × (0.99,1.00) -9.67% (p=0.000)
SetTypeNode64DeadSlice 1.43µs × (0.99,1.01) 1.40µs × (1.00,1.00) -2.39% (p=0.001)
SetTypeNode124 75.7ns × (1.00,1.01) 79.0ns × (1.00,1.00) +4.44% (p=0.000)
SetTypeNode124Slice 1.94µs × (1.00,1.01) 2.04µs × (0.99,1.01) +4.98% (p=0.000)
SetTypeNode126 75.4ns × (1.00,1.01) 77.7ns × (0.99,1.01) +3.11% (p=0.000)
SetTypeNode126Slice 1.95µs × (0.99,1.01) 2.03µs × (1.00,1.00) +3.74% (p=0.000)
SetTypeNode128 85.4ns × (0.99,1.01) 122.0ns × (1.00,1.00) +42.89% (p=0.000)
SetTypeNode128Slice 2.20µs × (1.00,1.01) 2.36µs × (0.98,1.02) +7.48% (p=0.001)
SetTypeNode130 83.3ns × (1.00,1.00) 123.0ns × (1.00,1.00) +47.61% (p=0.000)
SetTypeNode130Slice 2.30µs × (0.99,1.01) 2.40µs × (0.98,1.01) +4.37% (p=0.000)
SetTypeNode1024 498ns × (1.00,1.00) 537ns × (1.00,1.00) +7.96% (p=0.000)
SetTypeNode1024Slice 15.5µs × (0.99,1.01) 17.8µs × (1.00,1.00) +15.27% (p=0.000)
The above compares always using a cached pointer mask (and the
corresponding waste of memory) against using the programs directly.
Some slowdown is expected, in exchange for having a better general algorithm.
The GC programs kick in for SetTypeNode128, SetTypeNode130, SetTypeNode1024,
along with the slice variants of those.
It is possible that the cutoff of 128 words (bits) should be raised
in a followup CL, but even with this low cutoff the GC programs are
faster than Go 1.4's "fast path" non-GC program case.
Benchmarks for heapBitsSetType, Go 1.4 vs this CL:
name old mean new mean delta
SetTypePtr 6.89ns × (1.00,1.00) 5.17ns × (1.00,1.00) -25.02% (p=0.000)
SetTypePtr8 25.8ns × (0.97,1.05) 21.5ns × (1.00,1.00) -16.70% (p=0.000)
SetTypePtr16 39.8ns × (0.97,1.02) 24.7ns × (0.99,1.01) -37.81% (p=0.000)
SetTypePtr32 68.8ns × (0.98,1.01) 32.2ns × (1.00,1.01) -53.18% (p=0.000)
SetTypePtr64 130ns × (1.00,1.00) 47ns × (1.00,1.00) -63.67% (p=0.000)
SetTypePtr126 241ns × (0.99,1.01) 79ns × (1.00,1.01) -67.25% (p=0.000)
SetTypePtr128 2.07µs × (1.00,1.00) 0.08µs × (1.00,1.00) -96.27% (p=0.000)
SetTypePtrSlice 1.05µs × (0.99,1.01) 0.72µs × (0.99,1.02) -31.70% (p=0.000)
SetTypeNode1 16.0ns × (0.99,1.01) 20.8ns × (0.99,1.03) +29.91% (p=0.000)
SetTypeNode1Slice 184ns × (0.99,1.01) 112ns × (0.99,1.01) -39.26% (p=0.000)
SetTypeNode8 29.5ns × (0.97,1.02) 24.6ns × (1.00,1.00) -16.50% (p=0.000)
SetTypeNode8Slice 624ns × (0.98,1.02) 285ns × (1.00,1.00) -54.31% (p=0.000)
SetTypeNode64 135ns × (0.96,1.08) 52ns × (0.99,1.02) -61.32% (p=0.000)
SetTypeNode64Slice 3.83µs × (1.00,1.00) 1.14µs × (0.99,1.01) -70.16% (p=0.000)
SetTypeNode64Dead 134ns × (0.99,1.01) 32ns × (1.00,1.01) -75.74% (p=0.000)
SetTypeNode64DeadSlice 3.83µs × (0.99,1.00) 1.40µs × (1.00,1.01) -63.42% (p=0.000)
SetTypeNode124 240ns × (0.99,1.01) 79ns × (1.00,1.01) -67.05% (p=0.000)
SetTypeNode124Slice 7.27µs × (1.00,1.00) 2.04µs × (1.00,1.00) -71.95% (p=0.000)
SetTypeNode126 2.06µs × (0.99,1.01) 0.08µs × (0.99,1.01) -96.23% (p=0.000)
SetTypeNode126Slice 64.4µs × (1.00,1.00) 2.0µs × (1.00,1.00) -96.85% (p=0.000)
SetTypeNode128 2.09µs × (1.00,1.01) 0.12µs × (1.00,1.00) -94.15% (p=0.000)
SetTypeNode128Slice 65.4µs × (1.00,1.00) 2.4µs × (0.99,1.03) -96.39% (p=0.000)
SetTypeNode130 2.11µs × (1.00,1.00) 0.12µs × (1.00,1.00) -94.18% (p=0.000)
SetTypeNode130Slice 66.3µs × (1.00,1.00) 2.4µs × (0.97,1.08) -96.34% (p=0.000)
SetTypeNode1024 16.0µs × (1.00,1.01) 0.5µs × (1.00,1.00) -96.65% (p=0.000)
SetTypeNode1024Slice 512µs × (1.00,1.00) 18µs × (0.98,1.04) -96.45% (p=0.000)
SetTypeNode124 uses a 124 data + 2 ptr = 126-word allocation.
Both Go 1.4 and this CL are using pointer bitmaps for this case,
so that's an overall 3x speedup for using pointer bitmaps.
SetTypeNode128 uses a 128 data + 2 ptr = 130-word allocation.
Both Go 1.4 and this CL are running the GC program for this case,
so that's an overall 17x speedup when using GC programs (and
I've seen >20x on other systems).
Comparing Go 1.4's SetTypeNode124 (pointer bitmap) against
this CL's SetTypeNode128 (GC program), the slow path in the
code in this CL is 2x faster than the fast path in Go 1.4.
The Go 1 benchmarks are basically unaffected compared to just before this CL.
Go 1 benchmarks, before this CL vs this CL:
name old mean new mean delta
BinaryTree17 5.87s × (0.97,1.04) 5.91s × (0.96,1.04) ~ (p=0.306)
Fannkuch11 4.38s × (1.00,1.00) 4.37s × (1.00,1.01) -0.22% (p=0.006)
FmtFprintfEmpty 90.7ns × (0.97,1.10) 89.3ns × (0.96,1.09) ~ (p=0.280)
FmtFprintfString 282ns × (0.98,1.04) 287ns × (0.98,1.07) +1.72% (p=0.039)
FmtFprintfInt 269ns × (0.99,1.03) 282ns × (0.97,1.04) +4.87% (p=0.000)
FmtFprintfIntInt 478ns × (0.99,1.02) 481ns × (0.99,1.02) +0.61% (p=0.048)
FmtFprintfPrefixedInt 399ns × (0.98,1.03) 400ns × (0.98,1.05) ~ (p=0.533)
FmtFprintfFloat 563ns × (0.99,1.01) 570ns × (1.00,1.01) +1.37% (p=0.000)
FmtManyArgs 1.89µs × (0.99,1.01) 1.92µs × (0.99,1.02) +1.88% (p=0.000)
GobDecode 15.2ms × (0.99,1.01) 15.2ms × (0.98,1.05) ~ (p=0.609)
GobEncode 11.6ms × (0.98,1.03) 11.9ms × (0.98,1.04) +2.17% (p=0.000)
Gzip 648ms × (0.99,1.01) 648ms × (1.00,1.01) ~ (p=0.835)
Gunzip 142ms × (1.00,1.00) 143ms × (1.00,1.01) ~ (p=0.169)
HTTPClientServer 90.5µs × (0.98,1.03) 91.5µs × (0.98,1.04) +1.04% (p=0.045)
JSONEncode 31.5ms × (0.98,1.03) 31.4ms × (0.98,1.03) ~ (p=0.549)
JSONDecode 111ms × (0.99,1.01) 107ms × (0.99,1.01) -3.21% (p=0.000)
Mandelbrot200 6.01ms × (1.00,1.00) 6.01ms × (1.00,1.00) ~ (p=0.878)
GoParse 6.54ms × (0.99,1.02) 6.61ms × (0.99,1.03) +1.08% (p=0.004)
RegexpMatchEasy0_32 160ns × (1.00,1.01) 161ns × (1.00,1.00) +0.40% (p=0.000)
RegexpMatchEasy0_1K 560ns × (0.99,1.01) 559ns × (0.99,1.01) ~ (p=0.088)
RegexpMatchEasy1_32 138ns × (0.99,1.01) 138ns × (1.00,1.00) ~ (p=0.380)
RegexpMatchEasy1_1K 877ns × (1.00,1.00) 878ns × (1.00,1.00) ~ (p=0.157)
RegexpMatchMedium_32 251ns × (0.99,1.00) 251ns × (1.00,1.01) +0.28% (p=0.021)
RegexpMatchMedium_1K 72.6µs × (1.00,1.00) 72.6µs × (1.00,1.00) ~ (p=0.539)
RegexpMatchHard_32 3.84µs × (1.00,1.00) 3.84µs × (1.00,1.00) ~ (p=0.378)
RegexpMatchHard_1K 117µs × (1.00,1.00) 117µs × (1.00,1.00) ~ (p=0.067)
Revcomp 904ms × (0.99,1.02) 904ms × (0.99,1.01) ~ (p=0.943)
Template 125ms × (0.99,1.02) 127ms × (0.99,1.01) +1.79% (p=0.000)
TimeParse 627ns × (0.99,1.01) 622ns × (0.99,1.01) -0.88% (p=0.000)
TimeFormat 655ns × (0.99,1.02) 655ns × (0.99,1.02) ~ (p=0.976)
For the record, Go 1 benchmarks, Go 1.4 vs this CL:
name old mean new mean delta
BinaryTree17 4.61s × (0.97,1.05) 5.91s × (0.98,1.03) +28.35% (p=0.000)
Fannkuch11 4.40s × (0.99,1.03) 4.41s × (0.99,1.01) ~ (p=0.212)
FmtFprintfEmpty 102ns × (0.99,1.01) 84ns × (0.99,1.02) -18.38% (p=0.000)
FmtFprintfString 302ns × (0.98,1.01) 303ns × (0.99,1.02) ~ (p=0.203)
FmtFprintfInt 313ns × (0.97,1.05) 270ns × (0.99,1.01) -13.69% (p=0.000)
FmtFprintfIntInt 524ns × (0.98,1.02) 477ns × (0.99,1.00) -8.87% (p=0.000)
FmtFprintfPrefixedInt 424ns × (0.98,1.02) 386ns × (0.99,1.01) -8.96% (p=0.000)
FmtFprintfFloat 652ns × (0.98,1.02) 594ns × (0.97,1.05) -8.97% (p=0.000)
FmtManyArgs 2.13µs × (0.99,1.02) 1.94µs × (0.99,1.01) -8.92% (p=0.000)
GobDecode 17.1ms × (0.99,1.02) 14.9ms × (0.98,1.03) -13.07% (p=0.000)
GobEncode 13.5ms × (0.98,1.03) 11.5ms × (0.98,1.03) -15.25% (p=0.000)
Gzip 656ms × (0.99,1.02) 647ms × (0.99,1.01) -1.29% (p=0.000)
Gunzip 143ms × (0.99,1.02) 144ms × (0.99,1.01) ~ (p=0.204)
HTTPClientServer 88.2µs × (0.98,1.02) 90.8µs × (0.98,1.01) +2.93% (p=0.000)
JSONEncode 32.2ms × (0.98,1.02) 30.9ms × (0.97,1.04) -4.06% (p=0.001)
JSONDecode 121ms × (0.98,1.02) 110ms × (0.98,1.05) -8.95% (p=0.000)
Mandelbrot200 6.06ms × (0.99,1.01) 6.11ms × (0.98,1.04) ~ (p=0.184)
GoParse 6.76ms × (0.97,1.04) 6.58ms × (0.98,1.05) -2.63% (p=0.003)
RegexpMatchEasy0_32 195ns × (1.00,1.01) 155ns × (0.99,1.01) -20.43% (p=0.000)
RegexpMatchEasy0_1K 479ns × (0.98,1.03) 535ns × (0.99,1.02) +11.59% (p=0.000)
RegexpMatchEasy1_32 169ns × (0.99,1.02) 131ns × (0.99,1.03) -22.44% (p=0.000)
RegexpMatchEasy1_1K 1.53µs × (0.99,1.01) 0.87µs × (0.99,1.02) -43.07% (p=0.000)
RegexpMatchMedium_32 334ns × (0.99,1.01) 242ns × (0.99,1.01) -27.53% (p=0.000)
RegexpMatchMedium_1K 125µs × (1.00,1.01) 72µs × (0.99,1.03) -42.53% (p=0.000)
RegexpMatchHard_32 6.03µs × (0.99,1.01) 3.79µs × (0.99,1.01) -37.12% (p=0.000)
RegexpMatchHard_1K 189µs × (0.99,1.02) 115µs × (0.99,1.01) -39.20% (p=0.000)
Revcomp 935ms × (0.96,1.03) 926ms × (0.98,1.02) ~ (p=0.083)
Template 146ms × (0.97,1.05) 119ms × (0.99,1.01) -18.37% (p=0.000)
TimeParse 660ns × (0.99,1.01) 624ns × (0.99,1.02) -5.43% (p=0.000)
TimeFormat 670ns × (0.98,1.02) 710ns × (1.00,1.01) +5.97% (p=0.000)
This CL is a bit larger than I would like, but the compiler, linker, runtime,
and package reflect all need to be in sync about the format of these programs,
so there is no easy way to split this into independent changes (at least
while keeping the build working at each change).
Fixes #9625.
Fixes #10524.
Change-Id: I9e3e20d6097099d0f8532d1cb5b1af528804989a
Reviewed-on: https://go-review.googlesource.com/9888
Reviewed-by: Austin Clements <austin@google.com>
Run-TryBot: Russ Cox <rsc@golang.org>
2015-05-08 01:43:18 -04:00
|
|
|
if debugGCProg {
|
|
|
|
|
fmt.Fprintf(os.Stderr, "gcprog sym: %s at %d (ptr=%d+%d)\n", s.Name, s.Value, s.Value/ptrsize, nptr)
|
2015-04-28 00:28:47 -04:00
|
|
|
}
|
2015-02-27 22:57:28 -05:00
|
|
|
|
runtime: replace GC programs with simpler encoding, faster decoder
Small types record the location of pointers in their memory layout
by using a simple bitmap. In Go 1.4 the bitmap held 4-bit entries,
and in Go 1.5 the bitmap holds 1-bit entries, but in both cases using
a bitmap for a large type containing arrays does not make sense:
if someone refers to the type [1<<28]*byte in a program in such
a way that the type information makes it into the binary, it would be
a waste of space to write a 128 MB (for 4-bit entries) or even 32 MB
(for 1-bit entries) bitmap full of 1s into the binary or even to keep
one in memory during the execution of the program.
For large types containing arrays, it is much more compact to describe
the locations of pointers using a notation that can express repetition
than to lay out a bitmap of pointers. Go 1.4 included such a notation,
called ``GC programs'' but it was complex, required recursion during
decoding, and was generally slow. Dmitriy measured the execution of
these programs writing directly to the heap bitmap as being 7x slower
than copying from a preunrolled 4-bit mask (and frankly that code was
not terribly fast either). For some tests, unrollgcprog1 was seen costing
as much as 3x more than the rest of malloc combined.
This CL introduces a different form for the GC programs. They use a
simple Lempel-Ziv-style encoding of the 1-bit pointer information,
in which the only operations are (1) emit the following n bits
and (2) repeat the last n bits c more times. This encoding can be
generated directly from the Go type information (using repetition
only for arrays or large runs of non-pointer data) and it can be decoded
very efficiently. In particular the decoding requires little state and
no recursion, so that the entire decoding can run without any memory
accesses other than the reads of the encoding and the writes of the
decoded form to the heap bitmap. For recursive types like arrays of
arrays of arrays, the inner instructions are only executed once, not
n times, so that large repetitions run at full speed. (In contrast, large
repetitions in the old programs repeated the individual bit-level layout
of the inner data over and over.) The result is as much as 25x faster
decoding compared to the old form.
Because the old decoder was so slow, Go 1.4 had three (or so) cases
for how to set the heap bitmap bits for an allocation of a given type:
(1) If the type had an even number of words up to 32 words, then
the 4-bit pointer mask for the type fit in no more than 16 bytes;
store the 4-bit pointer mask directly in the binary and copy from it.
(1b) If the type had an odd number of words up to 15 words, then
the 4-bit pointer mask for the type, doubled to end on a byte boundary,
fit in no more than 16 bytes; store that doubled mask directly in the
binary and copy from it.
(2) If the type had an even number of words up to 128 words,
or an odd number of words up to 63 words (again due to doubling),
then the 4-bit pointer mask would fit in a 64-byte unrolled mask.
Store a GC program in the binary, but leave space in the BSS for
the unrolled mask. Execute the GC program to construct the mask the
first time it is needed, and thereafter copy from the mask.
(3) Otherwise, store a GC program and execute it to write directly to
the heap bitmap each time an object of that type is allocated.
(This is the case that was 7x slower than the other two.)
Because the new pointer masks store 1-bit entries instead of 4-bit
entries and because using the decoder no longer carries a significant
overhead, after this CL (that is, for Go 1.5) there are only two cases:
(1) If the type is 128 words or less (no condition about odd or even),
store the 1-bit pointer mask directly in the binary and use it to
initialize the heap bitmap during malloc. (Implemented in CL 9702.)
(2) There is no case 2 anymore.
(3) Otherwise, store a GC program and execute it to write directly to
the heap bitmap each time an object of that type is allocated.
Executing the GC program directly into the heap bitmap (case (3) above)
was disabled for the Go 1.5 dev cycle, both to avoid needing to use
GC programs for typedmemmove and to avoid updating that code as
the heap bitmap format changed. Typedmemmove no longer uses this
type information; as of CL 9886 it uses the heap bitmap directly.
Now that the heap bitmap format is stable, we reintroduce GC programs
and their space savings.
Benchmarks for heapBitsSetType, before this CL vs this CL:
name old mean new mean delta
SetTypePtr 7.59ns × (0.99,1.02) 5.16ns × (1.00,1.00) -32.05% (p=0.000)
SetTypePtr8 21.0ns × (0.98,1.05) 21.4ns × (1.00,1.00) ~ (p=0.179)
SetTypePtr16 24.1ns × (0.99,1.01) 24.6ns × (1.00,1.00) +2.41% (p=0.001)
SetTypePtr32 31.2ns × (0.99,1.01) 32.4ns × (0.99,1.02) +3.72% (p=0.001)
SetTypePtr64 45.2ns × (1.00,1.00) 47.2ns × (1.00,1.00) +4.42% (p=0.000)
SetTypePtr126 75.8ns × (0.99,1.01) 79.1ns × (1.00,1.00) +4.25% (p=0.000)
SetTypePtr128 74.3ns × (0.99,1.01) 77.6ns × (1.00,1.01) +4.55% (p=0.000)
SetTypePtrSlice 726ns × (1.00,1.01) 712ns × (1.00,1.00) -1.95% (p=0.001)
SetTypeNode1 20.0ns × (0.99,1.01) 20.7ns × (1.00,1.00) +3.71% (p=0.000)
SetTypeNode1Slice 112ns × (1.00,1.00) 113ns × (0.99,1.00) ~ (p=0.070)
SetTypeNode8 23.9ns × (1.00,1.00) 24.7ns × (1.00,1.01) +3.18% (p=0.000)
SetTypeNode8Slice 294ns × (0.99,1.02) 287ns × (0.99,1.01) -2.38% (p=0.015)
SetTypeNode64 52.8ns × (0.99,1.03) 51.8ns × (0.99,1.01) ~ (p=0.069)
SetTypeNode64Slice 1.13µs × (0.99,1.05) 1.14µs × (0.99,1.00) ~ (p=0.767)
SetTypeNode64Dead 36.0ns × (1.00,1.01) 32.5ns × (0.99,1.00) -9.67% (p=0.000)
SetTypeNode64DeadSlice 1.43µs × (0.99,1.01) 1.40µs × (1.00,1.00) -2.39% (p=0.001)
SetTypeNode124 75.7ns × (1.00,1.01) 79.0ns × (1.00,1.00) +4.44% (p=0.000)
SetTypeNode124Slice 1.94µs × (1.00,1.01) 2.04µs × (0.99,1.01) +4.98% (p=0.000)
SetTypeNode126 75.4ns × (1.00,1.01) 77.7ns × (0.99,1.01) +3.11% (p=0.000)
SetTypeNode126Slice 1.95µs × (0.99,1.01) 2.03µs × (1.00,1.00) +3.74% (p=0.000)
SetTypeNode128 85.4ns × (0.99,1.01) 122.0ns × (1.00,1.00) +42.89% (p=0.000)
SetTypeNode128Slice 2.20µs × (1.00,1.01) 2.36µs × (0.98,1.02) +7.48% (p=0.001)
SetTypeNode130 83.3ns × (1.00,1.00) 123.0ns × (1.00,1.00) +47.61% (p=0.000)
SetTypeNode130Slice 2.30µs × (0.99,1.01) 2.40µs × (0.98,1.01) +4.37% (p=0.000)
SetTypeNode1024 498ns × (1.00,1.00) 537ns × (1.00,1.00) +7.96% (p=0.000)
SetTypeNode1024Slice 15.5µs × (0.99,1.01) 17.8µs × (1.00,1.00) +15.27% (p=0.000)
The above compares always using a cached pointer mask (and the
corresponding waste of memory) against using the programs directly.
Some slowdown is expected, in exchange for having a better general algorithm.
The GC programs kick in for SetTypeNode128, SetTypeNode130, SetTypeNode1024,
along with the slice variants of those.
It is possible that the cutoff of 128 words (bits) should be raised
in a followup CL, but even with this low cutoff the GC programs are
faster than Go 1.4's "fast path" non-GC program case.
Benchmarks for heapBitsSetType, Go 1.4 vs this CL:
name old mean new mean delta
SetTypePtr 6.89ns × (1.00,1.00) 5.17ns × (1.00,1.00) -25.02% (p=0.000)
SetTypePtr8 25.8ns × (0.97,1.05) 21.5ns × (1.00,1.00) -16.70% (p=0.000)
SetTypePtr16 39.8ns × (0.97,1.02) 24.7ns × (0.99,1.01) -37.81% (p=0.000)
SetTypePtr32 68.8ns × (0.98,1.01) 32.2ns × (1.00,1.01) -53.18% (p=0.000)
SetTypePtr64 130ns × (1.00,1.00) 47ns × (1.00,1.00) -63.67% (p=0.000)
SetTypePtr126 241ns × (0.99,1.01) 79ns × (1.00,1.01) -67.25% (p=0.000)
SetTypePtr128 2.07µs × (1.00,1.00) 0.08µs × (1.00,1.00) -96.27% (p=0.000)
SetTypePtrSlice 1.05µs × (0.99,1.01) 0.72µs × (0.99,1.02) -31.70% (p=0.000)
SetTypeNode1 16.0ns × (0.99,1.01) 20.8ns × (0.99,1.03) +29.91% (p=0.000)
SetTypeNode1Slice 184ns × (0.99,1.01) 112ns × (0.99,1.01) -39.26% (p=0.000)
SetTypeNode8 29.5ns × (0.97,1.02) 24.6ns × (1.00,1.00) -16.50% (p=0.000)
SetTypeNode8Slice 624ns × (0.98,1.02) 285ns × (1.00,1.00) -54.31% (p=0.000)
SetTypeNode64 135ns × (0.96,1.08) 52ns × (0.99,1.02) -61.32% (p=0.000)
SetTypeNode64Slice 3.83µs × (1.00,1.00) 1.14µs × (0.99,1.01) -70.16% (p=0.000)
SetTypeNode64Dead 134ns × (0.99,1.01) 32ns × (1.00,1.01) -75.74% (p=0.000)
SetTypeNode64DeadSlice 3.83µs × (0.99,1.00) 1.40µs × (1.00,1.01) -63.42% (p=0.000)
SetTypeNode124 240ns × (0.99,1.01) 79ns × (1.00,1.01) -67.05% (p=0.000)
SetTypeNode124Slice 7.27µs × (1.00,1.00) 2.04µs × (1.00,1.00) -71.95% (p=0.000)
SetTypeNode126 2.06µs × (0.99,1.01) 0.08µs × (0.99,1.01) -96.23% (p=0.000)
SetTypeNode126Slice 64.4µs × (1.00,1.00) 2.0µs × (1.00,1.00) -96.85% (p=0.000)
SetTypeNode128 2.09µs × (1.00,1.01) 0.12µs × (1.00,1.00) -94.15% (p=0.000)
SetTypeNode128Slice 65.4µs × (1.00,1.00) 2.4µs × (0.99,1.03) -96.39% (p=0.000)
SetTypeNode130 2.11µs × (1.00,1.00) 0.12µs × (1.00,1.00) -94.18% (p=0.000)
SetTypeNode130Slice 66.3µs × (1.00,1.00) 2.4µs × (0.97,1.08) -96.34% (p=0.000)
SetTypeNode1024 16.0µs × (1.00,1.01) 0.5µs × (1.00,1.00) -96.65% (p=0.000)
SetTypeNode1024Slice 512µs × (1.00,1.00) 18µs × (0.98,1.04) -96.45% (p=0.000)
SetTypeNode124 uses a 124 data + 2 ptr = 126-word allocation.
Both Go 1.4 and this CL are using pointer bitmaps for this case,
so that's an overall 3x speedup for using pointer bitmaps.
SetTypeNode128 uses a 128 data + 2 ptr = 130-word allocation.
Both Go 1.4 and this CL are running the GC program for this case,
so that's an overall 17x speedup when using GC programs (and
I've seen >20x on other systems).
Comparing Go 1.4's SetTypeNode124 (pointer bitmap) against
this CL's SetTypeNode128 (GC program), the slow path in the
code in this CL is 2x faster than the fast path in Go 1.4.
The Go 1 benchmarks are basically unaffected compared to just before this CL.
Go 1 benchmarks, before this CL vs this CL:
name old mean new mean delta
BinaryTree17 5.87s × (0.97,1.04) 5.91s × (0.96,1.04) ~ (p=0.306)
Fannkuch11 4.38s × (1.00,1.00) 4.37s × (1.00,1.01) -0.22% (p=0.006)
FmtFprintfEmpty 90.7ns × (0.97,1.10) 89.3ns × (0.96,1.09) ~ (p=0.280)
FmtFprintfString 282ns × (0.98,1.04) 287ns × (0.98,1.07) +1.72% (p=0.039)
FmtFprintfInt 269ns × (0.99,1.03) 282ns × (0.97,1.04) +4.87% (p=0.000)
FmtFprintfIntInt 478ns × (0.99,1.02) 481ns × (0.99,1.02) +0.61% (p=0.048)
FmtFprintfPrefixedInt 399ns × (0.98,1.03) 400ns × (0.98,1.05) ~ (p=0.533)
FmtFprintfFloat 563ns × (0.99,1.01) 570ns × (1.00,1.01) +1.37% (p=0.000)
FmtManyArgs 1.89µs × (0.99,1.01) 1.92µs × (0.99,1.02) +1.88% (p=0.000)
GobDecode 15.2ms × (0.99,1.01) 15.2ms × (0.98,1.05) ~ (p=0.609)
GobEncode 11.6ms × (0.98,1.03) 11.9ms × (0.98,1.04) +2.17% (p=0.000)
Gzip 648ms × (0.99,1.01) 648ms × (1.00,1.01) ~ (p=0.835)
Gunzip 142ms × (1.00,1.00) 143ms × (1.00,1.01) ~ (p=0.169)
HTTPClientServer 90.5µs × (0.98,1.03) 91.5µs × (0.98,1.04) +1.04% (p=0.045)
JSONEncode 31.5ms × (0.98,1.03) 31.4ms × (0.98,1.03) ~ (p=0.549)
JSONDecode 111ms × (0.99,1.01) 107ms × (0.99,1.01) -3.21% (p=0.000)
Mandelbrot200 6.01ms × (1.00,1.00) 6.01ms × (1.00,1.00) ~ (p=0.878)
GoParse 6.54ms × (0.99,1.02) 6.61ms × (0.99,1.03) +1.08% (p=0.004)
RegexpMatchEasy0_32 160ns × (1.00,1.01) 161ns × (1.00,1.00) +0.40% (p=0.000)
RegexpMatchEasy0_1K 560ns × (0.99,1.01) 559ns × (0.99,1.01) ~ (p=0.088)
RegexpMatchEasy1_32 138ns × (0.99,1.01) 138ns × (1.00,1.00) ~ (p=0.380)
RegexpMatchEasy1_1K 877ns × (1.00,1.00) 878ns × (1.00,1.00) ~ (p=0.157)
RegexpMatchMedium_32 251ns × (0.99,1.00) 251ns × (1.00,1.01) +0.28% (p=0.021)
RegexpMatchMedium_1K 72.6µs × (1.00,1.00) 72.6µs × (1.00,1.00) ~ (p=0.539)
RegexpMatchHard_32 3.84µs × (1.00,1.00) 3.84µs × (1.00,1.00) ~ (p=0.378)
RegexpMatchHard_1K 117µs × (1.00,1.00) 117µs × (1.00,1.00) ~ (p=0.067)
Revcomp 904ms × (0.99,1.02) 904ms × (0.99,1.01) ~ (p=0.943)
Template 125ms × (0.99,1.02) 127ms × (0.99,1.01) +1.79% (p=0.000)
TimeParse 627ns × (0.99,1.01) 622ns × (0.99,1.01) -0.88% (p=0.000)
TimeFormat 655ns × (0.99,1.02) 655ns × (0.99,1.02) ~ (p=0.976)
For the record, Go 1 benchmarks, Go 1.4 vs this CL:
name old mean new mean delta
BinaryTree17 4.61s × (0.97,1.05) 5.91s × (0.98,1.03) +28.35% (p=0.000)
Fannkuch11 4.40s × (0.99,1.03) 4.41s × (0.99,1.01) ~ (p=0.212)
FmtFprintfEmpty 102ns × (0.99,1.01) 84ns × (0.99,1.02) -18.38% (p=0.000)
FmtFprintfString 302ns × (0.98,1.01) 303ns × (0.99,1.02) ~ (p=0.203)
FmtFprintfInt 313ns × (0.97,1.05) 270ns × (0.99,1.01) -13.69% (p=0.000)
FmtFprintfIntInt 524ns × (0.98,1.02) 477ns × (0.99,1.00) -8.87% (p=0.000)
FmtFprintfPrefixedInt 424ns × (0.98,1.02) 386ns × (0.99,1.01) -8.96% (p=0.000)
FmtFprintfFloat 652ns × (0.98,1.02) 594ns × (0.97,1.05) -8.97% (p=0.000)
FmtManyArgs 2.13µs × (0.99,1.02) 1.94µs × (0.99,1.01) -8.92% (p=0.000)
GobDecode 17.1ms × (0.99,1.02) 14.9ms × (0.98,1.03) -13.07% (p=0.000)
GobEncode 13.5ms × (0.98,1.03) 11.5ms × (0.98,1.03) -15.25% (p=0.000)
Gzip 656ms × (0.99,1.02) 647ms × (0.99,1.01) -1.29% (p=0.000)
Gunzip 143ms × (0.99,1.02) 144ms × (0.99,1.01) ~ (p=0.204)
HTTPClientServer 88.2µs × (0.98,1.02) 90.8µs × (0.98,1.01) +2.93% (p=0.000)
JSONEncode 32.2ms × (0.98,1.02) 30.9ms × (0.97,1.04) -4.06% (p=0.001)
JSONDecode 121ms × (0.98,1.02) 110ms × (0.98,1.05) -8.95% (p=0.000)
Mandelbrot200 6.06ms × (0.99,1.01) 6.11ms × (0.98,1.04) ~ (p=0.184)
GoParse 6.76ms × (0.97,1.04) 6.58ms × (0.98,1.05) -2.63% (p=0.003)
RegexpMatchEasy0_32 195ns × (1.00,1.01) 155ns × (0.99,1.01) -20.43% (p=0.000)
RegexpMatchEasy0_1K 479ns × (0.98,1.03) 535ns × (0.99,1.02) +11.59% (p=0.000)
RegexpMatchEasy1_32 169ns × (0.99,1.02) 131ns × (0.99,1.03) -22.44% (p=0.000)
RegexpMatchEasy1_1K 1.53µs × (0.99,1.01) 0.87µs × (0.99,1.02) -43.07% (p=0.000)
RegexpMatchMedium_32 334ns × (0.99,1.01) 242ns × (0.99,1.01) -27.53% (p=0.000)
RegexpMatchMedium_1K 125µs × (1.00,1.01) 72µs × (0.99,1.03) -42.53% (p=0.000)
RegexpMatchHard_32 6.03µs × (0.99,1.01) 3.79µs × (0.99,1.01) -37.12% (p=0.000)
RegexpMatchHard_1K 189µs × (0.99,1.02) 115µs × (0.99,1.01) -39.20% (p=0.000)
Revcomp 935ms × (0.96,1.03) 926ms × (0.98,1.02) ~ (p=0.083)
Template 146ms × (0.97,1.05) 119ms × (0.99,1.01) -18.37% (p=0.000)
TimeParse 660ns × (0.99,1.01) 624ns × (0.99,1.02) -5.43% (p=0.000)
TimeFormat 670ns × (0.98,1.02) 710ns × (1.00,1.01) +5.97% (p=0.000)
This CL is a bit larger than I would like, but the compiler, linker, runtime,
and package reflect all need to be in sync about the format of these programs,
so there is no easy way to split this into independent changes (at least
while keeping the build working at each change).
Fixes #9625.
Fixes #10524.
Change-Id: I9e3e20d6097099d0f8532d1cb5b1af528804989a
Reviewed-on: https://go-review.googlesource.com/9888
Reviewed-by: Austin Clements <austin@google.com>
Run-TryBot: Russ Cox <rsc@golang.org>
2015-05-08 01:43:18 -04:00
|
|
|
if decodetype_usegcprog(typ) == 0 {
|
|
|
|
|
// Copy pointers from mask into program.
|
2016-08-19 22:40:38 -04:00
|
|
|
mask := decodetype_gcmask(p.ctxt, typ)
|
runtime: replace GC programs with simpler encoding, faster decoder
Small types record the location of pointers in their memory layout
by using a simple bitmap. In Go 1.4 the bitmap held 4-bit entries,
and in Go 1.5 the bitmap holds 1-bit entries, but in both cases using
a bitmap for a large type containing arrays does not make sense:
if someone refers to the type [1<<28]*byte in a program in such
a way that the type information makes it into the binary, it would be
a waste of space to write a 128 MB (for 4-bit entries) or even 32 MB
(for 1-bit entries) bitmap full of 1s into the binary or even to keep
one in memory during the execution of the program.
For large types containing arrays, it is much more compact to describe
the locations of pointers using a notation that can express repetition
than to lay out a bitmap of pointers. Go 1.4 included such a notation,
called ``GC programs'' but it was complex, required recursion during
decoding, and was generally slow. Dmitriy measured the execution of
these programs writing directly to the heap bitmap as being 7x slower
than copying from a preunrolled 4-bit mask (and frankly that code was
not terribly fast either). For some tests, unrollgcprog1 was seen costing
as much as 3x more than the rest of malloc combined.
This CL introduces a different form for the GC programs. They use a
simple Lempel-Ziv-style encoding of the 1-bit pointer information,
in which the only operations are (1) emit the following n bits
and (2) repeat the last n bits c more times. This encoding can be
generated directly from the Go type information (using repetition
only for arrays or large runs of non-pointer data) and it can be decoded
very efficiently. In particular the decoding requires little state and
no recursion, so that the entire decoding can run without any memory
accesses other than the reads of the encoding and the writes of the
decoded form to the heap bitmap. For recursive types like arrays of
arrays of arrays, the inner instructions are only executed once, not
n times, so that large repetitions run at full speed. (In contrast, large
repetitions in the old programs repeated the individual bit-level layout
of the inner data over and over.) The result is as much as 25x faster
decoding compared to the old form.
Because the old decoder was so slow, Go 1.4 had three (or so) cases
for how to set the heap bitmap bits for an allocation of a given type:
(1) If the type had an even number of words up to 32 words, then
the 4-bit pointer mask for the type fit in no more than 16 bytes;
store the 4-bit pointer mask directly in the binary and copy from it.
(1b) If the type had an odd number of words up to 15 words, then
the 4-bit pointer mask for the type, doubled to end on a byte boundary,
fit in no more than 16 bytes; store that doubled mask directly in the
binary and copy from it.
(2) If the type had an even number of words up to 128 words,
or an odd number of words up to 63 words (again due to doubling),
then the 4-bit pointer mask would fit in a 64-byte unrolled mask.
Store a GC program in the binary, but leave space in the BSS for
the unrolled mask. Execute the GC program to construct the mask the
first time it is needed, and thereafter copy from the mask.
(3) Otherwise, store a GC program and execute it to write directly to
the heap bitmap each time an object of that type is allocated.
(This is the case that was 7x slower than the other two.)
Because the new pointer masks store 1-bit entries instead of 4-bit
entries and because using the decoder no longer carries a significant
overhead, after this CL (that is, for Go 1.5) there are only two cases:
(1) If the type is 128 words or less (no condition about odd or even),
store the 1-bit pointer mask directly in the binary and use it to
initialize the heap bitmap during malloc. (Implemented in CL 9702.)
(2) There is no case 2 anymore.
(3) Otherwise, store a GC program and execute it to write directly to
the heap bitmap each time an object of that type is allocated.
Executing the GC program directly into the heap bitmap (case (3) above)
was disabled for the Go 1.5 dev cycle, both to avoid needing to use
GC programs for typedmemmove and to avoid updating that code as
the heap bitmap format changed. Typedmemmove no longer uses this
type information; as of CL 9886 it uses the heap bitmap directly.
Now that the heap bitmap format is stable, we reintroduce GC programs
and their space savings.
Benchmarks for heapBitsSetType, before this CL vs this CL:
name old mean new mean delta
SetTypePtr 7.59ns × (0.99,1.02) 5.16ns × (1.00,1.00) -32.05% (p=0.000)
SetTypePtr8 21.0ns × (0.98,1.05) 21.4ns × (1.00,1.00) ~ (p=0.179)
SetTypePtr16 24.1ns × (0.99,1.01) 24.6ns × (1.00,1.00) +2.41% (p=0.001)
SetTypePtr32 31.2ns × (0.99,1.01) 32.4ns × (0.99,1.02) +3.72% (p=0.001)
SetTypePtr64 45.2ns × (1.00,1.00) 47.2ns × (1.00,1.00) +4.42% (p=0.000)
SetTypePtr126 75.8ns × (0.99,1.01) 79.1ns × (1.00,1.00) +4.25% (p=0.000)
SetTypePtr128 74.3ns × (0.99,1.01) 77.6ns × (1.00,1.01) +4.55% (p=0.000)
SetTypePtrSlice 726ns × (1.00,1.01) 712ns × (1.00,1.00) -1.95% (p=0.001)
SetTypeNode1 20.0ns × (0.99,1.01) 20.7ns × (1.00,1.00) +3.71% (p=0.000)
SetTypeNode1Slice 112ns × (1.00,1.00) 113ns × (0.99,1.00) ~ (p=0.070)
SetTypeNode8 23.9ns × (1.00,1.00) 24.7ns × (1.00,1.01) +3.18% (p=0.000)
SetTypeNode8Slice 294ns × (0.99,1.02) 287ns × (0.99,1.01) -2.38% (p=0.015)
SetTypeNode64 52.8ns × (0.99,1.03) 51.8ns × (0.99,1.01) ~ (p=0.069)
SetTypeNode64Slice 1.13µs × (0.99,1.05) 1.14µs × (0.99,1.00) ~ (p=0.767)
SetTypeNode64Dead 36.0ns × (1.00,1.01) 32.5ns × (0.99,1.00) -9.67% (p=0.000)
SetTypeNode64DeadSlice 1.43µs × (0.99,1.01) 1.40µs × (1.00,1.00) -2.39% (p=0.001)
SetTypeNode124 75.7ns × (1.00,1.01) 79.0ns × (1.00,1.00) +4.44% (p=0.000)
SetTypeNode124Slice 1.94µs × (1.00,1.01) 2.04µs × (0.99,1.01) +4.98% (p=0.000)
SetTypeNode126 75.4ns × (1.00,1.01) 77.7ns × (0.99,1.01) +3.11% (p=0.000)
SetTypeNode126Slice 1.95µs × (0.99,1.01) 2.03µs × (1.00,1.00) +3.74% (p=0.000)
SetTypeNode128 85.4ns × (0.99,1.01) 122.0ns × (1.00,1.00) +42.89% (p=0.000)
SetTypeNode128Slice 2.20µs × (1.00,1.01) 2.36µs × (0.98,1.02) +7.48% (p=0.001)
SetTypeNode130 83.3ns × (1.00,1.00) 123.0ns × (1.00,1.00) +47.61% (p=0.000)
SetTypeNode130Slice 2.30µs × (0.99,1.01) 2.40µs × (0.98,1.01) +4.37% (p=0.000)
SetTypeNode1024 498ns × (1.00,1.00) 537ns × (1.00,1.00) +7.96% (p=0.000)
SetTypeNode1024Slice 15.5µs × (0.99,1.01) 17.8µs × (1.00,1.00) +15.27% (p=0.000)
The above compares always using a cached pointer mask (and the
corresponding waste of memory) against using the programs directly.
Some slowdown is expected, in exchange for having a better general algorithm.
The GC programs kick in for SetTypeNode128, SetTypeNode130, SetTypeNode1024,
along with the slice variants of those.
It is possible that the cutoff of 128 words (bits) should be raised
in a followup CL, but even with this low cutoff the GC programs are
faster than Go 1.4's "fast path" non-GC program case.
Benchmarks for heapBitsSetType, Go 1.4 vs this CL:
name old mean new mean delta
SetTypePtr 6.89ns × (1.00,1.00) 5.17ns × (1.00,1.00) -25.02% (p=0.000)
SetTypePtr8 25.8ns × (0.97,1.05) 21.5ns × (1.00,1.00) -16.70% (p=0.000)
SetTypePtr16 39.8ns × (0.97,1.02) 24.7ns × (0.99,1.01) -37.81% (p=0.000)
SetTypePtr32 68.8ns × (0.98,1.01) 32.2ns × (1.00,1.01) -53.18% (p=0.000)
SetTypePtr64 130ns × (1.00,1.00) 47ns × (1.00,1.00) -63.67% (p=0.000)
SetTypePtr126 241ns × (0.99,1.01) 79ns × (1.00,1.01) -67.25% (p=0.000)
SetTypePtr128 2.07µs × (1.00,1.00) 0.08µs × (1.00,1.00) -96.27% (p=0.000)
SetTypePtrSlice 1.05µs × (0.99,1.01) 0.72µs × (0.99,1.02) -31.70% (p=0.000)
SetTypeNode1 16.0ns × (0.99,1.01) 20.8ns × (0.99,1.03) +29.91% (p=0.000)
SetTypeNode1Slice 184ns × (0.99,1.01) 112ns × (0.99,1.01) -39.26% (p=0.000)
SetTypeNode8 29.5ns × (0.97,1.02) 24.6ns × (1.00,1.00) -16.50% (p=0.000)
SetTypeNode8Slice 624ns × (0.98,1.02) 285ns × (1.00,1.00) -54.31% (p=0.000)
SetTypeNode64 135ns × (0.96,1.08) 52ns × (0.99,1.02) -61.32% (p=0.000)
SetTypeNode64Slice 3.83µs × (1.00,1.00) 1.14µs × (0.99,1.01) -70.16% (p=0.000)
SetTypeNode64Dead 134ns × (0.99,1.01) 32ns × (1.00,1.01) -75.74% (p=0.000)
SetTypeNode64DeadSlice 3.83µs × (0.99,1.00) 1.40µs × (1.00,1.01) -63.42% (p=0.000)
SetTypeNode124 240ns × (0.99,1.01) 79ns × (1.00,1.01) -67.05% (p=0.000)
SetTypeNode124Slice 7.27µs × (1.00,1.00) 2.04µs × (1.00,1.00) -71.95% (p=0.000)
SetTypeNode126 2.06µs × (0.99,1.01) 0.08µs × (0.99,1.01) -96.23% (p=0.000)
SetTypeNode126Slice 64.4µs × (1.00,1.00) 2.0µs × (1.00,1.00) -96.85% (p=0.000)
SetTypeNode128 2.09µs × (1.00,1.01) 0.12µs × (1.00,1.00) -94.15% (p=0.000)
SetTypeNode128Slice 65.4µs × (1.00,1.00) 2.4µs × (0.99,1.03) -96.39% (p=0.000)
SetTypeNode130 2.11µs × (1.00,1.00) 0.12µs × (1.00,1.00) -94.18% (p=0.000)
SetTypeNode130Slice 66.3µs × (1.00,1.00) 2.4µs × (0.97,1.08) -96.34% (p=0.000)
SetTypeNode1024 16.0µs × (1.00,1.01) 0.5µs × (1.00,1.00) -96.65% (p=0.000)
SetTypeNode1024Slice 512µs × (1.00,1.00) 18µs × (0.98,1.04) -96.45% (p=0.000)
SetTypeNode124 uses a 124 data + 2 ptr = 126-word allocation.
Both Go 1.4 and this CL are using pointer bitmaps for this case,
so that's an overall 3x speedup for using pointer bitmaps.
SetTypeNode128 uses a 128 data + 2 ptr = 130-word allocation.
Both Go 1.4 and this CL are running the GC program for this case,
so that's an overall 17x speedup when using GC programs (and
I've seen >20x on other systems).
Comparing Go 1.4's SetTypeNode124 (pointer bitmap) against
this CL's SetTypeNode128 (GC program), the slow path in the
code in this CL is 2x faster than the fast path in Go 1.4.
The Go 1 benchmarks are basically unaffected compared to just before this CL.
Go 1 benchmarks, before this CL vs this CL:
name old mean new mean delta
BinaryTree17 5.87s × (0.97,1.04) 5.91s × (0.96,1.04) ~ (p=0.306)
Fannkuch11 4.38s × (1.00,1.00) 4.37s × (1.00,1.01) -0.22% (p=0.006)
FmtFprintfEmpty 90.7ns × (0.97,1.10) 89.3ns × (0.96,1.09) ~ (p=0.280)
FmtFprintfString 282ns × (0.98,1.04) 287ns × (0.98,1.07) +1.72% (p=0.039)
FmtFprintfInt 269ns × (0.99,1.03) 282ns × (0.97,1.04) +4.87% (p=0.000)
FmtFprintfIntInt 478ns × (0.99,1.02) 481ns × (0.99,1.02) +0.61% (p=0.048)
FmtFprintfPrefixedInt 399ns × (0.98,1.03) 400ns × (0.98,1.05) ~ (p=0.533)
FmtFprintfFloat 563ns × (0.99,1.01) 570ns × (1.00,1.01) +1.37% (p=0.000)
FmtManyArgs 1.89µs × (0.99,1.01) 1.92µs × (0.99,1.02) +1.88% (p=0.000)
GobDecode 15.2ms × (0.99,1.01) 15.2ms × (0.98,1.05) ~ (p=0.609)
GobEncode 11.6ms × (0.98,1.03) 11.9ms × (0.98,1.04) +2.17% (p=0.000)
Gzip 648ms × (0.99,1.01) 648ms × (1.00,1.01) ~ (p=0.835)
Gunzip 142ms × (1.00,1.00) 143ms × (1.00,1.01) ~ (p=0.169)
HTTPClientServer 90.5µs × (0.98,1.03) 91.5µs × (0.98,1.04) +1.04% (p=0.045)
JSONEncode 31.5ms × (0.98,1.03) 31.4ms × (0.98,1.03) ~ (p=0.549)
JSONDecode 111ms × (0.99,1.01) 107ms × (0.99,1.01) -3.21% (p=0.000)
Mandelbrot200 6.01ms × (1.00,1.00) 6.01ms × (1.00,1.00) ~ (p=0.878)
GoParse 6.54ms × (0.99,1.02) 6.61ms × (0.99,1.03) +1.08% (p=0.004)
RegexpMatchEasy0_32 160ns × (1.00,1.01) 161ns × (1.00,1.00) +0.40% (p=0.000)
RegexpMatchEasy0_1K 560ns × (0.99,1.01) 559ns × (0.99,1.01) ~ (p=0.088)
RegexpMatchEasy1_32 138ns × (0.99,1.01) 138ns × (1.00,1.00) ~ (p=0.380)
RegexpMatchEasy1_1K 877ns × (1.00,1.00) 878ns × (1.00,1.00) ~ (p=0.157)
RegexpMatchMedium_32 251ns × (0.99,1.00) 251ns × (1.00,1.01) +0.28% (p=0.021)
RegexpMatchMedium_1K 72.6µs × (1.00,1.00) 72.6µs × (1.00,1.00) ~ (p=0.539)
RegexpMatchHard_32 3.84µs × (1.00,1.00) 3.84µs × (1.00,1.00) ~ (p=0.378)
RegexpMatchHard_1K 117µs × (1.00,1.00) 117µs × (1.00,1.00) ~ (p=0.067)
Revcomp 904ms × (0.99,1.02) 904ms × (0.99,1.01) ~ (p=0.943)
Template 125ms × (0.99,1.02) 127ms × (0.99,1.01) +1.79% (p=0.000)
TimeParse 627ns × (0.99,1.01) 622ns × (0.99,1.01) -0.88% (p=0.000)
TimeFormat 655ns × (0.99,1.02) 655ns × (0.99,1.02) ~ (p=0.976)
For the record, Go 1 benchmarks, Go 1.4 vs this CL:
name old mean new mean delta
BinaryTree17 4.61s × (0.97,1.05) 5.91s × (0.98,1.03) +28.35% (p=0.000)
Fannkuch11 4.40s × (0.99,1.03) 4.41s × (0.99,1.01) ~ (p=0.212)
FmtFprintfEmpty 102ns × (0.99,1.01) 84ns × (0.99,1.02) -18.38% (p=0.000)
FmtFprintfString 302ns × (0.98,1.01) 303ns × (0.99,1.02) ~ (p=0.203)
FmtFprintfInt 313ns × (0.97,1.05) 270ns × (0.99,1.01) -13.69% (p=0.000)
FmtFprintfIntInt 524ns × (0.98,1.02) 477ns × (0.99,1.00) -8.87% (p=0.000)
FmtFprintfPrefixedInt 424ns × (0.98,1.02) 386ns × (0.99,1.01) -8.96% (p=0.000)
FmtFprintfFloat 652ns × (0.98,1.02) 594ns × (0.97,1.05) -8.97% (p=0.000)
FmtManyArgs 2.13µs × (0.99,1.02) 1.94µs × (0.99,1.01) -8.92% (p=0.000)
GobDecode 17.1ms × (0.99,1.02) 14.9ms × (0.98,1.03) -13.07% (p=0.000)
GobEncode 13.5ms × (0.98,1.03) 11.5ms × (0.98,1.03) -15.25% (p=0.000)
Gzip 656ms × (0.99,1.02) 647ms × (0.99,1.01) -1.29% (p=0.000)
Gunzip 143ms × (0.99,1.02) 144ms × (0.99,1.01) ~ (p=0.204)
HTTPClientServer 88.2µs × (0.98,1.02) 90.8µs × (0.98,1.01) +2.93% (p=0.000)
JSONEncode 32.2ms × (0.98,1.02) 30.9ms × (0.97,1.04) -4.06% (p=0.001)
JSONDecode 121ms × (0.98,1.02) 110ms × (0.98,1.05) -8.95% (p=0.000)
Mandelbrot200 6.06ms × (0.99,1.01) 6.11ms × (0.98,1.04) ~ (p=0.184)
GoParse 6.76ms × (0.97,1.04) 6.58ms × (0.98,1.05) -2.63% (p=0.003)
RegexpMatchEasy0_32 195ns × (1.00,1.01) 155ns × (0.99,1.01) -20.43% (p=0.000)
RegexpMatchEasy0_1K 479ns × (0.98,1.03) 535ns × (0.99,1.02) +11.59% (p=0.000)
RegexpMatchEasy1_32 169ns × (0.99,1.02) 131ns × (0.99,1.03) -22.44% (p=0.000)
RegexpMatchEasy1_1K 1.53µs × (0.99,1.01) 0.87µs × (0.99,1.02) -43.07% (p=0.000)
RegexpMatchMedium_32 334ns × (0.99,1.01) 242ns × (0.99,1.01) -27.53% (p=0.000)
RegexpMatchMedium_1K 125µs × (1.00,1.01) 72µs × (0.99,1.03) -42.53% (p=0.000)
RegexpMatchHard_32 6.03µs × (0.99,1.01) 3.79µs × (0.99,1.01) -37.12% (p=0.000)
RegexpMatchHard_1K 189µs × (0.99,1.02) 115µs × (0.99,1.01) -39.20% (p=0.000)
Revcomp 935ms × (0.96,1.03) 926ms × (0.98,1.02) ~ (p=0.083)
Template 146ms × (0.97,1.05) 119ms × (0.99,1.01) -18.37% (p=0.000)
TimeParse 660ns × (0.99,1.01) 624ns × (0.99,1.02) -5.43% (p=0.000)
TimeFormat 670ns × (0.98,1.02) 710ns × (1.00,1.01) +5.97% (p=0.000)
This CL is a bit larger than I would like, but the compiler, linker, runtime,
and package reflect all need to be in sync about the format of these programs,
so there is no easy way to split this into independent changes (at least
while keeping the build working at each change).
Fixes #9625.
Fixes #10524.
Change-Id: I9e3e20d6097099d0f8532d1cb5b1af528804989a
Reviewed-on: https://go-review.googlesource.com/9888
Reviewed-by: Austin Clements <austin@google.com>
Run-TryBot: Russ Cox <rsc@golang.org>
2015-05-08 01:43:18 -04:00
|
|
|
for i := int64(0); i < nptr; i++ {
|
|
|
|
|
if (mask[i/8]>>uint(i%8))&1 != 0 {
|
|
|
|
|
p.w.Ptr(s.Value/ptrsize + i)
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
}
|
runtime: replace GC programs with simpler encoding, faster decoder
Small types record the location of pointers in their memory layout
by using a simple bitmap. In Go 1.4 the bitmap held 4-bit entries,
and in Go 1.5 the bitmap holds 1-bit entries, but in both cases using
a bitmap for a large type containing arrays does not make sense:
if someone refers to the type [1<<28]*byte in a program in such
a way that the type information makes it into the binary, it would be
a waste of space to write a 128 MB (for 4-bit entries) or even 32 MB
(for 1-bit entries) bitmap full of 1s into the binary or even to keep
one in memory during the execution of the program.
For large types containing arrays, it is much more compact to describe
the locations of pointers using a notation that can express repetition
than to lay out a bitmap of pointers. Go 1.4 included such a notation,
called ``GC programs'' but it was complex, required recursion during
decoding, and was generally slow. Dmitriy measured the execution of
these programs writing directly to the heap bitmap as being 7x slower
than copying from a preunrolled 4-bit mask (and frankly that code was
not terribly fast either). For some tests, unrollgcprog1 was seen costing
as much as 3x more than the rest of malloc combined.
This CL introduces a different form for the GC programs. They use a
simple Lempel-Ziv-style encoding of the 1-bit pointer information,
in which the only operations are (1) emit the following n bits
and (2) repeat the last n bits c more times. This encoding can be
generated directly from the Go type information (using repetition
only for arrays or large runs of non-pointer data) and it can be decoded
very efficiently. In particular the decoding requires little state and
no recursion, so that the entire decoding can run without any memory
accesses other than the reads of the encoding and the writes of the
decoded form to the heap bitmap. For recursive types like arrays of
arrays of arrays, the inner instructions are only executed once, not
n times, so that large repetitions run at full speed. (In contrast, large
repetitions in the old programs repeated the individual bit-level layout
of the inner data over and over.) The result is as much as 25x faster
decoding compared to the old form.
Because the old decoder was so slow, Go 1.4 had three (or so) cases
for how to set the heap bitmap bits for an allocation of a given type:
(1) If the type had an even number of words up to 32 words, then
the 4-bit pointer mask for the type fit in no more than 16 bytes;
store the 4-bit pointer mask directly in the binary and copy from it.
(1b) If the type had an odd number of words up to 15 words, then
the 4-bit pointer mask for the type, doubled to end on a byte boundary,
fit in no more than 16 bytes; store that doubled mask directly in the
binary and copy from it.
(2) If the type had an even number of words up to 128 words,
or an odd number of words up to 63 words (again due to doubling),
then the 4-bit pointer mask would fit in a 64-byte unrolled mask.
Store a GC program in the binary, but leave space in the BSS for
the unrolled mask. Execute the GC program to construct the mask the
first time it is needed, and thereafter copy from the mask.
(3) Otherwise, store a GC program and execute it to write directly to
the heap bitmap each time an object of that type is allocated.
(This is the case that was 7x slower than the other two.)
Because the new pointer masks store 1-bit entries instead of 4-bit
entries and because using the decoder no longer carries a significant
overhead, after this CL (that is, for Go 1.5) there are only two cases:
(1) If the type is 128 words or less (no condition about odd or even),
store the 1-bit pointer mask directly in the binary and use it to
initialize the heap bitmap during malloc. (Implemented in CL 9702.)
(2) There is no case 2 anymore.
(3) Otherwise, store a GC program and execute it to write directly to
the heap bitmap each time an object of that type is allocated.
Executing the GC program directly into the heap bitmap (case (3) above)
was disabled for the Go 1.5 dev cycle, both to avoid needing to use
GC programs for typedmemmove and to avoid updating that code as
the heap bitmap format changed. Typedmemmove no longer uses this
type information; as of CL 9886 it uses the heap bitmap directly.
Now that the heap bitmap format is stable, we reintroduce GC programs
and their space savings.
Benchmarks for heapBitsSetType, before this CL vs this CL:
name old mean new mean delta
SetTypePtr 7.59ns × (0.99,1.02) 5.16ns × (1.00,1.00) -32.05% (p=0.000)
SetTypePtr8 21.0ns × (0.98,1.05) 21.4ns × (1.00,1.00) ~ (p=0.179)
SetTypePtr16 24.1ns × (0.99,1.01) 24.6ns × (1.00,1.00) +2.41% (p=0.001)
SetTypePtr32 31.2ns × (0.99,1.01) 32.4ns × (0.99,1.02) +3.72% (p=0.001)
SetTypePtr64 45.2ns × (1.00,1.00) 47.2ns × (1.00,1.00) +4.42% (p=0.000)
SetTypePtr126 75.8ns × (0.99,1.01) 79.1ns × (1.00,1.00) +4.25% (p=0.000)
SetTypePtr128 74.3ns × (0.99,1.01) 77.6ns × (1.00,1.01) +4.55% (p=0.000)
SetTypePtrSlice 726ns × (1.00,1.01) 712ns × (1.00,1.00) -1.95% (p=0.001)
SetTypeNode1 20.0ns × (0.99,1.01) 20.7ns × (1.00,1.00) +3.71% (p=0.000)
SetTypeNode1Slice 112ns × (1.00,1.00) 113ns × (0.99,1.00) ~ (p=0.070)
SetTypeNode8 23.9ns × (1.00,1.00) 24.7ns × (1.00,1.01) +3.18% (p=0.000)
SetTypeNode8Slice 294ns × (0.99,1.02) 287ns × (0.99,1.01) -2.38% (p=0.015)
SetTypeNode64 52.8ns × (0.99,1.03) 51.8ns × (0.99,1.01) ~ (p=0.069)
SetTypeNode64Slice 1.13µs × (0.99,1.05) 1.14µs × (0.99,1.00) ~ (p=0.767)
SetTypeNode64Dead 36.0ns × (1.00,1.01) 32.5ns × (0.99,1.00) -9.67% (p=0.000)
SetTypeNode64DeadSlice 1.43µs × (0.99,1.01) 1.40µs × (1.00,1.00) -2.39% (p=0.001)
SetTypeNode124 75.7ns × (1.00,1.01) 79.0ns × (1.00,1.00) +4.44% (p=0.000)
SetTypeNode124Slice 1.94µs × (1.00,1.01) 2.04µs × (0.99,1.01) +4.98% (p=0.000)
SetTypeNode126 75.4ns × (1.00,1.01) 77.7ns × (0.99,1.01) +3.11% (p=0.000)
SetTypeNode126Slice 1.95µs × (0.99,1.01) 2.03µs × (1.00,1.00) +3.74% (p=0.000)
SetTypeNode128 85.4ns × (0.99,1.01) 122.0ns × (1.00,1.00) +42.89% (p=0.000)
SetTypeNode128Slice 2.20µs × (1.00,1.01) 2.36µs × (0.98,1.02) +7.48% (p=0.001)
SetTypeNode130 83.3ns × (1.00,1.00) 123.0ns × (1.00,1.00) +47.61% (p=0.000)
SetTypeNode130Slice 2.30µs × (0.99,1.01) 2.40µs × (0.98,1.01) +4.37% (p=0.000)
SetTypeNode1024 498ns × (1.00,1.00) 537ns × (1.00,1.00) +7.96% (p=0.000)
SetTypeNode1024Slice 15.5µs × (0.99,1.01) 17.8µs × (1.00,1.00) +15.27% (p=0.000)
The above compares always using a cached pointer mask (and the
corresponding waste of memory) against using the programs directly.
Some slowdown is expected, in exchange for having a better general algorithm.
The GC programs kick in for SetTypeNode128, SetTypeNode130, SetTypeNode1024,
along with the slice variants of those.
It is possible that the cutoff of 128 words (bits) should be raised
in a followup CL, but even with this low cutoff the GC programs are
faster than Go 1.4's "fast path" non-GC program case.
Benchmarks for heapBitsSetType, Go 1.4 vs this CL:
name old mean new mean delta
SetTypePtr 6.89ns × (1.00,1.00) 5.17ns × (1.00,1.00) -25.02% (p=0.000)
SetTypePtr8 25.8ns × (0.97,1.05) 21.5ns × (1.00,1.00) -16.70% (p=0.000)
SetTypePtr16 39.8ns × (0.97,1.02) 24.7ns × (0.99,1.01) -37.81% (p=0.000)
SetTypePtr32 68.8ns × (0.98,1.01) 32.2ns × (1.00,1.01) -53.18% (p=0.000)
SetTypePtr64 130ns × (1.00,1.00) 47ns × (1.00,1.00) -63.67% (p=0.000)
SetTypePtr126 241ns × (0.99,1.01) 79ns × (1.00,1.01) -67.25% (p=0.000)
SetTypePtr128 2.07µs × (1.00,1.00) 0.08µs × (1.00,1.00) -96.27% (p=0.000)
SetTypePtrSlice 1.05µs × (0.99,1.01) 0.72µs × (0.99,1.02) -31.70% (p=0.000)
SetTypeNode1 16.0ns × (0.99,1.01) 20.8ns × (0.99,1.03) +29.91% (p=0.000)
SetTypeNode1Slice 184ns × (0.99,1.01) 112ns × (0.99,1.01) -39.26% (p=0.000)
SetTypeNode8 29.5ns × (0.97,1.02) 24.6ns × (1.00,1.00) -16.50% (p=0.000)
SetTypeNode8Slice 624ns × (0.98,1.02) 285ns × (1.00,1.00) -54.31% (p=0.000)
SetTypeNode64 135ns × (0.96,1.08) 52ns × (0.99,1.02) -61.32% (p=0.000)
SetTypeNode64Slice 3.83µs × (1.00,1.00) 1.14µs × (0.99,1.01) -70.16% (p=0.000)
SetTypeNode64Dead 134ns × (0.99,1.01) 32ns × (1.00,1.01) -75.74% (p=0.000)
SetTypeNode64DeadSlice 3.83µs × (0.99,1.00) 1.40µs × (1.00,1.01) -63.42% (p=0.000)
SetTypeNode124 240ns × (0.99,1.01) 79ns × (1.00,1.01) -67.05% (p=0.000)
SetTypeNode124Slice 7.27µs × (1.00,1.00) 2.04µs × (1.00,1.00) -71.95% (p=0.000)
SetTypeNode126 2.06µs × (0.99,1.01) 0.08µs × (0.99,1.01) -96.23% (p=0.000)
SetTypeNode126Slice 64.4µs × (1.00,1.00) 2.0µs × (1.00,1.00) -96.85% (p=0.000)
SetTypeNode128 2.09µs × (1.00,1.01) 0.12µs × (1.00,1.00) -94.15% (p=0.000)
SetTypeNode128Slice 65.4µs × (1.00,1.00) 2.4µs × (0.99,1.03) -96.39% (p=0.000)
SetTypeNode130 2.11µs × (1.00,1.00) 0.12µs × (1.00,1.00) -94.18% (p=0.000)
SetTypeNode130Slice 66.3µs × (1.00,1.00) 2.4µs × (0.97,1.08) -96.34% (p=0.000)
SetTypeNode1024 16.0µs × (1.00,1.01) 0.5µs × (1.00,1.00) -96.65% (p=0.000)
SetTypeNode1024Slice 512µs × (1.00,1.00) 18µs × (0.98,1.04) -96.45% (p=0.000)
SetTypeNode124 uses a 124 data + 2 ptr = 126-word allocation.
Both Go 1.4 and this CL are using pointer bitmaps for this case,
so that's an overall 3x speedup for using pointer bitmaps.
SetTypeNode128 uses a 128 data + 2 ptr = 130-word allocation.
Both Go 1.4 and this CL are running the GC program for this case,
so that's an overall 17x speedup when using GC programs (and
I've seen >20x on other systems).
Comparing Go 1.4's SetTypeNode124 (pointer bitmap) against
this CL's SetTypeNode128 (GC program), the slow path in the
code in this CL is 2x faster than the fast path in Go 1.4.
The Go 1 benchmarks are basically unaffected compared to just before this CL.
Go 1 benchmarks, before this CL vs this CL:
name old mean new mean delta
BinaryTree17 5.87s × (0.97,1.04) 5.91s × (0.96,1.04) ~ (p=0.306)
Fannkuch11 4.38s × (1.00,1.00) 4.37s × (1.00,1.01) -0.22% (p=0.006)
FmtFprintfEmpty 90.7ns × (0.97,1.10) 89.3ns × (0.96,1.09) ~ (p=0.280)
FmtFprintfString 282ns × (0.98,1.04) 287ns × (0.98,1.07) +1.72% (p=0.039)
FmtFprintfInt 269ns × (0.99,1.03) 282ns × (0.97,1.04) +4.87% (p=0.000)
FmtFprintfIntInt 478ns × (0.99,1.02) 481ns × (0.99,1.02) +0.61% (p=0.048)
FmtFprintfPrefixedInt 399ns × (0.98,1.03) 400ns × (0.98,1.05) ~ (p=0.533)
FmtFprintfFloat 563ns × (0.99,1.01) 570ns × (1.00,1.01) +1.37% (p=0.000)
FmtManyArgs 1.89µs × (0.99,1.01) 1.92µs × (0.99,1.02) +1.88% (p=0.000)
GobDecode 15.2ms × (0.99,1.01) 15.2ms × (0.98,1.05) ~ (p=0.609)
GobEncode 11.6ms × (0.98,1.03) 11.9ms × (0.98,1.04) +2.17% (p=0.000)
Gzip 648ms × (0.99,1.01) 648ms × (1.00,1.01) ~ (p=0.835)
Gunzip 142ms × (1.00,1.00) 143ms × (1.00,1.01) ~ (p=0.169)
HTTPClientServer 90.5µs × (0.98,1.03) 91.5µs × (0.98,1.04) +1.04% (p=0.045)
JSONEncode 31.5ms × (0.98,1.03) 31.4ms × (0.98,1.03) ~ (p=0.549)
JSONDecode 111ms × (0.99,1.01) 107ms × (0.99,1.01) -3.21% (p=0.000)
Mandelbrot200 6.01ms × (1.00,1.00) 6.01ms × (1.00,1.00) ~ (p=0.878)
GoParse 6.54ms × (0.99,1.02) 6.61ms × (0.99,1.03) +1.08% (p=0.004)
RegexpMatchEasy0_32 160ns × (1.00,1.01) 161ns × (1.00,1.00) +0.40% (p=0.000)
RegexpMatchEasy0_1K 560ns × (0.99,1.01) 559ns × (0.99,1.01) ~ (p=0.088)
RegexpMatchEasy1_32 138ns × (0.99,1.01) 138ns × (1.00,1.00) ~ (p=0.380)
RegexpMatchEasy1_1K 877ns × (1.00,1.00) 878ns × (1.00,1.00) ~ (p=0.157)
RegexpMatchMedium_32 251ns × (0.99,1.00) 251ns × (1.00,1.01) +0.28% (p=0.021)
RegexpMatchMedium_1K 72.6µs × (1.00,1.00) 72.6µs × (1.00,1.00) ~ (p=0.539)
RegexpMatchHard_32 3.84µs × (1.00,1.00) 3.84µs × (1.00,1.00) ~ (p=0.378)
RegexpMatchHard_1K 117µs × (1.00,1.00) 117µs × (1.00,1.00) ~ (p=0.067)
Revcomp 904ms × (0.99,1.02) 904ms × (0.99,1.01) ~ (p=0.943)
Template 125ms × (0.99,1.02) 127ms × (0.99,1.01) +1.79% (p=0.000)
TimeParse 627ns × (0.99,1.01) 622ns × (0.99,1.01) -0.88% (p=0.000)
TimeFormat 655ns × (0.99,1.02) 655ns × (0.99,1.02) ~ (p=0.976)
For the record, Go 1 benchmarks, Go 1.4 vs this CL:
name old mean new mean delta
BinaryTree17 4.61s × (0.97,1.05) 5.91s × (0.98,1.03) +28.35% (p=0.000)
Fannkuch11 4.40s × (0.99,1.03) 4.41s × (0.99,1.01) ~ (p=0.212)
FmtFprintfEmpty 102ns × (0.99,1.01) 84ns × (0.99,1.02) -18.38% (p=0.000)
FmtFprintfString 302ns × (0.98,1.01) 303ns × (0.99,1.02) ~ (p=0.203)
FmtFprintfInt 313ns × (0.97,1.05) 270ns × (0.99,1.01) -13.69% (p=0.000)
FmtFprintfIntInt 524ns × (0.98,1.02) 477ns × (0.99,1.00) -8.87% (p=0.000)
FmtFprintfPrefixedInt 424ns × (0.98,1.02) 386ns × (0.99,1.01) -8.96% (p=0.000)
FmtFprintfFloat 652ns × (0.98,1.02) 594ns × (0.97,1.05) -8.97% (p=0.000)
FmtManyArgs 2.13µs × (0.99,1.02) 1.94µs × (0.99,1.01) -8.92% (p=0.000)
GobDecode 17.1ms × (0.99,1.02) 14.9ms × (0.98,1.03) -13.07% (p=0.000)
GobEncode 13.5ms × (0.98,1.03) 11.5ms × (0.98,1.03) -15.25% (p=0.000)
Gzip 656ms × (0.99,1.02) 647ms × (0.99,1.01) -1.29% (p=0.000)
Gunzip 143ms × (0.99,1.02) 144ms × (0.99,1.01) ~ (p=0.204)
HTTPClientServer 88.2µs × (0.98,1.02) 90.8µs × (0.98,1.01) +2.93% (p=0.000)
JSONEncode 32.2ms × (0.98,1.02) 30.9ms × (0.97,1.04) -4.06% (p=0.001)
JSONDecode 121ms × (0.98,1.02) 110ms × (0.98,1.05) -8.95% (p=0.000)
Mandelbrot200 6.06ms × (0.99,1.01) 6.11ms × (0.98,1.04) ~ (p=0.184)
GoParse 6.76ms × (0.97,1.04) 6.58ms × (0.98,1.05) -2.63% (p=0.003)
RegexpMatchEasy0_32 195ns × (1.00,1.01) 155ns × (0.99,1.01) -20.43% (p=0.000)
RegexpMatchEasy0_1K 479ns × (0.98,1.03) 535ns × (0.99,1.02) +11.59% (p=0.000)
RegexpMatchEasy1_32 169ns × (0.99,1.02) 131ns × (0.99,1.03) -22.44% (p=0.000)
RegexpMatchEasy1_1K 1.53µs × (0.99,1.01) 0.87µs × (0.99,1.02) -43.07% (p=0.000)
RegexpMatchMedium_32 334ns × (0.99,1.01) 242ns × (0.99,1.01) -27.53% (p=0.000)
RegexpMatchMedium_1K 125µs × (1.00,1.01) 72µs × (0.99,1.03) -42.53% (p=0.000)
RegexpMatchHard_32 6.03µs × (0.99,1.01) 3.79µs × (0.99,1.01) -37.12% (p=0.000)
RegexpMatchHard_1K 189µs × (0.99,1.02) 115µs × (0.99,1.01) -39.20% (p=0.000)
Revcomp 935ms × (0.96,1.03) 926ms × (0.98,1.02) ~ (p=0.083)
Template 146ms × (0.97,1.05) 119ms × (0.99,1.01) -18.37% (p=0.000)
TimeParse 660ns × (0.99,1.01) 624ns × (0.99,1.02) -5.43% (p=0.000)
TimeFormat 670ns × (0.98,1.02) 710ns × (1.00,1.01) +5.97% (p=0.000)
This CL is a bit larger than I would like, but the compiler, linker, runtime,
and package reflect all need to be in sync about the format of these programs,
so there is no easy way to split this into independent changes (at least
while keeping the build working at each change).
Fixes #9625.
Fixes #10524.
Change-Id: I9e3e20d6097099d0f8532d1cb5b1af528804989a
Reviewed-on: https://go-review.googlesource.com/9888
Reviewed-by: Austin Clements <austin@google.com>
Run-TryBot: Russ Cox <rsc@golang.org>
2015-05-08 01:43:18 -04:00
|
|
|
return
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
runtime: replace GC programs with simpler encoding, faster decoder
Small types record the location of pointers in their memory layout
by using a simple bitmap. In Go 1.4 the bitmap held 4-bit entries,
and in Go 1.5 the bitmap holds 1-bit entries, but in both cases using
a bitmap for a large type containing arrays does not make sense:
if someone refers to the type [1<<28]*byte in a program in such
a way that the type information makes it into the binary, it would be
a waste of space to write a 128 MB (for 4-bit entries) or even 32 MB
(for 1-bit entries) bitmap full of 1s into the binary or even to keep
one in memory during the execution of the program.
For large types containing arrays, it is much more compact to describe
the locations of pointers using a notation that can express repetition
than to lay out a bitmap of pointers. Go 1.4 included such a notation,
called ``GC programs'' but it was complex, required recursion during
decoding, and was generally slow. Dmitriy measured the execution of
these programs writing directly to the heap bitmap as being 7x slower
than copying from a preunrolled 4-bit mask (and frankly that code was
not terribly fast either). For some tests, unrollgcprog1 was seen costing
as much as 3x more than the rest of malloc combined.
This CL introduces a different form for the GC programs. They use a
simple Lempel-Ziv-style encoding of the 1-bit pointer information,
in which the only operations are (1) emit the following n bits
and (2) repeat the last n bits c more times. This encoding can be
generated directly from the Go type information (using repetition
only for arrays or large runs of non-pointer data) and it can be decoded
very efficiently. In particular the decoding requires little state and
no recursion, so that the entire decoding can run without any memory
accesses other than the reads of the encoding and the writes of the
decoded form to the heap bitmap. For recursive types like arrays of
arrays of arrays, the inner instructions are only executed once, not
n times, so that large repetitions run at full speed. (In contrast, large
repetitions in the old programs repeated the individual bit-level layout
of the inner data over and over.) The result is as much as 25x faster
decoding compared to the old form.
Because the old decoder was so slow, Go 1.4 had three (or so) cases
for how to set the heap bitmap bits for an allocation of a given type:
(1) If the type had an even number of words up to 32 words, then
the 4-bit pointer mask for the type fit in no more than 16 bytes;
store the 4-bit pointer mask directly in the binary and copy from it.
(1b) If the type had an odd number of words up to 15 words, then
the 4-bit pointer mask for the type, doubled to end on a byte boundary,
fit in no more than 16 bytes; store that doubled mask directly in the
binary and copy from it.
(2) If the type had an even number of words up to 128 words,
or an odd number of words up to 63 words (again due to doubling),
then the 4-bit pointer mask would fit in a 64-byte unrolled mask.
Store a GC program in the binary, but leave space in the BSS for
the unrolled mask. Execute the GC program to construct the mask the
first time it is needed, and thereafter copy from the mask.
(3) Otherwise, store a GC program and execute it to write directly to
the heap bitmap each time an object of that type is allocated.
(This is the case that was 7x slower than the other two.)
Because the new pointer masks store 1-bit entries instead of 4-bit
entries and because using the decoder no longer carries a significant
overhead, after this CL (that is, for Go 1.5) there are only two cases:
(1) If the type is 128 words or less (no condition about odd or even),
store the 1-bit pointer mask directly in the binary and use it to
initialize the heap bitmap during malloc. (Implemented in CL 9702.)
(2) There is no case 2 anymore.
(3) Otherwise, store a GC program and execute it to write directly to
the heap bitmap each time an object of that type is allocated.
Executing the GC program directly into the heap bitmap (case (3) above)
was disabled for the Go 1.5 dev cycle, both to avoid needing to use
GC programs for typedmemmove and to avoid updating that code as
the heap bitmap format changed. Typedmemmove no longer uses this
type information; as of CL 9886 it uses the heap bitmap directly.
Now that the heap bitmap format is stable, we reintroduce GC programs
and their space savings.
Benchmarks for heapBitsSetType, before this CL vs this CL:
name old mean new mean delta
SetTypePtr 7.59ns × (0.99,1.02) 5.16ns × (1.00,1.00) -32.05% (p=0.000)
SetTypePtr8 21.0ns × (0.98,1.05) 21.4ns × (1.00,1.00) ~ (p=0.179)
SetTypePtr16 24.1ns × (0.99,1.01) 24.6ns × (1.00,1.00) +2.41% (p=0.001)
SetTypePtr32 31.2ns × (0.99,1.01) 32.4ns × (0.99,1.02) +3.72% (p=0.001)
SetTypePtr64 45.2ns × (1.00,1.00) 47.2ns × (1.00,1.00) +4.42% (p=0.000)
SetTypePtr126 75.8ns × (0.99,1.01) 79.1ns × (1.00,1.00) +4.25% (p=0.000)
SetTypePtr128 74.3ns × (0.99,1.01) 77.6ns × (1.00,1.01) +4.55% (p=0.000)
SetTypePtrSlice 726ns × (1.00,1.01) 712ns × (1.00,1.00) -1.95% (p=0.001)
SetTypeNode1 20.0ns × (0.99,1.01) 20.7ns × (1.00,1.00) +3.71% (p=0.000)
SetTypeNode1Slice 112ns × (1.00,1.00) 113ns × (0.99,1.00) ~ (p=0.070)
SetTypeNode8 23.9ns × (1.00,1.00) 24.7ns × (1.00,1.01) +3.18% (p=0.000)
SetTypeNode8Slice 294ns × (0.99,1.02) 287ns × (0.99,1.01) -2.38% (p=0.015)
SetTypeNode64 52.8ns × (0.99,1.03) 51.8ns × (0.99,1.01) ~ (p=0.069)
SetTypeNode64Slice 1.13µs × (0.99,1.05) 1.14µs × (0.99,1.00) ~ (p=0.767)
SetTypeNode64Dead 36.0ns × (1.00,1.01) 32.5ns × (0.99,1.00) -9.67% (p=0.000)
SetTypeNode64DeadSlice 1.43µs × (0.99,1.01) 1.40µs × (1.00,1.00) -2.39% (p=0.001)
SetTypeNode124 75.7ns × (1.00,1.01) 79.0ns × (1.00,1.00) +4.44% (p=0.000)
SetTypeNode124Slice 1.94µs × (1.00,1.01) 2.04µs × (0.99,1.01) +4.98% (p=0.000)
SetTypeNode126 75.4ns × (1.00,1.01) 77.7ns × (0.99,1.01) +3.11% (p=0.000)
SetTypeNode126Slice 1.95µs × (0.99,1.01) 2.03µs × (1.00,1.00) +3.74% (p=0.000)
SetTypeNode128 85.4ns × (0.99,1.01) 122.0ns × (1.00,1.00) +42.89% (p=0.000)
SetTypeNode128Slice 2.20µs × (1.00,1.01) 2.36µs × (0.98,1.02) +7.48% (p=0.001)
SetTypeNode130 83.3ns × (1.00,1.00) 123.0ns × (1.00,1.00) +47.61% (p=0.000)
SetTypeNode130Slice 2.30µs × (0.99,1.01) 2.40µs × (0.98,1.01) +4.37% (p=0.000)
SetTypeNode1024 498ns × (1.00,1.00) 537ns × (1.00,1.00) +7.96% (p=0.000)
SetTypeNode1024Slice 15.5µs × (0.99,1.01) 17.8µs × (1.00,1.00) +15.27% (p=0.000)
The above compares always using a cached pointer mask (and the
corresponding waste of memory) against using the programs directly.
Some slowdown is expected, in exchange for having a better general algorithm.
The GC programs kick in for SetTypeNode128, SetTypeNode130, SetTypeNode1024,
along with the slice variants of those.
It is possible that the cutoff of 128 words (bits) should be raised
in a followup CL, but even with this low cutoff the GC programs are
faster than Go 1.4's "fast path" non-GC program case.
Benchmarks for heapBitsSetType, Go 1.4 vs this CL:
name old mean new mean delta
SetTypePtr 6.89ns × (1.00,1.00) 5.17ns × (1.00,1.00) -25.02% (p=0.000)
SetTypePtr8 25.8ns × (0.97,1.05) 21.5ns × (1.00,1.00) -16.70% (p=0.000)
SetTypePtr16 39.8ns × (0.97,1.02) 24.7ns × (0.99,1.01) -37.81% (p=0.000)
SetTypePtr32 68.8ns × (0.98,1.01) 32.2ns × (1.00,1.01) -53.18% (p=0.000)
SetTypePtr64 130ns × (1.00,1.00) 47ns × (1.00,1.00) -63.67% (p=0.000)
SetTypePtr126 241ns × (0.99,1.01) 79ns × (1.00,1.01) -67.25% (p=0.000)
SetTypePtr128 2.07µs × (1.00,1.00) 0.08µs × (1.00,1.00) -96.27% (p=0.000)
SetTypePtrSlice 1.05µs × (0.99,1.01) 0.72µs × (0.99,1.02) -31.70% (p=0.000)
SetTypeNode1 16.0ns × (0.99,1.01) 20.8ns × (0.99,1.03) +29.91% (p=0.000)
SetTypeNode1Slice 184ns × (0.99,1.01) 112ns × (0.99,1.01) -39.26% (p=0.000)
SetTypeNode8 29.5ns × (0.97,1.02) 24.6ns × (1.00,1.00) -16.50% (p=0.000)
SetTypeNode8Slice 624ns × (0.98,1.02) 285ns × (1.00,1.00) -54.31% (p=0.000)
SetTypeNode64 135ns × (0.96,1.08) 52ns × (0.99,1.02) -61.32% (p=0.000)
SetTypeNode64Slice 3.83µs × (1.00,1.00) 1.14µs × (0.99,1.01) -70.16% (p=0.000)
SetTypeNode64Dead 134ns × (0.99,1.01) 32ns × (1.00,1.01) -75.74% (p=0.000)
SetTypeNode64DeadSlice 3.83µs × (0.99,1.00) 1.40µs × (1.00,1.01) -63.42% (p=0.000)
SetTypeNode124 240ns × (0.99,1.01) 79ns × (1.00,1.01) -67.05% (p=0.000)
SetTypeNode124Slice 7.27µs × (1.00,1.00) 2.04µs × (1.00,1.00) -71.95% (p=0.000)
SetTypeNode126 2.06µs × (0.99,1.01) 0.08µs × (0.99,1.01) -96.23% (p=0.000)
SetTypeNode126Slice 64.4µs × (1.00,1.00) 2.0µs × (1.00,1.00) -96.85% (p=0.000)
SetTypeNode128 2.09µs × (1.00,1.01) 0.12µs × (1.00,1.00) -94.15% (p=0.000)
SetTypeNode128Slice 65.4µs × (1.00,1.00) 2.4µs × (0.99,1.03) -96.39% (p=0.000)
SetTypeNode130 2.11µs × (1.00,1.00) 0.12µs × (1.00,1.00) -94.18% (p=0.000)
SetTypeNode130Slice 66.3µs × (1.00,1.00) 2.4µs × (0.97,1.08) -96.34% (p=0.000)
SetTypeNode1024 16.0µs × (1.00,1.01) 0.5µs × (1.00,1.00) -96.65% (p=0.000)
SetTypeNode1024Slice 512µs × (1.00,1.00) 18µs × (0.98,1.04) -96.45% (p=0.000)
SetTypeNode124 uses a 124 data + 2 ptr = 126-word allocation.
Both Go 1.4 and this CL are using pointer bitmaps for this case,
so that's an overall 3x speedup for using pointer bitmaps.
SetTypeNode128 uses a 128 data + 2 ptr = 130-word allocation.
Both Go 1.4 and this CL are running the GC program for this case,
so that's an overall 17x speedup when using GC programs (and
I've seen >20x on other systems).
Comparing Go 1.4's SetTypeNode124 (pointer bitmap) against
this CL's SetTypeNode128 (GC program), the slow path in the
code in this CL is 2x faster than the fast path in Go 1.4.
The Go 1 benchmarks are basically unaffected compared to just before this CL.
Go 1 benchmarks, before this CL vs this CL:
name old mean new mean delta
BinaryTree17 5.87s × (0.97,1.04) 5.91s × (0.96,1.04) ~ (p=0.306)
Fannkuch11 4.38s × (1.00,1.00) 4.37s × (1.00,1.01) -0.22% (p=0.006)
FmtFprintfEmpty 90.7ns × (0.97,1.10) 89.3ns × (0.96,1.09) ~ (p=0.280)
FmtFprintfString 282ns × (0.98,1.04) 287ns × (0.98,1.07) +1.72% (p=0.039)
FmtFprintfInt 269ns × (0.99,1.03) 282ns × (0.97,1.04) +4.87% (p=0.000)
FmtFprintfIntInt 478ns × (0.99,1.02) 481ns × (0.99,1.02) +0.61% (p=0.048)
FmtFprintfPrefixedInt 399ns × (0.98,1.03) 400ns × (0.98,1.05) ~ (p=0.533)
FmtFprintfFloat 563ns × (0.99,1.01) 570ns × (1.00,1.01) +1.37% (p=0.000)
FmtManyArgs 1.89µs × (0.99,1.01) 1.92µs × (0.99,1.02) +1.88% (p=0.000)
GobDecode 15.2ms × (0.99,1.01) 15.2ms × (0.98,1.05) ~ (p=0.609)
GobEncode 11.6ms × (0.98,1.03) 11.9ms × (0.98,1.04) +2.17% (p=0.000)
Gzip 648ms × (0.99,1.01) 648ms × (1.00,1.01) ~ (p=0.835)
Gunzip 142ms × (1.00,1.00) 143ms × (1.00,1.01) ~ (p=0.169)
HTTPClientServer 90.5µs × (0.98,1.03) 91.5µs × (0.98,1.04) +1.04% (p=0.045)
JSONEncode 31.5ms × (0.98,1.03) 31.4ms × (0.98,1.03) ~ (p=0.549)
JSONDecode 111ms × (0.99,1.01) 107ms × (0.99,1.01) -3.21% (p=0.000)
Mandelbrot200 6.01ms × (1.00,1.00) 6.01ms × (1.00,1.00) ~ (p=0.878)
GoParse 6.54ms × (0.99,1.02) 6.61ms × (0.99,1.03) +1.08% (p=0.004)
RegexpMatchEasy0_32 160ns × (1.00,1.01) 161ns × (1.00,1.00) +0.40% (p=0.000)
RegexpMatchEasy0_1K 560ns × (0.99,1.01) 559ns × (0.99,1.01) ~ (p=0.088)
RegexpMatchEasy1_32 138ns × (0.99,1.01) 138ns × (1.00,1.00) ~ (p=0.380)
RegexpMatchEasy1_1K 877ns × (1.00,1.00) 878ns × (1.00,1.00) ~ (p=0.157)
RegexpMatchMedium_32 251ns × (0.99,1.00) 251ns × (1.00,1.01) +0.28% (p=0.021)
RegexpMatchMedium_1K 72.6µs × (1.00,1.00) 72.6µs × (1.00,1.00) ~ (p=0.539)
RegexpMatchHard_32 3.84µs × (1.00,1.00) 3.84µs × (1.00,1.00) ~ (p=0.378)
RegexpMatchHard_1K 117µs × (1.00,1.00) 117µs × (1.00,1.00) ~ (p=0.067)
Revcomp 904ms × (0.99,1.02) 904ms × (0.99,1.01) ~ (p=0.943)
Template 125ms × (0.99,1.02) 127ms × (0.99,1.01) +1.79% (p=0.000)
TimeParse 627ns × (0.99,1.01) 622ns × (0.99,1.01) -0.88% (p=0.000)
TimeFormat 655ns × (0.99,1.02) 655ns × (0.99,1.02) ~ (p=0.976)
For the record, Go 1 benchmarks, Go 1.4 vs this CL:
name old mean new mean delta
BinaryTree17 4.61s × (0.97,1.05) 5.91s × (0.98,1.03) +28.35% (p=0.000)
Fannkuch11 4.40s × (0.99,1.03) 4.41s × (0.99,1.01) ~ (p=0.212)
FmtFprintfEmpty 102ns × (0.99,1.01) 84ns × (0.99,1.02) -18.38% (p=0.000)
FmtFprintfString 302ns × (0.98,1.01) 303ns × (0.99,1.02) ~ (p=0.203)
FmtFprintfInt 313ns × (0.97,1.05) 270ns × (0.99,1.01) -13.69% (p=0.000)
FmtFprintfIntInt 524ns × (0.98,1.02) 477ns × (0.99,1.00) -8.87% (p=0.000)
FmtFprintfPrefixedInt 424ns × (0.98,1.02) 386ns × (0.99,1.01) -8.96% (p=0.000)
FmtFprintfFloat 652ns × (0.98,1.02) 594ns × (0.97,1.05) -8.97% (p=0.000)
FmtManyArgs 2.13µs × (0.99,1.02) 1.94µs × (0.99,1.01) -8.92% (p=0.000)
GobDecode 17.1ms × (0.99,1.02) 14.9ms × (0.98,1.03) -13.07% (p=0.000)
GobEncode 13.5ms × (0.98,1.03) 11.5ms × (0.98,1.03) -15.25% (p=0.000)
Gzip 656ms × (0.99,1.02) 647ms × (0.99,1.01) -1.29% (p=0.000)
Gunzip 143ms × (0.99,1.02) 144ms × (0.99,1.01) ~ (p=0.204)
HTTPClientServer 88.2µs × (0.98,1.02) 90.8µs × (0.98,1.01) +2.93% (p=0.000)
JSONEncode 32.2ms × (0.98,1.02) 30.9ms × (0.97,1.04) -4.06% (p=0.001)
JSONDecode 121ms × (0.98,1.02) 110ms × (0.98,1.05) -8.95% (p=0.000)
Mandelbrot200 6.06ms × (0.99,1.01) 6.11ms × (0.98,1.04) ~ (p=0.184)
GoParse 6.76ms × (0.97,1.04) 6.58ms × (0.98,1.05) -2.63% (p=0.003)
RegexpMatchEasy0_32 195ns × (1.00,1.01) 155ns × (0.99,1.01) -20.43% (p=0.000)
RegexpMatchEasy0_1K 479ns × (0.98,1.03) 535ns × (0.99,1.02) +11.59% (p=0.000)
RegexpMatchEasy1_32 169ns × (0.99,1.02) 131ns × (0.99,1.03) -22.44% (p=0.000)
RegexpMatchEasy1_1K 1.53µs × (0.99,1.01) 0.87µs × (0.99,1.02) -43.07% (p=0.000)
RegexpMatchMedium_32 334ns × (0.99,1.01) 242ns × (0.99,1.01) -27.53% (p=0.000)
RegexpMatchMedium_1K 125µs × (1.00,1.01) 72µs × (0.99,1.03) -42.53% (p=0.000)
RegexpMatchHard_32 6.03µs × (0.99,1.01) 3.79µs × (0.99,1.01) -37.12% (p=0.000)
RegexpMatchHard_1K 189µs × (0.99,1.02) 115µs × (0.99,1.01) -39.20% (p=0.000)
Revcomp 935ms × (0.96,1.03) 926ms × (0.98,1.02) ~ (p=0.083)
Template 146ms × (0.97,1.05) 119ms × (0.99,1.01) -18.37% (p=0.000)
TimeParse 660ns × (0.99,1.01) 624ns × (0.99,1.02) -5.43% (p=0.000)
TimeFormat 670ns × (0.98,1.02) 710ns × (1.00,1.01) +5.97% (p=0.000)
This CL is a bit larger than I would like, but the compiler, linker, runtime,
and package reflect all need to be in sync about the format of these programs,
so there is no easy way to split this into independent changes (at least
while keeping the build working at each change).
Fixes #9625.
Fixes #10524.
Change-Id: I9e3e20d6097099d0f8532d1cb5b1af528804989a
Reviewed-on: https://go-review.googlesource.com/9888
Reviewed-by: Austin Clements <austin@google.com>
Run-TryBot: Russ Cox <rsc@golang.org>
2015-05-08 01:43:18 -04:00
|
|
|
|
|
|
|
|
// Copy program.
|
2016-08-19 22:40:38 -04:00
|
|
|
prog := decodetype_gcprog(p.ctxt, typ)
|
runtime: replace GC programs with simpler encoding, faster decoder
Small types record the location of pointers in their memory layout
by using a simple bitmap. In Go 1.4 the bitmap held 4-bit entries,
and in Go 1.5 the bitmap holds 1-bit entries, but in both cases using
a bitmap for a large type containing arrays does not make sense:
if someone refers to the type [1<<28]*byte in a program in such
a way that the type information makes it into the binary, it would be
a waste of space to write a 128 MB (for 4-bit entries) or even 32 MB
(for 1-bit entries) bitmap full of 1s into the binary or even to keep
one in memory during the execution of the program.
For large types containing arrays, it is much more compact to describe
the locations of pointers using a notation that can express repetition
than to lay out a bitmap of pointers. Go 1.4 included such a notation,
called ``GC programs'' but it was complex, required recursion during
decoding, and was generally slow. Dmitriy measured the execution of
these programs writing directly to the heap bitmap as being 7x slower
than copying from a preunrolled 4-bit mask (and frankly that code was
not terribly fast either). For some tests, unrollgcprog1 was seen costing
as much as 3x more than the rest of malloc combined.
This CL introduces a different form for the GC programs. They use a
simple Lempel-Ziv-style encoding of the 1-bit pointer information,
in which the only operations are (1) emit the following n bits
and (2) repeat the last n bits c more times. This encoding can be
generated directly from the Go type information (using repetition
only for arrays or large runs of non-pointer data) and it can be decoded
very efficiently. In particular the decoding requires little state and
no recursion, so that the entire decoding can run without any memory
accesses other than the reads of the encoding and the writes of the
decoded form to the heap bitmap. For recursive types like arrays of
arrays of arrays, the inner instructions are only executed once, not
n times, so that large repetitions run at full speed. (In contrast, large
repetitions in the old programs repeated the individual bit-level layout
of the inner data over and over.) The result is as much as 25x faster
decoding compared to the old form.
Because the old decoder was so slow, Go 1.4 had three (or so) cases
for how to set the heap bitmap bits for an allocation of a given type:
(1) If the type had an even number of words up to 32 words, then
the 4-bit pointer mask for the type fit in no more than 16 bytes;
store the 4-bit pointer mask directly in the binary and copy from it.
(1b) If the type had an odd number of words up to 15 words, then
the 4-bit pointer mask for the type, doubled to end on a byte boundary,
fit in no more than 16 bytes; store that doubled mask directly in the
binary and copy from it.
(2) If the type had an even number of words up to 128 words,
or an odd number of words up to 63 words (again due to doubling),
then the 4-bit pointer mask would fit in a 64-byte unrolled mask.
Store a GC program in the binary, but leave space in the BSS for
the unrolled mask. Execute the GC program to construct the mask the
first time it is needed, and thereafter copy from the mask.
(3) Otherwise, store a GC program and execute it to write directly to
the heap bitmap each time an object of that type is allocated.
(This is the case that was 7x slower than the other two.)
Because the new pointer masks store 1-bit entries instead of 4-bit
entries and because using the decoder no longer carries a significant
overhead, after this CL (that is, for Go 1.5) there are only two cases:
(1) If the type is 128 words or less (no condition about odd or even),
store the 1-bit pointer mask directly in the binary and use it to
initialize the heap bitmap during malloc. (Implemented in CL 9702.)
(2) There is no case 2 anymore.
(3) Otherwise, store a GC program and execute it to write directly to
the heap bitmap each time an object of that type is allocated.
Executing the GC program directly into the heap bitmap (case (3) above)
was disabled for the Go 1.5 dev cycle, both to avoid needing to use
GC programs for typedmemmove and to avoid updating that code as
the heap bitmap format changed. Typedmemmove no longer uses this
type information; as of CL 9886 it uses the heap bitmap directly.
Now that the heap bitmap format is stable, we reintroduce GC programs
and their space savings.
Benchmarks for heapBitsSetType, before this CL vs this CL:
name old mean new mean delta
SetTypePtr 7.59ns × (0.99,1.02) 5.16ns × (1.00,1.00) -32.05% (p=0.000)
SetTypePtr8 21.0ns × (0.98,1.05) 21.4ns × (1.00,1.00) ~ (p=0.179)
SetTypePtr16 24.1ns × (0.99,1.01) 24.6ns × (1.00,1.00) +2.41% (p=0.001)
SetTypePtr32 31.2ns × (0.99,1.01) 32.4ns × (0.99,1.02) +3.72% (p=0.001)
SetTypePtr64 45.2ns × (1.00,1.00) 47.2ns × (1.00,1.00) +4.42% (p=0.000)
SetTypePtr126 75.8ns × (0.99,1.01) 79.1ns × (1.00,1.00) +4.25% (p=0.000)
SetTypePtr128 74.3ns × (0.99,1.01) 77.6ns × (1.00,1.01) +4.55% (p=0.000)
SetTypePtrSlice 726ns × (1.00,1.01) 712ns × (1.00,1.00) -1.95% (p=0.001)
SetTypeNode1 20.0ns × (0.99,1.01) 20.7ns × (1.00,1.00) +3.71% (p=0.000)
SetTypeNode1Slice 112ns × (1.00,1.00) 113ns × (0.99,1.00) ~ (p=0.070)
SetTypeNode8 23.9ns × (1.00,1.00) 24.7ns × (1.00,1.01) +3.18% (p=0.000)
SetTypeNode8Slice 294ns × (0.99,1.02) 287ns × (0.99,1.01) -2.38% (p=0.015)
SetTypeNode64 52.8ns × (0.99,1.03) 51.8ns × (0.99,1.01) ~ (p=0.069)
SetTypeNode64Slice 1.13µs × (0.99,1.05) 1.14µs × (0.99,1.00) ~ (p=0.767)
SetTypeNode64Dead 36.0ns × (1.00,1.01) 32.5ns × (0.99,1.00) -9.67% (p=0.000)
SetTypeNode64DeadSlice 1.43µs × (0.99,1.01) 1.40µs × (1.00,1.00) -2.39% (p=0.001)
SetTypeNode124 75.7ns × (1.00,1.01) 79.0ns × (1.00,1.00) +4.44% (p=0.000)
SetTypeNode124Slice 1.94µs × (1.00,1.01) 2.04µs × (0.99,1.01) +4.98% (p=0.000)
SetTypeNode126 75.4ns × (1.00,1.01) 77.7ns × (0.99,1.01) +3.11% (p=0.000)
SetTypeNode126Slice 1.95µs × (0.99,1.01) 2.03µs × (1.00,1.00) +3.74% (p=0.000)
SetTypeNode128 85.4ns × (0.99,1.01) 122.0ns × (1.00,1.00) +42.89% (p=0.000)
SetTypeNode128Slice 2.20µs × (1.00,1.01) 2.36µs × (0.98,1.02) +7.48% (p=0.001)
SetTypeNode130 83.3ns × (1.00,1.00) 123.0ns × (1.00,1.00) +47.61% (p=0.000)
SetTypeNode130Slice 2.30µs × (0.99,1.01) 2.40µs × (0.98,1.01) +4.37% (p=0.000)
SetTypeNode1024 498ns × (1.00,1.00) 537ns × (1.00,1.00) +7.96% (p=0.000)
SetTypeNode1024Slice 15.5µs × (0.99,1.01) 17.8µs × (1.00,1.00) +15.27% (p=0.000)
The above compares always using a cached pointer mask (and the
corresponding waste of memory) against using the programs directly.
Some slowdown is expected, in exchange for having a better general algorithm.
The GC programs kick in for SetTypeNode128, SetTypeNode130, SetTypeNode1024,
along with the slice variants of those.
It is possible that the cutoff of 128 words (bits) should be raised
in a followup CL, but even with this low cutoff the GC programs are
faster than Go 1.4's "fast path" non-GC program case.
Benchmarks for heapBitsSetType, Go 1.4 vs this CL:
name old mean new mean delta
SetTypePtr 6.89ns × (1.00,1.00) 5.17ns × (1.00,1.00) -25.02% (p=0.000)
SetTypePtr8 25.8ns × (0.97,1.05) 21.5ns × (1.00,1.00) -16.70% (p=0.000)
SetTypePtr16 39.8ns × (0.97,1.02) 24.7ns × (0.99,1.01) -37.81% (p=0.000)
SetTypePtr32 68.8ns × (0.98,1.01) 32.2ns × (1.00,1.01) -53.18% (p=0.000)
SetTypePtr64 130ns × (1.00,1.00) 47ns × (1.00,1.00) -63.67% (p=0.000)
SetTypePtr126 241ns × (0.99,1.01) 79ns × (1.00,1.01) -67.25% (p=0.000)
SetTypePtr128 2.07µs × (1.00,1.00) 0.08µs × (1.00,1.00) -96.27% (p=0.000)
SetTypePtrSlice 1.05µs × (0.99,1.01) 0.72µs × (0.99,1.02) -31.70% (p=0.000)
SetTypeNode1 16.0ns × (0.99,1.01) 20.8ns × (0.99,1.03) +29.91% (p=0.000)
SetTypeNode1Slice 184ns × (0.99,1.01) 112ns × (0.99,1.01) -39.26% (p=0.000)
SetTypeNode8 29.5ns × (0.97,1.02) 24.6ns × (1.00,1.00) -16.50% (p=0.000)
SetTypeNode8Slice 624ns × (0.98,1.02) 285ns × (1.00,1.00) -54.31% (p=0.000)
SetTypeNode64 135ns × (0.96,1.08) 52ns × (0.99,1.02) -61.32% (p=0.000)
SetTypeNode64Slice 3.83µs × (1.00,1.00) 1.14µs × (0.99,1.01) -70.16% (p=0.000)
SetTypeNode64Dead 134ns × (0.99,1.01) 32ns × (1.00,1.01) -75.74% (p=0.000)
SetTypeNode64DeadSlice 3.83µs × (0.99,1.00) 1.40µs × (1.00,1.01) -63.42% (p=0.000)
SetTypeNode124 240ns × (0.99,1.01) 79ns × (1.00,1.01) -67.05% (p=0.000)
SetTypeNode124Slice 7.27µs × (1.00,1.00) 2.04µs × (1.00,1.00) -71.95% (p=0.000)
SetTypeNode126 2.06µs × (0.99,1.01) 0.08µs × (0.99,1.01) -96.23% (p=0.000)
SetTypeNode126Slice 64.4µs × (1.00,1.00) 2.0µs × (1.00,1.00) -96.85% (p=0.000)
SetTypeNode128 2.09µs × (1.00,1.01) 0.12µs × (1.00,1.00) -94.15% (p=0.000)
SetTypeNode128Slice 65.4µs × (1.00,1.00) 2.4µs × (0.99,1.03) -96.39% (p=0.000)
SetTypeNode130 2.11µs × (1.00,1.00) 0.12µs × (1.00,1.00) -94.18% (p=0.000)
SetTypeNode130Slice 66.3µs × (1.00,1.00) 2.4µs × (0.97,1.08) -96.34% (p=0.000)
SetTypeNode1024 16.0µs × (1.00,1.01) 0.5µs × (1.00,1.00) -96.65% (p=0.000)
SetTypeNode1024Slice 512µs × (1.00,1.00) 18µs × (0.98,1.04) -96.45% (p=0.000)
SetTypeNode124 uses a 124 data + 2 ptr = 126-word allocation.
Both Go 1.4 and this CL are using pointer bitmaps for this case,
so that's an overall 3x speedup for using pointer bitmaps.
SetTypeNode128 uses a 128 data + 2 ptr = 130-word allocation.
Both Go 1.4 and this CL are running the GC program for this case,
so that's an overall 17x speedup when using GC programs (and
I've seen >20x on other systems).
Comparing Go 1.4's SetTypeNode124 (pointer bitmap) against
this CL's SetTypeNode128 (GC program), the slow path in the
code in this CL is 2x faster than the fast path in Go 1.4.
The Go 1 benchmarks are basically unaffected compared to just before this CL.
Go 1 benchmarks, before this CL vs this CL:
name old mean new mean delta
BinaryTree17 5.87s × (0.97,1.04) 5.91s × (0.96,1.04) ~ (p=0.306)
Fannkuch11 4.38s × (1.00,1.00) 4.37s × (1.00,1.01) -0.22% (p=0.006)
FmtFprintfEmpty 90.7ns × (0.97,1.10) 89.3ns × (0.96,1.09) ~ (p=0.280)
FmtFprintfString 282ns × (0.98,1.04) 287ns × (0.98,1.07) +1.72% (p=0.039)
FmtFprintfInt 269ns × (0.99,1.03) 282ns × (0.97,1.04) +4.87% (p=0.000)
FmtFprintfIntInt 478ns × (0.99,1.02) 481ns × (0.99,1.02) +0.61% (p=0.048)
FmtFprintfPrefixedInt 399ns × (0.98,1.03) 400ns × (0.98,1.05) ~ (p=0.533)
FmtFprintfFloat 563ns × (0.99,1.01) 570ns × (1.00,1.01) +1.37% (p=0.000)
FmtManyArgs 1.89µs × (0.99,1.01) 1.92µs × (0.99,1.02) +1.88% (p=0.000)
GobDecode 15.2ms × (0.99,1.01) 15.2ms × (0.98,1.05) ~ (p=0.609)
GobEncode 11.6ms × (0.98,1.03) 11.9ms × (0.98,1.04) +2.17% (p=0.000)
Gzip 648ms × (0.99,1.01) 648ms × (1.00,1.01) ~ (p=0.835)
Gunzip 142ms × (1.00,1.00) 143ms × (1.00,1.01) ~ (p=0.169)
HTTPClientServer 90.5µs × (0.98,1.03) 91.5µs × (0.98,1.04) +1.04% (p=0.045)
JSONEncode 31.5ms × (0.98,1.03) 31.4ms × (0.98,1.03) ~ (p=0.549)
JSONDecode 111ms × (0.99,1.01) 107ms × (0.99,1.01) -3.21% (p=0.000)
Mandelbrot200 6.01ms × (1.00,1.00) 6.01ms × (1.00,1.00) ~ (p=0.878)
GoParse 6.54ms × (0.99,1.02) 6.61ms × (0.99,1.03) +1.08% (p=0.004)
RegexpMatchEasy0_32 160ns × (1.00,1.01) 161ns × (1.00,1.00) +0.40% (p=0.000)
RegexpMatchEasy0_1K 560ns × (0.99,1.01) 559ns × (0.99,1.01) ~ (p=0.088)
RegexpMatchEasy1_32 138ns × (0.99,1.01) 138ns × (1.00,1.00) ~ (p=0.380)
RegexpMatchEasy1_1K 877ns × (1.00,1.00) 878ns × (1.00,1.00) ~ (p=0.157)
RegexpMatchMedium_32 251ns × (0.99,1.00) 251ns × (1.00,1.01) +0.28% (p=0.021)
RegexpMatchMedium_1K 72.6µs × (1.00,1.00) 72.6µs × (1.00,1.00) ~ (p=0.539)
RegexpMatchHard_32 3.84µs × (1.00,1.00) 3.84µs × (1.00,1.00) ~ (p=0.378)
RegexpMatchHard_1K 117µs × (1.00,1.00) 117µs × (1.00,1.00) ~ (p=0.067)
Revcomp 904ms × (0.99,1.02) 904ms × (0.99,1.01) ~ (p=0.943)
Template 125ms × (0.99,1.02) 127ms × (0.99,1.01) +1.79% (p=0.000)
TimeParse 627ns × (0.99,1.01) 622ns × (0.99,1.01) -0.88% (p=0.000)
TimeFormat 655ns × (0.99,1.02) 655ns × (0.99,1.02) ~ (p=0.976)
For the record, Go 1 benchmarks, Go 1.4 vs this CL:
name old mean new mean delta
BinaryTree17 4.61s × (0.97,1.05) 5.91s × (0.98,1.03) +28.35% (p=0.000)
Fannkuch11 4.40s × (0.99,1.03) 4.41s × (0.99,1.01) ~ (p=0.212)
FmtFprintfEmpty 102ns × (0.99,1.01) 84ns × (0.99,1.02) -18.38% (p=0.000)
FmtFprintfString 302ns × (0.98,1.01) 303ns × (0.99,1.02) ~ (p=0.203)
FmtFprintfInt 313ns × (0.97,1.05) 270ns × (0.99,1.01) -13.69% (p=0.000)
FmtFprintfIntInt 524ns × (0.98,1.02) 477ns × (0.99,1.00) -8.87% (p=0.000)
FmtFprintfPrefixedInt 424ns × (0.98,1.02) 386ns × (0.99,1.01) -8.96% (p=0.000)
FmtFprintfFloat 652ns × (0.98,1.02) 594ns × (0.97,1.05) -8.97% (p=0.000)
FmtManyArgs 2.13µs × (0.99,1.02) 1.94µs × (0.99,1.01) -8.92% (p=0.000)
GobDecode 17.1ms × (0.99,1.02) 14.9ms × (0.98,1.03) -13.07% (p=0.000)
GobEncode 13.5ms × (0.98,1.03) 11.5ms × (0.98,1.03) -15.25% (p=0.000)
Gzip 656ms × (0.99,1.02) 647ms × (0.99,1.01) -1.29% (p=0.000)
Gunzip 143ms × (0.99,1.02) 144ms × (0.99,1.01) ~ (p=0.204)
HTTPClientServer 88.2µs × (0.98,1.02) 90.8µs × (0.98,1.01) +2.93% (p=0.000)
JSONEncode 32.2ms × (0.98,1.02) 30.9ms × (0.97,1.04) -4.06% (p=0.001)
JSONDecode 121ms × (0.98,1.02) 110ms × (0.98,1.05) -8.95% (p=0.000)
Mandelbrot200 6.06ms × (0.99,1.01) 6.11ms × (0.98,1.04) ~ (p=0.184)
GoParse 6.76ms × (0.97,1.04) 6.58ms × (0.98,1.05) -2.63% (p=0.003)
RegexpMatchEasy0_32 195ns × (1.00,1.01) 155ns × (0.99,1.01) -20.43% (p=0.000)
RegexpMatchEasy0_1K 479ns × (0.98,1.03) 535ns × (0.99,1.02) +11.59% (p=0.000)
RegexpMatchEasy1_32 169ns × (0.99,1.02) 131ns × (0.99,1.03) -22.44% (p=0.000)
RegexpMatchEasy1_1K 1.53µs × (0.99,1.01) 0.87µs × (0.99,1.02) -43.07% (p=0.000)
RegexpMatchMedium_32 334ns × (0.99,1.01) 242ns × (0.99,1.01) -27.53% (p=0.000)
RegexpMatchMedium_1K 125µs × (1.00,1.01) 72µs × (0.99,1.03) -42.53% (p=0.000)
RegexpMatchHard_32 6.03µs × (0.99,1.01) 3.79µs × (0.99,1.01) -37.12% (p=0.000)
RegexpMatchHard_1K 189µs × (0.99,1.02) 115µs × (0.99,1.01) -39.20% (p=0.000)
Revcomp 935ms × (0.96,1.03) 926ms × (0.98,1.02) ~ (p=0.083)
Template 146ms × (0.97,1.05) 119ms × (0.99,1.01) -18.37% (p=0.000)
TimeParse 660ns × (0.99,1.01) 624ns × (0.99,1.02) -5.43% (p=0.000)
TimeFormat 670ns × (0.98,1.02) 710ns × (1.00,1.01) +5.97% (p=0.000)
This CL is a bit larger than I would like, but the compiler, linker, runtime,
and package reflect all need to be in sync about the format of these programs,
so there is no easy way to split this into independent changes (at least
while keeping the build working at each change).
Fixes #9625.
Fixes #10524.
Change-Id: I9e3e20d6097099d0f8532d1cb5b1af528804989a
Reviewed-on: https://go-review.googlesource.com/9888
Reviewed-by: Austin Clements <austin@google.com>
Run-TryBot: Russ Cox <rsc@golang.org>
2015-05-08 01:43:18 -04:00
|
|
|
p.w.ZeroUntil(s.Value / ptrsize)
|
2015-05-25 16:13:50 +12:00
|
|
|
p.w.Append(prog[4:], nptr)
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
|
2016-08-22 10:27:20 +12:00
|
|
|
// dataSortKey is used to sort a slice of data symbol *Symbol pointers.
|
2016-04-18 14:50:14 -04:00
|
|
|
// The sort keys are kept inline to improve cache behaviour while sorting.
|
2016-03-09 16:23:25 +02:00
|
|
|
type dataSortKey struct {
|
2016-04-18 14:50:14 -04:00
|
|
|
size int64
|
|
|
|
|
name string
|
2016-08-22 10:27:20 +12:00
|
|
|
sym *Symbol
|
2016-03-09 16:23:25 +02:00
|
|
|
}
|
|
|
|
|
|
2016-04-18 14:50:14 -04:00
|
|
|
type bySizeAndName []dataSortKey
|
2016-03-09 16:23:25 +02:00
|
|
|
|
2016-04-18 14:50:14 -04:00
|
|
|
func (d bySizeAndName) Len() int { return len(d) }
|
|
|
|
|
func (d bySizeAndName) Swap(i, j int) { d[i], d[j] = d[j], d[i] }
|
|
|
|
|
func (d bySizeAndName) Less(i, j int) bool {
|
|
|
|
|
s1, s2 := d[i], d[j]
|
|
|
|
|
if s1.size != s2.size {
|
|
|
|
|
return s1.size < s2.size
|
2016-04-04 13:07:24 -04:00
|
|
|
}
|
2016-04-18 14:50:14 -04:00
|
|
|
return s1.name < s2.name
|
2016-03-09 16:23:25 +02:00
|
|
|
}
|
|
|
|
|
|
2016-04-19 08:59:56 -04:00
|
|
|
const cutoff int64 = 2e9 // 2 GB (or so; looks better in errors than 2^31)
|
|
|
|
|
|
2016-08-19 22:40:38 -04:00
|
|
|
func checkdatsize(ctxt *Link, datsize int64, symn int) {
|
2016-04-19 08:59:56 -04:00
|
|
|
if datsize > cutoff {
|
2016-08-19 22:40:38 -04:00
|
|
|
ctxt.Diag("too much data in section %v (over %d bytes)", symn, cutoff)
|
2016-04-19 08:59:56 -04:00
|
|
|
}
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
|
2016-04-18 14:50:14 -04:00
|
|
|
// datap is a collection of reachable data symbols in address order.
|
|
|
|
|
// Generated by dodata.
|
2016-08-19 11:35:54 -04:00
|
|
|
var datap []*Symbol
|
2016-03-09 16:23:25 +02:00
|
|
|
|
2016-08-19 22:40:38 -04:00
|
|
|
func (ctxt *Link) dodata() {
|
2015-02-27 22:57:28 -05:00
|
|
|
if Debug['v'] != 0 {
|
2016-08-21 13:52:23 -04:00
|
|
|
fmt.Fprintf(ctxt.Bso, "%5.2f dodata\n", obj.Cputime())
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
2016-08-21 13:52:23 -04:00
|
|
|
ctxt.Bso.Flush()
|
2015-02-27 22:57:28 -05:00
|
|
|
|
2016-04-18 14:50:14 -04:00
|
|
|
// Collect data symbols by type into data.
|
2016-08-19 11:35:54 -04:00
|
|
|
var data [obj.SXREF][]*Symbol
|
2016-08-19 22:40:38 -04:00
|
|
|
for _, s := range ctxt.Allsym {
|
2016-03-02 07:59:49 -05:00
|
|
|
if !s.Attr.Reachable() || s.Attr.Special() {
|
2015-02-27 22:57:28 -05:00
|
|
|
continue
|
|
|
|
|
}
|
2016-04-18 14:50:14 -04:00
|
|
|
if s.Type <= obj.STEXT || s.Type >= obj.SXREF {
|
|
|
|
|
continue
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
2016-04-18 14:50:14 -04:00
|
|
|
data[s.Type] = append(data[s.Type], s)
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
|
2016-04-18 14:50:14 -04:00
|
|
|
// Now that we have the data symbols, but before we start
|
|
|
|
|
// to assign addresses, record all the necessary
|
|
|
|
|
// dynamic relocations. These will grow the relocation
|
|
|
|
|
// symbol, which is itself data.
|
|
|
|
|
//
|
|
|
|
|
// On darwin, we need the symbol table numbers for dynreloc.
|
2015-04-19 19:33:58 -07:00
|
|
|
if HEADTYPE == obj.Hdarwin {
|
2016-08-19 22:40:38 -04:00
|
|
|
machosymorder(ctxt)
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
2016-08-19 22:40:38 -04:00
|
|
|
dynreloc(ctxt, &data)
|
2015-02-27 22:57:28 -05:00
|
|
|
|
2015-05-21 13:07:19 +12:00
|
|
|
if UseRelro() {
|
|
|
|
|
// "read only" data with relocations needs to go in its own section
|
|
|
|
|
// when building a shared library. We do this by boosting objects of
|
|
|
|
|
// type SXXX with relocations to type SXXXRELRO.
|
2016-04-18 14:50:14 -04:00
|
|
|
for symnro := int16(obj.STYPE); symnro < obj.STYPERELRO; symnro++ {
|
|
|
|
|
symnrelro := symnro + obj.STYPERELRO - obj.STYPE
|
|
|
|
|
|
2016-08-19 11:35:54 -04:00
|
|
|
ro := []*Symbol{}
|
2016-04-18 14:50:14 -04:00
|
|
|
relro := data[symnrelro]
|
|
|
|
|
|
|
|
|
|
for _, s := range data[symnro] {
|
|
|
|
|
isRelro := len(s.R) > 0
|
|
|
|
|
switch s.Type {
|
|
|
|
|
case obj.STYPE, obj.SGOSTRINGHDR, obj.STYPERELRO, obj.SGOSTRINGHDRRELRO:
|
|
|
|
|
// Symbols are not sorted yet, so it is possible
|
|
|
|
|
// that an Outer symbol has been changed to a
|
|
|
|
|
// relro Type before it reaches here.
|
|
|
|
|
isRelro = true
|
|
|
|
|
}
|
|
|
|
|
if isRelro {
|
|
|
|
|
s.Type = symnrelro
|
|
|
|
|
if s.Outer != nil {
|
|
|
|
|
s.Outer.Type = s.Type
|
|
|
|
|
}
|
|
|
|
|
relro = append(relro, s)
|
|
|
|
|
} else {
|
|
|
|
|
ro = append(ro, s)
|
2015-05-21 13:07:19 +12:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2016-04-18 14:50:14 -04:00
|
|
|
// Check that we haven't made two symbols with the same .Outer into
|
|
|
|
|
// different types (because references two symbols with non-nil Outer
|
|
|
|
|
// become references to the outer symbol + offset it's vital that the
|
|
|
|
|
// symbol and the outer end up in the same section).
|
|
|
|
|
for _, s := range relro {
|
|
|
|
|
if s.Outer != nil && s.Outer.Type != s.Type {
|
2016-08-19 22:40:38 -04:00
|
|
|
ctxt.Diag("inconsistent types for %s and its Outer %s (%d != %d)",
|
2016-04-18 14:50:14 -04:00
|
|
|
s.Name, s.Outer.Name, s.Type, s.Outer.Type)
|
2015-03-30 15:45:33 +02:00
|
|
|
}
|
|
|
|
|
}
|
2016-04-18 14:50:14 -04:00
|
|
|
|
|
|
|
|
data[symnro] = ro
|
|
|
|
|
data[symnrelro] = relro
|
2015-03-30 15:45:33 +02:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2016-04-18 14:50:14 -04:00
|
|
|
// Sort symbols.
|
2016-04-19 08:59:56 -04:00
|
|
|
var dataMaxAlign [obj.SXREF]int32
|
2016-04-18 14:50:14 -04:00
|
|
|
var wg sync.WaitGroup
|
|
|
|
|
for symn := range data {
|
|
|
|
|
symn := symn
|
|
|
|
|
wg.Add(1)
|
|
|
|
|
go func() {
|
2016-08-19 22:40:38 -04:00
|
|
|
data[symn], dataMaxAlign[symn] = dodataSect(ctxt, symn, data[symn])
|
2016-04-18 14:50:14 -04:00
|
|
|
wg.Done()
|
|
|
|
|
}()
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
2016-04-18 14:50:14 -04:00
|
|
|
wg.Wait()
|
2015-02-27 22:57:28 -05:00
|
|
|
|
2016-04-18 14:50:14 -04:00
|
|
|
// Allocate sections.
|
|
|
|
|
// Data is processed before segtext, because we need
|
|
|
|
|
// to see all symbols in the .data and .bss sections in order
|
|
|
|
|
// to generate garbage collection information.
|
2015-03-02 12:35:15 -05:00
|
|
|
datsize := int64(0)
|
2015-02-27 22:57:28 -05:00
|
|
|
|
2016-04-18 14:50:14 -04:00
|
|
|
// Writable sections.
|
|
|
|
|
writableSects := []int{
|
|
|
|
|
obj.SELFSECT,
|
|
|
|
|
obj.SMACHO,
|
|
|
|
|
obj.SMACHOGOT,
|
|
|
|
|
obj.SWINDOWS,
|
|
|
|
|
}
|
|
|
|
|
for _, symn := range writableSects {
|
|
|
|
|
for _, s := range data[symn] {
|
|
|
|
|
sect := addsection(&Segdata, s.Name, 06)
|
|
|
|
|
sect.Align = symalign(s)
|
|
|
|
|
datsize = Rnd(datsize, int64(sect.Align))
|
|
|
|
|
sect.Vaddr = uint64(datsize)
|
|
|
|
|
s.Sect = sect
|
|
|
|
|
s.Type = obj.SDATA
|
|
|
|
|
s.Value = int64(uint64(datsize) - sect.Vaddr)
|
2016-04-19 08:59:56 -04:00
|
|
|
datsize += s.Size
|
2016-04-18 14:50:14 -04:00
|
|
|
sect.Length = uint64(datsize) - sect.Vaddr
|
|
|
|
|
}
|
2016-08-19 22:40:38 -04:00
|
|
|
checkdatsize(ctxt, datsize, symn)
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
|
2016-04-18 14:50:14 -04:00
|
|
|
// .got (and .toc on ppc64)
|
|
|
|
|
if len(data[obj.SELFGOT]) > 0 {
|
2015-03-02 12:35:15 -05:00
|
|
|
sect := addsection(&Segdata, ".got", 06)
|
2016-04-19 08:59:56 -04:00
|
|
|
sect.Align = dataMaxAlign[obj.SELFGOT]
|
2015-02-27 22:57:28 -05:00
|
|
|
datsize = Rnd(datsize, int64(sect.Align))
|
|
|
|
|
sect.Vaddr = uint64(datsize)
|
2016-08-19 11:35:54 -04:00
|
|
|
var toc *Symbol
|
2016-04-18 14:50:14 -04:00
|
|
|
for _, s := range data[obj.SELFGOT] {
|
2015-02-27 22:57:28 -05:00
|
|
|
datsize = aligndatsize(datsize, s)
|
|
|
|
|
s.Sect = sect
|
2015-04-19 19:33:58 -07:00
|
|
|
s.Type = obj.SDATA
|
2015-02-27 22:57:28 -05:00
|
|
|
s.Value = int64(uint64(datsize) - sect.Vaddr)
|
|
|
|
|
|
|
|
|
|
// Resolve .TOC. symbol for this object file (ppc64)
|
2016-08-19 22:40:38 -04:00
|
|
|
toc = Linkrlookup(ctxt, ".TOC.", int(s.Version))
|
2015-02-27 22:57:28 -05:00
|
|
|
if toc != nil {
|
|
|
|
|
toc.Sect = sect
|
|
|
|
|
toc.Outer = s
|
|
|
|
|
toc.Sub = s.Sub
|
|
|
|
|
s.Sub = toc
|
|
|
|
|
|
|
|
|
|
toc.Value = 0x8000
|
|
|
|
|
}
|
|
|
|
|
|
2016-04-19 08:59:56 -04:00
|
|
|
datsize += s.Size
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
2016-08-19 22:40:38 -04:00
|
|
|
checkdatsize(ctxt, datsize, obj.SELFGOT)
|
2015-02-27 22:57:28 -05:00
|
|
|
sect.Length = uint64(datsize) - sect.Vaddr
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* pointer-free data */
|
2016-04-18 14:50:14 -04:00
|
|
|
sect := addsection(&Segdata, ".noptrdata", 06)
|
2016-04-19 08:59:56 -04:00
|
|
|
sect.Align = dataMaxAlign[obj.SNOPTRDATA]
|
2015-02-27 22:57:28 -05:00
|
|
|
datsize = Rnd(datsize, int64(sect.Align))
|
|
|
|
|
sect.Vaddr = uint64(datsize)
|
2016-08-19 22:40:38 -04:00
|
|
|
Linklookup(ctxt, "runtime.noptrdata", 0).Sect = sect
|
|
|
|
|
Linklookup(ctxt, "runtime.enoptrdata", 0).Sect = sect
|
2016-04-18 14:50:14 -04:00
|
|
|
for _, s := range data[obj.SNOPTRDATA] {
|
2015-02-27 22:57:28 -05:00
|
|
|
datsize = aligndatsize(datsize, s)
|
|
|
|
|
s.Sect = sect
|
2015-04-19 19:33:58 -07:00
|
|
|
s.Type = obj.SDATA
|
2015-02-27 22:57:28 -05:00
|
|
|
s.Value = int64(uint64(datsize) - sect.Vaddr)
|
2016-04-19 08:59:56 -04:00
|
|
|
datsize += s.Size
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
2016-08-19 22:40:38 -04:00
|
|
|
checkdatsize(ctxt, datsize, obj.SNOPTRDATA)
|
2015-02-27 22:57:28 -05:00
|
|
|
sect.Length = uint64(datsize) - sect.Vaddr
|
|
|
|
|
|
2015-04-01 14:17:43 +13:00
|
|
|
hasinitarr := Linkshared
|
|
|
|
|
|
2015-02-27 22:57:28 -05:00
|
|
|
/* shared library initializer */
|
2015-04-09 10:44:05 -04:00
|
|
|
switch Buildmode {
|
|
|
|
|
case BuildmodeCArchive, BuildmodeCShared, BuildmodeShared:
|
2015-04-01 14:17:43 +13:00
|
|
|
hasinitarr = true
|
|
|
|
|
}
|
|
|
|
|
if hasinitarr {
|
2015-03-02 12:35:15 -05:00
|
|
|
sect := addsection(&Segdata, ".init_array", 06)
|
2016-04-19 08:59:56 -04:00
|
|
|
sect.Align = dataMaxAlign[obj.SINITARR]
|
2015-02-27 22:57:28 -05:00
|
|
|
datsize = Rnd(datsize, int64(sect.Align))
|
|
|
|
|
sect.Vaddr = uint64(datsize)
|
2016-04-18 14:50:14 -04:00
|
|
|
for _, s := range data[obj.SINITARR] {
|
2015-02-27 22:57:28 -05:00
|
|
|
datsize = aligndatsize(datsize, s)
|
|
|
|
|
s.Sect = sect
|
|
|
|
|
s.Value = int64(uint64(datsize) - sect.Vaddr)
|
2016-04-19 08:59:56 -04:00
|
|
|
datsize += s.Size
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
sect.Length = uint64(datsize) - sect.Vaddr
|
2016-08-19 22:40:38 -04:00
|
|
|
checkdatsize(ctxt, datsize, obj.SINITARR)
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* data */
|
|
|
|
|
sect = addsection(&Segdata, ".data", 06)
|
2016-04-19 08:59:56 -04:00
|
|
|
sect.Align = dataMaxAlign[obj.SDATA]
|
2015-02-27 22:57:28 -05:00
|
|
|
datsize = Rnd(datsize, int64(sect.Align))
|
|
|
|
|
sect.Vaddr = uint64(datsize)
|
2016-08-19 22:40:38 -04:00
|
|
|
Linklookup(ctxt, "runtime.data", 0).Sect = sect
|
|
|
|
|
Linklookup(ctxt, "runtime.edata", 0).Sect = sect
|
runtime: replace GC programs with simpler encoding, faster decoder
Small types record the location of pointers in their memory layout
by using a simple bitmap. In Go 1.4 the bitmap held 4-bit entries,
and in Go 1.5 the bitmap holds 1-bit entries, but in both cases using
a bitmap for a large type containing arrays does not make sense:
if someone refers to the type [1<<28]*byte in a program in such
a way that the type information makes it into the binary, it would be
a waste of space to write a 128 MB (for 4-bit entries) or even 32 MB
(for 1-bit entries) bitmap full of 1s into the binary or even to keep
one in memory during the execution of the program.
For large types containing arrays, it is much more compact to describe
the locations of pointers using a notation that can express repetition
than to lay out a bitmap of pointers. Go 1.4 included such a notation,
called ``GC programs'' but it was complex, required recursion during
decoding, and was generally slow. Dmitriy measured the execution of
these programs writing directly to the heap bitmap as being 7x slower
than copying from a preunrolled 4-bit mask (and frankly that code was
not terribly fast either). For some tests, unrollgcprog1 was seen costing
as much as 3x more than the rest of malloc combined.
This CL introduces a different form for the GC programs. They use a
simple Lempel-Ziv-style encoding of the 1-bit pointer information,
in which the only operations are (1) emit the following n bits
and (2) repeat the last n bits c more times. This encoding can be
generated directly from the Go type information (using repetition
only for arrays or large runs of non-pointer data) and it can be decoded
very efficiently. In particular the decoding requires little state and
no recursion, so that the entire decoding can run without any memory
accesses other than the reads of the encoding and the writes of the
decoded form to the heap bitmap. For recursive types like arrays of
arrays of arrays, the inner instructions are only executed once, not
n times, so that large repetitions run at full speed. (In contrast, large
repetitions in the old programs repeated the individual bit-level layout
of the inner data over and over.) The result is as much as 25x faster
decoding compared to the old form.
Because the old decoder was so slow, Go 1.4 had three (or so) cases
for how to set the heap bitmap bits for an allocation of a given type:
(1) If the type had an even number of words up to 32 words, then
the 4-bit pointer mask for the type fit in no more than 16 bytes;
store the 4-bit pointer mask directly in the binary and copy from it.
(1b) If the type had an odd number of words up to 15 words, then
the 4-bit pointer mask for the type, doubled to end on a byte boundary,
fit in no more than 16 bytes; store that doubled mask directly in the
binary and copy from it.
(2) If the type had an even number of words up to 128 words,
or an odd number of words up to 63 words (again due to doubling),
then the 4-bit pointer mask would fit in a 64-byte unrolled mask.
Store a GC program in the binary, but leave space in the BSS for
the unrolled mask. Execute the GC program to construct the mask the
first time it is needed, and thereafter copy from the mask.
(3) Otherwise, store a GC program and execute it to write directly to
the heap bitmap each time an object of that type is allocated.
(This is the case that was 7x slower than the other two.)
Because the new pointer masks store 1-bit entries instead of 4-bit
entries and because using the decoder no longer carries a significant
overhead, after this CL (that is, for Go 1.5) there are only two cases:
(1) If the type is 128 words or less (no condition about odd or even),
store the 1-bit pointer mask directly in the binary and use it to
initialize the heap bitmap during malloc. (Implemented in CL 9702.)
(2) There is no case 2 anymore.
(3) Otherwise, store a GC program and execute it to write directly to
the heap bitmap each time an object of that type is allocated.
Executing the GC program directly into the heap bitmap (case (3) above)
was disabled for the Go 1.5 dev cycle, both to avoid needing to use
GC programs for typedmemmove and to avoid updating that code as
the heap bitmap format changed. Typedmemmove no longer uses this
type information; as of CL 9886 it uses the heap bitmap directly.
Now that the heap bitmap format is stable, we reintroduce GC programs
and their space savings.
Benchmarks for heapBitsSetType, before this CL vs this CL:
name old mean new mean delta
SetTypePtr 7.59ns × (0.99,1.02) 5.16ns × (1.00,1.00) -32.05% (p=0.000)
SetTypePtr8 21.0ns × (0.98,1.05) 21.4ns × (1.00,1.00) ~ (p=0.179)
SetTypePtr16 24.1ns × (0.99,1.01) 24.6ns × (1.00,1.00) +2.41% (p=0.001)
SetTypePtr32 31.2ns × (0.99,1.01) 32.4ns × (0.99,1.02) +3.72% (p=0.001)
SetTypePtr64 45.2ns × (1.00,1.00) 47.2ns × (1.00,1.00) +4.42% (p=0.000)
SetTypePtr126 75.8ns × (0.99,1.01) 79.1ns × (1.00,1.00) +4.25% (p=0.000)
SetTypePtr128 74.3ns × (0.99,1.01) 77.6ns × (1.00,1.01) +4.55% (p=0.000)
SetTypePtrSlice 726ns × (1.00,1.01) 712ns × (1.00,1.00) -1.95% (p=0.001)
SetTypeNode1 20.0ns × (0.99,1.01) 20.7ns × (1.00,1.00) +3.71% (p=0.000)
SetTypeNode1Slice 112ns × (1.00,1.00) 113ns × (0.99,1.00) ~ (p=0.070)
SetTypeNode8 23.9ns × (1.00,1.00) 24.7ns × (1.00,1.01) +3.18% (p=0.000)
SetTypeNode8Slice 294ns × (0.99,1.02) 287ns × (0.99,1.01) -2.38% (p=0.015)
SetTypeNode64 52.8ns × (0.99,1.03) 51.8ns × (0.99,1.01) ~ (p=0.069)
SetTypeNode64Slice 1.13µs × (0.99,1.05) 1.14µs × (0.99,1.00) ~ (p=0.767)
SetTypeNode64Dead 36.0ns × (1.00,1.01) 32.5ns × (0.99,1.00) -9.67% (p=0.000)
SetTypeNode64DeadSlice 1.43µs × (0.99,1.01) 1.40µs × (1.00,1.00) -2.39% (p=0.001)
SetTypeNode124 75.7ns × (1.00,1.01) 79.0ns × (1.00,1.00) +4.44% (p=0.000)
SetTypeNode124Slice 1.94µs × (1.00,1.01) 2.04µs × (0.99,1.01) +4.98% (p=0.000)
SetTypeNode126 75.4ns × (1.00,1.01) 77.7ns × (0.99,1.01) +3.11% (p=0.000)
SetTypeNode126Slice 1.95µs × (0.99,1.01) 2.03µs × (1.00,1.00) +3.74% (p=0.000)
SetTypeNode128 85.4ns × (0.99,1.01) 122.0ns × (1.00,1.00) +42.89% (p=0.000)
SetTypeNode128Slice 2.20µs × (1.00,1.01) 2.36µs × (0.98,1.02) +7.48% (p=0.001)
SetTypeNode130 83.3ns × (1.00,1.00) 123.0ns × (1.00,1.00) +47.61% (p=0.000)
SetTypeNode130Slice 2.30µs × (0.99,1.01) 2.40µs × (0.98,1.01) +4.37% (p=0.000)
SetTypeNode1024 498ns × (1.00,1.00) 537ns × (1.00,1.00) +7.96% (p=0.000)
SetTypeNode1024Slice 15.5µs × (0.99,1.01) 17.8µs × (1.00,1.00) +15.27% (p=0.000)
The above compares always using a cached pointer mask (and the
corresponding waste of memory) against using the programs directly.
Some slowdown is expected, in exchange for having a better general algorithm.
The GC programs kick in for SetTypeNode128, SetTypeNode130, SetTypeNode1024,
along with the slice variants of those.
It is possible that the cutoff of 128 words (bits) should be raised
in a followup CL, but even with this low cutoff the GC programs are
faster than Go 1.4's "fast path" non-GC program case.
Benchmarks for heapBitsSetType, Go 1.4 vs this CL:
name old mean new mean delta
SetTypePtr 6.89ns × (1.00,1.00) 5.17ns × (1.00,1.00) -25.02% (p=0.000)
SetTypePtr8 25.8ns × (0.97,1.05) 21.5ns × (1.00,1.00) -16.70% (p=0.000)
SetTypePtr16 39.8ns × (0.97,1.02) 24.7ns × (0.99,1.01) -37.81% (p=0.000)
SetTypePtr32 68.8ns × (0.98,1.01) 32.2ns × (1.00,1.01) -53.18% (p=0.000)
SetTypePtr64 130ns × (1.00,1.00) 47ns × (1.00,1.00) -63.67% (p=0.000)
SetTypePtr126 241ns × (0.99,1.01) 79ns × (1.00,1.01) -67.25% (p=0.000)
SetTypePtr128 2.07µs × (1.00,1.00) 0.08µs × (1.00,1.00) -96.27% (p=0.000)
SetTypePtrSlice 1.05µs × (0.99,1.01) 0.72µs × (0.99,1.02) -31.70% (p=0.000)
SetTypeNode1 16.0ns × (0.99,1.01) 20.8ns × (0.99,1.03) +29.91% (p=0.000)
SetTypeNode1Slice 184ns × (0.99,1.01) 112ns × (0.99,1.01) -39.26% (p=0.000)
SetTypeNode8 29.5ns × (0.97,1.02) 24.6ns × (1.00,1.00) -16.50% (p=0.000)
SetTypeNode8Slice 624ns × (0.98,1.02) 285ns × (1.00,1.00) -54.31% (p=0.000)
SetTypeNode64 135ns × (0.96,1.08) 52ns × (0.99,1.02) -61.32% (p=0.000)
SetTypeNode64Slice 3.83µs × (1.00,1.00) 1.14µs × (0.99,1.01) -70.16% (p=0.000)
SetTypeNode64Dead 134ns × (0.99,1.01) 32ns × (1.00,1.01) -75.74% (p=0.000)
SetTypeNode64DeadSlice 3.83µs × (0.99,1.00) 1.40µs × (1.00,1.01) -63.42% (p=0.000)
SetTypeNode124 240ns × (0.99,1.01) 79ns × (1.00,1.01) -67.05% (p=0.000)
SetTypeNode124Slice 7.27µs × (1.00,1.00) 2.04µs × (1.00,1.00) -71.95% (p=0.000)
SetTypeNode126 2.06µs × (0.99,1.01) 0.08µs × (0.99,1.01) -96.23% (p=0.000)
SetTypeNode126Slice 64.4µs × (1.00,1.00) 2.0µs × (1.00,1.00) -96.85% (p=0.000)
SetTypeNode128 2.09µs × (1.00,1.01) 0.12µs × (1.00,1.00) -94.15% (p=0.000)
SetTypeNode128Slice 65.4µs × (1.00,1.00) 2.4µs × (0.99,1.03) -96.39% (p=0.000)
SetTypeNode130 2.11µs × (1.00,1.00) 0.12µs × (1.00,1.00) -94.18% (p=0.000)
SetTypeNode130Slice 66.3µs × (1.00,1.00) 2.4µs × (0.97,1.08) -96.34% (p=0.000)
SetTypeNode1024 16.0µs × (1.00,1.01) 0.5µs × (1.00,1.00) -96.65% (p=0.000)
SetTypeNode1024Slice 512µs × (1.00,1.00) 18µs × (0.98,1.04) -96.45% (p=0.000)
SetTypeNode124 uses a 124 data + 2 ptr = 126-word allocation.
Both Go 1.4 and this CL are using pointer bitmaps for this case,
so that's an overall 3x speedup for using pointer bitmaps.
SetTypeNode128 uses a 128 data + 2 ptr = 130-word allocation.
Both Go 1.4 and this CL are running the GC program for this case,
so that's an overall 17x speedup when using GC programs (and
I've seen >20x on other systems).
Comparing Go 1.4's SetTypeNode124 (pointer bitmap) against
this CL's SetTypeNode128 (GC program), the slow path in the
code in this CL is 2x faster than the fast path in Go 1.4.
The Go 1 benchmarks are basically unaffected compared to just before this CL.
Go 1 benchmarks, before this CL vs this CL:
name old mean new mean delta
BinaryTree17 5.87s × (0.97,1.04) 5.91s × (0.96,1.04) ~ (p=0.306)
Fannkuch11 4.38s × (1.00,1.00) 4.37s × (1.00,1.01) -0.22% (p=0.006)
FmtFprintfEmpty 90.7ns × (0.97,1.10) 89.3ns × (0.96,1.09) ~ (p=0.280)
FmtFprintfString 282ns × (0.98,1.04) 287ns × (0.98,1.07) +1.72% (p=0.039)
FmtFprintfInt 269ns × (0.99,1.03) 282ns × (0.97,1.04) +4.87% (p=0.000)
FmtFprintfIntInt 478ns × (0.99,1.02) 481ns × (0.99,1.02) +0.61% (p=0.048)
FmtFprintfPrefixedInt 399ns × (0.98,1.03) 400ns × (0.98,1.05) ~ (p=0.533)
FmtFprintfFloat 563ns × (0.99,1.01) 570ns × (1.00,1.01) +1.37% (p=0.000)
FmtManyArgs 1.89µs × (0.99,1.01) 1.92µs × (0.99,1.02) +1.88% (p=0.000)
GobDecode 15.2ms × (0.99,1.01) 15.2ms × (0.98,1.05) ~ (p=0.609)
GobEncode 11.6ms × (0.98,1.03) 11.9ms × (0.98,1.04) +2.17% (p=0.000)
Gzip 648ms × (0.99,1.01) 648ms × (1.00,1.01) ~ (p=0.835)
Gunzip 142ms × (1.00,1.00) 143ms × (1.00,1.01) ~ (p=0.169)
HTTPClientServer 90.5µs × (0.98,1.03) 91.5µs × (0.98,1.04) +1.04% (p=0.045)
JSONEncode 31.5ms × (0.98,1.03) 31.4ms × (0.98,1.03) ~ (p=0.549)
JSONDecode 111ms × (0.99,1.01) 107ms × (0.99,1.01) -3.21% (p=0.000)
Mandelbrot200 6.01ms × (1.00,1.00) 6.01ms × (1.00,1.00) ~ (p=0.878)
GoParse 6.54ms × (0.99,1.02) 6.61ms × (0.99,1.03) +1.08% (p=0.004)
RegexpMatchEasy0_32 160ns × (1.00,1.01) 161ns × (1.00,1.00) +0.40% (p=0.000)
RegexpMatchEasy0_1K 560ns × (0.99,1.01) 559ns × (0.99,1.01) ~ (p=0.088)
RegexpMatchEasy1_32 138ns × (0.99,1.01) 138ns × (1.00,1.00) ~ (p=0.380)
RegexpMatchEasy1_1K 877ns × (1.00,1.00) 878ns × (1.00,1.00) ~ (p=0.157)
RegexpMatchMedium_32 251ns × (0.99,1.00) 251ns × (1.00,1.01) +0.28% (p=0.021)
RegexpMatchMedium_1K 72.6µs × (1.00,1.00) 72.6µs × (1.00,1.00) ~ (p=0.539)
RegexpMatchHard_32 3.84µs × (1.00,1.00) 3.84µs × (1.00,1.00) ~ (p=0.378)
RegexpMatchHard_1K 117µs × (1.00,1.00) 117µs × (1.00,1.00) ~ (p=0.067)
Revcomp 904ms × (0.99,1.02) 904ms × (0.99,1.01) ~ (p=0.943)
Template 125ms × (0.99,1.02) 127ms × (0.99,1.01) +1.79% (p=0.000)
TimeParse 627ns × (0.99,1.01) 622ns × (0.99,1.01) -0.88% (p=0.000)
TimeFormat 655ns × (0.99,1.02) 655ns × (0.99,1.02) ~ (p=0.976)
For the record, Go 1 benchmarks, Go 1.4 vs this CL:
name old mean new mean delta
BinaryTree17 4.61s × (0.97,1.05) 5.91s × (0.98,1.03) +28.35% (p=0.000)
Fannkuch11 4.40s × (0.99,1.03) 4.41s × (0.99,1.01) ~ (p=0.212)
FmtFprintfEmpty 102ns × (0.99,1.01) 84ns × (0.99,1.02) -18.38% (p=0.000)
FmtFprintfString 302ns × (0.98,1.01) 303ns × (0.99,1.02) ~ (p=0.203)
FmtFprintfInt 313ns × (0.97,1.05) 270ns × (0.99,1.01) -13.69% (p=0.000)
FmtFprintfIntInt 524ns × (0.98,1.02) 477ns × (0.99,1.00) -8.87% (p=0.000)
FmtFprintfPrefixedInt 424ns × (0.98,1.02) 386ns × (0.99,1.01) -8.96% (p=0.000)
FmtFprintfFloat 652ns × (0.98,1.02) 594ns × (0.97,1.05) -8.97% (p=0.000)
FmtManyArgs 2.13µs × (0.99,1.02) 1.94µs × (0.99,1.01) -8.92% (p=0.000)
GobDecode 17.1ms × (0.99,1.02) 14.9ms × (0.98,1.03) -13.07% (p=0.000)
GobEncode 13.5ms × (0.98,1.03) 11.5ms × (0.98,1.03) -15.25% (p=0.000)
Gzip 656ms × (0.99,1.02) 647ms × (0.99,1.01) -1.29% (p=0.000)
Gunzip 143ms × (0.99,1.02) 144ms × (0.99,1.01) ~ (p=0.204)
HTTPClientServer 88.2µs × (0.98,1.02) 90.8µs × (0.98,1.01) +2.93% (p=0.000)
JSONEncode 32.2ms × (0.98,1.02) 30.9ms × (0.97,1.04) -4.06% (p=0.001)
JSONDecode 121ms × (0.98,1.02) 110ms × (0.98,1.05) -8.95% (p=0.000)
Mandelbrot200 6.06ms × (0.99,1.01) 6.11ms × (0.98,1.04) ~ (p=0.184)
GoParse 6.76ms × (0.97,1.04) 6.58ms × (0.98,1.05) -2.63% (p=0.003)
RegexpMatchEasy0_32 195ns × (1.00,1.01) 155ns × (0.99,1.01) -20.43% (p=0.000)
RegexpMatchEasy0_1K 479ns × (0.98,1.03) 535ns × (0.99,1.02) +11.59% (p=0.000)
RegexpMatchEasy1_32 169ns × (0.99,1.02) 131ns × (0.99,1.03) -22.44% (p=0.000)
RegexpMatchEasy1_1K 1.53µs × (0.99,1.01) 0.87µs × (0.99,1.02) -43.07% (p=0.000)
RegexpMatchMedium_32 334ns × (0.99,1.01) 242ns × (0.99,1.01) -27.53% (p=0.000)
RegexpMatchMedium_1K 125µs × (1.00,1.01) 72µs × (0.99,1.03) -42.53% (p=0.000)
RegexpMatchHard_32 6.03µs × (0.99,1.01) 3.79µs × (0.99,1.01) -37.12% (p=0.000)
RegexpMatchHard_1K 189µs × (0.99,1.02) 115µs × (0.99,1.01) -39.20% (p=0.000)
Revcomp 935ms × (0.96,1.03) 926ms × (0.98,1.02) ~ (p=0.083)
Template 146ms × (0.97,1.05) 119ms × (0.99,1.01) -18.37% (p=0.000)
TimeParse 660ns × (0.99,1.01) 624ns × (0.99,1.02) -5.43% (p=0.000)
TimeFormat 670ns × (0.98,1.02) 710ns × (1.00,1.01) +5.97% (p=0.000)
This CL is a bit larger than I would like, but the compiler, linker, runtime,
and package reflect all need to be in sync about the format of these programs,
so there is no easy way to split this into independent changes (at least
while keeping the build working at each change).
Fixes #9625.
Fixes #10524.
Change-Id: I9e3e20d6097099d0f8532d1cb5b1af528804989a
Reviewed-on: https://go-review.googlesource.com/9888
Reviewed-by: Austin Clements <austin@google.com>
Run-TryBot: Russ Cox <rsc@golang.org>
2015-05-08 01:43:18 -04:00
|
|
|
var gc GCProg
|
2016-08-19 22:40:38 -04:00
|
|
|
gc.Init(ctxt, "runtime.gcdata")
|
2016-04-18 14:50:14 -04:00
|
|
|
for _, s := range data[obj.SDATA] {
|
2015-02-27 22:57:28 -05:00
|
|
|
s.Sect = sect
|
2015-04-19 19:33:58 -07:00
|
|
|
s.Type = obj.SDATA
|
2015-02-27 22:57:28 -05:00
|
|
|
datsize = aligndatsize(datsize, s)
|
|
|
|
|
s.Value = int64(uint64(datsize) - sect.Vaddr)
|
runtime: replace GC programs with simpler encoding, faster decoder
Small types record the location of pointers in their memory layout
by using a simple bitmap. In Go 1.4 the bitmap held 4-bit entries,
and in Go 1.5 the bitmap holds 1-bit entries, but in both cases using
a bitmap for a large type containing arrays does not make sense:
if someone refers to the type [1<<28]*byte in a program in such
a way that the type information makes it into the binary, it would be
a waste of space to write a 128 MB (for 4-bit entries) or even 32 MB
(for 1-bit entries) bitmap full of 1s into the binary or even to keep
one in memory during the execution of the program.
For large types containing arrays, it is much more compact to describe
the locations of pointers using a notation that can express repetition
than to lay out a bitmap of pointers. Go 1.4 included such a notation,
called ``GC programs'' but it was complex, required recursion during
decoding, and was generally slow. Dmitriy measured the execution of
these programs writing directly to the heap bitmap as being 7x slower
than copying from a preunrolled 4-bit mask (and frankly that code was
not terribly fast either). For some tests, unrollgcprog1 was seen costing
as much as 3x more than the rest of malloc combined.
This CL introduces a different form for the GC programs. They use a
simple Lempel-Ziv-style encoding of the 1-bit pointer information,
in which the only operations are (1) emit the following n bits
and (2) repeat the last n bits c more times. This encoding can be
generated directly from the Go type information (using repetition
only for arrays or large runs of non-pointer data) and it can be decoded
very efficiently. In particular the decoding requires little state and
no recursion, so that the entire decoding can run without any memory
accesses other than the reads of the encoding and the writes of the
decoded form to the heap bitmap. For recursive types like arrays of
arrays of arrays, the inner instructions are only executed once, not
n times, so that large repetitions run at full speed. (In contrast, large
repetitions in the old programs repeated the individual bit-level layout
of the inner data over and over.) The result is as much as 25x faster
decoding compared to the old form.
Because the old decoder was so slow, Go 1.4 had three (or so) cases
for how to set the heap bitmap bits for an allocation of a given type:
(1) If the type had an even number of words up to 32 words, then
the 4-bit pointer mask for the type fit in no more than 16 bytes;
store the 4-bit pointer mask directly in the binary and copy from it.
(1b) If the type had an odd number of words up to 15 words, then
the 4-bit pointer mask for the type, doubled to end on a byte boundary,
fit in no more than 16 bytes; store that doubled mask directly in the
binary and copy from it.
(2) If the type had an even number of words up to 128 words,
or an odd number of words up to 63 words (again due to doubling),
then the 4-bit pointer mask would fit in a 64-byte unrolled mask.
Store a GC program in the binary, but leave space in the BSS for
the unrolled mask. Execute the GC program to construct the mask the
first time it is needed, and thereafter copy from the mask.
(3) Otherwise, store a GC program and execute it to write directly to
the heap bitmap each time an object of that type is allocated.
(This is the case that was 7x slower than the other two.)
Because the new pointer masks store 1-bit entries instead of 4-bit
entries and because using the decoder no longer carries a significant
overhead, after this CL (that is, for Go 1.5) there are only two cases:
(1) If the type is 128 words or less (no condition about odd or even),
store the 1-bit pointer mask directly in the binary and use it to
initialize the heap bitmap during malloc. (Implemented in CL 9702.)
(2) There is no case 2 anymore.
(3) Otherwise, store a GC program and execute it to write directly to
the heap bitmap each time an object of that type is allocated.
Executing the GC program directly into the heap bitmap (case (3) above)
was disabled for the Go 1.5 dev cycle, both to avoid needing to use
GC programs for typedmemmove and to avoid updating that code as
the heap bitmap format changed. Typedmemmove no longer uses this
type information; as of CL 9886 it uses the heap bitmap directly.
Now that the heap bitmap format is stable, we reintroduce GC programs
and their space savings.
Benchmarks for heapBitsSetType, before this CL vs this CL:
name old mean new mean delta
SetTypePtr 7.59ns × (0.99,1.02) 5.16ns × (1.00,1.00) -32.05% (p=0.000)
SetTypePtr8 21.0ns × (0.98,1.05) 21.4ns × (1.00,1.00) ~ (p=0.179)
SetTypePtr16 24.1ns × (0.99,1.01) 24.6ns × (1.00,1.00) +2.41% (p=0.001)
SetTypePtr32 31.2ns × (0.99,1.01) 32.4ns × (0.99,1.02) +3.72% (p=0.001)
SetTypePtr64 45.2ns × (1.00,1.00) 47.2ns × (1.00,1.00) +4.42% (p=0.000)
SetTypePtr126 75.8ns × (0.99,1.01) 79.1ns × (1.00,1.00) +4.25% (p=0.000)
SetTypePtr128 74.3ns × (0.99,1.01) 77.6ns × (1.00,1.01) +4.55% (p=0.000)
SetTypePtrSlice 726ns × (1.00,1.01) 712ns × (1.00,1.00) -1.95% (p=0.001)
SetTypeNode1 20.0ns × (0.99,1.01) 20.7ns × (1.00,1.00) +3.71% (p=0.000)
SetTypeNode1Slice 112ns × (1.00,1.00) 113ns × (0.99,1.00) ~ (p=0.070)
SetTypeNode8 23.9ns × (1.00,1.00) 24.7ns × (1.00,1.01) +3.18% (p=0.000)
SetTypeNode8Slice 294ns × (0.99,1.02) 287ns × (0.99,1.01) -2.38% (p=0.015)
SetTypeNode64 52.8ns × (0.99,1.03) 51.8ns × (0.99,1.01) ~ (p=0.069)
SetTypeNode64Slice 1.13µs × (0.99,1.05) 1.14µs × (0.99,1.00) ~ (p=0.767)
SetTypeNode64Dead 36.0ns × (1.00,1.01) 32.5ns × (0.99,1.00) -9.67% (p=0.000)
SetTypeNode64DeadSlice 1.43µs × (0.99,1.01) 1.40µs × (1.00,1.00) -2.39% (p=0.001)
SetTypeNode124 75.7ns × (1.00,1.01) 79.0ns × (1.00,1.00) +4.44% (p=0.000)
SetTypeNode124Slice 1.94µs × (1.00,1.01) 2.04µs × (0.99,1.01) +4.98% (p=0.000)
SetTypeNode126 75.4ns × (1.00,1.01) 77.7ns × (0.99,1.01) +3.11% (p=0.000)
SetTypeNode126Slice 1.95µs × (0.99,1.01) 2.03µs × (1.00,1.00) +3.74% (p=0.000)
SetTypeNode128 85.4ns × (0.99,1.01) 122.0ns × (1.00,1.00) +42.89% (p=0.000)
SetTypeNode128Slice 2.20µs × (1.00,1.01) 2.36µs × (0.98,1.02) +7.48% (p=0.001)
SetTypeNode130 83.3ns × (1.00,1.00) 123.0ns × (1.00,1.00) +47.61% (p=0.000)
SetTypeNode130Slice 2.30µs × (0.99,1.01) 2.40µs × (0.98,1.01) +4.37% (p=0.000)
SetTypeNode1024 498ns × (1.00,1.00) 537ns × (1.00,1.00) +7.96% (p=0.000)
SetTypeNode1024Slice 15.5µs × (0.99,1.01) 17.8µs × (1.00,1.00) +15.27% (p=0.000)
The above compares always using a cached pointer mask (and the
corresponding waste of memory) against using the programs directly.
Some slowdown is expected, in exchange for having a better general algorithm.
The GC programs kick in for SetTypeNode128, SetTypeNode130, SetTypeNode1024,
along with the slice variants of those.
It is possible that the cutoff of 128 words (bits) should be raised
in a followup CL, but even with this low cutoff the GC programs are
faster than Go 1.4's "fast path" non-GC program case.
Benchmarks for heapBitsSetType, Go 1.4 vs this CL:
name old mean new mean delta
SetTypePtr 6.89ns × (1.00,1.00) 5.17ns × (1.00,1.00) -25.02% (p=0.000)
SetTypePtr8 25.8ns × (0.97,1.05) 21.5ns × (1.00,1.00) -16.70% (p=0.000)
SetTypePtr16 39.8ns × (0.97,1.02) 24.7ns × (0.99,1.01) -37.81% (p=0.000)
SetTypePtr32 68.8ns × (0.98,1.01) 32.2ns × (1.00,1.01) -53.18% (p=0.000)
SetTypePtr64 130ns × (1.00,1.00) 47ns × (1.00,1.00) -63.67% (p=0.000)
SetTypePtr126 241ns × (0.99,1.01) 79ns × (1.00,1.01) -67.25% (p=0.000)
SetTypePtr128 2.07µs × (1.00,1.00) 0.08µs × (1.00,1.00) -96.27% (p=0.000)
SetTypePtrSlice 1.05µs × (0.99,1.01) 0.72µs × (0.99,1.02) -31.70% (p=0.000)
SetTypeNode1 16.0ns × (0.99,1.01) 20.8ns × (0.99,1.03) +29.91% (p=0.000)
SetTypeNode1Slice 184ns × (0.99,1.01) 112ns × (0.99,1.01) -39.26% (p=0.000)
SetTypeNode8 29.5ns × (0.97,1.02) 24.6ns × (1.00,1.00) -16.50% (p=0.000)
SetTypeNode8Slice 624ns × (0.98,1.02) 285ns × (1.00,1.00) -54.31% (p=0.000)
SetTypeNode64 135ns × (0.96,1.08) 52ns × (0.99,1.02) -61.32% (p=0.000)
SetTypeNode64Slice 3.83µs × (1.00,1.00) 1.14µs × (0.99,1.01) -70.16% (p=0.000)
SetTypeNode64Dead 134ns × (0.99,1.01) 32ns × (1.00,1.01) -75.74% (p=0.000)
SetTypeNode64DeadSlice 3.83µs × (0.99,1.00) 1.40µs × (1.00,1.01) -63.42% (p=0.000)
SetTypeNode124 240ns × (0.99,1.01) 79ns × (1.00,1.01) -67.05% (p=0.000)
SetTypeNode124Slice 7.27µs × (1.00,1.00) 2.04µs × (1.00,1.00) -71.95% (p=0.000)
SetTypeNode126 2.06µs × (0.99,1.01) 0.08µs × (0.99,1.01) -96.23% (p=0.000)
SetTypeNode126Slice 64.4µs × (1.00,1.00) 2.0µs × (1.00,1.00) -96.85% (p=0.000)
SetTypeNode128 2.09µs × (1.00,1.01) 0.12µs × (1.00,1.00) -94.15% (p=0.000)
SetTypeNode128Slice 65.4µs × (1.00,1.00) 2.4µs × (0.99,1.03) -96.39% (p=0.000)
SetTypeNode130 2.11µs × (1.00,1.00) 0.12µs × (1.00,1.00) -94.18% (p=0.000)
SetTypeNode130Slice 66.3µs × (1.00,1.00) 2.4µs × (0.97,1.08) -96.34% (p=0.000)
SetTypeNode1024 16.0µs × (1.00,1.01) 0.5µs × (1.00,1.00) -96.65% (p=0.000)
SetTypeNode1024Slice 512µs × (1.00,1.00) 18µs × (0.98,1.04) -96.45% (p=0.000)
SetTypeNode124 uses a 124 data + 2 ptr = 126-word allocation.
Both Go 1.4 and this CL are using pointer bitmaps for this case,
so that's an overall 3x speedup for using pointer bitmaps.
SetTypeNode128 uses a 128 data + 2 ptr = 130-word allocation.
Both Go 1.4 and this CL are running the GC program for this case,
so that's an overall 17x speedup when using GC programs (and
I've seen >20x on other systems).
Comparing Go 1.4's SetTypeNode124 (pointer bitmap) against
this CL's SetTypeNode128 (GC program), the slow path in the
code in this CL is 2x faster than the fast path in Go 1.4.
The Go 1 benchmarks are basically unaffected compared to just before this CL.
Go 1 benchmarks, before this CL vs this CL:
name old mean new mean delta
BinaryTree17 5.87s × (0.97,1.04) 5.91s × (0.96,1.04) ~ (p=0.306)
Fannkuch11 4.38s × (1.00,1.00) 4.37s × (1.00,1.01) -0.22% (p=0.006)
FmtFprintfEmpty 90.7ns × (0.97,1.10) 89.3ns × (0.96,1.09) ~ (p=0.280)
FmtFprintfString 282ns × (0.98,1.04) 287ns × (0.98,1.07) +1.72% (p=0.039)
FmtFprintfInt 269ns × (0.99,1.03) 282ns × (0.97,1.04) +4.87% (p=0.000)
FmtFprintfIntInt 478ns × (0.99,1.02) 481ns × (0.99,1.02) +0.61% (p=0.048)
FmtFprintfPrefixedInt 399ns × (0.98,1.03) 400ns × (0.98,1.05) ~ (p=0.533)
FmtFprintfFloat 563ns × (0.99,1.01) 570ns × (1.00,1.01) +1.37% (p=0.000)
FmtManyArgs 1.89µs × (0.99,1.01) 1.92µs × (0.99,1.02) +1.88% (p=0.000)
GobDecode 15.2ms × (0.99,1.01) 15.2ms × (0.98,1.05) ~ (p=0.609)
GobEncode 11.6ms × (0.98,1.03) 11.9ms × (0.98,1.04) +2.17% (p=0.000)
Gzip 648ms × (0.99,1.01) 648ms × (1.00,1.01) ~ (p=0.835)
Gunzip 142ms × (1.00,1.00) 143ms × (1.00,1.01) ~ (p=0.169)
HTTPClientServer 90.5µs × (0.98,1.03) 91.5µs × (0.98,1.04) +1.04% (p=0.045)
JSONEncode 31.5ms × (0.98,1.03) 31.4ms × (0.98,1.03) ~ (p=0.549)
JSONDecode 111ms × (0.99,1.01) 107ms × (0.99,1.01) -3.21% (p=0.000)
Mandelbrot200 6.01ms × (1.00,1.00) 6.01ms × (1.00,1.00) ~ (p=0.878)
GoParse 6.54ms × (0.99,1.02) 6.61ms × (0.99,1.03) +1.08% (p=0.004)
RegexpMatchEasy0_32 160ns × (1.00,1.01) 161ns × (1.00,1.00) +0.40% (p=0.000)
RegexpMatchEasy0_1K 560ns × (0.99,1.01) 559ns × (0.99,1.01) ~ (p=0.088)
RegexpMatchEasy1_32 138ns × (0.99,1.01) 138ns × (1.00,1.00) ~ (p=0.380)
RegexpMatchEasy1_1K 877ns × (1.00,1.00) 878ns × (1.00,1.00) ~ (p=0.157)
RegexpMatchMedium_32 251ns × (0.99,1.00) 251ns × (1.00,1.01) +0.28% (p=0.021)
RegexpMatchMedium_1K 72.6µs × (1.00,1.00) 72.6µs × (1.00,1.00) ~ (p=0.539)
RegexpMatchHard_32 3.84µs × (1.00,1.00) 3.84µs × (1.00,1.00) ~ (p=0.378)
RegexpMatchHard_1K 117µs × (1.00,1.00) 117µs × (1.00,1.00) ~ (p=0.067)
Revcomp 904ms × (0.99,1.02) 904ms × (0.99,1.01) ~ (p=0.943)
Template 125ms × (0.99,1.02) 127ms × (0.99,1.01) +1.79% (p=0.000)
TimeParse 627ns × (0.99,1.01) 622ns × (0.99,1.01) -0.88% (p=0.000)
TimeFormat 655ns × (0.99,1.02) 655ns × (0.99,1.02) ~ (p=0.976)
For the record, Go 1 benchmarks, Go 1.4 vs this CL:
name old mean new mean delta
BinaryTree17 4.61s × (0.97,1.05) 5.91s × (0.98,1.03) +28.35% (p=0.000)
Fannkuch11 4.40s × (0.99,1.03) 4.41s × (0.99,1.01) ~ (p=0.212)
FmtFprintfEmpty 102ns × (0.99,1.01) 84ns × (0.99,1.02) -18.38% (p=0.000)
FmtFprintfString 302ns × (0.98,1.01) 303ns × (0.99,1.02) ~ (p=0.203)
FmtFprintfInt 313ns × (0.97,1.05) 270ns × (0.99,1.01) -13.69% (p=0.000)
FmtFprintfIntInt 524ns × (0.98,1.02) 477ns × (0.99,1.00) -8.87% (p=0.000)
FmtFprintfPrefixedInt 424ns × (0.98,1.02) 386ns × (0.99,1.01) -8.96% (p=0.000)
FmtFprintfFloat 652ns × (0.98,1.02) 594ns × (0.97,1.05) -8.97% (p=0.000)
FmtManyArgs 2.13µs × (0.99,1.02) 1.94µs × (0.99,1.01) -8.92% (p=0.000)
GobDecode 17.1ms × (0.99,1.02) 14.9ms × (0.98,1.03) -13.07% (p=0.000)
GobEncode 13.5ms × (0.98,1.03) 11.5ms × (0.98,1.03) -15.25% (p=0.000)
Gzip 656ms × (0.99,1.02) 647ms × (0.99,1.01) -1.29% (p=0.000)
Gunzip 143ms × (0.99,1.02) 144ms × (0.99,1.01) ~ (p=0.204)
HTTPClientServer 88.2µs × (0.98,1.02) 90.8µs × (0.98,1.01) +2.93% (p=0.000)
JSONEncode 32.2ms × (0.98,1.02) 30.9ms × (0.97,1.04) -4.06% (p=0.001)
JSONDecode 121ms × (0.98,1.02) 110ms × (0.98,1.05) -8.95% (p=0.000)
Mandelbrot200 6.06ms × (0.99,1.01) 6.11ms × (0.98,1.04) ~ (p=0.184)
GoParse 6.76ms × (0.97,1.04) 6.58ms × (0.98,1.05) -2.63% (p=0.003)
RegexpMatchEasy0_32 195ns × (1.00,1.01) 155ns × (0.99,1.01) -20.43% (p=0.000)
RegexpMatchEasy0_1K 479ns × (0.98,1.03) 535ns × (0.99,1.02) +11.59% (p=0.000)
RegexpMatchEasy1_32 169ns × (0.99,1.02) 131ns × (0.99,1.03) -22.44% (p=0.000)
RegexpMatchEasy1_1K 1.53µs × (0.99,1.01) 0.87µs × (0.99,1.02) -43.07% (p=0.000)
RegexpMatchMedium_32 334ns × (0.99,1.01) 242ns × (0.99,1.01) -27.53% (p=0.000)
RegexpMatchMedium_1K 125µs × (1.00,1.01) 72µs × (0.99,1.03) -42.53% (p=0.000)
RegexpMatchHard_32 6.03µs × (0.99,1.01) 3.79µs × (0.99,1.01) -37.12% (p=0.000)
RegexpMatchHard_1K 189µs × (0.99,1.02) 115µs × (0.99,1.01) -39.20% (p=0.000)
Revcomp 935ms × (0.96,1.03) 926ms × (0.98,1.02) ~ (p=0.083)
Template 146ms × (0.97,1.05) 119ms × (0.99,1.01) -18.37% (p=0.000)
TimeParse 660ns × (0.99,1.01) 624ns × (0.99,1.02) -5.43% (p=0.000)
TimeFormat 670ns × (0.98,1.02) 710ns × (1.00,1.01) +5.97% (p=0.000)
This CL is a bit larger than I would like, but the compiler, linker, runtime,
and package reflect all need to be in sync about the format of these programs,
so there is no easy way to split this into independent changes (at least
while keeping the build working at each change).
Fixes #9625.
Fixes #10524.
Change-Id: I9e3e20d6097099d0f8532d1cb5b1af528804989a
Reviewed-on: https://go-review.googlesource.com/9888
Reviewed-by: Austin Clements <austin@google.com>
Run-TryBot: Russ Cox <rsc@golang.org>
2015-05-08 01:43:18 -04:00
|
|
|
gc.AddSym(s)
|
2016-04-19 08:59:56 -04:00
|
|
|
datsize += s.Size
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
2016-08-19 22:40:38 -04:00
|
|
|
checkdatsize(ctxt, datsize, obj.SDATA)
|
2015-02-27 22:57:28 -05:00
|
|
|
sect.Length = uint64(datsize) - sect.Vaddr
|
runtime: replace GC programs with simpler encoding, faster decoder
Small types record the location of pointers in their memory layout
by using a simple bitmap. In Go 1.4 the bitmap held 4-bit entries,
and in Go 1.5 the bitmap holds 1-bit entries, but in both cases using
a bitmap for a large type containing arrays does not make sense:
if someone refers to the type [1<<28]*byte in a program in such
a way that the type information makes it into the binary, it would be
a waste of space to write a 128 MB (for 4-bit entries) or even 32 MB
(for 1-bit entries) bitmap full of 1s into the binary or even to keep
one in memory during the execution of the program.
For large types containing arrays, it is much more compact to describe
the locations of pointers using a notation that can express repetition
than to lay out a bitmap of pointers. Go 1.4 included such a notation,
called ``GC programs'' but it was complex, required recursion during
decoding, and was generally slow. Dmitriy measured the execution of
these programs writing directly to the heap bitmap as being 7x slower
than copying from a preunrolled 4-bit mask (and frankly that code was
not terribly fast either). For some tests, unrollgcprog1 was seen costing
as much as 3x more than the rest of malloc combined.
This CL introduces a different form for the GC programs. They use a
simple Lempel-Ziv-style encoding of the 1-bit pointer information,
in which the only operations are (1) emit the following n bits
and (2) repeat the last n bits c more times. This encoding can be
generated directly from the Go type information (using repetition
only for arrays or large runs of non-pointer data) and it can be decoded
very efficiently. In particular the decoding requires little state and
no recursion, so that the entire decoding can run without any memory
accesses other than the reads of the encoding and the writes of the
decoded form to the heap bitmap. For recursive types like arrays of
arrays of arrays, the inner instructions are only executed once, not
n times, so that large repetitions run at full speed. (In contrast, large
repetitions in the old programs repeated the individual bit-level layout
of the inner data over and over.) The result is as much as 25x faster
decoding compared to the old form.
Because the old decoder was so slow, Go 1.4 had three (or so) cases
for how to set the heap bitmap bits for an allocation of a given type:
(1) If the type had an even number of words up to 32 words, then
the 4-bit pointer mask for the type fit in no more than 16 bytes;
store the 4-bit pointer mask directly in the binary and copy from it.
(1b) If the type had an odd number of words up to 15 words, then
the 4-bit pointer mask for the type, doubled to end on a byte boundary,
fit in no more than 16 bytes; store that doubled mask directly in the
binary and copy from it.
(2) If the type had an even number of words up to 128 words,
or an odd number of words up to 63 words (again due to doubling),
then the 4-bit pointer mask would fit in a 64-byte unrolled mask.
Store a GC program in the binary, but leave space in the BSS for
the unrolled mask. Execute the GC program to construct the mask the
first time it is needed, and thereafter copy from the mask.
(3) Otherwise, store a GC program and execute it to write directly to
the heap bitmap each time an object of that type is allocated.
(This is the case that was 7x slower than the other two.)
Because the new pointer masks store 1-bit entries instead of 4-bit
entries and because using the decoder no longer carries a significant
overhead, after this CL (that is, for Go 1.5) there are only two cases:
(1) If the type is 128 words or less (no condition about odd or even),
store the 1-bit pointer mask directly in the binary and use it to
initialize the heap bitmap during malloc. (Implemented in CL 9702.)
(2) There is no case 2 anymore.
(3) Otherwise, store a GC program and execute it to write directly to
the heap bitmap each time an object of that type is allocated.
Executing the GC program directly into the heap bitmap (case (3) above)
was disabled for the Go 1.5 dev cycle, both to avoid needing to use
GC programs for typedmemmove and to avoid updating that code as
the heap bitmap format changed. Typedmemmove no longer uses this
type information; as of CL 9886 it uses the heap bitmap directly.
Now that the heap bitmap format is stable, we reintroduce GC programs
and their space savings.
Benchmarks for heapBitsSetType, before this CL vs this CL:
name old mean new mean delta
SetTypePtr 7.59ns × (0.99,1.02) 5.16ns × (1.00,1.00) -32.05% (p=0.000)
SetTypePtr8 21.0ns × (0.98,1.05) 21.4ns × (1.00,1.00) ~ (p=0.179)
SetTypePtr16 24.1ns × (0.99,1.01) 24.6ns × (1.00,1.00) +2.41% (p=0.001)
SetTypePtr32 31.2ns × (0.99,1.01) 32.4ns × (0.99,1.02) +3.72% (p=0.001)
SetTypePtr64 45.2ns × (1.00,1.00) 47.2ns × (1.00,1.00) +4.42% (p=0.000)
SetTypePtr126 75.8ns × (0.99,1.01) 79.1ns × (1.00,1.00) +4.25% (p=0.000)
SetTypePtr128 74.3ns × (0.99,1.01) 77.6ns × (1.00,1.01) +4.55% (p=0.000)
SetTypePtrSlice 726ns × (1.00,1.01) 712ns × (1.00,1.00) -1.95% (p=0.001)
SetTypeNode1 20.0ns × (0.99,1.01) 20.7ns × (1.00,1.00) +3.71% (p=0.000)
SetTypeNode1Slice 112ns × (1.00,1.00) 113ns × (0.99,1.00) ~ (p=0.070)
SetTypeNode8 23.9ns × (1.00,1.00) 24.7ns × (1.00,1.01) +3.18% (p=0.000)
SetTypeNode8Slice 294ns × (0.99,1.02) 287ns × (0.99,1.01) -2.38% (p=0.015)
SetTypeNode64 52.8ns × (0.99,1.03) 51.8ns × (0.99,1.01) ~ (p=0.069)
SetTypeNode64Slice 1.13µs × (0.99,1.05) 1.14µs × (0.99,1.00) ~ (p=0.767)
SetTypeNode64Dead 36.0ns × (1.00,1.01) 32.5ns × (0.99,1.00) -9.67% (p=0.000)
SetTypeNode64DeadSlice 1.43µs × (0.99,1.01) 1.40µs × (1.00,1.00) -2.39% (p=0.001)
SetTypeNode124 75.7ns × (1.00,1.01) 79.0ns × (1.00,1.00) +4.44% (p=0.000)
SetTypeNode124Slice 1.94µs × (1.00,1.01) 2.04µs × (0.99,1.01) +4.98% (p=0.000)
SetTypeNode126 75.4ns × (1.00,1.01) 77.7ns × (0.99,1.01) +3.11% (p=0.000)
SetTypeNode126Slice 1.95µs × (0.99,1.01) 2.03µs × (1.00,1.00) +3.74% (p=0.000)
SetTypeNode128 85.4ns × (0.99,1.01) 122.0ns × (1.00,1.00) +42.89% (p=0.000)
SetTypeNode128Slice 2.20µs × (1.00,1.01) 2.36µs × (0.98,1.02) +7.48% (p=0.001)
SetTypeNode130 83.3ns × (1.00,1.00) 123.0ns × (1.00,1.00) +47.61% (p=0.000)
SetTypeNode130Slice 2.30µs × (0.99,1.01) 2.40µs × (0.98,1.01) +4.37% (p=0.000)
SetTypeNode1024 498ns × (1.00,1.00) 537ns × (1.00,1.00) +7.96% (p=0.000)
SetTypeNode1024Slice 15.5µs × (0.99,1.01) 17.8µs × (1.00,1.00) +15.27% (p=0.000)
The above compares always using a cached pointer mask (and the
corresponding waste of memory) against using the programs directly.
Some slowdown is expected, in exchange for having a better general algorithm.
The GC programs kick in for SetTypeNode128, SetTypeNode130, SetTypeNode1024,
along with the slice variants of those.
It is possible that the cutoff of 128 words (bits) should be raised
in a followup CL, but even with this low cutoff the GC programs are
faster than Go 1.4's "fast path" non-GC program case.
Benchmarks for heapBitsSetType, Go 1.4 vs this CL:
name old mean new mean delta
SetTypePtr 6.89ns × (1.00,1.00) 5.17ns × (1.00,1.00) -25.02% (p=0.000)
SetTypePtr8 25.8ns × (0.97,1.05) 21.5ns × (1.00,1.00) -16.70% (p=0.000)
SetTypePtr16 39.8ns × (0.97,1.02) 24.7ns × (0.99,1.01) -37.81% (p=0.000)
SetTypePtr32 68.8ns × (0.98,1.01) 32.2ns × (1.00,1.01) -53.18% (p=0.000)
SetTypePtr64 130ns × (1.00,1.00) 47ns × (1.00,1.00) -63.67% (p=0.000)
SetTypePtr126 241ns × (0.99,1.01) 79ns × (1.00,1.01) -67.25% (p=0.000)
SetTypePtr128 2.07µs × (1.00,1.00) 0.08µs × (1.00,1.00) -96.27% (p=0.000)
SetTypePtrSlice 1.05µs × (0.99,1.01) 0.72µs × (0.99,1.02) -31.70% (p=0.000)
SetTypeNode1 16.0ns × (0.99,1.01) 20.8ns × (0.99,1.03) +29.91% (p=0.000)
SetTypeNode1Slice 184ns × (0.99,1.01) 112ns × (0.99,1.01) -39.26% (p=0.000)
SetTypeNode8 29.5ns × (0.97,1.02) 24.6ns × (1.00,1.00) -16.50% (p=0.000)
SetTypeNode8Slice 624ns × (0.98,1.02) 285ns × (1.00,1.00) -54.31% (p=0.000)
SetTypeNode64 135ns × (0.96,1.08) 52ns × (0.99,1.02) -61.32% (p=0.000)
SetTypeNode64Slice 3.83µs × (1.00,1.00) 1.14µs × (0.99,1.01) -70.16% (p=0.000)
SetTypeNode64Dead 134ns × (0.99,1.01) 32ns × (1.00,1.01) -75.74% (p=0.000)
SetTypeNode64DeadSlice 3.83µs × (0.99,1.00) 1.40µs × (1.00,1.01) -63.42% (p=0.000)
SetTypeNode124 240ns × (0.99,1.01) 79ns × (1.00,1.01) -67.05% (p=0.000)
SetTypeNode124Slice 7.27µs × (1.00,1.00) 2.04µs × (1.00,1.00) -71.95% (p=0.000)
SetTypeNode126 2.06µs × (0.99,1.01) 0.08µs × (0.99,1.01) -96.23% (p=0.000)
SetTypeNode126Slice 64.4µs × (1.00,1.00) 2.0µs × (1.00,1.00) -96.85% (p=0.000)
SetTypeNode128 2.09µs × (1.00,1.01) 0.12µs × (1.00,1.00) -94.15% (p=0.000)
SetTypeNode128Slice 65.4µs × (1.00,1.00) 2.4µs × (0.99,1.03) -96.39% (p=0.000)
SetTypeNode130 2.11µs × (1.00,1.00) 0.12µs × (1.00,1.00) -94.18% (p=0.000)
SetTypeNode130Slice 66.3µs × (1.00,1.00) 2.4µs × (0.97,1.08) -96.34% (p=0.000)
SetTypeNode1024 16.0µs × (1.00,1.01) 0.5µs × (1.00,1.00) -96.65% (p=0.000)
SetTypeNode1024Slice 512µs × (1.00,1.00) 18µs × (0.98,1.04) -96.45% (p=0.000)
SetTypeNode124 uses a 124 data + 2 ptr = 126-word allocation.
Both Go 1.4 and this CL are using pointer bitmaps for this case,
so that's an overall 3x speedup for using pointer bitmaps.
SetTypeNode128 uses a 128 data + 2 ptr = 130-word allocation.
Both Go 1.4 and this CL are running the GC program for this case,
so that's an overall 17x speedup when using GC programs (and
I've seen >20x on other systems).
Comparing Go 1.4's SetTypeNode124 (pointer bitmap) against
this CL's SetTypeNode128 (GC program), the slow path in the
code in this CL is 2x faster than the fast path in Go 1.4.
The Go 1 benchmarks are basically unaffected compared to just before this CL.
Go 1 benchmarks, before this CL vs this CL:
name old mean new mean delta
BinaryTree17 5.87s × (0.97,1.04) 5.91s × (0.96,1.04) ~ (p=0.306)
Fannkuch11 4.38s × (1.00,1.00) 4.37s × (1.00,1.01) -0.22% (p=0.006)
FmtFprintfEmpty 90.7ns × (0.97,1.10) 89.3ns × (0.96,1.09) ~ (p=0.280)
FmtFprintfString 282ns × (0.98,1.04) 287ns × (0.98,1.07) +1.72% (p=0.039)
FmtFprintfInt 269ns × (0.99,1.03) 282ns × (0.97,1.04) +4.87% (p=0.000)
FmtFprintfIntInt 478ns × (0.99,1.02) 481ns × (0.99,1.02) +0.61% (p=0.048)
FmtFprintfPrefixedInt 399ns × (0.98,1.03) 400ns × (0.98,1.05) ~ (p=0.533)
FmtFprintfFloat 563ns × (0.99,1.01) 570ns × (1.00,1.01) +1.37% (p=0.000)
FmtManyArgs 1.89µs × (0.99,1.01) 1.92µs × (0.99,1.02) +1.88% (p=0.000)
GobDecode 15.2ms × (0.99,1.01) 15.2ms × (0.98,1.05) ~ (p=0.609)
GobEncode 11.6ms × (0.98,1.03) 11.9ms × (0.98,1.04) +2.17% (p=0.000)
Gzip 648ms × (0.99,1.01) 648ms × (1.00,1.01) ~ (p=0.835)
Gunzip 142ms × (1.00,1.00) 143ms × (1.00,1.01) ~ (p=0.169)
HTTPClientServer 90.5µs × (0.98,1.03) 91.5µs × (0.98,1.04) +1.04% (p=0.045)
JSONEncode 31.5ms × (0.98,1.03) 31.4ms × (0.98,1.03) ~ (p=0.549)
JSONDecode 111ms × (0.99,1.01) 107ms × (0.99,1.01) -3.21% (p=0.000)
Mandelbrot200 6.01ms × (1.00,1.00) 6.01ms × (1.00,1.00) ~ (p=0.878)
GoParse 6.54ms × (0.99,1.02) 6.61ms × (0.99,1.03) +1.08% (p=0.004)
RegexpMatchEasy0_32 160ns × (1.00,1.01) 161ns × (1.00,1.00) +0.40% (p=0.000)
RegexpMatchEasy0_1K 560ns × (0.99,1.01) 559ns × (0.99,1.01) ~ (p=0.088)
RegexpMatchEasy1_32 138ns × (0.99,1.01) 138ns × (1.00,1.00) ~ (p=0.380)
RegexpMatchEasy1_1K 877ns × (1.00,1.00) 878ns × (1.00,1.00) ~ (p=0.157)
RegexpMatchMedium_32 251ns × (0.99,1.00) 251ns × (1.00,1.01) +0.28% (p=0.021)
RegexpMatchMedium_1K 72.6µs × (1.00,1.00) 72.6µs × (1.00,1.00) ~ (p=0.539)
RegexpMatchHard_32 3.84µs × (1.00,1.00) 3.84µs × (1.00,1.00) ~ (p=0.378)
RegexpMatchHard_1K 117µs × (1.00,1.00) 117µs × (1.00,1.00) ~ (p=0.067)
Revcomp 904ms × (0.99,1.02) 904ms × (0.99,1.01) ~ (p=0.943)
Template 125ms × (0.99,1.02) 127ms × (0.99,1.01) +1.79% (p=0.000)
TimeParse 627ns × (0.99,1.01) 622ns × (0.99,1.01) -0.88% (p=0.000)
TimeFormat 655ns × (0.99,1.02) 655ns × (0.99,1.02) ~ (p=0.976)
For the record, Go 1 benchmarks, Go 1.4 vs this CL:
name old mean new mean delta
BinaryTree17 4.61s × (0.97,1.05) 5.91s × (0.98,1.03) +28.35% (p=0.000)
Fannkuch11 4.40s × (0.99,1.03) 4.41s × (0.99,1.01) ~ (p=0.212)
FmtFprintfEmpty 102ns × (0.99,1.01) 84ns × (0.99,1.02) -18.38% (p=0.000)
FmtFprintfString 302ns × (0.98,1.01) 303ns × (0.99,1.02) ~ (p=0.203)
FmtFprintfInt 313ns × (0.97,1.05) 270ns × (0.99,1.01) -13.69% (p=0.000)
FmtFprintfIntInt 524ns × (0.98,1.02) 477ns × (0.99,1.00) -8.87% (p=0.000)
FmtFprintfPrefixedInt 424ns × (0.98,1.02) 386ns × (0.99,1.01) -8.96% (p=0.000)
FmtFprintfFloat 652ns × (0.98,1.02) 594ns × (0.97,1.05) -8.97% (p=0.000)
FmtManyArgs 2.13µs × (0.99,1.02) 1.94µs × (0.99,1.01) -8.92% (p=0.000)
GobDecode 17.1ms × (0.99,1.02) 14.9ms × (0.98,1.03) -13.07% (p=0.000)
GobEncode 13.5ms × (0.98,1.03) 11.5ms × (0.98,1.03) -15.25% (p=0.000)
Gzip 656ms × (0.99,1.02) 647ms × (0.99,1.01) -1.29% (p=0.000)
Gunzip 143ms × (0.99,1.02) 144ms × (0.99,1.01) ~ (p=0.204)
HTTPClientServer 88.2µs × (0.98,1.02) 90.8µs × (0.98,1.01) +2.93% (p=0.000)
JSONEncode 32.2ms × (0.98,1.02) 30.9ms × (0.97,1.04) -4.06% (p=0.001)
JSONDecode 121ms × (0.98,1.02) 110ms × (0.98,1.05) -8.95% (p=0.000)
Mandelbrot200 6.06ms × (0.99,1.01) 6.11ms × (0.98,1.04) ~ (p=0.184)
GoParse 6.76ms × (0.97,1.04) 6.58ms × (0.98,1.05) -2.63% (p=0.003)
RegexpMatchEasy0_32 195ns × (1.00,1.01) 155ns × (0.99,1.01) -20.43% (p=0.000)
RegexpMatchEasy0_1K 479ns × (0.98,1.03) 535ns × (0.99,1.02) +11.59% (p=0.000)
RegexpMatchEasy1_32 169ns × (0.99,1.02) 131ns × (0.99,1.03) -22.44% (p=0.000)
RegexpMatchEasy1_1K 1.53µs × (0.99,1.01) 0.87µs × (0.99,1.02) -43.07% (p=0.000)
RegexpMatchMedium_32 334ns × (0.99,1.01) 242ns × (0.99,1.01) -27.53% (p=0.000)
RegexpMatchMedium_1K 125µs × (1.00,1.01) 72µs × (0.99,1.03) -42.53% (p=0.000)
RegexpMatchHard_32 6.03µs × (0.99,1.01) 3.79µs × (0.99,1.01) -37.12% (p=0.000)
RegexpMatchHard_1K 189µs × (0.99,1.02) 115µs × (0.99,1.01) -39.20% (p=0.000)
Revcomp 935ms × (0.96,1.03) 926ms × (0.98,1.02) ~ (p=0.083)
Template 146ms × (0.97,1.05) 119ms × (0.99,1.01) -18.37% (p=0.000)
TimeParse 660ns × (0.99,1.01) 624ns × (0.99,1.02) -5.43% (p=0.000)
TimeFormat 670ns × (0.98,1.02) 710ns × (1.00,1.01) +5.97% (p=0.000)
This CL is a bit larger than I would like, but the compiler, linker, runtime,
and package reflect all need to be in sync about the format of these programs,
so there is no easy way to split this into independent changes (at least
while keeping the build working at each change).
Fixes #9625.
Fixes #10524.
Change-Id: I9e3e20d6097099d0f8532d1cb5b1af528804989a
Reviewed-on: https://go-review.googlesource.com/9888
Reviewed-by: Austin Clements <austin@google.com>
Run-TryBot: Russ Cox <rsc@golang.org>
2015-05-08 01:43:18 -04:00
|
|
|
gc.End(int64(sect.Length))
|
2015-02-27 22:57:28 -05:00
|
|
|
|
|
|
|
|
/* bss */
|
|
|
|
|
sect = addsection(&Segdata, ".bss", 06)
|
2016-04-19 08:59:56 -04:00
|
|
|
sect.Align = dataMaxAlign[obj.SBSS]
|
2015-02-27 22:57:28 -05:00
|
|
|
datsize = Rnd(datsize, int64(sect.Align))
|
|
|
|
|
sect.Vaddr = uint64(datsize)
|
2016-08-19 22:40:38 -04:00
|
|
|
Linklookup(ctxt, "runtime.bss", 0).Sect = sect
|
|
|
|
|
Linklookup(ctxt, "runtime.ebss", 0).Sect = sect
|
runtime: replace GC programs with simpler encoding, faster decoder
Small types record the location of pointers in their memory layout
by using a simple bitmap. In Go 1.4 the bitmap held 4-bit entries,
and in Go 1.5 the bitmap holds 1-bit entries, but in both cases using
a bitmap for a large type containing arrays does not make sense:
if someone refers to the type [1<<28]*byte in a program in such
a way that the type information makes it into the binary, it would be
a waste of space to write a 128 MB (for 4-bit entries) or even 32 MB
(for 1-bit entries) bitmap full of 1s into the binary or even to keep
one in memory during the execution of the program.
For large types containing arrays, it is much more compact to describe
the locations of pointers using a notation that can express repetition
than to lay out a bitmap of pointers. Go 1.4 included such a notation,
called ``GC programs'' but it was complex, required recursion during
decoding, and was generally slow. Dmitriy measured the execution of
these programs writing directly to the heap bitmap as being 7x slower
than copying from a preunrolled 4-bit mask (and frankly that code was
not terribly fast either). For some tests, unrollgcprog1 was seen costing
as much as 3x more than the rest of malloc combined.
This CL introduces a different form for the GC programs. They use a
simple Lempel-Ziv-style encoding of the 1-bit pointer information,
in which the only operations are (1) emit the following n bits
and (2) repeat the last n bits c more times. This encoding can be
generated directly from the Go type information (using repetition
only for arrays or large runs of non-pointer data) and it can be decoded
very efficiently. In particular the decoding requires little state and
no recursion, so that the entire decoding can run without any memory
accesses other than the reads of the encoding and the writes of the
decoded form to the heap bitmap. For recursive types like arrays of
arrays of arrays, the inner instructions are only executed once, not
n times, so that large repetitions run at full speed. (In contrast, large
repetitions in the old programs repeated the individual bit-level layout
of the inner data over and over.) The result is as much as 25x faster
decoding compared to the old form.
Because the old decoder was so slow, Go 1.4 had three (or so) cases
for how to set the heap bitmap bits for an allocation of a given type:
(1) If the type had an even number of words up to 32 words, then
the 4-bit pointer mask for the type fit in no more than 16 bytes;
store the 4-bit pointer mask directly in the binary and copy from it.
(1b) If the type had an odd number of words up to 15 words, then
the 4-bit pointer mask for the type, doubled to end on a byte boundary,
fit in no more than 16 bytes; store that doubled mask directly in the
binary and copy from it.
(2) If the type had an even number of words up to 128 words,
or an odd number of words up to 63 words (again due to doubling),
then the 4-bit pointer mask would fit in a 64-byte unrolled mask.
Store a GC program in the binary, but leave space in the BSS for
the unrolled mask. Execute the GC program to construct the mask the
first time it is needed, and thereafter copy from the mask.
(3) Otherwise, store a GC program and execute it to write directly to
the heap bitmap each time an object of that type is allocated.
(This is the case that was 7x slower than the other two.)
Because the new pointer masks store 1-bit entries instead of 4-bit
entries and because using the decoder no longer carries a significant
overhead, after this CL (that is, for Go 1.5) there are only two cases:
(1) If the type is 128 words or less (no condition about odd or even),
store the 1-bit pointer mask directly in the binary and use it to
initialize the heap bitmap during malloc. (Implemented in CL 9702.)
(2) There is no case 2 anymore.
(3) Otherwise, store a GC program and execute it to write directly to
the heap bitmap each time an object of that type is allocated.
Executing the GC program directly into the heap bitmap (case (3) above)
was disabled for the Go 1.5 dev cycle, both to avoid needing to use
GC programs for typedmemmove and to avoid updating that code as
the heap bitmap format changed. Typedmemmove no longer uses this
type information; as of CL 9886 it uses the heap bitmap directly.
Now that the heap bitmap format is stable, we reintroduce GC programs
and their space savings.
Benchmarks for heapBitsSetType, before this CL vs this CL:
name old mean new mean delta
SetTypePtr 7.59ns × (0.99,1.02) 5.16ns × (1.00,1.00) -32.05% (p=0.000)
SetTypePtr8 21.0ns × (0.98,1.05) 21.4ns × (1.00,1.00) ~ (p=0.179)
SetTypePtr16 24.1ns × (0.99,1.01) 24.6ns × (1.00,1.00) +2.41% (p=0.001)
SetTypePtr32 31.2ns × (0.99,1.01) 32.4ns × (0.99,1.02) +3.72% (p=0.001)
SetTypePtr64 45.2ns × (1.00,1.00) 47.2ns × (1.00,1.00) +4.42% (p=0.000)
SetTypePtr126 75.8ns × (0.99,1.01) 79.1ns × (1.00,1.00) +4.25% (p=0.000)
SetTypePtr128 74.3ns × (0.99,1.01) 77.6ns × (1.00,1.01) +4.55% (p=0.000)
SetTypePtrSlice 726ns × (1.00,1.01) 712ns × (1.00,1.00) -1.95% (p=0.001)
SetTypeNode1 20.0ns × (0.99,1.01) 20.7ns × (1.00,1.00) +3.71% (p=0.000)
SetTypeNode1Slice 112ns × (1.00,1.00) 113ns × (0.99,1.00) ~ (p=0.070)
SetTypeNode8 23.9ns × (1.00,1.00) 24.7ns × (1.00,1.01) +3.18% (p=0.000)
SetTypeNode8Slice 294ns × (0.99,1.02) 287ns × (0.99,1.01) -2.38% (p=0.015)
SetTypeNode64 52.8ns × (0.99,1.03) 51.8ns × (0.99,1.01) ~ (p=0.069)
SetTypeNode64Slice 1.13µs × (0.99,1.05) 1.14µs × (0.99,1.00) ~ (p=0.767)
SetTypeNode64Dead 36.0ns × (1.00,1.01) 32.5ns × (0.99,1.00) -9.67% (p=0.000)
SetTypeNode64DeadSlice 1.43µs × (0.99,1.01) 1.40µs × (1.00,1.00) -2.39% (p=0.001)
SetTypeNode124 75.7ns × (1.00,1.01) 79.0ns × (1.00,1.00) +4.44% (p=0.000)
SetTypeNode124Slice 1.94µs × (1.00,1.01) 2.04µs × (0.99,1.01) +4.98% (p=0.000)
SetTypeNode126 75.4ns × (1.00,1.01) 77.7ns × (0.99,1.01) +3.11% (p=0.000)
SetTypeNode126Slice 1.95µs × (0.99,1.01) 2.03µs × (1.00,1.00) +3.74% (p=0.000)
SetTypeNode128 85.4ns × (0.99,1.01) 122.0ns × (1.00,1.00) +42.89% (p=0.000)
SetTypeNode128Slice 2.20µs × (1.00,1.01) 2.36µs × (0.98,1.02) +7.48% (p=0.001)
SetTypeNode130 83.3ns × (1.00,1.00) 123.0ns × (1.00,1.00) +47.61% (p=0.000)
SetTypeNode130Slice 2.30µs × (0.99,1.01) 2.40µs × (0.98,1.01) +4.37% (p=0.000)
SetTypeNode1024 498ns × (1.00,1.00) 537ns × (1.00,1.00) +7.96% (p=0.000)
SetTypeNode1024Slice 15.5µs × (0.99,1.01) 17.8µs × (1.00,1.00) +15.27% (p=0.000)
The above compares always using a cached pointer mask (and the
corresponding waste of memory) against using the programs directly.
Some slowdown is expected, in exchange for having a better general algorithm.
The GC programs kick in for SetTypeNode128, SetTypeNode130, SetTypeNode1024,
along with the slice variants of those.
It is possible that the cutoff of 128 words (bits) should be raised
in a followup CL, but even with this low cutoff the GC programs are
faster than Go 1.4's "fast path" non-GC program case.
Benchmarks for heapBitsSetType, Go 1.4 vs this CL:
name old mean new mean delta
SetTypePtr 6.89ns × (1.00,1.00) 5.17ns × (1.00,1.00) -25.02% (p=0.000)
SetTypePtr8 25.8ns × (0.97,1.05) 21.5ns × (1.00,1.00) -16.70% (p=0.000)
SetTypePtr16 39.8ns × (0.97,1.02) 24.7ns × (0.99,1.01) -37.81% (p=0.000)
SetTypePtr32 68.8ns × (0.98,1.01) 32.2ns × (1.00,1.01) -53.18% (p=0.000)
SetTypePtr64 130ns × (1.00,1.00) 47ns × (1.00,1.00) -63.67% (p=0.000)
SetTypePtr126 241ns × (0.99,1.01) 79ns × (1.00,1.01) -67.25% (p=0.000)
SetTypePtr128 2.07µs × (1.00,1.00) 0.08µs × (1.00,1.00) -96.27% (p=0.000)
SetTypePtrSlice 1.05µs × (0.99,1.01) 0.72µs × (0.99,1.02) -31.70% (p=0.000)
SetTypeNode1 16.0ns × (0.99,1.01) 20.8ns × (0.99,1.03) +29.91% (p=0.000)
SetTypeNode1Slice 184ns × (0.99,1.01) 112ns × (0.99,1.01) -39.26% (p=0.000)
SetTypeNode8 29.5ns × (0.97,1.02) 24.6ns × (1.00,1.00) -16.50% (p=0.000)
SetTypeNode8Slice 624ns × (0.98,1.02) 285ns × (1.00,1.00) -54.31% (p=0.000)
SetTypeNode64 135ns × (0.96,1.08) 52ns × (0.99,1.02) -61.32% (p=0.000)
SetTypeNode64Slice 3.83µs × (1.00,1.00) 1.14µs × (0.99,1.01) -70.16% (p=0.000)
SetTypeNode64Dead 134ns × (0.99,1.01) 32ns × (1.00,1.01) -75.74% (p=0.000)
SetTypeNode64DeadSlice 3.83µs × (0.99,1.00) 1.40µs × (1.00,1.01) -63.42% (p=0.000)
SetTypeNode124 240ns × (0.99,1.01) 79ns × (1.00,1.01) -67.05% (p=0.000)
SetTypeNode124Slice 7.27µs × (1.00,1.00) 2.04µs × (1.00,1.00) -71.95% (p=0.000)
SetTypeNode126 2.06µs × (0.99,1.01) 0.08µs × (0.99,1.01) -96.23% (p=0.000)
SetTypeNode126Slice 64.4µs × (1.00,1.00) 2.0µs × (1.00,1.00) -96.85% (p=0.000)
SetTypeNode128 2.09µs × (1.00,1.01) 0.12µs × (1.00,1.00) -94.15% (p=0.000)
SetTypeNode128Slice 65.4µs × (1.00,1.00) 2.4µs × (0.99,1.03) -96.39% (p=0.000)
SetTypeNode130 2.11µs × (1.00,1.00) 0.12µs × (1.00,1.00) -94.18% (p=0.000)
SetTypeNode130Slice 66.3µs × (1.00,1.00) 2.4µs × (0.97,1.08) -96.34% (p=0.000)
SetTypeNode1024 16.0µs × (1.00,1.01) 0.5µs × (1.00,1.00) -96.65% (p=0.000)
SetTypeNode1024Slice 512µs × (1.00,1.00) 18µs × (0.98,1.04) -96.45% (p=0.000)
SetTypeNode124 uses a 124 data + 2 ptr = 126-word allocation.
Both Go 1.4 and this CL are using pointer bitmaps for this case,
so that's an overall 3x speedup for using pointer bitmaps.
SetTypeNode128 uses a 128 data + 2 ptr = 130-word allocation.
Both Go 1.4 and this CL are running the GC program for this case,
so that's an overall 17x speedup when using GC programs (and
I've seen >20x on other systems).
Comparing Go 1.4's SetTypeNode124 (pointer bitmap) against
this CL's SetTypeNode128 (GC program), the slow path in the
code in this CL is 2x faster than the fast path in Go 1.4.
The Go 1 benchmarks are basically unaffected compared to just before this CL.
Go 1 benchmarks, before this CL vs this CL:
name old mean new mean delta
BinaryTree17 5.87s × (0.97,1.04) 5.91s × (0.96,1.04) ~ (p=0.306)
Fannkuch11 4.38s × (1.00,1.00) 4.37s × (1.00,1.01) -0.22% (p=0.006)
FmtFprintfEmpty 90.7ns × (0.97,1.10) 89.3ns × (0.96,1.09) ~ (p=0.280)
FmtFprintfString 282ns × (0.98,1.04) 287ns × (0.98,1.07) +1.72% (p=0.039)
FmtFprintfInt 269ns × (0.99,1.03) 282ns × (0.97,1.04) +4.87% (p=0.000)
FmtFprintfIntInt 478ns × (0.99,1.02) 481ns × (0.99,1.02) +0.61% (p=0.048)
FmtFprintfPrefixedInt 399ns × (0.98,1.03) 400ns × (0.98,1.05) ~ (p=0.533)
FmtFprintfFloat 563ns × (0.99,1.01) 570ns × (1.00,1.01) +1.37% (p=0.000)
FmtManyArgs 1.89µs × (0.99,1.01) 1.92µs × (0.99,1.02) +1.88% (p=0.000)
GobDecode 15.2ms × (0.99,1.01) 15.2ms × (0.98,1.05) ~ (p=0.609)
GobEncode 11.6ms × (0.98,1.03) 11.9ms × (0.98,1.04) +2.17% (p=0.000)
Gzip 648ms × (0.99,1.01) 648ms × (1.00,1.01) ~ (p=0.835)
Gunzip 142ms × (1.00,1.00) 143ms × (1.00,1.01) ~ (p=0.169)
HTTPClientServer 90.5µs × (0.98,1.03) 91.5µs × (0.98,1.04) +1.04% (p=0.045)
JSONEncode 31.5ms × (0.98,1.03) 31.4ms × (0.98,1.03) ~ (p=0.549)
JSONDecode 111ms × (0.99,1.01) 107ms × (0.99,1.01) -3.21% (p=0.000)
Mandelbrot200 6.01ms × (1.00,1.00) 6.01ms × (1.00,1.00) ~ (p=0.878)
GoParse 6.54ms × (0.99,1.02) 6.61ms × (0.99,1.03) +1.08% (p=0.004)
RegexpMatchEasy0_32 160ns × (1.00,1.01) 161ns × (1.00,1.00) +0.40% (p=0.000)
RegexpMatchEasy0_1K 560ns × (0.99,1.01) 559ns × (0.99,1.01) ~ (p=0.088)
RegexpMatchEasy1_32 138ns × (0.99,1.01) 138ns × (1.00,1.00) ~ (p=0.380)
RegexpMatchEasy1_1K 877ns × (1.00,1.00) 878ns × (1.00,1.00) ~ (p=0.157)
RegexpMatchMedium_32 251ns × (0.99,1.00) 251ns × (1.00,1.01) +0.28% (p=0.021)
RegexpMatchMedium_1K 72.6µs × (1.00,1.00) 72.6µs × (1.00,1.00) ~ (p=0.539)
RegexpMatchHard_32 3.84µs × (1.00,1.00) 3.84µs × (1.00,1.00) ~ (p=0.378)
RegexpMatchHard_1K 117µs × (1.00,1.00) 117µs × (1.00,1.00) ~ (p=0.067)
Revcomp 904ms × (0.99,1.02) 904ms × (0.99,1.01) ~ (p=0.943)
Template 125ms × (0.99,1.02) 127ms × (0.99,1.01) +1.79% (p=0.000)
TimeParse 627ns × (0.99,1.01) 622ns × (0.99,1.01) -0.88% (p=0.000)
TimeFormat 655ns × (0.99,1.02) 655ns × (0.99,1.02) ~ (p=0.976)
For the record, Go 1 benchmarks, Go 1.4 vs this CL:
name old mean new mean delta
BinaryTree17 4.61s × (0.97,1.05) 5.91s × (0.98,1.03) +28.35% (p=0.000)
Fannkuch11 4.40s × (0.99,1.03) 4.41s × (0.99,1.01) ~ (p=0.212)
FmtFprintfEmpty 102ns × (0.99,1.01) 84ns × (0.99,1.02) -18.38% (p=0.000)
FmtFprintfString 302ns × (0.98,1.01) 303ns × (0.99,1.02) ~ (p=0.203)
FmtFprintfInt 313ns × (0.97,1.05) 270ns × (0.99,1.01) -13.69% (p=0.000)
FmtFprintfIntInt 524ns × (0.98,1.02) 477ns × (0.99,1.00) -8.87% (p=0.000)
FmtFprintfPrefixedInt 424ns × (0.98,1.02) 386ns × (0.99,1.01) -8.96% (p=0.000)
FmtFprintfFloat 652ns × (0.98,1.02) 594ns × (0.97,1.05) -8.97% (p=0.000)
FmtManyArgs 2.13µs × (0.99,1.02) 1.94µs × (0.99,1.01) -8.92% (p=0.000)
GobDecode 17.1ms × (0.99,1.02) 14.9ms × (0.98,1.03) -13.07% (p=0.000)
GobEncode 13.5ms × (0.98,1.03) 11.5ms × (0.98,1.03) -15.25% (p=0.000)
Gzip 656ms × (0.99,1.02) 647ms × (0.99,1.01) -1.29% (p=0.000)
Gunzip 143ms × (0.99,1.02) 144ms × (0.99,1.01) ~ (p=0.204)
HTTPClientServer 88.2µs × (0.98,1.02) 90.8µs × (0.98,1.01) +2.93% (p=0.000)
JSONEncode 32.2ms × (0.98,1.02) 30.9ms × (0.97,1.04) -4.06% (p=0.001)
JSONDecode 121ms × (0.98,1.02) 110ms × (0.98,1.05) -8.95% (p=0.000)
Mandelbrot200 6.06ms × (0.99,1.01) 6.11ms × (0.98,1.04) ~ (p=0.184)
GoParse 6.76ms × (0.97,1.04) 6.58ms × (0.98,1.05) -2.63% (p=0.003)
RegexpMatchEasy0_32 195ns × (1.00,1.01) 155ns × (0.99,1.01) -20.43% (p=0.000)
RegexpMatchEasy0_1K 479ns × (0.98,1.03) 535ns × (0.99,1.02) +11.59% (p=0.000)
RegexpMatchEasy1_32 169ns × (0.99,1.02) 131ns × (0.99,1.03) -22.44% (p=0.000)
RegexpMatchEasy1_1K 1.53µs × (0.99,1.01) 0.87µs × (0.99,1.02) -43.07% (p=0.000)
RegexpMatchMedium_32 334ns × (0.99,1.01) 242ns × (0.99,1.01) -27.53% (p=0.000)
RegexpMatchMedium_1K 125µs × (1.00,1.01) 72µs × (0.99,1.03) -42.53% (p=0.000)
RegexpMatchHard_32 6.03µs × (0.99,1.01) 3.79µs × (0.99,1.01) -37.12% (p=0.000)
RegexpMatchHard_1K 189µs × (0.99,1.02) 115µs × (0.99,1.01) -39.20% (p=0.000)
Revcomp 935ms × (0.96,1.03) 926ms × (0.98,1.02) ~ (p=0.083)
Template 146ms × (0.97,1.05) 119ms × (0.99,1.01) -18.37% (p=0.000)
TimeParse 660ns × (0.99,1.01) 624ns × (0.99,1.02) -5.43% (p=0.000)
TimeFormat 670ns × (0.98,1.02) 710ns × (1.00,1.01) +5.97% (p=0.000)
This CL is a bit larger than I would like, but the compiler, linker, runtime,
and package reflect all need to be in sync about the format of these programs,
so there is no easy way to split this into independent changes (at least
while keeping the build working at each change).
Fixes #9625.
Fixes #10524.
Change-Id: I9e3e20d6097099d0f8532d1cb5b1af528804989a
Reviewed-on: https://go-review.googlesource.com/9888
Reviewed-by: Austin Clements <austin@google.com>
Run-TryBot: Russ Cox <rsc@golang.org>
2015-05-08 01:43:18 -04:00
|
|
|
gc = GCProg{}
|
2016-08-19 22:40:38 -04:00
|
|
|
gc.Init(ctxt, "runtime.gcbss")
|
2016-04-18 14:50:14 -04:00
|
|
|
for _, s := range data[obj.SBSS] {
|
2015-02-27 22:57:28 -05:00
|
|
|
s.Sect = sect
|
|
|
|
|
datsize = aligndatsize(datsize, s)
|
|
|
|
|
s.Value = int64(uint64(datsize) - sect.Vaddr)
|
runtime: replace GC programs with simpler encoding, faster decoder
Small types record the location of pointers in their memory layout
by using a simple bitmap. In Go 1.4 the bitmap held 4-bit entries,
and in Go 1.5 the bitmap holds 1-bit entries, but in both cases using
a bitmap for a large type containing arrays does not make sense:
if someone refers to the type [1<<28]*byte in a program in such
a way that the type information makes it into the binary, it would be
a waste of space to write a 128 MB (for 4-bit entries) or even 32 MB
(for 1-bit entries) bitmap full of 1s into the binary or even to keep
one in memory during the execution of the program.
For large types containing arrays, it is much more compact to describe
the locations of pointers using a notation that can express repetition
than to lay out a bitmap of pointers. Go 1.4 included such a notation,
called ``GC programs'' but it was complex, required recursion during
decoding, and was generally slow. Dmitriy measured the execution of
these programs writing directly to the heap bitmap as being 7x slower
than copying from a preunrolled 4-bit mask (and frankly that code was
not terribly fast either). For some tests, unrollgcprog1 was seen costing
as much as 3x more than the rest of malloc combined.
This CL introduces a different form for the GC programs. They use a
simple Lempel-Ziv-style encoding of the 1-bit pointer information,
in which the only operations are (1) emit the following n bits
and (2) repeat the last n bits c more times. This encoding can be
generated directly from the Go type information (using repetition
only for arrays or large runs of non-pointer data) and it can be decoded
very efficiently. In particular the decoding requires little state and
no recursion, so that the entire decoding can run without any memory
accesses other than the reads of the encoding and the writes of the
decoded form to the heap bitmap. For recursive types like arrays of
arrays of arrays, the inner instructions are only executed once, not
n times, so that large repetitions run at full speed. (In contrast, large
repetitions in the old programs repeated the individual bit-level layout
of the inner data over and over.) The result is as much as 25x faster
decoding compared to the old form.
Because the old decoder was so slow, Go 1.4 had three (or so) cases
for how to set the heap bitmap bits for an allocation of a given type:
(1) If the type had an even number of words up to 32 words, then
the 4-bit pointer mask for the type fit in no more than 16 bytes;
store the 4-bit pointer mask directly in the binary and copy from it.
(1b) If the type had an odd number of words up to 15 words, then
the 4-bit pointer mask for the type, doubled to end on a byte boundary,
fit in no more than 16 bytes; store that doubled mask directly in the
binary and copy from it.
(2) If the type had an even number of words up to 128 words,
or an odd number of words up to 63 words (again due to doubling),
then the 4-bit pointer mask would fit in a 64-byte unrolled mask.
Store a GC program in the binary, but leave space in the BSS for
the unrolled mask. Execute the GC program to construct the mask the
first time it is needed, and thereafter copy from the mask.
(3) Otherwise, store a GC program and execute it to write directly to
the heap bitmap each time an object of that type is allocated.
(This is the case that was 7x slower than the other two.)
Because the new pointer masks store 1-bit entries instead of 4-bit
entries and because using the decoder no longer carries a significant
overhead, after this CL (that is, for Go 1.5) there are only two cases:
(1) If the type is 128 words or less (no condition about odd or even),
store the 1-bit pointer mask directly in the binary and use it to
initialize the heap bitmap during malloc. (Implemented in CL 9702.)
(2) There is no case 2 anymore.
(3) Otherwise, store a GC program and execute it to write directly to
the heap bitmap each time an object of that type is allocated.
Executing the GC program directly into the heap bitmap (case (3) above)
was disabled for the Go 1.5 dev cycle, both to avoid needing to use
GC programs for typedmemmove and to avoid updating that code as
the heap bitmap format changed. Typedmemmove no longer uses this
type information; as of CL 9886 it uses the heap bitmap directly.
Now that the heap bitmap format is stable, we reintroduce GC programs
and their space savings.
Benchmarks for heapBitsSetType, before this CL vs this CL:
name old mean new mean delta
SetTypePtr 7.59ns × (0.99,1.02) 5.16ns × (1.00,1.00) -32.05% (p=0.000)
SetTypePtr8 21.0ns × (0.98,1.05) 21.4ns × (1.00,1.00) ~ (p=0.179)
SetTypePtr16 24.1ns × (0.99,1.01) 24.6ns × (1.00,1.00) +2.41% (p=0.001)
SetTypePtr32 31.2ns × (0.99,1.01) 32.4ns × (0.99,1.02) +3.72% (p=0.001)
SetTypePtr64 45.2ns × (1.00,1.00) 47.2ns × (1.00,1.00) +4.42% (p=0.000)
SetTypePtr126 75.8ns × (0.99,1.01) 79.1ns × (1.00,1.00) +4.25% (p=0.000)
SetTypePtr128 74.3ns × (0.99,1.01) 77.6ns × (1.00,1.01) +4.55% (p=0.000)
SetTypePtrSlice 726ns × (1.00,1.01) 712ns × (1.00,1.00) -1.95% (p=0.001)
SetTypeNode1 20.0ns × (0.99,1.01) 20.7ns × (1.00,1.00) +3.71% (p=0.000)
SetTypeNode1Slice 112ns × (1.00,1.00) 113ns × (0.99,1.00) ~ (p=0.070)
SetTypeNode8 23.9ns × (1.00,1.00) 24.7ns × (1.00,1.01) +3.18% (p=0.000)
SetTypeNode8Slice 294ns × (0.99,1.02) 287ns × (0.99,1.01) -2.38% (p=0.015)
SetTypeNode64 52.8ns × (0.99,1.03) 51.8ns × (0.99,1.01) ~ (p=0.069)
SetTypeNode64Slice 1.13µs × (0.99,1.05) 1.14µs × (0.99,1.00) ~ (p=0.767)
SetTypeNode64Dead 36.0ns × (1.00,1.01) 32.5ns × (0.99,1.00) -9.67% (p=0.000)
SetTypeNode64DeadSlice 1.43µs × (0.99,1.01) 1.40µs × (1.00,1.00) -2.39% (p=0.001)
SetTypeNode124 75.7ns × (1.00,1.01) 79.0ns × (1.00,1.00) +4.44% (p=0.000)
SetTypeNode124Slice 1.94µs × (1.00,1.01) 2.04µs × (0.99,1.01) +4.98% (p=0.000)
SetTypeNode126 75.4ns × (1.00,1.01) 77.7ns × (0.99,1.01) +3.11% (p=0.000)
SetTypeNode126Slice 1.95µs × (0.99,1.01) 2.03µs × (1.00,1.00) +3.74% (p=0.000)
SetTypeNode128 85.4ns × (0.99,1.01) 122.0ns × (1.00,1.00) +42.89% (p=0.000)
SetTypeNode128Slice 2.20µs × (1.00,1.01) 2.36µs × (0.98,1.02) +7.48% (p=0.001)
SetTypeNode130 83.3ns × (1.00,1.00) 123.0ns × (1.00,1.00) +47.61% (p=0.000)
SetTypeNode130Slice 2.30µs × (0.99,1.01) 2.40µs × (0.98,1.01) +4.37% (p=0.000)
SetTypeNode1024 498ns × (1.00,1.00) 537ns × (1.00,1.00) +7.96% (p=0.000)
SetTypeNode1024Slice 15.5µs × (0.99,1.01) 17.8µs × (1.00,1.00) +15.27% (p=0.000)
The above compares always using a cached pointer mask (and the
corresponding waste of memory) against using the programs directly.
Some slowdown is expected, in exchange for having a better general algorithm.
The GC programs kick in for SetTypeNode128, SetTypeNode130, SetTypeNode1024,
along with the slice variants of those.
It is possible that the cutoff of 128 words (bits) should be raised
in a followup CL, but even with this low cutoff the GC programs are
faster than Go 1.4's "fast path" non-GC program case.
Benchmarks for heapBitsSetType, Go 1.4 vs this CL:
name old mean new mean delta
SetTypePtr 6.89ns × (1.00,1.00) 5.17ns × (1.00,1.00) -25.02% (p=0.000)
SetTypePtr8 25.8ns × (0.97,1.05) 21.5ns × (1.00,1.00) -16.70% (p=0.000)
SetTypePtr16 39.8ns × (0.97,1.02) 24.7ns × (0.99,1.01) -37.81% (p=0.000)
SetTypePtr32 68.8ns × (0.98,1.01) 32.2ns × (1.00,1.01) -53.18% (p=0.000)
SetTypePtr64 130ns × (1.00,1.00) 47ns × (1.00,1.00) -63.67% (p=0.000)
SetTypePtr126 241ns × (0.99,1.01) 79ns × (1.00,1.01) -67.25% (p=0.000)
SetTypePtr128 2.07µs × (1.00,1.00) 0.08µs × (1.00,1.00) -96.27% (p=0.000)
SetTypePtrSlice 1.05µs × (0.99,1.01) 0.72µs × (0.99,1.02) -31.70% (p=0.000)
SetTypeNode1 16.0ns × (0.99,1.01) 20.8ns × (0.99,1.03) +29.91% (p=0.000)
SetTypeNode1Slice 184ns × (0.99,1.01) 112ns × (0.99,1.01) -39.26% (p=0.000)
SetTypeNode8 29.5ns × (0.97,1.02) 24.6ns × (1.00,1.00) -16.50% (p=0.000)
SetTypeNode8Slice 624ns × (0.98,1.02) 285ns × (1.00,1.00) -54.31% (p=0.000)
SetTypeNode64 135ns × (0.96,1.08) 52ns × (0.99,1.02) -61.32% (p=0.000)
SetTypeNode64Slice 3.83µs × (1.00,1.00) 1.14µs × (0.99,1.01) -70.16% (p=0.000)
SetTypeNode64Dead 134ns × (0.99,1.01) 32ns × (1.00,1.01) -75.74% (p=0.000)
SetTypeNode64DeadSlice 3.83µs × (0.99,1.00) 1.40µs × (1.00,1.01) -63.42% (p=0.000)
SetTypeNode124 240ns × (0.99,1.01) 79ns × (1.00,1.01) -67.05% (p=0.000)
SetTypeNode124Slice 7.27µs × (1.00,1.00) 2.04µs × (1.00,1.00) -71.95% (p=0.000)
SetTypeNode126 2.06µs × (0.99,1.01) 0.08µs × (0.99,1.01) -96.23% (p=0.000)
SetTypeNode126Slice 64.4µs × (1.00,1.00) 2.0µs × (1.00,1.00) -96.85% (p=0.000)
SetTypeNode128 2.09µs × (1.00,1.01) 0.12µs × (1.00,1.00) -94.15% (p=0.000)
SetTypeNode128Slice 65.4µs × (1.00,1.00) 2.4µs × (0.99,1.03) -96.39% (p=0.000)
SetTypeNode130 2.11µs × (1.00,1.00) 0.12µs × (1.00,1.00) -94.18% (p=0.000)
SetTypeNode130Slice 66.3µs × (1.00,1.00) 2.4µs × (0.97,1.08) -96.34% (p=0.000)
SetTypeNode1024 16.0µs × (1.00,1.01) 0.5µs × (1.00,1.00) -96.65% (p=0.000)
SetTypeNode1024Slice 512µs × (1.00,1.00) 18µs × (0.98,1.04) -96.45% (p=0.000)
SetTypeNode124 uses a 124 data + 2 ptr = 126-word allocation.
Both Go 1.4 and this CL are using pointer bitmaps for this case,
so that's an overall 3x speedup for using pointer bitmaps.
SetTypeNode128 uses a 128 data + 2 ptr = 130-word allocation.
Both Go 1.4 and this CL are running the GC program for this case,
so that's an overall 17x speedup when using GC programs (and
I've seen >20x on other systems).
Comparing Go 1.4's SetTypeNode124 (pointer bitmap) against
this CL's SetTypeNode128 (GC program), the slow path in the
code in this CL is 2x faster than the fast path in Go 1.4.
The Go 1 benchmarks are basically unaffected compared to just before this CL.
Go 1 benchmarks, before this CL vs this CL:
name old mean new mean delta
BinaryTree17 5.87s × (0.97,1.04) 5.91s × (0.96,1.04) ~ (p=0.306)
Fannkuch11 4.38s × (1.00,1.00) 4.37s × (1.00,1.01) -0.22% (p=0.006)
FmtFprintfEmpty 90.7ns × (0.97,1.10) 89.3ns × (0.96,1.09) ~ (p=0.280)
FmtFprintfString 282ns × (0.98,1.04) 287ns × (0.98,1.07) +1.72% (p=0.039)
FmtFprintfInt 269ns × (0.99,1.03) 282ns × (0.97,1.04) +4.87% (p=0.000)
FmtFprintfIntInt 478ns × (0.99,1.02) 481ns × (0.99,1.02) +0.61% (p=0.048)
FmtFprintfPrefixedInt 399ns × (0.98,1.03) 400ns × (0.98,1.05) ~ (p=0.533)
FmtFprintfFloat 563ns × (0.99,1.01) 570ns × (1.00,1.01) +1.37% (p=0.000)
FmtManyArgs 1.89µs × (0.99,1.01) 1.92µs × (0.99,1.02) +1.88% (p=0.000)
GobDecode 15.2ms × (0.99,1.01) 15.2ms × (0.98,1.05) ~ (p=0.609)
GobEncode 11.6ms × (0.98,1.03) 11.9ms × (0.98,1.04) +2.17% (p=0.000)
Gzip 648ms × (0.99,1.01) 648ms × (1.00,1.01) ~ (p=0.835)
Gunzip 142ms × (1.00,1.00) 143ms × (1.00,1.01) ~ (p=0.169)
HTTPClientServer 90.5µs × (0.98,1.03) 91.5µs × (0.98,1.04) +1.04% (p=0.045)
JSONEncode 31.5ms × (0.98,1.03) 31.4ms × (0.98,1.03) ~ (p=0.549)
JSONDecode 111ms × (0.99,1.01) 107ms × (0.99,1.01) -3.21% (p=0.000)
Mandelbrot200 6.01ms × (1.00,1.00) 6.01ms × (1.00,1.00) ~ (p=0.878)
GoParse 6.54ms × (0.99,1.02) 6.61ms × (0.99,1.03) +1.08% (p=0.004)
RegexpMatchEasy0_32 160ns × (1.00,1.01) 161ns × (1.00,1.00) +0.40% (p=0.000)
RegexpMatchEasy0_1K 560ns × (0.99,1.01) 559ns × (0.99,1.01) ~ (p=0.088)
RegexpMatchEasy1_32 138ns × (0.99,1.01) 138ns × (1.00,1.00) ~ (p=0.380)
RegexpMatchEasy1_1K 877ns × (1.00,1.00) 878ns × (1.00,1.00) ~ (p=0.157)
RegexpMatchMedium_32 251ns × (0.99,1.00) 251ns × (1.00,1.01) +0.28% (p=0.021)
RegexpMatchMedium_1K 72.6µs × (1.00,1.00) 72.6µs × (1.00,1.00) ~ (p=0.539)
RegexpMatchHard_32 3.84µs × (1.00,1.00) 3.84µs × (1.00,1.00) ~ (p=0.378)
RegexpMatchHard_1K 117µs × (1.00,1.00) 117µs × (1.00,1.00) ~ (p=0.067)
Revcomp 904ms × (0.99,1.02) 904ms × (0.99,1.01) ~ (p=0.943)
Template 125ms × (0.99,1.02) 127ms × (0.99,1.01) +1.79% (p=0.000)
TimeParse 627ns × (0.99,1.01) 622ns × (0.99,1.01) -0.88% (p=0.000)
TimeFormat 655ns × (0.99,1.02) 655ns × (0.99,1.02) ~ (p=0.976)
For the record, Go 1 benchmarks, Go 1.4 vs this CL:
name old mean new mean delta
BinaryTree17 4.61s × (0.97,1.05) 5.91s × (0.98,1.03) +28.35% (p=0.000)
Fannkuch11 4.40s × (0.99,1.03) 4.41s × (0.99,1.01) ~ (p=0.212)
FmtFprintfEmpty 102ns × (0.99,1.01) 84ns × (0.99,1.02) -18.38% (p=0.000)
FmtFprintfString 302ns × (0.98,1.01) 303ns × (0.99,1.02) ~ (p=0.203)
FmtFprintfInt 313ns × (0.97,1.05) 270ns × (0.99,1.01) -13.69% (p=0.000)
FmtFprintfIntInt 524ns × (0.98,1.02) 477ns × (0.99,1.00) -8.87% (p=0.000)
FmtFprintfPrefixedInt 424ns × (0.98,1.02) 386ns × (0.99,1.01) -8.96% (p=0.000)
FmtFprintfFloat 652ns × (0.98,1.02) 594ns × (0.97,1.05) -8.97% (p=0.000)
FmtManyArgs 2.13µs × (0.99,1.02) 1.94µs × (0.99,1.01) -8.92% (p=0.000)
GobDecode 17.1ms × (0.99,1.02) 14.9ms × (0.98,1.03) -13.07% (p=0.000)
GobEncode 13.5ms × (0.98,1.03) 11.5ms × (0.98,1.03) -15.25% (p=0.000)
Gzip 656ms × (0.99,1.02) 647ms × (0.99,1.01) -1.29% (p=0.000)
Gunzip 143ms × (0.99,1.02) 144ms × (0.99,1.01) ~ (p=0.204)
HTTPClientServer 88.2µs × (0.98,1.02) 90.8µs × (0.98,1.01) +2.93% (p=0.000)
JSONEncode 32.2ms × (0.98,1.02) 30.9ms × (0.97,1.04) -4.06% (p=0.001)
JSONDecode 121ms × (0.98,1.02) 110ms × (0.98,1.05) -8.95% (p=0.000)
Mandelbrot200 6.06ms × (0.99,1.01) 6.11ms × (0.98,1.04) ~ (p=0.184)
GoParse 6.76ms × (0.97,1.04) 6.58ms × (0.98,1.05) -2.63% (p=0.003)
RegexpMatchEasy0_32 195ns × (1.00,1.01) 155ns × (0.99,1.01) -20.43% (p=0.000)
RegexpMatchEasy0_1K 479ns × (0.98,1.03) 535ns × (0.99,1.02) +11.59% (p=0.000)
RegexpMatchEasy1_32 169ns × (0.99,1.02) 131ns × (0.99,1.03) -22.44% (p=0.000)
RegexpMatchEasy1_1K 1.53µs × (0.99,1.01) 0.87µs × (0.99,1.02) -43.07% (p=0.000)
RegexpMatchMedium_32 334ns × (0.99,1.01) 242ns × (0.99,1.01) -27.53% (p=0.000)
RegexpMatchMedium_1K 125µs × (1.00,1.01) 72µs × (0.99,1.03) -42.53% (p=0.000)
RegexpMatchHard_32 6.03µs × (0.99,1.01) 3.79µs × (0.99,1.01) -37.12% (p=0.000)
RegexpMatchHard_1K 189µs × (0.99,1.02) 115µs × (0.99,1.01) -39.20% (p=0.000)
Revcomp 935ms × (0.96,1.03) 926ms × (0.98,1.02) ~ (p=0.083)
Template 146ms × (0.97,1.05) 119ms × (0.99,1.01) -18.37% (p=0.000)
TimeParse 660ns × (0.99,1.01) 624ns × (0.99,1.02) -5.43% (p=0.000)
TimeFormat 670ns × (0.98,1.02) 710ns × (1.00,1.01) +5.97% (p=0.000)
This CL is a bit larger than I would like, but the compiler, linker, runtime,
and package reflect all need to be in sync about the format of these programs,
so there is no easy way to split this into independent changes (at least
while keeping the build working at each change).
Fixes #9625.
Fixes #10524.
Change-Id: I9e3e20d6097099d0f8532d1cb5b1af528804989a
Reviewed-on: https://go-review.googlesource.com/9888
Reviewed-by: Austin Clements <austin@google.com>
Run-TryBot: Russ Cox <rsc@golang.org>
2015-05-08 01:43:18 -04:00
|
|
|
gc.AddSym(s)
|
2016-04-19 08:59:56 -04:00
|
|
|
datsize += s.Size
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
2016-08-19 22:40:38 -04:00
|
|
|
checkdatsize(ctxt, datsize, obj.SBSS)
|
2015-02-27 22:57:28 -05:00
|
|
|
sect.Length = uint64(datsize) - sect.Vaddr
|
runtime: replace GC programs with simpler encoding, faster decoder
Small types record the location of pointers in their memory layout
by using a simple bitmap. In Go 1.4 the bitmap held 4-bit entries,
and in Go 1.5 the bitmap holds 1-bit entries, but in both cases using
a bitmap for a large type containing arrays does not make sense:
if someone refers to the type [1<<28]*byte in a program in such
a way that the type information makes it into the binary, it would be
a waste of space to write a 128 MB (for 4-bit entries) or even 32 MB
(for 1-bit entries) bitmap full of 1s into the binary or even to keep
one in memory during the execution of the program.
For large types containing arrays, it is much more compact to describe
the locations of pointers using a notation that can express repetition
than to lay out a bitmap of pointers. Go 1.4 included such a notation,
called ``GC programs'' but it was complex, required recursion during
decoding, and was generally slow. Dmitriy measured the execution of
these programs writing directly to the heap bitmap as being 7x slower
than copying from a preunrolled 4-bit mask (and frankly that code was
not terribly fast either). For some tests, unrollgcprog1 was seen costing
as much as 3x more than the rest of malloc combined.
This CL introduces a different form for the GC programs. They use a
simple Lempel-Ziv-style encoding of the 1-bit pointer information,
in which the only operations are (1) emit the following n bits
and (2) repeat the last n bits c more times. This encoding can be
generated directly from the Go type information (using repetition
only for arrays or large runs of non-pointer data) and it can be decoded
very efficiently. In particular the decoding requires little state and
no recursion, so that the entire decoding can run without any memory
accesses other than the reads of the encoding and the writes of the
decoded form to the heap bitmap. For recursive types like arrays of
arrays of arrays, the inner instructions are only executed once, not
n times, so that large repetitions run at full speed. (In contrast, large
repetitions in the old programs repeated the individual bit-level layout
of the inner data over and over.) The result is as much as 25x faster
decoding compared to the old form.
Because the old decoder was so slow, Go 1.4 had three (or so) cases
for how to set the heap bitmap bits for an allocation of a given type:
(1) If the type had an even number of words up to 32 words, then
the 4-bit pointer mask for the type fit in no more than 16 bytes;
store the 4-bit pointer mask directly in the binary and copy from it.
(1b) If the type had an odd number of words up to 15 words, then
the 4-bit pointer mask for the type, doubled to end on a byte boundary,
fit in no more than 16 bytes; store that doubled mask directly in the
binary and copy from it.
(2) If the type had an even number of words up to 128 words,
or an odd number of words up to 63 words (again due to doubling),
then the 4-bit pointer mask would fit in a 64-byte unrolled mask.
Store a GC program in the binary, but leave space in the BSS for
the unrolled mask. Execute the GC program to construct the mask the
first time it is needed, and thereafter copy from the mask.
(3) Otherwise, store a GC program and execute it to write directly to
the heap bitmap each time an object of that type is allocated.
(This is the case that was 7x slower than the other two.)
Because the new pointer masks store 1-bit entries instead of 4-bit
entries and because using the decoder no longer carries a significant
overhead, after this CL (that is, for Go 1.5) there are only two cases:
(1) If the type is 128 words or less (no condition about odd or even),
store the 1-bit pointer mask directly in the binary and use it to
initialize the heap bitmap during malloc. (Implemented in CL 9702.)
(2) There is no case 2 anymore.
(3) Otherwise, store a GC program and execute it to write directly to
the heap bitmap each time an object of that type is allocated.
Executing the GC program directly into the heap bitmap (case (3) above)
was disabled for the Go 1.5 dev cycle, both to avoid needing to use
GC programs for typedmemmove and to avoid updating that code as
the heap bitmap format changed. Typedmemmove no longer uses this
type information; as of CL 9886 it uses the heap bitmap directly.
Now that the heap bitmap format is stable, we reintroduce GC programs
and their space savings.
Benchmarks for heapBitsSetType, before this CL vs this CL:
name old mean new mean delta
SetTypePtr 7.59ns × (0.99,1.02) 5.16ns × (1.00,1.00) -32.05% (p=0.000)
SetTypePtr8 21.0ns × (0.98,1.05) 21.4ns × (1.00,1.00) ~ (p=0.179)
SetTypePtr16 24.1ns × (0.99,1.01) 24.6ns × (1.00,1.00) +2.41% (p=0.001)
SetTypePtr32 31.2ns × (0.99,1.01) 32.4ns × (0.99,1.02) +3.72% (p=0.001)
SetTypePtr64 45.2ns × (1.00,1.00) 47.2ns × (1.00,1.00) +4.42% (p=0.000)
SetTypePtr126 75.8ns × (0.99,1.01) 79.1ns × (1.00,1.00) +4.25% (p=0.000)
SetTypePtr128 74.3ns × (0.99,1.01) 77.6ns × (1.00,1.01) +4.55% (p=0.000)
SetTypePtrSlice 726ns × (1.00,1.01) 712ns × (1.00,1.00) -1.95% (p=0.001)
SetTypeNode1 20.0ns × (0.99,1.01) 20.7ns × (1.00,1.00) +3.71% (p=0.000)
SetTypeNode1Slice 112ns × (1.00,1.00) 113ns × (0.99,1.00) ~ (p=0.070)
SetTypeNode8 23.9ns × (1.00,1.00) 24.7ns × (1.00,1.01) +3.18% (p=0.000)
SetTypeNode8Slice 294ns × (0.99,1.02) 287ns × (0.99,1.01) -2.38% (p=0.015)
SetTypeNode64 52.8ns × (0.99,1.03) 51.8ns × (0.99,1.01) ~ (p=0.069)
SetTypeNode64Slice 1.13µs × (0.99,1.05) 1.14µs × (0.99,1.00) ~ (p=0.767)
SetTypeNode64Dead 36.0ns × (1.00,1.01) 32.5ns × (0.99,1.00) -9.67% (p=0.000)
SetTypeNode64DeadSlice 1.43µs × (0.99,1.01) 1.40µs × (1.00,1.00) -2.39% (p=0.001)
SetTypeNode124 75.7ns × (1.00,1.01) 79.0ns × (1.00,1.00) +4.44% (p=0.000)
SetTypeNode124Slice 1.94µs × (1.00,1.01) 2.04µs × (0.99,1.01) +4.98% (p=0.000)
SetTypeNode126 75.4ns × (1.00,1.01) 77.7ns × (0.99,1.01) +3.11% (p=0.000)
SetTypeNode126Slice 1.95µs × (0.99,1.01) 2.03µs × (1.00,1.00) +3.74% (p=0.000)
SetTypeNode128 85.4ns × (0.99,1.01) 122.0ns × (1.00,1.00) +42.89% (p=0.000)
SetTypeNode128Slice 2.20µs × (1.00,1.01) 2.36µs × (0.98,1.02) +7.48% (p=0.001)
SetTypeNode130 83.3ns × (1.00,1.00) 123.0ns × (1.00,1.00) +47.61% (p=0.000)
SetTypeNode130Slice 2.30µs × (0.99,1.01) 2.40µs × (0.98,1.01) +4.37% (p=0.000)
SetTypeNode1024 498ns × (1.00,1.00) 537ns × (1.00,1.00) +7.96% (p=0.000)
SetTypeNode1024Slice 15.5µs × (0.99,1.01) 17.8µs × (1.00,1.00) +15.27% (p=0.000)
The above compares always using a cached pointer mask (and the
corresponding waste of memory) against using the programs directly.
Some slowdown is expected, in exchange for having a better general algorithm.
The GC programs kick in for SetTypeNode128, SetTypeNode130, SetTypeNode1024,
along with the slice variants of those.
It is possible that the cutoff of 128 words (bits) should be raised
in a followup CL, but even with this low cutoff the GC programs are
faster than Go 1.4's "fast path" non-GC program case.
Benchmarks for heapBitsSetType, Go 1.4 vs this CL:
name old mean new mean delta
SetTypePtr 6.89ns × (1.00,1.00) 5.17ns × (1.00,1.00) -25.02% (p=0.000)
SetTypePtr8 25.8ns × (0.97,1.05) 21.5ns × (1.00,1.00) -16.70% (p=0.000)
SetTypePtr16 39.8ns × (0.97,1.02) 24.7ns × (0.99,1.01) -37.81% (p=0.000)
SetTypePtr32 68.8ns × (0.98,1.01) 32.2ns × (1.00,1.01) -53.18% (p=0.000)
SetTypePtr64 130ns × (1.00,1.00) 47ns × (1.00,1.00) -63.67% (p=0.000)
SetTypePtr126 241ns × (0.99,1.01) 79ns × (1.00,1.01) -67.25% (p=0.000)
SetTypePtr128 2.07µs × (1.00,1.00) 0.08µs × (1.00,1.00) -96.27% (p=0.000)
SetTypePtrSlice 1.05µs × (0.99,1.01) 0.72µs × (0.99,1.02) -31.70% (p=0.000)
SetTypeNode1 16.0ns × (0.99,1.01) 20.8ns × (0.99,1.03) +29.91% (p=0.000)
SetTypeNode1Slice 184ns × (0.99,1.01) 112ns × (0.99,1.01) -39.26% (p=0.000)
SetTypeNode8 29.5ns × (0.97,1.02) 24.6ns × (1.00,1.00) -16.50% (p=0.000)
SetTypeNode8Slice 624ns × (0.98,1.02) 285ns × (1.00,1.00) -54.31% (p=0.000)
SetTypeNode64 135ns × (0.96,1.08) 52ns × (0.99,1.02) -61.32% (p=0.000)
SetTypeNode64Slice 3.83µs × (1.00,1.00) 1.14µs × (0.99,1.01) -70.16% (p=0.000)
SetTypeNode64Dead 134ns × (0.99,1.01) 32ns × (1.00,1.01) -75.74% (p=0.000)
SetTypeNode64DeadSlice 3.83µs × (0.99,1.00) 1.40µs × (1.00,1.01) -63.42% (p=0.000)
SetTypeNode124 240ns × (0.99,1.01) 79ns × (1.00,1.01) -67.05% (p=0.000)
SetTypeNode124Slice 7.27µs × (1.00,1.00) 2.04µs × (1.00,1.00) -71.95% (p=0.000)
SetTypeNode126 2.06µs × (0.99,1.01) 0.08µs × (0.99,1.01) -96.23% (p=0.000)
SetTypeNode126Slice 64.4µs × (1.00,1.00) 2.0µs × (1.00,1.00) -96.85% (p=0.000)
SetTypeNode128 2.09µs × (1.00,1.01) 0.12µs × (1.00,1.00) -94.15% (p=0.000)
SetTypeNode128Slice 65.4µs × (1.00,1.00) 2.4µs × (0.99,1.03) -96.39% (p=0.000)
SetTypeNode130 2.11µs × (1.00,1.00) 0.12µs × (1.00,1.00) -94.18% (p=0.000)
SetTypeNode130Slice 66.3µs × (1.00,1.00) 2.4µs × (0.97,1.08) -96.34% (p=0.000)
SetTypeNode1024 16.0µs × (1.00,1.01) 0.5µs × (1.00,1.00) -96.65% (p=0.000)
SetTypeNode1024Slice 512µs × (1.00,1.00) 18µs × (0.98,1.04) -96.45% (p=0.000)
SetTypeNode124 uses a 124 data + 2 ptr = 126-word allocation.
Both Go 1.4 and this CL are using pointer bitmaps for this case,
so that's an overall 3x speedup for using pointer bitmaps.
SetTypeNode128 uses a 128 data + 2 ptr = 130-word allocation.
Both Go 1.4 and this CL are running the GC program for this case,
so that's an overall 17x speedup when using GC programs (and
I've seen >20x on other systems).
Comparing Go 1.4's SetTypeNode124 (pointer bitmap) against
this CL's SetTypeNode128 (GC program), the slow path in the
code in this CL is 2x faster than the fast path in Go 1.4.
The Go 1 benchmarks are basically unaffected compared to just before this CL.
Go 1 benchmarks, before this CL vs this CL:
name old mean new mean delta
BinaryTree17 5.87s × (0.97,1.04) 5.91s × (0.96,1.04) ~ (p=0.306)
Fannkuch11 4.38s × (1.00,1.00) 4.37s × (1.00,1.01) -0.22% (p=0.006)
FmtFprintfEmpty 90.7ns × (0.97,1.10) 89.3ns × (0.96,1.09) ~ (p=0.280)
FmtFprintfString 282ns × (0.98,1.04) 287ns × (0.98,1.07) +1.72% (p=0.039)
FmtFprintfInt 269ns × (0.99,1.03) 282ns × (0.97,1.04) +4.87% (p=0.000)
FmtFprintfIntInt 478ns × (0.99,1.02) 481ns × (0.99,1.02) +0.61% (p=0.048)
FmtFprintfPrefixedInt 399ns × (0.98,1.03) 400ns × (0.98,1.05) ~ (p=0.533)
FmtFprintfFloat 563ns × (0.99,1.01) 570ns × (1.00,1.01) +1.37% (p=0.000)
FmtManyArgs 1.89µs × (0.99,1.01) 1.92µs × (0.99,1.02) +1.88% (p=0.000)
GobDecode 15.2ms × (0.99,1.01) 15.2ms × (0.98,1.05) ~ (p=0.609)
GobEncode 11.6ms × (0.98,1.03) 11.9ms × (0.98,1.04) +2.17% (p=0.000)
Gzip 648ms × (0.99,1.01) 648ms × (1.00,1.01) ~ (p=0.835)
Gunzip 142ms × (1.00,1.00) 143ms × (1.00,1.01) ~ (p=0.169)
HTTPClientServer 90.5µs × (0.98,1.03) 91.5µs × (0.98,1.04) +1.04% (p=0.045)
JSONEncode 31.5ms × (0.98,1.03) 31.4ms × (0.98,1.03) ~ (p=0.549)
JSONDecode 111ms × (0.99,1.01) 107ms × (0.99,1.01) -3.21% (p=0.000)
Mandelbrot200 6.01ms × (1.00,1.00) 6.01ms × (1.00,1.00) ~ (p=0.878)
GoParse 6.54ms × (0.99,1.02) 6.61ms × (0.99,1.03) +1.08% (p=0.004)
RegexpMatchEasy0_32 160ns × (1.00,1.01) 161ns × (1.00,1.00) +0.40% (p=0.000)
RegexpMatchEasy0_1K 560ns × (0.99,1.01) 559ns × (0.99,1.01) ~ (p=0.088)
RegexpMatchEasy1_32 138ns × (0.99,1.01) 138ns × (1.00,1.00) ~ (p=0.380)
RegexpMatchEasy1_1K 877ns × (1.00,1.00) 878ns × (1.00,1.00) ~ (p=0.157)
RegexpMatchMedium_32 251ns × (0.99,1.00) 251ns × (1.00,1.01) +0.28% (p=0.021)
RegexpMatchMedium_1K 72.6µs × (1.00,1.00) 72.6µs × (1.00,1.00) ~ (p=0.539)
RegexpMatchHard_32 3.84µs × (1.00,1.00) 3.84µs × (1.00,1.00) ~ (p=0.378)
RegexpMatchHard_1K 117µs × (1.00,1.00) 117µs × (1.00,1.00) ~ (p=0.067)
Revcomp 904ms × (0.99,1.02) 904ms × (0.99,1.01) ~ (p=0.943)
Template 125ms × (0.99,1.02) 127ms × (0.99,1.01) +1.79% (p=0.000)
TimeParse 627ns × (0.99,1.01) 622ns × (0.99,1.01) -0.88% (p=0.000)
TimeFormat 655ns × (0.99,1.02) 655ns × (0.99,1.02) ~ (p=0.976)
For the record, Go 1 benchmarks, Go 1.4 vs this CL:
name old mean new mean delta
BinaryTree17 4.61s × (0.97,1.05) 5.91s × (0.98,1.03) +28.35% (p=0.000)
Fannkuch11 4.40s × (0.99,1.03) 4.41s × (0.99,1.01) ~ (p=0.212)
FmtFprintfEmpty 102ns × (0.99,1.01) 84ns × (0.99,1.02) -18.38% (p=0.000)
FmtFprintfString 302ns × (0.98,1.01) 303ns × (0.99,1.02) ~ (p=0.203)
FmtFprintfInt 313ns × (0.97,1.05) 270ns × (0.99,1.01) -13.69% (p=0.000)
FmtFprintfIntInt 524ns × (0.98,1.02) 477ns × (0.99,1.00) -8.87% (p=0.000)
FmtFprintfPrefixedInt 424ns × (0.98,1.02) 386ns × (0.99,1.01) -8.96% (p=0.000)
FmtFprintfFloat 652ns × (0.98,1.02) 594ns × (0.97,1.05) -8.97% (p=0.000)
FmtManyArgs 2.13µs × (0.99,1.02) 1.94µs × (0.99,1.01) -8.92% (p=0.000)
GobDecode 17.1ms × (0.99,1.02) 14.9ms × (0.98,1.03) -13.07% (p=0.000)
GobEncode 13.5ms × (0.98,1.03) 11.5ms × (0.98,1.03) -15.25% (p=0.000)
Gzip 656ms × (0.99,1.02) 647ms × (0.99,1.01) -1.29% (p=0.000)
Gunzip 143ms × (0.99,1.02) 144ms × (0.99,1.01) ~ (p=0.204)
HTTPClientServer 88.2µs × (0.98,1.02) 90.8µs × (0.98,1.01) +2.93% (p=0.000)
JSONEncode 32.2ms × (0.98,1.02) 30.9ms × (0.97,1.04) -4.06% (p=0.001)
JSONDecode 121ms × (0.98,1.02) 110ms × (0.98,1.05) -8.95% (p=0.000)
Mandelbrot200 6.06ms × (0.99,1.01) 6.11ms × (0.98,1.04) ~ (p=0.184)
GoParse 6.76ms × (0.97,1.04) 6.58ms × (0.98,1.05) -2.63% (p=0.003)
RegexpMatchEasy0_32 195ns × (1.00,1.01) 155ns × (0.99,1.01) -20.43% (p=0.000)
RegexpMatchEasy0_1K 479ns × (0.98,1.03) 535ns × (0.99,1.02) +11.59% (p=0.000)
RegexpMatchEasy1_32 169ns × (0.99,1.02) 131ns × (0.99,1.03) -22.44% (p=0.000)
RegexpMatchEasy1_1K 1.53µs × (0.99,1.01) 0.87µs × (0.99,1.02) -43.07% (p=0.000)
RegexpMatchMedium_32 334ns × (0.99,1.01) 242ns × (0.99,1.01) -27.53% (p=0.000)
RegexpMatchMedium_1K 125µs × (1.00,1.01) 72µs × (0.99,1.03) -42.53% (p=0.000)
RegexpMatchHard_32 6.03µs × (0.99,1.01) 3.79µs × (0.99,1.01) -37.12% (p=0.000)
RegexpMatchHard_1K 189µs × (0.99,1.02) 115µs × (0.99,1.01) -39.20% (p=0.000)
Revcomp 935ms × (0.96,1.03) 926ms × (0.98,1.02) ~ (p=0.083)
Template 146ms × (0.97,1.05) 119ms × (0.99,1.01) -18.37% (p=0.000)
TimeParse 660ns × (0.99,1.01) 624ns × (0.99,1.02) -5.43% (p=0.000)
TimeFormat 670ns × (0.98,1.02) 710ns × (1.00,1.01) +5.97% (p=0.000)
This CL is a bit larger than I would like, but the compiler, linker, runtime,
and package reflect all need to be in sync about the format of these programs,
so there is no easy way to split this into independent changes (at least
while keeping the build working at each change).
Fixes #9625.
Fixes #10524.
Change-Id: I9e3e20d6097099d0f8532d1cb5b1af528804989a
Reviewed-on: https://go-review.googlesource.com/9888
Reviewed-by: Austin Clements <austin@google.com>
Run-TryBot: Russ Cox <rsc@golang.org>
2015-05-08 01:43:18 -04:00
|
|
|
gc.End(int64(sect.Length))
|
2015-02-27 22:57:28 -05:00
|
|
|
|
|
|
|
|
/* pointer-free bss */
|
|
|
|
|
sect = addsection(&Segdata, ".noptrbss", 06)
|
2016-04-19 08:59:56 -04:00
|
|
|
sect.Align = dataMaxAlign[obj.SNOPTRBSS]
|
2015-02-27 22:57:28 -05:00
|
|
|
datsize = Rnd(datsize, int64(sect.Align))
|
|
|
|
|
sect.Vaddr = uint64(datsize)
|
2016-08-19 22:40:38 -04:00
|
|
|
Linklookup(ctxt, "runtime.noptrbss", 0).Sect = sect
|
|
|
|
|
Linklookup(ctxt, "runtime.enoptrbss", 0).Sect = sect
|
2016-04-18 14:50:14 -04:00
|
|
|
for _, s := range data[obj.SNOPTRBSS] {
|
2015-02-27 22:57:28 -05:00
|
|
|
datsize = aligndatsize(datsize, s)
|
|
|
|
|
s.Sect = sect
|
|
|
|
|
s.Value = int64(uint64(datsize) - sect.Vaddr)
|
2016-04-19 08:59:56 -04:00
|
|
|
datsize += s.Size
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
sect.Length = uint64(datsize) - sect.Vaddr
|
2016-08-19 22:40:38 -04:00
|
|
|
Linklookup(ctxt, "runtime.end", 0).Sect = sect
|
|
|
|
|
checkdatsize(ctxt, datsize, obj.SNOPTRBSS)
|
2015-02-27 22:57:28 -05:00
|
|
|
|
2016-04-18 14:50:14 -04:00
|
|
|
if len(data[obj.STLSBSS]) > 0 {
|
|
|
|
|
var sect *Section
|
2015-08-11 12:29:00 +12:00
|
|
|
if Iself && (Linkmode == LinkExternal || Debug['d'] == 0) && HEADTYPE != obj.Hopenbsd {
|
|
|
|
|
sect = addsection(&Segdata, ".tbss", 06)
|
2016-04-06 12:01:40 -07:00
|
|
|
sect.Align = int32(SysArch.PtrSize)
|
2015-08-11 12:29:00 +12:00
|
|
|
sect.Vaddr = 0
|
|
|
|
|
}
|
2015-02-27 22:57:28 -05:00
|
|
|
datsize = 0
|
2015-08-11 12:29:00 +12:00
|
|
|
|
2016-04-18 14:50:14 -04:00
|
|
|
for _, s := range data[obj.STLSBSS] {
|
2015-02-27 22:57:28 -05:00
|
|
|
datsize = aligndatsize(datsize, s)
|
|
|
|
|
s.Sect = sect
|
2015-08-11 12:29:00 +12:00
|
|
|
s.Value = datsize
|
2016-04-19 08:59:56 -04:00
|
|
|
datsize += s.Size
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
2016-08-19 22:40:38 -04:00
|
|
|
checkdatsize(ctxt, datsize, obj.STLSBSS)
|
2015-02-27 22:57:28 -05:00
|
|
|
|
2015-08-11 12:29:00 +12:00
|
|
|
if sect != nil {
|
|
|
|
|
sect.Length = uint64(datsize)
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* We finished data, begin read-only data.
|
|
|
|
|
* Not all systems support a separate read-only non-executable data section.
|
|
|
|
|
* ELF systems do.
|
|
|
|
|
* OS X and Plan 9 do not.
|
|
|
|
|
* Windows PE may, but if so we have not implemented it.
|
|
|
|
|
* And if we're using external linking mode, the point is moot,
|
|
|
|
|
* since it's not our decision; that code expects the sections in
|
|
|
|
|
* segtext.
|
|
|
|
|
*/
|
2015-03-02 12:35:15 -05:00
|
|
|
var segro *Segment
|
2015-02-27 22:57:28 -05:00
|
|
|
if Iself && Linkmode == LinkInternal {
|
|
|
|
|
segro = &Segrodata
|
|
|
|
|
} else {
|
|
|
|
|
segro = &Segtext
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
datsize = 0
|
|
|
|
|
|
|
|
|
|
/* read-only executable ELF, Mach-O sections */
|
2016-04-18 14:50:14 -04:00
|
|
|
if len(data[obj.STEXT]) != 0 {
|
2016-08-19 22:40:38 -04:00
|
|
|
ctxt.Diag("dodata found an STEXT symbol: %s", data[obj.STEXT][0].Name)
|
2016-04-18 14:50:14 -04:00
|
|
|
}
|
|
|
|
|
for _, s := range data[obj.SELFRXSECT] {
|
|
|
|
|
sect := addsection(&Segtext, s.Name, 04)
|
2015-02-27 22:57:28 -05:00
|
|
|
sect.Align = symalign(s)
|
|
|
|
|
datsize = Rnd(datsize, int64(sect.Align))
|
|
|
|
|
sect.Vaddr = uint64(datsize)
|
|
|
|
|
s.Sect = sect
|
2015-04-19 19:33:58 -07:00
|
|
|
s.Type = obj.SRODATA
|
2015-02-27 22:57:28 -05:00
|
|
|
s.Value = int64(uint64(datsize) - sect.Vaddr)
|
2016-04-19 08:59:56 -04:00
|
|
|
datsize += s.Size
|
2015-02-27 22:57:28 -05:00
|
|
|
sect.Length = uint64(datsize) - sect.Vaddr
|
2016-08-19 22:40:38 -04:00
|
|
|
checkdatsize(ctxt, datsize, obj.SELFRXSECT)
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* read-only data */
|
|
|
|
|
sect = addsection(segro, ".rodata", 04)
|
|
|
|
|
|
|
|
|
|
sect.Vaddr = 0
|
2016-08-19 22:40:38 -04:00
|
|
|
Linklookup(ctxt, "runtime.rodata", 0).Sect = sect
|
|
|
|
|
Linklookup(ctxt, "runtime.erodata", 0).Sect = sect
|
2016-03-27 10:21:48 -04:00
|
|
|
if !UseRelro() {
|
2016-08-19 22:40:38 -04:00
|
|
|
Linklookup(ctxt, "runtime.types", 0).Sect = sect
|
|
|
|
|
Linklookup(ctxt, "runtime.etypes", 0).Sect = sect
|
2016-03-27 10:21:48 -04:00
|
|
|
}
|
2016-04-18 14:50:14 -04:00
|
|
|
roSects := []int{
|
|
|
|
|
obj.STYPE,
|
|
|
|
|
obj.SSTRING,
|
|
|
|
|
obj.SGOSTRING,
|
|
|
|
|
obj.SGOSTRINGHDR,
|
|
|
|
|
obj.SGOFUNC,
|
|
|
|
|
obj.SGCBITS,
|
|
|
|
|
obj.SRODATA,
|
|
|
|
|
obj.SFUNCTAB,
|
|
|
|
|
}
|
|
|
|
|
for _, symn := range roSects {
|
2016-04-19 08:59:56 -04:00
|
|
|
align := dataMaxAlign[symn]
|
2016-04-18 14:50:14 -04:00
|
|
|
if sect.Align < align {
|
|
|
|
|
sect.Align = align
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
datsize = Rnd(datsize, int64(sect.Align))
|
|
|
|
|
for _, symn := range roSects {
|
|
|
|
|
for _, s := range data[symn] {
|
|
|
|
|
datsize = aligndatsize(datsize, s)
|
|
|
|
|
s.Sect = sect
|
|
|
|
|
s.Type = obj.SRODATA
|
|
|
|
|
s.Value = int64(uint64(datsize) - sect.Vaddr)
|
2016-04-19 08:59:56 -04:00
|
|
|
datsize += s.Size
|
2016-04-18 14:50:14 -04:00
|
|
|
}
|
2016-08-19 22:40:38 -04:00
|
|
|
checkdatsize(ctxt, datsize, symn)
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
sect.Length = uint64(datsize) - sect.Vaddr
|
|
|
|
|
|
2015-05-21 13:07:19 +12:00
|
|
|
// There is some data that are conceptually read-only but are written to by
|
|
|
|
|
// relocations. On GNU systems, we can arrange for the dynamic linker to
|
|
|
|
|
// mprotect sections after relocations are applied by giving them write
|
|
|
|
|
// permissions in the object file and calling them ".data.rel.ro.FOO". We
|
|
|
|
|
// divide the .rodata section between actual .rodata and .data.rel.ro.rodata,
|
|
|
|
|
// but for the other sections that this applies to, we just write a read-only
|
|
|
|
|
// .FOO section or a read-write .data.rel.ro.FOO section depending on the
|
|
|
|
|
// situation.
|
|
|
|
|
// TODO(mwhudson): It would make sense to do this more widely, but it makes
|
|
|
|
|
// the system linker segfault on darwin.
|
|
|
|
|
relro_perms := 04
|
|
|
|
|
relro_prefix := ""
|
|
|
|
|
|
|
|
|
|
if UseRelro() {
|
|
|
|
|
relro_perms = 06
|
|
|
|
|
relro_prefix = ".data.rel.ro"
|
|
|
|
|
/* data only written by relocations */
|
|
|
|
|
sect = addsection(segro, ".data.rel.ro", 06)
|
|
|
|
|
|
|
|
|
|
sect.Vaddr = 0
|
2016-08-19 22:40:38 -04:00
|
|
|
Linklookup(ctxt, "runtime.types", 0).Sect = sect
|
|
|
|
|
Linklookup(ctxt, "runtime.etypes", 0).Sect = sect
|
2016-04-18 14:50:14 -04:00
|
|
|
relroSects := []int{
|
|
|
|
|
obj.STYPERELRO,
|
|
|
|
|
obj.SSTRINGRELRO,
|
|
|
|
|
obj.SGOSTRINGRELRO,
|
|
|
|
|
obj.SGOSTRINGHDRRELRO,
|
|
|
|
|
obj.SGOFUNCRELRO,
|
|
|
|
|
obj.SGCBITSRELRO,
|
|
|
|
|
obj.SRODATARELRO,
|
|
|
|
|
obj.SFUNCTABRELRO,
|
|
|
|
|
}
|
|
|
|
|
for _, symn := range relroSects {
|
2016-04-19 08:59:56 -04:00
|
|
|
align := dataMaxAlign[symn]
|
2016-04-18 14:50:14 -04:00
|
|
|
if sect.Align < align {
|
|
|
|
|
sect.Align = align
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
datsize = Rnd(datsize, int64(sect.Align))
|
|
|
|
|
for _, symn := range relroSects {
|
|
|
|
|
for _, s := range data[symn] {
|
|
|
|
|
datsize = aligndatsize(datsize, s)
|
|
|
|
|
if s.Outer != nil && s.Outer.Sect != nil && s.Outer.Sect != sect {
|
2016-08-19 22:40:38 -04:00
|
|
|
ctxt.Diag("s.Outer (%s) in different section from s (%s)", s.Outer.Name, s.Name)
|
2016-04-18 14:50:14 -04:00
|
|
|
}
|
|
|
|
|
s.Sect = sect
|
|
|
|
|
s.Type = obj.SRODATA
|
|
|
|
|
s.Value = int64(uint64(datsize) - sect.Vaddr)
|
2016-04-19 08:59:56 -04:00
|
|
|
datsize += s.Size
|
2015-05-21 13:07:19 +12:00
|
|
|
}
|
2016-08-19 22:40:38 -04:00
|
|
|
checkdatsize(ctxt, datsize, symn)
|
2015-05-21 13:07:19 +12:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
sect.Length = uint64(datsize) - sect.Vaddr
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
2015-02-27 22:57:28 -05:00
|
|
|
/* typelink */
|
2015-05-21 13:07:19 +12:00
|
|
|
sect = addsection(segro, relro_prefix+".typelink", relro_perms)
|
2016-04-19 08:59:56 -04:00
|
|
|
sect.Align = dataMaxAlign[obj.STYPELINK]
|
2015-02-27 22:57:28 -05:00
|
|
|
datsize = Rnd(datsize, int64(sect.Align))
|
|
|
|
|
sect.Vaddr = uint64(datsize)
|
2016-08-19 22:40:38 -04:00
|
|
|
Linklookup(ctxt, "runtime.typelink", 0).Sect = sect
|
|
|
|
|
Linklookup(ctxt, "runtime.etypelink", 0).Sect = sect
|
2016-04-18 14:50:14 -04:00
|
|
|
for _, s := range data[obj.STYPELINK] {
|
2015-02-27 22:57:28 -05:00
|
|
|
datsize = aligndatsize(datsize, s)
|
|
|
|
|
s.Sect = sect
|
2015-04-19 19:33:58 -07:00
|
|
|
s.Type = obj.SRODATA
|
2015-02-27 22:57:28 -05:00
|
|
|
s.Value = int64(uint64(datsize) - sect.Vaddr)
|
2016-04-19 08:59:56 -04:00
|
|
|
datsize += s.Size
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
2016-08-19 22:40:38 -04:00
|
|
|
checkdatsize(ctxt, datsize, obj.STYPELINK)
|
2015-02-27 22:57:28 -05:00
|
|
|
sect.Length = uint64(datsize) - sect.Vaddr
|
|
|
|
|
|
2016-03-17 07:00:33 -07:00
|
|
|
/* itablink */
|
|
|
|
|
sect = addsection(segro, relro_prefix+".itablink", relro_perms)
|
2016-04-19 08:59:56 -04:00
|
|
|
sect.Align = dataMaxAlign[obj.SITABLINK]
|
2016-03-17 07:00:33 -07:00
|
|
|
datsize = Rnd(datsize, int64(sect.Align))
|
|
|
|
|
sect.Vaddr = uint64(datsize)
|
2016-08-19 22:40:38 -04:00
|
|
|
Linklookup(ctxt, "runtime.itablink", 0).Sect = sect
|
|
|
|
|
Linklookup(ctxt, "runtime.eitablink", 0).Sect = sect
|
2016-04-18 14:50:14 -04:00
|
|
|
for _, s := range data[obj.SITABLINK] {
|
2016-03-17 07:00:33 -07:00
|
|
|
datsize = aligndatsize(datsize, s)
|
|
|
|
|
s.Sect = sect
|
|
|
|
|
s.Type = obj.SRODATA
|
|
|
|
|
s.Value = int64(uint64(datsize) - sect.Vaddr)
|
2016-04-19 08:59:56 -04:00
|
|
|
datsize += s.Size
|
2016-03-17 07:00:33 -07:00
|
|
|
}
|
2016-08-19 22:40:38 -04:00
|
|
|
checkdatsize(ctxt, datsize, obj.SITABLINK)
|
2016-03-17 07:00:33 -07:00
|
|
|
sect.Length = uint64(datsize) - sect.Vaddr
|
|
|
|
|
|
2015-02-27 22:57:28 -05:00
|
|
|
/* gosymtab */
|
2015-05-21 13:07:19 +12:00
|
|
|
sect = addsection(segro, relro_prefix+".gosymtab", relro_perms)
|
2016-04-19 08:59:56 -04:00
|
|
|
sect.Align = dataMaxAlign[obj.SSYMTAB]
|
2015-02-27 22:57:28 -05:00
|
|
|
datsize = Rnd(datsize, int64(sect.Align))
|
|
|
|
|
sect.Vaddr = uint64(datsize)
|
2016-08-19 22:40:38 -04:00
|
|
|
Linklookup(ctxt, "runtime.symtab", 0).Sect = sect
|
|
|
|
|
Linklookup(ctxt, "runtime.esymtab", 0).Sect = sect
|
2016-04-18 14:50:14 -04:00
|
|
|
for _, s := range data[obj.SSYMTAB] {
|
2015-02-27 22:57:28 -05:00
|
|
|
datsize = aligndatsize(datsize, s)
|
|
|
|
|
s.Sect = sect
|
2015-04-19 19:33:58 -07:00
|
|
|
s.Type = obj.SRODATA
|
2015-02-27 22:57:28 -05:00
|
|
|
s.Value = int64(uint64(datsize) - sect.Vaddr)
|
2016-04-19 08:59:56 -04:00
|
|
|
datsize += s.Size
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
2016-08-19 22:40:38 -04:00
|
|
|
checkdatsize(ctxt, datsize, obj.SSYMTAB)
|
2015-02-27 22:57:28 -05:00
|
|
|
sect.Length = uint64(datsize) - sect.Vaddr
|
|
|
|
|
|
|
|
|
|
/* gopclntab */
|
2015-05-21 13:07:19 +12:00
|
|
|
sect = addsection(segro, relro_prefix+".gopclntab", relro_perms)
|
2016-04-19 08:59:56 -04:00
|
|
|
sect.Align = dataMaxAlign[obj.SPCLNTAB]
|
2015-02-27 22:57:28 -05:00
|
|
|
datsize = Rnd(datsize, int64(sect.Align))
|
|
|
|
|
sect.Vaddr = uint64(datsize)
|
2016-08-19 22:40:38 -04:00
|
|
|
Linklookup(ctxt, "runtime.pclntab", 0).Sect = sect
|
|
|
|
|
Linklookup(ctxt, "runtime.epclntab", 0).Sect = sect
|
2016-04-18 14:50:14 -04:00
|
|
|
for _, s := range data[obj.SPCLNTAB] {
|
2015-02-27 22:57:28 -05:00
|
|
|
datsize = aligndatsize(datsize, s)
|
|
|
|
|
s.Sect = sect
|
2015-04-19 19:33:58 -07:00
|
|
|
s.Type = obj.SRODATA
|
2015-02-27 22:57:28 -05:00
|
|
|
s.Value = int64(uint64(datsize) - sect.Vaddr)
|
2016-04-19 08:59:56 -04:00
|
|
|
datsize += s.Size
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
2016-08-19 22:40:38 -04:00
|
|
|
checkdatsize(ctxt, datsize, obj.SRODATA)
|
2015-02-27 22:57:28 -05:00
|
|
|
sect.Length = uint64(datsize) - sect.Vaddr
|
|
|
|
|
|
|
|
|
|
/* read-only ELF, Mach-O sections */
|
2016-04-18 14:50:14 -04:00
|
|
|
for _, s := range data[obj.SELFROSECT] {
|
|
|
|
|
sect = addsection(segro, s.Name, 04)
|
|
|
|
|
sect.Align = symalign(s)
|
|
|
|
|
datsize = Rnd(datsize, int64(sect.Align))
|
|
|
|
|
sect.Vaddr = uint64(datsize)
|
|
|
|
|
s.Sect = sect
|
|
|
|
|
s.Type = obj.SRODATA
|
|
|
|
|
s.Value = int64(uint64(datsize) - sect.Vaddr)
|
2016-04-19 08:59:56 -04:00
|
|
|
datsize += s.Size
|
2016-04-18 14:50:14 -04:00
|
|
|
sect.Length = uint64(datsize) - sect.Vaddr
|
|
|
|
|
}
|
2016-08-19 22:40:38 -04:00
|
|
|
checkdatsize(ctxt, datsize, obj.SELFROSECT)
|
2016-04-18 14:50:14 -04:00
|
|
|
|
|
|
|
|
for _, s := range data[obj.SMACHOPLT] {
|
2015-02-27 22:57:28 -05:00
|
|
|
sect = addsection(segro, s.Name, 04)
|
|
|
|
|
sect.Align = symalign(s)
|
|
|
|
|
datsize = Rnd(datsize, int64(sect.Align))
|
|
|
|
|
sect.Vaddr = uint64(datsize)
|
|
|
|
|
s.Sect = sect
|
2015-04-19 19:33:58 -07:00
|
|
|
s.Type = obj.SRODATA
|
2015-02-27 22:57:28 -05:00
|
|
|
s.Value = int64(uint64(datsize) - sect.Vaddr)
|
2016-04-19 08:59:56 -04:00
|
|
|
datsize += s.Size
|
2015-02-27 22:57:28 -05:00
|
|
|
sect.Length = uint64(datsize) - sect.Vaddr
|
|
|
|
|
}
|
2016-08-19 22:40:38 -04:00
|
|
|
checkdatsize(ctxt, datsize, obj.SMACHOPLT)
|
2015-02-27 22:57:28 -05:00
|
|
|
|
|
|
|
|
// 6g uses 4-byte relocation offsets, so the entire segment must fit in 32 bits.
|
|
|
|
|
if datsize != int64(uint32(datsize)) {
|
2016-08-19 22:40:38 -04:00
|
|
|
ctxt.Diag("read-only data segment too large")
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
|
2016-04-18 14:50:14 -04:00
|
|
|
for symn := obj.SELFRXSECT; symn < obj.SXREF; symn++ {
|
|
|
|
|
datap = append(datap, data[symn]...)
|
|
|
|
|
}
|
|
|
|
|
|
2016-08-19 22:40:38 -04:00
|
|
|
dwarfgeneratedebugsyms(ctxt)
|
2016-03-14 09:23:04 -07:00
|
|
|
|
2016-08-19 11:35:54 -04:00
|
|
|
var s *Symbol
|
2016-04-22 10:31:14 +12:00
|
|
|
var i int
|
|
|
|
|
for i, s = range dwarfp {
|
|
|
|
|
if s.Type != obj.SDWARFSECT {
|
|
|
|
|
break
|
|
|
|
|
}
|
2016-03-14 09:23:04 -07:00
|
|
|
sect = addsection(&Segdwarf, s.Name, 04)
|
|
|
|
|
sect.Align = 1
|
|
|
|
|
datsize = Rnd(datsize, int64(sect.Align))
|
|
|
|
|
sect.Vaddr = uint64(datsize)
|
|
|
|
|
s.Sect = sect
|
|
|
|
|
s.Type = obj.SRODATA
|
|
|
|
|
s.Value = int64(uint64(datsize) - sect.Vaddr)
|
2016-04-19 08:59:56 -04:00
|
|
|
datsize += s.Size
|
2016-03-14 09:23:04 -07:00
|
|
|
sect.Length = uint64(datsize) - sect.Vaddr
|
|
|
|
|
}
|
2016-08-19 22:40:38 -04:00
|
|
|
checkdatsize(ctxt, datsize, obj.SDWARFSECT)
|
2016-03-14 09:23:04 -07:00
|
|
|
|
2016-04-22 10:31:14 +12:00
|
|
|
if i < len(dwarfp) {
|
2016-03-14 09:23:04 -07:00
|
|
|
sect = addsection(&Segdwarf, ".debug_info", 04)
|
|
|
|
|
sect.Align = 1
|
|
|
|
|
datsize = Rnd(datsize, int64(sect.Align))
|
|
|
|
|
sect.Vaddr = uint64(datsize)
|
2016-04-22 10:31:14 +12:00
|
|
|
for _, s := range dwarfp[i:] {
|
|
|
|
|
if s.Type != obj.SDWARFINFO {
|
|
|
|
|
break
|
|
|
|
|
}
|
2016-03-14 09:23:04 -07:00
|
|
|
s.Sect = sect
|
|
|
|
|
s.Type = obj.SRODATA
|
|
|
|
|
s.Value = int64(uint64(datsize) - sect.Vaddr)
|
|
|
|
|
s.Attr |= AttrLocal
|
2016-04-19 08:59:56 -04:00
|
|
|
datsize += s.Size
|
2016-03-14 09:23:04 -07:00
|
|
|
}
|
|
|
|
|
sect.Length = uint64(datsize) - sect.Vaddr
|
2016-08-19 22:40:38 -04:00
|
|
|
checkdatsize(ctxt, datsize, obj.SDWARFINFO)
|
2016-03-14 09:23:04 -07:00
|
|
|
}
|
|
|
|
|
|
2015-02-27 22:57:28 -05:00
|
|
|
/* number the sections */
|
2015-03-02 12:35:15 -05:00
|
|
|
n := int32(1)
|
2015-02-27 22:57:28 -05:00
|
|
|
|
2015-03-02 12:35:15 -05:00
|
|
|
for sect := Segtext.Sect; sect != nil; sect = sect.Next {
|
2015-02-27 22:57:28 -05:00
|
|
|
sect.Extnum = int16(n)
|
|
|
|
|
n++
|
|
|
|
|
}
|
2015-03-02 12:35:15 -05:00
|
|
|
for sect := Segrodata.Sect; sect != nil; sect = sect.Next {
|
2015-02-27 22:57:28 -05:00
|
|
|
sect.Extnum = int16(n)
|
|
|
|
|
n++
|
|
|
|
|
}
|
2015-03-02 12:35:15 -05:00
|
|
|
for sect := Segdata.Sect; sect != nil; sect = sect.Next {
|
2015-02-27 22:57:28 -05:00
|
|
|
sect.Extnum = int16(n)
|
|
|
|
|
n++
|
|
|
|
|
}
|
2016-03-14 09:23:04 -07:00
|
|
|
for sect := Segdwarf.Sect; sect != nil; sect = sect.Next {
|
|
|
|
|
sect.Extnum = int16(n)
|
|
|
|
|
n++
|
|
|
|
|
}
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
2015-06-04 15:15:48 -04:00
|
|
|
|
2016-08-19 22:40:38 -04:00
|
|
|
func dodataSect(ctxt *Link, symn int, syms []*Symbol) (result []*Symbol, maxAlign int32) {
|
2016-04-18 14:50:14 -04:00
|
|
|
if HEADTYPE == obj.Hdarwin {
|
|
|
|
|
// Some symbols may no longer belong in syms
|
|
|
|
|
// due to movement in machosymorder.
|
2016-08-19 11:35:54 -04:00
|
|
|
newSyms := make([]*Symbol, 0, len(syms))
|
2016-04-18 14:50:14 -04:00
|
|
|
for _, s := range syms {
|
|
|
|
|
if int(s.Type) == symn {
|
|
|
|
|
newSyms = append(newSyms, s)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
syms = newSyms
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
symsSort := make([]dataSortKey, len(syms))
|
|
|
|
|
for i, s := range syms {
|
|
|
|
|
if s.Attr.OnList() {
|
|
|
|
|
log.Fatalf("symbol %s listed multiple times", s.Name)
|
|
|
|
|
}
|
|
|
|
|
s.Attr |= AttrOnList
|
2016-04-19 08:59:56 -04:00
|
|
|
switch {
|
|
|
|
|
case s.Size < int64(len(s.P)):
|
2016-08-19 22:40:38 -04:00
|
|
|
ctxt.Diag("%s: initialize bounds (%d < %d)", s.Name, s.Size, len(s.P))
|
2016-04-19 08:59:56 -04:00
|
|
|
case s.Size < 0:
|
2016-08-19 22:40:38 -04:00
|
|
|
ctxt.Diag("%s: negative size (%d bytes)", s.Name, s.Size)
|
2016-04-19 08:59:56 -04:00
|
|
|
case s.Size > cutoff:
|
2016-08-19 22:40:38 -04:00
|
|
|
ctxt.Diag("%s: symbol too large (%d bytes)", s.Name, s.Size)
|
2016-04-18 14:50:14 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
symsSort[i] = dataSortKey{
|
|
|
|
|
size: s.Size,
|
|
|
|
|
name: s.Name,
|
2016-08-22 10:27:20 +12:00
|
|
|
sym: s,
|
2016-04-18 14:50:14 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
switch s.Type {
|
|
|
|
|
case obj.SELFGOT:
|
|
|
|
|
// For ppc64, we want to interleave the .got and .toc sections
|
|
|
|
|
// from input files. Both are type SELFGOT, so in that case
|
|
|
|
|
// we skip size comparison and fall through to the name
|
|
|
|
|
// comparison (conveniently, .got sorts before .toc).
|
|
|
|
|
symsSort[i].size = 0
|
|
|
|
|
case obj.STYPELINK:
|
|
|
|
|
// Sort typelinks by the rtype.string field so the reflect
|
|
|
|
|
// package can binary search type links.
|
2016-04-07 16:29:16 -04:00
|
|
|
symsSort[i].name = string(decodetype_str(s.R[0].Sym))
|
2016-04-18 14:50:14 -04:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
sort.Sort(bySizeAndName(symsSort))
|
|
|
|
|
|
|
|
|
|
for i, symSort := range symsSort {
|
2016-08-22 10:27:20 +12:00
|
|
|
syms[i] = symSort.sym
|
|
|
|
|
align := symalign(symSort.sym)
|
2016-04-19 08:59:56 -04:00
|
|
|
if maxAlign < align {
|
|
|
|
|
maxAlign = align
|
|
|
|
|
}
|
2016-04-18 14:50:14 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if Iself && symn == obj.SELFROSECT {
|
|
|
|
|
// Make .rela and .rela.plt contiguous, the ELF ABI requires this
|
|
|
|
|
// and Solaris actually cares.
|
|
|
|
|
reli, plti := -1, -1
|
|
|
|
|
for i, s := range syms {
|
|
|
|
|
switch s.Name {
|
|
|
|
|
case ".rel.plt", ".rela.plt":
|
|
|
|
|
plti = i
|
|
|
|
|
case ".rel", ".rela":
|
|
|
|
|
reli = i
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if reli >= 0 && plti >= 0 && plti != reli+1 {
|
2016-04-20 19:10:20 -04:00
|
|
|
var first, second int
|
|
|
|
|
if plti > reli {
|
|
|
|
|
first, second = reli, plti
|
|
|
|
|
} else {
|
|
|
|
|
first, second = plti, reli
|
2016-04-18 14:50:14 -04:00
|
|
|
}
|
2016-04-20 19:10:20 -04:00
|
|
|
rel, plt := syms[reli], syms[plti]
|
|
|
|
|
copy(syms[first+2:], syms[first+1:second])
|
|
|
|
|
syms[first+0] = rel
|
|
|
|
|
syms[first+1] = plt
|
2016-04-18 14:50:14 -04:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2016-04-19 08:59:56 -04:00
|
|
|
return syms, maxAlign
|
2016-04-18 14:50:14 -04:00
|
|
|
}
|
|
|
|
|
|
2015-06-04 15:15:48 -04:00
|
|
|
// Add buildid to beginning of text segment, on non-ELF systems.
|
|
|
|
|
// Non-ELF binary formats are not always flexible enough to
|
|
|
|
|
// give us a place to put the Go build ID. On those systems, we put it
|
|
|
|
|
// at the very beginning of the text segment.
|
|
|
|
|
// This ``header'' is read by cmd/go.
|
2016-08-19 22:40:38 -04:00
|
|
|
func (ctxt *Link) textbuildid() {
|
2015-06-04 15:15:48 -04:00
|
|
|
if Iself || buildid == "" {
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
|
2016-08-19 22:40:38 -04:00
|
|
|
sym := Linklookup(ctxt, "go.buildid", 0)
|
2016-03-02 07:59:49 -05:00
|
|
|
sym.Attr |= AttrReachable
|
2015-06-04 15:15:48 -04:00
|
|
|
// The \xff is invalid UTF-8, meant to make it less likely
|
|
|
|
|
// to find one of these accidentally.
|
|
|
|
|
data := "\xff Go build ID: " + strconv.Quote(buildid) + "\n \xff"
|
|
|
|
|
sym.Type = obj.STEXT
|
|
|
|
|
sym.P = []byte(data)
|
|
|
|
|
sym.Size = int64(len(sym.P))
|
|
|
|
|
|
2016-08-19 22:40:38 -04:00
|
|
|
ctxt.Textp = append(ctxt.Textp, nil)
|
|
|
|
|
copy(ctxt.Textp[1:], ctxt.Textp)
|
|
|
|
|
ctxt.Textp[0] = sym
|
2015-06-04 15:15:48 -04:00
|
|
|
}
|
2015-02-27 22:57:28 -05:00
|
|
|
|
|
|
|
|
// assign addresses to text
|
2016-08-19 22:40:38 -04:00
|
|
|
func (ctxt *Link) textaddress() {
|
2015-02-27 22:57:28 -05:00
|
|
|
addsection(&Segtext, ".text", 05)
|
|
|
|
|
|
|
|
|
|
// Assign PCs in text segment.
|
|
|
|
|
// Could parallelize, by assigning to text
|
|
|
|
|
// and then letting threads copy down, but probably not worth it.
|
2015-03-02 12:35:15 -05:00
|
|
|
sect := Segtext.Sect
|
2015-02-27 22:57:28 -05:00
|
|
|
|
|
|
|
|
sect.Align = int32(Funcalign)
|
2016-08-19 22:40:38 -04:00
|
|
|
Linklookup(ctxt, "runtime.text", 0).Sect = sect
|
|
|
|
|
Linklookup(ctxt, "runtime.etext", 0).Sect = sect
|
2015-12-13 08:02:29 -05:00
|
|
|
if HEADTYPE == obj.Hwindows {
|
2016-08-19 22:40:38 -04:00
|
|
|
Linklookup(ctxt, ".text", 0).Sect = sect
|
2015-12-13 08:02:29 -05:00
|
|
|
}
|
2015-03-02 12:35:15 -05:00
|
|
|
va := uint64(INITTEXT)
|
2015-02-27 22:57:28 -05:00
|
|
|
sect.Vaddr = va
|
2016-08-19 22:40:38 -04:00
|
|
|
for _, sym := range ctxt.Textp {
|
2015-02-27 22:57:28 -05:00
|
|
|
sym.Sect = sect
|
2015-04-19 19:33:58 -07:00
|
|
|
if sym.Type&obj.SSUB != 0 {
|
2015-02-27 22:57:28 -05:00
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
if sym.Align != 0 {
|
|
|
|
|
va = uint64(Rnd(int64(va), int64(sym.Align)))
|
|
|
|
|
} else {
|
|
|
|
|
va = uint64(Rnd(int64(va), int64(Funcalign)))
|
|
|
|
|
}
|
|
|
|
|
sym.Value = 0
|
2016-04-18 14:50:14 -04:00
|
|
|
for sub := sym; sub != nil; sub = sub.Sub {
|
2015-02-27 22:57:28 -05:00
|
|
|
sub.Value += int64(va)
|
|
|
|
|
}
|
|
|
|
|
if sym.Size == 0 && sym.Sub != nil {
|
2016-08-19 22:40:38 -04:00
|
|
|
ctxt.Cursym = sym
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
if sym.Size < MINFUNC {
|
|
|
|
|
va += MINFUNC // spacing required for findfunctab
|
|
|
|
|
} else {
|
|
|
|
|
va += uint64(sym.Size)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
sect.Length = va - sect.Vaddr
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// assign addresses
|
2016-08-19 22:40:38 -04:00
|
|
|
func (ctxt *Link) address() {
|
2015-03-02 12:35:15 -05:00
|
|
|
va := uint64(INITTEXT)
|
2015-02-27 22:57:28 -05:00
|
|
|
Segtext.Rwx = 05
|
|
|
|
|
Segtext.Vaddr = va
|
|
|
|
|
Segtext.Fileoff = uint64(HEADR)
|
2015-03-02 12:35:15 -05:00
|
|
|
for s := Segtext.Sect; s != nil; s = s.Next {
|
2015-02-27 22:57:28 -05:00
|
|
|
va = uint64(Rnd(int64(va), int64(s.Align)))
|
|
|
|
|
s.Vaddr = va
|
|
|
|
|
va += s.Length
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
Segtext.Length = va - uint64(INITTEXT)
|
|
|
|
|
Segtext.Filelen = Segtext.Length
|
2015-04-19 19:33:58 -07:00
|
|
|
if HEADTYPE == obj.Hnacl {
|
2015-02-27 22:57:28 -05:00
|
|
|
va += 32 // room for the "halt sled"
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if Segrodata.Sect != nil {
|
|
|
|
|
// align to page boundary so as not to mix
|
|
|
|
|
// rodata and executable text.
|
|
|
|
|
va = uint64(Rnd(int64(va), int64(INITRND)))
|
|
|
|
|
|
|
|
|
|
Segrodata.Rwx = 04
|
|
|
|
|
Segrodata.Vaddr = va
|
|
|
|
|
Segrodata.Fileoff = va - Segtext.Vaddr + Segtext.Fileoff
|
|
|
|
|
Segrodata.Filelen = 0
|
2015-03-02 12:35:15 -05:00
|
|
|
for s := Segrodata.Sect; s != nil; s = s.Next {
|
2015-02-27 22:57:28 -05:00
|
|
|
va = uint64(Rnd(int64(va), int64(s.Align)))
|
|
|
|
|
s.Vaddr = va
|
|
|
|
|
va += s.Length
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
Segrodata.Length = va - Segrodata.Vaddr
|
|
|
|
|
Segrodata.Filelen = Segrodata.Length
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
va = uint64(Rnd(int64(va), int64(INITRND)))
|
|
|
|
|
Segdata.Rwx = 06
|
|
|
|
|
Segdata.Vaddr = va
|
|
|
|
|
Segdata.Fileoff = va - Segtext.Vaddr + Segtext.Fileoff
|
|
|
|
|
Segdata.Filelen = 0
|
2015-04-19 19:33:58 -07:00
|
|
|
if HEADTYPE == obj.Hwindows {
|
2015-02-27 22:57:28 -05:00
|
|
|
Segdata.Fileoff = Segtext.Fileoff + uint64(Rnd(int64(Segtext.Length), PEFILEALIGN))
|
|
|
|
|
}
|
2015-04-19 19:33:58 -07:00
|
|
|
if HEADTYPE == obj.Hplan9 {
|
2015-02-27 22:57:28 -05:00
|
|
|
Segdata.Fileoff = Segtext.Fileoff + Segtext.Filelen
|
|
|
|
|
}
|
2015-03-02 14:22:05 -05:00
|
|
|
var data *Section
|
|
|
|
|
var noptr *Section
|
|
|
|
|
var bss *Section
|
|
|
|
|
var noptrbss *Section
|
2015-03-02 12:35:15 -05:00
|
|
|
var vlen int64
|
|
|
|
|
for s := Segdata.Sect; s != nil; s = s.Next {
|
2015-08-11 12:29:00 +12:00
|
|
|
if Iself && s.Name == ".tbss" {
|
|
|
|
|
continue
|
|
|
|
|
}
|
2015-02-27 22:57:28 -05:00
|
|
|
vlen = int64(s.Length)
|
2015-08-11 12:29:00 +12:00
|
|
|
if s.Next != nil && !(Iself && s.Next.Name == ".tbss") {
|
2015-02-27 22:57:28 -05:00
|
|
|
vlen = int64(s.Next.Vaddr - s.Vaddr)
|
|
|
|
|
}
|
|
|
|
|
s.Vaddr = va
|
|
|
|
|
va += uint64(vlen)
|
|
|
|
|
Segdata.Length = va - Segdata.Vaddr
|
|
|
|
|
if s.Name == ".data" {
|
|
|
|
|
data = s
|
|
|
|
|
}
|
|
|
|
|
if s.Name == ".noptrdata" {
|
|
|
|
|
noptr = s
|
|
|
|
|
}
|
|
|
|
|
if s.Name == ".bss" {
|
|
|
|
|
bss = s
|
|
|
|
|
}
|
|
|
|
|
if s.Name == ".noptrbss" {
|
|
|
|
|
noptrbss = s
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
Segdata.Filelen = bss.Vaddr - Segdata.Vaddr
|
|
|
|
|
|
2016-03-14 09:23:04 -07:00
|
|
|
va = uint64(Rnd(int64(va), int64(INITRND)))
|
|
|
|
|
Segdwarf.Rwx = 06
|
|
|
|
|
Segdwarf.Vaddr = va
|
|
|
|
|
Segdwarf.Fileoff = Segdata.Fileoff + uint64(Rnd(int64(Segdata.Filelen), int64(INITRND)))
|
|
|
|
|
Segdwarf.Filelen = 0
|
|
|
|
|
if HEADTYPE == obj.Hwindows {
|
|
|
|
|
Segdwarf.Fileoff = Segdata.Fileoff + uint64(Rnd(int64(Segdata.Filelen), int64(PEFILEALIGN)))
|
|
|
|
|
}
|
|
|
|
|
for s := Segdwarf.Sect; s != nil; s = s.Next {
|
|
|
|
|
vlen = int64(s.Length)
|
|
|
|
|
if s.Next != nil {
|
|
|
|
|
vlen = int64(s.Next.Vaddr - s.Vaddr)
|
|
|
|
|
}
|
|
|
|
|
s.Vaddr = va
|
|
|
|
|
va += uint64(vlen)
|
|
|
|
|
if HEADTYPE == obj.Hwindows {
|
|
|
|
|
va = uint64(Rnd(int64(va), PEFILEALIGN))
|
|
|
|
|
}
|
|
|
|
|
Segdwarf.Length = va - Segdwarf.Vaddr
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
Segdwarf.Filelen = va - Segdwarf.Vaddr
|
|
|
|
|
|
2015-03-02 12:35:15 -05:00
|
|
|
text := Segtext.Sect
|
|
|
|
|
var rodata *Section
|
2015-02-27 22:57:28 -05:00
|
|
|
if Segrodata.Sect != nil {
|
|
|
|
|
rodata = Segrodata.Sect
|
|
|
|
|
} else {
|
|
|
|
|
rodata = text.Next
|
|
|
|
|
}
|
2016-03-27 10:21:48 -04:00
|
|
|
var relrodata *Section
|
2015-03-02 12:35:15 -05:00
|
|
|
typelink := rodata.Next
|
2015-05-21 13:07:19 +12:00
|
|
|
if UseRelro() {
|
|
|
|
|
// There is another section (.data.rel.ro) when building a shared
|
|
|
|
|
// object on elf systems.
|
2016-03-27 10:21:48 -04:00
|
|
|
relrodata = typelink
|
2015-05-21 13:07:19 +12:00
|
|
|
typelink = typelink.Next
|
|
|
|
|
}
|
2016-03-17 07:00:33 -07:00
|
|
|
itablink := typelink.Next
|
|
|
|
|
symtab := itablink.Next
|
2015-03-02 12:35:15 -05:00
|
|
|
pclntab := symtab.Next
|
2015-02-27 22:57:28 -05:00
|
|
|
|
2016-04-18 14:50:14 -04:00
|
|
|
for _, s := range datap {
|
2016-08-19 22:40:38 -04:00
|
|
|
ctxt.Cursym = s
|
2016-04-18 14:50:14 -04:00
|
|
|
if s.Sect != nil {
|
|
|
|
|
s.Value += int64(s.Sect.Vaddr)
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
2016-04-18 14:50:14 -04:00
|
|
|
for sub := s.Sub; sub != nil; sub = sub.Sub {
|
|
|
|
|
sub.Value += s.Value
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
}
|
2016-04-18 14:50:14 -04:00
|
|
|
|
2016-04-22 10:31:14 +12:00
|
|
|
for _, sym := range dwarfp {
|
2016-08-19 22:40:38 -04:00
|
|
|
ctxt.Cursym = sym
|
2016-03-14 09:23:04 -07:00
|
|
|
if sym.Sect != nil {
|
|
|
|
|
sym.Value += int64(sym.Sect.Vaddr)
|
|
|
|
|
}
|
2016-04-18 14:50:14 -04:00
|
|
|
for sub := sym.Sub; sub != nil; sub = sub.Sub {
|
2016-03-14 09:23:04 -07:00
|
|
|
sub.Value += sym.Value
|
|
|
|
|
}
|
|
|
|
|
}
|
2015-02-27 22:57:28 -05:00
|
|
|
|
2015-05-25 13:59:08 +12:00
|
|
|
if Buildmode == BuildmodeShared {
|
2016-08-19 22:40:38 -04:00
|
|
|
s := Linklookup(ctxt, "go.link.abihashbytes", 0)
|
|
|
|
|
sectSym := Linklookup(ctxt, ".note.go.abihash", 0)
|
2015-05-25 13:59:08 +12:00
|
|
|
s.Sect = sectSym.Sect
|
|
|
|
|
s.Value = int64(sectSym.Sect.Vaddr + 16)
|
|
|
|
|
}
|
|
|
|
|
|
2016-03-27 10:21:48 -04:00
|
|
|
types := relrodata
|
|
|
|
|
if types == nil {
|
|
|
|
|
types = rodata
|
|
|
|
|
}
|
|
|
|
|
|
2016-08-19 22:40:38 -04:00
|
|
|
ctxt.xdefine("runtime.text", obj.STEXT, int64(text.Vaddr))
|
|
|
|
|
ctxt.xdefine("runtime.etext", obj.STEXT, int64(text.Vaddr+text.Length))
|
2015-12-13 08:02:29 -05:00
|
|
|
if HEADTYPE == obj.Hwindows {
|
2016-08-19 22:40:38 -04:00
|
|
|
ctxt.xdefine(".text", obj.STEXT, int64(text.Vaddr))
|
|
|
|
|
}
|
|
|
|
|
ctxt.xdefine("runtime.rodata", obj.SRODATA, int64(rodata.Vaddr))
|
|
|
|
|
ctxt.xdefine("runtime.erodata", obj.SRODATA, int64(rodata.Vaddr+rodata.Length))
|
|
|
|
|
ctxt.xdefine("runtime.types", obj.SRODATA, int64(types.Vaddr))
|
|
|
|
|
ctxt.xdefine("runtime.etypes", obj.SRODATA, int64(types.Vaddr+types.Length))
|
|
|
|
|
ctxt.xdefine("runtime.typelink", obj.SRODATA, int64(typelink.Vaddr))
|
|
|
|
|
ctxt.xdefine("runtime.etypelink", obj.SRODATA, int64(typelink.Vaddr+typelink.Length))
|
|
|
|
|
ctxt.xdefine("runtime.itablink", obj.SRODATA, int64(itablink.Vaddr))
|
|
|
|
|
ctxt.xdefine("runtime.eitablink", obj.SRODATA, int64(itablink.Vaddr+itablink.Length))
|
|
|
|
|
|
|
|
|
|
sym := Linklookup(ctxt, "runtime.gcdata", 0)
|
2016-03-02 07:59:49 -05:00
|
|
|
sym.Attr |= AttrLocal
|
2016-08-19 22:40:38 -04:00
|
|
|
ctxt.xdefine("runtime.egcdata", obj.SRODATA, Symaddr(ctxt, sym)+sym.Size)
|
|
|
|
|
Linklookup(ctxt, "runtime.egcdata", 0).Sect = sym.Sect
|
2015-02-27 22:57:28 -05:00
|
|
|
|
2016-08-19 22:40:38 -04:00
|
|
|
sym = Linklookup(ctxt, "runtime.gcbss", 0)
|
2016-03-02 07:59:49 -05:00
|
|
|
sym.Attr |= AttrLocal
|
2016-08-19 22:40:38 -04:00
|
|
|
ctxt.xdefine("runtime.egcbss", obj.SRODATA, Symaddr(ctxt, sym)+sym.Size)
|
|
|
|
|
Linklookup(ctxt, "runtime.egcbss", 0).Sect = sym.Sect
|
|
|
|
|
|
|
|
|
|
ctxt.xdefine("runtime.symtab", obj.SRODATA, int64(symtab.Vaddr))
|
|
|
|
|
ctxt.xdefine("runtime.esymtab", obj.SRODATA, int64(symtab.Vaddr+symtab.Length))
|
|
|
|
|
ctxt.xdefine("runtime.pclntab", obj.SRODATA, int64(pclntab.Vaddr))
|
|
|
|
|
ctxt.xdefine("runtime.epclntab", obj.SRODATA, int64(pclntab.Vaddr+pclntab.Length))
|
|
|
|
|
ctxt.xdefine("runtime.noptrdata", obj.SNOPTRDATA, int64(noptr.Vaddr))
|
|
|
|
|
ctxt.xdefine("runtime.enoptrdata", obj.SNOPTRDATA, int64(noptr.Vaddr+noptr.Length))
|
|
|
|
|
ctxt.xdefine("runtime.bss", obj.SBSS, int64(bss.Vaddr))
|
|
|
|
|
ctxt.xdefine("runtime.ebss", obj.SBSS, int64(bss.Vaddr+bss.Length))
|
|
|
|
|
ctxt.xdefine("runtime.data", obj.SDATA, int64(data.Vaddr))
|
|
|
|
|
ctxt.xdefine("runtime.edata", obj.SDATA, int64(data.Vaddr+data.Length))
|
|
|
|
|
ctxt.xdefine("runtime.noptrbss", obj.SNOPTRBSS, int64(noptrbss.Vaddr))
|
|
|
|
|
ctxt.xdefine("runtime.enoptrbss", obj.SNOPTRBSS, int64(noptrbss.Vaddr+noptrbss.Length))
|
|
|
|
|
ctxt.xdefine("runtime.end", obj.SBSS, int64(Segdata.Vaddr+Segdata.Length))
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|