2015-02-27 22:57:28 -05:00
|
|
|
// Derived from Inferno utils/6l/obj.c and utils/6l/span.c
|
2016-08-28 17:04:46 -07:00
|
|
|
// https://bitbucket.org/inferno-os/inferno-os/src/default/utils/6l/obj.c
|
|
|
|
|
// https://bitbucket.org/inferno-os/inferno-os/src/default/utils/6l/span.c
|
2015-02-27 22:57:28 -05:00
|
|
|
//
|
|
|
|
|
// Copyright © 1994-1999 Lucent Technologies Inc. All rights reserved.
|
|
|
|
|
// Portions Copyright © 1995-1997 C H Forsyth (forsyth@terzarima.net)
|
|
|
|
|
// Portions Copyright © 1997-1999 Vita Nuova Limited
|
|
|
|
|
// Portions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com)
|
|
|
|
|
// Portions Copyright © 2004,2006 Bruce Ellis
|
|
|
|
|
// Portions Copyright © 2005-2007 C H Forsyth (forsyth@terzarima.net)
|
|
|
|
|
// Revisions Copyright © 2000-2007 Lucent Technologies Inc. and others
|
2016-04-10 14:32:26 -07:00
|
|
|
// Portions Copyright © 2009 The Go Authors. All rights reserved.
|
2015-02-27 22:57:28 -05:00
|
|
|
//
|
|
|
|
|
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
|
|
|
// of this software and associated documentation files (the "Software"), to deal
|
|
|
|
|
// in the Software without restriction, including without limitation the rights
|
|
|
|
|
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
|
|
|
// copies of the Software, and to permit persons to whom the Software is
|
|
|
|
|
// furnished to do so, subject to the following conditions:
|
|
|
|
|
//
|
|
|
|
|
// The above copyright notice and this permission notice shall be included in
|
|
|
|
|
// all copies or substantial portions of the Software.
|
|
|
|
|
//
|
|
|
|
|
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
|
|
|
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
|
|
|
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
|
|
|
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
|
|
|
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
|
|
|
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
|
|
|
// THE SOFTWARE.
|
|
|
|
|
|
|
|
|
|
package ld
|
|
|
|
|
|
|
|
|
|
import (
|
runtime: replace GC programs with simpler encoding, faster decoder
Small types record the location of pointers in their memory layout
by using a simple bitmap. In Go 1.4 the bitmap held 4-bit entries,
and in Go 1.5 the bitmap holds 1-bit entries, but in both cases using
a bitmap for a large type containing arrays does not make sense:
if someone refers to the type [1<<28]*byte in a program in such
a way that the type information makes it into the binary, it would be
a waste of space to write a 128 MB (for 4-bit entries) or even 32 MB
(for 1-bit entries) bitmap full of 1s into the binary or even to keep
one in memory during the execution of the program.
For large types containing arrays, it is much more compact to describe
the locations of pointers using a notation that can express repetition
than to lay out a bitmap of pointers. Go 1.4 included such a notation,
called ``GC programs'' but it was complex, required recursion during
decoding, and was generally slow. Dmitriy measured the execution of
these programs writing directly to the heap bitmap as being 7x slower
than copying from a preunrolled 4-bit mask (and frankly that code was
not terribly fast either). For some tests, unrollgcprog1 was seen costing
as much as 3x more than the rest of malloc combined.
This CL introduces a different form for the GC programs. They use a
simple Lempel-Ziv-style encoding of the 1-bit pointer information,
in which the only operations are (1) emit the following n bits
and (2) repeat the last n bits c more times. This encoding can be
generated directly from the Go type information (using repetition
only for arrays or large runs of non-pointer data) and it can be decoded
very efficiently. In particular the decoding requires little state and
no recursion, so that the entire decoding can run without any memory
accesses other than the reads of the encoding and the writes of the
decoded form to the heap bitmap. For recursive types like arrays of
arrays of arrays, the inner instructions are only executed once, not
n times, so that large repetitions run at full speed. (In contrast, large
repetitions in the old programs repeated the individual bit-level layout
of the inner data over and over.) The result is as much as 25x faster
decoding compared to the old form.
Because the old decoder was so slow, Go 1.4 had three (or so) cases
for how to set the heap bitmap bits for an allocation of a given type:
(1) If the type had an even number of words up to 32 words, then
the 4-bit pointer mask for the type fit in no more than 16 bytes;
store the 4-bit pointer mask directly in the binary and copy from it.
(1b) If the type had an odd number of words up to 15 words, then
the 4-bit pointer mask for the type, doubled to end on a byte boundary,
fit in no more than 16 bytes; store that doubled mask directly in the
binary and copy from it.
(2) If the type had an even number of words up to 128 words,
or an odd number of words up to 63 words (again due to doubling),
then the 4-bit pointer mask would fit in a 64-byte unrolled mask.
Store a GC program in the binary, but leave space in the BSS for
the unrolled mask. Execute the GC program to construct the mask the
first time it is needed, and thereafter copy from the mask.
(3) Otherwise, store a GC program and execute it to write directly to
the heap bitmap each time an object of that type is allocated.
(This is the case that was 7x slower than the other two.)
Because the new pointer masks store 1-bit entries instead of 4-bit
entries and because using the decoder no longer carries a significant
overhead, after this CL (that is, for Go 1.5) there are only two cases:
(1) If the type is 128 words or less (no condition about odd or even),
store the 1-bit pointer mask directly in the binary and use it to
initialize the heap bitmap during malloc. (Implemented in CL 9702.)
(2) There is no case 2 anymore.
(3) Otherwise, store a GC program and execute it to write directly to
the heap bitmap each time an object of that type is allocated.
Executing the GC program directly into the heap bitmap (case (3) above)
was disabled for the Go 1.5 dev cycle, both to avoid needing to use
GC programs for typedmemmove and to avoid updating that code as
the heap bitmap format changed. Typedmemmove no longer uses this
type information; as of CL 9886 it uses the heap bitmap directly.
Now that the heap bitmap format is stable, we reintroduce GC programs
and their space savings.
Benchmarks for heapBitsSetType, before this CL vs this CL:
name old mean new mean delta
SetTypePtr 7.59ns × (0.99,1.02) 5.16ns × (1.00,1.00) -32.05% (p=0.000)
SetTypePtr8 21.0ns × (0.98,1.05) 21.4ns × (1.00,1.00) ~ (p=0.179)
SetTypePtr16 24.1ns × (0.99,1.01) 24.6ns × (1.00,1.00) +2.41% (p=0.001)
SetTypePtr32 31.2ns × (0.99,1.01) 32.4ns × (0.99,1.02) +3.72% (p=0.001)
SetTypePtr64 45.2ns × (1.00,1.00) 47.2ns × (1.00,1.00) +4.42% (p=0.000)
SetTypePtr126 75.8ns × (0.99,1.01) 79.1ns × (1.00,1.00) +4.25% (p=0.000)
SetTypePtr128 74.3ns × (0.99,1.01) 77.6ns × (1.00,1.01) +4.55% (p=0.000)
SetTypePtrSlice 726ns × (1.00,1.01) 712ns × (1.00,1.00) -1.95% (p=0.001)
SetTypeNode1 20.0ns × (0.99,1.01) 20.7ns × (1.00,1.00) +3.71% (p=0.000)
SetTypeNode1Slice 112ns × (1.00,1.00) 113ns × (0.99,1.00) ~ (p=0.070)
SetTypeNode8 23.9ns × (1.00,1.00) 24.7ns × (1.00,1.01) +3.18% (p=0.000)
SetTypeNode8Slice 294ns × (0.99,1.02) 287ns × (0.99,1.01) -2.38% (p=0.015)
SetTypeNode64 52.8ns × (0.99,1.03) 51.8ns × (0.99,1.01) ~ (p=0.069)
SetTypeNode64Slice 1.13µs × (0.99,1.05) 1.14µs × (0.99,1.00) ~ (p=0.767)
SetTypeNode64Dead 36.0ns × (1.00,1.01) 32.5ns × (0.99,1.00) -9.67% (p=0.000)
SetTypeNode64DeadSlice 1.43µs × (0.99,1.01) 1.40µs × (1.00,1.00) -2.39% (p=0.001)
SetTypeNode124 75.7ns × (1.00,1.01) 79.0ns × (1.00,1.00) +4.44% (p=0.000)
SetTypeNode124Slice 1.94µs × (1.00,1.01) 2.04µs × (0.99,1.01) +4.98% (p=0.000)
SetTypeNode126 75.4ns × (1.00,1.01) 77.7ns × (0.99,1.01) +3.11% (p=0.000)
SetTypeNode126Slice 1.95µs × (0.99,1.01) 2.03µs × (1.00,1.00) +3.74% (p=0.000)
SetTypeNode128 85.4ns × (0.99,1.01) 122.0ns × (1.00,1.00) +42.89% (p=0.000)
SetTypeNode128Slice 2.20µs × (1.00,1.01) 2.36µs × (0.98,1.02) +7.48% (p=0.001)
SetTypeNode130 83.3ns × (1.00,1.00) 123.0ns × (1.00,1.00) +47.61% (p=0.000)
SetTypeNode130Slice 2.30µs × (0.99,1.01) 2.40µs × (0.98,1.01) +4.37% (p=0.000)
SetTypeNode1024 498ns × (1.00,1.00) 537ns × (1.00,1.00) +7.96% (p=0.000)
SetTypeNode1024Slice 15.5µs × (0.99,1.01) 17.8µs × (1.00,1.00) +15.27% (p=0.000)
The above compares always using a cached pointer mask (and the
corresponding waste of memory) against using the programs directly.
Some slowdown is expected, in exchange for having a better general algorithm.
The GC programs kick in for SetTypeNode128, SetTypeNode130, SetTypeNode1024,
along with the slice variants of those.
It is possible that the cutoff of 128 words (bits) should be raised
in a followup CL, but even with this low cutoff the GC programs are
faster than Go 1.4's "fast path" non-GC program case.
Benchmarks for heapBitsSetType, Go 1.4 vs this CL:
name old mean new mean delta
SetTypePtr 6.89ns × (1.00,1.00) 5.17ns × (1.00,1.00) -25.02% (p=0.000)
SetTypePtr8 25.8ns × (0.97,1.05) 21.5ns × (1.00,1.00) -16.70% (p=0.000)
SetTypePtr16 39.8ns × (0.97,1.02) 24.7ns × (0.99,1.01) -37.81% (p=0.000)
SetTypePtr32 68.8ns × (0.98,1.01) 32.2ns × (1.00,1.01) -53.18% (p=0.000)
SetTypePtr64 130ns × (1.00,1.00) 47ns × (1.00,1.00) -63.67% (p=0.000)
SetTypePtr126 241ns × (0.99,1.01) 79ns × (1.00,1.01) -67.25% (p=0.000)
SetTypePtr128 2.07µs × (1.00,1.00) 0.08µs × (1.00,1.00) -96.27% (p=0.000)
SetTypePtrSlice 1.05µs × (0.99,1.01) 0.72µs × (0.99,1.02) -31.70% (p=0.000)
SetTypeNode1 16.0ns × (0.99,1.01) 20.8ns × (0.99,1.03) +29.91% (p=0.000)
SetTypeNode1Slice 184ns × (0.99,1.01) 112ns × (0.99,1.01) -39.26% (p=0.000)
SetTypeNode8 29.5ns × (0.97,1.02) 24.6ns × (1.00,1.00) -16.50% (p=0.000)
SetTypeNode8Slice 624ns × (0.98,1.02) 285ns × (1.00,1.00) -54.31% (p=0.000)
SetTypeNode64 135ns × (0.96,1.08) 52ns × (0.99,1.02) -61.32% (p=0.000)
SetTypeNode64Slice 3.83µs × (1.00,1.00) 1.14µs × (0.99,1.01) -70.16% (p=0.000)
SetTypeNode64Dead 134ns × (0.99,1.01) 32ns × (1.00,1.01) -75.74% (p=0.000)
SetTypeNode64DeadSlice 3.83µs × (0.99,1.00) 1.40µs × (1.00,1.01) -63.42% (p=0.000)
SetTypeNode124 240ns × (0.99,1.01) 79ns × (1.00,1.01) -67.05% (p=0.000)
SetTypeNode124Slice 7.27µs × (1.00,1.00) 2.04µs × (1.00,1.00) -71.95% (p=0.000)
SetTypeNode126 2.06µs × (0.99,1.01) 0.08µs × (0.99,1.01) -96.23% (p=0.000)
SetTypeNode126Slice 64.4µs × (1.00,1.00) 2.0µs × (1.00,1.00) -96.85% (p=0.000)
SetTypeNode128 2.09µs × (1.00,1.01) 0.12µs × (1.00,1.00) -94.15% (p=0.000)
SetTypeNode128Slice 65.4µs × (1.00,1.00) 2.4µs × (0.99,1.03) -96.39% (p=0.000)
SetTypeNode130 2.11µs × (1.00,1.00) 0.12µs × (1.00,1.00) -94.18% (p=0.000)
SetTypeNode130Slice 66.3µs × (1.00,1.00) 2.4µs × (0.97,1.08) -96.34% (p=0.000)
SetTypeNode1024 16.0µs × (1.00,1.01) 0.5µs × (1.00,1.00) -96.65% (p=0.000)
SetTypeNode1024Slice 512µs × (1.00,1.00) 18µs × (0.98,1.04) -96.45% (p=0.000)
SetTypeNode124 uses a 124 data + 2 ptr = 126-word allocation.
Both Go 1.4 and this CL are using pointer bitmaps for this case,
so that's an overall 3x speedup for using pointer bitmaps.
SetTypeNode128 uses a 128 data + 2 ptr = 130-word allocation.
Both Go 1.4 and this CL are running the GC program for this case,
so that's an overall 17x speedup when using GC programs (and
I've seen >20x on other systems).
Comparing Go 1.4's SetTypeNode124 (pointer bitmap) against
this CL's SetTypeNode128 (GC program), the slow path in the
code in this CL is 2x faster than the fast path in Go 1.4.
The Go 1 benchmarks are basically unaffected compared to just before this CL.
Go 1 benchmarks, before this CL vs this CL:
name old mean new mean delta
BinaryTree17 5.87s × (0.97,1.04) 5.91s × (0.96,1.04) ~ (p=0.306)
Fannkuch11 4.38s × (1.00,1.00) 4.37s × (1.00,1.01) -0.22% (p=0.006)
FmtFprintfEmpty 90.7ns × (0.97,1.10) 89.3ns × (0.96,1.09) ~ (p=0.280)
FmtFprintfString 282ns × (0.98,1.04) 287ns × (0.98,1.07) +1.72% (p=0.039)
FmtFprintfInt 269ns × (0.99,1.03) 282ns × (0.97,1.04) +4.87% (p=0.000)
FmtFprintfIntInt 478ns × (0.99,1.02) 481ns × (0.99,1.02) +0.61% (p=0.048)
FmtFprintfPrefixedInt 399ns × (0.98,1.03) 400ns × (0.98,1.05) ~ (p=0.533)
FmtFprintfFloat 563ns × (0.99,1.01) 570ns × (1.00,1.01) +1.37% (p=0.000)
FmtManyArgs 1.89µs × (0.99,1.01) 1.92µs × (0.99,1.02) +1.88% (p=0.000)
GobDecode 15.2ms × (0.99,1.01) 15.2ms × (0.98,1.05) ~ (p=0.609)
GobEncode 11.6ms × (0.98,1.03) 11.9ms × (0.98,1.04) +2.17% (p=0.000)
Gzip 648ms × (0.99,1.01) 648ms × (1.00,1.01) ~ (p=0.835)
Gunzip 142ms × (1.00,1.00) 143ms × (1.00,1.01) ~ (p=0.169)
HTTPClientServer 90.5µs × (0.98,1.03) 91.5µs × (0.98,1.04) +1.04% (p=0.045)
JSONEncode 31.5ms × (0.98,1.03) 31.4ms × (0.98,1.03) ~ (p=0.549)
JSONDecode 111ms × (0.99,1.01) 107ms × (0.99,1.01) -3.21% (p=0.000)
Mandelbrot200 6.01ms × (1.00,1.00) 6.01ms × (1.00,1.00) ~ (p=0.878)
GoParse 6.54ms × (0.99,1.02) 6.61ms × (0.99,1.03) +1.08% (p=0.004)
RegexpMatchEasy0_32 160ns × (1.00,1.01) 161ns × (1.00,1.00) +0.40% (p=0.000)
RegexpMatchEasy0_1K 560ns × (0.99,1.01) 559ns × (0.99,1.01) ~ (p=0.088)
RegexpMatchEasy1_32 138ns × (0.99,1.01) 138ns × (1.00,1.00) ~ (p=0.380)
RegexpMatchEasy1_1K 877ns × (1.00,1.00) 878ns × (1.00,1.00) ~ (p=0.157)
RegexpMatchMedium_32 251ns × (0.99,1.00) 251ns × (1.00,1.01) +0.28% (p=0.021)
RegexpMatchMedium_1K 72.6µs × (1.00,1.00) 72.6µs × (1.00,1.00) ~ (p=0.539)
RegexpMatchHard_32 3.84µs × (1.00,1.00) 3.84µs × (1.00,1.00) ~ (p=0.378)
RegexpMatchHard_1K 117µs × (1.00,1.00) 117µs × (1.00,1.00) ~ (p=0.067)
Revcomp 904ms × (0.99,1.02) 904ms × (0.99,1.01) ~ (p=0.943)
Template 125ms × (0.99,1.02) 127ms × (0.99,1.01) +1.79% (p=0.000)
TimeParse 627ns × (0.99,1.01) 622ns × (0.99,1.01) -0.88% (p=0.000)
TimeFormat 655ns × (0.99,1.02) 655ns × (0.99,1.02) ~ (p=0.976)
For the record, Go 1 benchmarks, Go 1.4 vs this CL:
name old mean new mean delta
BinaryTree17 4.61s × (0.97,1.05) 5.91s × (0.98,1.03) +28.35% (p=0.000)
Fannkuch11 4.40s × (0.99,1.03) 4.41s × (0.99,1.01) ~ (p=0.212)
FmtFprintfEmpty 102ns × (0.99,1.01) 84ns × (0.99,1.02) -18.38% (p=0.000)
FmtFprintfString 302ns × (0.98,1.01) 303ns × (0.99,1.02) ~ (p=0.203)
FmtFprintfInt 313ns × (0.97,1.05) 270ns × (0.99,1.01) -13.69% (p=0.000)
FmtFprintfIntInt 524ns × (0.98,1.02) 477ns × (0.99,1.00) -8.87% (p=0.000)
FmtFprintfPrefixedInt 424ns × (0.98,1.02) 386ns × (0.99,1.01) -8.96% (p=0.000)
FmtFprintfFloat 652ns × (0.98,1.02) 594ns × (0.97,1.05) -8.97% (p=0.000)
FmtManyArgs 2.13µs × (0.99,1.02) 1.94µs × (0.99,1.01) -8.92% (p=0.000)
GobDecode 17.1ms × (0.99,1.02) 14.9ms × (0.98,1.03) -13.07% (p=0.000)
GobEncode 13.5ms × (0.98,1.03) 11.5ms × (0.98,1.03) -15.25% (p=0.000)
Gzip 656ms × (0.99,1.02) 647ms × (0.99,1.01) -1.29% (p=0.000)
Gunzip 143ms × (0.99,1.02) 144ms × (0.99,1.01) ~ (p=0.204)
HTTPClientServer 88.2µs × (0.98,1.02) 90.8µs × (0.98,1.01) +2.93% (p=0.000)
JSONEncode 32.2ms × (0.98,1.02) 30.9ms × (0.97,1.04) -4.06% (p=0.001)
JSONDecode 121ms × (0.98,1.02) 110ms × (0.98,1.05) -8.95% (p=0.000)
Mandelbrot200 6.06ms × (0.99,1.01) 6.11ms × (0.98,1.04) ~ (p=0.184)
GoParse 6.76ms × (0.97,1.04) 6.58ms × (0.98,1.05) -2.63% (p=0.003)
RegexpMatchEasy0_32 195ns × (1.00,1.01) 155ns × (0.99,1.01) -20.43% (p=0.000)
RegexpMatchEasy0_1K 479ns × (0.98,1.03) 535ns × (0.99,1.02) +11.59% (p=0.000)
RegexpMatchEasy1_32 169ns × (0.99,1.02) 131ns × (0.99,1.03) -22.44% (p=0.000)
RegexpMatchEasy1_1K 1.53µs × (0.99,1.01) 0.87µs × (0.99,1.02) -43.07% (p=0.000)
RegexpMatchMedium_32 334ns × (0.99,1.01) 242ns × (0.99,1.01) -27.53% (p=0.000)
RegexpMatchMedium_1K 125µs × (1.00,1.01) 72µs × (0.99,1.03) -42.53% (p=0.000)
RegexpMatchHard_32 6.03µs × (0.99,1.01) 3.79µs × (0.99,1.01) -37.12% (p=0.000)
RegexpMatchHard_1K 189µs × (0.99,1.02) 115µs × (0.99,1.01) -39.20% (p=0.000)
Revcomp 935ms × (0.96,1.03) 926ms × (0.98,1.02) ~ (p=0.083)
Template 146ms × (0.97,1.05) 119ms × (0.99,1.01) -18.37% (p=0.000)
TimeParse 660ns × (0.99,1.01) 624ns × (0.99,1.02) -5.43% (p=0.000)
TimeFormat 670ns × (0.98,1.02) 710ns × (1.00,1.01) +5.97% (p=0.000)
This CL is a bit larger than I would like, but the compiler, linker, runtime,
and package reflect all need to be in sync about the format of these programs,
so there is no easy way to split this into independent changes (at least
while keeping the build working at each change).
Fixes #9625.
Fixes #10524.
Change-Id: I9e3e20d6097099d0f8532d1cb5b1af528804989a
Reviewed-on: https://go-review.googlesource.com/9888
Reviewed-by: Austin Clements <austin@google.com>
Run-TryBot: Russ Cox <rsc@golang.org>
2015-05-08 01:43:18 -04:00
|
|
|
"cmd/internal/gcprog"
|
2017-04-18 12:53:25 -07:00
|
|
|
"cmd/internal/objabi"
|
2016-04-06 12:01:40 -07:00
|
|
|
"cmd/internal/sys"
|
2015-02-27 22:57:28 -05:00
|
|
|
"fmt"
|
|
|
|
|
"log"
|
runtime: replace GC programs with simpler encoding, faster decoder
Small types record the location of pointers in their memory layout
by using a simple bitmap. In Go 1.4 the bitmap held 4-bit entries,
and in Go 1.5 the bitmap holds 1-bit entries, but in both cases using
a bitmap for a large type containing arrays does not make sense:
if someone refers to the type [1<<28]*byte in a program in such
a way that the type information makes it into the binary, it would be
a waste of space to write a 128 MB (for 4-bit entries) or even 32 MB
(for 1-bit entries) bitmap full of 1s into the binary or even to keep
one in memory during the execution of the program.
For large types containing arrays, it is much more compact to describe
the locations of pointers using a notation that can express repetition
than to lay out a bitmap of pointers. Go 1.4 included such a notation,
called ``GC programs'' but it was complex, required recursion during
decoding, and was generally slow. Dmitriy measured the execution of
these programs writing directly to the heap bitmap as being 7x slower
than copying from a preunrolled 4-bit mask (and frankly that code was
not terribly fast either). For some tests, unrollgcprog1 was seen costing
as much as 3x more than the rest of malloc combined.
This CL introduces a different form for the GC programs. They use a
simple Lempel-Ziv-style encoding of the 1-bit pointer information,
in which the only operations are (1) emit the following n bits
and (2) repeat the last n bits c more times. This encoding can be
generated directly from the Go type information (using repetition
only for arrays or large runs of non-pointer data) and it can be decoded
very efficiently. In particular the decoding requires little state and
no recursion, so that the entire decoding can run without any memory
accesses other than the reads of the encoding and the writes of the
decoded form to the heap bitmap. For recursive types like arrays of
arrays of arrays, the inner instructions are only executed once, not
n times, so that large repetitions run at full speed. (In contrast, large
repetitions in the old programs repeated the individual bit-level layout
of the inner data over and over.) The result is as much as 25x faster
decoding compared to the old form.
Because the old decoder was so slow, Go 1.4 had three (or so) cases
for how to set the heap bitmap bits for an allocation of a given type:
(1) If the type had an even number of words up to 32 words, then
the 4-bit pointer mask for the type fit in no more than 16 bytes;
store the 4-bit pointer mask directly in the binary and copy from it.
(1b) If the type had an odd number of words up to 15 words, then
the 4-bit pointer mask for the type, doubled to end on a byte boundary,
fit in no more than 16 bytes; store that doubled mask directly in the
binary and copy from it.
(2) If the type had an even number of words up to 128 words,
or an odd number of words up to 63 words (again due to doubling),
then the 4-bit pointer mask would fit in a 64-byte unrolled mask.
Store a GC program in the binary, but leave space in the BSS for
the unrolled mask. Execute the GC program to construct the mask the
first time it is needed, and thereafter copy from the mask.
(3) Otherwise, store a GC program and execute it to write directly to
the heap bitmap each time an object of that type is allocated.
(This is the case that was 7x slower than the other two.)
Because the new pointer masks store 1-bit entries instead of 4-bit
entries and because using the decoder no longer carries a significant
overhead, after this CL (that is, for Go 1.5) there are only two cases:
(1) If the type is 128 words or less (no condition about odd or even),
store the 1-bit pointer mask directly in the binary and use it to
initialize the heap bitmap during malloc. (Implemented in CL 9702.)
(2) There is no case 2 anymore.
(3) Otherwise, store a GC program and execute it to write directly to
the heap bitmap each time an object of that type is allocated.
Executing the GC program directly into the heap bitmap (case (3) above)
was disabled for the Go 1.5 dev cycle, both to avoid needing to use
GC programs for typedmemmove and to avoid updating that code as
the heap bitmap format changed. Typedmemmove no longer uses this
type information; as of CL 9886 it uses the heap bitmap directly.
Now that the heap bitmap format is stable, we reintroduce GC programs
and their space savings.
Benchmarks for heapBitsSetType, before this CL vs this CL:
name old mean new mean delta
SetTypePtr 7.59ns × (0.99,1.02) 5.16ns × (1.00,1.00) -32.05% (p=0.000)
SetTypePtr8 21.0ns × (0.98,1.05) 21.4ns × (1.00,1.00) ~ (p=0.179)
SetTypePtr16 24.1ns × (0.99,1.01) 24.6ns × (1.00,1.00) +2.41% (p=0.001)
SetTypePtr32 31.2ns × (0.99,1.01) 32.4ns × (0.99,1.02) +3.72% (p=0.001)
SetTypePtr64 45.2ns × (1.00,1.00) 47.2ns × (1.00,1.00) +4.42% (p=0.000)
SetTypePtr126 75.8ns × (0.99,1.01) 79.1ns × (1.00,1.00) +4.25% (p=0.000)
SetTypePtr128 74.3ns × (0.99,1.01) 77.6ns × (1.00,1.01) +4.55% (p=0.000)
SetTypePtrSlice 726ns × (1.00,1.01) 712ns × (1.00,1.00) -1.95% (p=0.001)
SetTypeNode1 20.0ns × (0.99,1.01) 20.7ns × (1.00,1.00) +3.71% (p=0.000)
SetTypeNode1Slice 112ns × (1.00,1.00) 113ns × (0.99,1.00) ~ (p=0.070)
SetTypeNode8 23.9ns × (1.00,1.00) 24.7ns × (1.00,1.01) +3.18% (p=0.000)
SetTypeNode8Slice 294ns × (0.99,1.02) 287ns × (0.99,1.01) -2.38% (p=0.015)
SetTypeNode64 52.8ns × (0.99,1.03) 51.8ns × (0.99,1.01) ~ (p=0.069)
SetTypeNode64Slice 1.13µs × (0.99,1.05) 1.14µs × (0.99,1.00) ~ (p=0.767)
SetTypeNode64Dead 36.0ns × (1.00,1.01) 32.5ns × (0.99,1.00) -9.67% (p=0.000)
SetTypeNode64DeadSlice 1.43µs × (0.99,1.01) 1.40µs × (1.00,1.00) -2.39% (p=0.001)
SetTypeNode124 75.7ns × (1.00,1.01) 79.0ns × (1.00,1.00) +4.44% (p=0.000)
SetTypeNode124Slice 1.94µs × (1.00,1.01) 2.04µs × (0.99,1.01) +4.98% (p=0.000)
SetTypeNode126 75.4ns × (1.00,1.01) 77.7ns × (0.99,1.01) +3.11% (p=0.000)
SetTypeNode126Slice 1.95µs × (0.99,1.01) 2.03µs × (1.00,1.00) +3.74% (p=0.000)
SetTypeNode128 85.4ns × (0.99,1.01) 122.0ns × (1.00,1.00) +42.89% (p=0.000)
SetTypeNode128Slice 2.20µs × (1.00,1.01) 2.36µs × (0.98,1.02) +7.48% (p=0.001)
SetTypeNode130 83.3ns × (1.00,1.00) 123.0ns × (1.00,1.00) +47.61% (p=0.000)
SetTypeNode130Slice 2.30µs × (0.99,1.01) 2.40µs × (0.98,1.01) +4.37% (p=0.000)
SetTypeNode1024 498ns × (1.00,1.00) 537ns × (1.00,1.00) +7.96% (p=0.000)
SetTypeNode1024Slice 15.5µs × (0.99,1.01) 17.8µs × (1.00,1.00) +15.27% (p=0.000)
The above compares always using a cached pointer mask (and the
corresponding waste of memory) against using the programs directly.
Some slowdown is expected, in exchange for having a better general algorithm.
The GC programs kick in for SetTypeNode128, SetTypeNode130, SetTypeNode1024,
along with the slice variants of those.
It is possible that the cutoff of 128 words (bits) should be raised
in a followup CL, but even with this low cutoff the GC programs are
faster than Go 1.4's "fast path" non-GC program case.
Benchmarks for heapBitsSetType, Go 1.4 vs this CL:
name old mean new mean delta
SetTypePtr 6.89ns × (1.00,1.00) 5.17ns × (1.00,1.00) -25.02% (p=0.000)
SetTypePtr8 25.8ns × (0.97,1.05) 21.5ns × (1.00,1.00) -16.70% (p=0.000)
SetTypePtr16 39.8ns × (0.97,1.02) 24.7ns × (0.99,1.01) -37.81% (p=0.000)
SetTypePtr32 68.8ns × (0.98,1.01) 32.2ns × (1.00,1.01) -53.18% (p=0.000)
SetTypePtr64 130ns × (1.00,1.00) 47ns × (1.00,1.00) -63.67% (p=0.000)
SetTypePtr126 241ns × (0.99,1.01) 79ns × (1.00,1.01) -67.25% (p=0.000)
SetTypePtr128 2.07µs × (1.00,1.00) 0.08µs × (1.00,1.00) -96.27% (p=0.000)
SetTypePtrSlice 1.05µs × (0.99,1.01) 0.72µs × (0.99,1.02) -31.70% (p=0.000)
SetTypeNode1 16.0ns × (0.99,1.01) 20.8ns × (0.99,1.03) +29.91% (p=0.000)
SetTypeNode1Slice 184ns × (0.99,1.01) 112ns × (0.99,1.01) -39.26% (p=0.000)
SetTypeNode8 29.5ns × (0.97,1.02) 24.6ns × (1.00,1.00) -16.50% (p=0.000)
SetTypeNode8Slice 624ns × (0.98,1.02) 285ns × (1.00,1.00) -54.31% (p=0.000)
SetTypeNode64 135ns × (0.96,1.08) 52ns × (0.99,1.02) -61.32% (p=0.000)
SetTypeNode64Slice 3.83µs × (1.00,1.00) 1.14µs × (0.99,1.01) -70.16% (p=0.000)
SetTypeNode64Dead 134ns × (0.99,1.01) 32ns × (1.00,1.01) -75.74% (p=0.000)
SetTypeNode64DeadSlice 3.83µs × (0.99,1.00) 1.40µs × (1.00,1.01) -63.42% (p=0.000)
SetTypeNode124 240ns × (0.99,1.01) 79ns × (1.00,1.01) -67.05% (p=0.000)
SetTypeNode124Slice 7.27µs × (1.00,1.00) 2.04µs × (1.00,1.00) -71.95% (p=0.000)
SetTypeNode126 2.06µs × (0.99,1.01) 0.08µs × (0.99,1.01) -96.23% (p=0.000)
SetTypeNode126Slice 64.4µs × (1.00,1.00) 2.0µs × (1.00,1.00) -96.85% (p=0.000)
SetTypeNode128 2.09µs × (1.00,1.01) 0.12µs × (1.00,1.00) -94.15% (p=0.000)
SetTypeNode128Slice 65.4µs × (1.00,1.00) 2.4µs × (0.99,1.03) -96.39% (p=0.000)
SetTypeNode130 2.11µs × (1.00,1.00) 0.12µs × (1.00,1.00) -94.18% (p=0.000)
SetTypeNode130Slice 66.3µs × (1.00,1.00) 2.4µs × (0.97,1.08) -96.34% (p=0.000)
SetTypeNode1024 16.0µs × (1.00,1.01) 0.5µs × (1.00,1.00) -96.65% (p=0.000)
SetTypeNode1024Slice 512µs × (1.00,1.00) 18µs × (0.98,1.04) -96.45% (p=0.000)
SetTypeNode124 uses a 124 data + 2 ptr = 126-word allocation.
Both Go 1.4 and this CL are using pointer bitmaps for this case,
so that's an overall 3x speedup for using pointer bitmaps.
SetTypeNode128 uses a 128 data + 2 ptr = 130-word allocation.
Both Go 1.4 and this CL are running the GC program for this case,
so that's an overall 17x speedup when using GC programs (and
I've seen >20x on other systems).
Comparing Go 1.4's SetTypeNode124 (pointer bitmap) against
this CL's SetTypeNode128 (GC program), the slow path in the
code in this CL is 2x faster than the fast path in Go 1.4.
The Go 1 benchmarks are basically unaffected compared to just before this CL.
Go 1 benchmarks, before this CL vs this CL:
name old mean new mean delta
BinaryTree17 5.87s × (0.97,1.04) 5.91s × (0.96,1.04) ~ (p=0.306)
Fannkuch11 4.38s × (1.00,1.00) 4.37s × (1.00,1.01) -0.22% (p=0.006)
FmtFprintfEmpty 90.7ns × (0.97,1.10) 89.3ns × (0.96,1.09) ~ (p=0.280)
FmtFprintfString 282ns × (0.98,1.04) 287ns × (0.98,1.07) +1.72% (p=0.039)
FmtFprintfInt 269ns × (0.99,1.03) 282ns × (0.97,1.04) +4.87% (p=0.000)
FmtFprintfIntInt 478ns × (0.99,1.02) 481ns × (0.99,1.02) +0.61% (p=0.048)
FmtFprintfPrefixedInt 399ns × (0.98,1.03) 400ns × (0.98,1.05) ~ (p=0.533)
FmtFprintfFloat 563ns × (0.99,1.01) 570ns × (1.00,1.01) +1.37% (p=0.000)
FmtManyArgs 1.89µs × (0.99,1.01) 1.92µs × (0.99,1.02) +1.88% (p=0.000)
GobDecode 15.2ms × (0.99,1.01) 15.2ms × (0.98,1.05) ~ (p=0.609)
GobEncode 11.6ms × (0.98,1.03) 11.9ms × (0.98,1.04) +2.17% (p=0.000)
Gzip 648ms × (0.99,1.01) 648ms × (1.00,1.01) ~ (p=0.835)
Gunzip 142ms × (1.00,1.00) 143ms × (1.00,1.01) ~ (p=0.169)
HTTPClientServer 90.5µs × (0.98,1.03) 91.5µs × (0.98,1.04) +1.04% (p=0.045)
JSONEncode 31.5ms × (0.98,1.03) 31.4ms × (0.98,1.03) ~ (p=0.549)
JSONDecode 111ms × (0.99,1.01) 107ms × (0.99,1.01) -3.21% (p=0.000)
Mandelbrot200 6.01ms × (1.00,1.00) 6.01ms × (1.00,1.00) ~ (p=0.878)
GoParse 6.54ms × (0.99,1.02) 6.61ms × (0.99,1.03) +1.08% (p=0.004)
RegexpMatchEasy0_32 160ns × (1.00,1.01) 161ns × (1.00,1.00) +0.40% (p=0.000)
RegexpMatchEasy0_1K 560ns × (0.99,1.01) 559ns × (0.99,1.01) ~ (p=0.088)
RegexpMatchEasy1_32 138ns × (0.99,1.01) 138ns × (1.00,1.00) ~ (p=0.380)
RegexpMatchEasy1_1K 877ns × (1.00,1.00) 878ns × (1.00,1.00) ~ (p=0.157)
RegexpMatchMedium_32 251ns × (0.99,1.00) 251ns × (1.00,1.01) +0.28% (p=0.021)
RegexpMatchMedium_1K 72.6µs × (1.00,1.00) 72.6µs × (1.00,1.00) ~ (p=0.539)
RegexpMatchHard_32 3.84µs × (1.00,1.00) 3.84µs × (1.00,1.00) ~ (p=0.378)
RegexpMatchHard_1K 117µs × (1.00,1.00) 117µs × (1.00,1.00) ~ (p=0.067)
Revcomp 904ms × (0.99,1.02) 904ms × (0.99,1.01) ~ (p=0.943)
Template 125ms × (0.99,1.02) 127ms × (0.99,1.01) +1.79% (p=0.000)
TimeParse 627ns × (0.99,1.01) 622ns × (0.99,1.01) -0.88% (p=0.000)
TimeFormat 655ns × (0.99,1.02) 655ns × (0.99,1.02) ~ (p=0.976)
For the record, Go 1 benchmarks, Go 1.4 vs this CL:
name old mean new mean delta
BinaryTree17 4.61s × (0.97,1.05) 5.91s × (0.98,1.03) +28.35% (p=0.000)
Fannkuch11 4.40s × (0.99,1.03) 4.41s × (0.99,1.01) ~ (p=0.212)
FmtFprintfEmpty 102ns × (0.99,1.01) 84ns × (0.99,1.02) -18.38% (p=0.000)
FmtFprintfString 302ns × (0.98,1.01) 303ns × (0.99,1.02) ~ (p=0.203)
FmtFprintfInt 313ns × (0.97,1.05) 270ns × (0.99,1.01) -13.69% (p=0.000)
FmtFprintfIntInt 524ns × (0.98,1.02) 477ns × (0.99,1.00) -8.87% (p=0.000)
FmtFprintfPrefixedInt 424ns × (0.98,1.02) 386ns × (0.99,1.01) -8.96% (p=0.000)
FmtFprintfFloat 652ns × (0.98,1.02) 594ns × (0.97,1.05) -8.97% (p=0.000)
FmtManyArgs 2.13µs × (0.99,1.02) 1.94µs × (0.99,1.01) -8.92% (p=0.000)
GobDecode 17.1ms × (0.99,1.02) 14.9ms × (0.98,1.03) -13.07% (p=0.000)
GobEncode 13.5ms × (0.98,1.03) 11.5ms × (0.98,1.03) -15.25% (p=0.000)
Gzip 656ms × (0.99,1.02) 647ms × (0.99,1.01) -1.29% (p=0.000)
Gunzip 143ms × (0.99,1.02) 144ms × (0.99,1.01) ~ (p=0.204)
HTTPClientServer 88.2µs × (0.98,1.02) 90.8µs × (0.98,1.01) +2.93% (p=0.000)
JSONEncode 32.2ms × (0.98,1.02) 30.9ms × (0.97,1.04) -4.06% (p=0.001)
JSONDecode 121ms × (0.98,1.02) 110ms × (0.98,1.05) -8.95% (p=0.000)
Mandelbrot200 6.06ms × (0.99,1.01) 6.11ms × (0.98,1.04) ~ (p=0.184)
GoParse 6.76ms × (0.97,1.04) 6.58ms × (0.98,1.05) -2.63% (p=0.003)
RegexpMatchEasy0_32 195ns × (1.00,1.01) 155ns × (0.99,1.01) -20.43% (p=0.000)
RegexpMatchEasy0_1K 479ns × (0.98,1.03) 535ns × (0.99,1.02) +11.59% (p=0.000)
RegexpMatchEasy1_32 169ns × (0.99,1.02) 131ns × (0.99,1.03) -22.44% (p=0.000)
RegexpMatchEasy1_1K 1.53µs × (0.99,1.01) 0.87µs × (0.99,1.02) -43.07% (p=0.000)
RegexpMatchMedium_32 334ns × (0.99,1.01) 242ns × (0.99,1.01) -27.53% (p=0.000)
RegexpMatchMedium_1K 125µs × (1.00,1.01) 72µs × (0.99,1.03) -42.53% (p=0.000)
RegexpMatchHard_32 6.03µs × (0.99,1.01) 3.79µs × (0.99,1.01) -37.12% (p=0.000)
RegexpMatchHard_1K 189µs × (0.99,1.02) 115µs × (0.99,1.01) -39.20% (p=0.000)
Revcomp 935ms × (0.96,1.03) 926ms × (0.98,1.02) ~ (p=0.083)
Template 146ms × (0.97,1.05) 119ms × (0.99,1.01) -18.37% (p=0.000)
TimeParse 660ns × (0.99,1.01) 624ns × (0.99,1.02) -5.43% (p=0.000)
TimeFormat 670ns × (0.98,1.02) 710ns × (1.00,1.01) +5.97% (p=0.000)
This CL is a bit larger than I would like, but the compiler, linker, runtime,
and package reflect all need to be in sync about the format of these programs,
so there is no easy way to split this into independent changes (at least
while keeping the build working at each change).
Fixes #9625.
Fixes #10524.
Change-Id: I9e3e20d6097099d0f8532d1cb5b1af528804989a
Reviewed-on: https://go-review.googlesource.com/9888
Reviewed-by: Austin Clements <austin@google.com>
Run-TryBot: Russ Cox <rsc@golang.org>
2015-05-08 01:43:18 -04:00
|
|
|
"os"
|
2016-03-09 16:23:25 +02:00
|
|
|
"sort"
|
2015-06-04 15:15:48 -04:00
|
|
|
"strconv"
|
2015-02-27 22:57:28 -05:00
|
|
|
"strings"
|
2016-04-18 14:50:14 -04:00
|
|
|
"sync"
|
2015-02-27 22:57:28 -05:00
|
|
|
)
|
|
|
|
|
|
2016-09-20 15:31:26 +12:00
|
|
|
func Symgrow(s *Symbol, siz int64) {
|
2015-02-27 22:57:28 -05:00
|
|
|
if int64(int(siz)) != siz {
|
|
|
|
|
log.Fatalf("symgrow size %d too long", siz)
|
|
|
|
|
}
|
|
|
|
|
if int64(len(s.P)) >= siz {
|
|
|
|
|
return
|
|
|
|
|
}
|
2016-04-07 18:00:57 +03:00
|
|
|
if cap(s.P) < int(siz) {
|
|
|
|
|
p := make([]byte, 2*(siz+1))
|
|
|
|
|
s.P = append(p[:0], s.P...)
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
s.P = s.P[:siz]
|
|
|
|
|
}
|
|
|
|
|
|
2016-08-19 11:35:54 -04:00
|
|
|
func Addrel(s *Symbol) *Reloc {
|
2015-02-27 22:57:28 -05:00
|
|
|
s.R = append(s.R, Reloc{})
|
|
|
|
|
return &s.R[len(s.R)-1]
|
|
|
|
|
}
|
|
|
|
|
|
2016-08-19 11:35:54 -04:00
|
|
|
func setuintxx(ctxt *Link, s *Symbol, off int64, v uint64, wid int64) int64 {
|
2015-02-27 22:57:28 -05:00
|
|
|
if s.Type == 0 {
|
2017-04-19 15:15:35 +12:00
|
|
|
s.Type = SDATA
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
2016-03-02 07:59:49 -05:00
|
|
|
s.Attr |= AttrReachable
|
2015-02-27 22:57:28 -05:00
|
|
|
if s.Size < off+wid {
|
|
|
|
|
s.Size = off + wid
|
2016-09-20 15:31:26 +12:00
|
|
|
Symgrow(s, s.Size)
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
switch wid {
|
|
|
|
|
case 1:
|
|
|
|
|
s.P[off] = uint8(v)
|
|
|
|
|
case 2:
|
|
|
|
|
ctxt.Arch.ByteOrder.PutUint16(s.P[off:], uint16(v))
|
|
|
|
|
case 4:
|
|
|
|
|
ctxt.Arch.ByteOrder.PutUint32(s.P[off:], uint32(v))
|
|
|
|
|
case 8:
|
2016-04-14 19:04:45 -07:00
|
|
|
ctxt.Arch.ByteOrder.PutUint64(s.P[off:], v)
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return off + wid
|
|
|
|
|
}
|
|
|
|
|
|
2016-09-20 15:31:26 +12:00
|
|
|
func Addbytes(s *Symbol, bytes []byte) int64 {
|
2016-03-14 09:23:04 -07:00
|
|
|
if s.Type == 0 {
|
2017-04-19 15:15:35 +12:00
|
|
|
s.Type = SDATA
|
2016-03-14 09:23:04 -07:00
|
|
|
}
|
|
|
|
|
s.Attr |= AttrReachable
|
|
|
|
|
s.P = append(s.P, bytes...)
|
2016-04-07 18:00:57 +03:00
|
|
|
s.Size = int64(len(s.P))
|
2016-03-14 09:23:04 -07:00
|
|
|
|
|
|
|
|
return s.Size
|
|
|
|
|
}
|
|
|
|
|
|
2016-08-19 11:35:54 -04:00
|
|
|
func adduintxx(ctxt *Link, s *Symbol, v uint64, wid int) int64 {
|
2015-03-02 12:35:15 -05:00
|
|
|
off := s.Size
|
2015-02-27 22:57:28 -05:00
|
|
|
setuintxx(ctxt, s, off, v, int64(wid))
|
|
|
|
|
return off
|
|
|
|
|
}
|
|
|
|
|
|
2016-08-19 11:35:54 -04:00
|
|
|
func Adduint8(ctxt *Link, s *Symbol, v uint8) int64 {
|
2016-04-07 18:00:57 +03:00
|
|
|
off := s.Size
|
|
|
|
|
if s.Type == 0 {
|
2017-04-19 15:15:35 +12:00
|
|
|
s.Type = SDATA
|
2016-04-07 18:00:57 +03:00
|
|
|
}
|
|
|
|
|
s.Attr |= AttrReachable
|
|
|
|
|
s.Size++
|
|
|
|
|
s.P = append(s.P, v)
|
|
|
|
|
|
|
|
|
|
return off
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
|
2016-08-19 11:35:54 -04:00
|
|
|
func Adduint16(ctxt *Link, s *Symbol, v uint16) int64 {
|
2015-02-27 22:57:28 -05:00
|
|
|
return adduintxx(ctxt, s, uint64(v), 2)
|
|
|
|
|
}
|
|
|
|
|
|
2016-08-19 11:35:54 -04:00
|
|
|
func Adduint32(ctxt *Link, s *Symbol, v uint32) int64 {
|
2015-02-27 22:57:28 -05:00
|
|
|
return adduintxx(ctxt, s, uint64(v), 4)
|
|
|
|
|
}
|
|
|
|
|
|
2016-08-19 11:35:54 -04:00
|
|
|
func Adduint64(ctxt *Link, s *Symbol, v uint64) int64 {
|
2015-02-27 22:57:28 -05:00
|
|
|
return adduintxx(ctxt, s, v, 8)
|
|
|
|
|
}
|
|
|
|
|
|
2016-08-19 11:35:54 -04:00
|
|
|
func adduint(ctxt *Link, s *Symbol, v uint64) int64 {
|
2017-04-21 18:44:34 -07:00
|
|
|
return adduintxx(ctxt, s, v, SysArch.PtrSize)
|
2015-03-16 11:53:08 +13:00
|
|
|
}
|
|
|
|
|
|
2016-08-19 11:35:54 -04:00
|
|
|
func setuint8(ctxt *Link, s *Symbol, r int64, v uint8) int64 {
|
2015-02-27 22:57:28 -05:00
|
|
|
return setuintxx(ctxt, s, r, uint64(v), 1)
|
|
|
|
|
}
|
|
|
|
|
|
2016-08-19 11:35:54 -04:00
|
|
|
func setuint32(ctxt *Link, s *Symbol, r int64, v uint32) int64 {
|
2015-02-27 22:57:28 -05:00
|
|
|
return setuintxx(ctxt, s, r, uint64(v), 4)
|
|
|
|
|
}
|
|
|
|
|
|
2017-04-21 19:05:38 -07:00
|
|
|
func setuint(ctxt *Link, s *Symbol, r int64, v uint64) int64 {
|
|
|
|
|
return setuintxx(ctxt, s, r, v, int64(SysArch.PtrSize))
|
|
|
|
|
}
|
|
|
|
|
|
2016-08-19 11:35:54 -04:00
|
|
|
func Addaddrplus(ctxt *Link, s *Symbol, t *Symbol, add int64) int64 {
|
2015-02-27 22:57:28 -05:00
|
|
|
if s.Type == 0 {
|
2017-04-19 15:15:35 +12:00
|
|
|
s.Type = SDATA
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
2016-03-02 07:59:49 -05:00
|
|
|
s.Attr |= AttrReachable
|
2015-03-02 12:35:15 -05:00
|
|
|
i := s.Size
|
2016-04-06 12:01:40 -07:00
|
|
|
s.Size += int64(ctxt.Arch.PtrSize)
|
2016-09-20 15:31:26 +12:00
|
|
|
Symgrow(s, s.Size)
|
2015-03-02 12:35:15 -05:00
|
|
|
r := Addrel(s)
|
2015-02-27 22:57:28 -05:00
|
|
|
r.Sym = t
|
|
|
|
|
r.Off = int32(i)
|
2016-04-06 12:01:40 -07:00
|
|
|
r.Siz = uint8(ctxt.Arch.PtrSize)
|
2017-04-18 12:53:25 -07:00
|
|
|
r.Type = objabi.R_ADDR
|
2015-02-27 22:57:28 -05:00
|
|
|
r.Add = add
|
|
|
|
|
return i + int64(r.Siz)
|
|
|
|
|
}
|
|
|
|
|
|
2016-08-19 11:35:54 -04:00
|
|
|
func Addpcrelplus(ctxt *Link, s *Symbol, t *Symbol, add int64) int64 {
|
2015-02-27 22:57:28 -05:00
|
|
|
if s.Type == 0 {
|
2017-04-19 15:15:35 +12:00
|
|
|
s.Type = SDATA
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
2016-03-02 07:59:49 -05:00
|
|
|
s.Attr |= AttrReachable
|
2015-03-02 12:35:15 -05:00
|
|
|
i := s.Size
|
2015-02-27 22:57:28 -05:00
|
|
|
s.Size += 4
|
2016-09-20 15:31:26 +12:00
|
|
|
Symgrow(s, s.Size)
|
2015-03-02 12:35:15 -05:00
|
|
|
r := Addrel(s)
|
2015-02-27 22:57:28 -05:00
|
|
|
r.Sym = t
|
|
|
|
|
r.Off = int32(i)
|
|
|
|
|
r.Add = add
|
2017-04-18 12:53:25 -07:00
|
|
|
r.Type = objabi.R_PCREL
|
2015-02-27 22:57:28 -05:00
|
|
|
r.Siz = 4
|
2016-04-06 12:01:40 -07:00
|
|
|
if SysArch.Family == sys.S390X {
|
2016-03-18 16:57:54 -04:00
|
|
|
r.Variant = RV_390_DBL
|
|
|
|
|
}
|
2015-02-27 22:57:28 -05:00
|
|
|
return i + int64(r.Siz)
|
|
|
|
|
}
|
|
|
|
|
|
2016-08-19 11:35:54 -04:00
|
|
|
func Addaddr(ctxt *Link, s *Symbol, t *Symbol) int64 {
|
2015-02-27 22:57:28 -05:00
|
|
|
return Addaddrplus(ctxt, s, t, 0)
|
|
|
|
|
}
|
|
|
|
|
|
2016-08-19 11:35:54 -04:00
|
|
|
func setaddrplus(ctxt *Link, s *Symbol, off int64, t *Symbol, add int64) int64 {
|
2015-02-27 22:57:28 -05:00
|
|
|
if s.Type == 0 {
|
2017-04-19 15:15:35 +12:00
|
|
|
s.Type = SDATA
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
2016-03-02 07:59:49 -05:00
|
|
|
s.Attr |= AttrReachable
|
2016-04-06 12:01:40 -07:00
|
|
|
if off+int64(ctxt.Arch.PtrSize) > s.Size {
|
|
|
|
|
s.Size = off + int64(ctxt.Arch.PtrSize)
|
2016-09-20 15:31:26 +12:00
|
|
|
Symgrow(s, s.Size)
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
|
2015-03-02 12:35:15 -05:00
|
|
|
r := Addrel(s)
|
2015-02-27 22:57:28 -05:00
|
|
|
r.Sym = t
|
|
|
|
|
r.Off = int32(off)
|
2016-04-06 12:01:40 -07:00
|
|
|
r.Siz = uint8(ctxt.Arch.PtrSize)
|
2017-04-18 12:53:25 -07:00
|
|
|
r.Type = objabi.R_ADDR
|
2015-02-27 22:57:28 -05:00
|
|
|
r.Add = add
|
|
|
|
|
return off + int64(r.Siz)
|
|
|
|
|
}
|
|
|
|
|
|
2016-08-19 11:35:54 -04:00
|
|
|
func setaddr(ctxt *Link, s *Symbol, off int64, t *Symbol) int64 {
|
2015-02-27 22:57:28 -05:00
|
|
|
return setaddrplus(ctxt, s, off, t, 0)
|
|
|
|
|
}
|
|
|
|
|
|
2016-08-19 11:35:54 -04:00
|
|
|
func addsize(ctxt *Link, s *Symbol, t *Symbol) int64 {
|
2015-02-27 22:57:28 -05:00
|
|
|
if s.Type == 0 {
|
2017-04-19 15:15:35 +12:00
|
|
|
s.Type = SDATA
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
2016-03-02 07:59:49 -05:00
|
|
|
s.Attr |= AttrReachable
|
2015-03-02 12:35:15 -05:00
|
|
|
i := s.Size
|
2016-04-06 12:01:40 -07:00
|
|
|
s.Size += int64(ctxt.Arch.PtrSize)
|
2016-09-20 15:31:26 +12:00
|
|
|
Symgrow(s, s.Size)
|
2015-03-02 12:35:15 -05:00
|
|
|
r := Addrel(s)
|
2015-02-27 22:57:28 -05:00
|
|
|
r.Sym = t
|
|
|
|
|
r.Off = int32(i)
|
2016-04-06 12:01:40 -07:00
|
|
|
r.Siz = uint8(ctxt.Arch.PtrSize)
|
2017-04-18 12:53:25 -07:00
|
|
|
r.Type = objabi.R_SIZE
|
2015-02-27 22:57:28 -05:00
|
|
|
return i + int64(r.Siz)
|
|
|
|
|
}
|
|
|
|
|
|
2016-08-19 11:35:54 -04:00
|
|
|
func addaddrplus4(ctxt *Link, s *Symbol, t *Symbol, add int64) int64 {
|
2015-02-27 22:57:28 -05:00
|
|
|
if s.Type == 0 {
|
2017-04-19 15:15:35 +12:00
|
|
|
s.Type = SDATA
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
2016-03-02 07:59:49 -05:00
|
|
|
s.Attr |= AttrReachable
|
2015-03-02 12:35:15 -05:00
|
|
|
i := s.Size
|
2015-02-27 22:57:28 -05:00
|
|
|
s.Size += 4
|
2016-09-20 15:31:26 +12:00
|
|
|
Symgrow(s, s.Size)
|
2015-03-02 12:35:15 -05:00
|
|
|
r := Addrel(s)
|
2015-02-27 22:57:28 -05:00
|
|
|
r.Sym = t
|
|
|
|
|
r.Off = int32(i)
|
|
|
|
|
r.Siz = 4
|
2017-04-18 12:53:25 -07:00
|
|
|
r.Type = objabi.R_ADDR
|
2015-02-27 22:57:28 -05:00
|
|
|
r.Add = add
|
|
|
|
|
return i + int64(r.Siz)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
2016-08-22 10:53:05 +12:00
|
|
|
* divide-and-conquer list-link (by Sub) sort of Symbol* by Value.
|
|
|
|
|
* Used for sub-symbols when loading host objects (see e.g. ldelf.go).
|
2015-02-27 22:57:28 -05:00
|
|
|
*/
|
|
|
|
|
|
2016-08-22 10:53:05 +12:00
|
|
|
func listsort(l *Symbol) *Symbol {
|
|
|
|
|
if l == nil || l.Sub == nil {
|
2015-02-27 22:57:28 -05:00
|
|
|
return l
|
|
|
|
|
}
|
|
|
|
|
|
2015-03-02 12:35:15 -05:00
|
|
|
l1 := l
|
|
|
|
|
l2 := l
|
2015-02-27 22:57:28 -05:00
|
|
|
for {
|
2016-08-22 10:53:05 +12:00
|
|
|
l2 = l2.Sub
|
2015-02-27 22:57:28 -05:00
|
|
|
if l2 == nil {
|
|
|
|
|
break
|
|
|
|
|
}
|
2016-08-22 10:53:05 +12:00
|
|
|
l2 = l2.Sub
|
2015-02-27 22:57:28 -05:00
|
|
|
if l2 == nil {
|
|
|
|
|
break
|
|
|
|
|
}
|
2016-08-22 10:53:05 +12:00
|
|
|
l1 = l1.Sub
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
|
2016-08-22 10:53:05 +12:00
|
|
|
l2 = l1.Sub
|
|
|
|
|
l1.Sub = nil
|
|
|
|
|
l1 = listsort(l)
|
|
|
|
|
l2 = listsort(l2)
|
2015-02-27 22:57:28 -05:00
|
|
|
|
|
|
|
|
/* set up lead element */
|
2016-08-22 10:53:05 +12:00
|
|
|
if l1.Value < l2.Value {
|
2015-02-27 22:57:28 -05:00
|
|
|
l = l1
|
2016-08-22 10:53:05 +12:00
|
|
|
l1 = l1.Sub
|
2015-02-27 22:57:28 -05:00
|
|
|
} else {
|
|
|
|
|
l = l2
|
2016-08-22 10:53:05 +12:00
|
|
|
l2 = l2.Sub
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
|
2015-03-02 12:35:15 -05:00
|
|
|
le := l
|
2015-02-27 22:57:28 -05:00
|
|
|
|
|
|
|
|
for {
|
|
|
|
|
if l1 == nil {
|
|
|
|
|
for l2 != nil {
|
2016-08-22 10:53:05 +12:00
|
|
|
le.Sub = l2
|
2015-02-27 22:57:28 -05:00
|
|
|
le = l2
|
2016-08-22 10:53:05 +12:00
|
|
|
l2 = l2.Sub
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
|
2016-08-22 10:53:05 +12:00
|
|
|
le.Sub = nil
|
2015-02-27 22:57:28 -05:00
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if l2 == nil {
|
|
|
|
|
for l1 != nil {
|
2016-08-22 10:53:05 +12:00
|
|
|
le.Sub = l1
|
2015-02-27 22:57:28 -05:00
|
|
|
le = l1
|
2016-08-22 10:53:05 +12:00
|
|
|
l1 = l1.Sub
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
|
2016-08-22 10:53:05 +12:00
|
|
|
if l1.Value < l2.Value {
|
|
|
|
|
le.Sub = l1
|
2015-02-27 22:57:28 -05:00
|
|
|
le = l1
|
2016-08-22 10:53:05 +12:00
|
|
|
l1 = l1.Sub
|
2015-02-27 22:57:28 -05:00
|
|
|
} else {
|
2016-08-22 10:53:05 +12:00
|
|
|
le.Sub = l2
|
2015-02-27 22:57:28 -05:00
|
|
|
le = l2
|
2016-08-22 10:53:05 +12:00
|
|
|
l2 = l2.Sub
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2016-08-22 10:53:05 +12:00
|
|
|
le.Sub = nil
|
2015-02-27 22:57:28 -05:00
|
|
|
return l
|
|
|
|
|
}
|
|
|
|
|
|
2016-09-14 14:47:12 -04:00
|
|
|
// isRuntimeDepPkg returns whether pkg is the runtime package or its dependency
|
|
|
|
|
func isRuntimeDepPkg(pkg string) bool {
|
|
|
|
|
switch pkg {
|
|
|
|
|
case "runtime",
|
|
|
|
|
"sync/atomic": // runtime may call to sync/atomic, due to go:linkname
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
return strings.HasPrefix(pkg, "runtime/internal/") && !strings.HasSuffix(pkg, "_test")
|
|
|
|
|
}
|
|
|
|
|
|
2017-06-08 08:26:19 -04:00
|
|
|
// Estimate the max size needed to hold any new trampolines created for this function. This
|
|
|
|
|
// is used to determine when the section can be split if it becomes too large, to ensure that
|
|
|
|
|
// the trampolines are in the same section as the function that uses them.
|
|
|
|
|
func maxSizeTrampolinesPPC64(s *Symbol, isTramp bool) uint64 {
|
|
|
|
|
// If Thearch.Trampoline is nil, then trampoline support is not available on this arch.
|
|
|
|
|
// A trampoline does not need any dependent trampolines.
|
|
|
|
|
if Thearch.Trampoline == nil || isTramp {
|
|
|
|
|
return 0
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
n := uint64(0)
|
|
|
|
|
for ri := range s.R {
|
|
|
|
|
r := &s.R[ri]
|
|
|
|
|
if r.Type.IsDirectJump() {
|
|
|
|
|
n++
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
// Trampolines in ppc64 are 4 instructions.
|
|
|
|
|
return n * 16
|
|
|
|
|
}
|
|
|
|
|
|
2016-09-14 14:47:12 -04:00
|
|
|
// detect too-far jumps in function s, and add trampolines if necessary
|
2017-06-08 08:26:19 -04:00
|
|
|
// ARM, PPC64 & PPC64LE support trampoline insertion for internal and external linking
|
|
|
|
|
// On PPC64 & PPC64LE the text sections might be split but will still insert trampolines
|
|
|
|
|
// where necessary.
|
2016-09-14 14:47:12 -04:00
|
|
|
func trampoline(ctxt *Link, s *Symbol) {
|
|
|
|
|
if Thearch.Trampoline == nil {
|
|
|
|
|
return // no need or no support of trampolines on this arch
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for ri := range s.R {
|
|
|
|
|
r := &s.R[ri]
|
|
|
|
|
if !r.Type.IsDirectJump() {
|
|
|
|
|
continue
|
|
|
|
|
}
|
2017-04-19 15:15:35 +12:00
|
|
|
if Symaddr(r.Sym) == 0 && r.Sym.Type != SDYNIMPORT {
|
2016-09-14 14:47:12 -04:00
|
|
|
if r.Sym.File != s.File {
|
|
|
|
|
if !isRuntimeDepPkg(s.File) || !isRuntimeDepPkg(r.Sym.File) {
|
|
|
|
|
Errorf(s, "unresolved inter-package jump to %s(%s)", r.Sym, r.Sym.File)
|
|
|
|
|
}
|
|
|
|
|
// runtime and its dependent packages may call to each other.
|
|
|
|
|
// they are fine, as they will be laid down together.
|
|
|
|
|
}
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
Thearch.Trampoline(ctxt, r, s)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// resolve relocations in s.
|
2016-08-19 22:40:38 -04:00
|
|
|
func relocsym(ctxt *Link, s *Symbol) {
|
2015-02-27 22:57:28 -05:00
|
|
|
var r *Reloc
|
2016-08-19 11:35:54 -04:00
|
|
|
var rs *Symbol
|
2015-02-27 22:57:28 -05:00
|
|
|
var i16 int16
|
|
|
|
|
var off int32
|
|
|
|
|
var siz int32
|
|
|
|
|
var fl int32
|
|
|
|
|
var o int64
|
|
|
|
|
|
2015-03-02 12:35:15 -05:00
|
|
|
for ri := int32(0); ri < int32(len(s.R)); ri++ {
|
2015-02-27 22:57:28 -05:00
|
|
|
r = &s.R[ri]
|
2016-09-14 14:47:12 -04:00
|
|
|
|
2017-08-27 22:00:00 +09:00
|
|
|
r.Done = true
|
2015-02-27 22:57:28 -05:00
|
|
|
off = r.Off
|
|
|
|
|
siz = int32(r.Siz)
|
|
|
|
|
if off < 0 || off+siz > int32(len(s.P)) {
|
2016-09-17 09:39:33 -04:00
|
|
|
rname := ""
|
|
|
|
|
if r.Sym != nil {
|
|
|
|
|
rname = r.Sym.Name
|
|
|
|
|
}
|
|
|
|
|
Errorf(s, "invalid relocation %s: %d+%d not in [%d,%d)", rname, off, siz, 0, len(s.P))
|
2015-02-27 22:57:28 -05:00
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
|
2017-05-01 00:56:09 +00:00
|
|
|
if r.Sym != nil && (r.Sym.Type&(SMASK|SHIDDEN) == 0 || r.Sym.Type&SMASK == SXREF) {
|
2015-03-30 02:59:10 +00:00
|
|
|
// When putting the runtime but not main into a shared library
|
|
|
|
|
// these symbols are undefined and that's OK.
|
2016-07-28 13:04:41 -04:00
|
|
|
if Buildmode == BuildmodeShared {
|
|
|
|
|
if r.Sym.Name == "main.main" || r.Sym.Name == "main.init" {
|
2017-04-19 15:15:35 +12:00
|
|
|
r.Sym.Type = SDYNIMPORT
|
2016-07-28 13:04:41 -04:00
|
|
|
} else if strings.HasPrefix(r.Sym.Name, "go.info.") {
|
|
|
|
|
// Skip go.info symbols. They are only needed to communicate
|
|
|
|
|
// DWARF info between the compiler and linker.
|
|
|
|
|
continue
|
|
|
|
|
}
|
2015-03-30 02:59:10 +00:00
|
|
|
} else {
|
2016-09-17 09:39:33 -04:00
|
|
|
Errorf(s, "relocation target %s not defined", r.Sym.Name)
|
2015-03-30 02:59:10 +00:00
|
|
|
continue
|
|
|
|
|
}
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if r.Type >= 256 {
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
if r.Siz == 0 { // informational relocation - no work to do
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
|
2015-03-30 02:59:10 +00:00
|
|
|
// We need to be able to reference dynimport symbols when linking against
|
|
|
|
|
// shared libraries, and Solaris needs it always
|
2017-04-19 15:15:35 +12:00
|
|
|
if Headtype != objabi.Hsolaris && r.Sym != nil && r.Sym.Type == SDYNIMPORT && !ctxt.DynlinkingGo() {
|
2016-04-06 12:01:40 -07:00
|
|
|
if !(SysArch.Family == sys.PPC64 && Linkmode == LinkExternal && r.Sym.Name == ".TOC.") {
|
2017-08-17 09:45:20 +09:00
|
|
|
Errorf(s, "unhandled relocation for %s (type %d (%s) rtype %d (%s))", r.Sym.Name, r.Sym.Type, r.Sym.Type, r.Type, RelocName(r.Type))
|
cmd/compile, cmd/link, runtime: on ppc64x, maintain the TOC pointer in R2 when compiling PIC
The PowerPC ISA does not have a PC-relative load instruction, which poses
obvious challenges when generating position-independent code. The way the ELFv2
ABI addresses this is to specify that r2 points to a per "module" (shared
library or executable) TOC pointer. Maintaining this pointer requires
cooperation between codegen and the system linker:
* Non-leaf functions leave space on the stack at r1+24 to save the TOC pointer.
* A call to a function that *might* have to go via a PLT stub must be followed
by a nop instruction that the system linker can replace with "ld r1, 24(r1)"
to restore the TOC pointer (only when dynamically linking Go code).
* When calling a function via a function pointer, the address of the function
must be in r12, and the first couple of instructions (the "global entry
point") of the called function use this to derive the address of the TOC
for the module it is in.
* When calling a function that is implemented in the same module, the system
linker adjusts the call to skip over the instructions mentioned above (the
"local entry point"), assuming that r2 is already correctly set.
So this changeset adds the global entry point instructions, sets the metadata so
the system linker knows where the local entry point is, inserts code to save the
TOC pointer at 24(r1), adds a nop after any call not known to be local and copes
with the odd non-local code transfer in the runtime (e.g. the stuff around
jmpdefer). It does not actually compile PIC yet.
Change-Id: I7522e22bdfd2f891745a900c60254fe9e372c854
Reviewed-on: https://go-review.googlesource.com/15967
Reviewed-by: Russ Cox <rsc@golang.org>
2015-10-16 15:42:09 +13:00
|
|
|
}
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
2017-04-19 15:15:35 +12:00
|
|
|
if r.Sym != nil && r.Sym.Type != STLSBSS && r.Type != objabi.R_WEAKADDROFF && !r.Sym.Attr.Reachable() {
|
2016-09-17 09:39:33 -04:00
|
|
|
Errorf(s, "unreachable sym in relocation: %s", r.Sym.Name)
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
|
2016-03-18 16:57:54 -04:00
|
|
|
// TODO(mundaym): remove this special case - see issue 14218.
|
2016-04-06 12:01:40 -07:00
|
|
|
if SysArch.Family == sys.S390X {
|
2016-03-18 16:57:54 -04:00
|
|
|
switch r.Type {
|
2017-04-18 12:53:25 -07:00
|
|
|
case objabi.R_PCRELDBL:
|
|
|
|
|
r.Type = objabi.R_PCREL
|
2016-03-18 16:57:54 -04:00
|
|
|
r.Variant = RV_390_DBL
|
2017-04-18 12:53:25 -07:00
|
|
|
case objabi.R_CALL:
|
2016-03-18 16:57:54 -04:00
|
|
|
r.Variant = RV_390_DBL
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2015-02-27 22:57:28 -05:00
|
|
|
switch r.Type {
|
|
|
|
|
default:
|
2015-08-03 14:08:17 +12:00
|
|
|
switch siz {
|
|
|
|
|
default:
|
2016-09-17 09:39:33 -04:00
|
|
|
Errorf(s, "bad reloc size %#x for %s", uint32(siz), r.Sym.Name)
|
2015-08-03 14:08:17 +12:00
|
|
|
case 1:
|
|
|
|
|
o = int64(s.P[off])
|
|
|
|
|
case 2:
|
2016-08-19 22:40:38 -04:00
|
|
|
o = int64(ctxt.Arch.ByteOrder.Uint16(s.P[off:]))
|
2015-08-03 14:08:17 +12:00
|
|
|
case 4:
|
2016-08-19 22:40:38 -04:00
|
|
|
o = int64(ctxt.Arch.ByteOrder.Uint32(s.P[off:]))
|
2015-08-03 14:08:17 +12:00
|
|
|
case 8:
|
2016-08-19 22:40:38 -04:00
|
|
|
o = int64(ctxt.Arch.ByteOrder.Uint64(s.P[off:]))
|
2015-08-03 14:08:17 +12:00
|
|
|
}
|
2017-08-27 22:00:00 +09:00
|
|
|
if !Thearch.Archreloc(ctxt, r, s, &o) {
|
2017-08-17 09:45:20 +09:00
|
|
|
Errorf(s, "unknown reloc to %v: %d (%s)", r.Sym.Name, r.Type, RelocName(r.Type))
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
2017-04-18 12:53:25 -07:00
|
|
|
case objabi.R_TLS_LE:
|
|
|
|
|
isAndroidX86 := objabi.GOOS == "android" && (SysArch.InFamily(sys.AMD64, sys.I386))
|
2015-10-16 14:04:29 -04:00
|
|
|
|
2017-04-12 01:10:01 +10:00
|
|
|
if Linkmode == LinkExternal && Iself && !isAndroidX86 {
|
2017-08-27 22:00:00 +09:00
|
|
|
r.Done = false
|
2015-09-02 10:35:54 +12:00
|
|
|
if r.Sym == nil {
|
2016-08-19 22:40:38 -04:00
|
|
|
r.Sym = ctxt.Tlsg
|
2015-09-02 10:35:54 +12:00
|
|
|
}
|
|
|
|
|
r.Xsym = r.Sym
|
2015-02-27 22:57:28 -05:00
|
|
|
r.Xadd = r.Add
|
|
|
|
|
o = 0
|
2016-04-06 12:01:40 -07:00
|
|
|
if SysArch.Family != sys.AMD64 {
|
2015-02-27 22:57:28 -05:00
|
|
|
o = r.Add
|
|
|
|
|
}
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
|
2016-04-06 12:01:40 -07:00
|
|
|
if Iself && SysArch.Family == sys.ARM {
|
2015-09-02 10:35:54 +12:00
|
|
|
// On ELF ARM, the thread pointer is 8 bytes before
|
|
|
|
|
// the start of the thread-local data block, so add 8
|
|
|
|
|
// to the actual TLS offset (r->sym->value).
|
|
|
|
|
// This 8 seems to be a fundamental constant of
|
|
|
|
|
// ELF on ARM (or maybe Glibc on ARM); it is not
|
|
|
|
|
// related to the fact that our own TLS storage happens
|
|
|
|
|
// to take up 8 bytes.
|
|
|
|
|
o = 8 + r.Sym.Value
|
2017-04-18 12:53:25 -07:00
|
|
|
} else if Iself || Headtype == objabi.Hplan9 || Headtype == objabi.Hdarwin || isAndroidX86 {
|
2016-08-19 22:40:38 -04:00
|
|
|
o = int64(ctxt.Tlsoffset) + r.Add
|
2017-04-18 12:53:25 -07:00
|
|
|
} else if Headtype == objabi.Hwindows {
|
2015-04-23 21:53:48 +12:00
|
|
|
o = r.Add
|
|
|
|
|
} else {
|
2016-09-10 14:05:31 -07:00
|
|
|
log.Fatalf("unexpected R_TLS_LE relocation for %v", Headtype)
|
2015-04-23 21:53:48 +12:00
|
|
|
}
|
2017-04-18 12:53:25 -07:00
|
|
|
case objabi.R_TLS_IE:
|
|
|
|
|
isAndroidX86 := objabi.GOOS == "android" && (SysArch.InFamily(sys.AMD64, sys.I386))
|
2015-10-16 14:04:29 -04:00
|
|
|
|
2017-04-12 01:10:01 +10:00
|
|
|
if Linkmode == LinkExternal && Iself && !isAndroidX86 {
|
2017-08-27 22:00:00 +09:00
|
|
|
r.Done = false
|
2015-09-02 10:35:54 +12:00
|
|
|
if r.Sym == nil {
|
2016-08-19 22:40:38 -04:00
|
|
|
r.Sym = ctxt.Tlsg
|
2015-09-02 10:35:54 +12:00
|
|
|
}
|
|
|
|
|
r.Xsym = r.Sym
|
2015-02-27 22:57:28 -05:00
|
|
|
r.Xadd = r.Add
|
|
|
|
|
o = 0
|
2016-04-06 12:01:40 -07:00
|
|
|
if SysArch.Family != sys.AMD64 {
|
2015-02-27 22:57:28 -05:00
|
|
|
o = r.Add
|
|
|
|
|
}
|
|
|
|
|
break
|
|
|
|
|
}
|
2016-09-06 07:46:59 -04:00
|
|
|
if Buildmode == BuildmodePIE && Iself {
|
|
|
|
|
// We are linking the final executable, so we
|
|
|
|
|
// can optimize any TLS IE relocation to LE.
|
|
|
|
|
if Thearch.TLSIEtoLE == nil {
|
2016-09-14 11:35:29 -07:00
|
|
|
log.Fatalf("internal linking of TLS IE not supported on %v", SysArch.Family)
|
2016-09-06 07:46:59 -04:00
|
|
|
}
|
|
|
|
|
Thearch.TLSIEtoLE(s, int(off), int(r.Siz))
|
|
|
|
|
o = int64(ctxt.Tlsoffset)
|
2016-09-06 07:46:59 -04:00
|
|
|
// TODO: o += r.Add when SysArch.Family != sys.AMD64?
|
|
|
|
|
// Why do we treat r.Add differently on AMD64?
|
|
|
|
|
// Is the external linker using Xadd at all?
|
2016-09-06 07:46:59 -04:00
|
|
|
} else {
|
|
|
|
|
log.Fatalf("cannot handle R_TLS_IE (sym %s) when linking internally", s.Name)
|
|
|
|
|
}
|
2017-04-18 12:53:25 -07:00
|
|
|
case objabi.R_ADDR:
|
2017-04-19 15:15:35 +12:00
|
|
|
if Linkmode == LinkExternal && r.Sym.Type != SCONST {
|
2017-08-27 22:00:00 +09:00
|
|
|
r.Done = false
|
2015-02-27 22:57:28 -05:00
|
|
|
|
|
|
|
|
// set up addend for eventual relocation via outer symbol.
|
|
|
|
|
rs = r.Sym
|
|
|
|
|
|
|
|
|
|
r.Xadd = r.Add
|
|
|
|
|
for rs.Outer != nil {
|
2016-09-17 09:39:33 -04:00
|
|
|
r.Xadd += Symaddr(rs) - Symaddr(rs.Outer)
|
2015-02-27 22:57:28 -05:00
|
|
|
rs = rs.Outer
|
|
|
|
|
}
|
|
|
|
|
|
2017-04-19 15:15:35 +12:00
|
|
|
if rs.Type != SHOSTOBJ && rs.Type != SDYNIMPORT && rs.Sect == nil {
|
2016-09-17 09:39:33 -04:00
|
|
|
Errorf(s, "missing section for relocation target %s", rs.Name)
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
r.Xsym = rs
|
|
|
|
|
|
|
|
|
|
o = r.Xadd
|
|
|
|
|
if Iself {
|
2016-04-06 12:01:40 -07:00
|
|
|
if SysArch.Family == sys.AMD64 {
|
2015-02-27 22:57:28 -05:00
|
|
|
o = 0
|
|
|
|
|
}
|
2017-04-18 12:53:25 -07:00
|
|
|
} else if Headtype == objabi.Hdarwin {
|
2015-04-10 21:28:09 -04:00
|
|
|
// ld64 for arm64 has a bug where if the address pointed to by o exists in the
|
|
|
|
|
// symbol table (dynid >= 0), or is inside a symbol that exists in the symbol
|
|
|
|
|
// table, then it will add o twice into the relocated value.
|
|
|
|
|
// The workaround is that on arm64 don't ever add symaddr to o and always use
|
|
|
|
|
// extern relocation by requiring rs->dynid >= 0.
|
2017-04-19 15:15:35 +12:00
|
|
|
if rs.Type != SHOSTOBJ {
|
2016-04-06 12:01:40 -07:00
|
|
|
if SysArch.Family == sys.ARM64 && rs.Dynid < 0 {
|
2016-09-17 09:39:33 -04:00
|
|
|
Errorf(s, "R_ADDR reloc to %s+%d is not supported on darwin/arm64", rs.Name, o)
|
2015-04-10 21:28:09 -04:00
|
|
|
}
|
2016-04-06 12:01:40 -07:00
|
|
|
if SysArch.Family != sys.ARM64 {
|
2016-09-17 09:39:33 -04:00
|
|
|
o += Symaddr(rs)
|
2015-04-10 21:28:09 -04:00
|
|
|
}
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
2017-04-18 12:53:25 -07:00
|
|
|
} else if Headtype == objabi.Hwindows {
|
2015-03-09 03:05:40 -04:00
|
|
|
// nothing to do
|
2015-02-27 22:57:28 -05:00
|
|
|
} else {
|
2016-09-17 09:39:33 -04:00
|
|
|
Errorf(s, "unhandled pcrel relocation to %s on %v", rs.Name, Headtype)
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
|
2016-09-17 09:39:33 -04:00
|
|
|
o = Symaddr(r.Sym) + r.Add
|
2015-02-27 22:57:28 -05:00
|
|
|
|
|
|
|
|
// On amd64, 4-byte offsets will be sign-extended, so it is impossible to
|
|
|
|
|
// access more than 2GB of static data; fail at link time is better than
|
2015-07-10 17:17:11 -06:00
|
|
|
// fail at runtime. See https://golang.org/issue/7980.
|
2015-02-27 22:57:28 -05:00
|
|
|
// Instead of special casing only amd64, we treat this as an error on all
|
|
|
|
|
// 64-bit architectures so as to be future-proof.
|
2016-04-06 12:01:40 -07:00
|
|
|
if int32(o) < 0 && SysArch.PtrSize > 4 && siz == 4 {
|
2016-09-17 09:39:33 -04:00
|
|
|
Errorf(s, "non-pc-relative relocation address for %s is too big: %#x (%#x + %#x)", r.Sym.Name, uint64(o), Symaddr(r.Sym), r.Add)
|
2015-04-09 07:37:17 -04:00
|
|
|
errorexit()
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
2017-04-18 12:53:25 -07:00
|
|
|
case objabi.R_DWARFREF:
|
2017-05-22 20:17:31 -04:00
|
|
|
if r.Sym.Sect == nil {
|
2016-09-17 09:39:33 -04:00
|
|
|
Errorf(s, "missing DWARF section for relocation target %s", r.Sym.Name)
|
2016-03-14 09:23:04 -07:00
|
|
|
}
|
2017-05-02 16:46:01 +02:00
|
|
|
|
2016-03-14 09:23:04 -07:00
|
|
|
if Linkmode == LinkExternal {
|
2017-08-27 22:00:00 +09:00
|
|
|
r.Done = false
|
2017-02-08 12:30:30 +11:00
|
|
|
// PE code emits IMAGE_REL_I386_SECREL and IMAGE_REL_AMD64_SECREL
|
|
|
|
|
// for R_DWARFREF relocations, while R_ADDR is replaced with
|
|
|
|
|
// IMAGE_REL_I386_DIR32, IMAGE_REL_AMD64_ADDR64 and IMAGE_REL_AMD64_ADDR32.
|
|
|
|
|
// Do not replace R_DWARFREF with R_ADDR for windows -
|
|
|
|
|
// let PE code emit correct relocations.
|
2017-04-18 12:53:25 -07:00
|
|
|
if Headtype != objabi.Hwindows {
|
|
|
|
|
r.Type = objabi.R_ADDR
|
2017-02-08 12:30:30 +11:00
|
|
|
}
|
2016-03-14 09:23:04 -07:00
|
|
|
|
2017-05-22 20:17:31 -04:00
|
|
|
r.Xsym = ctxt.Syms.ROLookup(r.Sym.Sect.Name, 0)
|
|
|
|
|
r.Xadd = r.Add + Symaddr(r.Sym) - int64(r.Sym.Sect.Vaddr)
|
2017-05-02 16:46:01 +02:00
|
|
|
|
2016-03-14 09:23:04 -07:00
|
|
|
o = r.Xadd
|
|
|
|
|
rs = r.Xsym
|
2016-04-06 12:01:40 -07:00
|
|
|
if Iself && SysArch.Family == sys.AMD64 {
|
2016-03-14 09:23:04 -07:00
|
|
|
o = 0
|
|
|
|
|
}
|
|
|
|
|
break
|
|
|
|
|
}
|
2017-05-22 20:17:31 -04:00
|
|
|
o = Symaddr(r.Sym) + r.Add - int64(r.Sym.Sect.Vaddr)
|
2017-04-18 12:53:25 -07:00
|
|
|
case objabi.R_WEAKADDROFF:
|
2016-11-21 16:58:55 -05:00
|
|
|
if !r.Sym.Attr.Reachable() {
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
fallthrough
|
2017-04-18 12:53:25 -07:00
|
|
|
case objabi.R_ADDROFF:
|
2016-08-25 11:07:33 -05:00
|
|
|
// The method offset tables using this relocation expect the offset to be relative
|
|
|
|
|
// to the start of the first text section, even if there are multiple.
|
|
|
|
|
|
|
|
|
|
if r.Sym.Sect.Name == ".text" {
|
2017-04-18 21:52:06 +12:00
|
|
|
o = Symaddr(r.Sym) - int64(Segtext.Sections[0].Vaddr) + r.Add
|
2016-08-25 11:07:33 -05:00
|
|
|
} else {
|
|
|
|
|
o = Symaddr(r.Sym) - int64(r.Sym.Sect.Vaddr) + r.Add
|
|
|
|
|
}
|
2016-03-27 10:21:48 -04:00
|
|
|
|
2015-02-27 22:57:28 -05:00
|
|
|
// r->sym can be null when CALL $(constant) is transformed from absolute PC to relative PC call.
|
2017-04-18 12:53:25 -07:00
|
|
|
case objabi.R_GOTPCREL:
|
2017-04-19 15:15:35 +12:00
|
|
|
if ctxt.DynlinkingGo() && Headtype == objabi.Hdarwin && r.Sym != nil && r.Sym.Type != SCONST {
|
2017-08-27 22:00:00 +09:00
|
|
|
r.Done = false
|
2016-11-13 21:20:58 -05:00
|
|
|
r.Xadd = r.Add
|
|
|
|
|
r.Xadd -= int64(r.Siz) // relative to address after the relocated chunk
|
|
|
|
|
r.Xsym = r.Sym
|
|
|
|
|
|
|
|
|
|
o = r.Xadd
|
|
|
|
|
o += int64(r.Siz)
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
fallthrough
|
2017-04-18 12:53:25 -07:00
|
|
|
case objabi.R_CALL, objabi.R_PCREL:
|
2017-04-19 15:15:35 +12:00
|
|
|
if Linkmode == LinkExternal && r.Sym != nil && r.Sym.Type != SCONST && (r.Sym.Sect != s.Sect || r.Type == objabi.R_GOTPCREL) {
|
2017-08-27 22:00:00 +09:00
|
|
|
r.Done = false
|
2015-02-27 22:57:28 -05:00
|
|
|
|
|
|
|
|
// set up addend for eventual relocation via outer symbol.
|
|
|
|
|
rs = r.Sym
|
|
|
|
|
|
|
|
|
|
r.Xadd = r.Add
|
|
|
|
|
for rs.Outer != nil {
|
2016-09-17 09:39:33 -04:00
|
|
|
r.Xadd += Symaddr(rs) - Symaddr(rs.Outer)
|
2015-02-27 22:57:28 -05:00
|
|
|
rs = rs.Outer
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
r.Xadd -= int64(r.Siz) // relative to address after the relocated chunk
|
2017-04-19 15:15:35 +12:00
|
|
|
if rs.Type != SHOSTOBJ && rs.Type != SDYNIMPORT && rs.Sect == nil {
|
2016-09-17 09:39:33 -04:00
|
|
|
Errorf(s, "missing section for relocation target %s", rs.Name)
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
r.Xsym = rs
|
|
|
|
|
|
|
|
|
|
o = r.Xadd
|
|
|
|
|
if Iself {
|
2016-04-06 12:01:40 -07:00
|
|
|
if SysArch.Family == sys.AMD64 {
|
2015-02-27 22:57:28 -05:00
|
|
|
o = 0
|
|
|
|
|
}
|
2017-04-18 12:53:25 -07:00
|
|
|
} else if Headtype == objabi.Hdarwin {
|
|
|
|
|
if r.Type == objabi.R_CALL {
|
2017-04-19 15:15:35 +12:00
|
|
|
if rs.Type != SHOSTOBJ {
|
2016-09-17 09:39:33 -04:00
|
|
|
o += int64(uint64(Symaddr(rs)) - rs.Sect.Vaddr)
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
o -= int64(r.Off) // relative to section offset, not symbol
|
2016-04-26 15:17:56 -04:00
|
|
|
} else if SysArch.Family == sys.ARM {
|
|
|
|
|
// see ../arm/asm.go:/machoreloc1
|
2016-09-17 10:01:17 -04:00
|
|
|
o += Symaddr(rs) - int64(s.Value) - int64(r.Off)
|
2015-02-27 22:57:28 -05:00
|
|
|
} else {
|
|
|
|
|
o += int64(r.Siz)
|
|
|
|
|
}
|
2017-04-18 12:53:25 -07:00
|
|
|
} else if Headtype == objabi.Hwindows && SysArch.Family == sys.AMD64 { // only amd64 needs PCREL
|
2015-03-13 22:10:48 -04:00
|
|
|
// PE/COFF's PC32 relocation uses the address after the relocated
|
|
|
|
|
// bytes as the base. Compensate by skewing the addend.
|
|
|
|
|
o += int64(r.Siz)
|
2015-02-27 22:57:28 -05:00
|
|
|
} else {
|
2016-09-17 09:39:33 -04:00
|
|
|
Errorf(s, "unhandled pcrel relocation to %s on %v", rs.Name, Headtype)
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
o = 0
|
|
|
|
|
if r.Sym != nil {
|
2016-09-17 09:39:33 -04:00
|
|
|
o += Symaddr(r.Sym)
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
|
2016-09-06 12:33:36 -04:00
|
|
|
o += r.Add - (s.Value + int64(r.Off) + int64(r.Siz))
|
2017-04-18 12:53:25 -07:00
|
|
|
case objabi.R_SIZE:
|
2015-02-27 22:57:28 -05:00
|
|
|
o = r.Sym.Size + r.Add
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if r.Variant != RV_NONE {
|
2016-08-21 13:52:23 -04:00
|
|
|
o = Thearch.Archrelocvariant(ctxt, r, s, o)
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
|
2015-03-01 13:32:49 -05:00
|
|
|
if false {
|
|
|
|
|
nam := "<nil>"
|
|
|
|
|
if r.Sym != nil {
|
|
|
|
|
nam = r.Sym.Name
|
|
|
|
|
}
|
2017-08-17 09:45:20 +09:00
|
|
|
fmt.Printf("relocate %s %#x (%#x+%#x, size %d) => %s %#x +%#x [type %d (%s)/%d, %x]\n", s.Name, s.Value+int64(off), s.Value, r.Off, r.Siz, nam, Symaddr(r.Sym), r.Add, r.Type, RelocName(r.Type), r.Variant, o)
|
2015-03-01 13:32:49 -05:00
|
|
|
}
|
2015-02-27 22:57:28 -05:00
|
|
|
switch siz {
|
|
|
|
|
default:
|
2016-09-17 09:39:33 -04:00
|
|
|
Errorf(s, "bad reloc size %#x for %s", uint32(siz), r.Sym.Name)
|
2015-02-27 22:57:28 -05:00
|
|
|
fallthrough
|
|
|
|
|
|
|
|
|
|
// TODO(rsc): Remove.
|
|
|
|
|
case 1:
|
|
|
|
|
s.P[off] = byte(int8(o))
|
|
|
|
|
case 2:
|
|
|
|
|
if o != int64(int16(o)) {
|
2016-09-17 09:39:33 -04:00
|
|
|
Errorf(s, "relocation address for %s is too big: %#x", r.Sym.Name, o)
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
i16 = int16(o)
|
2016-08-19 22:40:38 -04:00
|
|
|
ctxt.Arch.ByteOrder.PutUint16(s.P[off:], uint16(i16))
|
2015-02-27 22:57:28 -05:00
|
|
|
case 4:
|
2017-04-18 12:53:25 -07:00
|
|
|
if r.Type == objabi.R_PCREL || r.Type == objabi.R_CALL {
|
2015-02-27 22:57:28 -05:00
|
|
|
if o != int64(int32(o)) {
|
2016-09-17 09:39:33 -04:00
|
|
|
Errorf(s, "pc-relative relocation address for %s is too big: %#x", r.Sym.Name, o)
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
if o != int64(int32(o)) && o != int64(uint32(o)) {
|
2016-09-17 09:39:33 -04:00
|
|
|
Errorf(s, "non-pc-relative relocation address for %s is too big: %#x", r.Sym.Name, uint64(o))
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fl = int32(o)
|
2016-08-19 22:40:38 -04:00
|
|
|
ctxt.Arch.ByteOrder.PutUint32(s.P[off:], uint32(fl))
|
2015-02-27 22:57:28 -05:00
|
|
|
case 8:
|
2016-08-19 22:40:38 -04:00
|
|
|
ctxt.Arch.ByteOrder.PutUint64(s.P[off:], uint64(o))
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2016-08-19 22:40:38 -04:00
|
|
|
func (ctxt *Link) reloc() {
|
2016-08-21 18:25:28 -04:00
|
|
|
if ctxt.Debugvlog != 0 {
|
2017-04-18 12:53:25 -07:00
|
|
|
ctxt.Logf("%5.2f reloc\n", Cputime())
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
|
2016-08-19 22:40:38 -04:00
|
|
|
for _, s := range ctxt.Textp {
|
|
|
|
|
relocsym(ctxt, s)
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
2016-04-18 14:50:14 -04:00
|
|
|
for _, sym := range datap {
|
2016-08-19 22:40:38 -04:00
|
|
|
relocsym(ctxt, sym)
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
2016-04-22 10:31:14 +12:00
|
|
|
for _, s := range dwarfp {
|
2016-08-19 22:40:38 -04:00
|
|
|
relocsym(ctxt, s)
|
2016-03-14 09:23:04 -07:00
|
|
|
}
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
|
2017-09-09 11:23:29 +09:00
|
|
|
func windynrelocsym(ctxt *Link, s *Symbol) {
|
|
|
|
|
rel := ctxt.Syms.Lookup(".rel", 0)
|
|
|
|
|
if s == rel {
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
for ri := 0; ri < len(s.R); ri++ {
|
|
|
|
|
r := &s.R[ri]
|
|
|
|
|
targ := r.Sym
|
|
|
|
|
if targ == nil {
|
|
|
|
|
continue
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
2017-09-09 11:23:29 +09:00
|
|
|
if !targ.Attr.Reachable() {
|
|
|
|
|
if r.Type == objabi.R_WEAKADDROFF {
|
2015-02-27 22:57:28 -05:00
|
|
|
continue
|
|
|
|
|
}
|
2017-09-09 11:23:29 +09:00
|
|
|
Errorf(s, "dynamic relocation to unreachable symbol %s", targ.Name)
|
|
|
|
|
}
|
|
|
|
|
if r.Sym.Plt == -2 && r.Sym.Got != -2 { // make dynimport JMP table for PE object files.
|
|
|
|
|
targ.Plt = int32(rel.Size)
|
|
|
|
|
r.Sym = rel
|
|
|
|
|
r.Add = int64(targ.Plt)
|
|
|
|
|
|
|
|
|
|
// jmp *addr
|
|
|
|
|
if SysArch.Family == sys.I386 {
|
|
|
|
|
Adduint8(ctxt, rel, 0xff)
|
|
|
|
|
Adduint8(ctxt, rel, 0x25)
|
|
|
|
|
Addaddr(ctxt, rel, targ)
|
|
|
|
|
Adduint8(ctxt, rel, 0x90)
|
|
|
|
|
Adduint8(ctxt, rel, 0x90)
|
|
|
|
|
} else {
|
|
|
|
|
Adduint8(ctxt, rel, 0xff)
|
|
|
|
|
Adduint8(ctxt, rel, 0x24)
|
|
|
|
|
Adduint8(ctxt, rel, 0x25)
|
|
|
|
|
addaddrplus4(ctxt, rel, targ, 0)
|
|
|
|
|
Adduint8(ctxt, rel, 0x90)
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
2017-09-09 11:23:29 +09:00
|
|
|
} else if r.Sym.Plt >= 0 {
|
|
|
|
|
r.Sym = rel
|
|
|
|
|
r.Add = int64(targ.Plt)
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
2017-09-09 11:23:29 +09:00
|
|
|
}
|
|
|
|
|
}
|
2015-02-27 22:57:28 -05:00
|
|
|
|
2017-09-09 11:23:29 +09:00
|
|
|
func dynrelocsym(ctxt *Link, s *Symbol) {
|
2017-09-10 09:01:30 +09:00
|
|
|
if Headtype == objabi.Hwindows {
|
|
|
|
|
if Linkmode == LinkInternal {
|
|
|
|
|
windynrelocsym(ctxt, s)
|
|
|
|
|
}
|
2015-02-27 22:57:28 -05:00
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
|
2015-03-02 12:35:15 -05:00
|
|
|
for ri := 0; ri < len(s.R); ri++ {
|
2016-04-20 10:36:49 -04:00
|
|
|
r := &s.R[ri]
|
2016-09-05 23:49:53 -04:00
|
|
|
if Buildmode == BuildmodePIE && Linkmode == LinkInternal {
|
|
|
|
|
// It's expected that some relocations will be done
|
|
|
|
|
// later by relocsym (R_TLS_LE, R_ADDROFF), so
|
|
|
|
|
// don't worry if Adddynrel returns false.
|
|
|
|
|
Thearch.Adddynrel(ctxt, s, r)
|
|
|
|
|
continue
|
|
|
|
|
}
|
2017-04-19 15:15:35 +12:00
|
|
|
if r.Sym != nil && r.Sym.Type == SDYNIMPORT || r.Type >= 256 {
|
2016-03-02 07:59:49 -05:00
|
|
|
if r.Sym != nil && !r.Sym.Attr.Reachable() {
|
2016-09-17 09:39:33 -04:00
|
|
|
Errorf(s, "dynamic relocation to unreachable symbol %s", r.Sym.Name)
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
2016-09-05 23:49:53 -04:00
|
|
|
if !Thearch.Adddynrel(ctxt, s, r) {
|
2017-08-17 09:45:20 +09:00
|
|
|
Errorf(s, "unsupported dynamic relocation for symbol %s (type=%d (%s) stype=%d (%s))", r.Sym.Name, r.Type, RelocName(r.Type), r.Sym.Type, r.Sym.Type)
|
2016-09-05 23:49:53 -04:00
|
|
|
}
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2017-04-19 15:15:35 +12:00
|
|
|
func dynreloc(ctxt *Link, data *[SXREF][]*Symbol) {
|
2015-02-27 22:57:28 -05:00
|
|
|
// -d suppresses dynamic loader format, so we may as well not
|
|
|
|
|
// compute these sections or mark their symbols as reachable.
|
2017-04-18 12:53:25 -07:00
|
|
|
if *FlagD && Headtype != objabi.Hwindows {
|
2015-02-27 22:57:28 -05:00
|
|
|
return
|
|
|
|
|
}
|
2016-08-21 18:25:28 -04:00
|
|
|
if ctxt.Debugvlog != 0 {
|
2017-07-22 09:11:35 +09:00
|
|
|
ctxt.Logf("%5.2f dynreloc\n", Cputime())
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
|
2016-08-19 22:40:38 -04:00
|
|
|
for _, s := range ctxt.Textp {
|
|
|
|
|
dynrelocsym(ctxt, s)
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
2016-04-18 14:50:14 -04:00
|
|
|
for _, syms := range data {
|
|
|
|
|
for _, sym := range syms {
|
2016-08-19 22:40:38 -04:00
|
|
|
dynrelocsym(ctxt, sym)
|
2016-04-18 14:50:14 -04:00
|
|
|
}
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
if Iself {
|
2016-08-19 22:40:38 -04:00
|
|
|
elfdynhash(ctxt)
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2016-08-19 22:40:38 -04:00
|
|
|
func Codeblk(ctxt *Link, addr int64, size int64) {
|
|
|
|
|
CodeblkPad(ctxt, addr, size, zeros[:])
|
2016-06-11 17:12:28 -07:00
|
|
|
}
|
2016-08-19 22:40:38 -04:00
|
|
|
func CodeblkPad(ctxt *Link, addr int64, size int64, pad []byte) {
|
2016-08-21 18:34:24 -04:00
|
|
|
if *flagA {
|
2016-08-25 12:32:42 +10:00
|
|
|
ctxt.Logf("codeblk [%#x,%#x) at offset %#x\n", addr, addr+size, coutbuf.Offset())
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
|
2016-04-22 10:31:14 +12:00
|
|
|
blk(ctxt, ctxt.Textp, addr, size, pad)
|
2015-02-27 22:57:28 -05:00
|
|
|
|
|
|
|
|
/* again for printing */
|
2016-08-21 18:34:24 -04:00
|
|
|
if !*flagA {
|
2015-02-27 22:57:28 -05:00
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
|
2016-08-19 22:40:38 -04:00
|
|
|
syms := ctxt.Textp
|
2016-04-22 09:38:41 +12:00
|
|
|
for i, sym := range syms {
|
2016-03-02 07:59:49 -05:00
|
|
|
if !sym.Attr.Reachable() {
|
2015-02-27 22:57:28 -05:00
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
if sym.Value >= addr {
|
2016-04-22 09:38:41 +12:00
|
|
|
syms = syms[i:]
|
2015-02-27 22:57:28 -05:00
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2015-03-02 12:35:15 -05:00
|
|
|
eaddr := addr + size
|
|
|
|
|
var q []byte
|
2016-04-22 09:38:41 +12:00
|
|
|
for _, sym := range syms {
|
2016-03-02 07:59:49 -05:00
|
|
|
if !sym.Attr.Reachable() {
|
2015-02-27 22:57:28 -05:00
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
if sym.Value >= eaddr {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if addr < sym.Value {
|
2016-08-25 12:32:42 +10:00
|
|
|
ctxt.Logf("%-20s %.8x|", "_", uint64(addr))
|
2015-02-27 22:57:28 -05:00
|
|
|
for ; addr < sym.Value; addr++ {
|
2016-08-25 12:32:42 +10:00
|
|
|
ctxt.Logf(" %.2x", 0)
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
2016-08-25 12:32:42 +10:00
|
|
|
ctxt.Logf("\n")
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
|
2016-08-25 12:32:42 +10:00
|
|
|
ctxt.Logf("%.6x\t%-20s\n", uint64(addr), sym.Name)
|
2015-02-27 22:57:28 -05:00
|
|
|
q = sym.P
|
|
|
|
|
|
2015-08-11 10:59:24 -04:00
|
|
|
for len(q) >= 16 {
|
2016-08-25 12:32:42 +10:00
|
|
|
ctxt.Logf("%.6x\t% x\n", uint64(addr), q[:16])
|
2015-02-27 22:57:28 -05:00
|
|
|
addr += 16
|
|
|
|
|
q = q[16:]
|
|
|
|
|
}
|
|
|
|
|
|
2015-08-11 10:59:24 -04:00
|
|
|
if len(q) > 0 {
|
2016-08-25 12:32:42 +10:00
|
|
|
ctxt.Logf("%.6x\t% x\n", uint64(addr), q)
|
2015-08-11 10:59:24 -04:00
|
|
|
addr += int64(len(q))
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if addr < eaddr {
|
2016-08-25 12:32:42 +10:00
|
|
|
ctxt.Logf("%-20s %.8x|", "_", uint64(addr))
|
2015-02-27 22:57:28 -05:00
|
|
|
for ; addr < eaddr; addr++ {
|
2016-08-25 12:32:42 +10:00
|
|
|
ctxt.Logf(" %.2x", 0)
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2016-04-22 10:31:14 +12:00
|
|
|
func blk(ctxt *Link, syms []*Symbol, addr, size int64, pad []byte) {
|
2016-04-18 14:50:14 -04:00
|
|
|
for i, s := range syms {
|
2017-04-19 15:15:35 +12:00
|
|
|
if s.Type&SSUB == 0 && s.Value >= addr {
|
2016-04-18 14:50:14 -04:00
|
|
|
syms = syms[i:]
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
eaddr := addr + size
|
|
|
|
|
for _, s := range syms {
|
2017-04-19 15:15:35 +12:00
|
|
|
if s.Type&SSUB != 0 {
|
2016-04-18 14:50:14 -04:00
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
if s.Value >= eaddr {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
if s.Value < addr {
|
2016-09-17 09:39:33 -04:00
|
|
|
Errorf(s, "phase error: addr=%#x but sym=%#x type=%d", addr, s.Value, s.Type)
|
2016-04-18 14:50:14 -04:00
|
|
|
errorexit()
|
|
|
|
|
}
|
|
|
|
|
if addr < s.Value {
|
2016-06-11 17:12:28 -07:00
|
|
|
strnputPad("", int(s.Value-addr), pad)
|
2016-04-18 14:50:14 -04:00
|
|
|
addr = s.Value
|
|
|
|
|
}
|
|
|
|
|
Cwrite(s.P)
|
|
|
|
|
addr += int64(len(s.P))
|
|
|
|
|
if addr < s.Value+s.Size {
|
2016-06-11 17:12:28 -07:00
|
|
|
strnputPad("", int(s.Value+s.Size-addr), pad)
|
2016-04-18 14:50:14 -04:00
|
|
|
addr = s.Value + s.Size
|
|
|
|
|
}
|
|
|
|
|
if addr != s.Value+s.Size {
|
2016-09-17 09:39:33 -04:00
|
|
|
Errorf(s, "phase error: addr=%#x value+size=%#x", addr, s.Value+s.Size)
|
2016-04-18 14:50:14 -04:00
|
|
|
errorexit()
|
|
|
|
|
}
|
|
|
|
|
if s.Value+s.Size >= eaddr {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if addr < eaddr {
|
2016-06-11 17:12:28 -07:00
|
|
|
strnputPad("", int(eaddr-addr), pad)
|
2016-04-18 14:50:14 -04:00
|
|
|
}
|
|
|
|
|
Cflush()
|
|
|
|
|
}
|
|
|
|
|
|
2016-08-19 22:40:38 -04:00
|
|
|
func Datblk(ctxt *Link, addr int64, size int64) {
|
2016-08-21 18:34:24 -04:00
|
|
|
if *flagA {
|
2016-08-25 12:32:42 +10:00
|
|
|
ctxt.Logf("datblk [%#x,%#x) at offset %#x\n", addr, addr+size, coutbuf.Offset())
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
|
2016-04-22 10:31:14 +12:00
|
|
|
blk(ctxt, datap, addr, size, zeros[:])
|
2015-02-27 22:57:28 -05:00
|
|
|
|
|
|
|
|
/* again for printing */
|
2016-08-21 18:34:24 -04:00
|
|
|
if !*flagA {
|
2015-02-27 22:57:28 -05:00
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
|
2016-04-18 14:50:14 -04:00
|
|
|
syms := datap
|
|
|
|
|
for i, sym := range syms {
|
2015-02-27 22:57:28 -05:00
|
|
|
if sym.Value >= addr {
|
2016-04-18 14:50:14 -04:00
|
|
|
syms = syms[i:]
|
2015-02-27 22:57:28 -05:00
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2015-03-02 12:35:15 -05:00
|
|
|
eaddr := addr + size
|
2016-04-18 14:50:14 -04:00
|
|
|
for _, sym := range syms {
|
2015-02-27 22:57:28 -05:00
|
|
|
if sym.Value >= eaddr {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
if addr < sym.Value {
|
2016-08-25 12:32:42 +10:00
|
|
|
ctxt.Logf("\t%.8x| 00 ...\n", uint64(addr))
|
2015-02-27 22:57:28 -05:00
|
|
|
addr = sym.Value
|
|
|
|
|
}
|
|
|
|
|
|
2016-08-25 12:32:42 +10:00
|
|
|
ctxt.Logf("%s\n\t%.8x|", sym.Name, uint64(addr))
|
2016-04-20 10:36:49 -04:00
|
|
|
for i, b := range sym.P {
|
|
|
|
|
if i > 0 && i%16 == 0 {
|
2016-08-25 12:32:42 +10:00
|
|
|
ctxt.Logf("\n\t%.8x|", uint64(addr)+uint64(i))
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
2016-08-25 12:32:42 +10:00
|
|
|
ctxt.Logf(" %.2x", b)
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
addr += int64(len(sym.P))
|
|
|
|
|
for ; addr < sym.Value+sym.Size; addr++ {
|
2016-08-25 12:32:42 +10:00
|
|
|
ctxt.Logf(" %.2x", 0)
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
2016-08-25 12:32:42 +10:00
|
|
|
ctxt.Logf("\n")
|
2015-02-27 22:57:28 -05:00
|
|
|
|
2016-04-20 10:36:49 -04:00
|
|
|
if Linkmode != LinkExternal {
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
for _, r := range sym.R {
|
|
|
|
|
rsname := ""
|
|
|
|
|
if r.Sym != nil {
|
|
|
|
|
rsname = r.Sym.Name
|
|
|
|
|
}
|
|
|
|
|
typ := "?"
|
|
|
|
|
switch r.Type {
|
2017-04-18 12:53:25 -07:00
|
|
|
case objabi.R_ADDR:
|
2016-04-20 10:36:49 -04:00
|
|
|
typ = "addr"
|
2017-04-18 12:53:25 -07:00
|
|
|
case objabi.R_PCREL:
|
2016-04-20 10:36:49 -04:00
|
|
|
typ = "pcrel"
|
2017-04-18 12:53:25 -07:00
|
|
|
case objabi.R_CALL:
|
2016-04-20 10:36:49 -04:00
|
|
|
typ = "call"
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
2016-08-25 12:32:42 +10:00
|
|
|
ctxt.Logf("\treloc %.8x/%d %s %s+%#x [%#x]\n", uint(sym.Value+int64(r.Off)), r.Siz, typ, rsname, r.Add, r.Sym.Value+r.Add)
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if addr < eaddr {
|
2016-08-25 12:32:42 +10:00
|
|
|
ctxt.Logf("\t%.8x| 00 ...\n", uint(addr))
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
2016-08-25 12:32:42 +10:00
|
|
|
ctxt.Logf("\t%.8x|\n", uint(eaddr))
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
|
2016-08-19 22:40:38 -04:00
|
|
|
func Dwarfblk(ctxt *Link, addr int64, size int64) {
|
2016-08-21 18:34:24 -04:00
|
|
|
if *flagA {
|
2016-08-25 12:32:42 +10:00
|
|
|
ctxt.Logf("dwarfblk [%#x,%#x) at offset %#x\n", addr, addr+size, coutbuf.Offset())
|
2016-03-14 09:23:04 -07:00
|
|
|
}
|
|
|
|
|
|
2016-04-22 10:31:14 +12:00
|
|
|
blk(ctxt, dwarfp, addr, size, zeros[:])
|
2016-03-14 09:23:04 -07:00
|
|
|
}
|
|
|
|
|
|
2016-02-29 11:49:49 +02:00
|
|
|
var zeros [512]byte
|
2015-02-27 22:57:28 -05:00
|
|
|
|
2016-02-29 11:49:49 +02:00
|
|
|
// strnput writes the first n bytes of s.
|
2016-06-11 17:12:28 -07:00
|
|
|
// If n is larger than len(s),
|
2016-02-29 11:49:49 +02:00
|
|
|
// it is padded with NUL bytes.
|
|
|
|
|
func strnput(s string, n int) {
|
2016-06-11 17:12:28 -07:00
|
|
|
strnputPad(s, n, zeros[:])
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// strnput writes the first n bytes of s.
|
|
|
|
|
// If n is larger than len(s),
|
|
|
|
|
// it is padded with the bytes in pad (repeated as needed).
|
|
|
|
|
func strnputPad(s string, n int, pad []byte) {
|
2016-02-29 11:49:49 +02:00
|
|
|
if len(s) >= n {
|
|
|
|
|
Cwritestring(s[:n])
|
|
|
|
|
} else {
|
|
|
|
|
Cwritestring(s)
|
|
|
|
|
n -= len(s)
|
2016-06-11 17:12:28 -07:00
|
|
|
for n > len(pad) {
|
|
|
|
|
Cwrite(pad)
|
|
|
|
|
n -= len(pad)
|
|
|
|
|
|
2016-02-29 11:49:49 +02:00
|
|
|
}
|
2016-06-11 17:12:28 -07:00
|
|
|
Cwrite(pad[:n])
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2016-08-19 11:35:54 -04:00
|
|
|
var strdata []*Symbol
|
2015-06-29 13:03:11 -04:00
|
|
|
|
2016-08-19 22:40:38 -04:00
|
|
|
func addstrdata1(ctxt *Link, arg string) {
|
2017-09-21 19:01:27 +02:00
|
|
|
eq := strings.IndexByte(arg, '=')
|
2016-10-25 08:59:35 -04:00
|
|
|
dot := strings.LastIndex(arg[:eq+1], ".")
|
|
|
|
|
if eq < 0 || dot < 0 {
|
2015-05-21 14:35:02 -04:00
|
|
|
Exitf("-X flag requires argument of the form importpath.name=value")
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
2017-04-17 18:46:09 -05:00
|
|
|
addstrdata(ctxt, objabi.PathToPrefix(arg[:dot])+arg[dot:eq], arg[eq+1:])
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
|
2016-08-19 22:40:38 -04:00
|
|
|
func addstrdata(ctxt *Link, name string, value string) {
|
2015-03-02 12:35:15 -05:00
|
|
|
p := fmt.Sprintf("%s.str", name)
|
cmd/link: use ctxt.{Lookup,ROLookup} in favour of function versions of same
Done with two eg templates:
package p
import (
"cmd/link/internal/ld"
)
func before(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ld.Linklookup(ctxt, name, v)
}
func after(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ctxt.Syms.Lookup(name, v)
}
package p
import (
"cmd/link/internal/ld"
)
func before(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ld.Linkrlookup(ctxt, name, v)
}
func after(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ctxt.Syms.ROLookup(name, v)
}
Change-Id: I00647dbf62294557bd24c29ad1f108fc786335f1
Reviewed-on: https://go-review.googlesource.com/29343
Run-TryBot: Michael Hudson-Doyle <michael.hudson@canonical.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Ian Lance Taylor <iant@golang.org>
2016-09-20 15:06:08 +12:00
|
|
|
sp := ctxt.Syms.Lookup(p, 0)
|
2015-02-27 22:57:28 -05:00
|
|
|
|
2016-09-20 15:31:26 +12:00
|
|
|
Addstring(sp, value)
|
2017-04-19 15:15:35 +12:00
|
|
|
sp.Type = SRODATA
|
2015-02-27 22:57:28 -05:00
|
|
|
|
cmd/link: use ctxt.{Lookup,ROLookup} in favour of function versions of same
Done with two eg templates:
package p
import (
"cmd/link/internal/ld"
)
func before(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ld.Linklookup(ctxt, name, v)
}
func after(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ctxt.Syms.Lookup(name, v)
}
package p
import (
"cmd/link/internal/ld"
)
func before(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ld.Linkrlookup(ctxt, name, v)
}
func after(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ctxt.Syms.ROLookup(name, v)
}
Change-Id: I00647dbf62294557bd24c29ad1f108fc786335f1
Reviewed-on: https://go-review.googlesource.com/29343
Run-TryBot: Michael Hudson-Doyle <michael.hudson@canonical.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Ian Lance Taylor <iant@golang.org>
2016-09-20 15:06:08 +12:00
|
|
|
s := ctxt.Syms.Lookup(name, 0)
|
2015-02-27 22:57:28 -05:00
|
|
|
s.Size = 0
|
2016-03-02 07:59:49 -05:00
|
|
|
s.Attr |= AttrDuplicateOK
|
|
|
|
|
reachable := s.Attr.Reachable()
|
2016-08-19 22:40:38 -04:00
|
|
|
Addaddr(ctxt, s, sp)
|
|
|
|
|
adduintxx(ctxt, s, uint64(len(value)), SysArch.PtrSize)
|
2015-02-27 22:57:28 -05:00
|
|
|
|
|
|
|
|
// addstring, addaddr, etc., mark the symbols as reachable.
|
|
|
|
|
// In this case that is not necessarily true, so stick to what
|
|
|
|
|
// we know before entering this function.
|
2016-03-02 07:59:49 -05:00
|
|
|
s.Attr.Set(AttrReachable, reachable)
|
2015-02-27 22:57:28 -05:00
|
|
|
|
2015-06-29 13:03:11 -04:00
|
|
|
strdata = append(strdata, s)
|
|
|
|
|
|
2016-03-02 07:59:49 -05:00
|
|
|
sp.Attr.Set(AttrReachable, reachable)
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
|
2016-08-19 22:40:38 -04:00
|
|
|
func (ctxt *Link) checkstrdata() {
|
2015-06-29 13:03:11 -04:00
|
|
|
for _, s := range strdata {
|
2017-04-19 15:15:35 +12:00
|
|
|
if s.Type == STEXT {
|
2016-09-17 09:39:33 -04:00
|
|
|
Errorf(s, "cannot use -X with text symbol")
|
2015-06-29 13:03:11 -04:00
|
|
|
} else if s.Gotype != nil && s.Gotype.Name != "type.string" {
|
2016-09-17 09:39:33 -04:00
|
|
|
Errorf(s, "cannot use -X with non-string symbol")
|
2015-06-29 13:03:11 -04:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2016-09-20 15:31:26 +12:00
|
|
|
func Addstring(s *Symbol, str string) int64 {
|
2015-02-27 22:57:28 -05:00
|
|
|
if s.Type == 0 {
|
2017-04-19 15:15:35 +12:00
|
|
|
s.Type = SNOPTRDATA
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
2016-03-02 07:59:49 -05:00
|
|
|
s.Attr |= AttrReachable
|
2016-04-07 18:00:57 +03:00
|
|
|
r := s.Size
|
2015-02-27 22:57:28 -05:00
|
|
|
if s.Name == ".shstrtab" {
|
2016-09-17 09:39:33 -04:00
|
|
|
elfsetstring(s, str, int(r))
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
2016-04-07 18:00:57 +03:00
|
|
|
s.P = append(s.P, str...)
|
|
|
|
|
s.P = append(s.P, 0)
|
|
|
|
|
s.Size = int64(len(s.P))
|
|
|
|
|
return r
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
|
2015-04-11 12:05:21 +08:00
|
|
|
// addgostring adds str, as a Go string value, to s. symname is the name of the
|
|
|
|
|
// symbol used to define the string data and must be unique per linked object.
|
2016-08-19 22:40:38 -04:00
|
|
|
func addgostring(ctxt *Link, s *Symbol, symname, str string) {
|
cmd/link: use ctxt.{Lookup,ROLookup} in favour of function versions of same
Done with two eg templates:
package p
import (
"cmd/link/internal/ld"
)
func before(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ld.Linklookup(ctxt, name, v)
}
func after(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ctxt.Syms.Lookup(name, v)
}
package p
import (
"cmd/link/internal/ld"
)
func before(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ld.Linkrlookup(ctxt, name, v)
}
func after(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ctxt.Syms.ROLookup(name, v)
}
Change-Id: I00647dbf62294557bd24c29ad1f108fc786335f1
Reviewed-on: https://go-review.googlesource.com/29343
Run-TryBot: Michael Hudson-Doyle <michael.hudson@canonical.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Ian Lance Taylor <iant@golang.org>
2016-09-20 15:06:08 +12:00
|
|
|
sym := ctxt.Syms.Lookup(symname, 0)
|
2017-04-19 15:15:35 +12:00
|
|
|
if sym.Type != Sxxx {
|
2016-09-17 09:39:33 -04:00
|
|
|
Errorf(s, "duplicate symname in addgostring: %s", symname)
|
2015-04-11 12:05:21 +08:00
|
|
|
}
|
2016-03-02 07:59:49 -05:00
|
|
|
sym.Attr |= AttrReachable
|
|
|
|
|
sym.Attr |= AttrLocal
|
2017-04-19 15:15:35 +12:00
|
|
|
sym.Type = SRODATA
|
2015-04-11 12:05:21 +08:00
|
|
|
sym.Size = int64(len(str))
|
|
|
|
|
sym.P = []byte(str)
|
2016-08-19 22:40:38 -04:00
|
|
|
Addaddr(ctxt, s, sym)
|
|
|
|
|
adduint(ctxt, s, uint64(len(str)))
|
2015-04-11 12:05:21 +08:00
|
|
|
}
|
|
|
|
|
|
2016-08-19 22:40:38 -04:00
|
|
|
func addinitarrdata(ctxt *Link, s *Symbol) {
|
2015-03-17 09:47:01 -07:00
|
|
|
p := s.Name + ".ptr"
|
cmd/link: use ctxt.{Lookup,ROLookup} in favour of function versions of same
Done with two eg templates:
package p
import (
"cmd/link/internal/ld"
)
func before(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ld.Linklookup(ctxt, name, v)
}
func after(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ctxt.Syms.Lookup(name, v)
}
package p
import (
"cmd/link/internal/ld"
)
func before(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ld.Linkrlookup(ctxt, name, v)
}
func after(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ctxt.Syms.ROLookup(name, v)
}
Change-Id: I00647dbf62294557bd24c29ad1f108fc786335f1
Reviewed-on: https://go-review.googlesource.com/29343
Run-TryBot: Michael Hudson-Doyle <michael.hudson@canonical.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Ian Lance Taylor <iant@golang.org>
2016-09-20 15:06:08 +12:00
|
|
|
sp := ctxt.Syms.Lookup(p, 0)
|
2017-04-19 15:15:35 +12:00
|
|
|
sp.Type = SINITARR
|
2015-03-17 09:47:01 -07:00
|
|
|
sp.Size = 0
|
2016-03-02 07:59:49 -05:00
|
|
|
sp.Attr |= AttrDuplicateOK
|
2016-08-19 22:40:38 -04:00
|
|
|
Addaddr(ctxt, sp, s)
|
2015-03-17 09:47:01 -07:00
|
|
|
}
|
|
|
|
|
|
2016-08-19 22:40:38 -04:00
|
|
|
func dosymtype(ctxt *Link) {
|
cmd/internal/obj: fix LSym.Type during compilation, not linking
Prior to this CL, the compiler and assembler
were sloppy about the LSym.Type for LSyms
containing static data.
The linker then fixed this up, converting
Sxxx and SBSS to SDATA, and SNOPTRBSS to SNOPTRDATA
if it noticed that the symbol had associated data.
It is preferable to just get this right in cmd/compile
and cmd/asm, because it removes an unnecessary traversal
of the symbol table from the linker (see #14624).
Do this by touching up the LSym.Type fixes in
LSym.prepwrite and Link.Globl.
I have confirmed by instrumenting the linker
that the now-eliminated code paths were unreached.
And an additional check in the object file writing code
will help preserve that invariant.
There was a case in the Windows linker,
with internal linking and cgo,
where we were generating SNOPTRBSS symbols with data.
For now, convert those at the site at which they occur
into SNOPTRDATA, just like they were.
Does not pass toolstash-check,
but does generate identical linked binaries.
No compiler performance changes.
Change-Id: I77b071ab103685ff8e042cee9abb864385488872
Reviewed-on: https://go-review.googlesource.com/40864
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Alex Brainman <alex.brainman@gmail.com>
Reviewed-by: Michael Hudson-Doyle <michael.hudson@canonical.com>
2017-04-16 08:22:44 -07:00
|
|
|
switch Buildmode {
|
|
|
|
|
case BuildmodeCArchive, BuildmodeCShared:
|
|
|
|
|
for _, s := range ctxt.Syms.Allsym {
|
|
|
|
|
// Create a new entry in the .init_array section that points to the
|
|
|
|
|
// library initializer function.
|
2017-07-09 09:01:47 +09:00
|
|
|
if s.Name == *flagEntrySymbol {
|
|
|
|
|
addinitarrdata(ctxt, s)
|
2015-04-09 10:44:05 -04:00
|
|
|
}
|
2015-03-17 09:47:01 -07:00
|
|
|
}
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2016-03-02 21:24:04 -05:00
|
|
|
// symalign returns the required alignment for the given symbol s.
|
2016-08-19 11:35:54 -04:00
|
|
|
func symalign(s *Symbol) int32 {
|
2016-03-02 21:24:04 -05:00
|
|
|
min := int32(Thearch.Minalign)
|
|
|
|
|
if s.Align >= min {
|
2015-02-27 22:57:28 -05:00
|
|
|
return s.Align
|
2016-03-02 21:24:04 -05:00
|
|
|
} else if s.Align != 0 {
|
|
|
|
|
return min
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
2016-10-13 22:31:46 +03:00
|
|
|
if strings.HasPrefix(s.Name, "go.string.") || strings.HasPrefix(s.Name, "type..namedata.") {
|
2016-03-07 22:48:07 -05:00
|
|
|
// String data is just bytes.
|
|
|
|
|
// If we align it, we waste a lot of space to padding.
|
2016-03-18 16:57:54 -04:00
|
|
|
return min
|
2016-03-07 22:48:07 -05:00
|
|
|
}
|
2015-03-02 12:35:15 -05:00
|
|
|
align := int32(Thearch.Maxalign)
|
2016-03-02 21:24:04 -05:00
|
|
|
for int64(align) > s.Size && align > min {
|
2015-02-27 22:57:28 -05:00
|
|
|
align >>= 1
|
|
|
|
|
}
|
|
|
|
|
return align
|
|
|
|
|
}
|
|
|
|
|
|
2016-08-19 11:35:54 -04:00
|
|
|
func aligndatsize(datsize int64, s *Symbol) int64 {
|
2015-02-27 22:57:28 -05:00
|
|
|
return Rnd(datsize, int64(symalign(s)))
|
|
|
|
|
}
|
|
|
|
|
|
runtime: replace GC programs with simpler encoding, faster decoder
Small types record the location of pointers in their memory layout
by using a simple bitmap. In Go 1.4 the bitmap held 4-bit entries,
and in Go 1.5 the bitmap holds 1-bit entries, but in both cases using
a bitmap for a large type containing arrays does not make sense:
if someone refers to the type [1<<28]*byte in a program in such
a way that the type information makes it into the binary, it would be
a waste of space to write a 128 MB (for 4-bit entries) or even 32 MB
(for 1-bit entries) bitmap full of 1s into the binary or even to keep
one in memory during the execution of the program.
For large types containing arrays, it is much more compact to describe
the locations of pointers using a notation that can express repetition
than to lay out a bitmap of pointers. Go 1.4 included such a notation,
called ``GC programs'' but it was complex, required recursion during
decoding, and was generally slow. Dmitriy measured the execution of
these programs writing directly to the heap bitmap as being 7x slower
than copying from a preunrolled 4-bit mask (and frankly that code was
not terribly fast either). For some tests, unrollgcprog1 was seen costing
as much as 3x more than the rest of malloc combined.
This CL introduces a different form for the GC programs. They use a
simple Lempel-Ziv-style encoding of the 1-bit pointer information,
in which the only operations are (1) emit the following n bits
and (2) repeat the last n bits c more times. This encoding can be
generated directly from the Go type information (using repetition
only for arrays or large runs of non-pointer data) and it can be decoded
very efficiently. In particular the decoding requires little state and
no recursion, so that the entire decoding can run without any memory
accesses other than the reads of the encoding and the writes of the
decoded form to the heap bitmap. For recursive types like arrays of
arrays of arrays, the inner instructions are only executed once, not
n times, so that large repetitions run at full speed. (In contrast, large
repetitions in the old programs repeated the individual bit-level layout
of the inner data over and over.) The result is as much as 25x faster
decoding compared to the old form.
Because the old decoder was so slow, Go 1.4 had three (or so) cases
for how to set the heap bitmap bits for an allocation of a given type:
(1) If the type had an even number of words up to 32 words, then
the 4-bit pointer mask for the type fit in no more than 16 bytes;
store the 4-bit pointer mask directly in the binary and copy from it.
(1b) If the type had an odd number of words up to 15 words, then
the 4-bit pointer mask for the type, doubled to end on a byte boundary,
fit in no more than 16 bytes; store that doubled mask directly in the
binary and copy from it.
(2) If the type had an even number of words up to 128 words,
or an odd number of words up to 63 words (again due to doubling),
then the 4-bit pointer mask would fit in a 64-byte unrolled mask.
Store a GC program in the binary, but leave space in the BSS for
the unrolled mask. Execute the GC program to construct the mask the
first time it is needed, and thereafter copy from the mask.
(3) Otherwise, store a GC program and execute it to write directly to
the heap bitmap each time an object of that type is allocated.
(This is the case that was 7x slower than the other two.)
Because the new pointer masks store 1-bit entries instead of 4-bit
entries and because using the decoder no longer carries a significant
overhead, after this CL (that is, for Go 1.5) there are only two cases:
(1) If the type is 128 words or less (no condition about odd or even),
store the 1-bit pointer mask directly in the binary and use it to
initialize the heap bitmap during malloc. (Implemented in CL 9702.)
(2) There is no case 2 anymore.
(3) Otherwise, store a GC program and execute it to write directly to
the heap bitmap each time an object of that type is allocated.
Executing the GC program directly into the heap bitmap (case (3) above)
was disabled for the Go 1.5 dev cycle, both to avoid needing to use
GC programs for typedmemmove and to avoid updating that code as
the heap bitmap format changed. Typedmemmove no longer uses this
type information; as of CL 9886 it uses the heap bitmap directly.
Now that the heap bitmap format is stable, we reintroduce GC programs
and their space savings.
Benchmarks for heapBitsSetType, before this CL vs this CL:
name old mean new mean delta
SetTypePtr 7.59ns × (0.99,1.02) 5.16ns × (1.00,1.00) -32.05% (p=0.000)
SetTypePtr8 21.0ns × (0.98,1.05) 21.4ns × (1.00,1.00) ~ (p=0.179)
SetTypePtr16 24.1ns × (0.99,1.01) 24.6ns × (1.00,1.00) +2.41% (p=0.001)
SetTypePtr32 31.2ns × (0.99,1.01) 32.4ns × (0.99,1.02) +3.72% (p=0.001)
SetTypePtr64 45.2ns × (1.00,1.00) 47.2ns × (1.00,1.00) +4.42% (p=0.000)
SetTypePtr126 75.8ns × (0.99,1.01) 79.1ns × (1.00,1.00) +4.25% (p=0.000)
SetTypePtr128 74.3ns × (0.99,1.01) 77.6ns × (1.00,1.01) +4.55% (p=0.000)
SetTypePtrSlice 726ns × (1.00,1.01) 712ns × (1.00,1.00) -1.95% (p=0.001)
SetTypeNode1 20.0ns × (0.99,1.01) 20.7ns × (1.00,1.00) +3.71% (p=0.000)
SetTypeNode1Slice 112ns × (1.00,1.00) 113ns × (0.99,1.00) ~ (p=0.070)
SetTypeNode8 23.9ns × (1.00,1.00) 24.7ns × (1.00,1.01) +3.18% (p=0.000)
SetTypeNode8Slice 294ns × (0.99,1.02) 287ns × (0.99,1.01) -2.38% (p=0.015)
SetTypeNode64 52.8ns × (0.99,1.03) 51.8ns × (0.99,1.01) ~ (p=0.069)
SetTypeNode64Slice 1.13µs × (0.99,1.05) 1.14µs × (0.99,1.00) ~ (p=0.767)
SetTypeNode64Dead 36.0ns × (1.00,1.01) 32.5ns × (0.99,1.00) -9.67% (p=0.000)
SetTypeNode64DeadSlice 1.43µs × (0.99,1.01) 1.40µs × (1.00,1.00) -2.39% (p=0.001)
SetTypeNode124 75.7ns × (1.00,1.01) 79.0ns × (1.00,1.00) +4.44% (p=0.000)
SetTypeNode124Slice 1.94µs × (1.00,1.01) 2.04µs × (0.99,1.01) +4.98% (p=0.000)
SetTypeNode126 75.4ns × (1.00,1.01) 77.7ns × (0.99,1.01) +3.11% (p=0.000)
SetTypeNode126Slice 1.95µs × (0.99,1.01) 2.03µs × (1.00,1.00) +3.74% (p=0.000)
SetTypeNode128 85.4ns × (0.99,1.01) 122.0ns × (1.00,1.00) +42.89% (p=0.000)
SetTypeNode128Slice 2.20µs × (1.00,1.01) 2.36µs × (0.98,1.02) +7.48% (p=0.001)
SetTypeNode130 83.3ns × (1.00,1.00) 123.0ns × (1.00,1.00) +47.61% (p=0.000)
SetTypeNode130Slice 2.30µs × (0.99,1.01) 2.40µs × (0.98,1.01) +4.37% (p=0.000)
SetTypeNode1024 498ns × (1.00,1.00) 537ns × (1.00,1.00) +7.96% (p=0.000)
SetTypeNode1024Slice 15.5µs × (0.99,1.01) 17.8µs × (1.00,1.00) +15.27% (p=0.000)
The above compares always using a cached pointer mask (and the
corresponding waste of memory) against using the programs directly.
Some slowdown is expected, in exchange for having a better general algorithm.
The GC programs kick in for SetTypeNode128, SetTypeNode130, SetTypeNode1024,
along with the slice variants of those.
It is possible that the cutoff of 128 words (bits) should be raised
in a followup CL, but even with this low cutoff the GC programs are
faster than Go 1.4's "fast path" non-GC program case.
Benchmarks for heapBitsSetType, Go 1.4 vs this CL:
name old mean new mean delta
SetTypePtr 6.89ns × (1.00,1.00) 5.17ns × (1.00,1.00) -25.02% (p=0.000)
SetTypePtr8 25.8ns × (0.97,1.05) 21.5ns × (1.00,1.00) -16.70% (p=0.000)
SetTypePtr16 39.8ns × (0.97,1.02) 24.7ns × (0.99,1.01) -37.81% (p=0.000)
SetTypePtr32 68.8ns × (0.98,1.01) 32.2ns × (1.00,1.01) -53.18% (p=0.000)
SetTypePtr64 130ns × (1.00,1.00) 47ns × (1.00,1.00) -63.67% (p=0.000)
SetTypePtr126 241ns × (0.99,1.01) 79ns × (1.00,1.01) -67.25% (p=0.000)
SetTypePtr128 2.07µs × (1.00,1.00) 0.08µs × (1.00,1.00) -96.27% (p=0.000)
SetTypePtrSlice 1.05µs × (0.99,1.01) 0.72µs × (0.99,1.02) -31.70% (p=0.000)
SetTypeNode1 16.0ns × (0.99,1.01) 20.8ns × (0.99,1.03) +29.91% (p=0.000)
SetTypeNode1Slice 184ns × (0.99,1.01) 112ns × (0.99,1.01) -39.26% (p=0.000)
SetTypeNode8 29.5ns × (0.97,1.02) 24.6ns × (1.00,1.00) -16.50% (p=0.000)
SetTypeNode8Slice 624ns × (0.98,1.02) 285ns × (1.00,1.00) -54.31% (p=0.000)
SetTypeNode64 135ns × (0.96,1.08) 52ns × (0.99,1.02) -61.32% (p=0.000)
SetTypeNode64Slice 3.83µs × (1.00,1.00) 1.14µs × (0.99,1.01) -70.16% (p=0.000)
SetTypeNode64Dead 134ns × (0.99,1.01) 32ns × (1.00,1.01) -75.74% (p=0.000)
SetTypeNode64DeadSlice 3.83µs × (0.99,1.00) 1.40µs × (1.00,1.01) -63.42% (p=0.000)
SetTypeNode124 240ns × (0.99,1.01) 79ns × (1.00,1.01) -67.05% (p=0.000)
SetTypeNode124Slice 7.27µs × (1.00,1.00) 2.04µs × (1.00,1.00) -71.95% (p=0.000)
SetTypeNode126 2.06µs × (0.99,1.01) 0.08µs × (0.99,1.01) -96.23% (p=0.000)
SetTypeNode126Slice 64.4µs × (1.00,1.00) 2.0µs × (1.00,1.00) -96.85% (p=0.000)
SetTypeNode128 2.09µs × (1.00,1.01) 0.12µs × (1.00,1.00) -94.15% (p=0.000)
SetTypeNode128Slice 65.4µs × (1.00,1.00) 2.4µs × (0.99,1.03) -96.39% (p=0.000)
SetTypeNode130 2.11µs × (1.00,1.00) 0.12µs × (1.00,1.00) -94.18% (p=0.000)
SetTypeNode130Slice 66.3µs × (1.00,1.00) 2.4µs × (0.97,1.08) -96.34% (p=0.000)
SetTypeNode1024 16.0µs × (1.00,1.01) 0.5µs × (1.00,1.00) -96.65% (p=0.000)
SetTypeNode1024Slice 512µs × (1.00,1.00) 18µs × (0.98,1.04) -96.45% (p=0.000)
SetTypeNode124 uses a 124 data + 2 ptr = 126-word allocation.
Both Go 1.4 and this CL are using pointer bitmaps for this case,
so that's an overall 3x speedup for using pointer bitmaps.
SetTypeNode128 uses a 128 data + 2 ptr = 130-word allocation.
Both Go 1.4 and this CL are running the GC program for this case,
so that's an overall 17x speedup when using GC programs (and
I've seen >20x on other systems).
Comparing Go 1.4's SetTypeNode124 (pointer bitmap) against
this CL's SetTypeNode128 (GC program), the slow path in the
code in this CL is 2x faster than the fast path in Go 1.4.
The Go 1 benchmarks are basically unaffected compared to just before this CL.
Go 1 benchmarks, before this CL vs this CL:
name old mean new mean delta
BinaryTree17 5.87s × (0.97,1.04) 5.91s × (0.96,1.04) ~ (p=0.306)
Fannkuch11 4.38s × (1.00,1.00) 4.37s × (1.00,1.01) -0.22% (p=0.006)
FmtFprintfEmpty 90.7ns × (0.97,1.10) 89.3ns × (0.96,1.09) ~ (p=0.280)
FmtFprintfString 282ns × (0.98,1.04) 287ns × (0.98,1.07) +1.72% (p=0.039)
FmtFprintfInt 269ns × (0.99,1.03) 282ns × (0.97,1.04) +4.87% (p=0.000)
FmtFprintfIntInt 478ns × (0.99,1.02) 481ns × (0.99,1.02) +0.61% (p=0.048)
FmtFprintfPrefixedInt 399ns × (0.98,1.03) 400ns × (0.98,1.05) ~ (p=0.533)
FmtFprintfFloat 563ns × (0.99,1.01) 570ns × (1.00,1.01) +1.37% (p=0.000)
FmtManyArgs 1.89µs × (0.99,1.01) 1.92µs × (0.99,1.02) +1.88% (p=0.000)
GobDecode 15.2ms × (0.99,1.01) 15.2ms × (0.98,1.05) ~ (p=0.609)
GobEncode 11.6ms × (0.98,1.03) 11.9ms × (0.98,1.04) +2.17% (p=0.000)
Gzip 648ms × (0.99,1.01) 648ms × (1.00,1.01) ~ (p=0.835)
Gunzip 142ms × (1.00,1.00) 143ms × (1.00,1.01) ~ (p=0.169)
HTTPClientServer 90.5µs × (0.98,1.03) 91.5µs × (0.98,1.04) +1.04% (p=0.045)
JSONEncode 31.5ms × (0.98,1.03) 31.4ms × (0.98,1.03) ~ (p=0.549)
JSONDecode 111ms × (0.99,1.01) 107ms × (0.99,1.01) -3.21% (p=0.000)
Mandelbrot200 6.01ms × (1.00,1.00) 6.01ms × (1.00,1.00) ~ (p=0.878)
GoParse 6.54ms × (0.99,1.02) 6.61ms × (0.99,1.03) +1.08% (p=0.004)
RegexpMatchEasy0_32 160ns × (1.00,1.01) 161ns × (1.00,1.00) +0.40% (p=0.000)
RegexpMatchEasy0_1K 560ns × (0.99,1.01) 559ns × (0.99,1.01) ~ (p=0.088)
RegexpMatchEasy1_32 138ns × (0.99,1.01) 138ns × (1.00,1.00) ~ (p=0.380)
RegexpMatchEasy1_1K 877ns × (1.00,1.00) 878ns × (1.00,1.00) ~ (p=0.157)
RegexpMatchMedium_32 251ns × (0.99,1.00) 251ns × (1.00,1.01) +0.28% (p=0.021)
RegexpMatchMedium_1K 72.6µs × (1.00,1.00) 72.6µs × (1.00,1.00) ~ (p=0.539)
RegexpMatchHard_32 3.84µs × (1.00,1.00) 3.84µs × (1.00,1.00) ~ (p=0.378)
RegexpMatchHard_1K 117µs × (1.00,1.00) 117µs × (1.00,1.00) ~ (p=0.067)
Revcomp 904ms × (0.99,1.02) 904ms × (0.99,1.01) ~ (p=0.943)
Template 125ms × (0.99,1.02) 127ms × (0.99,1.01) +1.79% (p=0.000)
TimeParse 627ns × (0.99,1.01) 622ns × (0.99,1.01) -0.88% (p=0.000)
TimeFormat 655ns × (0.99,1.02) 655ns × (0.99,1.02) ~ (p=0.976)
For the record, Go 1 benchmarks, Go 1.4 vs this CL:
name old mean new mean delta
BinaryTree17 4.61s × (0.97,1.05) 5.91s × (0.98,1.03) +28.35% (p=0.000)
Fannkuch11 4.40s × (0.99,1.03) 4.41s × (0.99,1.01) ~ (p=0.212)
FmtFprintfEmpty 102ns × (0.99,1.01) 84ns × (0.99,1.02) -18.38% (p=0.000)
FmtFprintfString 302ns × (0.98,1.01) 303ns × (0.99,1.02) ~ (p=0.203)
FmtFprintfInt 313ns × (0.97,1.05) 270ns × (0.99,1.01) -13.69% (p=0.000)
FmtFprintfIntInt 524ns × (0.98,1.02) 477ns × (0.99,1.00) -8.87% (p=0.000)
FmtFprintfPrefixedInt 424ns × (0.98,1.02) 386ns × (0.99,1.01) -8.96% (p=0.000)
FmtFprintfFloat 652ns × (0.98,1.02) 594ns × (0.97,1.05) -8.97% (p=0.000)
FmtManyArgs 2.13µs × (0.99,1.02) 1.94µs × (0.99,1.01) -8.92% (p=0.000)
GobDecode 17.1ms × (0.99,1.02) 14.9ms × (0.98,1.03) -13.07% (p=0.000)
GobEncode 13.5ms × (0.98,1.03) 11.5ms × (0.98,1.03) -15.25% (p=0.000)
Gzip 656ms × (0.99,1.02) 647ms × (0.99,1.01) -1.29% (p=0.000)
Gunzip 143ms × (0.99,1.02) 144ms × (0.99,1.01) ~ (p=0.204)
HTTPClientServer 88.2µs × (0.98,1.02) 90.8µs × (0.98,1.01) +2.93% (p=0.000)
JSONEncode 32.2ms × (0.98,1.02) 30.9ms × (0.97,1.04) -4.06% (p=0.001)
JSONDecode 121ms × (0.98,1.02) 110ms × (0.98,1.05) -8.95% (p=0.000)
Mandelbrot200 6.06ms × (0.99,1.01) 6.11ms × (0.98,1.04) ~ (p=0.184)
GoParse 6.76ms × (0.97,1.04) 6.58ms × (0.98,1.05) -2.63% (p=0.003)
RegexpMatchEasy0_32 195ns × (1.00,1.01) 155ns × (0.99,1.01) -20.43% (p=0.000)
RegexpMatchEasy0_1K 479ns × (0.98,1.03) 535ns × (0.99,1.02) +11.59% (p=0.000)
RegexpMatchEasy1_32 169ns × (0.99,1.02) 131ns × (0.99,1.03) -22.44% (p=0.000)
RegexpMatchEasy1_1K 1.53µs × (0.99,1.01) 0.87µs × (0.99,1.02) -43.07% (p=0.000)
RegexpMatchMedium_32 334ns × (0.99,1.01) 242ns × (0.99,1.01) -27.53% (p=0.000)
RegexpMatchMedium_1K 125µs × (1.00,1.01) 72µs × (0.99,1.03) -42.53% (p=0.000)
RegexpMatchHard_32 6.03µs × (0.99,1.01) 3.79µs × (0.99,1.01) -37.12% (p=0.000)
RegexpMatchHard_1K 189µs × (0.99,1.02) 115µs × (0.99,1.01) -39.20% (p=0.000)
Revcomp 935ms × (0.96,1.03) 926ms × (0.98,1.02) ~ (p=0.083)
Template 146ms × (0.97,1.05) 119ms × (0.99,1.01) -18.37% (p=0.000)
TimeParse 660ns × (0.99,1.01) 624ns × (0.99,1.02) -5.43% (p=0.000)
TimeFormat 670ns × (0.98,1.02) 710ns × (1.00,1.01) +5.97% (p=0.000)
This CL is a bit larger than I would like, but the compiler, linker, runtime,
and package reflect all need to be in sync about the format of these programs,
so there is no easy way to split this into independent changes (at least
while keeping the build working at each change).
Fixes #9625.
Fixes #10524.
Change-Id: I9e3e20d6097099d0f8532d1cb5b1af528804989a
Reviewed-on: https://go-review.googlesource.com/9888
Reviewed-by: Austin Clements <austin@google.com>
Run-TryBot: Russ Cox <rsc@golang.org>
2015-05-08 01:43:18 -04:00
|
|
|
const debugGCProg = false
|
2015-02-27 22:57:28 -05:00
|
|
|
|
runtime: replace GC programs with simpler encoding, faster decoder
Small types record the location of pointers in their memory layout
by using a simple bitmap. In Go 1.4 the bitmap held 4-bit entries,
and in Go 1.5 the bitmap holds 1-bit entries, but in both cases using
a bitmap for a large type containing arrays does not make sense:
if someone refers to the type [1<<28]*byte in a program in such
a way that the type information makes it into the binary, it would be
a waste of space to write a 128 MB (for 4-bit entries) or even 32 MB
(for 1-bit entries) bitmap full of 1s into the binary or even to keep
one in memory during the execution of the program.
For large types containing arrays, it is much more compact to describe
the locations of pointers using a notation that can express repetition
than to lay out a bitmap of pointers. Go 1.4 included such a notation,
called ``GC programs'' but it was complex, required recursion during
decoding, and was generally slow. Dmitriy measured the execution of
these programs writing directly to the heap bitmap as being 7x slower
than copying from a preunrolled 4-bit mask (and frankly that code was
not terribly fast either). For some tests, unrollgcprog1 was seen costing
as much as 3x more than the rest of malloc combined.
This CL introduces a different form for the GC programs. They use a
simple Lempel-Ziv-style encoding of the 1-bit pointer information,
in which the only operations are (1) emit the following n bits
and (2) repeat the last n bits c more times. This encoding can be
generated directly from the Go type information (using repetition
only for arrays or large runs of non-pointer data) and it can be decoded
very efficiently. In particular the decoding requires little state and
no recursion, so that the entire decoding can run without any memory
accesses other than the reads of the encoding and the writes of the
decoded form to the heap bitmap. For recursive types like arrays of
arrays of arrays, the inner instructions are only executed once, not
n times, so that large repetitions run at full speed. (In contrast, large
repetitions in the old programs repeated the individual bit-level layout
of the inner data over and over.) The result is as much as 25x faster
decoding compared to the old form.
Because the old decoder was so slow, Go 1.4 had three (or so) cases
for how to set the heap bitmap bits for an allocation of a given type:
(1) If the type had an even number of words up to 32 words, then
the 4-bit pointer mask for the type fit in no more than 16 bytes;
store the 4-bit pointer mask directly in the binary and copy from it.
(1b) If the type had an odd number of words up to 15 words, then
the 4-bit pointer mask for the type, doubled to end on a byte boundary,
fit in no more than 16 bytes; store that doubled mask directly in the
binary and copy from it.
(2) If the type had an even number of words up to 128 words,
or an odd number of words up to 63 words (again due to doubling),
then the 4-bit pointer mask would fit in a 64-byte unrolled mask.
Store a GC program in the binary, but leave space in the BSS for
the unrolled mask. Execute the GC program to construct the mask the
first time it is needed, and thereafter copy from the mask.
(3) Otherwise, store a GC program and execute it to write directly to
the heap bitmap each time an object of that type is allocated.
(This is the case that was 7x slower than the other two.)
Because the new pointer masks store 1-bit entries instead of 4-bit
entries and because using the decoder no longer carries a significant
overhead, after this CL (that is, for Go 1.5) there are only two cases:
(1) If the type is 128 words or less (no condition about odd or even),
store the 1-bit pointer mask directly in the binary and use it to
initialize the heap bitmap during malloc. (Implemented in CL 9702.)
(2) There is no case 2 anymore.
(3) Otherwise, store a GC program and execute it to write directly to
the heap bitmap each time an object of that type is allocated.
Executing the GC program directly into the heap bitmap (case (3) above)
was disabled for the Go 1.5 dev cycle, both to avoid needing to use
GC programs for typedmemmove and to avoid updating that code as
the heap bitmap format changed. Typedmemmove no longer uses this
type information; as of CL 9886 it uses the heap bitmap directly.
Now that the heap bitmap format is stable, we reintroduce GC programs
and their space savings.
Benchmarks for heapBitsSetType, before this CL vs this CL:
name old mean new mean delta
SetTypePtr 7.59ns × (0.99,1.02) 5.16ns × (1.00,1.00) -32.05% (p=0.000)
SetTypePtr8 21.0ns × (0.98,1.05) 21.4ns × (1.00,1.00) ~ (p=0.179)
SetTypePtr16 24.1ns × (0.99,1.01) 24.6ns × (1.00,1.00) +2.41% (p=0.001)
SetTypePtr32 31.2ns × (0.99,1.01) 32.4ns × (0.99,1.02) +3.72% (p=0.001)
SetTypePtr64 45.2ns × (1.00,1.00) 47.2ns × (1.00,1.00) +4.42% (p=0.000)
SetTypePtr126 75.8ns × (0.99,1.01) 79.1ns × (1.00,1.00) +4.25% (p=0.000)
SetTypePtr128 74.3ns × (0.99,1.01) 77.6ns × (1.00,1.01) +4.55% (p=0.000)
SetTypePtrSlice 726ns × (1.00,1.01) 712ns × (1.00,1.00) -1.95% (p=0.001)
SetTypeNode1 20.0ns × (0.99,1.01) 20.7ns × (1.00,1.00) +3.71% (p=0.000)
SetTypeNode1Slice 112ns × (1.00,1.00) 113ns × (0.99,1.00) ~ (p=0.070)
SetTypeNode8 23.9ns × (1.00,1.00) 24.7ns × (1.00,1.01) +3.18% (p=0.000)
SetTypeNode8Slice 294ns × (0.99,1.02) 287ns × (0.99,1.01) -2.38% (p=0.015)
SetTypeNode64 52.8ns × (0.99,1.03) 51.8ns × (0.99,1.01) ~ (p=0.069)
SetTypeNode64Slice 1.13µs × (0.99,1.05) 1.14µs × (0.99,1.00) ~ (p=0.767)
SetTypeNode64Dead 36.0ns × (1.00,1.01) 32.5ns × (0.99,1.00) -9.67% (p=0.000)
SetTypeNode64DeadSlice 1.43µs × (0.99,1.01) 1.40µs × (1.00,1.00) -2.39% (p=0.001)
SetTypeNode124 75.7ns × (1.00,1.01) 79.0ns × (1.00,1.00) +4.44% (p=0.000)
SetTypeNode124Slice 1.94µs × (1.00,1.01) 2.04µs × (0.99,1.01) +4.98% (p=0.000)
SetTypeNode126 75.4ns × (1.00,1.01) 77.7ns × (0.99,1.01) +3.11% (p=0.000)
SetTypeNode126Slice 1.95µs × (0.99,1.01) 2.03µs × (1.00,1.00) +3.74% (p=0.000)
SetTypeNode128 85.4ns × (0.99,1.01) 122.0ns × (1.00,1.00) +42.89% (p=0.000)
SetTypeNode128Slice 2.20µs × (1.00,1.01) 2.36µs × (0.98,1.02) +7.48% (p=0.001)
SetTypeNode130 83.3ns × (1.00,1.00) 123.0ns × (1.00,1.00) +47.61% (p=0.000)
SetTypeNode130Slice 2.30µs × (0.99,1.01) 2.40µs × (0.98,1.01) +4.37% (p=0.000)
SetTypeNode1024 498ns × (1.00,1.00) 537ns × (1.00,1.00) +7.96% (p=0.000)
SetTypeNode1024Slice 15.5µs × (0.99,1.01) 17.8µs × (1.00,1.00) +15.27% (p=0.000)
The above compares always using a cached pointer mask (and the
corresponding waste of memory) against using the programs directly.
Some slowdown is expected, in exchange for having a better general algorithm.
The GC programs kick in for SetTypeNode128, SetTypeNode130, SetTypeNode1024,
along with the slice variants of those.
It is possible that the cutoff of 128 words (bits) should be raised
in a followup CL, but even with this low cutoff the GC programs are
faster than Go 1.4's "fast path" non-GC program case.
Benchmarks for heapBitsSetType, Go 1.4 vs this CL:
name old mean new mean delta
SetTypePtr 6.89ns × (1.00,1.00) 5.17ns × (1.00,1.00) -25.02% (p=0.000)
SetTypePtr8 25.8ns × (0.97,1.05) 21.5ns × (1.00,1.00) -16.70% (p=0.000)
SetTypePtr16 39.8ns × (0.97,1.02) 24.7ns × (0.99,1.01) -37.81% (p=0.000)
SetTypePtr32 68.8ns × (0.98,1.01) 32.2ns × (1.00,1.01) -53.18% (p=0.000)
SetTypePtr64 130ns × (1.00,1.00) 47ns × (1.00,1.00) -63.67% (p=0.000)
SetTypePtr126 241ns × (0.99,1.01) 79ns × (1.00,1.01) -67.25% (p=0.000)
SetTypePtr128 2.07µs × (1.00,1.00) 0.08µs × (1.00,1.00) -96.27% (p=0.000)
SetTypePtrSlice 1.05µs × (0.99,1.01) 0.72µs × (0.99,1.02) -31.70% (p=0.000)
SetTypeNode1 16.0ns × (0.99,1.01) 20.8ns × (0.99,1.03) +29.91% (p=0.000)
SetTypeNode1Slice 184ns × (0.99,1.01) 112ns × (0.99,1.01) -39.26% (p=0.000)
SetTypeNode8 29.5ns × (0.97,1.02) 24.6ns × (1.00,1.00) -16.50% (p=0.000)
SetTypeNode8Slice 624ns × (0.98,1.02) 285ns × (1.00,1.00) -54.31% (p=0.000)
SetTypeNode64 135ns × (0.96,1.08) 52ns × (0.99,1.02) -61.32% (p=0.000)
SetTypeNode64Slice 3.83µs × (1.00,1.00) 1.14µs × (0.99,1.01) -70.16% (p=0.000)
SetTypeNode64Dead 134ns × (0.99,1.01) 32ns × (1.00,1.01) -75.74% (p=0.000)
SetTypeNode64DeadSlice 3.83µs × (0.99,1.00) 1.40µs × (1.00,1.01) -63.42% (p=0.000)
SetTypeNode124 240ns × (0.99,1.01) 79ns × (1.00,1.01) -67.05% (p=0.000)
SetTypeNode124Slice 7.27µs × (1.00,1.00) 2.04µs × (1.00,1.00) -71.95% (p=0.000)
SetTypeNode126 2.06µs × (0.99,1.01) 0.08µs × (0.99,1.01) -96.23% (p=0.000)
SetTypeNode126Slice 64.4µs × (1.00,1.00) 2.0µs × (1.00,1.00) -96.85% (p=0.000)
SetTypeNode128 2.09µs × (1.00,1.01) 0.12µs × (1.00,1.00) -94.15% (p=0.000)
SetTypeNode128Slice 65.4µs × (1.00,1.00) 2.4µs × (0.99,1.03) -96.39% (p=0.000)
SetTypeNode130 2.11µs × (1.00,1.00) 0.12µs × (1.00,1.00) -94.18% (p=0.000)
SetTypeNode130Slice 66.3µs × (1.00,1.00) 2.4µs × (0.97,1.08) -96.34% (p=0.000)
SetTypeNode1024 16.0µs × (1.00,1.01) 0.5µs × (1.00,1.00) -96.65% (p=0.000)
SetTypeNode1024Slice 512µs × (1.00,1.00) 18µs × (0.98,1.04) -96.45% (p=0.000)
SetTypeNode124 uses a 124 data + 2 ptr = 126-word allocation.
Both Go 1.4 and this CL are using pointer bitmaps for this case,
so that's an overall 3x speedup for using pointer bitmaps.
SetTypeNode128 uses a 128 data + 2 ptr = 130-word allocation.
Both Go 1.4 and this CL are running the GC program for this case,
so that's an overall 17x speedup when using GC programs (and
I've seen >20x on other systems).
Comparing Go 1.4's SetTypeNode124 (pointer bitmap) against
this CL's SetTypeNode128 (GC program), the slow path in the
code in this CL is 2x faster than the fast path in Go 1.4.
The Go 1 benchmarks are basically unaffected compared to just before this CL.
Go 1 benchmarks, before this CL vs this CL:
name old mean new mean delta
BinaryTree17 5.87s × (0.97,1.04) 5.91s × (0.96,1.04) ~ (p=0.306)
Fannkuch11 4.38s × (1.00,1.00) 4.37s × (1.00,1.01) -0.22% (p=0.006)
FmtFprintfEmpty 90.7ns × (0.97,1.10) 89.3ns × (0.96,1.09) ~ (p=0.280)
FmtFprintfString 282ns × (0.98,1.04) 287ns × (0.98,1.07) +1.72% (p=0.039)
FmtFprintfInt 269ns × (0.99,1.03) 282ns × (0.97,1.04) +4.87% (p=0.000)
FmtFprintfIntInt 478ns × (0.99,1.02) 481ns × (0.99,1.02) +0.61% (p=0.048)
FmtFprintfPrefixedInt 399ns × (0.98,1.03) 400ns × (0.98,1.05) ~ (p=0.533)
FmtFprintfFloat 563ns × (0.99,1.01) 570ns × (1.00,1.01) +1.37% (p=0.000)
FmtManyArgs 1.89µs × (0.99,1.01) 1.92µs × (0.99,1.02) +1.88% (p=0.000)
GobDecode 15.2ms × (0.99,1.01) 15.2ms × (0.98,1.05) ~ (p=0.609)
GobEncode 11.6ms × (0.98,1.03) 11.9ms × (0.98,1.04) +2.17% (p=0.000)
Gzip 648ms × (0.99,1.01) 648ms × (1.00,1.01) ~ (p=0.835)
Gunzip 142ms × (1.00,1.00) 143ms × (1.00,1.01) ~ (p=0.169)
HTTPClientServer 90.5µs × (0.98,1.03) 91.5µs × (0.98,1.04) +1.04% (p=0.045)
JSONEncode 31.5ms × (0.98,1.03) 31.4ms × (0.98,1.03) ~ (p=0.549)
JSONDecode 111ms × (0.99,1.01) 107ms × (0.99,1.01) -3.21% (p=0.000)
Mandelbrot200 6.01ms × (1.00,1.00) 6.01ms × (1.00,1.00) ~ (p=0.878)
GoParse 6.54ms × (0.99,1.02) 6.61ms × (0.99,1.03) +1.08% (p=0.004)
RegexpMatchEasy0_32 160ns × (1.00,1.01) 161ns × (1.00,1.00) +0.40% (p=0.000)
RegexpMatchEasy0_1K 560ns × (0.99,1.01) 559ns × (0.99,1.01) ~ (p=0.088)
RegexpMatchEasy1_32 138ns × (0.99,1.01) 138ns × (1.00,1.00) ~ (p=0.380)
RegexpMatchEasy1_1K 877ns × (1.00,1.00) 878ns × (1.00,1.00) ~ (p=0.157)
RegexpMatchMedium_32 251ns × (0.99,1.00) 251ns × (1.00,1.01) +0.28% (p=0.021)
RegexpMatchMedium_1K 72.6µs × (1.00,1.00) 72.6µs × (1.00,1.00) ~ (p=0.539)
RegexpMatchHard_32 3.84µs × (1.00,1.00) 3.84µs × (1.00,1.00) ~ (p=0.378)
RegexpMatchHard_1K 117µs × (1.00,1.00) 117µs × (1.00,1.00) ~ (p=0.067)
Revcomp 904ms × (0.99,1.02) 904ms × (0.99,1.01) ~ (p=0.943)
Template 125ms × (0.99,1.02) 127ms × (0.99,1.01) +1.79% (p=0.000)
TimeParse 627ns × (0.99,1.01) 622ns × (0.99,1.01) -0.88% (p=0.000)
TimeFormat 655ns × (0.99,1.02) 655ns × (0.99,1.02) ~ (p=0.976)
For the record, Go 1 benchmarks, Go 1.4 vs this CL:
name old mean new mean delta
BinaryTree17 4.61s × (0.97,1.05) 5.91s × (0.98,1.03) +28.35% (p=0.000)
Fannkuch11 4.40s × (0.99,1.03) 4.41s × (0.99,1.01) ~ (p=0.212)
FmtFprintfEmpty 102ns × (0.99,1.01) 84ns × (0.99,1.02) -18.38% (p=0.000)
FmtFprintfString 302ns × (0.98,1.01) 303ns × (0.99,1.02) ~ (p=0.203)
FmtFprintfInt 313ns × (0.97,1.05) 270ns × (0.99,1.01) -13.69% (p=0.000)
FmtFprintfIntInt 524ns × (0.98,1.02) 477ns × (0.99,1.00) -8.87% (p=0.000)
FmtFprintfPrefixedInt 424ns × (0.98,1.02) 386ns × (0.99,1.01) -8.96% (p=0.000)
FmtFprintfFloat 652ns × (0.98,1.02) 594ns × (0.97,1.05) -8.97% (p=0.000)
FmtManyArgs 2.13µs × (0.99,1.02) 1.94µs × (0.99,1.01) -8.92% (p=0.000)
GobDecode 17.1ms × (0.99,1.02) 14.9ms × (0.98,1.03) -13.07% (p=0.000)
GobEncode 13.5ms × (0.98,1.03) 11.5ms × (0.98,1.03) -15.25% (p=0.000)
Gzip 656ms × (0.99,1.02) 647ms × (0.99,1.01) -1.29% (p=0.000)
Gunzip 143ms × (0.99,1.02) 144ms × (0.99,1.01) ~ (p=0.204)
HTTPClientServer 88.2µs × (0.98,1.02) 90.8µs × (0.98,1.01) +2.93% (p=0.000)
JSONEncode 32.2ms × (0.98,1.02) 30.9ms × (0.97,1.04) -4.06% (p=0.001)
JSONDecode 121ms × (0.98,1.02) 110ms × (0.98,1.05) -8.95% (p=0.000)
Mandelbrot200 6.06ms × (0.99,1.01) 6.11ms × (0.98,1.04) ~ (p=0.184)
GoParse 6.76ms × (0.97,1.04) 6.58ms × (0.98,1.05) -2.63% (p=0.003)
RegexpMatchEasy0_32 195ns × (1.00,1.01) 155ns × (0.99,1.01) -20.43% (p=0.000)
RegexpMatchEasy0_1K 479ns × (0.98,1.03) 535ns × (0.99,1.02) +11.59% (p=0.000)
RegexpMatchEasy1_32 169ns × (0.99,1.02) 131ns × (0.99,1.03) -22.44% (p=0.000)
RegexpMatchEasy1_1K 1.53µs × (0.99,1.01) 0.87µs × (0.99,1.02) -43.07% (p=0.000)
RegexpMatchMedium_32 334ns × (0.99,1.01) 242ns × (0.99,1.01) -27.53% (p=0.000)
RegexpMatchMedium_1K 125µs × (1.00,1.01) 72µs × (0.99,1.03) -42.53% (p=0.000)
RegexpMatchHard_32 6.03µs × (0.99,1.01) 3.79µs × (0.99,1.01) -37.12% (p=0.000)
RegexpMatchHard_1K 189µs × (0.99,1.02) 115µs × (0.99,1.01) -39.20% (p=0.000)
Revcomp 935ms × (0.96,1.03) 926ms × (0.98,1.02) ~ (p=0.083)
Template 146ms × (0.97,1.05) 119ms × (0.99,1.01) -18.37% (p=0.000)
TimeParse 660ns × (0.99,1.01) 624ns × (0.99,1.02) -5.43% (p=0.000)
TimeFormat 670ns × (0.98,1.02) 710ns × (1.00,1.01) +5.97% (p=0.000)
This CL is a bit larger than I would like, but the compiler, linker, runtime,
and package reflect all need to be in sync about the format of these programs,
so there is no easy way to split this into independent changes (at least
while keeping the build working at each change).
Fixes #9625.
Fixes #10524.
Change-Id: I9e3e20d6097099d0f8532d1cb5b1af528804989a
Reviewed-on: https://go-review.googlesource.com/9888
Reviewed-by: Austin Clements <austin@google.com>
Run-TryBot: Russ Cox <rsc@golang.org>
2015-05-08 01:43:18 -04:00
|
|
|
type GCProg struct {
|
2016-08-19 22:40:38 -04:00
|
|
|
ctxt *Link
|
|
|
|
|
sym *Symbol
|
|
|
|
|
w gcprog.Writer
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
|
2016-08-19 22:40:38 -04:00
|
|
|
func (p *GCProg) Init(ctxt *Link, name string) {
|
|
|
|
|
p.ctxt = ctxt
|
cmd/link: use ctxt.{Lookup,ROLookup} in favour of function versions of same
Done with two eg templates:
package p
import (
"cmd/link/internal/ld"
)
func before(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ld.Linklookup(ctxt, name, v)
}
func after(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ctxt.Syms.Lookup(name, v)
}
package p
import (
"cmd/link/internal/ld"
)
func before(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ld.Linkrlookup(ctxt, name, v)
}
func after(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ctxt.Syms.ROLookup(name, v)
}
Change-Id: I00647dbf62294557bd24c29ad1f108fc786335f1
Reviewed-on: https://go-review.googlesource.com/29343
Run-TryBot: Michael Hudson-Doyle <michael.hudson@canonical.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Ian Lance Taylor <iant@golang.org>
2016-09-20 15:06:08 +12:00
|
|
|
p.sym = ctxt.Syms.Lookup(name, 0)
|
2016-08-19 22:40:38 -04:00
|
|
|
p.w.Init(p.writeByte(ctxt))
|
runtime: replace GC programs with simpler encoding, faster decoder
Small types record the location of pointers in their memory layout
by using a simple bitmap. In Go 1.4 the bitmap held 4-bit entries,
and in Go 1.5 the bitmap holds 1-bit entries, but in both cases using
a bitmap for a large type containing arrays does not make sense:
if someone refers to the type [1<<28]*byte in a program in such
a way that the type information makes it into the binary, it would be
a waste of space to write a 128 MB (for 4-bit entries) or even 32 MB
(for 1-bit entries) bitmap full of 1s into the binary or even to keep
one in memory during the execution of the program.
For large types containing arrays, it is much more compact to describe
the locations of pointers using a notation that can express repetition
than to lay out a bitmap of pointers. Go 1.4 included such a notation,
called ``GC programs'' but it was complex, required recursion during
decoding, and was generally slow. Dmitriy measured the execution of
these programs writing directly to the heap bitmap as being 7x slower
than copying from a preunrolled 4-bit mask (and frankly that code was
not terribly fast either). For some tests, unrollgcprog1 was seen costing
as much as 3x more than the rest of malloc combined.
This CL introduces a different form for the GC programs. They use a
simple Lempel-Ziv-style encoding of the 1-bit pointer information,
in which the only operations are (1) emit the following n bits
and (2) repeat the last n bits c more times. This encoding can be
generated directly from the Go type information (using repetition
only for arrays or large runs of non-pointer data) and it can be decoded
very efficiently. In particular the decoding requires little state and
no recursion, so that the entire decoding can run without any memory
accesses other than the reads of the encoding and the writes of the
decoded form to the heap bitmap. For recursive types like arrays of
arrays of arrays, the inner instructions are only executed once, not
n times, so that large repetitions run at full speed. (In contrast, large
repetitions in the old programs repeated the individual bit-level layout
of the inner data over and over.) The result is as much as 25x faster
decoding compared to the old form.
Because the old decoder was so slow, Go 1.4 had three (or so) cases
for how to set the heap bitmap bits for an allocation of a given type:
(1) If the type had an even number of words up to 32 words, then
the 4-bit pointer mask for the type fit in no more than 16 bytes;
store the 4-bit pointer mask directly in the binary and copy from it.
(1b) If the type had an odd number of words up to 15 words, then
the 4-bit pointer mask for the type, doubled to end on a byte boundary,
fit in no more than 16 bytes; store that doubled mask directly in the
binary and copy from it.
(2) If the type had an even number of words up to 128 words,
or an odd number of words up to 63 words (again due to doubling),
then the 4-bit pointer mask would fit in a 64-byte unrolled mask.
Store a GC program in the binary, but leave space in the BSS for
the unrolled mask. Execute the GC program to construct the mask the
first time it is needed, and thereafter copy from the mask.
(3) Otherwise, store a GC program and execute it to write directly to
the heap bitmap each time an object of that type is allocated.
(This is the case that was 7x slower than the other two.)
Because the new pointer masks store 1-bit entries instead of 4-bit
entries and because using the decoder no longer carries a significant
overhead, after this CL (that is, for Go 1.5) there are only two cases:
(1) If the type is 128 words or less (no condition about odd or even),
store the 1-bit pointer mask directly in the binary and use it to
initialize the heap bitmap during malloc. (Implemented in CL 9702.)
(2) There is no case 2 anymore.
(3) Otherwise, store a GC program and execute it to write directly to
the heap bitmap each time an object of that type is allocated.
Executing the GC program directly into the heap bitmap (case (3) above)
was disabled for the Go 1.5 dev cycle, both to avoid needing to use
GC programs for typedmemmove and to avoid updating that code as
the heap bitmap format changed. Typedmemmove no longer uses this
type information; as of CL 9886 it uses the heap bitmap directly.
Now that the heap bitmap format is stable, we reintroduce GC programs
and their space savings.
Benchmarks for heapBitsSetType, before this CL vs this CL:
name old mean new mean delta
SetTypePtr 7.59ns × (0.99,1.02) 5.16ns × (1.00,1.00) -32.05% (p=0.000)
SetTypePtr8 21.0ns × (0.98,1.05) 21.4ns × (1.00,1.00) ~ (p=0.179)
SetTypePtr16 24.1ns × (0.99,1.01) 24.6ns × (1.00,1.00) +2.41% (p=0.001)
SetTypePtr32 31.2ns × (0.99,1.01) 32.4ns × (0.99,1.02) +3.72% (p=0.001)
SetTypePtr64 45.2ns × (1.00,1.00) 47.2ns × (1.00,1.00) +4.42% (p=0.000)
SetTypePtr126 75.8ns × (0.99,1.01) 79.1ns × (1.00,1.00) +4.25% (p=0.000)
SetTypePtr128 74.3ns × (0.99,1.01) 77.6ns × (1.00,1.01) +4.55% (p=0.000)
SetTypePtrSlice 726ns × (1.00,1.01) 712ns × (1.00,1.00) -1.95% (p=0.001)
SetTypeNode1 20.0ns × (0.99,1.01) 20.7ns × (1.00,1.00) +3.71% (p=0.000)
SetTypeNode1Slice 112ns × (1.00,1.00) 113ns × (0.99,1.00) ~ (p=0.070)
SetTypeNode8 23.9ns × (1.00,1.00) 24.7ns × (1.00,1.01) +3.18% (p=0.000)
SetTypeNode8Slice 294ns × (0.99,1.02) 287ns × (0.99,1.01) -2.38% (p=0.015)
SetTypeNode64 52.8ns × (0.99,1.03) 51.8ns × (0.99,1.01) ~ (p=0.069)
SetTypeNode64Slice 1.13µs × (0.99,1.05) 1.14µs × (0.99,1.00) ~ (p=0.767)
SetTypeNode64Dead 36.0ns × (1.00,1.01) 32.5ns × (0.99,1.00) -9.67% (p=0.000)
SetTypeNode64DeadSlice 1.43µs × (0.99,1.01) 1.40µs × (1.00,1.00) -2.39% (p=0.001)
SetTypeNode124 75.7ns × (1.00,1.01) 79.0ns × (1.00,1.00) +4.44% (p=0.000)
SetTypeNode124Slice 1.94µs × (1.00,1.01) 2.04µs × (0.99,1.01) +4.98% (p=0.000)
SetTypeNode126 75.4ns × (1.00,1.01) 77.7ns × (0.99,1.01) +3.11% (p=0.000)
SetTypeNode126Slice 1.95µs × (0.99,1.01) 2.03µs × (1.00,1.00) +3.74% (p=0.000)
SetTypeNode128 85.4ns × (0.99,1.01) 122.0ns × (1.00,1.00) +42.89% (p=0.000)
SetTypeNode128Slice 2.20µs × (1.00,1.01) 2.36µs × (0.98,1.02) +7.48% (p=0.001)
SetTypeNode130 83.3ns × (1.00,1.00) 123.0ns × (1.00,1.00) +47.61% (p=0.000)
SetTypeNode130Slice 2.30µs × (0.99,1.01) 2.40µs × (0.98,1.01) +4.37% (p=0.000)
SetTypeNode1024 498ns × (1.00,1.00) 537ns × (1.00,1.00) +7.96% (p=0.000)
SetTypeNode1024Slice 15.5µs × (0.99,1.01) 17.8µs × (1.00,1.00) +15.27% (p=0.000)
The above compares always using a cached pointer mask (and the
corresponding waste of memory) against using the programs directly.
Some slowdown is expected, in exchange for having a better general algorithm.
The GC programs kick in for SetTypeNode128, SetTypeNode130, SetTypeNode1024,
along with the slice variants of those.
It is possible that the cutoff of 128 words (bits) should be raised
in a followup CL, but even with this low cutoff the GC programs are
faster than Go 1.4's "fast path" non-GC program case.
Benchmarks for heapBitsSetType, Go 1.4 vs this CL:
name old mean new mean delta
SetTypePtr 6.89ns × (1.00,1.00) 5.17ns × (1.00,1.00) -25.02% (p=0.000)
SetTypePtr8 25.8ns × (0.97,1.05) 21.5ns × (1.00,1.00) -16.70% (p=0.000)
SetTypePtr16 39.8ns × (0.97,1.02) 24.7ns × (0.99,1.01) -37.81% (p=0.000)
SetTypePtr32 68.8ns × (0.98,1.01) 32.2ns × (1.00,1.01) -53.18% (p=0.000)
SetTypePtr64 130ns × (1.00,1.00) 47ns × (1.00,1.00) -63.67% (p=0.000)
SetTypePtr126 241ns × (0.99,1.01) 79ns × (1.00,1.01) -67.25% (p=0.000)
SetTypePtr128 2.07µs × (1.00,1.00) 0.08µs × (1.00,1.00) -96.27% (p=0.000)
SetTypePtrSlice 1.05µs × (0.99,1.01) 0.72µs × (0.99,1.02) -31.70% (p=0.000)
SetTypeNode1 16.0ns × (0.99,1.01) 20.8ns × (0.99,1.03) +29.91% (p=0.000)
SetTypeNode1Slice 184ns × (0.99,1.01) 112ns × (0.99,1.01) -39.26% (p=0.000)
SetTypeNode8 29.5ns × (0.97,1.02) 24.6ns × (1.00,1.00) -16.50% (p=0.000)
SetTypeNode8Slice 624ns × (0.98,1.02) 285ns × (1.00,1.00) -54.31% (p=0.000)
SetTypeNode64 135ns × (0.96,1.08) 52ns × (0.99,1.02) -61.32% (p=0.000)
SetTypeNode64Slice 3.83µs × (1.00,1.00) 1.14µs × (0.99,1.01) -70.16% (p=0.000)
SetTypeNode64Dead 134ns × (0.99,1.01) 32ns × (1.00,1.01) -75.74% (p=0.000)
SetTypeNode64DeadSlice 3.83µs × (0.99,1.00) 1.40µs × (1.00,1.01) -63.42% (p=0.000)
SetTypeNode124 240ns × (0.99,1.01) 79ns × (1.00,1.01) -67.05% (p=0.000)
SetTypeNode124Slice 7.27µs × (1.00,1.00) 2.04µs × (1.00,1.00) -71.95% (p=0.000)
SetTypeNode126 2.06µs × (0.99,1.01) 0.08µs × (0.99,1.01) -96.23% (p=0.000)
SetTypeNode126Slice 64.4µs × (1.00,1.00) 2.0µs × (1.00,1.00) -96.85% (p=0.000)
SetTypeNode128 2.09µs × (1.00,1.01) 0.12µs × (1.00,1.00) -94.15% (p=0.000)
SetTypeNode128Slice 65.4µs × (1.00,1.00) 2.4µs × (0.99,1.03) -96.39% (p=0.000)
SetTypeNode130 2.11µs × (1.00,1.00) 0.12µs × (1.00,1.00) -94.18% (p=0.000)
SetTypeNode130Slice 66.3µs × (1.00,1.00) 2.4µs × (0.97,1.08) -96.34% (p=0.000)
SetTypeNode1024 16.0µs × (1.00,1.01) 0.5µs × (1.00,1.00) -96.65% (p=0.000)
SetTypeNode1024Slice 512µs × (1.00,1.00) 18µs × (0.98,1.04) -96.45% (p=0.000)
SetTypeNode124 uses a 124 data + 2 ptr = 126-word allocation.
Both Go 1.4 and this CL are using pointer bitmaps for this case,
so that's an overall 3x speedup for using pointer bitmaps.
SetTypeNode128 uses a 128 data + 2 ptr = 130-word allocation.
Both Go 1.4 and this CL are running the GC program for this case,
so that's an overall 17x speedup when using GC programs (and
I've seen >20x on other systems).
Comparing Go 1.4's SetTypeNode124 (pointer bitmap) against
this CL's SetTypeNode128 (GC program), the slow path in the
code in this CL is 2x faster than the fast path in Go 1.4.
The Go 1 benchmarks are basically unaffected compared to just before this CL.
Go 1 benchmarks, before this CL vs this CL:
name old mean new mean delta
BinaryTree17 5.87s × (0.97,1.04) 5.91s × (0.96,1.04) ~ (p=0.306)
Fannkuch11 4.38s × (1.00,1.00) 4.37s × (1.00,1.01) -0.22% (p=0.006)
FmtFprintfEmpty 90.7ns × (0.97,1.10) 89.3ns × (0.96,1.09) ~ (p=0.280)
FmtFprintfString 282ns × (0.98,1.04) 287ns × (0.98,1.07) +1.72% (p=0.039)
FmtFprintfInt 269ns × (0.99,1.03) 282ns × (0.97,1.04) +4.87% (p=0.000)
FmtFprintfIntInt 478ns × (0.99,1.02) 481ns × (0.99,1.02) +0.61% (p=0.048)
FmtFprintfPrefixedInt 399ns × (0.98,1.03) 400ns × (0.98,1.05) ~ (p=0.533)
FmtFprintfFloat 563ns × (0.99,1.01) 570ns × (1.00,1.01) +1.37% (p=0.000)
FmtManyArgs 1.89µs × (0.99,1.01) 1.92µs × (0.99,1.02) +1.88% (p=0.000)
GobDecode 15.2ms × (0.99,1.01) 15.2ms × (0.98,1.05) ~ (p=0.609)
GobEncode 11.6ms × (0.98,1.03) 11.9ms × (0.98,1.04) +2.17% (p=0.000)
Gzip 648ms × (0.99,1.01) 648ms × (1.00,1.01) ~ (p=0.835)
Gunzip 142ms × (1.00,1.00) 143ms × (1.00,1.01) ~ (p=0.169)
HTTPClientServer 90.5µs × (0.98,1.03) 91.5µs × (0.98,1.04) +1.04% (p=0.045)
JSONEncode 31.5ms × (0.98,1.03) 31.4ms × (0.98,1.03) ~ (p=0.549)
JSONDecode 111ms × (0.99,1.01) 107ms × (0.99,1.01) -3.21% (p=0.000)
Mandelbrot200 6.01ms × (1.00,1.00) 6.01ms × (1.00,1.00) ~ (p=0.878)
GoParse 6.54ms × (0.99,1.02) 6.61ms × (0.99,1.03) +1.08% (p=0.004)
RegexpMatchEasy0_32 160ns × (1.00,1.01) 161ns × (1.00,1.00) +0.40% (p=0.000)
RegexpMatchEasy0_1K 560ns × (0.99,1.01) 559ns × (0.99,1.01) ~ (p=0.088)
RegexpMatchEasy1_32 138ns × (0.99,1.01) 138ns × (1.00,1.00) ~ (p=0.380)
RegexpMatchEasy1_1K 877ns × (1.00,1.00) 878ns × (1.00,1.00) ~ (p=0.157)
RegexpMatchMedium_32 251ns × (0.99,1.00) 251ns × (1.00,1.01) +0.28% (p=0.021)
RegexpMatchMedium_1K 72.6µs × (1.00,1.00) 72.6µs × (1.00,1.00) ~ (p=0.539)
RegexpMatchHard_32 3.84µs × (1.00,1.00) 3.84µs × (1.00,1.00) ~ (p=0.378)
RegexpMatchHard_1K 117µs × (1.00,1.00) 117µs × (1.00,1.00) ~ (p=0.067)
Revcomp 904ms × (0.99,1.02) 904ms × (0.99,1.01) ~ (p=0.943)
Template 125ms × (0.99,1.02) 127ms × (0.99,1.01) +1.79% (p=0.000)
TimeParse 627ns × (0.99,1.01) 622ns × (0.99,1.01) -0.88% (p=0.000)
TimeFormat 655ns × (0.99,1.02) 655ns × (0.99,1.02) ~ (p=0.976)
For the record, Go 1 benchmarks, Go 1.4 vs this CL:
name old mean new mean delta
BinaryTree17 4.61s × (0.97,1.05) 5.91s × (0.98,1.03) +28.35% (p=0.000)
Fannkuch11 4.40s × (0.99,1.03) 4.41s × (0.99,1.01) ~ (p=0.212)
FmtFprintfEmpty 102ns × (0.99,1.01) 84ns × (0.99,1.02) -18.38% (p=0.000)
FmtFprintfString 302ns × (0.98,1.01) 303ns × (0.99,1.02) ~ (p=0.203)
FmtFprintfInt 313ns × (0.97,1.05) 270ns × (0.99,1.01) -13.69% (p=0.000)
FmtFprintfIntInt 524ns × (0.98,1.02) 477ns × (0.99,1.00) -8.87% (p=0.000)
FmtFprintfPrefixedInt 424ns × (0.98,1.02) 386ns × (0.99,1.01) -8.96% (p=0.000)
FmtFprintfFloat 652ns × (0.98,1.02) 594ns × (0.97,1.05) -8.97% (p=0.000)
FmtManyArgs 2.13µs × (0.99,1.02) 1.94µs × (0.99,1.01) -8.92% (p=0.000)
GobDecode 17.1ms × (0.99,1.02) 14.9ms × (0.98,1.03) -13.07% (p=0.000)
GobEncode 13.5ms × (0.98,1.03) 11.5ms × (0.98,1.03) -15.25% (p=0.000)
Gzip 656ms × (0.99,1.02) 647ms × (0.99,1.01) -1.29% (p=0.000)
Gunzip 143ms × (0.99,1.02) 144ms × (0.99,1.01) ~ (p=0.204)
HTTPClientServer 88.2µs × (0.98,1.02) 90.8µs × (0.98,1.01) +2.93% (p=0.000)
JSONEncode 32.2ms × (0.98,1.02) 30.9ms × (0.97,1.04) -4.06% (p=0.001)
JSONDecode 121ms × (0.98,1.02) 110ms × (0.98,1.05) -8.95% (p=0.000)
Mandelbrot200 6.06ms × (0.99,1.01) 6.11ms × (0.98,1.04) ~ (p=0.184)
GoParse 6.76ms × (0.97,1.04) 6.58ms × (0.98,1.05) -2.63% (p=0.003)
RegexpMatchEasy0_32 195ns × (1.00,1.01) 155ns × (0.99,1.01) -20.43% (p=0.000)
RegexpMatchEasy0_1K 479ns × (0.98,1.03) 535ns × (0.99,1.02) +11.59% (p=0.000)
RegexpMatchEasy1_32 169ns × (0.99,1.02) 131ns × (0.99,1.03) -22.44% (p=0.000)
RegexpMatchEasy1_1K 1.53µs × (0.99,1.01) 0.87µs × (0.99,1.02) -43.07% (p=0.000)
RegexpMatchMedium_32 334ns × (0.99,1.01) 242ns × (0.99,1.01) -27.53% (p=0.000)
RegexpMatchMedium_1K 125µs × (1.00,1.01) 72µs × (0.99,1.03) -42.53% (p=0.000)
RegexpMatchHard_32 6.03µs × (0.99,1.01) 3.79µs × (0.99,1.01) -37.12% (p=0.000)
RegexpMatchHard_1K 189µs × (0.99,1.02) 115µs × (0.99,1.01) -39.20% (p=0.000)
Revcomp 935ms × (0.96,1.03) 926ms × (0.98,1.02) ~ (p=0.083)
Template 146ms × (0.97,1.05) 119ms × (0.99,1.01) -18.37% (p=0.000)
TimeParse 660ns × (0.99,1.01) 624ns × (0.99,1.02) -5.43% (p=0.000)
TimeFormat 670ns × (0.98,1.02) 710ns × (1.00,1.01) +5.97% (p=0.000)
This CL is a bit larger than I would like, but the compiler, linker, runtime,
and package reflect all need to be in sync about the format of these programs,
so there is no easy way to split this into independent changes (at least
while keeping the build working at each change).
Fixes #9625.
Fixes #10524.
Change-Id: I9e3e20d6097099d0f8532d1cb5b1af528804989a
Reviewed-on: https://go-review.googlesource.com/9888
Reviewed-by: Austin Clements <austin@google.com>
Run-TryBot: Russ Cox <rsc@golang.org>
2015-05-08 01:43:18 -04:00
|
|
|
if debugGCProg {
|
|
|
|
|
fmt.Fprintf(os.Stderr, "ld: start GCProg %s\n", name)
|
|
|
|
|
p.w.Debug(os.Stderr)
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2016-08-19 22:40:38 -04:00
|
|
|
func (p *GCProg) writeByte(ctxt *Link) func(x byte) {
|
|
|
|
|
return func(x byte) {
|
|
|
|
|
Adduint8(ctxt, p.sym, x)
|
|
|
|
|
}
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
|
runtime: replace GC programs with simpler encoding, faster decoder
Small types record the location of pointers in their memory layout
by using a simple bitmap. In Go 1.4 the bitmap held 4-bit entries,
and in Go 1.5 the bitmap holds 1-bit entries, but in both cases using
a bitmap for a large type containing arrays does not make sense:
if someone refers to the type [1<<28]*byte in a program in such
a way that the type information makes it into the binary, it would be
a waste of space to write a 128 MB (for 4-bit entries) or even 32 MB
(for 1-bit entries) bitmap full of 1s into the binary or even to keep
one in memory during the execution of the program.
For large types containing arrays, it is much more compact to describe
the locations of pointers using a notation that can express repetition
than to lay out a bitmap of pointers. Go 1.4 included such a notation,
called ``GC programs'' but it was complex, required recursion during
decoding, and was generally slow. Dmitriy measured the execution of
these programs writing directly to the heap bitmap as being 7x slower
than copying from a preunrolled 4-bit mask (and frankly that code was
not terribly fast either). For some tests, unrollgcprog1 was seen costing
as much as 3x more than the rest of malloc combined.
This CL introduces a different form for the GC programs. They use a
simple Lempel-Ziv-style encoding of the 1-bit pointer information,
in which the only operations are (1) emit the following n bits
and (2) repeat the last n bits c more times. This encoding can be
generated directly from the Go type information (using repetition
only for arrays or large runs of non-pointer data) and it can be decoded
very efficiently. In particular the decoding requires little state and
no recursion, so that the entire decoding can run without any memory
accesses other than the reads of the encoding and the writes of the
decoded form to the heap bitmap. For recursive types like arrays of
arrays of arrays, the inner instructions are only executed once, not
n times, so that large repetitions run at full speed. (In contrast, large
repetitions in the old programs repeated the individual bit-level layout
of the inner data over and over.) The result is as much as 25x faster
decoding compared to the old form.
Because the old decoder was so slow, Go 1.4 had three (or so) cases
for how to set the heap bitmap bits for an allocation of a given type:
(1) If the type had an even number of words up to 32 words, then
the 4-bit pointer mask for the type fit in no more than 16 bytes;
store the 4-bit pointer mask directly in the binary and copy from it.
(1b) If the type had an odd number of words up to 15 words, then
the 4-bit pointer mask for the type, doubled to end on a byte boundary,
fit in no more than 16 bytes; store that doubled mask directly in the
binary and copy from it.
(2) If the type had an even number of words up to 128 words,
or an odd number of words up to 63 words (again due to doubling),
then the 4-bit pointer mask would fit in a 64-byte unrolled mask.
Store a GC program in the binary, but leave space in the BSS for
the unrolled mask. Execute the GC program to construct the mask the
first time it is needed, and thereafter copy from the mask.
(3) Otherwise, store a GC program and execute it to write directly to
the heap bitmap each time an object of that type is allocated.
(This is the case that was 7x slower than the other two.)
Because the new pointer masks store 1-bit entries instead of 4-bit
entries and because using the decoder no longer carries a significant
overhead, after this CL (that is, for Go 1.5) there are only two cases:
(1) If the type is 128 words or less (no condition about odd or even),
store the 1-bit pointer mask directly in the binary and use it to
initialize the heap bitmap during malloc. (Implemented in CL 9702.)
(2) There is no case 2 anymore.
(3) Otherwise, store a GC program and execute it to write directly to
the heap bitmap each time an object of that type is allocated.
Executing the GC program directly into the heap bitmap (case (3) above)
was disabled for the Go 1.5 dev cycle, both to avoid needing to use
GC programs for typedmemmove and to avoid updating that code as
the heap bitmap format changed. Typedmemmove no longer uses this
type information; as of CL 9886 it uses the heap bitmap directly.
Now that the heap bitmap format is stable, we reintroduce GC programs
and their space savings.
Benchmarks for heapBitsSetType, before this CL vs this CL:
name old mean new mean delta
SetTypePtr 7.59ns × (0.99,1.02) 5.16ns × (1.00,1.00) -32.05% (p=0.000)
SetTypePtr8 21.0ns × (0.98,1.05) 21.4ns × (1.00,1.00) ~ (p=0.179)
SetTypePtr16 24.1ns × (0.99,1.01) 24.6ns × (1.00,1.00) +2.41% (p=0.001)
SetTypePtr32 31.2ns × (0.99,1.01) 32.4ns × (0.99,1.02) +3.72% (p=0.001)
SetTypePtr64 45.2ns × (1.00,1.00) 47.2ns × (1.00,1.00) +4.42% (p=0.000)
SetTypePtr126 75.8ns × (0.99,1.01) 79.1ns × (1.00,1.00) +4.25% (p=0.000)
SetTypePtr128 74.3ns × (0.99,1.01) 77.6ns × (1.00,1.01) +4.55% (p=0.000)
SetTypePtrSlice 726ns × (1.00,1.01) 712ns × (1.00,1.00) -1.95% (p=0.001)
SetTypeNode1 20.0ns × (0.99,1.01) 20.7ns × (1.00,1.00) +3.71% (p=0.000)
SetTypeNode1Slice 112ns × (1.00,1.00) 113ns × (0.99,1.00) ~ (p=0.070)
SetTypeNode8 23.9ns × (1.00,1.00) 24.7ns × (1.00,1.01) +3.18% (p=0.000)
SetTypeNode8Slice 294ns × (0.99,1.02) 287ns × (0.99,1.01) -2.38% (p=0.015)
SetTypeNode64 52.8ns × (0.99,1.03) 51.8ns × (0.99,1.01) ~ (p=0.069)
SetTypeNode64Slice 1.13µs × (0.99,1.05) 1.14µs × (0.99,1.00) ~ (p=0.767)
SetTypeNode64Dead 36.0ns × (1.00,1.01) 32.5ns × (0.99,1.00) -9.67% (p=0.000)
SetTypeNode64DeadSlice 1.43µs × (0.99,1.01) 1.40µs × (1.00,1.00) -2.39% (p=0.001)
SetTypeNode124 75.7ns × (1.00,1.01) 79.0ns × (1.00,1.00) +4.44% (p=0.000)
SetTypeNode124Slice 1.94µs × (1.00,1.01) 2.04µs × (0.99,1.01) +4.98% (p=0.000)
SetTypeNode126 75.4ns × (1.00,1.01) 77.7ns × (0.99,1.01) +3.11% (p=0.000)
SetTypeNode126Slice 1.95µs × (0.99,1.01) 2.03µs × (1.00,1.00) +3.74% (p=0.000)
SetTypeNode128 85.4ns × (0.99,1.01) 122.0ns × (1.00,1.00) +42.89% (p=0.000)
SetTypeNode128Slice 2.20µs × (1.00,1.01) 2.36µs × (0.98,1.02) +7.48% (p=0.001)
SetTypeNode130 83.3ns × (1.00,1.00) 123.0ns × (1.00,1.00) +47.61% (p=0.000)
SetTypeNode130Slice 2.30µs × (0.99,1.01) 2.40µs × (0.98,1.01) +4.37% (p=0.000)
SetTypeNode1024 498ns × (1.00,1.00) 537ns × (1.00,1.00) +7.96% (p=0.000)
SetTypeNode1024Slice 15.5µs × (0.99,1.01) 17.8µs × (1.00,1.00) +15.27% (p=0.000)
The above compares always using a cached pointer mask (and the
corresponding waste of memory) against using the programs directly.
Some slowdown is expected, in exchange for having a better general algorithm.
The GC programs kick in for SetTypeNode128, SetTypeNode130, SetTypeNode1024,
along with the slice variants of those.
It is possible that the cutoff of 128 words (bits) should be raised
in a followup CL, but even with this low cutoff the GC programs are
faster than Go 1.4's "fast path" non-GC program case.
Benchmarks for heapBitsSetType, Go 1.4 vs this CL:
name old mean new mean delta
SetTypePtr 6.89ns × (1.00,1.00) 5.17ns × (1.00,1.00) -25.02% (p=0.000)
SetTypePtr8 25.8ns × (0.97,1.05) 21.5ns × (1.00,1.00) -16.70% (p=0.000)
SetTypePtr16 39.8ns × (0.97,1.02) 24.7ns × (0.99,1.01) -37.81% (p=0.000)
SetTypePtr32 68.8ns × (0.98,1.01) 32.2ns × (1.00,1.01) -53.18% (p=0.000)
SetTypePtr64 130ns × (1.00,1.00) 47ns × (1.00,1.00) -63.67% (p=0.000)
SetTypePtr126 241ns × (0.99,1.01) 79ns × (1.00,1.01) -67.25% (p=0.000)
SetTypePtr128 2.07µs × (1.00,1.00) 0.08µs × (1.00,1.00) -96.27% (p=0.000)
SetTypePtrSlice 1.05µs × (0.99,1.01) 0.72µs × (0.99,1.02) -31.70% (p=0.000)
SetTypeNode1 16.0ns × (0.99,1.01) 20.8ns × (0.99,1.03) +29.91% (p=0.000)
SetTypeNode1Slice 184ns × (0.99,1.01) 112ns × (0.99,1.01) -39.26% (p=0.000)
SetTypeNode8 29.5ns × (0.97,1.02) 24.6ns × (1.00,1.00) -16.50% (p=0.000)
SetTypeNode8Slice 624ns × (0.98,1.02) 285ns × (1.00,1.00) -54.31% (p=0.000)
SetTypeNode64 135ns × (0.96,1.08) 52ns × (0.99,1.02) -61.32% (p=0.000)
SetTypeNode64Slice 3.83µs × (1.00,1.00) 1.14µs × (0.99,1.01) -70.16% (p=0.000)
SetTypeNode64Dead 134ns × (0.99,1.01) 32ns × (1.00,1.01) -75.74% (p=0.000)
SetTypeNode64DeadSlice 3.83µs × (0.99,1.00) 1.40µs × (1.00,1.01) -63.42% (p=0.000)
SetTypeNode124 240ns × (0.99,1.01) 79ns × (1.00,1.01) -67.05% (p=0.000)
SetTypeNode124Slice 7.27µs × (1.00,1.00) 2.04µs × (1.00,1.00) -71.95% (p=0.000)
SetTypeNode126 2.06µs × (0.99,1.01) 0.08µs × (0.99,1.01) -96.23% (p=0.000)
SetTypeNode126Slice 64.4µs × (1.00,1.00) 2.0µs × (1.00,1.00) -96.85% (p=0.000)
SetTypeNode128 2.09µs × (1.00,1.01) 0.12µs × (1.00,1.00) -94.15% (p=0.000)
SetTypeNode128Slice 65.4µs × (1.00,1.00) 2.4µs × (0.99,1.03) -96.39% (p=0.000)
SetTypeNode130 2.11µs × (1.00,1.00) 0.12µs × (1.00,1.00) -94.18% (p=0.000)
SetTypeNode130Slice 66.3µs × (1.00,1.00) 2.4µs × (0.97,1.08) -96.34% (p=0.000)
SetTypeNode1024 16.0µs × (1.00,1.01) 0.5µs × (1.00,1.00) -96.65% (p=0.000)
SetTypeNode1024Slice 512µs × (1.00,1.00) 18µs × (0.98,1.04) -96.45% (p=0.000)
SetTypeNode124 uses a 124 data + 2 ptr = 126-word allocation.
Both Go 1.4 and this CL are using pointer bitmaps for this case,
so that's an overall 3x speedup for using pointer bitmaps.
SetTypeNode128 uses a 128 data + 2 ptr = 130-word allocation.
Both Go 1.4 and this CL are running the GC program for this case,
so that's an overall 17x speedup when using GC programs (and
I've seen >20x on other systems).
Comparing Go 1.4's SetTypeNode124 (pointer bitmap) against
this CL's SetTypeNode128 (GC program), the slow path in the
code in this CL is 2x faster than the fast path in Go 1.4.
The Go 1 benchmarks are basically unaffected compared to just before this CL.
Go 1 benchmarks, before this CL vs this CL:
name old mean new mean delta
BinaryTree17 5.87s × (0.97,1.04) 5.91s × (0.96,1.04) ~ (p=0.306)
Fannkuch11 4.38s × (1.00,1.00) 4.37s × (1.00,1.01) -0.22% (p=0.006)
FmtFprintfEmpty 90.7ns × (0.97,1.10) 89.3ns × (0.96,1.09) ~ (p=0.280)
FmtFprintfString 282ns × (0.98,1.04) 287ns × (0.98,1.07) +1.72% (p=0.039)
FmtFprintfInt 269ns × (0.99,1.03) 282ns × (0.97,1.04) +4.87% (p=0.000)
FmtFprintfIntInt 478ns × (0.99,1.02) 481ns × (0.99,1.02) +0.61% (p=0.048)
FmtFprintfPrefixedInt 399ns × (0.98,1.03) 400ns × (0.98,1.05) ~ (p=0.533)
FmtFprintfFloat 563ns × (0.99,1.01) 570ns × (1.00,1.01) +1.37% (p=0.000)
FmtManyArgs 1.89µs × (0.99,1.01) 1.92µs × (0.99,1.02) +1.88% (p=0.000)
GobDecode 15.2ms × (0.99,1.01) 15.2ms × (0.98,1.05) ~ (p=0.609)
GobEncode 11.6ms × (0.98,1.03) 11.9ms × (0.98,1.04) +2.17% (p=0.000)
Gzip 648ms × (0.99,1.01) 648ms × (1.00,1.01) ~ (p=0.835)
Gunzip 142ms × (1.00,1.00) 143ms × (1.00,1.01) ~ (p=0.169)
HTTPClientServer 90.5µs × (0.98,1.03) 91.5µs × (0.98,1.04) +1.04% (p=0.045)
JSONEncode 31.5ms × (0.98,1.03) 31.4ms × (0.98,1.03) ~ (p=0.549)
JSONDecode 111ms × (0.99,1.01) 107ms × (0.99,1.01) -3.21% (p=0.000)
Mandelbrot200 6.01ms × (1.00,1.00) 6.01ms × (1.00,1.00) ~ (p=0.878)
GoParse 6.54ms × (0.99,1.02) 6.61ms × (0.99,1.03) +1.08% (p=0.004)
RegexpMatchEasy0_32 160ns × (1.00,1.01) 161ns × (1.00,1.00) +0.40% (p=0.000)
RegexpMatchEasy0_1K 560ns × (0.99,1.01) 559ns × (0.99,1.01) ~ (p=0.088)
RegexpMatchEasy1_32 138ns × (0.99,1.01) 138ns × (1.00,1.00) ~ (p=0.380)
RegexpMatchEasy1_1K 877ns × (1.00,1.00) 878ns × (1.00,1.00) ~ (p=0.157)
RegexpMatchMedium_32 251ns × (0.99,1.00) 251ns × (1.00,1.01) +0.28% (p=0.021)
RegexpMatchMedium_1K 72.6µs × (1.00,1.00) 72.6µs × (1.00,1.00) ~ (p=0.539)
RegexpMatchHard_32 3.84µs × (1.00,1.00) 3.84µs × (1.00,1.00) ~ (p=0.378)
RegexpMatchHard_1K 117µs × (1.00,1.00) 117µs × (1.00,1.00) ~ (p=0.067)
Revcomp 904ms × (0.99,1.02) 904ms × (0.99,1.01) ~ (p=0.943)
Template 125ms × (0.99,1.02) 127ms × (0.99,1.01) +1.79% (p=0.000)
TimeParse 627ns × (0.99,1.01) 622ns × (0.99,1.01) -0.88% (p=0.000)
TimeFormat 655ns × (0.99,1.02) 655ns × (0.99,1.02) ~ (p=0.976)
For the record, Go 1 benchmarks, Go 1.4 vs this CL:
name old mean new mean delta
BinaryTree17 4.61s × (0.97,1.05) 5.91s × (0.98,1.03) +28.35% (p=0.000)
Fannkuch11 4.40s × (0.99,1.03) 4.41s × (0.99,1.01) ~ (p=0.212)
FmtFprintfEmpty 102ns × (0.99,1.01) 84ns × (0.99,1.02) -18.38% (p=0.000)
FmtFprintfString 302ns × (0.98,1.01) 303ns × (0.99,1.02) ~ (p=0.203)
FmtFprintfInt 313ns × (0.97,1.05) 270ns × (0.99,1.01) -13.69% (p=0.000)
FmtFprintfIntInt 524ns × (0.98,1.02) 477ns × (0.99,1.00) -8.87% (p=0.000)
FmtFprintfPrefixedInt 424ns × (0.98,1.02) 386ns × (0.99,1.01) -8.96% (p=0.000)
FmtFprintfFloat 652ns × (0.98,1.02) 594ns × (0.97,1.05) -8.97% (p=0.000)
FmtManyArgs 2.13µs × (0.99,1.02) 1.94µs × (0.99,1.01) -8.92% (p=0.000)
GobDecode 17.1ms × (0.99,1.02) 14.9ms × (0.98,1.03) -13.07% (p=0.000)
GobEncode 13.5ms × (0.98,1.03) 11.5ms × (0.98,1.03) -15.25% (p=0.000)
Gzip 656ms × (0.99,1.02) 647ms × (0.99,1.01) -1.29% (p=0.000)
Gunzip 143ms × (0.99,1.02) 144ms × (0.99,1.01) ~ (p=0.204)
HTTPClientServer 88.2µs × (0.98,1.02) 90.8µs × (0.98,1.01) +2.93% (p=0.000)
JSONEncode 32.2ms × (0.98,1.02) 30.9ms × (0.97,1.04) -4.06% (p=0.001)
JSONDecode 121ms × (0.98,1.02) 110ms × (0.98,1.05) -8.95% (p=0.000)
Mandelbrot200 6.06ms × (0.99,1.01) 6.11ms × (0.98,1.04) ~ (p=0.184)
GoParse 6.76ms × (0.97,1.04) 6.58ms × (0.98,1.05) -2.63% (p=0.003)
RegexpMatchEasy0_32 195ns × (1.00,1.01) 155ns × (0.99,1.01) -20.43% (p=0.000)
RegexpMatchEasy0_1K 479ns × (0.98,1.03) 535ns × (0.99,1.02) +11.59% (p=0.000)
RegexpMatchEasy1_32 169ns × (0.99,1.02) 131ns × (0.99,1.03) -22.44% (p=0.000)
RegexpMatchEasy1_1K 1.53µs × (0.99,1.01) 0.87µs × (0.99,1.02) -43.07% (p=0.000)
RegexpMatchMedium_32 334ns × (0.99,1.01) 242ns × (0.99,1.01) -27.53% (p=0.000)
RegexpMatchMedium_1K 125µs × (1.00,1.01) 72µs × (0.99,1.03) -42.53% (p=0.000)
RegexpMatchHard_32 6.03µs × (0.99,1.01) 3.79µs × (0.99,1.01) -37.12% (p=0.000)
RegexpMatchHard_1K 189µs × (0.99,1.02) 115µs × (0.99,1.01) -39.20% (p=0.000)
Revcomp 935ms × (0.96,1.03) 926ms × (0.98,1.02) ~ (p=0.083)
Template 146ms × (0.97,1.05) 119ms × (0.99,1.01) -18.37% (p=0.000)
TimeParse 660ns × (0.99,1.01) 624ns × (0.99,1.02) -5.43% (p=0.000)
TimeFormat 670ns × (0.98,1.02) 710ns × (1.00,1.01) +5.97% (p=0.000)
This CL is a bit larger than I would like, but the compiler, linker, runtime,
and package reflect all need to be in sync about the format of these programs,
so there is no easy way to split this into independent changes (at least
while keeping the build working at each change).
Fixes #9625.
Fixes #10524.
Change-Id: I9e3e20d6097099d0f8532d1cb5b1af528804989a
Reviewed-on: https://go-review.googlesource.com/9888
Reviewed-by: Austin Clements <austin@google.com>
Run-TryBot: Russ Cox <rsc@golang.org>
2015-05-08 01:43:18 -04:00
|
|
|
func (p *GCProg) End(size int64) {
|
2016-04-06 12:01:40 -07:00
|
|
|
p.w.ZeroUntil(size / int64(SysArch.PtrSize))
|
runtime: replace GC programs with simpler encoding, faster decoder
Small types record the location of pointers in their memory layout
by using a simple bitmap. In Go 1.4 the bitmap held 4-bit entries,
and in Go 1.5 the bitmap holds 1-bit entries, but in both cases using
a bitmap for a large type containing arrays does not make sense:
if someone refers to the type [1<<28]*byte in a program in such
a way that the type information makes it into the binary, it would be
a waste of space to write a 128 MB (for 4-bit entries) or even 32 MB
(for 1-bit entries) bitmap full of 1s into the binary or even to keep
one in memory during the execution of the program.
For large types containing arrays, it is much more compact to describe
the locations of pointers using a notation that can express repetition
than to lay out a bitmap of pointers. Go 1.4 included such a notation,
called ``GC programs'' but it was complex, required recursion during
decoding, and was generally slow. Dmitriy measured the execution of
these programs writing directly to the heap bitmap as being 7x slower
than copying from a preunrolled 4-bit mask (and frankly that code was
not terribly fast either). For some tests, unrollgcprog1 was seen costing
as much as 3x more than the rest of malloc combined.
This CL introduces a different form for the GC programs. They use a
simple Lempel-Ziv-style encoding of the 1-bit pointer information,
in which the only operations are (1) emit the following n bits
and (2) repeat the last n bits c more times. This encoding can be
generated directly from the Go type information (using repetition
only for arrays or large runs of non-pointer data) and it can be decoded
very efficiently. In particular the decoding requires little state and
no recursion, so that the entire decoding can run without any memory
accesses other than the reads of the encoding and the writes of the
decoded form to the heap bitmap. For recursive types like arrays of
arrays of arrays, the inner instructions are only executed once, not
n times, so that large repetitions run at full speed. (In contrast, large
repetitions in the old programs repeated the individual bit-level layout
of the inner data over and over.) The result is as much as 25x faster
decoding compared to the old form.
Because the old decoder was so slow, Go 1.4 had three (or so) cases
for how to set the heap bitmap bits for an allocation of a given type:
(1) If the type had an even number of words up to 32 words, then
the 4-bit pointer mask for the type fit in no more than 16 bytes;
store the 4-bit pointer mask directly in the binary and copy from it.
(1b) If the type had an odd number of words up to 15 words, then
the 4-bit pointer mask for the type, doubled to end on a byte boundary,
fit in no more than 16 bytes; store that doubled mask directly in the
binary and copy from it.
(2) If the type had an even number of words up to 128 words,
or an odd number of words up to 63 words (again due to doubling),
then the 4-bit pointer mask would fit in a 64-byte unrolled mask.
Store a GC program in the binary, but leave space in the BSS for
the unrolled mask. Execute the GC program to construct the mask the
first time it is needed, and thereafter copy from the mask.
(3) Otherwise, store a GC program and execute it to write directly to
the heap bitmap each time an object of that type is allocated.
(This is the case that was 7x slower than the other two.)
Because the new pointer masks store 1-bit entries instead of 4-bit
entries and because using the decoder no longer carries a significant
overhead, after this CL (that is, for Go 1.5) there are only two cases:
(1) If the type is 128 words or less (no condition about odd or even),
store the 1-bit pointer mask directly in the binary and use it to
initialize the heap bitmap during malloc. (Implemented in CL 9702.)
(2) There is no case 2 anymore.
(3) Otherwise, store a GC program and execute it to write directly to
the heap bitmap each time an object of that type is allocated.
Executing the GC program directly into the heap bitmap (case (3) above)
was disabled for the Go 1.5 dev cycle, both to avoid needing to use
GC programs for typedmemmove and to avoid updating that code as
the heap bitmap format changed. Typedmemmove no longer uses this
type information; as of CL 9886 it uses the heap bitmap directly.
Now that the heap bitmap format is stable, we reintroduce GC programs
and their space savings.
Benchmarks for heapBitsSetType, before this CL vs this CL:
name old mean new mean delta
SetTypePtr 7.59ns × (0.99,1.02) 5.16ns × (1.00,1.00) -32.05% (p=0.000)
SetTypePtr8 21.0ns × (0.98,1.05) 21.4ns × (1.00,1.00) ~ (p=0.179)
SetTypePtr16 24.1ns × (0.99,1.01) 24.6ns × (1.00,1.00) +2.41% (p=0.001)
SetTypePtr32 31.2ns × (0.99,1.01) 32.4ns × (0.99,1.02) +3.72% (p=0.001)
SetTypePtr64 45.2ns × (1.00,1.00) 47.2ns × (1.00,1.00) +4.42% (p=0.000)
SetTypePtr126 75.8ns × (0.99,1.01) 79.1ns × (1.00,1.00) +4.25% (p=0.000)
SetTypePtr128 74.3ns × (0.99,1.01) 77.6ns × (1.00,1.01) +4.55% (p=0.000)
SetTypePtrSlice 726ns × (1.00,1.01) 712ns × (1.00,1.00) -1.95% (p=0.001)
SetTypeNode1 20.0ns × (0.99,1.01) 20.7ns × (1.00,1.00) +3.71% (p=0.000)
SetTypeNode1Slice 112ns × (1.00,1.00) 113ns × (0.99,1.00) ~ (p=0.070)
SetTypeNode8 23.9ns × (1.00,1.00) 24.7ns × (1.00,1.01) +3.18% (p=0.000)
SetTypeNode8Slice 294ns × (0.99,1.02) 287ns × (0.99,1.01) -2.38% (p=0.015)
SetTypeNode64 52.8ns × (0.99,1.03) 51.8ns × (0.99,1.01) ~ (p=0.069)
SetTypeNode64Slice 1.13µs × (0.99,1.05) 1.14µs × (0.99,1.00) ~ (p=0.767)
SetTypeNode64Dead 36.0ns × (1.00,1.01) 32.5ns × (0.99,1.00) -9.67% (p=0.000)
SetTypeNode64DeadSlice 1.43µs × (0.99,1.01) 1.40µs × (1.00,1.00) -2.39% (p=0.001)
SetTypeNode124 75.7ns × (1.00,1.01) 79.0ns × (1.00,1.00) +4.44% (p=0.000)
SetTypeNode124Slice 1.94µs × (1.00,1.01) 2.04µs × (0.99,1.01) +4.98% (p=0.000)
SetTypeNode126 75.4ns × (1.00,1.01) 77.7ns × (0.99,1.01) +3.11% (p=0.000)
SetTypeNode126Slice 1.95µs × (0.99,1.01) 2.03µs × (1.00,1.00) +3.74% (p=0.000)
SetTypeNode128 85.4ns × (0.99,1.01) 122.0ns × (1.00,1.00) +42.89% (p=0.000)
SetTypeNode128Slice 2.20µs × (1.00,1.01) 2.36µs × (0.98,1.02) +7.48% (p=0.001)
SetTypeNode130 83.3ns × (1.00,1.00) 123.0ns × (1.00,1.00) +47.61% (p=0.000)
SetTypeNode130Slice 2.30µs × (0.99,1.01) 2.40µs × (0.98,1.01) +4.37% (p=0.000)
SetTypeNode1024 498ns × (1.00,1.00) 537ns × (1.00,1.00) +7.96% (p=0.000)
SetTypeNode1024Slice 15.5µs × (0.99,1.01) 17.8µs × (1.00,1.00) +15.27% (p=0.000)
The above compares always using a cached pointer mask (and the
corresponding waste of memory) against using the programs directly.
Some slowdown is expected, in exchange for having a better general algorithm.
The GC programs kick in for SetTypeNode128, SetTypeNode130, SetTypeNode1024,
along with the slice variants of those.
It is possible that the cutoff of 128 words (bits) should be raised
in a followup CL, but even with this low cutoff the GC programs are
faster than Go 1.4's "fast path" non-GC program case.
Benchmarks for heapBitsSetType, Go 1.4 vs this CL:
name old mean new mean delta
SetTypePtr 6.89ns × (1.00,1.00) 5.17ns × (1.00,1.00) -25.02% (p=0.000)
SetTypePtr8 25.8ns × (0.97,1.05) 21.5ns × (1.00,1.00) -16.70% (p=0.000)
SetTypePtr16 39.8ns × (0.97,1.02) 24.7ns × (0.99,1.01) -37.81% (p=0.000)
SetTypePtr32 68.8ns × (0.98,1.01) 32.2ns × (1.00,1.01) -53.18% (p=0.000)
SetTypePtr64 130ns × (1.00,1.00) 47ns × (1.00,1.00) -63.67% (p=0.000)
SetTypePtr126 241ns × (0.99,1.01) 79ns × (1.00,1.01) -67.25% (p=0.000)
SetTypePtr128 2.07µs × (1.00,1.00) 0.08µs × (1.00,1.00) -96.27% (p=0.000)
SetTypePtrSlice 1.05µs × (0.99,1.01) 0.72µs × (0.99,1.02) -31.70% (p=0.000)
SetTypeNode1 16.0ns × (0.99,1.01) 20.8ns × (0.99,1.03) +29.91% (p=0.000)
SetTypeNode1Slice 184ns × (0.99,1.01) 112ns × (0.99,1.01) -39.26% (p=0.000)
SetTypeNode8 29.5ns × (0.97,1.02) 24.6ns × (1.00,1.00) -16.50% (p=0.000)
SetTypeNode8Slice 624ns × (0.98,1.02) 285ns × (1.00,1.00) -54.31% (p=0.000)
SetTypeNode64 135ns × (0.96,1.08) 52ns × (0.99,1.02) -61.32% (p=0.000)
SetTypeNode64Slice 3.83µs × (1.00,1.00) 1.14µs × (0.99,1.01) -70.16% (p=0.000)
SetTypeNode64Dead 134ns × (0.99,1.01) 32ns × (1.00,1.01) -75.74% (p=0.000)
SetTypeNode64DeadSlice 3.83µs × (0.99,1.00) 1.40µs × (1.00,1.01) -63.42% (p=0.000)
SetTypeNode124 240ns × (0.99,1.01) 79ns × (1.00,1.01) -67.05% (p=0.000)
SetTypeNode124Slice 7.27µs × (1.00,1.00) 2.04µs × (1.00,1.00) -71.95% (p=0.000)
SetTypeNode126 2.06µs × (0.99,1.01) 0.08µs × (0.99,1.01) -96.23% (p=0.000)
SetTypeNode126Slice 64.4µs × (1.00,1.00) 2.0µs × (1.00,1.00) -96.85% (p=0.000)
SetTypeNode128 2.09µs × (1.00,1.01) 0.12µs × (1.00,1.00) -94.15% (p=0.000)
SetTypeNode128Slice 65.4µs × (1.00,1.00) 2.4µs × (0.99,1.03) -96.39% (p=0.000)
SetTypeNode130 2.11µs × (1.00,1.00) 0.12µs × (1.00,1.00) -94.18% (p=0.000)
SetTypeNode130Slice 66.3µs × (1.00,1.00) 2.4µs × (0.97,1.08) -96.34% (p=0.000)
SetTypeNode1024 16.0µs × (1.00,1.01) 0.5µs × (1.00,1.00) -96.65% (p=0.000)
SetTypeNode1024Slice 512µs × (1.00,1.00) 18µs × (0.98,1.04) -96.45% (p=0.000)
SetTypeNode124 uses a 124 data + 2 ptr = 126-word allocation.
Both Go 1.4 and this CL are using pointer bitmaps for this case,
so that's an overall 3x speedup for using pointer bitmaps.
SetTypeNode128 uses a 128 data + 2 ptr = 130-word allocation.
Both Go 1.4 and this CL are running the GC program for this case,
so that's an overall 17x speedup when using GC programs (and
I've seen >20x on other systems).
Comparing Go 1.4's SetTypeNode124 (pointer bitmap) against
this CL's SetTypeNode128 (GC program), the slow path in the
code in this CL is 2x faster than the fast path in Go 1.4.
The Go 1 benchmarks are basically unaffected compared to just before this CL.
Go 1 benchmarks, before this CL vs this CL:
name old mean new mean delta
BinaryTree17 5.87s × (0.97,1.04) 5.91s × (0.96,1.04) ~ (p=0.306)
Fannkuch11 4.38s × (1.00,1.00) 4.37s × (1.00,1.01) -0.22% (p=0.006)
FmtFprintfEmpty 90.7ns × (0.97,1.10) 89.3ns × (0.96,1.09) ~ (p=0.280)
FmtFprintfString 282ns × (0.98,1.04) 287ns × (0.98,1.07) +1.72% (p=0.039)
FmtFprintfInt 269ns × (0.99,1.03) 282ns × (0.97,1.04) +4.87% (p=0.000)
FmtFprintfIntInt 478ns × (0.99,1.02) 481ns × (0.99,1.02) +0.61% (p=0.048)
FmtFprintfPrefixedInt 399ns × (0.98,1.03) 400ns × (0.98,1.05) ~ (p=0.533)
FmtFprintfFloat 563ns × (0.99,1.01) 570ns × (1.00,1.01) +1.37% (p=0.000)
FmtManyArgs 1.89µs × (0.99,1.01) 1.92µs × (0.99,1.02) +1.88% (p=0.000)
GobDecode 15.2ms × (0.99,1.01) 15.2ms × (0.98,1.05) ~ (p=0.609)
GobEncode 11.6ms × (0.98,1.03) 11.9ms × (0.98,1.04) +2.17% (p=0.000)
Gzip 648ms × (0.99,1.01) 648ms × (1.00,1.01) ~ (p=0.835)
Gunzip 142ms × (1.00,1.00) 143ms × (1.00,1.01) ~ (p=0.169)
HTTPClientServer 90.5µs × (0.98,1.03) 91.5µs × (0.98,1.04) +1.04% (p=0.045)
JSONEncode 31.5ms × (0.98,1.03) 31.4ms × (0.98,1.03) ~ (p=0.549)
JSONDecode 111ms × (0.99,1.01) 107ms × (0.99,1.01) -3.21% (p=0.000)
Mandelbrot200 6.01ms × (1.00,1.00) 6.01ms × (1.00,1.00) ~ (p=0.878)
GoParse 6.54ms × (0.99,1.02) 6.61ms × (0.99,1.03) +1.08% (p=0.004)
RegexpMatchEasy0_32 160ns × (1.00,1.01) 161ns × (1.00,1.00) +0.40% (p=0.000)
RegexpMatchEasy0_1K 560ns × (0.99,1.01) 559ns × (0.99,1.01) ~ (p=0.088)
RegexpMatchEasy1_32 138ns × (0.99,1.01) 138ns × (1.00,1.00) ~ (p=0.380)
RegexpMatchEasy1_1K 877ns × (1.00,1.00) 878ns × (1.00,1.00) ~ (p=0.157)
RegexpMatchMedium_32 251ns × (0.99,1.00) 251ns × (1.00,1.01) +0.28% (p=0.021)
RegexpMatchMedium_1K 72.6µs × (1.00,1.00) 72.6µs × (1.00,1.00) ~ (p=0.539)
RegexpMatchHard_32 3.84µs × (1.00,1.00) 3.84µs × (1.00,1.00) ~ (p=0.378)
RegexpMatchHard_1K 117µs × (1.00,1.00) 117µs × (1.00,1.00) ~ (p=0.067)
Revcomp 904ms × (0.99,1.02) 904ms × (0.99,1.01) ~ (p=0.943)
Template 125ms × (0.99,1.02) 127ms × (0.99,1.01) +1.79% (p=0.000)
TimeParse 627ns × (0.99,1.01) 622ns × (0.99,1.01) -0.88% (p=0.000)
TimeFormat 655ns × (0.99,1.02) 655ns × (0.99,1.02) ~ (p=0.976)
For the record, Go 1 benchmarks, Go 1.4 vs this CL:
name old mean new mean delta
BinaryTree17 4.61s × (0.97,1.05) 5.91s × (0.98,1.03) +28.35% (p=0.000)
Fannkuch11 4.40s × (0.99,1.03) 4.41s × (0.99,1.01) ~ (p=0.212)
FmtFprintfEmpty 102ns × (0.99,1.01) 84ns × (0.99,1.02) -18.38% (p=0.000)
FmtFprintfString 302ns × (0.98,1.01) 303ns × (0.99,1.02) ~ (p=0.203)
FmtFprintfInt 313ns × (0.97,1.05) 270ns × (0.99,1.01) -13.69% (p=0.000)
FmtFprintfIntInt 524ns × (0.98,1.02) 477ns × (0.99,1.00) -8.87% (p=0.000)
FmtFprintfPrefixedInt 424ns × (0.98,1.02) 386ns × (0.99,1.01) -8.96% (p=0.000)
FmtFprintfFloat 652ns × (0.98,1.02) 594ns × (0.97,1.05) -8.97% (p=0.000)
FmtManyArgs 2.13µs × (0.99,1.02) 1.94µs × (0.99,1.01) -8.92% (p=0.000)
GobDecode 17.1ms × (0.99,1.02) 14.9ms × (0.98,1.03) -13.07% (p=0.000)
GobEncode 13.5ms × (0.98,1.03) 11.5ms × (0.98,1.03) -15.25% (p=0.000)
Gzip 656ms × (0.99,1.02) 647ms × (0.99,1.01) -1.29% (p=0.000)
Gunzip 143ms × (0.99,1.02) 144ms × (0.99,1.01) ~ (p=0.204)
HTTPClientServer 88.2µs × (0.98,1.02) 90.8µs × (0.98,1.01) +2.93% (p=0.000)
JSONEncode 32.2ms × (0.98,1.02) 30.9ms × (0.97,1.04) -4.06% (p=0.001)
JSONDecode 121ms × (0.98,1.02) 110ms × (0.98,1.05) -8.95% (p=0.000)
Mandelbrot200 6.06ms × (0.99,1.01) 6.11ms × (0.98,1.04) ~ (p=0.184)
GoParse 6.76ms × (0.97,1.04) 6.58ms × (0.98,1.05) -2.63% (p=0.003)
RegexpMatchEasy0_32 195ns × (1.00,1.01) 155ns × (0.99,1.01) -20.43% (p=0.000)
RegexpMatchEasy0_1K 479ns × (0.98,1.03) 535ns × (0.99,1.02) +11.59% (p=0.000)
RegexpMatchEasy1_32 169ns × (0.99,1.02) 131ns × (0.99,1.03) -22.44% (p=0.000)
RegexpMatchEasy1_1K 1.53µs × (0.99,1.01) 0.87µs × (0.99,1.02) -43.07% (p=0.000)
RegexpMatchMedium_32 334ns × (0.99,1.01) 242ns × (0.99,1.01) -27.53% (p=0.000)
RegexpMatchMedium_1K 125µs × (1.00,1.01) 72µs × (0.99,1.03) -42.53% (p=0.000)
RegexpMatchHard_32 6.03µs × (0.99,1.01) 3.79µs × (0.99,1.01) -37.12% (p=0.000)
RegexpMatchHard_1K 189µs × (0.99,1.02) 115µs × (0.99,1.01) -39.20% (p=0.000)
Revcomp 935ms × (0.96,1.03) 926ms × (0.98,1.02) ~ (p=0.083)
Template 146ms × (0.97,1.05) 119ms × (0.99,1.01) -18.37% (p=0.000)
TimeParse 660ns × (0.99,1.01) 624ns × (0.99,1.02) -5.43% (p=0.000)
TimeFormat 670ns × (0.98,1.02) 710ns × (1.00,1.01) +5.97% (p=0.000)
This CL is a bit larger than I would like, but the compiler, linker, runtime,
and package reflect all need to be in sync about the format of these programs,
so there is no easy way to split this into independent changes (at least
while keeping the build working at each change).
Fixes #9625.
Fixes #10524.
Change-Id: I9e3e20d6097099d0f8532d1cb5b1af528804989a
Reviewed-on: https://go-review.googlesource.com/9888
Reviewed-by: Austin Clements <austin@google.com>
Run-TryBot: Russ Cox <rsc@golang.org>
2015-05-08 01:43:18 -04:00
|
|
|
p.w.End()
|
|
|
|
|
if debugGCProg {
|
|
|
|
|
fmt.Fprintf(os.Stderr, "ld: end GCProg\n")
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2016-08-19 11:35:54 -04:00
|
|
|
func (p *GCProg) AddSym(s *Symbol) {
|
runtime: replace GC programs with simpler encoding, faster decoder
Small types record the location of pointers in their memory layout
by using a simple bitmap. In Go 1.4 the bitmap held 4-bit entries,
and in Go 1.5 the bitmap holds 1-bit entries, but in both cases using
a bitmap for a large type containing arrays does not make sense:
if someone refers to the type [1<<28]*byte in a program in such
a way that the type information makes it into the binary, it would be
a waste of space to write a 128 MB (for 4-bit entries) or even 32 MB
(for 1-bit entries) bitmap full of 1s into the binary or even to keep
one in memory during the execution of the program.
For large types containing arrays, it is much more compact to describe
the locations of pointers using a notation that can express repetition
than to lay out a bitmap of pointers. Go 1.4 included such a notation,
called ``GC programs'' but it was complex, required recursion during
decoding, and was generally slow. Dmitriy measured the execution of
these programs writing directly to the heap bitmap as being 7x slower
than copying from a preunrolled 4-bit mask (and frankly that code was
not terribly fast either). For some tests, unrollgcprog1 was seen costing
as much as 3x more than the rest of malloc combined.
This CL introduces a different form for the GC programs. They use a
simple Lempel-Ziv-style encoding of the 1-bit pointer information,
in which the only operations are (1) emit the following n bits
and (2) repeat the last n bits c more times. This encoding can be
generated directly from the Go type information (using repetition
only for arrays or large runs of non-pointer data) and it can be decoded
very efficiently. In particular the decoding requires little state and
no recursion, so that the entire decoding can run without any memory
accesses other than the reads of the encoding and the writes of the
decoded form to the heap bitmap. For recursive types like arrays of
arrays of arrays, the inner instructions are only executed once, not
n times, so that large repetitions run at full speed. (In contrast, large
repetitions in the old programs repeated the individual bit-level layout
of the inner data over and over.) The result is as much as 25x faster
decoding compared to the old form.
Because the old decoder was so slow, Go 1.4 had three (or so) cases
for how to set the heap bitmap bits for an allocation of a given type:
(1) If the type had an even number of words up to 32 words, then
the 4-bit pointer mask for the type fit in no more than 16 bytes;
store the 4-bit pointer mask directly in the binary and copy from it.
(1b) If the type had an odd number of words up to 15 words, then
the 4-bit pointer mask for the type, doubled to end on a byte boundary,
fit in no more than 16 bytes; store that doubled mask directly in the
binary and copy from it.
(2) If the type had an even number of words up to 128 words,
or an odd number of words up to 63 words (again due to doubling),
then the 4-bit pointer mask would fit in a 64-byte unrolled mask.
Store a GC program in the binary, but leave space in the BSS for
the unrolled mask. Execute the GC program to construct the mask the
first time it is needed, and thereafter copy from the mask.
(3) Otherwise, store a GC program and execute it to write directly to
the heap bitmap each time an object of that type is allocated.
(This is the case that was 7x slower than the other two.)
Because the new pointer masks store 1-bit entries instead of 4-bit
entries and because using the decoder no longer carries a significant
overhead, after this CL (that is, for Go 1.5) there are only two cases:
(1) If the type is 128 words or less (no condition about odd or even),
store the 1-bit pointer mask directly in the binary and use it to
initialize the heap bitmap during malloc. (Implemented in CL 9702.)
(2) There is no case 2 anymore.
(3) Otherwise, store a GC program and execute it to write directly to
the heap bitmap each time an object of that type is allocated.
Executing the GC program directly into the heap bitmap (case (3) above)
was disabled for the Go 1.5 dev cycle, both to avoid needing to use
GC programs for typedmemmove and to avoid updating that code as
the heap bitmap format changed. Typedmemmove no longer uses this
type information; as of CL 9886 it uses the heap bitmap directly.
Now that the heap bitmap format is stable, we reintroduce GC programs
and their space savings.
Benchmarks for heapBitsSetType, before this CL vs this CL:
name old mean new mean delta
SetTypePtr 7.59ns × (0.99,1.02) 5.16ns × (1.00,1.00) -32.05% (p=0.000)
SetTypePtr8 21.0ns × (0.98,1.05) 21.4ns × (1.00,1.00) ~ (p=0.179)
SetTypePtr16 24.1ns × (0.99,1.01) 24.6ns × (1.00,1.00) +2.41% (p=0.001)
SetTypePtr32 31.2ns × (0.99,1.01) 32.4ns × (0.99,1.02) +3.72% (p=0.001)
SetTypePtr64 45.2ns × (1.00,1.00) 47.2ns × (1.00,1.00) +4.42% (p=0.000)
SetTypePtr126 75.8ns × (0.99,1.01) 79.1ns × (1.00,1.00) +4.25% (p=0.000)
SetTypePtr128 74.3ns × (0.99,1.01) 77.6ns × (1.00,1.01) +4.55% (p=0.000)
SetTypePtrSlice 726ns × (1.00,1.01) 712ns × (1.00,1.00) -1.95% (p=0.001)
SetTypeNode1 20.0ns × (0.99,1.01) 20.7ns × (1.00,1.00) +3.71% (p=0.000)
SetTypeNode1Slice 112ns × (1.00,1.00) 113ns × (0.99,1.00) ~ (p=0.070)
SetTypeNode8 23.9ns × (1.00,1.00) 24.7ns × (1.00,1.01) +3.18% (p=0.000)
SetTypeNode8Slice 294ns × (0.99,1.02) 287ns × (0.99,1.01) -2.38% (p=0.015)
SetTypeNode64 52.8ns × (0.99,1.03) 51.8ns × (0.99,1.01) ~ (p=0.069)
SetTypeNode64Slice 1.13µs × (0.99,1.05) 1.14µs × (0.99,1.00) ~ (p=0.767)
SetTypeNode64Dead 36.0ns × (1.00,1.01) 32.5ns × (0.99,1.00) -9.67% (p=0.000)
SetTypeNode64DeadSlice 1.43µs × (0.99,1.01) 1.40µs × (1.00,1.00) -2.39% (p=0.001)
SetTypeNode124 75.7ns × (1.00,1.01) 79.0ns × (1.00,1.00) +4.44% (p=0.000)
SetTypeNode124Slice 1.94µs × (1.00,1.01) 2.04µs × (0.99,1.01) +4.98% (p=0.000)
SetTypeNode126 75.4ns × (1.00,1.01) 77.7ns × (0.99,1.01) +3.11% (p=0.000)
SetTypeNode126Slice 1.95µs × (0.99,1.01) 2.03µs × (1.00,1.00) +3.74% (p=0.000)
SetTypeNode128 85.4ns × (0.99,1.01) 122.0ns × (1.00,1.00) +42.89% (p=0.000)
SetTypeNode128Slice 2.20µs × (1.00,1.01) 2.36µs × (0.98,1.02) +7.48% (p=0.001)
SetTypeNode130 83.3ns × (1.00,1.00) 123.0ns × (1.00,1.00) +47.61% (p=0.000)
SetTypeNode130Slice 2.30µs × (0.99,1.01) 2.40µs × (0.98,1.01) +4.37% (p=0.000)
SetTypeNode1024 498ns × (1.00,1.00) 537ns × (1.00,1.00) +7.96% (p=0.000)
SetTypeNode1024Slice 15.5µs × (0.99,1.01) 17.8µs × (1.00,1.00) +15.27% (p=0.000)
The above compares always using a cached pointer mask (and the
corresponding waste of memory) against using the programs directly.
Some slowdown is expected, in exchange for having a better general algorithm.
The GC programs kick in for SetTypeNode128, SetTypeNode130, SetTypeNode1024,
along with the slice variants of those.
It is possible that the cutoff of 128 words (bits) should be raised
in a followup CL, but even with this low cutoff the GC programs are
faster than Go 1.4's "fast path" non-GC program case.
Benchmarks for heapBitsSetType, Go 1.4 vs this CL:
name old mean new mean delta
SetTypePtr 6.89ns × (1.00,1.00) 5.17ns × (1.00,1.00) -25.02% (p=0.000)
SetTypePtr8 25.8ns × (0.97,1.05) 21.5ns × (1.00,1.00) -16.70% (p=0.000)
SetTypePtr16 39.8ns × (0.97,1.02) 24.7ns × (0.99,1.01) -37.81% (p=0.000)
SetTypePtr32 68.8ns × (0.98,1.01) 32.2ns × (1.00,1.01) -53.18% (p=0.000)
SetTypePtr64 130ns × (1.00,1.00) 47ns × (1.00,1.00) -63.67% (p=0.000)
SetTypePtr126 241ns × (0.99,1.01) 79ns × (1.00,1.01) -67.25% (p=0.000)
SetTypePtr128 2.07µs × (1.00,1.00) 0.08µs × (1.00,1.00) -96.27% (p=0.000)
SetTypePtrSlice 1.05µs × (0.99,1.01) 0.72µs × (0.99,1.02) -31.70% (p=0.000)
SetTypeNode1 16.0ns × (0.99,1.01) 20.8ns × (0.99,1.03) +29.91% (p=0.000)
SetTypeNode1Slice 184ns × (0.99,1.01) 112ns × (0.99,1.01) -39.26% (p=0.000)
SetTypeNode8 29.5ns × (0.97,1.02) 24.6ns × (1.00,1.00) -16.50% (p=0.000)
SetTypeNode8Slice 624ns × (0.98,1.02) 285ns × (1.00,1.00) -54.31% (p=0.000)
SetTypeNode64 135ns × (0.96,1.08) 52ns × (0.99,1.02) -61.32% (p=0.000)
SetTypeNode64Slice 3.83µs × (1.00,1.00) 1.14µs × (0.99,1.01) -70.16% (p=0.000)
SetTypeNode64Dead 134ns × (0.99,1.01) 32ns × (1.00,1.01) -75.74% (p=0.000)
SetTypeNode64DeadSlice 3.83µs × (0.99,1.00) 1.40µs × (1.00,1.01) -63.42% (p=0.000)
SetTypeNode124 240ns × (0.99,1.01) 79ns × (1.00,1.01) -67.05% (p=0.000)
SetTypeNode124Slice 7.27µs × (1.00,1.00) 2.04µs × (1.00,1.00) -71.95% (p=0.000)
SetTypeNode126 2.06µs × (0.99,1.01) 0.08µs × (0.99,1.01) -96.23% (p=0.000)
SetTypeNode126Slice 64.4µs × (1.00,1.00) 2.0µs × (1.00,1.00) -96.85% (p=0.000)
SetTypeNode128 2.09µs × (1.00,1.01) 0.12µs × (1.00,1.00) -94.15% (p=0.000)
SetTypeNode128Slice 65.4µs × (1.00,1.00) 2.4µs × (0.99,1.03) -96.39% (p=0.000)
SetTypeNode130 2.11µs × (1.00,1.00) 0.12µs × (1.00,1.00) -94.18% (p=0.000)
SetTypeNode130Slice 66.3µs × (1.00,1.00) 2.4µs × (0.97,1.08) -96.34% (p=0.000)
SetTypeNode1024 16.0µs × (1.00,1.01) 0.5µs × (1.00,1.00) -96.65% (p=0.000)
SetTypeNode1024Slice 512µs × (1.00,1.00) 18µs × (0.98,1.04) -96.45% (p=0.000)
SetTypeNode124 uses a 124 data + 2 ptr = 126-word allocation.
Both Go 1.4 and this CL are using pointer bitmaps for this case,
so that's an overall 3x speedup for using pointer bitmaps.
SetTypeNode128 uses a 128 data + 2 ptr = 130-word allocation.
Both Go 1.4 and this CL are running the GC program for this case,
so that's an overall 17x speedup when using GC programs (and
I've seen >20x on other systems).
Comparing Go 1.4's SetTypeNode124 (pointer bitmap) against
this CL's SetTypeNode128 (GC program), the slow path in the
code in this CL is 2x faster than the fast path in Go 1.4.
The Go 1 benchmarks are basically unaffected compared to just before this CL.
Go 1 benchmarks, before this CL vs this CL:
name old mean new mean delta
BinaryTree17 5.87s × (0.97,1.04) 5.91s × (0.96,1.04) ~ (p=0.306)
Fannkuch11 4.38s × (1.00,1.00) 4.37s × (1.00,1.01) -0.22% (p=0.006)
FmtFprintfEmpty 90.7ns × (0.97,1.10) 89.3ns × (0.96,1.09) ~ (p=0.280)
FmtFprintfString 282ns × (0.98,1.04) 287ns × (0.98,1.07) +1.72% (p=0.039)
FmtFprintfInt 269ns × (0.99,1.03) 282ns × (0.97,1.04) +4.87% (p=0.000)
FmtFprintfIntInt 478ns × (0.99,1.02) 481ns × (0.99,1.02) +0.61% (p=0.048)
FmtFprintfPrefixedInt 399ns × (0.98,1.03) 400ns × (0.98,1.05) ~ (p=0.533)
FmtFprintfFloat 563ns × (0.99,1.01) 570ns × (1.00,1.01) +1.37% (p=0.000)
FmtManyArgs 1.89µs × (0.99,1.01) 1.92µs × (0.99,1.02) +1.88% (p=0.000)
GobDecode 15.2ms × (0.99,1.01) 15.2ms × (0.98,1.05) ~ (p=0.609)
GobEncode 11.6ms × (0.98,1.03) 11.9ms × (0.98,1.04) +2.17% (p=0.000)
Gzip 648ms × (0.99,1.01) 648ms × (1.00,1.01) ~ (p=0.835)
Gunzip 142ms × (1.00,1.00) 143ms × (1.00,1.01) ~ (p=0.169)
HTTPClientServer 90.5µs × (0.98,1.03) 91.5µs × (0.98,1.04) +1.04% (p=0.045)
JSONEncode 31.5ms × (0.98,1.03) 31.4ms × (0.98,1.03) ~ (p=0.549)
JSONDecode 111ms × (0.99,1.01) 107ms × (0.99,1.01) -3.21% (p=0.000)
Mandelbrot200 6.01ms × (1.00,1.00) 6.01ms × (1.00,1.00) ~ (p=0.878)
GoParse 6.54ms × (0.99,1.02) 6.61ms × (0.99,1.03) +1.08% (p=0.004)
RegexpMatchEasy0_32 160ns × (1.00,1.01) 161ns × (1.00,1.00) +0.40% (p=0.000)
RegexpMatchEasy0_1K 560ns × (0.99,1.01) 559ns × (0.99,1.01) ~ (p=0.088)
RegexpMatchEasy1_32 138ns × (0.99,1.01) 138ns × (1.00,1.00) ~ (p=0.380)
RegexpMatchEasy1_1K 877ns × (1.00,1.00) 878ns × (1.00,1.00) ~ (p=0.157)
RegexpMatchMedium_32 251ns × (0.99,1.00) 251ns × (1.00,1.01) +0.28% (p=0.021)
RegexpMatchMedium_1K 72.6µs × (1.00,1.00) 72.6µs × (1.00,1.00) ~ (p=0.539)
RegexpMatchHard_32 3.84µs × (1.00,1.00) 3.84µs × (1.00,1.00) ~ (p=0.378)
RegexpMatchHard_1K 117µs × (1.00,1.00) 117µs × (1.00,1.00) ~ (p=0.067)
Revcomp 904ms × (0.99,1.02) 904ms × (0.99,1.01) ~ (p=0.943)
Template 125ms × (0.99,1.02) 127ms × (0.99,1.01) +1.79% (p=0.000)
TimeParse 627ns × (0.99,1.01) 622ns × (0.99,1.01) -0.88% (p=0.000)
TimeFormat 655ns × (0.99,1.02) 655ns × (0.99,1.02) ~ (p=0.976)
For the record, Go 1 benchmarks, Go 1.4 vs this CL:
name old mean new mean delta
BinaryTree17 4.61s × (0.97,1.05) 5.91s × (0.98,1.03) +28.35% (p=0.000)
Fannkuch11 4.40s × (0.99,1.03) 4.41s × (0.99,1.01) ~ (p=0.212)
FmtFprintfEmpty 102ns × (0.99,1.01) 84ns × (0.99,1.02) -18.38% (p=0.000)
FmtFprintfString 302ns × (0.98,1.01) 303ns × (0.99,1.02) ~ (p=0.203)
FmtFprintfInt 313ns × (0.97,1.05) 270ns × (0.99,1.01) -13.69% (p=0.000)
FmtFprintfIntInt 524ns × (0.98,1.02) 477ns × (0.99,1.00) -8.87% (p=0.000)
FmtFprintfPrefixedInt 424ns × (0.98,1.02) 386ns × (0.99,1.01) -8.96% (p=0.000)
FmtFprintfFloat 652ns × (0.98,1.02) 594ns × (0.97,1.05) -8.97% (p=0.000)
FmtManyArgs 2.13µs × (0.99,1.02) 1.94µs × (0.99,1.01) -8.92% (p=0.000)
GobDecode 17.1ms × (0.99,1.02) 14.9ms × (0.98,1.03) -13.07% (p=0.000)
GobEncode 13.5ms × (0.98,1.03) 11.5ms × (0.98,1.03) -15.25% (p=0.000)
Gzip 656ms × (0.99,1.02) 647ms × (0.99,1.01) -1.29% (p=0.000)
Gunzip 143ms × (0.99,1.02) 144ms × (0.99,1.01) ~ (p=0.204)
HTTPClientServer 88.2µs × (0.98,1.02) 90.8µs × (0.98,1.01) +2.93% (p=0.000)
JSONEncode 32.2ms × (0.98,1.02) 30.9ms × (0.97,1.04) -4.06% (p=0.001)
JSONDecode 121ms × (0.98,1.02) 110ms × (0.98,1.05) -8.95% (p=0.000)
Mandelbrot200 6.06ms × (0.99,1.01) 6.11ms × (0.98,1.04) ~ (p=0.184)
GoParse 6.76ms × (0.97,1.04) 6.58ms × (0.98,1.05) -2.63% (p=0.003)
RegexpMatchEasy0_32 195ns × (1.00,1.01) 155ns × (0.99,1.01) -20.43% (p=0.000)
RegexpMatchEasy0_1K 479ns × (0.98,1.03) 535ns × (0.99,1.02) +11.59% (p=0.000)
RegexpMatchEasy1_32 169ns × (0.99,1.02) 131ns × (0.99,1.03) -22.44% (p=0.000)
RegexpMatchEasy1_1K 1.53µs × (0.99,1.01) 0.87µs × (0.99,1.02) -43.07% (p=0.000)
RegexpMatchMedium_32 334ns × (0.99,1.01) 242ns × (0.99,1.01) -27.53% (p=0.000)
RegexpMatchMedium_1K 125µs × (1.00,1.01) 72µs × (0.99,1.03) -42.53% (p=0.000)
RegexpMatchHard_32 6.03µs × (0.99,1.01) 3.79µs × (0.99,1.01) -37.12% (p=0.000)
RegexpMatchHard_1K 189µs × (0.99,1.02) 115µs × (0.99,1.01) -39.20% (p=0.000)
Revcomp 935ms × (0.96,1.03) 926ms × (0.98,1.02) ~ (p=0.083)
Template 146ms × (0.97,1.05) 119ms × (0.99,1.01) -18.37% (p=0.000)
TimeParse 660ns × (0.99,1.01) 624ns × (0.99,1.02) -5.43% (p=0.000)
TimeFormat 670ns × (0.98,1.02) 710ns × (1.00,1.01) +5.97% (p=0.000)
This CL is a bit larger than I would like, but the compiler, linker, runtime,
and package reflect all need to be in sync about the format of these programs,
so there is no easy way to split this into independent changes (at least
while keeping the build working at each change).
Fixes #9625.
Fixes #10524.
Change-Id: I9e3e20d6097099d0f8532d1cb5b1af528804989a
Reviewed-on: https://go-review.googlesource.com/9888
Reviewed-by: Austin Clements <austin@google.com>
Run-TryBot: Russ Cox <rsc@golang.org>
2015-05-08 01:43:18 -04:00
|
|
|
typ := s.Gotype
|
|
|
|
|
// Things without pointers should be in SNOPTRDATA or SNOPTRBSS;
|
|
|
|
|
// everything we see should have pointers and should therefore have a type.
|
|
|
|
|
if typ == nil {
|
2016-09-19 14:13:07 -04:00
|
|
|
switch s.Name {
|
|
|
|
|
case "runtime.data", "runtime.edata", "runtime.bss", "runtime.ebss":
|
|
|
|
|
// Ignore special symbols that are sometimes laid out
|
|
|
|
|
// as real symbols. See comment about dyld on darwin in
|
|
|
|
|
// the address function.
|
|
|
|
|
return
|
|
|
|
|
}
|
2016-09-17 09:39:33 -04:00
|
|
|
Errorf(s, "missing Go type information for global symbol: size %d", s.Size)
|
2015-02-27 22:57:28 -05:00
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
|
2016-04-06 12:01:40 -07:00
|
|
|
ptrsize := int64(SysArch.PtrSize)
|
2016-08-22 10:33:13 -04:00
|
|
|
nptr := decodetypePtrdata(p.ctxt.Arch, typ) / ptrsize
|
2015-02-27 22:57:28 -05:00
|
|
|
|
runtime: replace GC programs with simpler encoding, faster decoder
Small types record the location of pointers in their memory layout
by using a simple bitmap. In Go 1.4 the bitmap held 4-bit entries,
and in Go 1.5 the bitmap holds 1-bit entries, but in both cases using
a bitmap for a large type containing arrays does not make sense:
if someone refers to the type [1<<28]*byte in a program in such
a way that the type information makes it into the binary, it would be
a waste of space to write a 128 MB (for 4-bit entries) or even 32 MB
(for 1-bit entries) bitmap full of 1s into the binary or even to keep
one in memory during the execution of the program.
For large types containing arrays, it is much more compact to describe
the locations of pointers using a notation that can express repetition
than to lay out a bitmap of pointers. Go 1.4 included such a notation,
called ``GC programs'' but it was complex, required recursion during
decoding, and was generally slow. Dmitriy measured the execution of
these programs writing directly to the heap bitmap as being 7x slower
than copying from a preunrolled 4-bit mask (and frankly that code was
not terribly fast either). For some tests, unrollgcprog1 was seen costing
as much as 3x more than the rest of malloc combined.
This CL introduces a different form for the GC programs. They use a
simple Lempel-Ziv-style encoding of the 1-bit pointer information,
in which the only operations are (1) emit the following n bits
and (2) repeat the last n bits c more times. This encoding can be
generated directly from the Go type information (using repetition
only for arrays or large runs of non-pointer data) and it can be decoded
very efficiently. In particular the decoding requires little state and
no recursion, so that the entire decoding can run without any memory
accesses other than the reads of the encoding and the writes of the
decoded form to the heap bitmap. For recursive types like arrays of
arrays of arrays, the inner instructions are only executed once, not
n times, so that large repetitions run at full speed. (In contrast, large
repetitions in the old programs repeated the individual bit-level layout
of the inner data over and over.) The result is as much as 25x faster
decoding compared to the old form.
Because the old decoder was so slow, Go 1.4 had three (or so) cases
for how to set the heap bitmap bits for an allocation of a given type:
(1) If the type had an even number of words up to 32 words, then
the 4-bit pointer mask for the type fit in no more than 16 bytes;
store the 4-bit pointer mask directly in the binary and copy from it.
(1b) If the type had an odd number of words up to 15 words, then
the 4-bit pointer mask for the type, doubled to end on a byte boundary,
fit in no more than 16 bytes; store that doubled mask directly in the
binary and copy from it.
(2) If the type had an even number of words up to 128 words,
or an odd number of words up to 63 words (again due to doubling),
then the 4-bit pointer mask would fit in a 64-byte unrolled mask.
Store a GC program in the binary, but leave space in the BSS for
the unrolled mask. Execute the GC program to construct the mask the
first time it is needed, and thereafter copy from the mask.
(3) Otherwise, store a GC program and execute it to write directly to
the heap bitmap each time an object of that type is allocated.
(This is the case that was 7x slower than the other two.)
Because the new pointer masks store 1-bit entries instead of 4-bit
entries and because using the decoder no longer carries a significant
overhead, after this CL (that is, for Go 1.5) there are only two cases:
(1) If the type is 128 words or less (no condition about odd or even),
store the 1-bit pointer mask directly in the binary and use it to
initialize the heap bitmap during malloc. (Implemented in CL 9702.)
(2) There is no case 2 anymore.
(3) Otherwise, store a GC program and execute it to write directly to
the heap bitmap each time an object of that type is allocated.
Executing the GC program directly into the heap bitmap (case (3) above)
was disabled for the Go 1.5 dev cycle, both to avoid needing to use
GC programs for typedmemmove and to avoid updating that code as
the heap bitmap format changed. Typedmemmove no longer uses this
type information; as of CL 9886 it uses the heap bitmap directly.
Now that the heap bitmap format is stable, we reintroduce GC programs
and their space savings.
Benchmarks for heapBitsSetType, before this CL vs this CL:
name old mean new mean delta
SetTypePtr 7.59ns × (0.99,1.02) 5.16ns × (1.00,1.00) -32.05% (p=0.000)
SetTypePtr8 21.0ns × (0.98,1.05) 21.4ns × (1.00,1.00) ~ (p=0.179)
SetTypePtr16 24.1ns × (0.99,1.01) 24.6ns × (1.00,1.00) +2.41% (p=0.001)
SetTypePtr32 31.2ns × (0.99,1.01) 32.4ns × (0.99,1.02) +3.72% (p=0.001)
SetTypePtr64 45.2ns × (1.00,1.00) 47.2ns × (1.00,1.00) +4.42% (p=0.000)
SetTypePtr126 75.8ns × (0.99,1.01) 79.1ns × (1.00,1.00) +4.25% (p=0.000)
SetTypePtr128 74.3ns × (0.99,1.01) 77.6ns × (1.00,1.01) +4.55% (p=0.000)
SetTypePtrSlice 726ns × (1.00,1.01) 712ns × (1.00,1.00) -1.95% (p=0.001)
SetTypeNode1 20.0ns × (0.99,1.01) 20.7ns × (1.00,1.00) +3.71% (p=0.000)
SetTypeNode1Slice 112ns × (1.00,1.00) 113ns × (0.99,1.00) ~ (p=0.070)
SetTypeNode8 23.9ns × (1.00,1.00) 24.7ns × (1.00,1.01) +3.18% (p=0.000)
SetTypeNode8Slice 294ns × (0.99,1.02) 287ns × (0.99,1.01) -2.38% (p=0.015)
SetTypeNode64 52.8ns × (0.99,1.03) 51.8ns × (0.99,1.01) ~ (p=0.069)
SetTypeNode64Slice 1.13µs × (0.99,1.05) 1.14µs × (0.99,1.00) ~ (p=0.767)
SetTypeNode64Dead 36.0ns × (1.00,1.01) 32.5ns × (0.99,1.00) -9.67% (p=0.000)
SetTypeNode64DeadSlice 1.43µs × (0.99,1.01) 1.40µs × (1.00,1.00) -2.39% (p=0.001)
SetTypeNode124 75.7ns × (1.00,1.01) 79.0ns × (1.00,1.00) +4.44% (p=0.000)
SetTypeNode124Slice 1.94µs × (1.00,1.01) 2.04µs × (0.99,1.01) +4.98% (p=0.000)
SetTypeNode126 75.4ns × (1.00,1.01) 77.7ns × (0.99,1.01) +3.11% (p=0.000)
SetTypeNode126Slice 1.95µs × (0.99,1.01) 2.03µs × (1.00,1.00) +3.74% (p=0.000)
SetTypeNode128 85.4ns × (0.99,1.01) 122.0ns × (1.00,1.00) +42.89% (p=0.000)
SetTypeNode128Slice 2.20µs × (1.00,1.01) 2.36µs × (0.98,1.02) +7.48% (p=0.001)
SetTypeNode130 83.3ns × (1.00,1.00) 123.0ns × (1.00,1.00) +47.61% (p=0.000)
SetTypeNode130Slice 2.30µs × (0.99,1.01) 2.40µs × (0.98,1.01) +4.37% (p=0.000)
SetTypeNode1024 498ns × (1.00,1.00) 537ns × (1.00,1.00) +7.96% (p=0.000)
SetTypeNode1024Slice 15.5µs × (0.99,1.01) 17.8µs × (1.00,1.00) +15.27% (p=0.000)
The above compares always using a cached pointer mask (and the
corresponding waste of memory) against using the programs directly.
Some slowdown is expected, in exchange for having a better general algorithm.
The GC programs kick in for SetTypeNode128, SetTypeNode130, SetTypeNode1024,
along with the slice variants of those.
It is possible that the cutoff of 128 words (bits) should be raised
in a followup CL, but even with this low cutoff the GC programs are
faster than Go 1.4's "fast path" non-GC program case.
Benchmarks for heapBitsSetType, Go 1.4 vs this CL:
name old mean new mean delta
SetTypePtr 6.89ns × (1.00,1.00) 5.17ns × (1.00,1.00) -25.02% (p=0.000)
SetTypePtr8 25.8ns × (0.97,1.05) 21.5ns × (1.00,1.00) -16.70% (p=0.000)
SetTypePtr16 39.8ns × (0.97,1.02) 24.7ns × (0.99,1.01) -37.81% (p=0.000)
SetTypePtr32 68.8ns × (0.98,1.01) 32.2ns × (1.00,1.01) -53.18% (p=0.000)
SetTypePtr64 130ns × (1.00,1.00) 47ns × (1.00,1.00) -63.67% (p=0.000)
SetTypePtr126 241ns × (0.99,1.01) 79ns × (1.00,1.01) -67.25% (p=0.000)
SetTypePtr128 2.07µs × (1.00,1.00) 0.08µs × (1.00,1.00) -96.27% (p=0.000)
SetTypePtrSlice 1.05µs × (0.99,1.01) 0.72µs × (0.99,1.02) -31.70% (p=0.000)
SetTypeNode1 16.0ns × (0.99,1.01) 20.8ns × (0.99,1.03) +29.91% (p=0.000)
SetTypeNode1Slice 184ns × (0.99,1.01) 112ns × (0.99,1.01) -39.26% (p=0.000)
SetTypeNode8 29.5ns × (0.97,1.02) 24.6ns × (1.00,1.00) -16.50% (p=0.000)
SetTypeNode8Slice 624ns × (0.98,1.02) 285ns × (1.00,1.00) -54.31% (p=0.000)
SetTypeNode64 135ns × (0.96,1.08) 52ns × (0.99,1.02) -61.32% (p=0.000)
SetTypeNode64Slice 3.83µs × (1.00,1.00) 1.14µs × (0.99,1.01) -70.16% (p=0.000)
SetTypeNode64Dead 134ns × (0.99,1.01) 32ns × (1.00,1.01) -75.74% (p=0.000)
SetTypeNode64DeadSlice 3.83µs × (0.99,1.00) 1.40µs × (1.00,1.01) -63.42% (p=0.000)
SetTypeNode124 240ns × (0.99,1.01) 79ns × (1.00,1.01) -67.05% (p=0.000)
SetTypeNode124Slice 7.27µs × (1.00,1.00) 2.04µs × (1.00,1.00) -71.95% (p=0.000)
SetTypeNode126 2.06µs × (0.99,1.01) 0.08µs × (0.99,1.01) -96.23% (p=0.000)
SetTypeNode126Slice 64.4µs × (1.00,1.00) 2.0µs × (1.00,1.00) -96.85% (p=0.000)
SetTypeNode128 2.09µs × (1.00,1.01) 0.12µs × (1.00,1.00) -94.15% (p=0.000)
SetTypeNode128Slice 65.4µs × (1.00,1.00) 2.4µs × (0.99,1.03) -96.39% (p=0.000)
SetTypeNode130 2.11µs × (1.00,1.00) 0.12µs × (1.00,1.00) -94.18% (p=0.000)
SetTypeNode130Slice 66.3µs × (1.00,1.00) 2.4µs × (0.97,1.08) -96.34% (p=0.000)
SetTypeNode1024 16.0µs × (1.00,1.01) 0.5µs × (1.00,1.00) -96.65% (p=0.000)
SetTypeNode1024Slice 512µs × (1.00,1.00) 18µs × (0.98,1.04) -96.45% (p=0.000)
SetTypeNode124 uses a 124 data + 2 ptr = 126-word allocation.
Both Go 1.4 and this CL are using pointer bitmaps for this case,
so that's an overall 3x speedup for using pointer bitmaps.
SetTypeNode128 uses a 128 data + 2 ptr = 130-word allocation.
Both Go 1.4 and this CL are running the GC program for this case,
so that's an overall 17x speedup when using GC programs (and
I've seen >20x on other systems).
Comparing Go 1.4's SetTypeNode124 (pointer bitmap) against
this CL's SetTypeNode128 (GC program), the slow path in the
code in this CL is 2x faster than the fast path in Go 1.4.
The Go 1 benchmarks are basically unaffected compared to just before this CL.
Go 1 benchmarks, before this CL vs this CL:
name old mean new mean delta
BinaryTree17 5.87s × (0.97,1.04) 5.91s × (0.96,1.04) ~ (p=0.306)
Fannkuch11 4.38s × (1.00,1.00) 4.37s × (1.00,1.01) -0.22% (p=0.006)
FmtFprintfEmpty 90.7ns × (0.97,1.10) 89.3ns × (0.96,1.09) ~ (p=0.280)
FmtFprintfString 282ns × (0.98,1.04) 287ns × (0.98,1.07) +1.72% (p=0.039)
FmtFprintfInt 269ns × (0.99,1.03) 282ns × (0.97,1.04) +4.87% (p=0.000)
FmtFprintfIntInt 478ns × (0.99,1.02) 481ns × (0.99,1.02) +0.61% (p=0.048)
FmtFprintfPrefixedInt 399ns × (0.98,1.03) 400ns × (0.98,1.05) ~ (p=0.533)
FmtFprintfFloat 563ns × (0.99,1.01) 570ns × (1.00,1.01) +1.37% (p=0.000)
FmtManyArgs 1.89µs × (0.99,1.01) 1.92µs × (0.99,1.02) +1.88% (p=0.000)
GobDecode 15.2ms × (0.99,1.01) 15.2ms × (0.98,1.05) ~ (p=0.609)
GobEncode 11.6ms × (0.98,1.03) 11.9ms × (0.98,1.04) +2.17% (p=0.000)
Gzip 648ms × (0.99,1.01) 648ms × (1.00,1.01) ~ (p=0.835)
Gunzip 142ms × (1.00,1.00) 143ms × (1.00,1.01) ~ (p=0.169)
HTTPClientServer 90.5µs × (0.98,1.03) 91.5µs × (0.98,1.04) +1.04% (p=0.045)
JSONEncode 31.5ms × (0.98,1.03) 31.4ms × (0.98,1.03) ~ (p=0.549)
JSONDecode 111ms × (0.99,1.01) 107ms × (0.99,1.01) -3.21% (p=0.000)
Mandelbrot200 6.01ms × (1.00,1.00) 6.01ms × (1.00,1.00) ~ (p=0.878)
GoParse 6.54ms × (0.99,1.02) 6.61ms × (0.99,1.03) +1.08% (p=0.004)
RegexpMatchEasy0_32 160ns × (1.00,1.01) 161ns × (1.00,1.00) +0.40% (p=0.000)
RegexpMatchEasy0_1K 560ns × (0.99,1.01) 559ns × (0.99,1.01) ~ (p=0.088)
RegexpMatchEasy1_32 138ns × (0.99,1.01) 138ns × (1.00,1.00) ~ (p=0.380)
RegexpMatchEasy1_1K 877ns × (1.00,1.00) 878ns × (1.00,1.00) ~ (p=0.157)
RegexpMatchMedium_32 251ns × (0.99,1.00) 251ns × (1.00,1.01) +0.28% (p=0.021)
RegexpMatchMedium_1K 72.6µs × (1.00,1.00) 72.6µs × (1.00,1.00) ~ (p=0.539)
RegexpMatchHard_32 3.84µs × (1.00,1.00) 3.84µs × (1.00,1.00) ~ (p=0.378)
RegexpMatchHard_1K 117µs × (1.00,1.00) 117µs × (1.00,1.00) ~ (p=0.067)
Revcomp 904ms × (0.99,1.02) 904ms × (0.99,1.01) ~ (p=0.943)
Template 125ms × (0.99,1.02) 127ms × (0.99,1.01) +1.79% (p=0.000)
TimeParse 627ns × (0.99,1.01) 622ns × (0.99,1.01) -0.88% (p=0.000)
TimeFormat 655ns × (0.99,1.02) 655ns × (0.99,1.02) ~ (p=0.976)
For the record, Go 1 benchmarks, Go 1.4 vs this CL:
name old mean new mean delta
BinaryTree17 4.61s × (0.97,1.05) 5.91s × (0.98,1.03) +28.35% (p=0.000)
Fannkuch11 4.40s × (0.99,1.03) 4.41s × (0.99,1.01) ~ (p=0.212)
FmtFprintfEmpty 102ns × (0.99,1.01) 84ns × (0.99,1.02) -18.38% (p=0.000)
FmtFprintfString 302ns × (0.98,1.01) 303ns × (0.99,1.02) ~ (p=0.203)
FmtFprintfInt 313ns × (0.97,1.05) 270ns × (0.99,1.01) -13.69% (p=0.000)
FmtFprintfIntInt 524ns × (0.98,1.02) 477ns × (0.99,1.00) -8.87% (p=0.000)
FmtFprintfPrefixedInt 424ns × (0.98,1.02) 386ns × (0.99,1.01) -8.96% (p=0.000)
FmtFprintfFloat 652ns × (0.98,1.02) 594ns × (0.97,1.05) -8.97% (p=0.000)
FmtManyArgs 2.13µs × (0.99,1.02) 1.94µs × (0.99,1.01) -8.92% (p=0.000)
GobDecode 17.1ms × (0.99,1.02) 14.9ms × (0.98,1.03) -13.07% (p=0.000)
GobEncode 13.5ms × (0.98,1.03) 11.5ms × (0.98,1.03) -15.25% (p=0.000)
Gzip 656ms × (0.99,1.02) 647ms × (0.99,1.01) -1.29% (p=0.000)
Gunzip 143ms × (0.99,1.02) 144ms × (0.99,1.01) ~ (p=0.204)
HTTPClientServer 88.2µs × (0.98,1.02) 90.8µs × (0.98,1.01) +2.93% (p=0.000)
JSONEncode 32.2ms × (0.98,1.02) 30.9ms × (0.97,1.04) -4.06% (p=0.001)
JSONDecode 121ms × (0.98,1.02) 110ms × (0.98,1.05) -8.95% (p=0.000)
Mandelbrot200 6.06ms × (0.99,1.01) 6.11ms × (0.98,1.04) ~ (p=0.184)
GoParse 6.76ms × (0.97,1.04) 6.58ms × (0.98,1.05) -2.63% (p=0.003)
RegexpMatchEasy0_32 195ns × (1.00,1.01) 155ns × (0.99,1.01) -20.43% (p=0.000)
RegexpMatchEasy0_1K 479ns × (0.98,1.03) 535ns × (0.99,1.02) +11.59% (p=0.000)
RegexpMatchEasy1_32 169ns × (0.99,1.02) 131ns × (0.99,1.03) -22.44% (p=0.000)
RegexpMatchEasy1_1K 1.53µs × (0.99,1.01) 0.87µs × (0.99,1.02) -43.07% (p=0.000)
RegexpMatchMedium_32 334ns × (0.99,1.01) 242ns × (0.99,1.01) -27.53% (p=0.000)
RegexpMatchMedium_1K 125µs × (1.00,1.01) 72µs × (0.99,1.03) -42.53% (p=0.000)
RegexpMatchHard_32 6.03µs × (0.99,1.01) 3.79µs × (0.99,1.01) -37.12% (p=0.000)
RegexpMatchHard_1K 189µs × (0.99,1.02) 115µs × (0.99,1.01) -39.20% (p=0.000)
Revcomp 935ms × (0.96,1.03) 926ms × (0.98,1.02) ~ (p=0.083)
Template 146ms × (0.97,1.05) 119ms × (0.99,1.01) -18.37% (p=0.000)
TimeParse 660ns × (0.99,1.01) 624ns × (0.99,1.02) -5.43% (p=0.000)
TimeFormat 670ns × (0.98,1.02) 710ns × (1.00,1.01) +5.97% (p=0.000)
This CL is a bit larger than I would like, but the compiler, linker, runtime,
and package reflect all need to be in sync about the format of these programs,
so there is no easy way to split this into independent changes (at least
while keeping the build working at each change).
Fixes #9625.
Fixes #10524.
Change-Id: I9e3e20d6097099d0f8532d1cb5b1af528804989a
Reviewed-on: https://go-review.googlesource.com/9888
Reviewed-by: Austin Clements <austin@google.com>
Run-TryBot: Russ Cox <rsc@golang.org>
2015-05-08 01:43:18 -04:00
|
|
|
if debugGCProg {
|
|
|
|
|
fmt.Fprintf(os.Stderr, "gcprog sym: %s at %d (ptr=%d+%d)\n", s.Name, s.Value, s.Value/ptrsize, nptr)
|
2015-04-28 00:28:47 -04:00
|
|
|
}
|
2015-02-27 22:57:28 -05:00
|
|
|
|
2016-08-22 10:33:13 -04:00
|
|
|
if decodetypeUsegcprog(typ) == 0 {
|
runtime: replace GC programs with simpler encoding, faster decoder
Small types record the location of pointers in their memory layout
by using a simple bitmap. In Go 1.4 the bitmap held 4-bit entries,
and in Go 1.5 the bitmap holds 1-bit entries, but in both cases using
a bitmap for a large type containing arrays does not make sense:
if someone refers to the type [1<<28]*byte in a program in such
a way that the type information makes it into the binary, it would be
a waste of space to write a 128 MB (for 4-bit entries) or even 32 MB
(for 1-bit entries) bitmap full of 1s into the binary or even to keep
one in memory during the execution of the program.
For large types containing arrays, it is much more compact to describe
the locations of pointers using a notation that can express repetition
than to lay out a bitmap of pointers. Go 1.4 included such a notation,
called ``GC programs'' but it was complex, required recursion during
decoding, and was generally slow. Dmitriy measured the execution of
these programs writing directly to the heap bitmap as being 7x slower
than copying from a preunrolled 4-bit mask (and frankly that code was
not terribly fast either). For some tests, unrollgcprog1 was seen costing
as much as 3x more than the rest of malloc combined.
This CL introduces a different form for the GC programs. They use a
simple Lempel-Ziv-style encoding of the 1-bit pointer information,
in which the only operations are (1) emit the following n bits
and (2) repeat the last n bits c more times. This encoding can be
generated directly from the Go type information (using repetition
only for arrays or large runs of non-pointer data) and it can be decoded
very efficiently. In particular the decoding requires little state and
no recursion, so that the entire decoding can run without any memory
accesses other than the reads of the encoding and the writes of the
decoded form to the heap bitmap. For recursive types like arrays of
arrays of arrays, the inner instructions are only executed once, not
n times, so that large repetitions run at full speed. (In contrast, large
repetitions in the old programs repeated the individual bit-level layout
of the inner data over and over.) The result is as much as 25x faster
decoding compared to the old form.
Because the old decoder was so slow, Go 1.4 had three (or so) cases
for how to set the heap bitmap bits for an allocation of a given type:
(1) If the type had an even number of words up to 32 words, then
the 4-bit pointer mask for the type fit in no more than 16 bytes;
store the 4-bit pointer mask directly in the binary and copy from it.
(1b) If the type had an odd number of words up to 15 words, then
the 4-bit pointer mask for the type, doubled to end on a byte boundary,
fit in no more than 16 bytes; store that doubled mask directly in the
binary and copy from it.
(2) If the type had an even number of words up to 128 words,
or an odd number of words up to 63 words (again due to doubling),
then the 4-bit pointer mask would fit in a 64-byte unrolled mask.
Store a GC program in the binary, but leave space in the BSS for
the unrolled mask. Execute the GC program to construct the mask the
first time it is needed, and thereafter copy from the mask.
(3) Otherwise, store a GC program and execute it to write directly to
the heap bitmap each time an object of that type is allocated.
(This is the case that was 7x slower than the other two.)
Because the new pointer masks store 1-bit entries instead of 4-bit
entries and because using the decoder no longer carries a significant
overhead, after this CL (that is, for Go 1.5) there are only two cases:
(1) If the type is 128 words or less (no condition about odd or even),
store the 1-bit pointer mask directly in the binary and use it to
initialize the heap bitmap during malloc. (Implemented in CL 9702.)
(2) There is no case 2 anymore.
(3) Otherwise, store a GC program and execute it to write directly to
the heap bitmap each time an object of that type is allocated.
Executing the GC program directly into the heap bitmap (case (3) above)
was disabled for the Go 1.5 dev cycle, both to avoid needing to use
GC programs for typedmemmove and to avoid updating that code as
the heap bitmap format changed. Typedmemmove no longer uses this
type information; as of CL 9886 it uses the heap bitmap directly.
Now that the heap bitmap format is stable, we reintroduce GC programs
and their space savings.
Benchmarks for heapBitsSetType, before this CL vs this CL:
name old mean new mean delta
SetTypePtr 7.59ns × (0.99,1.02) 5.16ns × (1.00,1.00) -32.05% (p=0.000)
SetTypePtr8 21.0ns × (0.98,1.05) 21.4ns × (1.00,1.00) ~ (p=0.179)
SetTypePtr16 24.1ns × (0.99,1.01) 24.6ns × (1.00,1.00) +2.41% (p=0.001)
SetTypePtr32 31.2ns × (0.99,1.01) 32.4ns × (0.99,1.02) +3.72% (p=0.001)
SetTypePtr64 45.2ns × (1.00,1.00) 47.2ns × (1.00,1.00) +4.42% (p=0.000)
SetTypePtr126 75.8ns × (0.99,1.01) 79.1ns × (1.00,1.00) +4.25% (p=0.000)
SetTypePtr128 74.3ns × (0.99,1.01) 77.6ns × (1.00,1.01) +4.55% (p=0.000)
SetTypePtrSlice 726ns × (1.00,1.01) 712ns × (1.00,1.00) -1.95% (p=0.001)
SetTypeNode1 20.0ns × (0.99,1.01) 20.7ns × (1.00,1.00) +3.71% (p=0.000)
SetTypeNode1Slice 112ns × (1.00,1.00) 113ns × (0.99,1.00) ~ (p=0.070)
SetTypeNode8 23.9ns × (1.00,1.00) 24.7ns × (1.00,1.01) +3.18% (p=0.000)
SetTypeNode8Slice 294ns × (0.99,1.02) 287ns × (0.99,1.01) -2.38% (p=0.015)
SetTypeNode64 52.8ns × (0.99,1.03) 51.8ns × (0.99,1.01) ~ (p=0.069)
SetTypeNode64Slice 1.13µs × (0.99,1.05) 1.14µs × (0.99,1.00) ~ (p=0.767)
SetTypeNode64Dead 36.0ns × (1.00,1.01) 32.5ns × (0.99,1.00) -9.67% (p=0.000)
SetTypeNode64DeadSlice 1.43µs × (0.99,1.01) 1.40µs × (1.00,1.00) -2.39% (p=0.001)
SetTypeNode124 75.7ns × (1.00,1.01) 79.0ns × (1.00,1.00) +4.44% (p=0.000)
SetTypeNode124Slice 1.94µs × (1.00,1.01) 2.04µs × (0.99,1.01) +4.98% (p=0.000)
SetTypeNode126 75.4ns × (1.00,1.01) 77.7ns × (0.99,1.01) +3.11% (p=0.000)
SetTypeNode126Slice 1.95µs × (0.99,1.01) 2.03µs × (1.00,1.00) +3.74% (p=0.000)
SetTypeNode128 85.4ns × (0.99,1.01) 122.0ns × (1.00,1.00) +42.89% (p=0.000)
SetTypeNode128Slice 2.20µs × (1.00,1.01) 2.36µs × (0.98,1.02) +7.48% (p=0.001)
SetTypeNode130 83.3ns × (1.00,1.00) 123.0ns × (1.00,1.00) +47.61% (p=0.000)
SetTypeNode130Slice 2.30µs × (0.99,1.01) 2.40µs × (0.98,1.01) +4.37% (p=0.000)
SetTypeNode1024 498ns × (1.00,1.00) 537ns × (1.00,1.00) +7.96% (p=0.000)
SetTypeNode1024Slice 15.5µs × (0.99,1.01) 17.8µs × (1.00,1.00) +15.27% (p=0.000)
The above compares always using a cached pointer mask (and the
corresponding waste of memory) against using the programs directly.
Some slowdown is expected, in exchange for having a better general algorithm.
The GC programs kick in for SetTypeNode128, SetTypeNode130, SetTypeNode1024,
along with the slice variants of those.
It is possible that the cutoff of 128 words (bits) should be raised
in a followup CL, but even with this low cutoff the GC programs are
faster than Go 1.4's "fast path" non-GC program case.
Benchmarks for heapBitsSetType, Go 1.4 vs this CL:
name old mean new mean delta
SetTypePtr 6.89ns × (1.00,1.00) 5.17ns × (1.00,1.00) -25.02% (p=0.000)
SetTypePtr8 25.8ns × (0.97,1.05) 21.5ns × (1.00,1.00) -16.70% (p=0.000)
SetTypePtr16 39.8ns × (0.97,1.02) 24.7ns × (0.99,1.01) -37.81% (p=0.000)
SetTypePtr32 68.8ns × (0.98,1.01) 32.2ns × (1.00,1.01) -53.18% (p=0.000)
SetTypePtr64 130ns × (1.00,1.00) 47ns × (1.00,1.00) -63.67% (p=0.000)
SetTypePtr126 241ns × (0.99,1.01) 79ns × (1.00,1.01) -67.25% (p=0.000)
SetTypePtr128 2.07µs × (1.00,1.00) 0.08µs × (1.00,1.00) -96.27% (p=0.000)
SetTypePtrSlice 1.05µs × (0.99,1.01) 0.72µs × (0.99,1.02) -31.70% (p=0.000)
SetTypeNode1 16.0ns × (0.99,1.01) 20.8ns × (0.99,1.03) +29.91% (p=0.000)
SetTypeNode1Slice 184ns × (0.99,1.01) 112ns × (0.99,1.01) -39.26% (p=0.000)
SetTypeNode8 29.5ns × (0.97,1.02) 24.6ns × (1.00,1.00) -16.50% (p=0.000)
SetTypeNode8Slice 624ns × (0.98,1.02) 285ns × (1.00,1.00) -54.31% (p=0.000)
SetTypeNode64 135ns × (0.96,1.08) 52ns × (0.99,1.02) -61.32% (p=0.000)
SetTypeNode64Slice 3.83µs × (1.00,1.00) 1.14µs × (0.99,1.01) -70.16% (p=0.000)
SetTypeNode64Dead 134ns × (0.99,1.01) 32ns × (1.00,1.01) -75.74% (p=0.000)
SetTypeNode64DeadSlice 3.83µs × (0.99,1.00) 1.40µs × (1.00,1.01) -63.42% (p=0.000)
SetTypeNode124 240ns × (0.99,1.01) 79ns × (1.00,1.01) -67.05% (p=0.000)
SetTypeNode124Slice 7.27µs × (1.00,1.00) 2.04µs × (1.00,1.00) -71.95% (p=0.000)
SetTypeNode126 2.06µs × (0.99,1.01) 0.08µs × (0.99,1.01) -96.23% (p=0.000)
SetTypeNode126Slice 64.4µs × (1.00,1.00) 2.0µs × (1.00,1.00) -96.85% (p=0.000)
SetTypeNode128 2.09µs × (1.00,1.01) 0.12µs × (1.00,1.00) -94.15% (p=0.000)
SetTypeNode128Slice 65.4µs × (1.00,1.00) 2.4µs × (0.99,1.03) -96.39% (p=0.000)
SetTypeNode130 2.11µs × (1.00,1.00) 0.12µs × (1.00,1.00) -94.18% (p=0.000)
SetTypeNode130Slice 66.3µs × (1.00,1.00) 2.4µs × (0.97,1.08) -96.34% (p=0.000)
SetTypeNode1024 16.0µs × (1.00,1.01) 0.5µs × (1.00,1.00) -96.65% (p=0.000)
SetTypeNode1024Slice 512µs × (1.00,1.00) 18µs × (0.98,1.04) -96.45% (p=0.000)
SetTypeNode124 uses a 124 data + 2 ptr = 126-word allocation.
Both Go 1.4 and this CL are using pointer bitmaps for this case,
so that's an overall 3x speedup for using pointer bitmaps.
SetTypeNode128 uses a 128 data + 2 ptr = 130-word allocation.
Both Go 1.4 and this CL are running the GC program for this case,
so that's an overall 17x speedup when using GC programs (and
I've seen >20x on other systems).
Comparing Go 1.4's SetTypeNode124 (pointer bitmap) against
this CL's SetTypeNode128 (GC program), the slow path in the
code in this CL is 2x faster than the fast path in Go 1.4.
The Go 1 benchmarks are basically unaffected compared to just before this CL.
Go 1 benchmarks, before this CL vs this CL:
name old mean new mean delta
BinaryTree17 5.87s × (0.97,1.04) 5.91s × (0.96,1.04) ~ (p=0.306)
Fannkuch11 4.38s × (1.00,1.00) 4.37s × (1.00,1.01) -0.22% (p=0.006)
FmtFprintfEmpty 90.7ns × (0.97,1.10) 89.3ns × (0.96,1.09) ~ (p=0.280)
FmtFprintfString 282ns × (0.98,1.04) 287ns × (0.98,1.07) +1.72% (p=0.039)
FmtFprintfInt 269ns × (0.99,1.03) 282ns × (0.97,1.04) +4.87% (p=0.000)
FmtFprintfIntInt 478ns × (0.99,1.02) 481ns × (0.99,1.02) +0.61% (p=0.048)
FmtFprintfPrefixedInt 399ns × (0.98,1.03) 400ns × (0.98,1.05) ~ (p=0.533)
FmtFprintfFloat 563ns × (0.99,1.01) 570ns × (1.00,1.01) +1.37% (p=0.000)
FmtManyArgs 1.89µs × (0.99,1.01) 1.92µs × (0.99,1.02) +1.88% (p=0.000)
GobDecode 15.2ms × (0.99,1.01) 15.2ms × (0.98,1.05) ~ (p=0.609)
GobEncode 11.6ms × (0.98,1.03) 11.9ms × (0.98,1.04) +2.17% (p=0.000)
Gzip 648ms × (0.99,1.01) 648ms × (1.00,1.01) ~ (p=0.835)
Gunzip 142ms × (1.00,1.00) 143ms × (1.00,1.01) ~ (p=0.169)
HTTPClientServer 90.5µs × (0.98,1.03) 91.5µs × (0.98,1.04) +1.04% (p=0.045)
JSONEncode 31.5ms × (0.98,1.03) 31.4ms × (0.98,1.03) ~ (p=0.549)
JSONDecode 111ms × (0.99,1.01) 107ms × (0.99,1.01) -3.21% (p=0.000)
Mandelbrot200 6.01ms × (1.00,1.00) 6.01ms × (1.00,1.00) ~ (p=0.878)
GoParse 6.54ms × (0.99,1.02) 6.61ms × (0.99,1.03) +1.08% (p=0.004)
RegexpMatchEasy0_32 160ns × (1.00,1.01) 161ns × (1.00,1.00) +0.40% (p=0.000)
RegexpMatchEasy0_1K 560ns × (0.99,1.01) 559ns × (0.99,1.01) ~ (p=0.088)
RegexpMatchEasy1_32 138ns × (0.99,1.01) 138ns × (1.00,1.00) ~ (p=0.380)
RegexpMatchEasy1_1K 877ns × (1.00,1.00) 878ns × (1.00,1.00) ~ (p=0.157)
RegexpMatchMedium_32 251ns × (0.99,1.00) 251ns × (1.00,1.01) +0.28% (p=0.021)
RegexpMatchMedium_1K 72.6µs × (1.00,1.00) 72.6µs × (1.00,1.00) ~ (p=0.539)
RegexpMatchHard_32 3.84µs × (1.00,1.00) 3.84µs × (1.00,1.00) ~ (p=0.378)
RegexpMatchHard_1K 117µs × (1.00,1.00) 117µs × (1.00,1.00) ~ (p=0.067)
Revcomp 904ms × (0.99,1.02) 904ms × (0.99,1.01) ~ (p=0.943)
Template 125ms × (0.99,1.02) 127ms × (0.99,1.01) +1.79% (p=0.000)
TimeParse 627ns × (0.99,1.01) 622ns × (0.99,1.01) -0.88% (p=0.000)
TimeFormat 655ns × (0.99,1.02) 655ns × (0.99,1.02) ~ (p=0.976)
For the record, Go 1 benchmarks, Go 1.4 vs this CL:
name old mean new mean delta
BinaryTree17 4.61s × (0.97,1.05) 5.91s × (0.98,1.03) +28.35% (p=0.000)
Fannkuch11 4.40s × (0.99,1.03) 4.41s × (0.99,1.01) ~ (p=0.212)
FmtFprintfEmpty 102ns × (0.99,1.01) 84ns × (0.99,1.02) -18.38% (p=0.000)
FmtFprintfString 302ns × (0.98,1.01) 303ns × (0.99,1.02) ~ (p=0.203)
FmtFprintfInt 313ns × (0.97,1.05) 270ns × (0.99,1.01) -13.69% (p=0.000)
FmtFprintfIntInt 524ns × (0.98,1.02) 477ns × (0.99,1.00) -8.87% (p=0.000)
FmtFprintfPrefixedInt 424ns × (0.98,1.02) 386ns × (0.99,1.01) -8.96% (p=0.000)
FmtFprintfFloat 652ns × (0.98,1.02) 594ns × (0.97,1.05) -8.97% (p=0.000)
FmtManyArgs 2.13µs × (0.99,1.02) 1.94µs × (0.99,1.01) -8.92% (p=0.000)
GobDecode 17.1ms × (0.99,1.02) 14.9ms × (0.98,1.03) -13.07% (p=0.000)
GobEncode 13.5ms × (0.98,1.03) 11.5ms × (0.98,1.03) -15.25% (p=0.000)
Gzip 656ms × (0.99,1.02) 647ms × (0.99,1.01) -1.29% (p=0.000)
Gunzip 143ms × (0.99,1.02) 144ms × (0.99,1.01) ~ (p=0.204)
HTTPClientServer 88.2µs × (0.98,1.02) 90.8µs × (0.98,1.01) +2.93% (p=0.000)
JSONEncode 32.2ms × (0.98,1.02) 30.9ms × (0.97,1.04) -4.06% (p=0.001)
JSONDecode 121ms × (0.98,1.02) 110ms × (0.98,1.05) -8.95% (p=0.000)
Mandelbrot200 6.06ms × (0.99,1.01) 6.11ms × (0.98,1.04) ~ (p=0.184)
GoParse 6.76ms × (0.97,1.04) 6.58ms × (0.98,1.05) -2.63% (p=0.003)
RegexpMatchEasy0_32 195ns × (1.00,1.01) 155ns × (0.99,1.01) -20.43% (p=0.000)
RegexpMatchEasy0_1K 479ns × (0.98,1.03) 535ns × (0.99,1.02) +11.59% (p=0.000)
RegexpMatchEasy1_32 169ns × (0.99,1.02) 131ns × (0.99,1.03) -22.44% (p=0.000)
RegexpMatchEasy1_1K 1.53µs × (0.99,1.01) 0.87µs × (0.99,1.02) -43.07% (p=0.000)
RegexpMatchMedium_32 334ns × (0.99,1.01) 242ns × (0.99,1.01) -27.53% (p=0.000)
RegexpMatchMedium_1K 125µs × (1.00,1.01) 72µs × (0.99,1.03) -42.53% (p=0.000)
RegexpMatchHard_32 6.03µs × (0.99,1.01) 3.79µs × (0.99,1.01) -37.12% (p=0.000)
RegexpMatchHard_1K 189µs × (0.99,1.02) 115µs × (0.99,1.01) -39.20% (p=0.000)
Revcomp 935ms × (0.96,1.03) 926ms × (0.98,1.02) ~ (p=0.083)
Template 146ms × (0.97,1.05) 119ms × (0.99,1.01) -18.37% (p=0.000)
TimeParse 660ns × (0.99,1.01) 624ns × (0.99,1.02) -5.43% (p=0.000)
TimeFormat 670ns × (0.98,1.02) 710ns × (1.00,1.01) +5.97% (p=0.000)
This CL is a bit larger than I would like, but the compiler, linker, runtime,
and package reflect all need to be in sync about the format of these programs,
so there is no easy way to split this into independent changes (at least
while keeping the build working at each change).
Fixes #9625.
Fixes #10524.
Change-Id: I9e3e20d6097099d0f8532d1cb5b1af528804989a
Reviewed-on: https://go-review.googlesource.com/9888
Reviewed-by: Austin Clements <austin@google.com>
Run-TryBot: Russ Cox <rsc@golang.org>
2015-05-08 01:43:18 -04:00
|
|
|
// Copy pointers from mask into program.
|
2016-08-22 10:33:13 -04:00
|
|
|
mask := decodetypeGcmask(p.ctxt, typ)
|
runtime: replace GC programs with simpler encoding, faster decoder
Small types record the location of pointers in their memory layout
by using a simple bitmap. In Go 1.4 the bitmap held 4-bit entries,
and in Go 1.5 the bitmap holds 1-bit entries, but in both cases using
a bitmap for a large type containing arrays does not make sense:
if someone refers to the type [1<<28]*byte in a program in such
a way that the type information makes it into the binary, it would be
a waste of space to write a 128 MB (for 4-bit entries) or even 32 MB
(for 1-bit entries) bitmap full of 1s into the binary or even to keep
one in memory during the execution of the program.
For large types containing arrays, it is much more compact to describe
the locations of pointers using a notation that can express repetition
than to lay out a bitmap of pointers. Go 1.4 included such a notation,
called ``GC programs'' but it was complex, required recursion during
decoding, and was generally slow. Dmitriy measured the execution of
these programs writing directly to the heap bitmap as being 7x slower
than copying from a preunrolled 4-bit mask (and frankly that code was
not terribly fast either). For some tests, unrollgcprog1 was seen costing
as much as 3x more than the rest of malloc combined.
This CL introduces a different form for the GC programs. They use a
simple Lempel-Ziv-style encoding of the 1-bit pointer information,
in which the only operations are (1) emit the following n bits
and (2) repeat the last n bits c more times. This encoding can be
generated directly from the Go type information (using repetition
only for arrays or large runs of non-pointer data) and it can be decoded
very efficiently. In particular the decoding requires little state and
no recursion, so that the entire decoding can run without any memory
accesses other than the reads of the encoding and the writes of the
decoded form to the heap bitmap. For recursive types like arrays of
arrays of arrays, the inner instructions are only executed once, not
n times, so that large repetitions run at full speed. (In contrast, large
repetitions in the old programs repeated the individual bit-level layout
of the inner data over and over.) The result is as much as 25x faster
decoding compared to the old form.
Because the old decoder was so slow, Go 1.4 had three (or so) cases
for how to set the heap bitmap bits for an allocation of a given type:
(1) If the type had an even number of words up to 32 words, then
the 4-bit pointer mask for the type fit in no more than 16 bytes;
store the 4-bit pointer mask directly in the binary and copy from it.
(1b) If the type had an odd number of words up to 15 words, then
the 4-bit pointer mask for the type, doubled to end on a byte boundary,
fit in no more than 16 bytes; store that doubled mask directly in the
binary and copy from it.
(2) If the type had an even number of words up to 128 words,
or an odd number of words up to 63 words (again due to doubling),
then the 4-bit pointer mask would fit in a 64-byte unrolled mask.
Store a GC program in the binary, but leave space in the BSS for
the unrolled mask. Execute the GC program to construct the mask the
first time it is needed, and thereafter copy from the mask.
(3) Otherwise, store a GC program and execute it to write directly to
the heap bitmap each time an object of that type is allocated.
(This is the case that was 7x slower than the other two.)
Because the new pointer masks store 1-bit entries instead of 4-bit
entries and because using the decoder no longer carries a significant
overhead, after this CL (that is, for Go 1.5) there are only two cases:
(1) If the type is 128 words or less (no condition about odd or even),
store the 1-bit pointer mask directly in the binary and use it to
initialize the heap bitmap during malloc. (Implemented in CL 9702.)
(2) There is no case 2 anymore.
(3) Otherwise, store a GC program and execute it to write directly to
the heap bitmap each time an object of that type is allocated.
Executing the GC program directly into the heap bitmap (case (3) above)
was disabled for the Go 1.5 dev cycle, both to avoid needing to use
GC programs for typedmemmove and to avoid updating that code as
the heap bitmap format changed. Typedmemmove no longer uses this
type information; as of CL 9886 it uses the heap bitmap directly.
Now that the heap bitmap format is stable, we reintroduce GC programs
and their space savings.
Benchmarks for heapBitsSetType, before this CL vs this CL:
name old mean new mean delta
SetTypePtr 7.59ns × (0.99,1.02) 5.16ns × (1.00,1.00) -32.05% (p=0.000)
SetTypePtr8 21.0ns × (0.98,1.05) 21.4ns × (1.00,1.00) ~ (p=0.179)
SetTypePtr16 24.1ns × (0.99,1.01) 24.6ns × (1.00,1.00) +2.41% (p=0.001)
SetTypePtr32 31.2ns × (0.99,1.01) 32.4ns × (0.99,1.02) +3.72% (p=0.001)
SetTypePtr64 45.2ns × (1.00,1.00) 47.2ns × (1.00,1.00) +4.42% (p=0.000)
SetTypePtr126 75.8ns × (0.99,1.01) 79.1ns × (1.00,1.00) +4.25% (p=0.000)
SetTypePtr128 74.3ns × (0.99,1.01) 77.6ns × (1.00,1.01) +4.55% (p=0.000)
SetTypePtrSlice 726ns × (1.00,1.01) 712ns × (1.00,1.00) -1.95% (p=0.001)
SetTypeNode1 20.0ns × (0.99,1.01) 20.7ns × (1.00,1.00) +3.71% (p=0.000)
SetTypeNode1Slice 112ns × (1.00,1.00) 113ns × (0.99,1.00) ~ (p=0.070)
SetTypeNode8 23.9ns × (1.00,1.00) 24.7ns × (1.00,1.01) +3.18% (p=0.000)
SetTypeNode8Slice 294ns × (0.99,1.02) 287ns × (0.99,1.01) -2.38% (p=0.015)
SetTypeNode64 52.8ns × (0.99,1.03) 51.8ns × (0.99,1.01) ~ (p=0.069)
SetTypeNode64Slice 1.13µs × (0.99,1.05) 1.14µs × (0.99,1.00) ~ (p=0.767)
SetTypeNode64Dead 36.0ns × (1.00,1.01) 32.5ns × (0.99,1.00) -9.67% (p=0.000)
SetTypeNode64DeadSlice 1.43µs × (0.99,1.01) 1.40µs × (1.00,1.00) -2.39% (p=0.001)
SetTypeNode124 75.7ns × (1.00,1.01) 79.0ns × (1.00,1.00) +4.44% (p=0.000)
SetTypeNode124Slice 1.94µs × (1.00,1.01) 2.04µs × (0.99,1.01) +4.98% (p=0.000)
SetTypeNode126 75.4ns × (1.00,1.01) 77.7ns × (0.99,1.01) +3.11% (p=0.000)
SetTypeNode126Slice 1.95µs × (0.99,1.01) 2.03µs × (1.00,1.00) +3.74% (p=0.000)
SetTypeNode128 85.4ns × (0.99,1.01) 122.0ns × (1.00,1.00) +42.89% (p=0.000)
SetTypeNode128Slice 2.20µs × (1.00,1.01) 2.36µs × (0.98,1.02) +7.48% (p=0.001)
SetTypeNode130 83.3ns × (1.00,1.00) 123.0ns × (1.00,1.00) +47.61% (p=0.000)
SetTypeNode130Slice 2.30µs × (0.99,1.01) 2.40µs × (0.98,1.01) +4.37% (p=0.000)
SetTypeNode1024 498ns × (1.00,1.00) 537ns × (1.00,1.00) +7.96% (p=0.000)
SetTypeNode1024Slice 15.5µs × (0.99,1.01) 17.8µs × (1.00,1.00) +15.27% (p=0.000)
The above compares always using a cached pointer mask (and the
corresponding waste of memory) against using the programs directly.
Some slowdown is expected, in exchange for having a better general algorithm.
The GC programs kick in for SetTypeNode128, SetTypeNode130, SetTypeNode1024,
along with the slice variants of those.
It is possible that the cutoff of 128 words (bits) should be raised
in a followup CL, but even with this low cutoff the GC programs are
faster than Go 1.4's "fast path" non-GC program case.
Benchmarks for heapBitsSetType, Go 1.4 vs this CL:
name old mean new mean delta
SetTypePtr 6.89ns × (1.00,1.00) 5.17ns × (1.00,1.00) -25.02% (p=0.000)
SetTypePtr8 25.8ns × (0.97,1.05) 21.5ns × (1.00,1.00) -16.70% (p=0.000)
SetTypePtr16 39.8ns × (0.97,1.02) 24.7ns × (0.99,1.01) -37.81% (p=0.000)
SetTypePtr32 68.8ns × (0.98,1.01) 32.2ns × (1.00,1.01) -53.18% (p=0.000)
SetTypePtr64 130ns × (1.00,1.00) 47ns × (1.00,1.00) -63.67% (p=0.000)
SetTypePtr126 241ns × (0.99,1.01) 79ns × (1.00,1.01) -67.25% (p=0.000)
SetTypePtr128 2.07µs × (1.00,1.00) 0.08µs × (1.00,1.00) -96.27% (p=0.000)
SetTypePtrSlice 1.05µs × (0.99,1.01) 0.72µs × (0.99,1.02) -31.70% (p=0.000)
SetTypeNode1 16.0ns × (0.99,1.01) 20.8ns × (0.99,1.03) +29.91% (p=0.000)
SetTypeNode1Slice 184ns × (0.99,1.01) 112ns × (0.99,1.01) -39.26% (p=0.000)
SetTypeNode8 29.5ns × (0.97,1.02) 24.6ns × (1.00,1.00) -16.50% (p=0.000)
SetTypeNode8Slice 624ns × (0.98,1.02) 285ns × (1.00,1.00) -54.31% (p=0.000)
SetTypeNode64 135ns × (0.96,1.08) 52ns × (0.99,1.02) -61.32% (p=0.000)
SetTypeNode64Slice 3.83µs × (1.00,1.00) 1.14µs × (0.99,1.01) -70.16% (p=0.000)
SetTypeNode64Dead 134ns × (0.99,1.01) 32ns × (1.00,1.01) -75.74% (p=0.000)
SetTypeNode64DeadSlice 3.83µs × (0.99,1.00) 1.40µs × (1.00,1.01) -63.42% (p=0.000)
SetTypeNode124 240ns × (0.99,1.01) 79ns × (1.00,1.01) -67.05% (p=0.000)
SetTypeNode124Slice 7.27µs × (1.00,1.00) 2.04µs × (1.00,1.00) -71.95% (p=0.000)
SetTypeNode126 2.06µs × (0.99,1.01) 0.08µs × (0.99,1.01) -96.23% (p=0.000)
SetTypeNode126Slice 64.4µs × (1.00,1.00) 2.0µs × (1.00,1.00) -96.85% (p=0.000)
SetTypeNode128 2.09µs × (1.00,1.01) 0.12µs × (1.00,1.00) -94.15% (p=0.000)
SetTypeNode128Slice 65.4µs × (1.00,1.00) 2.4µs × (0.99,1.03) -96.39% (p=0.000)
SetTypeNode130 2.11µs × (1.00,1.00) 0.12µs × (1.00,1.00) -94.18% (p=0.000)
SetTypeNode130Slice 66.3µs × (1.00,1.00) 2.4µs × (0.97,1.08) -96.34% (p=0.000)
SetTypeNode1024 16.0µs × (1.00,1.01) 0.5µs × (1.00,1.00) -96.65% (p=0.000)
SetTypeNode1024Slice 512µs × (1.00,1.00) 18µs × (0.98,1.04) -96.45% (p=0.000)
SetTypeNode124 uses a 124 data + 2 ptr = 126-word allocation.
Both Go 1.4 and this CL are using pointer bitmaps for this case,
so that's an overall 3x speedup for using pointer bitmaps.
SetTypeNode128 uses a 128 data + 2 ptr = 130-word allocation.
Both Go 1.4 and this CL are running the GC program for this case,
so that's an overall 17x speedup when using GC programs (and
I've seen >20x on other systems).
Comparing Go 1.4's SetTypeNode124 (pointer bitmap) against
this CL's SetTypeNode128 (GC program), the slow path in the
code in this CL is 2x faster than the fast path in Go 1.4.
The Go 1 benchmarks are basically unaffected compared to just before this CL.
Go 1 benchmarks, before this CL vs this CL:
name old mean new mean delta
BinaryTree17 5.87s × (0.97,1.04) 5.91s × (0.96,1.04) ~ (p=0.306)
Fannkuch11 4.38s × (1.00,1.00) 4.37s × (1.00,1.01) -0.22% (p=0.006)
FmtFprintfEmpty 90.7ns × (0.97,1.10) 89.3ns × (0.96,1.09) ~ (p=0.280)
FmtFprintfString 282ns × (0.98,1.04) 287ns × (0.98,1.07) +1.72% (p=0.039)
FmtFprintfInt 269ns × (0.99,1.03) 282ns × (0.97,1.04) +4.87% (p=0.000)
FmtFprintfIntInt 478ns × (0.99,1.02) 481ns × (0.99,1.02) +0.61% (p=0.048)
FmtFprintfPrefixedInt 399ns × (0.98,1.03) 400ns × (0.98,1.05) ~ (p=0.533)
FmtFprintfFloat 563ns × (0.99,1.01) 570ns × (1.00,1.01) +1.37% (p=0.000)
FmtManyArgs 1.89µs × (0.99,1.01) 1.92µs × (0.99,1.02) +1.88% (p=0.000)
GobDecode 15.2ms × (0.99,1.01) 15.2ms × (0.98,1.05) ~ (p=0.609)
GobEncode 11.6ms × (0.98,1.03) 11.9ms × (0.98,1.04) +2.17% (p=0.000)
Gzip 648ms × (0.99,1.01) 648ms × (1.00,1.01) ~ (p=0.835)
Gunzip 142ms × (1.00,1.00) 143ms × (1.00,1.01) ~ (p=0.169)
HTTPClientServer 90.5µs × (0.98,1.03) 91.5µs × (0.98,1.04) +1.04% (p=0.045)
JSONEncode 31.5ms × (0.98,1.03) 31.4ms × (0.98,1.03) ~ (p=0.549)
JSONDecode 111ms × (0.99,1.01) 107ms × (0.99,1.01) -3.21% (p=0.000)
Mandelbrot200 6.01ms × (1.00,1.00) 6.01ms × (1.00,1.00) ~ (p=0.878)
GoParse 6.54ms × (0.99,1.02) 6.61ms × (0.99,1.03) +1.08% (p=0.004)
RegexpMatchEasy0_32 160ns × (1.00,1.01) 161ns × (1.00,1.00) +0.40% (p=0.000)
RegexpMatchEasy0_1K 560ns × (0.99,1.01) 559ns × (0.99,1.01) ~ (p=0.088)
RegexpMatchEasy1_32 138ns × (0.99,1.01) 138ns × (1.00,1.00) ~ (p=0.380)
RegexpMatchEasy1_1K 877ns × (1.00,1.00) 878ns × (1.00,1.00) ~ (p=0.157)
RegexpMatchMedium_32 251ns × (0.99,1.00) 251ns × (1.00,1.01) +0.28% (p=0.021)
RegexpMatchMedium_1K 72.6µs × (1.00,1.00) 72.6µs × (1.00,1.00) ~ (p=0.539)
RegexpMatchHard_32 3.84µs × (1.00,1.00) 3.84µs × (1.00,1.00) ~ (p=0.378)
RegexpMatchHard_1K 117µs × (1.00,1.00) 117µs × (1.00,1.00) ~ (p=0.067)
Revcomp 904ms × (0.99,1.02) 904ms × (0.99,1.01) ~ (p=0.943)
Template 125ms × (0.99,1.02) 127ms × (0.99,1.01) +1.79% (p=0.000)
TimeParse 627ns × (0.99,1.01) 622ns × (0.99,1.01) -0.88% (p=0.000)
TimeFormat 655ns × (0.99,1.02) 655ns × (0.99,1.02) ~ (p=0.976)
For the record, Go 1 benchmarks, Go 1.4 vs this CL:
name old mean new mean delta
BinaryTree17 4.61s × (0.97,1.05) 5.91s × (0.98,1.03) +28.35% (p=0.000)
Fannkuch11 4.40s × (0.99,1.03) 4.41s × (0.99,1.01) ~ (p=0.212)
FmtFprintfEmpty 102ns × (0.99,1.01) 84ns × (0.99,1.02) -18.38% (p=0.000)
FmtFprintfString 302ns × (0.98,1.01) 303ns × (0.99,1.02) ~ (p=0.203)
FmtFprintfInt 313ns × (0.97,1.05) 270ns × (0.99,1.01) -13.69% (p=0.000)
FmtFprintfIntInt 524ns × (0.98,1.02) 477ns × (0.99,1.00) -8.87% (p=0.000)
FmtFprintfPrefixedInt 424ns × (0.98,1.02) 386ns × (0.99,1.01) -8.96% (p=0.000)
FmtFprintfFloat 652ns × (0.98,1.02) 594ns × (0.97,1.05) -8.97% (p=0.000)
FmtManyArgs 2.13µs × (0.99,1.02) 1.94µs × (0.99,1.01) -8.92% (p=0.000)
GobDecode 17.1ms × (0.99,1.02) 14.9ms × (0.98,1.03) -13.07% (p=0.000)
GobEncode 13.5ms × (0.98,1.03) 11.5ms × (0.98,1.03) -15.25% (p=0.000)
Gzip 656ms × (0.99,1.02) 647ms × (0.99,1.01) -1.29% (p=0.000)
Gunzip 143ms × (0.99,1.02) 144ms × (0.99,1.01) ~ (p=0.204)
HTTPClientServer 88.2µs × (0.98,1.02) 90.8µs × (0.98,1.01) +2.93% (p=0.000)
JSONEncode 32.2ms × (0.98,1.02) 30.9ms × (0.97,1.04) -4.06% (p=0.001)
JSONDecode 121ms × (0.98,1.02) 110ms × (0.98,1.05) -8.95% (p=0.000)
Mandelbrot200 6.06ms × (0.99,1.01) 6.11ms × (0.98,1.04) ~ (p=0.184)
GoParse 6.76ms × (0.97,1.04) 6.58ms × (0.98,1.05) -2.63% (p=0.003)
RegexpMatchEasy0_32 195ns × (1.00,1.01) 155ns × (0.99,1.01) -20.43% (p=0.000)
RegexpMatchEasy0_1K 479ns × (0.98,1.03) 535ns × (0.99,1.02) +11.59% (p=0.000)
RegexpMatchEasy1_32 169ns × (0.99,1.02) 131ns × (0.99,1.03) -22.44% (p=0.000)
RegexpMatchEasy1_1K 1.53µs × (0.99,1.01) 0.87µs × (0.99,1.02) -43.07% (p=0.000)
RegexpMatchMedium_32 334ns × (0.99,1.01) 242ns × (0.99,1.01) -27.53% (p=0.000)
RegexpMatchMedium_1K 125µs × (1.00,1.01) 72µs × (0.99,1.03) -42.53% (p=0.000)
RegexpMatchHard_32 6.03µs × (0.99,1.01) 3.79µs × (0.99,1.01) -37.12% (p=0.000)
RegexpMatchHard_1K 189µs × (0.99,1.02) 115µs × (0.99,1.01) -39.20% (p=0.000)
Revcomp 935ms × (0.96,1.03) 926ms × (0.98,1.02) ~ (p=0.083)
Template 146ms × (0.97,1.05) 119ms × (0.99,1.01) -18.37% (p=0.000)
TimeParse 660ns × (0.99,1.01) 624ns × (0.99,1.02) -5.43% (p=0.000)
TimeFormat 670ns × (0.98,1.02) 710ns × (1.00,1.01) +5.97% (p=0.000)
This CL is a bit larger than I would like, but the compiler, linker, runtime,
and package reflect all need to be in sync about the format of these programs,
so there is no easy way to split this into independent changes (at least
while keeping the build working at each change).
Fixes #9625.
Fixes #10524.
Change-Id: I9e3e20d6097099d0f8532d1cb5b1af528804989a
Reviewed-on: https://go-review.googlesource.com/9888
Reviewed-by: Austin Clements <austin@google.com>
Run-TryBot: Russ Cox <rsc@golang.org>
2015-05-08 01:43:18 -04:00
|
|
|
for i := int64(0); i < nptr; i++ {
|
|
|
|
|
if (mask[i/8]>>uint(i%8))&1 != 0 {
|
|
|
|
|
p.w.Ptr(s.Value/ptrsize + i)
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
}
|
runtime: replace GC programs with simpler encoding, faster decoder
Small types record the location of pointers in their memory layout
by using a simple bitmap. In Go 1.4 the bitmap held 4-bit entries,
and in Go 1.5 the bitmap holds 1-bit entries, but in both cases using
a bitmap for a large type containing arrays does not make sense:
if someone refers to the type [1<<28]*byte in a program in such
a way that the type information makes it into the binary, it would be
a waste of space to write a 128 MB (for 4-bit entries) or even 32 MB
(for 1-bit entries) bitmap full of 1s into the binary or even to keep
one in memory during the execution of the program.
For large types containing arrays, it is much more compact to describe
the locations of pointers using a notation that can express repetition
than to lay out a bitmap of pointers. Go 1.4 included such a notation,
called ``GC programs'' but it was complex, required recursion during
decoding, and was generally slow. Dmitriy measured the execution of
these programs writing directly to the heap bitmap as being 7x slower
than copying from a preunrolled 4-bit mask (and frankly that code was
not terribly fast either). For some tests, unrollgcprog1 was seen costing
as much as 3x more than the rest of malloc combined.
This CL introduces a different form for the GC programs. They use a
simple Lempel-Ziv-style encoding of the 1-bit pointer information,
in which the only operations are (1) emit the following n bits
and (2) repeat the last n bits c more times. This encoding can be
generated directly from the Go type information (using repetition
only for arrays or large runs of non-pointer data) and it can be decoded
very efficiently. In particular the decoding requires little state and
no recursion, so that the entire decoding can run without any memory
accesses other than the reads of the encoding and the writes of the
decoded form to the heap bitmap. For recursive types like arrays of
arrays of arrays, the inner instructions are only executed once, not
n times, so that large repetitions run at full speed. (In contrast, large
repetitions in the old programs repeated the individual bit-level layout
of the inner data over and over.) The result is as much as 25x faster
decoding compared to the old form.
Because the old decoder was so slow, Go 1.4 had three (or so) cases
for how to set the heap bitmap bits for an allocation of a given type:
(1) If the type had an even number of words up to 32 words, then
the 4-bit pointer mask for the type fit in no more than 16 bytes;
store the 4-bit pointer mask directly in the binary and copy from it.
(1b) If the type had an odd number of words up to 15 words, then
the 4-bit pointer mask for the type, doubled to end on a byte boundary,
fit in no more than 16 bytes; store that doubled mask directly in the
binary and copy from it.
(2) If the type had an even number of words up to 128 words,
or an odd number of words up to 63 words (again due to doubling),
then the 4-bit pointer mask would fit in a 64-byte unrolled mask.
Store a GC program in the binary, but leave space in the BSS for
the unrolled mask. Execute the GC program to construct the mask the
first time it is needed, and thereafter copy from the mask.
(3) Otherwise, store a GC program and execute it to write directly to
the heap bitmap each time an object of that type is allocated.
(This is the case that was 7x slower than the other two.)
Because the new pointer masks store 1-bit entries instead of 4-bit
entries and because using the decoder no longer carries a significant
overhead, after this CL (that is, for Go 1.5) there are only two cases:
(1) If the type is 128 words or less (no condition about odd or even),
store the 1-bit pointer mask directly in the binary and use it to
initialize the heap bitmap during malloc. (Implemented in CL 9702.)
(2) There is no case 2 anymore.
(3) Otherwise, store a GC program and execute it to write directly to
the heap bitmap each time an object of that type is allocated.
Executing the GC program directly into the heap bitmap (case (3) above)
was disabled for the Go 1.5 dev cycle, both to avoid needing to use
GC programs for typedmemmove and to avoid updating that code as
the heap bitmap format changed. Typedmemmove no longer uses this
type information; as of CL 9886 it uses the heap bitmap directly.
Now that the heap bitmap format is stable, we reintroduce GC programs
and their space savings.
Benchmarks for heapBitsSetType, before this CL vs this CL:
name old mean new mean delta
SetTypePtr 7.59ns × (0.99,1.02) 5.16ns × (1.00,1.00) -32.05% (p=0.000)
SetTypePtr8 21.0ns × (0.98,1.05) 21.4ns × (1.00,1.00) ~ (p=0.179)
SetTypePtr16 24.1ns × (0.99,1.01) 24.6ns × (1.00,1.00) +2.41% (p=0.001)
SetTypePtr32 31.2ns × (0.99,1.01) 32.4ns × (0.99,1.02) +3.72% (p=0.001)
SetTypePtr64 45.2ns × (1.00,1.00) 47.2ns × (1.00,1.00) +4.42% (p=0.000)
SetTypePtr126 75.8ns × (0.99,1.01) 79.1ns × (1.00,1.00) +4.25% (p=0.000)
SetTypePtr128 74.3ns × (0.99,1.01) 77.6ns × (1.00,1.01) +4.55% (p=0.000)
SetTypePtrSlice 726ns × (1.00,1.01) 712ns × (1.00,1.00) -1.95% (p=0.001)
SetTypeNode1 20.0ns × (0.99,1.01) 20.7ns × (1.00,1.00) +3.71% (p=0.000)
SetTypeNode1Slice 112ns × (1.00,1.00) 113ns × (0.99,1.00) ~ (p=0.070)
SetTypeNode8 23.9ns × (1.00,1.00) 24.7ns × (1.00,1.01) +3.18% (p=0.000)
SetTypeNode8Slice 294ns × (0.99,1.02) 287ns × (0.99,1.01) -2.38% (p=0.015)
SetTypeNode64 52.8ns × (0.99,1.03) 51.8ns × (0.99,1.01) ~ (p=0.069)
SetTypeNode64Slice 1.13µs × (0.99,1.05) 1.14µs × (0.99,1.00) ~ (p=0.767)
SetTypeNode64Dead 36.0ns × (1.00,1.01) 32.5ns × (0.99,1.00) -9.67% (p=0.000)
SetTypeNode64DeadSlice 1.43µs × (0.99,1.01) 1.40µs × (1.00,1.00) -2.39% (p=0.001)
SetTypeNode124 75.7ns × (1.00,1.01) 79.0ns × (1.00,1.00) +4.44% (p=0.000)
SetTypeNode124Slice 1.94µs × (1.00,1.01) 2.04µs × (0.99,1.01) +4.98% (p=0.000)
SetTypeNode126 75.4ns × (1.00,1.01) 77.7ns × (0.99,1.01) +3.11% (p=0.000)
SetTypeNode126Slice 1.95µs × (0.99,1.01) 2.03µs × (1.00,1.00) +3.74% (p=0.000)
SetTypeNode128 85.4ns × (0.99,1.01) 122.0ns × (1.00,1.00) +42.89% (p=0.000)
SetTypeNode128Slice 2.20µs × (1.00,1.01) 2.36µs × (0.98,1.02) +7.48% (p=0.001)
SetTypeNode130 83.3ns × (1.00,1.00) 123.0ns × (1.00,1.00) +47.61% (p=0.000)
SetTypeNode130Slice 2.30µs × (0.99,1.01) 2.40µs × (0.98,1.01) +4.37% (p=0.000)
SetTypeNode1024 498ns × (1.00,1.00) 537ns × (1.00,1.00) +7.96% (p=0.000)
SetTypeNode1024Slice 15.5µs × (0.99,1.01) 17.8µs × (1.00,1.00) +15.27% (p=0.000)
The above compares always using a cached pointer mask (and the
corresponding waste of memory) against using the programs directly.
Some slowdown is expected, in exchange for having a better general algorithm.
The GC programs kick in for SetTypeNode128, SetTypeNode130, SetTypeNode1024,
along with the slice variants of those.
It is possible that the cutoff of 128 words (bits) should be raised
in a followup CL, but even with this low cutoff the GC programs are
faster than Go 1.4's "fast path" non-GC program case.
Benchmarks for heapBitsSetType, Go 1.4 vs this CL:
name old mean new mean delta
SetTypePtr 6.89ns × (1.00,1.00) 5.17ns × (1.00,1.00) -25.02% (p=0.000)
SetTypePtr8 25.8ns × (0.97,1.05) 21.5ns × (1.00,1.00) -16.70% (p=0.000)
SetTypePtr16 39.8ns × (0.97,1.02) 24.7ns × (0.99,1.01) -37.81% (p=0.000)
SetTypePtr32 68.8ns × (0.98,1.01) 32.2ns × (1.00,1.01) -53.18% (p=0.000)
SetTypePtr64 130ns × (1.00,1.00) 47ns × (1.00,1.00) -63.67% (p=0.000)
SetTypePtr126 241ns × (0.99,1.01) 79ns × (1.00,1.01) -67.25% (p=0.000)
SetTypePtr128 2.07µs × (1.00,1.00) 0.08µs × (1.00,1.00) -96.27% (p=0.000)
SetTypePtrSlice 1.05µs × (0.99,1.01) 0.72µs × (0.99,1.02) -31.70% (p=0.000)
SetTypeNode1 16.0ns × (0.99,1.01) 20.8ns × (0.99,1.03) +29.91% (p=0.000)
SetTypeNode1Slice 184ns × (0.99,1.01) 112ns × (0.99,1.01) -39.26% (p=0.000)
SetTypeNode8 29.5ns × (0.97,1.02) 24.6ns × (1.00,1.00) -16.50% (p=0.000)
SetTypeNode8Slice 624ns × (0.98,1.02) 285ns × (1.00,1.00) -54.31% (p=0.000)
SetTypeNode64 135ns × (0.96,1.08) 52ns × (0.99,1.02) -61.32% (p=0.000)
SetTypeNode64Slice 3.83µs × (1.00,1.00) 1.14µs × (0.99,1.01) -70.16% (p=0.000)
SetTypeNode64Dead 134ns × (0.99,1.01) 32ns × (1.00,1.01) -75.74% (p=0.000)
SetTypeNode64DeadSlice 3.83µs × (0.99,1.00) 1.40µs × (1.00,1.01) -63.42% (p=0.000)
SetTypeNode124 240ns × (0.99,1.01) 79ns × (1.00,1.01) -67.05% (p=0.000)
SetTypeNode124Slice 7.27µs × (1.00,1.00) 2.04µs × (1.00,1.00) -71.95% (p=0.000)
SetTypeNode126 2.06µs × (0.99,1.01) 0.08µs × (0.99,1.01) -96.23% (p=0.000)
SetTypeNode126Slice 64.4µs × (1.00,1.00) 2.0µs × (1.00,1.00) -96.85% (p=0.000)
SetTypeNode128 2.09µs × (1.00,1.01) 0.12µs × (1.00,1.00) -94.15% (p=0.000)
SetTypeNode128Slice 65.4µs × (1.00,1.00) 2.4µs × (0.99,1.03) -96.39% (p=0.000)
SetTypeNode130 2.11µs × (1.00,1.00) 0.12µs × (1.00,1.00) -94.18% (p=0.000)
SetTypeNode130Slice 66.3µs × (1.00,1.00) 2.4µs × (0.97,1.08) -96.34% (p=0.000)
SetTypeNode1024 16.0µs × (1.00,1.01) 0.5µs × (1.00,1.00) -96.65% (p=0.000)
SetTypeNode1024Slice 512µs × (1.00,1.00) 18µs × (0.98,1.04) -96.45% (p=0.000)
SetTypeNode124 uses a 124 data + 2 ptr = 126-word allocation.
Both Go 1.4 and this CL are using pointer bitmaps for this case,
so that's an overall 3x speedup for using pointer bitmaps.
SetTypeNode128 uses a 128 data + 2 ptr = 130-word allocation.
Both Go 1.4 and this CL are running the GC program for this case,
so that's an overall 17x speedup when using GC programs (and
I've seen >20x on other systems).
Comparing Go 1.4's SetTypeNode124 (pointer bitmap) against
this CL's SetTypeNode128 (GC program), the slow path in the
code in this CL is 2x faster than the fast path in Go 1.4.
The Go 1 benchmarks are basically unaffected compared to just before this CL.
Go 1 benchmarks, before this CL vs this CL:
name old mean new mean delta
BinaryTree17 5.87s × (0.97,1.04) 5.91s × (0.96,1.04) ~ (p=0.306)
Fannkuch11 4.38s × (1.00,1.00) 4.37s × (1.00,1.01) -0.22% (p=0.006)
FmtFprintfEmpty 90.7ns × (0.97,1.10) 89.3ns × (0.96,1.09) ~ (p=0.280)
FmtFprintfString 282ns × (0.98,1.04) 287ns × (0.98,1.07) +1.72% (p=0.039)
FmtFprintfInt 269ns × (0.99,1.03) 282ns × (0.97,1.04) +4.87% (p=0.000)
FmtFprintfIntInt 478ns × (0.99,1.02) 481ns × (0.99,1.02) +0.61% (p=0.048)
FmtFprintfPrefixedInt 399ns × (0.98,1.03) 400ns × (0.98,1.05) ~ (p=0.533)
FmtFprintfFloat 563ns × (0.99,1.01) 570ns × (1.00,1.01) +1.37% (p=0.000)
FmtManyArgs 1.89µs × (0.99,1.01) 1.92µs × (0.99,1.02) +1.88% (p=0.000)
GobDecode 15.2ms × (0.99,1.01) 15.2ms × (0.98,1.05) ~ (p=0.609)
GobEncode 11.6ms × (0.98,1.03) 11.9ms × (0.98,1.04) +2.17% (p=0.000)
Gzip 648ms × (0.99,1.01) 648ms × (1.00,1.01) ~ (p=0.835)
Gunzip 142ms × (1.00,1.00) 143ms × (1.00,1.01) ~ (p=0.169)
HTTPClientServer 90.5µs × (0.98,1.03) 91.5µs × (0.98,1.04) +1.04% (p=0.045)
JSONEncode 31.5ms × (0.98,1.03) 31.4ms × (0.98,1.03) ~ (p=0.549)
JSONDecode 111ms × (0.99,1.01) 107ms × (0.99,1.01) -3.21% (p=0.000)
Mandelbrot200 6.01ms × (1.00,1.00) 6.01ms × (1.00,1.00) ~ (p=0.878)
GoParse 6.54ms × (0.99,1.02) 6.61ms × (0.99,1.03) +1.08% (p=0.004)
RegexpMatchEasy0_32 160ns × (1.00,1.01) 161ns × (1.00,1.00) +0.40% (p=0.000)
RegexpMatchEasy0_1K 560ns × (0.99,1.01) 559ns × (0.99,1.01) ~ (p=0.088)
RegexpMatchEasy1_32 138ns × (0.99,1.01) 138ns × (1.00,1.00) ~ (p=0.380)
RegexpMatchEasy1_1K 877ns × (1.00,1.00) 878ns × (1.00,1.00) ~ (p=0.157)
RegexpMatchMedium_32 251ns × (0.99,1.00) 251ns × (1.00,1.01) +0.28% (p=0.021)
RegexpMatchMedium_1K 72.6µs × (1.00,1.00) 72.6µs × (1.00,1.00) ~ (p=0.539)
RegexpMatchHard_32 3.84µs × (1.00,1.00) 3.84µs × (1.00,1.00) ~ (p=0.378)
RegexpMatchHard_1K 117µs × (1.00,1.00) 117µs × (1.00,1.00) ~ (p=0.067)
Revcomp 904ms × (0.99,1.02) 904ms × (0.99,1.01) ~ (p=0.943)
Template 125ms × (0.99,1.02) 127ms × (0.99,1.01) +1.79% (p=0.000)
TimeParse 627ns × (0.99,1.01) 622ns × (0.99,1.01) -0.88% (p=0.000)
TimeFormat 655ns × (0.99,1.02) 655ns × (0.99,1.02) ~ (p=0.976)
For the record, Go 1 benchmarks, Go 1.4 vs this CL:
name old mean new mean delta
BinaryTree17 4.61s × (0.97,1.05) 5.91s × (0.98,1.03) +28.35% (p=0.000)
Fannkuch11 4.40s × (0.99,1.03) 4.41s × (0.99,1.01) ~ (p=0.212)
FmtFprintfEmpty 102ns × (0.99,1.01) 84ns × (0.99,1.02) -18.38% (p=0.000)
FmtFprintfString 302ns × (0.98,1.01) 303ns × (0.99,1.02) ~ (p=0.203)
FmtFprintfInt 313ns × (0.97,1.05) 270ns × (0.99,1.01) -13.69% (p=0.000)
FmtFprintfIntInt 524ns × (0.98,1.02) 477ns × (0.99,1.00) -8.87% (p=0.000)
FmtFprintfPrefixedInt 424ns × (0.98,1.02) 386ns × (0.99,1.01) -8.96% (p=0.000)
FmtFprintfFloat 652ns × (0.98,1.02) 594ns × (0.97,1.05) -8.97% (p=0.000)
FmtManyArgs 2.13µs × (0.99,1.02) 1.94µs × (0.99,1.01) -8.92% (p=0.000)
GobDecode 17.1ms × (0.99,1.02) 14.9ms × (0.98,1.03) -13.07% (p=0.000)
GobEncode 13.5ms × (0.98,1.03) 11.5ms × (0.98,1.03) -15.25% (p=0.000)
Gzip 656ms × (0.99,1.02) 647ms × (0.99,1.01) -1.29% (p=0.000)
Gunzip 143ms × (0.99,1.02) 144ms × (0.99,1.01) ~ (p=0.204)
HTTPClientServer 88.2µs × (0.98,1.02) 90.8µs × (0.98,1.01) +2.93% (p=0.000)
JSONEncode 32.2ms × (0.98,1.02) 30.9ms × (0.97,1.04) -4.06% (p=0.001)
JSONDecode 121ms × (0.98,1.02) 110ms × (0.98,1.05) -8.95% (p=0.000)
Mandelbrot200 6.06ms × (0.99,1.01) 6.11ms × (0.98,1.04) ~ (p=0.184)
GoParse 6.76ms × (0.97,1.04) 6.58ms × (0.98,1.05) -2.63% (p=0.003)
RegexpMatchEasy0_32 195ns × (1.00,1.01) 155ns × (0.99,1.01) -20.43% (p=0.000)
RegexpMatchEasy0_1K 479ns × (0.98,1.03) 535ns × (0.99,1.02) +11.59% (p=0.000)
RegexpMatchEasy1_32 169ns × (0.99,1.02) 131ns × (0.99,1.03) -22.44% (p=0.000)
RegexpMatchEasy1_1K 1.53µs × (0.99,1.01) 0.87µs × (0.99,1.02) -43.07% (p=0.000)
RegexpMatchMedium_32 334ns × (0.99,1.01) 242ns × (0.99,1.01) -27.53% (p=0.000)
RegexpMatchMedium_1K 125µs × (1.00,1.01) 72µs × (0.99,1.03) -42.53% (p=0.000)
RegexpMatchHard_32 6.03µs × (0.99,1.01) 3.79µs × (0.99,1.01) -37.12% (p=0.000)
RegexpMatchHard_1K 189µs × (0.99,1.02) 115µs × (0.99,1.01) -39.20% (p=0.000)
Revcomp 935ms × (0.96,1.03) 926ms × (0.98,1.02) ~ (p=0.083)
Template 146ms × (0.97,1.05) 119ms × (0.99,1.01) -18.37% (p=0.000)
TimeParse 660ns × (0.99,1.01) 624ns × (0.99,1.02) -5.43% (p=0.000)
TimeFormat 670ns × (0.98,1.02) 710ns × (1.00,1.01) +5.97% (p=0.000)
This CL is a bit larger than I would like, but the compiler, linker, runtime,
and package reflect all need to be in sync about the format of these programs,
so there is no easy way to split this into independent changes (at least
while keeping the build working at each change).
Fixes #9625.
Fixes #10524.
Change-Id: I9e3e20d6097099d0f8532d1cb5b1af528804989a
Reviewed-on: https://go-review.googlesource.com/9888
Reviewed-by: Austin Clements <austin@google.com>
Run-TryBot: Russ Cox <rsc@golang.org>
2015-05-08 01:43:18 -04:00
|
|
|
return
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
runtime: replace GC programs with simpler encoding, faster decoder
Small types record the location of pointers in their memory layout
by using a simple bitmap. In Go 1.4 the bitmap held 4-bit entries,
and in Go 1.5 the bitmap holds 1-bit entries, but in both cases using
a bitmap for a large type containing arrays does not make sense:
if someone refers to the type [1<<28]*byte in a program in such
a way that the type information makes it into the binary, it would be
a waste of space to write a 128 MB (for 4-bit entries) or even 32 MB
(for 1-bit entries) bitmap full of 1s into the binary or even to keep
one in memory during the execution of the program.
For large types containing arrays, it is much more compact to describe
the locations of pointers using a notation that can express repetition
than to lay out a bitmap of pointers. Go 1.4 included such a notation,
called ``GC programs'' but it was complex, required recursion during
decoding, and was generally slow. Dmitriy measured the execution of
these programs writing directly to the heap bitmap as being 7x slower
than copying from a preunrolled 4-bit mask (and frankly that code was
not terribly fast either). For some tests, unrollgcprog1 was seen costing
as much as 3x more than the rest of malloc combined.
This CL introduces a different form for the GC programs. They use a
simple Lempel-Ziv-style encoding of the 1-bit pointer information,
in which the only operations are (1) emit the following n bits
and (2) repeat the last n bits c more times. This encoding can be
generated directly from the Go type information (using repetition
only for arrays or large runs of non-pointer data) and it can be decoded
very efficiently. In particular the decoding requires little state and
no recursion, so that the entire decoding can run without any memory
accesses other than the reads of the encoding and the writes of the
decoded form to the heap bitmap. For recursive types like arrays of
arrays of arrays, the inner instructions are only executed once, not
n times, so that large repetitions run at full speed. (In contrast, large
repetitions in the old programs repeated the individual bit-level layout
of the inner data over and over.) The result is as much as 25x faster
decoding compared to the old form.
Because the old decoder was so slow, Go 1.4 had three (or so) cases
for how to set the heap bitmap bits for an allocation of a given type:
(1) If the type had an even number of words up to 32 words, then
the 4-bit pointer mask for the type fit in no more than 16 bytes;
store the 4-bit pointer mask directly in the binary and copy from it.
(1b) If the type had an odd number of words up to 15 words, then
the 4-bit pointer mask for the type, doubled to end on a byte boundary,
fit in no more than 16 bytes; store that doubled mask directly in the
binary and copy from it.
(2) If the type had an even number of words up to 128 words,
or an odd number of words up to 63 words (again due to doubling),
then the 4-bit pointer mask would fit in a 64-byte unrolled mask.
Store a GC program in the binary, but leave space in the BSS for
the unrolled mask. Execute the GC program to construct the mask the
first time it is needed, and thereafter copy from the mask.
(3) Otherwise, store a GC program and execute it to write directly to
the heap bitmap each time an object of that type is allocated.
(This is the case that was 7x slower than the other two.)
Because the new pointer masks store 1-bit entries instead of 4-bit
entries and because using the decoder no longer carries a significant
overhead, after this CL (that is, for Go 1.5) there are only two cases:
(1) If the type is 128 words or less (no condition about odd or even),
store the 1-bit pointer mask directly in the binary and use it to
initialize the heap bitmap during malloc. (Implemented in CL 9702.)
(2) There is no case 2 anymore.
(3) Otherwise, store a GC program and execute it to write directly to
the heap bitmap each time an object of that type is allocated.
Executing the GC program directly into the heap bitmap (case (3) above)
was disabled for the Go 1.5 dev cycle, both to avoid needing to use
GC programs for typedmemmove and to avoid updating that code as
the heap bitmap format changed. Typedmemmove no longer uses this
type information; as of CL 9886 it uses the heap bitmap directly.
Now that the heap bitmap format is stable, we reintroduce GC programs
and their space savings.
Benchmarks for heapBitsSetType, before this CL vs this CL:
name old mean new mean delta
SetTypePtr 7.59ns × (0.99,1.02) 5.16ns × (1.00,1.00) -32.05% (p=0.000)
SetTypePtr8 21.0ns × (0.98,1.05) 21.4ns × (1.00,1.00) ~ (p=0.179)
SetTypePtr16 24.1ns × (0.99,1.01) 24.6ns × (1.00,1.00) +2.41% (p=0.001)
SetTypePtr32 31.2ns × (0.99,1.01) 32.4ns × (0.99,1.02) +3.72% (p=0.001)
SetTypePtr64 45.2ns × (1.00,1.00) 47.2ns × (1.00,1.00) +4.42% (p=0.000)
SetTypePtr126 75.8ns × (0.99,1.01) 79.1ns × (1.00,1.00) +4.25% (p=0.000)
SetTypePtr128 74.3ns × (0.99,1.01) 77.6ns × (1.00,1.01) +4.55% (p=0.000)
SetTypePtrSlice 726ns × (1.00,1.01) 712ns × (1.00,1.00) -1.95% (p=0.001)
SetTypeNode1 20.0ns × (0.99,1.01) 20.7ns × (1.00,1.00) +3.71% (p=0.000)
SetTypeNode1Slice 112ns × (1.00,1.00) 113ns × (0.99,1.00) ~ (p=0.070)
SetTypeNode8 23.9ns × (1.00,1.00) 24.7ns × (1.00,1.01) +3.18% (p=0.000)
SetTypeNode8Slice 294ns × (0.99,1.02) 287ns × (0.99,1.01) -2.38% (p=0.015)
SetTypeNode64 52.8ns × (0.99,1.03) 51.8ns × (0.99,1.01) ~ (p=0.069)
SetTypeNode64Slice 1.13µs × (0.99,1.05) 1.14µs × (0.99,1.00) ~ (p=0.767)
SetTypeNode64Dead 36.0ns × (1.00,1.01) 32.5ns × (0.99,1.00) -9.67% (p=0.000)
SetTypeNode64DeadSlice 1.43µs × (0.99,1.01) 1.40µs × (1.00,1.00) -2.39% (p=0.001)
SetTypeNode124 75.7ns × (1.00,1.01) 79.0ns × (1.00,1.00) +4.44% (p=0.000)
SetTypeNode124Slice 1.94µs × (1.00,1.01) 2.04µs × (0.99,1.01) +4.98% (p=0.000)
SetTypeNode126 75.4ns × (1.00,1.01) 77.7ns × (0.99,1.01) +3.11% (p=0.000)
SetTypeNode126Slice 1.95µs × (0.99,1.01) 2.03µs × (1.00,1.00) +3.74% (p=0.000)
SetTypeNode128 85.4ns × (0.99,1.01) 122.0ns × (1.00,1.00) +42.89% (p=0.000)
SetTypeNode128Slice 2.20µs × (1.00,1.01) 2.36µs × (0.98,1.02) +7.48% (p=0.001)
SetTypeNode130 83.3ns × (1.00,1.00) 123.0ns × (1.00,1.00) +47.61% (p=0.000)
SetTypeNode130Slice 2.30µs × (0.99,1.01) 2.40µs × (0.98,1.01) +4.37% (p=0.000)
SetTypeNode1024 498ns × (1.00,1.00) 537ns × (1.00,1.00) +7.96% (p=0.000)
SetTypeNode1024Slice 15.5µs × (0.99,1.01) 17.8µs × (1.00,1.00) +15.27% (p=0.000)
The above compares always using a cached pointer mask (and the
corresponding waste of memory) against using the programs directly.
Some slowdown is expected, in exchange for having a better general algorithm.
The GC programs kick in for SetTypeNode128, SetTypeNode130, SetTypeNode1024,
along with the slice variants of those.
It is possible that the cutoff of 128 words (bits) should be raised
in a followup CL, but even with this low cutoff the GC programs are
faster than Go 1.4's "fast path" non-GC program case.
Benchmarks for heapBitsSetType, Go 1.4 vs this CL:
name old mean new mean delta
SetTypePtr 6.89ns × (1.00,1.00) 5.17ns × (1.00,1.00) -25.02% (p=0.000)
SetTypePtr8 25.8ns × (0.97,1.05) 21.5ns × (1.00,1.00) -16.70% (p=0.000)
SetTypePtr16 39.8ns × (0.97,1.02) 24.7ns × (0.99,1.01) -37.81% (p=0.000)
SetTypePtr32 68.8ns × (0.98,1.01) 32.2ns × (1.00,1.01) -53.18% (p=0.000)
SetTypePtr64 130ns × (1.00,1.00) 47ns × (1.00,1.00) -63.67% (p=0.000)
SetTypePtr126 241ns × (0.99,1.01) 79ns × (1.00,1.01) -67.25% (p=0.000)
SetTypePtr128 2.07µs × (1.00,1.00) 0.08µs × (1.00,1.00) -96.27% (p=0.000)
SetTypePtrSlice 1.05µs × (0.99,1.01) 0.72µs × (0.99,1.02) -31.70% (p=0.000)
SetTypeNode1 16.0ns × (0.99,1.01) 20.8ns × (0.99,1.03) +29.91% (p=0.000)
SetTypeNode1Slice 184ns × (0.99,1.01) 112ns × (0.99,1.01) -39.26% (p=0.000)
SetTypeNode8 29.5ns × (0.97,1.02) 24.6ns × (1.00,1.00) -16.50% (p=0.000)
SetTypeNode8Slice 624ns × (0.98,1.02) 285ns × (1.00,1.00) -54.31% (p=0.000)
SetTypeNode64 135ns × (0.96,1.08) 52ns × (0.99,1.02) -61.32% (p=0.000)
SetTypeNode64Slice 3.83µs × (1.00,1.00) 1.14µs × (0.99,1.01) -70.16% (p=0.000)
SetTypeNode64Dead 134ns × (0.99,1.01) 32ns × (1.00,1.01) -75.74% (p=0.000)
SetTypeNode64DeadSlice 3.83µs × (0.99,1.00) 1.40µs × (1.00,1.01) -63.42% (p=0.000)
SetTypeNode124 240ns × (0.99,1.01) 79ns × (1.00,1.01) -67.05% (p=0.000)
SetTypeNode124Slice 7.27µs × (1.00,1.00) 2.04µs × (1.00,1.00) -71.95% (p=0.000)
SetTypeNode126 2.06µs × (0.99,1.01) 0.08µs × (0.99,1.01) -96.23% (p=0.000)
SetTypeNode126Slice 64.4µs × (1.00,1.00) 2.0µs × (1.00,1.00) -96.85% (p=0.000)
SetTypeNode128 2.09µs × (1.00,1.01) 0.12µs × (1.00,1.00) -94.15% (p=0.000)
SetTypeNode128Slice 65.4µs × (1.00,1.00) 2.4µs × (0.99,1.03) -96.39% (p=0.000)
SetTypeNode130 2.11µs × (1.00,1.00) 0.12µs × (1.00,1.00) -94.18% (p=0.000)
SetTypeNode130Slice 66.3µs × (1.00,1.00) 2.4µs × (0.97,1.08) -96.34% (p=0.000)
SetTypeNode1024 16.0µs × (1.00,1.01) 0.5µs × (1.00,1.00) -96.65% (p=0.000)
SetTypeNode1024Slice 512µs × (1.00,1.00) 18µs × (0.98,1.04) -96.45% (p=0.000)
SetTypeNode124 uses a 124 data + 2 ptr = 126-word allocation.
Both Go 1.4 and this CL are using pointer bitmaps for this case,
so that's an overall 3x speedup for using pointer bitmaps.
SetTypeNode128 uses a 128 data + 2 ptr = 130-word allocation.
Both Go 1.4 and this CL are running the GC program for this case,
so that's an overall 17x speedup when using GC programs (and
I've seen >20x on other systems).
Comparing Go 1.4's SetTypeNode124 (pointer bitmap) against
this CL's SetTypeNode128 (GC program), the slow path in the
code in this CL is 2x faster than the fast path in Go 1.4.
The Go 1 benchmarks are basically unaffected compared to just before this CL.
Go 1 benchmarks, before this CL vs this CL:
name old mean new mean delta
BinaryTree17 5.87s × (0.97,1.04) 5.91s × (0.96,1.04) ~ (p=0.306)
Fannkuch11 4.38s × (1.00,1.00) 4.37s × (1.00,1.01) -0.22% (p=0.006)
FmtFprintfEmpty 90.7ns × (0.97,1.10) 89.3ns × (0.96,1.09) ~ (p=0.280)
FmtFprintfString 282ns × (0.98,1.04) 287ns × (0.98,1.07) +1.72% (p=0.039)
FmtFprintfInt 269ns × (0.99,1.03) 282ns × (0.97,1.04) +4.87% (p=0.000)
FmtFprintfIntInt 478ns × (0.99,1.02) 481ns × (0.99,1.02) +0.61% (p=0.048)
FmtFprintfPrefixedInt 399ns × (0.98,1.03) 400ns × (0.98,1.05) ~ (p=0.533)
FmtFprintfFloat 563ns × (0.99,1.01) 570ns × (1.00,1.01) +1.37% (p=0.000)
FmtManyArgs 1.89µs × (0.99,1.01) 1.92µs × (0.99,1.02) +1.88% (p=0.000)
GobDecode 15.2ms × (0.99,1.01) 15.2ms × (0.98,1.05) ~ (p=0.609)
GobEncode 11.6ms × (0.98,1.03) 11.9ms × (0.98,1.04) +2.17% (p=0.000)
Gzip 648ms × (0.99,1.01) 648ms × (1.00,1.01) ~ (p=0.835)
Gunzip 142ms × (1.00,1.00) 143ms × (1.00,1.01) ~ (p=0.169)
HTTPClientServer 90.5µs × (0.98,1.03) 91.5µs × (0.98,1.04) +1.04% (p=0.045)
JSONEncode 31.5ms × (0.98,1.03) 31.4ms × (0.98,1.03) ~ (p=0.549)
JSONDecode 111ms × (0.99,1.01) 107ms × (0.99,1.01) -3.21% (p=0.000)
Mandelbrot200 6.01ms × (1.00,1.00) 6.01ms × (1.00,1.00) ~ (p=0.878)
GoParse 6.54ms × (0.99,1.02) 6.61ms × (0.99,1.03) +1.08% (p=0.004)
RegexpMatchEasy0_32 160ns × (1.00,1.01) 161ns × (1.00,1.00) +0.40% (p=0.000)
RegexpMatchEasy0_1K 560ns × (0.99,1.01) 559ns × (0.99,1.01) ~ (p=0.088)
RegexpMatchEasy1_32 138ns × (0.99,1.01) 138ns × (1.00,1.00) ~ (p=0.380)
RegexpMatchEasy1_1K 877ns × (1.00,1.00) 878ns × (1.00,1.00) ~ (p=0.157)
RegexpMatchMedium_32 251ns × (0.99,1.00) 251ns × (1.00,1.01) +0.28% (p=0.021)
RegexpMatchMedium_1K 72.6µs × (1.00,1.00) 72.6µs × (1.00,1.00) ~ (p=0.539)
RegexpMatchHard_32 3.84µs × (1.00,1.00) 3.84µs × (1.00,1.00) ~ (p=0.378)
RegexpMatchHard_1K 117µs × (1.00,1.00) 117µs × (1.00,1.00) ~ (p=0.067)
Revcomp 904ms × (0.99,1.02) 904ms × (0.99,1.01) ~ (p=0.943)
Template 125ms × (0.99,1.02) 127ms × (0.99,1.01) +1.79% (p=0.000)
TimeParse 627ns × (0.99,1.01) 622ns × (0.99,1.01) -0.88% (p=0.000)
TimeFormat 655ns × (0.99,1.02) 655ns × (0.99,1.02) ~ (p=0.976)
For the record, Go 1 benchmarks, Go 1.4 vs this CL:
name old mean new mean delta
BinaryTree17 4.61s × (0.97,1.05) 5.91s × (0.98,1.03) +28.35% (p=0.000)
Fannkuch11 4.40s × (0.99,1.03) 4.41s × (0.99,1.01) ~ (p=0.212)
FmtFprintfEmpty 102ns × (0.99,1.01) 84ns × (0.99,1.02) -18.38% (p=0.000)
FmtFprintfString 302ns × (0.98,1.01) 303ns × (0.99,1.02) ~ (p=0.203)
FmtFprintfInt 313ns × (0.97,1.05) 270ns × (0.99,1.01) -13.69% (p=0.000)
FmtFprintfIntInt 524ns × (0.98,1.02) 477ns × (0.99,1.00) -8.87% (p=0.000)
FmtFprintfPrefixedInt 424ns × (0.98,1.02) 386ns × (0.99,1.01) -8.96% (p=0.000)
FmtFprintfFloat 652ns × (0.98,1.02) 594ns × (0.97,1.05) -8.97% (p=0.000)
FmtManyArgs 2.13µs × (0.99,1.02) 1.94µs × (0.99,1.01) -8.92% (p=0.000)
GobDecode 17.1ms × (0.99,1.02) 14.9ms × (0.98,1.03) -13.07% (p=0.000)
GobEncode 13.5ms × (0.98,1.03) 11.5ms × (0.98,1.03) -15.25% (p=0.000)
Gzip 656ms × (0.99,1.02) 647ms × (0.99,1.01) -1.29% (p=0.000)
Gunzip 143ms × (0.99,1.02) 144ms × (0.99,1.01) ~ (p=0.204)
HTTPClientServer 88.2µs × (0.98,1.02) 90.8µs × (0.98,1.01) +2.93% (p=0.000)
JSONEncode 32.2ms × (0.98,1.02) 30.9ms × (0.97,1.04) -4.06% (p=0.001)
JSONDecode 121ms × (0.98,1.02) 110ms × (0.98,1.05) -8.95% (p=0.000)
Mandelbrot200 6.06ms × (0.99,1.01) 6.11ms × (0.98,1.04) ~ (p=0.184)
GoParse 6.76ms × (0.97,1.04) 6.58ms × (0.98,1.05) -2.63% (p=0.003)
RegexpMatchEasy0_32 195ns × (1.00,1.01) 155ns × (0.99,1.01) -20.43% (p=0.000)
RegexpMatchEasy0_1K 479ns × (0.98,1.03) 535ns × (0.99,1.02) +11.59% (p=0.000)
RegexpMatchEasy1_32 169ns × (0.99,1.02) 131ns × (0.99,1.03) -22.44% (p=0.000)
RegexpMatchEasy1_1K 1.53µs × (0.99,1.01) 0.87µs × (0.99,1.02) -43.07% (p=0.000)
RegexpMatchMedium_32 334ns × (0.99,1.01) 242ns × (0.99,1.01) -27.53% (p=0.000)
RegexpMatchMedium_1K 125µs × (1.00,1.01) 72µs × (0.99,1.03) -42.53% (p=0.000)
RegexpMatchHard_32 6.03µs × (0.99,1.01) 3.79µs × (0.99,1.01) -37.12% (p=0.000)
RegexpMatchHard_1K 189µs × (0.99,1.02) 115µs × (0.99,1.01) -39.20% (p=0.000)
Revcomp 935ms × (0.96,1.03) 926ms × (0.98,1.02) ~ (p=0.083)
Template 146ms × (0.97,1.05) 119ms × (0.99,1.01) -18.37% (p=0.000)
TimeParse 660ns × (0.99,1.01) 624ns × (0.99,1.02) -5.43% (p=0.000)
TimeFormat 670ns × (0.98,1.02) 710ns × (1.00,1.01) +5.97% (p=0.000)
This CL is a bit larger than I would like, but the compiler, linker, runtime,
and package reflect all need to be in sync about the format of these programs,
so there is no easy way to split this into independent changes (at least
while keeping the build working at each change).
Fixes #9625.
Fixes #10524.
Change-Id: I9e3e20d6097099d0f8532d1cb5b1af528804989a
Reviewed-on: https://go-review.googlesource.com/9888
Reviewed-by: Austin Clements <austin@google.com>
Run-TryBot: Russ Cox <rsc@golang.org>
2015-05-08 01:43:18 -04:00
|
|
|
|
|
|
|
|
// Copy program.
|
2016-08-22 10:33:13 -04:00
|
|
|
prog := decodetypeGcprog(p.ctxt, typ)
|
runtime: replace GC programs with simpler encoding, faster decoder
Small types record the location of pointers in their memory layout
by using a simple bitmap. In Go 1.4 the bitmap held 4-bit entries,
and in Go 1.5 the bitmap holds 1-bit entries, but in both cases using
a bitmap for a large type containing arrays does not make sense:
if someone refers to the type [1<<28]*byte in a program in such
a way that the type information makes it into the binary, it would be
a waste of space to write a 128 MB (for 4-bit entries) or even 32 MB
(for 1-bit entries) bitmap full of 1s into the binary or even to keep
one in memory during the execution of the program.
For large types containing arrays, it is much more compact to describe
the locations of pointers using a notation that can express repetition
than to lay out a bitmap of pointers. Go 1.4 included such a notation,
called ``GC programs'' but it was complex, required recursion during
decoding, and was generally slow. Dmitriy measured the execution of
these programs writing directly to the heap bitmap as being 7x slower
than copying from a preunrolled 4-bit mask (and frankly that code was
not terribly fast either). For some tests, unrollgcprog1 was seen costing
as much as 3x more than the rest of malloc combined.
This CL introduces a different form for the GC programs. They use a
simple Lempel-Ziv-style encoding of the 1-bit pointer information,
in which the only operations are (1) emit the following n bits
and (2) repeat the last n bits c more times. This encoding can be
generated directly from the Go type information (using repetition
only for arrays or large runs of non-pointer data) and it can be decoded
very efficiently. In particular the decoding requires little state and
no recursion, so that the entire decoding can run without any memory
accesses other than the reads of the encoding and the writes of the
decoded form to the heap bitmap. For recursive types like arrays of
arrays of arrays, the inner instructions are only executed once, not
n times, so that large repetitions run at full speed. (In contrast, large
repetitions in the old programs repeated the individual bit-level layout
of the inner data over and over.) The result is as much as 25x faster
decoding compared to the old form.
Because the old decoder was so slow, Go 1.4 had three (or so) cases
for how to set the heap bitmap bits for an allocation of a given type:
(1) If the type had an even number of words up to 32 words, then
the 4-bit pointer mask for the type fit in no more than 16 bytes;
store the 4-bit pointer mask directly in the binary and copy from it.
(1b) If the type had an odd number of words up to 15 words, then
the 4-bit pointer mask for the type, doubled to end on a byte boundary,
fit in no more than 16 bytes; store that doubled mask directly in the
binary and copy from it.
(2) If the type had an even number of words up to 128 words,
or an odd number of words up to 63 words (again due to doubling),
then the 4-bit pointer mask would fit in a 64-byte unrolled mask.
Store a GC program in the binary, but leave space in the BSS for
the unrolled mask. Execute the GC program to construct the mask the
first time it is needed, and thereafter copy from the mask.
(3) Otherwise, store a GC program and execute it to write directly to
the heap bitmap each time an object of that type is allocated.
(This is the case that was 7x slower than the other two.)
Because the new pointer masks store 1-bit entries instead of 4-bit
entries and because using the decoder no longer carries a significant
overhead, after this CL (that is, for Go 1.5) there are only two cases:
(1) If the type is 128 words or less (no condition about odd or even),
store the 1-bit pointer mask directly in the binary and use it to
initialize the heap bitmap during malloc. (Implemented in CL 9702.)
(2) There is no case 2 anymore.
(3) Otherwise, store a GC program and execute it to write directly to
the heap bitmap each time an object of that type is allocated.
Executing the GC program directly into the heap bitmap (case (3) above)
was disabled for the Go 1.5 dev cycle, both to avoid needing to use
GC programs for typedmemmove and to avoid updating that code as
the heap bitmap format changed. Typedmemmove no longer uses this
type information; as of CL 9886 it uses the heap bitmap directly.
Now that the heap bitmap format is stable, we reintroduce GC programs
and their space savings.
Benchmarks for heapBitsSetType, before this CL vs this CL:
name old mean new mean delta
SetTypePtr 7.59ns × (0.99,1.02) 5.16ns × (1.00,1.00) -32.05% (p=0.000)
SetTypePtr8 21.0ns × (0.98,1.05) 21.4ns × (1.00,1.00) ~ (p=0.179)
SetTypePtr16 24.1ns × (0.99,1.01) 24.6ns × (1.00,1.00) +2.41% (p=0.001)
SetTypePtr32 31.2ns × (0.99,1.01) 32.4ns × (0.99,1.02) +3.72% (p=0.001)
SetTypePtr64 45.2ns × (1.00,1.00) 47.2ns × (1.00,1.00) +4.42% (p=0.000)
SetTypePtr126 75.8ns × (0.99,1.01) 79.1ns × (1.00,1.00) +4.25% (p=0.000)
SetTypePtr128 74.3ns × (0.99,1.01) 77.6ns × (1.00,1.01) +4.55% (p=0.000)
SetTypePtrSlice 726ns × (1.00,1.01) 712ns × (1.00,1.00) -1.95% (p=0.001)
SetTypeNode1 20.0ns × (0.99,1.01) 20.7ns × (1.00,1.00) +3.71% (p=0.000)
SetTypeNode1Slice 112ns × (1.00,1.00) 113ns × (0.99,1.00) ~ (p=0.070)
SetTypeNode8 23.9ns × (1.00,1.00) 24.7ns × (1.00,1.01) +3.18% (p=0.000)
SetTypeNode8Slice 294ns × (0.99,1.02) 287ns × (0.99,1.01) -2.38% (p=0.015)
SetTypeNode64 52.8ns × (0.99,1.03) 51.8ns × (0.99,1.01) ~ (p=0.069)
SetTypeNode64Slice 1.13µs × (0.99,1.05) 1.14µs × (0.99,1.00) ~ (p=0.767)
SetTypeNode64Dead 36.0ns × (1.00,1.01) 32.5ns × (0.99,1.00) -9.67% (p=0.000)
SetTypeNode64DeadSlice 1.43µs × (0.99,1.01) 1.40µs × (1.00,1.00) -2.39% (p=0.001)
SetTypeNode124 75.7ns × (1.00,1.01) 79.0ns × (1.00,1.00) +4.44% (p=0.000)
SetTypeNode124Slice 1.94µs × (1.00,1.01) 2.04µs × (0.99,1.01) +4.98% (p=0.000)
SetTypeNode126 75.4ns × (1.00,1.01) 77.7ns × (0.99,1.01) +3.11% (p=0.000)
SetTypeNode126Slice 1.95µs × (0.99,1.01) 2.03µs × (1.00,1.00) +3.74% (p=0.000)
SetTypeNode128 85.4ns × (0.99,1.01) 122.0ns × (1.00,1.00) +42.89% (p=0.000)
SetTypeNode128Slice 2.20µs × (1.00,1.01) 2.36µs × (0.98,1.02) +7.48% (p=0.001)
SetTypeNode130 83.3ns × (1.00,1.00) 123.0ns × (1.00,1.00) +47.61% (p=0.000)
SetTypeNode130Slice 2.30µs × (0.99,1.01) 2.40µs × (0.98,1.01) +4.37% (p=0.000)
SetTypeNode1024 498ns × (1.00,1.00) 537ns × (1.00,1.00) +7.96% (p=0.000)
SetTypeNode1024Slice 15.5µs × (0.99,1.01) 17.8µs × (1.00,1.00) +15.27% (p=0.000)
The above compares always using a cached pointer mask (and the
corresponding waste of memory) against using the programs directly.
Some slowdown is expected, in exchange for having a better general algorithm.
The GC programs kick in for SetTypeNode128, SetTypeNode130, SetTypeNode1024,
along with the slice variants of those.
It is possible that the cutoff of 128 words (bits) should be raised
in a followup CL, but even with this low cutoff the GC programs are
faster than Go 1.4's "fast path" non-GC program case.
Benchmarks for heapBitsSetType, Go 1.4 vs this CL:
name old mean new mean delta
SetTypePtr 6.89ns × (1.00,1.00) 5.17ns × (1.00,1.00) -25.02% (p=0.000)
SetTypePtr8 25.8ns × (0.97,1.05) 21.5ns × (1.00,1.00) -16.70% (p=0.000)
SetTypePtr16 39.8ns × (0.97,1.02) 24.7ns × (0.99,1.01) -37.81% (p=0.000)
SetTypePtr32 68.8ns × (0.98,1.01) 32.2ns × (1.00,1.01) -53.18% (p=0.000)
SetTypePtr64 130ns × (1.00,1.00) 47ns × (1.00,1.00) -63.67% (p=0.000)
SetTypePtr126 241ns × (0.99,1.01) 79ns × (1.00,1.01) -67.25% (p=0.000)
SetTypePtr128 2.07µs × (1.00,1.00) 0.08µs × (1.00,1.00) -96.27% (p=0.000)
SetTypePtrSlice 1.05µs × (0.99,1.01) 0.72µs × (0.99,1.02) -31.70% (p=0.000)
SetTypeNode1 16.0ns × (0.99,1.01) 20.8ns × (0.99,1.03) +29.91% (p=0.000)
SetTypeNode1Slice 184ns × (0.99,1.01) 112ns × (0.99,1.01) -39.26% (p=0.000)
SetTypeNode8 29.5ns × (0.97,1.02) 24.6ns × (1.00,1.00) -16.50% (p=0.000)
SetTypeNode8Slice 624ns × (0.98,1.02) 285ns × (1.00,1.00) -54.31% (p=0.000)
SetTypeNode64 135ns × (0.96,1.08) 52ns × (0.99,1.02) -61.32% (p=0.000)
SetTypeNode64Slice 3.83µs × (1.00,1.00) 1.14µs × (0.99,1.01) -70.16% (p=0.000)
SetTypeNode64Dead 134ns × (0.99,1.01) 32ns × (1.00,1.01) -75.74% (p=0.000)
SetTypeNode64DeadSlice 3.83µs × (0.99,1.00) 1.40µs × (1.00,1.01) -63.42% (p=0.000)
SetTypeNode124 240ns × (0.99,1.01) 79ns × (1.00,1.01) -67.05% (p=0.000)
SetTypeNode124Slice 7.27µs × (1.00,1.00) 2.04µs × (1.00,1.00) -71.95% (p=0.000)
SetTypeNode126 2.06µs × (0.99,1.01) 0.08µs × (0.99,1.01) -96.23% (p=0.000)
SetTypeNode126Slice 64.4µs × (1.00,1.00) 2.0µs × (1.00,1.00) -96.85% (p=0.000)
SetTypeNode128 2.09µs × (1.00,1.01) 0.12µs × (1.00,1.00) -94.15% (p=0.000)
SetTypeNode128Slice 65.4µs × (1.00,1.00) 2.4µs × (0.99,1.03) -96.39% (p=0.000)
SetTypeNode130 2.11µs × (1.00,1.00) 0.12µs × (1.00,1.00) -94.18% (p=0.000)
SetTypeNode130Slice 66.3µs × (1.00,1.00) 2.4µs × (0.97,1.08) -96.34% (p=0.000)
SetTypeNode1024 16.0µs × (1.00,1.01) 0.5µs × (1.00,1.00) -96.65% (p=0.000)
SetTypeNode1024Slice 512µs × (1.00,1.00) 18µs × (0.98,1.04) -96.45% (p=0.000)
SetTypeNode124 uses a 124 data + 2 ptr = 126-word allocation.
Both Go 1.4 and this CL are using pointer bitmaps for this case,
so that's an overall 3x speedup for using pointer bitmaps.
SetTypeNode128 uses a 128 data + 2 ptr = 130-word allocation.
Both Go 1.4 and this CL are running the GC program for this case,
so that's an overall 17x speedup when using GC programs (and
I've seen >20x on other systems).
Comparing Go 1.4's SetTypeNode124 (pointer bitmap) against
this CL's SetTypeNode128 (GC program), the slow path in the
code in this CL is 2x faster than the fast path in Go 1.4.
The Go 1 benchmarks are basically unaffected compared to just before this CL.
Go 1 benchmarks, before this CL vs this CL:
name old mean new mean delta
BinaryTree17 5.87s × (0.97,1.04) 5.91s × (0.96,1.04) ~ (p=0.306)
Fannkuch11 4.38s × (1.00,1.00) 4.37s × (1.00,1.01) -0.22% (p=0.006)
FmtFprintfEmpty 90.7ns × (0.97,1.10) 89.3ns × (0.96,1.09) ~ (p=0.280)
FmtFprintfString 282ns × (0.98,1.04) 287ns × (0.98,1.07) +1.72% (p=0.039)
FmtFprintfInt 269ns × (0.99,1.03) 282ns × (0.97,1.04) +4.87% (p=0.000)
FmtFprintfIntInt 478ns × (0.99,1.02) 481ns × (0.99,1.02) +0.61% (p=0.048)
FmtFprintfPrefixedInt 399ns × (0.98,1.03) 400ns × (0.98,1.05) ~ (p=0.533)
FmtFprintfFloat 563ns × (0.99,1.01) 570ns × (1.00,1.01) +1.37% (p=0.000)
FmtManyArgs 1.89µs × (0.99,1.01) 1.92µs × (0.99,1.02) +1.88% (p=0.000)
GobDecode 15.2ms × (0.99,1.01) 15.2ms × (0.98,1.05) ~ (p=0.609)
GobEncode 11.6ms × (0.98,1.03) 11.9ms × (0.98,1.04) +2.17% (p=0.000)
Gzip 648ms × (0.99,1.01) 648ms × (1.00,1.01) ~ (p=0.835)
Gunzip 142ms × (1.00,1.00) 143ms × (1.00,1.01) ~ (p=0.169)
HTTPClientServer 90.5µs × (0.98,1.03) 91.5µs × (0.98,1.04) +1.04% (p=0.045)
JSONEncode 31.5ms × (0.98,1.03) 31.4ms × (0.98,1.03) ~ (p=0.549)
JSONDecode 111ms × (0.99,1.01) 107ms × (0.99,1.01) -3.21% (p=0.000)
Mandelbrot200 6.01ms × (1.00,1.00) 6.01ms × (1.00,1.00) ~ (p=0.878)
GoParse 6.54ms × (0.99,1.02) 6.61ms × (0.99,1.03) +1.08% (p=0.004)
RegexpMatchEasy0_32 160ns × (1.00,1.01) 161ns × (1.00,1.00) +0.40% (p=0.000)
RegexpMatchEasy0_1K 560ns × (0.99,1.01) 559ns × (0.99,1.01) ~ (p=0.088)
RegexpMatchEasy1_32 138ns × (0.99,1.01) 138ns × (1.00,1.00) ~ (p=0.380)
RegexpMatchEasy1_1K 877ns × (1.00,1.00) 878ns × (1.00,1.00) ~ (p=0.157)
RegexpMatchMedium_32 251ns × (0.99,1.00) 251ns × (1.00,1.01) +0.28% (p=0.021)
RegexpMatchMedium_1K 72.6µs × (1.00,1.00) 72.6µs × (1.00,1.00) ~ (p=0.539)
RegexpMatchHard_32 3.84µs × (1.00,1.00) 3.84µs × (1.00,1.00) ~ (p=0.378)
RegexpMatchHard_1K 117µs × (1.00,1.00) 117µs × (1.00,1.00) ~ (p=0.067)
Revcomp 904ms × (0.99,1.02) 904ms × (0.99,1.01) ~ (p=0.943)
Template 125ms × (0.99,1.02) 127ms × (0.99,1.01) +1.79% (p=0.000)
TimeParse 627ns × (0.99,1.01) 622ns × (0.99,1.01) -0.88% (p=0.000)
TimeFormat 655ns × (0.99,1.02) 655ns × (0.99,1.02) ~ (p=0.976)
For the record, Go 1 benchmarks, Go 1.4 vs this CL:
name old mean new mean delta
BinaryTree17 4.61s × (0.97,1.05) 5.91s × (0.98,1.03) +28.35% (p=0.000)
Fannkuch11 4.40s × (0.99,1.03) 4.41s × (0.99,1.01) ~ (p=0.212)
FmtFprintfEmpty 102ns × (0.99,1.01) 84ns × (0.99,1.02) -18.38% (p=0.000)
FmtFprintfString 302ns × (0.98,1.01) 303ns × (0.99,1.02) ~ (p=0.203)
FmtFprintfInt 313ns × (0.97,1.05) 270ns × (0.99,1.01) -13.69% (p=0.000)
FmtFprintfIntInt 524ns × (0.98,1.02) 477ns × (0.99,1.00) -8.87% (p=0.000)
FmtFprintfPrefixedInt 424ns × (0.98,1.02) 386ns × (0.99,1.01) -8.96% (p=0.000)
FmtFprintfFloat 652ns × (0.98,1.02) 594ns × (0.97,1.05) -8.97% (p=0.000)
FmtManyArgs 2.13µs × (0.99,1.02) 1.94µs × (0.99,1.01) -8.92% (p=0.000)
GobDecode 17.1ms × (0.99,1.02) 14.9ms × (0.98,1.03) -13.07% (p=0.000)
GobEncode 13.5ms × (0.98,1.03) 11.5ms × (0.98,1.03) -15.25% (p=0.000)
Gzip 656ms × (0.99,1.02) 647ms × (0.99,1.01) -1.29% (p=0.000)
Gunzip 143ms × (0.99,1.02) 144ms × (0.99,1.01) ~ (p=0.204)
HTTPClientServer 88.2µs × (0.98,1.02) 90.8µs × (0.98,1.01) +2.93% (p=0.000)
JSONEncode 32.2ms × (0.98,1.02) 30.9ms × (0.97,1.04) -4.06% (p=0.001)
JSONDecode 121ms × (0.98,1.02) 110ms × (0.98,1.05) -8.95% (p=0.000)
Mandelbrot200 6.06ms × (0.99,1.01) 6.11ms × (0.98,1.04) ~ (p=0.184)
GoParse 6.76ms × (0.97,1.04) 6.58ms × (0.98,1.05) -2.63% (p=0.003)
RegexpMatchEasy0_32 195ns × (1.00,1.01) 155ns × (0.99,1.01) -20.43% (p=0.000)
RegexpMatchEasy0_1K 479ns × (0.98,1.03) 535ns × (0.99,1.02) +11.59% (p=0.000)
RegexpMatchEasy1_32 169ns × (0.99,1.02) 131ns × (0.99,1.03) -22.44% (p=0.000)
RegexpMatchEasy1_1K 1.53µs × (0.99,1.01) 0.87µs × (0.99,1.02) -43.07% (p=0.000)
RegexpMatchMedium_32 334ns × (0.99,1.01) 242ns × (0.99,1.01) -27.53% (p=0.000)
RegexpMatchMedium_1K 125µs × (1.00,1.01) 72µs × (0.99,1.03) -42.53% (p=0.000)
RegexpMatchHard_32 6.03µs × (0.99,1.01) 3.79µs × (0.99,1.01) -37.12% (p=0.000)
RegexpMatchHard_1K 189µs × (0.99,1.02) 115µs × (0.99,1.01) -39.20% (p=0.000)
Revcomp 935ms × (0.96,1.03) 926ms × (0.98,1.02) ~ (p=0.083)
Template 146ms × (0.97,1.05) 119ms × (0.99,1.01) -18.37% (p=0.000)
TimeParse 660ns × (0.99,1.01) 624ns × (0.99,1.02) -5.43% (p=0.000)
TimeFormat 670ns × (0.98,1.02) 710ns × (1.00,1.01) +5.97% (p=0.000)
This CL is a bit larger than I would like, but the compiler, linker, runtime,
and package reflect all need to be in sync about the format of these programs,
so there is no easy way to split this into independent changes (at least
while keeping the build working at each change).
Fixes #9625.
Fixes #10524.
Change-Id: I9e3e20d6097099d0f8532d1cb5b1af528804989a
Reviewed-on: https://go-review.googlesource.com/9888
Reviewed-by: Austin Clements <austin@google.com>
Run-TryBot: Russ Cox <rsc@golang.org>
2015-05-08 01:43:18 -04:00
|
|
|
p.w.ZeroUntil(s.Value / ptrsize)
|
2015-05-25 16:13:50 +12:00
|
|
|
p.w.Append(prog[4:], nptr)
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
|
2016-08-22 10:27:20 +12:00
|
|
|
// dataSortKey is used to sort a slice of data symbol *Symbol pointers.
|
2016-12-08 22:15:40 +00:00
|
|
|
// The sort keys are kept inline to improve cache behavior while sorting.
|
2016-03-09 16:23:25 +02:00
|
|
|
type dataSortKey struct {
|
2016-04-18 14:50:14 -04:00
|
|
|
size int64
|
|
|
|
|
name string
|
2016-08-22 10:27:20 +12:00
|
|
|
sym *Symbol
|
2016-03-09 16:23:25 +02:00
|
|
|
}
|
|
|
|
|
|
2016-04-18 14:50:14 -04:00
|
|
|
type bySizeAndName []dataSortKey
|
2016-03-09 16:23:25 +02:00
|
|
|
|
2016-04-18 14:50:14 -04:00
|
|
|
func (d bySizeAndName) Len() int { return len(d) }
|
|
|
|
|
func (d bySizeAndName) Swap(i, j int) { d[i], d[j] = d[j], d[i] }
|
|
|
|
|
func (d bySizeAndName) Less(i, j int) bool {
|
|
|
|
|
s1, s2 := d[i], d[j]
|
|
|
|
|
if s1.size != s2.size {
|
|
|
|
|
return s1.size < s2.size
|
2016-04-04 13:07:24 -04:00
|
|
|
}
|
2016-04-18 14:50:14 -04:00
|
|
|
return s1.name < s2.name
|
2016-03-09 16:23:25 +02:00
|
|
|
}
|
|
|
|
|
|
2016-04-19 08:59:56 -04:00
|
|
|
const cutoff int64 = 2e9 // 2 GB (or so; looks better in errors than 2^31)
|
|
|
|
|
|
2017-04-19 15:15:35 +12:00
|
|
|
func checkdatsize(ctxt *Link, datsize int64, symn SymKind) {
|
2016-04-19 08:59:56 -04:00
|
|
|
if datsize > cutoff {
|
2016-09-17 09:39:33 -04:00
|
|
|
Errorf(nil, "too much data in section %v (over %d bytes)", symn, cutoff)
|
2016-04-19 08:59:56 -04:00
|
|
|
}
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
|
2016-04-18 14:50:14 -04:00
|
|
|
// datap is a collection of reachable data symbols in address order.
|
|
|
|
|
// Generated by dodata.
|
2016-08-19 11:35:54 -04:00
|
|
|
var datap []*Symbol
|
2016-03-09 16:23:25 +02:00
|
|
|
|
2016-08-19 22:40:38 -04:00
|
|
|
func (ctxt *Link) dodata() {
|
2016-08-21 18:25:28 -04:00
|
|
|
if ctxt.Debugvlog != 0 {
|
2017-04-18 12:53:25 -07:00
|
|
|
ctxt.Logf("%5.2f dodata\n", Cputime())
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
|
2017-04-18 12:53:25 -07:00
|
|
|
if ctxt.DynlinkingGo() && Headtype == objabi.Hdarwin {
|
2016-09-19 14:13:07 -04:00
|
|
|
// The values in moduledata are filled out by relocations
|
|
|
|
|
// pointing to the addresses of these special symbols.
|
|
|
|
|
// Typically these symbols have no size and are not laid
|
|
|
|
|
// out with their matching section.
|
|
|
|
|
//
|
|
|
|
|
// However on darwin, dyld will find the special symbol
|
|
|
|
|
// in the first loaded module, even though it is local.
|
|
|
|
|
//
|
|
|
|
|
// (An hypothesis, formed without looking in the dyld sources:
|
|
|
|
|
// these special symbols have no size, so their address
|
|
|
|
|
// matches a real symbol. The dynamic linker assumes we
|
|
|
|
|
// want the normal symbol with the same address and finds
|
|
|
|
|
// it in the other module.)
|
|
|
|
|
//
|
|
|
|
|
// To work around this we lay out the symbls whose
|
|
|
|
|
// addresses are vital for multi-module programs to work
|
|
|
|
|
// as normal symbols, and give them a little size.
|
|
|
|
|
bss := ctxt.Syms.Lookup("runtime.bss", 0)
|
|
|
|
|
bss.Size = 8
|
|
|
|
|
bss.Attr.Set(AttrSpecial, false)
|
|
|
|
|
|
|
|
|
|
ctxt.Syms.Lookup("runtime.ebss", 0).Attr.Set(AttrSpecial, false)
|
|
|
|
|
|
|
|
|
|
data := ctxt.Syms.Lookup("runtime.data", 0)
|
|
|
|
|
data.Size = 8
|
|
|
|
|
data.Attr.Set(AttrSpecial, false)
|
|
|
|
|
|
|
|
|
|
ctxt.Syms.Lookup("runtime.edata", 0).Attr.Set(AttrSpecial, false)
|
|
|
|
|
|
|
|
|
|
types := ctxt.Syms.Lookup("runtime.types", 0)
|
2017-04-19 15:15:35 +12:00
|
|
|
types.Type = STYPE
|
2016-09-19 14:13:07 -04:00
|
|
|
types.Size = 8
|
|
|
|
|
types.Attr.Set(AttrSpecial, false)
|
|
|
|
|
|
|
|
|
|
etypes := ctxt.Syms.Lookup("runtime.etypes", 0)
|
2017-04-19 15:15:35 +12:00
|
|
|
etypes.Type = SFUNCTAB
|
2016-09-19 14:13:07 -04:00
|
|
|
etypes.Attr.Set(AttrSpecial, false)
|
|
|
|
|
}
|
|
|
|
|
|
2016-04-18 14:50:14 -04:00
|
|
|
// Collect data symbols by type into data.
|
2017-04-19 15:15:35 +12:00
|
|
|
var data [SXREF][]*Symbol
|
2016-09-20 14:59:39 +12:00
|
|
|
for _, s := range ctxt.Syms.Allsym {
|
2016-03-02 07:59:49 -05:00
|
|
|
if !s.Attr.Reachable() || s.Attr.Special() {
|
2015-02-27 22:57:28 -05:00
|
|
|
continue
|
|
|
|
|
}
|
2017-04-19 15:15:35 +12:00
|
|
|
if s.Type <= STEXT || s.Type >= SXREF {
|
2016-04-18 14:50:14 -04:00
|
|
|
continue
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
2016-04-18 14:50:14 -04:00
|
|
|
data[s.Type] = append(data[s.Type], s)
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
|
2016-04-18 14:50:14 -04:00
|
|
|
// Now that we have the data symbols, but before we start
|
|
|
|
|
// to assign addresses, record all the necessary
|
|
|
|
|
// dynamic relocations. These will grow the relocation
|
|
|
|
|
// symbol, which is itself data.
|
|
|
|
|
//
|
|
|
|
|
// On darwin, we need the symbol table numbers for dynreloc.
|
2017-04-18 12:53:25 -07:00
|
|
|
if Headtype == objabi.Hdarwin {
|
2016-08-19 22:40:38 -04:00
|
|
|
machosymorder(ctxt)
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
2016-08-19 22:40:38 -04:00
|
|
|
dynreloc(ctxt, &data)
|
2015-02-27 22:57:28 -05:00
|
|
|
|
2015-05-21 13:07:19 +12:00
|
|
|
if UseRelro() {
|
|
|
|
|
// "read only" data with relocations needs to go in its own section
|
|
|
|
|
// when building a shared library. We do this by boosting objects of
|
|
|
|
|
// type SXXX with relocations to type SXXXRELRO.
|
2017-04-28 08:11:21 +12:00
|
|
|
for _, symnro := range readOnly {
|
|
|
|
|
symnrelro := relROMap[symnro]
|
2016-04-18 14:50:14 -04:00
|
|
|
|
2016-08-19 11:35:54 -04:00
|
|
|
ro := []*Symbol{}
|
2016-04-18 14:50:14 -04:00
|
|
|
relro := data[symnrelro]
|
|
|
|
|
|
|
|
|
|
for _, s := range data[symnro] {
|
|
|
|
|
isRelro := len(s.R) > 0
|
|
|
|
|
switch s.Type {
|
2017-04-19 15:15:35 +12:00
|
|
|
case STYPE, STYPERELRO, SGOFUNCRELRO:
|
2016-04-18 14:50:14 -04:00
|
|
|
// Symbols are not sorted yet, so it is possible
|
|
|
|
|
// that an Outer symbol has been changed to a
|
|
|
|
|
// relro Type before it reaches here.
|
|
|
|
|
isRelro = true
|
|
|
|
|
}
|
|
|
|
|
if isRelro {
|
|
|
|
|
s.Type = symnrelro
|
|
|
|
|
if s.Outer != nil {
|
|
|
|
|
s.Outer.Type = s.Type
|
|
|
|
|
}
|
|
|
|
|
relro = append(relro, s)
|
|
|
|
|
} else {
|
|
|
|
|
ro = append(ro, s)
|
2015-05-21 13:07:19 +12:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2016-04-18 14:50:14 -04:00
|
|
|
// Check that we haven't made two symbols with the same .Outer into
|
|
|
|
|
// different types (because references two symbols with non-nil Outer
|
|
|
|
|
// become references to the outer symbol + offset it's vital that the
|
|
|
|
|
// symbol and the outer end up in the same section).
|
|
|
|
|
for _, s := range relro {
|
|
|
|
|
if s.Outer != nil && s.Outer.Type != s.Type {
|
2016-09-17 09:39:33 -04:00
|
|
|
Errorf(s, "inconsistent types for symbol and its Outer %s (%v != %v)",
|
|
|
|
|
s.Outer.Name, s.Type, s.Outer.Type)
|
2015-03-30 15:45:33 +02:00
|
|
|
}
|
|
|
|
|
}
|
2016-04-18 14:50:14 -04:00
|
|
|
|
|
|
|
|
data[symnro] = ro
|
|
|
|
|
data[symnrelro] = relro
|
2015-03-30 15:45:33 +02:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2016-04-18 14:50:14 -04:00
|
|
|
// Sort symbols.
|
2017-04-19 15:15:35 +12:00
|
|
|
var dataMaxAlign [SXREF]int32
|
2016-04-18 14:50:14 -04:00
|
|
|
var wg sync.WaitGroup
|
|
|
|
|
for symn := range data {
|
2017-04-19 15:15:35 +12:00
|
|
|
symn := SymKind(symn)
|
2016-04-18 14:50:14 -04:00
|
|
|
wg.Add(1)
|
|
|
|
|
go func() {
|
2016-08-19 22:40:38 -04:00
|
|
|
data[symn], dataMaxAlign[symn] = dodataSect(ctxt, symn, data[symn])
|
2016-04-18 14:50:14 -04:00
|
|
|
wg.Done()
|
|
|
|
|
}()
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
2016-04-18 14:50:14 -04:00
|
|
|
wg.Wait()
|
2015-02-27 22:57:28 -05:00
|
|
|
|
2016-04-18 14:50:14 -04:00
|
|
|
// Allocate sections.
|
|
|
|
|
// Data is processed before segtext, because we need
|
|
|
|
|
// to see all symbols in the .data and .bss sections in order
|
|
|
|
|
// to generate garbage collection information.
|
2015-03-02 12:35:15 -05:00
|
|
|
datsize := int64(0)
|
2015-02-27 22:57:28 -05:00
|
|
|
|
2016-09-07 14:45:27 -04:00
|
|
|
// Writable data sections that do not need any specialized handling.
|
2017-04-19 15:15:35 +12:00
|
|
|
writable := []SymKind{
|
|
|
|
|
SELFSECT,
|
|
|
|
|
SMACHO,
|
|
|
|
|
SMACHOGOT,
|
|
|
|
|
SWINDOWS,
|
2016-04-18 14:50:14 -04:00
|
|
|
}
|
2016-09-07 14:45:27 -04:00
|
|
|
for _, symn := range writable {
|
2016-04-18 14:50:14 -04:00
|
|
|
for _, s := range data[symn] {
|
|
|
|
|
sect := addsection(&Segdata, s.Name, 06)
|
|
|
|
|
sect.Align = symalign(s)
|
|
|
|
|
datsize = Rnd(datsize, int64(sect.Align))
|
|
|
|
|
sect.Vaddr = uint64(datsize)
|
|
|
|
|
s.Sect = sect
|
2017-04-19 15:15:35 +12:00
|
|
|
s.Type = SDATA
|
2016-04-18 14:50:14 -04:00
|
|
|
s.Value = int64(uint64(datsize) - sect.Vaddr)
|
2016-04-19 08:59:56 -04:00
|
|
|
datsize += s.Size
|
2016-04-18 14:50:14 -04:00
|
|
|
sect.Length = uint64(datsize) - sect.Vaddr
|
|
|
|
|
}
|
2016-08-19 22:40:38 -04:00
|
|
|
checkdatsize(ctxt, datsize, symn)
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
|
2016-04-18 14:50:14 -04:00
|
|
|
// .got (and .toc on ppc64)
|
2017-04-19 15:15:35 +12:00
|
|
|
if len(data[SELFGOT]) > 0 {
|
2015-03-02 12:35:15 -05:00
|
|
|
sect := addsection(&Segdata, ".got", 06)
|
2017-04-19 15:15:35 +12:00
|
|
|
sect.Align = dataMaxAlign[SELFGOT]
|
2015-02-27 22:57:28 -05:00
|
|
|
datsize = Rnd(datsize, int64(sect.Align))
|
|
|
|
|
sect.Vaddr = uint64(datsize)
|
2016-08-19 11:35:54 -04:00
|
|
|
var toc *Symbol
|
2017-04-19 15:15:35 +12:00
|
|
|
for _, s := range data[SELFGOT] {
|
2015-02-27 22:57:28 -05:00
|
|
|
datsize = aligndatsize(datsize, s)
|
|
|
|
|
s.Sect = sect
|
2017-04-19 15:15:35 +12:00
|
|
|
s.Type = SDATA
|
2015-02-27 22:57:28 -05:00
|
|
|
s.Value = int64(uint64(datsize) - sect.Vaddr)
|
|
|
|
|
|
|
|
|
|
// Resolve .TOC. symbol for this object file (ppc64)
|
cmd/link: use ctxt.{Lookup,ROLookup} in favour of function versions of same
Done with two eg templates:
package p
import (
"cmd/link/internal/ld"
)
func before(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ld.Linklookup(ctxt, name, v)
}
func after(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ctxt.Syms.Lookup(name, v)
}
package p
import (
"cmd/link/internal/ld"
)
func before(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ld.Linkrlookup(ctxt, name, v)
}
func after(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ctxt.Syms.ROLookup(name, v)
}
Change-Id: I00647dbf62294557bd24c29ad1f108fc786335f1
Reviewed-on: https://go-review.googlesource.com/29343
Run-TryBot: Michael Hudson-Doyle <michael.hudson@canonical.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Ian Lance Taylor <iant@golang.org>
2016-09-20 15:06:08 +12:00
|
|
|
toc = ctxt.Syms.ROLookup(".TOC.", int(s.Version))
|
2015-02-27 22:57:28 -05:00
|
|
|
if toc != nil {
|
|
|
|
|
toc.Sect = sect
|
|
|
|
|
toc.Outer = s
|
|
|
|
|
toc.Sub = s.Sub
|
|
|
|
|
s.Sub = toc
|
|
|
|
|
|
|
|
|
|
toc.Value = 0x8000
|
|
|
|
|
}
|
|
|
|
|
|
2016-04-19 08:59:56 -04:00
|
|
|
datsize += s.Size
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
2017-04-19 15:15:35 +12:00
|
|
|
checkdatsize(ctxt, datsize, SELFGOT)
|
2015-02-27 22:57:28 -05:00
|
|
|
sect.Length = uint64(datsize) - sect.Vaddr
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* pointer-free data */
|
2016-04-18 14:50:14 -04:00
|
|
|
sect := addsection(&Segdata, ".noptrdata", 06)
|
2017-04-19 15:15:35 +12:00
|
|
|
sect.Align = dataMaxAlign[SNOPTRDATA]
|
2015-02-27 22:57:28 -05:00
|
|
|
datsize = Rnd(datsize, int64(sect.Align))
|
|
|
|
|
sect.Vaddr = uint64(datsize)
|
cmd/link: use ctxt.{Lookup,ROLookup} in favour of function versions of same
Done with two eg templates:
package p
import (
"cmd/link/internal/ld"
)
func before(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ld.Linklookup(ctxt, name, v)
}
func after(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ctxt.Syms.Lookup(name, v)
}
package p
import (
"cmd/link/internal/ld"
)
func before(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ld.Linkrlookup(ctxt, name, v)
}
func after(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ctxt.Syms.ROLookup(name, v)
}
Change-Id: I00647dbf62294557bd24c29ad1f108fc786335f1
Reviewed-on: https://go-review.googlesource.com/29343
Run-TryBot: Michael Hudson-Doyle <michael.hudson@canonical.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Ian Lance Taylor <iant@golang.org>
2016-09-20 15:06:08 +12:00
|
|
|
ctxt.Syms.Lookup("runtime.noptrdata", 0).Sect = sect
|
|
|
|
|
ctxt.Syms.Lookup("runtime.enoptrdata", 0).Sect = sect
|
2017-04-19 15:15:35 +12:00
|
|
|
for _, s := range data[SNOPTRDATA] {
|
2015-02-27 22:57:28 -05:00
|
|
|
datsize = aligndatsize(datsize, s)
|
|
|
|
|
s.Sect = sect
|
2017-04-19 15:15:35 +12:00
|
|
|
s.Type = SDATA
|
2015-02-27 22:57:28 -05:00
|
|
|
s.Value = int64(uint64(datsize) - sect.Vaddr)
|
2016-04-19 08:59:56 -04:00
|
|
|
datsize += s.Size
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
2017-04-19 15:15:35 +12:00
|
|
|
checkdatsize(ctxt, datsize, SNOPTRDATA)
|
2015-02-27 22:57:28 -05:00
|
|
|
sect.Length = uint64(datsize) - sect.Vaddr
|
|
|
|
|
|
2016-08-21 18:34:24 -04:00
|
|
|
hasinitarr := *FlagLinkshared
|
2015-04-01 14:17:43 +13:00
|
|
|
|
2015-02-27 22:57:28 -05:00
|
|
|
/* shared library initializer */
|
2015-04-09 10:44:05 -04:00
|
|
|
switch Buildmode {
|
2016-08-25 21:58:45 -04:00
|
|
|
case BuildmodeCArchive, BuildmodeCShared, BuildmodeShared, BuildmodePlugin:
|
2015-04-01 14:17:43 +13:00
|
|
|
hasinitarr = true
|
|
|
|
|
}
|
|
|
|
|
if hasinitarr {
|
2015-03-02 12:35:15 -05:00
|
|
|
sect := addsection(&Segdata, ".init_array", 06)
|
2017-04-19 15:15:35 +12:00
|
|
|
sect.Align = dataMaxAlign[SINITARR]
|
2015-02-27 22:57:28 -05:00
|
|
|
datsize = Rnd(datsize, int64(sect.Align))
|
|
|
|
|
sect.Vaddr = uint64(datsize)
|
2017-04-19 15:15:35 +12:00
|
|
|
for _, s := range data[SINITARR] {
|
2015-02-27 22:57:28 -05:00
|
|
|
datsize = aligndatsize(datsize, s)
|
|
|
|
|
s.Sect = sect
|
|
|
|
|
s.Value = int64(uint64(datsize) - sect.Vaddr)
|
2016-04-19 08:59:56 -04:00
|
|
|
datsize += s.Size
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
sect.Length = uint64(datsize) - sect.Vaddr
|
2017-04-19 15:15:35 +12:00
|
|
|
checkdatsize(ctxt, datsize, SINITARR)
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* data */
|
|
|
|
|
sect = addsection(&Segdata, ".data", 06)
|
2017-04-19 15:15:35 +12:00
|
|
|
sect.Align = dataMaxAlign[SDATA]
|
2015-02-27 22:57:28 -05:00
|
|
|
datsize = Rnd(datsize, int64(sect.Align))
|
|
|
|
|
sect.Vaddr = uint64(datsize)
|
cmd/link: use ctxt.{Lookup,ROLookup} in favour of function versions of same
Done with two eg templates:
package p
import (
"cmd/link/internal/ld"
)
func before(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ld.Linklookup(ctxt, name, v)
}
func after(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ctxt.Syms.Lookup(name, v)
}
package p
import (
"cmd/link/internal/ld"
)
func before(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ld.Linkrlookup(ctxt, name, v)
}
func after(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ctxt.Syms.ROLookup(name, v)
}
Change-Id: I00647dbf62294557bd24c29ad1f108fc786335f1
Reviewed-on: https://go-review.googlesource.com/29343
Run-TryBot: Michael Hudson-Doyle <michael.hudson@canonical.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Ian Lance Taylor <iant@golang.org>
2016-09-20 15:06:08 +12:00
|
|
|
ctxt.Syms.Lookup("runtime.data", 0).Sect = sect
|
|
|
|
|
ctxt.Syms.Lookup("runtime.edata", 0).Sect = sect
|
runtime: replace GC programs with simpler encoding, faster decoder
Small types record the location of pointers in their memory layout
by using a simple bitmap. In Go 1.4 the bitmap held 4-bit entries,
and in Go 1.5 the bitmap holds 1-bit entries, but in both cases using
a bitmap for a large type containing arrays does not make sense:
if someone refers to the type [1<<28]*byte in a program in such
a way that the type information makes it into the binary, it would be
a waste of space to write a 128 MB (for 4-bit entries) or even 32 MB
(for 1-bit entries) bitmap full of 1s into the binary or even to keep
one in memory during the execution of the program.
For large types containing arrays, it is much more compact to describe
the locations of pointers using a notation that can express repetition
than to lay out a bitmap of pointers. Go 1.4 included such a notation,
called ``GC programs'' but it was complex, required recursion during
decoding, and was generally slow. Dmitriy measured the execution of
these programs writing directly to the heap bitmap as being 7x slower
than copying from a preunrolled 4-bit mask (and frankly that code was
not terribly fast either). For some tests, unrollgcprog1 was seen costing
as much as 3x more than the rest of malloc combined.
This CL introduces a different form for the GC programs. They use a
simple Lempel-Ziv-style encoding of the 1-bit pointer information,
in which the only operations are (1) emit the following n bits
and (2) repeat the last n bits c more times. This encoding can be
generated directly from the Go type information (using repetition
only for arrays or large runs of non-pointer data) and it can be decoded
very efficiently. In particular the decoding requires little state and
no recursion, so that the entire decoding can run without any memory
accesses other than the reads of the encoding and the writes of the
decoded form to the heap bitmap. For recursive types like arrays of
arrays of arrays, the inner instructions are only executed once, not
n times, so that large repetitions run at full speed. (In contrast, large
repetitions in the old programs repeated the individual bit-level layout
of the inner data over and over.) The result is as much as 25x faster
decoding compared to the old form.
Because the old decoder was so slow, Go 1.4 had three (or so) cases
for how to set the heap bitmap bits for an allocation of a given type:
(1) If the type had an even number of words up to 32 words, then
the 4-bit pointer mask for the type fit in no more than 16 bytes;
store the 4-bit pointer mask directly in the binary and copy from it.
(1b) If the type had an odd number of words up to 15 words, then
the 4-bit pointer mask for the type, doubled to end on a byte boundary,
fit in no more than 16 bytes; store that doubled mask directly in the
binary and copy from it.
(2) If the type had an even number of words up to 128 words,
or an odd number of words up to 63 words (again due to doubling),
then the 4-bit pointer mask would fit in a 64-byte unrolled mask.
Store a GC program in the binary, but leave space in the BSS for
the unrolled mask. Execute the GC program to construct the mask the
first time it is needed, and thereafter copy from the mask.
(3) Otherwise, store a GC program and execute it to write directly to
the heap bitmap each time an object of that type is allocated.
(This is the case that was 7x slower than the other two.)
Because the new pointer masks store 1-bit entries instead of 4-bit
entries and because using the decoder no longer carries a significant
overhead, after this CL (that is, for Go 1.5) there are only two cases:
(1) If the type is 128 words or less (no condition about odd or even),
store the 1-bit pointer mask directly in the binary and use it to
initialize the heap bitmap during malloc. (Implemented in CL 9702.)
(2) There is no case 2 anymore.
(3) Otherwise, store a GC program and execute it to write directly to
the heap bitmap each time an object of that type is allocated.
Executing the GC program directly into the heap bitmap (case (3) above)
was disabled for the Go 1.5 dev cycle, both to avoid needing to use
GC programs for typedmemmove and to avoid updating that code as
the heap bitmap format changed. Typedmemmove no longer uses this
type information; as of CL 9886 it uses the heap bitmap directly.
Now that the heap bitmap format is stable, we reintroduce GC programs
and their space savings.
Benchmarks for heapBitsSetType, before this CL vs this CL:
name old mean new mean delta
SetTypePtr 7.59ns × (0.99,1.02) 5.16ns × (1.00,1.00) -32.05% (p=0.000)
SetTypePtr8 21.0ns × (0.98,1.05) 21.4ns × (1.00,1.00) ~ (p=0.179)
SetTypePtr16 24.1ns × (0.99,1.01) 24.6ns × (1.00,1.00) +2.41% (p=0.001)
SetTypePtr32 31.2ns × (0.99,1.01) 32.4ns × (0.99,1.02) +3.72% (p=0.001)
SetTypePtr64 45.2ns × (1.00,1.00) 47.2ns × (1.00,1.00) +4.42% (p=0.000)
SetTypePtr126 75.8ns × (0.99,1.01) 79.1ns × (1.00,1.00) +4.25% (p=0.000)
SetTypePtr128 74.3ns × (0.99,1.01) 77.6ns × (1.00,1.01) +4.55% (p=0.000)
SetTypePtrSlice 726ns × (1.00,1.01) 712ns × (1.00,1.00) -1.95% (p=0.001)
SetTypeNode1 20.0ns × (0.99,1.01) 20.7ns × (1.00,1.00) +3.71% (p=0.000)
SetTypeNode1Slice 112ns × (1.00,1.00) 113ns × (0.99,1.00) ~ (p=0.070)
SetTypeNode8 23.9ns × (1.00,1.00) 24.7ns × (1.00,1.01) +3.18% (p=0.000)
SetTypeNode8Slice 294ns × (0.99,1.02) 287ns × (0.99,1.01) -2.38% (p=0.015)
SetTypeNode64 52.8ns × (0.99,1.03) 51.8ns × (0.99,1.01) ~ (p=0.069)
SetTypeNode64Slice 1.13µs × (0.99,1.05) 1.14µs × (0.99,1.00) ~ (p=0.767)
SetTypeNode64Dead 36.0ns × (1.00,1.01) 32.5ns × (0.99,1.00) -9.67% (p=0.000)
SetTypeNode64DeadSlice 1.43µs × (0.99,1.01) 1.40µs × (1.00,1.00) -2.39% (p=0.001)
SetTypeNode124 75.7ns × (1.00,1.01) 79.0ns × (1.00,1.00) +4.44% (p=0.000)
SetTypeNode124Slice 1.94µs × (1.00,1.01) 2.04µs × (0.99,1.01) +4.98% (p=0.000)
SetTypeNode126 75.4ns × (1.00,1.01) 77.7ns × (0.99,1.01) +3.11% (p=0.000)
SetTypeNode126Slice 1.95µs × (0.99,1.01) 2.03µs × (1.00,1.00) +3.74% (p=0.000)
SetTypeNode128 85.4ns × (0.99,1.01) 122.0ns × (1.00,1.00) +42.89% (p=0.000)
SetTypeNode128Slice 2.20µs × (1.00,1.01) 2.36µs × (0.98,1.02) +7.48% (p=0.001)
SetTypeNode130 83.3ns × (1.00,1.00) 123.0ns × (1.00,1.00) +47.61% (p=0.000)
SetTypeNode130Slice 2.30µs × (0.99,1.01) 2.40µs × (0.98,1.01) +4.37% (p=0.000)
SetTypeNode1024 498ns × (1.00,1.00) 537ns × (1.00,1.00) +7.96% (p=0.000)
SetTypeNode1024Slice 15.5µs × (0.99,1.01) 17.8µs × (1.00,1.00) +15.27% (p=0.000)
The above compares always using a cached pointer mask (and the
corresponding waste of memory) against using the programs directly.
Some slowdown is expected, in exchange for having a better general algorithm.
The GC programs kick in for SetTypeNode128, SetTypeNode130, SetTypeNode1024,
along with the slice variants of those.
It is possible that the cutoff of 128 words (bits) should be raised
in a followup CL, but even with this low cutoff the GC programs are
faster than Go 1.4's "fast path" non-GC program case.
Benchmarks for heapBitsSetType, Go 1.4 vs this CL:
name old mean new mean delta
SetTypePtr 6.89ns × (1.00,1.00) 5.17ns × (1.00,1.00) -25.02% (p=0.000)
SetTypePtr8 25.8ns × (0.97,1.05) 21.5ns × (1.00,1.00) -16.70% (p=0.000)
SetTypePtr16 39.8ns × (0.97,1.02) 24.7ns × (0.99,1.01) -37.81% (p=0.000)
SetTypePtr32 68.8ns × (0.98,1.01) 32.2ns × (1.00,1.01) -53.18% (p=0.000)
SetTypePtr64 130ns × (1.00,1.00) 47ns × (1.00,1.00) -63.67% (p=0.000)
SetTypePtr126 241ns × (0.99,1.01) 79ns × (1.00,1.01) -67.25% (p=0.000)
SetTypePtr128 2.07µs × (1.00,1.00) 0.08µs × (1.00,1.00) -96.27% (p=0.000)
SetTypePtrSlice 1.05µs × (0.99,1.01) 0.72µs × (0.99,1.02) -31.70% (p=0.000)
SetTypeNode1 16.0ns × (0.99,1.01) 20.8ns × (0.99,1.03) +29.91% (p=0.000)
SetTypeNode1Slice 184ns × (0.99,1.01) 112ns × (0.99,1.01) -39.26% (p=0.000)
SetTypeNode8 29.5ns × (0.97,1.02) 24.6ns × (1.00,1.00) -16.50% (p=0.000)
SetTypeNode8Slice 624ns × (0.98,1.02) 285ns × (1.00,1.00) -54.31% (p=0.000)
SetTypeNode64 135ns × (0.96,1.08) 52ns × (0.99,1.02) -61.32% (p=0.000)
SetTypeNode64Slice 3.83µs × (1.00,1.00) 1.14µs × (0.99,1.01) -70.16% (p=0.000)
SetTypeNode64Dead 134ns × (0.99,1.01) 32ns × (1.00,1.01) -75.74% (p=0.000)
SetTypeNode64DeadSlice 3.83µs × (0.99,1.00) 1.40µs × (1.00,1.01) -63.42% (p=0.000)
SetTypeNode124 240ns × (0.99,1.01) 79ns × (1.00,1.01) -67.05% (p=0.000)
SetTypeNode124Slice 7.27µs × (1.00,1.00) 2.04µs × (1.00,1.00) -71.95% (p=0.000)
SetTypeNode126 2.06µs × (0.99,1.01) 0.08µs × (0.99,1.01) -96.23% (p=0.000)
SetTypeNode126Slice 64.4µs × (1.00,1.00) 2.0µs × (1.00,1.00) -96.85% (p=0.000)
SetTypeNode128 2.09µs × (1.00,1.01) 0.12µs × (1.00,1.00) -94.15% (p=0.000)
SetTypeNode128Slice 65.4µs × (1.00,1.00) 2.4µs × (0.99,1.03) -96.39% (p=0.000)
SetTypeNode130 2.11µs × (1.00,1.00) 0.12µs × (1.00,1.00) -94.18% (p=0.000)
SetTypeNode130Slice 66.3µs × (1.00,1.00) 2.4µs × (0.97,1.08) -96.34% (p=0.000)
SetTypeNode1024 16.0µs × (1.00,1.01) 0.5µs × (1.00,1.00) -96.65% (p=0.000)
SetTypeNode1024Slice 512µs × (1.00,1.00) 18µs × (0.98,1.04) -96.45% (p=0.000)
SetTypeNode124 uses a 124 data + 2 ptr = 126-word allocation.
Both Go 1.4 and this CL are using pointer bitmaps for this case,
so that's an overall 3x speedup for using pointer bitmaps.
SetTypeNode128 uses a 128 data + 2 ptr = 130-word allocation.
Both Go 1.4 and this CL are running the GC program for this case,
so that's an overall 17x speedup when using GC programs (and
I've seen >20x on other systems).
Comparing Go 1.4's SetTypeNode124 (pointer bitmap) against
this CL's SetTypeNode128 (GC program), the slow path in the
code in this CL is 2x faster than the fast path in Go 1.4.
The Go 1 benchmarks are basically unaffected compared to just before this CL.
Go 1 benchmarks, before this CL vs this CL:
name old mean new mean delta
BinaryTree17 5.87s × (0.97,1.04) 5.91s × (0.96,1.04) ~ (p=0.306)
Fannkuch11 4.38s × (1.00,1.00) 4.37s × (1.00,1.01) -0.22% (p=0.006)
FmtFprintfEmpty 90.7ns × (0.97,1.10) 89.3ns × (0.96,1.09) ~ (p=0.280)
FmtFprintfString 282ns × (0.98,1.04) 287ns × (0.98,1.07) +1.72% (p=0.039)
FmtFprintfInt 269ns × (0.99,1.03) 282ns × (0.97,1.04) +4.87% (p=0.000)
FmtFprintfIntInt 478ns × (0.99,1.02) 481ns × (0.99,1.02) +0.61% (p=0.048)
FmtFprintfPrefixedInt 399ns × (0.98,1.03) 400ns × (0.98,1.05) ~ (p=0.533)
FmtFprintfFloat 563ns × (0.99,1.01) 570ns × (1.00,1.01) +1.37% (p=0.000)
FmtManyArgs 1.89µs × (0.99,1.01) 1.92µs × (0.99,1.02) +1.88% (p=0.000)
GobDecode 15.2ms × (0.99,1.01) 15.2ms × (0.98,1.05) ~ (p=0.609)
GobEncode 11.6ms × (0.98,1.03) 11.9ms × (0.98,1.04) +2.17% (p=0.000)
Gzip 648ms × (0.99,1.01) 648ms × (1.00,1.01) ~ (p=0.835)
Gunzip 142ms × (1.00,1.00) 143ms × (1.00,1.01) ~ (p=0.169)
HTTPClientServer 90.5µs × (0.98,1.03) 91.5µs × (0.98,1.04) +1.04% (p=0.045)
JSONEncode 31.5ms × (0.98,1.03) 31.4ms × (0.98,1.03) ~ (p=0.549)
JSONDecode 111ms × (0.99,1.01) 107ms × (0.99,1.01) -3.21% (p=0.000)
Mandelbrot200 6.01ms × (1.00,1.00) 6.01ms × (1.00,1.00) ~ (p=0.878)
GoParse 6.54ms × (0.99,1.02) 6.61ms × (0.99,1.03) +1.08% (p=0.004)
RegexpMatchEasy0_32 160ns × (1.00,1.01) 161ns × (1.00,1.00) +0.40% (p=0.000)
RegexpMatchEasy0_1K 560ns × (0.99,1.01) 559ns × (0.99,1.01) ~ (p=0.088)
RegexpMatchEasy1_32 138ns × (0.99,1.01) 138ns × (1.00,1.00) ~ (p=0.380)
RegexpMatchEasy1_1K 877ns × (1.00,1.00) 878ns × (1.00,1.00) ~ (p=0.157)
RegexpMatchMedium_32 251ns × (0.99,1.00) 251ns × (1.00,1.01) +0.28% (p=0.021)
RegexpMatchMedium_1K 72.6µs × (1.00,1.00) 72.6µs × (1.00,1.00) ~ (p=0.539)
RegexpMatchHard_32 3.84µs × (1.00,1.00) 3.84µs × (1.00,1.00) ~ (p=0.378)
RegexpMatchHard_1K 117µs × (1.00,1.00) 117µs × (1.00,1.00) ~ (p=0.067)
Revcomp 904ms × (0.99,1.02) 904ms × (0.99,1.01) ~ (p=0.943)
Template 125ms × (0.99,1.02) 127ms × (0.99,1.01) +1.79% (p=0.000)
TimeParse 627ns × (0.99,1.01) 622ns × (0.99,1.01) -0.88% (p=0.000)
TimeFormat 655ns × (0.99,1.02) 655ns × (0.99,1.02) ~ (p=0.976)
For the record, Go 1 benchmarks, Go 1.4 vs this CL:
name old mean new mean delta
BinaryTree17 4.61s × (0.97,1.05) 5.91s × (0.98,1.03) +28.35% (p=0.000)
Fannkuch11 4.40s × (0.99,1.03) 4.41s × (0.99,1.01) ~ (p=0.212)
FmtFprintfEmpty 102ns × (0.99,1.01) 84ns × (0.99,1.02) -18.38% (p=0.000)
FmtFprintfString 302ns × (0.98,1.01) 303ns × (0.99,1.02) ~ (p=0.203)
FmtFprintfInt 313ns × (0.97,1.05) 270ns × (0.99,1.01) -13.69% (p=0.000)
FmtFprintfIntInt 524ns × (0.98,1.02) 477ns × (0.99,1.00) -8.87% (p=0.000)
FmtFprintfPrefixedInt 424ns × (0.98,1.02) 386ns × (0.99,1.01) -8.96% (p=0.000)
FmtFprintfFloat 652ns × (0.98,1.02) 594ns × (0.97,1.05) -8.97% (p=0.000)
FmtManyArgs 2.13µs × (0.99,1.02) 1.94µs × (0.99,1.01) -8.92% (p=0.000)
GobDecode 17.1ms × (0.99,1.02) 14.9ms × (0.98,1.03) -13.07% (p=0.000)
GobEncode 13.5ms × (0.98,1.03) 11.5ms × (0.98,1.03) -15.25% (p=0.000)
Gzip 656ms × (0.99,1.02) 647ms × (0.99,1.01) -1.29% (p=0.000)
Gunzip 143ms × (0.99,1.02) 144ms × (0.99,1.01) ~ (p=0.204)
HTTPClientServer 88.2µs × (0.98,1.02) 90.8µs × (0.98,1.01) +2.93% (p=0.000)
JSONEncode 32.2ms × (0.98,1.02) 30.9ms × (0.97,1.04) -4.06% (p=0.001)
JSONDecode 121ms × (0.98,1.02) 110ms × (0.98,1.05) -8.95% (p=0.000)
Mandelbrot200 6.06ms × (0.99,1.01) 6.11ms × (0.98,1.04) ~ (p=0.184)
GoParse 6.76ms × (0.97,1.04) 6.58ms × (0.98,1.05) -2.63% (p=0.003)
RegexpMatchEasy0_32 195ns × (1.00,1.01) 155ns × (0.99,1.01) -20.43% (p=0.000)
RegexpMatchEasy0_1K 479ns × (0.98,1.03) 535ns × (0.99,1.02) +11.59% (p=0.000)
RegexpMatchEasy1_32 169ns × (0.99,1.02) 131ns × (0.99,1.03) -22.44% (p=0.000)
RegexpMatchEasy1_1K 1.53µs × (0.99,1.01) 0.87µs × (0.99,1.02) -43.07% (p=0.000)
RegexpMatchMedium_32 334ns × (0.99,1.01) 242ns × (0.99,1.01) -27.53% (p=0.000)
RegexpMatchMedium_1K 125µs × (1.00,1.01) 72µs × (0.99,1.03) -42.53% (p=0.000)
RegexpMatchHard_32 6.03µs × (0.99,1.01) 3.79µs × (0.99,1.01) -37.12% (p=0.000)
RegexpMatchHard_1K 189µs × (0.99,1.02) 115µs × (0.99,1.01) -39.20% (p=0.000)
Revcomp 935ms × (0.96,1.03) 926ms × (0.98,1.02) ~ (p=0.083)
Template 146ms × (0.97,1.05) 119ms × (0.99,1.01) -18.37% (p=0.000)
TimeParse 660ns × (0.99,1.01) 624ns × (0.99,1.02) -5.43% (p=0.000)
TimeFormat 670ns × (0.98,1.02) 710ns × (1.00,1.01) +5.97% (p=0.000)
This CL is a bit larger than I would like, but the compiler, linker, runtime,
and package reflect all need to be in sync about the format of these programs,
so there is no easy way to split this into independent changes (at least
while keeping the build working at each change).
Fixes #9625.
Fixes #10524.
Change-Id: I9e3e20d6097099d0f8532d1cb5b1af528804989a
Reviewed-on: https://go-review.googlesource.com/9888
Reviewed-by: Austin Clements <austin@google.com>
Run-TryBot: Russ Cox <rsc@golang.org>
2015-05-08 01:43:18 -04:00
|
|
|
var gc GCProg
|
2016-08-19 22:40:38 -04:00
|
|
|
gc.Init(ctxt, "runtime.gcdata")
|
2017-04-19 15:15:35 +12:00
|
|
|
for _, s := range data[SDATA] {
|
2015-02-27 22:57:28 -05:00
|
|
|
s.Sect = sect
|
2017-04-19 15:15:35 +12:00
|
|
|
s.Type = SDATA
|
2015-02-27 22:57:28 -05:00
|
|
|
datsize = aligndatsize(datsize, s)
|
|
|
|
|
s.Value = int64(uint64(datsize) - sect.Vaddr)
|
runtime: replace GC programs with simpler encoding, faster decoder
Small types record the location of pointers in their memory layout
by using a simple bitmap. In Go 1.4 the bitmap held 4-bit entries,
and in Go 1.5 the bitmap holds 1-bit entries, but in both cases using
a bitmap for a large type containing arrays does not make sense:
if someone refers to the type [1<<28]*byte in a program in such
a way that the type information makes it into the binary, it would be
a waste of space to write a 128 MB (for 4-bit entries) or even 32 MB
(for 1-bit entries) bitmap full of 1s into the binary or even to keep
one in memory during the execution of the program.
For large types containing arrays, it is much more compact to describe
the locations of pointers using a notation that can express repetition
than to lay out a bitmap of pointers. Go 1.4 included such a notation,
called ``GC programs'' but it was complex, required recursion during
decoding, and was generally slow. Dmitriy measured the execution of
these programs writing directly to the heap bitmap as being 7x slower
than copying from a preunrolled 4-bit mask (and frankly that code was
not terribly fast either). For some tests, unrollgcprog1 was seen costing
as much as 3x more than the rest of malloc combined.
This CL introduces a different form for the GC programs. They use a
simple Lempel-Ziv-style encoding of the 1-bit pointer information,
in which the only operations are (1) emit the following n bits
and (2) repeat the last n bits c more times. This encoding can be
generated directly from the Go type information (using repetition
only for arrays or large runs of non-pointer data) and it can be decoded
very efficiently. In particular the decoding requires little state and
no recursion, so that the entire decoding can run without any memory
accesses other than the reads of the encoding and the writes of the
decoded form to the heap bitmap. For recursive types like arrays of
arrays of arrays, the inner instructions are only executed once, not
n times, so that large repetitions run at full speed. (In contrast, large
repetitions in the old programs repeated the individual bit-level layout
of the inner data over and over.) The result is as much as 25x faster
decoding compared to the old form.
Because the old decoder was so slow, Go 1.4 had three (or so) cases
for how to set the heap bitmap bits for an allocation of a given type:
(1) If the type had an even number of words up to 32 words, then
the 4-bit pointer mask for the type fit in no more than 16 bytes;
store the 4-bit pointer mask directly in the binary and copy from it.
(1b) If the type had an odd number of words up to 15 words, then
the 4-bit pointer mask for the type, doubled to end on a byte boundary,
fit in no more than 16 bytes; store that doubled mask directly in the
binary and copy from it.
(2) If the type had an even number of words up to 128 words,
or an odd number of words up to 63 words (again due to doubling),
then the 4-bit pointer mask would fit in a 64-byte unrolled mask.
Store a GC program in the binary, but leave space in the BSS for
the unrolled mask. Execute the GC program to construct the mask the
first time it is needed, and thereafter copy from the mask.
(3) Otherwise, store a GC program and execute it to write directly to
the heap bitmap each time an object of that type is allocated.
(This is the case that was 7x slower than the other two.)
Because the new pointer masks store 1-bit entries instead of 4-bit
entries and because using the decoder no longer carries a significant
overhead, after this CL (that is, for Go 1.5) there are only two cases:
(1) If the type is 128 words or less (no condition about odd or even),
store the 1-bit pointer mask directly in the binary and use it to
initialize the heap bitmap during malloc. (Implemented in CL 9702.)
(2) There is no case 2 anymore.
(3) Otherwise, store a GC program and execute it to write directly to
the heap bitmap each time an object of that type is allocated.
Executing the GC program directly into the heap bitmap (case (3) above)
was disabled for the Go 1.5 dev cycle, both to avoid needing to use
GC programs for typedmemmove and to avoid updating that code as
the heap bitmap format changed. Typedmemmove no longer uses this
type information; as of CL 9886 it uses the heap bitmap directly.
Now that the heap bitmap format is stable, we reintroduce GC programs
and their space savings.
Benchmarks for heapBitsSetType, before this CL vs this CL:
name old mean new mean delta
SetTypePtr 7.59ns × (0.99,1.02) 5.16ns × (1.00,1.00) -32.05% (p=0.000)
SetTypePtr8 21.0ns × (0.98,1.05) 21.4ns × (1.00,1.00) ~ (p=0.179)
SetTypePtr16 24.1ns × (0.99,1.01) 24.6ns × (1.00,1.00) +2.41% (p=0.001)
SetTypePtr32 31.2ns × (0.99,1.01) 32.4ns × (0.99,1.02) +3.72% (p=0.001)
SetTypePtr64 45.2ns × (1.00,1.00) 47.2ns × (1.00,1.00) +4.42% (p=0.000)
SetTypePtr126 75.8ns × (0.99,1.01) 79.1ns × (1.00,1.00) +4.25% (p=0.000)
SetTypePtr128 74.3ns × (0.99,1.01) 77.6ns × (1.00,1.01) +4.55% (p=0.000)
SetTypePtrSlice 726ns × (1.00,1.01) 712ns × (1.00,1.00) -1.95% (p=0.001)
SetTypeNode1 20.0ns × (0.99,1.01) 20.7ns × (1.00,1.00) +3.71% (p=0.000)
SetTypeNode1Slice 112ns × (1.00,1.00) 113ns × (0.99,1.00) ~ (p=0.070)
SetTypeNode8 23.9ns × (1.00,1.00) 24.7ns × (1.00,1.01) +3.18% (p=0.000)
SetTypeNode8Slice 294ns × (0.99,1.02) 287ns × (0.99,1.01) -2.38% (p=0.015)
SetTypeNode64 52.8ns × (0.99,1.03) 51.8ns × (0.99,1.01) ~ (p=0.069)
SetTypeNode64Slice 1.13µs × (0.99,1.05) 1.14µs × (0.99,1.00) ~ (p=0.767)
SetTypeNode64Dead 36.0ns × (1.00,1.01) 32.5ns × (0.99,1.00) -9.67% (p=0.000)
SetTypeNode64DeadSlice 1.43µs × (0.99,1.01) 1.40µs × (1.00,1.00) -2.39% (p=0.001)
SetTypeNode124 75.7ns × (1.00,1.01) 79.0ns × (1.00,1.00) +4.44% (p=0.000)
SetTypeNode124Slice 1.94µs × (1.00,1.01) 2.04µs × (0.99,1.01) +4.98% (p=0.000)
SetTypeNode126 75.4ns × (1.00,1.01) 77.7ns × (0.99,1.01) +3.11% (p=0.000)
SetTypeNode126Slice 1.95µs × (0.99,1.01) 2.03µs × (1.00,1.00) +3.74% (p=0.000)
SetTypeNode128 85.4ns × (0.99,1.01) 122.0ns × (1.00,1.00) +42.89% (p=0.000)
SetTypeNode128Slice 2.20µs × (1.00,1.01) 2.36µs × (0.98,1.02) +7.48% (p=0.001)
SetTypeNode130 83.3ns × (1.00,1.00) 123.0ns × (1.00,1.00) +47.61% (p=0.000)
SetTypeNode130Slice 2.30µs × (0.99,1.01) 2.40µs × (0.98,1.01) +4.37% (p=0.000)
SetTypeNode1024 498ns × (1.00,1.00) 537ns × (1.00,1.00) +7.96% (p=0.000)
SetTypeNode1024Slice 15.5µs × (0.99,1.01) 17.8µs × (1.00,1.00) +15.27% (p=0.000)
The above compares always using a cached pointer mask (and the
corresponding waste of memory) against using the programs directly.
Some slowdown is expected, in exchange for having a better general algorithm.
The GC programs kick in for SetTypeNode128, SetTypeNode130, SetTypeNode1024,
along with the slice variants of those.
It is possible that the cutoff of 128 words (bits) should be raised
in a followup CL, but even with this low cutoff the GC programs are
faster than Go 1.4's "fast path" non-GC program case.
Benchmarks for heapBitsSetType, Go 1.4 vs this CL:
name old mean new mean delta
SetTypePtr 6.89ns × (1.00,1.00) 5.17ns × (1.00,1.00) -25.02% (p=0.000)
SetTypePtr8 25.8ns × (0.97,1.05) 21.5ns × (1.00,1.00) -16.70% (p=0.000)
SetTypePtr16 39.8ns × (0.97,1.02) 24.7ns × (0.99,1.01) -37.81% (p=0.000)
SetTypePtr32 68.8ns × (0.98,1.01) 32.2ns × (1.00,1.01) -53.18% (p=0.000)
SetTypePtr64 130ns × (1.00,1.00) 47ns × (1.00,1.00) -63.67% (p=0.000)
SetTypePtr126 241ns × (0.99,1.01) 79ns × (1.00,1.01) -67.25% (p=0.000)
SetTypePtr128 2.07µs × (1.00,1.00) 0.08µs × (1.00,1.00) -96.27% (p=0.000)
SetTypePtrSlice 1.05µs × (0.99,1.01) 0.72µs × (0.99,1.02) -31.70% (p=0.000)
SetTypeNode1 16.0ns × (0.99,1.01) 20.8ns × (0.99,1.03) +29.91% (p=0.000)
SetTypeNode1Slice 184ns × (0.99,1.01) 112ns × (0.99,1.01) -39.26% (p=0.000)
SetTypeNode8 29.5ns × (0.97,1.02) 24.6ns × (1.00,1.00) -16.50% (p=0.000)
SetTypeNode8Slice 624ns × (0.98,1.02) 285ns × (1.00,1.00) -54.31% (p=0.000)
SetTypeNode64 135ns × (0.96,1.08) 52ns × (0.99,1.02) -61.32% (p=0.000)
SetTypeNode64Slice 3.83µs × (1.00,1.00) 1.14µs × (0.99,1.01) -70.16% (p=0.000)
SetTypeNode64Dead 134ns × (0.99,1.01) 32ns × (1.00,1.01) -75.74% (p=0.000)
SetTypeNode64DeadSlice 3.83µs × (0.99,1.00) 1.40µs × (1.00,1.01) -63.42% (p=0.000)
SetTypeNode124 240ns × (0.99,1.01) 79ns × (1.00,1.01) -67.05% (p=0.000)
SetTypeNode124Slice 7.27µs × (1.00,1.00) 2.04µs × (1.00,1.00) -71.95% (p=0.000)
SetTypeNode126 2.06µs × (0.99,1.01) 0.08µs × (0.99,1.01) -96.23% (p=0.000)
SetTypeNode126Slice 64.4µs × (1.00,1.00) 2.0µs × (1.00,1.00) -96.85% (p=0.000)
SetTypeNode128 2.09µs × (1.00,1.01) 0.12µs × (1.00,1.00) -94.15% (p=0.000)
SetTypeNode128Slice 65.4µs × (1.00,1.00) 2.4µs × (0.99,1.03) -96.39% (p=0.000)
SetTypeNode130 2.11µs × (1.00,1.00) 0.12µs × (1.00,1.00) -94.18% (p=0.000)
SetTypeNode130Slice 66.3µs × (1.00,1.00) 2.4µs × (0.97,1.08) -96.34% (p=0.000)
SetTypeNode1024 16.0µs × (1.00,1.01) 0.5µs × (1.00,1.00) -96.65% (p=0.000)
SetTypeNode1024Slice 512µs × (1.00,1.00) 18µs × (0.98,1.04) -96.45% (p=0.000)
SetTypeNode124 uses a 124 data + 2 ptr = 126-word allocation.
Both Go 1.4 and this CL are using pointer bitmaps for this case,
so that's an overall 3x speedup for using pointer bitmaps.
SetTypeNode128 uses a 128 data + 2 ptr = 130-word allocation.
Both Go 1.4 and this CL are running the GC program for this case,
so that's an overall 17x speedup when using GC programs (and
I've seen >20x on other systems).
Comparing Go 1.4's SetTypeNode124 (pointer bitmap) against
this CL's SetTypeNode128 (GC program), the slow path in the
code in this CL is 2x faster than the fast path in Go 1.4.
The Go 1 benchmarks are basically unaffected compared to just before this CL.
Go 1 benchmarks, before this CL vs this CL:
name old mean new mean delta
BinaryTree17 5.87s × (0.97,1.04) 5.91s × (0.96,1.04) ~ (p=0.306)
Fannkuch11 4.38s × (1.00,1.00) 4.37s × (1.00,1.01) -0.22% (p=0.006)
FmtFprintfEmpty 90.7ns × (0.97,1.10) 89.3ns × (0.96,1.09) ~ (p=0.280)
FmtFprintfString 282ns × (0.98,1.04) 287ns × (0.98,1.07) +1.72% (p=0.039)
FmtFprintfInt 269ns × (0.99,1.03) 282ns × (0.97,1.04) +4.87% (p=0.000)
FmtFprintfIntInt 478ns × (0.99,1.02) 481ns × (0.99,1.02) +0.61% (p=0.048)
FmtFprintfPrefixedInt 399ns × (0.98,1.03) 400ns × (0.98,1.05) ~ (p=0.533)
FmtFprintfFloat 563ns × (0.99,1.01) 570ns × (1.00,1.01) +1.37% (p=0.000)
FmtManyArgs 1.89µs × (0.99,1.01) 1.92µs × (0.99,1.02) +1.88% (p=0.000)
GobDecode 15.2ms × (0.99,1.01) 15.2ms × (0.98,1.05) ~ (p=0.609)
GobEncode 11.6ms × (0.98,1.03) 11.9ms × (0.98,1.04) +2.17% (p=0.000)
Gzip 648ms × (0.99,1.01) 648ms × (1.00,1.01) ~ (p=0.835)
Gunzip 142ms × (1.00,1.00) 143ms × (1.00,1.01) ~ (p=0.169)
HTTPClientServer 90.5µs × (0.98,1.03) 91.5µs × (0.98,1.04) +1.04% (p=0.045)
JSONEncode 31.5ms × (0.98,1.03) 31.4ms × (0.98,1.03) ~ (p=0.549)
JSONDecode 111ms × (0.99,1.01) 107ms × (0.99,1.01) -3.21% (p=0.000)
Mandelbrot200 6.01ms × (1.00,1.00) 6.01ms × (1.00,1.00) ~ (p=0.878)
GoParse 6.54ms × (0.99,1.02) 6.61ms × (0.99,1.03) +1.08% (p=0.004)
RegexpMatchEasy0_32 160ns × (1.00,1.01) 161ns × (1.00,1.00) +0.40% (p=0.000)
RegexpMatchEasy0_1K 560ns × (0.99,1.01) 559ns × (0.99,1.01) ~ (p=0.088)
RegexpMatchEasy1_32 138ns × (0.99,1.01) 138ns × (1.00,1.00) ~ (p=0.380)
RegexpMatchEasy1_1K 877ns × (1.00,1.00) 878ns × (1.00,1.00) ~ (p=0.157)
RegexpMatchMedium_32 251ns × (0.99,1.00) 251ns × (1.00,1.01) +0.28% (p=0.021)
RegexpMatchMedium_1K 72.6µs × (1.00,1.00) 72.6µs × (1.00,1.00) ~ (p=0.539)
RegexpMatchHard_32 3.84µs × (1.00,1.00) 3.84µs × (1.00,1.00) ~ (p=0.378)
RegexpMatchHard_1K 117µs × (1.00,1.00) 117µs × (1.00,1.00) ~ (p=0.067)
Revcomp 904ms × (0.99,1.02) 904ms × (0.99,1.01) ~ (p=0.943)
Template 125ms × (0.99,1.02) 127ms × (0.99,1.01) +1.79% (p=0.000)
TimeParse 627ns × (0.99,1.01) 622ns × (0.99,1.01) -0.88% (p=0.000)
TimeFormat 655ns × (0.99,1.02) 655ns × (0.99,1.02) ~ (p=0.976)
For the record, Go 1 benchmarks, Go 1.4 vs this CL:
name old mean new mean delta
BinaryTree17 4.61s × (0.97,1.05) 5.91s × (0.98,1.03) +28.35% (p=0.000)
Fannkuch11 4.40s × (0.99,1.03) 4.41s × (0.99,1.01) ~ (p=0.212)
FmtFprintfEmpty 102ns × (0.99,1.01) 84ns × (0.99,1.02) -18.38% (p=0.000)
FmtFprintfString 302ns × (0.98,1.01) 303ns × (0.99,1.02) ~ (p=0.203)
FmtFprintfInt 313ns × (0.97,1.05) 270ns × (0.99,1.01) -13.69% (p=0.000)
FmtFprintfIntInt 524ns × (0.98,1.02) 477ns × (0.99,1.00) -8.87% (p=0.000)
FmtFprintfPrefixedInt 424ns × (0.98,1.02) 386ns × (0.99,1.01) -8.96% (p=0.000)
FmtFprintfFloat 652ns × (0.98,1.02) 594ns × (0.97,1.05) -8.97% (p=0.000)
FmtManyArgs 2.13µs × (0.99,1.02) 1.94µs × (0.99,1.01) -8.92% (p=0.000)
GobDecode 17.1ms × (0.99,1.02) 14.9ms × (0.98,1.03) -13.07% (p=0.000)
GobEncode 13.5ms × (0.98,1.03) 11.5ms × (0.98,1.03) -15.25% (p=0.000)
Gzip 656ms × (0.99,1.02) 647ms × (0.99,1.01) -1.29% (p=0.000)
Gunzip 143ms × (0.99,1.02) 144ms × (0.99,1.01) ~ (p=0.204)
HTTPClientServer 88.2µs × (0.98,1.02) 90.8µs × (0.98,1.01) +2.93% (p=0.000)
JSONEncode 32.2ms × (0.98,1.02) 30.9ms × (0.97,1.04) -4.06% (p=0.001)
JSONDecode 121ms × (0.98,1.02) 110ms × (0.98,1.05) -8.95% (p=0.000)
Mandelbrot200 6.06ms × (0.99,1.01) 6.11ms × (0.98,1.04) ~ (p=0.184)
GoParse 6.76ms × (0.97,1.04) 6.58ms × (0.98,1.05) -2.63% (p=0.003)
RegexpMatchEasy0_32 195ns × (1.00,1.01) 155ns × (0.99,1.01) -20.43% (p=0.000)
RegexpMatchEasy0_1K 479ns × (0.98,1.03) 535ns × (0.99,1.02) +11.59% (p=0.000)
RegexpMatchEasy1_32 169ns × (0.99,1.02) 131ns × (0.99,1.03) -22.44% (p=0.000)
RegexpMatchEasy1_1K 1.53µs × (0.99,1.01) 0.87µs × (0.99,1.02) -43.07% (p=0.000)
RegexpMatchMedium_32 334ns × (0.99,1.01) 242ns × (0.99,1.01) -27.53% (p=0.000)
RegexpMatchMedium_1K 125µs × (1.00,1.01) 72µs × (0.99,1.03) -42.53% (p=0.000)
RegexpMatchHard_32 6.03µs × (0.99,1.01) 3.79µs × (0.99,1.01) -37.12% (p=0.000)
RegexpMatchHard_1K 189µs × (0.99,1.02) 115µs × (0.99,1.01) -39.20% (p=0.000)
Revcomp 935ms × (0.96,1.03) 926ms × (0.98,1.02) ~ (p=0.083)
Template 146ms × (0.97,1.05) 119ms × (0.99,1.01) -18.37% (p=0.000)
TimeParse 660ns × (0.99,1.01) 624ns × (0.99,1.02) -5.43% (p=0.000)
TimeFormat 670ns × (0.98,1.02) 710ns × (1.00,1.01) +5.97% (p=0.000)
This CL is a bit larger than I would like, but the compiler, linker, runtime,
and package reflect all need to be in sync about the format of these programs,
so there is no easy way to split this into independent changes (at least
while keeping the build working at each change).
Fixes #9625.
Fixes #10524.
Change-Id: I9e3e20d6097099d0f8532d1cb5b1af528804989a
Reviewed-on: https://go-review.googlesource.com/9888
Reviewed-by: Austin Clements <austin@google.com>
Run-TryBot: Russ Cox <rsc@golang.org>
2015-05-08 01:43:18 -04:00
|
|
|
gc.AddSym(s)
|
2016-04-19 08:59:56 -04:00
|
|
|
datsize += s.Size
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
2017-04-19 15:15:35 +12:00
|
|
|
checkdatsize(ctxt, datsize, SDATA)
|
2015-02-27 22:57:28 -05:00
|
|
|
sect.Length = uint64(datsize) - sect.Vaddr
|
runtime: replace GC programs with simpler encoding, faster decoder
Small types record the location of pointers in their memory layout
by using a simple bitmap. In Go 1.4 the bitmap held 4-bit entries,
and in Go 1.5 the bitmap holds 1-bit entries, but in both cases using
a bitmap for a large type containing arrays does not make sense:
if someone refers to the type [1<<28]*byte in a program in such
a way that the type information makes it into the binary, it would be
a waste of space to write a 128 MB (for 4-bit entries) or even 32 MB
(for 1-bit entries) bitmap full of 1s into the binary or even to keep
one in memory during the execution of the program.
For large types containing arrays, it is much more compact to describe
the locations of pointers using a notation that can express repetition
than to lay out a bitmap of pointers. Go 1.4 included such a notation,
called ``GC programs'' but it was complex, required recursion during
decoding, and was generally slow. Dmitriy measured the execution of
these programs writing directly to the heap bitmap as being 7x slower
than copying from a preunrolled 4-bit mask (and frankly that code was
not terribly fast either). For some tests, unrollgcprog1 was seen costing
as much as 3x more than the rest of malloc combined.
This CL introduces a different form for the GC programs. They use a
simple Lempel-Ziv-style encoding of the 1-bit pointer information,
in which the only operations are (1) emit the following n bits
and (2) repeat the last n bits c more times. This encoding can be
generated directly from the Go type information (using repetition
only for arrays or large runs of non-pointer data) and it can be decoded
very efficiently. In particular the decoding requires little state and
no recursion, so that the entire decoding can run without any memory
accesses other than the reads of the encoding and the writes of the
decoded form to the heap bitmap. For recursive types like arrays of
arrays of arrays, the inner instructions are only executed once, not
n times, so that large repetitions run at full speed. (In contrast, large
repetitions in the old programs repeated the individual bit-level layout
of the inner data over and over.) The result is as much as 25x faster
decoding compared to the old form.
Because the old decoder was so slow, Go 1.4 had three (or so) cases
for how to set the heap bitmap bits for an allocation of a given type:
(1) If the type had an even number of words up to 32 words, then
the 4-bit pointer mask for the type fit in no more than 16 bytes;
store the 4-bit pointer mask directly in the binary and copy from it.
(1b) If the type had an odd number of words up to 15 words, then
the 4-bit pointer mask for the type, doubled to end on a byte boundary,
fit in no more than 16 bytes; store that doubled mask directly in the
binary and copy from it.
(2) If the type had an even number of words up to 128 words,
or an odd number of words up to 63 words (again due to doubling),
then the 4-bit pointer mask would fit in a 64-byte unrolled mask.
Store a GC program in the binary, but leave space in the BSS for
the unrolled mask. Execute the GC program to construct the mask the
first time it is needed, and thereafter copy from the mask.
(3) Otherwise, store a GC program and execute it to write directly to
the heap bitmap each time an object of that type is allocated.
(This is the case that was 7x slower than the other two.)
Because the new pointer masks store 1-bit entries instead of 4-bit
entries and because using the decoder no longer carries a significant
overhead, after this CL (that is, for Go 1.5) there are only two cases:
(1) If the type is 128 words or less (no condition about odd or even),
store the 1-bit pointer mask directly in the binary and use it to
initialize the heap bitmap during malloc. (Implemented in CL 9702.)
(2) There is no case 2 anymore.
(3) Otherwise, store a GC program and execute it to write directly to
the heap bitmap each time an object of that type is allocated.
Executing the GC program directly into the heap bitmap (case (3) above)
was disabled for the Go 1.5 dev cycle, both to avoid needing to use
GC programs for typedmemmove and to avoid updating that code as
the heap bitmap format changed. Typedmemmove no longer uses this
type information; as of CL 9886 it uses the heap bitmap directly.
Now that the heap bitmap format is stable, we reintroduce GC programs
and their space savings.
Benchmarks for heapBitsSetType, before this CL vs this CL:
name old mean new mean delta
SetTypePtr 7.59ns × (0.99,1.02) 5.16ns × (1.00,1.00) -32.05% (p=0.000)
SetTypePtr8 21.0ns × (0.98,1.05) 21.4ns × (1.00,1.00) ~ (p=0.179)
SetTypePtr16 24.1ns × (0.99,1.01) 24.6ns × (1.00,1.00) +2.41% (p=0.001)
SetTypePtr32 31.2ns × (0.99,1.01) 32.4ns × (0.99,1.02) +3.72% (p=0.001)
SetTypePtr64 45.2ns × (1.00,1.00) 47.2ns × (1.00,1.00) +4.42% (p=0.000)
SetTypePtr126 75.8ns × (0.99,1.01) 79.1ns × (1.00,1.00) +4.25% (p=0.000)
SetTypePtr128 74.3ns × (0.99,1.01) 77.6ns × (1.00,1.01) +4.55% (p=0.000)
SetTypePtrSlice 726ns × (1.00,1.01) 712ns × (1.00,1.00) -1.95% (p=0.001)
SetTypeNode1 20.0ns × (0.99,1.01) 20.7ns × (1.00,1.00) +3.71% (p=0.000)
SetTypeNode1Slice 112ns × (1.00,1.00) 113ns × (0.99,1.00) ~ (p=0.070)
SetTypeNode8 23.9ns × (1.00,1.00) 24.7ns × (1.00,1.01) +3.18% (p=0.000)
SetTypeNode8Slice 294ns × (0.99,1.02) 287ns × (0.99,1.01) -2.38% (p=0.015)
SetTypeNode64 52.8ns × (0.99,1.03) 51.8ns × (0.99,1.01) ~ (p=0.069)
SetTypeNode64Slice 1.13µs × (0.99,1.05) 1.14µs × (0.99,1.00) ~ (p=0.767)
SetTypeNode64Dead 36.0ns × (1.00,1.01) 32.5ns × (0.99,1.00) -9.67% (p=0.000)
SetTypeNode64DeadSlice 1.43µs × (0.99,1.01) 1.40µs × (1.00,1.00) -2.39% (p=0.001)
SetTypeNode124 75.7ns × (1.00,1.01) 79.0ns × (1.00,1.00) +4.44% (p=0.000)
SetTypeNode124Slice 1.94µs × (1.00,1.01) 2.04µs × (0.99,1.01) +4.98% (p=0.000)
SetTypeNode126 75.4ns × (1.00,1.01) 77.7ns × (0.99,1.01) +3.11% (p=0.000)
SetTypeNode126Slice 1.95µs × (0.99,1.01) 2.03µs × (1.00,1.00) +3.74% (p=0.000)
SetTypeNode128 85.4ns × (0.99,1.01) 122.0ns × (1.00,1.00) +42.89% (p=0.000)
SetTypeNode128Slice 2.20µs × (1.00,1.01) 2.36µs × (0.98,1.02) +7.48% (p=0.001)
SetTypeNode130 83.3ns × (1.00,1.00) 123.0ns × (1.00,1.00) +47.61% (p=0.000)
SetTypeNode130Slice 2.30µs × (0.99,1.01) 2.40µs × (0.98,1.01) +4.37% (p=0.000)
SetTypeNode1024 498ns × (1.00,1.00) 537ns × (1.00,1.00) +7.96% (p=0.000)
SetTypeNode1024Slice 15.5µs × (0.99,1.01) 17.8µs × (1.00,1.00) +15.27% (p=0.000)
The above compares always using a cached pointer mask (and the
corresponding waste of memory) against using the programs directly.
Some slowdown is expected, in exchange for having a better general algorithm.
The GC programs kick in for SetTypeNode128, SetTypeNode130, SetTypeNode1024,
along with the slice variants of those.
It is possible that the cutoff of 128 words (bits) should be raised
in a followup CL, but even with this low cutoff the GC programs are
faster than Go 1.4's "fast path" non-GC program case.
Benchmarks for heapBitsSetType, Go 1.4 vs this CL:
name old mean new mean delta
SetTypePtr 6.89ns × (1.00,1.00) 5.17ns × (1.00,1.00) -25.02% (p=0.000)
SetTypePtr8 25.8ns × (0.97,1.05) 21.5ns × (1.00,1.00) -16.70% (p=0.000)
SetTypePtr16 39.8ns × (0.97,1.02) 24.7ns × (0.99,1.01) -37.81% (p=0.000)
SetTypePtr32 68.8ns × (0.98,1.01) 32.2ns × (1.00,1.01) -53.18% (p=0.000)
SetTypePtr64 130ns × (1.00,1.00) 47ns × (1.00,1.00) -63.67% (p=0.000)
SetTypePtr126 241ns × (0.99,1.01) 79ns × (1.00,1.01) -67.25% (p=0.000)
SetTypePtr128 2.07µs × (1.00,1.00) 0.08µs × (1.00,1.00) -96.27% (p=0.000)
SetTypePtrSlice 1.05µs × (0.99,1.01) 0.72µs × (0.99,1.02) -31.70% (p=0.000)
SetTypeNode1 16.0ns × (0.99,1.01) 20.8ns × (0.99,1.03) +29.91% (p=0.000)
SetTypeNode1Slice 184ns × (0.99,1.01) 112ns × (0.99,1.01) -39.26% (p=0.000)
SetTypeNode8 29.5ns × (0.97,1.02) 24.6ns × (1.00,1.00) -16.50% (p=0.000)
SetTypeNode8Slice 624ns × (0.98,1.02) 285ns × (1.00,1.00) -54.31% (p=0.000)
SetTypeNode64 135ns × (0.96,1.08) 52ns × (0.99,1.02) -61.32% (p=0.000)
SetTypeNode64Slice 3.83µs × (1.00,1.00) 1.14µs × (0.99,1.01) -70.16% (p=0.000)
SetTypeNode64Dead 134ns × (0.99,1.01) 32ns × (1.00,1.01) -75.74% (p=0.000)
SetTypeNode64DeadSlice 3.83µs × (0.99,1.00) 1.40µs × (1.00,1.01) -63.42% (p=0.000)
SetTypeNode124 240ns × (0.99,1.01) 79ns × (1.00,1.01) -67.05% (p=0.000)
SetTypeNode124Slice 7.27µs × (1.00,1.00) 2.04µs × (1.00,1.00) -71.95% (p=0.000)
SetTypeNode126 2.06µs × (0.99,1.01) 0.08µs × (0.99,1.01) -96.23% (p=0.000)
SetTypeNode126Slice 64.4µs × (1.00,1.00) 2.0µs × (1.00,1.00) -96.85% (p=0.000)
SetTypeNode128 2.09µs × (1.00,1.01) 0.12µs × (1.00,1.00) -94.15% (p=0.000)
SetTypeNode128Slice 65.4µs × (1.00,1.00) 2.4µs × (0.99,1.03) -96.39% (p=0.000)
SetTypeNode130 2.11µs × (1.00,1.00) 0.12µs × (1.00,1.00) -94.18% (p=0.000)
SetTypeNode130Slice 66.3µs × (1.00,1.00) 2.4µs × (0.97,1.08) -96.34% (p=0.000)
SetTypeNode1024 16.0µs × (1.00,1.01) 0.5µs × (1.00,1.00) -96.65% (p=0.000)
SetTypeNode1024Slice 512µs × (1.00,1.00) 18µs × (0.98,1.04) -96.45% (p=0.000)
SetTypeNode124 uses a 124 data + 2 ptr = 126-word allocation.
Both Go 1.4 and this CL are using pointer bitmaps for this case,
so that's an overall 3x speedup for using pointer bitmaps.
SetTypeNode128 uses a 128 data + 2 ptr = 130-word allocation.
Both Go 1.4 and this CL are running the GC program for this case,
so that's an overall 17x speedup when using GC programs (and
I've seen >20x on other systems).
Comparing Go 1.4's SetTypeNode124 (pointer bitmap) against
this CL's SetTypeNode128 (GC program), the slow path in the
code in this CL is 2x faster than the fast path in Go 1.4.
The Go 1 benchmarks are basically unaffected compared to just before this CL.
Go 1 benchmarks, before this CL vs this CL:
name old mean new mean delta
BinaryTree17 5.87s × (0.97,1.04) 5.91s × (0.96,1.04) ~ (p=0.306)
Fannkuch11 4.38s × (1.00,1.00) 4.37s × (1.00,1.01) -0.22% (p=0.006)
FmtFprintfEmpty 90.7ns × (0.97,1.10) 89.3ns × (0.96,1.09) ~ (p=0.280)
FmtFprintfString 282ns × (0.98,1.04) 287ns × (0.98,1.07) +1.72% (p=0.039)
FmtFprintfInt 269ns × (0.99,1.03) 282ns × (0.97,1.04) +4.87% (p=0.000)
FmtFprintfIntInt 478ns × (0.99,1.02) 481ns × (0.99,1.02) +0.61% (p=0.048)
FmtFprintfPrefixedInt 399ns × (0.98,1.03) 400ns × (0.98,1.05) ~ (p=0.533)
FmtFprintfFloat 563ns × (0.99,1.01) 570ns × (1.00,1.01) +1.37% (p=0.000)
FmtManyArgs 1.89µs × (0.99,1.01) 1.92µs × (0.99,1.02) +1.88% (p=0.000)
GobDecode 15.2ms × (0.99,1.01) 15.2ms × (0.98,1.05) ~ (p=0.609)
GobEncode 11.6ms × (0.98,1.03) 11.9ms × (0.98,1.04) +2.17% (p=0.000)
Gzip 648ms × (0.99,1.01) 648ms × (1.00,1.01) ~ (p=0.835)
Gunzip 142ms × (1.00,1.00) 143ms × (1.00,1.01) ~ (p=0.169)
HTTPClientServer 90.5µs × (0.98,1.03) 91.5µs × (0.98,1.04) +1.04% (p=0.045)
JSONEncode 31.5ms × (0.98,1.03) 31.4ms × (0.98,1.03) ~ (p=0.549)
JSONDecode 111ms × (0.99,1.01) 107ms × (0.99,1.01) -3.21% (p=0.000)
Mandelbrot200 6.01ms × (1.00,1.00) 6.01ms × (1.00,1.00) ~ (p=0.878)
GoParse 6.54ms × (0.99,1.02) 6.61ms × (0.99,1.03) +1.08% (p=0.004)
RegexpMatchEasy0_32 160ns × (1.00,1.01) 161ns × (1.00,1.00) +0.40% (p=0.000)
RegexpMatchEasy0_1K 560ns × (0.99,1.01) 559ns × (0.99,1.01) ~ (p=0.088)
RegexpMatchEasy1_32 138ns × (0.99,1.01) 138ns × (1.00,1.00) ~ (p=0.380)
RegexpMatchEasy1_1K 877ns × (1.00,1.00) 878ns × (1.00,1.00) ~ (p=0.157)
RegexpMatchMedium_32 251ns × (0.99,1.00) 251ns × (1.00,1.01) +0.28% (p=0.021)
RegexpMatchMedium_1K 72.6µs × (1.00,1.00) 72.6µs × (1.00,1.00) ~ (p=0.539)
RegexpMatchHard_32 3.84µs × (1.00,1.00) 3.84µs × (1.00,1.00) ~ (p=0.378)
RegexpMatchHard_1K 117µs × (1.00,1.00) 117µs × (1.00,1.00) ~ (p=0.067)
Revcomp 904ms × (0.99,1.02) 904ms × (0.99,1.01) ~ (p=0.943)
Template 125ms × (0.99,1.02) 127ms × (0.99,1.01) +1.79% (p=0.000)
TimeParse 627ns × (0.99,1.01) 622ns × (0.99,1.01) -0.88% (p=0.000)
TimeFormat 655ns × (0.99,1.02) 655ns × (0.99,1.02) ~ (p=0.976)
For the record, Go 1 benchmarks, Go 1.4 vs this CL:
name old mean new mean delta
BinaryTree17 4.61s × (0.97,1.05) 5.91s × (0.98,1.03) +28.35% (p=0.000)
Fannkuch11 4.40s × (0.99,1.03) 4.41s × (0.99,1.01) ~ (p=0.212)
FmtFprintfEmpty 102ns × (0.99,1.01) 84ns × (0.99,1.02) -18.38% (p=0.000)
FmtFprintfString 302ns × (0.98,1.01) 303ns × (0.99,1.02) ~ (p=0.203)
FmtFprintfInt 313ns × (0.97,1.05) 270ns × (0.99,1.01) -13.69% (p=0.000)
FmtFprintfIntInt 524ns × (0.98,1.02) 477ns × (0.99,1.00) -8.87% (p=0.000)
FmtFprintfPrefixedInt 424ns × (0.98,1.02) 386ns × (0.99,1.01) -8.96% (p=0.000)
FmtFprintfFloat 652ns × (0.98,1.02) 594ns × (0.97,1.05) -8.97% (p=0.000)
FmtManyArgs 2.13µs × (0.99,1.02) 1.94µs × (0.99,1.01) -8.92% (p=0.000)
GobDecode 17.1ms × (0.99,1.02) 14.9ms × (0.98,1.03) -13.07% (p=0.000)
GobEncode 13.5ms × (0.98,1.03) 11.5ms × (0.98,1.03) -15.25% (p=0.000)
Gzip 656ms × (0.99,1.02) 647ms × (0.99,1.01) -1.29% (p=0.000)
Gunzip 143ms × (0.99,1.02) 144ms × (0.99,1.01) ~ (p=0.204)
HTTPClientServer 88.2µs × (0.98,1.02) 90.8µs × (0.98,1.01) +2.93% (p=0.000)
JSONEncode 32.2ms × (0.98,1.02) 30.9ms × (0.97,1.04) -4.06% (p=0.001)
JSONDecode 121ms × (0.98,1.02) 110ms × (0.98,1.05) -8.95% (p=0.000)
Mandelbrot200 6.06ms × (0.99,1.01) 6.11ms × (0.98,1.04) ~ (p=0.184)
GoParse 6.76ms × (0.97,1.04) 6.58ms × (0.98,1.05) -2.63% (p=0.003)
RegexpMatchEasy0_32 195ns × (1.00,1.01) 155ns × (0.99,1.01) -20.43% (p=0.000)
RegexpMatchEasy0_1K 479ns × (0.98,1.03) 535ns × (0.99,1.02) +11.59% (p=0.000)
RegexpMatchEasy1_32 169ns × (0.99,1.02) 131ns × (0.99,1.03) -22.44% (p=0.000)
RegexpMatchEasy1_1K 1.53µs × (0.99,1.01) 0.87µs × (0.99,1.02) -43.07% (p=0.000)
RegexpMatchMedium_32 334ns × (0.99,1.01) 242ns × (0.99,1.01) -27.53% (p=0.000)
RegexpMatchMedium_1K 125µs × (1.00,1.01) 72µs × (0.99,1.03) -42.53% (p=0.000)
RegexpMatchHard_32 6.03µs × (0.99,1.01) 3.79µs × (0.99,1.01) -37.12% (p=0.000)
RegexpMatchHard_1K 189µs × (0.99,1.02) 115µs × (0.99,1.01) -39.20% (p=0.000)
Revcomp 935ms × (0.96,1.03) 926ms × (0.98,1.02) ~ (p=0.083)
Template 146ms × (0.97,1.05) 119ms × (0.99,1.01) -18.37% (p=0.000)
TimeParse 660ns × (0.99,1.01) 624ns × (0.99,1.02) -5.43% (p=0.000)
TimeFormat 670ns × (0.98,1.02) 710ns × (1.00,1.01) +5.97% (p=0.000)
This CL is a bit larger than I would like, but the compiler, linker, runtime,
and package reflect all need to be in sync about the format of these programs,
so there is no easy way to split this into independent changes (at least
while keeping the build working at each change).
Fixes #9625.
Fixes #10524.
Change-Id: I9e3e20d6097099d0f8532d1cb5b1af528804989a
Reviewed-on: https://go-review.googlesource.com/9888
Reviewed-by: Austin Clements <austin@google.com>
Run-TryBot: Russ Cox <rsc@golang.org>
2015-05-08 01:43:18 -04:00
|
|
|
gc.End(int64(sect.Length))
|
2015-02-27 22:57:28 -05:00
|
|
|
|
|
|
|
|
/* bss */
|
|
|
|
|
sect = addsection(&Segdata, ".bss", 06)
|
2017-04-19 15:15:35 +12:00
|
|
|
sect.Align = dataMaxAlign[SBSS]
|
2015-02-27 22:57:28 -05:00
|
|
|
datsize = Rnd(datsize, int64(sect.Align))
|
|
|
|
|
sect.Vaddr = uint64(datsize)
|
cmd/link: use ctxt.{Lookup,ROLookup} in favour of function versions of same
Done with two eg templates:
package p
import (
"cmd/link/internal/ld"
)
func before(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ld.Linklookup(ctxt, name, v)
}
func after(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ctxt.Syms.Lookup(name, v)
}
package p
import (
"cmd/link/internal/ld"
)
func before(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ld.Linkrlookup(ctxt, name, v)
}
func after(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ctxt.Syms.ROLookup(name, v)
}
Change-Id: I00647dbf62294557bd24c29ad1f108fc786335f1
Reviewed-on: https://go-review.googlesource.com/29343
Run-TryBot: Michael Hudson-Doyle <michael.hudson@canonical.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Ian Lance Taylor <iant@golang.org>
2016-09-20 15:06:08 +12:00
|
|
|
ctxt.Syms.Lookup("runtime.bss", 0).Sect = sect
|
|
|
|
|
ctxt.Syms.Lookup("runtime.ebss", 0).Sect = sect
|
runtime: replace GC programs with simpler encoding, faster decoder
Small types record the location of pointers in their memory layout
by using a simple bitmap. In Go 1.4 the bitmap held 4-bit entries,
and in Go 1.5 the bitmap holds 1-bit entries, but in both cases using
a bitmap for a large type containing arrays does not make sense:
if someone refers to the type [1<<28]*byte in a program in such
a way that the type information makes it into the binary, it would be
a waste of space to write a 128 MB (for 4-bit entries) or even 32 MB
(for 1-bit entries) bitmap full of 1s into the binary or even to keep
one in memory during the execution of the program.
For large types containing arrays, it is much more compact to describe
the locations of pointers using a notation that can express repetition
than to lay out a bitmap of pointers. Go 1.4 included such a notation,
called ``GC programs'' but it was complex, required recursion during
decoding, and was generally slow. Dmitriy measured the execution of
these programs writing directly to the heap bitmap as being 7x slower
than copying from a preunrolled 4-bit mask (and frankly that code was
not terribly fast either). For some tests, unrollgcprog1 was seen costing
as much as 3x more than the rest of malloc combined.
This CL introduces a different form for the GC programs. They use a
simple Lempel-Ziv-style encoding of the 1-bit pointer information,
in which the only operations are (1) emit the following n bits
and (2) repeat the last n bits c more times. This encoding can be
generated directly from the Go type information (using repetition
only for arrays or large runs of non-pointer data) and it can be decoded
very efficiently. In particular the decoding requires little state and
no recursion, so that the entire decoding can run without any memory
accesses other than the reads of the encoding and the writes of the
decoded form to the heap bitmap. For recursive types like arrays of
arrays of arrays, the inner instructions are only executed once, not
n times, so that large repetitions run at full speed. (In contrast, large
repetitions in the old programs repeated the individual bit-level layout
of the inner data over and over.) The result is as much as 25x faster
decoding compared to the old form.
Because the old decoder was so slow, Go 1.4 had three (or so) cases
for how to set the heap bitmap bits for an allocation of a given type:
(1) If the type had an even number of words up to 32 words, then
the 4-bit pointer mask for the type fit in no more than 16 bytes;
store the 4-bit pointer mask directly in the binary and copy from it.
(1b) If the type had an odd number of words up to 15 words, then
the 4-bit pointer mask for the type, doubled to end on a byte boundary,
fit in no more than 16 bytes; store that doubled mask directly in the
binary and copy from it.
(2) If the type had an even number of words up to 128 words,
or an odd number of words up to 63 words (again due to doubling),
then the 4-bit pointer mask would fit in a 64-byte unrolled mask.
Store a GC program in the binary, but leave space in the BSS for
the unrolled mask. Execute the GC program to construct the mask the
first time it is needed, and thereafter copy from the mask.
(3) Otherwise, store a GC program and execute it to write directly to
the heap bitmap each time an object of that type is allocated.
(This is the case that was 7x slower than the other two.)
Because the new pointer masks store 1-bit entries instead of 4-bit
entries and because using the decoder no longer carries a significant
overhead, after this CL (that is, for Go 1.5) there are only two cases:
(1) If the type is 128 words or less (no condition about odd or even),
store the 1-bit pointer mask directly in the binary and use it to
initialize the heap bitmap during malloc. (Implemented in CL 9702.)
(2) There is no case 2 anymore.
(3) Otherwise, store a GC program and execute it to write directly to
the heap bitmap each time an object of that type is allocated.
Executing the GC program directly into the heap bitmap (case (3) above)
was disabled for the Go 1.5 dev cycle, both to avoid needing to use
GC programs for typedmemmove and to avoid updating that code as
the heap bitmap format changed. Typedmemmove no longer uses this
type information; as of CL 9886 it uses the heap bitmap directly.
Now that the heap bitmap format is stable, we reintroduce GC programs
and their space savings.
Benchmarks for heapBitsSetType, before this CL vs this CL:
name old mean new mean delta
SetTypePtr 7.59ns × (0.99,1.02) 5.16ns × (1.00,1.00) -32.05% (p=0.000)
SetTypePtr8 21.0ns × (0.98,1.05) 21.4ns × (1.00,1.00) ~ (p=0.179)
SetTypePtr16 24.1ns × (0.99,1.01) 24.6ns × (1.00,1.00) +2.41% (p=0.001)
SetTypePtr32 31.2ns × (0.99,1.01) 32.4ns × (0.99,1.02) +3.72% (p=0.001)
SetTypePtr64 45.2ns × (1.00,1.00) 47.2ns × (1.00,1.00) +4.42% (p=0.000)
SetTypePtr126 75.8ns × (0.99,1.01) 79.1ns × (1.00,1.00) +4.25% (p=0.000)
SetTypePtr128 74.3ns × (0.99,1.01) 77.6ns × (1.00,1.01) +4.55% (p=0.000)
SetTypePtrSlice 726ns × (1.00,1.01) 712ns × (1.00,1.00) -1.95% (p=0.001)
SetTypeNode1 20.0ns × (0.99,1.01) 20.7ns × (1.00,1.00) +3.71% (p=0.000)
SetTypeNode1Slice 112ns × (1.00,1.00) 113ns × (0.99,1.00) ~ (p=0.070)
SetTypeNode8 23.9ns × (1.00,1.00) 24.7ns × (1.00,1.01) +3.18% (p=0.000)
SetTypeNode8Slice 294ns × (0.99,1.02) 287ns × (0.99,1.01) -2.38% (p=0.015)
SetTypeNode64 52.8ns × (0.99,1.03) 51.8ns × (0.99,1.01) ~ (p=0.069)
SetTypeNode64Slice 1.13µs × (0.99,1.05) 1.14µs × (0.99,1.00) ~ (p=0.767)
SetTypeNode64Dead 36.0ns × (1.00,1.01) 32.5ns × (0.99,1.00) -9.67% (p=0.000)
SetTypeNode64DeadSlice 1.43µs × (0.99,1.01) 1.40µs × (1.00,1.00) -2.39% (p=0.001)
SetTypeNode124 75.7ns × (1.00,1.01) 79.0ns × (1.00,1.00) +4.44% (p=0.000)
SetTypeNode124Slice 1.94µs × (1.00,1.01) 2.04µs × (0.99,1.01) +4.98% (p=0.000)
SetTypeNode126 75.4ns × (1.00,1.01) 77.7ns × (0.99,1.01) +3.11% (p=0.000)
SetTypeNode126Slice 1.95µs × (0.99,1.01) 2.03µs × (1.00,1.00) +3.74% (p=0.000)
SetTypeNode128 85.4ns × (0.99,1.01) 122.0ns × (1.00,1.00) +42.89% (p=0.000)
SetTypeNode128Slice 2.20µs × (1.00,1.01) 2.36µs × (0.98,1.02) +7.48% (p=0.001)
SetTypeNode130 83.3ns × (1.00,1.00) 123.0ns × (1.00,1.00) +47.61% (p=0.000)
SetTypeNode130Slice 2.30µs × (0.99,1.01) 2.40µs × (0.98,1.01) +4.37% (p=0.000)
SetTypeNode1024 498ns × (1.00,1.00) 537ns × (1.00,1.00) +7.96% (p=0.000)
SetTypeNode1024Slice 15.5µs × (0.99,1.01) 17.8µs × (1.00,1.00) +15.27% (p=0.000)
The above compares always using a cached pointer mask (and the
corresponding waste of memory) against using the programs directly.
Some slowdown is expected, in exchange for having a better general algorithm.
The GC programs kick in for SetTypeNode128, SetTypeNode130, SetTypeNode1024,
along with the slice variants of those.
It is possible that the cutoff of 128 words (bits) should be raised
in a followup CL, but even with this low cutoff the GC programs are
faster than Go 1.4's "fast path" non-GC program case.
Benchmarks for heapBitsSetType, Go 1.4 vs this CL:
name old mean new mean delta
SetTypePtr 6.89ns × (1.00,1.00) 5.17ns × (1.00,1.00) -25.02% (p=0.000)
SetTypePtr8 25.8ns × (0.97,1.05) 21.5ns × (1.00,1.00) -16.70% (p=0.000)
SetTypePtr16 39.8ns × (0.97,1.02) 24.7ns × (0.99,1.01) -37.81% (p=0.000)
SetTypePtr32 68.8ns × (0.98,1.01) 32.2ns × (1.00,1.01) -53.18% (p=0.000)
SetTypePtr64 130ns × (1.00,1.00) 47ns × (1.00,1.00) -63.67% (p=0.000)
SetTypePtr126 241ns × (0.99,1.01) 79ns × (1.00,1.01) -67.25% (p=0.000)
SetTypePtr128 2.07µs × (1.00,1.00) 0.08µs × (1.00,1.00) -96.27% (p=0.000)
SetTypePtrSlice 1.05µs × (0.99,1.01) 0.72µs × (0.99,1.02) -31.70% (p=0.000)
SetTypeNode1 16.0ns × (0.99,1.01) 20.8ns × (0.99,1.03) +29.91% (p=0.000)
SetTypeNode1Slice 184ns × (0.99,1.01) 112ns × (0.99,1.01) -39.26% (p=0.000)
SetTypeNode8 29.5ns × (0.97,1.02) 24.6ns × (1.00,1.00) -16.50% (p=0.000)
SetTypeNode8Slice 624ns × (0.98,1.02) 285ns × (1.00,1.00) -54.31% (p=0.000)
SetTypeNode64 135ns × (0.96,1.08) 52ns × (0.99,1.02) -61.32% (p=0.000)
SetTypeNode64Slice 3.83µs × (1.00,1.00) 1.14µs × (0.99,1.01) -70.16% (p=0.000)
SetTypeNode64Dead 134ns × (0.99,1.01) 32ns × (1.00,1.01) -75.74% (p=0.000)
SetTypeNode64DeadSlice 3.83µs × (0.99,1.00) 1.40µs × (1.00,1.01) -63.42% (p=0.000)
SetTypeNode124 240ns × (0.99,1.01) 79ns × (1.00,1.01) -67.05% (p=0.000)
SetTypeNode124Slice 7.27µs × (1.00,1.00) 2.04µs × (1.00,1.00) -71.95% (p=0.000)
SetTypeNode126 2.06µs × (0.99,1.01) 0.08µs × (0.99,1.01) -96.23% (p=0.000)
SetTypeNode126Slice 64.4µs × (1.00,1.00) 2.0µs × (1.00,1.00) -96.85% (p=0.000)
SetTypeNode128 2.09µs × (1.00,1.01) 0.12µs × (1.00,1.00) -94.15% (p=0.000)
SetTypeNode128Slice 65.4µs × (1.00,1.00) 2.4µs × (0.99,1.03) -96.39% (p=0.000)
SetTypeNode130 2.11µs × (1.00,1.00) 0.12µs × (1.00,1.00) -94.18% (p=0.000)
SetTypeNode130Slice 66.3µs × (1.00,1.00) 2.4µs × (0.97,1.08) -96.34% (p=0.000)
SetTypeNode1024 16.0µs × (1.00,1.01) 0.5µs × (1.00,1.00) -96.65% (p=0.000)
SetTypeNode1024Slice 512µs × (1.00,1.00) 18µs × (0.98,1.04) -96.45% (p=0.000)
SetTypeNode124 uses a 124 data + 2 ptr = 126-word allocation.
Both Go 1.4 and this CL are using pointer bitmaps for this case,
so that's an overall 3x speedup for using pointer bitmaps.
SetTypeNode128 uses a 128 data + 2 ptr = 130-word allocation.
Both Go 1.4 and this CL are running the GC program for this case,
so that's an overall 17x speedup when using GC programs (and
I've seen >20x on other systems).
Comparing Go 1.4's SetTypeNode124 (pointer bitmap) against
this CL's SetTypeNode128 (GC program), the slow path in the
code in this CL is 2x faster than the fast path in Go 1.4.
The Go 1 benchmarks are basically unaffected compared to just before this CL.
Go 1 benchmarks, before this CL vs this CL:
name old mean new mean delta
BinaryTree17 5.87s × (0.97,1.04) 5.91s × (0.96,1.04) ~ (p=0.306)
Fannkuch11 4.38s × (1.00,1.00) 4.37s × (1.00,1.01) -0.22% (p=0.006)
FmtFprintfEmpty 90.7ns × (0.97,1.10) 89.3ns × (0.96,1.09) ~ (p=0.280)
FmtFprintfString 282ns × (0.98,1.04) 287ns × (0.98,1.07) +1.72% (p=0.039)
FmtFprintfInt 269ns × (0.99,1.03) 282ns × (0.97,1.04) +4.87% (p=0.000)
FmtFprintfIntInt 478ns × (0.99,1.02) 481ns × (0.99,1.02) +0.61% (p=0.048)
FmtFprintfPrefixedInt 399ns × (0.98,1.03) 400ns × (0.98,1.05) ~ (p=0.533)
FmtFprintfFloat 563ns × (0.99,1.01) 570ns × (1.00,1.01) +1.37% (p=0.000)
FmtManyArgs 1.89µs × (0.99,1.01) 1.92µs × (0.99,1.02) +1.88% (p=0.000)
GobDecode 15.2ms × (0.99,1.01) 15.2ms × (0.98,1.05) ~ (p=0.609)
GobEncode 11.6ms × (0.98,1.03) 11.9ms × (0.98,1.04) +2.17% (p=0.000)
Gzip 648ms × (0.99,1.01) 648ms × (1.00,1.01) ~ (p=0.835)
Gunzip 142ms × (1.00,1.00) 143ms × (1.00,1.01) ~ (p=0.169)
HTTPClientServer 90.5µs × (0.98,1.03) 91.5µs × (0.98,1.04) +1.04% (p=0.045)
JSONEncode 31.5ms × (0.98,1.03) 31.4ms × (0.98,1.03) ~ (p=0.549)
JSONDecode 111ms × (0.99,1.01) 107ms × (0.99,1.01) -3.21% (p=0.000)
Mandelbrot200 6.01ms × (1.00,1.00) 6.01ms × (1.00,1.00) ~ (p=0.878)
GoParse 6.54ms × (0.99,1.02) 6.61ms × (0.99,1.03) +1.08% (p=0.004)
RegexpMatchEasy0_32 160ns × (1.00,1.01) 161ns × (1.00,1.00) +0.40% (p=0.000)
RegexpMatchEasy0_1K 560ns × (0.99,1.01) 559ns × (0.99,1.01) ~ (p=0.088)
RegexpMatchEasy1_32 138ns × (0.99,1.01) 138ns × (1.00,1.00) ~ (p=0.380)
RegexpMatchEasy1_1K 877ns × (1.00,1.00) 878ns × (1.00,1.00) ~ (p=0.157)
RegexpMatchMedium_32 251ns × (0.99,1.00) 251ns × (1.00,1.01) +0.28% (p=0.021)
RegexpMatchMedium_1K 72.6µs × (1.00,1.00) 72.6µs × (1.00,1.00) ~ (p=0.539)
RegexpMatchHard_32 3.84µs × (1.00,1.00) 3.84µs × (1.00,1.00) ~ (p=0.378)
RegexpMatchHard_1K 117µs × (1.00,1.00) 117µs × (1.00,1.00) ~ (p=0.067)
Revcomp 904ms × (0.99,1.02) 904ms × (0.99,1.01) ~ (p=0.943)
Template 125ms × (0.99,1.02) 127ms × (0.99,1.01) +1.79% (p=0.000)
TimeParse 627ns × (0.99,1.01) 622ns × (0.99,1.01) -0.88% (p=0.000)
TimeFormat 655ns × (0.99,1.02) 655ns × (0.99,1.02) ~ (p=0.976)
For the record, Go 1 benchmarks, Go 1.4 vs this CL:
name old mean new mean delta
BinaryTree17 4.61s × (0.97,1.05) 5.91s × (0.98,1.03) +28.35% (p=0.000)
Fannkuch11 4.40s × (0.99,1.03) 4.41s × (0.99,1.01) ~ (p=0.212)
FmtFprintfEmpty 102ns × (0.99,1.01) 84ns × (0.99,1.02) -18.38% (p=0.000)
FmtFprintfString 302ns × (0.98,1.01) 303ns × (0.99,1.02) ~ (p=0.203)
FmtFprintfInt 313ns × (0.97,1.05) 270ns × (0.99,1.01) -13.69% (p=0.000)
FmtFprintfIntInt 524ns × (0.98,1.02) 477ns × (0.99,1.00) -8.87% (p=0.000)
FmtFprintfPrefixedInt 424ns × (0.98,1.02) 386ns × (0.99,1.01) -8.96% (p=0.000)
FmtFprintfFloat 652ns × (0.98,1.02) 594ns × (0.97,1.05) -8.97% (p=0.000)
FmtManyArgs 2.13µs × (0.99,1.02) 1.94µs × (0.99,1.01) -8.92% (p=0.000)
GobDecode 17.1ms × (0.99,1.02) 14.9ms × (0.98,1.03) -13.07% (p=0.000)
GobEncode 13.5ms × (0.98,1.03) 11.5ms × (0.98,1.03) -15.25% (p=0.000)
Gzip 656ms × (0.99,1.02) 647ms × (0.99,1.01) -1.29% (p=0.000)
Gunzip 143ms × (0.99,1.02) 144ms × (0.99,1.01) ~ (p=0.204)
HTTPClientServer 88.2µs × (0.98,1.02) 90.8µs × (0.98,1.01) +2.93% (p=0.000)
JSONEncode 32.2ms × (0.98,1.02) 30.9ms × (0.97,1.04) -4.06% (p=0.001)
JSONDecode 121ms × (0.98,1.02) 110ms × (0.98,1.05) -8.95% (p=0.000)
Mandelbrot200 6.06ms × (0.99,1.01) 6.11ms × (0.98,1.04) ~ (p=0.184)
GoParse 6.76ms × (0.97,1.04) 6.58ms × (0.98,1.05) -2.63% (p=0.003)
RegexpMatchEasy0_32 195ns × (1.00,1.01) 155ns × (0.99,1.01) -20.43% (p=0.000)
RegexpMatchEasy0_1K 479ns × (0.98,1.03) 535ns × (0.99,1.02) +11.59% (p=0.000)
RegexpMatchEasy1_32 169ns × (0.99,1.02) 131ns × (0.99,1.03) -22.44% (p=0.000)
RegexpMatchEasy1_1K 1.53µs × (0.99,1.01) 0.87µs × (0.99,1.02) -43.07% (p=0.000)
RegexpMatchMedium_32 334ns × (0.99,1.01) 242ns × (0.99,1.01) -27.53% (p=0.000)
RegexpMatchMedium_1K 125µs × (1.00,1.01) 72µs × (0.99,1.03) -42.53% (p=0.000)
RegexpMatchHard_32 6.03µs × (0.99,1.01) 3.79µs × (0.99,1.01) -37.12% (p=0.000)
RegexpMatchHard_1K 189µs × (0.99,1.02) 115µs × (0.99,1.01) -39.20% (p=0.000)
Revcomp 935ms × (0.96,1.03) 926ms × (0.98,1.02) ~ (p=0.083)
Template 146ms × (0.97,1.05) 119ms × (0.99,1.01) -18.37% (p=0.000)
TimeParse 660ns × (0.99,1.01) 624ns × (0.99,1.02) -5.43% (p=0.000)
TimeFormat 670ns × (0.98,1.02) 710ns × (1.00,1.01) +5.97% (p=0.000)
This CL is a bit larger than I would like, but the compiler, linker, runtime,
and package reflect all need to be in sync about the format of these programs,
so there is no easy way to split this into independent changes (at least
while keeping the build working at each change).
Fixes #9625.
Fixes #10524.
Change-Id: I9e3e20d6097099d0f8532d1cb5b1af528804989a
Reviewed-on: https://go-review.googlesource.com/9888
Reviewed-by: Austin Clements <austin@google.com>
Run-TryBot: Russ Cox <rsc@golang.org>
2015-05-08 01:43:18 -04:00
|
|
|
gc = GCProg{}
|
2016-08-19 22:40:38 -04:00
|
|
|
gc.Init(ctxt, "runtime.gcbss")
|
2017-04-19 15:15:35 +12:00
|
|
|
for _, s := range data[SBSS] {
|
2015-02-27 22:57:28 -05:00
|
|
|
s.Sect = sect
|
|
|
|
|
datsize = aligndatsize(datsize, s)
|
|
|
|
|
s.Value = int64(uint64(datsize) - sect.Vaddr)
|
runtime: replace GC programs with simpler encoding, faster decoder
Small types record the location of pointers in their memory layout
by using a simple bitmap. In Go 1.4 the bitmap held 4-bit entries,
and in Go 1.5 the bitmap holds 1-bit entries, but in both cases using
a bitmap for a large type containing arrays does not make sense:
if someone refers to the type [1<<28]*byte in a program in such
a way that the type information makes it into the binary, it would be
a waste of space to write a 128 MB (for 4-bit entries) or even 32 MB
(for 1-bit entries) bitmap full of 1s into the binary or even to keep
one in memory during the execution of the program.
For large types containing arrays, it is much more compact to describe
the locations of pointers using a notation that can express repetition
than to lay out a bitmap of pointers. Go 1.4 included such a notation,
called ``GC programs'' but it was complex, required recursion during
decoding, and was generally slow. Dmitriy measured the execution of
these programs writing directly to the heap bitmap as being 7x slower
than copying from a preunrolled 4-bit mask (and frankly that code was
not terribly fast either). For some tests, unrollgcprog1 was seen costing
as much as 3x more than the rest of malloc combined.
This CL introduces a different form for the GC programs. They use a
simple Lempel-Ziv-style encoding of the 1-bit pointer information,
in which the only operations are (1) emit the following n bits
and (2) repeat the last n bits c more times. This encoding can be
generated directly from the Go type information (using repetition
only for arrays or large runs of non-pointer data) and it can be decoded
very efficiently. In particular the decoding requires little state and
no recursion, so that the entire decoding can run without any memory
accesses other than the reads of the encoding and the writes of the
decoded form to the heap bitmap. For recursive types like arrays of
arrays of arrays, the inner instructions are only executed once, not
n times, so that large repetitions run at full speed. (In contrast, large
repetitions in the old programs repeated the individual bit-level layout
of the inner data over and over.) The result is as much as 25x faster
decoding compared to the old form.
Because the old decoder was so slow, Go 1.4 had three (or so) cases
for how to set the heap bitmap bits for an allocation of a given type:
(1) If the type had an even number of words up to 32 words, then
the 4-bit pointer mask for the type fit in no more than 16 bytes;
store the 4-bit pointer mask directly in the binary and copy from it.
(1b) If the type had an odd number of words up to 15 words, then
the 4-bit pointer mask for the type, doubled to end on a byte boundary,
fit in no more than 16 bytes; store that doubled mask directly in the
binary and copy from it.
(2) If the type had an even number of words up to 128 words,
or an odd number of words up to 63 words (again due to doubling),
then the 4-bit pointer mask would fit in a 64-byte unrolled mask.
Store a GC program in the binary, but leave space in the BSS for
the unrolled mask. Execute the GC program to construct the mask the
first time it is needed, and thereafter copy from the mask.
(3) Otherwise, store a GC program and execute it to write directly to
the heap bitmap each time an object of that type is allocated.
(This is the case that was 7x slower than the other two.)
Because the new pointer masks store 1-bit entries instead of 4-bit
entries and because using the decoder no longer carries a significant
overhead, after this CL (that is, for Go 1.5) there are only two cases:
(1) If the type is 128 words or less (no condition about odd or even),
store the 1-bit pointer mask directly in the binary and use it to
initialize the heap bitmap during malloc. (Implemented in CL 9702.)
(2) There is no case 2 anymore.
(3) Otherwise, store a GC program and execute it to write directly to
the heap bitmap each time an object of that type is allocated.
Executing the GC program directly into the heap bitmap (case (3) above)
was disabled for the Go 1.5 dev cycle, both to avoid needing to use
GC programs for typedmemmove and to avoid updating that code as
the heap bitmap format changed. Typedmemmove no longer uses this
type information; as of CL 9886 it uses the heap bitmap directly.
Now that the heap bitmap format is stable, we reintroduce GC programs
and their space savings.
Benchmarks for heapBitsSetType, before this CL vs this CL:
name old mean new mean delta
SetTypePtr 7.59ns × (0.99,1.02) 5.16ns × (1.00,1.00) -32.05% (p=0.000)
SetTypePtr8 21.0ns × (0.98,1.05) 21.4ns × (1.00,1.00) ~ (p=0.179)
SetTypePtr16 24.1ns × (0.99,1.01) 24.6ns × (1.00,1.00) +2.41% (p=0.001)
SetTypePtr32 31.2ns × (0.99,1.01) 32.4ns × (0.99,1.02) +3.72% (p=0.001)
SetTypePtr64 45.2ns × (1.00,1.00) 47.2ns × (1.00,1.00) +4.42% (p=0.000)
SetTypePtr126 75.8ns × (0.99,1.01) 79.1ns × (1.00,1.00) +4.25% (p=0.000)
SetTypePtr128 74.3ns × (0.99,1.01) 77.6ns × (1.00,1.01) +4.55% (p=0.000)
SetTypePtrSlice 726ns × (1.00,1.01) 712ns × (1.00,1.00) -1.95% (p=0.001)
SetTypeNode1 20.0ns × (0.99,1.01) 20.7ns × (1.00,1.00) +3.71% (p=0.000)
SetTypeNode1Slice 112ns × (1.00,1.00) 113ns × (0.99,1.00) ~ (p=0.070)
SetTypeNode8 23.9ns × (1.00,1.00) 24.7ns × (1.00,1.01) +3.18% (p=0.000)
SetTypeNode8Slice 294ns × (0.99,1.02) 287ns × (0.99,1.01) -2.38% (p=0.015)
SetTypeNode64 52.8ns × (0.99,1.03) 51.8ns × (0.99,1.01) ~ (p=0.069)
SetTypeNode64Slice 1.13µs × (0.99,1.05) 1.14µs × (0.99,1.00) ~ (p=0.767)
SetTypeNode64Dead 36.0ns × (1.00,1.01) 32.5ns × (0.99,1.00) -9.67% (p=0.000)
SetTypeNode64DeadSlice 1.43µs × (0.99,1.01) 1.40µs × (1.00,1.00) -2.39% (p=0.001)
SetTypeNode124 75.7ns × (1.00,1.01) 79.0ns × (1.00,1.00) +4.44% (p=0.000)
SetTypeNode124Slice 1.94µs × (1.00,1.01) 2.04µs × (0.99,1.01) +4.98% (p=0.000)
SetTypeNode126 75.4ns × (1.00,1.01) 77.7ns × (0.99,1.01) +3.11% (p=0.000)
SetTypeNode126Slice 1.95µs × (0.99,1.01) 2.03µs × (1.00,1.00) +3.74% (p=0.000)
SetTypeNode128 85.4ns × (0.99,1.01) 122.0ns × (1.00,1.00) +42.89% (p=0.000)
SetTypeNode128Slice 2.20µs × (1.00,1.01) 2.36µs × (0.98,1.02) +7.48% (p=0.001)
SetTypeNode130 83.3ns × (1.00,1.00) 123.0ns × (1.00,1.00) +47.61% (p=0.000)
SetTypeNode130Slice 2.30µs × (0.99,1.01) 2.40µs × (0.98,1.01) +4.37% (p=0.000)
SetTypeNode1024 498ns × (1.00,1.00) 537ns × (1.00,1.00) +7.96% (p=0.000)
SetTypeNode1024Slice 15.5µs × (0.99,1.01) 17.8µs × (1.00,1.00) +15.27% (p=0.000)
The above compares always using a cached pointer mask (and the
corresponding waste of memory) against using the programs directly.
Some slowdown is expected, in exchange for having a better general algorithm.
The GC programs kick in for SetTypeNode128, SetTypeNode130, SetTypeNode1024,
along with the slice variants of those.
It is possible that the cutoff of 128 words (bits) should be raised
in a followup CL, but even with this low cutoff the GC programs are
faster than Go 1.4's "fast path" non-GC program case.
Benchmarks for heapBitsSetType, Go 1.4 vs this CL:
name old mean new mean delta
SetTypePtr 6.89ns × (1.00,1.00) 5.17ns × (1.00,1.00) -25.02% (p=0.000)
SetTypePtr8 25.8ns × (0.97,1.05) 21.5ns × (1.00,1.00) -16.70% (p=0.000)
SetTypePtr16 39.8ns × (0.97,1.02) 24.7ns × (0.99,1.01) -37.81% (p=0.000)
SetTypePtr32 68.8ns × (0.98,1.01) 32.2ns × (1.00,1.01) -53.18% (p=0.000)
SetTypePtr64 130ns × (1.00,1.00) 47ns × (1.00,1.00) -63.67% (p=0.000)
SetTypePtr126 241ns × (0.99,1.01) 79ns × (1.00,1.01) -67.25% (p=0.000)
SetTypePtr128 2.07µs × (1.00,1.00) 0.08µs × (1.00,1.00) -96.27% (p=0.000)
SetTypePtrSlice 1.05µs × (0.99,1.01) 0.72µs × (0.99,1.02) -31.70% (p=0.000)
SetTypeNode1 16.0ns × (0.99,1.01) 20.8ns × (0.99,1.03) +29.91% (p=0.000)
SetTypeNode1Slice 184ns × (0.99,1.01) 112ns × (0.99,1.01) -39.26% (p=0.000)
SetTypeNode8 29.5ns × (0.97,1.02) 24.6ns × (1.00,1.00) -16.50% (p=0.000)
SetTypeNode8Slice 624ns × (0.98,1.02) 285ns × (1.00,1.00) -54.31% (p=0.000)
SetTypeNode64 135ns × (0.96,1.08) 52ns × (0.99,1.02) -61.32% (p=0.000)
SetTypeNode64Slice 3.83µs × (1.00,1.00) 1.14µs × (0.99,1.01) -70.16% (p=0.000)
SetTypeNode64Dead 134ns × (0.99,1.01) 32ns × (1.00,1.01) -75.74% (p=0.000)
SetTypeNode64DeadSlice 3.83µs × (0.99,1.00) 1.40µs × (1.00,1.01) -63.42% (p=0.000)
SetTypeNode124 240ns × (0.99,1.01) 79ns × (1.00,1.01) -67.05% (p=0.000)
SetTypeNode124Slice 7.27µs × (1.00,1.00) 2.04µs × (1.00,1.00) -71.95% (p=0.000)
SetTypeNode126 2.06µs × (0.99,1.01) 0.08µs × (0.99,1.01) -96.23% (p=0.000)
SetTypeNode126Slice 64.4µs × (1.00,1.00) 2.0µs × (1.00,1.00) -96.85% (p=0.000)
SetTypeNode128 2.09µs × (1.00,1.01) 0.12µs × (1.00,1.00) -94.15% (p=0.000)
SetTypeNode128Slice 65.4µs × (1.00,1.00) 2.4µs × (0.99,1.03) -96.39% (p=0.000)
SetTypeNode130 2.11µs × (1.00,1.00) 0.12µs × (1.00,1.00) -94.18% (p=0.000)
SetTypeNode130Slice 66.3µs × (1.00,1.00) 2.4µs × (0.97,1.08) -96.34% (p=0.000)
SetTypeNode1024 16.0µs × (1.00,1.01) 0.5µs × (1.00,1.00) -96.65% (p=0.000)
SetTypeNode1024Slice 512µs × (1.00,1.00) 18µs × (0.98,1.04) -96.45% (p=0.000)
SetTypeNode124 uses a 124 data + 2 ptr = 126-word allocation.
Both Go 1.4 and this CL are using pointer bitmaps for this case,
so that's an overall 3x speedup for using pointer bitmaps.
SetTypeNode128 uses a 128 data + 2 ptr = 130-word allocation.
Both Go 1.4 and this CL are running the GC program for this case,
so that's an overall 17x speedup when using GC programs (and
I've seen >20x on other systems).
Comparing Go 1.4's SetTypeNode124 (pointer bitmap) against
this CL's SetTypeNode128 (GC program), the slow path in the
code in this CL is 2x faster than the fast path in Go 1.4.
The Go 1 benchmarks are basically unaffected compared to just before this CL.
Go 1 benchmarks, before this CL vs this CL:
name old mean new mean delta
BinaryTree17 5.87s × (0.97,1.04) 5.91s × (0.96,1.04) ~ (p=0.306)
Fannkuch11 4.38s × (1.00,1.00) 4.37s × (1.00,1.01) -0.22% (p=0.006)
FmtFprintfEmpty 90.7ns × (0.97,1.10) 89.3ns × (0.96,1.09) ~ (p=0.280)
FmtFprintfString 282ns × (0.98,1.04) 287ns × (0.98,1.07) +1.72% (p=0.039)
FmtFprintfInt 269ns × (0.99,1.03) 282ns × (0.97,1.04) +4.87% (p=0.000)
FmtFprintfIntInt 478ns × (0.99,1.02) 481ns × (0.99,1.02) +0.61% (p=0.048)
FmtFprintfPrefixedInt 399ns × (0.98,1.03) 400ns × (0.98,1.05) ~ (p=0.533)
FmtFprintfFloat 563ns × (0.99,1.01) 570ns × (1.00,1.01) +1.37% (p=0.000)
FmtManyArgs 1.89µs × (0.99,1.01) 1.92µs × (0.99,1.02) +1.88% (p=0.000)
GobDecode 15.2ms × (0.99,1.01) 15.2ms × (0.98,1.05) ~ (p=0.609)
GobEncode 11.6ms × (0.98,1.03) 11.9ms × (0.98,1.04) +2.17% (p=0.000)
Gzip 648ms × (0.99,1.01) 648ms × (1.00,1.01) ~ (p=0.835)
Gunzip 142ms × (1.00,1.00) 143ms × (1.00,1.01) ~ (p=0.169)
HTTPClientServer 90.5µs × (0.98,1.03) 91.5µs × (0.98,1.04) +1.04% (p=0.045)
JSONEncode 31.5ms × (0.98,1.03) 31.4ms × (0.98,1.03) ~ (p=0.549)
JSONDecode 111ms × (0.99,1.01) 107ms × (0.99,1.01) -3.21% (p=0.000)
Mandelbrot200 6.01ms × (1.00,1.00) 6.01ms × (1.00,1.00) ~ (p=0.878)
GoParse 6.54ms × (0.99,1.02) 6.61ms × (0.99,1.03) +1.08% (p=0.004)
RegexpMatchEasy0_32 160ns × (1.00,1.01) 161ns × (1.00,1.00) +0.40% (p=0.000)
RegexpMatchEasy0_1K 560ns × (0.99,1.01) 559ns × (0.99,1.01) ~ (p=0.088)
RegexpMatchEasy1_32 138ns × (0.99,1.01) 138ns × (1.00,1.00) ~ (p=0.380)
RegexpMatchEasy1_1K 877ns × (1.00,1.00) 878ns × (1.00,1.00) ~ (p=0.157)
RegexpMatchMedium_32 251ns × (0.99,1.00) 251ns × (1.00,1.01) +0.28% (p=0.021)
RegexpMatchMedium_1K 72.6µs × (1.00,1.00) 72.6µs × (1.00,1.00) ~ (p=0.539)
RegexpMatchHard_32 3.84µs × (1.00,1.00) 3.84µs × (1.00,1.00) ~ (p=0.378)
RegexpMatchHard_1K 117µs × (1.00,1.00) 117µs × (1.00,1.00) ~ (p=0.067)
Revcomp 904ms × (0.99,1.02) 904ms × (0.99,1.01) ~ (p=0.943)
Template 125ms × (0.99,1.02) 127ms × (0.99,1.01) +1.79% (p=0.000)
TimeParse 627ns × (0.99,1.01) 622ns × (0.99,1.01) -0.88% (p=0.000)
TimeFormat 655ns × (0.99,1.02) 655ns × (0.99,1.02) ~ (p=0.976)
For the record, Go 1 benchmarks, Go 1.4 vs this CL:
name old mean new mean delta
BinaryTree17 4.61s × (0.97,1.05) 5.91s × (0.98,1.03) +28.35% (p=0.000)
Fannkuch11 4.40s × (0.99,1.03) 4.41s × (0.99,1.01) ~ (p=0.212)
FmtFprintfEmpty 102ns × (0.99,1.01) 84ns × (0.99,1.02) -18.38% (p=0.000)
FmtFprintfString 302ns × (0.98,1.01) 303ns × (0.99,1.02) ~ (p=0.203)
FmtFprintfInt 313ns × (0.97,1.05) 270ns × (0.99,1.01) -13.69% (p=0.000)
FmtFprintfIntInt 524ns × (0.98,1.02) 477ns × (0.99,1.00) -8.87% (p=0.000)
FmtFprintfPrefixedInt 424ns × (0.98,1.02) 386ns × (0.99,1.01) -8.96% (p=0.000)
FmtFprintfFloat 652ns × (0.98,1.02) 594ns × (0.97,1.05) -8.97% (p=0.000)
FmtManyArgs 2.13µs × (0.99,1.02) 1.94µs × (0.99,1.01) -8.92% (p=0.000)
GobDecode 17.1ms × (0.99,1.02) 14.9ms × (0.98,1.03) -13.07% (p=0.000)
GobEncode 13.5ms × (0.98,1.03) 11.5ms × (0.98,1.03) -15.25% (p=0.000)
Gzip 656ms × (0.99,1.02) 647ms × (0.99,1.01) -1.29% (p=0.000)
Gunzip 143ms × (0.99,1.02) 144ms × (0.99,1.01) ~ (p=0.204)
HTTPClientServer 88.2µs × (0.98,1.02) 90.8µs × (0.98,1.01) +2.93% (p=0.000)
JSONEncode 32.2ms × (0.98,1.02) 30.9ms × (0.97,1.04) -4.06% (p=0.001)
JSONDecode 121ms × (0.98,1.02) 110ms × (0.98,1.05) -8.95% (p=0.000)
Mandelbrot200 6.06ms × (0.99,1.01) 6.11ms × (0.98,1.04) ~ (p=0.184)
GoParse 6.76ms × (0.97,1.04) 6.58ms × (0.98,1.05) -2.63% (p=0.003)
RegexpMatchEasy0_32 195ns × (1.00,1.01) 155ns × (0.99,1.01) -20.43% (p=0.000)
RegexpMatchEasy0_1K 479ns × (0.98,1.03) 535ns × (0.99,1.02) +11.59% (p=0.000)
RegexpMatchEasy1_32 169ns × (0.99,1.02) 131ns × (0.99,1.03) -22.44% (p=0.000)
RegexpMatchEasy1_1K 1.53µs × (0.99,1.01) 0.87µs × (0.99,1.02) -43.07% (p=0.000)
RegexpMatchMedium_32 334ns × (0.99,1.01) 242ns × (0.99,1.01) -27.53% (p=0.000)
RegexpMatchMedium_1K 125µs × (1.00,1.01) 72µs × (0.99,1.03) -42.53% (p=0.000)
RegexpMatchHard_32 6.03µs × (0.99,1.01) 3.79µs × (0.99,1.01) -37.12% (p=0.000)
RegexpMatchHard_1K 189µs × (0.99,1.02) 115µs × (0.99,1.01) -39.20% (p=0.000)
Revcomp 935ms × (0.96,1.03) 926ms × (0.98,1.02) ~ (p=0.083)
Template 146ms × (0.97,1.05) 119ms × (0.99,1.01) -18.37% (p=0.000)
TimeParse 660ns × (0.99,1.01) 624ns × (0.99,1.02) -5.43% (p=0.000)
TimeFormat 670ns × (0.98,1.02) 710ns × (1.00,1.01) +5.97% (p=0.000)
This CL is a bit larger than I would like, but the compiler, linker, runtime,
and package reflect all need to be in sync about the format of these programs,
so there is no easy way to split this into independent changes (at least
while keeping the build working at each change).
Fixes #9625.
Fixes #10524.
Change-Id: I9e3e20d6097099d0f8532d1cb5b1af528804989a
Reviewed-on: https://go-review.googlesource.com/9888
Reviewed-by: Austin Clements <austin@google.com>
Run-TryBot: Russ Cox <rsc@golang.org>
2015-05-08 01:43:18 -04:00
|
|
|
gc.AddSym(s)
|
2016-04-19 08:59:56 -04:00
|
|
|
datsize += s.Size
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
2017-04-19 15:15:35 +12:00
|
|
|
checkdatsize(ctxt, datsize, SBSS)
|
2015-02-27 22:57:28 -05:00
|
|
|
sect.Length = uint64(datsize) - sect.Vaddr
|
runtime: replace GC programs with simpler encoding, faster decoder
Small types record the location of pointers in their memory layout
by using a simple bitmap. In Go 1.4 the bitmap held 4-bit entries,
and in Go 1.5 the bitmap holds 1-bit entries, but in both cases using
a bitmap for a large type containing arrays does not make sense:
if someone refers to the type [1<<28]*byte in a program in such
a way that the type information makes it into the binary, it would be
a waste of space to write a 128 MB (for 4-bit entries) or even 32 MB
(for 1-bit entries) bitmap full of 1s into the binary or even to keep
one in memory during the execution of the program.
For large types containing arrays, it is much more compact to describe
the locations of pointers using a notation that can express repetition
than to lay out a bitmap of pointers. Go 1.4 included such a notation,
called ``GC programs'' but it was complex, required recursion during
decoding, and was generally slow. Dmitriy measured the execution of
these programs writing directly to the heap bitmap as being 7x slower
than copying from a preunrolled 4-bit mask (and frankly that code was
not terribly fast either). For some tests, unrollgcprog1 was seen costing
as much as 3x more than the rest of malloc combined.
This CL introduces a different form for the GC programs. They use a
simple Lempel-Ziv-style encoding of the 1-bit pointer information,
in which the only operations are (1) emit the following n bits
and (2) repeat the last n bits c more times. This encoding can be
generated directly from the Go type information (using repetition
only for arrays or large runs of non-pointer data) and it can be decoded
very efficiently. In particular the decoding requires little state and
no recursion, so that the entire decoding can run without any memory
accesses other than the reads of the encoding and the writes of the
decoded form to the heap bitmap. For recursive types like arrays of
arrays of arrays, the inner instructions are only executed once, not
n times, so that large repetitions run at full speed. (In contrast, large
repetitions in the old programs repeated the individual bit-level layout
of the inner data over and over.) The result is as much as 25x faster
decoding compared to the old form.
Because the old decoder was so slow, Go 1.4 had three (or so) cases
for how to set the heap bitmap bits for an allocation of a given type:
(1) If the type had an even number of words up to 32 words, then
the 4-bit pointer mask for the type fit in no more than 16 bytes;
store the 4-bit pointer mask directly in the binary and copy from it.
(1b) If the type had an odd number of words up to 15 words, then
the 4-bit pointer mask for the type, doubled to end on a byte boundary,
fit in no more than 16 bytes; store that doubled mask directly in the
binary and copy from it.
(2) If the type had an even number of words up to 128 words,
or an odd number of words up to 63 words (again due to doubling),
then the 4-bit pointer mask would fit in a 64-byte unrolled mask.
Store a GC program in the binary, but leave space in the BSS for
the unrolled mask. Execute the GC program to construct the mask the
first time it is needed, and thereafter copy from the mask.
(3) Otherwise, store a GC program and execute it to write directly to
the heap bitmap each time an object of that type is allocated.
(This is the case that was 7x slower than the other two.)
Because the new pointer masks store 1-bit entries instead of 4-bit
entries and because using the decoder no longer carries a significant
overhead, after this CL (that is, for Go 1.5) there are only two cases:
(1) If the type is 128 words or less (no condition about odd or even),
store the 1-bit pointer mask directly in the binary and use it to
initialize the heap bitmap during malloc. (Implemented in CL 9702.)
(2) There is no case 2 anymore.
(3) Otherwise, store a GC program and execute it to write directly to
the heap bitmap each time an object of that type is allocated.
Executing the GC program directly into the heap bitmap (case (3) above)
was disabled for the Go 1.5 dev cycle, both to avoid needing to use
GC programs for typedmemmove and to avoid updating that code as
the heap bitmap format changed. Typedmemmove no longer uses this
type information; as of CL 9886 it uses the heap bitmap directly.
Now that the heap bitmap format is stable, we reintroduce GC programs
and their space savings.
Benchmarks for heapBitsSetType, before this CL vs this CL:
name old mean new mean delta
SetTypePtr 7.59ns × (0.99,1.02) 5.16ns × (1.00,1.00) -32.05% (p=0.000)
SetTypePtr8 21.0ns × (0.98,1.05) 21.4ns × (1.00,1.00) ~ (p=0.179)
SetTypePtr16 24.1ns × (0.99,1.01) 24.6ns × (1.00,1.00) +2.41% (p=0.001)
SetTypePtr32 31.2ns × (0.99,1.01) 32.4ns × (0.99,1.02) +3.72% (p=0.001)
SetTypePtr64 45.2ns × (1.00,1.00) 47.2ns × (1.00,1.00) +4.42% (p=0.000)
SetTypePtr126 75.8ns × (0.99,1.01) 79.1ns × (1.00,1.00) +4.25% (p=0.000)
SetTypePtr128 74.3ns × (0.99,1.01) 77.6ns × (1.00,1.01) +4.55% (p=0.000)
SetTypePtrSlice 726ns × (1.00,1.01) 712ns × (1.00,1.00) -1.95% (p=0.001)
SetTypeNode1 20.0ns × (0.99,1.01) 20.7ns × (1.00,1.00) +3.71% (p=0.000)
SetTypeNode1Slice 112ns × (1.00,1.00) 113ns × (0.99,1.00) ~ (p=0.070)
SetTypeNode8 23.9ns × (1.00,1.00) 24.7ns × (1.00,1.01) +3.18% (p=0.000)
SetTypeNode8Slice 294ns × (0.99,1.02) 287ns × (0.99,1.01) -2.38% (p=0.015)
SetTypeNode64 52.8ns × (0.99,1.03) 51.8ns × (0.99,1.01) ~ (p=0.069)
SetTypeNode64Slice 1.13µs × (0.99,1.05) 1.14µs × (0.99,1.00) ~ (p=0.767)
SetTypeNode64Dead 36.0ns × (1.00,1.01) 32.5ns × (0.99,1.00) -9.67% (p=0.000)
SetTypeNode64DeadSlice 1.43µs × (0.99,1.01) 1.40µs × (1.00,1.00) -2.39% (p=0.001)
SetTypeNode124 75.7ns × (1.00,1.01) 79.0ns × (1.00,1.00) +4.44% (p=0.000)
SetTypeNode124Slice 1.94µs × (1.00,1.01) 2.04µs × (0.99,1.01) +4.98% (p=0.000)
SetTypeNode126 75.4ns × (1.00,1.01) 77.7ns × (0.99,1.01) +3.11% (p=0.000)
SetTypeNode126Slice 1.95µs × (0.99,1.01) 2.03µs × (1.00,1.00) +3.74% (p=0.000)
SetTypeNode128 85.4ns × (0.99,1.01) 122.0ns × (1.00,1.00) +42.89% (p=0.000)
SetTypeNode128Slice 2.20µs × (1.00,1.01) 2.36µs × (0.98,1.02) +7.48% (p=0.001)
SetTypeNode130 83.3ns × (1.00,1.00) 123.0ns × (1.00,1.00) +47.61% (p=0.000)
SetTypeNode130Slice 2.30µs × (0.99,1.01) 2.40µs × (0.98,1.01) +4.37% (p=0.000)
SetTypeNode1024 498ns × (1.00,1.00) 537ns × (1.00,1.00) +7.96% (p=0.000)
SetTypeNode1024Slice 15.5µs × (0.99,1.01) 17.8µs × (1.00,1.00) +15.27% (p=0.000)
The above compares always using a cached pointer mask (and the
corresponding waste of memory) against using the programs directly.
Some slowdown is expected, in exchange for having a better general algorithm.
The GC programs kick in for SetTypeNode128, SetTypeNode130, SetTypeNode1024,
along with the slice variants of those.
It is possible that the cutoff of 128 words (bits) should be raised
in a followup CL, but even with this low cutoff the GC programs are
faster than Go 1.4's "fast path" non-GC program case.
Benchmarks for heapBitsSetType, Go 1.4 vs this CL:
name old mean new mean delta
SetTypePtr 6.89ns × (1.00,1.00) 5.17ns × (1.00,1.00) -25.02% (p=0.000)
SetTypePtr8 25.8ns × (0.97,1.05) 21.5ns × (1.00,1.00) -16.70% (p=0.000)
SetTypePtr16 39.8ns × (0.97,1.02) 24.7ns × (0.99,1.01) -37.81% (p=0.000)
SetTypePtr32 68.8ns × (0.98,1.01) 32.2ns × (1.00,1.01) -53.18% (p=0.000)
SetTypePtr64 130ns × (1.00,1.00) 47ns × (1.00,1.00) -63.67% (p=0.000)
SetTypePtr126 241ns × (0.99,1.01) 79ns × (1.00,1.01) -67.25% (p=0.000)
SetTypePtr128 2.07µs × (1.00,1.00) 0.08µs × (1.00,1.00) -96.27% (p=0.000)
SetTypePtrSlice 1.05µs × (0.99,1.01) 0.72µs × (0.99,1.02) -31.70% (p=0.000)
SetTypeNode1 16.0ns × (0.99,1.01) 20.8ns × (0.99,1.03) +29.91% (p=0.000)
SetTypeNode1Slice 184ns × (0.99,1.01) 112ns × (0.99,1.01) -39.26% (p=0.000)
SetTypeNode8 29.5ns × (0.97,1.02) 24.6ns × (1.00,1.00) -16.50% (p=0.000)
SetTypeNode8Slice 624ns × (0.98,1.02) 285ns × (1.00,1.00) -54.31% (p=0.000)
SetTypeNode64 135ns × (0.96,1.08) 52ns × (0.99,1.02) -61.32% (p=0.000)
SetTypeNode64Slice 3.83µs × (1.00,1.00) 1.14µs × (0.99,1.01) -70.16% (p=0.000)
SetTypeNode64Dead 134ns × (0.99,1.01) 32ns × (1.00,1.01) -75.74% (p=0.000)
SetTypeNode64DeadSlice 3.83µs × (0.99,1.00) 1.40µs × (1.00,1.01) -63.42% (p=0.000)
SetTypeNode124 240ns × (0.99,1.01) 79ns × (1.00,1.01) -67.05% (p=0.000)
SetTypeNode124Slice 7.27µs × (1.00,1.00) 2.04µs × (1.00,1.00) -71.95% (p=0.000)
SetTypeNode126 2.06µs × (0.99,1.01) 0.08µs × (0.99,1.01) -96.23% (p=0.000)
SetTypeNode126Slice 64.4µs × (1.00,1.00) 2.0µs × (1.00,1.00) -96.85% (p=0.000)
SetTypeNode128 2.09µs × (1.00,1.01) 0.12µs × (1.00,1.00) -94.15% (p=0.000)
SetTypeNode128Slice 65.4µs × (1.00,1.00) 2.4µs × (0.99,1.03) -96.39% (p=0.000)
SetTypeNode130 2.11µs × (1.00,1.00) 0.12µs × (1.00,1.00) -94.18% (p=0.000)
SetTypeNode130Slice 66.3µs × (1.00,1.00) 2.4µs × (0.97,1.08) -96.34% (p=0.000)
SetTypeNode1024 16.0µs × (1.00,1.01) 0.5µs × (1.00,1.00) -96.65% (p=0.000)
SetTypeNode1024Slice 512µs × (1.00,1.00) 18µs × (0.98,1.04) -96.45% (p=0.000)
SetTypeNode124 uses a 124 data + 2 ptr = 126-word allocation.
Both Go 1.4 and this CL are using pointer bitmaps for this case,
so that's an overall 3x speedup for using pointer bitmaps.
SetTypeNode128 uses a 128 data + 2 ptr = 130-word allocation.
Both Go 1.4 and this CL are running the GC program for this case,
so that's an overall 17x speedup when using GC programs (and
I've seen >20x on other systems).
Comparing Go 1.4's SetTypeNode124 (pointer bitmap) against
this CL's SetTypeNode128 (GC program), the slow path in the
code in this CL is 2x faster than the fast path in Go 1.4.
The Go 1 benchmarks are basically unaffected compared to just before this CL.
Go 1 benchmarks, before this CL vs this CL:
name old mean new mean delta
BinaryTree17 5.87s × (0.97,1.04) 5.91s × (0.96,1.04) ~ (p=0.306)
Fannkuch11 4.38s × (1.00,1.00) 4.37s × (1.00,1.01) -0.22% (p=0.006)
FmtFprintfEmpty 90.7ns × (0.97,1.10) 89.3ns × (0.96,1.09) ~ (p=0.280)
FmtFprintfString 282ns × (0.98,1.04) 287ns × (0.98,1.07) +1.72% (p=0.039)
FmtFprintfInt 269ns × (0.99,1.03) 282ns × (0.97,1.04) +4.87% (p=0.000)
FmtFprintfIntInt 478ns × (0.99,1.02) 481ns × (0.99,1.02) +0.61% (p=0.048)
FmtFprintfPrefixedInt 399ns × (0.98,1.03) 400ns × (0.98,1.05) ~ (p=0.533)
FmtFprintfFloat 563ns × (0.99,1.01) 570ns × (1.00,1.01) +1.37% (p=0.000)
FmtManyArgs 1.89µs × (0.99,1.01) 1.92µs × (0.99,1.02) +1.88% (p=0.000)
GobDecode 15.2ms × (0.99,1.01) 15.2ms × (0.98,1.05) ~ (p=0.609)
GobEncode 11.6ms × (0.98,1.03) 11.9ms × (0.98,1.04) +2.17% (p=0.000)
Gzip 648ms × (0.99,1.01) 648ms × (1.00,1.01) ~ (p=0.835)
Gunzip 142ms × (1.00,1.00) 143ms × (1.00,1.01) ~ (p=0.169)
HTTPClientServer 90.5µs × (0.98,1.03) 91.5µs × (0.98,1.04) +1.04% (p=0.045)
JSONEncode 31.5ms × (0.98,1.03) 31.4ms × (0.98,1.03) ~ (p=0.549)
JSONDecode 111ms × (0.99,1.01) 107ms × (0.99,1.01) -3.21% (p=0.000)
Mandelbrot200 6.01ms × (1.00,1.00) 6.01ms × (1.00,1.00) ~ (p=0.878)
GoParse 6.54ms × (0.99,1.02) 6.61ms × (0.99,1.03) +1.08% (p=0.004)
RegexpMatchEasy0_32 160ns × (1.00,1.01) 161ns × (1.00,1.00) +0.40% (p=0.000)
RegexpMatchEasy0_1K 560ns × (0.99,1.01) 559ns × (0.99,1.01) ~ (p=0.088)
RegexpMatchEasy1_32 138ns × (0.99,1.01) 138ns × (1.00,1.00) ~ (p=0.380)
RegexpMatchEasy1_1K 877ns × (1.00,1.00) 878ns × (1.00,1.00) ~ (p=0.157)
RegexpMatchMedium_32 251ns × (0.99,1.00) 251ns × (1.00,1.01) +0.28% (p=0.021)
RegexpMatchMedium_1K 72.6µs × (1.00,1.00) 72.6µs × (1.00,1.00) ~ (p=0.539)
RegexpMatchHard_32 3.84µs × (1.00,1.00) 3.84µs × (1.00,1.00) ~ (p=0.378)
RegexpMatchHard_1K 117µs × (1.00,1.00) 117µs × (1.00,1.00) ~ (p=0.067)
Revcomp 904ms × (0.99,1.02) 904ms × (0.99,1.01) ~ (p=0.943)
Template 125ms × (0.99,1.02) 127ms × (0.99,1.01) +1.79% (p=0.000)
TimeParse 627ns × (0.99,1.01) 622ns × (0.99,1.01) -0.88% (p=0.000)
TimeFormat 655ns × (0.99,1.02) 655ns × (0.99,1.02) ~ (p=0.976)
For the record, Go 1 benchmarks, Go 1.4 vs this CL:
name old mean new mean delta
BinaryTree17 4.61s × (0.97,1.05) 5.91s × (0.98,1.03) +28.35% (p=0.000)
Fannkuch11 4.40s × (0.99,1.03) 4.41s × (0.99,1.01) ~ (p=0.212)
FmtFprintfEmpty 102ns × (0.99,1.01) 84ns × (0.99,1.02) -18.38% (p=0.000)
FmtFprintfString 302ns × (0.98,1.01) 303ns × (0.99,1.02) ~ (p=0.203)
FmtFprintfInt 313ns × (0.97,1.05) 270ns × (0.99,1.01) -13.69% (p=0.000)
FmtFprintfIntInt 524ns × (0.98,1.02) 477ns × (0.99,1.00) -8.87% (p=0.000)
FmtFprintfPrefixedInt 424ns × (0.98,1.02) 386ns × (0.99,1.01) -8.96% (p=0.000)
FmtFprintfFloat 652ns × (0.98,1.02) 594ns × (0.97,1.05) -8.97% (p=0.000)
FmtManyArgs 2.13µs × (0.99,1.02) 1.94µs × (0.99,1.01) -8.92% (p=0.000)
GobDecode 17.1ms × (0.99,1.02) 14.9ms × (0.98,1.03) -13.07% (p=0.000)
GobEncode 13.5ms × (0.98,1.03) 11.5ms × (0.98,1.03) -15.25% (p=0.000)
Gzip 656ms × (0.99,1.02) 647ms × (0.99,1.01) -1.29% (p=0.000)
Gunzip 143ms × (0.99,1.02) 144ms × (0.99,1.01) ~ (p=0.204)
HTTPClientServer 88.2µs × (0.98,1.02) 90.8µs × (0.98,1.01) +2.93% (p=0.000)
JSONEncode 32.2ms × (0.98,1.02) 30.9ms × (0.97,1.04) -4.06% (p=0.001)
JSONDecode 121ms × (0.98,1.02) 110ms × (0.98,1.05) -8.95% (p=0.000)
Mandelbrot200 6.06ms × (0.99,1.01) 6.11ms × (0.98,1.04) ~ (p=0.184)
GoParse 6.76ms × (0.97,1.04) 6.58ms × (0.98,1.05) -2.63% (p=0.003)
RegexpMatchEasy0_32 195ns × (1.00,1.01) 155ns × (0.99,1.01) -20.43% (p=0.000)
RegexpMatchEasy0_1K 479ns × (0.98,1.03) 535ns × (0.99,1.02) +11.59% (p=0.000)
RegexpMatchEasy1_32 169ns × (0.99,1.02) 131ns × (0.99,1.03) -22.44% (p=0.000)
RegexpMatchEasy1_1K 1.53µs × (0.99,1.01) 0.87µs × (0.99,1.02) -43.07% (p=0.000)
RegexpMatchMedium_32 334ns × (0.99,1.01) 242ns × (0.99,1.01) -27.53% (p=0.000)
RegexpMatchMedium_1K 125µs × (1.00,1.01) 72µs × (0.99,1.03) -42.53% (p=0.000)
RegexpMatchHard_32 6.03µs × (0.99,1.01) 3.79µs × (0.99,1.01) -37.12% (p=0.000)
RegexpMatchHard_1K 189µs × (0.99,1.02) 115µs × (0.99,1.01) -39.20% (p=0.000)
Revcomp 935ms × (0.96,1.03) 926ms × (0.98,1.02) ~ (p=0.083)
Template 146ms × (0.97,1.05) 119ms × (0.99,1.01) -18.37% (p=0.000)
TimeParse 660ns × (0.99,1.01) 624ns × (0.99,1.02) -5.43% (p=0.000)
TimeFormat 670ns × (0.98,1.02) 710ns × (1.00,1.01) +5.97% (p=0.000)
This CL is a bit larger than I would like, but the compiler, linker, runtime,
and package reflect all need to be in sync about the format of these programs,
so there is no easy way to split this into independent changes (at least
while keeping the build working at each change).
Fixes #9625.
Fixes #10524.
Change-Id: I9e3e20d6097099d0f8532d1cb5b1af528804989a
Reviewed-on: https://go-review.googlesource.com/9888
Reviewed-by: Austin Clements <austin@google.com>
Run-TryBot: Russ Cox <rsc@golang.org>
2015-05-08 01:43:18 -04:00
|
|
|
gc.End(int64(sect.Length))
|
2015-02-27 22:57:28 -05:00
|
|
|
|
|
|
|
|
/* pointer-free bss */
|
|
|
|
|
sect = addsection(&Segdata, ".noptrbss", 06)
|
2017-04-19 15:15:35 +12:00
|
|
|
sect.Align = dataMaxAlign[SNOPTRBSS]
|
2015-02-27 22:57:28 -05:00
|
|
|
datsize = Rnd(datsize, int64(sect.Align))
|
|
|
|
|
sect.Vaddr = uint64(datsize)
|
cmd/link: use ctxt.{Lookup,ROLookup} in favour of function versions of same
Done with two eg templates:
package p
import (
"cmd/link/internal/ld"
)
func before(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ld.Linklookup(ctxt, name, v)
}
func after(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ctxt.Syms.Lookup(name, v)
}
package p
import (
"cmd/link/internal/ld"
)
func before(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ld.Linkrlookup(ctxt, name, v)
}
func after(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ctxt.Syms.ROLookup(name, v)
}
Change-Id: I00647dbf62294557bd24c29ad1f108fc786335f1
Reviewed-on: https://go-review.googlesource.com/29343
Run-TryBot: Michael Hudson-Doyle <michael.hudson@canonical.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Ian Lance Taylor <iant@golang.org>
2016-09-20 15:06:08 +12:00
|
|
|
ctxt.Syms.Lookup("runtime.noptrbss", 0).Sect = sect
|
|
|
|
|
ctxt.Syms.Lookup("runtime.enoptrbss", 0).Sect = sect
|
2017-04-19 15:15:35 +12:00
|
|
|
for _, s := range data[SNOPTRBSS] {
|
2015-02-27 22:57:28 -05:00
|
|
|
datsize = aligndatsize(datsize, s)
|
|
|
|
|
s.Sect = sect
|
|
|
|
|
s.Value = int64(uint64(datsize) - sect.Vaddr)
|
2016-04-19 08:59:56 -04:00
|
|
|
datsize += s.Size
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
sect.Length = uint64(datsize) - sect.Vaddr
|
cmd/link: use ctxt.{Lookup,ROLookup} in favour of function versions of same
Done with two eg templates:
package p
import (
"cmd/link/internal/ld"
)
func before(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ld.Linklookup(ctxt, name, v)
}
func after(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ctxt.Syms.Lookup(name, v)
}
package p
import (
"cmd/link/internal/ld"
)
func before(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ld.Linkrlookup(ctxt, name, v)
}
func after(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ctxt.Syms.ROLookup(name, v)
}
Change-Id: I00647dbf62294557bd24c29ad1f108fc786335f1
Reviewed-on: https://go-review.googlesource.com/29343
Run-TryBot: Michael Hudson-Doyle <michael.hudson@canonical.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Ian Lance Taylor <iant@golang.org>
2016-09-20 15:06:08 +12:00
|
|
|
ctxt.Syms.Lookup("runtime.end", 0).Sect = sect
|
2017-04-19 15:15:35 +12:00
|
|
|
checkdatsize(ctxt, datsize, SNOPTRBSS)
|
2015-02-27 22:57:28 -05:00
|
|
|
|
2017-04-19 15:15:35 +12:00
|
|
|
if len(data[STLSBSS]) > 0 {
|
2016-04-18 14:50:14 -04:00
|
|
|
var sect *Section
|
2017-04-12 01:10:01 +10:00
|
|
|
if Iself && (Linkmode == LinkExternal || !*FlagD) {
|
2015-08-11 12:29:00 +12:00
|
|
|
sect = addsection(&Segdata, ".tbss", 06)
|
2016-04-06 12:01:40 -07:00
|
|
|
sect.Align = int32(SysArch.PtrSize)
|
2015-08-11 12:29:00 +12:00
|
|
|
sect.Vaddr = 0
|
|
|
|
|
}
|
2015-02-27 22:57:28 -05:00
|
|
|
datsize = 0
|
2015-08-11 12:29:00 +12:00
|
|
|
|
2017-04-19 15:15:35 +12:00
|
|
|
for _, s := range data[STLSBSS] {
|
2015-02-27 22:57:28 -05:00
|
|
|
datsize = aligndatsize(datsize, s)
|
|
|
|
|
s.Sect = sect
|
2015-08-11 12:29:00 +12:00
|
|
|
s.Value = datsize
|
2016-04-19 08:59:56 -04:00
|
|
|
datsize += s.Size
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
2017-04-19 15:15:35 +12:00
|
|
|
checkdatsize(ctxt, datsize, STLSBSS)
|
2015-02-27 22:57:28 -05:00
|
|
|
|
2015-08-11 12:29:00 +12:00
|
|
|
if sect != nil {
|
|
|
|
|
sect.Length = uint64(datsize)
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* We finished data, begin read-only data.
|
|
|
|
|
* Not all systems support a separate read-only non-executable data section.
|
|
|
|
|
* ELF systems do.
|
|
|
|
|
* OS X and Plan 9 do not.
|
|
|
|
|
* Windows PE may, but if so we have not implemented it.
|
|
|
|
|
* And if we're using external linking mode, the point is moot,
|
|
|
|
|
* since it's not our decision; that code expects the sections in
|
|
|
|
|
* segtext.
|
|
|
|
|
*/
|
2015-03-02 12:35:15 -05:00
|
|
|
var segro *Segment
|
2015-02-27 22:57:28 -05:00
|
|
|
if Iself && Linkmode == LinkInternal {
|
|
|
|
|
segro = &Segrodata
|
|
|
|
|
} else {
|
|
|
|
|
segro = &Segtext
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
datsize = 0
|
|
|
|
|
|
|
|
|
|
/* read-only executable ELF, Mach-O sections */
|
2017-04-19 15:15:35 +12:00
|
|
|
if len(data[STEXT]) != 0 {
|
|
|
|
|
Errorf(nil, "dodata found an STEXT symbol: %s", data[STEXT][0].Name)
|
2016-04-18 14:50:14 -04:00
|
|
|
}
|
2017-04-19 15:15:35 +12:00
|
|
|
for _, s := range data[SELFRXSECT] {
|
2016-04-18 14:50:14 -04:00
|
|
|
sect := addsection(&Segtext, s.Name, 04)
|
2015-02-27 22:57:28 -05:00
|
|
|
sect.Align = symalign(s)
|
|
|
|
|
datsize = Rnd(datsize, int64(sect.Align))
|
|
|
|
|
sect.Vaddr = uint64(datsize)
|
|
|
|
|
s.Sect = sect
|
2017-04-19 15:15:35 +12:00
|
|
|
s.Type = SRODATA
|
2015-02-27 22:57:28 -05:00
|
|
|
s.Value = int64(uint64(datsize) - sect.Vaddr)
|
2016-04-19 08:59:56 -04:00
|
|
|
datsize += s.Size
|
2015-02-27 22:57:28 -05:00
|
|
|
sect.Length = uint64(datsize) - sect.Vaddr
|
2017-04-19 15:15:35 +12:00
|
|
|
checkdatsize(ctxt, datsize, SELFRXSECT)
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* read-only data */
|
|
|
|
|
sect = addsection(segro, ".rodata", 04)
|
|
|
|
|
|
|
|
|
|
sect.Vaddr = 0
|
cmd/link: use ctxt.{Lookup,ROLookup} in favour of function versions of same
Done with two eg templates:
package p
import (
"cmd/link/internal/ld"
)
func before(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ld.Linklookup(ctxt, name, v)
}
func after(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ctxt.Syms.Lookup(name, v)
}
package p
import (
"cmd/link/internal/ld"
)
func before(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ld.Linkrlookup(ctxt, name, v)
}
func after(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ctxt.Syms.ROLookup(name, v)
}
Change-Id: I00647dbf62294557bd24c29ad1f108fc786335f1
Reviewed-on: https://go-review.googlesource.com/29343
Run-TryBot: Michael Hudson-Doyle <michael.hudson@canonical.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Ian Lance Taylor <iant@golang.org>
2016-09-20 15:06:08 +12:00
|
|
|
ctxt.Syms.Lookup("runtime.rodata", 0).Sect = sect
|
|
|
|
|
ctxt.Syms.Lookup("runtime.erodata", 0).Sect = sect
|
2016-03-27 10:21:48 -04:00
|
|
|
if !UseRelro() {
|
cmd/link: use ctxt.{Lookup,ROLookup} in favour of function versions of same
Done with two eg templates:
package p
import (
"cmd/link/internal/ld"
)
func before(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ld.Linklookup(ctxt, name, v)
}
func after(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ctxt.Syms.Lookup(name, v)
}
package p
import (
"cmd/link/internal/ld"
)
func before(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ld.Linkrlookup(ctxt, name, v)
}
func after(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ctxt.Syms.ROLookup(name, v)
}
Change-Id: I00647dbf62294557bd24c29ad1f108fc786335f1
Reviewed-on: https://go-review.googlesource.com/29343
Run-TryBot: Michael Hudson-Doyle <michael.hudson@canonical.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Ian Lance Taylor <iant@golang.org>
2016-09-20 15:06:08 +12:00
|
|
|
ctxt.Syms.Lookup("runtime.types", 0).Sect = sect
|
|
|
|
|
ctxt.Syms.Lookup("runtime.etypes", 0).Sect = sect
|
2016-03-27 10:21:48 -04:00
|
|
|
}
|
2017-04-28 08:11:21 +12:00
|
|
|
for _, symn := range readOnly {
|
2016-04-19 08:59:56 -04:00
|
|
|
align := dataMaxAlign[symn]
|
2016-04-18 14:50:14 -04:00
|
|
|
if sect.Align < align {
|
|
|
|
|
sect.Align = align
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
datsize = Rnd(datsize, int64(sect.Align))
|
2017-04-28 08:11:21 +12:00
|
|
|
for _, symn := range readOnly {
|
2016-04-18 14:50:14 -04:00
|
|
|
for _, s := range data[symn] {
|
|
|
|
|
datsize = aligndatsize(datsize, s)
|
|
|
|
|
s.Sect = sect
|
2017-04-19 15:15:35 +12:00
|
|
|
s.Type = SRODATA
|
2016-04-18 14:50:14 -04:00
|
|
|
s.Value = int64(uint64(datsize) - sect.Vaddr)
|
2016-04-19 08:59:56 -04:00
|
|
|
datsize += s.Size
|
2016-04-18 14:50:14 -04:00
|
|
|
}
|
2016-08-19 22:40:38 -04:00
|
|
|
checkdatsize(ctxt, datsize, symn)
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
sect.Length = uint64(datsize) - sect.Vaddr
|
|
|
|
|
|
2016-09-05 23:29:16 -04:00
|
|
|
/* read-only ELF, Mach-O sections */
|
2017-04-19 15:15:35 +12:00
|
|
|
for _, s := range data[SELFROSECT] {
|
2016-09-05 23:29:16 -04:00
|
|
|
sect = addsection(segro, s.Name, 04)
|
|
|
|
|
sect.Align = symalign(s)
|
|
|
|
|
datsize = Rnd(datsize, int64(sect.Align))
|
|
|
|
|
sect.Vaddr = uint64(datsize)
|
|
|
|
|
s.Sect = sect
|
2017-04-19 15:15:35 +12:00
|
|
|
s.Type = SRODATA
|
2016-09-05 23:29:16 -04:00
|
|
|
s.Value = int64(uint64(datsize) - sect.Vaddr)
|
|
|
|
|
datsize += s.Size
|
|
|
|
|
sect.Length = uint64(datsize) - sect.Vaddr
|
|
|
|
|
}
|
2017-04-19 15:15:35 +12:00
|
|
|
checkdatsize(ctxt, datsize, SELFROSECT)
|
2016-09-05 23:29:16 -04:00
|
|
|
|
2017-04-19 15:15:35 +12:00
|
|
|
for _, s := range data[SMACHOPLT] {
|
2016-09-05 23:29:16 -04:00
|
|
|
sect = addsection(segro, s.Name, 04)
|
|
|
|
|
sect.Align = symalign(s)
|
|
|
|
|
datsize = Rnd(datsize, int64(sect.Align))
|
|
|
|
|
sect.Vaddr = uint64(datsize)
|
|
|
|
|
s.Sect = sect
|
2017-04-19 15:15:35 +12:00
|
|
|
s.Type = SRODATA
|
2016-09-05 23:29:16 -04:00
|
|
|
s.Value = int64(uint64(datsize) - sect.Vaddr)
|
|
|
|
|
datsize += s.Size
|
|
|
|
|
sect.Length = uint64(datsize) - sect.Vaddr
|
|
|
|
|
}
|
2017-04-19 15:15:35 +12:00
|
|
|
checkdatsize(ctxt, datsize, SMACHOPLT)
|
2016-09-05 23:29:16 -04:00
|
|
|
|
2015-05-21 13:07:19 +12:00
|
|
|
// There is some data that are conceptually read-only but are written to by
|
|
|
|
|
// relocations. On GNU systems, we can arrange for the dynamic linker to
|
|
|
|
|
// mprotect sections after relocations are applied by giving them write
|
|
|
|
|
// permissions in the object file and calling them ".data.rel.ro.FOO". We
|
|
|
|
|
// divide the .rodata section between actual .rodata and .data.rel.ro.rodata,
|
|
|
|
|
// but for the other sections that this applies to, we just write a read-only
|
|
|
|
|
// .FOO section or a read-write .data.rel.ro.FOO section depending on the
|
|
|
|
|
// situation.
|
|
|
|
|
// TODO(mwhudson): It would make sense to do this more widely, but it makes
|
|
|
|
|
// the system linker segfault on darwin.
|
2016-09-05 23:29:16 -04:00
|
|
|
addrelrosection := func(suffix string) *Section {
|
|
|
|
|
return addsection(segro, suffix, 04)
|
|
|
|
|
}
|
2015-05-21 13:07:19 +12:00
|
|
|
|
|
|
|
|
if UseRelro() {
|
2016-09-05 23:29:16 -04:00
|
|
|
addrelrosection = func(suffix string) *Section {
|
|
|
|
|
seg := &Segrelrodata
|
|
|
|
|
if Linkmode == LinkExternal {
|
|
|
|
|
// Using a separate segment with an external
|
|
|
|
|
// linker results in some programs moving
|
|
|
|
|
// their data sections unexpectedly, which
|
|
|
|
|
// corrupts the moduledata. So we use the
|
|
|
|
|
// rodata segment and let the external linker
|
|
|
|
|
// sort out a rel.ro segment.
|
|
|
|
|
seg = &Segrodata
|
|
|
|
|
}
|
|
|
|
|
return addsection(seg, ".data.rel.ro"+suffix, 06)
|
|
|
|
|
}
|
2015-05-21 13:07:19 +12:00
|
|
|
/* data only written by relocations */
|
2016-09-05 23:29:16 -04:00
|
|
|
sect = addrelrosection("")
|
2015-05-21 13:07:19 +12:00
|
|
|
|
|
|
|
|
sect.Vaddr = 0
|
cmd/link: use ctxt.{Lookup,ROLookup} in favour of function versions of same
Done with two eg templates:
package p
import (
"cmd/link/internal/ld"
)
func before(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ld.Linklookup(ctxt, name, v)
}
func after(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ctxt.Syms.Lookup(name, v)
}
package p
import (
"cmd/link/internal/ld"
)
func before(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ld.Linkrlookup(ctxt, name, v)
}
func after(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ctxt.Syms.ROLookup(name, v)
}
Change-Id: I00647dbf62294557bd24c29ad1f108fc786335f1
Reviewed-on: https://go-review.googlesource.com/29343
Run-TryBot: Michael Hudson-Doyle <michael.hudson@canonical.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Ian Lance Taylor <iant@golang.org>
2016-09-20 15:06:08 +12:00
|
|
|
ctxt.Syms.Lookup("runtime.types", 0).Sect = sect
|
|
|
|
|
ctxt.Syms.Lookup("runtime.etypes", 0).Sect = sect
|
2017-04-28 08:11:21 +12:00
|
|
|
for _, symnro := range readOnly {
|
|
|
|
|
symn := relROMap[symnro]
|
2016-04-19 08:59:56 -04:00
|
|
|
align := dataMaxAlign[symn]
|
2016-04-18 14:50:14 -04:00
|
|
|
if sect.Align < align {
|
|
|
|
|
sect.Align = align
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
datsize = Rnd(datsize, int64(sect.Align))
|
2017-04-28 08:11:21 +12:00
|
|
|
for _, symnro := range readOnly {
|
|
|
|
|
symn := relROMap[symnro]
|
2016-04-18 14:50:14 -04:00
|
|
|
for _, s := range data[symn] {
|
|
|
|
|
datsize = aligndatsize(datsize, s)
|
|
|
|
|
if s.Outer != nil && s.Outer.Sect != nil && s.Outer.Sect != sect {
|
2016-09-17 09:39:33 -04:00
|
|
|
Errorf(s, "s.Outer (%s) in different section from s, %s != %s", s.Outer.Name, s.Outer.Sect.Name, sect.Name)
|
2016-04-18 14:50:14 -04:00
|
|
|
}
|
|
|
|
|
s.Sect = sect
|
2017-04-19 15:15:35 +12:00
|
|
|
s.Type = SRODATA
|
2016-04-18 14:50:14 -04:00
|
|
|
s.Value = int64(uint64(datsize) - sect.Vaddr)
|
2016-04-19 08:59:56 -04:00
|
|
|
datsize += s.Size
|
2015-05-21 13:07:19 +12:00
|
|
|
}
|
2016-08-19 22:40:38 -04:00
|
|
|
checkdatsize(ctxt, datsize, symn)
|
2015-05-21 13:07:19 +12:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
sect.Length = uint64(datsize) - sect.Vaddr
|
|
|
|
|
}
|
|
|
|
|
|
2015-02-27 22:57:28 -05:00
|
|
|
/* typelink */
|
2016-09-05 23:29:16 -04:00
|
|
|
sect = addrelrosection(".typelink")
|
2017-04-19 15:15:35 +12:00
|
|
|
sect.Align = dataMaxAlign[STYPELINK]
|
2015-02-27 22:57:28 -05:00
|
|
|
datsize = Rnd(datsize, int64(sect.Align))
|
|
|
|
|
sect.Vaddr = uint64(datsize)
|
2016-10-19 07:33:16 +03:00
|
|
|
typelink := ctxt.Syms.Lookup("runtime.typelink", 0)
|
|
|
|
|
typelink.Sect = sect
|
2017-04-19 15:15:35 +12:00
|
|
|
typelink.Type = SRODATA
|
2016-10-19 07:33:16 +03:00
|
|
|
datsize += typelink.Size
|
2017-04-19 15:15:35 +12:00
|
|
|
checkdatsize(ctxt, datsize, STYPELINK)
|
2015-02-27 22:57:28 -05:00
|
|
|
sect.Length = uint64(datsize) - sect.Vaddr
|
|
|
|
|
|
2016-03-17 07:00:33 -07:00
|
|
|
/* itablink */
|
2016-09-05 23:29:16 -04:00
|
|
|
sect = addrelrosection(".itablink")
|
2017-04-19 15:15:35 +12:00
|
|
|
sect.Align = dataMaxAlign[SITABLINK]
|
2016-03-17 07:00:33 -07:00
|
|
|
datsize = Rnd(datsize, int64(sect.Align))
|
|
|
|
|
sect.Vaddr = uint64(datsize)
|
cmd/link: use ctxt.{Lookup,ROLookup} in favour of function versions of same
Done with two eg templates:
package p
import (
"cmd/link/internal/ld"
)
func before(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ld.Linklookup(ctxt, name, v)
}
func after(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ctxt.Syms.Lookup(name, v)
}
package p
import (
"cmd/link/internal/ld"
)
func before(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ld.Linkrlookup(ctxt, name, v)
}
func after(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ctxt.Syms.ROLookup(name, v)
}
Change-Id: I00647dbf62294557bd24c29ad1f108fc786335f1
Reviewed-on: https://go-review.googlesource.com/29343
Run-TryBot: Michael Hudson-Doyle <michael.hudson@canonical.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Ian Lance Taylor <iant@golang.org>
2016-09-20 15:06:08 +12:00
|
|
|
ctxt.Syms.Lookup("runtime.itablink", 0).Sect = sect
|
|
|
|
|
ctxt.Syms.Lookup("runtime.eitablink", 0).Sect = sect
|
2017-04-19 15:15:35 +12:00
|
|
|
for _, s := range data[SITABLINK] {
|
2016-03-17 07:00:33 -07:00
|
|
|
datsize = aligndatsize(datsize, s)
|
|
|
|
|
s.Sect = sect
|
2017-04-19 15:15:35 +12:00
|
|
|
s.Type = SRODATA
|
2016-03-17 07:00:33 -07:00
|
|
|
s.Value = int64(uint64(datsize) - sect.Vaddr)
|
2016-04-19 08:59:56 -04:00
|
|
|
datsize += s.Size
|
2016-03-17 07:00:33 -07:00
|
|
|
}
|
2017-04-19 15:15:35 +12:00
|
|
|
checkdatsize(ctxt, datsize, SITABLINK)
|
2016-03-17 07:00:33 -07:00
|
|
|
sect.Length = uint64(datsize) - sect.Vaddr
|
|
|
|
|
|
2015-02-27 22:57:28 -05:00
|
|
|
/* gosymtab */
|
2016-09-05 23:29:16 -04:00
|
|
|
sect = addrelrosection(".gosymtab")
|
2017-04-19 15:15:35 +12:00
|
|
|
sect.Align = dataMaxAlign[SSYMTAB]
|
2015-02-27 22:57:28 -05:00
|
|
|
datsize = Rnd(datsize, int64(sect.Align))
|
|
|
|
|
sect.Vaddr = uint64(datsize)
|
cmd/link: use ctxt.{Lookup,ROLookup} in favour of function versions of same
Done with two eg templates:
package p
import (
"cmd/link/internal/ld"
)
func before(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ld.Linklookup(ctxt, name, v)
}
func after(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ctxt.Syms.Lookup(name, v)
}
package p
import (
"cmd/link/internal/ld"
)
func before(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ld.Linkrlookup(ctxt, name, v)
}
func after(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ctxt.Syms.ROLookup(name, v)
}
Change-Id: I00647dbf62294557bd24c29ad1f108fc786335f1
Reviewed-on: https://go-review.googlesource.com/29343
Run-TryBot: Michael Hudson-Doyle <michael.hudson@canonical.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Ian Lance Taylor <iant@golang.org>
2016-09-20 15:06:08 +12:00
|
|
|
ctxt.Syms.Lookup("runtime.symtab", 0).Sect = sect
|
|
|
|
|
ctxt.Syms.Lookup("runtime.esymtab", 0).Sect = sect
|
2017-04-19 15:15:35 +12:00
|
|
|
for _, s := range data[SSYMTAB] {
|
2015-02-27 22:57:28 -05:00
|
|
|
datsize = aligndatsize(datsize, s)
|
|
|
|
|
s.Sect = sect
|
2017-04-19 15:15:35 +12:00
|
|
|
s.Type = SRODATA
|
2015-02-27 22:57:28 -05:00
|
|
|
s.Value = int64(uint64(datsize) - sect.Vaddr)
|
2016-04-19 08:59:56 -04:00
|
|
|
datsize += s.Size
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
2017-04-19 15:15:35 +12:00
|
|
|
checkdatsize(ctxt, datsize, SSYMTAB)
|
2015-02-27 22:57:28 -05:00
|
|
|
sect.Length = uint64(datsize) - sect.Vaddr
|
|
|
|
|
|
|
|
|
|
/* gopclntab */
|
2016-09-05 23:29:16 -04:00
|
|
|
sect = addrelrosection(".gopclntab")
|
2017-04-19 15:15:35 +12:00
|
|
|
sect.Align = dataMaxAlign[SPCLNTAB]
|
2015-02-27 22:57:28 -05:00
|
|
|
datsize = Rnd(datsize, int64(sect.Align))
|
|
|
|
|
sect.Vaddr = uint64(datsize)
|
cmd/link: use ctxt.{Lookup,ROLookup} in favour of function versions of same
Done with two eg templates:
package p
import (
"cmd/link/internal/ld"
)
func before(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ld.Linklookup(ctxt, name, v)
}
func after(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ctxt.Syms.Lookup(name, v)
}
package p
import (
"cmd/link/internal/ld"
)
func before(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ld.Linkrlookup(ctxt, name, v)
}
func after(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ctxt.Syms.ROLookup(name, v)
}
Change-Id: I00647dbf62294557bd24c29ad1f108fc786335f1
Reviewed-on: https://go-review.googlesource.com/29343
Run-TryBot: Michael Hudson-Doyle <michael.hudson@canonical.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Ian Lance Taylor <iant@golang.org>
2016-09-20 15:06:08 +12:00
|
|
|
ctxt.Syms.Lookup("runtime.pclntab", 0).Sect = sect
|
|
|
|
|
ctxt.Syms.Lookup("runtime.epclntab", 0).Sect = sect
|
2017-04-19 15:15:35 +12:00
|
|
|
for _, s := range data[SPCLNTAB] {
|
2015-02-27 22:57:28 -05:00
|
|
|
datsize = aligndatsize(datsize, s)
|
|
|
|
|
s.Sect = sect
|
2017-04-19 15:15:35 +12:00
|
|
|
s.Type = SRODATA
|
2015-02-27 22:57:28 -05:00
|
|
|
s.Value = int64(uint64(datsize) - sect.Vaddr)
|
2016-04-19 08:59:56 -04:00
|
|
|
datsize += s.Size
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
2017-04-19 15:15:35 +12:00
|
|
|
checkdatsize(ctxt, datsize, SRODATA)
|
2015-02-27 22:57:28 -05:00
|
|
|
sect.Length = uint64(datsize) - sect.Vaddr
|
|
|
|
|
|
|
|
|
|
// 6g uses 4-byte relocation offsets, so the entire segment must fit in 32 bits.
|
|
|
|
|
if datsize != int64(uint32(datsize)) {
|
2016-09-17 09:39:33 -04:00
|
|
|
Errorf(nil, "read-only data segment too large: %d", datsize)
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
|
2017-04-19 15:15:35 +12:00
|
|
|
for symn := SELFRXSECT; symn < SXREF; symn++ {
|
2016-04-18 14:50:14 -04:00
|
|
|
datap = append(datap, data[symn]...)
|
|
|
|
|
}
|
|
|
|
|
|
2016-08-19 22:40:38 -04:00
|
|
|
dwarfgeneratedebugsyms(ctxt)
|
2016-03-14 09:23:04 -07:00
|
|
|
|
2016-04-22 10:31:14 +12:00
|
|
|
var i int
|
2017-05-22 20:17:31 -04:00
|
|
|
for ; i < len(dwarfp); i++ {
|
|
|
|
|
s := dwarfp[i]
|
2017-04-19 15:15:35 +12:00
|
|
|
if s.Type != SDWARFSECT {
|
2016-04-22 10:31:14 +12:00
|
|
|
break
|
|
|
|
|
}
|
2017-05-02 16:46:01 +02:00
|
|
|
|
2016-03-14 09:23:04 -07:00
|
|
|
sect = addsection(&Segdwarf, s.Name, 04)
|
|
|
|
|
sect.Align = 1
|
|
|
|
|
datsize = Rnd(datsize, int64(sect.Align))
|
|
|
|
|
sect.Vaddr = uint64(datsize)
|
|
|
|
|
s.Sect = sect
|
2017-04-19 15:15:35 +12:00
|
|
|
s.Type = SRODATA
|
2016-03-14 09:23:04 -07:00
|
|
|
s.Value = int64(uint64(datsize) - sect.Vaddr)
|
2016-04-19 08:59:56 -04:00
|
|
|
datsize += s.Size
|
2016-03-14 09:23:04 -07:00
|
|
|
sect.Length = uint64(datsize) - sect.Vaddr
|
|
|
|
|
}
|
2017-04-19 15:15:35 +12:00
|
|
|
checkdatsize(ctxt, datsize, SDWARFSECT)
|
2016-03-14 09:23:04 -07:00
|
|
|
|
2017-05-22 20:17:31 -04:00
|
|
|
for i < len(dwarfp) {
|
|
|
|
|
curType := dwarfp[i].Type
|
|
|
|
|
var sect *Section
|
|
|
|
|
switch curType {
|
|
|
|
|
case SDWARFINFO:
|
|
|
|
|
sect = addsection(&Segdwarf, ".debug_info", 04)
|
|
|
|
|
case SDWARFRANGE:
|
|
|
|
|
sect = addsection(&Segdwarf, ".debug_ranges", 04)
|
[dev.debug] cmd/compile: better DWARF with optimizations on
Debuggers use DWARF information to find local variables on the
stack and in registers. Prior to this CL, the DWARF information for
functions claimed that all variables were on the stack at all times.
That's incorrect when optimizations are enabled, and results in
debuggers showing data that is out of date or complete gibberish.
After this CL, the compiler is capable of representing variable
locations more accurately, and attempts to do so. Due to limitations of
the SSA backend, it's not possible to be completely correct.
There are a number of problems in the current design. One of the easier
to understand is that variable names currently must be attached to an
SSA value, but not all assignments in the source code actually result
in machine code. For example:
type myint int
var a int
b := myint(int)
and
b := (*uint64)(unsafe.Pointer(a))
don't generate machine code because the underlying representation is the
same, so the correct value of b will not be set when the user would
expect.
Generating the more precise debug information is behind a flag,
dwarflocationlists. Because of the issues described above, setting the
flag may not make the debugging experience much better, and may actually
make it worse in cases where the variable actually is on the stack and
the more complicated analysis doesn't realize it.
A number of changes are included:
- Add a new pseudo-instruction, RegKill, which indicates that the value
in the register has been clobbered.
- Adjust regalloc to emit RegKills in the right places. Significantly,
this means that phis are mixed with StoreReg and RegKills after
regalloc.
- Track variable decomposition in ssa.LocalSlots.
- After the SSA backend is done, analyze the result and build location
lists for each LocalSlot.
- After assembly is done, update the location lists with the assembled
PC offsets, recompose variables, and build DWARF location lists. Emit the
list as a new linker symbol, one per function.
- In the linker, aggregate the location lists into a .debug_loc section.
TODO:
- currently disabled for non-X86/AMD64 because there are no data tables.
go build -toolexec 'toolstash -cmp' -a std succeeds.
With -dwarflocationlists false:
before: f02812195637909ff675782c0b46836a8ff01976
after: 06f61e8112a42ac34fb80e0c818b3cdb84a5e7ec
benchstat -geomean /tmp/220352263 /tmp/621364410
completed 15 of 15, estimated time remaining 0s (eta 3:52PM)
name old time/op new time/op delta
Template 199ms ± 3% 198ms ± 2% ~ (p=0.400 n=15+14)
Unicode 96.6ms ± 5% 96.4ms ± 5% ~ (p=0.838 n=15+15)
GoTypes 653ms ± 2% 647ms ± 2% ~ (p=0.102 n=15+14)
Flate 133ms ± 6% 129ms ± 3% -2.62% (p=0.041 n=15+15)
GoParser 164ms ± 5% 159ms ± 3% -3.05% (p=0.000 n=15+15)
Reflect 428ms ± 4% 422ms ± 3% ~ (p=0.156 n=15+13)
Tar 123ms ±10% 124ms ± 8% ~ (p=0.461 n=15+15)
XML 228ms ± 3% 224ms ± 3% -1.57% (p=0.045 n=15+15)
[Geo mean] 206ms 377ms +82.86%
name old user-time/op new user-time/op delta
Template 292ms ±10% 301ms ±12% ~ (p=0.189 n=15+15)
Unicode 166ms ±37% 158ms ±14% ~ (p=0.418 n=15+14)
GoTypes 962ms ± 6% 963ms ± 7% ~ (p=0.976 n=15+15)
Flate 207ms ±19% 200ms ±14% ~ (p=0.345 n=14+15)
GoParser 246ms ±22% 240ms ±15% ~ (p=0.587 n=15+15)
Reflect 611ms ±13% 587ms ±14% ~ (p=0.085 n=15+13)
Tar 211ms ±12% 217ms ±14% ~ (p=0.355 n=14+15)
XML 335ms ±15% 320ms ±18% ~ (p=0.169 n=15+15)
[Geo mean] 317ms 583ms +83.72%
name old alloc/op new alloc/op delta
Template 40.2MB ± 0% 40.2MB ± 0% -0.15% (p=0.000 n=14+15)
Unicode 29.2MB ± 0% 29.3MB ± 0% ~ (p=0.624 n=15+15)
GoTypes 114MB ± 0% 114MB ± 0% -0.15% (p=0.000 n=15+14)
Flate 25.7MB ± 0% 25.6MB ± 0% -0.18% (p=0.000 n=13+15)
GoParser 32.2MB ± 0% 32.2MB ± 0% -0.14% (p=0.003 n=15+15)
Reflect 77.8MB ± 0% 77.9MB ± 0% ~ (p=0.061 n=15+15)
Tar 27.1MB ± 0% 27.0MB ± 0% -0.11% (p=0.029 n=15+15)
XML 42.7MB ± 0% 42.5MB ± 0% -0.29% (p=0.000 n=15+15)
[Geo mean] 42.1MB 75.0MB +78.05%
name old allocs/op new allocs/op delta
Template 402k ± 1% 398k ± 0% -0.91% (p=0.000 n=15+15)
Unicode 344k ± 1% 344k ± 0% ~ (p=0.715 n=15+14)
GoTypes 1.18M ± 0% 1.17M ± 0% -0.91% (p=0.000 n=15+14)
Flate 243k ± 0% 240k ± 1% -1.05% (p=0.000 n=13+15)
GoParser 327k ± 1% 324k ± 1% -0.96% (p=0.000 n=15+15)
Reflect 984k ± 1% 982k ± 0% ~ (p=0.050 n=15+15)
Tar 261k ± 1% 259k ± 1% -0.77% (p=0.000 n=15+15)
XML 411k ± 0% 404k ± 1% -1.55% (p=0.000 n=15+15)
[Geo mean] 439k 755k +72.01%
name old text-bytes new text-bytes delta
HelloSize 694kB ± 0% 694kB ± 0% -0.00% (p=0.000 n=15+15)
name old data-bytes new data-bytes delta
HelloSize 5.55kB ± 0% 5.55kB ± 0% ~ (all equal)
name old bss-bytes new bss-bytes delta
HelloSize 133kB ± 0% 133kB ± 0% ~ (all equal)
name old exe-bytes new exe-bytes delta
HelloSize 1.04MB ± 0% 1.04MB ± 0% ~ (all equal)
Change-Id: I991fc553ef175db46bb23b2128317bbd48de70d8
Reviewed-on: https://go-review.googlesource.com/41770
Reviewed-by: Josh Bleecher Snyder <josharian@gmail.com>
2017-07-21 18:30:19 -04:00
|
|
|
case SDWARFLOC:
|
|
|
|
|
sect = addsection(&Segdwarf, ".debug_loc", 04)
|
2017-05-22 20:17:31 -04:00
|
|
|
default:
|
|
|
|
|
Errorf(dwarfp[i], "unknown DWARF section %v", curType)
|
|
|
|
|
}
|
|
|
|
|
|
2016-03-14 09:23:04 -07:00
|
|
|
sect.Align = 1
|
|
|
|
|
datsize = Rnd(datsize, int64(sect.Align))
|
|
|
|
|
sect.Vaddr = uint64(datsize)
|
2017-05-22 20:17:31 -04:00
|
|
|
for ; i < len(dwarfp); i++ {
|
|
|
|
|
s := dwarfp[i]
|
|
|
|
|
if s.Type != curType {
|
2016-04-22 10:31:14 +12:00
|
|
|
break
|
|
|
|
|
}
|
2016-03-14 09:23:04 -07:00
|
|
|
s.Sect = sect
|
2017-04-19 15:15:35 +12:00
|
|
|
s.Type = SRODATA
|
2016-03-14 09:23:04 -07:00
|
|
|
s.Value = int64(uint64(datsize) - sect.Vaddr)
|
|
|
|
|
s.Attr |= AttrLocal
|
2016-04-19 08:59:56 -04:00
|
|
|
datsize += s.Size
|
2016-03-14 09:23:04 -07:00
|
|
|
}
|
|
|
|
|
sect.Length = uint64(datsize) - sect.Vaddr
|
2017-05-22 20:17:31 -04:00
|
|
|
checkdatsize(ctxt, datsize, curType)
|
2016-03-14 09:23:04 -07:00
|
|
|
}
|
|
|
|
|
|
2015-02-27 22:57:28 -05:00
|
|
|
/* number the sections */
|
2015-03-02 12:35:15 -05:00
|
|
|
n := int32(1)
|
2015-02-27 22:57:28 -05:00
|
|
|
|
2017-04-18 21:52:06 +12:00
|
|
|
for _, sect := range Segtext.Sections {
|
2015-02-27 22:57:28 -05:00
|
|
|
sect.Extnum = int16(n)
|
|
|
|
|
n++
|
|
|
|
|
}
|
2017-04-18 21:52:06 +12:00
|
|
|
for _, sect := range Segrodata.Sections {
|
2015-02-27 22:57:28 -05:00
|
|
|
sect.Extnum = int16(n)
|
|
|
|
|
n++
|
|
|
|
|
}
|
2017-04-18 21:52:06 +12:00
|
|
|
for _, sect := range Segrelrodata.Sections {
|
2016-09-05 23:29:16 -04:00
|
|
|
sect.Extnum = int16(n)
|
|
|
|
|
n++
|
|
|
|
|
}
|
2017-04-18 21:52:06 +12:00
|
|
|
for _, sect := range Segdata.Sections {
|
2015-02-27 22:57:28 -05:00
|
|
|
sect.Extnum = int16(n)
|
|
|
|
|
n++
|
|
|
|
|
}
|
2017-04-18 21:52:06 +12:00
|
|
|
for _, sect := range Segdwarf.Sections {
|
2016-03-14 09:23:04 -07:00
|
|
|
sect.Extnum = int16(n)
|
|
|
|
|
n++
|
|
|
|
|
}
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
2015-06-04 15:15:48 -04:00
|
|
|
|
2017-04-19 15:15:35 +12:00
|
|
|
func dodataSect(ctxt *Link, symn SymKind, syms []*Symbol) (result []*Symbol, maxAlign int32) {
|
2017-04-18 12:53:25 -07:00
|
|
|
if Headtype == objabi.Hdarwin {
|
2016-04-18 14:50:14 -04:00
|
|
|
// Some symbols may no longer belong in syms
|
|
|
|
|
// due to movement in machosymorder.
|
2016-08-19 11:35:54 -04:00
|
|
|
newSyms := make([]*Symbol, 0, len(syms))
|
2016-04-18 14:50:14 -04:00
|
|
|
for _, s := range syms {
|
2016-09-07 14:45:27 -04:00
|
|
|
if s.Type == symn {
|
2016-04-18 14:50:14 -04:00
|
|
|
newSyms = append(newSyms, s)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
syms = newSyms
|
|
|
|
|
}
|
|
|
|
|
|
2016-09-19 14:13:07 -04:00
|
|
|
var head, tail *Symbol
|
|
|
|
|
symsSort := make([]dataSortKey, 0, len(syms))
|
|
|
|
|
for _, s := range syms {
|
2016-04-18 14:50:14 -04:00
|
|
|
if s.Attr.OnList() {
|
|
|
|
|
log.Fatalf("symbol %s listed multiple times", s.Name)
|
|
|
|
|
}
|
|
|
|
|
s.Attr |= AttrOnList
|
2016-04-19 08:59:56 -04:00
|
|
|
switch {
|
|
|
|
|
case s.Size < int64(len(s.P)):
|
2016-09-17 09:39:33 -04:00
|
|
|
Errorf(s, "initialize bounds (%d < %d)", s.Size, len(s.P))
|
2016-04-19 08:59:56 -04:00
|
|
|
case s.Size < 0:
|
2016-09-17 09:39:33 -04:00
|
|
|
Errorf(s, "negative size (%d bytes)", s.Size)
|
2016-04-19 08:59:56 -04:00
|
|
|
case s.Size > cutoff:
|
2016-09-17 09:39:33 -04:00
|
|
|
Errorf(s, "symbol too large (%d bytes)", s.Size)
|
2016-04-18 14:50:14 -04:00
|
|
|
}
|
|
|
|
|
|
2016-09-19 14:13:07 -04:00
|
|
|
// If the usually-special section-marker symbols are being laid
|
|
|
|
|
// out as regular symbols, put them either at the beginning or
|
|
|
|
|
// end of their section.
|
2017-04-18 12:53:25 -07:00
|
|
|
if ctxt.DynlinkingGo() && Headtype == objabi.Hdarwin {
|
2016-09-19 14:13:07 -04:00
|
|
|
switch s.Name {
|
|
|
|
|
case "runtime.text", "runtime.bss", "runtime.data", "runtime.types":
|
|
|
|
|
head = s
|
|
|
|
|
continue
|
|
|
|
|
case "runtime.etext", "runtime.ebss", "runtime.edata", "runtime.etypes":
|
|
|
|
|
tail = s
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
key := dataSortKey{
|
2016-04-18 14:50:14 -04:00
|
|
|
size: s.Size,
|
|
|
|
|
name: s.Name,
|
2016-08-22 10:27:20 +12:00
|
|
|
sym: s,
|
2016-04-18 14:50:14 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
switch s.Type {
|
2017-04-19 15:15:35 +12:00
|
|
|
case SELFGOT:
|
2016-04-18 14:50:14 -04:00
|
|
|
// For ppc64, we want to interleave the .got and .toc sections
|
|
|
|
|
// from input files. Both are type SELFGOT, so in that case
|
|
|
|
|
// we skip size comparison and fall through to the name
|
|
|
|
|
// comparison (conveniently, .got sorts before .toc).
|
2016-09-19 14:13:07 -04:00
|
|
|
key.size = 0
|
2016-04-18 14:50:14 -04:00
|
|
|
}
|
2016-09-19 14:13:07 -04:00
|
|
|
|
|
|
|
|
symsSort = append(symsSort, key)
|
2016-04-18 14:50:14 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
sort.Sort(bySizeAndName(symsSort))
|
|
|
|
|
|
2016-09-19 14:13:07 -04:00
|
|
|
off := 0
|
|
|
|
|
if head != nil {
|
|
|
|
|
syms[0] = head
|
|
|
|
|
off++
|
|
|
|
|
}
|
2016-04-18 14:50:14 -04:00
|
|
|
for i, symSort := range symsSort {
|
2016-09-19 14:13:07 -04:00
|
|
|
syms[i+off] = symSort.sym
|
2016-08-22 10:27:20 +12:00
|
|
|
align := symalign(symSort.sym)
|
2016-04-19 08:59:56 -04:00
|
|
|
if maxAlign < align {
|
|
|
|
|
maxAlign = align
|
|
|
|
|
}
|
2016-04-18 14:50:14 -04:00
|
|
|
}
|
2016-09-19 14:13:07 -04:00
|
|
|
if tail != nil {
|
|
|
|
|
syms[len(syms)-1] = tail
|
|
|
|
|
}
|
2016-04-18 14:50:14 -04:00
|
|
|
|
2017-04-19 15:15:35 +12:00
|
|
|
if Iself && symn == SELFROSECT {
|
2016-04-18 14:50:14 -04:00
|
|
|
// Make .rela and .rela.plt contiguous, the ELF ABI requires this
|
|
|
|
|
// and Solaris actually cares.
|
|
|
|
|
reli, plti := -1, -1
|
|
|
|
|
for i, s := range syms {
|
|
|
|
|
switch s.Name {
|
|
|
|
|
case ".rel.plt", ".rela.plt":
|
|
|
|
|
plti = i
|
|
|
|
|
case ".rel", ".rela":
|
|
|
|
|
reli = i
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if reli >= 0 && plti >= 0 && plti != reli+1 {
|
2016-04-20 19:10:20 -04:00
|
|
|
var first, second int
|
|
|
|
|
if plti > reli {
|
|
|
|
|
first, second = reli, plti
|
|
|
|
|
} else {
|
|
|
|
|
first, second = plti, reli
|
2016-04-18 14:50:14 -04:00
|
|
|
}
|
2016-04-20 19:10:20 -04:00
|
|
|
rel, plt := syms[reli], syms[plti]
|
|
|
|
|
copy(syms[first+2:], syms[first+1:second])
|
|
|
|
|
syms[first+0] = rel
|
|
|
|
|
syms[first+1] = plt
|
2016-12-01 22:48:52 -08:00
|
|
|
|
|
|
|
|
// Make sure alignment doesn't introduce a gap.
|
|
|
|
|
// Setting the alignment explicitly prevents
|
|
|
|
|
// symalign from basing it on the size and
|
|
|
|
|
// getting it wrong.
|
|
|
|
|
rel.Align = int32(SysArch.RegSize)
|
|
|
|
|
plt.Align = int32(SysArch.RegSize)
|
2016-04-18 14:50:14 -04:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2016-04-19 08:59:56 -04:00
|
|
|
return syms, maxAlign
|
2016-04-18 14:50:14 -04:00
|
|
|
}
|
|
|
|
|
|
2015-06-04 15:15:48 -04:00
|
|
|
// Add buildid to beginning of text segment, on non-ELF systems.
|
|
|
|
|
// Non-ELF binary formats are not always flexible enough to
|
|
|
|
|
// give us a place to put the Go build ID. On those systems, we put it
|
|
|
|
|
// at the very beginning of the text segment.
|
|
|
|
|
// This ``header'' is read by cmd/go.
|
2016-08-19 22:40:38 -04:00
|
|
|
func (ctxt *Link) textbuildid() {
|
2016-09-19 14:13:07 -04:00
|
|
|
if Iself || Buildmode == BuildmodePlugin || *flagBuildid == "" {
|
2015-06-04 15:15:48 -04:00
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
|
cmd/link: use ctxt.{Lookup,ROLookup} in favour of function versions of same
Done with two eg templates:
package p
import (
"cmd/link/internal/ld"
)
func before(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ld.Linklookup(ctxt, name, v)
}
func after(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ctxt.Syms.Lookup(name, v)
}
package p
import (
"cmd/link/internal/ld"
)
func before(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ld.Linkrlookup(ctxt, name, v)
}
func after(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ctxt.Syms.ROLookup(name, v)
}
Change-Id: I00647dbf62294557bd24c29ad1f108fc786335f1
Reviewed-on: https://go-review.googlesource.com/29343
Run-TryBot: Michael Hudson-Doyle <michael.hudson@canonical.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Ian Lance Taylor <iant@golang.org>
2016-09-20 15:06:08 +12:00
|
|
|
sym := ctxt.Syms.Lookup("go.buildid", 0)
|
2016-03-02 07:59:49 -05:00
|
|
|
sym.Attr |= AttrReachable
|
2015-06-04 15:15:48 -04:00
|
|
|
// The \xff is invalid UTF-8, meant to make it less likely
|
|
|
|
|
// to find one of these accidentally.
|
2016-08-21 18:34:24 -04:00
|
|
|
data := "\xff Go build ID: " + strconv.Quote(*flagBuildid) + "\n \xff"
|
2017-04-19 15:15:35 +12:00
|
|
|
sym.Type = STEXT
|
2015-06-04 15:15:48 -04:00
|
|
|
sym.P = []byte(data)
|
|
|
|
|
sym.Size = int64(len(sym.P))
|
|
|
|
|
|
2016-08-19 22:40:38 -04:00
|
|
|
ctxt.Textp = append(ctxt.Textp, nil)
|
|
|
|
|
copy(ctxt.Textp[1:], ctxt.Textp)
|
|
|
|
|
ctxt.Textp[0] = sym
|
2015-06-04 15:15:48 -04:00
|
|
|
}
|
2015-02-27 22:57:28 -05:00
|
|
|
|
|
|
|
|
// assign addresses to text
|
2016-08-19 22:40:38 -04:00
|
|
|
func (ctxt *Link) textaddress() {
|
2015-02-27 22:57:28 -05:00
|
|
|
addsection(&Segtext, ".text", 05)
|
|
|
|
|
|
|
|
|
|
// Assign PCs in text segment.
|
|
|
|
|
// Could parallelize, by assigning to text
|
|
|
|
|
// and then letting threads copy down, but probably not worth it.
|
2017-04-18 21:52:06 +12:00
|
|
|
sect := Segtext.Sections[0]
|
2015-02-27 22:57:28 -05:00
|
|
|
|
|
|
|
|
sect.Align = int32(Funcalign)
|
2016-09-19 14:13:07 -04:00
|
|
|
|
|
|
|
|
text := ctxt.Syms.Lookup("runtime.text", 0)
|
|
|
|
|
text.Sect = sect
|
|
|
|
|
|
2017-04-18 12:53:25 -07:00
|
|
|
if ctxt.DynlinkingGo() && Headtype == objabi.Hdarwin {
|
2016-09-19 14:13:07 -04:00
|
|
|
etext := ctxt.Syms.Lookup("runtime.etext", 0)
|
|
|
|
|
etext.Sect = sect
|
|
|
|
|
|
|
|
|
|
ctxt.Textp = append(ctxt.Textp, etext, nil)
|
|
|
|
|
copy(ctxt.Textp[1:], ctxt.Textp)
|
|
|
|
|
ctxt.Textp[0] = text
|
|
|
|
|
}
|
|
|
|
|
|
2016-08-21 18:34:24 -04:00
|
|
|
va := uint64(*FlagTextAddr)
|
2016-08-25 11:07:33 -05:00
|
|
|
n := 1
|
2015-02-27 22:57:28 -05:00
|
|
|
sect.Vaddr = va
|
2016-09-14 14:47:12 -04:00
|
|
|
ntramps := 0
|
2016-08-19 22:40:38 -04:00
|
|
|
for _, sym := range ctxt.Textp {
|
2017-06-08 08:26:19 -04:00
|
|
|
sect, n, va = assignAddress(ctxt, sect, n, sym, va, false)
|
2016-09-14 14:47:12 -04:00
|
|
|
|
|
|
|
|
trampoline(ctxt, sym) // resolve jumps, may add trampolines if jump too far
|
|
|
|
|
|
|
|
|
|
// lay down trampolines after each function
|
|
|
|
|
for ; ntramps < len(ctxt.tramps); ntramps++ {
|
|
|
|
|
tramp := ctxt.tramps[ntramps]
|
2017-06-08 08:26:19 -04:00
|
|
|
sect, n, va = assignAddress(ctxt, sect, n, tramp, va, true)
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
2016-09-14 14:47:12 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
sect.Length = va - sect.Vaddr
|
|
|
|
|
ctxt.Syms.Lookup("runtime.etext", 0).Sect = sect
|
|
|
|
|
|
|
|
|
|
// merge tramps into Textp, keeping Textp in address order
|
|
|
|
|
if ntramps != 0 {
|
|
|
|
|
newtextp := make([]*Symbol, 0, len(ctxt.Textp)+ntramps)
|
|
|
|
|
i := 0
|
|
|
|
|
for _, sym := range ctxt.Textp {
|
|
|
|
|
for ; i < ntramps && ctxt.tramps[i].Value < sym.Value; i++ {
|
|
|
|
|
newtextp = append(newtextp, ctxt.tramps[i])
|
|
|
|
|
}
|
|
|
|
|
newtextp = append(newtextp, sym)
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
2016-09-14 14:47:12 -04:00
|
|
|
newtextp = append(newtextp, ctxt.tramps[i:ntramps]...)
|
|
|
|
|
|
|
|
|
|
ctxt.Textp = newtextp
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// assigns address for a text symbol, returns (possibly new) section, its number, and the address
|
|
|
|
|
// Note: once we have trampoline insertion support for external linking, this function
|
|
|
|
|
// will not need to create new text sections, and so no need to return sect and n.
|
2017-06-08 08:26:19 -04:00
|
|
|
func assignAddress(ctxt *Link, sect *Section, n int, sym *Symbol, va uint64, isTramp bool) (*Section, int, uint64) {
|
2016-09-14 14:47:12 -04:00
|
|
|
sym.Sect = sect
|
2017-04-19 15:15:35 +12:00
|
|
|
if sym.Type&SSUB != 0 {
|
2016-09-14 14:47:12 -04:00
|
|
|
return sect, n, va
|
|
|
|
|
}
|
|
|
|
|
if sym.Align != 0 {
|
|
|
|
|
va = uint64(Rnd(int64(va), int64(sym.Align)))
|
|
|
|
|
} else {
|
|
|
|
|
va = uint64(Rnd(int64(va), int64(Funcalign)))
|
|
|
|
|
}
|
|
|
|
|
sym.Value = 0
|
|
|
|
|
for sub := sym; sub != nil; sub = sub.Sub {
|
|
|
|
|
sub.Value += int64(va)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
funcsize := uint64(MINFUNC) // spacing required for findfunctab
|
|
|
|
|
if sym.Size > MINFUNC {
|
|
|
|
|
funcsize = uint64(sym.Size)
|
|
|
|
|
}
|
2016-08-25 11:07:33 -05:00
|
|
|
|
2016-09-14 14:47:12 -04:00
|
|
|
// On ppc64x a text section should not be larger than 2^26 bytes due to the size of
|
|
|
|
|
// call target offset field in the bl instruction. Splitting into smaller text
|
|
|
|
|
// sections smaller than this limit allows the GNU linker to modify the long calls
|
|
|
|
|
// appropriately. The limit allows for the space needed for tables inserted by the linker.
|
2016-08-25 11:07:33 -05:00
|
|
|
|
2016-09-14 14:47:12 -04:00
|
|
|
// If this function doesn't fit in the current text section, then create a new one.
|
2016-08-25 11:07:33 -05:00
|
|
|
|
2016-09-14 14:47:12 -04:00
|
|
|
// Only break at outermost syms.
|
2016-08-25 11:07:33 -05:00
|
|
|
|
2017-06-08 08:26:19 -04:00
|
|
|
if SysArch.InFamily(sys.PPC64) && sym.Outer == nil && Iself && Linkmode == LinkExternal && va-sect.Vaddr+funcsize+maxSizeTrampolinesPPC64(sym, isTramp) > 0x1c00000 {
|
2016-08-25 11:07:33 -05:00
|
|
|
|
2016-09-14 14:47:12 -04:00
|
|
|
// Set the length for the previous text section
|
|
|
|
|
sect.Length = va - sect.Vaddr
|
2016-08-25 11:07:33 -05:00
|
|
|
|
2016-09-14 14:47:12 -04:00
|
|
|
// Create new section, set the starting Vaddr
|
|
|
|
|
sect = addsection(&Segtext, ".text", 05)
|
|
|
|
|
sect.Vaddr = va
|
2016-12-07 07:55:21 -06:00
|
|
|
sym.Sect = sect
|
2016-08-25 11:07:33 -05:00
|
|
|
|
2016-09-14 14:47:12 -04:00
|
|
|
// Create a symbol for the start of the secondary text sections
|
|
|
|
|
ctxt.Syms.Lookup(fmt.Sprintf("runtime.text.%d", n), 0).Sect = sect
|
|
|
|
|
n++
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
2016-09-14 14:47:12 -04:00
|
|
|
va += funcsize
|
2015-02-27 22:57:28 -05:00
|
|
|
|
2016-09-14 14:47:12 -04:00
|
|
|
return sect, n, va
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// assign addresses
|
2016-08-19 22:40:38 -04:00
|
|
|
func (ctxt *Link) address() {
|
2016-08-21 18:34:24 -04:00
|
|
|
va := uint64(*FlagTextAddr)
|
2015-02-27 22:57:28 -05:00
|
|
|
Segtext.Rwx = 05
|
|
|
|
|
Segtext.Vaddr = va
|
|
|
|
|
Segtext.Fileoff = uint64(HEADR)
|
2017-04-18 21:52:06 +12:00
|
|
|
for _, s := range Segtext.Sections {
|
2015-02-27 22:57:28 -05:00
|
|
|
va = uint64(Rnd(int64(va), int64(s.Align)))
|
|
|
|
|
s.Vaddr = va
|
|
|
|
|
va += s.Length
|
|
|
|
|
}
|
|
|
|
|
|
2016-08-21 18:34:24 -04:00
|
|
|
Segtext.Length = va - uint64(*FlagTextAddr)
|
2015-02-27 22:57:28 -05:00
|
|
|
Segtext.Filelen = Segtext.Length
|
2017-04-18 12:53:25 -07:00
|
|
|
if Headtype == objabi.Hnacl {
|
2015-02-27 22:57:28 -05:00
|
|
|
va += 32 // room for the "halt sled"
|
|
|
|
|
}
|
|
|
|
|
|
2017-04-18 21:52:06 +12:00
|
|
|
if len(Segrodata.Sections) > 0 {
|
2015-02-27 22:57:28 -05:00
|
|
|
// align to page boundary so as not to mix
|
|
|
|
|
// rodata and executable text.
|
2016-09-05 23:29:16 -04:00
|
|
|
//
|
|
|
|
|
// Note: gold or GNU ld will reduce the size of the executable
|
|
|
|
|
// file by arranging for the relro segment to end at a page
|
|
|
|
|
// boundary, and overlap the end of the text segment with the
|
|
|
|
|
// start of the relro segment in the file. The PT_LOAD segments
|
|
|
|
|
// will be such that the last page of the text segment will be
|
|
|
|
|
// mapped twice, once r-x and once starting out rw- and, after
|
|
|
|
|
// relocation processing, changed to r--.
|
|
|
|
|
//
|
|
|
|
|
// Ideally the last page of the text segment would not be
|
|
|
|
|
// writable even for this short period.
|
2016-08-21 18:34:24 -04:00
|
|
|
va = uint64(Rnd(int64(va), int64(*FlagRound)))
|
2015-02-27 22:57:28 -05:00
|
|
|
|
|
|
|
|
Segrodata.Rwx = 04
|
|
|
|
|
Segrodata.Vaddr = va
|
|
|
|
|
Segrodata.Fileoff = va - Segtext.Vaddr + Segtext.Fileoff
|
|
|
|
|
Segrodata.Filelen = 0
|
2017-04-18 21:52:06 +12:00
|
|
|
for _, s := range Segrodata.Sections {
|
2015-02-27 22:57:28 -05:00
|
|
|
va = uint64(Rnd(int64(va), int64(s.Align)))
|
|
|
|
|
s.Vaddr = va
|
|
|
|
|
va += s.Length
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
Segrodata.Length = va - Segrodata.Vaddr
|
|
|
|
|
Segrodata.Filelen = Segrodata.Length
|
|
|
|
|
}
|
2017-04-18 21:52:06 +12:00
|
|
|
if len(Segrelrodata.Sections) > 0 {
|
2016-09-05 23:29:16 -04:00
|
|
|
// align to page boundary so as not to mix
|
|
|
|
|
// rodata, rel-ro data, and executable text.
|
|
|
|
|
va = uint64(Rnd(int64(va), int64(*FlagRound)))
|
|
|
|
|
|
|
|
|
|
Segrelrodata.Rwx = 06
|
|
|
|
|
Segrelrodata.Vaddr = va
|
|
|
|
|
Segrelrodata.Fileoff = va - Segrodata.Vaddr + Segrodata.Fileoff
|
|
|
|
|
Segrelrodata.Filelen = 0
|
2017-04-18 21:52:06 +12:00
|
|
|
for _, s := range Segrelrodata.Sections {
|
2016-09-05 23:29:16 -04:00
|
|
|
va = uint64(Rnd(int64(va), int64(s.Align)))
|
|
|
|
|
s.Vaddr = va
|
|
|
|
|
va += s.Length
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
Segrelrodata.Length = va - Segrelrodata.Vaddr
|
|
|
|
|
Segrelrodata.Filelen = Segrelrodata.Length
|
|
|
|
|
}
|
2015-02-27 22:57:28 -05:00
|
|
|
|
2016-08-21 18:34:24 -04:00
|
|
|
va = uint64(Rnd(int64(va), int64(*FlagRound)))
|
2015-02-27 22:57:28 -05:00
|
|
|
Segdata.Rwx = 06
|
|
|
|
|
Segdata.Vaddr = va
|
|
|
|
|
Segdata.Fileoff = va - Segtext.Vaddr + Segtext.Fileoff
|
|
|
|
|
Segdata.Filelen = 0
|
2017-04-18 12:53:25 -07:00
|
|
|
if Headtype == objabi.Hwindows {
|
2015-02-27 22:57:28 -05:00
|
|
|
Segdata.Fileoff = Segtext.Fileoff + uint64(Rnd(int64(Segtext.Length), PEFILEALIGN))
|
|
|
|
|
}
|
2017-04-18 12:53:25 -07:00
|
|
|
if Headtype == objabi.Hplan9 {
|
2015-02-27 22:57:28 -05:00
|
|
|
Segdata.Fileoff = Segtext.Fileoff + Segtext.Filelen
|
|
|
|
|
}
|
2015-03-02 14:22:05 -05:00
|
|
|
var data *Section
|
|
|
|
|
var noptr *Section
|
|
|
|
|
var bss *Section
|
|
|
|
|
var noptrbss *Section
|
2015-03-02 12:35:15 -05:00
|
|
|
var vlen int64
|
2017-04-18 21:52:06 +12:00
|
|
|
for i, s := range Segdata.Sections {
|
2015-08-11 12:29:00 +12:00
|
|
|
if Iself && s.Name == ".tbss" {
|
|
|
|
|
continue
|
|
|
|
|
}
|
2015-02-27 22:57:28 -05:00
|
|
|
vlen = int64(s.Length)
|
2017-04-18 21:52:06 +12:00
|
|
|
if i+1 < len(Segdata.Sections) && !(Iself && Segdata.Sections[i+1].Name == ".tbss") {
|
|
|
|
|
vlen = int64(Segdata.Sections[i+1].Vaddr - s.Vaddr)
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
s.Vaddr = va
|
|
|
|
|
va += uint64(vlen)
|
|
|
|
|
Segdata.Length = va - Segdata.Vaddr
|
|
|
|
|
if s.Name == ".data" {
|
|
|
|
|
data = s
|
|
|
|
|
}
|
|
|
|
|
if s.Name == ".noptrdata" {
|
|
|
|
|
noptr = s
|
|
|
|
|
}
|
|
|
|
|
if s.Name == ".bss" {
|
|
|
|
|
bss = s
|
|
|
|
|
}
|
|
|
|
|
if s.Name == ".noptrbss" {
|
|
|
|
|
noptrbss = s
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
Segdata.Filelen = bss.Vaddr - Segdata.Vaddr
|
|
|
|
|
|
2016-08-21 18:34:24 -04:00
|
|
|
va = uint64(Rnd(int64(va), int64(*FlagRound)))
|
2016-03-14 09:23:04 -07:00
|
|
|
Segdwarf.Rwx = 06
|
|
|
|
|
Segdwarf.Vaddr = va
|
2016-08-21 18:34:24 -04:00
|
|
|
Segdwarf.Fileoff = Segdata.Fileoff + uint64(Rnd(int64(Segdata.Filelen), int64(*FlagRound)))
|
2016-03-14 09:23:04 -07:00
|
|
|
Segdwarf.Filelen = 0
|
2017-04-18 12:53:25 -07:00
|
|
|
if Headtype == objabi.Hwindows {
|
2016-03-14 09:23:04 -07:00
|
|
|
Segdwarf.Fileoff = Segdata.Fileoff + uint64(Rnd(int64(Segdata.Filelen), int64(PEFILEALIGN)))
|
|
|
|
|
}
|
2017-04-18 21:52:06 +12:00
|
|
|
for i, s := range Segdwarf.Sections {
|
2016-03-14 09:23:04 -07:00
|
|
|
vlen = int64(s.Length)
|
2017-04-18 21:52:06 +12:00
|
|
|
if i+1 < len(Segdwarf.Sections) {
|
|
|
|
|
vlen = int64(Segdwarf.Sections[i+1].Vaddr - s.Vaddr)
|
2016-03-14 09:23:04 -07:00
|
|
|
}
|
|
|
|
|
s.Vaddr = va
|
|
|
|
|
va += uint64(vlen)
|
2017-04-18 12:53:25 -07:00
|
|
|
if Headtype == objabi.Hwindows {
|
2016-03-14 09:23:04 -07:00
|
|
|
va = uint64(Rnd(int64(va), PEFILEALIGN))
|
|
|
|
|
}
|
|
|
|
|
Segdwarf.Length = va - Segdwarf.Vaddr
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
Segdwarf.Filelen = va - Segdwarf.Vaddr
|
|
|
|
|
|
2016-09-05 23:29:16 -04:00
|
|
|
var (
|
2017-04-18 21:52:06 +12:00
|
|
|
text = Segtext.Sections[0]
|
cmd/link: use ctxt.{Lookup,ROLookup} in favour of function versions of same
Done with two eg templates:
package p
import (
"cmd/link/internal/ld"
)
func before(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ld.Linklookup(ctxt, name, v)
}
func after(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ctxt.Syms.Lookup(name, v)
}
package p
import (
"cmd/link/internal/ld"
)
func before(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ld.Linkrlookup(ctxt, name, v)
}
func after(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ctxt.Syms.ROLookup(name, v)
}
Change-Id: I00647dbf62294557bd24c29ad1f108fc786335f1
Reviewed-on: https://go-review.googlesource.com/29343
Run-TryBot: Michael Hudson-Doyle <michael.hudson@canonical.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Ian Lance Taylor <iant@golang.org>
2016-09-20 15:06:08 +12:00
|
|
|
rodata = ctxt.Syms.Lookup("runtime.rodata", 0).Sect
|
|
|
|
|
itablink = ctxt.Syms.Lookup("runtime.itablink", 0).Sect
|
|
|
|
|
symtab = ctxt.Syms.Lookup("runtime.symtab", 0).Sect
|
|
|
|
|
pclntab = ctxt.Syms.Lookup("runtime.pclntab", 0).Sect
|
|
|
|
|
types = ctxt.Syms.Lookup("runtime.types", 0).Sect
|
2016-09-05 23:29:16 -04:00
|
|
|
)
|
2016-08-25 11:07:33 -05:00
|
|
|
lasttext := text
|
|
|
|
|
// Could be multiple .text sections
|
2017-04-18 21:52:06 +12:00
|
|
|
for _, sect := range Segtext.Sections {
|
|
|
|
|
if sect.Name == ".text" {
|
|
|
|
|
lasttext = sect
|
|
|
|
|
}
|
2016-08-25 11:07:33 -05:00
|
|
|
}
|
2015-02-27 22:57:28 -05:00
|
|
|
|
2016-04-18 14:50:14 -04:00
|
|
|
for _, s := range datap {
|
|
|
|
|
if s.Sect != nil {
|
|
|
|
|
s.Value += int64(s.Sect.Vaddr)
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
2016-04-18 14:50:14 -04:00
|
|
|
for sub := s.Sub; sub != nil; sub = sub.Sub {
|
|
|
|
|
sub.Value += s.Value
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
|
|
|
|
}
|
2016-04-18 14:50:14 -04:00
|
|
|
|
2016-04-22 10:31:14 +12:00
|
|
|
for _, sym := range dwarfp {
|
2016-03-14 09:23:04 -07:00
|
|
|
if sym.Sect != nil {
|
|
|
|
|
sym.Value += int64(sym.Sect.Vaddr)
|
|
|
|
|
}
|
2016-04-18 14:50:14 -04:00
|
|
|
for sub := sym.Sub; sub != nil; sub = sub.Sub {
|
2016-03-14 09:23:04 -07:00
|
|
|
sub.Value += sym.Value
|
|
|
|
|
}
|
|
|
|
|
}
|
2015-02-27 22:57:28 -05:00
|
|
|
|
2015-05-25 13:59:08 +12:00
|
|
|
if Buildmode == BuildmodeShared {
|
cmd/link: use ctxt.{Lookup,ROLookup} in favour of function versions of same
Done with two eg templates:
package p
import (
"cmd/link/internal/ld"
)
func before(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ld.Linklookup(ctxt, name, v)
}
func after(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ctxt.Syms.Lookup(name, v)
}
package p
import (
"cmd/link/internal/ld"
)
func before(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ld.Linkrlookup(ctxt, name, v)
}
func after(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ctxt.Syms.ROLookup(name, v)
}
Change-Id: I00647dbf62294557bd24c29ad1f108fc786335f1
Reviewed-on: https://go-review.googlesource.com/29343
Run-TryBot: Michael Hudson-Doyle <michael.hudson@canonical.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Ian Lance Taylor <iant@golang.org>
2016-09-20 15:06:08 +12:00
|
|
|
s := ctxt.Syms.Lookup("go.link.abihashbytes", 0)
|
|
|
|
|
sectSym := ctxt.Syms.Lookup(".note.go.abihash", 0)
|
2015-05-25 13:59:08 +12:00
|
|
|
s.Sect = sectSym.Sect
|
|
|
|
|
s.Value = int64(sectSym.Sect.Vaddr + 16)
|
|
|
|
|
}
|
|
|
|
|
|
2017-04-19 15:15:35 +12:00
|
|
|
ctxt.xdefine("runtime.text", STEXT, int64(text.Vaddr))
|
|
|
|
|
ctxt.xdefine("runtime.etext", STEXT, int64(lasttext.Vaddr+lasttext.Length))
|
2016-08-25 11:07:33 -05:00
|
|
|
|
|
|
|
|
// If there are multiple text sections, create runtime.text.n for
|
|
|
|
|
// their section Vaddr, using n for index
|
|
|
|
|
n := 1
|
2017-04-18 21:52:06 +12:00
|
|
|
for _, sect := range Segtext.Sections[1:] {
|
|
|
|
|
if sect.Name == ".text" {
|
|
|
|
|
symname := fmt.Sprintf("runtime.text.%d", n)
|
2017-04-19 15:15:35 +12:00
|
|
|
ctxt.xdefine(symname, STEXT, int64(sect.Vaddr))
|
2017-04-18 21:52:06 +12:00
|
|
|
n++
|
|
|
|
|
} else {
|
|
|
|
|
break
|
|
|
|
|
}
|
2016-08-25 11:07:33 -05:00
|
|
|
}
|
|
|
|
|
|
2017-04-19 15:15:35 +12:00
|
|
|
ctxt.xdefine("runtime.rodata", SRODATA, int64(rodata.Vaddr))
|
|
|
|
|
ctxt.xdefine("runtime.erodata", SRODATA, int64(rodata.Vaddr+rodata.Length))
|
|
|
|
|
ctxt.xdefine("runtime.types", SRODATA, int64(types.Vaddr))
|
|
|
|
|
ctxt.xdefine("runtime.etypes", SRODATA, int64(types.Vaddr+types.Length))
|
|
|
|
|
ctxt.xdefine("runtime.itablink", SRODATA, int64(itablink.Vaddr))
|
|
|
|
|
ctxt.xdefine("runtime.eitablink", SRODATA, int64(itablink.Vaddr+itablink.Length))
|
2016-08-19 22:40:38 -04:00
|
|
|
|
cmd/link: use ctxt.{Lookup,ROLookup} in favour of function versions of same
Done with two eg templates:
package p
import (
"cmd/link/internal/ld"
)
func before(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ld.Linklookup(ctxt, name, v)
}
func after(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ctxt.Syms.Lookup(name, v)
}
package p
import (
"cmd/link/internal/ld"
)
func before(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ld.Linkrlookup(ctxt, name, v)
}
func after(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ctxt.Syms.ROLookup(name, v)
}
Change-Id: I00647dbf62294557bd24c29ad1f108fc786335f1
Reviewed-on: https://go-review.googlesource.com/29343
Run-TryBot: Michael Hudson-Doyle <michael.hudson@canonical.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Ian Lance Taylor <iant@golang.org>
2016-09-20 15:06:08 +12:00
|
|
|
sym := ctxt.Syms.Lookup("runtime.gcdata", 0)
|
2016-03-02 07:59:49 -05:00
|
|
|
sym.Attr |= AttrLocal
|
2017-04-19 15:15:35 +12:00
|
|
|
ctxt.xdefine("runtime.egcdata", SRODATA, Symaddr(sym)+sym.Size)
|
cmd/link: use ctxt.{Lookup,ROLookup} in favour of function versions of same
Done with two eg templates:
package p
import (
"cmd/link/internal/ld"
)
func before(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ld.Linklookup(ctxt, name, v)
}
func after(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ctxt.Syms.Lookup(name, v)
}
package p
import (
"cmd/link/internal/ld"
)
func before(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ld.Linkrlookup(ctxt, name, v)
}
func after(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ctxt.Syms.ROLookup(name, v)
}
Change-Id: I00647dbf62294557bd24c29ad1f108fc786335f1
Reviewed-on: https://go-review.googlesource.com/29343
Run-TryBot: Michael Hudson-Doyle <michael.hudson@canonical.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Ian Lance Taylor <iant@golang.org>
2016-09-20 15:06:08 +12:00
|
|
|
ctxt.Syms.Lookup("runtime.egcdata", 0).Sect = sym.Sect
|
2015-02-27 22:57:28 -05:00
|
|
|
|
cmd/link: use ctxt.{Lookup,ROLookup} in favour of function versions of same
Done with two eg templates:
package p
import (
"cmd/link/internal/ld"
)
func before(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ld.Linklookup(ctxt, name, v)
}
func after(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ctxt.Syms.Lookup(name, v)
}
package p
import (
"cmd/link/internal/ld"
)
func before(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ld.Linkrlookup(ctxt, name, v)
}
func after(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ctxt.Syms.ROLookup(name, v)
}
Change-Id: I00647dbf62294557bd24c29ad1f108fc786335f1
Reviewed-on: https://go-review.googlesource.com/29343
Run-TryBot: Michael Hudson-Doyle <michael.hudson@canonical.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Ian Lance Taylor <iant@golang.org>
2016-09-20 15:06:08 +12:00
|
|
|
sym = ctxt.Syms.Lookup("runtime.gcbss", 0)
|
2016-03-02 07:59:49 -05:00
|
|
|
sym.Attr |= AttrLocal
|
2017-04-19 15:15:35 +12:00
|
|
|
ctxt.xdefine("runtime.egcbss", SRODATA, Symaddr(sym)+sym.Size)
|
cmd/link: use ctxt.{Lookup,ROLookup} in favour of function versions of same
Done with two eg templates:
package p
import (
"cmd/link/internal/ld"
)
func before(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ld.Linklookup(ctxt, name, v)
}
func after(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ctxt.Syms.Lookup(name, v)
}
package p
import (
"cmd/link/internal/ld"
)
func before(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ld.Linkrlookup(ctxt, name, v)
}
func after(ctxt *ld.Link, name string, v int) *ld.Symbol {
return ctxt.Syms.ROLookup(name, v)
}
Change-Id: I00647dbf62294557bd24c29ad1f108fc786335f1
Reviewed-on: https://go-review.googlesource.com/29343
Run-TryBot: Michael Hudson-Doyle <michael.hudson@canonical.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Ian Lance Taylor <iant@golang.org>
2016-09-20 15:06:08 +12:00
|
|
|
ctxt.Syms.Lookup("runtime.egcbss", 0).Sect = sym.Sect
|
2016-08-19 22:40:38 -04:00
|
|
|
|
2017-04-19 15:15:35 +12:00
|
|
|
ctxt.xdefine("runtime.symtab", SRODATA, int64(symtab.Vaddr))
|
|
|
|
|
ctxt.xdefine("runtime.esymtab", SRODATA, int64(symtab.Vaddr+symtab.Length))
|
|
|
|
|
ctxt.xdefine("runtime.pclntab", SRODATA, int64(pclntab.Vaddr))
|
|
|
|
|
ctxt.xdefine("runtime.epclntab", SRODATA, int64(pclntab.Vaddr+pclntab.Length))
|
|
|
|
|
ctxt.xdefine("runtime.noptrdata", SNOPTRDATA, int64(noptr.Vaddr))
|
|
|
|
|
ctxt.xdefine("runtime.enoptrdata", SNOPTRDATA, int64(noptr.Vaddr+noptr.Length))
|
|
|
|
|
ctxt.xdefine("runtime.bss", SBSS, int64(bss.Vaddr))
|
|
|
|
|
ctxt.xdefine("runtime.ebss", SBSS, int64(bss.Vaddr+bss.Length))
|
|
|
|
|
ctxt.xdefine("runtime.data", SDATA, int64(data.Vaddr))
|
|
|
|
|
ctxt.xdefine("runtime.edata", SDATA, int64(data.Vaddr+data.Length))
|
|
|
|
|
ctxt.xdefine("runtime.noptrbss", SNOPTRBSS, int64(noptrbss.Vaddr))
|
|
|
|
|
ctxt.xdefine("runtime.enoptrbss", SNOPTRBSS, int64(noptrbss.Vaddr+noptrbss.Length))
|
|
|
|
|
ctxt.xdefine("runtime.end", SBSS, int64(Segdata.Vaddr+Segdata.Length))
|
2015-02-27 22:57:28 -05:00
|
|
|
}
|
2016-09-14 14:47:12 -04:00
|
|
|
|
|
|
|
|
// add a trampoline with symbol s (to be laid down after the current function)
|
|
|
|
|
func (ctxt *Link) AddTramp(s *Symbol) {
|
2017-04-19 15:15:35 +12:00
|
|
|
s.Type = STEXT
|
2016-09-14 14:47:12 -04:00
|
|
|
s.Attr |= AttrReachable
|
|
|
|
|
s.Attr |= AttrOnList
|
|
|
|
|
ctxt.tramps = append(ctxt.tramps, s)
|
|
|
|
|
if *FlagDebugTramp > 0 && ctxt.Debugvlog > 0 {
|
|
|
|
|
ctxt.Logf("trampoline %s inserted\n", s)
|
|
|
|
|
}
|
|
|
|
|
}
|