go/src/cmd/compile/internal/test/inl_test.go

397 lines
10 KiB
Go
Raw Normal View History

// Copyright 2017 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package test
import (
"bufio"
"internal/goexperiment"
"internal/testenv"
"io"
"math/bits"
"regexp"
"runtime"
"strings"
"testing"
)
// TestIntendedInlining tests that specific functions are inlined.
// This allows refactoring for code clarity and re-use without fear that
// changes to the compiler will cause silent performance regressions.
func TestIntendedInlining(t *testing.T) {
if testing.Short() && testenv.Builder() == "" {
t.Skip("skipping in short mode")
}
testenv.MustHaveGoRun(t)
t.Parallel()
// want is the list of function names (by package) that should
// be inlinable. If they have no callers in their packages, they
cmd/compile/internal/gc: inline autogenerated (*T).M wrappers Currently all inlining of autogenerated wrappers is disabled, because it causes build failures, when indexed export format is enabled. Turns out we can reenable it for common case of (*T).M wrappers. This fixes most performance degradation of 1.11 vs 1.10. encoding/binary: name old time/op new time/op delta ReadSlice1000Int32s-6 14.8µs ± 2% 11.5µs ± 2% -22.01% (p=0.000 n=10+10) WriteSlice1000Int32s-6 14.8µs ± 2% 11.7µs ± 2% -20.95% (p=0.000 n=10+10) bufio: name old time/op new time/op delta WriterFlush-6 32.4ns ± 1% 28.8ns ± 0% -11.17% (p=0.000 n=9+10) sort: SearchWrappers-6 231ns ± 1% 231ns ± 0% ~ (p=0.129 n=9+10) SortString1K-6 365µs ± 1% 298µs ± 1% -18.43% (p=0.000 n=9+10) SortString1K_Slice-6 274µs ± 2% 276µs ± 1% ~ (p=0.105 n=10+10) StableString1K-6 490µs ± 1% 373µs ± 1% -23.73% (p=0.000 n=10+10) SortInt1K-6 210µs ± 1% 142µs ± 1% -32.69% (p=0.000 n=10+10) StableInt1K-6 243µs ± 0% 151µs ± 1% -37.75% (p=0.000 n=10+10) StableInt1K_Slice-6 130µs ± 1% 130µs ± 0% ~ (p=0.237 n=10+8) SortInt64K-6 19.9ms ± 1% 13.5ms ± 1% -32.32% (p=0.000 n=10+10) SortInt64K_Slice-6 11.5ms ± 1% 11.5ms ± 1% ~ (p=0.912 n=10+10) StableInt64K-6 21.5ms ± 0% 13.5ms ± 1% -37.30% (p=0.000 n=9+10) Sort1e2-6 108µs ± 2% 83µs ± 3% -23.26% (p=0.000 n=10+10) Stable1e2-6 218µs ± 0% 161µs ± 1% -25.99% (p=0.000 n=8+9) Sort1e4-6 22.6ms ± 1% 16.8ms ± 0% -25.45% (p=0.000 n=10+7) Stable1e4-6 67.6ms ± 1% 49.7ms ± 0% -26.48% (p=0.000 n=10+10) Sort1e6-6 3.44s ± 0% 2.55s ± 1% -26.05% (p=0.000 n=8+9) Stable1e6-6 13.7s ± 0% 9.9s ± 1% -27.68% (p=0.000 n=8+10) Fixes #27621 Updates #25338 Change-Id: I6fe633202f63fa829a6ab849c44d7e45f8835dff Reviewed-on: https://go-review.googlesource.com/c/135697 Run-TryBot: Ilya Tocar <ilya.tocar@intel.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Matthew Dempsky <mdempsky@google.com>
2018-09-17 14:08:03 -05:00
// might not actually be inlined anywhere.
want := map[string][]string{
"runtime": {
"add",
"acquirem",
"add1",
"addb",
"adjustpanics",
"adjustpointer",
"alignDown",
"alignUp",
"bucketMask",
"bucketShift",
"chanbuf",
"evacuated",
"fastlog2",
"fastrand",
"float64bits",
"funcspdelta",
"getm",
"getMCache",
"isDirectIface",
"itabHashFunc",
"noescape",
"pcvalueCacheKey",
"readUnaligned32",
"readUnaligned64",
"releasem",
"roundupsize",
"stackmapdata",
"stringStructOf",
"subtract1",
"subtractb",
"tophash",
"(*bmap).keys",
"(*bmap).overflow",
"(*waitq).enqueue",
"funcInfo.entry",
// GC-related ones
"cgoInRange",
"gclinkptr.ptr",
"guintptr.ptr",
"writeHeapBitsForAddr",
"markBits.isMarked",
"muintptr.ptr",
"puintptr.ptr",
"spanOf",
"spanOfUnchecked",
"(*gcWork).putFast",
"(*gcWork).tryGetFast",
"(*guintptr).set",
"(*markBits).advance",
"(*mspan).allocBitsForIndex",
"(*mspan).base",
"(*mspan).markBitsForBase",
"(*mspan).markBitsForIndex",
"(*muintptr).set",
"(*puintptr).set",
"(*wbBuf).get1",
"(*wbBuf).get2",
},
"runtime/internal/sys": {},
"runtime/internal/math": {
"MulUintptr",
},
"bytes": {
"(*Buffer).Bytes",
"(*Buffer).Cap",
"(*Buffer).Len",
"(*Buffer).Grow",
"(*Buffer).Next",
"(*Buffer).Read",
"(*Buffer).ReadByte",
"(*Buffer).Reset",
"(*Buffer).String",
"(*Buffer).UnreadByte",
"(*Buffer).tryGrowByReslice",
},
cmd/compile/internal/gc: inline autogenerated (*T).M wrappers Currently all inlining of autogenerated wrappers is disabled, because it causes build failures, when indexed export format is enabled. Turns out we can reenable it for common case of (*T).M wrappers. This fixes most performance degradation of 1.11 vs 1.10. encoding/binary: name old time/op new time/op delta ReadSlice1000Int32s-6 14.8µs ± 2% 11.5µs ± 2% -22.01% (p=0.000 n=10+10) WriteSlice1000Int32s-6 14.8µs ± 2% 11.7µs ± 2% -20.95% (p=0.000 n=10+10) bufio: name old time/op new time/op delta WriterFlush-6 32.4ns ± 1% 28.8ns ± 0% -11.17% (p=0.000 n=9+10) sort: SearchWrappers-6 231ns ± 1% 231ns ± 0% ~ (p=0.129 n=9+10) SortString1K-6 365µs ± 1% 298µs ± 1% -18.43% (p=0.000 n=9+10) SortString1K_Slice-6 274µs ± 2% 276µs ± 1% ~ (p=0.105 n=10+10) StableString1K-6 490µs ± 1% 373µs ± 1% -23.73% (p=0.000 n=10+10) SortInt1K-6 210µs ± 1% 142µs ± 1% -32.69% (p=0.000 n=10+10) StableInt1K-6 243µs ± 0% 151µs ± 1% -37.75% (p=0.000 n=10+10) StableInt1K_Slice-6 130µs ± 1% 130µs ± 0% ~ (p=0.237 n=10+8) SortInt64K-6 19.9ms ± 1% 13.5ms ± 1% -32.32% (p=0.000 n=10+10) SortInt64K_Slice-6 11.5ms ± 1% 11.5ms ± 1% ~ (p=0.912 n=10+10) StableInt64K-6 21.5ms ± 0% 13.5ms ± 1% -37.30% (p=0.000 n=9+10) Sort1e2-6 108µs ± 2% 83µs ± 3% -23.26% (p=0.000 n=10+10) Stable1e2-6 218µs ± 0% 161µs ± 1% -25.99% (p=0.000 n=8+9) Sort1e4-6 22.6ms ± 1% 16.8ms ± 0% -25.45% (p=0.000 n=10+7) Stable1e4-6 67.6ms ± 1% 49.7ms ± 0% -26.48% (p=0.000 n=10+10) Sort1e6-6 3.44s ± 0% 2.55s ± 1% -26.05% (p=0.000 n=8+9) Stable1e6-6 13.7s ± 0% 9.9s ± 1% -27.68% (p=0.000 n=8+10) Fixes #27621 Updates #25338 Change-Id: I6fe633202f63fa829a6ab849c44d7e45f8835dff Reviewed-on: https://go-review.googlesource.com/c/135697 Run-TryBot: Ilya Tocar <ilya.tocar@intel.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Matthew Dempsky <mdempsky@google.com>
2018-09-17 14:08:03 -05:00
"compress/flate": {
"byLiteral.Len",
"byLiteral.Less",
"byLiteral.Swap",
"(*dictDecoder).tryWriteCopy",
cmd/compile/internal/gc: inline autogenerated (*T).M wrappers Currently all inlining of autogenerated wrappers is disabled, because it causes build failures, when indexed export format is enabled. Turns out we can reenable it for common case of (*T).M wrappers. This fixes most performance degradation of 1.11 vs 1.10. encoding/binary: name old time/op new time/op delta ReadSlice1000Int32s-6 14.8µs ± 2% 11.5µs ± 2% -22.01% (p=0.000 n=10+10) WriteSlice1000Int32s-6 14.8µs ± 2% 11.7µs ± 2% -20.95% (p=0.000 n=10+10) bufio: name old time/op new time/op delta WriterFlush-6 32.4ns ± 1% 28.8ns ± 0% -11.17% (p=0.000 n=9+10) sort: SearchWrappers-6 231ns ± 1% 231ns ± 0% ~ (p=0.129 n=9+10) SortString1K-6 365µs ± 1% 298µs ± 1% -18.43% (p=0.000 n=9+10) SortString1K_Slice-6 274µs ± 2% 276µs ± 1% ~ (p=0.105 n=10+10) StableString1K-6 490µs ± 1% 373µs ± 1% -23.73% (p=0.000 n=10+10) SortInt1K-6 210µs ± 1% 142µs ± 1% -32.69% (p=0.000 n=10+10) StableInt1K-6 243µs ± 0% 151µs ± 1% -37.75% (p=0.000 n=10+10) StableInt1K_Slice-6 130µs ± 1% 130µs ± 0% ~ (p=0.237 n=10+8) SortInt64K-6 19.9ms ± 1% 13.5ms ± 1% -32.32% (p=0.000 n=10+10) SortInt64K_Slice-6 11.5ms ± 1% 11.5ms ± 1% ~ (p=0.912 n=10+10) StableInt64K-6 21.5ms ± 0% 13.5ms ± 1% -37.30% (p=0.000 n=9+10) Sort1e2-6 108µs ± 2% 83µs ± 3% -23.26% (p=0.000 n=10+10) Stable1e2-6 218µs ± 0% 161µs ± 1% -25.99% (p=0.000 n=8+9) Sort1e4-6 22.6ms ± 1% 16.8ms ± 0% -25.45% (p=0.000 n=10+7) Stable1e4-6 67.6ms ± 1% 49.7ms ± 0% -26.48% (p=0.000 n=10+10) Sort1e6-6 3.44s ± 0% 2.55s ± 1% -26.05% (p=0.000 n=8+9) Stable1e6-6 13.7s ± 0% 9.9s ± 1% -27.68% (p=0.000 n=8+10) Fixes #27621 Updates #25338 Change-Id: I6fe633202f63fa829a6ab849c44d7e45f8835dff Reviewed-on: https://go-review.googlesource.com/c/135697 Run-TryBot: Ilya Tocar <ilya.tocar@intel.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Matthew Dempsky <mdempsky@google.com>
2018-09-17 14:08:03 -05:00
},
encoding/base64: speed up the decoder Most of the decoding time is spent in the first Decode loop, since the rest of the function only deals with the few remaining bytes. Any unnecessary work done in that loop body matters tremendously. One such unnecessary bottleneck was the use of the enc.decodeMap table. Since enc is a pointer receiver, and the field is used within the non-inlineable function decode64, the decoder must perform a nil check at every iteration. To fix that, move the enc.decodeMap uses to the parent function, where we can lift the nil check outside the loop. That gives roughly a 15% speed-up. The function no longer performs decoding per se, so rename it. While at it, remove the now unnecessary receivers. An unfortunate side effect of this change is that the loop now contains eight bounds checks on src instead of just one. However, not having to slice src plus the nil check removal well outweigh the added cost. The other piece that made decode64 slow was that it wasn't inlined, and had multiple branches. Use a simple bitwise-or trick suggested by Roger Peppe, and collapse the rest of the bitwise logic into a single expression. Inlinability and the reduced branching give a further 10% speed-up. Finally, add these two functions to TestIntendedInlining, since we want them to stay inlinable. Apply the same refactor to decode32 for consistency, and to let 32-bit architectures see a similar performance gain for large inputs. name old time/op new time/op delta DecodeString/2-8 47.3ns ± 1% 45.8ns ± 0% -3.28% (p=0.002 n=6+6) DecodeString/4-8 55.8ns ± 2% 51.5ns ± 0% -7.71% (p=0.004 n=5+6) DecodeString/8-8 64.9ns ± 0% 61.7ns ± 0% -4.99% (p=0.004 n=5+6) DecodeString/64-8 238ns ± 0% 198ns ± 0% -16.54% (p=0.002 n=6+6) DecodeString/8192-8 19.5µs ± 0% 14.6µs ± 0% -24.96% (p=0.004 n=6+5) name old speed new speed delta DecodeString/2-8 84.6MB/s ± 1% 87.4MB/s ± 0% +3.38% (p=0.002 n=6+6) DecodeString/4-8 143MB/s ± 2% 155MB/s ± 0% +8.41% (p=0.004 n=5+6) DecodeString/8-8 185MB/s ± 0% 195MB/s ± 0% +5.29% (p=0.004 n=5+6) DecodeString/64-8 369MB/s ± 0% 442MB/s ± 0% +19.78% (p=0.002 n=6+6) DecodeString/8192-8 560MB/s ± 0% 746MB/s ± 0% +33.27% (p=0.004 n=6+5) Updates #19636. Change-Id: Ib839577b0e3f5a2bb201f5cae580c61365d92894 Reviewed-on: https://go-review.googlesource.com/c/go/+/151177 Run-TryBot: Daniel Martí <mvdan@mvdan.cc> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org> Reviewed-by: roger peppe <rogpeppe@gmail.com>
2018-11-25 17:30:36 +00:00
"encoding/base64": {
"assemble32",
"assemble64",
},
"unicode/utf8": {
"FullRune",
"FullRuneInString",
"RuneLen",
"AppendRune",
"ValidRune",
},
utf16: reduce utf16.Decode allocations This CL avoids allocating in utf16.Decode for code point sequences with less than 64 elements. It does so by splitting the function in two, one that can be inlined that preallocates a buffer and the other that does the heavy-lifting. The mid-stack inliner will allocate the buffer in the caller stack, and in many cases this will be enough to avoid the allocation. unicode/utf16 benchmarks: name old time/op new time/op delta DecodeValidASCII-12 60.1ns ± 3% 16.0ns ±20% -73.40% (p=0.000 n=8+10) DecodeValidJapaneseChars-12 61.3ns ±10% 14.9ns ±39% -75.71% (p=0.000 n=10+10) name old alloc/op new alloc/op delta DecodeValidASCII-12 48.0B ± 0% 0.0B -100.00% (p=0.000 n=10+10) DecodeValidJapaneseChars-12 48.0B ± 0% 0.0B -100.00% (p=0.000 n=10+10) name old allocs/op new allocs/op delta DecodeValidASCII-12 1.00 ± 0% 0.00 -100.00% (p=0.000 n=10+10) DecodeValidJapaneseChars-12 1.00 ± 0% 0.00 -100.00% (p=0.000 n=10+10) I've also benchmarked os.File.ReadDir with this change applied to demonstrate that it does make a difference in the caller site, in this case via syscall.UTF16ToString: name old time/op new time/op delta ReadDir-12 592µs ± 8% 620µs ±16% ~ (p=0.280 n=10+10) name old alloc/op new alloc/op delta ReadDir-12 30.4kB ± 0% 22.4kB ± 0% -26.10% (p=0.000 n=8+10) name old allocs/op new allocs/op delta ReadDir-12 402 ± 0% 272 ± 0% -32.34% (p=0.000 n=10+10) Change-Id: I65cf5caa3fd3b3a466c0ed837a50a96e975bbe6b Reviewed-on: https://go-review.googlesource.com/c/go/+/453415 Reviewed-by: Damien Neil <dneil@google.com> Reviewed-by: Cherry Mui <cherryyz@google.com> Reviewed-by: Alex Brainman <alex.brainman@gmail.com> TryBot-Result: Gopher Robot <gobot@golang.org> Run-TryBot: Quim Muntal <quimmuntal@gmail.com>
2022-11-25 14:22:36 +01:00
"unicode/utf16": {
"Decode",
},
"reflect": {
reflect: make more Value methods inlineable The following Value methods are now inlineable: Bool for ~bool String for ~string (but not other kinds) Bytes for []byte (but not ~[]byte or ~[N]byte) Len for ~[]T (but not ~[N]T, ~chan T, ~map[K]V, or ~string) Cap for ~[]T (but not ~[N]T or ~chan T) For Bytes, we only have enough inline budget to inline one type, so we optimize for unnamed []byte, which is far more common than named []byte or [N]byte. For Len and Cap, we only have enough inline budget to inline one kind, so we optimize for ~[]T, which is more common than the others. The exception is string, but the size of a string can be obtained through len(v.String()). Performance: Bool 1.65ns ± 0% 0.51ns ± 3% -68.81% (p=0.008 n=5+5) String 1.97ns ± 1% 0.70ns ± 1% -64.25% (p=0.008 n=5+5) Bytes 8.90ns ± 2% 0.89ns ± 1% -89.95% (p=0.008 n=5+5) NamedBytes 8.89ns ± 1% 8.88ns ± 1% ~ (p=0.548 n=5+5) BytesArray 10.0ns ± 2% 10.2ns ± 1% +1.58% (p=0.048 n=5+5) SliceLen 1.97ns ± 1% 0.45ns ± 1% -77.22% (p=0.008 n=5+5) MapLen 2.62ns ± 1% 3.07ns ± 1% +17.24% (p=0.008 n=5+5) StringLen 1.96ns ± 1% 1.98ns ± 2% ~ (p=0.151 n=5+5) ArrayLen 1.96ns ± 1% 2.19ns ± 1% +11.46% (p=0.008 n=5+5) SliceCap 1.76ns ± 1% 0.45ns ± 2% -74.28% (p=0.008 n=5+5) There's a slight slowdown (~10-20%) for obtaining the length of a string or map, but a substantial improvement for slices. Performance according to encoding/json: CodeMarshal 555µs ± 2% 562µs ± 4% ~ (p=0.421 n=5+5) MarshalBytes/32 163ns ± 1% 157ns ± 1% -3.82% (p=0.008 n=5+5) MarshalBytes/256 453ns ± 1% 447ns ± 1% ~ (p=0.056 n=5+5) MarshalBytes/4096 4.10µs ± 1% 4.09µs ± 0% ~ (p=1.000 n=5+4) CodeUnmarshal 3.16ms ± 2% 3.02ms ± 1% -4.18% (p=0.008 n=5+5) CodeUnmarshalReuse 2.64ms ± 3% 2.51ms ± 2% -4.81% (p=0.016 n=5+5) UnmarshalString 65.4ns ± 4% 64.1ns ± 0% ~ (p=0.190 n=5+4) UnmarshalFloat64 59.8ns ± 5% 58.9ns ± 2% ~ (p=0.222 n=5+5) UnmarshalInt64 51.7ns ± 1% 50.0ns ± 2% -3.26% (p=0.008 n=5+5) EncodeMarshaler 23.6ns ±11% 20.8ns ± 1% -12.10% (p=0.016 n=5+4) Add all inlineable methods of Value to cmd/compile/internal/test/inl_test.go. Change-Id: Ifc192491918af6b62f7fe3a094a5a5256bfb326d Reviewed-on: https://go-review.googlesource.com/c/go/+/400676 Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org> Reviewed-by: Ian Lance Taylor <iant@google.com> Run-TryBot: Ian Lance Taylor <iant@google.com> Auto-Submit: Ian Lance Taylor <iant@google.com> TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
2022-04-16 19:01:48 -07:00
"Value.Bool",
"Value.Bytes",
"Value.CanAddr",
reflect: make more Value methods inlineable The following Value methods are now inlineable: Bool for ~bool String for ~string (but not other kinds) Bytes for []byte (but not ~[]byte or ~[N]byte) Len for ~[]T (but not ~[N]T, ~chan T, ~map[K]V, or ~string) Cap for ~[]T (but not ~[N]T or ~chan T) For Bytes, we only have enough inline budget to inline one type, so we optimize for unnamed []byte, which is far more common than named []byte or [N]byte. For Len and Cap, we only have enough inline budget to inline one kind, so we optimize for ~[]T, which is more common than the others. The exception is string, but the size of a string can be obtained through len(v.String()). Performance: Bool 1.65ns ± 0% 0.51ns ± 3% -68.81% (p=0.008 n=5+5) String 1.97ns ± 1% 0.70ns ± 1% -64.25% (p=0.008 n=5+5) Bytes 8.90ns ± 2% 0.89ns ± 1% -89.95% (p=0.008 n=5+5) NamedBytes 8.89ns ± 1% 8.88ns ± 1% ~ (p=0.548 n=5+5) BytesArray 10.0ns ± 2% 10.2ns ± 1% +1.58% (p=0.048 n=5+5) SliceLen 1.97ns ± 1% 0.45ns ± 1% -77.22% (p=0.008 n=5+5) MapLen 2.62ns ± 1% 3.07ns ± 1% +17.24% (p=0.008 n=5+5) StringLen 1.96ns ± 1% 1.98ns ± 2% ~ (p=0.151 n=5+5) ArrayLen 1.96ns ± 1% 2.19ns ± 1% +11.46% (p=0.008 n=5+5) SliceCap 1.76ns ± 1% 0.45ns ± 2% -74.28% (p=0.008 n=5+5) There's a slight slowdown (~10-20%) for obtaining the length of a string or map, but a substantial improvement for slices. Performance according to encoding/json: CodeMarshal 555µs ± 2% 562µs ± 4% ~ (p=0.421 n=5+5) MarshalBytes/32 163ns ± 1% 157ns ± 1% -3.82% (p=0.008 n=5+5) MarshalBytes/256 453ns ± 1% 447ns ± 1% ~ (p=0.056 n=5+5) MarshalBytes/4096 4.10µs ± 1% 4.09µs ± 0% ~ (p=1.000 n=5+4) CodeUnmarshal 3.16ms ± 2% 3.02ms ± 1% -4.18% (p=0.008 n=5+5) CodeUnmarshalReuse 2.64ms ± 3% 2.51ms ± 2% -4.81% (p=0.016 n=5+5) UnmarshalString 65.4ns ± 4% 64.1ns ± 0% ~ (p=0.190 n=5+4) UnmarshalFloat64 59.8ns ± 5% 58.9ns ± 2% ~ (p=0.222 n=5+5) UnmarshalInt64 51.7ns ± 1% 50.0ns ± 2% -3.26% (p=0.008 n=5+5) EncodeMarshaler 23.6ns ±11% 20.8ns ± 1% -12.10% (p=0.016 n=5+4) Add all inlineable methods of Value to cmd/compile/internal/test/inl_test.go. Change-Id: Ifc192491918af6b62f7fe3a094a5a5256bfb326d Reviewed-on: https://go-review.googlesource.com/c/go/+/400676 Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org> Reviewed-by: Ian Lance Taylor <iant@google.com> Run-TryBot: Ian Lance Taylor <iant@google.com> Auto-Submit: Ian Lance Taylor <iant@google.com> TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
2022-04-16 19:01:48 -07:00
"Value.CanComplex",
"Value.CanFloat",
"Value.CanInt",
"Value.CanInterface",
reflect: make more Value methods inlineable The following Value methods are now inlineable: Bool for ~bool String for ~string (but not other kinds) Bytes for []byte (but not ~[]byte or ~[N]byte) Len for ~[]T (but not ~[N]T, ~chan T, ~map[K]V, or ~string) Cap for ~[]T (but not ~[N]T or ~chan T) For Bytes, we only have enough inline budget to inline one type, so we optimize for unnamed []byte, which is far more common than named []byte or [N]byte. For Len and Cap, we only have enough inline budget to inline one kind, so we optimize for ~[]T, which is more common than the others. The exception is string, but the size of a string can be obtained through len(v.String()). Performance: Bool 1.65ns ± 0% 0.51ns ± 3% -68.81% (p=0.008 n=5+5) String 1.97ns ± 1% 0.70ns ± 1% -64.25% (p=0.008 n=5+5) Bytes 8.90ns ± 2% 0.89ns ± 1% -89.95% (p=0.008 n=5+5) NamedBytes 8.89ns ± 1% 8.88ns ± 1% ~ (p=0.548 n=5+5) BytesArray 10.0ns ± 2% 10.2ns ± 1% +1.58% (p=0.048 n=5+5) SliceLen 1.97ns ± 1% 0.45ns ± 1% -77.22% (p=0.008 n=5+5) MapLen 2.62ns ± 1% 3.07ns ± 1% +17.24% (p=0.008 n=5+5) StringLen 1.96ns ± 1% 1.98ns ± 2% ~ (p=0.151 n=5+5) ArrayLen 1.96ns ± 1% 2.19ns ± 1% +11.46% (p=0.008 n=5+5) SliceCap 1.76ns ± 1% 0.45ns ± 2% -74.28% (p=0.008 n=5+5) There's a slight slowdown (~10-20%) for obtaining the length of a string or map, but a substantial improvement for slices. Performance according to encoding/json: CodeMarshal 555µs ± 2% 562µs ± 4% ~ (p=0.421 n=5+5) MarshalBytes/32 163ns ± 1% 157ns ± 1% -3.82% (p=0.008 n=5+5) MarshalBytes/256 453ns ± 1% 447ns ± 1% ~ (p=0.056 n=5+5) MarshalBytes/4096 4.10µs ± 1% 4.09µs ± 0% ~ (p=1.000 n=5+4) CodeUnmarshal 3.16ms ± 2% 3.02ms ± 1% -4.18% (p=0.008 n=5+5) CodeUnmarshalReuse 2.64ms ± 3% 2.51ms ± 2% -4.81% (p=0.016 n=5+5) UnmarshalString 65.4ns ± 4% 64.1ns ± 0% ~ (p=0.190 n=5+4) UnmarshalFloat64 59.8ns ± 5% 58.9ns ± 2% ~ (p=0.222 n=5+5) UnmarshalInt64 51.7ns ± 1% 50.0ns ± 2% -3.26% (p=0.008 n=5+5) EncodeMarshaler 23.6ns ±11% 20.8ns ± 1% -12.10% (p=0.016 n=5+4) Add all inlineable methods of Value to cmd/compile/internal/test/inl_test.go. Change-Id: Ifc192491918af6b62f7fe3a094a5a5256bfb326d Reviewed-on: https://go-review.googlesource.com/c/go/+/400676 Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org> Reviewed-by: Ian Lance Taylor <iant@google.com> Run-TryBot: Ian Lance Taylor <iant@google.com> Auto-Submit: Ian Lance Taylor <iant@google.com> TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
2022-04-16 19:01:48 -07:00
"Value.CanSet",
"Value.CanUint",
"Value.Cap",
"Value.Complex",
"Value.Float",
"Value.Int",
"Value.Interface",
"Value.IsNil",
"Value.IsValid",
reflect: make more Value methods inlineable The following Value methods are now inlineable: Bool for ~bool String for ~string (but not other kinds) Bytes for []byte (but not ~[]byte or ~[N]byte) Len for ~[]T (but not ~[N]T, ~chan T, ~map[K]V, or ~string) Cap for ~[]T (but not ~[N]T or ~chan T) For Bytes, we only have enough inline budget to inline one type, so we optimize for unnamed []byte, which is far more common than named []byte or [N]byte. For Len and Cap, we only have enough inline budget to inline one kind, so we optimize for ~[]T, which is more common than the others. The exception is string, but the size of a string can be obtained through len(v.String()). Performance: Bool 1.65ns ± 0% 0.51ns ± 3% -68.81% (p=0.008 n=5+5) String 1.97ns ± 1% 0.70ns ± 1% -64.25% (p=0.008 n=5+5) Bytes 8.90ns ± 2% 0.89ns ± 1% -89.95% (p=0.008 n=5+5) NamedBytes 8.89ns ± 1% 8.88ns ± 1% ~ (p=0.548 n=5+5) BytesArray 10.0ns ± 2% 10.2ns ± 1% +1.58% (p=0.048 n=5+5) SliceLen 1.97ns ± 1% 0.45ns ± 1% -77.22% (p=0.008 n=5+5) MapLen 2.62ns ± 1% 3.07ns ± 1% +17.24% (p=0.008 n=5+5) StringLen 1.96ns ± 1% 1.98ns ± 2% ~ (p=0.151 n=5+5) ArrayLen 1.96ns ± 1% 2.19ns ± 1% +11.46% (p=0.008 n=5+5) SliceCap 1.76ns ± 1% 0.45ns ± 2% -74.28% (p=0.008 n=5+5) There's a slight slowdown (~10-20%) for obtaining the length of a string or map, but a substantial improvement for slices. Performance according to encoding/json: CodeMarshal 555µs ± 2% 562µs ± 4% ~ (p=0.421 n=5+5) MarshalBytes/32 163ns ± 1% 157ns ± 1% -3.82% (p=0.008 n=5+5) MarshalBytes/256 453ns ± 1% 447ns ± 1% ~ (p=0.056 n=5+5) MarshalBytes/4096 4.10µs ± 1% 4.09µs ± 0% ~ (p=1.000 n=5+4) CodeUnmarshal 3.16ms ± 2% 3.02ms ± 1% -4.18% (p=0.008 n=5+5) CodeUnmarshalReuse 2.64ms ± 3% 2.51ms ± 2% -4.81% (p=0.016 n=5+5) UnmarshalString 65.4ns ± 4% 64.1ns ± 0% ~ (p=0.190 n=5+4) UnmarshalFloat64 59.8ns ± 5% 58.9ns ± 2% ~ (p=0.222 n=5+5) UnmarshalInt64 51.7ns ± 1% 50.0ns ± 2% -3.26% (p=0.008 n=5+5) EncodeMarshaler 23.6ns ±11% 20.8ns ± 1% -12.10% (p=0.016 n=5+4) Add all inlineable methods of Value to cmd/compile/internal/test/inl_test.go. Change-Id: Ifc192491918af6b62f7fe3a094a5a5256bfb326d Reviewed-on: https://go-review.googlesource.com/c/go/+/400676 Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org> Reviewed-by: Ian Lance Taylor <iant@google.com> Run-TryBot: Ian Lance Taylor <iant@google.com> Auto-Submit: Ian Lance Taylor <iant@google.com> TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
2022-04-16 19:01:48 -07:00
"Value.Kind",
"Value.Len",
"Value.MapRange",
reflect: make more Value methods inlineable The following Value methods are now inlineable: Bool for ~bool String for ~string (but not other kinds) Bytes for []byte (but not ~[]byte or ~[N]byte) Len for ~[]T (but not ~[N]T, ~chan T, ~map[K]V, or ~string) Cap for ~[]T (but not ~[N]T or ~chan T) For Bytes, we only have enough inline budget to inline one type, so we optimize for unnamed []byte, which is far more common than named []byte or [N]byte. For Len and Cap, we only have enough inline budget to inline one kind, so we optimize for ~[]T, which is more common than the others. The exception is string, but the size of a string can be obtained through len(v.String()). Performance: Bool 1.65ns ± 0% 0.51ns ± 3% -68.81% (p=0.008 n=5+5) String 1.97ns ± 1% 0.70ns ± 1% -64.25% (p=0.008 n=5+5) Bytes 8.90ns ± 2% 0.89ns ± 1% -89.95% (p=0.008 n=5+5) NamedBytes 8.89ns ± 1% 8.88ns ± 1% ~ (p=0.548 n=5+5) BytesArray 10.0ns ± 2% 10.2ns ± 1% +1.58% (p=0.048 n=5+5) SliceLen 1.97ns ± 1% 0.45ns ± 1% -77.22% (p=0.008 n=5+5) MapLen 2.62ns ± 1% 3.07ns ± 1% +17.24% (p=0.008 n=5+5) StringLen 1.96ns ± 1% 1.98ns ± 2% ~ (p=0.151 n=5+5) ArrayLen 1.96ns ± 1% 2.19ns ± 1% +11.46% (p=0.008 n=5+5) SliceCap 1.76ns ± 1% 0.45ns ± 2% -74.28% (p=0.008 n=5+5) There's a slight slowdown (~10-20%) for obtaining the length of a string or map, but a substantial improvement for slices. Performance according to encoding/json: CodeMarshal 555µs ± 2% 562µs ± 4% ~ (p=0.421 n=5+5) MarshalBytes/32 163ns ± 1% 157ns ± 1% -3.82% (p=0.008 n=5+5) MarshalBytes/256 453ns ± 1% 447ns ± 1% ~ (p=0.056 n=5+5) MarshalBytes/4096 4.10µs ± 1% 4.09µs ± 0% ~ (p=1.000 n=5+4) CodeUnmarshal 3.16ms ± 2% 3.02ms ± 1% -4.18% (p=0.008 n=5+5) CodeUnmarshalReuse 2.64ms ± 3% 2.51ms ± 2% -4.81% (p=0.016 n=5+5) UnmarshalString 65.4ns ± 4% 64.1ns ± 0% ~ (p=0.190 n=5+4) UnmarshalFloat64 59.8ns ± 5% 58.9ns ± 2% ~ (p=0.222 n=5+5) UnmarshalInt64 51.7ns ± 1% 50.0ns ± 2% -3.26% (p=0.008 n=5+5) EncodeMarshaler 23.6ns ±11% 20.8ns ± 1% -12.10% (p=0.016 n=5+4) Add all inlineable methods of Value to cmd/compile/internal/test/inl_test.go. Change-Id: Ifc192491918af6b62f7fe3a094a5a5256bfb326d Reviewed-on: https://go-review.googlesource.com/c/go/+/400676 Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org> Reviewed-by: Ian Lance Taylor <iant@google.com> Run-TryBot: Ian Lance Taylor <iant@google.com> Auto-Submit: Ian Lance Taylor <iant@google.com> TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
2022-04-16 19:01:48 -07:00
"Value.OverflowComplex",
"Value.OverflowFloat",
"Value.OverflowInt",
"Value.OverflowUint",
"Value.String",
"Value.Type",
"Value.Uint",
"Value.UnsafeAddr",
"Value.pointer",
"add",
"align",
"flag.mustBe",
"flag.mustBeAssignable",
"flag.mustBeExported",
"flag.kind",
"flag.ro",
},
regexp: make (*bitState).push inlinable By refactoring job.arg from int with 0/1 as the only valid values into bool and simplifying (*bitState).push, we reduce the number of nodes below the inlining threshold. This improves backtracking regexp performance by 5-10% and go1 geomean by 1.7% Full performance data below: name old time/op new time/op delta Find-6 510ns ± 0% 480ns ± 1% -5.90% (p=0.000 n=10+10) FindString-6 504ns ± 1% 479ns ± 1% -5.10% (p=0.000 n=10+10) FindSubmatch-6 689ns ± 1% 659ns ± 1% -4.27% (p=0.000 n=9+10) FindStringSubmatch-6 659ns ± 0% 628ns ± 1% -4.69% (p=0.000 n=8+10) Literal-6 174ns ± 1% 171ns ± 1% -1.50% (p=0.000 n=10+10) NotLiteral-6 2.89µs ± 1% 2.72µs ± 0% -5.84% (p=0.000 n=10+9) MatchClass-6 4.65µs ± 1% 4.28µs ± 1% -7.96% (p=0.000 n=10+10) MatchClass_InRange-6 4.15µs ± 1% 3.80µs ± 0% -8.61% (p=0.000 n=10+8) ReplaceAll-6 2.72µs ± 1% 2.60µs ± 1% -4.68% (p=0.000 n=10+10) AnchoredLiteralShortNonMatch-6 158ns ± 1% 153ns ± 1% -3.03% (p=0.000 n=10+10) AnchoredLiteralLongNonMatch-6 176ns ± 1% 176ns ± 0% ~ (p=1.000 n=10+9) AnchoredShortMatch-6 260ns ± 0% 255ns ± 1% -1.84% (p=0.000 n=9+10) AnchoredLongMatch-6 456ns ± 0% 455ns ± 0% -0.19% (p=0.008 n=8+10) OnePassShortA-6 1.13µs ± 1% 1.12µs ± 0% -0.57% (p=0.046 n=10+8) NotOnePassShortA-6 1.14µs ± 1% 1.14µs ± 1% ~ (p=0.162 n=10+10) OnePassShortB-6 908ns ± 0% 893ns ± 0% -1.60% (p=0.000 n=8+9) NotOnePassShortB-6 857ns ± 0% 803ns ± 1% -6.34% (p=0.000 n=8+10) OnePassLongPrefix-6 190ns ± 0% 190ns ± 1% ~ (p=0.059 n=8+10) OnePassLongNotPrefix-6 722ns ± 1% 722ns ± 1% ~ (p=0.451 n=10+10) MatchParallelShared-6 810ns ± 2% 807ns ± 2% ~ (p=0.643 n=10+10) MatchParallelCopied-6 72.1ns ± 1% 69.4ns ± 1% -3.81% (p=0.000 n=10+10) QuoteMetaAll-6 213ns ± 2% 216ns ± 3% ~ (p=0.284 n=10+10) QuoteMetaNone-6 89.7ns ± 1% 89.8ns ± 1% ~ (p=0.616 n=10+10) Match/Easy0/32-6 127ns ± 1% 127ns ± 1% ~ (p=0.977 n=10+10) Match/Easy0/1K-6 566ns ± 0% 566ns ± 0% ~ (p=1.000 n=8+8) Match/Easy0/32K-6 9.30µs ± 1% 9.28µs ± 1% ~ (p=0.529 n=10+10) Match/Easy0/1M-6 460µs ± 1% 460µs ± 1% ~ (p=0.853 n=10+10) Match/Easy0/32M-6 15.0ms ± 0% 15.1ms ± 0% +0.77% (p=0.000 n=9+8) Match/Easy0i/32-6 2.10µs ± 1% 1.98µs ± 0% -6.02% (p=0.000 n=10+8) Match/Easy0i/1K-6 61.5µs ± 0% 57.2µs ± 0% -6.97% (p=0.000 n=10+9) Match/Easy0i/32K-6 2.75ms ± 0% 2.72ms ± 0% -1.10% (p=0.000 n=9+9) Match/Easy0i/1M-6 88.0ms ± 0% 86.9ms ± 1% -1.29% (p=0.000 n=8+10) Match/Easy0i/32M-6 2.82s ± 0% 2.77s ± 1% -1.81% (p=0.000 n=8+10) Match/Easy1/32-6 123ns ± 1% 124ns ± 1% +0.90% (p=0.001 n=10+10) Match/Easy1/1K-6 1.70µs ± 1% 1.65µs ± 0% -3.18% (p=0.000 n=9+10) Match/Easy1/32K-6 69.1µs ± 0% 68.4µs ± 1% -0.95% (p=0.000 n=8+10) Match/Easy1/1M-6 2.46ms ± 1% 2.42ms ± 1% -1.66% (p=0.000 n=10+10) Match/Easy1/32M-6 78.4ms ± 1% 77.5ms ± 0% -1.08% (p=0.000 n=10+9) Match/Medium/32-6 2.07µs ± 1% 1.91µs ± 1% -7.69% (p=0.000 n=10+10) Match/Medium/1K-6 62.8µs ± 0% 58.0µs ± 1% -7.70% (p=0.000 n=8+10) Match/Medium/32K-6 2.63ms ± 1% 2.58ms ± 1% -2.14% (p=0.000 n=10+10) Match/Medium/1M-6 84.6ms ± 0% 82.5ms ± 0% -2.37% (p=0.000 n=8+9) Match/Medium/32M-6 2.71s ± 0% 2.64s ± 0% -2.46% (p=0.000 n=10+9) Match/Hard/32-6 3.26µs ± 1% 2.98µs ± 1% -8.49% (p=0.000 n=10+10) Match/Hard/1K-6 100µs ± 0% 90µs ± 1% -9.55% (p=0.000 n=9+10) Match/Hard/32K-6 3.82ms ± 0% 3.82ms ± 1% ~ (p=0.515 n=8+10) Match/Hard/1M-6 122ms ± 1% 123ms ± 0% +0.66% (p=0.000 n=10+8) Match/Hard/32M-6 3.89s ± 1% 3.91s ± 1% ~ (p=0.105 n=10+10) Match/Hard1/32-6 18.1µs ± 1% 16.1µs ± 1% -11.31% (p=0.000 n=10+10) Match/Hard1/1K-6 565µs ± 0% 493µs ± 1% -12.65% (p=0.000 n=8+10) Match/Hard1/32K-6 18.8ms ± 0% 18.8ms ± 1% ~ (p=0.905 n=9+10) Match/Hard1/1M-6 602ms ± 1% 602ms ± 1% ~ (p=0.278 n=9+10) Match/Hard1/32M-6 19.1s ± 1% 19.2s ± 1% +0.31% (p=0.035 n=9+10) Match_onepass_regex/32-6 6.32µs ± 1% 6.34µs ± 1% ~ (p=0.060 n=10+10) Match_onepass_regex/1K-6 204µs ± 1% 204µs ± 1% ~ (p=0.842 n=9+10) Match_onepass_regex/32K-6 6.53ms ± 0% 6.55ms ± 1% +0.36% (p=0.005 n=10+10) Match_onepass_regex/1M-6 209ms ± 0% 208ms ± 1% -0.65% (p=0.034 n=8+10) Match_onepass_regex/32M-6 6.72s ± 0% 6.68s ± 1% -0.74% (p=0.000 n=9+10) CompileOnepass/^(?:(?:(?:.(?:$))?))...-6 7.02µs ± 1% 7.02µs ± 1% ~ (p=0.671 n=10+10) CompileOnepass/^abcd$-6 5.65µs ± 1% 5.65µs ± 1% ~ (p=0.411 n=10+9) CompileOnepass/^(?:(?:a{0,})*?)$-6 7.06µs ± 1% 7.06µs ± 1% ~ (p=0.912 n=10+10) CompileOnepass/^(?:(?:a+)*)$-6 6.40µs ± 1% 6.41µs ± 1% ~ (p=0.699 n=10+10) CompileOnepass/^(?:(?:a|(?:aa)))$-6 8.18µs ± 2% 8.16µs ± 1% ~ (p=0.529 n=10+10) CompileOnepass/^(?:[^\s\S])$-6 5.08µs ± 1% 5.17µs ± 1% +1.77% (p=0.000 n=9+10) CompileOnepass/^(?:(?:(?:a*)+))$-6 6.86µs ± 1% 6.85µs ± 0% ~ (p=0.190 n=10+9) CompileOnepass/^[a-c]+$-6 5.14µs ± 1% 5.11µs ± 0% -0.53% (p=0.041 n=10+10) CompileOnepass/^[a-c]*$-6 5.62µs ± 1% 5.63µs ± 1% ~ (p=0.382 n=10+10) CompileOnepass/^(?:a*)$-6 5.76µs ± 1% 5.73µs ± 1% -0.41% (p=0.008 n=9+10) CompileOnepass/^(?:(?:aa)|a)$-6 7.89µs ± 1% 7.84µs ± 1% -0.66% (p=0.020 n=10+10) CompileOnepass/^...$-6 5.38µs ± 1% 5.38µs ± 1% ~ (p=0.857 n=9+10) CompileOnepass/^(?:a|(?:aa))$-6 7.80µs ± 2% 7.82µs ± 1% ~ (p=0.342 n=10+10) CompileOnepass/^a((b))c$-6 7.75µs ± 1% 7.78µs ± 1% ~ (p=0.172 n=10+10) CompileOnepass/^a.[l-nA-Cg-j]?e$-6 8.39µs ± 1% 8.42µs ± 1% ~ (p=0.138 n=10+10) CompileOnepass/^a((b))$-6 6.92µs ± 1% 6.95µs ± 1% ~ (p=0.159 n=10+10) CompileOnepass/^a(?:(b)|(c))c$-6 10.0µs ± 1% 10.0µs ± 1% ~ (p=0.896 n=10+10) CompileOnepass/^a(?:b|c)$-6 5.62µs ± 1% 5.66µs ± 1% +0.71% (p=0.023 n=10+10) CompileOnepass/^a(?:b?|c)$-6 8.49µs ± 1% 8.43µs ± 1% -0.69% (p=0.010 n=10+10) CompileOnepass/^a(?:b?|c+)$-6 9.26µs ± 1% 9.28µs ± 1% ~ (p=0.448 n=10+10) CompileOnepass/^a(?:bc)+$-6 6.52µs ± 1% 6.46µs ± 2% -1.02% (p=0.003 n=10+10) CompileOnepass/^a(?:[bcd])+$-6 6.29µs ± 1% 6.32µs ± 1% ~ (p=0.256 n=10+10) CompileOnepass/^a((?:[bcd])+)$-6 7.77µs ± 1% 7.79µs ± 1% ~ (p=0.105 n=10+10) CompileOnepass/^a(:?b|c)*d$-6 14.0µs ± 1% 13.9µs ± 1% -0.69% (p=0.003 n=10+10) CompileOnepass/^.bc(d|e)*$-6 8.96µs ± 1% 9.06µs ± 1% +1.20% (p=0.000 n=10+9) CompileOnepass/^loooooooooooooooooo...-6 219µs ± 1% 220µs ± 1% +0.63% (p=0.006 n=9+10) [Geo mean] 31.6µs 31.1µs -1.82% name old speed new speed delta QuoteMetaAll-6 65.5MB/s ± 2% 64.8MB/s ± 3% ~ (p=0.315 n=10+10) QuoteMetaNone-6 290MB/s ± 1% 290MB/s ± 1% ~ (p=0.755 n=10+10) Match/Easy0/32-6 250MB/s ± 0% 251MB/s ± 1% ~ (p=0.277 n=8+9) Match/Easy0/1K-6 1.81GB/s ± 0% 1.81GB/s ± 0% ~ (p=0.408 n=8+10) Match/Easy0/32K-6 3.52GB/s ± 1% 3.53GB/s ± 1% ~ (p=0.529 n=10+10) Match/Easy0/1M-6 2.28GB/s ± 1% 2.28GB/s ± 1% ~ (p=0.853 n=10+10) Match/Easy0/32M-6 2.24GB/s ± 0% 2.23GB/s ± 0% -0.76% (p=0.000 n=9+8) Match/Easy0i/32-6 15.2MB/s ± 1% 16.2MB/s ± 0% +6.43% (p=0.000 n=10+9) Match/Easy0i/1K-6 16.6MB/s ± 0% 17.9MB/s ± 0% +7.48% (p=0.000 n=10+9) Match/Easy0i/32K-6 11.9MB/s ± 0% 12.0MB/s ± 0% +1.11% (p=0.000 n=9+9) Match/Easy0i/1M-6 11.9MB/s ± 0% 12.1MB/s ± 1% +1.31% (p=0.000 n=8+10) Match/Easy0i/32M-6 11.9MB/s ± 0% 12.1MB/s ± 1% +1.84% (p=0.000 n=8+10) Match/Easy1/32-6 260MB/s ± 1% 258MB/s ± 1% -0.91% (p=0.001 n=10+10) Match/Easy1/1K-6 601MB/s ± 1% 621MB/s ± 0% +3.28% (p=0.000 n=9+10) Match/Easy1/32K-6 474MB/s ± 0% 479MB/s ± 1% +0.96% (p=0.000 n=8+10) Match/Easy1/1M-6 426MB/s ± 1% 433MB/s ± 1% +1.68% (p=0.000 n=10+10) Match/Easy1/32M-6 428MB/s ± 1% 433MB/s ± 0% +1.09% (p=0.000 n=10+9) Match/Medium/32-6 15.4MB/s ± 1% 16.7MB/s ± 1% +8.23% (p=0.000 n=10+9) Match/Medium/1K-6 16.3MB/s ± 1% 17.7MB/s ± 1% +8.43% (p=0.000 n=9+10) Match/Medium/32K-6 12.5MB/s ± 1% 12.7MB/s ± 1% +2.15% (p=0.000 n=10+10) Match/Medium/1M-6 12.4MB/s ± 0% 12.7MB/s ± 0% +2.44% (p=0.000 n=8+9) Match/Medium/32M-6 12.4MB/s ± 0% 12.7MB/s ± 0% +2.52% (p=0.000 n=10+9) Match/Hard/32-6 9.82MB/s ± 1% 10.73MB/s ± 1% +9.29% (p=0.000 n=10+10) Match/Hard/1K-6 10.2MB/s ± 0% 11.3MB/s ± 1% +10.56% (p=0.000 n=9+10) Match/Hard/32K-6 8.58MB/s ± 0% 8.58MB/s ± 1% ~ (p=0.554 n=8+10) Match/Hard/1M-6 8.59MB/s ± 1% 8.53MB/s ± 0% -0.70% (p=0.000 n=10+8) Match/Hard/32M-6 8.62MB/s ± 1% 8.59MB/s ± 1% ~ (p=0.098 n=10+10) Match/Hard1/32-6 1.77MB/s ± 1% 1.99MB/s ± 1% +12.40% (p=0.000 n=10+8) Match/Hard1/1K-6 1.81MB/s ± 1% 2.08MB/s ± 1% +14.55% (p=0.000 n=10+10) Match/Hard1/32K-6 1.74MB/s ± 0% 1.74MB/s ± 0% ~ (p=0.108 n=9+10) Match/Hard1/1M-6 1.74MB/s ± 0% 1.74MB/s ± 1% ~ (p=1.000 n=9+10) Match/Hard1/32M-6 1.75MB/s ± 0% 1.75MB/s ± 1% ~ (p=0.157 n=9+10) Match_onepass_regex/32-6 5.05MB/s ± 0% 5.05MB/s ± 1% ~ (p=0.262 n=8+10) Match_onepass_regex/1K-6 5.02MB/s ± 1% 5.02MB/s ± 1% ~ (p=0.677 n=9+10) Match_onepass_regex/32K-6 5.02MB/s ± 0% 4.99MB/s ± 0% -0.47% (p=0.000 n=10+9) Match_onepass_regex/1M-6 5.01MB/s ± 0% 5.04MB/s ± 1% +0.68% (p=0.017 n=8+10) Match_onepass_regex/32M-6 4.99MB/s ± 0% 5.03MB/s ± 1% +0.74% (p=0.000 n=10+10) [Geo mean] 29.1MB/s 29.8MB/s +2.44% go1 data for reference name old time/op new time/op delta BinaryTree17-6 4.39s ± 1% 4.37s ± 0% -0.58% (p=0.006 n=9+9) Fannkuch11-6 5.13s ± 0% 5.18s ± 0% +0.87% (p=0.000 n=8+8) FmtFprintfEmpty-6 74.2ns ± 0% 71.7ns ± 3% -3.41% (p=0.000 n=10+10) FmtFprintfString-6 120ns ± 1% 122ns ± 2% ~ (p=0.333 n=10+10) FmtFprintfInt-6 127ns ± 1% 127ns ± 1% ~ (p=0.809 n=10+10) FmtFprintfIntInt-6 186ns ± 0% 188ns ± 1% +1.02% (p=0.002 n=8+10) FmtFprintfPrefixedInt-6 223ns ± 1% 222ns ± 2% ~ (p=0.421 n=10+10) FmtFprintfFloat-6 374ns ± 0% 376ns ± 1% +0.43% (p=0.030 n=8+10) FmtManyArgs-6 795ns ± 0% 788ns ± 1% -0.79% (p=0.000 n=8+9) GobDecode-6 10.9ms ± 1% 10.9ms ± 0% ~ (p=0.079 n=10+9) GobEncode-6 8.60ms ± 1% 8.56ms ± 0% -0.52% (p=0.004 n=10+10) Gzip-6 378ms ± 1% 386ms ± 1% +2.28% (p=0.000 n=10+10) Gunzip-6 63.7ms ± 0% 62.3ms ± 0% -2.22% (p=0.000 n=9+8) HTTPClientServer-6 120µs ± 3% 114µs ± 3% -4.99% (p=0.000 n=10+10) JSONEncode-6 20.3ms ± 1% 19.9ms ± 0% -1.90% (p=0.000 n=9+10) JSONDecode-6 84.3ms ± 0% 83.7ms ± 0% -0.76% (p=0.000 n=8+8) Mandelbrot200-6 6.91ms ± 0% 6.89ms ± 0% -0.31% (p=0.000 n=9+8) GoParse-6 5.49ms ± 0% 5.47ms ± 1% ~ (p=0.101 n=8+10) RegexpMatchEasy0_32-6 130ns ± 0% 128ns ± 0% -1.54% (p=0.002 n=8+10) RegexpMatchEasy0_1K-6 322ns ± 1% 322ns ± 0% ~ (p=0.525 n=10+9) RegexpMatchEasy1_32-6 124ns ± 0% 124ns ± 0% -0.32% (p=0.046 n=8+10) RegexpMatchEasy1_1K-6 570ns ± 0% 548ns ± 1% -3.76% (p=0.000 n=10+10) RegexpMatchMedium_32-6 196ns ± 0% 183ns ± 1% -6.61% (p=0.000 n=8+10) RegexpMatchMedium_1K-6 64.3µs ± 0% 59.0µs ± 1% -8.31% (p=0.000 n=8+10) RegexpMatchHard_32-6 3.08µs ± 0% 2.80µs ± 0% -8.96% (p=0.000 n=8+9) RegexpMatchHard_1K-6 93.0µs ± 0% 84.5µs ± 1% -9.17% (p=0.000 n=8+9) Revcomp-6 647ms ± 2% 646ms ± 1% ~ (p=0.720 n=10+9) Template-6 92.3ms ± 0% 91.7ms ± 0% -0.65% (p=0.000 n=8+8) TimeParse-6 490ns ± 0% 488ns ± 0% -0.43% (p=0.000 n=10+10) TimeFormat-6 513ns ± 0% 513ns ± 1% ~ (p=0.144 n=9+10) [Geo mean] 79.1µs 77.7µs -1.73% name old speed new speed delta GobDecode-6 70.1MB/s ± 1% 70.3MB/s ± 0% ~ (p=0.078 n=10+9) GobEncode-6 89.2MB/s ± 1% 89.7MB/s ± 0% +0.52% (p=0.004 n=10+10) Gzip-6 51.4MB/s ± 1% 50.2MB/s ± 1% -2.23% (p=0.000 n=10+10) Gunzip-6 304MB/s ± 0% 311MB/s ± 0% +2.27% (p=0.000 n=9+8) JSONEncode-6 95.8MB/s ± 1% 97.7MB/s ± 0% +1.93% (p=0.000 n=9+10) JSONDecode-6 23.0MB/s ± 0% 23.2MB/s ± 0% +0.76% (p=0.000 n=8+8) GoParse-6 10.6MB/s ± 0% 10.6MB/s ± 1% ~ (p=0.111 n=8+10) RegexpMatchEasy0_32-6 244MB/s ± 0% 249MB/s ± 0% +2.06% (p=0.000 n=9+10) RegexpMatchEasy0_1K-6 3.18GB/s ± 1% 3.17GB/s ± 0% ~ (p=0.211 n=10+9) RegexpMatchEasy1_32-6 257MB/s ± 0% 258MB/s ± 0% +0.37% (p=0.000 n=8+8) RegexpMatchEasy1_1K-6 1.80GB/s ± 0% 1.87GB/s ± 1% +3.91% (p=0.000 n=10+10) RegexpMatchMedium_32-6 5.08MB/s ± 0% 5.43MB/s ± 1% +7.03% (p=0.000 n=8+10) RegexpMatchMedium_1K-6 15.9MB/s ± 0% 17.4MB/s ± 1% +9.08% (p=0.000 n=8+10) RegexpMatchHard_32-6 10.4MB/s ± 0% 11.4MB/s ± 0% +9.82% (p=0.000 n=8+9) RegexpMatchHard_1K-6 11.0MB/s ± 0% 12.1MB/s ± 1% +10.10% (p=0.000 n=8+9) Revcomp-6 393MB/s ± 2% 394MB/s ± 1% ~ (p=0.720 n=10+9) Template-6 21.0MB/s ± 0% 21.2MB/s ± 0% +0.66% (p=0.000 n=8+8) [Geo mean] 74.2MB/s 76.2MB/s +2.70% Updates #21851 Change-Id: Ie88455db925f422a828f8528293790726a9c036b Reviewed-on: https://go-review.googlesource.com/65491 Run-TryBot: Ilya Tocar <ilya.tocar@intel.com> Reviewed-by: Daniel Martí <mvdan@mvdan.cc> Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org>
2017-09-22 15:15:23 -05:00
"regexp": {
"(*bitState).push",
},
"math/big": {
"bigEndianWord",
// The following functions require the math_big_pure_go build tag.
"addVW",
"subVW",
},
"math/rand": {
"(*rngSource).Int63",
"(*rngSource).Uint64",
},
"net": {
"(*UDPConn).ReadFromUDP",
},
cmd/compile: allow more inlining of functions that construct closures [This is a roll-forward of CL 479095, which was reverted due to a bad interaction between inlining and escape analysis since fixed in CL 482355.] Currently, when the inliner is determining if a function is inlineable, it descends into the bodies of closures constructed by that function. This has several unfortunate consequences: - If the closure contains a disallowed operation (e.g., a defer), then the outer function can't be inlined. It makes sense that the *closure* can't be inlined in this case, but it doesn't make sense to punish the function that constructs the closure. - The hairiness of the closure counts against the inlining budget of the outer function. Since we currently copy the closure body when inlining the outer function, this makes sense from the perspective of export data size and binary size, but ultimately doesn't make much sense from the perspective of what should be inlineable. - Since the inliner walks into every closure created by an outer function in addition to starting a walk at every closure, this adds an n^2 factor to inlinability analysis. This CL simply drops this behavior. In std, this makes 57 more functions inlinable, and disallows inlining for 10 (due to the basic instability of our bottom-up inlining approach), for an net increase of 47 inlinable functions (+0.6%). This will help significantly with the performance of the functions to be added for #56102, which have a somewhat complicated nesting of closures with a performance-critical fast path. The downside of this seems to be a potential increase in export data and text size, but the practical impact of this seems to be negligible: │ before │ after │ │ bytes │ bytes vs base │ Go/binary 15.12Mi ± 0% 15.14Mi ± 0% +0.16% (n=1) Go/text 5.220Mi ± 0% 5.237Mi ± 0% +0.32% (n=1) Compile/binary 22.92Mi ± 0% 22.94Mi ± 0% +0.07% (n=1) Compile/text 8.428Mi ± 0% 8.435Mi ± 0% +0.08% (n=1) Updates #56102. Change-Id: I1f4fc96c71609c8feb59fecdb92b69ba7e3b5b41 Reviewed-on: https://go-review.googlesource.com/c/go/+/482356 Reviewed-by: Cuong Manh Le <cuong.manhle.vn@gmail.com> Run-TryBot: Than McIntosh <thanm@google.com> Reviewed-by: Cherry Mui <cherryyz@google.com> TryBot-Result: Gopher Robot <gobot@golang.org>
2023-04-04 18:31:46 -04:00
"sync": {
// Both OnceFunc and its returned closure need to be inlinable so
// that the returned closure can be inlined into the caller of OnceFunc.
"OnceFunc",
"OnceFunc.func2", // The returned closure.
// TODO(austin): It would be good to check OnceValue and OnceValues,
// too, but currently they aren't reported because they have type
// parameters and aren't instantiated in sync.
},
"sync/atomic": {
// (*Bool).CompareAndSwap handled below.
"(*Bool).Load",
"(*Bool).Store",
"(*Bool).Swap",
"(*Int32).Add",
"(*Int32).CompareAndSwap",
"(*Int32).Load",
"(*Int32).Store",
"(*Int32).Swap",
"(*Int64).Add",
"(*Int64).CompareAndSwap",
"(*Int64).Load",
"(*Int64).Store",
"(*Int64).Swap",
"(*Uint32).Add",
"(*Uint32).CompareAndSwap",
"(*Uint32).Load",
"(*Uint32).Store",
"(*Uint32).Swap",
"(*Uint64).Add",
"(*Uint64).CompareAndSwap",
"(*Uint64).Load",
"(*Uint64).Store",
"(*Uint64).Swap",
"(*Uintptr).Add",
"(*Uintptr).CompareAndSwap",
"(*Uintptr).Load",
"(*Uintptr).Store",
"(*Uintptr).Swap",
"(*Pointer[go.shape.int]).CompareAndSwap",
"(*Pointer[go.shape.int]).Load",
"(*Pointer[go.shape.int]).Store",
"(*Pointer[go.shape.int]).Swap",
},
}
if runtime.GOARCH != "386" && runtime.GOARCH != "loong64" && runtime.GOARCH != "mips64" && runtime.GOARCH != "mips64le" && runtime.GOARCH != "riscv64" {
// nextFreeFast calls sys.TrailingZeros64, which on 386 is implemented in asm and is not inlinable.
// We currently don't have midstack inlining so nextFreeFast is also not inlinable on 386.
// On loong64, mips64x and riscv64, TrailingZeros64 is not intrinsified and causes nextFreeFast
// too expensive to inline (Issue 22239).
want["runtime"] = append(want["runtime"], "nextFreeFast")
// Same behavior for heapBits.nextFast.
want["runtime"] = append(want["runtime"], "heapBits.nextFast")
}
if runtime.GOARCH != "386" {
// As explained above, TrailingZeros64 and TrailingZeros32 are not Go code on 386.
// The same applies to Bswap32.
want["runtime/internal/sys"] = append(want["runtime/internal/sys"], "TrailingZeros64")
want["runtime/internal/sys"] = append(want["runtime/internal/sys"], "TrailingZeros32")
want["runtime/internal/sys"] = append(want["runtime/internal/sys"], "Bswap32")
}
if bits.UintSize == 64 {
runtime: using wyhash for memhashFallback on 64bit platform wyhash is a general hash function that: 1. About 8-70% faster that internal maphash 2. Passed Smhasher, BigCrush and PractRand tests name old time/op new time/op delta Hash5 28.9ns ± 0% 30.0ns ± 0% +3.77% (p=0.000 n=9+10) Hash16 32.4ns ± 0% 30.2ns ± 0% -6.74% (p=0.000 n=10+8) Hash64 52.4ns ± 0% 43.4ns ± 0% -17.20% (p=0.000 n=9+10) Hash1024 415ns ± 0% 258ns ± 2% -37.89% (p=0.000 n=10+10) Hash65536 24.9µs ± 0% 14.6µs ± 0% -41.22% (p=0.000 n=9+9) HashStringSpeed 50.2ns ± 4% 47.8ns ± 4% -4.88% (p=0.000 n=10+10) HashBytesSpeed 90.1ns ± 7% 78.3ns ± 4% -13.06% (p=0.000 n=10+10) HashInt32Speed 33.3ns ± 6% 33.6ns ± 4% ~ (p=0.071 n=10+10) HashInt64Speed 32.7ns ± 3% 34.0ns ± 3% +4.05% (p=0.000 n=9+10) HashStringArraySpeed 131ns ± 2% 117ns ± 5% -10.32% (p=0.000 n=9+10) FastrandHashiter 72.2ns ± 1% 75.7ns ±10% +4.87% (p=0.019 n=8+10) name old speed new speed delta Hash5 173MB/s ± 0% 167MB/s ± 0% -3.63% (p=0.000 n=9+10) Hash16 494MB/s ± 0% 530MB/s ± 0% +7.23% (p=0.000 n=10+8) Hash64 1.22GB/s ± 0% 1.48GB/s ± 0% +20.77% (p=0.000 n=9+10) Hash1024 2.47GB/s ± 0% 3.97GB/s ± 2% +61.01% (p=0.000 n=8+10) Hash65536 2.64GB/s ± 0% 4.48GB/s ± 0% +70.13% (p=0.000 n=9+9) Change-Id: I76af4e2bc1995a18149d11983ea8a149c132865e Reviewed-on: https://go-review.googlesource.com/c/go/+/279612 Trust: Meng Zhuo <mzh@golangcn.org> Run-TryBot: Meng Zhuo <mzh@golangcn.org> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
2021-04-08 16:08:55 +08:00
// mix is only defined on 64-bit architectures
want["runtime"] = append(want["runtime"], "mix")
// (*Bool).CompareAndSwap is just over budget on 32-bit systems (386, arm).
want["sync/atomic"] = append(want["sync/atomic"], "(*Bool).CompareAndSwap")
}
switch runtime.GOARCH {
case "386", "wasm", "arm":
default:
// TODO(mvdan): As explained in /test/inline_sync.go, some
// architectures don't have atomic intrinsics, so these go over
// the inlining budget. Move back to the main table once that
// problem is solved.
want["sync"] = []string{
"(*Mutex).Lock",
"(*Mutex).Unlock",
"(*RWMutex).RLock",
"(*RWMutex).RUnlock",
"(*Once).Do",
}
}
cmd/compile/internal/gc: inline autogenerated (*T).M wrappers Currently all inlining of autogenerated wrappers is disabled, because it causes build failures, when indexed export format is enabled. Turns out we can reenable it for common case of (*T).M wrappers. This fixes most performance degradation of 1.11 vs 1.10. encoding/binary: name old time/op new time/op delta ReadSlice1000Int32s-6 14.8µs ± 2% 11.5µs ± 2% -22.01% (p=0.000 n=10+10) WriteSlice1000Int32s-6 14.8µs ± 2% 11.7µs ± 2% -20.95% (p=0.000 n=10+10) bufio: name old time/op new time/op delta WriterFlush-6 32.4ns ± 1% 28.8ns ± 0% -11.17% (p=0.000 n=9+10) sort: SearchWrappers-6 231ns ± 1% 231ns ± 0% ~ (p=0.129 n=9+10) SortString1K-6 365µs ± 1% 298µs ± 1% -18.43% (p=0.000 n=9+10) SortString1K_Slice-6 274µs ± 2% 276µs ± 1% ~ (p=0.105 n=10+10) StableString1K-6 490µs ± 1% 373µs ± 1% -23.73% (p=0.000 n=10+10) SortInt1K-6 210µs ± 1% 142µs ± 1% -32.69% (p=0.000 n=10+10) StableInt1K-6 243µs ± 0% 151µs ± 1% -37.75% (p=0.000 n=10+10) StableInt1K_Slice-6 130µs ± 1% 130µs ± 0% ~ (p=0.237 n=10+8) SortInt64K-6 19.9ms ± 1% 13.5ms ± 1% -32.32% (p=0.000 n=10+10) SortInt64K_Slice-6 11.5ms ± 1% 11.5ms ± 1% ~ (p=0.912 n=10+10) StableInt64K-6 21.5ms ± 0% 13.5ms ± 1% -37.30% (p=0.000 n=9+10) Sort1e2-6 108µs ± 2% 83µs ± 3% -23.26% (p=0.000 n=10+10) Stable1e2-6 218µs ± 0% 161µs ± 1% -25.99% (p=0.000 n=8+9) Sort1e4-6 22.6ms ± 1% 16.8ms ± 0% -25.45% (p=0.000 n=10+7) Stable1e4-6 67.6ms ± 1% 49.7ms ± 0% -26.48% (p=0.000 n=10+10) Sort1e6-6 3.44s ± 0% 2.55s ± 1% -26.05% (p=0.000 n=8+9) Stable1e6-6 13.7s ± 0% 9.9s ± 1% -27.68% (p=0.000 n=8+10) Fixes #27621 Updates #25338 Change-Id: I6fe633202f63fa829a6ab849c44d7e45f8835dff Reviewed-on: https://go-review.googlesource.com/c/135697 Run-TryBot: Ilya Tocar <ilya.tocar@intel.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Matthew Dempsky <mdempsky@google.com>
2018-09-17 14:08:03 -05:00
// Functions that must actually be inlined; they must have actual callers.
must := map[string]bool{
"compress/flate.byLiteral.Len": true,
"compress/flate.byLiteral.Less": true,
"compress/flate.byLiteral.Swap": true,
}
notInlinedReason := make(map[string]string)
pkgs := make([]string, 0, len(want))
for pname, fnames := range want {
pkgs = append(pkgs, pname)
for _, fname := range fnames {
fullName := pname + "." + fname
if _, ok := notInlinedReason[fullName]; ok {
t.Errorf("duplicate func: %s", fullName)
}
notInlinedReason[fullName] = "unknown reason"
}
}
args := append([]string{"build", "-gcflags=-m -m", "-tags=math_big_pure_go"}, pkgs...)
cmd := testenv.CleanCmdEnv(testenv.Command(t, testenv.GoToolPath(t), args...))
pr, pw := io.Pipe()
cmd.Stdout = pw
cmd.Stderr = pw
cmdErr := make(chan error, 1)
go func() {
cmdErr <- cmd.Run()
pw.Close()
}()
scanner := bufio.NewScanner(pr)
curPkg := ""
canInline := regexp.MustCompile(`: can inline ([^ ]*)`)
cmd/compile/internal/gc: inline autogenerated (*T).M wrappers Currently all inlining of autogenerated wrappers is disabled, because it causes build failures, when indexed export format is enabled. Turns out we can reenable it for common case of (*T).M wrappers. This fixes most performance degradation of 1.11 vs 1.10. encoding/binary: name old time/op new time/op delta ReadSlice1000Int32s-6 14.8µs ± 2% 11.5µs ± 2% -22.01% (p=0.000 n=10+10) WriteSlice1000Int32s-6 14.8µs ± 2% 11.7µs ± 2% -20.95% (p=0.000 n=10+10) bufio: name old time/op new time/op delta WriterFlush-6 32.4ns ± 1% 28.8ns ± 0% -11.17% (p=0.000 n=9+10) sort: SearchWrappers-6 231ns ± 1% 231ns ± 0% ~ (p=0.129 n=9+10) SortString1K-6 365µs ± 1% 298µs ± 1% -18.43% (p=0.000 n=9+10) SortString1K_Slice-6 274µs ± 2% 276µs ± 1% ~ (p=0.105 n=10+10) StableString1K-6 490µs ± 1% 373µs ± 1% -23.73% (p=0.000 n=10+10) SortInt1K-6 210µs ± 1% 142µs ± 1% -32.69% (p=0.000 n=10+10) StableInt1K-6 243µs ± 0% 151µs ± 1% -37.75% (p=0.000 n=10+10) StableInt1K_Slice-6 130µs ± 1% 130µs ± 0% ~ (p=0.237 n=10+8) SortInt64K-6 19.9ms ± 1% 13.5ms ± 1% -32.32% (p=0.000 n=10+10) SortInt64K_Slice-6 11.5ms ± 1% 11.5ms ± 1% ~ (p=0.912 n=10+10) StableInt64K-6 21.5ms ± 0% 13.5ms ± 1% -37.30% (p=0.000 n=9+10) Sort1e2-6 108µs ± 2% 83µs ± 3% -23.26% (p=0.000 n=10+10) Stable1e2-6 218µs ± 0% 161µs ± 1% -25.99% (p=0.000 n=8+9) Sort1e4-6 22.6ms ± 1% 16.8ms ± 0% -25.45% (p=0.000 n=10+7) Stable1e4-6 67.6ms ± 1% 49.7ms ± 0% -26.48% (p=0.000 n=10+10) Sort1e6-6 3.44s ± 0% 2.55s ± 1% -26.05% (p=0.000 n=8+9) Stable1e6-6 13.7s ± 0% 9.9s ± 1% -27.68% (p=0.000 n=8+10) Fixes #27621 Updates #25338 Change-Id: I6fe633202f63fa829a6ab849c44d7e45f8835dff Reviewed-on: https://go-review.googlesource.com/c/135697 Run-TryBot: Ilya Tocar <ilya.tocar@intel.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Matthew Dempsky <mdempsky@google.com>
2018-09-17 14:08:03 -05:00
haveInlined := regexp.MustCompile(`: inlining call to ([^ ]*)`)
cannotInline := regexp.MustCompile(`: cannot inline ([^ ]*): (.*)`)
for scanner.Scan() {
line := scanner.Text()
if strings.HasPrefix(line, "# ") {
curPkg = line[2:]
continue
}
cmd/compile/internal/gc: inline autogenerated (*T).M wrappers Currently all inlining of autogenerated wrappers is disabled, because it causes build failures, when indexed export format is enabled. Turns out we can reenable it for common case of (*T).M wrappers. This fixes most performance degradation of 1.11 vs 1.10. encoding/binary: name old time/op new time/op delta ReadSlice1000Int32s-6 14.8µs ± 2% 11.5µs ± 2% -22.01% (p=0.000 n=10+10) WriteSlice1000Int32s-6 14.8µs ± 2% 11.7µs ± 2% -20.95% (p=0.000 n=10+10) bufio: name old time/op new time/op delta WriterFlush-6 32.4ns ± 1% 28.8ns ± 0% -11.17% (p=0.000 n=9+10) sort: SearchWrappers-6 231ns ± 1% 231ns ± 0% ~ (p=0.129 n=9+10) SortString1K-6 365µs ± 1% 298µs ± 1% -18.43% (p=0.000 n=9+10) SortString1K_Slice-6 274µs ± 2% 276µs ± 1% ~ (p=0.105 n=10+10) StableString1K-6 490µs ± 1% 373µs ± 1% -23.73% (p=0.000 n=10+10) SortInt1K-6 210µs ± 1% 142µs ± 1% -32.69% (p=0.000 n=10+10) StableInt1K-6 243µs ± 0% 151µs ± 1% -37.75% (p=0.000 n=10+10) StableInt1K_Slice-6 130µs ± 1% 130µs ± 0% ~ (p=0.237 n=10+8) SortInt64K-6 19.9ms ± 1% 13.5ms ± 1% -32.32% (p=0.000 n=10+10) SortInt64K_Slice-6 11.5ms ± 1% 11.5ms ± 1% ~ (p=0.912 n=10+10) StableInt64K-6 21.5ms ± 0% 13.5ms ± 1% -37.30% (p=0.000 n=9+10) Sort1e2-6 108µs ± 2% 83µs ± 3% -23.26% (p=0.000 n=10+10) Stable1e2-6 218µs ± 0% 161µs ± 1% -25.99% (p=0.000 n=8+9) Sort1e4-6 22.6ms ± 1% 16.8ms ± 0% -25.45% (p=0.000 n=10+7) Stable1e4-6 67.6ms ± 1% 49.7ms ± 0% -26.48% (p=0.000 n=10+10) Sort1e6-6 3.44s ± 0% 2.55s ± 1% -26.05% (p=0.000 n=8+9) Stable1e6-6 13.7s ± 0% 9.9s ± 1% -27.68% (p=0.000 n=8+10) Fixes #27621 Updates #25338 Change-Id: I6fe633202f63fa829a6ab849c44d7e45f8835dff Reviewed-on: https://go-review.googlesource.com/c/135697 Run-TryBot: Ilya Tocar <ilya.tocar@intel.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Matthew Dempsky <mdempsky@google.com>
2018-09-17 14:08:03 -05:00
if m := haveInlined.FindStringSubmatch(line); m != nil {
fname := m[1]
delete(notInlinedReason, curPkg+"."+fname)
continue
}
cmd/compile/internal/gc: inline autogenerated (*T).M wrappers Currently all inlining of autogenerated wrappers is disabled, because it causes build failures, when indexed export format is enabled. Turns out we can reenable it for common case of (*T).M wrappers. This fixes most performance degradation of 1.11 vs 1.10. encoding/binary: name old time/op new time/op delta ReadSlice1000Int32s-6 14.8µs ± 2% 11.5µs ± 2% -22.01% (p=0.000 n=10+10) WriteSlice1000Int32s-6 14.8µs ± 2% 11.7µs ± 2% -20.95% (p=0.000 n=10+10) bufio: name old time/op new time/op delta WriterFlush-6 32.4ns ± 1% 28.8ns ± 0% -11.17% (p=0.000 n=9+10) sort: SearchWrappers-6 231ns ± 1% 231ns ± 0% ~ (p=0.129 n=9+10) SortString1K-6 365µs ± 1% 298µs ± 1% -18.43% (p=0.000 n=9+10) SortString1K_Slice-6 274µs ± 2% 276µs ± 1% ~ (p=0.105 n=10+10) StableString1K-6 490µs ± 1% 373µs ± 1% -23.73% (p=0.000 n=10+10) SortInt1K-6 210µs ± 1% 142µs ± 1% -32.69% (p=0.000 n=10+10) StableInt1K-6 243µs ± 0% 151µs ± 1% -37.75% (p=0.000 n=10+10) StableInt1K_Slice-6 130µs ± 1% 130µs ± 0% ~ (p=0.237 n=10+8) SortInt64K-6 19.9ms ± 1% 13.5ms ± 1% -32.32% (p=0.000 n=10+10) SortInt64K_Slice-6 11.5ms ± 1% 11.5ms ± 1% ~ (p=0.912 n=10+10) StableInt64K-6 21.5ms ± 0% 13.5ms ± 1% -37.30% (p=0.000 n=9+10) Sort1e2-6 108µs ± 2% 83µs ± 3% -23.26% (p=0.000 n=10+10) Stable1e2-6 218µs ± 0% 161µs ± 1% -25.99% (p=0.000 n=8+9) Sort1e4-6 22.6ms ± 1% 16.8ms ± 0% -25.45% (p=0.000 n=10+7) Stable1e4-6 67.6ms ± 1% 49.7ms ± 0% -26.48% (p=0.000 n=10+10) Sort1e6-6 3.44s ± 0% 2.55s ± 1% -26.05% (p=0.000 n=8+9) Stable1e6-6 13.7s ± 0% 9.9s ± 1% -27.68% (p=0.000 n=8+10) Fixes #27621 Updates #25338 Change-Id: I6fe633202f63fa829a6ab849c44d7e45f8835dff Reviewed-on: https://go-review.googlesource.com/c/135697 Run-TryBot: Ilya Tocar <ilya.tocar@intel.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Matthew Dempsky <mdempsky@google.com>
2018-09-17 14:08:03 -05:00
if m := canInline.FindStringSubmatch(line); m != nil {
fname := m[1]
fullname := curPkg + "." + fname
// If function must be inlined somewhere, being inlinable is not enough
cmd/compile/internal/gc: inline autogenerated (*T).M wrappers Currently all inlining of autogenerated wrappers is disabled, because it causes build failures, when indexed export format is enabled. Turns out we can reenable it for common case of (*T).M wrappers. This fixes most performance degradation of 1.11 vs 1.10. encoding/binary: name old time/op new time/op delta ReadSlice1000Int32s-6 14.8µs ± 2% 11.5µs ± 2% -22.01% (p=0.000 n=10+10) WriteSlice1000Int32s-6 14.8µs ± 2% 11.7µs ± 2% -20.95% (p=0.000 n=10+10) bufio: name old time/op new time/op delta WriterFlush-6 32.4ns ± 1% 28.8ns ± 0% -11.17% (p=0.000 n=9+10) sort: SearchWrappers-6 231ns ± 1% 231ns ± 0% ~ (p=0.129 n=9+10) SortString1K-6 365µs ± 1% 298µs ± 1% -18.43% (p=0.000 n=9+10) SortString1K_Slice-6 274µs ± 2% 276µs ± 1% ~ (p=0.105 n=10+10) StableString1K-6 490µs ± 1% 373µs ± 1% -23.73% (p=0.000 n=10+10) SortInt1K-6 210µs ± 1% 142µs ± 1% -32.69% (p=0.000 n=10+10) StableInt1K-6 243µs ± 0% 151µs ± 1% -37.75% (p=0.000 n=10+10) StableInt1K_Slice-6 130µs ± 1% 130µs ± 0% ~ (p=0.237 n=10+8) SortInt64K-6 19.9ms ± 1% 13.5ms ± 1% -32.32% (p=0.000 n=10+10) SortInt64K_Slice-6 11.5ms ± 1% 11.5ms ± 1% ~ (p=0.912 n=10+10) StableInt64K-6 21.5ms ± 0% 13.5ms ± 1% -37.30% (p=0.000 n=9+10) Sort1e2-6 108µs ± 2% 83µs ± 3% -23.26% (p=0.000 n=10+10) Stable1e2-6 218µs ± 0% 161µs ± 1% -25.99% (p=0.000 n=8+9) Sort1e4-6 22.6ms ± 1% 16.8ms ± 0% -25.45% (p=0.000 n=10+7) Stable1e4-6 67.6ms ± 1% 49.7ms ± 0% -26.48% (p=0.000 n=10+10) Sort1e6-6 3.44s ± 0% 2.55s ± 1% -26.05% (p=0.000 n=8+9) Stable1e6-6 13.7s ± 0% 9.9s ± 1% -27.68% (p=0.000 n=8+10) Fixes #27621 Updates #25338 Change-Id: I6fe633202f63fa829a6ab849c44d7e45f8835dff Reviewed-on: https://go-review.googlesource.com/c/135697 Run-TryBot: Ilya Tocar <ilya.tocar@intel.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Matthew Dempsky <mdempsky@google.com>
2018-09-17 14:08:03 -05:00
if _, ok := must[fullname]; !ok {
delete(notInlinedReason, fullname)
continue
}
}
if m := cannotInline.FindStringSubmatch(line); m != nil {
fname, reason := m[1], m[2]
fullName := curPkg + "." + fname
if _, ok := notInlinedReason[fullName]; ok {
// cmd/compile gave us a reason why
notInlinedReason[fullName] = reason
}
continue
}
}
if err := <-cmdErr; err != nil {
t.Fatal(err)
}
if err := scanner.Err(); err != nil {
t.Fatal(err)
}
for fullName, reason := range notInlinedReason {
t.Errorf("%s was not inlined: %s", fullName, reason)
}
}
func collectInlCands(msgs string) map[string]struct{} {
rv := make(map[string]struct{})
lines := strings.Split(msgs, "\n")
re := regexp.MustCompile(`^\S+\s+can\s+inline\s+(\S+)`)
for _, line := range lines {
m := re.FindStringSubmatch(line)
if m != nil {
rv[m[1]] = struct{}{}
}
}
return rv
}
func TestIssue56044(t *testing.T) {
if testing.Short() {
t.Skipf("skipping test: too long for short mode")
}
if !goexperiment.CoverageRedesign {
t.Skipf("skipping new coverage tests (experiment not enabled)")
}
testenv.MustHaveGoBuild(t)
modes := []string{"-covermode=set", "-covermode=atomic"}
for _, mode := range modes {
// Build the Go runtime with "-m", capturing output.
args := []string{"build", "-gcflags=runtime=-m", "runtime"}
cmd := testenv.Command(t, testenv.GoToolPath(t), args...)
b, err := cmd.CombinedOutput()
if err != nil {
t.Fatalf("build failed (%v): %s", err, b)
}
mbase := collectInlCands(string(b))
// Redo the build with -cover, also with "-m".
args = []string{"build", "-gcflags=runtime=-m", mode, "runtime"}
cmd = testenv.Command(t, testenv.GoToolPath(t), args...)
b, err = cmd.CombinedOutput()
if err != nil {
t.Fatalf("build failed (%v): %s", err, b)
}
mcov := collectInlCands(string(b))
// Make sure that there aren't any functions that are marked
// as inline candidates at base but not with coverage.
for k := range mbase {
if _, ok := mcov[k]; !ok {
t.Errorf("error: did not find %s in coverage -m output", k)
}
}
}
}