go/src/cmd/compile/internal/ssa/compile.go

516 lines
15 KiB
Go
Raw Normal View History

// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package ssa
import (
"cmd/internal/objabi"
"cmd/internal/src"
"fmt"
"hash/crc32"
"log"
"math/rand"
"os"
"regexp"
"runtime"
"strings"
"time"
)
// Compile is the main entry point for this package.
// Compile modifies f so that on return:
// · all Values in f map to 0 or 1 assembly instructions of the target architecture
// · the order of f.Blocks is the order to emit the Blocks
// · the order of b.Values is the order to emit the Values in each Block
// · f has a non-nil regAlloc field
func Compile(f *Func) {
// TODO: debugging - set flags to control verbosity of compiler,
// which phases to dump IR before/after, etc.
if f.Log() {
f.Logf("compiling %s\n", f.Name)
}
var rnd *rand.Rand
if checkEnabled {
rnd = rand.New(rand.NewSource(int64(crc32.ChecksumIEEE(([]byte)(f.Name)))))
}
// hook to print function & phase if panic happens
phaseName := "init"
defer func() {
if phaseName != "" {
err := recover()
stack := make([]byte, 16384)
n := runtime.Stack(stack, false)
stack = stack[:n]
f.Fatalf("panic during %s while compiling %s:\n\n%v\n\n%s\n", phaseName, f.Name, err, stack)
}
}()
// Run all the passes
if f.Log() {
printFunc(f)
}
f.HTMLWriter.WriteFunc("start", "start", f)
if BuildDump != "" && BuildDump == f.Name {
f.dumpFile("build")
}
if checkEnabled {
checkFunc(f)
}
const logMemStats = false
for _, p := range passes {
if !f.Config.optimize && !p.required || p.disabled {
continue
}
f.pass = &p
phaseName = p.name
if f.Log() {
f.Logf(" pass %s begin\n", p.name)
}
// TODO: capture logging during this pass, add it to the HTML
var mStart runtime.MemStats
if logMemStats || p.mem {
runtime.ReadMemStats(&mStart)
}
if checkEnabled && !f.scheduled {
// Test that we don't depend on the value order, by randomizing
// the order of values in each block. See issue 18169.
for _, b := range f.Blocks {
for i := 0; i < len(b.Values)-1; i++ {
j := i + rnd.Intn(len(b.Values)-i)
b.Values[i], b.Values[j] = b.Values[j], b.Values[i]
}
}
}
tStart := time.Now()
p.fn(f)
tEnd := time.Now()
// Need something less crude than "Log the whole intermediate result".
if f.Log() || f.HTMLWriter != nil {
time := tEnd.Sub(tStart).Nanoseconds()
var stats string
if logMemStats {
var mEnd runtime.MemStats
runtime.ReadMemStats(&mEnd)
nBytes := mEnd.TotalAlloc - mStart.TotalAlloc
nAllocs := mEnd.Mallocs - mStart.Mallocs
stats = fmt.Sprintf("[%d ns %d allocs %d bytes]", time, nAllocs, nBytes)
} else {
stats = fmt.Sprintf("[%d ns]", time)
}
if f.Log() {
f.Logf(" pass %s end %s\n", p.name, stats)
printFunc(f)
}
f.HTMLWriter.WriteFunc(phaseName, fmt.Sprintf("%s <span class=\"stats\">%s</span>", phaseName, stats), f)
}
if p.time || p.mem {
// Surround timing information w/ enough context to allow comparisons.
time := tEnd.Sub(tStart).Nanoseconds()
if p.time {
cmd/compile: use sparse algorithm for phis in large program This adds a sparse method for locating nearest ancestors in a dominator tree, and checks blocks with more than one predecessor for differences and inserts phi functions where there are. Uses reversed post order to cut number of passes, running it from first def to last use ("last use" for paramout and mem is end-of-program; last use for a phi input from a backedge is the source of the back edge) Includes a cutover from old algorithm to new to avoid paying large constant factor for small programs. This keeps normal builds running at about the same time, while not running over-long on large machine-generated inputs. Add "phase" flags for ssa/build -- ssa/build/stats prints number of blocks, values (before and after linking references and inserting phis, so expansion can be measured), and their product; the product governs the cutover, where a good value seems to be somewhere between 1 and 5 million. Among the files compiled by make.bash, this is the shape of the tail of the distribution for #blocks, #vars, and their product: #blocks #vars product max 6171 28180 173,898,780 99.9% 1641 6548 10,401,878 99% 463 1909 873,721 95% 152 639 95,235 90% 84 359 30,021 The old algorithm is indeed usually fastest, for 99%ile values of usually. The fix to LookupVarOutgoing ( https://go-review.googlesource.com/#/c/22790/ ) deals with some of the same problems addressed by this CL, but on at least one bug ( #15537 ) this change is still a significant help. With this CL: /tmp/gopath$ rm -rf pkg bin /tmp/gopath$ time go get -v -gcflags -memprofile=y.mprof \ github.com/gogo/protobuf/test/theproto3/combos/... ... real 4m35.200s user 13m16.644s sys 0m36.712s and pprof reports 3.4GB allocated in one of the larger profiles With tip: /tmp/gopath$ rm -rf pkg bin /tmp/gopath$ time go get -v -gcflags -memprofile=y.mprof \ github.com/gogo/protobuf/test/theproto3/combos/... ... real 10m36.569s user 25m52.286s sys 4m3.696s and pprof reports 8.3GB allocated in the same larger profile With this CL, most of the compilation time on the benchmarked input is spent in register/stack allocation (cumulative 53%) and in the sparse lookup algorithm itself (cumulative 20%). Fixes #15537. Change-Id: Ia0299dda6a291534d8b08e5f9883216ded677a00 Reviewed-on: https://go-review.googlesource.com/22342 Reviewed-by: Keith Randall <khr@golang.org> Run-TryBot: David Chase <drchase@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org>
2016-04-21 13:24:58 -04:00
f.LogStat("TIME(ns)", time)
}
if p.mem {
var mEnd runtime.MemStats
runtime.ReadMemStats(&mEnd)
nBytes := mEnd.TotalAlloc - mStart.TotalAlloc
nAllocs := mEnd.Mallocs - mStart.Mallocs
cmd/compile: use sparse algorithm for phis in large program This adds a sparse method for locating nearest ancestors in a dominator tree, and checks blocks with more than one predecessor for differences and inserts phi functions where there are. Uses reversed post order to cut number of passes, running it from first def to last use ("last use" for paramout and mem is end-of-program; last use for a phi input from a backedge is the source of the back edge) Includes a cutover from old algorithm to new to avoid paying large constant factor for small programs. This keeps normal builds running at about the same time, while not running over-long on large machine-generated inputs. Add "phase" flags for ssa/build -- ssa/build/stats prints number of blocks, values (before and after linking references and inserting phis, so expansion can be measured), and their product; the product governs the cutover, where a good value seems to be somewhere between 1 and 5 million. Among the files compiled by make.bash, this is the shape of the tail of the distribution for #blocks, #vars, and their product: #blocks #vars product max 6171 28180 173,898,780 99.9% 1641 6548 10,401,878 99% 463 1909 873,721 95% 152 639 95,235 90% 84 359 30,021 The old algorithm is indeed usually fastest, for 99%ile values of usually. The fix to LookupVarOutgoing ( https://go-review.googlesource.com/#/c/22790/ ) deals with some of the same problems addressed by this CL, but on at least one bug ( #15537 ) this change is still a significant help. With this CL: /tmp/gopath$ rm -rf pkg bin /tmp/gopath$ time go get -v -gcflags -memprofile=y.mprof \ github.com/gogo/protobuf/test/theproto3/combos/... ... real 4m35.200s user 13m16.644s sys 0m36.712s and pprof reports 3.4GB allocated in one of the larger profiles With tip: /tmp/gopath$ rm -rf pkg bin /tmp/gopath$ time go get -v -gcflags -memprofile=y.mprof \ github.com/gogo/protobuf/test/theproto3/combos/... ... real 10m36.569s user 25m52.286s sys 4m3.696s and pprof reports 8.3GB allocated in the same larger profile With this CL, most of the compilation time on the benchmarked input is spent in register/stack allocation (cumulative 53%) and in the sparse lookup algorithm itself (cumulative 20%). Fixes #15537. Change-Id: Ia0299dda6a291534d8b08e5f9883216ded677a00 Reviewed-on: https://go-review.googlesource.com/22342 Reviewed-by: Keith Randall <khr@golang.org> Run-TryBot: David Chase <drchase@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org>
2016-04-21 13:24:58 -04:00
f.LogStat("TIME(ns):BYTES:ALLOCS", time, nBytes, nAllocs)
}
}
if p.dump != nil && p.dump[f.Name] {
// Dump function to appropriately named file
f.dumpFile(phaseName)
}
if checkEnabled {
checkFunc(f)
}
}
// Squash error printing defer
phaseName = ""
}
// TODO: should be a config field
var dumpFileSeq int
// dumpFile creates a file from the phase name and function name
// Dumping is done to files to avoid buffering huge strings before
// output.
func (f *Func) dumpFile(phaseName string) {
dumpFileSeq++
fname := fmt.Sprintf("%s_%02d__%s.dump", f.Name, dumpFileSeq, phaseName)
fname = strings.Replace(fname, " ", "_", -1)
fname = strings.Replace(fname, "/", "_", -1)
fname = strings.Replace(fname, ":", "_", -1)
fi, err := os.Create(fname)
if err != nil {
f.Warnl(src.NoXPos, "Unable to create after-phase dump file %s", fname)
return
}
p := stringFuncPrinter{w: fi}
fprintFunc(p, f)
fi.Close()
}
type pass struct {
name string
fn func(*Func)
required bool
disabled bool
time bool // report time to run pass
mem bool // report mem stats to run pass
stats int // pass reports own "stats" (e.g., branches removed)
debug int // pass performs some debugging. =1 should be in error-testing-friendly Warnl format.
test int // pass-specific ad-hoc option, perhaps useful in development
dump map[string]bool // dump if function name matches
}
func (p *pass) addDump(s string) {
if p.dump == nil {
p.dump = make(map[string]bool)
}
p.dump[s] = true
}
// Run consistency checker between each phase
var checkEnabled = false
// Debug output
var IntrinsicsDebug int
var IntrinsicsDisable bool
cmd/compile: use sparse algorithm for phis in large program This adds a sparse method for locating nearest ancestors in a dominator tree, and checks blocks with more than one predecessor for differences and inserts phi functions where there are. Uses reversed post order to cut number of passes, running it from first def to last use ("last use" for paramout and mem is end-of-program; last use for a phi input from a backedge is the source of the back edge) Includes a cutover from old algorithm to new to avoid paying large constant factor for small programs. This keeps normal builds running at about the same time, while not running over-long on large machine-generated inputs. Add "phase" flags for ssa/build -- ssa/build/stats prints number of blocks, values (before and after linking references and inserting phis, so expansion can be measured), and their product; the product governs the cutover, where a good value seems to be somewhere between 1 and 5 million. Among the files compiled by make.bash, this is the shape of the tail of the distribution for #blocks, #vars, and their product: #blocks #vars product max 6171 28180 173,898,780 99.9% 1641 6548 10,401,878 99% 463 1909 873,721 95% 152 639 95,235 90% 84 359 30,021 The old algorithm is indeed usually fastest, for 99%ile values of usually. The fix to LookupVarOutgoing ( https://go-review.googlesource.com/#/c/22790/ ) deals with some of the same problems addressed by this CL, but on at least one bug ( #15537 ) this change is still a significant help. With this CL: /tmp/gopath$ rm -rf pkg bin /tmp/gopath$ time go get -v -gcflags -memprofile=y.mprof \ github.com/gogo/protobuf/test/theproto3/combos/... ... real 4m35.200s user 13m16.644s sys 0m36.712s and pprof reports 3.4GB allocated in one of the larger profiles With tip: /tmp/gopath$ rm -rf pkg bin /tmp/gopath$ time go get -v -gcflags -memprofile=y.mprof \ github.com/gogo/protobuf/test/theproto3/combos/... ... real 10m36.569s user 25m52.286s sys 4m3.696s and pprof reports 8.3GB allocated in the same larger profile With this CL, most of the compilation time on the benchmarked input is spent in register/stack allocation (cumulative 53%) and in the sparse lookup algorithm itself (cumulative 20%). Fixes #15537. Change-Id: Ia0299dda6a291534d8b08e5f9883216ded677a00 Reviewed-on: https://go-review.googlesource.com/22342 Reviewed-by: Keith Randall <khr@golang.org> Run-TryBot: David Chase <drchase@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org>
2016-04-21 13:24:58 -04:00
var BuildDebug int
var BuildTest int
var BuildStats int
var BuildDump string // name of function to dump after initial build of ssa
cmd/compile: use sparse algorithm for phis in large program This adds a sparse method for locating nearest ancestors in a dominator tree, and checks blocks with more than one predecessor for differences and inserts phi functions where there are. Uses reversed post order to cut number of passes, running it from first def to last use ("last use" for paramout and mem is end-of-program; last use for a phi input from a backedge is the source of the back edge) Includes a cutover from old algorithm to new to avoid paying large constant factor for small programs. This keeps normal builds running at about the same time, while not running over-long on large machine-generated inputs. Add "phase" flags for ssa/build -- ssa/build/stats prints number of blocks, values (before and after linking references and inserting phis, so expansion can be measured), and their product; the product governs the cutover, where a good value seems to be somewhere between 1 and 5 million. Among the files compiled by make.bash, this is the shape of the tail of the distribution for #blocks, #vars, and their product: #blocks #vars product max 6171 28180 173,898,780 99.9% 1641 6548 10,401,878 99% 463 1909 873,721 95% 152 639 95,235 90% 84 359 30,021 The old algorithm is indeed usually fastest, for 99%ile values of usually. The fix to LookupVarOutgoing ( https://go-review.googlesource.com/#/c/22790/ ) deals with some of the same problems addressed by this CL, but on at least one bug ( #15537 ) this change is still a significant help. With this CL: /tmp/gopath$ rm -rf pkg bin /tmp/gopath$ time go get -v -gcflags -memprofile=y.mprof \ github.com/gogo/protobuf/test/theproto3/combos/... ... real 4m35.200s user 13m16.644s sys 0m36.712s and pprof reports 3.4GB allocated in one of the larger profiles With tip: /tmp/gopath$ rm -rf pkg bin /tmp/gopath$ time go get -v -gcflags -memprofile=y.mprof \ github.com/gogo/protobuf/test/theproto3/combos/... ... real 10m36.569s user 25m52.286s sys 4m3.696s and pprof reports 8.3GB allocated in the same larger profile With this CL, most of the compilation time on the benchmarked input is spent in register/stack allocation (cumulative 53%) and in the sparse lookup algorithm itself (cumulative 20%). Fixes #15537. Change-Id: Ia0299dda6a291534d8b08e5f9883216ded677a00 Reviewed-on: https://go-review.googlesource.com/22342 Reviewed-by: Keith Randall <khr@golang.org> Run-TryBot: David Chase <drchase@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org>
2016-04-21 13:24:58 -04:00
// PhaseOption sets the specified flag in the specified ssa phase,
// returning empty string if this was successful or a string explaining
// the error if it was not.
// A version of the phase name with "_" replaced by " " is also checked for a match.
// If the phase name begins a '~' then the rest of the underscores-replaced-with-blanks
// version is used as a regular expression to match the phase name(s).
//
// Special cases that have turned out to be useful:
// ssa/check/on enables checking after each phase
// ssa/all/time enables time reporting for all phases
//
// See gc/lex.go for dissection of the option string.
// Example uses:
//
// GO_GCFLAGS=-d=ssa/generic_cse/time,ssa/generic_cse/stats,ssa/generic_cse/debug=3 ./make.bash
//
// BOOT_GO_GCFLAGS=-d='ssa/~^.*scc$/off' GO_GCFLAGS='-d=ssa/~^.*scc$/off' ./make.bash
//
func PhaseOption(phase, flag string, val int, valString string) string {
if phase == "help" {
lastcr := 0
cmd/compile: better formatting for ssa phases options doc Change the help doc of go tool compile -d=ssa/help from this: compile: GcFlag -d=ssa/<phase>/<flag>[=<value>|<function_name>] <phase> is one of: check, all, build, intrinsics, early_phielim, early_copyelim early_deadcode, short_circuit, decompose_user, opt, zero_arg_cse opt_deadcode, generic_cse, phiopt, nilcheckelim, prove, loopbce decompose_builtin, softfloat, late_opt, generic_deadcode, check_bce fuse, dse, writebarrier, insert_resched_checks, tighten, lower lowered_cse, elim_unread_autos, lowered_deadcode, checkLower late_phielim, late_copyelim, phi_tighten, late_deadcode, critical likelyadjust, layout, schedule, late_nilcheck, flagalloc, regalloc loop_rotate, stackframe, trim <flag> is one of on, off, debug, mem, time, test, stats, dump <value> defaults to 1 <function_name> is required for "dump", specifies name of function to dump after <phase> Except for dump, output is directed to standard out; dump appears in a file. Phase "all" supports flags "time", "mem", and "dump". Phases "intrinsics" supports flags "on", "off", and "debug". Interpretation of the "debug" value depends on the phase. Dump files are named <phase>__<function_name>_<seq>.dump. To this: compile: PhaseOptions usage: go tool compile -d=ssa/<phase>/<flag>[=<value>|<function_name>] where: - <phase> is one of: check, all, build, intrinsics, early_phielim, early_copyelim early_deadcode, short_circuit, decompose_user, opt, zero_arg_cse opt_deadcode, generic_cse, phiopt, nilcheckelim, prove decompose_builtin, softfloat, late_opt, generic_deadcode, check_bce branchelim, fuse, dse, writebarrier, insert_resched_checks, lower lowered_cse, elim_unread_autos, lowered_deadcode, checkLower late_phielim, late_copyelim, tighten, phi_tighten, late_deadcode critical, likelyadjust, layout, schedule, late_nilcheck, flagalloc regalloc, loop_rotate, stackframe, trim - <flag> is one of: on, off, debug, mem, time, test, stats, dump - <value> defaults to 1 - <function_name> is required for the "dump" flag, and specifies the name of function to dump after <phase> Phase "all" supports flags "time", "mem", and "dump". Phase "intrinsics" supports flags "on", "off", and "debug". If the "dump" flag is specified, the output is written on a file named <phase>__<function_name>_<seq>.dump; otherwise it is directed to stdout. Also add a few examples at the bottom. Fixes #20349 Change-Id: I334799e951e7b27855b3ace5d2d966c4d6ec4cff Reviewed-on: https://go-review.googlesource.com/110062 Reviewed-by: Josh Bleecher Snyder <josharian@gmail.com>
2018-04-29 14:57:30 +02:00
phasenames := " check, all, build, intrinsics"
for _, p := range passes {
pn := strings.Replace(p.name, " ", "_", -1)
if len(pn)+len(phasenames)-lastcr > 70 {
cmd/compile: better formatting for ssa phases options doc Change the help doc of go tool compile -d=ssa/help from this: compile: GcFlag -d=ssa/<phase>/<flag>[=<value>|<function_name>] <phase> is one of: check, all, build, intrinsics, early_phielim, early_copyelim early_deadcode, short_circuit, decompose_user, opt, zero_arg_cse opt_deadcode, generic_cse, phiopt, nilcheckelim, prove, loopbce decompose_builtin, softfloat, late_opt, generic_deadcode, check_bce fuse, dse, writebarrier, insert_resched_checks, tighten, lower lowered_cse, elim_unread_autos, lowered_deadcode, checkLower late_phielim, late_copyelim, phi_tighten, late_deadcode, critical likelyadjust, layout, schedule, late_nilcheck, flagalloc, regalloc loop_rotate, stackframe, trim <flag> is one of on, off, debug, mem, time, test, stats, dump <value> defaults to 1 <function_name> is required for "dump", specifies name of function to dump after <phase> Except for dump, output is directed to standard out; dump appears in a file. Phase "all" supports flags "time", "mem", and "dump". Phases "intrinsics" supports flags "on", "off", and "debug". Interpretation of the "debug" value depends on the phase. Dump files are named <phase>__<function_name>_<seq>.dump. To this: compile: PhaseOptions usage: go tool compile -d=ssa/<phase>/<flag>[=<value>|<function_name>] where: - <phase> is one of: check, all, build, intrinsics, early_phielim, early_copyelim early_deadcode, short_circuit, decompose_user, opt, zero_arg_cse opt_deadcode, generic_cse, phiopt, nilcheckelim, prove decompose_builtin, softfloat, late_opt, generic_deadcode, check_bce branchelim, fuse, dse, writebarrier, insert_resched_checks, lower lowered_cse, elim_unread_autos, lowered_deadcode, checkLower late_phielim, late_copyelim, tighten, phi_tighten, late_deadcode critical, likelyadjust, layout, schedule, late_nilcheck, flagalloc regalloc, loop_rotate, stackframe, trim - <flag> is one of: on, off, debug, mem, time, test, stats, dump - <value> defaults to 1 - <function_name> is required for the "dump" flag, and specifies the name of function to dump after <phase> Phase "all" supports flags "time", "mem", and "dump". Phase "intrinsics" supports flags "on", "off", and "debug". If the "dump" flag is specified, the output is written on a file named <phase>__<function_name>_<seq>.dump; otherwise it is directed to stdout. Also add a few examples at the bottom. Fixes #20349 Change-Id: I334799e951e7b27855b3ace5d2d966c4d6ec4cff Reviewed-on: https://go-review.googlesource.com/110062 Reviewed-by: Josh Bleecher Snyder <josharian@gmail.com>
2018-04-29 14:57:30 +02:00
phasenames += "\n "
lastcr = len(phasenames)
phasenames += pn
} else {
phasenames += ", " + pn
}
}
cmd/compile: better formatting for ssa phases options doc Change the help doc of go tool compile -d=ssa/help from this: compile: GcFlag -d=ssa/<phase>/<flag>[=<value>|<function_name>] <phase> is one of: check, all, build, intrinsics, early_phielim, early_copyelim early_deadcode, short_circuit, decompose_user, opt, zero_arg_cse opt_deadcode, generic_cse, phiopt, nilcheckelim, prove, loopbce decompose_builtin, softfloat, late_opt, generic_deadcode, check_bce fuse, dse, writebarrier, insert_resched_checks, tighten, lower lowered_cse, elim_unread_autos, lowered_deadcode, checkLower late_phielim, late_copyelim, phi_tighten, late_deadcode, critical likelyadjust, layout, schedule, late_nilcheck, flagalloc, regalloc loop_rotate, stackframe, trim <flag> is one of on, off, debug, mem, time, test, stats, dump <value> defaults to 1 <function_name> is required for "dump", specifies name of function to dump after <phase> Except for dump, output is directed to standard out; dump appears in a file. Phase "all" supports flags "time", "mem", and "dump". Phases "intrinsics" supports flags "on", "off", and "debug". Interpretation of the "debug" value depends on the phase. Dump files are named <phase>__<function_name>_<seq>.dump. To this: compile: PhaseOptions usage: go tool compile -d=ssa/<phase>/<flag>[=<value>|<function_name>] where: - <phase> is one of: check, all, build, intrinsics, early_phielim, early_copyelim early_deadcode, short_circuit, decompose_user, opt, zero_arg_cse opt_deadcode, generic_cse, phiopt, nilcheckelim, prove decompose_builtin, softfloat, late_opt, generic_deadcode, check_bce branchelim, fuse, dse, writebarrier, insert_resched_checks, lower lowered_cse, elim_unread_autos, lowered_deadcode, checkLower late_phielim, late_copyelim, tighten, phi_tighten, late_deadcode critical, likelyadjust, layout, schedule, late_nilcheck, flagalloc regalloc, loop_rotate, stackframe, trim - <flag> is one of: on, off, debug, mem, time, test, stats, dump - <value> defaults to 1 - <function_name> is required for the "dump" flag, and specifies the name of function to dump after <phase> Phase "all" supports flags "time", "mem", and "dump". Phase "intrinsics" supports flags "on", "off", and "debug". If the "dump" flag is specified, the output is written on a file named <phase>__<function_name>_<seq>.dump; otherwise it is directed to stdout. Also add a few examples at the bottom. Fixes #20349 Change-Id: I334799e951e7b27855b3ace5d2d966c4d6ec4cff Reviewed-on: https://go-review.googlesource.com/110062 Reviewed-by: Josh Bleecher Snyder <josharian@gmail.com>
2018-04-29 14:57:30 +02:00
return `PhaseOptions usage:
go tool compile -d=ssa/<phase>/<flag>[=<value>|<function_name>]
where:
- <phase> is one of:
` + phasenames + `
cmd/compile: better formatting for ssa phases options doc Change the help doc of go tool compile -d=ssa/help from this: compile: GcFlag -d=ssa/<phase>/<flag>[=<value>|<function_name>] <phase> is one of: check, all, build, intrinsics, early_phielim, early_copyelim early_deadcode, short_circuit, decompose_user, opt, zero_arg_cse opt_deadcode, generic_cse, phiopt, nilcheckelim, prove, loopbce decompose_builtin, softfloat, late_opt, generic_deadcode, check_bce fuse, dse, writebarrier, insert_resched_checks, tighten, lower lowered_cse, elim_unread_autos, lowered_deadcode, checkLower late_phielim, late_copyelim, phi_tighten, late_deadcode, critical likelyadjust, layout, schedule, late_nilcheck, flagalloc, regalloc loop_rotate, stackframe, trim <flag> is one of on, off, debug, mem, time, test, stats, dump <value> defaults to 1 <function_name> is required for "dump", specifies name of function to dump after <phase> Except for dump, output is directed to standard out; dump appears in a file. Phase "all" supports flags "time", "mem", and "dump". Phases "intrinsics" supports flags "on", "off", and "debug". Interpretation of the "debug" value depends on the phase. Dump files are named <phase>__<function_name>_<seq>.dump. To this: compile: PhaseOptions usage: go tool compile -d=ssa/<phase>/<flag>[=<value>|<function_name>] where: - <phase> is one of: check, all, build, intrinsics, early_phielim, early_copyelim early_deadcode, short_circuit, decompose_user, opt, zero_arg_cse opt_deadcode, generic_cse, phiopt, nilcheckelim, prove decompose_builtin, softfloat, late_opt, generic_deadcode, check_bce branchelim, fuse, dse, writebarrier, insert_resched_checks, lower lowered_cse, elim_unread_autos, lowered_deadcode, checkLower late_phielim, late_copyelim, tighten, phi_tighten, late_deadcode critical, likelyadjust, layout, schedule, late_nilcheck, flagalloc regalloc, loop_rotate, stackframe, trim - <flag> is one of: on, off, debug, mem, time, test, stats, dump - <value> defaults to 1 - <function_name> is required for the "dump" flag, and specifies the name of function to dump after <phase> Phase "all" supports flags "time", "mem", and "dump". Phase "intrinsics" supports flags "on", "off", and "debug". If the "dump" flag is specified, the output is written on a file named <phase>__<function_name>_<seq>.dump; otherwise it is directed to stdout. Also add a few examples at the bottom. Fixes #20349 Change-Id: I334799e951e7b27855b3ace5d2d966c4d6ec4cff Reviewed-on: https://go-review.googlesource.com/110062 Reviewed-by: Josh Bleecher Snyder <josharian@gmail.com>
2018-04-29 14:57:30 +02:00
- <flag> is one of:
on, off, debug, mem, time, test, stats, dump
- <value> defaults to 1
- <function_name> is required for the "dump" flag, and specifies the
name of function to dump after <phase>
Phase "all" supports flags "time", "mem", and "dump".
cmd/compile: better formatting for ssa phases options doc Change the help doc of go tool compile -d=ssa/help from this: compile: GcFlag -d=ssa/<phase>/<flag>[=<value>|<function_name>] <phase> is one of: check, all, build, intrinsics, early_phielim, early_copyelim early_deadcode, short_circuit, decompose_user, opt, zero_arg_cse opt_deadcode, generic_cse, phiopt, nilcheckelim, prove, loopbce decompose_builtin, softfloat, late_opt, generic_deadcode, check_bce fuse, dse, writebarrier, insert_resched_checks, tighten, lower lowered_cse, elim_unread_autos, lowered_deadcode, checkLower late_phielim, late_copyelim, phi_tighten, late_deadcode, critical likelyadjust, layout, schedule, late_nilcheck, flagalloc, regalloc loop_rotate, stackframe, trim <flag> is one of on, off, debug, mem, time, test, stats, dump <value> defaults to 1 <function_name> is required for "dump", specifies name of function to dump after <phase> Except for dump, output is directed to standard out; dump appears in a file. Phase "all" supports flags "time", "mem", and "dump". Phases "intrinsics" supports flags "on", "off", and "debug". Interpretation of the "debug" value depends on the phase. Dump files are named <phase>__<function_name>_<seq>.dump. To this: compile: PhaseOptions usage: go tool compile -d=ssa/<phase>/<flag>[=<value>|<function_name>] where: - <phase> is one of: check, all, build, intrinsics, early_phielim, early_copyelim early_deadcode, short_circuit, decompose_user, opt, zero_arg_cse opt_deadcode, generic_cse, phiopt, nilcheckelim, prove decompose_builtin, softfloat, late_opt, generic_deadcode, check_bce branchelim, fuse, dse, writebarrier, insert_resched_checks, lower lowered_cse, elim_unread_autos, lowered_deadcode, checkLower late_phielim, late_copyelim, tighten, phi_tighten, late_deadcode critical, likelyadjust, layout, schedule, late_nilcheck, flagalloc regalloc, loop_rotate, stackframe, trim - <flag> is one of: on, off, debug, mem, time, test, stats, dump - <value> defaults to 1 - <function_name> is required for the "dump" flag, and specifies the name of function to dump after <phase> Phase "all" supports flags "time", "mem", and "dump". Phase "intrinsics" supports flags "on", "off", and "debug". If the "dump" flag is specified, the output is written on a file named <phase>__<function_name>_<seq>.dump; otherwise it is directed to stdout. Also add a few examples at the bottom. Fixes #20349 Change-Id: I334799e951e7b27855b3ace5d2d966c4d6ec4cff Reviewed-on: https://go-review.googlesource.com/110062 Reviewed-by: Josh Bleecher Snyder <josharian@gmail.com>
2018-04-29 14:57:30 +02:00
Phase "intrinsics" supports flags "on", "off", and "debug".
If the "dump" flag is specified, the output is written on a file named
<phase>__<function_name>_<seq>.dump; otherwise it is directed to stdout.
Examples:
-d=ssa/check/on
enables checking after each phase
-d=ssa/all/time
enables time reporting for all phases
-d=ssa/prove/debug=2
sets debugging level to 2 in the prove pass
Multiple flags can be passed at once, by separating them with
commas. For example:
-d=ssa/check/on,ssa/all/time
`
}
if phase == "check" && flag == "on" {
checkEnabled = val != 0
return ""
}
if phase == "check" && flag == "off" {
checkEnabled = val == 0
return ""
}
alltime := false
allmem := false
alldump := false
if phase == "all" {
if flag == "time" {
alltime = val != 0
} else if flag == "mem" {
allmem = val != 0
} else if flag == "dump" {
alldump = val != 0
if alldump {
BuildDump = valString
}
} else {
return fmt.Sprintf("Did not find a flag matching %s in -d=ssa/%s debug option", flag, phase)
}
}
if phase == "intrinsics" {
switch flag {
case "on":
IntrinsicsDisable = val == 0
case "off":
IntrinsicsDisable = val != 0
case "debug":
IntrinsicsDebug = val
default:
return fmt.Sprintf("Did not find a flag matching %s in -d=ssa/%s debug option", flag, phase)
}
return ""
}
cmd/compile: use sparse algorithm for phis in large program This adds a sparse method for locating nearest ancestors in a dominator tree, and checks blocks with more than one predecessor for differences and inserts phi functions where there are. Uses reversed post order to cut number of passes, running it from first def to last use ("last use" for paramout and mem is end-of-program; last use for a phi input from a backedge is the source of the back edge) Includes a cutover from old algorithm to new to avoid paying large constant factor for small programs. This keeps normal builds running at about the same time, while not running over-long on large machine-generated inputs. Add "phase" flags for ssa/build -- ssa/build/stats prints number of blocks, values (before and after linking references and inserting phis, so expansion can be measured), and their product; the product governs the cutover, where a good value seems to be somewhere between 1 and 5 million. Among the files compiled by make.bash, this is the shape of the tail of the distribution for #blocks, #vars, and their product: #blocks #vars product max 6171 28180 173,898,780 99.9% 1641 6548 10,401,878 99% 463 1909 873,721 95% 152 639 95,235 90% 84 359 30,021 The old algorithm is indeed usually fastest, for 99%ile values of usually. The fix to LookupVarOutgoing ( https://go-review.googlesource.com/#/c/22790/ ) deals with some of the same problems addressed by this CL, but on at least one bug ( #15537 ) this change is still a significant help. With this CL: /tmp/gopath$ rm -rf pkg bin /tmp/gopath$ time go get -v -gcflags -memprofile=y.mprof \ github.com/gogo/protobuf/test/theproto3/combos/... ... real 4m35.200s user 13m16.644s sys 0m36.712s and pprof reports 3.4GB allocated in one of the larger profiles With tip: /tmp/gopath$ rm -rf pkg bin /tmp/gopath$ time go get -v -gcflags -memprofile=y.mprof \ github.com/gogo/protobuf/test/theproto3/combos/... ... real 10m36.569s user 25m52.286s sys 4m3.696s and pprof reports 8.3GB allocated in the same larger profile With this CL, most of the compilation time on the benchmarked input is spent in register/stack allocation (cumulative 53%) and in the sparse lookup algorithm itself (cumulative 20%). Fixes #15537. Change-Id: Ia0299dda6a291534d8b08e5f9883216ded677a00 Reviewed-on: https://go-review.googlesource.com/22342 Reviewed-by: Keith Randall <khr@golang.org> Run-TryBot: David Chase <drchase@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org>
2016-04-21 13:24:58 -04:00
if phase == "build" {
switch flag {
case "debug":
BuildDebug = val
case "test":
BuildTest = val
case "stats":
BuildStats = val
case "dump":
BuildDump = valString
cmd/compile: use sparse algorithm for phis in large program This adds a sparse method for locating nearest ancestors in a dominator tree, and checks blocks with more than one predecessor for differences and inserts phi functions where there are. Uses reversed post order to cut number of passes, running it from first def to last use ("last use" for paramout and mem is end-of-program; last use for a phi input from a backedge is the source of the back edge) Includes a cutover from old algorithm to new to avoid paying large constant factor for small programs. This keeps normal builds running at about the same time, while not running over-long on large machine-generated inputs. Add "phase" flags for ssa/build -- ssa/build/stats prints number of blocks, values (before and after linking references and inserting phis, so expansion can be measured), and their product; the product governs the cutover, where a good value seems to be somewhere between 1 and 5 million. Among the files compiled by make.bash, this is the shape of the tail of the distribution for #blocks, #vars, and their product: #blocks #vars product max 6171 28180 173,898,780 99.9% 1641 6548 10,401,878 99% 463 1909 873,721 95% 152 639 95,235 90% 84 359 30,021 The old algorithm is indeed usually fastest, for 99%ile values of usually. The fix to LookupVarOutgoing ( https://go-review.googlesource.com/#/c/22790/ ) deals with some of the same problems addressed by this CL, but on at least one bug ( #15537 ) this change is still a significant help. With this CL: /tmp/gopath$ rm -rf pkg bin /tmp/gopath$ time go get -v -gcflags -memprofile=y.mprof \ github.com/gogo/protobuf/test/theproto3/combos/... ... real 4m35.200s user 13m16.644s sys 0m36.712s and pprof reports 3.4GB allocated in one of the larger profiles With tip: /tmp/gopath$ rm -rf pkg bin /tmp/gopath$ time go get -v -gcflags -memprofile=y.mprof \ github.com/gogo/protobuf/test/theproto3/combos/... ... real 10m36.569s user 25m52.286s sys 4m3.696s and pprof reports 8.3GB allocated in the same larger profile With this CL, most of the compilation time on the benchmarked input is spent in register/stack allocation (cumulative 53%) and in the sparse lookup algorithm itself (cumulative 20%). Fixes #15537. Change-Id: Ia0299dda6a291534d8b08e5f9883216ded677a00 Reviewed-on: https://go-review.googlesource.com/22342 Reviewed-by: Keith Randall <khr@golang.org> Run-TryBot: David Chase <drchase@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org>
2016-04-21 13:24:58 -04:00
default:
return fmt.Sprintf("Did not find a flag matching %s in -d=ssa/%s debug option", flag, phase)
}
return ""
}
underphase := strings.Replace(phase, "_", " ", -1)
var re *regexp.Regexp
if phase[0] == '~' {
r, ok := regexp.Compile(underphase[1:])
if ok != nil {
return fmt.Sprintf("Error %s in regexp for phase %s, flag %s", ok.Error(), phase, flag)
}
re = r
}
matchedOne := false
for i, p := range passes {
if phase == "all" {
p.time = alltime
p.mem = allmem
if alldump {
p.addDump(valString)
}
passes[i] = p
matchedOne = true
} else if p.name == phase || p.name == underphase || re != nil && re.MatchString(p.name) {
switch flag {
case "on":
p.disabled = val == 0
case "off":
p.disabled = val != 0
case "time":
p.time = val != 0
case "mem":
p.mem = val != 0
case "debug":
p.debug = val
case "stats":
p.stats = val
case "test":
p.test = val
case "dump":
p.addDump(valString)
default:
return fmt.Sprintf("Did not find a flag matching %s in -d=ssa/%s debug option", flag, phase)
}
if p.disabled && p.required {
return fmt.Sprintf("Cannot disable required SSA phase %s using -d=ssa/%s debug option", phase, phase)
}
passes[i] = p
matchedOne = true
}
}
if matchedOne {
return ""
}
return fmt.Sprintf("Did not find a phase matching %s in -d=ssa/... debug option", phase)
}
// list of passes for the compiler
var passes = [...]pass{
// TODO: combine phielim and copyelim into a single pass?
cmd/compile: assign and preserve statement boundaries. A new pass run after ssa building (before any other optimization) identifies the "first" ssa node for each statement. Other "noise" nodes are tagged as being never appropriate for a statement boundary (e.g., VarKill, VarDef, Phi). Rewrite, deadcode, cse, and nilcheck are modified to move the statement boundaries forward whenever possible if a boundary-tagged ssa value is removed; never-boundary nodes are ignored in this search (some operations involving constants are also tagged as never-boundary and also ignored because they are likely to be moved or removed during optimization). Code generation treats all nodes except those explicitly marked as statement boundaries as "not statement" nodes, and floats statement boundaries to the beginning of each same-line run of instructions found within a basic block. Line number html conversion was modified to make statement boundary nodes a bit more obvious by prepending a "+". The code in fuse.go that glued together the value slices of two blocks produced a result that depended on the former capacities (not lengths) of the two slices. This causes differences in the 386 bootstrap, and also can sometimes put values into an order that does a worse job of preserving statement boundaries when values are removed. Portions of two delve tests that had caught problems were incorporated into ssa/debug_test.go. There are some opportunities to do better with optimized code, but the next-ing is not lying or overly jumpy. Over 4 CLs, compilebench geomean measured binary size increase of 3.5% and compile user time increase of 3.8% (this is after optimization to reuse a sparse map instead of creating multiple maps.) This CL worsens the optimized-debugging experience with Delve; we need to work with the delve team so that they can use the is_stmt marks that we're emitting now. The reference output changes from time to time depending on other changes in the compiler, sometimes better, sometimes worse. This CL now includes a test ensuring that 99+% of the lines in the Go command itself (a handy optimized binary) include is_stmt markers. Change-Id: I359c94e06843f1eb41f9da437bd614885aa9644a Reviewed-on: https://go-review.googlesource.com/102435 Run-TryBot: David Chase <drchase@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Austin Clements <austin@google.com>
2018-03-23 22:46:06 -04:00
{name: "number lines", fn: numberLines, required: true},
{name: "early phielim", fn: phielim},
{name: "early copyelim", fn: copyelim},
{name: "early deadcode", fn: deadcode}, // remove generated dead code to avoid doing pointless work during opt
{name: "short circuit", fn: shortcircuit},
{name: "decompose args", fn: decomposeArgs, required: true},
{name: "decompose user", fn: decomposeUser, required: true},
{name: "opt", fn: opt, required: true}, // TODO: split required rules and optimizing rules
{name: "zero arg cse", fn: zcse, required: true}, // required to merge OpSB values
{name: "opt deadcode", fn: deadcode, required: true}, // remove any blocks orphaned during opt
{name: "generic cse", fn: cse},
{name: "phiopt", fn: phiopt},
{name: "nilcheckelim", fn: nilcheckelim},
{name: "prove", fn: prove},
cmd/compile: fuse before branchelim The branchelim pass works better after fuse. Running fuse before branchelim also increases the stability of generated code amidst other compiler changes, which was the original motivation behind this change. The fuse pass is not cheap enough to run in its entirety before branchelim, but the most important half of it is. This change makes it possible to run "plain fuse" independently and does so before branchelim. During make.bash, elimIf occurrences increase from 4244 to 4288 (1%), and elimIfElse occurrences increase from 989 to 1079 (9%). Toolspeed impact is marginal; plain fuse pays for itself. name old time/op new time/op delta Template 189ms ± 2% 189ms ± 2% ~ (p=0.890 n=45+46) Unicode 93.2ms ± 5% 93.4ms ± 7% ~ (p=0.790 n=48+48) GoTypes 662ms ± 4% 660ms ± 4% ~ (p=0.186 n=48+49) Compiler 2.89s ± 4% 2.91s ± 3% +0.89% (p=0.050 n=49+44) SSA 8.23s ± 2% 8.21s ± 1% ~ (p=0.165 n=46+44) Flate 123ms ± 4% 123ms ± 3% +0.58% (p=0.031 n=47+49) GoParser 154ms ± 4% 154ms ± 4% ~ (p=0.492 n=49+48) Reflect 430ms ± 4% 429ms ± 4% ~ (p=1.000 n=48+48) Tar 171ms ± 3% 170ms ± 4% ~ (p=0.122 n=48+48) XML 232ms ± 3% 232ms ± 2% ~ (p=0.850 n=46+49) [Geo mean] 394ms 394ms +0.02% name old user-time/op new user-time/op delta Template 236ms ± 5% 236ms ± 4% ~ (p=0.934 n=50+50) Unicode 132ms ± 7% 130ms ± 9% ~ (p=0.087 n=50+50) GoTypes 861ms ± 3% 867ms ± 4% ~ (p=0.124 n=48+50) Compiler 3.93s ± 4% 3.94s ± 3% ~ (p=0.584 n=49+44) SSA 12.2s ± 2% 12.3s ± 1% ~ (p=0.610 n=46+45) Flate 149ms ± 4% 150ms ± 4% ~ (p=0.194 n=48+49) GoParser 193ms ± 5% 191ms ± 6% ~ (p=0.239 n=49+50) Reflect 553ms ± 5% 556ms ± 5% ~ (p=0.091 n=49+49) Tar 218ms ± 5% 218ms ± 5% ~ (p=0.359 n=49+50) XML 299ms ± 5% 298ms ± 4% ~ (p=0.482 n=50+49) [Geo mean] 516ms 516ms -0.01% name old alloc/op new alloc/op delta Template 36.3MB ± 0% 36.3MB ± 0% -0.02% (p=0.000 n=49+49) Unicode 29.7MB ± 0% 29.7MB ± 0% ~ (p=0.270 n=50+50) GoTypes 126MB ± 0% 126MB ± 0% -0.34% (p=0.000 n=50+49) Compiler 534MB ± 0% 531MB ± 0% -0.50% (p=0.000 n=50+50) SSA 1.98GB ± 0% 1.98GB ± 0% -0.06% (p=0.000 n=49+49) Flate 24.6MB ± 0% 24.6MB ± 0% -0.29% (p=0.000 n=50+50) GoParser 29.5MB ± 0% 29.4MB ± 0% -0.15% (p=0.000 n=49+50) Reflect 87.3MB ± 0% 87.2MB ± 0% -0.13% (p=0.000 n=49+50) Tar 35.6MB ± 0% 35.5MB ± 0% -0.17% (p=0.000 n=50+50) XML 48.2MB ± 0% 48.0MB ± 0% -0.30% (p=0.000 n=48+50) [Geo mean] 83.1MB 82.9MB -0.20% name old allocs/op new allocs/op delta Template 352k ± 0% 352k ± 0% -0.01% (p=0.004 n=49+49) Unicode 341k ± 0% 341k ± 0% ~ (p=0.341 n=48+50) GoTypes 1.28M ± 0% 1.28M ± 0% -0.03% (p=0.000 n=50+49) Compiler 4.96M ± 0% 4.96M ± 0% -0.05% (p=0.000 n=50+49) SSA 15.5M ± 0% 15.5M ± 0% -0.01% (p=0.000 n=50+49) Flate 233k ± 0% 233k ± 0% +0.01% (p=0.032 n=49+49) GoParser 294k ± 0% 294k ± 0% ~ (p=0.052 n=46+48) Reflect 1.04M ± 0% 1.04M ± 0% ~ (p=0.171 n=50+47) Tar 343k ± 0% 343k ± 0% -0.03% (p=0.000 n=50+50) XML 429k ± 0% 429k ± 0% -0.04% (p=0.000 n=50+50) [Geo mean] 812k 812k -0.02% Object files grow slightly; branchelim often increases binary size, at least on amd64. name old object-bytes new object-bytes delta Template 509kB ± 0% 509kB ± 0% -0.01% (p=0.008 n=5+5) Unicode 224kB ± 0% 224kB ± 0% ~ (all equal) GoTypes 1.84MB ± 0% 1.84MB ± 0% +0.00% (p=0.008 n=5+5) Compiler 6.71MB ± 0% 6.71MB ± 0% +0.01% (p=0.008 n=5+5) SSA 21.2MB ± 0% 21.2MB ± 0% +0.01% (p=0.008 n=5+5) Flate 324kB ± 0% 324kB ± 0% -0.00% (p=0.008 n=5+5) GoParser 404kB ± 0% 404kB ± 0% -0.02% (p=0.008 n=5+5) Reflect 1.40MB ± 0% 1.40MB ± 0% +0.09% (p=0.008 n=5+5) Tar 452kB ± 0% 452kB ± 0% +0.06% (p=0.008 n=5+5) XML 596kB ± 0% 596kB ± 0% +0.00% (p=0.008 n=5+5) [Geo mean] 1.04MB 1.04MB +0.01% Change-Id: I535c711b85380ff657fc0f022bebd9cb14ddd07f Reviewed-on: https://go-review.googlesource.com/c/129378 Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
2018-05-27 09:03:45 -07:00
{name: "fuse plain", fn: fusePlain},
{name: "decompose builtin", fn: decomposeBuiltIn, required: true},
{name: "softfloat", fn: softfloat, required: true},
{name: "late opt", fn: opt, required: true}, // TODO: split required rules and optimizing rules
cmd/compile: add some generic composite type optimizations Propagate values through some wide Zero/Move operations. Among other things this allows us to optimize some kinds of array initialization. For example, the following code no longer requires a temporary be allocated on the stack. Instead it writes the values directly into the return value. func f(i uint32) [4]uint32 { return [4]uint32{i, i+1, i+2, i+3} } The return value is unnecessarily cleared but removing that is probably a task for dead store analysis (I think it needs to be able to match multiple Store ops to wide Zero ops). In order to reliably remove stack variables that are rendered unnecessary by these new rules I've added a new generic version of the unread autos elimination pass. These rules are triggered more than 5000 times when building and testing the standard library. Updates #15925 (fixes for arrays of up to 4 elements). Updates #24386 (fixes for up to 4 kept elements). Updates #24416. compilebench results: name old time/op new time/op delta Template 353ms ± 5% 359ms ± 3% ~ (p=0.143 n=10+10) Unicode 219ms ± 1% 217ms ± 4% ~ (p=0.740 n=7+10) GoTypes 1.26s ± 1% 1.26s ± 2% ~ (p=0.549 n=9+10) Compiler 6.00s ± 1% 6.08s ± 1% +1.42% (p=0.000 n=9+8) SSA 15.3s ± 2% 15.6s ± 1% +2.43% (p=0.000 n=10+10) Flate 237ms ± 2% 240ms ± 2% +1.31% (p=0.015 n=10+10) GoParser 285ms ± 1% 285ms ± 1% ~ (p=0.878 n=8+8) Reflect 797ms ± 3% 807ms ± 2% ~ (p=0.065 n=9+10) Tar 334ms ± 0% 335ms ± 4% ~ (p=0.460 n=8+10) XML 419ms ± 0% 423ms ± 1% +0.91% (p=0.001 n=7+9) StdCmd 46.0s ± 0% 46.4s ± 0% +0.85% (p=0.000 n=9+9) name old user-time/op new user-time/op delta Template 337ms ± 3% 346ms ± 5% ~ (p=0.053 n=9+10) Unicode 205ms ±10% 205ms ± 8% ~ (p=1.000 n=10+10) GoTypes 1.22s ± 2% 1.21s ± 3% ~ (p=0.436 n=10+10) Compiler 5.85s ± 1% 5.93s ± 0% +1.46% (p=0.000 n=10+8) SSA 14.9s ± 1% 15.3s ± 1% +2.62% (p=0.000 n=10+10) Flate 229ms ± 4% 228ms ± 6% ~ (p=0.796 n=10+10) GoParser 271ms ± 3% 275ms ± 4% ~ (p=0.165 n=10+10) Reflect 779ms ± 5% 775ms ± 2% ~ (p=0.971 n=10+10) Tar 317ms ± 4% 319ms ± 5% ~ (p=0.853 n=10+10) XML 404ms ± 4% 409ms ± 5% ~ (p=0.436 n=10+10) name old alloc/op new alloc/op delta Template 34.9MB ± 0% 35.0MB ± 0% +0.26% (p=0.000 n=10+10) Unicode 29.3MB ± 0% 29.3MB ± 0% +0.02% (p=0.000 n=10+10) GoTypes 115MB ± 0% 115MB ± 0% +0.30% (p=0.000 n=10+10) Compiler 519MB ± 0% 521MB ± 0% +0.30% (p=0.000 n=10+10) SSA 1.55GB ± 0% 1.57GB ± 0% +1.34% (p=0.000 n=10+9) Flate 24.1MB ± 0% 24.2MB ± 0% +0.10% (p=0.000 n=10+10) GoParser 28.1MB ± 0% 28.1MB ± 0% +0.07% (p=0.000 n=10+10) Reflect 78.7MB ± 0% 78.7MB ± 0% +0.03% (p=0.000 n=8+10) Tar 34.4MB ± 0% 34.5MB ± 0% +0.12% (p=0.000 n=10+10) XML 43.2MB ± 0% 43.2MB ± 0% +0.13% (p=0.000 n=10+10) name old allocs/op new allocs/op delta Template 330k ± 0% 330k ± 0% -0.01% (p=0.017 n=10+10) Unicode 337k ± 0% 337k ± 0% +0.01% (p=0.000 n=9+10) GoTypes 1.15M ± 0% 1.15M ± 0% +0.03% (p=0.000 n=10+10) Compiler 4.77M ± 0% 4.77M ± 0% +0.03% (p=0.000 n=9+10) SSA 12.5M ± 0% 12.6M ± 0% +1.16% (p=0.000 n=10+10) Flate 221k ± 0% 221k ± 0% +0.05% (p=0.000 n=9+10) GoParser 275k ± 0% 275k ± 0% +0.01% (p=0.014 n=10+9) Reflect 944k ± 0% 944k ± 0% -0.02% (p=0.000 n=10+10) Tar 324k ± 0% 323k ± 0% -0.12% (p=0.000 n=10+10) XML 384k ± 0% 384k ± 0% -0.01% (p=0.001 n=10+10) name old object-bytes new object-bytes delta Template 476kB ± 0% 476kB ± 0% -0.04% (p=0.000 n=10+10) Unicode 218kB ± 0% 218kB ± 0% ~ (all equal) GoTypes 1.58MB ± 0% 1.58MB ± 0% -0.04% (p=0.000 n=10+10) Compiler 6.25MB ± 0% 6.24MB ± 0% -0.09% (p=0.000 n=10+10) SSA 15.9MB ± 0% 16.1MB ± 0% +1.22% (p=0.000 n=10+10) Flate 304kB ± 0% 304kB ± 0% -0.13% (p=0.000 n=10+10) GoParser 370kB ± 0% 370kB ± 0% -0.00% (p=0.000 n=10+10) Reflect 1.27MB ± 0% 1.27MB ± 0% -0.12% (p=0.000 n=10+10) Tar 421kB ± 0% 419kB ± 0% -0.64% (p=0.000 n=10+10) XML 518kB ± 0% 517kB ± 0% -0.12% (p=0.000 n=10+10) name old export-bytes new export-bytes delta Template 16.7kB ± 0% 16.7kB ± 0% ~ (all equal) Unicode 6.52kB ± 0% 6.52kB ± 0% ~ (all equal) GoTypes 29.2kB ± 0% 29.2kB ± 0% ~ (all equal) Compiler 88.0kB ± 0% 88.0kB ± 0% ~ (all equal) SSA 109kB ± 0% 109kB ± 0% ~ (all equal) Flate 4.49kB ± 0% 4.49kB ± 0% ~ (all equal) GoParser 8.10kB ± 0% 8.10kB ± 0% ~ (all equal) Reflect 7.71kB ± 0% 7.71kB ± 0% ~ (all equal) Tar 9.15kB ± 0% 9.15kB ± 0% ~ (all equal) XML 12.3kB ± 0% 12.3kB ± 0% ~ (all equal) name old text-bytes new text-bytes delta HelloSize 676kB ± 0% 672kB ± 0% -0.59% (p=0.000 n=10+10) CmdGoSize 7.26MB ± 0% 7.24MB ± 0% -0.18% (p=0.000 n=10+10) name old data-bytes new data-bytes delta HelloSize 10.2kB ± 0% 10.2kB ± 0% ~ (all equal) CmdGoSize 248kB ± 0% 248kB ± 0% ~ (all equal) name old bss-bytes new bss-bytes delta HelloSize 125kB ± 0% 125kB ± 0% ~ (all equal) CmdGoSize 145kB ± 0% 145kB ± 0% ~ (all equal) name old exe-bytes new exe-bytes delta HelloSize 1.46MB ± 0% 1.45MB ± 0% -0.31% (p=0.000 n=10+10) CmdGoSize 14.7MB ± 0% 14.7MB ± 0% -0.17% (p=0.000 n=10+10) Change-Id: Ic72b0c189dd542f391e1c9ab88a76e9148dc4285 Reviewed-on: https://go-review.googlesource.com/106495 Run-TryBot: Michael Munday <mike.munday@ibm.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
2018-04-11 22:47:24 +01:00
{name: "dead auto elim", fn: elimDeadAutosGeneric},
{name: "generic deadcode", fn: deadcode, required: true}, // remove dead stores, which otherwise mess up store chain
{name: "check bce", fn: checkbce},
cmd/compile/internal/ssa: emit csel on arm64 Introduce a new SSA pass to generate CondSelect intstrutions, and add CondSelect lowering rules for arm64. In order to make the CSEL instruction easier to optimize, and to simplify the introduction of CSNEG, CSINC, and CSINV in the future, modify the CSEL instruction to accept a condition code in the aux field. Notably, this change makes the go1 Gzip benchmark more than 10% faster. Benchmarks on a Cavium ThunderX: name old time/op new time/op delta BinaryTree17-96 15.9s ± 6% 16.0s ± 4% ~ (p=0.968 n=10+9) Fannkuch11-96 7.17s ± 0% 7.00s ± 0% -2.43% (p=0.000 n=8+9) FmtFprintfEmpty-96 208ns ± 1% 207ns ± 0% ~ (p=0.152 n=10+8) FmtFprintfString-96 379ns ± 0% 375ns ± 0% -0.95% (p=0.000 n=10+9) FmtFprintfInt-96 385ns ± 0% 383ns ± 0% -0.52% (p=0.000 n=9+10) FmtFprintfIntInt-96 591ns ± 0% 586ns ± 0% -0.85% (p=0.006 n=7+9) FmtFprintfPrefixedInt-96 656ns ± 0% 667ns ± 0% +1.71% (p=0.000 n=10+10) FmtFprintfFloat-96 967ns ± 0% 984ns ± 0% +1.78% (p=0.000 n=10+10) FmtManyArgs-96 2.35µs ± 0% 2.25µs ± 0% -4.63% (p=0.000 n=9+8) GobDecode-96 31.0ms ± 0% 30.8ms ± 0% -0.36% (p=0.006 n=9+9) GobEncode-96 24.4ms ± 0% 24.5ms ± 0% +0.30% (p=0.000 n=9+9) Gzip-96 1.60s ± 0% 1.43s ± 0% -10.58% (p=0.000 n=9+10) Gunzip-96 167ms ± 0% 169ms ± 0% +0.83% (p=0.000 n=8+9) HTTPClientServer-96 311µs ± 1% 308µs ± 0% -0.75% (p=0.000 n=10+10) JSONEncode-96 65.0ms ± 0% 64.8ms ± 0% -0.25% (p=0.000 n=9+8) JSONDecode-96 262ms ± 1% 261ms ± 1% ~ (p=0.579 n=10+10) Mandelbrot200-96 18.0ms ± 0% 18.1ms ± 0% +0.17% (p=0.000 n=8+10) GoParse-96 14.0ms ± 0% 14.1ms ± 1% +0.42% (p=0.003 n=9+10) RegexpMatchEasy0_32-96 644ns ± 2% 645ns ± 2% ~ (p=0.836 n=10+10) RegexpMatchEasy0_1K-96 3.70µs ± 0% 3.49µs ± 0% -5.58% (p=0.000 n=10+10) RegexpMatchEasy1_32-96 662ns ± 2% 657ns ± 2% ~ (p=0.137 n=10+10) RegexpMatchEasy1_1K-96 4.47µs ± 0% 4.31µs ± 0% -3.48% (p=0.000 n=10+10) RegexpMatchMedium_32-96 844ns ± 2% 849ns ± 1% ~ (p=0.208 n=10+10) RegexpMatchMedium_1K-96 179µs ± 0% 182µs ± 0% +1.20% (p=0.000 n=10+10) RegexpMatchHard_32-96 10.0µs ± 0% 10.1µs ± 0% +0.48% (p=0.000 n=10+9) RegexpMatchHard_1K-96 297µs ± 0% 297µs ± 0% -0.14% (p=0.000 n=10+10) Revcomp-96 3.08s ± 0% 3.13s ± 0% +1.56% (p=0.000 n=9+9) Template-96 276ms ± 2% 275ms ± 1% ~ (p=0.393 n=10+10) TimeParse-96 1.37µs ± 0% 1.36µs ± 0% -0.53% (p=0.000 n=10+7) TimeFormat-96 1.40µs ± 0% 1.42µs ± 0% +0.97% (p=0.000 n=10+10) [Geo mean] 264µs 262µs -0.77% Change-Id: Ie54eee4b3092af53e6da3baa6d1755098f57f3a2 Reviewed-on: https://go-review.googlesource.com/55670 Run-TryBot: Philip Hofer <phofer@umich.edu> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Cherry Zhang <cherryyz@google.com> Reviewed-by: Keith Randall <khr@golang.org>
2017-08-13 22:36:47 +00:00
{name: "branchelim", fn: branchelim},
cmd/compile: fuse before branchelim The branchelim pass works better after fuse. Running fuse before branchelim also increases the stability of generated code amidst other compiler changes, which was the original motivation behind this change. The fuse pass is not cheap enough to run in its entirety before branchelim, but the most important half of it is. This change makes it possible to run "plain fuse" independently and does so before branchelim. During make.bash, elimIf occurrences increase from 4244 to 4288 (1%), and elimIfElse occurrences increase from 989 to 1079 (9%). Toolspeed impact is marginal; plain fuse pays for itself. name old time/op new time/op delta Template 189ms ± 2% 189ms ± 2% ~ (p=0.890 n=45+46) Unicode 93.2ms ± 5% 93.4ms ± 7% ~ (p=0.790 n=48+48) GoTypes 662ms ± 4% 660ms ± 4% ~ (p=0.186 n=48+49) Compiler 2.89s ± 4% 2.91s ± 3% +0.89% (p=0.050 n=49+44) SSA 8.23s ± 2% 8.21s ± 1% ~ (p=0.165 n=46+44) Flate 123ms ± 4% 123ms ± 3% +0.58% (p=0.031 n=47+49) GoParser 154ms ± 4% 154ms ± 4% ~ (p=0.492 n=49+48) Reflect 430ms ± 4% 429ms ± 4% ~ (p=1.000 n=48+48) Tar 171ms ± 3% 170ms ± 4% ~ (p=0.122 n=48+48) XML 232ms ± 3% 232ms ± 2% ~ (p=0.850 n=46+49) [Geo mean] 394ms 394ms +0.02% name old user-time/op new user-time/op delta Template 236ms ± 5% 236ms ± 4% ~ (p=0.934 n=50+50) Unicode 132ms ± 7% 130ms ± 9% ~ (p=0.087 n=50+50) GoTypes 861ms ± 3% 867ms ± 4% ~ (p=0.124 n=48+50) Compiler 3.93s ± 4% 3.94s ± 3% ~ (p=0.584 n=49+44) SSA 12.2s ± 2% 12.3s ± 1% ~ (p=0.610 n=46+45) Flate 149ms ± 4% 150ms ± 4% ~ (p=0.194 n=48+49) GoParser 193ms ± 5% 191ms ± 6% ~ (p=0.239 n=49+50) Reflect 553ms ± 5% 556ms ± 5% ~ (p=0.091 n=49+49) Tar 218ms ± 5% 218ms ± 5% ~ (p=0.359 n=49+50) XML 299ms ± 5% 298ms ± 4% ~ (p=0.482 n=50+49) [Geo mean] 516ms 516ms -0.01% name old alloc/op new alloc/op delta Template 36.3MB ± 0% 36.3MB ± 0% -0.02% (p=0.000 n=49+49) Unicode 29.7MB ± 0% 29.7MB ± 0% ~ (p=0.270 n=50+50) GoTypes 126MB ± 0% 126MB ± 0% -0.34% (p=0.000 n=50+49) Compiler 534MB ± 0% 531MB ± 0% -0.50% (p=0.000 n=50+50) SSA 1.98GB ± 0% 1.98GB ± 0% -0.06% (p=0.000 n=49+49) Flate 24.6MB ± 0% 24.6MB ± 0% -0.29% (p=0.000 n=50+50) GoParser 29.5MB ± 0% 29.4MB ± 0% -0.15% (p=0.000 n=49+50) Reflect 87.3MB ± 0% 87.2MB ± 0% -0.13% (p=0.000 n=49+50) Tar 35.6MB ± 0% 35.5MB ± 0% -0.17% (p=0.000 n=50+50) XML 48.2MB ± 0% 48.0MB ± 0% -0.30% (p=0.000 n=48+50) [Geo mean] 83.1MB 82.9MB -0.20% name old allocs/op new allocs/op delta Template 352k ± 0% 352k ± 0% -0.01% (p=0.004 n=49+49) Unicode 341k ± 0% 341k ± 0% ~ (p=0.341 n=48+50) GoTypes 1.28M ± 0% 1.28M ± 0% -0.03% (p=0.000 n=50+49) Compiler 4.96M ± 0% 4.96M ± 0% -0.05% (p=0.000 n=50+49) SSA 15.5M ± 0% 15.5M ± 0% -0.01% (p=0.000 n=50+49) Flate 233k ± 0% 233k ± 0% +0.01% (p=0.032 n=49+49) GoParser 294k ± 0% 294k ± 0% ~ (p=0.052 n=46+48) Reflect 1.04M ± 0% 1.04M ± 0% ~ (p=0.171 n=50+47) Tar 343k ± 0% 343k ± 0% -0.03% (p=0.000 n=50+50) XML 429k ± 0% 429k ± 0% -0.04% (p=0.000 n=50+50) [Geo mean] 812k 812k -0.02% Object files grow slightly; branchelim often increases binary size, at least on amd64. name old object-bytes new object-bytes delta Template 509kB ± 0% 509kB ± 0% -0.01% (p=0.008 n=5+5) Unicode 224kB ± 0% 224kB ± 0% ~ (all equal) GoTypes 1.84MB ± 0% 1.84MB ± 0% +0.00% (p=0.008 n=5+5) Compiler 6.71MB ± 0% 6.71MB ± 0% +0.01% (p=0.008 n=5+5) SSA 21.2MB ± 0% 21.2MB ± 0% +0.01% (p=0.008 n=5+5) Flate 324kB ± 0% 324kB ± 0% -0.00% (p=0.008 n=5+5) GoParser 404kB ± 0% 404kB ± 0% -0.02% (p=0.008 n=5+5) Reflect 1.40MB ± 0% 1.40MB ± 0% +0.09% (p=0.008 n=5+5) Tar 452kB ± 0% 452kB ± 0% +0.06% (p=0.008 n=5+5) XML 596kB ± 0% 596kB ± 0% +0.00% (p=0.008 n=5+5) [Geo mean] 1.04MB 1.04MB +0.01% Change-Id: I535c711b85380ff657fc0f022bebd9cb14ddd07f Reviewed-on: https://go-review.googlesource.com/c/129378 Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
2018-05-27 09:03:45 -07:00
{name: "fuse", fn: fuseAll},
{name: "dse", fn: dse},
{name: "writebarrier", fn: writebarrier, required: true}, // expand write barrier ops
{name: "insert resched checks", fn: insertLoopReschedChecks,
disabled: objabi.Preemptibleloops_enabled == 0}, // insert resched checks in loops.
{name: "lower", fn: lower, required: true},
{name: "lowered cse", fn: cse},
{name: "elim unread autos", fn: elimUnreadAutos},
{name: "lowered deadcode", fn: deadcode, required: true},
{name: "checkLower", fn: checkLower, required: true},
{name: "late phielim", fn: phielim},
{name: "late copyelim", fn: copyelim},
cmd/compile: tighten after lowering Moving tighten after lowering benefits from the removal of values by lowering and lowered CSE. It lets us make better decisions about which values are rematerializable and which generate flags. Empirically, it lowers stack usage (by avoiding spills) and generates slightly smaller and faster binaries. Fixes #19853 Fixes #21041 name old time/op new time/op delta Template 195ms ± 4% 193ms ± 4% -1.33% (p=0.000 n=92+97) Unicode 94.1ms ± 9% 92.5ms ± 8% -1.66% (p=0.002 n=97+95) GoTypes 572ms ± 5% 566ms ± 7% -0.92% (p=0.001 n=95+98) Compiler 2.56s ± 4% 2.52s ± 3% -1.41% (p=0.000 n=94+97) SSA 6.52s ± 2% 6.47s ± 3% -0.82% (p=0.000 n=96+94) Flate 117ms ± 5% 116ms ± 7% -0.72% (p=0.018 n=97+97) GoParser 148ms ± 6% 146ms ± 4% -0.97% (p=0.002 n=98+95) Reflect 370ms ± 7% 363ms ± 6% -1.79% (p=0.000 n=99+98) Tar 175ms ± 6% 173ms ± 6% -1.11% (p=0.001 n=94+95) XML 204ms ± 6% 201ms ± 5% -1.49% (p=0.000 n=97+96) [Geo mean] 363ms 359ms -1.22% name old user-time/op new user-time/op delta Template 251ms ± 5% 245ms ± 5% -2.40% (p=0.000 n=97+93) Unicode 131ms ±10% 128ms ± 9% -1.93% (p=0.001 n=100+99) GoTypes 760ms ± 4% 752ms ± 4% -0.96% (p=0.000 n=97+95) Compiler 3.51s ± 3% 3.48s ± 2% -1.04% (p=0.000 n=96+95) SSA 9.57s ± 4% 9.52s ± 2% -0.50% (p=0.004 n=97+96) Flate 149ms ± 6% 147ms ± 6% -1.46% (p=0.000 n=98+96) GoParser 184ms ± 5% 181ms ± 7% -1.84% (p=0.000 n=98+97) Reflect 469ms ± 6% 461ms ± 6% -1.69% (p=0.000 n=100+98) Tar 219ms ± 8% 217ms ± 7% -0.90% (p=0.035 n=96+96) XML 255ms ± 5% 251ms ± 6% -1.48% (p=0.000 n=98+98) [Geo mean] 476ms 469ms -1.42% name old alloc/op new alloc/op delta Template 37.8MB ± 0% 37.8MB ± 0% -0.17% (p=0.000 n=100+100) Unicode 28.8MB ± 0% 28.8MB ± 0% -0.02% (p=0.000 n=100+95) GoTypes 112MB ± 0% 112MB ± 0% -0.20% (p=0.000 n=100+97) Compiler 466MB ± 0% 464MB ± 0% -0.27% (p=0.000 n=100+100) SSA 1.49GB ± 0% 1.49GB ± 0% -0.08% (p=0.000 n=100+99) Flate 24.4MB ± 0% 24.3MB ± 0% -0.25% (p=0.000 n=98+99) GoParser 30.7MB ± 0% 30.6MB ± 0% -0.26% (p=0.000 n=99+100) Reflect 76.4MB ± 0% 76.4MB ± 0% ~ (p=0.253 n=100+100) Tar 38.9MB ± 0% 38.8MB ± 0% -0.20% (p=0.000 n=100+97) XML 41.5MB ± 0% 41.4MB ± 0% -0.19% (p=0.000 n=100+98) [Geo mean] 77.5MB 77.4MB -0.16% name old allocs/op new allocs/op delta Template 381k ± 0% 381k ± 0% -0.15% (p=0.000 n=100+100) Unicode 342k ± 0% 342k ± 0% -0.01% (p=0.000 n=100+98) GoTypes 1.19M ± 0% 1.18M ± 0% -0.24% (p=0.000 n=100+100) Compiler 4.52M ± 0% 4.50M ± 0% -0.29% (p=0.000 n=100+100) SSA 12.3M ± 0% 12.3M ± 0% -0.11% (p=0.000 n=100+100) Flate 234k ± 0% 234k ± 0% -0.26% (p=0.000 n=99+96) GoParser 318k ± 0% 317k ± 0% -0.21% (p=0.000 n=99+100) Reflect 974k ± 0% 974k ± 0% -0.03% (p=0.000 n=100+100) Tar 392k ± 0% 391k ± 0% -0.17% (p=0.000 n=100+99) XML 404k ± 0% 403k ± 0% -0.24% (p=0.000 n=99+99) [Geo mean] 794k 792k -0.17% name old object-bytes new object-bytes delta Template 393kB ± 0% 392kB ± 0% -0.19% (p=0.008 n=5+5) Unicode 207kB ± 0% 207kB ± 0% ~ (all equal) GoTypes 1.23MB ± 0% 1.22MB ± 0% -0.11% (p=0.008 n=5+5) Compiler 4.34MB ± 0% 4.33MB ± 0% -0.15% (p=0.008 n=5+5) SSA 9.85MB ± 0% 9.85MB ± 0% -0.07% (p=0.008 n=5+5) Flate 235kB ± 0% 234kB ± 0% -0.59% (p=0.008 n=5+5) GoParser 297kB ± 0% 296kB ± 0% -0.22% (p=0.008 n=5+5) Reflect 1.03MB ± 0% 1.03MB ± 0% -0.00% (p=0.008 n=5+5) Tar 332kB ± 0% 331kB ± 0% -0.15% (p=0.008 n=5+5) XML 413kB ± 0% 412kB ± 0% -0.19% (p=0.008 n=5+5) [Geo mean] 728kB 727kB -0.17% Change-Id: I9b5cdb668ed102a001897a05e833105acba220a2 Reviewed-on: https://go-review.googlesource.com/95995 Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
2018-02-20 17:16:19 -08:00
{name: "tighten", fn: tighten}, // move values closer to their uses
cmd/compile: move phi args which are constants closer to the phi entry: x = MOVQconst [7] ... b1: goto b2 b2: v = Phi(x, y, z) Transform that program to: entry: ... b1: x = MOVQconst [7] goto b2 b2: v = Phi(x, y, z) This CL moves constant-generating instructions used by a phi to the appropriate immediate predecessor of the phi's block. We used to put all constants in the entry block. Unfortunately, in large functions we have lots of constants at the start of the function, all of which are used by lots of phis throughout the function. This leads to the constants being live through most of the function (especially if there is an outer loop). That's an O(n^2) problem. Note that most of the non-phi uses of constants have already been folded into instructions (ADDQconst, MOVQstoreconst, etc.). This CL may be generally useful for other instances of compiler slowness, I'll have to check. It may cause some programs to run slower, but probably not by much, as rematerializeable values like these constants are allocated late (not at their originally scheduled location) anyway. This CL is definitely a minimal change that can be considered for 1.7. We probably want to do a better job in the tighten pass generally, not just for phi args. Leaving that for 1.8. Update #16407 Change-Id: If112a8883b4ef172b2f37dea13e44bda9346c342 Reviewed-on: https://go-review.googlesource.com/25046 Run-TryBot: Keith Randall <khr@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Josh Bleecher Snyder <josharian@gmail.com>
2016-07-18 23:06:04 -07:00
{name: "phi tighten", fn: phiTighten},
{name: "late deadcode", fn: deadcode},
[dev.ssa] cmd/compile: adjust branch likeliness for calls/loops Static branch predictions (which guide block ordering) are adjusted based on: loop/not-loop (favor looping) abnormal-exit/not (avoid panic) call/not-call (avoid call) ret/default (treat returns as rare) This appears to make no difference in performance of real code, meaning the compiler itself. The earlier version of this has been stripped down to help make the cost of this only-aesthetic-on-Intel phase be as cheap as possible (we probably want information about inner loops for improving register allocation, but because register allocation follows close behind this pass, conceivably the information could be reused -- so we might do this anyway just to normalize output). For a ./make.bash that takes 200 user seconds, about .75 second is reported in likelyadjust (summing nanoseconds reported with -d=ssa/likelyadjust/time ). Upstream predictions are respected. Includes test, limited to build on amd64 only. Did several iterations on the debugging output to allow some rough checks on behavior. Debug=1 logging notes agree/disagree with earlier passes, allowing analysis like the following: Run on make.bash: GO_GCFLAGS=-d=ssa/likelyadjust/debug \ ./make.bash >& lkly5.log grep 'ranch prediction' lkly5.log | wc -l 78242 // 78k predictions grep 'ranch predi' lkly5.log | egrep -v 'agrees with' | wc -l 29633 // 29k NEW predictions grep 'disagrees' lkly5.log | wc -l 444 // contradicted 444 times grep '< exit' lkly5.log | wc -l 10212 // 10k exit predictions grep '< exit' lkly5.log | egrep 'disagrees' | wc -l 5 // 5 contradicted by previous prediction grep '< exit' lkly5.log | egrep -v 'agrees' | wc -l 702 // 702-5 redundant with previous prediction grep '< call' lkly5.log | egrep -v 'agrees' | wc -l 16699 // 16k new call predictions grep 'stay in loop' lkly5.log | egrep -v 'agrees' | wc -l 3951 // 4k new "remain in loop" predictions Fixes #11451. Change-Id: Iafb0504f7030d304ef4b6dc1aba9a5789151a593 Reviewed-on: https://go-review.googlesource.com/19995 Run-TryBot: David Chase <drchase@google.com> Reviewed-by: Keith Randall <khr@golang.org>
2016-02-27 11:54:52 -05:00
{name: "critical", fn: critical, required: true}, // remove critical edges
{name: "likelyadjust", fn: likelyadjust},
{name: "layout", fn: layout, required: true}, // schedule blocks
{name: "schedule", fn: schedule, required: true}, // schedule values
{name: "late nilcheck", fn: nilcheckelim2},
{name: "flagalloc", fn: flagalloc, required: true}, // allocate flags register
{name: "regalloc", fn: regalloc, required: true}, // allocate int & float registers + stack slots
cmd/compile: rotate loops so conditional branch is at the end Old loops look like this: loop: CMPQ ... JGE exit ... JMP loop exit: New loops look like this: JMP entry loop: ... entry: CMPQ ... JLT loop This removes one instruction (the unconditional jump) from the inner loop. Kinda surprisingly, it matters. This is a bit different than the peeling that the old obj library did in that we don't duplicate the loop exit test. We just jump to the test. I'm not sure if it is better or worse to do that (peeling gets rid of the JMP but means more code duplication), but this CL is certainly a much simpler compiler change, so I'll try this way first. The obj library used to do peeling before CL https://go-review.googlesource.com/c/36205 turned it off. Fixes #15837 (remove obj instruction reordering) The reordering is already removed, this CL implements the only part of that reordering that we'd like to keep. Fixes #14758 (append loop) name old time/op new time/op delta Foo-12 817ns ± 4% 538ns ± 0% -34.08% (p=0.000 n=10+9) Bar-12 850ns ±11% 570ns ±13% -32.88% (p=0.000 n=10+10) Update #19595 (BLAS slowdown) name old time/op new time/op delta DgemvMedMedNoTransIncN-12 13.2µs ± 9% 10.2µs ± 1% -22.26% (p=0.000 n=9+9) Fixes #19633 (append loop) name old time/op new time/op delta Foo-12 810ns ± 1% 540ns ± 0% -33.30% (p=0.000 n=8+9) Update #18977 (Fannkuch11 regression) name old time/op new time/op delta Fannkuch11-8 2.80s ± 0% 3.01s ± 0% +7.47% (p=0.000 n=9+10) This one makes no sense. There's strictly 1 less instruction in the inner loop (17 instead of 18). They are exactly the same instructions except for the JMP that has been elided. go1 benchmarks generally don't look very impressive. But the gains for the specific issues above make this CL still probably worth it. name old time/op new time/op delta BinaryTree17-8 2.32s ± 0% 2.34s ± 0% +1.14% (p=0.000 n=9+7) Fannkuch11-8 2.80s ± 0% 3.01s ± 0% +7.47% (p=0.000 n=9+10) FmtFprintfEmpty-8 44.1ns ± 1% 46.1ns ± 1% +4.53% (p=0.000 n=10+10) FmtFprintfString-8 67.8ns ± 0% 74.4ns ± 1% +9.80% (p=0.000 n=10+9) FmtFprintfInt-8 74.9ns ± 0% 78.4ns ± 0% +4.67% (p=0.000 n=8+10) FmtFprintfIntInt-8 117ns ± 1% 123ns ± 1% +4.69% (p=0.000 n=9+10) FmtFprintfPrefixedInt-8 160ns ± 1% 146ns ± 0% -8.22% (p=0.000 n=8+10) FmtFprintfFloat-8 214ns ± 0% 206ns ± 0% -3.91% (p=0.000 n=8+8) FmtManyArgs-8 468ns ± 0% 497ns ± 1% +6.09% (p=0.000 n=8+10) GobDecode-8 6.16ms ± 0% 6.21ms ± 1% +0.76% (p=0.000 n=9+10) GobEncode-8 4.90ms ± 0% 4.92ms ± 1% +0.37% (p=0.028 n=9+10) Gzip-8 209ms ± 0% 212ms ± 0% +1.33% (p=0.000 n=10+10) Gunzip-8 36.6ms ± 0% 38.0ms ± 1% +4.03% (p=0.000 n=9+9) HTTPClientServer-8 84.2µs ± 0% 86.0µs ± 1% +2.14% (p=0.000 n=9+9) JSONEncode-8 13.6ms ± 3% 13.8ms ± 1% +1.55% (p=0.003 n=9+10) JSONDecode-8 53.2ms ± 5% 52.9ms ± 0% ~ (p=0.280 n=10+10) Mandelbrot200-8 3.78ms ± 0% 3.78ms ± 1% ~ (p=0.661 n=10+9) GoParse-8 2.89ms ± 0% 2.94ms ± 2% +1.50% (p=0.000 n=10+10) RegexpMatchEasy0_32-8 68.5ns ± 2% 68.9ns ± 1% ~ (p=0.136 n=10+10) RegexpMatchEasy0_1K-8 220ns ± 1% 225ns ± 1% +2.41% (p=0.000 n=10+10) RegexpMatchEasy1_32-8 64.7ns ± 0% 64.5ns ± 0% -0.28% (p=0.042 n=10+10) RegexpMatchEasy1_1K-8 348ns ± 1% 355ns ± 0% +1.90% (p=0.000 n=10+10) RegexpMatchMedium_32-8 102ns ± 1% 105ns ± 1% +2.95% (p=0.000 n=10+10) RegexpMatchMedium_1K-8 33.1µs ± 3% 32.5µs ± 0% -1.75% (p=0.000 n=10+10) RegexpMatchHard_32-8 1.71µs ± 1% 1.70µs ± 1% -0.84% (p=0.002 n=10+9) RegexpMatchHard_1K-8 51.1µs ± 0% 50.8µs ± 1% -0.48% (p=0.004 n=10+10) Revcomp-8 411ms ± 1% 402ms ± 0% -2.22% (p=0.000 n=10+9) Template-8 61.8ms ± 1% 59.7ms ± 0% -3.44% (p=0.000 n=9+9) TimeParse-8 306ns ± 0% 318ns ± 0% +3.83% (p=0.000 n=10+10) TimeFormat-8 320ns ± 0% 318ns ± 1% -0.53% (p=0.012 n=7+10) Change-Id: Ifaf29abbe5874e437048e411ba8f7cfbc9e1c94b Reviewed-on: https://go-review.googlesource.com/38431 Run-TryBot: Keith Randall <khr@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: David Chase <drchase@google.com>
2017-03-21 14:51:38 -07:00
{name: "loop rotate", fn: loopRotate},
{name: "stackframe", fn: stackframe, required: true},
{name: "trim", fn: trim}, // remove empty blocks
}
// Double-check phase ordering constraints.
// This code is intended to document the ordering requirements
// between different phases. It does not override the passes
// list above.
type constraint struct {
a, b string // a must come before b
}
var passOrder = [...]constraint{
// "insert resched checks" uses mem, better to clean out stores first.
{"dse", "insert resched checks"},
// insert resched checks adds new blocks containing generic instructions
{"insert resched checks", "lower"},
{"insert resched checks", "tighten"},
// prove relies on common-subexpression elimination for maximum benefits.
{"generic cse", "prove"},
// deadcode after prove to eliminate all new dead blocks.
{"prove", "generic deadcode"},
// common-subexpression before dead-store elim, so that we recognize
// when two address expressions are the same.
{"generic cse", "dse"},
// cse substantially improves nilcheckelim efficacy
{"generic cse", "nilcheckelim"},
// allow deadcode to clean up after nilcheckelim
{"nilcheckelim", "generic deadcode"},
// nilcheckelim generates sequences of plain basic blocks
{"nilcheckelim", "fuse"},
// nilcheckelim relies on opt to rewrite user nil checks
{"opt", "nilcheckelim"},
[dev.ssa] cmd/compile: add SSA pass to move values closer to uses Even this very simple, restricted initial implementation helps. While running make.bash, it moves 84437 values to new, closer homes. As a concrete example: func f_ssa(i, j int, b bool) int { if !b { return 0 } return i + j } It cuts off one stack slot and two instructions: Before: "".f_ssa t=1 size=96 value=0 args=0x20 locals=0x18 0x0000 00000 (x.go:3) TEXT "".f_ssa(SB), $24-32 0x0000 00000 (x.go:3) SUBQ $24, SP 0x0004 00004 (x.go:3) FUNCDATA $0, "".gcargs·0(SB) 0x0004 00004 (x.go:3) FUNCDATA $1, "".gclocals·1(SB) 0x0004 00004 (x.go:5) MOVQ $0, AX 0x0006 00006 (x.go:3) MOVQ 32(SP), CX 0x000b 00011 (x.go:3) MOVQ 40(SP), DX 0x0010 00016 (x.go:3) LEAQ 48(SP), BX 0x0015 00021 (x.go:3) MOVB (BX), BPB 0x0018 00024 (x.go:3) MOVQ $0, SI 0x001a 00026 (x.go:3) MOVQ SI, 56(SP) 0x001f 00031 (x.go:3) TESTB BPB, BPB 0x0022 00034 (x.go:5) MOVQ AX, (SP) 0x0026 00038 (x.go:3) MOVQ CX, 8(SP) 0x002b 00043 (x.go:3) MOVQ DX, 16(SP) 0x0030 00048 (x.go:4) JEQ 74 0x0032 00050 (x.go:3) MOVQ 8(SP), AX 0x0037 00055 (x.go:3) MOVQ 16(SP), CX 0x003c 00060 (x.go:7) LEAQ (AX)(CX*1), DX 0x0040 00064 (x.go:7) MOVQ DX, 56(SP) 0x0045 00069 (x.go:3) ADDQ $24, SP 0x0049 00073 (x.go:3) RET 0x004a 00074 (x.go:5) MOVQ (SP), AX 0x004e 00078 (x.go:5) MOVQ AX, 56(SP) 0x0053 00083 (x.go:3) JMP 69 After: "".f_ssa t=1 size=80 value=0 args=0x20 locals=0x10 0x0000 00000 (x.go:3) TEXT "".f_ssa(SB), $16-32 0x0000 00000 (x.go:3) SUBQ $16, SP 0x0004 00004 (x.go:3) FUNCDATA $0, "".gcargs·0(SB) 0x0004 00004 (x.go:3) FUNCDATA $1, "".gclocals·1(SB) 0x0004 00004 (x.go:3) MOVQ 32(SP), AX 0x0009 00009 (x.go:3) MOVQ 24(SP), CX 0x000e 00014 (x.go:3) LEAQ 40(SP), DX 0x0013 00019 (x.go:3) MOVB (DX), BL 0x0015 00021 (x.go:3) MOVQ $0, BP 0x0017 00023 (x.go:3) MOVQ BP, 48(SP) 0x001c 00028 (x.go:3) TESTB BL, BL 0x001e 00030 (x.go:3) MOVQ AX, (SP) 0x0022 00034 (x.go:3) MOVQ CX, 8(SP) 0x0027 00039 (x.go:4) JEQ 64 0x0029 00041 (x.go:3) MOVQ 8(SP), AX 0x002e 00046 (x.go:3) MOVQ (SP), CX 0x0032 00050 (x.go:7) LEAQ (AX)(CX*1), DX 0x0036 00054 (x.go:7) MOVQ DX, 48(SP) 0x003b 00059 (x.go:3) ADDQ $16, SP 0x003f 00063 (x.go:3) RET 0x0040 00064 (x.go:5) MOVQ $0, AX 0x0042 00066 (x.go:5) MOVQ AX, 48(SP) 0x0047 00071 (x.go:3) JMP 59 Of course, the old backend is still well ahead: "".f_ssa t=1 size=48 value=0 args=0x20 locals=0x0 0x0000 00000 (x.go:3) TEXT "".f_ssa(SB), $0-32 0x0000 00000 (x.go:3) NOP 0x0000 00000 (x.go:3) NOP 0x0000 00000 (x.go:3) FUNCDATA $0, gclocals·a8eabfc4a4514ed6b3b0c61e9680e440(SB) 0x0000 00000 (x.go:3) FUNCDATA $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB) 0x0000 00000 (x.go:4) CMPB "".b+24(FP), $0 0x0005 00005 (x.go:4) JNE 17 0x0007 00007 (x.go:5) MOVQ $0, "".~r3+32(FP) 0x0010 00016 (x.go:5) RET 0x0011 00017 (x.go:7) MOVQ "".i+8(FP), BX 0x0016 00022 (x.go:7) MOVQ "".j+16(FP), BP 0x001b 00027 (x.go:7) ADDQ BP, BX 0x001e 00030 (x.go:7) MOVQ BX, "".~r3+32(FP) 0x0023 00035 (x.go:7) RET Some regalloc improvements should help considerably. Change-Id: I95bb5dd83e56afd70ae4e983f1d32dffd0c3d46a Reviewed-on: https://go-review.googlesource.com/13142 Reviewed-by: Keith Randall <khr@golang.org>
2015-08-04 14:55:35 -07:00
// tighten will be most effective when as many values have been removed as possible
{"generic deadcode", "tighten"},
{"generic cse", "tighten"},
// checkbce needs the values removed
{"generic deadcode", "check bce"},
// don't run optimization pass until we've decomposed builtin objects
{"decompose builtin", "late opt"},
// decompose builtin is the last pass that may introduce new float ops, so run softfloat after it
{"decompose builtin", "softfloat"},
// don't layout blocks until critical edges have been removed
{"critical", "layout"},
// regalloc requires the removal of all critical edges
{"critical", "regalloc"},
// regalloc requires all the values in a block to be scheduled
{"schedule", "regalloc"},
// checkLower must run after lowering & subsequent dead code elim
{"lower", "checkLower"},
{"lowered deadcode", "checkLower"},
// late nilcheck needs instructions to be scheduled.
{"schedule", "late nilcheck"},
// flagalloc needs instructions to be scheduled.
{"schedule", "flagalloc"},
// regalloc needs flags to be allocated first.
{"flagalloc", "regalloc"},
cmd/compile: rotate loops so conditional branch is at the end Old loops look like this: loop: CMPQ ... JGE exit ... JMP loop exit: New loops look like this: JMP entry loop: ... entry: CMPQ ... JLT loop This removes one instruction (the unconditional jump) from the inner loop. Kinda surprisingly, it matters. This is a bit different than the peeling that the old obj library did in that we don't duplicate the loop exit test. We just jump to the test. I'm not sure if it is better or worse to do that (peeling gets rid of the JMP but means more code duplication), but this CL is certainly a much simpler compiler change, so I'll try this way first. The obj library used to do peeling before CL https://go-review.googlesource.com/c/36205 turned it off. Fixes #15837 (remove obj instruction reordering) The reordering is already removed, this CL implements the only part of that reordering that we'd like to keep. Fixes #14758 (append loop) name old time/op new time/op delta Foo-12 817ns ± 4% 538ns ± 0% -34.08% (p=0.000 n=10+9) Bar-12 850ns ±11% 570ns ±13% -32.88% (p=0.000 n=10+10) Update #19595 (BLAS slowdown) name old time/op new time/op delta DgemvMedMedNoTransIncN-12 13.2µs ± 9% 10.2µs ± 1% -22.26% (p=0.000 n=9+9) Fixes #19633 (append loop) name old time/op new time/op delta Foo-12 810ns ± 1% 540ns ± 0% -33.30% (p=0.000 n=8+9) Update #18977 (Fannkuch11 regression) name old time/op new time/op delta Fannkuch11-8 2.80s ± 0% 3.01s ± 0% +7.47% (p=0.000 n=9+10) This one makes no sense. There's strictly 1 less instruction in the inner loop (17 instead of 18). They are exactly the same instructions except for the JMP that has been elided. go1 benchmarks generally don't look very impressive. But the gains for the specific issues above make this CL still probably worth it. name old time/op new time/op delta BinaryTree17-8 2.32s ± 0% 2.34s ± 0% +1.14% (p=0.000 n=9+7) Fannkuch11-8 2.80s ± 0% 3.01s ± 0% +7.47% (p=0.000 n=9+10) FmtFprintfEmpty-8 44.1ns ± 1% 46.1ns ± 1% +4.53% (p=0.000 n=10+10) FmtFprintfString-8 67.8ns ± 0% 74.4ns ± 1% +9.80% (p=0.000 n=10+9) FmtFprintfInt-8 74.9ns ± 0% 78.4ns ± 0% +4.67% (p=0.000 n=8+10) FmtFprintfIntInt-8 117ns ± 1% 123ns ± 1% +4.69% (p=0.000 n=9+10) FmtFprintfPrefixedInt-8 160ns ± 1% 146ns ± 0% -8.22% (p=0.000 n=8+10) FmtFprintfFloat-8 214ns ± 0% 206ns ± 0% -3.91% (p=0.000 n=8+8) FmtManyArgs-8 468ns ± 0% 497ns ± 1% +6.09% (p=0.000 n=8+10) GobDecode-8 6.16ms ± 0% 6.21ms ± 1% +0.76% (p=0.000 n=9+10) GobEncode-8 4.90ms ± 0% 4.92ms ± 1% +0.37% (p=0.028 n=9+10) Gzip-8 209ms ± 0% 212ms ± 0% +1.33% (p=0.000 n=10+10) Gunzip-8 36.6ms ± 0% 38.0ms ± 1% +4.03% (p=0.000 n=9+9) HTTPClientServer-8 84.2µs ± 0% 86.0µs ± 1% +2.14% (p=0.000 n=9+9) JSONEncode-8 13.6ms ± 3% 13.8ms ± 1% +1.55% (p=0.003 n=9+10) JSONDecode-8 53.2ms ± 5% 52.9ms ± 0% ~ (p=0.280 n=10+10) Mandelbrot200-8 3.78ms ± 0% 3.78ms ± 1% ~ (p=0.661 n=10+9) GoParse-8 2.89ms ± 0% 2.94ms ± 2% +1.50% (p=0.000 n=10+10) RegexpMatchEasy0_32-8 68.5ns ± 2% 68.9ns ± 1% ~ (p=0.136 n=10+10) RegexpMatchEasy0_1K-8 220ns ± 1% 225ns ± 1% +2.41% (p=0.000 n=10+10) RegexpMatchEasy1_32-8 64.7ns ± 0% 64.5ns ± 0% -0.28% (p=0.042 n=10+10) RegexpMatchEasy1_1K-8 348ns ± 1% 355ns ± 0% +1.90% (p=0.000 n=10+10) RegexpMatchMedium_32-8 102ns ± 1% 105ns ± 1% +2.95% (p=0.000 n=10+10) RegexpMatchMedium_1K-8 33.1µs ± 3% 32.5µs ± 0% -1.75% (p=0.000 n=10+10) RegexpMatchHard_32-8 1.71µs ± 1% 1.70µs ± 1% -0.84% (p=0.002 n=10+9) RegexpMatchHard_1K-8 51.1µs ± 0% 50.8µs ± 1% -0.48% (p=0.004 n=10+10) Revcomp-8 411ms ± 1% 402ms ± 0% -2.22% (p=0.000 n=10+9) Template-8 61.8ms ± 1% 59.7ms ± 0% -3.44% (p=0.000 n=9+9) TimeParse-8 306ns ± 0% 318ns ± 0% +3.83% (p=0.000 n=10+10) TimeFormat-8 320ns ± 0% 318ns ± 1% -0.53% (p=0.012 n=7+10) Change-Id: Ifaf29abbe5874e437048e411ba8f7cfbc9e1c94b Reviewed-on: https://go-review.googlesource.com/38431 Run-TryBot: Keith Randall <khr@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: David Chase <drchase@google.com>
2017-03-21 14:51:38 -07:00
// loopRotate will confuse regalloc.
{"regalloc", "loop rotate"},
// stackframe needs to know about spilled registers.
{"regalloc", "stackframe"},
// trim needs regalloc to be done first.
{"regalloc", "trim"},
}
func init() {
for _, c := range passOrder {
a, b := c.a, c.b
i := -1
j := -1
for k, p := range passes {
if p.name == a {
i = k
}
if p.name == b {
j = k
}
}
if i < 0 {
log.Panicf("pass %s not found", a)
}
if j < 0 {
log.Panicf("pass %s not found", b)
}
if i >= j {
log.Panicf("passes %s and %s out of order", a, b)
}
}
}