go/src/cmd/compile/internal/ssa/compile.go

277 lines
8.8 KiB
Go
Raw Normal View History

// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package ssa
import (
"fmt"
"log"
"runtime"
"strings"
"time"
)
// Compile is the main entry point for this package.
// Compile modifies f so that on return:
// · all Values in f map to 0 or 1 assembly instructions of the target architecture
// · the order of f.Blocks is the order to emit the Blocks
// · the order of b.Values is the order to emit the Values in each Block
// · f has a non-nil regAlloc field
func Compile(f *Func) {
// TODO: debugging - set flags to control verbosity of compiler,
// which phases to dump IR before/after, etc.
if f.Log() {
f.Logf("compiling %s\n", f.Name)
}
// hook to print function & phase if panic happens
phaseName := "init"
defer func() {
if phaseName != "" {
err := recover()
stack := make([]byte, 16384)
n := runtime.Stack(stack, false)
stack = stack[:n]
f.Fatalf("panic during %s while compiling %s:\n\n%v\n\n%s\n", phaseName, f.Name, err, stack)
}
}()
// Run all the passes
printFunc(f)
f.Config.HTML.WriteFunc("start", f)
if checkEnabled {
checkFunc(f)
}
const logMemStats = false
for _, p := range passes {
if !f.Config.optimize && !p.required {
continue
}
f.pass = &p
phaseName = p.name
if f.Log() {
f.Logf(" pass %s begin\n", p.name)
}
// TODO: capture logging during this pass, add it to the HTML
var mStart runtime.MemStats
if logMemStats || p.mem {
runtime.ReadMemStats(&mStart)
}
tStart := time.Now()
p.fn(f)
tEnd := time.Now()
// Need something less crude than "Log the whole intermediate result".
if f.Log() || f.Config.HTML != nil {
time := tEnd.Sub(tStart).Nanoseconds()
var stats string
if logMemStats {
var mEnd runtime.MemStats
runtime.ReadMemStats(&mEnd)
nBytes := mEnd.TotalAlloc - mStart.TotalAlloc
nAllocs := mEnd.Mallocs - mStart.Mallocs
stats = fmt.Sprintf("[%d ns %d allocs %d bytes]", time, nAllocs, nBytes)
} else {
stats = fmt.Sprintf("[%d ns]", time)
}
f.Logf(" pass %s end %s\n", p.name, stats)
printFunc(f)
f.Config.HTML.WriteFunc(fmt.Sprintf("after %s <span class=\"stats\">%s</span>", phaseName, stats), f)
}
if p.time || p.mem {
// Surround timing information w/ enough context to allow comparisons.
time := tEnd.Sub(tStart).Nanoseconds()
if p.time {
f.logStat("TIME(ns)", time)
}
if p.mem {
var mEnd runtime.MemStats
runtime.ReadMemStats(&mEnd)
nBytes := mEnd.TotalAlloc - mStart.TotalAlloc
nAllocs := mEnd.Mallocs - mStart.Mallocs
f.logStat("TIME(ns):BYTES:ALLOCS", time, nBytes, nAllocs)
}
}
if checkEnabled {
checkFunc(f)
}
}
// Squash error printing defer
phaseName = ""
}
type pass struct {
name string
fn func(*Func)
required bool
disabled bool
time bool // report time to run pass
mem bool // report mem stats to run pass
stats int // pass reports own "stats" (e.g., branches removed)
debug int // pass performs some debugging. =1 should be in error-testing-friendly Warnl format.
test int // pass-specific ad-hoc option, perhaps useful in development
}
// Run consistency checker between each phase
var checkEnabled = true
// PhaseOption sets the specified flag in the specified ssa phase,
// returning empty string if this was successful or a string explaining
// the error if it was not. A version of the phase name with "_"
// replaced by " " is also checked for a match.
// See gc/lex.go for dissection of the option string. Example use:
// GO_GCFLAGS=-d=ssa/generic_cse/time,ssa/generic_cse/stats,ssa/generic_cse/debug=3 ./make.bash ...
//
func PhaseOption(phase, flag string, val int) string {
if phase == "check" && flag == "on" {
checkEnabled = val != 0
return ""
}
if phase == "check" && flag == "off" {
checkEnabled = val == 0
return ""
}
underphase := strings.Replace(phase, "_", " ", -1)
for i, p := range passes {
if p.name == phase || p.name == underphase {
switch flag {
case "on":
p.disabled = val == 0
case "off":
p.disabled = val != 0
case "time":
p.time = val != 0
case "mem":
p.mem = val != 0
case "debug":
p.debug = val
case "stats":
p.stats = val
case "test":
p.test = val
default:
return fmt.Sprintf("Did not find a flag matching %s in -d=ssa/%s debug option", flag, phase)
}
if p.disabled && p.required {
return fmt.Sprintf("Cannot disable required SSA phase %s using -d=ssa/%s debug option", phase, phase)
}
passes[i] = p
return ""
}
}
return fmt.Sprintf("Did not find a phase matching %s in -d=ssa/... debug option", phase)
}
// list of passes for the compiler
var passes = [...]pass{
// TODO: combine phielim and copyelim into a single pass?
{name: "early phielim", fn: phielim},
{name: "early copyelim", fn: copyelim},
{name: "early deadcode", fn: deadcode}, // remove generated dead code to avoid doing pointless work during opt
{name: "short circuit", fn: shortcircuit},
{name: "decompose user", fn: decomposeUser, required: true},
{name: "decompose builtin", fn: decomposeBuiltIn, required: true},
{name: "opt", fn: opt, required: true}, // TODO: split required rules and optimizing rules
{name: "zero arg cse", fn: zcse, required: true}, // required to merge OpSB values
{name: "opt deadcode", fn: deadcode}, // remove any blocks orphaned during opt
{name: "generic cse", fn: cse},
{name: "phiopt", fn: phiopt},
{name: "nilcheckelim", fn: nilcheckelim},
{name: "prove", fn: prove},
{name: "generic deadcode", fn: deadcode},
{name: "fuse", fn: fuse},
{name: "dse", fn: dse},
{name: "tighten", fn: tighten}, // move values closer to their uses
{name: "lower", fn: lower, required: true},
{name: "lowered cse", fn: cse},
{name: "lowered deadcode", fn: deadcode, required: true},
{name: "checkLower", fn: checkLower, required: true},
{name: "late phielim", fn: phielim},
{name: "late copyelim", fn: copyelim},
{name: "late deadcode", fn: deadcode},
[dev.ssa] cmd/compile: adjust branch likeliness for calls/loops Static branch predictions (which guide block ordering) are adjusted based on: loop/not-loop (favor looping) abnormal-exit/not (avoid panic) call/not-call (avoid call) ret/default (treat returns as rare) This appears to make no difference in performance of real code, meaning the compiler itself. The earlier version of this has been stripped down to help make the cost of this only-aesthetic-on-Intel phase be as cheap as possible (we probably want information about inner loops for improving register allocation, but because register allocation follows close behind this pass, conceivably the information could be reused -- so we might do this anyway just to normalize output). For a ./make.bash that takes 200 user seconds, about .75 second is reported in likelyadjust (summing nanoseconds reported with -d=ssa/likelyadjust/time ). Upstream predictions are respected. Includes test, limited to build on amd64 only. Did several iterations on the debugging output to allow some rough checks on behavior. Debug=1 logging notes agree/disagree with earlier passes, allowing analysis like the following: Run on make.bash: GO_GCFLAGS=-d=ssa/likelyadjust/debug \ ./make.bash >& lkly5.log grep 'ranch prediction' lkly5.log | wc -l 78242 // 78k predictions grep 'ranch predi' lkly5.log | egrep -v 'agrees with' | wc -l 29633 // 29k NEW predictions grep 'disagrees' lkly5.log | wc -l 444 // contradicted 444 times grep '< exit' lkly5.log | wc -l 10212 // 10k exit predictions grep '< exit' lkly5.log | egrep 'disagrees' | wc -l 5 // 5 contradicted by previous prediction grep '< exit' lkly5.log | egrep -v 'agrees' | wc -l 702 // 702-5 redundant with previous prediction grep '< call' lkly5.log | egrep -v 'agrees' | wc -l 16699 // 16k new call predictions grep 'stay in loop' lkly5.log | egrep -v 'agrees' | wc -l 3951 // 4k new "remain in loop" predictions Fixes #11451. Change-Id: Iafb0504f7030d304ef4b6dc1aba9a5789151a593 Reviewed-on: https://go-review.googlesource.com/19995 Run-TryBot: David Chase <drchase@google.com> Reviewed-by: Keith Randall <khr@golang.org>
2016-02-27 11:54:52 -05:00
{name: "critical", fn: critical, required: true}, // remove critical edges
{name: "likelyadjust", fn: likelyadjust},
{name: "layout", fn: layout, required: true}, // schedule blocks
{name: "schedule", fn: schedule, required: true}, // schedule values
{name: "flagalloc", fn: flagalloc, required: true}, // allocate flags register
{name: "regalloc", fn: regalloc, required: true}, // allocate int & float registers + stack slots
{name: "trim", fn: trim}, // remove empty blocks
}
// Double-check phase ordering constraints.
// This code is intended to document the ordering requirements
// between different phases. It does not override the passes
// list above.
type constraint struct {
a, b string // a must come before b
}
var passOrder = [...]constraint{
// prove reliese on common-subexpression elimination for maximum benefits.
{"generic cse", "prove"},
// deadcode after prove to eliminate all new dead blocks.
{"prove", "generic deadcode"},
// common-subexpression before dead-store elim, so that we recognize
// when two address expressions are the same.
{"generic cse", "dse"},
// cse substantially improves nilcheckelim efficacy
{"generic cse", "nilcheckelim"},
// allow deadcode to clean up after nilcheckelim
{"nilcheckelim", "generic deadcode"},
// nilcheckelim generates sequences of plain basic blocks
{"nilcheckelim", "fuse"},
// nilcheckelim relies on opt to rewrite user nil checks
{"opt", "nilcheckelim"},
[dev.ssa] cmd/compile: add SSA pass to move values closer to uses Even this very simple, restricted initial implementation helps. While running make.bash, it moves 84437 values to new, closer homes. As a concrete example: func f_ssa(i, j int, b bool) int { if !b { return 0 } return i + j } It cuts off one stack slot and two instructions: Before: "".f_ssa t=1 size=96 value=0 args=0x20 locals=0x18 0x0000 00000 (x.go:3) TEXT "".f_ssa(SB), $24-32 0x0000 00000 (x.go:3) SUBQ $24, SP 0x0004 00004 (x.go:3) FUNCDATA $0, "".gcargs·0(SB) 0x0004 00004 (x.go:3) FUNCDATA $1, "".gclocals·1(SB) 0x0004 00004 (x.go:5) MOVQ $0, AX 0x0006 00006 (x.go:3) MOVQ 32(SP), CX 0x000b 00011 (x.go:3) MOVQ 40(SP), DX 0x0010 00016 (x.go:3) LEAQ 48(SP), BX 0x0015 00021 (x.go:3) MOVB (BX), BPB 0x0018 00024 (x.go:3) MOVQ $0, SI 0x001a 00026 (x.go:3) MOVQ SI, 56(SP) 0x001f 00031 (x.go:3) TESTB BPB, BPB 0x0022 00034 (x.go:5) MOVQ AX, (SP) 0x0026 00038 (x.go:3) MOVQ CX, 8(SP) 0x002b 00043 (x.go:3) MOVQ DX, 16(SP) 0x0030 00048 (x.go:4) JEQ 74 0x0032 00050 (x.go:3) MOVQ 8(SP), AX 0x0037 00055 (x.go:3) MOVQ 16(SP), CX 0x003c 00060 (x.go:7) LEAQ (AX)(CX*1), DX 0x0040 00064 (x.go:7) MOVQ DX, 56(SP) 0x0045 00069 (x.go:3) ADDQ $24, SP 0x0049 00073 (x.go:3) RET 0x004a 00074 (x.go:5) MOVQ (SP), AX 0x004e 00078 (x.go:5) MOVQ AX, 56(SP) 0x0053 00083 (x.go:3) JMP 69 After: "".f_ssa t=1 size=80 value=0 args=0x20 locals=0x10 0x0000 00000 (x.go:3) TEXT "".f_ssa(SB), $16-32 0x0000 00000 (x.go:3) SUBQ $16, SP 0x0004 00004 (x.go:3) FUNCDATA $0, "".gcargs·0(SB) 0x0004 00004 (x.go:3) FUNCDATA $1, "".gclocals·1(SB) 0x0004 00004 (x.go:3) MOVQ 32(SP), AX 0x0009 00009 (x.go:3) MOVQ 24(SP), CX 0x000e 00014 (x.go:3) LEAQ 40(SP), DX 0x0013 00019 (x.go:3) MOVB (DX), BL 0x0015 00021 (x.go:3) MOVQ $0, BP 0x0017 00023 (x.go:3) MOVQ BP, 48(SP) 0x001c 00028 (x.go:3) TESTB BL, BL 0x001e 00030 (x.go:3) MOVQ AX, (SP) 0x0022 00034 (x.go:3) MOVQ CX, 8(SP) 0x0027 00039 (x.go:4) JEQ 64 0x0029 00041 (x.go:3) MOVQ 8(SP), AX 0x002e 00046 (x.go:3) MOVQ (SP), CX 0x0032 00050 (x.go:7) LEAQ (AX)(CX*1), DX 0x0036 00054 (x.go:7) MOVQ DX, 48(SP) 0x003b 00059 (x.go:3) ADDQ $16, SP 0x003f 00063 (x.go:3) RET 0x0040 00064 (x.go:5) MOVQ $0, AX 0x0042 00066 (x.go:5) MOVQ AX, 48(SP) 0x0047 00071 (x.go:3) JMP 59 Of course, the old backend is still well ahead: "".f_ssa t=1 size=48 value=0 args=0x20 locals=0x0 0x0000 00000 (x.go:3) TEXT "".f_ssa(SB), $0-32 0x0000 00000 (x.go:3) NOP 0x0000 00000 (x.go:3) NOP 0x0000 00000 (x.go:3) FUNCDATA $0, gclocals·a8eabfc4a4514ed6b3b0c61e9680e440(SB) 0x0000 00000 (x.go:3) FUNCDATA $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB) 0x0000 00000 (x.go:4) CMPB "".b+24(FP), $0 0x0005 00005 (x.go:4) JNE 17 0x0007 00007 (x.go:5) MOVQ $0, "".~r3+32(FP) 0x0010 00016 (x.go:5) RET 0x0011 00017 (x.go:7) MOVQ "".i+8(FP), BX 0x0016 00022 (x.go:7) MOVQ "".j+16(FP), BP 0x001b 00027 (x.go:7) ADDQ BP, BX 0x001e 00030 (x.go:7) MOVQ BX, "".~r3+32(FP) 0x0023 00035 (x.go:7) RET Some regalloc improvements should help considerably. Change-Id: I95bb5dd83e56afd70ae4e983f1d32dffd0c3d46a Reviewed-on: https://go-review.googlesource.com/13142 Reviewed-by: Keith Randall <khr@golang.org>
2015-08-04 14:55:35 -07:00
// tighten should happen before lowering to avoid splitting naturally paired instructions such as CMP/SET
{"tighten", "lower"},
// tighten will be most effective when as many values have been removed as possible
{"generic deadcode", "tighten"},
{"generic cse", "tighten"},
// don't run optimization pass until we've decomposed builtin objects
{"decompose builtin", "opt"},
// don't layout blocks until critical edges have been removed
{"critical", "layout"},
// regalloc requires the removal of all critical edges
{"critical", "regalloc"},
// regalloc requires all the values in a block to be scheduled
{"schedule", "regalloc"},
// checkLower must run after lowering & subsequent dead code elim
{"lower", "checkLower"},
{"lowered deadcode", "checkLower"},
// flagalloc needs instructions to be scheduled.
{"schedule", "flagalloc"},
// regalloc needs flags to be allocated first.
{"flagalloc", "regalloc"},
// trim needs regalloc to be done first.
{"regalloc", "trim"},
}
func init() {
for _, c := range passOrder {
a, b := c.a, c.b
i := -1
j := -1
for k, p := range passes {
if p.name == a {
i = k
}
if p.name == b {
j = k
}
}
if i < 0 {
log.Panicf("pass %s not found", a)
}
if j < 0 {
log.Panicf("pass %s not found", b)
}
if i >= j {
log.Panicf("passes %s and %s out of order", a, b)
}
}
}