go/src/cmd/compile/internal/ssa/cse.go

329 lines
8.1 KiB
Go
Raw Normal View History

// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package ssa
import (
"fmt"
"sort"
)
const (
cmpDepth = 4
)
// cse does common-subexpression elimination on the Function.
// Values are just relinked, nothing is deleted. A subsequent deadcode
// pass is required to actually remove duplicate expressions.
func cse(f *Func) {
// Two values are equivalent if they satisfy the following definition:
// equivalent(v, w):
// v.op == w.op
// v.type == w.type
// v.aux == w.aux
// v.auxint == w.auxint
// len(v.args) == len(w.args)
[dev.ssa] cmd/compile: don't combine phi vars from different blocks in CSE Here is a concrete case in which this goes wrong. func f_ssa() int { var n int Next: for j := 0; j < 3; j++ { for i := 0; i < 10; i++ { if i == 6 { continue Next } n = i } n += j + j + j + j + j + j + j + j + j + j // j * 10 } return n } What follows is the function printout before and after CSE. Note blocks b8 and b10 in the before case. b8 is the inner loop's condition: i < 10. b10 is the inner loop's increment: i++. v82 is i. On entry to b8, it is either 0 (v19) the first time, or the result of incrementing v82, by way of v29. The CSE pass considered v82 and v49 to be common subexpressions, and eliminated v82 in favor of v49. In the after case, v82 is now dead and will shortly be eliminated. As a result, v29 is also dead, and we have lost the increment. The loop runs forever. BEFORE CSE f_ssa <nil> b1: v1 = Arg <mem> v2 = SP <uint64> v4 = Addr <*int> {~r0} v2 v13 = Zero <mem> [8] v4 v1 v14 = Const <int> v15 = Const <int> v17 = Const <int> [3] v19 = Const <int> v21 = Const <int> [10] v24 = Const <int> [6] v28 = Const <int> [1] v43 = Const <int> [1] Plain -> b3 b2: <- b7 Exit v47 b3: <- b1 Plain -> b4 b4: <- b3 b6 v49 = Phi <int> v15 v44 v68 = Phi <int> v14 v67 v81 = Phi <mem> v13 v81 v18 = Less <bool> v49 v17 If v18 -> b5 b7 b5: <- b4 Plain -> b8 b6: <- b12 b11 v67 = Phi <int> v66 v41 v44 = Add <int> v49 v43 Plain -> b4 b7: <- b4 v47 = Store <mem> v4 v68 v81 Plain -> b2 b8: <- b5 b10 v66 = Phi <int> v68 v82 v82 = Phi <int> v19 v29 v22 = Less <bool> v82 v21 If v22 -> b9 b11 b9: <- b8 v25 = Eq <bool> v82 v24 If v25 -> b12 b13 b10: <- b13 v29 = Add <int> v82 v28 Plain -> b8 b11: <- b8 v32 = Add <int> v49 v49 v33 = Add <int> v32 v49 v34 = Add <int> v33 v49 v35 = Add <int> v34 v49 v36 = Add <int> v35 v49 v37 = Add <int> v36 v49 v38 = Add <int> v37 v49 v39 = Add <int> v38 v49 v40 = Add <int> v39 v49 v41 = Add <int> v66 v40 Plain -> b6 b12: <- b9 Plain -> b6 b13: <- b9 Plain -> b10 AFTER CSE f_ssa <nil> b1: v1 = Arg <mem> v2 = SP <uint64> v4 = Addr <*int> {~r0} v2 v13 = Zero <mem> [8] v4 v1 v14 = Const <int> v15 = Const <int> v17 = Const <int> [3] v19 = Const <int> v21 = Const <int> [10] v24 = Const <int> [6] v28 = Const <int> [1] v43 = Const <int> [1] Plain -> b3 b2: <- b7 Exit v47 b3: <- b1 Plain -> b4 b4: <- b3 b6 v49 = Phi <int> v19 v44 v68 = Phi <int> v19 v67 v81 = Phi <mem> v13 v81 v18 = Less <bool> v49 v17 If v18 -> b5 b7 b5: <- b4 Plain -> b8 b6: <- b12 b11 v67 = Phi <int> v66 v41 v44 = Add <int> v49 v43 Plain -> b4 b7: <- b4 v47 = Store <mem> v4 v68 v81 Plain -> b2 b8: <- b5 b10 v66 = Phi <int> v68 v49 v82 = Phi <int> v19 v29 v22 = Less <bool> v49 v21 If v22 -> b9 b11 b9: <- b8 v25 = Eq <bool> v49 v24 If v25 -> b12 b13 b10: <- b13 v29 = Add <int> v49 v43 Plain -> b8 b11: <- b8 v32 = Add <int> v49 v49 v33 = Add <int> v32 v49 v34 = Add <int> v33 v49 v35 = Add <int> v34 v49 v36 = Add <int> v35 v49 v37 = Add <int> v36 v49 v38 = Add <int> v37 v49 v39 = Add <int> v38 v49 v40 = Add <int> v39 v49 v41 = Add <int> v66 v40 Plain -> b6 b12: <- b9 Plain -> b6 b13: <- b9 Plain -> b10 Change-Id: I16fc4ec527ec63f24f7d0d79d1a4a59bf37269de Reviewed-on: https://go-review.googlesource.com/12444 Reviewed-by: Keith Randall <khr@golang.org>
2015-07-20 18:50:17 -07:00
// v.block == w.block if v.op == OpPhi
// equivalent(v.args[i], w.args[i]) for i in 0..len(v.args)-1
// The algorithm searches for a partition of f's values into
// equivalence classes using the above definition.
// It starts with a coarse partition and iteratively refines it
// until it reaches a fixed point.
// Make initial coarse partitions by using a subset of the conditions above.
a := make([]*Value, 0, f.NumValues())
auxIDs := auxmap{}
for _, b := range f.Blocks {
for _, v := range b.Values {
if auxIDs[v.Aux] == 0 {
auxIDs[v.Aux] = int32(len(auxIDs)) + 1
}
if v.Type.IsMemory() {
continue // memory values can never cse
[dev.ssa] cmd/compile: don't combine phi vars from different blocks in CSE Here is a concrete case in which this goes wrong. func f_ssa() int { var n int Next: for j := 0; j < 3; j++ { for i := 0; i < 10; i++ { if i == 6 { continue Next } n = i } n += j + j + j + j + j + j + j + j + j + j // j * 10 } return n } What follows is the function printout before and after CSE. Note blocks b8 and b10 in the before case. b8 is the inner loop's condition: i < 10. b10 is the inner loop's increment: i++. v82 is i. On entry to b8, it is either 0 (v19) the first time, or the result of incrementing v82, by way of v29. The CSE pass considered v82 and v49 to be common subexpressions, and eliminated v82 in favor of v49. In the after case, v82 is now dead and will shortly be eliminated. As a result, v29 is also dead, and we have lost the increment. The loop runs forever. BEFORE CSE f_ssa <nil> b1: v1 = Arg <mem> v2 = SP <uint64> v4 = Addr <*int> {~r0} v2 v13 = Zero <mem> [8] v4 v1 v14 = Const <int> v15 = Const <int> v17 = Const <int> [3] v19 = Const <int> v21 = Const <int> [10] v24 = Const <int> [6] v28 = Const <int> [1] v43 = Const <int> [1] Plain -> b3 b2: <- b7 Exit v47 b3: <- b1 Plain -> b4 b4: <- b3 b6 v49 = Phi <int> v15 v44 v68 = Phi <int> v14 v67 v81 = Phi <mem> v13 v81 v18 = Less <bool> v49 v17 If v18 -> b5 b7 b5: <- b4 Plain -> b8 b6: <- b12 b11 v67 = Phi <int> v66 v41 v44 = Add <int> v49 v43 Plain -> b4 b7: <- b4 v47 = Store <mem> v4 v68 v81 Plain -> b2 b8: <- b5 b10 v66 = Phi <int> v68 v82 v82 = Phi <int> v19 v29 v22 = Less <bool> v82 v21 If v22 -> b9 b11 b9: <- b8 v25 = Eq <bool> v82 v24 If v25 -> b12 b13 b10: <- b13 v29 = Add <int> v82 v28 Plain -> b8 b11: <- b8 v32 = Add <int> v49 v49 v33 = Add <int> v32 v49 v34 = Add <int> v33 v49 v35 = Add <int> v34 v49 v36 = Add <int> v35 v49 v37 = Add <int> v36 v49 v38 = Add <int> v37 v49 v39 = Add <int> v38 v49 v40 = Add <int> v39 v49 v41 = Add <int> v66 v40 Plain -> b6 b12: <- b9 Plain -> b6 b13: <- b9 Plain -> b10 AFTER CSE f_ssa <nil> b1: v1 = Arg <mem> v2 = SP <uint64> v4 = Addr <*int> {~r0} v2 v13 = Zero <mem> [8] v4 v1 v14 = Const <int> v15 = Const <int> v17 = Const <int> [3] v19 = Const <int> v21 = Const <int> [10] v24 = Const <int> [6] v28 = Const <int> [1] v43 = Const <int> [1] Plain -> b3 b2: <- b7 Exit v47 b3: <- b1 Plain -> b4 b4: <- b3 b6 v49 = Phi <int> v19 v44 v68 = Phi <int> v19 v67 v81 = Phi <mem> v13 v81 v18 = Less <bool> v49 v17 If v18 -> b5 b7 b5: <- b4 Plain -> b8 b6: <- b12 b11 v67 = Phi <int> v66 v41 v44 = Add <int> v49 v43 Plain -> b4 b7: <- b4 v47 = Store <mem> v4 v68 v81 Plain -> b2 b8: <- b5 b10 v66 = Phi <int> v68 v49 v82 = Phi <int> v19 v29 v22 = Less <bool> v49 v21 If v22 -> b9 b11 b9: <- b8 v25 = Eq <bool> v49 v24 If v25 -> b12 b13 b10: <- b13 v29 = Add <int> v49 v43 Plain -> b8 b11: <- b8 v32 = Add <int> v49 v49 v33 = Add <int> v32 v49 v34 = Add <int> v33 v49 v35 = Add <int> v34 v49 v36 = Add <int> v35 v49 v37 = Add <int> v36 v49 v38 = Add <int> v37 v49 v39 = Add <int> v38 v49 v40 = Add <int> v39 v49 v41 = Add <int> v66 v40 Plain -> b6 b12: <- b9 Plain -> b6 b13: <- b9 Plain -> b10 Change-Id: I16fc4ec527ec63f24f7d0d79d1a4a59bf37269de Reviewed-on: https://go-review.googlesource.com/12444 Reviewed-by: Keith Randall <khr@golang.org>
2015-07-20 18:50:17 -07:00
}
if opcodeTable[v.Op].commutative && len(v.Args) == 2 && v.Args[1].ID < v.Args[0].ID {
// Order the arguments of binary commutative operations.
v.Args[0], v.Args[1] = v.Args[1], v.Args[0]
}
a = append(a, v)
}
}
partition := partitionValues(a, auxIDs)
// map from value id back to eqclass id
valueEqClass := make([]ID, f.NumValues())
for _, b := range f.Blocks {
for _, v := range b.Values {
// Use negative equivalence class #s for unique values.
valueEqClass[v.ID] = -v.ID
}
}
for i, e := range partition {
if f.pass.debug > 1 && len(e) > 500 {
fmt.Printf("CSE.large partition (%d): ", len(e))
for j := 0; j < 3; j++ {
fmt.Printf("%s ", e[j].LongString())
}
fmt.Println()
}
for _, v := range e {
valueEqClass[v.ID] = ID(i)
}
if f.pass.debug > 2 && len(e) > 1 {
fmt.Printf("CSE.partition #%d:", i)
for _, v := range e {
fmt.Printf(" %s", v.String())
}
fmt.Printf("\n")
}
}
// Find an equivalence class where some members of the class have
// non-equivalent arguments. Split the equivalence class appropriately.
// Repeat until we can't find any more splits.
for {
changed := false
// partition can grow in the loop. By not using a range loop here,
// we process new additions as they arrive, avoiding O(n^2) behavior.
for i := 0; i < len(partition); i++ {
e := partition[i]
v := e[0]
// all values in this equiv class that are not equivalent to v get moved
// into another equiv class.
// To avoid allocating while building that equivalence class,
// move the values equivalent to v to the beginning of e
// and other values to the end of e.
allvals := e
eqloop:
for j := 1; j < len(e); {
w := e[j]
equivalent := true
for i := 0; i < len(v.Args); i++ {
if valueEqClass[v.Args[i].ID] != valueEqClass[w.Args[i].ID] {
equivalent = false
break
}
}
if !equivalent || !v.Type.Equal(w.Type) {
// w is not equivalent to v.
// move it to the end and shrink e.
e[j], e[len(e)-1] = e[len(e)-1], e[j]
e = e[:len(e)-1]
valueEqClass[w.ID] = ID(len(partition))
changed = true
continue eqloop
}
// v and w are equivalent. Keep w in e.
j++
}
partition[i] = e
if len(e) < len(allvals) {
partition = append(partition, allvals[len(e):])
}
}
if !changed {
break
}
}
// Dominator tree (f.sdom) is computed by the generic domtree pass.
// Compute substitutions we would like to do. We substitute v for w
// if v and w are in the same equivalence class and v dominates w.
rewrite := make([]*Value, f.NumValues())
for _, e := range partition {
sort.Sort(sortbyentry{e, f.sdom})
for i := 0; i < len(e)-1; i++ {
// e is sorted by entry value so maximal dominant element should be
// found first in the slice
v := e[i]
if v == nil {
continue
}
e[i] = nil
// Replace all elements of e which v dominates
for j := i + 1; j < len(e); j++ {
w := e[j]
if w == nil {
continue
}
if f.sdom.isAncestorEq(v.Block, w.Block) {
rewrite[w.ID] = v
e[j] = nil
} else {
// since the blocks are assorted in ascending order by entry number
// once we know that we don't dominate a block we can't dominate any
// 'later' block
break
}
}
}
}
rewrites := int64(0)
// Apply substitutions
for _, b := range f.Blocks {
for _, v := range b.Values {
for i, w := range v.Args {
if x := rewrite[w.ID]; x != nil {
v.SetArg(i, x)
rewrites++
}
}
}
if v := b.Control; v != nil {
if x := rewrite[v.ID]; x != nil {
if v.Op == OpNilCheck {
// nilcheck pass will remove the nil checks and log
// them appropriately, so don't mess with them here.
continue
}
b.SetControl(x)
}
}
}
if f.pass.stats > 0 {
f.logStat("CSE REWRITES", rewrites)
}
}
// An eqclass approximates an equivalence class. During the
// algorithm it may represent the union of several of the
// final equivalence classes.
type eqclass []*Value
// partitionValues partitions the values into equivalence classes
// based on having all the following features match:
// - opcode
// - type
// - auxint
// - aux
// - nargs
// - block # if a phi op
// - first two arg's opcodes and auxint
// - NOT first two arg's aux; that can break CSE.
// partitionValues returns a list of equivalence classes, each
// being a sorted by ID list of *Values. The eqclass slices are
// backed by the same storage as the input slice.
// Equivalence classes of size 1 are ignored.
func partitionValues(a []*Value, auxIDs auxmap) []eqclass {
sort.Sort(sortvalues{a, auxIDs})
var partition []eqclass
for len(a) > 0 {
v := a[0]
j := 1
for ; j < len(a); j++ {
w := a[j]
if cmpVal(v, w, auxIDs, cmpDepth) != CMPeq {
break
}
}
if j > 1 {
partition = append(partition, a[:j])
}
a = a[j:]
}
return partition
}
func lt2Cmp(isLt bool) Cmp {
if isLt {
return CMPlt
}
return CMPgt
}
type auxmap map[interface{}]int32
func cmpVal(v, w *Value, auxIDs auxmap, depth int) Cmp {
// Try to order these comparison by cost (cheaper first)
if v.Op != w.Op {
return lt2Cmp(v.Op < w.Op)
}
if v.AuxInt != w.AuxInt {
return lt2Cmp(v.AuxInt < w.AuxInt)
}
if len(v.Args) != len(w.Args) {
return lt2Cmp(len(v.Args) < len(w.Args))
}
if v.Op == OpPhi && v.Block != w.Block {
return lt2Cmp(v.Block.ID < w.Block.ID)
}
cmd/compile: teach CSE that new objects are bespoke runtime.newobject never returns the same thing twice, so the resulting value will never be a common subexpression. This helps when compiling large static data structures that include pointers, such as maps and slices. No clear performance impact on other code. (See below.) For the code in issue #15112: Before: real 1m14.238s user 1m18.985s sys 0m0.787s After: real 0m47.172s user 0m52.248s sys 0m0.767s For the code in issue #15235, size 10k: Before: real 0m44.916s user 0m46.577s sys 0m0.304s After: real 0m7.703s user 0m9.041s sys 0m0.316s Still more work to be done, particularly for #15112. Updates #15112 Updates #15235 name old time/op new time/op delta Template 330ms ±11% 333ms ±13% ~ (p=0.749 n=20+19) Unicode 148ms ± 6% 152ms ± 8% ~ (p=0.072 n=18+20) GoTypes 1.01s ± 7% 1.01s ± 3% ~ (p=0.583 n=20+20) Compiler 5.04s ± 2% 5.06s ± 2% ~ (p=0.314 n=20+20) name old user-ns/op new user-ns/op delta Template 444user-ms ±11% 445user-ms ±10% ~ (p=0.738 n=20+20) Unicode 215user-ms ± 5% 218user-ms ± 5% ~ (p=0.239 n=18+18) GoTypes 1.45user-s ± 3% 1.45user-s ± 4% ~ (p=0.620 n=20+20) Compiler 7.23user-s ± 2% 7.22user-s ± 2% ~ (p=0.901 n=20+19) name old alloc/op new alloc/op delta Template 55.0MB ± 0% 55.0MB ± 0% ~ (p=0.547 n=20+20) Unicode 37.6MB ± 0% 37.6MB ± 0% ~ (p=0.301 n=20+20) GoTypes 177MB ± 0% 177MB ± 0% ~ (p=0.065 n=20+19) Compiler 798MB ± 0% 797MB ± 0% -0.05% (p=0.000 n=19+20) name old allocs/op new allocs/op delta Template 492k ± 0% 493k ± 0% +0.03% (p=0.030 n=20+20) Unicode 377k ± 0% 377k ± 0% ~ (p=0.423 n=20+19) GoTypes 1.40M ± 0% 1.40M ± 0% ~ (p=0.102 n=20+20) Compiler 5.53M ± 0% 5.53M ± 0% ~ (p=0.094 n=17+18) name old text-bytes new text-bytes delta HelloSize 561k ± 0% 561k ± 0% ~ (all samples are equal) CmdGoSize 6.13M ± 0% 6.13M ± 0% ~ (all samples are equal) name old data-bytes new data-bytes delta HelloSize 128k ± 0% 128k ± 0% ~ (all samples are equal) CmdGoSize 306k ± 0% 306k ± 0% ~ (all samples are equal) name old exe-bytes new exe-bytes delta HelloSize 905k ± 0% 905k ± 0% ~ (all samples are equal) CmdGoSize 9.64M ± 0% 9.64M ± 0% ~ (all samples are equal) Change-Id: Id774e2901d7701a3ec7a1c1d1cf1d9327a4107fc Reviewed-on: https://go-review.googlesource.com/21937 Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Todd Neal <todd@tneal.org>
2016-04-12 17:12:26 -07:00
switch v.Op {
case OpStaticCall, OpAMD64CALLstatic, OpARMCALLstatic:
sym := v.Aux.(GCSym)
if sym.IsRuntimeCall("newobject") {
return lt2Cmp(v.ID < w.ID)
}
}
if tc := v.Type.Compare(w.Type); tc != CMPeq {
return tc
}
if v.Aux != w.Aux {
if v.Aux == nil {
return CMPlt
}
if w.Aux == nil {
return CMPgt
}
return lt2Cmp(auxIDs[v.Aux] < auxIDs[w.Aux])
}
if depth > 0 {
for i := range v.Args {
if v.Args[i] == w.Args[i] {
// skip comparing equal args
continue
}
if ac := cmpVal(v.Args[i], w.Args[i], auxIDs, depth-1); ac != CMPeq {
return ac
}
}
}
return CMPeq
}
// Sort values to make the initial partition.
type sortvalues struct {
a []*Value // array of values
auxIDs auxmap // aux -> aux ID map
}
func (sv sortvalues) Len() int { return len(sv.a) }
func (sv sortvalues) Swap(i, j int) { sv.a[i], sv.a[j] = sv.a[j], sv.a[i] }
func (sv sortvalues) Less(i, j int) bool {
v := sv.a[i]
w := sv.a[j]
if cmp := cmpVal(v, w, sv.auxIDs, cmpDepth); cmp != CMPeq {
return cmp == CMPlt
}
// Sort by value ID last to keep the sort result deterministic.
return v.ID < w.ID
}
type sortbyentry struct {
a []*Value // array of values
sdom sparseTree
}
func (sv sortbyentry) Len() int { return len(sv.a) }
func (sv sortbyentry) Swap(i, j int) { sv.a[i], sv.a[j] = sv.a[j], sv.a[i] }
func (sv sortbyentry) Less(i, j int) bool {
v := sv.a[i]
w := sv.a[j]
return sv.sdom.maxdomorder(v.Block) < sv.sdom.maxdomorder(w.Block)
}