cmd/compile: optimize liveness in stackalloc

The stackalloc code needs to run a liveness pass to build the
interference graph between stack slots. Because the values that we need
liveness over is so sparse, we can optimize the analysis by using a path
exploration algorithm rather than a iterative dataflow one

In local testing, this cuts 74.05 ms of CPU time off a build of cmd/compile.

Change-Id: I765ace87d5e8aae177e65eb63da482e3d698bea7
Reviewed-on: https://go-review.googlesource.com/c/go/+/718540
Reviewed-by: Keith Randall <khr@golang.org>
Auto-Submit: Keith Randall <khr@golang.org>
Reviewed-by: Junyang Shao <shaojunyang@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Keith Randall <khr@google.com>
This commit is contained in:
Daniel Morsing 2025-11-05 18:33:44 +00:00 committed by Gopher Robot
parent 956909ff84
commit 4e761b9a18

View file

@ -56,10 +56,35 @@ func putStackAllocState(s *stackAllocState) {
} }
type stackValState struct { type stackValState struct {
typ *types.Type typ *types.Type
spill *Value spill *Value
needSlot bool needSlot bool
isArg bool isArg bool
defBlock ID
useBlocks []stackUseBlock
}
// addUseBlock adds a block to the set of blocks that uses this value.
// Note that we only loosely enforce the set property by checking the last
// block that was appended to the list and duplicates may occur.
// Because we add values block by block (barring phi-nodes), the number of duplicates is
// small and we deduplicate as part of the liveness algorithm later anyway.
func (sv *stackValState) addUseBlock(b *Block, liveout bool) {
entry := stackUseBlock{
b: b,
liveout: liveout,
}
if sv.useBlocks == nil || sv.useBlocks[len(sv.useBlocks)-1] != entry {
sv.useBlocks = append(sv.useBlocks, stackUseBlock{
b: b,
liveout: liveout,
})
}
}
type stackUseBlock struct {
b *Block
liveout bool
} }
// stackalloc allocates storage in the stack frame for // stackalloc allocates storage in the stack frame for
@ -99,6 +124,7 @@ func (s *stackAllocState) init(f *Func, spillLive [][]ID) {
s.values[v.ID].typ = v.Type s.values[v.ID].typ = v.Type
s.values[v.ID].needSlot = !v.Type.IsMemory() && !v.Type.IsVoid() && !v.Type.IsFlags() && f.getHome(v.ID) == nil && !v.rematerializeable() && !v.OnWasmStack s.values[v.ID].needSlot = !v.Type.IsMemory() && !v.Type.IsVoid() && !v.Type.IsFlags() && f.getHome(v.ID) == nil && !v.rematerializeable() && !v.OnWasmStack
s.values[v.ID].isArg = hasAnyArgOp(v) s.values[v.ID].isArg = hasAnyArgOp(v)
s.values[v.ID].defBlock = b.ID
if f.pass.debug > stackDebug && s.values[v.ID].needSlot { if f.pass.debug > stackDebug && s.values[v.ID].needSlot {
fmt.Printf("%s needs a stack slot\n", v) fmt.Printf("%s needs a stack slot\n", v)
} }
@ -291,80 +317,89 @@ func (s *stackAllocState) stackalloc() {
// computeLive computes a map from block ID to a list of // computeLive computes a map from block ID to a list of
// stack-slot-needing value IDs live at the end of that block. // stack-slot-needing value IDs live at the end of that block.
// TODO: this could be quadratic if lots of variables are live across lots of
// basic blocks. Figure out a way to make this function (or, more precisely, the user
// of this function) require only linear size & time.
func (s *stackAllocState) computeLive(spillLive [][]ID) { func (s *stackAllocState) computeLive(spillLive [][]ID) {
s.live = make([][]ID, s.f.NumBlocks())
var phis []*Value
live := s.f.newSparseSet(s.f.NumValues())
defer s.f.retSparseSet(live)
t := s.f.newSparseSet(s.f.NumValues())
defer s.f.retSparseSet(t)
// Instead of iterating over f.Blocks, iterate over their postordering. // Because values using stack slots are few and far inbetween
// Liveness information flows backward, so starting at the end // (compared to the set of all values), we use a path exploration
// increases the probability that we will stabilize quickly. // algorithm to calculate liveness here.
po := s.f.postorder() f := s.f
for { for _, b := range f.Blocks {
changed := false for _, spillvid := range spillLive[b.ID] {
for _, b := range po { val := &s.values[spillvid]
// Start with known live values at the end of the block val.addUseBlock(b, true)
live.clear()
live.addAll(s.live[b.ID])
// Propagate backwards to the start of the block
phis = phis[:0]
for i := len(b.Values) - 1; i >= 0; i-- {
v := b.Values[i]
live.remove(v.ID)
if v.Op == OpPhi {
// Save phi for later.
// Note: its args might need a stack slot even though
// the phi itself doesn't. So don't use needSlot.
if !v.Type.IsMemory() && !v.Type.IsVoid() {
phis = append(phis, v)
}
continue
}
for _, a := range v.Args {
if s.values[a.ID].needSlot {
live.add(a.ID)
}
}
}
// for each predecessor of b, expand its list of live-at-end values
// invariant: s contains the values live at the start of b (excluding phi inputs)
for i, e := range b.Preds {
p := e.b
t.clear()
t.addAll(s.live[p.ID])
t.addAll(live.contents())
t.addAll(spillLive[p.ID])
for _, v := range phis {
a := v.Args[i]
if s.values[a.ID].needSlot {
t.add(a.ID)
}
if spill := s.values[a.ID].spill; spill != nil {
//TODO: remove? Subsumed by SpillUse?
t.add(spill.ID)
}
}
if t.size() == len(s.live[p.ID]) {
continue
}
// grow p's live set
s.live[p.ID] = append(s.live[p.ID][:0], t.contents()...)
changed = true
}
} }
for _, v := range b.Values {
if !changed { for i, a := range v.Args {
break val := &s.values[a.ID]
useBlock := b
forceLiveout := false
if v.Op == OpPhi {
useBlock = b.Preds[i].b
forceLiveout = true
if spill := val.spill; spill != nil {
//TODO: remove? Subsumed by SpillUse?
s.values[spill.ID].addUseBlock(useBlock, true)
}
}
if !val.needSlot {
continue
}
val.addUseBlock(useBlock, forceLiveout)
}
} }
} }
s.live = make([][]ID, f.NumBlocks())
push := func(bid, vid ID) {
l := s.live[bid]
if l == nil || l[len(l)-1] != vid {
l = append(l, vid)
s.live[bid] = l
}
}
// TODO: If we can help along the interference graph by calculating livein sets,
// we can do so trivially by turning this sparse set into an array of arrays
// and checking the top for the current value instead of inclusion in the sparse set.
seen := f.newSparseSet(f.NumBlocks())
defer f.retSparseSet(seen)
// instead of pruning out duplicate blocks when we build the useblocks slices
// or when we add them to the queue, rely on the seen set to stop considering
// them. This is slightly faster than building the workqueues as sets
//
// However, this means that the queue can grow larger than the number of blocks,
// usually in very short functions. Returning a slice with values appended beyond the
// original allocation can corrupt the allocator state, so cap the queue and return
// the originally allocated slice regardless.
allocedBqueue := f.Cache.allocBlockSlice(f.NumBlocks())
defer f.Cache.freeBlockSlice(allocedBqueue)
bqueue := allocedBqueue[:0:f.NumBlocks()]
for vid, v := range s.values {
if !v.needSlot {
continue
}
seen.clear()
bqueue = bqueue[:0]
for _, b := range v.useBlocks {
if b.liveout {
push(b.b.ID, ID(vid))
}
bqueue = append(bqueue, b.b)
}
for len(bqueue) > 0 {
work := bqueue[len(bqueue)-1]
bqueue = bqueue[:len(bqueue)-1]
if seen.contains(work.ID) || work.ID == v.defBlock {
continue
}
seen.add(work.ID)
for _, e := range work.Preds {
push(e.b.ID, ID(vid))
bqueue = append(bqueue, e.b)
}
}
}
if s.f.pass.debug > stackDebug { if s.f.pass.debug > stackDebug {
for _, b := range s.f.Blocks { for _, b := range s.f.Blocks {
fmt.Printf("stacklive %s %v\n", b, s.live[b.ID]) fmt.Printf("stacklive %s %v\n", b, s.live[b.ID])