cmd/compile: faster liveness analysis in regalloc

Instead of iterating until dataflow stabilization, fill in values known to be live at the beginning of loop headers. This reduces the number of passes over the CFG from the depth of the loopnest to just 2. In a test instrumented version of this change, run against cmd/compile/internal/ssa, it brought the time spent in liveness analysis down to 150.52ms from 225.49ms on my machine. Change-Id: Ic72762eedfd1f10b1ba74c430ed62ab4ebd3ec5c Reviewed-on: https://go-review.googlesource.com/c/go/+/695255 Reviewed-by: Keith Randall <khr@google.com> Auto-Submit: Keith Randall <khr@golang.org> Reviewed-by: Michael Pratt <mpratt@google.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Keith Randall <khr@golang.org>
2025-12-08 06:10:04 +00:00 · 2025-08-09 20:30:30 +01:00 · 2025-08-09 20:30:30 +01:00 · 16705b962e
commit 16705b962e
parent a5fe6791d7
1 changed files with 386 additions and 162 deletions
--- a/src/cmd/compile/internal/ssa/regalloc.go
+++ b/src/cmd/compile/internal/ssa/regalloc.go
@ -119,10 +119,12 @@ import (
 	"cmd/compile/internal/types"
 	"cmd/internal/src"
 	"cmd/internal/sys"
+	"cmp"
 	"fmt"
 	"internal/buildcfg"
 	"math"
 	"math/bits"
+	"slices"
 	"unsafe"
 )

@ -1013,10 +1015,12 @@ func (s *regAllocState) regalloc(f *Func) {
 		// Initialize regValLiveSet and uses fields for this block.
 		// Walk backwards through the block doing liveness analysis.
 		regValLiveSet.clear()
+		if s.live != nil {
 			for _, e := range s.live[b.ID] {
 				s.addUse(e.ID, int32(len(b.Values))+e.dist, e.pos) // pseudo-uses from beyond end of block
 				regValLiveSet.add(e.ID)
 			}
+		}
 		for _, v := range b.ControlValues() {
 			if s.values[v.ID].needReg {
 				s.addUse(v.ID, int32(len(b.Values)), b.Pos) // pseudo-use by control values
@ -1335,7 +1339,9 @@ func (s *regAllocState) regalloc(f *Func) {
 		}

 		// Load static desired register info at the end of the block.
+		if s.desired != nil {
 			desired.copy(&s.desired[b.ID])
+		}

 		// Check actual assigned registers at the start of the next block(s).
 		// Dynamically assigned registers will trump the static
@ -1377,7 +1383,7 @@ func (s *regAllocState) regalloc(f *Func) {
 			}
 		}
 		// Walk values backwards computing desired register info.
-		// See computeLive for more comments.
+		// See computeDesired for more comments.
 		for i := len(oldSched) - 1; i >= 0; i-- {
 			v := oldSched[i]
 			prefs := desired.remove(v.ID)
@ -2044,9 +2050,11 @@ func (s *regAllocState) regalloc(f *Func) {

 		if checkEnabled {
 			regValLiveSet.clear()
+			if s.live != nil {
 				for _, x := range s.live[b.ID] {
 					regValLiveSet.add(x.ID)
 				}
+			}
 			for r := register(0); r < s.numRegs; r++ {
 				v := s.regs[r].v
 				if v == nil {
@ -2062,6 +2070,7 @@ func (s *regAllocState) regalloc(f *Func) {
 		// isn't in a register, generate a use for the spill location.
 		// We need to remember this information so that
 		// the liveness analysis in stackalloc is correct.
+		if s.live != nil {
 			for _, e := range s.live[b.ID] {
 				vi := &s.values[e.ID]
 				if vi.regs != 0 {
@ -2094,6 +2103,7 @@ func (s *regAllocState) regalloc(f *Func) {
 				u.next = s.freeUseRecords
 				s.freeUseRecords = u
 			}
+		}

 		// allocReg may have dropped registers from startRegsMask that
 		// aren't actually needed in startRegs. Synchronize back to
@ -2192,8 +2202,8 @@ func (s *regAllocState) placeSpills() {
 		best := v.Block
 		bestArg := v
 		var bestDepth int16
-		if l := s.loopnest.b2l[best.ID]; l != nil {
-			bestDepth = l.depth
+		if s.loopnest != nil && s.loopnest.b2l[best.ID] != nil {
+			bestDepth = s.loopnest.b2l[best.ID].depth
 		}
 		b := best
 		const maxSpillSearch = 100
@ -2215,8 +2225,8 @@ func (s *regAllocState) placeSpills() {
 			}

 			var depth int16
-			if l := s.loopnest.b2l[b.ID]; l != nil {
-				depth = l.depth
+			if s.loopnest != nil && s.loopnest.b2l[b.ID] != nil {
+				depth = s.loopnest.b2l[b.ID].depth
 			}
 			if depth > bestDepth {
 				// Don't push the spill into a deeper loop.
@ -2796,16 +2806,18 @@ type liveInfo struct {
 // computeLive computes a map from block ID to a list of value IDs live at the end
 // of that block. Together with the value ID is a count of how many instructions
 // to the next use of that value. The resulting map is stored in s.live.
-// computeLive also computes the desired register information at the end of each block.
-// This desired register information is stored in s.desired.
-// TODO: this could be quadratic if lots of variables are live across lots of
-// basic blocks. Figure out a way to make this function (or, more precisely, the user
-// of this function) require only linear size & time.
 func (s *regAllocState) computeLive() {
 	f := s.f
+	// single block functions do not have variables that are live across
+	// branches
+	if len(f.Blocks) == 1 {
+		return
+	}
+	po := f.postorder()
 	s.live = make([][]liveInfo, f.NumBlocks())
 	s.desired = make([]desiredState, f.NumBlocks())
-	var phis []*Value
+	s.loopnest = f.loopnest()
+
 	rematIDs := make([]ID, 0, 64)

 	live := f.newSparseMapPos(f.NumValues())
@ -2813,31 +2825,63 @@ func (s *regAllocState) computeLive() {
 	t := f.newSparseMapPos(f.NumValues())
 	defer f.retSparseMapPos(t)

-	// Keep track of which value we want in each register.
-	var desired desiredState
-
-	// Instead of iterating over f.Blocks, iterate over their postordering.
-	// Liveness information flows backward, so starting at the end
-	// increases the probability that we will stabilize quickly.
-	// TODO: Do a better job yet. Here's one possibility:
-	// Calculate the dominator tree and locate all strongly connected components.
-	// If a value is live in one block of an SCC, it is live in all.
-	// Walk the dominator tree from end to beginning, just once, treating SCC
-	// components as single blocks, duplicated calculated liveness information
-	// out to all of them.
-	po := f.postorder()
-	s.loopnest = f.loopnest()
 	s.loopnest.computeUnavoidableCalls()
+
+	// Liveness analysis.
+	// This is an adapted version of the algorithm described in chapter 2.4.2
+	// of Fabrice Rastello's On Sparse Intermediate Representations.
+	//   https://web.archive.org/web/20240417212122if_/https://inria.hal.science/hal-00761555/file/habilitation.pdf#section.50
+	//
+	// For our implementation, we fall back to a traditional iterative algorithm when we encounter
+	// Irreducible CFGs. They are very uncommon in Go code because they need to be constructed with
+	// gotos and our current loopnest definition does not compute all the information that
+	// we'd need to compute the loop ancestors for that step of the algorithm.
+	//
+	// Additionally, instead of only considering non-loop successors in the initial DFS phase,
+	// we compute the liveout as the union of all successors. This larger liveout set is a subset
+	// of the final liveout for the block and adding this information in the DFS phase means that
+	// we get slightly more accurate distance information.
+	var loopLiveIn map[*loop][]liveInfo
+	var numCalls []int32
+	if len(s.loopnest.loops) > 0 && !s.loopnest.hasIrreducible {
+		loopLiveIn = make(map[*loop][]liveInfo)
+		numCalls = f.Cache.allocInt32Slice(f.NumBlocks())
+		defer f.Cache.freeInt32Slice(numCalls)
+	}
+
 	for {
 		changed := false

 		for _, b := range po {
 			// Start with known live values at the end of the block.
-			// Add len(b.Values) to adjust from end-of-block distance
-			// to beginning-of-block distance.
 			live.clear()
 			for _, e := range s.live[b.ID] {
-				live.set(e.ID, e.dist+int32(len(b.Values)), e.pos)
+				live.set(e.ID, e.dist, e.pos)
+			}
+			update := false
+			// arguments to phi nodes are live at this blocks out
+			for _, e := range b.Succs {
+				succ := e.b
+				delta := branchDistance(b, succ)
+				for _, v := range succ.Values {
+					if v.Op != OpPhi {
+						break
+					}
+					arg := v.Args[e.i]
+					if s.values[arg.ID].needReg && (!live.contains(arg.ID) || delta < live.get(arg.ID)) {
+						live.set(arg.ID, delta, v.Pos)
+						update = true
+					}
+				}
+			}
+			if update {
+				s.live[b.ID] = updateLive(live, s.live[b.ID])
+			}
+			// Add len(b.Values) to adjust from end-of-block distance
+			// to beginning-of-block distance.
+			c := live.contents()
+			for i := range c {
+				c[i].val += int32(len(b.Values))
 			}

 			// Mark control values as live
@ -2847,18 +2891,16 @@ func (s *regAllocState) computeLive() {
 				}
 			}

-			// Propagate backwards to the start of the block
-			// Assumes Values have been scheduled.
-			phis = phis[:0]
 			for i := len(b.Values) - 1; i >= 0; i-- {
 				v := b.Values[i]
 				live.remove(v.ID)
 				if v.Op == OpPhi {
-					// save phi ops for later
-					phis = append(phis, v)
 					continue
 				}
 				if opcodeTable[v.Op].call {
+					if numCalls != nil {
+						numCalls[b.ID]++
+					}
 					rematIDs = rematIDs[:0]
 					c := live.contents()
 					for i := range c {
@ -2881,7 +2923,207 @@ func (s *regAllocState) computeLive() {
 					}
 				}
 			}
-			// Propagate desired registers backwards.
+			// This is a loop header, save our live-in so that
+			// we can use it to fill in the loop bodies later
+			if loopLiveIn != nil {
+				loop := s.loopnest.b2l[b.ID]
+				if loop != nil && loop.header.ID == b.ID {
+					loopLiveIn[loop] = updateLive(live, nil)
+				}
+			}
+			// For each predecessor of b, expand its list of live-at-end values.
+			// invariant: live contains the values live at the start of b
+			for _, e := range b.Preds {
+				p := e.b
+				delta := branchDistance(p, b)
+
+				// Start t off with the previously known live values at the end of p.
+				t.clear()
+				for _, e := range s.live[p.ID] {
+					t.set(e.ID, e.dist, e.pos)
+				}
+				update := false
+
+				// Add new live values from scanning this block.
+				for _, e := range live.contents() {
+					d := e.val + delta
+					if !t.contains(e.key) || d < t.get(e.key) {
+						update = true
+						t.set(e.key, d, e.pos)
+					}
+				}
+
+				if !update {
+					continue
+				}
+				s.live[p.ID] = updateLive(t, s.live[p.ID])
+				changed = true
+			}
+		}
+
+		// Doing a traditional iterative algorithm and have run
+		// out of changes
+		if !changed {
+			break
+		}
+
+		// Doing a pre-pass and will fill in the liveness information
+		// later
+		if loopLiveIn != nil {
+			break
+		}
+		// For loopless code, we have full liveness info after a single
+		// iteration
+		if len(s.loopnest.loops) == 0 {
+			break
+		}
+	}
+	if f.pass.debug > regDebug {
+		s.debugPrintLive("after dfs walk", f, s.live, s.desired)
+	}
+
+	// irreducible CFGs and functions without loops are already
+	// done, compute their desired registers and return
+	if loopLiveIn == nil {
+		s.computeDesired()
+		return
+	}
+
+	// Walk the loopnest from outer to inner, adding
+	// all live-in values from their parent. Instead of
+	// a recursive algorithm, iterate in depth order.
+	// TODO(dmo): can we permute the loopnest? can we avoid this copy?
+	loops := slices.Clone(s.loopnest.loops)
+	slices.SortFunc(loops, func(a, b *loop) int {
+		return cmp.Compare(a.depth, b.depth)
+	})
+
+	loopset := f.newSparseMapPos(f.NumValues())
+	defer f.retSparseMapPos(loopset)
+	for _, loop := range loops {
+		if loop.outer == nil {
+			continue
+		}
+		livein := loopLiveIn[loop]
+		loopset.clear()
+		for _, l := range livein {
+			loopset.set(l.ID, l.dist, l.pos)
+		}
+		update := false
+		for _, l := range loopLiveIn[loop.outer] {
+			if !loopset.contains(l.ID) {
+				loopset.set(l.ID, l.dist, l.pos)
+				update = true
+			}
+		}
+		if update {
+			loopLiveIn[loop] = updateLive(loopset, livein)
+		}
+	}
+	// unknownDistance is a sentinel value for when we know a variable
+	// is live at any given block, but we do not yet know how far until it's next
+	// use. The distance will be computed later.
+	const unknownDistance = -1
+
+	// add live-in values of the loop headers to their children.
+	// This includes the loop headers themselves, since they can have values
+	// that die in the middle of the block and aren't live-out
+	for _, b := range po {
+		loop := s.loopnest.b2l[b.ID]
+		if loop == nil {
+			continue
+		}
+		headerLive := loopLiveIn[loop]
+		loopset.clear()
+		for _, l := range s.live[b.ID] {
+			loopset.set(l.ID, l.dist, l.pos)
+		}
+		update := false
+		for _, l := range headerLive {
+			if !loopset.contains(l.ID) {
+				loopset.set(l.ID, unknownDistance, src.NoXPos)
+				update = true
+			}
+		}
+		if update {
+			s.live[b.ID] = updateLive(loopset, s.live[b.ID])
+		}
+	}
+	if f.pass.debug > regDebug {
+		s.debugPrintLive("after live loop prop", f, s.live, s.desired)
+	}
+	// Filling in liveness from loops leaves some blocks with no distance information
+	// Run over them and fill in the information from their successors.
+	// To stabilize faster, we quit when no block has missing values and we only
+	// look at blocks that still have missing values in subsequent iterations
+	unfinishedBlocks := f.Cache.allocBlockSlice(len(po))
+	defer f.Cache.freeBlockSlice(unfinishedBlocks)
+	copy(unfinishedBlocks, po)
+
+	for len(unfinishedBlocks) > 0 {
+		n := 0
+		for _, b := range unfinishedBlocks {
+			live.clear()
+			unfinishedValues := 0
+			for _, l := range s.live[b.ID] {
+				if l.dist == unknownDistance {
+					unfinishedValues++
+				}
+				live.set(l.ID, l.dist, l.pos)
+			}
+			update := false
+			for _, e := range b.Succs {
+				succ := e.b
+				for _, l := range s.live[succ.ID] {
+					if !live.contains(l.ID) || l.dist == unknownDistance {
+						continue
+					}
+					dist := int32(len(succ.Values)) + l.dist + branchDistance(b, succ)
+					dist += numCalls[succ.ID] * unlikelyDistance
+					val := live.get(l.ID)
+					switch {
+					case val == unknownDistance:
+						unfinishedValues--
+						fallthrough
+					case dist < val:
+						update = true
+						live.set(l.ID, dist, l.pos)
+					}
+				}
+			}
+			if update {
+				s.live[b.ID] = updateLive(live, s.live[b.ID])
+			}
+			if unfinishedValues > 0 {
+				unfinishedBlocks[n] = b
+				n++
+			}
+		}
+		unfinishedBlocks = unfinishedBlocks[:n]
+	}
+
+	s.computeDesired()
+
+	if f.pass.debug > regDebug {
+		s.debugPrintLive("final", f, s.live, s.desired)
+	}
+}
+
+// computeDesired computes the desired register information at the end of each block.
+// It is essentially a liveness analysis on machine registers instead of SSA values
+// The desired register information is stored in s.desired.
+func (s *regAllocState) computeDesired() {
+
+	// TODO: Can we speed this up using the liveness information we have already
+	// from computeLive?
+	// TODO: Since we don't propagate information through phi nodes, can we do
+	// this as a single dominator tree walk instead of the iterative solution?
+	var desired desiredState
+	f := s.f
+	po := f.postorder()
+	for {
+		changed := false
+		for _, b := range po {
 			desired.copy(&s.desired[b.ID])
 			for i := len(b.Values) - 1; i >= 0; i-- {
 				v := b.Values[i]
@ -2916,82 +3158,63 @@ func (s *regAllocState) computeLive() {
 					desired.addList(v.Args[0].ID, prefs)
 				}
 			}
-
-			// For each predecessor of b, expand its list of live-at-end values.
-			// invariant: live contains the values live at the start of b (excluding phi inputs)
-			for i, e := range b.Preds {
+			for _, e := range b.Preds {
 				p := e.b
-				// Compute additional distance for the edge.
-				// Note: delta must be at least 1 to distinguish the control
-				// value use from the first user in a successor block.
-				delta := int32(normalDistance)
-				if len(p.Succs) == 2 {
-					if p.Succs[0].b == b && p.Likely == BranchLikely ||
-						p.Succs[1].b == b && p.Likely == BranchUnlikely {
-						delta = likelyDistance
-					}
-					if p.Succs[0].b == b && p.Likely == BranchUnlikely ||
-						p.Succs[1].b == b && p.Likely == BranchLikely {
-						delta = unlikelyDistance
+				changed = s.desired[p.ID].merge(&desired) || changed
 			}
 		}
-
-				// Update any desired registers at the end of p.
-				s.desired[p.ID].merge(&desired)
-
-				// Start t off with the previously known live values at the end of p.
-				t.clear()
-				for _, e := range s.live[p.ID] {
-					t.set(e.ID, e.dist, e.pos)
-				}
-				update := false
-
-				// Add new live values from scanning this block.
-				for _, e := range live.contents() {
-					d := e.val + delta
-					if !t.contains(e.key) || d < t.get(e.key) {
-						update = true
-						t.set(e.key, d, e.pos)
-					}
-				}
-				// Also add the correct arg from the saved phi values.
-				// All phis are at distance delta (we consider them
-				// simultaneously happening at the start of the block).
-				for _, v := range phis {
-					id := v.Args[i].ID
-					if s.values[id].needReg && (!t.contains(id) || delta < t.get(id)) {
-						update = true
-						t.set(id, delta, v.Pos)
-					}
-				}
-
-				if !update {
-					continue
-				}
-				// The live set has changed, update it.
-				l := s.live[p.ID][:0]
-				if cap(l) < t.size() {
-					l = make([]liveInfo, 0, t.size())
-				}
-				for _, e := range t.contents() {
-					l = append(l, liveInfo{e.key, e.val, e.pos})
-				}
-				s.live[p.ID] = l
-				changed = true
-			}
-		}
-
-		if !changed {
+		if !changed || (!s.loopnest.hasIrreducible && len(s.loopnest.loops) == 0) {
 			break
 		}
 	}
-	if f.pass.debug > regDebug {
-		fmt.Println("live values at end of each block")
+}
+
+// updateLive updates a given liveInfo slice with the contents of t
+func updateLive(t *sparseMapPos, live []liveInfo) []liveInfo {
+	live = live[:0]
+	if cap(live) < t.size() {
+		live = make([]liveInfo, 0, t.size())
+	}
+	for _, e := range t.contents() {
+		live = append(live, liveInfo{e.key, e.val, e.pos})
+	}
+	return live
+}
+
+// branchDistance calculates the distance between a block and a
+// successor in pseudo-instructions. This is used to indicate
+// likeliness
+func branchDistance(b *Block, s *Block) int32 {
+	if len(b.Succs) == 2 {
+		if b.Succs[0].b == s && b.Likely == BranchLikely ||
+			b.Succs[1].b == s && b.Likely == BranchUnlikely {
+			return likelyDistance
+		}
+		if b.Succs[0].b == s && b.Likely == BranchUnlikely ||
+			b.Succs[1].b == s && b.Likely == BranchLikely {
+			return unlikelyDistance
+		}
+	}
+	// Note: the branch distance must be at least 1 to distinguish the control
+	// value use from the first user in a successor block.
+	return normalDistance
+}
+
+func (s *regAllocState) debugPrintLive(stage string, f *Func, live [][]liveInfo, desired []desiredState) {
+	fmt.Printf("%s: live values at end of each block: %s\n", stage, f.Name)
 	for _, b := range f.Blocks {
+		s.debugPrintLiveBlock(b, live[b.ID], &desired[b.ID])
+	}
+}
+
+func (s *regAllocState) debugPrintLiveBlock(b *Block, live []liveInfo, desired *desiredState) {
 	fmt.Printf("  %s:", b)
-			for _, x := range s.live[b.ID] {
+	slices.SortFunc(live, func(a, b liveInfo) int {
+		return cmp.Compare(a.ID, b.ID)
+	})
+	for _, x := range live {
 		fmt.Printf(" v%d(%d)", x.ID, x.dist)
-				for _, e := range s.desired[b.ID].entries {
+		for _, e := range desired.entries {
 			if e.ID != x.ID {
 				continue
 			}
@ -3010,13 +3233,11 @@ func (s *regAllocState) computeLive() {
 			fmt.Printf("]")
 		}
 	}
-			if avoid := s.desired[b.ID].avoid; avoid != 0 {
+	if avoid := desired.avoid; avoid != 0 {
 		fmt.Printf(" avoid=%v", s.RegMaskString(avoid))
 	}
 	fmt.Println()
 }
-	}
-}

 // A desiredState represents desired register assignments.
 type desiredState struct {
@ -3131,14 +3352,17 @@ func (d *desiredState) remove(vid ID) [4]register {
 	return [4]register{noRegister, noRegister, noRegister, noRegister}
 }

-// merge merges another desired state x into d.
-func (d *desiredState) merge(x *desiredState) {
+// merge merges another desired state x into d. Returns whether the set has
+// changed
+func (d *desiredState) merge(x *desiredState) bool {
+	oldAvoid := d.avoid
 	d.avoid |= x.avoid
 	// There should only be a few desired registers, so
 	// linear insert is ok.
 	for _, e := range x.entries {
 		d.addList(e.ID, e.regs)
 	}
+	return oldAvoid != d.avoid
 }

 // computeUnavoidableCalls computes the containsUnavoidableCall fields in the loop nest.