cmd/compile: use prove pass to detect Ctz of non-zero values

On amd64, Ctz must include special handling of zeros. But the prove pass has enough information to detect whether the input is non-zero, allowing a more efficient lowering. Introduce new CtzNonZero ops to capture and use this information. Benchmark code: func BenchmarkVisitBits(b *testing.B) { b.Run("8", func(b *testing.B) { for i := 0; i < b.N; i++ { x := uint8(0xff) for x != 0 { sink = bits.TrailingZeros8(x) x &= x - 1 } } }) // and similarly so for 16, 32, 64 } name old time/op new time/op delta VisitBits/8-8 7.27ns ± 4% 5.58ns ± 4% -23.35% (p=0.000 n=28+26) VisitBits/16-8 14.7ns ± 7% 10.5ns ± 4% -28.43% (p=0.000 n=30+28) VisitBits/32-8 27.6ns ± 8% 19.3ns ± 3% -30.14% (p=0.000 n=30+26) VisitBits/64-8 44.0ns ±11% 38.0ns ± 5% -13.48% (p=0.000 n=30+30) Fixes #25077 Change-Id: Ie6e5bd86baf39ee8a4ca7cadcf56d934e047f957 Reviewed-on: https://go-review.googlesource.com/109358 Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
2025-12-08 06:10:04 +00:00 · 2018-04-25 11:52:06 -07:00 · 2018-04-25 11:52:06 -07:00 · d9a50a6531
commit d9a50a6531
parent adbb6ec903
19 changed files with 347 additions and 32 deletions
--- a/src/cmd/compile/internal/ssa/prove.go
+++ b/src/cmd/compile/internal/ssa/prove.go
@ -365,7 +365,7 @@ var opMax = map[Op]int64{
 	OpAdd32: math.MaxInt32, OpSub32: math.MaxInt32,
 }

-// isNonNegative returns true if v is known to be non-negative.
+// isNonNegative reports whether v is known to be non-negative.
 func (ft *factsTable) isNonNegative(v *Value) bool {
 	if isNonNegative(v) {
 		return true
@ -734,34 +734,48 @@ func addRestrictions(parent *Block, ft *factsTable, t domain, v, w *Value, r rel
 	}
 }

+var ctzNonZeroOp = map[Op]Op{OpCtz8: OpCtz8NonZero, OpCtz16: OpCtz16NonZero, OpCtz32: OpCtz32NonZero, OpCtz64: OpCtz64NonZero}
+
 // simplifyBlock simplifies some constant values in b and evaluates
 // branches to non-uniquely dominated successors of b.
 func simplifyBlock(sdom SparseTree, ft *factsTable, b *Block) {
-	// Replace OpSlicemask operations in b with constants where possible.
 	for _, v := range b.Values {
-		if v.Op != OpSlicemask {
-			continue
-		}
-		x, delta := isConstDelta(v.Args[0])
-		if x == nil {
-			continue
-		}
-		// slicemask(x + y)
-		// if x is larger than -y (y is negative), then slicemask is -1.
-		lim, ok := ft.limits[x.ID]
-		if !ok {
-			continue
-		}
-		if lim.umin > uint64(-delta) {
-			if v.Args[0].Op == OpAdd64 {
-				v.reset(OpConst64)
-			} else {
-				v.reset(OpConst32)
+		switch v.Op {
+		case OpSlicemask:
+			// Replace OpSlicemask operations in b with constants where possible.
+			x, delta := isConstDelta(v.Args[0])
+			if x == nil {
+				continue
 			}
-			if b.Func.pass.debug > 0 {
-				b.Func.Warnl(v.Pos, "Proved slicemask not needed")
+			// slicemask(x + y)
+			// if x is larger than -y (y is negative), then slicemask is -1.
+			lim, ok := ft.limits[x.ID]
+			if !ok {
+				continue
+			}
+			if lim.umin > uint64(-delta) {
+				if v.Args[0].Op == OpAdd64 {
+					v.reset(OpConst64)
+				} else {
+					v.reset(OpConst32)
+				}
+				if b.Func.pass.debug > 0 {
+					b.Func.Warnl(v.Pos, "Proved slicemask not needed")
+				}
+				v.AuxInt = -1
+			}
+		case OpCtz8, OpCtz16, OpCtz32, OpCtz64:
+			// On some architectures, notably amd64, we can generate much better
+			// code for CtzNN if we know that the argument is non-zero.
+			// Capture that information here for use in arch-specific optimizations.
+			x := v.Args[0]
+			lim, ok := ft.limits[x.ID]
+			if !ok {
+				continue
+			}
+			if lim.umin > 0 || lim.min > 0 || lim.max < 0 {
+				v.Op = ctzNonZeroOp[v.Op]
 			}
-			v.AuxInt = -1
 		}
 	}

@ -818,7 +832,7 @@ func removeBranch(b *Block, branch branch) {
 	}
 }

-// isNonNegative returns true is v is known to be greater or equal to zero.
+// isNonNegative reports whether v is known to be greater or equal to zero.
 func isNonNegative(v *Value) bool {
 	switch v.Op {
 	case OpConst64: