cmd/compile: use prove pass to detect Ctz of non-zero values

On amd64, Ctz must include special handling of zeros.
But the prove pass has enough information to detect whether the input
is non-zero, allowing a more efficient lowering.

Introduce new CtzNonZero ops to capture and use this information.

Benchmark code:

func BenchmarkVisitBits(b *testing.B) {
	b.Run("8", func(b *testing.B) {
		for i := 0; i < b.N; i++ {
			x := uint8(0xff)
			for x != 0 {
				sink = bits.TrailingZeros8(x)
				x &= x - 1
			}
		}
	})

    // and similarly so for 16, 32, 64
}

name            old time/op  new time/op  delta
VisitBits/8-8   7.27ns ± 4%  5.58ns ± 4%  -23.35%  (p=0.000 n=28+26)
VisitBits/16-8  14.7ns ± 7%  10.5ns ± 4%  -28.43%  (p=0.000 n=30+28)
VisitBits/32-8  27.6ns ± 8%  19.3ns ± 3%  -30.14%  (p=0.000 n=30+26)
VisitBits/64-8  44.0ns ±11%  38.0ns ± 5%  -13.48%  (p=0.000 n=30+30)

Fixes #25077

Change-Id: Ie6e5bd86baf39ee8a4ca7cadcf56d934e047f957
Reviewed-on: https://go-review.googlesource.com/109358
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
This commit is contained in:
Josh Bleecher Snyder 2018-04-25 11:52:06 -07:00
parent adbb6ec903
commit d9a50a6531
19 changed files with 347 additions and 32 deletions

View file

@ -365,7 +365,7 @@ var opMax = map[Op]int64{
OpAdd32: math.MaxInt32, OpSub32: math.MaxInt32,
}
// isNonNegative returns true if v is known to be non-negative.
// isNonNegative reports whether v is known to be non-negative.
func (ft *factsTable) isNonNegative(v *Value) bool {
if isNonNegative(v) {
return true
@ -734,34 +734,48 @@ func addRestrictions(parent *Block, ft *factsTable, t domain, v, w *Value, r rel
}
}
var ctzNonZeroOp = map[Op]Op{OpCtz8: OpCtz8NonZero, OpCtz16: OpCtz16NonZero, OpCtz32: OpCtz32NonZero, OpCtz64: OpCtz64NonZero}
// simplifyBlock simplifies some constant values in b and evaluates
// branches to non-uniquely dominated successors of b.
func simplifyBlock(sdom SparseTree, ft *factsTable, b *Block) {
// Replace OpSlicemask operations in b with constants where possible.
for _, v := range b.Values {
if v.Op != OpSlicemask {
continue
}
x, delta := isConstDelta(v.Args[0])
if x == nil {
continue
}
// slicemask(x + y)
// if x is larger than -y (y is negative), then slicemask is -1.
lim, ok := ft.limits[x.ID]
if !ok {
continue
}
if lim.umin > uint64(-delta) {
if v.Args[0].Op == OpAdd64 {
v.reset(OpConst64)
} else {
v.reset(OpConst32)
switch v.Op {
case OpSlicemask:
// Replace OpSlicemask operations in b with constants where possible.
x, delta := isConstDelta(v.Args[0])
if x == nil {
continue
}
if b.Func.pass.debug > 0 {
b.Func.Warnl(v.Pos, "Proved slicemask not needed")
// slicemask(x + y)
// if x is larger than -y (y is negative), then slicemask is -1.
lim, ok := ft.limits[x.ID]
if !ok {
continue
}
if lim.umin > uint64(-delta) {
if v.Args[0].Op == OpAdd64 {
v.reset(OpConst64)
} else {
v.reset(OpConst32)
}
if b.Func.pass.debug > 0 {
b.Func.Warnl(v.Pos, "Proved slicemask not needed")
}
v.AuxInt = -1
}
case OpCtz8, OpCtz16, OpCtz32, OpCtz64:
// On some architectures, notably amd64, we can generate much better
// code for CtzNN if we know that the argument is non-zero.
// Capture that information here for use in arch-specific optimizations.
x := v.Args[0]
lim, ok := ft.limits[x.ID]
if !ok {
continue
}
if lim.umin > 0 || lim.min > 0 || lim.max < 0 {
v.Op = ctzNonZeroOp[v.Op]
}
v.AuxInt = -1
}
}
@ -818,7 +832,7 @@ func removeBranch(b *Block, branch branch) {
}
}
// isNonNegative returns true is v is known to be greater or equal to zero.
// isNonNegative reports whether v is known to be greater or equal to zero.
func isNonNegative(v *Value) bool {
switch v.Op {
case OpConst64: