mirror of
https://github.com/golang/go.git
synced 2025-12-08 06:10:04 +00:00
cmd/compile: optimize Ctz64 on 386
Compared with the version generated by dec64.rules based on Ctz32,
the number of assembly instructions is reduced by half.
SwissMap uses TrailingZeros64 to find the first match in its control
group and may benefit from this CL on 386 architectures.
goos: linux
goarch: 386
cpu: 13th Gen Intel(R) Core(TM) i7-13700H
│ old.txt │ new.txt │
│ sec/op │ sec/op vs base │
TrailingZeros64-20 0.8828n ± 1% 0.6299n ± 1% -28.65% (p=0.000 n=20)
Change-Id: Iba08a3f4e13efd3349715dfb7fcd5fd470286cd3
Reviewed-on: https://go-review.googlesource.com/c/go/+/624376
Reviewed-by: David Chase <drchase@google.com>
Reviewed-by: Keith Randall <khr@golang.org>
Reviewed-by: Keith Randall <khr@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Auto-Submit: Keith Randall <khr@golang.org>
This commit is contained in:
parent
bea9b91f0f
commit
0140aae6d0
7 changed files with 87 additions and 4 deletions
|
|
@ -63,6 +63,7 @@
|
||||||
(Ctz16NonZero ...) => (BSFL ...)
|
(Ctz16NonZero ...) => (BSFL ...)
|
||||||
(Ctz32 ...) => (LoweredCtz32 ...)
|
(Ctz32 ...) => (LoweredCtz32 ...)
|
||||||
(Ctz32NonZero ...) => (BSFL ...)
|
(Ctz32NonZero ...) => (BSFL ...)
|
||||||
|
(Ctz64On32 ...) => (LoweredCtz64 ...)
|
||||||
|
|
||||||
// Lowering extension
|
// Lowering extension
|
||||||
(SignExt8to16 ...) => (MOVBLSX ...)
|
(SignExt8to16 ...) => (MOVBLSX ...)
|
||||||
|
|
|
||||||
|
|
@ -300,9 +300,10 @@ func init() {
|
||||||
|
|
||||||
{name: "NOTL", argLength: 1, reg: gp11, asm: "NOTL", resultInArg0: true}, // ^arg0
|
{name: "NOTL", argLength: 1, reg: gp11, asm: "NOTL", resultInArg0: true}, // ^arg0
|
||||||
|
|
||||||
{name: "BSFL", argLength: 1, reg: gp11, asm: "BSFL", clobberFlags: true}, // arg0 # of low-order zeroes ; undef if zero
|
{name: "BSFL", argLength: 1, reg: gp11, asm: "BSFL", clobberFlags: true}, // arg0 # of low-order zeroes ; undef if zero
|
||||||
{name: "BSFW", argLength: 1, reg: gp11, asm: "BSFW", clobberFlags: true}, // arg0 # of low-order zeroes ; undef if zero
|
{name: "BSFW", argLength: 1, reg: gp11, asm: "BSFW", clobberFlags: true}, // arg0 # of low-order zeroes ; undef if zero
|
||||||
{name: "LoweredCtz32", argLength: 1, reg: gp11, clobberFlags: true}, // arg0 # of low-order zeroes
|
{name: "LoweredCtz32", argLength: 1, reg: gp11, clobberFlags: true}, // arg0 # of low-order zeroes
|
||||||
|
{name: "LoweredCtz64", argLength: 2, reg: gp21, resultNotInArgs: true, clobberFlags: true}, // arg1<<32+arg0 # of low-order zeroes
|
||||||
|
|
||||||
{name: "BSRL", argLength: 1, reg: gp11, asm: "BSRL", clobberFlags: true}, // arg0 # of high-order zeroes ; undef if zero
|
{name: "BSRL", argLength: 1, reg: gp11, asm: "BSRL", clobberFlags: true}, // arg0 # of high-order zeroes ; undef if zero
|
||||||
{name: "BSRW", argLength: 1, reg: gp11, asm: "BSRW", clobberFlags: true}, // arg0 # of high-order zeroes ; undef if zero
|
{name: "BSRW", argLength: 1, reg: gp11, asm: "BSRW", clobberFlags: true}, // arg0 # of high-order zeroes ; undef if zero
|
||||||
|
|
|
||||||
|
|
@ -229,6 +229,7 @@ var genericOps = []opData{
|
||||||
{name: "Ctz16", argLength: 1}, // Count trailing (low order) zeroes (returns 0-16)
|
{name: "Ctz16", argLength: 1}, // Count trailing (low order) zeroes (returns 0-16)
|
||||||
{name: "Ctz32", argLength: 1}, // Count trailing (low order) zeroes (returns 0-32)
|
{name: "Ctz32", argLength: 1}, // Count trailing (low order) zeroes (returns 0-32)
|
||||||
{name: "Ctz64", argLength: 1}, // Count trailing (low order) zeroes (returns 0-64)
|
{name: "Ctz64", argLength: 1}, // Count trailing (low order) zeroes (returns 0-64)
|
||||||
|
{name: "Ctz64On32", argLength: 2}, // Count trailing (low order) zeroes (returns 0-64) in arg[1]<<32+arg[0]
|
||||||
{name: "Ctz8NonZero", argLength: 1}, // same as above, but arg[0] known to be non-zero, returns 0-7
|
{name: "Ctz8NonZero", argLength: 1}, // same as above, but arg[0] known to be non-zero, returns 0-7
|
||||||
{name: "Ctz16NonZero", argLength: 1}, // same as above, but arg[0] known to be non-zero, returns 0-15
|
{name: "Ctz16NonZero", argLength: 1}, // same as above, but arg[0] known to be non-zero, returns 0-15
|
||||||
{name: "Ctz32NonZero", argLength: 1}, // same as above, but arg[0] known to be non-zero, returns 0-31
|
{name: "Ctz32NonZero", argLength: 1}, // same as above, but arg[0] known to be non-zero, returns 0-31
|
||||||
|
|
|
||||||
|
|
@ -469,6 +469,7 @@ const (
|
||||||
Op386BSFL
|
Op386BSFL
|
||||||
Op386BSFW
|
Op386BSFW
|
||||||
Op386LoweredCtz32
|
Op386LoweredCtz32
|
||||||
|
Op386LoweredCtz64
|
||||||
Op386BSRL
|
Op386BSRL
|
||||||
Op386BSRW
|
Op386BSRW
|
||||||
Op386BSWAPL
|
Op386BSWAPL
|
||||||
|
|
@ -3093,6 +3094,7 @@ const (
|
||||||
OpCtz16
|
OpCtz16
|
||||||
OpCtz32
|
OpCtz32
|
||||||
OpCtz64
|
OpCtz64
|
||||||
|
OpCtz64On32
|
||||||
OpCtz8NonZero
|
OpCtz8NonZero
|
||||||
OpCtz16NonZero
|
OpCtz16NonZero
|
||||||
OpCtz32NonZero
|
OpCtz32NonZero
|
||||||
|
|
@ -5195,6 +5197,21 @@ var opcodeTable = [...]opInfo{
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
name: "LoweredCtz64",
|
||||||
|
argLen: 2,
|
||||||
|
resultNotInArgs: true,
|
||||||
|
clobberFlags: true,
|
||||||
|
reg: regInfo{
|
||||||
|
inputs: []inputInfo{
|
||||||
|
{0, 239}, // AX CX DX BX BP SI DI
|
||||||
|
{1, 239}, // AX CX DX BX BP SI DI
|
||||||
|
},
|
||||||
|
outputs: []outputInfo{
|
||||||
|
{0, 239}, // AX CX DX BX BP SI DI
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
{
|
{
|
||||||
name: "BSRL",
|
name: "BSRL",
|
||||||
argLen: 1,
|
argLen: 1,
|
||||||
|
|
@ -40458,6 +40475,11 @@ var opcodeTable = [...]opInfo{
|
||||||
argLen: 1,
|
argLen: 1,
|
||||||
generic: true,
|
generic: true,
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
name: "Ctz64On32",
|
||||||
|
argLen: 2,
|
||||||
|
generic: true,
|
||||||
|
},
|
||||||
{
|
{
|
||||||
name: "Ctz8NonZero",
|
name: "Ctz8NonZero",
|
||||||
argLen: 1,
|
argLen: 1,
|
||||||
|
|
|
||||||
|
|
@ -323,6 +323,9 @@ func rewriteValue386(v *Value) bool {
|
||||||
case OpCtz32NonZero:
|
case OpCtz32NonZero:
|
||||||
v.Op = Op386BSFL
|
v.Op = Op386BSFL
|
||||||
return true
|
return true
|
||||||
|
case OpCtz64On32:
|
||||||
|
v.Op = Op386LoweredCtz64
|
||||||
|
return true
|
||||||
case OpCtz8:
|
case OpCtz8:
|
||||||
return rewriteValue386_OpCtz8(v)
|
return rewriteValue386_OpCtz8(v)
|
||||||
case OpCtz8NonZero:
|
case OpCtz8NonZero:
|
||||||
|
|
|
||||||
|
|
@ -747,7 +747,14 @@ func initIntrinsics(cfg *intrinsicBuildConfig) {
|
||||||
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
|
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
|
||||||
return s.newValue1(ssa.OpCtz64, types.Types[types.TINT], args[0])
|
return s.newValue1(ssa.OpCtz64, types.Types[types.TINT], args[0])
|
||||||
},
|
},
|
||||||
sys.AMD64, sys.I386, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm)
|
sys.AMD64, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm)
|
||||||
|
addF("math/bits", "TrailingZeros64",
|
||||||
|
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
|
||||||
|
lo := s.newValue1(ssa.OpInt64Lo, types.Types[types.TUINT32], args[0])
|
||||||
|
hi := s.newValue1(ssa.OpInt64Hi, types.Types[types.TUINT32], args[0])
|
||||||
|
return s.newValue2(ssa.OpCtz64On32, types.Types[types.TINT], lo, hi)
|
||||||
|
},
|
||||||
|
sys.I386)
|
||||||
addF("math/bits", "TrailingZeros32",
|
addF("math/bits", "TrailingZeros32",
|
||||||
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
|
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
|
||||||
return s.newValue1(ssa.OpCtz32, types.Types[types.TINT], args[0])
|
return s.newValue1(ssa.OpCtz32, types.Types[types.TINT], args[0])
|
||||||
|
|
|
||||||
|
|
@ -850,6 +850,54 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
|
||||||
p2.To.Type = obj.TYPE_REG
|
p2.To.Type = obj.TYPE_REG
|
||||||
p2.To.Reg = v.Reg()
|
p2.To.Reg = v.Reg()
|
||||||
|
|
||||||
|
// NOP (so the JNZ has somewhere to land)
|
||||||
|
nop := s.Prog(obj.ANOP)
|
||||||
|
p1.To.SetTarget(nop)
|
||||||
|
case ssa.Op386LoweredCtz64:
|
||||||
|
if v.Args[0].Reg() == v.Reg() {
|
||||||
|
v.Fatalf("input[0] and output in the same register %s", v.LongString())
|
||||||
|
}
|
||||||
|
if v.Args[1].Reg() == v.Reg() {
|
||||||
|
v.Fatalf("input[1] and output in the same register %s", v.LongString())
|
||||||
|
}
|
||||||
|
|
||||||
|
// BSFL arg0, out
|
||||||
|
p := s.Prog(x86.ABSFL)
|
||||||
|
p.From.Type = obj.TYPE_REG
|
||||||
|
p.From.Reg = v.Args[0].Reg()
|
||||||
|
p.To.Type = obj.TYPE_REG
|
||||||
|
p.To.Reg = v.Reg()
|
||||||
|
|
||||||
|
// JNZ 5(PC)
|
||||||
|
p1 := s.Prog(x86.AJNE)
|
||||||
|
p1.To.Type = obj.TYPE_BRANCH
|
||||||
|
|
||||||
|
// BSFL arg1, out
|
||||||
|
p2 := s.Prog(x86.ABSFL)
|
||||||
|
p2.From.Type = obj.TYPE_REG
|
||||||
|
p2.From.Reg = v.Args[1].Reg()
|
||||||
|
p2.To.Type = obj.TYPE_REG
|
||||||
|
p2.To.Reg = v.Reg()
|
||||||
|
|
||||||
|
// JNZ 2(PC)
|
||||||
|
p3 := s.Prog(x86.AJNE)
|
||||||
|
p3.To.Type = obj.TYPE_BRANCH
|
||||||
|
|
||||||
|
// MOVL $32, out
|
||||||
|
p4 := s.Prog(x86.AMOVL)
|
||||||
|
p4.From.Type = obj.TYPE_CONST
|
||||||
|
p4.From.Offset = 32
|
||||||
|
p4.To.Type = obj.TYPE_REG
|
||||||
|
p4.To.Reg = v.Reg()
|
||||||
|
|
||||||
|
// ADDL $32, out
|
||||||
|
p5 := s.Prog(x86.AADDL)
|
||||||
|
p5.From.Type = obj.TYPE_CONST
|
||||||
|
p5.From.Offset = 32
|
||||||
|
p5.To.Type = obj.TYPE_REG
|
||||||
|
p5.To.Reg = v.Reg()
|
||||||
|
p3.To.SetTarget(p5)
|
||||||
|
|
||||||
// NOP (so the JNZ has somewhere to land)
|
// NOP (so the JNZ has somewhere to land)
|
||||||
nop := s.Prog(obj.ANOP)
|
nop := s.Prog(obj.ANOP)
|
||||||
p1.To.SetTarget(nop)
|
p1.To.SetTarget(nop)
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue