cmd/compile: add opt branchelim to rewrite some CondSelect into math

This allows something like:
  if y { x++ }

To be compiled to:
  MOVBLZX BX, CX
  ADDQ CX, AX

Instead of:
  LEAQ    1(AX), CX
  MOVBLZX BL, DX
  TESTQ   DX, DX
  CMOVQNE CX, AX

While ./make.bash uniqued per LOC, there is 100 additions and 75 substractions.

See benchmark here: https://go.dev/play/p/DJf5COjwhd_s

Either it's a performance no-op or it is faster:

  goos: linux
  goarch: amd64
  cpu: AMD Ryzen 5 3600 6-Core Processor
                                          │ /tmp/old.logs │            /tmp/new.logs             │
                                          │    sec/op     │    sec/op     vs base                │
  CmovInlineConditionAddLatency-12           0.5443n ± 5%   0.5339n ± 3%   -1.90% (p=0.004 n=10)
  CmovInlineConditionAddThroughputBy6-12      1.492n ± 1%    1.494n ± 1%        ~ (p=0.955 n=10)
  CmovInlineConditionSubLatency-12           0.5419n ± 3%   0.5282n ± 3%   -2.52% (p=0.019 n=10)
  CmovInlineConditionSubThroughputBy6-12      1.587n ± 1%    1.584n ± 2%        ~ (p=0.492 n=10)
  CmovOutlineConditionAddLatency-12          0.5223n ± 1%   0.2639n ± 4%  -49.47% (p=0.000 n=10)
  CmovOutlineConditionAddThroughputBy6-12     1.159n ± 1%    1.097n ± 2%   -5.35% (p=0.000 n=10)
  CmovOutlineConditionSubLatency-12          0.5271n ± 3%   0.2654n ± 2%  -49.66% (p=0.000 n=10)
  CmovOutlineConditionSubThroughputBy6-12     1.053n ± 1%    1.050n ± 1%        ~ (p=1.000 n=10)
  geomean

There are other benefits not tested by this benchmark:
- the math form is usually a couple bytes shorter (ICACHE)
- the math form is usually 0~2 uops shorter (UCACHE)
- the math form has usually less register pressure*
- the math form can sometimes be optimized further

*regalloc rarely find how it can use less registers

As far as pass ordering goes there are many possible options,
I've decided to reorder branchelim before late opt since:
- unlike running exclusively the CondSelect rules after branchelim,
  some extra optimizations might trigger on the adds or subs.
- I don't want to maintain a second generic.rules file of only the stuff,
  that can trigger after branchelim.
- rerunning all of opt a third time increase compilation time for little gains.

By elimination moving branchelim seems fine.

Change-Id: I869adf57e4d109948ee157cfc47144445146bafd
Reviewed-on: https://go-review.googlesource.com/c/go/+/685676
Reviewed-by: Keith Randall <khr@golang.org>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Keith Randall <khr@google.com>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
This commit is contained in:
Jorropo 2025-07-03 01:35:51 +02:00
parent f32cf8e4b0
commit fcd28070fe
4 changed files with 292 additions and 4 deletions

View file

@ -56,6 +56,8 @@ func rewriteValuegeneric(v *Value) bool {
return rewriteValuegeneric_OpCom64(v)
case OpCom8:
return rewriteValuegeneric_OpCom8(v)
case OpCondSelect:
return rewriteValuegeneric_OpCondSelect(v)
case OpConstInterface:
return rewriteValuegeneric_OpConstInterface(v)
case OpConstSlice:
@ -5694,6 +5696,254 @@ func rewriteValuegeneric_OpCom8(v *Value) bool {
}
return false
}
func rewriteValuegeneric_OpCondSelect(v *Value) bool {
v_2 := v.Args[2]
v_1 := v.Args[1]
v_0 := v.Args[0]
b := v.Block
config := b.Func.Config
// match: (CondSelect x _ (ConstBool [true ]))
// result: x
for {
x := v_0
if v_2.Op != OpConstBool || auxIntToBool(v_2.AuxInt) != true {
break
}
v.copyOf(x)
return true
}
// match: (CondSelect _ y (ConstBool [false]))
// result: y
for {
y := v_1
if v_2.Op != OpConstBool || auxIntToBool(v_2.AuxInt) != false {
break
}
v.copyOf(y)
return true
}
// match: (CondSelect (Add8 <t> x (Const8 [1])) x bool)
// cond: config.arch != "arm64"
// result: (Add8 x (CvtBoolToUint8 <t> bool))
for {
if v_0.Op != OpAdd8 {
break
}
t := v_0.Type
_ = v_0.Args[1]
v_0_0 := v_0.Args[0]
v_0_1 := v_0.Args[1]
for _i0 := 0; _i0 <= 1; _i0, v_0_0, v_0_1 = _i0+1, v_0_1, v_0_0 {
x := v_0_0
if v_0_1.Op != OpConst8 || auxIntToInt8(v_0_1.AuxInt) != 1 || x != v_1 {
continue
}
bool := v_2
if !(config.arch != "arm64") {
continue
}
v.reset(OpAdd8)
v0 := b.NewValue0(v.Pos, OpCvtBoolToUint8, t)
v0.AddArg(bool)
v.AddArg2(x, v0)
return true
}
break
}
// match: (CondSelect (Add64 <t> x (Const64 [1])) x bool)
// cond: config.arch != "arm64"
// result: (Add64 x (ZeroExt8to64 <t> (CvtBoolToUint8 <types.Types[types.TUINT8]> bool)))
for {
if v_0.Op != OpAdd64 {
break
}
t := v_0.Type
_ = v_0.Args[1]
v_0_0 := v_0.Args[0]
v_0_1 := v_0.Args[1]
for _i0 := 0; _i0 <= 1; _i0, v_0_0, v_0_1 = _i0+1, v_0_1, v_0_0 {
x := v_0_0
if v_0_1.Op != OpConst64 || auxIntToInt64(v_0_1.AuxInt) != 1 || x != v_1 {
continue
}
bool := v_2
if !(config.arch != "arm64") {
continue
}
v.reset(OpAdd64)
v0 := b.NewValue0(v.Pos, OpZeroExt8to64, t)
v1 := b.NewValue0(v.Pos, OpCvtBoolToUint8, types.Types[types.TUINT8])
v1.AddArg(bool)
v0.AddArg(v1)
v.AddArg2(x, v0)
return true
}
break
}
// match: (CondSelect (Add32 <t> x (Const32 [1])) x bool)
// cond: config.arch != "arm64"
// result: (Add32 x (ZeroExt8to32 <t> (CvtBoolToUint8 <types.Types[types.TUINT8]> bool)))
for {
if v_0.Op != OpAdd32 {
break
}
t := v_0.Type
_ = v_0.Args[1]
v_0_0 := v_0.Args[0]
v_0_1 := v_0.Args[1]
for _i0 := 0; _i0 <= 1; _i0, v_0_0, v_0_1 = _i0+1, v_0_1, v_0_0 {
x := v_0_0
if v_0_1.Op != OpConst32 || auxIntToInt32(v_0_1.AuxInt) != 1 || x != v_1 {
continue
}
bool := v_2
if !(config.arch != "arm64") {
continue
}
v.reset(OpAdd32)
v0 := b.NewValue0(v.Pos, OpZeroExt8to32, t)
v1 := b.NewValue0(v.Pos, OpCvtBoolToUint8, types.Types[types.TUINT8])
v1.AddArg(bool)
v0.AddArg(v1)
v.AddArg2(x, v0)
return true
}
break
}
// match: (CondSelect (Add16 <t> x (Const16 [1])) x bool)
// cond: config.arch != "arm64"
// result: (Add16 x (ZeroExt8to16 <t> (CvtBoolToUint8 <types.Types[types.TUINT8]> bool)))
for {
if v_0.Op != OpAdd16 {
break
}
t := v_0.Type
_ = v_0.Args[1]
v_0_0 := v_0.Args[0]
v_0_1 := v_0.Args[1]
for _i0 := 0; _i0 <= 1; _i0, v_0_0, v_0_1 = _i0+1, v_0_1, v_0_0 {
x := v_0_0
if v_0_1.Op != OpConst16 || auxIntToInt16(v_0_1.AuxInt) != 1 || x != v_1 {
continue
}
bool := v_2
if !(config.arch != "arm64") {
continue
}
v.reset(OpAdd16)
v0 := b.NewValue0(v.Pos, OpZeroExt8to16, t)
v1 := b.NewValue0(v.Pos, OpCvtBoolToUint8, types.Types[types.TUINT8])
v1.AddArg(bool)
v0.AddArg(v1)
v.AddArg2(x, v0)
return true
}
break
}
// match: (CondSelect (Add8 <t> x (Const8 [-1])) x bool)
// result: (Sub8 x (CvtBoolToUint8 <t> bool))
for {
if v_0.Op != OpAdd8 {
break
}
t := v_0.Type
_ = v_0.Args[1]
v_0_0 := v_0.Args[0]
v_0_1 := v_0.Args[1]
for _i0 := 0; _i0 <= 1; _i0, v_0_0, v_0_1 = _i0+1, v_0_1, v_0_0 {
x := v_0_0
if v_0_1.Op != OpConst8 || auxIntToInt8(v_0_1.AuxInt) != -1 || x != v_1 {
continue
}
bool := v_2
v.reset(OpSub8)
v0 := b.NewValue0(v.Pos, OpCvtBoolToUint8, t)
v0.AddArg(bool)
v.AddArg2(x, v0)
return true
}
break
}
// match: (CondSelect (Add64 <t> x (Const64 [-1])) x bool)
// result: (Sub64 x (ZeroExt8to64 <t> (CvtBoolToUint8 <types.Types[types.TUINT8]> bool)))
for {
if v_0.Op != OpAdd64 {
break
}
t := v_0.Type
_ = v_0.Args[1]
v_0_0 := v_0.Args[0]
v_0_1 := v_0.Args[1]
for _i0 := 0; _i0 <= 1; _i0, v_0_0, v_0_1 = _i0+1, v_0_1, v_0_0 {
x := v_0_0
if v_0_1.Op != OpConst64 || auxIntToInt64(v_0_1.AuxInt) != -1 || x != v_1 {
continue
}
bool := v_2
v.reset(OpSub64)
v0 := b.NewValue0(v.Pos, OpZeroExt8to64, t)
v1 := b.NewValue0(v.Pos, OpCvtBoolToUint8, types.Types[types.TUINT8])
v1.AddArg(bool)
v0.AddArg(v1)
v.AddArg2(x, v0)
return true
}
break
}
// match: (CondSelect (Add32 <t> x (Const32 [-1])) x bool)
// result: (Sub32 x (ZeroExt8to32 <t> (CvtBoolToUint8 <types.Types[types.TUINT8]> bool)))
for {
if v_0.Op != OpAdd32 {
break
}
t := v_0.Type
_ = v_0.Args[1]
v_0_0 := v_0.Args[0]
v_0_1 := v_0.Args[1]
for _i0 := 0; _i0 <= 1; _i0, v_0_0, v_0_1 = _i0+1, v_0_1, v_0_0 {
x := v_0_0
if v_0_1.Op != OpConst32 || auxIntToInt32(v_0_1.AuxInt) != -1 || x != v_1 {
continue
}
bool := v_2
v.reset(OpSub32)
v0 := b.NewValue0(v.Pos, OpZeroExt8to32, t)
v1 := b.NewValue0(v.Pos, OpCvtBoolToUint8, types.Types[types.TUINT8])
v1.AddArg(bool)
v0.AddArg(v1)
v.AddArg2(x, v0)
return true
}
break
}
// match: (CondSelect (Add16 <t> x (Const16 [-1])) x bool)
// result: (Sub16 x (ZeroExt8to16 <t> (CvtBoolToUint8 <types.Types[types.TUINT8]> bool)))
for {
if v_0.Op != OpAdd16 {
break
}
t := v_0.Type
_ = v_0.Args[1]
v_0_0 := v_0.Args[0]
v_0_1 := v_0.Args[1]
for _i0 := 0; _i0 <= 1; _i0, v_0_0, v_0_1 = _i0+1, v_0_1, v_0_0 {
x := v_0_0
if v_0_1.Op != OpConst16 || auxIntToInt16(v_0_1.AuxInt) != -1 || x != v_1 {
continue
}
bool := v_2
v.reset(OpSub16)
v0 := b.NewValue0(v.Pos, OpZeroExt8to16, t)
v1 := b.NewValue0(v.Pos, OpCvtBoolToUint8, types.Types[types.TUINT8])
v1.AddArg(bool)
v0.AddArg(v1)
v.AddArg2(x, v0)
return true
}
break
}
return false
}
func rewriteValuegeneric_OpConstInterface(v *Value) bool {
b := v.Block
typ := &b.Func.Config.Types