2018-03-05 20:59:40 +01:00
|
|
|
// asmcheck
|
|
|
|
|
2018-04-15 23:03:12 +02:00
|
|
|
// Copyright 2018 The Go Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style
|
|
|
|
// license that can be found in the LICENSE file.
|
|
|
|
|
2018-03-05 20:59:40 +01:00
|
|
|
package codegen
|
|
|
|
|
|
|
|
func cmovint(c int) int {
|
|
|
|
x := c + 4
|
|
|
|
if x < 0 {
|
|
|
|
x = 182
|
|
|
|
}
|
|
|
|
// amd64:"CMOVQLT"
|
|
|
|
// arm64:"CSEL\tLT"
|
2023-01-18 15:20:15 -06:00
|
|
|
// ppc64x:"ISEL\t[$]0"
|
2019-08-20 09:03:41 +00:00
|
|
|
// wasm:"Select"
|
2018-03-05 20:59:40 +01:00
|
|
|
return x
|
|
|
|
}
|
|
|
|
|
|
|
|
func cmovchan(x, y chan int) chan int {
|
|
|
|
if x != y {
|
|
|
|
x = y
|
|
|
|
}
|
|
|
|
// amd64:"CMOVQNE"
|
|
|
|
// arm64:"CSEL\tNE"
|
2023-01-18 15:20:15 -06:00
|
|
|
// ppc64x:"ISEL\t[$]2"
|
2019-08-20 09:03:41 +00:00
|
|
|
// wasm:"Select"
|
2018-03-05 20:59:40 +01:00
|
|
|
return x
|
|
|
|
}
|
|
|
|
|
|
|
|
func cmovuintptr(x, y uintptr) uintptr {
|
|
|
|
if x < y {
|
|
|
|
x = -y
|
|
|
|
}
|
2020-12-30 12:05:57 -05:00
|
|
|
// amd64:"CMOVQ(HI|CS)"
|
cmd/compile: add rewrite rules for conditional instructions on arm64
This CL adds rewrite rules for CSETM, CSINC, CSINV, and CSNEG. By adding
these rules, we can save one instruction.
For example,
func test(cond bool, a int) int {
if cond {
a++
}
return a
}
Before:
MOVD "".a+8(RSP), R0
ADD $1, R0, R1
MOVBU "".cond(RSP), R2
CMPW $0, R2
CSEL NE, R1, R0, R0
After:
MOVBU "".cond(RSP), R0
CMPW $0, R0
MOVD "".a+8(RSP), R0
CSINC EQ, R0, R0, R0
This patch is a copy of CL 285694. Co-authored-by: JunchenLi
<junchen.li@arm.com>
Change-Id: Ic1a79e8b8ece409b533becfcb7950f11e7b76f24
Reviewed-on: https://go-review.googlesource.com/c/go/+/302231
Trust: fannie zhang <Fannie.Zhang@arm.com>
Run-TryBot: fannie zhang <Fannie.Zhang@arm.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2021-01-18 14:32:49 +08:00
|
|
|
// arm64:"CSNEG\tLS"
|
2023-01-18 15:20:15 -06:00
|
|
|
// ppc64x:"ISEL\t[$]1"
|
2019-08-20 09:03:41 +00:00
|
|
|
// wasm:"Select"
|
2018-03-05 20:59:40 +01:00
|
|
|
return x
|
|
|
|
}
|
|
|
|
|
|
|
|
func cmov32bit(x, y uint32) uint32 {
|
|
|
|
if x < y {
|
|
|
|
x = -y
|
|
|
|
}
|
2020-12-30 12:05:57 -05:00
|
|
|
// amd64:"CMOVL(HI|CS)"
|
cmd/compile: add rewrite rules for conditional instructions on arm64
This CL adds rewrite rules for CSETM, CSINC, CSINV, and CSNEG. By adding
these rules, we can save one instruction.
For example,
func test(cond bool, a int) int {
if cond {
a++
}
return a
}
Before:
MOVD "".a+8(RSP), R0
ADD $1, R0, R1
MOVBU "".cond(RSP), R2
CMPW $0, R2
CSEL NE, R1, R0, R0
After:
MOVBU "".cond(RSP), R0
CMPW $0, R0
MOVD "".a+8(RSP), R0
CSINC EQ, R0, R0, R0
This patch is a copy of CL 285694. Co-authored-by: JunchenLi
<junchen.li@arm.com>
Change-Id: Ic1a79e8b8ece409b533becfcb7950f11e7b76f24
Reviewed-on: https://go-review.googlesource.com/c/go/+/302231
Trust: fannie zhang <Fannie.Zhang@arm.com>
Run-TryBot: fannie zhang <Fannie.Zhang@arm.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2021-01-18 14:32:49 +08:00
|
|
|
// arm64:"CSNEG\t(LS|HS)"
|
2023-01-18 15:20:15 -06:00
|
|
|
// ppc64x:"ISEL\t[$]1"
|
2019-08-20 09:03:41 +00:00
|
|
|
// wasm:"Select"
|
2018-03-05 20:59:40 +01:00
|
|
|
return x
|
|
|
|
}
|
|
|
|
|
|
|
|
func cmov16bit(x, y uint16) uint16 {
|
|
|
|
if x < y {
|
|
|
|
x = -y
|
|
|
|
}
|
2020-12-30 12:05:57 -05:00
|
|
|
// amd64:"CMOVW(HI|CS)"
|
cmd/compile: add rewrite rules for conditional instructions on arm64
This CL adds rewrite rules for CSETM, CSINC, CSINV, and CSNEG. By adding
these rules, we can save one instruction.
For example,
func test(cond bool, a int) int {
if cond {
a++
}
return a
}
Before:
MOVD "".a+8(RSP), R0
ADD $1, R0, R1
MOVBU "".cond(RSP), R2
CMPW $0, R2
CSEL NE, R1, R0, R0
After:
MOVBU "".cond(RSP), R0
CMPW $0, R0
MOVD "".a+8(RSP), R0
CSINC EQ, R0, R0, R0
This patch is a copy of CL 285694. Co-authored-by: JunchenLi
<junchen.li@arm.com>
Change-Id: Ic1a79e8b8ece409b533becfcb7950f11e7b76f24
Reviewed-on: https://go-review.googlesource.com/c/go/+/302231
Trust: fannie zhang <Fannie.Zhang@arm.com>
Run-TryBot: fannie zhang <Fannie.Zhang@arm.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2021-01-18 14:32:49 +08:00
|
|
|
// arm64:"CSNEG\t(LS|HS)"
|
2023-05-09 10:34:52 -04:00
|
|
|
// ppc64x:"ISEL\t[$][01]"
|
2019-08-20 09:03:41 +00:00
|
|
|
// wasm:"Select"
|
2018-03-05 20:59:40 +01:00
|
|
|
return x
|
|
|
|
}
|
|
|
|
|
|
|
|
// Floating point comparison. For EQ/NE, we must
|
|
|
|
// generate special code to handle NaNs.
|
|
|
|
func cmovfloateq(x, y float64) int {
|
|
|
|
a := 128
|
|
|
|
if x == y {
|
|
|
|
a = 256
|
|
|
|
}
|
|
|
|
// amd64:"CMOVQNE","CMOVQPC"
|
|
|
|
// arm64:"CSEL\tEQ"
|
2023-01-18 15:20:15 -06:00
|
|
|
// ppc64x:"ISEL\t[$]2"
|
2019-08-20 09:03:41 +00:00
|
|
|
// wasm:"Select"
|
2018-03-05 20:59:40 +01:00
|
|
|
return a
|
|
|
|
}
|
|
|
|
|
|
|
|
func cmovfloatne(x, y float64) int {
|
|
|
|
a := 128
|
|
|
|
if x != y {
|
|
|
|
a = 256
|
|
|
|
}
|
|
|
|
// amd64:"CMOVQNE","CMOVQPS"
|
|
|
|
// arm64:"CSEL\tNE"
|
2023-01-18 15:20:15 -06:00
|
|
|
// ppc64x:"ISEL\t[$]2"
|
2019-08-20 09:03:41 +00:00
|
|
|
// wasm:"Select"
|
2018-03-05 20:59:40 +01:00
|
|
|
return a
|
|
|
|
}
|
|
|
|
|
|
|
|
//go:noinline
|
|
|
|
func frexp(f float64) (frac float64, exp int) {
|
|
|
|
return 1.0, 4
|
|
|
|
}
|
|
|
|
|
|
|
|
//go:noinline
|
|
|
|
func ldexp(frac float64, exp int) float64 {
|
|
|
|
return 1.0
|
|
|
|
}
|
|
|
|
|
|
|
|
// Generate a CMOV with a floating comparison and integer move.
|
|
|
|
func cmovfloatint2(x, y float64) float64 {
|
|
|
|
yfr, yexp := 4.0, 5
|
|
|
|
|
|
|
|
r := x
|
|
|
|
for r >= y {
|
|
|
|
rfr, rexp := frexp(r)
|
|
|
|
if rfr < yfr {
|
cmd/compile: add opt branchelim to rewrite some CondSelect into math
This allows something like:
if y { x++ }
To be compiled to:
MOVBLZX BX, CX
ADDQ CX, AX
Instead of:
LEAQ 1(AX), CX
MOVBLZX BL, DX
TESTQ DX, DX
CMOVQNE CX, AX
While ./make.bash uniqued per LOC, there is 100 additions and 75 substractions.
See benchmark here: https://go.dev/play/p/DJf5COjwhd_s
Either it's a performance no-op or it is faster:
goos: linux
goarch: amd64
cpu: AMD Ryzen 5 3600 6-Core Processor
│ /tmp/old.logs │ /tmp/new.logs │
│ sec/op │ sec/op vs base │
CmovInlineConditionAddLatency-12 0.5443n ± 5% 0.5339n ± 3% -1.90% (p=0.004 n=10)
CmovInlineConditionAddThroughputBy6-12 1.492n ± 1% 1.494n ± 1% ~ (p=0.955 n=10)
CmovInlineConditionSubLatency-12 0.5419n ± 3% 0.5282n ± 3% -2.52% (p=0.019 n=10)
CmovInlineConditionSubThroughputBy6-12 1.587n ± 1% 1.584n ± 2% ~ (p=0.492 n=10)
CmovOutlineConditionAddLatency-12 0.5223n ± 1% 0.2639n ± 4% -49.47% (p=0.000 n=10)
CmovOutlineConditionAddThroughputBy6-12 1.159n ± 1% 1.097n ± 2% -5.35% (p=0.000 n=10)
CmovOutlineConditionSubLatency-12 0.5271n ± 3% 0.2654n ± 2% -49.66% (p=0.000 n=10)
CmovOutlineConditionSubThroughputBy6-12 1.053n ± 1% 1.050n ± 1% ~ (p=1.000 n=10)
geomean
There are other benefits not tested by this benchmark:
- the math form is usually a couple bytes shorter (ICACHE)
- the math form is usually 0~2 uops shorter (UCACHE)
- the math form has usually less register pressure*
- the math form can sometimes be optimized further
*regalloc rarely find how it can use less registers
As far as pass ordering goes there are many possible options,
I've decided to reorder branchelim before late opt since:
- unlike running exclusively the CondSelect rules after branchelim,
some extra optimizations might trigger on the adds or subs.
- I don't want to maintain a second generic.rules file of only the stuff,
that can trigger after branchelim.
- rerunning all of opt a third time increase compilation time for little gains.
By elimination moving branchelim seems fine.
Change-Id: I869adf57e4d109948ee157cfc47144445146bafd
Reviewed-on: https://go-review.googlesource.com/c/go/+/685676
Reviewed-by: Keith Randall <khr@golang.org>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Keith Randall <khr@google.com>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
2025-07-03 01:35:51 +02:00
|
|
|
rexp = rexp - 42
|
2018-03-05 20:59:40 +01:00
|
|
|
}
|
|
|
|
// amd64:"CMOVQHI"
|
2019-02-20 11:38:16 +00:00
|
|
|
// arm64:"CSEL\tMI"
|
2023-01-18 15:20:15 -06:00
|
|
|
// ppc64x:"ISEL\t[$]0"
|
2019-08-20 09:03:41 +00:00
|
|
|
// wasm:"Select"
|
2020-04-20 20:33:19 +00:00
|
|
|
r = r - ldexp(y, rexp-yexp)
|
2018-03-05 20:59:40 +01:00
|
|
|
}
|
|
|
|
return r
|
|
|
|
}
|
|
|
|
|
|
|
|
func cmovloaded(x [4]int, y int) int {
|
|
|
|
if x[2] != 0 {
|
|
|
|
y = x[2]
|
|
|
|
} else {
|
|
|
|
y = y >> 2
|
|
|
|
}
|
|
|
|
// amd64:"CMOVQNE"
|
|
|
|
// arm64:"CSEL\tNE"
|
2023-01-18 15:20:15 -06:00
|
|
|
// ppc64x:"ISEL\t[$]2"
|
2019-08-20 09:03:41 +00:00
|
|
|
// wasm:"Select"
|
2018-03-05 20:59:40 +01:00
|
|
|
return y
|
|
|
|
}
|
|
|
|
|
|
|
|
func cmovuintptr2(x, y uintptr) uintptr {
|
|
|
|
a := x * 2
|
|
|
|
if a == 0 {
|
|
|
|
a = 256
|
|
|
|
}
|
|
|
|
// amd64:"CMOVQEQ"
|
|
|
|
// arm64:"CSEL\tEQ"
|
2023-01-18 15:20:15 -06:00
|
|
|
// ppc64x:"ISEL\t[$]2"
|
2019-08-20 09:03:41 +00:00
|
|
|
// wasm:"Select"
|
2018-03-05 20:59:40 +01:00
|
|
|
return a
|
|
|
|
}
|
|
|
|
|
2023-01-18 15:20:15 -06:00
|
|
|
// Floating point CMOVs are not supported by amd64/arm64/ppc64x
|
2018-03-05 20:59:40 +01:00
|
|
|
func cmovfloatmove(x, y int) float64 {
|
|
|
|
a := 1.0
|
|
|
|
if x <= y {
|
|
|
|
a = 2.0
|
|
|
|
}
|
|
|
|
// amd64:-"CMOV"
|
|
|
|
// arm64:-"CSEL"
|
2023-01-18 15:20:15 -06:00
|
|
|
// ppc64x:-"ISEL"
|
2019-08-20 09:03:41 +00:00
|
|
|
// wasm:-"Select"
|
2018-03-05 20:59:40 +01:00
|
|
|
return a
|
|
|
|
}
|
|
|
|
|
|
|
|
// On amd64, the following patterns trigger comparison inversion.
|
|
|
|
// Test that we correctly invert the CMOV condition
|
|
|
|
var gsink int64
|
|
|
|
var gusink uint64
|
|
|
|
|
|
|
|
func cmovinvert1(x, y int64) int64 {
|
|
|
|
if x < gsink {
|
|
|
|
y = -y
|
|
|
|
}
|
|
|
|
// amd64:"CMOVQGT"
|
|
|
|
return y
|
|
|
|
}
|
|
|
|
func cmovinvert2(x, y int64) int64 {
|
|
|
|
if x <= gsink {
|
|
|
|
y = -y
|
|
|
|
}
|
|
|
|
// amd64:"CMOVQGE"
|
|
|
|
return y
|
|
|
|
}
|
|
|
|
func cmovinvert3(x, y int64) int64 {
|
|
|
|
if x == gsink {
|
|
|
|
y = -y
|
|
|
|
}
|
|
|
|
// amd64:"CMOVQEQ"
|
|
|
|
return y
|
|
|
|
}
|
|
|
|
func cmovinvert4(x, y int64) int64 {
|
|
|
|
if x != gsink {
|
|
|
|
y = -y
|
|
|
|
}
|
|
|
|
// amd64:"CMOVQNE"
|
|
|
|
return y
|
|
|
|
}
|
|
|
|
func cmovinvert5(x, y uint64) uint64 {
|
|
|
|
if x > gusink {
|
|
|
|
y = -y
|
|
|
|
}
|
|
|
|
// amd64:"CMOVQCS"
|
|
|
|
return y
|
|
|
|
}
|
|
|
|
func cmovinvert6(x, y uint64) uint64 {
|
|
|
|
if x >= gusink {
|
|
|
|
y = -y
|
|
|
|
}
|
|
|
|
// amd64:"CMOVQLS"
|
|
|
|
return y
|
|
|
|
}
|
2018-10-29 15:14:39 -07:00
|
|
|
|
|
|
|
func cmovload(a []int, i int, b bool) int {
|
|
|
|
if b {
|
cmd/compile: add opt branchelim to rewrite some CondSelect into math
This allows something like:
if y { x++ }
To be compiled to:
MOVBLZX BX, CX
ADDQ CX, AX
Instead of:
LEAQ 1(AX), CX
MOVBLZX BL, DX
TESTQ DX, DX
CMOVQNE CX, AX
While ./make.bash uniqued per LOC, there is 100 additions and 75 substractions.
See benchmark here: https://go.dev/play/p/DJf5COjwhd_s
Either it's a performance no-op or it is faster:
goos: linux
goarch: amd64
cpu: AMD Ryzen 5 3600 6-Core Processor
│ /tmp/old.logs │ /tmp/new.logs │
│ sec/op │ sec/op vs base │
CmovInlineConditionAddLatency-12 0.5443n ± 5% 0.5339n ± 3% -1.90% (p=0.004 n=10)
CmovInlineConditionAddThroughputBy6-12 1.492n ± 1% 1.494n ± 1% ~ (p=0.955 n=10)
CmovInlineConditionSubLatency-12 0.5419n ± 3% 0.5282n ± 3% -2.52% (p=0.019 n=10)
CmovInlineConditionSubThroughputBy6-12 1.587n ± 1% 1.584n ± 2% ~ (p=0.492 n=10)
CmovOutlineConditionAddLatency-12 0.5223n ± 1% 0.2639n ± 4% -49.47% (p=0.000 n=10)
CmovOutlineConditionAddThroughputBy6-12 1.159n ± 1% 1.097n ± 2% -5.35% (p=0.000 n=10)
CmovOutlineConditionSubLatency-12 0.5271n ± 3% 0.2654n ± 2% -49.66% (p=0.000 n=10)
CmovOutlineConditionSubThroughputBy6-12 1.053n ± 1% 1.050n ± 1% ~ (p=1.000 n=10)
geomean
There are other benefits not tested by this benchmark:
- the math form is usually a couple bytes shorter (ICACHE)
- the math form is usually 0~2 uops shorter (UCACHE)
- the math form has usually less register pressure*
- the math form can sometimes be optimized further
*regalloc rarely find how it can use less registers
As far as pass ordering goes there are many possible options,
I've decided to reorder branchelim before late opt since:
- unlike running exclusively the CondSelect rules after branchelim,
some extra optimizations might trigger on the adds or subs.
- I don't want to maintain a second generic.rules file of only the stuff,
that can trigger after branchelim.
- rerunning all of opt a third time increase compilation time for little gains.
By elimination moving branchelim seems fine.
Change-Id: I869adf57e4d109948ee157cfc47144445146bafd
Reviewed-on: https://go-review.googlesource.com/c/go/+/685676
Reviewed-by: Keith Randall <khr@golang.org>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Keith Randall <khr@google.com>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
2025-07-03 01:35:51 +02:00
|
|
|
i += 42
|
2018-10-29 15:14:39 -07:00
|
|
|
}
|
|
|
|
// See issue 26306
|
|
|
|
// amd64:-"CMOVQNE"
|
|
|
|
return a[i]
|
|
|
|
}
|
|
|
|
|
|
|
|
func cmovstore(a []int, i int, b bool) {
|
|
|
|
if b {
|
cmd/compile: add opt branchelim to rewrite some CondSelect into math
This allows something like:
if y { x++ }
To be compiled to:
MOVBLZX BX, CX
ADDQ CX, AX
Instead of:
LEAQ 1(AX), CX
MOVBLZX BL, DX
TESTQ DX, DX
CMOVQNE CX, AX
While ./make.bash uniqued per LOC, there is 100 additions and 75 substractions.
See benchmark here: https://go.dev/play/p/DJf5COjwhd_s
Either it's a performance no-op or it is faster:
goos: linux
goarch: amd64
cpu: AMD Ryzen 5 3600 6-Core Processor
│ /tmp/old.logs │ /tmp/new.logs │
│ sec/op │ sec/op vs base │
CmovInlineConditionAddLatency-12 0.5443n ± 5% 0.5339n ± 3% -1.90% (p=0.004 n=10)
CmovInlineConditionAddThroughputBy6-12 1.492n ± 1% 1.494n ± 1% ~ (p=0.955 n=10)
CmovInlineConditionSubLatency-12 0.5419n ± 3% 0.5282n ± 3% -2.52% (p=0.019 n=10)
CmovInlineConditionSubThroughputBy6-12 1.587n ± 1% 1.584n ± 2% ~ (p=0.492 n=10)
CmovOutlineConditionAddLatency-12 0.5223n ± 1% 0.2639n ± 4% -49.47% (p=0.000 n=10)
CmovOutlineConditionAddThroughputBy6-12 1.159n ± 1% 1.097n ± 2% -5.35% (p=0.000 n=10)
CmovOutlineConditionSubLatency-12 0.5271n ± 3% 0.2654n ± 2% -49.66% (p=0.000 n=10)
CmovOutlineConditionSubThroughputBy6-12 1.053n ± 1% 1.050n ± 1% ~ (p=1.000 n=10)
geomean
There are other benefits not tested by this benchmark:
- the math form is usually a couple bytes shorter (ICACHE)
- the math form is usually 0~2 uops shorter (UCACHE)
- the math form has usually less register pressure*
- the math form can sometimes be optimized further
*regalloc rarely find how it can use less registers
As far as pass ordering goes there are many possible options,
I've decided to reorder branchelim before late opt since:
- unlike running exclusively the CondSelect rules after branchelim,
some extra optimizations might trigger on the adds or subs.
- I don't want to maintain a second generic.rules file of only the stuff,
that can trigger after branchelim.
- rerunning all of opt a third time increase compilation time for little gains.
By elimination moving branchelim seems fine.
Change-Id: I869adf57e4d109948ee157cfc47144445146bafd
Reviewed-on: https://go-review.googlesource.com/c/go/+/685676
Reviewed-by: Keith Randall <khr@golang.org>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Keith Randall <khr@google.com>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
2025-07-03 01:35:51 +02:00
|
|
|
i += 42
|
2018-10-29 15:14:39 -07:00
|
|
|
}
|
|
|
|
// amd64:"CMOVQNE"
|
|
|
|
a[i] = 7
|
|
|
|
}
|
cmd/compile: add rewrite rules for conditional instructions on arm64
This CL adds rewrite rules for CSETM, CSINC, CSINV, and CSNEG. By adding
these rules, we can save one instruction.
For example,
func test(cond bool, a int) int {
if cond {
a++
}
return a
}
Before:
MOVD "".a+8(RSP), R0
ADD $1, R0, R1
MOVBU "".cond(RSP), R2
CMPW $0, R2
CSEL NE, R1, R0, R0
After:
MOVBU "".cond(RSP), R0
CMPW $0, R0
MOVD "".a+8(RSP), R0
CSINC EQ, R0, R0, R0
This patch is a copy of CL 285694. Co-authored-by: JunchenLi
<junchen.li@arm.com>
Change-Id: Ic1a79e8b8ece409b533becfcb7950f11e7b76f24
Reviewed-on: https://go-review.googlesource.com/c/go/+/302231
Trust: fannie zhang <Fannie.Zhang@arm.com>
Run-TryBot: fannie zhang <Fannie.Zhang@arm.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2021-01-18 14:32:49 +08:00
|
|
|
|
|
|
|
var r0, r1, r2, r3, r4, r5 int
|
|
|
|
|
|
|
|
func cmovinc(cond bool, a, b, c int) {
|
|
|
|
var x0, x1 int
|
|
|
|
|
|
|
|
if cond {
|
|
|
|
x0 = a
|
|
|
|
} else {
|
|
|
|
x0 = b + 1
|
|
|
|
}
|
|
|
|
// arm64:"CSINC\tNE", -"CSEL"
|
|
|
|
r0 = x0
|
|
|
|
|
|
|
|
if cond {
|
|
|
|
x1 = b + 1
|
|
|
|
} else {
|
|
|
|
x1 = a
|
|
|
|
}
|
|
|
|
// arm64:"CSINC\tEQ", -"CSEL"
|
|
|
|
r1 = x1
|
|
|
|
|
|
|
|
if cond {
|
|
|
|
c++
|
|
|
|
}
|
|
|
|
// arm64:"CSINC\tEQ", -"CSEL"
|
|
|
|
r2 = c
|
|
|
|
}
|
|
|
|
|
|
|
|
func cmovinv(cond bool, a, b int) {
|
|
|
|
var x0, x1 int
|
|
|
|
|
|
|
|
if cond {
|
|
|
|
x0 = a
|
|
|
|
} else {
|
|
|
|
x0 = ^b
|
|
|
|
}
|
|
|
|
// arm64:"CSINV\tNE", -"CSEL"
|
|
|
|
r0 = x0
|
|
|
|
|
|
|
|
if cond {
|
|
|
|
x1 = ^b
|
|
|
|
} else {
|
|
|
|
x1 = a
|
|
|
|
}
|
|
|
|
// arm64:"CSINV\tEQ", -"CSEL"
|
|
|
|
r1 = x1
|
|
|
|
}
|
|
|
|
|
|
|
|
func cmovneg(cond bool, a, b, c int) {
|
|
|
|
var x0, x1 int
|
|
|
|
|
|
|
|
if cond {
|
|
|
|
x0 = a
|
|
|
|
} else {
|
|
|
|
x0 = -b
|
|
|
|
}
|
|
|
|
// arm64:"CSNEG\tNE", -"CSEL"
|
|
|
|
r0 = x0
|
|
|
|
|
|
|
|
if cond {
|
|
|
|
x1 = -b
|
|
|
|
} else {
|
|
|
|
x1 = a
|
|
|
|
}
|
|
|
|
// arm64:"CSNEG\tEQ", -"CSEL"
|
|
|
|
r1 = x1
|
|
|
|
}
|
|
|
|
|
|
|
|
func cmovsetm(cond bool, x int) {
|
|
|
|
var x0, x1 int
|
|
|
|
|
|
|
|
if cond {
|
|
|
|
x0 = -1
|
|
|
|
} else {
|
|
|
|
x0 = 0
|
|
|
|
}
|
|
|
|
// arm64:"CSETM\tNE", -"CSEL"
|
|
|
|
r0 = x0
|
|
|
|
|
|
|
|
if cond {
|
|
|
|
x1 = 0
|
|
|
|
} else {
|
|
|
|
x1 = -1
|
|
|
|
}
|
|
|
|
// arm64:"CSETM\tEQ", -"CSEL"
|
|
|
|
r1 = x1
|
|
|
|
}
|
|
|
|
|
|
|
|
func cmovFcmp0(s, t float64, a, b int) {
|
|
|
|
var x0, x1, x2, x3, x4, x5 int
|
|
|
|
|
|
|
|
if s < t {
|
|
|
|
x0 = a
|
|
|
|
} else {
|
|
|
|
x0 = b + 1
|
|
|
|
}
|
|
|
|
// arm64:"CSINC\tMI", -"CSEL"
|
|
|
|
r0 = x0
|
|
|
|
|
|
|
|
if s <= t {
|
|
|
|
x1 = a
|
|
|
|
} else {
|
|
|
|
x1 = ^b
|
|
|
|
}
|
|
|
|
// arm64:"CSINV\tLS", -"CSEL"
|
|
|
|
r1 = x1
|
|
|
|
|
|
|
|
if s > t {
|
|
|
|
x2 = a
|
|
|
|
} else {
|
|
|
|
x2 = -b
|
|
|
|
}
|
|
|
|
// arm64:"CSNEG\tMI", -"CSEL"
|
|
|
|
r2 = x2
|
|
|
|
|
|
|
|
if s >= t {
|
|
|
|
x3 = -1
|
|
|
|
} else {
|
|
|
|
x3 = 0
|
|
|
|
}
|
|
|
|
// arm64:"CSETM\tLS", -"CSEL"
|
|
|
|
r3 = x3
|
|
|
|
|
|
|
|
if s == t {
|
|
|
|
x4 = a
|
|
|
|
} else {
|
|
|
|
x4 = b + 1
|
|
|
|
}
|
|
|
|
// arm64:"CSINC\tEQ", -"CSEL"
|
|
|
|
r4 = x4
|
|
|
|
|
|
|
|
if s != t {
|
|
|
|
x5 = a
|
|
|
|
} else {
|
|
|
|
x5 = b + 1
|
|
|
|
}
|
|
|
|
// arm64:"CSINC\tNE", -"CSEL"
|
|
|
|
r5 = x5
|
|
|
|
}
|
|
|
|
|
|
|
|
func cmovFcmp1(s, t float64, a, b int) {
|
|
|
|
var x0, x1, x2, x3, x4, x5 int
|
|
|
|
|
|
|
|
if s < t {
|
|
|
|
x0 = b + 1
|
|
|
|
} else {
|
|
|
|
x0 = a
|
|
|
|
}
|
|
|
|
// arm64:"CSINC\tPL", -"CSEL"
|
|
|
|
r0 = x0
|
|
|
|
|
|
|
|
if s <= t {
|
|
|
|
x1 = ^b
|
|
|
|
} else {
|
|
|
|
x1 = a
|
|
|
|
}
|
|
|
|
// arm64:"CSINV\tHI", -"CSEL"
|
|
|
|
r1 = x1
|
|
|
|
|
|
|
|
if s > t {
|
|
|
|
x2 = -b
|
|
|
|
} else {
|
|
|
|
x2 = a
|
|
|
|
}
|
|
|
|
// arm64:"CSNEG\tPL", -"CSEL"
|
|
|
|
r2 = x2
|
|
|
|
|
|
|
|
if s >= t {
|
|
|
|
x3 = 0
|
|
|
|
} else {
|
|
|
|
x3 = -1
|
|
|
|
}
|
|
|
|
// arm64:"CSETM\tHI", -"CSEL"
|
|
|
|
r3 = x3
|
|
|
|
|
|
|
|
if s == t {
|
|
|
|
x4 = b + 1
|
|
|
|
} else {
|
|
|
|
x4 = a
|
|
|
|
}
|
|
|
|
// arm64:"CSINC\tNE", -"CSEL"
|
|
|
|
r4 = x4
|
|
|
|
|
|
|
|
if s != t {
|
|
|
|
x5 = b + 1
|
|
|
|
} else {
|
|
|
|
x5 = a
|
|
|
|
}
|
|
|
|
// arm64:"CSINC\tEQ", -"CSEL"
|
|
|
|
r5 = x5
|
|
|
|
}
|
2022-07-09 19:26:47 +08:00
|
|
|
|
|
|
|
func cmovzero1(c bool) int {
|
|
|
|
var x int
|
|
|
|
if c {
|
|
|
|
x = 182
|
|
|
|
}
|
|
|
|
// loong64:"MASKEQZ", -"MASKNEZ"
|
|
|
|
return x
|
|
|
|
}
|
|
|
|
|
|
|
|
func cmovzero2(c bool) int {
|
|
|
|
var x int
|
|
|
|
if !c {
|
|
|
|
x = 182
|
|
|
|
}
|
|
|
|
// loong64:"MASKNEZ", -"MASKEQZ"
|
|
|
|
return x
|
|
|
|
}
|
2022-10-12 14:02:38 -05:00
|
|
|
|
|
|
|
// Conditionally selecting between a value or 0 can be done without
|
|
|
|
// an extra load of 0 to a register on PPC64 by using R0 (which always
|
|
|
|
// holds the value $0) instead. Verify both cases where either arg1
|
|
|
|
// or arg2 is zero.
|
|
|
|
func cmovzeroreg0(a, b int) int {
|
|
|
|
x := 0
|
|
|
|
if a == b {
|
|
|
|
x = a
|
|
|
|
}
|
2023-01-18 15:20:15 -06:00
|
|
|
// ppc64x:"ISEL\t[$]2, R[0-9]+, R0, R[0-9]+"
|
2022-10-12 14:02:38 -05:00
|
|
|
return x
|
|
|
|
}
|
|
|
|
|
|
|
|
func cmovzeroreg1(a, b int) int {
|
|
|
|
x := a
|
|
|
|
if a == b {
|
|
|
|
x = 0
|
|
|
|
}
|
2023-01-18 15:20:15 -06:00
|
|
|
// ppc64x:"ISEL\t[$]2, R0, R[0-9]+, R[0-9]+"
|
2022-10-12 14:02:38 -05:00
|
|
|
return x
|
|
|
|
}
|
cmd/compile: add opt branchelim to rewrite some CondSelect into math
This allows something like:
if y { x++ }
To be compiled to:
MOVBLZX BX, CX
ADDQ CX, AX
Instead of:
LEAQ 1(AX), CX
MOVBLZX BL, DX
TESTQ DX, DX
CMOVQNE CX, AX
While ./make.bash uniqued per LOC, there is 100 additions and 75 substractions.
See benchmark here: https://go.dev/play/p/DJf5COjwhd_s
Either it's a performance no-op or it is faster:
goos: linux
goarch: amd64
cpu: AMD Ryzen 5 3600 6-Core Processor
│ /tmp/old.logs │ /tmp/new.logs │
│ sec/op │ sec/op vs base │
CmovInlineConditionAddLatency-12 0.5443n ± 5% 0.5339n ± 3% -1.90% (p=0.004 n=10)
CmovInlineConditionAddThroughputBy6-12 1.492n ± 1% 1.494n ± 1% ~ (p=0.955 n=10)
CmovInlineConditionSubLatency-12 0.5419n ± 3% 0.5282n ± 3% -2.52% (p=0.019 n=10)
CmovInlineConditionSubThroughputBy6-12 1.587n ± 1% 1.584n ± 2% ~ (p=0.492 n=10)
CmovOutlineConditionAddLatency-12 0.5223n ± 1% 0.2639n ± 4% -49.47% (p=0.000 n=10)
CmovOutlineConditionAddThroughputBy6-12 1.159n ± 1% 1.097n ± 2% -5.35% (p=0.000 n=10)
CmovOutlineConditionSubLatency-12 0.5271n ± 3% 0.2654n ± 2% -49.66% (p=0.000 n=10)
CmovOutlineConditionSubThroughputBy6-12 1.053n ± 1% 1.050n ± 1% ~ (p=1.000 n=10)
geomean
There are other benefits not tested by this benchmark:
- the math form is usually a couple bytes shorter (ICACHE)
- the math form is usually 0~2 uops shorter (UCACHE)
- the math form has usually less register pressure*
- the math form can sometimes be optimized further
*regalloc rarely find how it can use less registers
As far as pass ordering goes there are many possible options,
I've decided to reorder branchelim before late opt since:
- unlike running exclusively the CondSelect rules after branchelim,
some extra optimizations might trigger on the adds or subs.
- I don't want to maintain a second generic.rules file of only the stuff,
that can trigger after branchelim.
- rerunning all of opt a third time increase compilation time for little gains.
By elimination moving branchelim seems fine.
Change-Id: I869adf57e4d109948ee157cfc47144445146bafd
Reviewed-on: https://go-review.googlesource.com/c/go/+/685676
Reviewed-by: Keith Randall <khr@golang.org>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Keith Randall <khr@google.com>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
2025-07-03 01:35:51 +02:00
|
|
|
|
|
|
|
func cmovmathadd(a uint, b bool) uint {
|
|
|
|
if b {
|
|
|
|
a++
|
|
|
|
}
|
|
|
|
// amd64:"ADDQ", -"CMOV"
|
|
|
|
// arm64:"CSINC", -"CSEL"
|
|
|
|
// ppc64x:"ADD", -"ISEL"
|
|
|
|
// wasm:"Add", "-Select"
|
|
|
|
return a
|
|
|
|
}
|
|
|
|
|
|
|
|
func cmovmathsub(a uint, b bool) uint {
|
|
|
|
if b {
|
|
|
|
a--
|
|
|
|
}
|
|
|
|
// amd64:"SUBQ", -"CMOV"
|
|
|
|
// arm64:"SUB", -"CSEL"
|
|
|
|
// ppc64x:"SUB", -"ISEL"
|
|
|
|
// wasm:"Sub", "-Select"
|
|
|
|
return a
|
|
|
|
}
|