cmd/compile: remove flags → bool → flags roundtrips on amd64

Fixes #76056
Fixes #76060

If we modify the issue's fieldReduceOnce2 function to:

  // fieldReduceOnce reduces a value a < 2q.
  func fieldReduceOnce2(a uint32) fieldElement {
    x, b := bits.Sub(uint(a), uint(q), 0)
    return fieldElement(subtle.ConstantTimeSelect(int(b), int(a), int(x)))
  }

We get the wanted assembly*:
  MOVL AX, CX
  MOVL AX, DX
  SUBQ $8380417, CX
  CMOVQCS DX, CX
  MOVQ CX, AX ; not ideal code size but handled by the register renaming unit
  RET

Changes made to fieldReduceOnce2:
- fixed a bug where a and x arguments to subtle.ConstantTimeSelect were swapped.
  we should use a when the sub underflows and x otherwise.
- use bits.Sub rather than bits.Sub32 which is intriscified.

*we use CMOVQCS + MOVQ because the CMOV randomly gets generated backward,
I believe this would be fixed if we teach regalloc to commut CMOV
(by swapping the two register args and inverting the condition).

Change-Id: I01eca545d3c5c8a1c1f5a107e0089f715359dfc6
Reviewed-on: https://go-review.googlesource.com/c/go/+/778141
LUCI-TryBot-Result: golang-scoped@luci-project-accounts.iam.gserviceaccount.com <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Keith Randall <khr@google.com>
Auto-Submit: Jorropo <jorropo.pgm@gmail.com>
Reviewed-by: Keith Randall <khr@golang.org>
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
This commit is contained in:
Jorropo 2026-05-15 10:38:34 +02:00 committed by Gopher Robot
parent c6eaf03788
commit 9e0467b174
4 changed files with 14087 additions and 10 deletions

View file

@ -1812,3 +1812,55 @@
(VPMOVMToVec32x16 (VCMPPS512 [3] x y))
(VPORD512 (VPMOVMToVec64x8 (VCMPPD512 [3] x x)) (VPMOVMToVec64x8 (VCMPPD512 [3] y y))) =>
(VPMOVMToVec64x8 (VCMPPD512 [3] x y))
// remove flags → bool → flags roundtrip
// Only do it if the flag generating instruction is local otherwise the likelihood flagalloc won't undo this optimization and makes things worse are slim.
(NE t:(TESTQ x:(MOVBQZX s:(SET(EQ|NE|L|G|LE|GE|A|B|AE|BE|EQF|NEF|GF|GEF) flags)) x) yes no) && t.Block == s.Block => ((EQ|NE|LT|GT|LE|GE|UGT|ULT|UGE|ULE|EQF|NEF|UGE|UGT) flags yes no)
(NE t:(TESTL x:(MOVBQZX s:(SET(EQ|NE|L|G|LE|GE|A|B|AE|BE|EQF|NEF|GF|GEF) flags)) x) yes no) && t.Block == s.Block => ((EQ|NE|LT|GT|LE|GE|UGT|ULT|UGE|ULE|EQF|NEF|UGE|UGT) flags yes no)
(NE t:(TESTW x:(MOVBQZX s:(SET(EQ|NE|L|G|LE|GE|A|B|AE|BE|EQF|NEF|GF|GEF) flags)) x) yes no) && t.Block == s.Block => ((EQ|NE|LT|GT|LE|GE|UGT|ULT|UGE|ULE|EQF|NEF|UGE|UGT) flags yes no)
(NE t:(TESTB s:(SET(EQ|NE|L|G|LE|GE|A|B|AE|BE|EQF|NEF|GF|GEF) flags) s) yes no) && t.Block == s.Block => ((EQ|NE|LT|GT|LE|GE|UGT|ULT|UGE|ULE|EQF|NEF|UGE|UGT) flags yes no)
(CMOVQNE yes no t:(TESTQ x:(MOVBQZX s:(SET(EQ|NE|L|G|LE|GE|A|B|AE|BE|EQF|NEF|GF|GEF) flags)) x)) && t.Block == s.Block => (CMOVQ(EQ|NE|LT|GT|LE|GE|HI|CS|CC|LS|EQF|NEF|GTF|GEF) yes no flags)
(CMOVQNE yes no t:(TESTL x:(MOVBQZX s:(SET(EQ|NE|L|G|LE|GE|A|B|AE|BE|EQF|NEF|GF|GEF) flags)) x)) && t.Block == s.Block => (CMOVQ(EQ|NE|LT|GT|LE|GE|HI|CS|CC|LS|EQF|NEF|GTF|GEF) yes no flags)
(CMOVQNE yes no t:(TESTW x:(MOVBQZX s:(SET(EQ|NE|L|G|LE|GE|A|B|AE|BE|EQF|NEF|GF|GEF) flags)) x)) && t.Block == s.Block => (CMOVQ(EQ|NE|LT|GT|LE|GE|HI|CS|CC|LS|EQF|NEF|GTF|GEF) yes no flags)
(CMOVQNE yes no t:(TESTB s:(SET(EQ|NE|L|G|LE|GE|A|B|AE|BE|EQF|NEF|GF|GEF) flags) s)) && t.Block == s.Block => (CMOVQ(EQ|NE|LT|GT|LE|GE|HI|CS|CC|LS|EQF|NEF|GTF|GEF) yes no flags)
(CMOVLNE yes no t:(TESTQ x:(MOVBQZX s:(SET(EQ|NE|L|G|LE|GE|A|B|AE|BE|EQF|NEF|GF|GEF) flags)) x)) && t.Block == s.Block => (CMOVL(EQ|NE|LT|GT|LE|GE|HI|CS|CC|LS|EQF|NEF|GTF|GEF) yes no flags)
(CMOVLNE yes no t:(TESTL x:(MOVBQZX s:(SET(EQ|NE|L|G|LE|GE|A|B|AE|BE|EQF|NEF|GF|GEF) flags)) x)) && t.Block == s.Block => (CMOVL(EQ|NE|LT|GT|LE|GE|HI|CS|CC|LS|EQF|NEF|GTF|GEF) yes no flags)
(CMOVLNE yes no t:(TESTW x:(MOVBQZX s:(SET(EQ|NE|L|G|LE|GE|A|B|AE|BE|EQF|NEF|GF|GEF) flags)) x)) && t.Block == s.Block => (CMOVL(EQ|NE|LT|GT|LE|GE|HI|CS|CC|LS|EQF|NEF|GTF|GEF) yes no flags)
(CMOVLNE yes no t:(TESTB s:(SET(EQ|NE|L|G|LE|GE|A|B|AE|BE|EQF|NEF|GF|GEF) flags) s)) && t.Block == s.Block => (CMOVL(EQ|NE|LT|GT|LE|GE|HI|CS|CC|LS|EQF|NEF|GTF|GEF) yes no flags)
(CMOVWNE yes no t:(TESTQ x:(MOVBQZX s:(SET(EQ|NE|L|G|LE|GE|A|B|AE|BE|EQF|NEF|GF|GEF) flags)) x)) && t.Block == s.Block => (CMOVW(EQ|NE|LT|GT|LE|GE|HI|CS|CC|LS|EQF|NEF|GTF|GEF) yes no flags)
(CMOVWNE yes no t:(TESTL x:(MOVBQZX s:(SET(EQ|NE|L|G|LE|GE|A|B|AE|BE|EQF|NEF|GF|GEF) flags)) x)) && t.Block == s.Block => (CMOVW(EQ|NE|LT|GT|LE|GE|HI|CS|CC|LS|EQF|NEF|GTF|GEF) yes no flags)
(CMOVWNE yes no t:(TESTW x:(MOVBQZX s:(SET(EQ|NE|L|G|LE|GE|A|B|AE|BE|EQF|NEF|GF|GEF) flags)) x)) && t.Block == s.Block => (CMOVW(EQ|NE|LT|GT|LE|GE|HI|CS|CC|LS|EQF|NEF|GTF|GEF) yes no flags)
(CMOVWNE yes no t:(TESTB s:(SET(EQ|NE|L|G|LE|GE|A|B|AE|BE|EQF|NEF|GF|GEF) flags) s)) && t.Block == s.Block => (CMOVW(EQ|NE|LT|GT|LE|GE|HI|CS|CC|LS|EQF|NEF|GTF|GEF) yes no flags)
(SETNE t:(TESTQ x:(MOVBQZX s:(SET(EQ|NE|L|G|LE|GE|A|B|AE|BE|EQF|NEF|GF|GEF) flags)) x)) && t.Block == s.Block => s
(SETNE t:(TESTL x:(MOVBQZX s:(SET(EQ|NE|L|G|LE|GE|A|B|AE|BE|EQF|NEF|GF|GEF) flags)) x)) && t.Block == s.Block => s
(SETNE t:(TESTW x:(MOVBQZX s:(SET(EQ|NE|L|G|LE|GE|A|B|AE|BE|EQF|NEF|GF|GEF) flags)) x)) && t.Block == s.Block => s
(SETNE t:(TESTB s:(SET(EQ|NE|L|G|LE|GE|A|B|AE|BE|EQF|NEF|GF|GEF) flags) s)) && t.Block == s.Block => s
(EQ t:(TESTQ x:(MOVBQZX s:(SET(EQ|NE|L|G|LE|GE|A|B|AE|BE) flags)) x) yes no) && t.Block == s.Block => ((NE|EQ|GE|LE|GT|LT|ULE|UGE|ULT|UGT) flags yes no)
(EQ t:(TESTL x:(MOVBQZX s:(SET(EQ|NE|L|G|LE|GE|A|B|AE|BE) flags)) x) yes no) && t.Block == s.Block => ((NE|EQ|GE|LE|GT|LT|ULE|UGE|ULT|UGT) flags yes no)
(EQ t:(TESTW x:(MOVBQZX s:(SET(EQ|NE|L|G|LE|GE|A|B|AE|BE) flags)) x) yes no) && t.Block == s.Block => ((NE|EQ|GE|LE|GT|LT|ULE|UGE|ULT|UGT) flags yes no)
(EQ t:(TESTB s:(SET(EQ|NE|L|G|LE|GE|A|B|AE|BE) flags) s) yes no) && t.Block == s.Block => ((NE|EQ|GE|LE|GT|LT|ULE|UGE|ULT|UGT) flags yes no)
(CMOVQEQ yes no t:(TESTQ x:(MOVBQZX s:(SET(EQ|NE|L|G|LE|GE|A|B|AE|BE) flags)) x)) && t.Block == s.Block => (CMOVQ(NE|EQ|GE|LE|GT|LT|LS|CC|CS|HI) yes no flags)
(CMOVQEQ yes no t:(TESTL x:(MOVBQZX s:(SET(EQ|NE|L|G|LE|GE|A|B|AE|BE) flags)) x)) && t.Block == s.Block => (CMOVQ(NE|EQ|GE|LE|GT|LT|LS|CC|CS|HI) yes no flags)
(CMOVQEQ yes no t:(TESTW x:(MOVBQZX s:(SET(EQ|NE|L|G|LE|GE|A|B|AE|BE) flags)) x)) && t.Block == s.Block => (CMOVQ(NE|EQ|GE|LE|GT|LT|LS|CC|CS|HI) yes no flags)
(CMOVQEQ yes no t:(TESTB s:(SET(EQ|NE|L|G|LE|GE|A|B|AE|BE) flags) s)) && t.Block == s.Block => (CMOVQ(NE|EQ|GE|LE|GT|LT|LS|CC|CS|HI) yes no flags)
(CMOVLEQ yes no t:(TESTQ x:(MOVBQZX s:(SET(EQ|NE|L|G|LE|GE|A|B|AE|BE) flags)) x)) && t.Block == s.Block => (CMOVL(NE|EQ|GE|LE|GT|LT|LS|CC|CS|HI) yes no flags)
(CMOVLEQ yes no t:(TESTL x:(MOVBQZX s:(SET(EQ|NE|L|G|LE|GE|A|B|AE|BE) flags)) x)) && t.Block == s.Block => (CMOVL(NE|EQ|GE|LE|GT|LT|LS|CC|CS|HI) yes no flags)
(CMOVLEQ yes no t:(TESTW x:(MOVBQZX s:(SET(EQ|NE|L|G|LE|GE|A|B|AE|BE) flags)) x)) && t.Block == s.Block => (CMOVL(NE|EQ|GE|LE|GT|LT|LS|CC|CS|HI) yes no flags)
(CMOVLEQ yes no t:(TESTB s:(SET(EQ|NE|L|G|LE|GE|A|B|AE|BE) flags) s)) && t.Block == s.Block => (CMOVL(NE|EQ|GE|LE|GT|LT|LS|CC|CS|HI) yes no flags)
(CMOVWEQ yes no t:(TESTQ x:(MOVBQZX s:(SET(EQ|NE|L|G|LE|GE|A|B|AE|BE) flags)) x)) && t.Block == s.Block => (CMOVW(NE|EQ|GE|LE|GT|LT|LS|CC|CS|HI) yes no flags)
(CMOVWEQ yes no t:(TESTL x:(MOVBQZX s:(SET(EQ|NE|L|G|LE|GE|A|B|AE|BE) flags)) x)) && t.Block == s.Block => (CMOVW(NE|EQ|GE|LE|GT|LT|LS|CC|CS|HI) yes no flags)
(CMOVWEQ yes no t:(TESTW x:(MOVBQZX s:(SET(EQ|NE|L|G|LE|GE|A|B|AE|BE) flags)) x)) && t.Block == s.Block => (CMOVW(NE|EQ|GE|LE|GT|LT|LS|CC|CS|HI) yes no flags)
(CMOVWEQ yes no t:(TESTB s:(SET(EQ|NE|L|G|LE|GE|A|B|AE|BE) flags) s)) && t.Block == s.Block => (CMOVW(NE|EQ|GE|LE|GT|LT|LS|CC|CS|HI) yes no flags)
(SETEQ t:(TESTQ x:(MOVBQZX s:(SET(EQ|NE|L|G|LE|GE|A|B|AE|BE) flags)) x)) && t.Block == s.Block => (SET(NE|EQ|GE|LE|G|L|BE|AE|B|A) flags)
(SETEQ t:(TESTL x:(MOVBQZX s:(SET(EQ|NE|L|G|LE|GE|A|B|AE|BE) flags)) x)) && t.Block == s.Block => (SET(NE|EQ|GE|LE|G|L|BE|AE|B|A) flags)
(SETEQ t:(TESTW x:(MOVBQZX s:(SET(EQ|NE|L|G|LE|GE|A|B|AE|BE) flags)) x)) && t.Block == s.Block => (SET(NE|EQ|GE|LE|G|L|BE|AE|B|A) flags)
(SETEQ t:(TESTB s:(SET(EQ|NE|L|G|LE|GE|A|B|AE|BE) flags) s)) && t.Block == s.Block => (SET(NE|EQ|GE|LE|G|L|BE|AE|B|A) flags)

View file

@ -2720,7 +2720,7 @@ var invertEqNeqOp = map[Op]Op{
// simplifyBlock simplifies some constant values in b and evaluates
// branches to non-uniquely dominated successors of b.
func simplifyBlock(sdom SparseTree, ft *factsTable, b *Block) {
for iv, v := range b.Values {
for _, v := range b.Values {
switch v.Op {
case OpStaticLECall:
if b.Func.pass.debug > 0 && len(v.Args) == 2 {
@ -2874,14 +2874,6 @@ func simplifyBlock(sdom SparseTree, ft *factsTable, b *Block) {
v.reset(OpCondSelect)
v.AddArg3(y, zero, check)
// FIXME: workaround for go.dev/issues/76060
// we need to schedule the Neq before the CondSelect even tho
// scheduling is meaningless until we reach the schedule pass.
if b.Values[len(b.Values)-1] != check {
panic("unreachable; failed sanity check, new value isn't at the end of the block")
}
b.Values[iv], b.Values[len(b.Values)-1] = b.Values[len(b.Values)-1], b.Values[iv]
if b.Func.pass.debug > 0 {
b.Func.Warnl(v.Pos, "Rewrote Mul %v into CondSelect; %v is bool", v, x)
}

File diff suppressed because it is too large Load diff

View file

@ -6,7 +6,10 @@
package codegen
import "crypto/subtle"
import (
"crypto/subtle"
"math/bits"
)
func cmovint(c int) int {
x := c + 4
@ -808,3 +811,33 @@ func constantTimeSelect(v, x, y int) int {
// riscv64/rva23u64:"CZERONEZ" "CZEROEQZ" "OR" -"SNEZ" -"NEG" -"AND"
return subtle.ConstantTimeSelect(v, x, y)
}
func issue76056fieldReduceOnceSub32(a uint32) uint32 {
const q = 8380417 // 2²³ - 2¹³ + 1
// FIXME: the compiler struggles with Sub32 since it's not intriscified.
x, b := bits.Sub32(a, q, 0)
// FIXME: prove doesn't rewrite this multiply to a condselect because it doesn't know that b is always 0 or 1.
return x + b*q
}
func issue76056fieldReduceOnce2Sub32(a uint32) uint32 {
const q = 8380417 // 2²³ - 2¹³ + 1
// FIXME: the compiler struggles with Sub32 since it's not intriscified.
x, b := bits.Sub32(a, q, 0)
return uint32(subtle.ConstantTimeSelect(int(b), int(a), int(x)))
}
func issue76056fieldReduceOnceSub64(a uint32) uint32 {
const q = 8380417 // 2²³ - 2¹³ + 1
x, b := bits.Sub64(uint64(a), q, 0)
// FIXME: prove doesn't rewrite this multiply to a condselect because it doesn't know that b is always 0 or 1.
return uint32(x) + uint32(b)*q
}
func issue76056fieldReduceOnce2Sub64(a uint32) uint32 {
const q = 8380417 // 2²³ - 2¹³ + 1
// amd64:"SUB" -"TEST" -"SBB"
x, b := bits.Sub64(uint64(a), q, 0)
// amd64:"CMOV" -"TEST" -"SBB"
return uint32(subtle.ConstantTimeSelect(int(b), int(a), int(x)))
}