crypto/subtle,cmd/compile: add intrinsics for ConstantTimeSelect and *Eq

Targeting crypto/subtle rather than
crypto/internal/fips140/subtle after discussion with Filippo.

goos: linux
goarch: amd64
pkg: crypto/subtle
cpu: AMD Ryzen 5 3600 6-Core Processor
                        │ /tmp/old.logs │            /tmp/new.logs             │
                        │    sec/op     │    sec/op     vs base                │
ConstantTimeSelect-12      0.5246n ± 1%   0.5217n ± 2%        ~ (p=0.118 n=10)
ConstantTimeByteEq-12      1.0415n ± 1%   0.5202n ± 2%  -50.05% (p=0.000 n=10)
ConstantTimeEq-12          0.7813n ± 2%   0.7819n ± 0%        ~ (p=0.897 n=10)
ConstantTimeLessOrEq-12    1.0415n ± 3%   0.7813n ± 1%  -24.98% (p=0.000 n=10)
geomean                    0.8166n        0.6381n       -21.86%

The last three will become 1 lat-cycle (0.25ns) faster once #76066 is fixed.

The Select being that fast with the old code is really impressive.
I am pretty sure this happens because my CPU has BMI1&2 support and
a fusing unit able to translate non BMI code into BMI code.
This benchmark doesn't capture the CACHE gains from the shorter assembly.

It currently compiles as:
v17 = TESTQ <flags> v31 v31 // v != 0
v20 = CMOVQNE <int> v32 v33 v17 (y[int])

It is possible to remove the `TESTQ` by compiletime fusing it with the
compare in a pattern like this:
subtle.ConstantTimeSelect(subtle.ConstantTimeLessOrEq(left, right), right, left)

Saving 2 latency-cycles (1 with #76066 fixed).

Updates #76056

Change-Id: I61a1df99e97a1506f75dae13db529f43846d8f1e
Reviewed-on: https://go-review.googlesource.com/c/go/+/715045
Reviewed-by: Keith Randall <khr@golang.org>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Keith Randall <khr@google.com>
This commit is contained in:
Jorropo 2025-10-26 22:19:30 +01:00
parent 73d7635fae
commit 2c91c33e88
4 changed files with 88 additions and 5 deletions

View file

@ -1602,6 +1602,36 @@ func initIntrinsics(cfg *intrinsicBuildConfig) {
return s.newValue1(ssa.OpZeroExt8to64, types.Types[types.TUINT64], out)
},
sys.AMD64)
/******** crypto/subtle ********/
// We implement a superset of the ConstantTimeSelect promise:
// ConstantTimeSelect returns x if v != 0 and y if v == 0.
add("crypto/subtle", "ConstantTimeSelect",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
v, x, y := args[0], args[1], args[2]
var checkOp ssa.Op
var zero *ssa.Value
switch s.config.PtrSize {
case 8:
checkOp = ssa.OpNeq64
zero = s.constInt64(types.Types[types.TINT], 0)
case 4:
checkOp = ssa.OpNeq32
zero = s.constInt32(types.Types[types.TINT], 0)
default:
panic("unreachable")
}
check := s.newValue2(checkOp, types.Types[types.TBOOL], zero, v)
return s.newValue3(ssa.OpCondSelect, types.Types[types.TINT], x, y, check)
},
sys.ArchAMD64, sys.ArchARM64, sys.ArchLoong64, sys.ArchPPC64, sys.ArchPPC64LE, sys.ArchWasm) // all with CMOV support.
add("crypto/subtle", "constantTimeBoolToUint8",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpCvtBoolToUint8, types.Types[types.TUINT8], args[0])
},
all...)
}
// findIntrinsic returns a function which builds the SSA equivalent of the

View file

@ -41,6 +41,7 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{
{"386", "math/bits", "TrailingZeros8"}: struct{}{},
{"386", "runtime", "KeepAlive"}: struct{}{},
{"386", "runtime", "slicebytetostringtmp"}: struct{}{},
{"386", "crypto/subtle", "constantTimeBoolToUint8"}: struct{}{},
{"amd64", "internal/runtime/atomic", "And"}: struct{}{},
{"amd64", "internal/runtime/atomic", "And32"}: struct{}{},
{"amd64", "internal/runtime/atomic", "And64"}: struct{}{},
@ -187,6 +188,8 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{
{"amd64", "sync/atomic", "SwapUint32"}: struct{}{},
{"amd64", "sync/atomic", "SwapUint64"}: struct{}{},
{"amd64", "sync/atomic", "SwapUintptr"}: struct{}{},
{"amd64", "crypto/subtle", "ConstantTimeSelect"}: struct{}{},
{"amd64", "crypto/subtle", "constantTimeBoolToUint8"}: struct{}{},
{"arm", "internal/runtime/sys", "Bswap32"}: struct{}{},
{"arm", "internal/runtime/sys", "Bswap64"}: struct{}{},
{"arm", "internal/runtime/sys", "GetCallerPC"}: struct{}{},
@ -214,6 +217,7 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{
{"arm", "math/bits", "TrailingZeros8"}: struct{}{},
{"arm", "runtime", "KeepAlive"}: struct{}{},
{"arm", "runtime", "slicebytetostringtmp"}: struct{}{},
{"arm", "crypto/subtle", "constantTimeBoolToUint8"}: struct{}{},
{"arm64", "internal/runtime/atomic", "And"}: struct{}{},
{"arm64", "internal/runtime/atomic", "And32"}: struct{}{},
{"arm64", "internal/runtime/atomic", "And64"}: struct{}{},
@ -358,6 +362,8 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{
{"arm64", "sync/atomic", "SwapUint32"}: struct{}{},
{"arm64", "sync/atomic", "SwapUint64"}: struct{}{},
{"arm64", "sync/atomic", "SwapUintptr"}: struct{}{},
{"arm64", "crypto/subtle", "ConstantTimeSelect"}: struct{}{},
{"arm64", "crypto/subtle", "constantTimeBoolToUint8"}: struct{}{},
{"loong64", "internal/runtime/atomic", "And"}: struct{}{},
{"loong64", "internal/runtime/atomic", "And32"}: struct{}{},
{"loong64", "internal/runtime/atomic", "And64"}: struct{}{},
@ -504,6 +510,8 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{
{"loong64", "sync/atomic", "SwapUint32"}: struct{}{},
{"loong64", "sync/atomic", "SwapUint64"}: struct{}{},
{"loong64", "sync/atomic", "SwapUintptr"}: struct{}{},
{"loong64", "crypto/subtle", "ConstantTimeSelect"}: struct{}{},
{"loong64", "crypto/subtle", "constantTimeBoolToUint8"}: struct{}{},
{"mips", "internal/runtime/atomic", "And"}: struct{}{},
{"mips", "internal/runtime/atomic", "And8"}: struct{}{},
{"mips", "internal/runtime/atomic", "Cas"}: struct{}{},
@ -574,6 +582,7 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{
{"mips", "sync/atomic", "SwapInt32"}: struct{}{},
{"mips", "sync/atomic", "SwapUint32"}: struct{}{},
{"mips", "sync/atomic", "SwapUintptr"}: struct{}{},
{"mips", "crypto/subtle", "constantTimeBoolToUint8"}: struct{}{},
{"mips64", "internal/runtime/atomic", "And"}: struct{}{},
{"mips64", "internal/runtime/atomic", "And8"}: struct{}{},
{"mips64", "internal/runtime/atomic", "Cas"}: struct{}{},
@ -662,6 +671,7 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{
{"mips64", "sync/atomic", "SwapUint32"}: struct{}{},
{"mips64", "sync/atomic", "SwapUint64"}: struct{}{},
{"mips64", "sync/atomic", "SwapUintptr"}: struct{}{},
{"mips64", "crypto/subtle", "constantTimeBoolToUint8"}: struct{}{},
{"mips64le", "internal/runtime/atomic", "And"}: struct{}{},
{"mips64le", "internal/runtime/atomic", "And8"}: struct{}{},
{"mips64le", "internal/runtime/atomic", "Cas"}: struct{}{},
@ -750,6 +760,7 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{
{"mips64le", "sync/atomic", "SwapUint32"}: struct{}{},
{"mips64le", "sync/atomic", "SwapUint64"}: struct{}{},
{"mips64le", "sync/atomic", "SwapUintptr"}: struct{}{},
{"mips64le", "crypto/subtle", "constantTimeBoolToUint8"}: struct{}{},
{"mipsle", "internal/runtime/atomic", "And"}: struct{}{},
{"mipsle", "internal/runtime/atomic", "And8"}: struct{}{},
{"mipsle", "internal/runtime/atomic", "Cas"}: struct{}{},
@ -820,6 +831,7 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{
{"mipsle", "sync/atomic", "SwapInt32"}: struct{}{},
{"mipsle", "sync/atomic", "SwapUint32"}: struct{}{},
{"mipsle", "sync/atomic", "SwapUintptr"}: struct{}{},
{"mipsle", "crypto/subtle", "constantTimeBoolToUint8"}: struct{}{},
{"ppc64", "internal/runtime/atomic", "And"}: struct{}{},
{"ppc64", "internal/runtime/atomic", "And8"}: struct{}{},
{"ppc64", "internal/runtime/atomic", "Cas"}: struct{}{},
@ -944,6 +956,8 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{
{"ppc64", "sync/atomic", "SwapUint32"}: struct{}{},
{"ppc64", "sync/atomic", "SwapUint64"}: struct{}{},
{"ppc64", "sync/atomic", "SwapUintptr"}: struct{}{},
{"ppc64", "crypto/subtle", "ConstantTimeSelect"}: struct{}{},
{"ppc64", "crypto/subtle", "constantTimeBoolToUint8"}: struct{}{},
{"ppc64le", "internal/runtime/atomic", "And"}: struct{}{},
{"ppc64le", "internal/runtime/atomic", "And8"}: struct{}{},
{"ppc64le", "internal/runtime/atomic", "Cas"}: struct{}{},
@ -1068,6 +1082,8 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{
{"ppc64le", "sync/atomic", "SwapUint32"}: struct{}{},
{"ppc64le", "sync/atomic", "SwapUint64"}: struct{}{},
{"ppc64le", "sync/atomic", "SwapUintptr"}: struct{}{},
{"ppc64le", "crypto/subtle", "ConstantTimeSelect"}: struct{}{},
{"ppc64le", "crypto/subtle", "constantTimeBoolToUint8"}: struct{}{},
{"riscv64", "internal/runtime/atomic", "And"}: struct{}{},
{"riscv64", "internal/runtime/atomic", "And8"}: struct{}{},
{"riscv64", "internal/runtime/atomic", "Cas"}: struct{}{},
@ -1188,6 +1204,7 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{
{"riscv64", "sync/atomic", "SwapUint32"}: struct{}{},
{"riscv64", "sync/atomic", "SwapUint64"}: struct{}{},
{"riscv64", "sync/atomic", "SwapUintptr"}: struct{}{},
{"riscv64", "crypto/subtle", "constantTimeBoolToUint8"}: struct{}{},
{"s390x", "internal/runtime/atomic", "And"}: struct{}{},
{"s390x", "internal/runtime/atomic", "And8"}: struct{}{},
{"s390x", "internal/runtime/atomic", "Cas"}: struct{}{},
@ -1306,6 +1323,7 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{
{"s390x", "sync/atomic", "SwapUint32"}: struct{}{},
{"s390x", "sync/atomic", "SwapUint64"}: struct{}{},
{"s390x", "sync/atomic", "SwapUintptr"}: struct{}{},
{"s390x", "crypto/subtle", "constantTimeBoolToUint8"}: struct{}{},
{"wasm", "internal/runtime/sys", "GetCallerPC"}: struct{}{},
{"wasm", "internal/runtime/sys", "GetCallerSP"}: struct{}{},
{"wasm", "internal/runtime/sys", "GetClosurePtr"}: struct{}{},
@ -1341,6 +1359,8 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{
{"wasm", "math/bits", "TrailingZeros8"}: struct{}{},
{"wasm", "runtime", "KeepAlive"}: struct{}{},
{"wasm", "runtime", "slicebytetostringtmp"}: struct{}{},
{"wasm", "crypto/subtle", "ConstantTimeSelect"}: struct{}{},
{"wasm", "crypto/subtle", "constantTimeBoolToUint8"}: struct{}{},
}
func TestIntrinsics(t *testing.T) {

View file

@ -13,34 +13,56 @@ import "crypto/internal/fips140/subtle"
// is independent of the contents. If the lengths of x and y do not match it
// returns 0 immediately.
func ConstantTimeCompare(x, y []byte) int {
return subtle.ConstantTimeCompare(x, y)
if len(x) != len(y) {
return 0
}
var v byte
for i := 0; i < len(x); i++ {
v |= x[i] ^ y[i]
}
return ConstantTimeByteEq(v, 0)
}
// ConstantTimeSelect returns x if v == 1 and y if v == 0.
// Its behavior is undefined if v takes any other value.
func ConstantTimeSelect(v, x, y int) int {
return subtle.ConstantTimeSelect(v, x, y)
// This is intrinsicified on arches with CMOV.
// It implements the following superset behavior:
// ConstantTimeSelect returns x if v != 0 and y if v == 0.
// Do the same here to avoid non portable UB.
v = int(constantTimeBoolToUint8(v != 0))
return ^(v-1)&x | (v-1)&y
}
// ConstantTimeByteEq returns 1 if x == y and 0 otherwise.
func ConstantTimeByteEq(x, y uint8) int {
return subtle.ConstantTimeByteEq(x, y)
return int(constantTimeBoolToUint8(x == y))
}
// ConstantTimeEq returns 1 if x == y and 0 otherwise.
func ConstantTimeEq(x, y int32) int {
return subtle.ConstantTimeEq(x, y)
return int(constantTimeBoolToUint8(x == y))
}
// ConstantTimeCopy copies the contents of y into x (a slice of equal length)
// if v == 1. If v == 0, x is left unchanged. Its behavior is undefined if v
// takes any other value.
func ConstantTimeCopy(v int, x, y []byte) {
// Forward this one since it gains nothing from compiler intrinsics.
subtle.ConstantTimeCopy(v, x, y)
}
// ConstantTimeLessOrEq returns 1 if x <= y and 0 otherwise.
// Its behavior is undefined if x or y are negative or > 2**31 - 1.
func ConstantTimeLessOrEq(x, y int) int {
return subtle.ConstantTimeLessOrEq(x, y)
return int(constantTimeBoolToUint8(x <= y))
}
// constantTimeBoolToUint8 is a compiler intrinsic.
// It returns 1 for true and 0 for false.
func constantTimeBoolToUint8(b bool) uint8 {
panic("unreachable; must be intrinsicified")
}

View file

@ -128,6 +128,17 @@ func TestConstantTimeLessOrEq(t *testing.T) {
var benchmarkGlobal uint8
func BenchmarkConstantTimeSelect(b *testing.B) {
x := int(benchmarkGlobal)
var y, z int
for range b.N {
y, z, x = ConstantTimeSelect(x, y, z), y, z
}
benchmarkGlobal = uint8(x)
}
func BenchmarkConstantTimeByteEq(b *testing.B) {
var x, y uint8