crypto/internal/fips140/edwards25519/field: speed up add chains

Repeated squaring forms the bulk of these add chains.
Doing it in a dedicated routine with local variables is faster.

Introduce SquareN for this, and switch to an add chain geared towards
maximizing runs of repeated squares.

Unrolling the SquareN loop 5x gains a few percentage points more;
this could be done as desired in a follow-up.

A planned follow-up will use this newfound speed to delete amd64 asm.


Microbenchmarks:

goos: darwin
goarch: arm64
pkg: crypto/internal/fips140/edwards25519
cpu: Apple M3 Max
                               │      a      │                  b                   │
                               │   sec/op    │   sec/op     vs base                 │
EncodingDecoding-16              5.842µ ± 0%   4.703µ ± 0%  -19.51% (n=100)
ScalarBaseMult-16                9.157µ ± 0%   9.178µ ± 0%   +0.23% (p=0.000 n=100)
ScalarMult-16                    29.28µ ± 0%   29.25µ ± 0%   -0.09% (n=100)
VarTimeDoubleScalarBaseMult-16   27.46µ ± 0%   27.46µ ± 0%   -0.02% (p=0.002 n=100)
geomean                          14.40µ        13.64µ        -5.25%

pkg: crypto/internal/fips140/edwards25519/field
            │      a      │                  b                   │
            │   sec/op    │   sec/op     vs base                 │
Add-16        3.364n ± 0%   3.350n ± 0%   -0.43% (p=0.000 n=100)
Multiply-16   14.15n ± 0%   14.15n ± 0%    0.00% (p=0.000 n=100)
Square-16     10.32n ± 0%   10.25n ± 0%   -0.68% (n=100)
Invert-16     2.734µ ± 0%   2.331µ ± 0%  -14.74% (n=100)
Mult32-16     5.067n ± 0%   4.926n ± 0%   -2.78% (n=100)
Bytes-16      4.595n ± 0%   4.580n ± 0%        ~ (p=0.052 n=100)
geomean       17.75n        17.16n        -3.31%


Macrobenchmarks:

goos: darwin
goarch: arm64
pkg: crypto/ed25519
cpu: Apple M3 Max
                  │   before    │               after                │
                  │   sec/op    │   sec/op     vs base               │
KeyGeneration-16    13.84µ ± 2%   12.92µ ± 1%  -6.65% (p=0.000 n=30)
NewKeyFromSeed-16   13.60µ ± 3%   12.91µ ± 1%  -5.09% (p=0.000 n=30)
Signing-16          16.14µ ± 3%   15.75µ ± 1%  -2.45% (p=0.000 n=30)
Verification-16     35.47µ ± 2%   34.84µ ± 0%  -1.79% (p=0.001 n=30)
geomean             18.12µ        17.39µ       -4.02%

Change-Id: I30d09b8d15fa9d1d64863a21d26c1c9ce4d8e9cc
Reviewed-on: https://go-review.googlesource.com/c/go/+/760760
LUCI-TryBot-Result: golang-scoped@luci-project-accounts.iam.gserviceaccount.com <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: David Chase <drchase@google.com>
Reviewed-by: Filippo Valsorda <filippo@golang.org>
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
This commit is contained in:
Josh Bleecher Snyder 2026-03-24 22:43:50 -07:00
parent 1926d1d95d
commit a00bbab762
6 changed files with 190 additions and 95 deletions

View file

@ -115,65 +115,40 @@ func (v *Element) Negate(a *Element) *Element {
//
// If z == 0, Invert returns v = 0.
func (v *Element) Invert(z *Element) *Element {
// Inversion is implemented as exponentiation with exponent p 2. It uses the
// same sequence of 254 squarings and 11 multiplications as [Curve25519].
var z2, z9, z11, z2_5_0, z2_10_0, z2_20_0, z2_50_0, z2_100_0, t Element
// Inversion is implemented as exponentiation with exponent p 2.
// It uses 254 squarings and 11 multiplications, grouping squarings to use SquareN.
var z11, t0, t1, t2, t Element
z2.Square(z) // 2
t.Square(&z2) // 4
t.Square(&t) // 8
z9.Multiply(&t, z) // 9
z11.Multiply(&z9, &z2) // 11
t.Square(&z11) // 22
z2_5_0.Multiply(&t, &z9) // 31 = 2^5 - 2^0
t1.Square(z) // 2
t.Square(&t1) // 4
t.Square(&t) // 8
t2.Multiply(&t, z) // 9
z11.Multiply(&t2, &t1) // 11
t.Square(&z11) // 22
t0.Multiply(&t, &t2) // 31 = 2^5 - 2^0
t.Square(&z2_5_0) // 2^6 - 2^1
for i := 0; i < 4; i++ {
t.Square(&t) // 2^10 - 2^5
}
z2_10_0.Multiply(&t, &z2_5_0) // 2^10 - 2^0
t.SquareN(&t0, 5) // 2^10 - 2^5
t2.Multiply(&t, &t0) // 2^10 - 1
t.Square(&z2_10_0) // 2^11 - 2^1
for i := 0; i < 9; i++ {
t.Square(&t) // 2^20 - 2^10
}
z2_20_0.Multiply(&t, &z2_10_0) // 2^20 - 2^0
t.SquareN(&t2, 5) // 2^15 - 2^5
t0.Multiply(&t, &t0) // 2^15 - 1
t.Square(&z2_20_0) // 2^21 - 2^1
for i := 0; i < 19; i++ {
t.Square(&t) // 2^40 - 2^20
}
t.Multiply(&t, &z2_20_0) // 2^40 - 2^0
t.SquareN(&t0, 15) // 2^30 - 2^15
t1.Multiply(&t, &t0) // 2^30 - 1
t.Square(&t) // 2^41 - 2^1
for i := 0; i < 9; i++ {
t.Square(&t) // 2^50 - 2^10
}
z2_50_0.Multiply(&t, &z2_10_0) // 2^50 - 2^0
t.SquareN(&t1, 30) // 2^60 - 2^30
t0.Multiply(&t, &t1) // 2^60 - 1
t.Square(&z2_50_0) // 2^51 - 2^1
for i := 0; i < 49; i++ {
t.Square(&t) // 2^100 - 2^50
}
z2_100_0.Multiply(&t, &z2_50_0) // 2^100 - 2^0
t.SquareN(&t0, 60) // 2^120 - 2^60
t1.Multiply(&t, &t0) // 2^120 - 1
t.Square(&z2_100_0) // 2^101 - 2^1
for i := 0; i < 99; i++ {
t.Square(&t) // 2^200 - 2^100
}
t.Multiply(&t, &z2_100_0) // 2^200 - 2^0
t.SquareN(&t1, 120) // 2^240 - 2^120
t.Multiply(&t, &t1) // 2^240 - 1
t.Square(&t) // 2^201 - 2^1
for i := 0; i < 49; i++ {
t.Square(&t) // 2^250 - 2^50
}
t.Multiply(&t, &z2_50_0) // 2^250 - 2^0
t.SquareN(&t, 10) // 2^250 - 2^10
t.Multiply(&t, &t2) // 2^250 - 1
t.Square(&t) // 2^251 - 2^1
t.Square(&t) // 2^252 - 2^2
t.Square(&t) // 2^253 - 2^3
t.Square(&t) // 2^254 - 2^4
t.Square(&t) // 2^255 - 2^5
t.SquareN(&t, 5) // 2^255 - 2^5
return v.Multiply(&t, &z11) // 2^255 - 21
}
@ -311,6 +286,12 @@ func (v *Element) Square(x *Element) *Element {
return v
}
// SquareN sets v = x^(2^n), and returns v. n must be positive.
func (v *Element) SquareN(x *Element, n int) *Element {
feSquareN(v, x, n)
return v
}
// Mult32 sets v = x * y, and returns v.
func (v *Element) Mult32(x *Element, y uint32) *Element {
x0lo, x0hi := mul51(x.l0, y)
@ -340,51 +321,37 @@ func mul51(a uint64, b uint32) (lo uint64, hi uint64) {
func (v *Element) Pow22523(x *Element) *Element {
var t0, t1, t2 Element
t0.Square(x) // x^2
t1.Square(&t0) // x^4
t1.Square(&t1) // x^8
t1.Multiply(x, &t1) // x^9
t0.Multiply(&t0, &t1) // x^11
t0.Square(&t0) // x^22
t0.Multiply(&t1, &t0) // x^31
t1.Square(&t0) // x^62
for i := 1; i < 5; i++ { // x^992
t1.Square(&t1)
}
t0.Multiply(&t1, &t0) // x^1023 -> 1023 = 2^10 - 1
t1.Square(&t0) // 2^11 - 2
for i := 1; i < 10; i++ { // 2^20 - 2^10
t1.Square(&t1)
}
t1.Multiply(&t1, &t0) // 2^20 - 1
t2.Square(&t1) // 2^21 - 2
for i := 1; i < 20; i++ { // 2^40 - 2^20
t2.Square(&t2)
}
t1.Multiply(&t2, &t1) // 2^40 - 1
t1.Square(&t1) // 2^41 - 2
for i := 1; i < 10; i++ { // 2^50 - 2^10
t1.Square(&t1)
}
t0.Multiply(&t1, &t0) // 2^50 - 1
t1.Square(&t0) // 2^51 - 2
for i := 1; i < 50; i++ { // 2^100 - 2^50
t1.Square(&t1)
}
t1.Multiply(&t1, &t0) // 2^100 - 1
t2.Square(&t1) // 2^101 - 2
for i := 1; i < 100; i++ { // 2^200 - 2^100
t2.Square(&t2)
}
t1.Multiply(&t2, &t1) // 2^200 - 1
t1.Square(&t1) // 2^201 - 2
for i := 1; i < 50; i++ { // 2^250 - 2^50
t1.Square(&t1)
}
t0.Multiply(&t1, &t0) // 2^250 - 1
t0.Square(&t0) // 2^251 - 2
t0.Square(&t0) // 2^252 - 4
return v.Multiply(&t0, x) // 2^252 - 3 -> x^(2^252-3)
t0.Square(x) // x^2
t1.Multiply(x, &t0) // x^3
t0.Square(&t1) // x^6
t0.Square(&t0) // x^12
t0.Multiply(&t1, &t0) // x^15
t0.Square(&t0) // x^30
t0.Multiply(x, &t0) // x^31 = 2^5 - 1
t1.SquareN(&t0, 5) // 2^10 - 2^5
t1.Multiply(&t1, &t0) // 2^10 - 1
t2.SquareN(&t1, 5) // 2^15 - 2^5
t0.Multiply(&t2, &t0) // 2^15 - 1
t2.SquareN(&t0, 15) // 2^30 - 2^15
t2.Multiply(&t2, &t0) // 2^30 - 1
t0.SquareN(&t2, 30) // 2^60 - 2^30
t0.Multiply(&t0, &t2) // 2^60 - 1
t2.SquareN(&t0, 60) // 2^120 - 2^60
t2.Multiply(&t2, &t0) // 2^120 - 1
t0.SquareN(&t2, 120) // 2^240 - 2^120
t0.Multiply(&t0, &t2) // 2^240 - 1
t0.SquareN(&t0, 10) // 2^250 - 2^10
t0.Multiply(&t0, &t1) // 2^250 - 1
t0.SquareN(&t0, 2) // 2^252 - 4
return v.Multiply(&t0, x) // 2^252 - 3
}
// sqrtM1 is 2^((p-1)/4), which squared is equal to -1 by Euler's Criterion.

View file

@ -96,6 +96,12 @@ func TestAliasing(t *testing.T) {
{name: "Negate", oneArgF: (*Element).Negate},
{name: "Set", oneArgF: (*Element).Set},
{name: "Square", oneArgF: (*Element).Square},
{
name: "SquareN",
oneArgF: func(v, x *Element) *Element {
return v.SquareN(x, 10)
},
},
{name: "Pow22523", oneArgF: (*Element).Pow22523},
{
name: "Mult32",

View file

@ -9,3 +9,5 @@ package field
func feMul(v, x, y *Element) { feMulGeneric(v, x, y) }
func feSquare(v, x *Element) { feSquareGeneric(v, x) }
func feSquareN(v, a *Element, n int) { feSquareNGeneric(v, a, n) }

View file

@ -256,6 +256,62 @@ func feSquareGeneric(v, a *Element) {
v.l4 = rr4&maskLow51Bits + rr3>>51
}
// feSquareNGeneric squares a n times and writes the result to v.
// It uses local variables to keep limbs in registers.
func feSquareNGeneric(v, a *Element, n int) {
l0 := a.l0
l1 := a.l1
l2 := a.l2
l3 := a.l3
l4 := a.l4
for range n {
r0 := mul(l0, l0)
r0 = addMul38(r0, l1, l4)
r0 = addMul38(r0, l2, l3)
r1 := mul(l0*2, l1)
r1 = addMul38(r1, l2, l4)
r1 = addMul19(r1, l3, l3)
r2 := mul(l0*2, l2)
r2 = addMul(r2, l1, l1)
r2 = addMul38(r2, l3, l4)
r3 := mul(l0*2, l3)
r3 = addMul(r3, l1*2, l2)
r3 = addMul19(r3, l4, l4)
r4 := mul(l0*2, l4)
r4 = addMul(r4, l1*2, l3)
r4 = addMul(r4, l2, l2)
c0 := shiftRightBy51(r0)
c1 := shiftRightBy51(r1)
c2 := shiftRightBy51(r2)
c3 := shiftRightBy51(r3)
c4 := shiftRightBy51(r4)
rr0 := r0.lo&maskLow51Bits + mul19(c4)
rr1 := r1.lo&maskLow51Bits + c0
rr2 := r2.lo&maskLow51Bits + c1
rr3 := r3.lo&maskLow51Bits + c2
rr4 := r4.lo&maskLow51Bits + c3
l0 = rr0&maskLow51Bits + mul19(rr4>>51)
l1 = rr1&maskLow51Bits + rr0>>51
l2 = rr2&maskLow51Bits + rr1>>51
l3 = rr3&maskLow51Bits + rr2>>51
l4 = rr4&maskLow51Bits + rr3>>51
}
v.l0 = l0
v.l1 = l1
v.l2 = l2
v.l3 = l3
v.l4 = l4
}
// carryPropagate brings the limbs below 52 bits by applying the reduction
// identity (a * 2²⁵⁵ + b = a * 19 + b) to the l4 carry.
func (v *Element) carryPropagate() *Element {

View file

@ -0,0 +1,14 @@
// Copyright (c) 2026 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build amd64 && !purego
package field
func feSquareN(v, a *Element, n int) {
feSquare(v, a)
for range n - 1 {
feSquare(v, v)
}
}

View file

@ -489,6 +489,56 @@ func TestSqrtRatio(t *testing.T) {
}
}
func TestSquareN(t *testing.T) {
squareNMatchesRepeatSquare := func(x Element) bool {
for _, n := range []int{1, 2, 5, 10, 15, 50, 120} {
got := new(Element).SquareN(&x, n)
want := new(Element).Set(&x)
for range n {
want.Square(want)
}
if got.Equal(want) != 1 {
t.Logf("SquareN(%d) mismatch", n)
return false
}
if !isInBounds(got) {
t.Logf("SquareN(%d) out of bounds", n)
return false
}
}
return true
}
if err := quick.Check(squareNMatchesRepeatSquare, quickCheckConfig(1024)); err != nil {
t.Error(err)
}
}
func TestFeSquareN(t *testing.T) {
asmLikeGeneric := func(a Element) bool {
for _, n := range []int{1, 2, 5, 10, 15, 50, 120} {
t1 := a
t2 := a
feSquareNGeneric(&t1, &t1, n)
feSquareN(&t2, &t2, n)
if t1 != t2 {
t.Logf("n=%d: got %#v, expected %#v", n, t2, t1)
return false
}
if !isInBounds(&t2) {
return false
}
}
return true
}
if err := quick.Check(asmLikeGeneric, quickCheckConfig(1024)); err != nil {
t.Error(err)
}
}
func TestFeSquare(t *testing.T) {
asmLikeGeneric := func(a Element) bool {
t1 := a