mirror of
https://github.com/golang/go.git
synced 2026-06-28 03:40:37 +00:00
crypto/internal/fips140/edwards25519/field: speed up add chains
Repeated squaring forms the bulk of these add chains.
Doing it in a dedicated routine with local variables is faster.
Introduce SquareN for this, and switch to an add chain geared towards
maximizing runs of repeated squares.
Unrolling the SquareN loop 5x gains a few percentage points more;
this could be done as desired in a follow-up.
A planned follow-up will use this newfound speed to delete amd64 asm.
Microbenchmarks:
goos: darwin
goarch: arm64
pkg: crypto/internal/fips140/edwards25519
cpu: Apple M3 Max
│ a │ b │
│ sec/op │ sec/op vs base │
EncodingDecoding-16 5.842µ ± 0% 4.703µ ± 0% -19.51% (n=100)
ScalarBaseMult-16 9.157µ ± 0% 9.178µ ± 0% +0.23% (p=0.000 n=100)
ScalarMult-16 29.28µ ± 0% 29.25µ ± 0% -0.09% (n=100)
VarTimeDoubleScalarBaseMult-16 27.46µ ± 0% 27.46µ ± 0% -0.02% (p=0.002 n=100)
geomean 14.40µ 13.64µ -5.25%
pkg: crypto/internal/fips140/edwards25519/field
│ a │ b │
│ sec/op │ sec/op vs base │
Add-16 3.364n ± 0% 3.350n ± 0% -0.43% (p=0.000 n=100)
Multiply-16 14.15n ± 0% 14.15n ± 0% 0.00% (p=0.000 n=100)
Square-16 10.32n ± 0% 10.25n ± 0% -0.68% (n=100)
Invert-16 2.734µ ± 0% 2.331µ ± 0% -14.74% (n=100)
Mult32-16 5.067n ± 0% 4.926n ± 0% -2.78% (n=100)
Bytes-16 4.595n ± 0% 4.580n ± 0% ~ (p=0.052 n=100)
geomean 17.75n 17.16n -3.31%
Macrobenchmarks:
goos: darwin
goarch: arm64
pkg: crypto/ed25519
cpu: Apple M3 Max
│ before │ after │
│ sec/op │ sec/op vs base │
KeyGeneration-16 13.84µ ± 2% 12.92µ ± 1% -6.65% (p=0.000 n=30)
NewKeyFromSeed-16 13.60µ ± 3% 12.91µ ± 1% -5.09% (p=0.000 n=30)
Signing-16 16.14µ ± 3% 15.75µ ± 1% -2.45% (p=0.000 n=30)
Verification-16 35.47µ ± 2% 34.84µ ± 0% -1.79% (p=0.001 n=30)
geomean 18.12µ 17.39µ -4.02%
Change-Id: I30d09b8d15fa9d1d64863a21d26c1c9ce4d8e9cc
Reviewed-on: https://go-review.googlesource.com/c/go/+/760760
LUCI-TryBot-Result: golang-scoped@luci-project-accounts.iam.gserviceaccount.com <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: David Chase <drchase@google.com>
Reviewed-by: Filippo Valsorda <filippo@golang.org>
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
This commit is contained in:
parent
1926d1d95d
commit
a00bbab762
6 changed files with 190 additions and 95 deletions
|
|
@ -115,65 +115,40 @@ func (v *Element) Negate(a *Element) *Element {
|
|||
//
|
||||
// If z == 0, Invert returns v = 0.
|
||||
func (v *Element) Invert(z *Element) *Element {
|
||||
// Inversion is implemented as exponentiation with exponent p − 2. It uses the
|
||||
// same sequence of 254 squarings and 11 multiplications as [Curve25519].
|
||||
var z2, z9, z11, z2_5_0, z2_10_0, z2_20_0, z2_50_0, z2_100_0, t Element
|
||||
// Inversion is implemented as exponentiation with exponent p − 2.
|
||||
// It uses 254 squarings and 11 multiplications, grouping squarings to use SquareN.
|
||||
var z11, t0, t1, t2, t Element
|
||||
|
||||
z2.Square(z) // 2
|
||||
t.Square(&z2) // 4
|
||||
t.Square(&t) // 8
|
||||
z9.Multiply(&t, z) // 9
|
||||
z11.Multiply(&z9, &z2) // 11
|
||||
t.Square(&z11) // 22
|
||||
z2_5_0.Multiply(&t, &z9) // 31 = 2^5 - 2^0
|
||||
t1.Square(z) // 2
|
||||
t.Square(&t1) // 4
|
||||
t.Square(&t) // 8
|
||||
t2.Multiply(&t, z) // 9
|
||||
z11.Multiply(&t2, &t1) // 11
|
||||
t.Square(&z11) // 22
|
||||
t0.Multiply(&t, &t2) // 31 = 2^5 - 2^0
|
||||
|
||||
t.Square(&z2_5_0) // 2^6 - 2^1
|
||||
for i := 0; i < 4; i++ {
|
||||
t.Square(&t) // 2^10 - 2^5
|
||||
}
|
||||
z2_10_0.Multiply(&t, &z2_5_0) // 2^10 - 2^0
|
||||
t.SquareN(&t0, 5) // 2^10 - 2^5
|
||||
t2.Multiply(&t, &t0) // 2^10 - 1
|
||||
|
||||
t.Square(&z2_10_0) // 2^11 - 2^1
|
||||
for i := 0; i < 9; i++ {
|
||||
t.Square(&t) // 2^20 - 2^10
|
||||
}
|
||||
z2_20_0.Multiply(&t, &z2_10_0) // 2^20 - 2^0
|
||||
t.SquareN(&t2, 5) // 2^15 - 2^5
|
||||
t0.Multiply(&t, &t0) // 2^15 - 1
|
||||
|
||||
t.Square(&z2_20_0) // 2^21 - 2^1
|
||||
for i := 0; i < 19; i++ {
|
||||
t.Square(&t) // 2^40 - 2^20
|
||||
}
|
||||
t.Multiply(&t, &z2_20_0) // 2^40 - 2^0
|
||||
t.SquareN(&t0, 15) // 2^30 - 2^15
|
||||
t1.Multiply(&t, &t0) // 2^30 - 1
|
||||
|
||||
t.Square(&t) // 2^41 - 2^1
|
||||
for i := 0; i < 9; i++ {
|
||||
t.Square(&t) // 2^50 - 2^10
|
||||
}
|
||||
z2_50_0.Multiply(&t, &z2_10_0) // 2^50 - 2^0
|
||||
t.SquareN(&t1, 30) // 2^60 - 2^30
|
||||
t0.Multiply(&t, &t1) // 2^60 - 1
|
||||
|
||||
t.Square(&z2_50_0) // 2^51 - 2^1
|
||||
for i := 0; i < 49; i++ {
|
||||
t.Square(&t) // 2^100 - 2^50
|
||||
}
|
||||
z2_100_0.Multiply(&t, &z2_50_0) // 2^100 - 2^0
|
||||
t.SquareN(&t0, 60) // 2^120 - 2^60
|
||||
t1.Multiply(&t, &t0) // 2^120 - 1
|
||||
|
||||
t.Square(&z2_100_0) // 2^101 - 2^1
|
||||
for i := 0; i < 99; i++ {
|
||||
t.Square(&t) // 2^200 - 2^100
|
||||
}
|
||||
t.Multiply(&t, &z2_100_0) // 2^200 - 2^0
|
||||
t.SquareN(&t1, 120) // 2^240 - 2^120
|
||||
t.Multiply(&t, &t1) // 2^240 - 1
|
||||
|
||||
t.Square(&t) // 2^201 - 2^1
|
||||
for i := 0; i < 49; i++ {
|
||||
t.Square(&t) // 2^250 - 2^50
|
||||
}
|
||||
t.Multiply(&t, &z2_50_0) // 2^250 - 2^0
|
||||
t.SquareN(&t, 10) // 2^250 - 2^10
|
||||
t.Multiply(&t, &t2) // 2^250 - 1
|
||||
|
||||
t.Square(&t) // 2^251 - 2^1
|
||||
t.Square(&t) // 2^252 - 2^2
|
||||
t.Square(&t) // 2^253 - 2^3
|
||||
t.Square(&t) // 2^254 - 2^4
|
||||
t.Square(&t) // 2^255 - 2^5
|
||||
t.SquareN(&t, 5) // 2^255 - 2^5
|
||||
|
||||
return v.Multiply(&t, &z11) // 2^255 - 21
|
||||
}
|
||||
|
|
@ -311,6 +286,12 @@ func (v *Element) Square(x *Element) *Element {
|
|||
return v
|
||||
}
|
||||
|
||||
// SquareN sets v = x^(2^n), and returns v. n must be positive.
|
||||
func (v *Element) SquareN(x *Element, n int) *Element {
|
||||
feSquareN(v, x, n)
|
||||
return v
|
||||
}
|
||||
|
||||
// Mult32 sets v = x * y, and returns v.
|
||||
func (v *Element) Mult32(x *Element, y uint32) *Element {
|
||||
x0lo, x0hi := mul51(x.l0, y)
|
||||
|
|
@ -340,51 +321,37 @@ func mul51(a uint64, b uint32) (lo uint64, hi uint64) {
|
|||
func (v *Element) Pow22523(x *Element) *Element {
|
||||
var t0, t1, t2 Element
|
||||
|
||||
t0.Square(x) // x^2
|
||||
t1.Square(&t0) // x^4
|
||||
t1.Square(&t1) // x^8
|
||||
t1.Multiply(x, &t1) // x^9
|
||||
t0.Multiply(&t0, &t1) // x^11
|
||||
t0.Square(&t0) // x^22
|
||||
t0.Multiply(&t1, &t0) // x^31
|
||||
t1.Square(&t0) // x^62
|
||||
for i := 1; i < 5; i++ { // x^992
|
||||
t1.Square(&t1)
|
||||
}
|
||||
t0.Multiply(&t1, &t0) // x^1023 -> 1023 = 2^10 - 1
|
||||
t1.Square(&t0) // 2^11 - 2
|
||||
for i := 1; i < 10; i++ { // 2^20 - 2^10
|
||||
t1.Square(&t1)
|
||||
}
|
||||
t1.Multiply(&t1, &t0) // 2^20 - 1
|
||||
t2.Square(&t1) // 2^21 - 2
|
||||
for i := 1; i < 20; i++ { // 2^40 - 2^20
|
||||
t2.Square(&t2)
|
||||
}
|
||||
t1.Multiply(&t2, &t1) // 2^40 - 1
|
||||
t1.Square(&t1) // 2^41 - 2
|
||||
for i := 1; i < 10; i++ { // 2^50 - 2^10
|
||||
t1.Square(&t1)
|
||||
}
|
||||
t0.Multiply(&t1, &t0) // 2^50 - 1
|
||||
t1.Square(&t0) // 2^51 - 2
|
||||
for i := 1; i < 50; i++ { // 2^100 - 2^50
|
||||
t1.Square(&t1)
|
||||
}
|
||||
t1.Multiply(&t1, &t0) // 2^100 - 1
|
||||
t2.Square(&t1) // 2^101 - 2
|
||||
for i := 1; i < 100; i++ { // 2^200 - 2^100
|
||||
t2.Square(&t2)
|
||||
}
|
||||
t1.Multiply(&t2, &t1) // 2^200 - 1
|
||||
t1.Square(&t1) // 2^201 - 2
|
||||
for i := 1; i < 50; i++ { // 2^250 - 2^50
|
||||
t1.Square(&t1)
|
||||
}
|
||||
t0.Multiply(&t1, &t0) // 2^250 - 1
|
||||
t0.Square(&t0) // 2^251 - 2
|
||||
t0.Square(&t0) // 2^252 - 4
|
||||
return v.Multiply(&t0, x) // 2^252 - 3 -> x^(2^252-3)
|
||||
t0.Square(x) // x^2
|
||||
t1.Multiply(x, &t0) // x^3
|
||||
t0.Square(&t1) // x^6
|
||||
t0.Square(&t0) // x^12
|
||||
t0.Multiply(&t1, &t0) // x^15
|
||||
t0.Square(&t0) // x^30
|
||||
t0.Multiply(x, &t0) // x^31 = 2^5 - 1
|
||||
|
||||
t1.SquareN(&t0, 5) // 2^10 - 2^5
|
||||
t1.Multiply(&t1, &t0) // 2^10 - 1
|
||||
|
||||
t2.SquareN(&t1, 5) // 2^15 - 2^5
|
||||
t0.Multiply(&t2, &t0) // 2^15 - 1
|
||||
|
||||
t2.SquareN(&t0, 15) // 2^30 - 2^15
|
||||
t2.Multiply(&t2, &t0) // 2^30 - 1
|
||||
|
||||
t0.SquareN(&t2, 30) // 2^60 - 2^30
|
||||
t0.Multiply(&t0, &t2) // 2^60 - 1
|
||||
|
||||
t2.SquareN(&t0, 60) // 2^120 - 2^60
|
||||
t2.Multiply(&t2, &t0) // 2^120 - 1
|
||||
|
||||
t0.SquareN(&t2, 120) // 2^240 - 2^120
|
||||
t0.Multiply(&t0, &t2) // 2^240 - 1
|
||||
|
||||
t0.SquareN(&t0, 10) // 2^250 - 2^10
|
||||
t0.Multiply(&t0, &t1) // 2^250 - 1
|
||||
|
||||
t0.SquareN(&t0, 2) // 2^252 - 4
|
||||
return v.Multiply(&t0, x) // 2^252 - 3
|
||||
}
|
||||
|
||||
// sqrtM1 is 2^((p-1)/4), which squared is equal to -1 by Euler's Criterion.
|
||||
|
|
|
|||
|
|
@ -96,6 +96,12 @@ func TestAliasing(t *testing.T) {
|
|||
{name: "Negate", oneArgF: (*Element).Negate},
|
||||
{name: "Set", oneArgF: (*Element).Set},
|
||||
{name: "Square", oneArgF: (*Element).Square},
|
||||
{
|
||||
name: "SquareN",
|
||||
oneArgF: func(v, x *Element) *Element {
|
||||
return v.SquareN(x, 10)
|
||||
},
|
||||
},
|
||||
{name: "Pow22523", oneArgF: (*Element).Pow22523},
|
||||
{
|
||||
name: "Mult32",
|
||||
|
|
|
|||
|
|
@ -9,3 +9,5 @@ package field
|
|||
func feMul(v, x, y *Element) { feMulGeneric(v, x, y) }
|
||||
|
||||
func feSquare(v, x *Element) { feSquareGeneric(v, x) }
|
||||
|
||||
func feSquareN(v, a *Element, n int) { feSquareNGeneric(v, a, n) }
|
||||
|
|
|
|||
|
|
@ -256,6 +256,62 @@ func feSquareGeneric(v, a *Element) {
|
|||
v.l4 = rr4&maskLow51Bits + rr3>>51
|
||||
}
|
||||
|
||||
// feSquareNGeneric squares a n times and writes the result to v.
|
||||
// It uses local variables to keep limbs in registers.
|
||||
func feSquareNGeneric(v, a *Element, n int) {
|
||||
l0 := a.l0
|
||||
l1 := a.l1
|
||||
l2 := a.l2
|
||||
l3 := a.l3
|
||||
l4 := a.l4
|
||||
|
||||
for range n {
|
||||
r0 := mul(l0, l0)
|
||||
r0 = addMul38(r0, l1, l4)
|
||||
r0 = addMul38(r0, l2, l3)
|
||||
|
||||
r1 := mul(l0*2, l1)
|
||||
r1 = addMul38(r1, l2, l4)
|
||||
r1 = addMul19(r1, l3, l3)
|
||||
|
||||
r2 := mul(l0*2, l2)
|
||||
r2 = addMul(r2, l1, l1)
|
||||
r2 = addMul38(r2, l3, l4)
|
||||
|
||||
r3 := mul(l0*2, l3)
|
||||
r3 = addMul(r3, l1*2, l2)
|
||||
r3 = addMul19(r3, l4, l4)
|
||||
|
||||
r4 := mul(l0*2, l4)
|
||||
r4 = addMul(r4, l1*2, l3)
|
||||
r4 = addMul(r4, l2, l2)
|
||||
|
||||
c0 := shiftRightBy51(r0)
|
||||
c1 := shiftRightBy51(r1)
|
||||
c2 := shiftRightBy51(r2)
|
||||
c3 := shiftRightBy51(r3)
|
||||
c4 := shiftRightBy51(r4)
|
||||
|
||||
rr0 := r0.lo&maskLow51Bits + mul19(c4)
|
||||
rr1 := r1.lo&maskLow51Bits + c0
|
||||
rr2 := r2.lo&maskLow51Bits + c1
|
||||
rr3 := r3.lo&maskLow51Bits + c2
|
||||
rr4 := r4.lo&maskLow51Bits + c3
|
||||
|
||||
l0 = rr0&maskLow51Bits + mul19(rr4>>51)
|
||||
l1 = rr1&maskLow51Bits + rr0>>51
|
||||
l2 = rr2&maskLow51Bits + rr1>>51
|
||||
l3 = rr3&maskLow51Bits + rr2>>51
|
||||
l4 = rr4&maskLow51Bits + rr3>>51
|
||||
}
|
||||
|
||||
v.l0 = l0
|
||||
v.l1 = l1
|
||||
v.l2 = l2
|
||||
v.l3 = l3
|
||||
v.l4 = l4
|
||||
}
|
||||
|
||||
// carryPropagate brings the limbs below 52 bits by applying the reduction
|
||||
// identity (a * 2²⁵⁵ + b = a * 19 + b) to the l4 carry.
|
||||
func (v *Element) carryPropagate() *Element {
|
||||
|
|
|
|||
|
|
@ -0,0 +1,14 @@
|
|||
// Copyright (c) 2026 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
//go:build amd64 && !purego
|
||||
|
||||
package field
|
||||
|
||||
func feSquareN(v, a *Element, n int) {
|
||||
feSquare(v, a)
|
||||
for range n - 1 {
|
||||
feSquare(v, v)
|
||||
}
|
||||
}
|
||||
|
|
@ -489,6 +489,56 @@ func TestSqrtRatio(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
func TestSquareN(t *testing.T) {
|
||||
squareNMatchesRepeatSquare := func(x Element) bool {
|
||||
for _, n := range []int{1, 2, 5, 10, 15, 50, 120} {
|
||||
got := new(Element).SquareN(&x, n)
|
||||
want := new(Element).Set(&x)
|
||||
for range n {
|
||||
want.Square(want)
|
||||
}
|
||||
if got.Equal(want) != 1 {
|
||||
t.Logf("SquareN(%d) mismatch", n)
|
||||
return false
|
||||
}
|
||||
if !isInBounds(got) {
|
||||
t.Logf("SquareN(%d) out of bounds", n)
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
if err := quick.Check(squareNMatchesRepeatSquare, quickCheckConfig(1024)); err != nil {
|
||||
t.Error(err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFeSquareN(t *testing.T) {
|
||||
asmLikeGeneric := func(a Element) bool {
|
||||
for _, n := range []int{1, 2, 5, 10, 15, 50, 120} {
|
||||
t1 := a
|
||||
t2 := a
|
||||
|
||||
feSquareNGeneric(&t1, &t1, n)
|
||||
feSquareN(&t2, &t2, n)
|
||||
|
||||
if t1 != t2 {
|
||||
t.Logf("n=%d: got %#v, expected %#v", n, t2, t1)
|
||||
return false
|
||||
}
|
||||
if !isInBounds(&t2) {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
if err := quick.Check(asmLikeGeneric, quickCheckConfig(1024)); err != nil {
|
||||
t.Error(err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFeSquare(t *testing.T) {
|
||||
asmLikeGeneric := func(a Element) bool {
|
||||
t1 := a
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue