crypto/internal/fips140/edwards25519/field: delete Square amd64 assembly

The preceding commit made the compiler-generated code faster
than the assembly.

Since the generic/assembly split is gone, use nicer function names.
The fact that they are functions instead of methods is itself a vestige
of their assembly roots. But unwinding that makes for a large diff.

goos: linux
goarch: amd64
cpu: AMD Ryzen Threadripper PRO 7975WX 32-Cores

pkg: crypto/ed25519
                │   before    │               after                │
                │   sec/op    │   sec/op     vs base               │
KeyGeneration-64    12.70µ ± 1%   12.38µ ± 2%  -2.53% (p=0.000 n=30)
NewKeyFromSeed-64   12.52µ ± 0%   12.27µ ± 1%  -2.00% (p=0.000 n=30)
Signing-64          15.42µ ± 0%   14.81µ ± 0%  -3.97% (p=0.000 n=30)
Verification-64     34.84µ ± 0%   34.68µ ± 0%  -0.44% (p=0.006 n=30)
geomean             17.10µ        16.71µ       -2.24%

pkg: crypto/internal/fips140/edwards25519
                                │   before    │                after                │
                                │   sec/op    │   sec/op     vs base                │
EncodingDecoding-64              5.159µ ± 0%   4.589µ ± 1%  -11.05% (p=0.000 n=30)
ScalarBaseMult-64                9.761µ ± 0%   9.780µ ± 1%        ~ (p=0.965 n=30)
ScalarMult-64                    31.99µ ± 0%   32.46µ ± 0%   +1.47% (p=0.000 n=30)
VarTimeDoubleScalarBaseMult-64   29.82µ ± 0%   30.16µ ± 0%   +1.14% (p=0.000 n=30)
geomean                          14.80µ        14.48µ        -2.20%

pkg: crypto/internal/fips140/edwards25519/field
            │   before    │                after                │
            │   sec/op    │   sec/op     vs base                │
Add-64        2.571n ± 2%   2.573n ± 1%        ~ (p=0.460 n=30)
Multiply-64   10.67n ± 0%   10.62n ± 0%   -0.47% (p=0.001 n=30)
Square-64     8.849n ± 0%   8.412n ± 0%   -4.94% (p=0.000 n=30)
Invert-64     2.401µ ± 0%   2.156µ ± 2%  -10.20% (p=0.000 n=30)
Mult32-64     3.226n ± 0%   3.240n ± 0%   +0.47% (p=0.004 n=30)
Bytes-64      7.974n ± 1%   7.905n ± 1%   -0.87% (p=0.015 n=30)
geomean       15.70n        15.27n        -2.74%


Change-Id: I995209e72e202e7ca4e436615424120ef09e8b37
Reviewed-on: https://go-review.googlesource.com/c/go/+/778500
Reviewed-by: David Chase <drchase@google.com>
LUCI-TryBot-Result: golang-scoped@luci-project-accounts.iam.gserviceaccount.com <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Filippo Valsorda <filippo@golang.org>
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
This commit is contained in:
Josh Bleecher Snyder 2026-05-14 10:06:04 -07:00
parent a00bbab762
commit acced3df03
7 changed files with 3 additions and 328 deletions

View file

@ -19,7 +19,6 @@ func main() {
Package("crypto/internal/fips140/edwards25519/field")
ConstraintExpr("!purego")
feMul()
feSquare()
Generate()
}
@ -37,95 +36,6 @@ type uint128 struct {
func (c uint128) String() string { return c.name }
func feSquare() {
TEXT("feSquare", NOSPLIT, "func(out, a *Element)")
Doc("feSquare sets out = a * a. It works like feSquareGeneric.")
Pragma("noescape")
a := Dereference(Param("a"))
l0 := namedComponent{a.Field("l0"), "l0"}
l1 := namedComponent{a.Field("l1"), "l1"}
l2 := namedComponent{a.Field("l2"), "l2"}
l3 := namedComponent{a.Field("l3"), "l3"}
l4 := namedComponent{a.Field("l4"), "l4"}
// r0 = l0×l0 + 19×2×(l1×l4 + l2×l3)
r0 := uint128{"r0", GP64(), GP64()}
mul64(r0, 1, l0, l0)
addMul64(r0, 38, l1, l4)
addMul64(r0, 38, l2, l3)
// r1 = 2×l0×l1 + 19×2×l2×l4 + 19×l3×l3
r1 := uint128{"r1", GP64(), GP64()}
mul64(r1, 2, l0, l1)
addMul64(r1, 38, l2, l4)
addMul64(r1, 19, l3, l3)
// r2 = = 2×l0×l2 + l1×l1 + 19×2×l3×l4
r2 := uint128{"r2", GP64(), GP64()}
mul64(r2, 2, l0, l2)
addMul64(r2, 1, l1, l1)
addMul64(r2, 38, l3, l4)
// r3 = = 2×l0×l3 + 2×l1×l2 + 19×l4×l4
r3 := uint128{"r3", GP64(), GP64()}
mul64(r3, 2, l0, l3)
addMul64(r3, 2, l1, l2)
addMul64(r3, 19, l4, l4)
// r4 = = 2×l0×l4 + 2×l1×l3 + l2×l2
r4 := uint128{"r4", GP64(), GP64()}
mul64(r4, 2, l0, l4)
addMul64(r4, 2, l1, l3)
addMul64(r4, 1, l2, l2)
Comment("First reduction chain")
maskLow51Bits := GP64()
MOVQ(Imm((1<<51)-1), maskLow51Bits)
c0, r0lo := shiftRightBy51(&r0)
c1, r1lo := shiftRightBy51(&r1)
c2, r2lo := shiftRightBy51(&r2)
c3, r3lo := shiftRightBy51(&r3)
c4, r4lo := shiftRightBy51(&r4)
maskAndAdd(r0lo, maskLow51Bits, c4, 19)
maskAndAdd(r1lo, maskLow51Bits, c0, 1)
maskAndAdd(r2lo, maskLow51Bits, c1, 1)
maskAndAdd(r3lo, maskLow51Bits, c2, 1)
maskAndAdd(r4lo, maskLow51Bits, c3, 1)
Comment("Second reduction chain (carryPropagate)")
// c0 = r0 >> 51
MOVQ(r0lo, c0)
SHRQ(Imm(51), c0)
// c1 = r1 >> 51
MOVQ(r1lo, c1)
SHRQ(Imm(51), c1)
// c2 = r2 >> 51
MOVQ(r2lo, c2)
SHRQ(Imm(51), c2)
// c3 = r3 >> 51
MOVQ(r3lo, c3)
SHRQ(Imm(51), c3)
// c4 = r4 >> 51
MOVQ(r4lo, c4)
SHRQ(Imm(51), c4)
maskAndAdd(r0lo, maskLow51Bits, c4, 19)
maskAndAdd(r1lo, maskLow51Bits, c0, 1)
maskAndAdd(r2lo, maskLow51Bits, c1, 1)
maskAndAdd(r3lo, maskLow51Bits, c2, 1)
maskAndAdd(r4lo, maskLow51Bits, c3, 1)
Comment("Store output")
out := Dereference(Param("out"))
Store(r0lo, out.Field("l0"))
Store(r1lo, out.Field("l1"))
Store(r2lo, out.Field("l2"))
Store(r3lo, out.Field("l3"))
Store(r4lo, out.Field("l4"))
RET()
}
func feMul() {
TEXT("feMul", NOSPLIT, "func(out, a, b *Element)")
Doc("feMul sets out = a * b. It works like feMulGeneric.")

View file

@ -8,8 +8,3 @@ package field
//
//go:noescape
func feMul(out *Element, a *Element, b *Element)
// feSquare sets out = a * a. It works like feSquareGeneric.
//
//go:noescape
func feSquare(out *Element, a *Element)

View file

@ -229,170 +229,3 @@ TEXT ·feMul(SB), NOSPLIT, $0-24
MOVQ R13, 24(AX)
MOVQ R15, 32(AX)
RET
// func feSquare(out *Element, a *Element)
TEXT ·feSquare(SB), NOSPLIT, $0-16
MOVQ a+8(FP), CX
// r0 = l0×l0
MOVQ (CX), AX
MULQ (CX)
MOVQ AX, SI
MOVQ DX, BX
// r0 += 38×l1×l4
MOVQ 8(CX), DX
LEAQ (DX)(DX*8), AX
LEAQ (DX)(AX*2), AX
SHLQ $0x01, AX
MULQ 32(CX)
ADDQ AX, SI
ADCQ DX, BX
// r0 += 38×l2×l3
MOVQ 16(CX), DX
LEAQ (DX)(DX*8), AX
LEAQ (DX)(AX*2), AX
SHLQ $0x01, AX
MULQ 24(CX)
ADDQ AX, SI
ADCQ DX, BX
// r1 = 2×l0×l1
MOVQ (CX), AX
SHLQ $0x01, AX
MULQ 8(CX)
MOVQ AX, R8
MOVQ DX, DI
// r1 += 38×l2×l4
MOVQ 16(CX), DX
LEAQ (DX)(DX*8), AX
LEAQ (DX)(AX*2), AX
SHLQ $0x01, AX
MULQ 32(CX)
ADDQ AX, R8
ADCQ DX, DI
// r1 += 19×l3×l3
MOVQ 24(CX), DX
LEAQ (DX)(DX*8), AX
LEAQ (DX)(AX*2), AX
MULQ 24(CX)
ADDQ AX, R8
ADCQ DX, DI
// r2 = 2×l0×l2
MOVQ (CX), AX
SHLQ $0x01, AX
MULQ 16(CX)
MOVQ AX, R10
MOVQ DX, R9
// r2 += l1×l1
MOVQ 8(CX), AX
MULQ 8(CX)
ADDQ AX, R10
ADCQ DX, R9
// r2 += 38×l3×l4
MOVQ 24(CX), DX
LEAQ (DX)(DX*8), AX
LEAQ (DX)(AX*2), AX
SHLQ $0x01, AX
MULQ 32(CX)
ADDQ AX, R10
ADCQ DX, R9
// r3 = 2×l0×l3
MOVQ (CX), AX
SHLQ $0x01, AX
MULQ 24(CX)
MOVQ AX, R12
MOVQ DX, R11
// r3 += 2×l1×l2
MOVQ 8(CX), AX
SHLQ $0x01, AX
MULQ 16(CX)
ADDQ AX, R12
ADCQ DX, R11
// r3 += 19×l4×l4
MOVQ 32(CX), DX
LEAQ (DX)(DX*8), AX
LEAQ (DX)(AX*2), AX
MULQ 32(CX)
ADDQ AX, R12
ADCQ DX, R11
// r4 = 2×l0×l4
MOVQ (CX), AX
SHLQ $0x01, AX
MULQ 32(CX)
MOVQ AX, R14
MOVQ DX, R13
// r4 += 2×l1×l3
MOVQ 8(CX), AX
SHLQ $0x01, AX
MULQ 24(CX)
ADDQ AX, R14
ADCQ DX, R13
// r4 += l2×l2
MOVQ 16(CX), AX
MULQ 16(CX)
ADDQ AX, R14
ADCQ DX, R13
// First reduction chain
MOVQ $0x0007ffffffffffff, AX
SHLQ $0x0d, SI, BX
SHLQ $0x0d, R8, DI
SHLQ $0x0d, R10, R9
SHLQ $0x0d, R12, R11
SHLQ $0x0d, R14, R13
ANDQ AX, SI
IMUL3Q $0x13, R13, R13
ADDQ R13, SI
ANDQ AX, R8
ADDQ BX, R8
ANDQ AX, R10
ADDQ DI, R10
ANDQ AX, R12
ADDQ R9, R12
ANDQ AX, R14
ADDQ R11, R14
// Second reduction chain (carryPropagate)
MOVQ SI, BX
SHRQ $0x33, BX
MOVQ R8, DI
SHRQ $0x33, DI
MOVQ R10, R9
SHRQ $0x33, R9
MOVQ R12, R11
SHRQ $0x33, R11
MOVQ R14, R13
SHRQ $0x33, R13
ANDQ AX, SI
IMUL3Q $0x13, R13, R13
ADDQ R13, SI
ANDQ AX, R8
ADDQ BX, R8
ANDQ AX, R10
ADDQ DI, R10
ANDQ AX, R12
ADDQ R9, R12
ANDQ AX, R14
ADDQ R11, R14
// Store output
MOVQ out+0(FP), AX
MOVQ SI, (AX)
MOVQ R8, 8(AX)
MOVQ R10, 16(AX)
MOVQ R12, 24(AX)
MOVQ R14, 32(AX)
RET

View file

@ -7,7 +7,3 @@
package field
func feMul(v, x, y *Element) { feMulGeneric(v, x, y) }
func feSquare(v, x *Element) { feSquareGeneric(v, x) }
func feSquareN(v, a *Element, n int) { feSquareNGeneric(v, a, n) }

View file

@ -183,7 +183,7 @@ func feMulGeneric(v, a, b *Element) {
v.l4 = rr4&maskLow51Bits + rr3>>51
}
func feSquareGeneric(v, a *Element) {
func feSquare(v, a *Element) {
l0 := a.l0
l1 := a.l1
l2 := a.l2
@ -256,9 +256,9 @@ func feSquareGeneric(v, a *Element) {
v.l4 = rr4&maskLow51Bits + rr3>>51
}
// feSquareNGeneric squares a n times and writes the result to v.
// feSquareN squares a n times and writes the result to v.
// It uses local variables to keep limbs in registers.
func feSquareNGeneric(v, a *Element, n int) {
func feSquareN(v, a *Element, n int) {
l0 := a.l0
l1 := a.l1
l2 := a.l2

View file

@ -1,14 +0,0 @@
// Copyright (c) 2026 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build amd64 && !purego
package field
func feSquareN(v, a *Element, n int) {
feSquare(v, a)
for range n - 1 {
feSquare(v, v)
}
}

View file

@ -514,51 +514,6 @@ func TestSquareN(t *testing.T) {
}
}
func TestFeSquareN(t *testing.T) {
asmLikeGeneric := func(a Element) bool {
for _, n := range []int{1, 2, 5, 10, 15, 50, 120} {
t1 := a
t2 := a
feSquareNGeneric(&t1, &t1, n)
feSquareN(&t2, &t2, n)
if t1 != t2 {
t.Logf("n=%d: got %#v, expected %#v", n, t2, t1)
return false
}
if !isInBounds(&t2) {
return false
}
}
return true
}
if err := quick.Check(asmLikeGeneric, quickCheckConfig(1024)); err != nil {
t.Error(err)
}
}
func TestFeSquare(t *testing.T) {
asmLikeGeneric := func(a Element) bool {
t1 := a
t2 := a
feSquareGeneric(&t1, &t1)
feSquare(&t2, &t2)
if t1 != t2 {
t.Logf("got: %#v,\nexpected: %#v", t1, t2)
}
return t1 == t2 && isInBounds(&t2)
}
if err := quick.Check(asmLikeGeneric, quickCheckConfig(1024)); err != nil {
t.Error(err)
}
}
func TestFeMul(t *testing.T) {
asmLikeGeneric := func(a, b Element) bool {
a1 := a