mirror of
https://github.com/golang/go.git
synced 2026-06-28 03:40:37 +00:00
crypto/internal/fips140/edwards25519/field: delete Square amd64 assembly
The preceding commit made the compiler-generated code faster
than the assembly.
Since the generic/assembly split is gone, use nicer function names.
The fact that they are functions instead of methods is itself a vestige
of their assembly roots. But unwinding that makes for a large diff.
goos: linux
goarch: amd64
cpu: AMD Ryzen Threadripper PRO 7975WX 32-Cores
pkg: crypto/ed25519
│ before │ after │
│ sec/op │ sec/op vs base │
KeyGeneration-64 12.70µ ± 1% 12.38µ ± 2% -2.53% (p=0.000 n=30)
NewKeyFromSeed-64 12.52µ ± 0% 12.27µ ± 1% -2.00% (p=0.000 n=30)
Signing-64 15.42µ ± 0% 14.81µ ± 0% -3.97% (p=0.000 n=30)
Verification-64 34.84µ ± 0% 34.68µ ± 0% -0.44% (p=0.006 n=30)
geomean 17.10µ 16.71µ -2.24%
pkg: crypto/internal/fips140/edwards25519
│ before │ after │
│ sec/op │ sec/op vs base │
EncodingDecoding-64 5.159µ ± 0% 4.589µ ± 1% -11.05% (p=0.000 n=30)
ScalarBaseMult-64 9.761µ ± 0% 9.780µ ± 1% ~ (p=0.965 n=30)
ScalarMult-64 31.99µ ± 0% 32.46µ ± 0% +1.47% (p=0.000 n=30)
VarTimeDoubleScalarBaseMult-64 29.82µ ± 0% 30.16µ ± 0% +1.14% (p=0.000 n=30)
geomean 14.80µ 14.48µ -2.20%
pkg: crypto/internal/fips140/edwards25519/field
│ before │ after │
│ sec/op │ sec/op vs base │
Add-64 2.571n ± 2% 2.573n ± 1% ~ (p=0.460 n=30)
Multiply-64 10.67n ± 0% 10.62n ± 0% -0.47% (p=0.001 n=30)
Square-64 8.849n ± 0% 8.412n ± 0% -4.94% (p=0.000 n=30)
Invert-64 2.401µ ± 0% 2.156µ ± 2% -10.20% (p=0.000 n=30)
Mult32-64 3.226n ± 0% 3.240n ± 0% +0.47% (p=0.004 n=30)
Bytes-64 7.974n ± 1% 7.905n ± 1% -0.87% (p=0.015 n=30)
geomean 15.70n 15.27n -2.74%
Change-Id: I995209e72e202e7ca4e436615424120ef09e8b37
Reviewed-on: https://go-review.googlesource.com/c/go/+/778500
Reviewed-by: David Chase <drchase@google.com>
LUCI-TryBot-Result: golang-scoped@luci-project-accounts.iam.gserviceaccount.com <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Filippo Valsorda <filippo@golang.org>
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
This commit is contained in:
parent
a00bbab762
commit
acced3df03
7 changed files with 3 additions and 328 deletions
|
|
@ -19,7 +19,6 @@ func main() {
|
|||
Package("crypto/internal/fips140/edwards25519/field")
|
||||
ConstraintExpr("!purego")
|
||||
feMul()
|
||||
feSquare()
|
||||
Generate()
|
||||
}
|
||||
|
||||
|
|
@ -37,95 +36,6 @@ type uint128 struct {
|
|||
|
||||
func (c uint128) String() string { return c.name }
|
||||
|
||||
func feSquare() {
|
||||
TEXT("feSquare", NOSPLIT, "func(out, a *Element)")
|
||||
Doc("feSquare sets out = a * a. It works like feSquareGeneric.")
|
||||
Pragma("noescape")
|
||||
|
||||
a := Dereference(Param("a"))
|
||||
l0 := namedComponent{a.Field("l0"), "l0"}
|
||||
l1 := namedComponent{a.Field("l1"), "l1"}
|
||||
l2 := namedComponent{a.Field("l2"), "l2"}
|
||||
l3 := namedComponent{a.Field("l3"), "l3"}
|
||||
l4 := namedComponent{a.Field("l4"), "l4"}
|
||||
|
||||
// r0 = l0×l0 + 19×2×(l1×l4 + l2×l3)
|
||||
r0 := uint128{"r0", GP64(), GP64()}
|
||||
mul64(r0, 1, l0, l0)
|
||||
addMul64(r0, 38, l1, l4)
|
||||
addMul64(r0, 38, l2, l3)
|
||||
|
||||
// r1 = 2×l0×l1 + 19×2×l2×l4 + 19×l3×l3
|
||||
r1 := uint128{"r1", GP64(), GP64()}
|
||||
mul64(r1, 2, l0, l1)
|
||||
addMul64(r1, 38, l2, l4)
|
||||
addMul64(r1, 19, l3, l3)
|
||||
|
||||
// r2 = = 2×l0×l2 + l1×l1 + 19×2×l3×l4
|
||||
r2 := uint128{"r2", GP64(), GP64()}
|
||||
mul64(r2, 2, l0, l2)
|
||||
addMul64(r2, 1, l1, l1)
|
||||
addMul64(r2, 38, l3, l4)
|
||||
|
||||
// r3 = = 2×l0×l3 + 2×l1×l2 + 19×l4×l4
|
||||
r3 := uint128{"r3", GP64(), GP64()}
|
||||
mul64(r3, 2, l0, l3)
|
||||
addMul64(r3, 2, l1, l2)
|
||||
addMul64(r3, 19, l4, l4)
|
||||
|
||||
// r4 = = 2×l0×l4 + 2×l1×l3 + l2×l2
|
||||
r4 := uint128{"r4", GP64(), GP64()}
|
||||
mul64(r4, 2, l0, l4)
|
||||
addMul64(r4, 2, l1, l3)
|
||||
addMul64(r4, 1, l2, l2)
|
||||
|
||||
Comment("First reduction chain")
|
||||
maskLow51Bits := GP64()
|
||||
MOVQ(Imm((1<<51)-1), maskLow51Bits)
|
||||
c0, r0lo := shiftRightBy51(&r0)
|
||||
c1, r1lo := shiftRightBy51(&r1)
|
||||
c2, r2lo := shiftRightBy51(&r2)
|
||||
c3, r3lo := shiftRightBy51(&r3)
|
||||
c4, r4lo := shiftRightBy51(&r4)
|
||||
maskAndAdd(r0lo, maskLow51Bits, c4, 19)
|
||||
maskAndAdd(r1lo, maskLow51Bits, c0, 1)
|
||||
maskAndAdd(r2lo, maskLow51Bits, c1, 1)
|
||||
maskAndAdd(r3lo, maskLow51Bits, c2, 1)
|
||||
maskAndAdd(r4lo, maskLow51Bits, c3, 1)
|
||||
|
||||
Comment("Second reduction chain (carryPropagate)")
|
||||
// c0 = r0 >> 51
|
||||
MOVQ(r0lo, c0)
|
||||
SHRQ(Imm(51), c0)
|
||||
// c1 = r1 >> 51
|
||||
MOVQ(r1lo, c1)
|
||||
SHRQ(Imm(51), c1)
|
||||
// c2 = r2 >> 51
|
||||
MOVQ(r2lo, c2)
|
||||
SHRQ(Imm(51), c2)
|
||||
// c3 = r3 >> 51
|
||||
MOVQ(r3lo, c3)
|
||||
SHRQ(Imm(51), c3)
|
||||
// c4 = r4 >> 51
|
||||
MOVQ(r4lo, c4)
|
||||
SHRQ(Imm(51), c4)
|
||||
maskAndAdd(r0lo, maskLow51Bits, c4, 19)
|
||||
maskAndAdd(r1lo, maskLow51Bits, c0, 1)
|
||||
maskAndAdd(r2lo, maskLow51Bits, c1, 1)
|
||||
maskAndAdd(r3lo, maskLow51Bits, c2, 1)
|
||||
maskAndAdd(r4lo, maskLow51Bits, c3, 1)
|
||||
|
||||
Comment("Store output")
|
||||
out := Dereference(Param("out"))
|
||||
Store(r0lo, out.Field("l0"))
|
||||
Store(r1lo, out.Field("l1"))
|
||||
Store(r2lo, out.Field("l2"))
|
||||
Store(r3lo, out.Field("l3"))
|
||||
Store(r4lo, out.Field("l4"))
|
||||
|
||||
RET()
|
||||
}
|
||||
|
||||
func feMul() {
|
||||
TEXT("feMul", NOSPLIT, "func(out, a, b *Element)")
|
||||
Doc("feMul sets out = a * b. It works like feMulGeneric.")
|
||||
|
|
|
|||
|
|
@ -8,8 +8,3 @@ package field
|
|||
//
|
||||
//go:noescape
|
||||
func feMul(out *Element, a *Element, b *Element)
|
||||
|
||||
// feSquare sets out = a * a. It works like feSquareGeneric.
|
||||
//
|
||||
//go:noescape
|
||||
func feSquare(out *Element, a *Element)
|
||||
|
|
|
|||
|
|
@ -229,170 +229,3 @@ TEXT ·feMul(SB), NOSPLIT, $0-24
|
|||
MOVQ R13, 24(AX)
|
||||
MOVQ R15, 32(AX)
|
||||
RET
|
||||
|
||||
// func feSquare(out *Element, a *Element)
|
||||
TEXT ·feSquare(SB), NOSPLIT, $0-16
|
||||
MOVQ a+8(FP), CX
|
||||
|
||||
// r0 = l0×l0
|
||||
MOVQ (CX), AX
|
||||
MULQ (CX)
|
||||
MOVQ AX, SI
|
||||
MOVQ DX, BX
|
||||
|
||||
// r0 += 38×l1×l4
|
||||
MOVQ 8(CX), DX
|
||||
LEAQ (DX)(DX*8), AX
|
||||
LEAQ (DX)(AX*2), AX
|
||||
SHLQ $0x01, AX
|
||||
MULQ 32(CX)
|
||||
ADDQ AX, SI
|
||||
ADCQ DX, BX
|
||||
|
||||
// r0 += 38×l2×l3
|
||||
MOVQ 16(CX), DX
|
||||
LEAQ (DX)(DX*8), AX
|
||||
LEAQ (DX)(AX*2), AX
|
||||
SHLQ $0x01, AX
|
||||
MULQ 24(CX)
|
||||
ADDQ AX, SI
|
||||
ADCQ DX, BX
|
||||
|
||||
// r1 = 2×l0×l1
|
||||
MOVQ (CX), AX
|
||||
SHLQ $0x01, AX
|
||||
MULQ 8(CX)
|
||||
MOVQ AX, R8
|
||||
MOVQ DX, DI
|
||||
|
||||
// r1 += 38×l2×l4
|
||||
MOVQ 16(CX), DX
|
||||
LEAQ (DX)(DX*8), AX
|
||||
LEAQ (DX)(AX*2), AX
|
||||
SHLQ $0x01, AX
|
||||
MULQ 32(CX)
|
||||
ADDQ AX, R8
|
||||
ADCQ DX, DI
|
||||
|
||||
// r1 += 19×l3×l3
|
||||
MOVQ 24(CX), DX
|
||||
LEAQ (DX)(DX*8), AX
|
||||
LEAQ (DX)(AX*2), AX
|
||||
MULQ 24(CX)
|
||||
ADDQ AX, R8
|
||||
ADCQ DX, DI
|
||||
|
||||
// r2 = 2×l0×l2
|
||||
MOVQ (CX), AX
|
||||
SHLQ $0x01, AX
|
||||
MULQ 16(CX)
|
||||
MOVQ AX, R10
|
||||
MOVQ DX, R9
|
||||
|
||||
// r2 += l1×l1
|
||||
MOVQ 8(CX), AX
|
||||
MULQ 8(CX)
|
||||
ADDQ AX, R10
|
||||
ADCQ DX, R9
|
||||
|
||||
// r2 += 38×l3×l4
|
||||
MOVQ 24(CX), DX
|
||||
LEAQ (DX)(DX*8), AX
|
||||
LEAQ (DX)(AX*2), AX
|
||||
SHLQ $0x01, AX
|
||||
MULQ 32(CX)
|
||||
ADDQ AX, R10
|
||||
ADCQ DX, R9
|
||||
|
||||
// r3 = 2×l0×l3
|
||||
MOVQ (CX), AX
|
||||
SHLQ $0x01, AX
|
||||
MULQ 24(CX)
|
||||
MOVQ AX, R12
|
||||
MOVQ DX, R11
|
||||
|
||||
// r3 += 2×l1×l2
|
||||
MOVQ 8(CX), AX
|
||||
SHLQ $0x01, AX
|
||||
MULQ 16(CX)
|
||||
ADDQ AX, R12
|
||||
ADCQ DX, R11
|
||||
|
||||
// r3 += 19×l4×l4
|
||||
MOVQ 32(CX), DX
|
||||
LEAQ (DX)(DX*8), AX
|
||||
LEAQ (DX)(AX*2), AX
|
||||
MULQ 32(CX)
|
||||
ADDQ AX, R12
|
||||
ADCQ DX, R11
|
||||
|
||||
// r4 = 2×l0×l4
|
||||
MOVQ (CX), AX
|
||||
SHLQ $0x01, AX
|
||||
MULQ 32(CX)
|
||||
MOVQ AX, R14
|
||||
MOVQ DX, R13
|
||||
|
||||
// r4 += 2×l1×l3
|
||||
MOVQ 8(CX), AX
|
||||
SHLQ $0x01, AX
|
||||
MULQ 24(CX)
|
||||
ADDQ AX, R14
|
||||
ADCQ DX, R13
|
||||
|
||||
// r4 += l2×l2
|
||||
MOVQ 16(CX), AX
|
||||
MULQ 16(CX)
|
||||
ADDQ AX, R14
|
||||
ADCQ DX, R13
|
||||
|
||||
// First reduction chain
|
||||
MOVQ $0x0007ffffffffffff, AX
|
||||
SHLQ $0x0d, SI, BX
|
||||
SHLQ $0x0d, R8, DI
|
||||
SHLQ $0x0d, R10, R9
|
||||
SHLQ $0x0d, R12, R11
|
||||
SHLQ $0x0d, R14, R13
|
||||
ANDQ AX, SI
|
||||
IMUL3Q $0x13, R13, R13
|
||||
ADDQ R13, SI
|
||||
ANDQ AX, R8
|
||||
ADDQ BX, R8
|
||||
ANDQ AX, R10
|
||||
ADDQ DI, R10
|
||||
ANDQ AX, R12
|
||||
ADDQ R9, R12
|
||||
ANDQ AX, R14
|
||||
ADDQ R11, R14
|
||||
|
||||
// Second reduction chain (carryPropagate)
|
||||
MOVQ SI, BX
|
||||
SHRQ $0x33, BX
|
||||
MOVQ R8, DI
|
||||
SHRQ $0x33, DI
|
||||
MOVQ R10, R9
|
||||
SHRQ $0x33, R9
|
||||
MOVQ R12, R11
|
||||
SHRQ $0x33, R11
|
||||
MOVQ R14, R13
|
||||
SHRQ $0x33, R13
|
||||
ANDQ AX, SI
|
||||
IMUL3Q $0x13, R13, R13
|
||||
ADDQ R13, SI
|
||||
ANDQ AX, R8
|
||||
ADDQ BX, R8
|
||||
ANDQ AX, R10
|
||||
ADDQ DI, R10
|
||||
ANDQ AX, R12
|
||||
ADDQ R9, R12
|
||||
ANDQ AX, R14
|
||||
ADDQ R11, R14
|
||||
|
||||
// Store output
|
||||
MOVQ out+0(FP), AX
|
||||
MOVQ SI, (AX)
|
||||
MOVQ R8, 8(AX)
|
||||
MOVQ R10, 16(AX)
|
||||
MOVQ R12, 24(AX)
|
||||
MOVQ R14, 32(AX)
|
||||
RET
|
||||
|
|
|
|||
|
|
@ -7,7 +7,3 @@
|
|||
package field
|
||||
|
||||
func feMul(v, x, y *Element) { feMulGeneric(v, x, y) }
|
||||
|
||||
func feSquare(v, x *Element) { feSquareGeneric(v, x) }
|
||||
|
||||
func feSquareN(v, a *Element, n int) { feSquareNGeneric(v, a, n) }
|
||||
|
|
|
|||
|
|
@ -183,7 +183,7 @@ func feMulGeneric(v, a, b *Element) {
|
|||
v.l4 = rr4&maskLow51Bits + rr3>>51
|
||||
}
|
||||
|
||||
func feSquareGeneric(v, a *Element) {
|
||||
func feSquare(v, a *Element) {
|
||||
l0 := a.l0
|
||||
l1 := a.l1
|
||||
l2 := a.l2
|
||||
|
|
@ -256,9 +256,9 @@ func feSquareGeneric(v, a *Element) {
|
|||
v.l4 = rr4&maskLow51Bits + rr3>>51
|
||||
}
|
||||
|
||||
// feSquareNGeneric squares a n times and writes the result to v.
|
||||
// feSquareN squares a n times and writes the result to v.
|
||||
// It uses local variables to keep limbs in registers.
|
||||
func feSquareNGeneric(v, a *Element, n int) {
|
||||
func feSquareN(v, a *Element, n int) {
|
||||
l0 := a.l0
|
||||
l1 := a.l1
|
||||
l2 := a.l2
|
||||
|
|
|
|||
|
|
@ -1,14 +0,0 @@
|
|||
// Copyright (c) 2026 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
//go:build amd64 && !purego
|
||||
|
||||
package field
|
||||
|
||||
func feSquareN(v, a *Element, n int) {
|
||||
feSquare(v, a)
|
||||
for range n - 1 {
|
||||
feSquare(v, v)
|
||||
}
|
||||
}
|
||||
|
|
@ -514,51 +514,6 @@ func TestSquareN(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
func TestFeSquareN(t *testing.T) {
|
||||
asmLikeGeneric := func(a Element) bool {
|
||||
for _, n := range []int{1, 2, 5, 10, 15, 50, 120} {
|
||||
t1 := a
|
||||
t2 := a
|
||||
|
||||
feSquareNGeneric(&t1, &t1, n)
|
||||
feSquareN(&t2, &t2, n)
|
||||
|
||||
if t1 != t2 {
|
||||
t.Logf("n=%d: got %#v, expected %#v", n, t2, t1)
|
||||
return false
|
||||
}
|
||||
if !isInBounds(&t2) {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
if err := quick.Check(asmLikeGeneric, quickCheckConfig(1024)); err != nil {
|
||||
t.Error(err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFeSquare(t *testing.T) {
|
||||
asmLikeGeneric := func(a Element) bool {
|
||||
t1 := a
|
||||
t2 := a
|
||||
|
||||
feSquareGeneric(&t1, &t1)
|
||||
feSquare(&t2, &t2)
|
||||
|
||||
if t1 != t2 {
|
||||
t.Logf("got: %#v,\nexpected: %#v", t1, t2)
|
||||
}
|
||||
|
||||
return t1 == t2 && isInBounds(&t2)
|
||||
}
|
||||
|
||||
if err := quick.Check(asmLikeGeneric, quickCheckConfig(1024)); err != nil {
|
||||
t.Error(err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFeMul(t *testing.T) {
|
||||
asmLikeGeneric := func(a, b Element) bool {
|
||||
a1 := a
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue