crypto/internal/fips140/nistec: replace P-256 scalar assembly with fiat

Change-Id: I6a6a4656c554e26151dc73287b68d6665a824dc3
Reviewed-on: https://go-review.googlesource.com/c/go/+/669895
Auto-Submit: Filippo Valsorda <filippo@golang.org>
LUCI-TryBot-Result: golang-scoped@luci-project-accounts.iam.gserviceaccount.com <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Neal Patel <neal@golang.org>
Reviewed-by: Neal Patel <nealpatel@google.com>
Reviewed-by: Roland Shoemaker <roland@golang.org>
This commit is contained in:
Filippo Valsorda 2025-05-05 12:09:13 +02:00 committed by Gopher Robot
parent 91a81e5ae1
commit be35de22f1
15 changed files with 1436 additions and 1669 deletions

View file

@ -103,7 +103,8 @@ func TestStmtLines(t *testing.T) {
if pkgname == "runtime" {
continue
}
if pkgname == "crypto/internal/fips140/nistec/fiat" {
if pkgname == "crypto/internal/fips140/nistec/fiat" ||
pkgname == "crypto/internal/fips140/nistec" {
continue // golang.org/issue/49372
}
if e.Val(dwarf.AttrStmtList) == nil {

View file

@ -142,6 +142,14 @@ func (x *Nat) Bits() []uint {
return x.limbs
}
// SetBits assigns x = y, where y is a slice of little-endian uint. x is resized
// to the length of y.
func (x *Nat) SetBits(y []uint) *Nat {
x.reset(len(y))
copy(x.limbs, y)
return x
}
// Bytes returns x as a zero-extended big-endian byte slice. The size of the
// slice will match the size of m.
//

View file

@ -13,6 +13,7 @@ import (
"errors"
"hash"
"io"
"math/bits"
"sync"
)
@ -54,7 +55,7 @@ const (
type Curve[P Point[P]] struct {
curve curveID
newPoint func() P
ordInverse func([]byte) ([]byte, error)
ordInverse func(*[4]uint64)
N *bigmod.Modulus
nMinus2 []byte
}
@ -387,14 +388,11 @@ func signGeneric[P Point[P]](c *Curve[P], priv *PrivateKey, drbg *hmacDRBG, hash
// inverse sets kInv to the inverse of k modulo the order of the curve.
func inverse[P Point[P]](c *Curve[P], kInv, k *bigmod.Nat) {
if c.ordInverse != nil {
kBytes, err := c.ordInverse(k.Bytes(c.N))
// Some platforms don't implement ordInverse, and always return an error.
if err == nil {
_, err := kInv.SetBytes(kBytes, c.N)
if err != nil {
panic("ecdsa: internal error: ordInverse produced an invalid value")
}
if c.ordInverse != nil && bits.UintSize == 64 {
if kb := k.Bits(); len(kb) == 4 {
k64 := [4]uint64{uint64(kb[0]), uint64(kb[1]), uint64(kb[2]), uint64(kb[3])}
c.ordInverse(&k64)
kInv.SetBits([]uint{uint(k64[0]), uint(k64[1]), uint(k64[2]), uint(k64[3])})
return
}
}

View file

@ -1,11 +1,11 @@
module crypto/internal/fips140/nistec/_asm
go 1.24
go 1.26.0
require github.com/mmcloughlin/avo v0.6.0
require (
golang.org/x/mod v0.20.0 // indirect
golang.org/x/sync v0.8.0 // indirect
golang.org/x/tools v0.24.0 // indirect
golang.org/x/mod v0.33.0 // indirect
golang.org/x/sync v0.19.0 // indirect
golang.org/x/tools v0.42.0 // indirect
)

View file

@ -2,7 +2,13 @@ github.com/mmcloughlin/avo v0.6.0 h1:QH6FU8SKoTLaVs80GA8TJuLNkUYl4VokHKlPhVDg4YY
github.com/mmcloughlin/avo v0.6.0/go.mod h1:8CoAGaCSYXtCPR+8y18Y9aB/kxb8JSS6FRI7mSkvD+8=
golang.org/x/mod v0.20.0 h1:utOm6MM3R3dnawAiJgn0y+xvuYRsm1RKM/4giyfDgV0=
golang.org/x/mod v0.20.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
golang.org/x/mod v0.33.0 h1:tHFzIWbBifEmbwtGz65eaWyGiGZatSrT9prnU8DbVL8=
golang.org/x/mod v0.33.0/go.mod h1:swjeQEj+6r7fODbD2cqrnje9PnziFuw4bmLbBZFrQ5w=
golang.org/x/sync v0.8.0 h1:3NFvSEYkUoMifnESzZl15y791HH1qU2xm6eCJU5ZPXQ=
golang.org/x/sync v0.8.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
golang.org/x/sync v0.19.0 h1:vV+1eWNmZ5geRlYjzm2adRgW2/mcpevXNg50YZtPCE4=
golang.org/x/sync v0.19.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI=
golang.org/x/tools v0.24.0 h1:J1shsA93PJUEVaUSaay7UXAyE8aimq3GW0pjlolpa24=
golang.org/x/tools v0.24.0/go.mod h1:YhNqVBIfWHdzvTLs0d8LCuMhkKUgSUKldakyV7W/WDQ=
golang.org/x/tools v0.42.0 h1:uNgphsn75Tdz5Ji2q36v/nsFSfR/9BRFvqhGBaJGd5k=
golang.org/x/tools v0.42.0/go.mod h1:Ma6lCIwGZvHK6XtgbswSoWroEkhugApmsXyrUmBhfr0=

View file

@ -52,8 +52,6 @@ func main() {
p256FromMont()
p256Select()
p256SelectAffine()
p256OrdMul()
p256OrdSqr()
p256SubInternal()
p256MulInternal()
p256SqrInternal()
@ -832,583 +830,6 @@ func p256SelectAffine() {
RET()
}
// Implements:
//
// func p256OrdMul(res, in1, in2 *p256OrdElement)
func p256OrdMul() {
Implement("p256OrdMul")
Attributes(NOSPLIT)
Load(Param("res"), res_ptr)
Load(Param("in1"), x_ptr)
Load(Param("in2"), y_ptr)
Comment("x * y[0]")
MOVQ(Mem{Base: y_ptr}.Offset(8*0), t0_v1)
MOVQ(Mem{Base: x_ptr}.Offset(8*0), RAX)
MULQ(t0_v1)
MOVQ(RAX, acc0_v1)
MOVQ(RDX, acc1_v1)
MOVQ(Mem{Base: x_ptr}.Offset(8*1), RAX)
MULQ(t0_v1)
ADDQ(RAX, acc1_v1)
ADCQ(Imm(0), RDX)
MOVQ(RDX, acc2_v1)
MOVQ(Mem{Base: x_ptr}.Offset(8*2), RAX)
MULQ(t0_v1)
ADDQ(RAX, acc2_v1)
ADCQ(Imm(0), RDX)
MOVQ(RDX, acc3_v1)
MOVQ(Mem{Base: x_ptr}.Offset(8*3), RAX)
MULQ(t0_v1)
ADDQ(RAX, acc3_v1)
ADCQ(Imm(0), RDX)
MOVQ(RDX, acc4_v1)
XORQ(acc5_v1, acc5_v1)
Comment("First reduction step")
MOVQ(acc0_v1, RAX)
p256ordK0 := p256ordK0_DATA()
MULQ(p256ordK0)
MOVQ(RAX, t0_v1)
p256ord := p256ord_DATA()
MOVQ(p256ord.Offset(0x00), RAX)
MULQ(t0_v1)
ADDQ(RAX, acc0_v1)
ADCQ(Imm(0), RDX)
MOVQ(RDX, t1_v1)
MOVQ(p256ord.Offset(0x08), RAX)
MULQ(t0_v1)
ADDQ(t1_v1, acc1_v1)
ADCQ(Imm(0), RDX)
ADDQ(RAX, acc1_v1)
ADCQ(Imm(0), RDX)
MOVQ(RDX, t1_v1)
MOVQ(p256ord.Offset(0x10), RAX)
MULQ(t0_v1)
ADDQ(t1_v1, acc2_v1)
ADCQ(Imm(0), RDX)
ADDQ(RAX, acc2_v1)
ADCQ(Imm(0), RDX)
MOVQ(RDX, t1_v1)
MOVQ(p256ord.Offset(0x18), RAX)
MULQ(t0_v1)
ADDQ(t1_v1, acc3_v1)
ADCQ(Imm(0), RDX)
ADDQ(RAX, acc3_v1)
ADCQ(RDX, acc4_v1)
ADCQ(Imm(0), acc5_v1)
Comment("x * y[1]")
MOVQ(Mem{Base: y_ptr}.Offset(8*1), t0_v1)
MOVQ(Mem{Base: x_ptr}.Offset(8*0), RAX)
MULQ(t0_v1)
ADDQ(RAX, acc1_v1)
ADCQ(Imm(0), RDX)
MOVQ(RDX, t1_v1)
MOVQ(Mem{Base: x_ptr}.Offset(8*1), RAX)
MULQ(t0_v1)
ADDQ(t1_v1, acc2_v1)
ADCQ(Imm(0), RDX)
ADDQ(RAX, acc2_v1)
ADCQ(Imm(0), RDX)
MOVQ(RDX, t1_v1)
MOVQ(Mem{Base: x_ptr}.Offset(8*2), RAX)
MULQ(t0_v1)
ADDQ(t1_v1, acc3_v1)
ADCQ(Imm(0), RDX)
ADDQ(RAX, acc3_v1)
ADCQ(Imm(0), RDX)
MOVQ(RDX, t1_v1)
MOVQ(Mem{Base: x_ptr}.Offset(8*3), RAX)
MULQ(t0_v1)
ADDQ(t1_v1, acc4_v1)
ADCQ(Imm(0), RDX)
ADDQ(RAX, acc4_v1)
ADCQ(RDX, acc5_v1)
ADCQ(Imm(0), acc0_v1)
Comment("Second reduction step")
MOVQ(acc1_v1, RAX)
MULQ(p256ordK0)
MOVQ(RAX, t0_v1)
MOVQ(p256ord.Offset(0x00), RAX)
MULQ(t0_v1)
ADDQ(RAX, acc1_v1)
ADCQ(Imm(0), RDX)
MOVQ(RDX, t1_v1)
MOVQ(p256ord.Offset(0x08), RAX)
MULQ(t0_v1)
ADDQ(t1_v1, acc2_v1)
ADCQ(Imm(0), RDX)
ADDQ(RAX, acc2_v1)
ADCQ(Imm(0), RDX)
MOVQ(RDX, t1_v1)
MOVQ(p256ord.Offset(0x10), RAX)
MULQ(t0_v1)
ADDQ(t1_v1, acc3_v1)
ADCQ(Imm(0), RDX)
ADDQ(RAX, acc3_v1)
ADCQ(Imm(0), RDX)
MOVQ(RDX, t1_v1)
MOVQ(p256ord.Offset(0x18), RAX)
MULQ(t0_v1)
ADDQ(t1_v1, acc4_v1)
ADCQ(Imm(0), RDX)
ADDQ(RAX, acc4_v1)
ADCQ(RDX, acc5_v1)
ADCQ(Imm(0), acc0_v1)
Comment("x * y[2]")
MOVQ(Mem{Base: y_ptr}.Offset(8*2), t0_v1)
MOVQ(Mem{Base: x_ptr}.Offset(8*0), RAX)
MULQ(t0_v1)
ADDQ(RAX, acc2_v1)
ADCQ(Imm(0), RDX)
MOVQ(RDX, t1_v1)
MOVQ(Mem{Base: x_ptr}.Offset(8*1), RAX)
MULQ(t0_v1)
ADDQ(t1_v1, acc3_v1)
ADCQ(Imm(0), RDX)
ADDQ(RAX, acc3_v1)
ADCQ(Imm(0), RDX)
MOVQ(RDX, t1_v1)
MOVQ(Mem{Base: x_ptr}.Offset(8*2), RAX)
MULQ(t0_v1)
ADDQ(t1_v1, acc4_v1)
ADCQ(Imm(0), RDX)
ADDQ(RAX, acc4_v1)
ADCQ(Imm(0), RDX)
MOVQ(RDX, t1_v1)
MOVQ(Mem{Base: x_ptr}.Offset(8*3), RAX)
MULQ(t0_v1)
ADDQ(t1_v1, acc5_v1)
ADCQ(Imm(0), RDX)
ADDQ(RAX, acc5_v1)
ADCQ(RDX, acc0_v1)
ADCQ(Imm(0), acc1_v1)
Comment("Third reduction step")
MOVQ(acc2_v1, RAX)
MULQ(p256ordK0)
MOVQ(RAX, t0_v1)
MOVQ(p256ord.Offset(0x00), RAX)
MULQ(t0_v1)
ADDQ(RAX, acc2_v1)
ADCQ(Imm(0), RDX)
MOVQ(RDX, t1_v1)
MOVQ(p256ord.Offset(0x08), RAX)
MULQ(t0_v1)
ADDQ(t1_v1, acc3_v1)
ADCQ(Imm(0), RDX)
ADDQ(RAX, acc3_v1)
ADCQ(Imm(0), RDX)
MOVQ(RDX, t1_v1)
MOVQ(p256ord.Offset(0x10), RAX)
MULQ(t0_v1)
ADDQ(t1_v1, acc4_v1)
ADCQ(Imm(0), RDX)
ADDQ(RAX, acc4_v1)
ADCQ(Imm(0), RDX)
MOVQ(RDX, t1_v1)
MOVQ(p256ord.Offset(0x18), RAX)
MULQ(t0_v1)
ADDQ(t1_v1, acc5_v1)
ADCQ(Imm(0), RDX)
ADDQ(RAX, acc5_v1)
ADCQ(RDX, acc0_v1)
ADCQ(Imm(0), acc1_v1)
Comment("x * y[3]")
MOVQ(Mem{Base: y_ptr}.Offset(8*3), t0_v1)
MOVQ(Mem{Base: x_ptr}.Offset(8*0), RAX)
MULQ(t0_v1)
ADDQ(RAX, acc3_v1)
ADCQ(Imm(0), RDX)
MOVQ(RDX, t1_v1)
MOVQ(Mem{Base: x_ptr}.Offset(8*1), RAX)
MULQ(t0_v1)
ADDQ(t1_v1, acc4_v1)
ADCQ(Imm(0), RDX)
ADDQ(RAX, acc4_v1)
ADCQ(Imm(0), RDX)
MOVQ(RDX, t1_v1)
MOVQ(Mem{Base: x_ptr}.Offset(8*2), RAX)
MULQ(t0_v1)
ADDQ(t1_v1, acc5_v1)
ADCQ(Imm(0), RDX)
ADDQ(RAX, acc5_v1)
ADCQ(Imm(0), RDX)
MOVQ(RDX, t1_v1)
MOVQ(Mem{Base: x_ptr}.Offset(8*3), RAX)
MULQ(t0_v1)
ADDQ(t1_v1, acc0_v1)
ADCQ(Imm(0), RDX)
ADDQ(RAX, acc0_v1)
ADCQ(RDX, acc1_v1)
ADCQ(Imm(0), acc2_v1)
Comment("Last reduction step")
MOVQ(acc3_v1, RAX)
MULQ(p256ordK0)
MOVQ(RAX, t0_v1)
MOVQ(p256ord.Offset(0x00), RAX)
MULQ(t0_v1)
ADDQ(RAX, acc3_v1)
ADCQ(Imm(0), RDX)
MOVQ(RDX, t1_v1)
MOVQ(p256ord.Offset(0x08), RAX)
MULQ(t0_v1)
ADDQ(t1_v1, acc4_v1)
ADCQ(Imm(0), RDX)
ADDQ(RAX, acc4_v1)
ADCQ(Imm(0), RDX)
MOVQ(RDX, t1_v1)
MOVQ(p256ord.Offset(0x10), RAX)
MULQ(t0_v1)
ADDQ(t1_v1, acc5_v1)
ADCQ(Imm(0), RDX)
ADDQ(RAX, acc5_v1)
ADCQ(Imm(0), RDX)
MOVQ(RDX, t1_v1)
MOVQ(p256ord.Offset(0x18), RAX)
MULQ(t0_v1)
ADDQ(t1_v1, acc0_v1)
ADCQ(Imm(0), RDX)
ADDQ(RAX, acc0_v1)
ADCQ(RDX, acc1_v1)
ADCQ(Imm(0), acc2_v1)
Comment("Copy result [255:0]")
MOVQ(acc4_v1, x_ptr)
MOVQ(acc5_v1, acc3_v1)
MOVQ(acc0_v1, t0_v1)
MOVQ(acc1_v1, t1_v1)
Comment("Subtract p256")
SUBQ(p256ord.Offset(0x00), acc4_v1)
SBBQ(p256ord.Offset(0x08), acc5_v1)
SBBQ(p256ord.Offset(0x10), acc0_v1)
SBBQ(p256ord.Offset(0x18), acc1_v1)
SBBQ(Imm(0), acc2_v1)
CMOVQCS(x_ptr, acc4_v1)
CMOVQCS(acc3_v1, acc5_v1)
CMOVQCS(t0_v1, acc0_v1)
CMOVQCS(t1_v1, acc1_v1)
MOVQ(acc4_v1, Mem{Base: res_ptr}.Offset(8*0))
MOVQ(acc5_v1, Mem{Base: res_ptr}.Offset(8*1))
MOVQ(acc0_v1, Mem{Base: res_ptr}.Offset(8*2))
MOVQ(acc1_v1, Mem{Base: res_ptr}.Offset(8*3))
RET()
}
// Implements:
//
// func p256OrdSqr(res, in *p256OrdElement, n int)
func p256OrdSqr() {
Implement("p256OrdSqr")
Attributes(NOSPLIT)
Load(Param("res"), res_ptr)
Load(Param("in"), x_ptr)
Load(Param("n"), RBX)
Label("ordSqrLoop")
Comment("y[1:] * y[0]")
MOVQ(Mem{Base: x_ptr}.Offset(8*0), t0_v1)
MOVQ(Mem{Base: x_ptr}.Offset(8*1), RAX)
MULQ(t0_v1)
MOVQ(RAX, acc1_v1)
MOVQ(RDX, acc2_v1)
MOVQ(Mem{Base: x_ptr}.Offset(8*2), RAX)
MULQ(t0_v1)
ADDQ(RAX, acc2_v1)
ADCQ(Imm(0), RDX)
MOVQ(RDX, acc3_v1)
MOVQ(Mem{Base: x_ptr}.Offset(8*3), RAX)
MULQ(t0_v1)
ADDQ(RAX, acc3_v1)
ADCQ(Imm(0), RDX)
MOVQ(RDX, acc4_v1)
Comment("y[2:] * y[1]")
MOVQ(Mem{Base: x_ptr}.Offset(8*1), t0_v1)
MOVQ(Mem{Base: x_ptr}.Offset(8*2), RAX)
MULQ(t0_v1)
ADDQ(RAX, acc3_v1)
ADCQ(Imm(0), RDX)
MOVQ(RDX, t1_v1)
MOVQ(Mem{Base: x_ptr}.Offset(8*3), RAX)
MULQ(t0_v1)
ADDQ(t1_v1, acc4_v1)
ADCQ(Imm(0), RDX)
ADDQ(RAX, acc4_v1)
ADCQ(Imm(0), RDX)
MOVQ(RDX, acc5_v1)
Comment("y[3] * y[2]")
MOVQ(Mem{Base: x_ptr}.Offset(8*2), t0_v1)
MOVQ(Mem{Base: x_ptr}.Offset(8*3), RAX)
MULQ(t0_v1)
ADDQ(RAX, acc5_v1)
ADCQ(Imm(0), RDX)
MOVQ(RDX, y_ptr)
XORQ(t1_v1, t1_v1)
Comment("*2")
ADDQ(acc1_v1, acc1_v1)
ADCQ(acc2_v1, acc2_v1)
ADCQ(acc3_v1, acc3_v1)
ADCQ(acc4_v1, acc4_v1)
ADCQ(acc5_v1, acc5_v1)
ADCQ(y_ptr, y_ptr)
ADCQ(Imm(0), t1_v1)
Comment("Missing products")
MOVQ(Mem{Base: x_ptr}.Offset(8*0), RAX)
MULQ(RAX)
MOVQ(RAX, acc0_v1)
MOVQ(RDX, t0_v1)
MOVQ(Mem{Base: x_ptr}.Offset(8*1), RAX)
MULQ(RAX)
ADDQ(t0_v1, acc1_v1)
ADCQ(RAX, acc2_v1)
ADCQ(Imm(0), RDX)
MOVQ(RDX, t0_v1)
MOVQ(Mem{Base: x_ptr}.Offset(8*2), RAX)
MULQ(RAX)
ADDQ(t0_v1, acc3_v1)
ADCQ(RAX, acc4_v1)
ADCQ(Imm(0), RDX)
MOVQ(RDX, t0_v1)
MOVQ(Mem{Base: x_ptr}.Offset(8*3), RAX)
MULQ(RAX)
ADDQ(t0_v1, acc5_v1)
ADCQ(RAX, y_ptr)
ADCQ(RDX, t1_v1)
MOVQ(t1_v1, x_ptr)
Comment("First reduction step")
MOVQ(acc0_v1, RAX)
p256ordK0 := p256ordK0_DATA()
MULQ(p256ordK0)
MOVQ(RAX, t0_v1)
p256ord := p256ord_DATA()
MOVQ(p256ord.Offset(0x00), RAX)
MULQ(t0_v1)
ADDQ(RAX, acc0_v1)
ADCQ(Imm(0), RDX)
MOVQ(RDX, t1_v1)
MOVQ(p256ord.Offset(0x08), RAX)
MULQ(t0_v1)
ADDQ(t1_v1, acc1_v1)
ADCQ(Imm(0), RDX)
ADDQ(RAX, acc1_v1)
MOVQ(t0_v1, t1_v1)
ADCQ(RDX, acc2_v1)
ADCQ(Imm(0), t1_v1)
SUBQ(t0_v1, acc2_v1)
SBBQ(Imm(0), t1_v1)
MOVQ(t0_v1, RAX)
MOVQ(t0_v1, RDX)
MOVQ(t0_v1, acc0_v1)
SHLQ(Imm(32), RAX)
SHRQ(Imm(32), RDX)
ADDQ(t1_v1, acc3_v1)
ADCQ(Imm(0), acc0_v1)
SUBQ(RAX, acc3_v1)
SBBQ(RDX, acc0_v1)
Comment("Second reduction step")
MOVQ(acc1_v1, RAX)
MULQ(p256ordK0)
MOVQ(RAX, t0_v1)
MOVQ(p256ord.Offset(0x00), RAX)
MULQ(t0_v1)
ADDQ(RAX, acc1_v1)
ADCQ(Imm(0), RDX)
MOVQ(RDX, t1_v1)
MOVQ(p256ord.Offset(0x08), RAX)
MULQ(t0_v1)
ADDQ(t1_v1, acc2_v1)
ADCQ(Imm(0), RDX)
ADDQ(RAX, acc2_v1)
MOVQ(t0_v1, t1_v1)
ADCQ(RDX, acc3_v1)
ADCQ(Imm(0), t1_v1)
SUBQ(t0_v1, acc3_v1)
SBBQ(Imm(0), t1_v1)
MOVQ(t0_v1, RAX)
MOVQ(t0_v1, RDX)
MOVQ(t0_v1, acc1_v1)
SHLQ(Imm(32), RAX)
SHRQ(Imm(32), RDX)
ADDQ(t1_v1, acc0_v1)
ADCQ(Imm(0), acc1_v1)
SUBQ(RAX, acc0_v1)
SBBQ(RDX, acc1_v1)
Comment("Third reduction step")
MOVQ(acc2_v1, RAX)
MULQ(p256ordK0)
MOVQ(RAX, t0_v1)
MOVQ(p256ord.Offset(0x00), RAX)
MULQ(t0_v1)
ADDQ(RAX, acc2_v1)
ADCQ(Imm(0), RDX)
MOVQ(RDX, t1_v1)
MOVQ(p256ord.Offset(0x08), RAX)
MULQ(t0_v1)
ADDQ(t1_v1, acc3_v1)
ADCQ(Imm(0), RDX)
ADDQ(RAX, acc3_v1)
MOVQ(t0_v1, t1_v1)
ADCQ(RDX, acc0_v1)
ADCQ(Imm(0), t1_v1)
SUBQ(t0_v1, acc0_v1)
SBBQ(Imm(0), t1_v1)
MOVQ(t0_v1, RAX)
MOVQ(t0_v1, RDX)
MOVQ(t0_v1, acc2_v1)
SHLQ(Imm(32), RAX)
SHRQ(Imm(32), RDX)
ADDQ(t1_v1, acc1_v1)
ADCQ(Imm(0), acc2_v1)
SUBQ(RAX, acc1_v1)
SBBQ(RDX, acc2_v1)
Comment("Last reduction step")
MOVQ(acc3_v1, RAX)
MULQ(p256ordK0)
MOVQ(RAX, t0_v1)
MOVQ(p256ord.Offset(0x00), RAX)
MULQ(t0_v1)
ADDQ(RAX, acc3_v1)
ADCQ(Imm(0), RDX)
MOVQ(RDX, t1_v1)
MOVQ(p256ord.Offset(0x08), RAX)
MULQ(t0_v1)
ADDQ(t1_v1, acc0_v1)
ADCQ(Imm(0), RDX)
ADDQ(RAX, acc0_v1)
ADCQ(Imm(0), RDX)
MOVQ(RDX, t1_v1)
MOVQ(t0_v1, t1_v1)
ADCQ(RDX, acc1_v1)
ADCQ(Imm(0), t1_v1)
SUBQ(t0_v1, acc1_v1)
SBBQ(Imm(0), t1_v1)
MOVQ(t0_v1, RAX)
MOVQ(t0_v1, RDX)
MOVQ(t0_v1, acc3_v1)
SHLQ(Imm(32), RAX)
SHRQ(Imm(32), RDX)
ADDQ(t1_v1, acc2_v1)
ADCQ(Imm(0), acc3_v1)
SUBQ(RAX, acc2_v1)
SBBQ(RDX, acc3_v1)
XORQ(t0_v1, t0_v1)
Comment("Add bits [511:256] of the sqr result")
ADCQ(acc4_v1, acc0_v1)
ADCQ(acc5_v1, acc1_v1)
ADCQ(y_ptr, acc2_v1)
ADCQ(x_ptr, acc3_v1)
ADCQ(Imm(0), t0_v1)
MOVQ(acc0_v1, acc4_v1)
MOVQ(acc1_v1, acc5_v1)
MOVQ(acc2_v1, y_ptr)
MOVQ(acc3_v1, t1_v1)
Comment("Subtract p256")
SUBQ(p256ord.Offset(0x00), acc0_v1)
SBBQ(p256ord.Offset(0x08), acc1_v1)
SBBQ(p256ord.Offset(0x10), acc2_v1)
SBBQ(p256ord.Offset(0x18), acc3_v1)
SBBQ(Imm(0), t0_v1)
CMOVQCS(acc4_v1, acc0_v1)
CMOVQCS(acc5_v1, acc1_v1)
CMOVQCS(y_ptr, acc2_v1)
CMOVQCS(t1_v1, acc3_v1)
MOVQ(acc0_v1, Mem{Base: res_ptr}.Offset(8*0))
MOVQ(acc1_v1, Mem{Base: res_ptr}.Offset(8*1))
MOVQ(acc2_v1, Mem{Base: res_ptr}.Offset(8*2))
MOVQ(acc3_v1, Mem{Base: res_ptr}.Offset(8*3))
MOVQ(res_ptr, x_ptr)
DECQ(RBX)
JNE(LabelRef("ordSqrLoop"))
RET()
}
// These variables have been versioned as they get redfined in the reference implementation.
// This is done to produce a minimal semantic diff.
var (
@ -2595,7 +2016,7 @@ func p256PointDoubleAsm() {
// #----------------------------DATA SECTION-----------------------------------##
// Pointers for memoizing Data section symbols
var p256const0_ptr, p256const1_ptr, p256ordK0_ptr, p256ord_ptr, p256one_ptr *Mem
var p256const0_ptr, p256const1_ptr, p256one_ptr *Mem
func p256const0_DATA() Mem {
if p256const0_ptr != nil {
@ -2619,39 +2040,6 @@ func p256const1_DATA() Mem {
return p256const1
}
func p256ordK0_DATA() Mem {
if p256ordK0_ptr != nil {
return *p256ordK0_ptr
}
p256ordK0 := GLOBL("p256ordK0", 8)
p256ordK0_ptr = &p256ordK0
DATA(0, U64(0xccd1c8aaee00bc4f))
return p256ordK0
}
var p256ordConstants = [4]uint64{
0xf3b9cac2fc632551,
0xbce6faada7179e84,
0xffffffffffffffff,
0xffffffff00000000,
}
func p256ord_DATA() Mem {
if p256ord_ptr != nil {
return *p256ord_ptr
}
p256ord := GLOBL("p256ord", 8)
p256ord_ptr = &p256ord
for i, k := range p256ordConstants {
DATA(i*8, U64(k))
}
return p256ord
}
var p256oneConstants = [4]uint64{
0x0000000000000001,
0xffffffff00000000,

View file

@ -392,8 +392,8 @@ func (q *P256Point) Select(p1, p2 *P256Point, cond int) *P256Point {
return q
}
// p256OrdElement is a P-256 scalar field element in [0, ord(G)-1] in the
// Montgomery domain (with R 2²⁵⁶) as four uint64 limbs in little-endian order.
// p256OrdElement is a P-256 scalar field element in [0, ord(G)-1]
// as four uint64 limbs in little-endian order.
type p256OrdElement [4]uint64
// SetBytes sets s to the big-endian value of x, reducing it as necessary.

View file

@ -76,9 +76,10 @@ const p256CompressedLength = 1 + p256ElementLength
// the curve, it returns nil and an error, and the receiver is unchanged.
// Otherwise, it returns p.
func (p *P256Point) SetBytes(b []byte) (*P256Point, error) {
// p256Mul operates in the Montgomery domain with R = 2²⁵⁶ mod p. Thus rr
// here is R in the Montgomery domain, or R×R mod p. See comment in
// P256OrdInverse about how this is used.
// This implementation operates in the Montgomery domain with R = 2²⁵⁶ mod
// p. Elements in the Montgomery domain take the form a×R and p256Mul
// calculates (a × b × R⁻¹) mod p. rr is R in the domain, or R×R mod p, thus
// p256Mul(e, RR) gives e×R, i.e. converts e into the Montgomery domain.
rr := p256Element{0x0000000000000003, 0xfffffffbffffffff,
0xfffffffffffffffe, 0x00000004fffffffd}
@ -282,7 +283,7 @@ func p256Mul(res, in1, in2 *p256Element)
//go:noescape
func p256Sqr(res, in *p256Element, n int)
// Montgomery multiplication by R⁻¹, or 1 outside the domain.
// Montgomery multiplication by R⁻¹, or 1 outside the domain, as R⁻¹×R = 1.
// Sets res = in * R⁻¹, bringing res out of the Montgomery domain.
//
//go:noescape
@ -365,8 +366,8 @@ func p256PointAddAsm(res, in1, in2 *P256Point) int
//go:noescape
func p256PointDoubleAsm(res, in *P256Point)
// p256OrdElement is a P-256 scalar field element in [0, ord(G)-1] in the
// Montgomery domain (with R 2²⁵⁶) as four uint64 limbs in little-endian order.
// p256OrdElement is a P-256 scalar field element in [0, ord(G)-1]
// as four uint64 limbs in little-endian order.
type p256OrdElement [4]uint64
// p256OrdReduce ensures s is in the range [0, ord(G)-1].

View file

@ -640,509 +640,6 @@ loop_select_base:
MOVOU X3, 48(DX)
RET
// func p256OrdMul(res *p256OrdElement, in1 *p256OrdElement, in2 *p256OrdElement)
// Requires: CMOV
TEXT ·p256OrdMul(SB), NOSPLIT, $0-24
MOVQ res+0(FP), DI
MOVQ in1+8(FP), SI
MOVQ in2+16(FP), CX
// x * y[0]
MOVQ (CX), R14
MOVQ (SI), AX
MULQ R14
MOVQ AX, R8
MOVQ DX, R9
MOVQ 8(SI), AX
MULQ R14
ADDQ AX, R9
ADCQ $0x00, DX
MOVQ DX, R10
MOVQ 16(SI), AX
MULQ R14
ADDQ AX, R10
ADCQ $0x00, DX
MOVQ DX, R11
MOVQ 24(SI), AX
MULQ R14
ADDQ AX, R11
ADCQ $0x00, DX
MOVQ DX, R12
XORQ R13, R13
// First reduction step
MOVQ R8, AX
MULQ p256ordK0<>+0(SB)
MOVQ AX, R14
MOVQ p256ord<>+0(SB), AX
MULQ R14
ADDQ AX, R8
ADCQ $0x00, DX
MOVQ DX, R15
MOVQ p256ord<>+8(SB), AX
MULQ R14
ADDQ R15, R9
ADCQ $0x00, DX
ADDQ AX, R9
ADCQ $0x00, DX
MOVQ DX, R15
MOVQ p256ord<>+16(SB), AX
MULQ R14
ADDQ R15, R10
ADCQ $0x00, DX
ADDQ AX, R10
ADCQ $0x00, DX
MOVQ DX, R15
MOVQ p256ord<>+24(SB), AX
MULQ R14
ADDQ R15, R11
ADCQ $0x00, DX
ADDQ AX, R11
ADCQ DX, R12
ADCQ $0x00, R13
// x * y[1]
MOVQ 8(CX), R14
MOVQ (SI), AX
MULQ R14
ADDQ AX, R9
ADCQ $0x00, DX
MOVQ DX, R15
MOVQ 8(SI), AX
MULQ R14
ADDQ R15, R10
ADCQ $0x00, DX
ADDQ AX, R10
ADCQ $0x00, DX
MOVQ DX, R15
MOVQ 16(SI), AX
MULQ R14
ADDQ R15, R11
ADCQ $0x00, DX
ADDQ AX, R11
ADCQ $0x00, DX
MOVQ DX, R15
MOVQ 24(SI), AX
MULQ R14
ADDQ R15, R12
ADCQ $0x00, DX
ADDQ AX, R12
ADCQ DX, R13
ADCQ $0x00, R8
// Second reduction step
MOVQ R9, AX
MULQ p256ordK0<>+0(SB)
MOVQ AX, R14
MOVQ p256ord<>+0(SB), AX
MULQ R14
ADDQ AX, R9
ADCQ $0x00, DX
MOVQ DX, R15
MOVQ p256ord<>+8(SB), AX
MULQ R14
ADDQ R15, R10
ADCQ $0x00, DX
ADDQ AX, R10
ADCQ $0x00, DX
MOVQ DX, R15
MOVQ p256ord<>+16(SB), AX
MULQ R14
ADDQ R15, R11
ADCQ $0x00, DX
ADDQ AX, R11
ADCQ $0x00, DX
MOVQ DX, R15
MOVQ p256ord<>+24(SB), AX
MULQ R14
ADDQ R15, R12
ADCQ $0x00, DX
ADDQ AX, R12
ADCQ DX, R13
ADCQ $0x00, R8
// x * y[2]
MOVQ 16(CX), R14
MOVQ (SI), AX
MULQ R14
ADDQ AX, R10
ADCQ $0x00, DX
MOVQ DX, R15
MOVQ 8(SI), AX
MULQ R14
ADDQ R15, R11
ADCQ $0x00, DX
ADDQ AX, R11
ADCQ $0x00, DX
MOVQ DX, R15
MOVQ 16(SI), AX
MULQ R14
ADDQ R15, R12
ADCQ $0x00, DX
ADDQ AX, R12
ADCQ $0x00, DX
MOVQ DX, R15
MOVQ 24(SI), AX
MULQ R14
ADDQ R15, R13
ADCQ $0x00, DX
ADDQ AX, R13
ADCQ DX, R8
ADCQ $0x00, R9
// Third reduction step
MOVQ R10, AX
MULQ p256ordK0<>+0(SB)
MOVQ AX, R14
MOVQ p256ord<>+0(SB), AX
MULQ R14
ADDQ AX, R10
ADCQ $0x00, DX
MOVQ DX, R15
MOVQ p256ord<>+8(SB), AX
MULQ R14
ADDQ R15, R11
ADCQ $0x00, DX
ADDQ AX, R11
ADCQ $0x00, DX
MOVQ DX, R15
MOVQ p256ord<>+16(SB), AX
MULQ R14
ADDQ R15, R12
ADCQ $0x00, DX
ADDQ AX, R12
ADCQ $0x00, DX
MOVQ DX, R15
MOVQ p256ord<>+24(SB), AX
MULQ R14
ADDQ R15, R13
ADCQ $0x00, DX
ADDQ AX, R13
ADCQ DX, R8
ADCQ $0x00, R9
// x * y[3]
MOVQ 24(CX), R14
MOVQ (SI), AX
MULQ R14
ADDQ AX, R11
ADCQ $0x00, DX
MOVQ DX, R15
MOVQ 8(SI), AX
MULQ R14
ADDQ R15, R12
ADCQ $0x00, DX
ADDQ AX, R12
ADCQ $0x00, DX
MOVQ DX, R15
MOVQ 16(SI), AX
MULQ R14
ADDQ R15, R13
ADCQ $0x00, DX
ADDQ AX, R13
ADCQ $0x00, DX
MOVQ DX, R15
MOVQ 24(SI), AX
MULQ R14
ADDQ R15, R8
ADCQ $0x00, DX
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0x00, R10
// Last reduction step
MOVQ R11, AX
MULQ p256ordK0<>+0(SB)
MOVQ AX, R14
MOVQ p256ord<>+0(SB), AX
MULQ R14
ADDQ AX, R11
ADCQ $0x00, DX
MOVQ DX, R15
MOVQ p256ord<>+8(SB), AX
MULQ R14
ADDQ R15, R12
ADCQ $0x00, DX
ADDQ AX, R12
ADCQ $0x00, DX
MOVQ DX, R15
MOVQ p256ord<>+16(SB), AX
MULQ R14
ADDQ R15, R13
ADCQ $0x00, DX
ADDQ AX, R13
ADCQ $0x00, DX
MOVQ DX, R15
MOVQ p256ord<>+24(SB), AX
MULQ R14
ADDQ R15, R8
ADCQ $0x00, DX
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0x00, R10
// Copy result [255:0]
MOVQ R12, SI
MOVQ R13, R11
MOVQ R8, R14
MOVQ R9, R15
// Subtract p256
SUBQ p256ord<>+0(SB), R12
SBBQ p256ord<>+8(SB), R13
SBBQ p256ord<>+16(SB), R8
SBBQ p256ord<>+24(SB), R9
SBBQ $0x00, R10
CMOVQCS SI, R12
CMOVQCS R11, R13
CMOVQCS R14, R8
CMOVQCS R15, R9
MOVQ R12, (DI)
MOVQ R13, 8(DI)
MOVQ R8, 16(DI)
MOVQ R9, 24(DI)
RET
DATA p256ordK0<>+0(SB)/8, $0xccd1c8aaee00bc4f
GLOBL p256ordK0<>(SB), RODATA, $8
DATA p256ord<>+0(SB)/8, $0xf3b9cac2fc632551
DATA p256ord<>+8(SB)/8, $0xbce6faada7179e84
DATA p256ord<>+16(SB)/8, $0xffffffffffffffff
DATA p256ord<>+24(SB)/8, $0xffffffff00000000
GLOBL p256ord<>(SB), RODATA, $32
// func p256OrdSqr(res *p256OrdElement, in *p256OrdElement, n int)
// Requires: CMOV
TEXT ·p256OrdSqr(SB), NOSPLIT, $0-24
MOVQ res+0(FP), DI
MOVQ in+8(FP), SI
MOVQ n+16(FP), BX
ordSqrLoop:
// y[1:] * y[0]
MOVQ (SI), R14
MOVQ 8(SI), AX
MULQ R14
MOVQ AX, R9
MOVQ DX, R10
MOVQ 16(SI), AX
MULQ R14
ADDQ AX, R10
ADCQ $0x00, DX
MOVQ DX, R11
MOVQ 24(SI), AX
MULQ R14
ADDQ AX, R11
ADCQ $0x00, DX
MOVQ DX, R12
// y[2:] * y[1]
MOVQ 8(SI), R14
MOVQ 16(SI), AX
MULQ R14
ADDQ AX, R11
ADCQ $0x00, DX
MOVQ DX, R15
MOVQ 24(SI), AX
MULQ R14
ADDQ R15, R12
ADCQ $0x00, DX
ADDQ AX, R12
ADCQ $0x00, DX
MOVQ DX, R13
// y[3] * y[2]
MOVQ 16(SI), R14
MOVQ 24(SI), AX
MULQ R14
ADDQ AX, R13
ADCQ $0x00, DX
MOVQ DX, CX
XORQ R15, R15
// *2
ADDQ R9, R9
ADCQ R10, R10
ADCQ R11, R11
ADCQ R12, R12
ADCQ R13, R13
ADCQ CX, CX
ADCQ $0x00, R15
// Missing products
MOVQ (SI), AX
MULQ AX
MOVQ AX, R8
MOVQ DX, R14
MOVQ 8(SI), AX
MULQ AX
ADDQ R14, R9
ADCQ AX, R10
ADCQ $0x00, DX
MOVQ DX, R14
MOVQ 16(SI), AX
MULQ AX
ADDQ R14, R11
ADCQ AX, R12
ADCQ $0x00, DX
MOVQ DX, R14
MOVQ 24(SI), AX
MULQ AX
ADDQ R14, R13
ADCQ AX, CX
ADCQ DX, R15
MOVQ R15, SI
// First reduction step
MOVQ R8, AX
MULQ p256ordK0<>+0(SB)
MOVQ AX, R14
MOVQ p256ord<>+0(SB), AX
MULQ R14
ADDQ AX, R8
ADCQ $0x00, DX
MOVQ DX, R15
MOVQ p256ord<>+8(SB), AX
MULQ R14
ADDQ R15, R9
ADCQ $0x00, DX
ADDQ AX, R9
MOVQ R14, R15
ADCQ DX, R10
ADCQ $0x00, R15
SUBQ R14, R10
SBBQ $0x00, R15
MOVQ R14, AX
MOVQ R14, DX
MOVQ R14, R8
SHLQ $0x20, AX
SHRQ $0x20, DX
ADDQ R15, R11
ADCQ $0x00, R8
SUBQ AX, R11
SBBQ DX, R8
// Second reduction step
MOVQ R9, AX
MULQ p256ordK0<>+0(SB)
MOVQ AX, R14
MOVQ p256ord<>+0(SB), AX
MULQ R14
ADDQ AX, R9
ADCQ $0x00, DX
MOVQ DX, R15
MOVQ p256ord<>+8(SB), AX
MULQ R14
ADDQ R15, R10
ADCQ $0x00, DX
ADDQ AX, R10
MOVQ R14, R15
ADCQ DX, R11
ADCQ $0x00, R15
SUBQ R14, R11
SBBQ $0x00, R15
MOVQ R14, AX
MOVQ R14, DX
MOVQ R14, R9
SHLQ $0x20, AX
SHRQ $0x20, DX
ADDQ R15, R8
ADCQ $0x00, R9
SUBQ AX, R8
SBBQ DX, R9
// Third reduction step
MOVQ R10, AX
MULQ p256ordK0<>+0(SB)
MOVQ AX, R14
MOVQ p256ord<>+0(SB), AX
MULQ R14
ADDQ AX, R10
ADCQ $0x00, DX
MOVQ DX, R15
MOVQ p256ord<>+8(SB), AX
MULQ R14
ADDQ R15, R11
ADCQ $0x00, DX
ADDQ AX, R11
MOVQ R14, R15
ADCQ DX, R8
ADCQ $0x00, R15
SUBQ R14, R8
SBBQ $0x00, R15
MOVQ R14, AX
MOVQ R14, DX
MOVQ R14, R10
SHLQ $0x20, AX
SHRQ $0x20, DX
ADDQ R15, R9
ADCQ $0x00, R10
SUBQ AX, R9
SBBQ DX, R10
// Last reduction step
MOVQ R11, AX
MULQ p256ordK0<>+0(SB)
MOVQ AX, R14
MOVQ p256ord<>+0(SB), AX
MULQ R14
ADDQ AX, R11
ADCQ $0x00, DX
MOVQ DX, R15
MOVQ p256ord<>+8(SB), AX
MULQ R14
ADDQ R15, R8
ADCQ $0x00, DX
ADDQ AX, R8
ADCQ $0x00, DX
MOVQ DX, R15
MOVQ R14, R15
ADCQ DX, R9
ADCQ $0x00, R15
SUBQ R14, R9
SBBQ $0x00, R15
MOVQ R14, AX
MOVQ R14, DX
MOVQ R14, R11
SHLQ $0x20, AX
SHRQ $0x20, DX
ADDQ R15, R10
ADCQ $0x00, R11
SUBQ AX, R10
SBBQ DX, R11
XORQ R14, R14
// Add bits [511:256] of the sqr result
ADCQ R12, R8
ADCQ R13, R9
ADCQ CX, R10
ADCQ SI, R11
ADCQ $0x00, R14
MOVQ R8, R12
MOVQ R9, R13
MOVQ R10, CX
MOVQ R11, R15
// Subtract p256
SUBQ p256ord<>+0(SB), R8
SBBQ p256ord<>+8(SB), R9
SBBQ p256ord<>+16(SB), R10
SBBQ p256ord<>+24(SB), R11
SBBQ $0x00, R14
CMOVQCS R12, R8
CMOVQCS R13, R9
CMOVQCS CX, R10
CMOVQCS R15, R11
MOVQ R8, (DI)
MOVQ R9, 8(DI)
MOVQ R10, 16(DI)
MOVQ R11, 24(DI)
MOVQ DI, SI
DECQ BX
JNE ordSqrLoop
RET
// func p256SubInternal()
// Requires: CMOV
TEXT p256SubInternal(SB), NOSPLIT, $0

View file

@ -45,24 +45,14 @@
#define y2 R25
#define y3 R26
#define const2 t2
#define const3 t3
DATA p256const0<>+0x00(SB)/8, $0x00000000ffffffff
DATA p256const1<>+0x00(SB)/8, $0xffffffff00000001
DATA p256ordK0<>+0x00(SB)/8, $0xccd1c8aaee00bc4f
DATA p256ord<>+0x00(SB)/8, $0xf3b9cac2fc632551
DATA p256ord<>+0x08(SB)/8, $0xbce6faada7179e84
DATA p256ord<>+0x10(SB)/8, $0xffffffffffffffff
DATA p256ord<>+0x18(SB)/8, $0xffffffff00000000
DATA p256one<>+0x00(SB)/8, $0x0000000000000001
DATA p256one<>+0x08(SB)/8, $0xffffffff00000000
DATA p256one<>+0x10(SB)/8, $0xffffffffffffffff
DATA p256one<>+0x18(SB)/8, $0x00000000fffffffe
GLOBL p256const0<>(SB), 8, $8
GLOBL p256const1<>(SB), 8, $8
GLOBL p256ordK0<>(SB), 8, $8
GLOBL p256ord<>(SB), 8, $32
GLOBL p256one<>(SB), 8, $32
/* ---------------------------------------*/
@ -343,425 +333,10 @@ loop_select:
STP (y2, y3), 3*16(res_ptr)
RET
/* ---------------------------------------*/
// func p256OrdSqr(res, in *p256OrdElement, n int)
TEXT ·p256OrdSqr(SB),NOSPLIT,$0
MOVD in+8(FP), a_ptr
MOVD n+16(FP), b_ptr
MOVD p256ordK0<>(SB), hlp1
LDP p256ord<>+0x00(SB), (const0, const1)
LDP p256ord<>+0x10(SB), (const2, const3)
LDP 0*16(a_ptr), (x0, x1)
LDP 1*16(a_ptr), (x2, x3)
ordSqrLoop:
SUB $1, b_ptr
// x[1:] * x[0]
MUL x0, x1, acc1
UMULH x0, x1, acc2
MUL x0, x2, t0
ADDS t0, acc2, acc2
UMULH x0, x2, acc3
MUL x0, x3, t0
ADCS t0, acc3, acc3
UMULH x0, x3, acc4
ADC $0, acc4, acc4
// x[2:] * x[1]
MUL x1, x2, t0
ADDS t0, acc3
UMULH x1, x2, t1
ADCS t1, acc4
ADC $0, ZR, acc5
MUL x1, x3, t0
ADDS t0, acc4
UMULH x1, x3, t1
ADC t1, acc5
// x[3] * x[2]
MUL x2, x3, t0
ADDS t0, acc5
UMULH x2, x3, acc6
ADC $0, acc6
MOVD $0, acc7
// *2
ADDS acc1, acc1
ADCS acc2, acc2
ADCS acc3, acc3
ADCS acc4, acc4
ADCS acc5, acc5
ADCS acc6, acc6
ADC $0, acc7
// Missing products
MUL x0, x0, acc0
UMULH x0, x0, t0
ADDS t0, acc1, acc1
MUL x1, x1, t0
ADCS t0, acc2, acc2
UMULH x1, x1, t1
ADCS t1, acc3, acc3
MUL x2, x2, t0
ADCS t0, acc4, acc4
UMULH x2, x2, t1
ADCS t1, acc5, acc5
MUL x3, x3, t0
ADCS t0, acc6, acc6
UMULH x3, x3, t1
ADC t1, acc7, acc7
// First reduction step
MUL acc0, hlp1, hlp0
MUL const0, hlp1, t0
ADDS t0, acc0, acc0
UMULH const0, hlp0, t1
MUL const1, hlp0, t0
ADCS t0, acc1, acc1
UMULH const1, hlp0, y0
MUL const2, hlp0, t0
ADCS t0, acc2, acc2
UMULH const2, hlp0, acc0
MUL const3, hlp0, t0
ADCS t0, acc3, acc3
UMULH const3, hlp0, hlp0
ADC $0, hlp0
ADDS t1, acc1, acc1
ADCS y0, acc2, acc2
ADCS acc0, acc3, acc3
ADC $0, hlp0, acc0
// Second reduction step
MUL acc1, hlp1, hlp0
MUL const0, hlp1, t0
ADDS t0, acc1, acc1
UMULH const0, hlp0, t1
MUL const1, hlp0, t0
ADCS t0, acc2, acc2
UMULH const1, hlp0, y0
MUL const2, hlp0, t0
ADCS t0, acc3, acc3
UMULH const2, hlp0, acc1
MUL const3, hlp0, t0
ADCS t0, acc0, acc0
UMULH const3, hlp0, hlp0
ADC $0, hlp0
ADDS t1, acc2, acc2
ADCS y0, acc3, acc3
ADCS acc1, acc0, acc0
ADC $0, hlp0, acc1
// Third reduction step
MUL acc2, hlp1, hlp0
MUL const0, hlp1, t0
ADDS t0, acc2, acc2
UMULH const0, hlp0, t1
MUL const1, hlp0, t0
ADCS t0, acc3, acc3
UMULH const1, hlp0, y0
MUL const2, hlp0, t0
ADCS t0, acc0, acc0
UMULH const2, hlp0, acc2
MUL const3, hlp0, t0
ADCS t0, acc1, acc1
UMULH const3, hlp0, hlp0
ADC $0, hlp0
ADDS t1, acc3, acc3
ADCS y0, acc0, acc0
ADCS acc2, acc1, acc1
ADC $0, hlp0, acc2
// Last reduction step
MUL acc3, hlp1, hlp0
MUL const0, hlp1, t0
ADDS t0, acc3, acc3
UMULH const0, hlp0, t1
MUL const1, hlp0, t0
ADCS t0, acc0, acc0
UMULH const1, hlp0, y0
MUL const2, hlp0, t0
ADCS t0, acc1, acc1
UMULH const2, hlp0, acc3
MUL const3, hlp0, t0
ADCS t0, acc2, acc2
UMULH const3, hlp0, hlp0
ADC $0, acc7
ADDS t1, acc0, acc0
ADCS y0, acc1, acc1
ADCS acc3, acc2, acc2
ADC $0, hlp0, acc3
ADDS acc4, acc0, acc0
ADCS acc5, acc1, acc1
ADCS acc6, acc2, acc2
ADCS acc7, acc3, acc3
ADC $0, ZR, acc4
SUBS const0, acc0, y0
SBCS const1, acc1, y1
SBCS const2, acc2, y2
SBCS const3, acc3, y3
SBCS $0, acc4, acc4
CSEL CS, y0, acc0, x0
CSEL CS, y1, acc1, x1
CSEL CS, y2, acc2, x2
CSEL CS, y3, acc3, x3
CBNZ b_ptr, ordSqrLoop
MOVD res+0(FP), res_ptr
STP (x0, x1), 0*16(res_ptr)
STP (x2, x3), 1*16(res_ptr)
RET
/* ---------------------------------------*/
// func p256OrdMul(res, in1, in2 *p256OrdElement)
TEXT ·p256OrdMul(SB),NOSPLIT,$0
MOVD in1+8(FP), a_ptr
MOVD in2+16(FP), b_ptr
MOVD p256ordK0<>(SB), hlp1
LDP p256ord<>+0x00(SB), (const0, const1)
LDP p256ord<>+0x10(SB), (const2, const3)
LDP 0*16(a_ptr), (x0, x1)
LDP 1*16(a_ptr), (x2, x3)
LDP 0*16(b_ptr), (y0, y1)
LDP 1*16(b_ptr), (y2, y3)
// y[0] * x
MUL y0, x0, acc0
UMULH y0, x0, acc1
MUL y0, x1, t0
ADDS t0, acc1
UMULH y0, x1, acc2
MUL y0, x2, t0
ADCS t0, acc2
UMULH y0, x2, acc3
MUL y0, x3, t0
ADCS t0, acc3
UMULH y0, x3, acc4
ADC $0, acc4
// First reduction step
MUL acc0, hlp1, hlp0
MUL const0, hlp1, t0
ADDS t0, acc0, acc0
UMULH const0, hlp0, t1
MUL const1, hlp0, t0
ADCS t0, acc1, acc1
UMULH const1, hlp0, y0
MUL const2, hlp0, t0
ADCS t0, acc2, acc2
UMULH const2, hlp0, acc0
MUL const3, hlp0, t0
ADCS t0, acc3, acc3
UMULH const3, hlp0, hlp0
ADC $0, acc4
ADDS t1, acc1, acc1
ADCS y0, acc2, acc2
ADCS acc0, acc3, acc3
ADC $0, hlp0, acc0
// y[1] * x
MUL y1, x0, t0
ADDS t0, acc1
UMULH y1, x0, t1
MUL y1, x1, t0
ADCS t0, acc2
UMULH y1, x1, hlp0
MUL y1, x2, t0
ADCS t0, acc3
UMULH y1, x2, y0
MUL y1, x3, t0
ADCS t0, acc4
UMULH y1, x3, y1
ADC $0, ZR, acc5
ADDS t1, acc2
ADCS hlp0, acc3
ADCS y0, acc4
ADC y1, acc5
// Second reduction step
MUL acc1, hlp1, hlp0
MUL const0, hlp1, t0
ADDS t0, acc1, acc1
UMULH const0, hlp0, t1
MUL const1, hlp0, t0
ADCS t0, acc2, acc2
UMULH const1, hlp0, y0
MUL const2, hlp0, t0
ADCS t0, acc3, acc3
UMULH const2, hlp0, acc1
MUL const3, hlp0, t0
ADCS t0, acc0, acc0
UMULH const3, hlp0, hlp0
ADC $0, acc5
ADDS t1, acc2, acc2
ADCS y0, acc3, acc3
ADCS acc1, acc0, acc0
ADC $0, hlp0, acc1
// y[2] * x
MUL y2, x0, t0
ADDS t0, acc2
UMULH y2, x0, t1
MUL y2, x1, t0
ADCS t0, acc3
UMULH y2, x1, hlp0
MUL y2, x2, t0
ADCS t0, acc4
UMULH y2, x2, y0
MUL y2, x3, t0
ADCS t0, acc5
UMULH y2, x3, y1
ADC $0, ZR, acc6
ADDS t1, acc3
ADCS hlp0, acc4
ADCS y0, acc5
ADC y1, acc6
// Third reduction step
MUL acc2, hlp1, hlp0
MUL const0, hlp1, t0
ADDS t0, acc2, acc2
UMULH const0, hlp0, t1
MUL const1, hlp0, t0
ADCS t0, acc3, acc3
UMULH const1, hlp0, y0
MUL const2, hlp0, t0
ADCS t0, acc0, acc0
UMULH const2, hlp0, acc2
MUL const3, hlp0, t0
ADCS t0, acc1, acc1
UMULH const3, hlp0, hlp0
ADC $0, acc6
ADDS t1, acc3, acc3
ADCS y0, acc0, acc0
ADCS acc2, acc1, acc1
ADC $0, hlp0, acc2
// y[3] * x
MUL y3, x0, t0
ADDS t0, acc3
UMULH y3, x0, t1
MUL y3, x1, t0
ADCS t0, acc4
UMULH y3, x1, hlp0
MUL y3, x2, t0
ADCS t0, acc5
UMULH y3, x2, y0
MUL y3, x3, t0
ADCS t0, acc6
UMULH y3, x3, y1
ADC $0, ZR, acc7
ADDS t1, acc4
ADCS hlp0, acc5
ADCS y0, acc6
ADC y1, acc7
// Last reduction step
MUL acc3, hlp1, hlp0
MUL const0, hlp1, t0
ADDS t0, acc3, acc3
UMULH const0, hlp0, t1
MUL const1, hlp0, t0
ADCS t0, acc0, acc0
UMULH const1, hlp0, y0
MUL const2, hlp0, t0
ADCS t0, acc1, acc1
UMULH const2, hlp0, acc3
MUL const3, hlp0, t0
ADCS t0, acc2, acc2
UMULH const3, hlp0, hlp0
ADC $0, acc7
ADDS t1, acc0, acc0
ADCS y0, acc1, acc1
ADCS acc3, acc2, acc2
ADC $0, hlp0, acc3
ADDS acc4, acc0, acc0
ADCS acc5, acc1, acc1
ADCS acc6, acc2, acc2
ADCS acc7, acc3, acc3
ADC $0, ZR, acc4
SUBS const0, acc0, t0
SBCS const1, acc1, t1
SBCS const2, acc2, t2
SBCS const3, acc3, t3
SBCS $0, acc4, acc4
CSEL CS, t0, acc0, acc0
CSEL CS, t1, acc1, acc1
CSEL CS, t2, acc2, acc2
CSEL CS, t3, acc3, acc3
MOVD res+0(FP), res_ptr
STP (acc0, acc1), 0*16(res_ptr)
STP (acc2, acc3), 1*16(res_ptr)
RET
/* ---------------------------------------*/
// input: x0-x3, y0-y3
// output: x0-x3
// uses: const0, const1
// clobbers: y0-y3, hlp0
TEXT p256SubInternal<>(SB),NOSPLIT,$0
SUBS x0, y0, acc0
SBCS x1, y1, acc1

File diff suppressed because it is too large Load diff

View file

@ -1,13 +0,0 @@
// Copyright 2022 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build (!amd64 && !arm64) || purego
package nistec
import "errors"
func P256OrdInverse(k []byte) ([]byte, error) {
return nil, errors.New("unimplemented")
}

View file

@ -0,0 +1,30 @@
// Copyright 2026 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build fips140v1.0 || fips140v1.26
package fipstest
import (
"crypto/internal/fips140/nistec"
"internal/goarch"
"testing"
)
// package nistec
// func P256OrdInverse(k []byte) ([]byte, error)
func p256OrdInverse(t *testing.T, k *[4]uint64) {
input := limbsToBytes(*k)
out, err := nistec.P256OrdInverse(input)
if err != nil {
switch goarch.GOARCH {
case "amd64", "arm64":
t.Fatal(err)
default:
t.Skip("this GOARCH didn't have P256OrdInverse in v1.0/v1.26")
}
}
*k = bytesToLimbs(out)
}

View file

@ -0,0 +1,16 @@
// Copyright 2026 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build !(fips140v1.0 || fips140v1.26)
package fipstest
import (
"crypto/internal/fips140/nistec"
"testing"
)
func p256OrdInverse(t *testing.T, k *[4]uint64) {
nistec.P256OrdInverse(k)
}

View file

@ -2,80 +2,83 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build (amd64 || arm64) && !purego
package fipstest
import (
"bytes"
"crypto/elliptic"
"crypto/internal/fips140/nistec"
"internal/byteorder"
"math/big"
"testing"
)
func bytesToLimbs(b []byte) [4]uint64 {
var l [4]uint64
l[0] = byteorder.BEUint64(b[24:])
l[1] = byteorder.BEUint64(b[16:])
l[2] = byteorder.BEUint64(b[8:])
l[3] = byteorder.BEUint64(b[:])
return l
}
func limbsToBytes(l [4]uint64) []byte {
b := make([]byte, 32)
byteorder.BEPutUint64(b[24:], l[0])
byteorder.BEPutUint64(b[16:], l[1])
byteorder.BEPutUint64(b[8:], l[2])
byteorder.BEPutUint64(b[:], l[3])
return b
}
func TestP256OrdInverse(t *testing.T) {
N := elliptic.P256().Params().N
// inv(0) is expected to be 0.
zero := make([]byte, 32)
out, err := nistec.P256OrdInverse(zero)
if err != nil {
t.Fatal(err)
}
if !bytes.Equal(out, zero) {
k := bytesToLimbs(zero)
p256OrdInverse(t, &k)
if !bytes.Equal(limbsToBytes(k), zero) {
t.Error("unexpected output for inv(0)")
}
// inv(N) is also 0 mod N.
input := make([]byte, 32)
N.FillBytes(input)
out, err = nistec.P256OrdInverse(input)
if err != nil {
t.Fatal(err)
}
if !bytes.Equal(out, zero) {
k = bytesToLimbs(input)
p256OrdInverse(t, &k)
if !bytes.Equal(limbsToBytes(k), zero) {
t.Error("unexpected output for inv(N)")
}
if !bytes.Equal(input, N.Bytes()) {
t.Error("input was modified")
}
// Check inv(1) and inv(N+1) against math/big
exp := new(big.Int).ModInverse(big.NewInt(1), N).FillBytes(make([]byte, 32))
big.NewInt(1).FillBytes(input)
out, err = nistec.P256OrdInverse(input)
if err != nil {
t.Fatal(err)
}
if !bytes.Equal(out, exp) {
k = bytesToLimbs(input)
p256OrdInverse(t, &k)
if !bytes.Equal(limbsToBytes(k), exp) {
t.Error("unexpected output for inv(1)")
}
new(big.Int).Add(N, big.NewInt(1)).FillBytes(input)
out, err = nistec.P256OrdInverse(input)
if err != nil {
t.Fatal(err)
}
if !bytes.Equal(out, exp) {
k = bytesToLimbs(input)
p256OrdInverse(t, &k)
if !bytes.Equal(limbsToBytes(k), exp) {
t.Error("unexpected output for inv(N+1)")
}
// Check inv(20) and inv(N+20) against math/big
exp = new(big.Int).ModInverse(big.NewInt(20), N).FillBytes(make([]byte, 32))
big.NewInt(20).FillBytes(input)
out, err = nistec.P256OrdInverse(input)
if err != nil {
t.Fatal(err)
}
if !bytes.Equal(out, exp) {
k = bytesToLimbs(input)
p256OrdInverse(t, &k)
if !bytes.Equal(limbsToBytes(k), exp) {
t.Error("unexpected output for inv(20)")
}
new(big.Int).Add(N, big.NewInt(20)).FillBytes(input)
out, err = nistec.P256OrdInverse(input)
if err != nil {
t.Fatal(err)
}
if !bytes.Equal(out, exp) {
k = bytesToLimbs(input)
p256OrdInverse(t, &k)
if !bytes.Equal(limbsToBytes(k), exp) {
t.Error("unexpected output for inv(N+20)")
}
@ -84,11 +87,9 @@ func TestP256OrdInverse(t *testing.T) {
bigInput.Sub(bigInput, big.NewInt(1))
exp = new(big.Int).ModInverse(bigInput, N).FillBytes(make([]byte, 32))
bigInput.FillBytes(input)
out, err = nistec.P256OrdInverse(input)
if err != nil {
t.Fatal(err)
}
if !bytes.Equal(out, exp) {
k = bytesToLimbs(input)
p256OrdInverse(t, &k)
if !bytes.Equal(limbsToBytes(k), exp) {
t.Error("unexpected output for inv(2^256-1)")
}
}