mirror of
https://github.com/golang/go.git
synced 2026-06-28 03:40:37 +00:00
crypto/internal/fips140/nistec: replace P-256 scalar assembly with fiat
Change-Id: I6a6a4656c554e26151dc73287b68d6665a824dc3 Reviewed-on: https://go-review.googlesource.com/c/go/+/669895 Auto-Submit: Filippo Valsorda <filippo@golang.org> LUCI-TryBot-Result: golang-scoped@luci-project-accounts.iam.gserviceaccount.com <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Neal Patel <neal@golang.org> Reviewed-by: Neal Patel <nealpatel@google.com> Reviewed-by: Roland Shoemaker <roland@golang.org>
This commit is contained in:
parent
91a81e5ae1
commit
be35de22f1
15 changed files with 1436 additions and 1669 deletions
|
|
@ -103,7 +103,8 @@ func TestStmtLines(t *testing.T) {
|
|||
if pkgname == "runtime" {
|
||||
continue
|
||||
}
|
||||
if pkgname == "crypto/internal/fips140/nistec/fiat" {
|
||||
if pkgname == "crypto/internal/fips140/nistec/fiat" ||
|
||||
pkgname == "crypto/internal/fips140/nistec" {
|
||||
continue // golang.org/issue/49372
|
||||
}
|
||||
if e.Val(dwarf.AttrStmtList) == nil {
|
||||
|
|
|
|||
|
|
@ -142,6 +142,14 @@ func (x *Nat) Bits() []uint {
|
|||
return x.limbs
|
||||
}
|
||||
|
||||
// SetBits assigns x = y, where y is a slice of little-endian uint. x is resized
|
||||
// to the length of y.
|
||||
func (x *Nat) SetBits(y []uint) *Nat {
|
||||
x.reset(len(y))
|
||||
copy(x.limbs, y)
|
||||
return x
|
||||
}
|
||||
|
||||
// Bytes returns x as a zero-extended big-endian byte slice. The size of the
|
||||
// slice will match the size of m.
|
||||
//
|
||||
|
|
|
|||
|
|
@ -13,6 +13,7 @@ import (
|
|||
"errors"
|
||||
"hash"
|
||||
"io"
|
||||
"math/bits"
|
||||
"sync"
|
||||
)
|
||||
|
||||
|
|
@ -54,7 +55,7 @@ const (
|
|||
type Curve[P Point[P]] struct {
|
||||
curve curveID
|
||||
newPoint func() P
|
||||
ordInverse func([]byte) ([]byte, error)
|
||||
ordInverse func(*[4]uint64)
|
||||
N *bigmod.Modulus
|
||||
nMinus2 []byte
|
||||
}
|
||||
|
|
@ -387,14 +388,11 @@ func signGeneric[P Point[P]](c *Curve[P], priv *PrivateKey, drbg *hmacDRBG, hash
|
|||
|
||||
// inverse sets kInv to the inverse of k modulo the order of the curve.
|
||||
func inverse[P Point[P]](c *Curve[P], kInv, k *bigmod.Nat) {
|
||||
if c.ordInverse != nil {
|
||||
kBytes, err := c.ordInverse(k.Bytes(c.N))
|
||||
// Some platforms don't implement ordInverse, and always return an error.
|
||||
if err == nil {
|
||||
_, err := kInv.SetBytes(kBytes, c.N)
|
||||
if err != nil {
|
||||
panic("ecdsa: internal error: ordInverse produced an invalid value")
|
||||
}
|
||||
if c.ordInverse != nil && bits.UintSize == 64 {
|
||||
if kb := k.Bits(); len(kb) == 4 {
|
||||
k64 := [4]uint64{uint64(kb[0]), uint64(kb[1]), uint64(kb[2]), uint64(kb[3])}
|
||||
c.ordInverse(&k64)
|
||||
kInv.SetBits([]uint{uint(k64[0]), uint(k64[1]), uint(k64[2]), uint(k64[3])})
|
||||
return
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,11 +1,11 @@
|
|||
module crypto/internal/fips140/nistec/_asm
|
||||
|
||||
go 1.24
|
||||
go 1.26.0
|
||||
|
||||
require github.com/mmcloughlin/avo v0.6.0
|
||||
|
||||
require (
|
||||
golang.org/x/mod v0.20.0 // indirect
|
||||
golang.org/x/sync v0.8.0 // indirect
|
||||
golang.org/x/tools v0.24.0 // indirect
|
||||
golang.org/x/mod v0.33.0 // indirect
|
||||
golang.org/x/sync v0.19.0 // indirect
|
||||
golang.org/x/tools v0.42.0 // indirect
|
||||
)
|
||||
|
|
|
|||
|
|
@ -2,7 +2,13 @@ github.com/mmcloughlin/avo v0.6.0 h1:QH6FU8SKoTLaVs80GA8TJuLNkUYl4VokHKlPhVDg4YY
|
|||
github.com/mmcloughlin/avo v0.6.0/go.mod h1:8CoAGaCSYXtCPR+8y18Y9aB/kxb8JSS6FRI7mSkvD+8=
|
||||
golang.org/x/mod v0.20.0 h1:utOm6MM3R3dnawAiJgn0y+xvuYRsm1RKM/4giyfDgV0=
|
||||
golang.org/x/mod v0.20.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
|
||||
golang.org/x/mod v0.33.0 h1:tHFzIWbBifEmbwtGz65eaWyGiGZatSrT9prnU8DbVL8=
|
||||
golang.org/x/mod v0.33.0/go.mod h1:swjeQEj+6r7fODbD2cqrnje9PnziFuw4bmLbBZFrQ5w=
|
||||
golang.org/x/sync v0.8.0 h1:3NFvSEYkUoMifnESzZl15y791HH1qU2xm6eCJU5ZPXQ=
|
||||
golang.org/x/sync v0.8.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
|
||||
golang.org/x/sync v0.19.0 h1:vV+1eWNmZ5geRlYjzm2adRgW2/mcpevXNg50YZtPCE4=
|
||||
golang.org/x/sync v0.19.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI=
|
||||
golang.org/x/tools v0.24.0 h1:J1shsA93PJUEVaUSaay7UXAyE8aimq3GW0pjlolpa24=
|
||||
golang.org/x/tools v0.24.0/go.mod h1:YhNqVBIfWHdzvTLs0d8LCuMhkKUgSUKldakyV7W/WDQ=
|
||||
golang.org/x/tools v0.42.0 h1:uNgphsn75Tdz5Ji2q36v/nsFSfR/9BRFvqhGBaJGd5k=
|
||||
golang.org/x/tools v0.42.0/go.mod h1:Ma6lCIwGZvHK6XtgbswSoWroEkhugApmsXyrUmBhfr0=
|
||||
|
|
|
|||
|
|
@ -52,8 +52,6 @@ func main() {
|
|||
p256FromMont()
|
||||
p256Select()
|
||||
p256SelectAffine()
|
||||
p256OrdMul()
|
||||
p256OrdSqr()
|
||||
p256SubInternal()
|
||||
p256MulInternal()
|
||||
p256SqrInternal()
|
||||
|
|
@ -832,583 +830,6 @@ func p256SelectAffine() {
|
|||
RET()
|
||||
}
|
||||
|
||||
// Implements:
|
||||
//
|
||||
// func p256OrdMul(res, in1, in2 *p256OrdElement)
|
||||
func p256OrdMul() {
|
||||
Implement("p256OrdMul")
|
||||
Attributes(NOSPLIT)
|
||||
|
||||
Load(Param("res"), res_ptr)
|
||||
Load(Param("in1"), x_ptr)
|
||||
Load(Param("in2"), y_ptr)
|
||||
|
||||
Comment("x * y[0]")
|
||||
MOVQ(Mem{Base: y_ptr}.Offset(8*0), t0_v1)
|
||||
|
||||
MOVQ(Mem{Base: x_ptr}.Offset(8*0), RAX)
|
||||
MULQ(t0_v1)
|
||||
MOVQ(RAX, acc0_v1)
|
||||
MOVQ(RDX, acc1_v1)
|
||||
|
||||
MOVQ(Mem{Base: x_ptr}.Offset(8*1), RAX)
|
||||
MULQ(t0_v1)
|
||||
ADDQ(RAX, acc1_v1)
|
||||
ADCQ(Imm(0), RDX)
|
||||
MOVQ(RDX, acc2_v1)
|
||||
|
||||
MOVQ(Mem{Base: x_ptr}.Offset(8*2), RAX)
|
||||
MULQ(t0_v1)
|
||||
ADDQ(RAX, acc2_v1)
|
||||
ADCQ(Imm(0), RDX)
|
||||
MOVQ(RDX, acc3_v1)
|
||||
|
||||
MOVQ(Mem{Base: x_ptr}.Offset(8*3), RAX)
|
||||
MULQ(t0_v1)
|
||||
ADDQ(RAX, acc3_v1)
|
||||
ADCQ(Imm(0), RDX)
|
||||
MOVQ(RDX, acc4_v1)
|
||||
XORQ(acc5_v1, acc5_v1)
|
||||
|
||||
Comment("First reduction step")
|
||||
MOVQ(acc0_v1, RAX)
|
||||
p256ordK0 := p256ordK0_DATA()
|
||||
MULQ(p256ordK0)
|
||||
MOVQ(RAX, t0_v1)
|
||||
|
||||
p256ord := p256ord_DATA()
|
||||
MOVQ(p256ord.Offset(0x00), RAX)
|
||||
MULQ(t0_v1)
|
||||
ADDQ(RAX, acc0_v1)
|
||||
ADCQ(Imm(0), RDX)
|
||||
MOVQ(RDX, t1_v1)
|
||||
|
||||
MOVQ(p256ord.Offset(0x08), RAX)
|
||||
MULQ(t0_v1)
|
||||
ADDQ(t1_v1, acc1_v1)
|
||||
ADCQ(Imm(0), RDX)
|
||||
ADDQ(RAX, acc1_v1)
|
||||
ADCQ(Imm(0), RDX)
|
||||
MOVQ(RDX, t1_v1)
|
||||
|
||||
MOVQ(p256ord.Offset(0x10), RAX)
|
||||
MULQ(t0_v1)
|
||||
ADDQ(t1_v1, acc2_v1)
|
||||
ADCQ(Imm(0), RDX)
|
||||
ADDQ(RAX, acc2_v1)
|
||||
ADCQ(Imm(0), RDX)
|
||||
MOVQ(RDX, t1_v1)
|
||||
|
||||
MOVQ(p256ord.Offset(0x18), RAX)
|
||||
MULQ(t0_v1)
|
||||
ADDQ(t1_v1, acc3_v1)
|
||||
ADCQ(Imm(0), RDX)
|
||||
ADDQ(RAX, acc3_v1)
|
||||
ADCQ(RDX, acc4_v1)
|
||||
ADCQ(Imm(0), acc5_v1)
|
||||
|
||||
Comment("x * y[1]")
|
||||
MOVQ(Mem{Base: y_ptr}.Offset(8*1), t0_v1)
|
||||
|
||||
MOVQ(Mem{Base: x_ptr}.Offset(8*0), RAX)
|
||||
MULQ(t0_v1)
|
||||
ADDQ(RAX, acc1_v1)
|
||||
ADCQ(Imm(0), RDX)
|
||||
MOVQ(RDX, t1_v1)
|
||||
|
||||
MOVQ(Mem{Base: x_ptr}.Offset(8*1), RAX)
|
||||
MULQ(t0_v1)
|
||||
ADDQ(t1_v1, acc2_v1)
|
||||
ADCQ(Imm(0), RDX)
|
||||
ADDQ(RAX, acc2_v1)
|
||||
ADCQ(Imm(0), RDX)
|
||||
MOVQ(RDX, t1_v1)
|
||||
|
||||
MOVQ(Mem{Base: x_ptr}.Offset(8*2), RAX)
|
||||
MULQ(t0_v1)
|
||||
ADDQ(t1_v1, acc3_v1)
|
||||
ADCQ(Imm(0), RDX)
|
||||
ADDQ(RAX, acc3_v1)
|
||||
ADCQ(Imm(0), RDX)
|
||||
MOVQ(RDX, t1_v1)
|
||||
|
||||
MOVQ(Mem{Base: x_ptr}.Offset(8*3), RAX)
|
||||
MULQ(t0_v1)
|
||||
ADDQ(t1_v1, acc4_v1)
|
||||
ADCQ(Imm(0), RDX)
|
||||
ADDQ(RAX, acc4_v1)
|
||||
ADCQ(RDX, acc5_v1)
|
||||
ADCQ(Imm(0), acc0_v1)
|
||||
|
||||
Comment("Second reduction step")
|
||||
MOVQ(acc1_v1, RAX)
|
||||
MULQ(p256ordK0)
|
||||
MOVQ(RAX, t0_v1)
|
||||
|
||||
MOVQ(p256ord.Offset(0x00), RAX)
|
||||
MULQ(t0_v1)
|
||||
ADDQ(RAX, acc1_v1)
|
||||
ADCQ(Imm(0), RDX)
|
||||
MOVQ(RDX, t1_v1)
|
||||
|
||||
MOVQ(p256ord.Offset(0x08), RAX)
|
||||
MULQ(t0_v1)
|
||||
ADDQ(t1_v1, acc2_v1)
|
||||
ADCQ(Imm(0), RDX)
|
||||
ADDQ(RAX, acc2_v1)
|
||||
ADCQ(Imm(0), RDX)
|
||||
MOVQ(RDX, t1_v1)
|
||||
|
||||
MOVQ(p256ord.Offset(0x10), RAX)
|
||||
MULQ(t0_v1)
|
||||
ADDQ(t1_v1, acc3_v1)
|
||||
ADCQ(Imm(0), RDX)
|
||||
ADDQ(RAX, acc3_v1)
|
||||
ADCQ(Imm(0), RDX)
|
||||
MOVQ(RDX, t1_v1)
|
||||
|
||||
MOVQ(p256ord.Offset(0x18), RAX)
|
||||
MULQ(t0_v1)
|
||||
ADDQ(t1_v1, acc4_v1)
|
||||
ADCQ(Imm(0), RDX)
|
||||
ADDQ(RAX, acc4_v1)
|
||||
ADCQ(RDX, acc5_v1)
|
||||
ADCQ(Imm(0), acc0_v1)
|
||||
|
||||
Comment("x * y[2]")
|
||||
MOVQ(Mem{Base: y_ptr}.Offset(8*2), t0_v1)
|
||||
|
||||
MOVQ(Mem{Base: x_ptr}.Offset(8*0), RAX)
|
||||
MULQ(t0_v1)
|
||||
ADDQ(RAX, acc2_v1)
|
||||
ADCQ(Imm(0), RDX)
|
||||
MOVQ(RDX, t1_v1)
|
||||
|
||||
MOVQ(Mem{Base: x_ptr}.Offset(8*1), RAX)
|
||||
MULQ(t0_v1)
|
||||
ADDQ(t1_v1, acc3_v1)
|
||||
ADCQ(Imm(0), RDX)
|
||||
ADDQ(RAX, acc3_v1)
|
||||
ADCQ(Imm(0), RDX)
|
||||
MOVQ(RDX, t1_v1)
|
||||
|
||||
MOVQ(Mem{Base: x_ptr}.Offset(8*2), RAX)
|
||||
MULQ(t0_v1)
|
||||
ADDQ(t1_v1, acc4_v1)
|
||||
ADCQ(Imm(0), RDX)
|
||||
ADDQ(RAX, acc4_v1)
|
||||
ADCQ(Imm(0), RDX)
|
||||
MOVQ(RDX, t1_v1)
|
||||
|
||||
MOVQ(Mem{Base: x_ptr}.Offset(8*3), RAX)
|
||||
MULQ(t0_v1)
|
||||
ADDQ(t1_v1, acc5_v1)
|
||||
ADCQ(Imm(0), RDX)
|
||||
ADDQ(RAX, acc5_v1)
|
||||
ADCQ(RDX, acc0_v1)
|
||||
ADCQ(Imm(0), acc1_v1)
|
||||
|
||||
Comment("Third reduction step")
|
||||
MOVQ(acc2_v1, RAX)
|
||||
MULQ(p256ordK0)
|
||||
MOVQ(RAX, t0_v1)
|
||||
|
||||
MOVQ(p256ord.Offset(0x00), RAX)
|
||||
MULQ(t0_v1)
|
||||
ADDQ(RAX, acc2_v1)
|
||||
ADCQ(Imm(0), RDX)
|
||||
MOVQ(RDX, t1_v1)
|
||||
|
||||
MOVQ(p256ord.Offset(0x08), RAX)
|
||||
MULQ(t0_v1)
|
||||
ADDQ(t1_v1, acc3_v1)
|
||||
ADCQ(Imm(0), RDX)
|
||||
ADDQ(RAX, acc3_v1)
|
||||
ADCQ(Imm(0), RDX)
|
||||
MOVQ(RDX, t1_v1)
|
||||
|
||||
MOVQ(p256ord.Offset(0x10), RAX)
|
||||
MULQ(t0_v1)
|
||||
ADDQ(t1_v1, acc4_v1)
|
||||
ADCQ(Imm(0), RDX)
|
||||
ADDQ(RAX, acc4_v1)
|
||||
ADCQ(Imm(0), RDX)
|
||||
MOVQ(RDX, t1_v1)
|
||||
|
||||
MOVQ(p256ord.Offset(0x18), RAX)
|
||||
MULQ(t0_v1)
|
||||
ADDQ(t1_v1, acc5_v1)
|
||||
ADCQ(Imm(0), RDX)
|
||||
ADDQ(RAX, acc5_v1)
|
||||
ADCQ(RDX, acc0_v1)
|
||||
ADCQ(Imm(0), acc1_v1)
|
||||
|
||||
Comment("x * y[3]")
|
||||
MOVQ(Mem{Base: y_ptr}.Offset(8*3), t0_v1)
|
||||
|
||||
MOVQ(Mem{Base: x_ptr}.Offset(8*0), RAX)
|
||||
MULQ(t0_v1)
|
||||
ADDQ(RAX, acc3_v1)
|
||||
ADCQ(Imm(0), RDX)
|
||||
MOVQ(RDX, t1_v1)
|
||||
|
||||
MOVQ(Mem{Base: x_ptr}.Offset(8*1), RAX)
|
||||
MULQ(t0_v1)
|
||||
ADDQ(t1_v1, acc4_v1)
|
||||
ADCQ(Imm(0), RDX)
|
||||
ADDQ(RAX, acc4_v1)
|
||||
ADCQ(Imm(0), RDX)
|
||||
MOVQ(RDX, t1_v1)
|
||||
|
||||
MOVQ(Mem{Base: x_ptr}.Offset(8*2), RAX)
|
||||
MULQ(t0_v1)
|
||||
ADDQ(t1_v1, acc5_v1)
|
||||
ADCQ(Imm(0), RDX)
|
||||
ADDQ(RAX, acc5_v1)
|
||||
ADCQ(Imm(0), RDX)
|
||||
MOVQ(RDX, t1_v1)
|
||||
|
||||
MOVQ(Mem{Base: x_ptr}.Offset(8*3), RAX)
|
||||
MULQ(t0_v1)
|
||||
ADDQ(t1_v1, acc0_v1)
|
||||
ADCQ(Imm(0), RDX)
|
||||
ADDQ(RAX, acc0_v1)
|
||||
ADCQ(RDX, acc1_v1)
|
||||
ADCQ(Imm(0), acc2_v1)
|
||||
|
||||
Comment("Last reduction step")
|
||||
MOVQ(acc3_v1, RAX)
|
||||
MULQ(p256ordK0)
|
||||
MOVQ(RAX, t0_v1)
|
||||
|
||||
MOVQ(p256ord.Offset(0x00), RAX)
|
||||
MULQ(t0_v1)
|
||||
ADDQ(RAX, acc3_v1)
|
||||
ADCQ(Imm(0), RDX)
|
||||
MOVQ(RDX, t1_v1)
|
||||
|
||||
MOVQ(p256ord.Offset(0x08), RAX)
|
||||
MULQ(t0_v1)
|
||||
ADDQ(t1_v1, acc4_v1)
|
||||
ADCQ(Imm(0), RDX)
|
||||
ADDQ(RAX, acc4_v1)
|
||||
ADCQ(Imm(0), RDX)
|
||||
MOVQ(RDX, t1_v1)
|
||||
|
||||
MOVQ(p256ord.Offset(0x10), RAX)
|
||||
MULQ(t0_v1)
|
||||
ADDQ(t1_v1, acc5_v1)
|
||||
ADCQ(Imm(0), RDX)
|
||||
ADDQ(RAX, acc5_v1)
|
||||
ADCQ(Imm(0), RDX)
|
||||
MOVQ(RDX, t1_v1)
|
||||
|
||||
MOVQ(p256ord.Offset(0x18), RAX)
|
||||
MULQ(t0_v1)
|
||||
ADDQ(t1_v1, acc0_v1)
|
||||
ADCQ(Imm(0), RDX)
|
||||
ADDQ(RAX, acc0_v1)
|
||||
ADCQ(RDX, acc1_v1)
|
||||
ADCQ(Imm(0), acc2_v1)
|
||||
|
||||
Comment("Copy result [255:0]")
|
||||
MOVQ(acc4_v1, x_ptr)
|
||||
MOVQ(acc5_v1, acc3_v1)
|
||||
MOVQ(acc0_v1, t0_v1)
|
||||
MOVQ(acc1_v1, t1_v1)
|
||||
|
||||
Comment("Subtract p256")
|
||||
SUBQ(p256ord.Offset(0x00), acc4_v1)
|
||||
SBBQ(p256ord.Offset(0x08), acc5_v1)
|
||||
SBBQ(p256ord.Offset(0x10), acc0_v1)
|
||||
SBBQ(p256ord.Offset(0x18), acc1_v1)
|
||||
SBBQ(Imm(0), acc2_v1)
|
||||
|
||||
CMOVQCS(x_ptr, acc4_v1)
|
||||
CMOVQCS(acc3_v1, acc5_v1)
|
||||
CMOVQCS(t0_v1, acc0_v1)
|
||||
CMOVQCS(t1_v1, acc1_v1)
|
||||
|
||||
MOVQ(acc4_v1, Mem{Base: res_ptr}.Offset(8*0))
|
||||
MOVQ(acc5_v1, Mem{Base: res_ptr}.Offset(8*1))
|
||||
MOVQ(acc0_v1, Mem{Base: res_ptr}.Offset(8*2))
|
||||
MOVQ(acc1_v1, Mem{Base: res_ptr}.Offset(8*3))
|
||||
|
||||
RET()
|
||||
}
|
||||
|
||||
// Implements:
|
||||
//
|
||||
// func p256OrdSqr(res, in *p256OrdElement, n int)
|
||||
func p256OrdSqr() {
|
||||
Implement("p256OrdSqr")
|
||||
Attributes(NOSPLIT)
|
||||
|
||||
Load(Param("res"), res_ptr)
|
||||
Load(Param("in"), x_ptr)
|
||||
Load(Param("n"), RBX)
|
||||
|
||||
Label("ordSqrLoop")
|
||||
|
||||
Comment("y[1:] * y[0]")
|
||||
MOVQ(Mem{Base: x_ptr}.Offset(8*0), t0_v1)
|
||||
|
||||
MOVQ(Mem{Base: x_ptr}.Offset(8*1), RAX)
|
||||
MULQ(t0_v1)
|
||||
MOVQ(RAX, acc1_v1)
|
||||
MOVQ(RDX, acc2_v1)
|
||||
|
||||
MOVQ(Mem{Base: x_ptr}.Offset(8*2), RAX)
|
||||
MULQ(t0_v1)
|
||||
ADDQ(RAX, acc2_v1)
|
||||
ADCQ(Imm(0), RDX)
|
||||
MOVQ(RDX, acc3_v1)
|
||||
|
||||
MOVQ(Mem{Base: x_ptr}.Offset(8*3), RAX)
|
||||
MULQ(t0_v1)
|
||||
ADDQ(RAX, acc3_v1)
|
||||
ADCQ(Imm(0), RDX)
|
||||
MOVQ(RDX, acc4_v1)
|
||||
|
||||
Comment("y[2:] * y[1]")
|
||||
MOVQ(Mem{Base: x_ptr}.Offset(8*1), t0_v1)
|
||||
|
||||
MOVQ(Mem{Base: x_ptr}.Offset(8*2), RAX)
|
||||
MULQ(t0_v1)
|
||||
ADDQ(RAX, acc3_v1)
|
||||
ADCQ(Imm(0), RDX)
|
||||
MOVQ(RDX, t1_v1)
|
||||
|
||||
MOVQ(Mem{Base: x_ptr}.Offset(8*3), RAX)
|
||||
MULQ(t0_v1)
|
||||
ADDQ(t1_v1, acc4_v1)
|
||||
ADCQ(Imm(0), RDX)
|
||||
ADDQ(RAX, acc4_v1)
|
||||
ADCQ(Imm(0), RDX)
|
||||
MOVQ(RDX, acc5_v1)
|
||||
|
||||
Comment("y[3] * y[2]")
|
||||
MOVQ(Mem{Base: x_ptr}.Offset(8*2), t0_v1)
|
||||
|
||||
MOVQ(Mem{Base: x_ptr}.Offset(8*3), RAX)
|
||||
MULQ(t0_v1)
|
||||
ADDQ(RAX, acc5_v1)
|
||||
ADCQ(Imm(0), RDX)
|
||||
MOVQ(RDX, y_ptr)
|
||||
XORQ(t1_v1, t1_v1)
|
||||
|
||||
Comment("*2")
|
||||
ADDQ(acc1_v1, acc1_v1)
|
||||
ADCQ(acc2_v1, acc2_v1)
|
||||
ADCQ(acc3_v1, acc3_v1)
|
||||
ADCQ(acc4_v1, acc4_v1)
|
||||
ADCQ(acc5_v1, acc5_v1)
|
||||
ADCQ(y_ptr, y_ptr)
|
||||
ADCQ(Imm(0), t1_v1)
|
||||
|
||||
Comment("Missing products")
|
||||
MOVQ(Mem{Base: x_ptr}.Offset(8*0), RAX)
|
||||
MULQ(RAX)
|
||||
MOVQ(RAX, acc0_v1)
|
||||
MOVQ(RDX, t0_v1)
|
||||
|
||||
MOVQ(Mem{Base: x_ptr}.Offset(8*1), RAX)
|
||||
MULQ(RAX)
|
||||
ADDQ(t0_v1, acc1_v1)
|
||||
ADCQ(RAX, acc2_v1)
|
||||
ADCQ(Imm(0), RDX)
|
||||
MOVQ(RDX, t0_v1)
|
||||
|
||||
MOVQ(Mem{Base: x_ptr}.Offset(8*2), RAX)
|
||||
MULQ(RAX)
|
||||
ADDQ(t0_v1, acc3_v1)
|
||||
ADCQ(RAX, acc4_v1)
|
||||
ADCQ(Imm(0), RDX)
|
||||
MOVQ(RDX, t0_v1)
|
||||
|
||||
MOVQ(Mem{Base: x_ptr}.Offset(8*3), RAX)
|
||||
MULQ(RAX)
|
||||
ADDQ(t0_v1, acc5_v1)
|
||||
ADCQ(RAX, y_ptr)
|
||||
ADCQ(RDX, t1_v1)
|
||||
MOVQ(t1_v1, x_ptr)
|
||||
|
||||
Comment("First reduction step")
|
||||
MOVQ(acc0_v1, RAX)
|
||||
p256ordK0 := p256ordK0_DATA()
|
||||
MULQ(p256ordK0)
|
||||
MOVQ(RAX, t0_v1)
|
||||
|
||||
p256ord := p256ord_DATA()
|
||||
MOVQ(p256ord.Offset(0x00), RAX)
|
||||
MULQ(t0_v1)
|
||||
ADDQ(RAX, acc0_v1)
|
||||
ADCQ(Imm(0), RDX)
|
||||
MOVQ(RDX, t1_v1)
|
||||
|
||||
MOVQ(p256ord.Offset(0x08), RAX)
|
||||
MULQ(t0_v1)
|
||||
ADDQ(t1_v1, acc1_v1)
|
||||
ADCQ(Imm(0), RDX)
|
||||
ADDQ(RAX, acc1_v1)
|
||||
|
||||
MOVQ(t0_v1, t1_v1)
|
||||
ADCQ(RDX, acc2_v1)
|
||||
ADCQ(Imm(0), t1_v1)
|
||||
SUBQ(t0_v1, acc2_v1)
|
||||
SBBQ(Imm(0), t1_v1)
|
||||
|
||||
MOVQ(t0_v1, RAX)
|
||||
MOVQ(t0_v1, RDX)
|
||||
MOVQ(t0_v1, acc0_v1)
|
||||
SHLQ(Imm(32), RAX)
|
||||
SHRQ(Imm(32), RDX)
|
||||
|
||||
ADDQ(t1_v1, acc3_v1)
|
||||
ADCQ(Imm(0), acc0_v1)
|
||||
SUBQ(RAX, acc3_v1)
|
||||
SBBQ(RDX, acc0_v1)
|
||||
|
||||
Comment("Second reduction step")
|
||||
MOVQ(acc1_v1, RAX)
|
||||
MULQ(p256ordK0)
|
||||
MOVQ(RAX, t0_v1)
|
||||
|
||||
MOVQ(p256ord.Offset(0x00), RAX)
|
||||
MULQ(t0_v1)
|
||||
ADDQ(RAX, acc1_v1)
|
||||
ADCQ(Imm(0), RDX)
|
||||
MOVQ(RDX, t1_v1)
|
||||
|
||||
MOVQ(p256ord.Offset(0x08), RAX)
|
||||
MULQ(t0_v1)
|
||||
ADDQ(t1_v1, acc2_v1)
|
||||
ADCQ(Imm(0), RDX)
|
||||
ADDQ(RAX, acc2_v1)
|
||||
|
||||
MOVQ(t0_v1, t1_v1)
|
||||
ADCQ(RDX, acc3_v1)
|
||||
ADCQ(Imm(0), t1_v1)
|
||||
SUBQ(t0_v1, acc3_v1)
|
||||
SBBQ(Imm(0), t1_v1)
|
||||
|
||||
MOVQ(t0_v1, RAX)
|
||||
MOVQ(t0_v1, RDX)
|
||||
MOVQ(t0_v1, acc1_v1)
|
||||
SHLQ(Imm(32), RAX)
|
||||
SHRQ(Imm(32), RDX)
|
||||
|
||||
ADDQ(t1_v1, acc0_v1)
|
||||
ADCQ(Imm(0), acc1_v1)
|
||||
SUBQ(RAX, acc0_v1)
|
||||
SBBQ(RDX, acc1_v1)
|
||||
|
||||
Comment("Third reduction step")
|
||||
MOVQ(acc2_v1, RAX)
|
||||
MULQ(p256ordK0)
|
||||
MOVQ(RAX, t0_v1)
|
||||
|
||||
MOVQ(p256ord.Offset(0x00), RAX)
|
||||
MULQ(t0_v1)
|
||||
ADDQ(RAX, acc2_v1)
|
||||
ADCQ(Imm(0), RDX)
|
||||
MOVQ(RDX, t1_v1)
|
||||
|
||||
MOVQ(p256ord.Offset(0x08), RAX)
|
||||
MULQ(t0_v1)
|
||||
ADDQ(t1_v1, acc3_v1)
|
||||
ADCQ(Imm(0), RDX)
|
||||
ADDQ(RAX, acc3_v1)
|
||||
|
||||
MOVQ(t0_v1, t1_v1)
|
||||
ADCQ(RDX, acc0_v1)
|
||||
ADCQ(Imm(0), t1_v1)
|
||||
SUBQ(t0_v1, acc0_v1)
|
||||
SBBQ(Imm(0), t1_v1)
|
||||
|
||||
MOVQ(t0_v1, RAX)
|
||||
MOVQ(t0_v1, RDX)
|
||||
MOVQ(t0_v1, acc2_v1)
|
||||
SHLQ(Imm(32), RAX)
|
||||
SHRQ(Imm(32), RDX)
|
||||
|
||||
ADDQ(t1_v1, acc1_v1)
|
||||
ADCQ(Imm(0), acc2_v1)
|
||||
SUBQ(RAX, acc1_v1)
|
||||
SBBQ(RDX, acc2_v1)
|
||||
|
||||
Comment("Last reduction step")
|
||||
MOVQ(acc3_v1, RAX)
|
||||
MULQ(p256ordK0)
|
||||
MOVQ(RAX, t0_v1)
|
||||
|
||||
MOVQ(p256ord.Offset(0x00), RAX)
|
||||
MULQ(t0_v1)
|
||||
ADDQ(RAX, acc3_v1)
|
||||
ADCQ(Imm(0), RDX)
|
||||
MOVQ(RDX, t1_v1)
|
||||
|
||||
MOVQ(p256ord.Offset(0x08), RAX)
|
||||
MULQ(t0_v1)
|
||||
ADDQ(t1_v1, acc0_v1)
|
||||
ADCQ(Imm(0), RDX)
|
||||
ADDQ(RAX, acc0_v1)
|
||||
ADCQ(Imm(0), RDX)
|
||||
MOVQ(RDX, t1_v1)
|
||||
|
||||
MOVQ(t0_v1, t1_v1)
|
||||
ADCQ(RDX, acc1_v1)
|
||||
ADCQ(Imm(0), t1_v1)
|
||||
SUBQ(t0_v1, acc1_v1)
|
||||
SBBQ(Imm(0), t1_v1)
|
||||
|
||||
MOVQ(t0_v1, RAX)
|
||||
MOVQ(t0_v1, RDX)
|
||||
MOVQ(t0_v1, acc3_v1)
|
||||
SHLQ(Imm(32), RAX)
|
||||
SHRQ(Imm(32), RDX)
|
||||
|
||||
ADDQ(t1_v1, acc2_v1)
|
||||
ADCQ(Imm(0), acc3_v1)
|
||||
SUBQ(RAX, acc2_v1)
|
||||
SBBQ(RDX, acc3_v1)
|
||||
XORQ(t0_v1, t0_v1)
|
||||
|
||||
Comment("Add bits [511:256] of the sqr result")
|
||||
ADCQ(acc4_v1, acc0_v1)
|
||||
ADCQ(acc5_v1, acc1_v1)
|
||||
ADCQ(y_ptr, acc2_v1)
|
||||
ADCQ(x_ptr, acc3_v1)
|
||||
ADCQ(Imm(0), t0_v1)
|
||||
|
||||
MOVQ(acc0_v1, acc4_v1)
|
||||
MOVQ(acc1_v1, acc5_v1)
|
||||
MOVQ(acc2_v1, y_ptr)
|
||||
MOVQ(acc3_v1, t1_v1)
|
||||
|
||||
Comment("Subtract p256")
|
||||
SUBQ(p256ord.Offset(0x00), acc0_v1)
|
||||
SBBQ(p256ord.Offset(0x08), acc1_v1)
|
||||
SBBQ(p256ord.Offset(0x10), acc2_v1)
|
||||
SBBQ(p256ord.Offset(0x18), acc3_v1)
|
||||
SBBQ(Imm(0), t0_v1)
|
||||
|
||||
CMOVQCS(acc4_v1, acc0_v1)
|
||||
CMOVQCS(acc5_v1, acc1_v1)
|
||||
CMOVQCS(y_ptr, acc2_v1)
|
||||
CMOVQCS(t1_v1, acc3_v1)
|
||||
|
||||
MOVQ(acc0_v1, Mem{Base: res_ptr}.Offset(8*0))
|
||||
MOVQ(acc1_v1, Mem{Base: res_ptr}.Offset(8*1))
|
||||
MOVQ(acc2_v1, Mem{Base: res_ptr}.Offset(8*2))
|
||||
MOVQ(acc3_v1, Mem{Base: res_ptr}.Offset(8*3))
|
||||
MOVQ(res_ptr, x_ptr)
|
||||
DECQ(RBX)
|
||||
JNE(LabelRef("ordSqrLoop"))
|
||||
|
||||
RET()
|
||||
}
|
||||
|
||||
// These variables have been versioned as they get redfined in the reference implementation.
|
||||
// This is done to produce a minimal semantic diff.
|
||||
var (
|
||||
|
|
@ -2595,7 +2016,7 @@ func p256PointDoubleAsm() {
|
|||
// #----------------------------DATA SECTION-----------------------------------##
|
||||
|
||||
// Pointers for memoizing Data section symbols
|
||||
var p256const0_ptr, p256const1_ptr, p256ordK0_ptr, p256ord_ptr, p256one_ptr *Mem
|
||||
var p256const0_ptr, p256const1_ptr, p256one_ptr *Mem
|
||||
|
||||
func p256const0_DATA() Mem {
|
||||
if p256const0_ptr != nil {
|
||||
|
|
@ -2619,39 +2040,6 @@ func p256const1_DATA() Mem {
|
|||
return p256const1
|
||||
}
|
||||
|
||||
func p256ordK0_DATA() Mem {
|
||||
if p256ordK0_ptr != nil {
|
||||
return *p256ordK0_ptr
|
||||
}
|
||||
|
||||
p256ordK0 := GLOBL("p256ordK0", 8)
|
||||
p256ordK0_ptr = &p256ordK0
|
||||
DATA(0, U64(0xccd1c8aaee00bc4f))
|
||||
return p256ordK0
|
||||
}
|
||||
|
||||
var p256ordConstants = [4]uint64{
|
||||
0xf3b9cac2fc632551,
|
||||
0xbce6faada7179e84,
|
||||
0xffffffffffffffff,
|
||||
0xffffffff00000000,
|
||||
}
|
||||
|
||||
func p256ord_DATA() Mem {
|
||||
if p256ord_ptr != nil {
|
||||
return *p256ord_ptr
|
||||
}
|
||||
|
||||
p256ord := GLOBL("p256ord", 8)
|
||||
p256ord_ptr = &p256ord
|
||||
|
||||
for i, k := range p256ordConstants {
|
||||
DATA(i*8, U64(k))
|
||||
}
|
||||
|
||||
return p256ord
|
||||
}
|
||||
|
||||
var p256oneConstants = [4]uint64{
|
||||
0x0000000000000001,
|
||||
0xffffffff00000000,
|
||||
|
|
|
|||
|
|
@ -392,8 +392,8 @@ func (q *P256Point) Select(p1, p2 *P256Point, cond int) *P256Point {
|
|||
return q
|
||||
}
|
||||
|
||||
// p256OrdElement is a P-256 scalar field element in [0, ord(G)-1] in the
|
||||
// Montgomery domain (with R 2²⁵⁶) as four uint64 limbs in little-endian order.
|
||||
// p256OrdElement is a P-256 scalar field element in [0, ord(G)-1]
|
||||
// as four uint64 limbs in little-endian order.
|
||||
type p256OrdElement [4]uint64
|
||||
|
||||
// SetBytes sets s to the big-endian value of x, reducing it as necessary.
|
||||
|
|
|
|||
|
|
@ -76,9 +76,10 @@ const p256CompressedLength = 1 + p256ElementLength
|
|||
// the curve, it returns nil and an error, and the receiver is unchanged.
|
||||
// Otherwise, it returns p.
|
||||
func (p *P256Point) SetBytes(b []byte) (*P256Point, error) {
|
||||
// p256Mul operates in the Montgomery domain with R = 2²⁵⁶ mod p. Thus rr
|
||||
// here is R in the Montgomery domain, or R×R mod p. See comment in
|
||||
// P256OrdInverse about how this is used.
|
||||
// This implementation operates in the Montgomery domain with R = 2²⁵⁶ mod
|
||||
// p. Elements in the Montgomery domain take the form a×R and p256Mul
|
||||
// calculates (a × b × R⁻¹) mod p. rr is R in the domain, or R×R mod p, thus
|
||||
// p256Mul(e, RR) gives e×R, i.e. converts e into the Montgomery domain.
|
||||
rr := p256Element{0x0000000000000003, 0xfffffffbffffffff,
|
||||
0xfffffffffffffffe, 0x00000004fffffffd}
|
||||
|
||||
|
|
@ -282,7 +283,7 @@ func p256Mul(res, in1, in2 *p256Element)
|
|||
//go:noescape
|
||||
func p256Sqr(res, in *p256Element, n int)
|
||||
|
||||
// Montgomery multiplication by R⁻¹, or 1 outside the domain.
|
||||
// Montgomery multiplication by R⁻¹, or 1 outside the domain, as R⁻¹×R = 1.
|
||||
// Sets res = in * R⁻¹, bringing res out of the Montgomery domain.
|
||||
//
|
||||
//go:noescape
|
||||
|
|
@ -365,8 +366,8 @@ func p256PointAddAsm(res, in1, in2 *P256Point) int
|
|||
//go:noescape
|
||||
func p256PointDoubleAsm(res, in *P256Point)
|
||||
|
||||
// p256OrdElement is a P-256 scalar field element in [0, ord(G)-1] in the
|
||||
// Montgomery domain (with R 2²⁵⁶) as four uint64 limbs in little-endian order.
|
||||
// p256OrdElement is a P-256 scalar field element in [0, ord(G)-1]
|
||||
// as four uint64 limbs in little-endian order.
|
||||
type p256OrdElement [4]uint64
|
||||
|
||||
// p256OrdReduce ensures s is in the range [0, ord(G)-1].
|
||||
|
|
|
|||
|
|
@ -640,509 +640,6 @@ loop_select_base:
|
|||
MOVOU X3, 48(DX)
|
||||
RET
|
||||
|
||||
// func p256OrdMul(res *p256OrdElement, in1 *p256OrdElement, in2 *p256OrdElement)
|
||||
// Requires: CMOV
|
||||
TEXT ·p256OrdMul(SB), NOSPLIT, $0-24
|
||||
MOVQ res+0(FP), DI
|
||||
MOVQ in1+8(FP), SI
|
||||
MOVQ in2+16(FP), CX
|
||||
|
||||
// x * y[0]
|
||||
MOVQ (CX), R14
|
||||
MOVQ (SI), AX
|
||||
MULQ R14
|
||||
MOVQ AX, R8
|
||||
MOVQ DX, R9
|
||||
MOVQ 8(SI), AX
|
||||
MULQ R14
|
||||
ADDQ AX, R9
|
||||
ADCQ $0x00, DX
|
||||
MOVQ DX, R10
|
||||
MOVQ 16(SI), AX
|
||||
MULQ R14
|
||||
ADDQ AX, R10
|
||||
ADCQ $0x00, DX
|
||||
MOVQ DX, R11
|
||||
MOVQ 24(SI), AX
|
||||
MULQ R14
|
||||
ADDQ AX, R11
|
||||
ADCQ $0x00, DX
|
||||
MOVQ DX, R12
|
||||
XORQ R13, R13
|
||||
|
||||
// First reduction step
|
||||
MOVQ R8, AX
|
||||
MULQ p256ordK0<>+0(SB)
|
||||
MOVQ AX, R14
|
||||
MOVQ p256ord<>+0(SB), AX
|
||||
MULQ R14
|
||||
ADDQ AX, R8
|
||||
ADCQ $0x00, DX
|
||||
MOVQ DX, R15
|
||||
MOVQ p256ord<>+8(SB), AX
|
||||
MULQ R14
|
||||
ADDQ R15, R9
|
||||
ADCQ $0x00, DX
|
||||
ADDQ AX, R9
|
||||
ADCQ $0x00, DX
|
||||
MOVQ DX, R15
|
||||
MOVQ p256ord<>+16(SB), AX
|
||||
MULQ R14
|
||||
ADDQ R15, R10
|
||||
ADCQ $0x00, DX
|
||||
ADDQ AX, R10
|
||||
ADCQ $0x00, DX
|
||||
MOVQ DX, R15
|
||||
MOVQ p256ord<>+24(SB), AX
|
||||
MULQ R14
|
||||
ADDQ R15, R11
|
||||
ADCQ $0x00, DX
|
||||
ADDQ AX, R11
|
||||
ADCQ DX, R12
|
||||
ADCQ $0x00, R13
|
||||
|
||||
// x * y[1]
|
||||
MOVQ 8(CX), R14
|
||||
MOVQ (SI), AX
|
||||
MULQ R14
|
||||
ADDQ AX, R9
|
||||
ADCQ $0x00, DX
|
||||
MOVQ DX, R15
|
||||
MOVQ 8(SI), AX
|
||||
MULQ R14
|
||||
ADDQ R15, R10
|
||||
ADCQ $0x00, DX
|
||||
ADDQ AX, R10
|
||||
ADCQ $0x00, DX
|
||||
MOVQ DX, R15
|
||||
MOVQ 16(SI), AX
|
||||
MULQ R14
|
||||
ADDQ R15, R11
|
||||
ADCQ $0x00, DX
|
||||
ADDQ AX, R11
|
||||
ADCQ $0x00, DX
|
||||
MOVQ DX, R15
|
||||
MOVQ 24(SI), AX
|
||||
MULQ R14
|
||||
ADDQ R15, R12
|
||||
ADCQ $0x00, DX
|
||||
ADDQ AX, R12
|
||||
ADCQ DX, R13
|
||||
ADCQ $0x00, R8
|
||||
|
||||
// Second reduction step
|
||||
MOVQ R9, AX
|
||||
MULQ p256ordK0<>+0(SB)
|
||||
MOVQ AX, R14
|
||||
MOVQ p256ord<>+0(SB), AX
|
||||
MULQ R14
|
||||
ADDQ AX, R9
|
||||
ADCQ $0x00, DX
|
||||
MOVQ DX, R15
|
||||
MOVQ p256ord<>+8(SB), AX
|
||||
MULQ R14
|
||||
ADDQ R15, R10
|
||||
ADCQ $0x00, DX
|
||||
ADDQ AX, R10
|
||||
ADCQ $0x00, DX
|
||||
MOVQ DX, R15
|
||||
MOVQ p256ord<>+16(SB), AX
|
||||
MULQ R14
|
||||
ADDQ R15, R11
|
||||
ADCQ $0x00, DX
|
||||
ADDQ AX, R11
|
||||
ADCQ $0x00, DX
|
||||
MOVQ DX, R15
|
||||
MOVQ p256ord<>+24(SB), AX
|
||||
MULQ R14
|
||||
ADDQ R15, R12
|
||||
ADCQ $0x00, DX
|
||||
ADDQ AX, R12
|
||||
ADCQ DX, R13
|
||||
ADCQ $0x00, R8
|
||||
|
||||
// x * y[2]
|
||||
MOVQ 16(CX), R14
|
||||
MOVQ (SI), AX
|
||||
MULQ R14
|
||||
ADDQ AX, R10
|
||||
ADCQ $0x00, DX
|
||||
MOVQ DX, R15
|
||||
MOVQ 8(SI), AX
|
||||
MULQ R14
|
||||
ADDQ R15, R11
|
||||
ADCQ $0x00, DX
|
||||
ADDQ AX, R11
|
||||
ADCQ $0x00, DX
|
||||
MOVQ DX, R15
|
||||
MOVQ 16(SI), AX
|
||||
MULQ R14
|
||||
ADDQ R15, R12
|
||||
ADCQ $0x00, DX
|
||||
ADDQ AX, R12
|
||||
ADCQ $0x00, DX
|
||||
MOVQ DX, R15
|
||||
MOVQ 24(SI), AX
|
||||
MULQ R14
|
||||
ADDQ R15, R13
|
||||
ADCQ $0x00, DX
|
||||
ADDQ AX, R13
|
||||
ADCQ DX, R8
|
||||
ADCQ $0x00, R9
|
||||
|
||||
// Third reduction step
|
||||
MOVQ R10, AX
|
||||
MULQ p256ordK0<>+0(SB)
|
||||
MOVQ AX, R14
|
||||
MOVQ p256ord<>+0(SB), AX
|
||||
MULQ R14
|
||||
ADDQ AX, R10
|
||||
ADCQ $0x00, DX
|
||||
MOVQ DX, R15
|
||||
MOVQ p256ord<>+8(SB), AX
|
||||
MULQ R14
|
||||
ADDQ R15, R11
|
||||
ADCQ $0x00, DX
|
||||
ADDQ AX, R11
|
||||
ADCQ $0x00, DX
|
||||
MOVQ DX, R15
|
||||
MOVQ p256ord<>+16(SB), AX
|
||||
MULQ R14
|
||||
ADDQ R15, R12
|
||||
ADCQ $0x00, DX
|
||||
ADDQ AX, R12
|
||||
ADCQ $0x00, DX
|
||||
MOVQ DX, R15
|
||||
MOVQ p256ord<>+24(SB), AX
|
||||
MULQ R14
|
||||
ADDQ R15, R13
|
||||
ADCQ $0x00, DX
|
||||
ADDQ AX, R13
|
||||
ADCQ DX, R8
|
||||
ADCQ $0x00, R9
|
||||
|
||||
// x * y[3]
|
||||
MOVQ 24(CX), R14
|
||||
MOVQ (SI), AX
|
||||
MULQ R14
|
||||
ADDQ AX, R11
|
||||
ADCQ $0x00, DX
|
||||
MOVQ DX, R15
|
||||
MOVQ 8(SI), AX
|
||||
MULQ R14
|
||||
ADDQ R15, R12
|
||||
ADCQ $0x00, DX
|
||||
ADDQ AX, R12
|
||||
ADCQ $0x00, DX
|
||||
MOVQ DX, R15
|
||||
MOVQ 16(SI), AX
|
||||
MULQ R14
|
||||
ADDQ R15, R13
|
||||
ADCQ $0x00, DX
|
||||
ADDQ AX, R13
|
||||
ADCQ $0x00, DX
|
||||
MOVQ DX, R15
|
||||
MOVQ 24(SI), AX
|
||||
MULQ R14
|
||||
ADDQ R15, R8
|
||||
ADCQ $0x00, DX
|
||||
ADDQ AX, R8
|
||||
ADCQ DX, R9
|
||||
ADCQ $0x00, R10
|
||||
|
||||
// Last reduction step
|
||||
MOVQ R11, AX
|
||||
MULQ p256ordK0<>+0(SB)
|
||||
MOVQ AX, R14
|
||||
MOVQ p256ord<>+0(SB), AX
|
||||
MULQ R14
|
||||
ADDQ AX, R11
|
||||
ADCQ $0x00, DX
|
||||
MOVQ DX, R15
|
||||
MOVQ p256ord<>+8(SB), AX
|
||||
MULQ R14
|
||||
ADDQ R15, R12
|
||||
ADCQ $0x00, DX
|
||||
ADDQ AX, R12
|
||||
ADCQ $0x00, DX
|
||||
MOVQ DX, R15
|
||||
MOVQ p256ord<>+16(SB), AX
|
||||
MULQ R14
|
||||
ADDQ R15, R13
|
||||
ADCQ $0x00, DX
|
||||
ADDQ AX, R13
|
||||
ADCQ $0x00, DX
|
||||
MOVQ DX, R15
|
||||
MOVQ p256ord<>+24(SB), AX
|
||||
MULQ R14
|
||||
ADDQ R15, R8
|
||||
ADCQ $0x00, DX
|
||||
ADDQ AX, R8
|
||||
ADCQ DX, R9
|
||||
ADCQ $0x00, R10
|
||||
|
||||
// Copy result [255:0]
|
||||
MOVQ R12, SI
|
||||
MOVQ R13, R11
|
||||
MOVQ R8, R14
|
||||
MOVQ R9, R15
|
||||
|
||||
// Subtract p256
|
||||
SUBQ p256ord<>+0(SB), R12
|
||||
SBBQ p256ord<>+8(SB), R13
|
||||
SBBQ p256ord<>+16(SB), R8
|
||||
SBBQ p256ord<>+24(SB), R9
|
||||
SBBQ $0x00, R10
|
||||
CMOVQCS SI, R12
|
||||
CMOVQCS R11, R13
|
||||
CMOVQCS R14, R8
|
||||
CMOVQCS R15, R9
|
||||
MOVQ R12, (DI)
|
||||
MOVQ R13, 8(DI)
|
||||
MOVQ R8, 16(DI)
|
||||
MOVQ R9, 24(DI)
|
||||
RET
|
||||
|
||||
DATA p256ordK0<>+0(SB)/8, $0xccd1c8aaee00bc4f
|
||||
GLOBL p256ordK0<>(SB), RODATA, $8
|
||||
|
||||
DATA p256ord<>+0(SB)/8, $0xf3b9cac2fc632551
|
||||
DATA p256ord<>+8(SB)/8, $0xbce6faada7179e84
|
||||
DATA p256ord<>+16(SB)/8, $0xffffffffffffffff
|
||||
DATA p256ord<>+24(SB)/8, $0xffffffff00000000
|
||||
GLOBL p256ord<>(SB), RODATA, $32
|
||||
|
||||
// func p256OrdSqr(res *p256OrdElement, in *p256OrdElement, n int)
|
||||
// Requires: CMOV
|
||||
TEXT ·p256OrdSqr(SB), NOSPLIT, $0-24
|
||||
MOVQ res+0(FP), DI
|
||||
MOVQ in+8(FP), SI
|
||||
MOVQ n+16(FP), BX
|
||||
|
||||
ordSqrLoop:
|
||||
// y[1:] * y[0]
|
||||
MOVQ (SI), R14
|
||||
MOVQ 8(SI), AX
|
||||
MULQ R14
|
||||
MOVQ AX, R9
|
||||
MOVQ DX, R10
|
||||
MOVQ 16(SI), AX
|
||||
MULQ R14
|
||||
ADDQ AX, R10
|
||||
ADCQ $0x00, DX
|
||||
MOVQ DX, R11
|
||||
MOVQ 24(SI), AX
|
||||
MULQ R14
|
||||
ADDQ AX, R11
|
||||
ADCQ $0x00, DX
|
||||
MOVQ DX, R12
|
||||
|
||||
// y[2:] * y[1]
|
||||
MOVQ 8(SI), R14
|
||||
MOVQ 16(SI), AX
|
||||
MULQ R14
|
||||
ADDQ AX, R11
|
||||
ADCQ $0x00, DX
|
||||
MOVQ DX, R15
|
||||
MOVQ 24(SI), AX
|
||||
MULQ R14
|
||||
ADDQ R15, R12
|
||||
ADCQ $0x00, DX
|
||||
ADDQ AX, R12
|
||||
ADCQ $0x00, DX
|
||||
MOVQ DX, R13
|
||||
|
||||
// y[3] * y[2]
|
||||
MOVQ 16(SI), R14
|
||||
MOVQ 24(SI), AX
|
||||
MULQ R14
|
||||
ADDQ AX, R13
|
||||
ADCQ $0x00, DX
|
||||
MOVQ DX, CX
|
||||
XORQ R15, R15
|
||||
|
||||
// *2
|
||||
ADDQ R9, R9
|
||||
ADCQ R10, R10
|
||||
ADCQ R11, R11
|
||||
ADCQ R12, R12
|
||||
ADCQ R13, R13
|
||||
ADCQ CX, CX
|
||||
ADCQ $0x00, R15
|
||||
|
||||
// Missing products
|
||||
MOVQ (SI), AX
|
||||
MULQ AX
|
||||
MOVQ AX, R8
|
||||
MOVQ DX, R14
|
||||
MOVQ 8(SI), AX
|
||||
MULQ AX
|
||||
ADDQ R14, R9
|
||||
ADCQ AX, R10
|
||||
ADCQ $0x00, DX
|
||||
MOVQ DX, R14
|
||||
MOVQ 16(SI), AX
|
||||
MULQ AX
|
||||
ADDQ R14, R11
|
||||
ADCQ AX, R12
|
||||
ADCQ $0x00, DX
|
||||
MOVQ DX, R14
|
||||
MOVQ 24(SI), AX
|
||||
MULQ AX
|
||||
ADDQ R14, R13
|
||||
ADCQ AX, CX
|
||||
ADCQ DX, R15
|
||||
MOVQ R15, SI
|
||||
|
||||
// First reduction step
|
||||
MOVQ R8, AX
|
||||
MULQ p256ordK0<>+0(SB)
|
||||
MOVQ AX, R14
|
||||
MOVQ p256ord<>+0(SB), AX
|
||||
MULQ R14
|
||||
ADDQ AX, R8
|
||||
ADCQ $0x00, DX
|
||||
MOVQ DX, R15
|
||||
MOVQ p256ord<>+8(SB), AX
|
||||
MULQ R14
|
||||
ADDQ R15, R9
|
||||
ADCQ $0x00, DX
|
||||
ADDQ AX, R9
|
||||
MOVQ R14, R15
|
||||
ADCQ DX, R10
|
||||
ADCQ $0x00, R15
|
||||
SUBQ R14, R10
|
||||
SBBQ $0x00, R15
|
||||
MOVQ R14, AX
|
||||
MOVQ R14, DX
|
||||
MOVQ R14, R8
|
||||
SHLQ $0x20, AX
|
||||
SHRQ $0x20, DX
|
||||
ADDQ R15, R11
|
||||
ADCQ $0x00, R8
|
||||
SUBQ AX, R11
|
||||
SBBQ DX, R8
|
||||
|
||||
// Second reduction step
|
||||
MOVQ R9, AX
|
||||
MULQ p256ordK0<>+0(SB)
|
||||
MOVQ AX, R14
|
||||
MOVQ p256ord<>+0(SB), AX
|
||||
MULQ R14
|
||||
ADDQ AX, R9
|
||||
ADCQ $0x00, DX
|
||||
MOVQ DX, R15
|
||||
MOVQ p256ord<>+8(SB), AX
|
||||
MULQ R14
|
||||
ADDQ R15, R10
|
||||
ADCQ $0x00, DX
|
||||
ADDQ AX, R10
|
||||
MOVQ R14, R15
|
||||
ADCQ DX, R11
|
||||
ADCQ $0x00, R15
|
||||
SUBQ R14, R11
|
||||
SBBQ $0x00, R15
|
||||
MOVQ R14, AX
|
||||
MOVQ R14, DX
|
||||
MOVQ R14, R9
|
||||
SHLQ $0x20, AX
|
||||
SHRQ $0x20, DX
|
||||
ADDQ R15, R8
|
||||
ADCQ $0x00, R9
|
||||
SUBQ AX, R8
|
||||
SBBQ DX, R9
|
||||
|
||||
// Third reduction step
|
||||
MOVQ R10, AX
|
||||
MULQ p256ordK0<>+0(SB)
|
||||
MOVQ AX, R14
|
||||
MOVQ p256ord<>+0(SB), AX
|
||||
MULQ R14
|
||||
ADDQ AX, R10
|
||||
ADCQ $0x00, DX
|
||||
MOVQ DX, R15
|
||||
MOVQ p256ord<>+8(SB), AX
|
||||
MULQ R14
|
||||
ADDQ R15, R11
|
||||
ADCQ $0x00, DX
|
||||
ADDQ AX, R11
|
||||
MOVQ R14, R15
|
||||
ADCQ DX, R8
|
||||
ADCQ $0x00, R15
|
||||
SUBQ R14, R8
|
||||
SBBQ $0x00, R15
|
||||
MOVQ R14, AX
|
||||
MOVQ R14, DX
|
||||
MOVQ R14, R10
|
||||
SHLQ $0x20, AX
|
||||
SHRQ $0x20, DX
|
||||
ADDQ R15, R9
|
||||
ADCQ $0x00, R10
|
||||
SUBQ AX, R9
|
||||
SBBQ DX, R10
|
||||
|
||||
// Last reduction step
|
||||
MOVQ R11, AX
|
||||
MULQ p256ordK0<>+0(SB)
|
||||
MOVQ AX, R14
|
||||
MOVQ p256ord<>+0(SB), AX
|
||||
MULQ R14
|
||||
ADDQ AX, R11
|
||||
ADCQ $0x00, DX
|
||||
MOVQ DX, R15
|
||||
MOVQ p256ord<>+8(SB), AX
|
||||
MULQ R14
|
||||
ADDQ R15, R8
|
||||
ADCQ $0x00, DX
|
||||
ADDQ AX, R8
|
||||
ADCQ $0x00, DX
|
||||
MOVQ DX, R15
|
||||
MOVQ R14, R15
|
||||
ADCQ DX, R9
|
||||
ADCQ $0x00, R15
|
||||
SUBQ R14, R9
|
||||
SBBQ $0x00, R15
|
||||
MOVQ R14, AX
|
||||
MOVQ R14, DX
|
||||
MOVQ R14, R11
|
||||
SHLQ $0x20, AX
|
||||
SHRQ $0x20, DX
|
||||
ADDQ R15, R10
|
||||
ADCQ $0x00, R11
|
||||
SUBQ AX, R10
|
||||
SBBQ DX, R11
|
||||
XORQ R14, R14
|
||||
|
||||
// Add bits [511:256] of the sqr result
|
||||
ADCQ R12, R8
|
||||
ADCQ R13, R9
|
||||
ADCQ CX, R10
|
||||
ADCQ SI, R11
|
||||
ADCQ $0x00, R14
|
||||
MOVQ R8, R12
|
||||
MOVQ R9, R13
|
||||
MOVQ R10, CX
|
||||
MOVQ R11, R15
|
||||
|
||||
// Subtract p256
|
||||
SUBQ p256ord<>+0(SB), R8
|
||||
SBBQ p256ord<>+8(SB), R9
|
||||
SBBQ p256ord<>+16(SB), R10
|
||||
SBBQ p256ord<>+24(SB), R11
|
||||
SBBQ $0x00, R14
|
||||
CMOVQCS R12, R8
|
||||
CMOVQCS R13, R9
|
||||
CMOVQCS CX, R10
|
||||
CMOVQCS R15, R11
|
||||
MOVQ R8, (DI)
|
||||
MOVQ R9, 8(DI)
|
||||
MOVQ R10, 16(DI)
|
||||
MOVQ R11, 24(DI)
|
||||
MOVQ DI, SI
|
||||
DECQ BX
|
||||
JNE ordSqrLoop
|
||||
RET
|
||||
|
||||
// func p256SubInternal()
|
||||
// Requires: CMOV
|
||||
TEXT p256SubInternal(SB), NOSPLIT, $0
|
||||
|
|
|
|||
|
|
@ -45,24 +45,14 @@
|
|||
#define y2 R25
|
||||
#define y3 R26
|
||||
|
||||
#define const2 t2
|
||||
#define const3 t3
|
||||
|
||||
DATA p256const0<>+0x00(SB)/8, $0x00000000ffffffff
|
||||
DATA p256const1<>+0x00(SB)/8, $0xffffffff00000001
|
||||
DATA p256ordK0<>+0x00(SB)/8, $0xccd1c8aaee00bc4f
|
||||
DATA p256ord<>+0x00(SB)/8, $0xf3b9cac2fc632551
|
||||
DATA p256ord<>+0x08(SB)/8, $0xbce6faada7179e84
|
||||
DATA p256ord<>+0x10(SB)/8, $0xffffffffffffffff
|
||||
DATA p256ord<>+0x18(SB)/8, $0xffffffff00000000
|
||||
DATA p256one<>+0x00(SB)/8, $0x0000000000000001
|
||||
DATA p256one<>+0x08(SB)/8, $0xffffffff00000000
|
||||
DATA p256one<>+0x10(SB)/8, $0xffffffffffffffff
|
||||
DATA p256one<>+0x18(SB)/8, $0x00000000fffffffe
|
||||
GLOBL p256const0<>(SB), 8, $8
|
||||
GLOBL p256const1<>(SB), 8, $8
|
||||
GLOBL p256ordK0<>(SB), 8, $8
|
||||
GLOBL p256ord<>(SB), 8, $32
|
||||
GLOBL p256one<>(SB), 8, $32
|
||||
|
||||
/* ---------------------------------------*/
|
||||
|
|
@ -343,425 +333,10 @@ loop_select:
|
|||
STP (y2, y3), 3*16(res_ptr)
|
||||
RET
|
||||
/* ---------------------------------------*/
|
||||
// func p256OrdSqr(res, in *p256OrdElement, n int)
|
||||
TEXT ·p256OrdSqr(SB),NOSPLIT,$0
|
||||
MOVD in+8(FP), a_ptr
|
||||
MOVD n+16(FP), b_ptr
|
||||
|
||||
MOVD p256ordK0<>(SB), hlp1
|
||||
LDP p256ord<>+0x00(SB), (const0, const1)
|
||||
LDP p256ord<>+0x10(SB), (const2, const3)
|
||||
|
||||
LDP 0*16(a_ptr), (x0, x1)
|
||||
LDP 1*16(a_ptr), (x2, x3)
|
||||
|
||||
ordSqrLoop:
|
||||
SUB $1, b_ptr
|
||||
|
||||
// x[1:] * x[0]
|
||||
MUL x0, x1, acc1
|
||||
UMULH x0, x1, acc2
|
||||
|
||||
MUL x0, x2, t0
|
||||
ADDS t0, acc2, acc2
|
||||
UMULH x0, x2, acc3
|
||||
|
||||
MUL x0, x3, t0
|
||||
ADCS t0, acc3, acc3
|
||||
UMULH x0, x3, acc4
|
||||
ADC $0, acc4, acc4
|
||||
// x[2:] * x[1]
|
||||
MUL x1, x2, t0
|
||||
ADDS t0, acc3
|
||||
UMULH x1, x2, t1
|
||||
ADCS t1, acc4
|
||||
ADC $0, ZR, acc5
|
||||
|
||||
MUL x1, x3, t0
|
||||
ADDS t0, acc4
|
||||
UMULH x1, x3, t1
|
||||
ADC t1, acc5
|
||||
// x[3] * x[2]
|
||||
MUL x2, x3, t0
|
||||
ADDS t0, acc5
|
||||
UMULH x2, x3, acc6
|
||||
ADC $0, acc6
|
||||
|
||||
MOVD $0, acc7
|
||||
// *2
|
||||
ADDS acc1, acc1
|
||||
ADCS acc2, acc2
|
||||
ADCS acc3, acc3
|
||||
ADCS acc4, acc4
|
||||
ADCS acc5, acc5
|
||||
ADCS acc6, acc6
|
||||
ADC $0, acc7
|
||||
// Missing products
|
||||
MUL x0, x0, acc0
|
||||
UMULH x0, x0, t0
|
||||
ADDS t0, acc1, acc1
|
||||
|
||||
MUL x1, x1, t0
|
||||
ADCS t0, acc2, acc2
|
||||
UMULH x1, x1, t1
|
||||
ADCS t1, acc3, acc3
|
||||
|
||||
MUL x2, x2, t0
|
||||
ADCS t0, acc4, acc4
|
||||
UMULH x2, x2, t1
|
||||
ADCS t1, acc5, acc5
|
||||
|
||||
MUL x3, x3, t0
|
||||
ADCS t0, acc6, acc6
|
||||
UMULH x3, x3, t1
|
||||
ADC t1, acc7, acc7
|
||||
// First reduction step
|
||||
MUL acc0, hlp1, hlp0
|
||||
|
||||
MUL const0, hlp1, t0
|
||||
ADDS t0, acc0, acc0
|
||||
UMULH const0, hlp0, t1
|
||||
|
||||
MUL const1, hlp0, t0
|
||||
ADCS t0, acc1, acc1
|
||||
UMULH const1, hlp0, y0
|
||||
|
||||
MUL const2, hlp0, t0
|
||||
ADCS t0, acc2, acc2
|
||||
UMULH const2, hlp0, acc0
|
||||
|
||||
MUL const3, hlp0, t0
|
||||
ADCS t0, acc3, acc3
|
||||
|
||||
UMULH const3, hlp0, hlp0
|
||||
ADC $0, hlp0
|
||||
|
||||
ADDS t1, acc1, acc1
|
||||
ADCS y0, acc2, acc2
|
||||
ADCS acc0, acc3, acc3
|
||||
ADC $0, hlp0, acc0
|
||||
// Second reduction step
|
||||
MUL acc1, hlp1, hlp0
|
||||
|
||||
MUL const0, hlp1, t0
|
||||
ADDS t0, acc1, acc1
|
||||
UMULH const0, hlp0, t1
|
||||
|
||||
MUL const1, hlp0, t0
|
||||
ADCS t0, acc2, acc2
|
||||
UMULH const1, hlp0, y0
|
||||
|
||||
MUL const2, hlp0, t0
|
||||
ADCS t0, acc3, acc3
|
||||
UMULH const2, hlp0, acc1
|
||||
|
||||
MUL const3, hlp0, t0
|
||||
ADCS t0, acc0, acc0
|
||||
|
||||
UMULH const3, hlp0, hlp0
|
||||
ADC $0, hlp0
|
||||
|
||||
ADDS t1, acc2, acc2
|
||||
ADCS y0, acc3, acc3
|
||||
ADCS acc1, acc0, acc0
|
||||
ADC $0, hlp0, acc1
|
||||
// Third reduction step
|
||||
MUL acc2, hlp1, hlp0
|
||||
|
||||
MUL const0, hlp1, t0
|
||||
ADDS t0, acc2, acc2
|
||||
UMULH const0, hlp0, t1
|
||||
|
||||
MUL const1, hlp0, t0
|
||||
ADCS t0, acc3, acc3
|
||||
UMULH const1, hlp0, y0
|
||||
|
||||
MUL const2, hlp0, t0
|
||||
ADCS t0, acc0, acc0
|
||||
UMULH const2, hlp0, acc2
|
||||
|
||||
MUL const3, hlp0, t0
|
||||
ADCS t0, acc1, acc1
|
||||
|
||||
UMULH const3, hlp0, hlp0
|
||||
ADC $0, hlp0
|
||||
|
||||
ADDS t1, acc3, acc3
|
||||
ADCS y0, acc0, acc0
|
||||
ADCS acc2, acc1, acc1
|
||||
ADC $0, hlp0, acc2
|
||||
|
||||
// Last reduction step
|
||||
MUL acc3, hlp1, hlp0
|
||||
|
||||
MUL const0, hlp1, t0
|
||||
ADDS t0, acc3, acc3
|
||||
UMULH const0, hlp0, t1
|
||||
|
||||
MUL const1, hlp0, t0
|
||||
ADCS t0, acc0, acc0
|
||||
UMULH const1, hlp0, y0
|
||||
|
||||
MUL const2, hlp0, t0
|
||||
ADCS t0, acc1, acc1
|
||||
UMULH const2, hlp0, acc3
|
||||
|
||||
MUL const3, hlp0, t0
|
||||
ADCS t0, acc2, acc2
|
||||
|
||||
UMULH const3, hlp0, hlp0
|
||||
ADC $0, acc7
|
||||
|
||||
ADDS t1, acc0, acc0
|
||||
ADCS y0, acc1, acc1
|
||||
ADCS acc3, acc2, acc2
|
||||
ADC $0, hlp0, acc3
|
||||
|
||||
ADDS acc4, acc0, acc0
|
||||
ADCS acc5, acc1, acc1
|
||||
ADCS acc6, acc2, acc2
|
||||
ADCS acc7, acc3, acc3
|
||||
ADC $0, ZR, acc4
|
||||
|
||||
SUBS const0, acc0, y0
|
||||
SBCS const1, acc1, y1
|
||||
SBCS const2, acc2, y2
|
||||
SBCS const3, acc3, y3
|
||||
SBCS $0, acc4, acc4
|
||||
|
||||
CSEL CS, y0, acc0, x0
|
||||
CSEL CS, y1, acc1, x1
|
||||
CSEL CS, y2, acc2, x2
|
||||
CSEL CS, y3, acc3, x3
|
||||
|
||||
CBNZ b_ptr, ordSqrLoop
|
||||
|
||||
MOVD res+0(FP), res_ptr
|
||||
STP (x0, x1), 0*16(res_ptr)
|
||||
STP (x2, x3), 1*16(res_ptr)
|
||||
|
||||
RET
|
||||
/* ---------------------------------------*/
|
||||
// func p256OrdMul(res, in1, in2 *p256OrdElement)
|
||||
TEXT ·p256OrdMul(SB),NOSPLIT,$0
|
||||
MOVD in1+8(FP), a_ptr
|
||||
MOVD in2+16(FP), b_ptr
|
||||
|
||||
MOVD p256ordK0<>(SB), hlp1
|
||||
LDP p256ord<>+0x00(SB), (const0, const1)
|
||||
LDP p256ord<>+0x10(SB), (const2, const3)
|
||||
|
||||
LDP 0*16(a_ptr), (x0, x1)
|
||||
LDP 1*16(a_ptr), (x2, x3)
|
||||
LDP 0*16(b_ptr), (y0, y1)
|
||||
LDP 1*16(b_ptr), (y2, y3)
|
||||
|
||||
// y[0] * x
|
||||
MUL y0, x0, acc0
|
||||
UMULH y0, x0, acc1
|
||||
|
||||
MUL y0, x1, t0
|
||||
ADDS t0, acc1
|
||||
UMULH y0, x1, acc2
|
||||
|
||||
MUL y0, x2, t0
|
||||
ADCS t0, acc2
|
||||
UMULH y0, x2, acc3
|
||||
|
||||
MUL y0, x3, t0
|
||||
ADCS t0, acc3
|
||||
UMULH y0, x3, acc4
|
||||
ADC $0, acc4
|
||||
// First reduction step
|
||||
MUL acc0, hlp1, hlp0
|
||||
|
||||
MUL const0, hlp1, t0
|
||||
ADDS t0, acc0, acc0
|
||||
UMULH const0, hlp0, t1
|
||||
|
||||
MUL const1, hlp0, t0
|
||||
ADCS t0, acc1, acc1
|
||||
UMULH const1, hlp0, y0
|
||||
|
||||
MUL const2, hlp0, t0
|
||||
ADCS t0, acc2, acc2
|
||||
UMULH const2, hlp0, acc0
|
||||
|
||||
MUL const3, hlp0, t0
|
||||
ADCS t0, acc3, acc3
|
||||
|
||||
UMULH const3, hlp0, hlp0
|
||||
ADC $0, acc4
|
||||
|
||||
ADDS t1, acc1, acc1
|
||||
ADCS y0, acc2, acc2
|
||||
ADCS acc0, acc3, acc3
|
||||
ADC $0, hlp0, acc0
|
||||
// y[1] * x
|
||||
MUL y1, x0, t0
|
||||
ADDS t0, acc1
|
||||
UMULH y1, x0, t1
|
||||
|
||||
MUL y1, x1, t0
|
||||
ADCS t0, acc2
|
||||
UMULH y1, x1, hlp0
|
||||
|
||||
MUL y1, x2, t0
|
||||
ADCS t0, acc3
|
||||
UMULH y1, x2, y0
|
||||
|
||||
MUL y1, x3, t0
|
||||
ADCS t0, acc4
|
||||
UMULH y1, x3, y1
|
||||
ADC $0, ZR, acc5
|
||||
|
||||
ADDS t1, acc2
|
||||
ADCS hlp0, acc3
|
||||
ADCS y0, acc4
|
||||
ADC y1, acc5
|
||||
// Second reduction step
|
||||
MUL acc1, hlp1, hlp0
|
||||
|
||||
MUL const0, hlp1, t0
|
||||
ADDS t0, acc1, acc1
|
||||
UMULH const0, hlp0, t1
|
||||
|
||||
MUL const1, hlp0, t0
|
||||
ADCS t0, acc2, acc2
|
||||
UMULH const1, hlp0, y0
|
||||
|
||||
MUL const2, hlp0, t0
|
||||
ADCS t0, acc3, acc3
|
||||
UMULH const2, hlp0, acc1
|
||||
|
||||
MUL const3, hlp0, t0
|
||||
ADCS t0, acc0, acc0
|
||||
|
||||
UMULH const3, hlp0, hlp0
|
||||
ADC $0, acc5
|
||||
|
||||
ADDS t1, acc2, acc2
|
||||
ADCS y0, acc3, acc3
|
||||
ADCS acc1, acc0, acc0
|
||||
ADC $0, hlp0, acc1
|
||||
// y[2] * x
|
||||
MUL y2, x0, t0
|
||||
ADDS t0, acc2
|
||||
UMULH y2, x0, t1
|
||||
|
||||
MUL y2, x1, t0
|
||||
ADCS t0, acc3
|
||||
UMULH y2, x1, hlp0
|
||||
|
||||
MUL y2, x2, t0
|
||||
ADCS t0, acc4
|
||||
UMULH y2, x2, y0
|
||||
|
||||
MUL y2, x3, t0
|
||||
ADCS t0, acc5
|
||||
UMULH y2, x3, y1
|
||||
ADC $0, ZR, acc6
|
||||
|
||||
ADDS t1, acc3
|
||||
ADCS hlp0, acc4
|
||||
ADCS y0, acc5
|
||||
ADC y1, acc6
|
||||
// Third reduction step
|
||||
MUL acc2, hlp1, hlp0
|
||||
|
||||
MUL const0, hlp1, t0
|
||||
ADDS t0, acc2, acc2
|
||||
UMULH const0, hlp0, t1
|
||||
|
||||
MUL const1, hlp0, t0
|
||||
ADCS t0, acc3, acc3
|
||||
UMULH const1, hlp0, y0
|
||||
|
||||
MUL const2, hlp0, t0
|
||||
ADCS t0, acc0, acc0
|
||||
UMULH const2, hlp0, acc2
|
||||
|
||||
MUL const3, hlp0, t0
|
||||
ADCS t0, acc1, acc1
|
||||
|
||||
UMULH const3, hlp0, hlp0
|
||||
ADC $0, acc6
|
||||
|
||||
ADDS t1, acc3, acc3
|
||||
ADCS y0, acc0, acc0
|
||||
ADCS acc2, acc1, acc1
|
||||
ADC $0, hlp0, acc2
|
||||
// y[3] * x
|
||||
MUL y3, x0, t0
|
||||
ADDS t0, acc3
|
||||
UMULH y3, x0, t1
|
||||
|
||||
MUL y3, x1, t0
|
||||
ADCS t0, acc4
|
||||
UMULH y3, x1, hlp0
|
||||
|
||||
MUL y3, x2, t0
|
||||
ADCS t0, acc5
|
||||
UMULH y3, x2, y0
|
||||
|
||||
MUL y3, x3, t0
|
||||
ADCS t0, acc6
|
||||
UMULH y3, x3, y1
|
||||
ADC $0, ZR, acc7
|
||||
|
||||
ADDS t1, acc4
|
||||
ADCS hlp0, acc5
|
||||
ADCS y0, acc6
|
||||
ADC y1, acc7
|
||||
// Last reduction step
|
||||
MUL acc3, hlp1, hlp0
|
||||
|
||||
MUL const0, hlp1, t0
|
||||
ADDS t0, acc3, acc3
|
||||
UMULH const0, hlp0, t1
|
||||
|
||||
MUL const1, hlp0, t0
|
||||
ADCS t0, acc0, acc0
|
||||
UMULH const1, hlp0, y0
|
||||
|
||||
MUL const2, hlp0, t0
|
||||
ADCS t0, acc1, acc1
|
||||
UMULH const2, hlp0, acc3
|
||||
|
||||
MUL const3, hlp0, t0
|
||||
ADCS t0, acc2, acc2
|
||||
|
||||
UMULH const3, hlp0, hlp0
|
||||
ADC $0, acc7
|
||||
|
||||
ADDS t1, acc0, acc0
|
||||
ADCS y0, acc1, acc1
|
||||
ADCS acc3, acc2, acc2
|
||||
ADC $0, hlp0, acc3
|
||||
|
||||
ADDS acc4, acc0, acc0
|
||||
ADCS acc5, acc1, acc1
|
||||
ADCS acc6, acc2, acc2
|
||||
ADCS acc7, acc3, acc3
|
||||
ADC $0, ZR, acc4
|
||||
|
||||
SUBS const0, acc0, t0
|
||||
SBCS const1, acc1, t1
|
||||
SBCS const2, acc2, t2
|
||||
SBCS const3, acc3, t3
|
||||
SBCS $0, acc4, acc4
|
||||
|
||||
CSEL CS, t0, acc0, acc0
|
||||
CSEL CS, t1, acc1, acc1
|
||||
CSEL CS, t2, acc2, acc2
|
||||
CSEL CS, t3, acc3, acc3
|
||||
|
||||
MOVD res+0(FP), res_ptr
|
||||
STP (acc0, acc1), 0*16(res_ptr)
|
||||
STP (acc2, acc3), 1*16(res_ptr)
|
||||
|
||||
RET
|
||||
/* ---------------------------------------*/
|
||||
// input: x0-x3, y0-y3
|
||||
// output: x0-x3
|
||||
// uses: const0, const1
|
||||
// clobbers: y0-y3, hlp0
|
||||
TEXT p256SubInternal<>(SB),NOSPLIT,$0
|
||||
SUBS x0, y0, acc0
|
||||
SBCS x1, y1, acc1
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load diff
|
|
@ -1,13 +0,0 @@
|
|||
// Copyright 2022 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
//go:build (!amd64 && !arm64) || purego
|
||||
|
||||
package nistec
|
||||
|
||||
import "errors"
|
||||
|
||||
func P256OrdInverse(k []byte) ([]byte, error) {
|
||||
return nil, errors.New("unimplemented")
|
||||
}
|
||||
|
|
@ -0,0 +1,30 @@
|
|||
// Copyright 2026 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
//go:build fips140v1.0 || fips140v1.26
|
||||
|
||||
package fipstest
|
||||
|
||||
import (
|
||||
"crypto/internal/fips140/nistec"
|
||||
"internal/goarch"
|
||||
"testing"
|
||||
)
|
||||
|
||||
// package nistec
|
||||
// func P256OrdInverse(k []byte) ([]byte, error)
|
||||
|
||||
func p256OrdInverse(t *testing.T, k *[4]uint64) {
|
||||
input := limbsToBytes(*k)
|
||||
out, err := nistec.P256OrdInverse(input)
|
||||
if err != nil {
|
||||
switch goarch.GOARCH {
|
||||
case "amd64", "arm64":
|
||||
t.Fatal(err)
|
||||
default:
|
||||
t.Skip("this GOARCH didn't have P256OrdInverse in v1.0/v1.26")
|
||||
}
|
||||
}
|
||||
*k = bytesToLimbs(out)
|
||||
}
|
||||
|
|
@ -0,0 +1,16 @@
|
|||
// Copyright 2026 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
//go:build !(fips140v1.0 || fips140v1.26)
|
||||
|
||||
package fipstest
|
||||
|
||||
import (
|
||||
"crypto/internal/fips140/nistec"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func p256OrdInverse(t *testing.T, k *[4]uint64) {
|
||||
nistec.P256OrdInverse(k)
|
||||
}
|
||||
|
|
@ -2,80 +2,83 @@
|
|||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
//go:build (amd64 || arm64) && !purego
|
||||
|
||||
package fipstest
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"crypto/elliptic"
|
||||
"crypto/internal/fips140/nistec"
|
||||
"internal/byteorder"
|
||||
"math/big"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func bytesToLimbs(b []byte) [4]uint64 {
|
||||
var l [4]uint64
|
||||
l[0] = byteorder.BEUint64(b[24:])
|
||||
l[1] = byteorder.BEUint64(b[16:])
|
||||
l[2] = byteorder.BEUint64(b[8:])
|
||||
l[3] = byteorder.BEUint64(b[:])
|
||||
return l
|
||||
}
|
||||
|
||||
func limbsToBytes(l [4]uint64) []byte {
|
||||
b := make([]byte, 32)
|
||||
byteorder.BEPutUint64(b[24:], l[0])
|
||||
byteorder.BEPutUint64(b[16:], l[1])
|
||||
byteorder.BEPutUint64(b[8:], l[2])
|
||||
byteorder.BEPutUint64(b[:], l[3])
|
||||
return b
|
||||
}
|
||||
|
||||
func TestP256OrdInverse(t *testing.T) {
|
||||
N := elliptic.P256().Params().N
|
||||
|
||||
// inv(0) is expected to be 0.
|
||||
zero := make([]byte, 32)
|
||||
out, err := nistec.P256OrdInverse(zero)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if !bytes.Equal(out, zero) {
|
||||
k := bytesToLimbs(zero)
|
||||
p256OrdInverse(t, &k)
|
||||
if !bytes.Equal(limbsToBytes(k), zero) {
|
||||
t.Error("unexpected output for inv(0)")
|
||||
}
|
||||
|
||||
// inv(N) is also 0 mod N.
|
||||
input := make([]byte, 32)
|
||||
N.FillBytes(input)
|
||||
out, err = nistec.P256OrdInverse(input)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if !bytes.Equal(out, zero) {
|
||||
k = bytesToLimbs(input)
|
||||
p256OrdInverse(t, &k)
|
||||
if !bytes.Equal(limbsToBytes(k), zero) {
|
||||
t.Error("unexpected output for inv(N)")
|
||||
}
|
||||
if !bytes.Equal(input, N.Bytes()) {
|
||||
t.Error("input was modified")
|
||||
}
|
||||
|
||||
// Check inv(1) and inv(N+1) against math/big
|
||||
exp := new(big.Int).ModInverse(big.NewInt(1), N).FillBytes(make([]byte, 32))
|
||||
big.NewInt(1).FillBytes(input)
|
||||
out, err = nistec.P256OrdInverse(input)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if !bytes.Equal(out, exp) {
|
||||
k = bytesToLimbs(input)
|
||||
p256OrdInverse(t, &k)
|
||||
if !bytes.Equal(limbsToBytes(k), exp) {
|
||||
t.Error("unexpected output for inv(1)")
|
||||
}
|
||||
|
||||
new(big.Int).Add(N, big.NewInt(1)).FillBytes(input)
|
||||
out, err = nistec.P256OrdInverse(input)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if !bytes.Equal(out, exp) {
|
||||
k = bytesToLimbs(input)
|
||||
p256OrdInverse(t, &k)
|
||||
if !bytes.Equal(limbsToBytes(k), exp) {
|
||||
t.Error("unexpected output for inv(N+1)")
|
||||
}
|
||||
|
||||
// Check inv(20) and inv(N+20) against math/big
|
||||
exp = new(big.Int).ModInverse(big.NewInt(20), N).FillBytes(make([]byte, 32))
|
||||
big.NewInt(20).FillBytes(input)
|
||||
out, err = nistec.P256OrdInverse(input)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if !bytes.Equal(out, exp) {
|
||||
k = bytesToLimbs(input)
|
||||
p256OrdInverse(t, &k)
|
||||
if !bytes.Equal(limbsToBytes(k), exp) {
|
||||
t.Error("unexpected output for inv(20)")
|
||||
}
|
||||
|
||||
new(big.Int).Add(N, big.NewInt(20)).FillBytes(input)
|
||||
out, err = nistec.P256OrdInverse(input)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if !bytes.Equal(out, exp) {
|
||||
k = bytesToLimbs(input)
|
||||
p256OrdInverse(t, &k)
|
||||
if !bytes.Equal(limbsToBytes(k), exp) {
|
||||
t.Error("unexpected output for inv(N+20)")
|
||||
}
|
||||
|
||||
|
|
@ -84,11 +87,9 @@ func TestP256OrdInverse(t *testing.T) {
|
|||
bigInput.Sub(bigInput, big.NewInt(1))
|
||||
exp = new(big.Int).ModInverse(bigInput, N).FillBytes(make([]byte, 32))
|
||||
bigInput.FillBytes(input)
|
||||
out, err = nistec.P256OrdInverse(input)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if !bytes.Equal(out, exp) {
|
||||
k = bytesToLimbs(input)
|
||||
p256OrdInverse(t, &k)
|
||||
if !bytes.Equal(limbsToBytes(k), exp) {
|
||||
t.Error("unexpected output for inv(2^256-1)")
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue