From 2c659bb4db1ff06c819f8b4cd6decc71e4d5b36e Mon Sep 17 00:00:00 2001 From: Filippo Valsorda Date: Wed, 25 Feb 2026 14:12:15 +0100 Subject: [PATCH] crypto/internal/fips140/nistec: optimize P-256 scalar fiat implementation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reduces the regression of CL 669895 by about half on arm64. host: linux-amd64_c2s16 3227c963a3c 12743b9f2c8 ccb659cd879 9a11e9167c3 sec/op vs base vs base vs base Sign/P256-16 33.38µ +5.23% +5.04% +5.41% Verify/P256-16 74.11µ +2.54% +2.49% +2.69% GenerateKey/P256-16 14.61µ ~ ~ ~ geomean 33.06µ +2.50% +2.41% +2.63% B/op vs base vs base vs base Sign/P256-16 5.922Ki -0.53% -0.53% -0.53% Verify/P256-16 576.0 -5.56% -5.56% -5.56% GenerateKey/P256-16 984.0 ~ ~ ~ geomean 1.474Ki -2.06% -2.06% -2.06% allocs/op vs base vs base vs base Sign/P256-16 59.00 -1.69% -1.69% -1.69% Verify/P256-16 10.00 -10.00% -10.00% -10.00% GenerateKey/P256-16 16.00 ~ ~ ~ geomean 21.13 -4.00% -4.00% -4.00% host: linux-arm64_c4as16 3227c963a3c 12743b9f2c8 ccb659cd879 9a11e9167c3 sec/op vs base vs base vs base Sign/P256-16 29.29µ +8.88% +8.94% +5.41% Verify/P256-16 69.25µ +3.52% +3.48% +2.21% GenerateKey/P256-16 15.17µ ~ ~ ~ geomean 31.34µ +4.05% +3.97% +2.51% B/op vs base vs base vs base Sign/P256-16 5.922Ki -0.53% -0.53% -0.53% Verify/P256-16 576.0 -5.56% -5.56% -5.56% GenerateKey/P256-16 984.0 ~ ~ ~ geomean 1.474Ki -2.06% -2.06% -2.06% allocs/op vs base vs base vs base Sign/P256-16 59.00 -1.69% -1.69% -1.69% Verify/P256-16 10.00 -10.00% -10.00% -10.00% GenerateKey/P256-16 16.00 ~ ~ ~ geomean 21.13 -4.00% -4.00% -4.00% Change-Id: I69adc8175acf0082dca7c8a13d5f62046a6a6964 Reviewed-on: https://go-review.googlesource.com/c/go/+/749141 Auto-Submit: Filippo Valsorda Reviewed-by: Neal Patel Reviewed-by: Roland Shoemaker LUCI-TryBot-Result: golang-scoped@luci-project-accounts.iam.gserviceaccount.com Reviewed-by: Neal Patel --- .../internal/fips140/nistec/p256_ordinv.go | 419 ++++++------------ 1 file changed, 147 insertions(+), 272 deletions(-) diff --git a/src/crypto/internal/fips140/nistec/p256_ordinv.go b/src/crypto/internal/fips140/nistec/p256_ordinv.go index d1e58b202b..a65f27c814 100644 --- a/src/crypto/internal/fips140/nistec/p256_ordinv.go +++ b/src/crypto/internal/fips140/nistec/p256_ordinv.go @@ -71,14 +71,8 @@ func P256OrdInverse(k *[4]uint64) { p256OrdFromMontgomery(j, x) } -func p256OrdSqr(out1, arg1 *p256OrdMontElement, n int) { - p256OrdSquare(out1, arg1) - for range n - 1 { - p256OrdSquare(out1, out1) - } -} - -// The code below was generated by Fiat Cryptography v0.1.6-63-g92ee794c2. +// The code below was generated by Fiat Cryptography v0.1.6-63-g92ee794c2, and +// then manually formatted and optimized. // // word-by-word-montgomery --lang Go --no-wide-int // --relax-primitive-carry-to-bitwidth 32,64 --cmovznz-by-mul --static @@ -112,60 +106,12 @@ func p256OrdSqr(out1, arg1 *p256OrdMontElement, n int) { // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Autogenerated: fiat_crypto.js word-by-word-montgomery --lang Go --no-wide-int --relax-primitive-carry-to-bitwidth 32,64 --cmovznz-by-mul --static --package-case flatcase --private-function-case camelCase --private-type-case camelCase --no-prefix-fiat --package-name nistec p256Ord 64 2^256-2^224+2^192-89188191075325690597107910205041859247 mul square from_montgomery to_montgomery -// -// curve description: p256Ord -// -// machine_wordsize = 64 (from "64") -// -// requested operations: mul, square, from_montgomery, to_montgomery -// -// m = 0xffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2fc632551 (from "2^256-2^224+2^192-89188191075325690597107910205041859247") -// -// -// -// NOTE: In addition to the bounds specified above each function, all -// -// functions synthesized for this Montgomery arithmetic require the -// -// input to be strictly less than the prime modulus (m), and also -// -// require the input to be in the unique saturated representation. -// -// All functions also ensure that these two properties are true of -// -// return values. -// -// -// -// Computed values: -// -// eval z = z[0] + (z[1] << 64) + (z[2] << 128) + (z[3] << 192) -// -// bytes_eval z = z[0] + (z[1] << 8) + (z[2] << 16) + (z[3] << 24) + (z[4] << 32) + (z[5] << 40) + (z[6] << 48) + (z[7] << 56) + (z[8] << 64) + (z[9] << 72) + (z[10] << 80) + (z[11] << 88) + (z[12] << 96) + (z[13] << 104) + (z[14] << 112) + (z[15] << 120) + (z[16] << 128) + (z[17] << 136) + (z[18] << 144) + (z[19] << 152) + (z[20] << 160) + (z[21] << 168) + (z[22] << 176) + (z[23] << 184) + (z[24] << 192) + (z[25] << 200) + (z[26] << 208) + (z[27] << 216) + (z[28] << 224) + (z[29] << 232) + (z[30] << 240) + (z[31] << 248) -// -// twos_complement_eval z = let x1 := z[0] + (z[1] << 64) + (z[2] << 128) + (z[3] << 192) in -// -// if x1 & (2^256-1) < 2^255 then x1 & (2^256-1) else (x1 & (2^256-1)) - 2^256 - -type p256OrdUint1 = uint64 // We use uint64 instead of a more narrow type for performance reasons; see https://github.com/mit-plv/fiat-crypto/pull/1006#issuecomment-892625927 - -// The type p256OrdMontElement is a field element in the Montgomery domain. -// -// Bounds: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]] +// p256OrdMontElement is a scalar field element in the Montgomery domain, as +// four uint64 limbs in little-endian order. It must be strictly less than +// ord(G) and in Montgomery form (with R 2²⁵⁶). type p256OrdMontElement [4]uint64 -// The function p256OrdMul multiplies two field elements in the Montgomery domain. -// -// Preconditions: -// -// 0 ≤ eval arg1 < m -// 0 ≤ eval arg2 < m -// -// Postconditions: -// -// eval (from_montgomery out1) mod m = (eval (from_montgomery arg1) * eval (from_montgomery arg2)) mod m -// 0 ≤ eval out1 < m +// p256OrdMul multiplies two field elements in the Montgomery domain. func p256OrdMul(out1 *p256OrdMontElement, arg1 *p256OrdMontElement, arg2 *p256OrdMontElement) { x1 := arg1[1] x2 := arg1[2] @@ -282,170 +228,144 @@ func p256OrdMul(out1 *p256OrdMontElement, arg1 *p256OrdMontElement, arg2 *p256Or x210, x211 := bits.Sub64(x201, 0xffffffffffffffff, x209) x212, x213 := bits.Sub64(x203, 0xffffffff00000000, x211) _, x215 := bits.Sub64(x205, 0, x213) - var x216 uint64 - p256OrdCmovznzU64(&x216, x215, x206, x197) - var x217 uint64 - p256OrdCmovznzU64(&x217, x215, x208, x199) - var x218 uint64 - p256OrdCmovznzU64(&x218, x215, x210, x201) - var x219 uint64 - p256OrdCmovznzU64(&x219, x215, x212, x203) - out1[0] = x216 - out1[1] = x217 - out1[2] = x218 - out1[3] = x219 + mask, _ := bits.Sub64(0, 0, x215) + out1[0] = x206&^mask | x197&mask + out1[1] = x208&^mask | x199&mask + out1[2] = x210&^mask | x201&mask + out1[3] = x212&^mask | x203&mask } -// The function p256OrdSquare squares a field element in the Montgomery domain. -// -// Preconditions: -// -// 0 ≤ eval arg1 < m -// -// Postconditions: -// -// eval (from_montgomery out1) mod m = (eval (from_montgomery arg1) * eval (from_montgomery arg1)) mod m -// 0 ≤ eval out1 < m -func p256OrdSquare(out1 *p256OrdMontElement, arg1 *p256OrdMontElement) { +// p256OrdSqr squares n times a field element in the Montgomery domain. +func p256OrdSqr(out1 *p256OrdMontElement, arg1 *p256OrdMontElement, n int) { x1 := arg1[1] x2 := arg1[2] x3 := arg1[3] x4 := arg1[0] - x6, x5 := bits.Mul64(x4, arg1[3]) - x8, x7 := bits.Mul64(x4, arg1[2]) - x10, x9 := bits.Mul64(x4, arg1[1]) - x12, x11 := bits.Mul64(x4, arg1[0]) - x13, x14 := bits.Add64(x12, x9, 0) - x15, x16 := bits.Add64(x10, x7, x14) - x17, x18 := bits.Add64(x8, x5, x16) - x19 := x18 + x6 - _, x20 := bits.Mul64(x11, 0xccd1c8aaee00bc4f) - x23, x22 := bits.Mul64(x20, 0xffffffff00000000) - x25, x24 := bits.Mul64(x20, 0xffffffffffffffff) - x27, x26 := bits.Mul64(x20, 0xbce6faada7179e84) - x29, x28 := bits.Mul64(x20, 0xf3b9cac2fc632551) - x30, x31 := bits.Add64(x29, x26, 0) - x32, x33 := bits.Add64(x27, x24, x31) - x34, x35 := bits.Add64(x25, x22, x33) - x36 := x35 + x23 - _, x38 := bits.Add64(x11, x28, 0) - x39, x40 := bits.Add64(x13, x30, x38) - x41, x42 := bits.Add64(x15, x32, x40) - x43, x44 := bits.Add64(x17, x34, x42) - x45, x46 := bits.Add64(x19, x36, x44) - x48, x47 := bits.Mul64(x1, arg1[3]) - x50, x49 := bits.Mul64(x1, arg1[2]) - x52, x51 := bits.Mul64(x1, arg1[1]) - x54, x53 := bits.Mul64(x1, arg1[0]) - x55, x56 := bits.Add64(x54, x51, 0) - x57, x58 := bits.Add64(x52, x49, x56) - x59, x60 := bits.Add64(x50, x47, x58) - x61 := x60 + x48 - x62, x63 := bits.Add64(x39, x53, 0) - x64, x65 := bits.Add64(x41, x55, x63) - x66, x67 := bits.Add64(x43, x57, x65) - x68, x69 := bits.Add64(x45, x59, x67) - x70, x71 := bits.Add64(x46, x61, x69) - _, x72 := bits.Mul64(x62, 0xccd1c8aaee00bc4f) - x75, x74 := bits.Mul64(x72, 0xffffffff00000000) - x77, x76 := bits.Mul64(x72, 0xffffffffffffffff) - x79, x78 := bits.Mul64(x72, 0xbce6faada7179e84) - x81, x80 := bits.Mul64(x72, 0xf3b9cac2fc632551) - x82, x83 := bits.Add64(x81, x78, 0) - x84, x85 := bits.Add64(x79, x76, x83) - x86, x87 := bits.Add64(x77, x74, x85) - x88 := x87 + x75 - _, x90 := bits.Add64(x62, x80, 0) - x91, x92 := bits.Add64(x64, x82, x90) - x93, x94 := bits.Add64(x66, x84, x92) - x95, x96 := bits.Add64(x68, x86, x94) - x97, x98 := bits.Add64(x70, x88, x96) - x99 := x98 + x71 - x101, x100 := bits.Mul64(x2, arg1[3]) - x103, x102 := bits.Mul64(x2, arg1[2]) - x105, x104 := bits.Mul64(x2, arg1[1]) - x107, x106 := bits.Mul64(x2, arg1[0]) - x108, x109 := bits.Add64(x107, x104, 0) - x110, x111 := bits.Add64(x105, x102, x109) - x112, x113 := bits.Add64(x103, x100, x111) - x114 := x113 + x101 - x115, x116 := bits.Add64(x91, x106, 0) - x117, x118 := bits.Add64(x93, x108, x116) - x119, x120 := bits.Add64(x95, x110, x118) - x121, x122 := bits.Add64(x97, x112, x120) - x123, x124 := bits.Add64(x99, x114, x122) - _, x125 := bits.Mul64(x115, 0xccd1c8aaee00bc4f) - x128, x127 := bits.Mul64(x125, 0xffffffff00000000) - x130, x129 := bits.Mul64(x125, 0xffffffffffffffff) - x132, x131 := bits.Mul64(x125, 0xbce6faada7179e84) - x134, x133 := bits.Mul64(x125, 0xf3b9cac2fc632551) - x135, x136 := bits.Add64(x134, x131, 0) - x137, x138 := bits.Add64(x132, x129, x136) - x139, x140 := bits.Add64(x130, x127, x138) - x141 := x140 + x128 - _, x143 := bits.Add64(x115, x133, 0) - x144, x145 := bits.Add64(x117, x135, x143) - x146, x147 := bits.Add64(x119, x137, x145) - x148, x149 := bits.Add64(x121, x139, x147) - x150, x151 := bits.Add64(x123, x141, x149) - x152 := x151 + x124 - x154, x153 := bits.Mul64(x3, arg1[3]) - x156, x155 := bits.Mul64(x3, arg1[2]) - x158, x157 := bits.Mul64(x3, arg1[1]) - x160, x159 := bits.Mul64(x3, arg1[0]) - x161, x162 := bits.Add64(x160, x157, 0) - x163, x164 := bits.Add64(x158, x155, x162) - x165, x166 := bits.Add64(x156, x153, x164) - x167 := x166 + x154 - x168, x169 := bits.Add64(x144, x159, 0) - x170, x171 := bits.Add64(x146, x161, x169) - x172, x173 := bits.Add64(x148, x163, x171) - x174, x175 := bits.Add64(x150, x165, x173) - x176, x177 := bits.Add64(x152, x167, x175) - _, x178 := bits.Mul64(x168, 0xccd1c8aaee00bc4f) - x181, x180 := bits.Mul64(x178, 0xffffffff00000000) - x183, x182 := bits.Mul64(x178, 0xffffffffffffffff) - x185, x184 := bits.Mul64(x178, 0xbce6faada7179e84) - x187, x186 := bits.Mul64(x178, 0xf3b9cac2fc632551) - x188, x189 := bits.Add64(x187, x184, 0) - x190, x191 := bits.Add64(x185, x182, x189) - x192, x193 := bits.Add64(x183, x180, x191) - x194 := x193 + x181 - _, x196 := bits.Add64(x168, x186, 0) - x197, x198 := bits.Add64(x170, x188, x196) - x199, x200 := bits.Add64(x172, x190, x198) - x201, x202 := bits.Add64(x174, x192, x200) - x203, x204 := bits.Add64(x176, x194, x202) - x205 := x204 + x177 - x206, x207 := bits.Sub64(x197, 0xf3b9cac2fc632551, 0) - x208, x209 := bits.Sub64(x199, 0xbce6faada7179e84, x207) - x210, x211 := bits.Sub64(x201, 0xffffffffffffffff, x209) - x212, x213 := bits.Sub64(x203, 0xffffffff00000000, x211) - _, x215 := bits.Sub64(x205, 0, x213) - var x216 uint64 - p256OrdCmovznzU64(&x216, x215, x206, x197) - var x217 uint64 - p256OrdCmovznzU64(&x217, x215, x208, x199) - var x218 uint64 - p256OrdCmovznzU64(&x218, x215, x210, x201) - var x219 uint64 - p256OrdCmovznzU64(&x219, x215, x212, x203) - out1[0] = x216 - out1[1] = x217 - out1[2] = x218 - out1[3] = x219 + for range n { + x6, x5 := bits.Mul64(x4, x3) + x8, x7 := bits.Mul64(x4, x2) + x10, x9 := bits.Mul64(x4, x1) + x12, x11 := bits.Mul64(x4, x4) + x13, x14 := bits.Add64(x12, x9, 0) + x15, x16 := bits.Add64(x10, x7, x14) + x17, x18 := bits.Add64(x8, x5, x16) + x19 := x18 + x6 + _, x20 := bits.Mul64(x11, 0xccd1c8aaee00bc4f) + x23, x22 := bits.Mul64(x20, 0xffffffff00000000) + x25, x24 := bits.Mul64(x20, 0xffffffffffffffff) + x27, x26 := bits.Mul64(x20, 0xbce6faada7179e84) + x29, x28 := bits.Mul64(x20, 0xf3b9cac2fc632551) + x30, x31 := bits.Add64(x29, x26, 0) + x32, x33 := bits.Add64(x27, x24, x31) + x34, x35 := bits.Add64(x25, x22, x33) + x36 := x35 + x23 + _, x38 := bits.Add64(x11, x28, 0) + x39, x40 := bits.Add64(x13, x30, x38) + x41, x42 := bits.Add64(x15, x32, x40) + x43, x44 := bits.Add64(x17, x34, x42) + x45, x46 := bits.Add64(x19, x36, x44) + x48, x47 := bits.Mul64(x1, x3) + x50, x49 := bits.Mul64(x1, x2) + x52, x51 := bits.Mul64(x1, x1) + x54, x53 := bits.Mul64(x1, x4) + x55, x56 := bits.Add64(x54, x51, 0) + x57, x58 := bits.Add64(x52, x49, x56) + x59, x60 := bits.Add64(x50, x47, x58) + x61 := x60 + x48 + x62, x63 := bits.Add64(x39, x53, 0) + x64, x65 := bits.Add64(x41, x55, x63) + x66, x67 := bits.Add64(x43, x57, x65) + x68, x69 := bits.Add64(x45, x59, x67) + x70, x71 := bits.Add64(x46, x61, x69) + _, x72 := bits.Mul64(x62, 0xccd1c8aaee00bc4f) + x75, x74 := bits.Mul64(x72, 0xffffffff00000000) + x77, x76 := bits.Mul64(x72, 0xffffffffffffffff) + x79, x78 := bits.Mul64(x72, 0xbce6faada7179e84) + x81, x80 := bits.Mul64(x72, 0xf3b9cac2fc632551) + x82, x83 := bits.Add64(x81, x78, 0) + x84, x85 := bits.Add64(x79, x76, x83) + x86, x87 := bits.Add64(x77, x74, x85) + x88 := x87 + x75 + _, x90 := bits.Add64(x62, x80, 0) + x91, x92 := bits.Add64(x64, x82, x90) + x93, x94 := bits.Add64(x66, x84, x92) + x95, x96 := bits.Add64(x68, x86, x94) + x97, x98 := bits.Add64(x70, x88, x96) + x99 := x98 + x71 + x101, x100 := bits.Mul64(x2, x3) + x103, x102 := bits.Mul64(x2, x2) + x105, x104 := bits.Mul64(x2, x1) + x107, x106 := bits.Mul64(x2, x4) + x108, x109 := bits.Add64(x107, x104, 0) + x110, x111 := bits.Add64(x105, x102, x109) + x112, x113 := bits.Add64(x103, x100, x111) + x114 := x113 + x101 + x115, x116 := bits.Add64(x91, x106, 0) + x117, x118 := bits.Add64(x93, x108, x116) + x119, x120 := bits.Add64(x95, x110, x118) + x121, x122 := bits.Add64(x97, x112, x120) + x123, x124 := bits.Add64(x99, x114, x122) + _, x125 := bits.Mul64(x115, 0xccd1c8aaee00bc4f) + x128, x127 := bits.Mul64(x125, 0xffffffff00000000) + x130, x129 := bits.Mul64(x125, 0xffffffffffffffff) + x132, x131 := bits.Mul64(x125, 0xbce6faada7179e84) + x134, x133 := bits.Mul64(x125, 0xf3b9cac2fc632551) + x135, x136 := bits.Add64(x134, x131, 0) + x137, x138 := bits.Add64(x132, x129, x136) + x139, x140 := bits.Add64(x130, x127, x138) + x141 := x140 + x128 + _, x143 := bits.Add64(x115, x133, 0) + x144, x145 := bits.Add64(x117, x135, x143) + x146, x147 := bits.Add64(x119, x137, x145) + x148, x149 := bits.Add64(x121, x139, x147) + x150, x151 := bits.Add64(x123, x141, x149) + x152 := x151 + x124 + x154, x153 := bits.Mul64(x3, x3) + x156, x155 := bits.Mul64(x3, x2) + x158, x157 := bits.Mul64(x3, x1) + x160, x159 := bits.Mul64(x3, x4) + x161, x162 := bits.Add64(x160, x157, 0) + x163, x164 := bits.Add64(x158, x155, x162) + x165, x166 := bits.Add64(x156, x153, x164) + x167 := x166 + x154 + x168, x169 := bits.Add64(x144, x159, 0) + x170, x171 := bits.Add64(x146, x161, x169) + x172, x173 := bits.Add64(x148, x163, x171) + x174, x175 := bits.Add64(x150, x165, x173) + x176, x177 := bits.Add64(x152, x167, x175) + _, x178 := bits.Mul64(x168, 0xccd1c8aaee00bc4f) + x181, x180 := bits.Mul64(x178, 0xffffffff00000000) + x183, x182 := bits.Mul64(x178, 0xffffffffffffffff) + x185, x184 := bits.Mul64(x178, 0xbce6faada7179e84) + x187, x186 := bits.Mul64(x178, 0xf3b9cac2fc632551) + x188, x189 := bits.Add64(x187, x184, 0) + x190, x191 := bits.Add64(x185, x182, x189) + x192, x193 := bits.Add64(x183, x180, x191) + x194 := x193 + x181 + _, x196 := bits.Add64(x168, x186, 0) + x197, x198 := bits.Add64(x170, x188, x196) + x199, x200 := bits.Add64(x172, x190, x198) + x201, x202 := bits.Add64(x174, x192, x200) + x203, x204 := bits.Add64(x176, x194, x202) + x205 := x204 + x177 + x206, x207 := bits.Sub64(x197, 0xf3b9cac2fc632551, 0) + x208, x209 := bits.Sub64(x199, 0xbce6faada7179e84, x207) + x210, x211 := bits.Sub64(x201, 0xffffffffffffffff, x209) + x212, x213 := bits.Sub64(x203, 0xffffffff00000000, x211) + _, x215 := bits.Sub64(x205, 0, x213) + mask, _ := bits.Sub64(0, 0, x215) + x4 = x206&^mask | x197&mask + x1 = x208&^mask | x199&mask + x2 = x210&^mask | x201&mask + x3 = x212&^mask | x203&mask + } + out1[0] = x4 + out1[1] = x1 + out1[2] = x2 + out1[3] = x3 } -// The function p256OrdFromMontgomery translates a field element out of the Montgomery domain. -// -// Preconditions: -// -// 0 ≤ eval arg1 < m -// -// Postconditions: -// -// eval out1 mod m = (eval arg1 * ((2^64)⁻¹ mod m)^4) mod m -// 0 ≤ eval out1 < m +// p256OrdFromMontgomery translates a field element out of the Montgomery domain. func p256OrdFromMontgomery(out1 *p256OrdElement, arg1 *p256OrdMontElement) { x1 := arg1[0] _, x2 := bits.Mul64(x1, 0xccd1c8aaee00bc4f) @@ -510,31 +430,14 @@ func p256OrdFromMontgomery(out1 *p256OrdElement, arg1 *p256OrdMontElement) { x119, x120 := bits.Sub64(x112, 0xbce6faada7179e84, x118) x121, x122 := bits.Sub64(x114, 0xffffffffffffffff, x120) x123, x124 := bits.Sub64(x116, 0xffffffff00000000, x122) - _, x126 := bits.Sub64(0, 0, x124) - var x127 uint64 - p256OrdCmovznzU64(&x127, x126, x117, x110) - var x128 uint64 - p256OrdCmovznzU64(&x128, x126, x119, x112) - var x129 uint64 - p256OrdCmovznzU64(&x129, x126, x121, x114) - var x130 uint64 - p256OrdCmovznzU64(&x130, x126, x123, x116) - out1[0] = x127 - out1[1] = x128 - out1[2] = x129 - out1[3] = x130 + mask, _ := bits.Sub64(0, 0, x124) + out1[0] = x117&^mask | x110&mask + out1[1] = x119&^mask | x112&mask + out1[2] = x121&^mask | x114&mask + out1[3] = x123&^mask | x116&mask } -// The function p256OrdToMontgomery translates a field element into the Montgomery domain. -// -// Preconditions: -// -// 0 ≤ eval arg1 < m -// -// Postconditions: -// -// eval (from_montgomery out1) mod m = eval arg1 mod m -// 0 ≤ eval out1 < m +// p256OrdToMontgomery translates a field element into the Montgomery domain. func p256OrdToMontgomery(out1 *p256OrdMontElement, arg1 *p256OrdElement) { x1 := arg1[1] x2 := arg1[2] @@ -637,37 +540,9 @@ func p256OrdToMontgomery(out1 *p256OrdMontElement, arg1 *p256OrdElement) { x193, x194 := bits.Sub64(x185, 0xffffffffffffffff, x192) x195, x196 := bits.Sub64(x187, 0xffffffff00000000, x194) _, x198 := bits.Sub64(x188, 0, x196) - var x199 uint64 - p256OrdCmovznzU64(&x199, x198, x189, x181) - var x200 uint64 - p256OrdCmovznzU64(&x200, x198, x191, x183) - var x201 uint64 - p256OrdCmovznzU64(&x201, x198, x193, x185) - var x202 uint64 - p256OrdCmovznzU64(&x202, x198, x195, x187) - out1[0] = x199 - out1[1] = x200 - out1[2] = x201 - out1[3] = x202 -} - -// The function p256OrdCmovznzU64 is a single-word conditional move. -// -// Postconditions: -// -// out1 = (if arg1 = 0 then arg2 else arg3) -// -// Input Bounds: -// -// arg1: [0x0 ~> 0x1] -// arg2: [0x0 ~> 0xffffffffffffffff] -// arg3: [0x0 ~> 0xffffffffffffffff] -// -// Output Bounds: -// -// out1: [0x0 ~> 0xffffffffffffffff] -func p256OrdCmovznzU64(out1 *uint64, arg1 uint64, arg2 uint64, arg3 uint64) { - x1 := arg1 * 0xffffffffffffffff - x2 := x1&arg3 | ^x1&arg2 - *out1 = x2 + mask, _ := bits.Sub64(0, 0, x198) + out1[0] = x189&^mask | x181&mask + out1[1] = x191&^mask | x183&mask + out1[2] = x193&^mask | x185&mask + out1[3] = x195&^mask | x187&mask }