crypto/internal/fips140/nistec: optimize P-256 scalar fiat implementation

This reduces the regression of CL 669895 by about half on arm64.

host: linux-amd64_c2s16
                      3227c963a3c   12743b9f2c8   ccb659cd879   9a11e9167c3
                        sec/op       vs base       vs base       vs base
Sign/P256-16            33.38µ       +5.23%        +5.04%        +5.41%
Verify/P256-16          74.11µ       +2.54%        +2.49%        +2.69%
GenerateKey/P256-16     14.61µ         ~             ~             ~
geomean                 33.06µ       +2.50%        +2.41%        +2.63%

                        B/op         vs base       vs base       vs base
Sign/P256-16            5.922Ki      -0.53%        -0.53%        -0.53%
Verify/P256-16            576.0      -5.56%        -5.56%        -5.56%
GenerateKey/P256-16       984.0        ~             ~             ~
geomean                 1.474Ki      -2.06%        -2.06%        -2.06%

                        allocs/op    vs base       vs base       vs base
Sign/P256-16              59.00      -1.69%        -1.69%        -1.69%
Verify/P256-16            10.00     -10.00%       -10.00%       -10.00%
GenerateKey/P256-16       16.00        ~             ~             ~
geomean                   21.13      -4.00%        -4.00%        -4.00%

host: linux-arm64_c4as16
                      3227c963a3c   12743b9f2c8   ccb659cd879   9a11e9167c3
                        sec/op       vs base       vs base       vs base
Sign/P256-16            29.29µ       +8.88%        +8.94%        +5.41%
Verify/P256-16          69.25µ       +3.52%        +3.48%        +2.21%
GenerateKey/P256-16     15.17µ         ~             ~             ~
geomean                 31.34µ       +4.05%        +3.97%        +2.51%

                        B/op         vs base       vs base       vs base
Sign/P256-16            5.922Ki      -0.53%        -0.53%        -0.53%
Verify/P256-16            576.0      -5.56%        -5.56%        -5.56%
GenerateKey/P256-16       984.0        ~             ~             ~
geomean                 1.474Ki      -2.06%        -2.06%        -2.06%

                        allocs/op    vs base       vs base       vs base
Sign/P256-16              59.00      -1.69%        -1.69%        -1.69%
Verify/P256-16            10.00     -10.00%       -10.00%       -10.00%
GenerateKey/P256-16       16.00        ~             ~             ~
geomean                   21.13      -4.00%        -4.00%        -4.00%

Change-Id: I69adc8175acf0082dca7c8a13d5f62046a6a6964
Reviewed-on: https://go-review.googlesource.com/c/go/+/749141
Auto-Submit: Filippo Valsorda <filippo@golang.org>
Reviewed-by: Neal Patel <neal@golang.org>
Reviewed-by: Roland Shoemaker <roland@golang.org>
LUCI-TryBot-Result: golang-scoped@luci-project-accounts.iam.gserviceaccount.com <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Neal Patel <nealpatel@google.com>
This commit is contained in:
Filippo Valsorda 2026-02-25 14:12:15 +01:00 committed by Gopher Robot
parent e4e6887cee
commit 2c659bb4db

View file

@ -71,14 +71,8 @@ func P256OrdInverse(k *[4]uint64) {
p256OrdFromMontgomery(j, x)
}
func p256OrdSqr(out1, arg1 *p256OrdMontElement, n int) {
p256OrdSquare(out1, arg1)
for range n - 1 {
p256OrdSquare(out1, out1)
}
}
// The code below was generated by Fiat Cryptography v0.1.6-63-g92ee794c2.
// The code below was generated by Fiat Cryptography v0.1.6-63-g92ee794c2, and
// then manually formatted and optimized.
//
// word-by-word-montgomery --lang Go --no-wide-int
// --relax-primitive-carry-to-bitwidth 32,64 --cmovznz-by-mul --static
@ -112,60 +106,12 @@ func p256OrdSqr(out1, arg1 *p256OrdMontElement, n int) {
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Autogenerated: fiat_crypto.js word-by-word-montgomery --lang Go --no-wide-int --relax-primitive-carry-to-bitwidth 32,64 --cmovznz-by-mul --static --package-case flatcase --private-function-case camelCase --private-type-case camelCase --no-prefix-fiat --package-name nistec p256Ord 64 2^256-2^224+2^192-89188191075325690597107910205041859247 mul square from_montgomery to_montgomery
//
// curve description: p256Ord
//
// machine_wordsize = 64 (from "64")
//
// requested operations: mul, square, from_montgomery, to_montgomery
//
// m = 0xffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2fc632551 (from "2^256-2^224+2^192-89188191075325690597107910205041859247")
//
//
//
// NOTE: In addition to the bounds specified above each function, all
//
// functions synthesized for this Montgomery arithmetic require the
//
// input to be strictly less than the prime modulus (m), and also
//
// require the input to be in the unique saturated representation.
//
// All functions also ensure that these two properties are true of
//
// return values.
//
//
//
// Computed values:
//
// eval z = z[0] + (z[1] << 64) + (z[2] << 128) + (z[3] << 192)
//
// bytes_eval z = z[0] + (z[1] << 8) + (z[2] << 16) + (z[3] << 24) + (z[4] << 32) + (z[5] << 40) + (z[6] << 48) + (z[7] << 56) + (z[8] << 64) + (z[9] << 72) + (z[10] << 80) + (z[11] << 88) + (z[12] << 96) + (z[13] << 104) + (z[14] << 112) + (z[15] << 120) + (z[16] << 128) + (z[17] << 136) + (z[18] << 144) + (z[19] << 152) + (z[20] << 160) + (z[21] << 168) + (z[22] << 176) + (z[23] << 184) + (z[24] << 192) + (z[25] << 200) + (z[26] << 208) + (z[27] << 216) + (z[28] << 224) + (z[29] << 232) + (z[30] << 240) + (z[31] << 248)
//
// twos_complement_eval z = let x1 := z[0] + (z[1] << 64) + (z[2] << 128) + (z[3] << 192) in
//
// if x1 & (2^256-1) < 2^255 then x1 & (2^256-1) else (x1 & (2^256-1)) - 2^256
type p256OrdUint1 = uint64 // We use uint64 instead of a more narrow type for performance reasons; see https://github.com/mit-plv/fiat-crypto/pull/1006#issuecomment-892625927
// The type p256OrdMontElement is a field element in the Montgomery domain.
//
// Bounds: [[0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff], [0x0 ~> 0xffffffffffffffff]]
// p256OrdMontElement is a scalar field element in the Montgomery domain, as
// four uint64 limbs in little-endian order. It must be strictly less than
// ord(G) and in Montgomery form (with R 2²⁵⁶).
type p256OrdMontElement [4]uint64
// The function p256OrdMul multiplies two field elements in the Montgomery domain.
//
// Preconditions:
//
// 0 ≤ eval arg1 < m
// 0 ≤ eval arg2 < m
//
// Postconditions:
//
// eval (from_montgomery out1) mod m = (eval (from_montgomery arg1) * eval (from_montgomery arg2)) mod m
// 0 ≤ eval out1 < m
// p256OrdMul multiplies two field elements in the Montgomery domain.
func p256OrdMul(out1 *p256OrdMontElement, arg1 *p256OrdMontElement, arg2 *p256OrdMontElement) {
x1 := arg1[1]
x2 := arg1[2]
@ -282,170 +228,144 @@ func p256OrdMul(out1 *p256OrdMontElement, arg1 *p256OrdMontElement, arg2 *p256Or
x210, x211 := bits.Sub64(x201, 0xffffffffffffffff, x209)
x212, x213 := bits.Sub64(x203, 0xffffffff00000000, x211)
_, x215 := bits.Sub64(x205, 0, x213)
var x216 uint64
p256OrdCmovznzU64(&x216, x215, x206, x197)
var x217 uint64
p256OrdCmovznzU64(&x217, x215, x208, x199)
var x218 uint64
p256OrdCmovznzU64(&x218, x215, x210, x201)
var x219 uint64
p256OrdCmovznzU64(&x219, x215, x212, x203)
out1[0] = x216
out1[1] = x217
out1[2] = x218
out1[3] = x219
mask, _ := bits.Sub64(0, 0, x215)
out1[0] = x206&^mask | x197&mask
out1[1] = x208&^mask | x199&mask
out1[2] = x210&^mask | x201&mask
out1[3] = x212&^mask | x203&mask
}
// The function p256OrdSquare squares a field element in the Montgomery domain.
//
// Preconditions:
//
// 0 ≤ eval arg1 < m
//
// Postconditions:
//
// eval (from_montgomery out1) mod m = (eval (from_montgomery arg1) * eval (from_montgomery arg1)) mod m
// 0 ≤ eval out1 < m
func p256OrdSquare(out1 *p256OrdMontElement, arg1 *p256OrdMontElement) {
// p256OrdSqr squares n times a field element in the Montgomery domain.
func p256OrdSqr(out1 *p256OrdMontElement, arg1 *p256OrdMontElement, n int) {
x1 := arg1[1]
x2 := arg1[2]
x3 := arg1[3]
x4 := arg1[0]
x6, x5 := bits.Mul64(x4, arg1[3])
x8, x7 := bits.Mul64(x4, arg1[2])
x10, x9 := bits.Mul64(x4, arg1[1])
x12, x11 := bits.Mul64(x4, arg1[0])
x13, x14 := bits.Add64(x12, x9, 0)
x15, x16 := bits.Add64(x10, x7, x14)
x17, x18 := bits.Add64(x8, x5, x16)
x19 := x18 + x6
_, x20 := bits.Mul64(x11, 0xccd1c8aaee00bc4f)
x23, x22 := bits.Mul64(x20, 0xffffffff00000000)
x25, x24 := bits.Mul64(x20, 0xffffffffffffffff)
x27, x26 := bits.Mul64(x20, 0xbce6faada7179e84)
x29, x28 := bits.Mul64(x20, 0xf3b9cac2fc632551)
x30, x31 := bits.Add64(x29, x26, 0)
x32, x33 := bits.Add64(x27, x24, x31)
x34, x35 := bits.Add64(x25, x22, x33)
x36 := x35 + x23
_, x38 := bits.Add64(x11, x28, 0)
x39, x40 := bits.Add64(x13, x30, x38)
x41, x42 := bits.Add64(x15, x32, x40)
x43, x44 := bits.Add64(x17, x34, x42)
x45, x46 := bits.Add64(x19, x36, x44)
x48, x47 := bits.Mul64(x1, arg1[3])
x50, x49 := bits.Mul64(x1, arg1[2])
x52, x51 := bits.Mul64(x1, arg1[1])
x54, x53 := bits.Mul64(x1, arg1[0])
x55, x56 := bits.Add64(x54, x51, 0)
x57, x58 := bits.Add64(x52, x49, x56)
x59, x60 := bits.Add64(x50, x47, x58)
x61 := x60 + x48
x62, x63 := bits.Add64(x39, x53, 0)
x64, x65 := bits.Add64(x41, x55, x63)
x66, x67 := bits.Add64(x43, x57, x65)
x68, x69 := bits.Add64(x45, x59, x67)
x70, x71 := bits.Add64(x46, x61, x69)
_, x72 := bits.Mul64(x62, 0xccd1c8aaee00bc4f)
x75, x74 := bits.Mul64(x72, 0xffffffff00000000)
x77, x76 := bits.Mul64(x72, 0xffffffffffffffff)
x79, x78 := bits.Mul64(x72, 0xbce6faada7179e84)
x81, x80 := bits.Mul64(x72, 0xf3b9cac2fc632551)
x82, x83 := bits.Add64(x81, x78, 0)
x84, x85 := bits.Add64(x79, x76, x83)
x86, x87 := bits.Add64(x77, x74, x85)
x88 := x87 + x75
_, x90 := bits.Add64(x62, x80, 0)
x91, x92 := bits.Add64(x64, x82, x90)
x93, x94 := bits.Add64(x66, x84, x92)
x95, x96 := bits.Add64(x68, x86, x94)
x97, x98 := bits.Add64(x70, x88, x96)
x99 := x98 + x71
x101, x100 := bits.Mul64(x2, arg1[3])
x103, x102 := bits.Mul64(x2, arg1[2])
x105, x104 := bits.Mul64(x2, arg1[1])
x107, x106 := bits.Mul64(x2, arg1[0])
x108, x109 := bits.Add64(x107, x104, 0)
x110, x111 := bits.Add64(x105, x102, x109)
x112, x113 := bits.Add64(x103, x100, x111)
x114 := x113 + x101
x115, x116 := bits.Add64(x91, x106, 0)
x117, x118 := bits.Add64(x93, x108, x116)
x119, x120 := bits.Add64(x95, x110, x118)
x121, x122 := bits.Add64(x97, x112, x120)
x123, x124 := bits.Add64(x99, x114, x122)
_, x125 := bits.Mul64(x115, 0xccd1c8aaee00bc4f)
x128, x127 := bits.Mul64(x125, 0xffffffff00000000)
x130, x129 := bits.Mul64(x125, 0xffffffffffffffff)
x132, x131 := bits.Mul64(x125, 0xbce6faada7179e84)
x134, x133 := bits.Mul64(x125, 0xf3b9cac2fc632551)
x135, x136 := bits.Add64(x134, x131, 0)
x137, x138 := bits.Add64(x132, x129, x136)
x139, x140 := bits.Add64(x130, x127, x138)
x141 := x140 + x128
_, x143 := bits.Add64(x115, x133, 0)
x144, x145 := bits.Add64(x117, x135, x143)
x146, x147 := bits.Add64(x119, x137, x145)
x148, x149 := bits.Add64(x121, x139, x147)
x150, x151 := bits.Add64(x123, x141, x149)
x152 := x151 + x124
x154, x153 := bits.Mul64(x3, arg1[3])
x156, x155 := bits.Mul64(x3, arg1[2])
x158, x157 := bits.Mul64(x3, arg1[1])
x160, x159 := bits.Mul64(x3, arg1[0])
x161, x162 := bits.Add64(x160, x157, 0)
x163, x164 := bits.Add64(x158, x155, x162)
x165, x166 := bits.Add64(x156, x153, x164)
x167 := x166 + x154
x168, x169 := bits.Add64(x144, x159, 0)
x170, x171 := bits.Add64(x146, x161, x169)
x172, x173 := bits.Add64(x148, x163, x171)
x174, x175 := bits.Add64(x150, x165, x173)
x176, x177 := bits.Add64(x152, x167, x175)
_, x178 := bits.Mul64(x168, 0xccd1c8aaee00bc4f)
x181, x180 := bits.Mul64(x178, 0xffffffff00000000)
x183, x182 := bits.Mul64(x178, 0xffffffffffffffff)
x185, x184 := bits.Mul64(x178, 0xbce6faada7179e84)
x187, x186 := bits.Mul64(x178, 0xf3b9cac2fc632551)
x188, x189 := bits.Add64(x187, x184, 0)
x190, x191 := bits.Add64(x185, x182, x189)
x192, x193 := bits.Add64(x183, x180, x191)
x194 := x193 + x181
_, x196 := bits.Add64(x168, x186, 0)
x197, x198 := bits.Add64(x170, x188, x196)
x199, x200 := bits.Add64(x172, x190, x198)
x201, x202 := bits.Add64(x174, x192, x200)
x203, x204 := bits.Add64(x176, x194, x202)
x205 := x204 + x177
x206, x207 := bits.Sub64(x197, 0xf3b9cac2fc632551, 0)
x208, x209 := bits.Sub64(x199, 0xbce6faada7179e84, x207)
x210, x211 := bits.Sub64(x201, 0xffffffffffffffff, x209)
x212, x213 := bits.Sub64(x203, 0xffffffff00000000, x211)
_, x215 := bits.Sub64(x205, 0, x213)
var x216 uint64
p256OrdCmovznzU64(&x216, x215, x206, x197)
var x217 uint64
p256OrdCmovznzU64(&x217, x215, x208, x199)
var x218 uint64
p256OrdCmovznzU64(&x218, x215, x210, x201)
var x219 uint64
p256OrdCmovznzU64(&x219, x215, x212, x203)
out1[0] = x216
out1[1] = x217
out1[2] = x218
out1[3] = x219
for range n {
x6, x5 := bits.Mul64(x4, x3)
x8, x7 := bits.Mul64(x4, x2)
x10, x9 := bits.Mul64(x4, x1)
x12, x11 := bits.Mul64(x4, x4)
x13, x14 := bits.Add64(x12, x9, 0)
x15, x16 := bits.Add64(x10, x7, x14)
x17, x18 := bits.Add64(x8, x5, x16)
x19 := x18 + x6
_, x20 := bits.Mul64(x11, 0xccd1c8aaee00bc4f)
x23, x22 := bits.Mul64(x20, 0xffffffff00000000)
x25, x24 := bits.Mul64(x20, 0xffffffffffffffff)
x27, x26 := bits.Mul64(x20, 0xbce6faada7179e84)
x29, x28 := bits.Mul64(x20, 0xf3b9cac2fc632551)
x30, x31 := bits.Add64(x29, x26, 0)
x32, x33 := bits.Add64(x27, x24, x31)
x34, x35 := bits.Add64(x25, x22, x33)
x36 := x35 + x23
_, x38 := bits.Add64(x11, x28, 0)
x39, x40 := bits.Add64(x13, x30, x38)
x41, x42 := bits.Add64(x15, x32, x40)
x43, x44 := bits.Add64(x17, x34, x42)
x45, x46 := bits.Add64(x19, x36, x44)
x48, x47 := bits.Mul64(x1, x3)
x50, x49 := bits.Mul64(x1, x2)
x52, x51 := bits.Mul64(x1, x1)
x54, x53 := bits.Mul64(x1, x4)
x55, x56 := bits.Add64(x54, x51, 0)
x57, x58 := bits.Add64(x52, x49, x56)
x59, x60 := bits.Add64(x50, x47, x58)
x61 := x60 + x48
x62, x63 := bits.Add64(x39, x53, 0)
x64, x65 := bits.Add64(x41, x55, x63)
x66, x67 := bits.Add64(x43, x57, x65)
x68, x69 := bits.Add64(x45, x59, x67)
x70, x71 := bits.Add64(x46, x61, x69)
_, x72 := bits.Mul64(x62, 0xccd1c8aaee00bc4f)
x75, x74 := bits.Mul64(x72, 0xffffffff00000000)
x77, x76 := bits.Mul64(x72, 0xffffffffffffffff)
x79, x78 := bits.Mul64(x72, 0xbce6faada7179e84)
x81, x80 := bits.Mul64(x72, 0xf3b9cac2fc632551)
x82, x83 := bits.Add64(x81, x78, 0)
x84, x85 := bits.Add64(x79, x76, x83)
x86, x87 := bits.Add64(x77, x74, x85)
x88 := x87 + x75
_, x90 := bits.Add64(x62, x80, 0)
x91, x92 := bits.Add64(x64, x82, x90)
x93, x94 := bits.Add64(x66, x84, x92)
x95, x96 := bits.Add64(x68, x86, x94)
x97, x98 := bits.Add64(x70, x88, x96)
x99 := x98 + x71
x101, x100 := bits.Mul64(x2, x3)
x103, x102 := bits.Mul64(x2, x2)
x105, x104 := bits.Mul64(x2, x1)
x107, x106 := bits.Mul64(x2, x4)
x108, x109 := bits.Add64(x107, x104, 0)
x110, x111 := bits.Add64(x105, x102, x109)
x112, x113 := bits.Add64(x103, x100, x111)
x114 := x113 + x101
x115, x116 := bits.Add64(x91, x106, 0)
x117, x118 := bits.Add64(x93, x108, x116)
x119, x120 := bits.Add64(x95, x110, x118)
x121, x122 := bits.Add64(x97, x112, x120)
x123, x124 := bits.Add64(x99, x114, x122)
_, x125 := bits.Mul64(x115, 0xccd1c8aaee00bc4f)
x128, x127 := bits.Mul64(x125, 0xffffffff00000000)
x130, x129 := bits.Mul64(x125, 0xffffffffffffffff)
x132, x131 := bits.Mul64(x125, 0xbce6faada7179e84)
x134, x133 := bits.Mul64(x125, 0xf3b9cac2fc632551)
x135, x136 := bits.Add64(x134, x131, 0)
x137, x138 := bits.Add64(x132, x129, x136)
x139, x140 := bits.Add64(x130, x127, x138)
x141 := x140 + x128
_, x143 := bits.Add64(x115, x133, 0)
x144, x145 := bits.Add64(x117, x135, x143)
x146, x147 := bits.Add64(x119, x137, x145)
x148, x149 := bits.Add64(x121, x139, x147)
x150, x151 := bits.Add64(x123, x141, x149)
x152 := x151 + x124
x154, x153 := bits.Mul64(x3, x3)
x156, x155 := bits.Mul64(x3, x2)
x158, x157 := bits.Mul64(x3, x1)
x160, x159 := bits.Mul64(x3, x4)
x161, x162 := bits.Add64(x160, x157, 0)
x163, x164 := bits.Add64(x158, x155, x162)
x165, x166 := bits.Add64(x156, x153, x164)
x167 := x166 + x154
x168, x169 := bits.Add64(x144, x159, 0)
x170, x171 := bits.Add64(x146, x161, x169)
x172, x173 := bits.Add64(x148, x163, x171)
x174, x175 := bits.Add64(x150, x165, x173)
x176, x177 := bits.Add64(x152, x167, x175)
_, x178 := bits.Mul64(x168, 0xccd1c8aaee00bc4f)
x181, x180 := bits.Mul64(x178, 0xffffffff00000000)
x183, x182 := bits.Mul64(x178, 0xffffffffffffffff)
x185, x184 := bits.Mul64(x178, 0xbce6faada7179e84)
x187, x186 := bits.Mul64(x178, 0xf3b9cac2fc632551)
x188, x189 := bits.Add64(x187, x184, 0)
x190, x191 := bits.Add64(x185, x182, x189)
x192, x193 := bits.Add64(x183, x180, x191)
x194 := x193 + x181
_, x196 := bits.Add64(x168, x186, 0)
x197, x198 := bits.Add64(x170, x188, x196)
x199, x200 := bits.Add64(x172, x190, x198)
x201, x202 := bits.Add64(x174, x192, x200)
x203, x204 := bits.Add64(x176, x194, x202)
x205 := x204 + x177
x206, x207 := bits.Sub64(x197, 0xf3b9cac2fc632551, 0)
x208, x209 := bits.Sub64(x199, 0xbce6faada7179e84, x207)
x210, x211 := bits.Sub64(x201, 0xffffffffffffffff, x209)
x212, x213 := bits.Sub64(x203, 0xffffffff00000000, x211)
_, x215 := bits.Sub64(x205, 0, x213)
mask, _ := bits.Sub64(0, 0, x215)
x4 = x206&^mask | x197&mask
x1 = x208&^mask | x199&mask
x2 = x210&^mask | x201&mask
x3 = x212&^mask | x203&mask
}
out1[0] = x4
out1[1] = x1
out1[2] = x2
out1[3] = x3
}
// The function p256OrdFromMontgomery translates a field element out of the Montgomery domain.
//
// Preconditions:
//
// 0 ≤ eval arg1 < m
//
// Postconditions:
//
// eval out1 mod m = (eval arg1 * ((2^64)⁻¹ mod m)^4) mod m
// 0 ≤ eval out1 < m
// p256OrdFromMontgomery translates a field element out of the Montgomery domain.
func p256OrdFromMontgomery(out1 *p256OrdElement, arg1 *p256OrdMontElement) {
x1 := arg1[0]
_, x2 := bits.Mul64(x1, 0xccd1c8aaee00bc4f)
@ -510,31 +430,14 @@ func p256OrdFromMontgomery(out1 *p256OrdElement, arg1 *p256OrdMontElement) {
x119, x120 := bits.Sub64(x112, 0xbce6faada7179e84, x118)
x121, x122 := bits.Sub64(x114, 0xffffffffffffffff, x120)
x123, x124 := bits.Sub64(x116, 0xffffffff00000000, x122)
_, x126 := bits.Sub64(0, 0, x124)
var x127 uint64
p256OrdCmovznzU64(&x127, x126, x117, x110)
var x128 uint64
p256OrdCmovznzU64(&x128, x126, x119, x112)
var x129 uint64
p256OrdCmovznzU64(&x129, x126, x121, x114)
var x130 uint64
p256OrdCmovznzU64(&x130, x126, x123, x116)
out1[0] = x127
out1[1] = x128
out1[2] = x129
out1[3] = x130
mask, _ := bits.Sub64(0, 0, x124)
out1[0] = x117&^mask | x110&mask
out1[1] = x119&^mask | x112&mask
out1[2] = x121&^mask | x114&mask
out1[3] = x123&^mask | x116&mask
}
// The function p256OrdToMontgomery translates a field element into the Montgomery domain.
//
// Preconditions:
//
// 0 ≤ eval arg1 < m
//
// Postconditions:
//
// eval (from_montgomery out1) mod m = eval arg1 mod m
// 0 ≤ eval out1 < m
// p256OrdToMontgomery translates a field element into the Montgomery domain.
func p256OrdToMontgomery(out1 *p256OrdMontElement, arg1 *p256OrdElement) {
x1 := arg1[1]
x2 := arg1[2]
@ -637,37 +540,9 @@ func p256OrdToMontgomery(out1 *p256OrdMontElement, arg1 *p256OrdElement) {
x193, x194 := bits.Sub64(x185, 0xffffffffffffffff, x192)
x195, x196 := bits.Sub64(x187, 0xffffffff00000000, x194)
_, x198 := bits.Sub64(x188, 0, x196)
var x199 uint64
p256OrdCmovznzU64(&x199, x198, x189, x181)
var x200 uint64
p256OrdCmovznzU64(&x200, x198, x191, x183)
var x201 uint64
p256OrdCmovznzU64(&x201, x198, x193, x185)
var x202 uint64
p256OrdCmovznzU64(&x202, x198, x195, x187)
out1[0] = x199
out1[1] = x200
out1[2] = x201
out1[3] = x202
}
// The function p256OrdCmovznzU64 is a single-word conditional move.
//
// Postconditions:
//
// out1 = (if arg1 = 0 then arg2 else arg3)
//
// Input Bounds:
//
// arg1: [0x0 ~> 0x1]
// arg2: [0x0 ~> 0xffffffffffffffff]
// arg3: [0x0 ~> 0xffffffffffffffff]
//
// Output Bounds:
//
// out1: [0x0 ~> 0xffffffffffffffff]
func p256OrdCmovznzU64(out1 *uint64, arg1 uint64, arg2 uint64, arg3 uint64) {
x1 := arg1 * 0xffffffffffffffff
x2 := x1&arg3 | ^x1&arg2
*out1 = x2
mask, _ := bits.Sub64(0, 0, x198)
out1[0] = x189&^mask | x181&mask
out1[1] = x191&^mask | x183&mask
out1[2] = x193&^mask | x185&mask
out1[3] = x195&^mask | x187&mask
}