mirror of
https://github.com/golang/go.git
synced 2026-06-27 19:30:52 +00:00
crypto/internal/fips140/aes/gcm: constant-time GHASH
Replace our variable time GHASH implementation with a constant-time one. This implementation is slower on platforms which lack native AES instructions (the only places we use the generic implementation), but that is a reasonable trade-off to remove one of the remaining variable-time pieces of the generic AES implementation. The technique used in this implementation is based on the technique described in the BearSSL constant-time notes[0] with Karatsuba multiplication to decompose the 128-bit multiplication into smaller pieces. [0] https://www.bearssl.org/constanttime.html#ghash-for-gcm goos: darwin goarch: arm64 pkg: crypto/internal/fips140/aes/gcm cpu: Apple M1 Pro │ ghash-vt.bench │ ghash-ct.bench │ │ sec/op │ sec/op vs base │ GHASH/16-10 72.41n ± 2% 129.85n ± 1% +79.31% (p=0.000 n=10) GHASH/32-10 130.5n ± 3% 243.3n ± 0% +86.44% (p=0.000 n=10) GHASH/64-10 248.7n ± 1% 474.2n ± 1% +90.69% (p=0.000 n=10) GHASH/128-10 487.0n ± 1% 953.3n ± 2% +95.75% (p=0.000 n=10) GHASH/256-10 952.7n ± 2% 1893.0n ± 3% +98.71% (p=0.000 n=10) GHASH/512-10 1.893µ ± 1% 3.775µ ± 2% +99.47% (p=0.000 n=10) GHASH/1024-10 3.777µ ± 2% 7.472µ ± 2% +97.84% (p=0.000 n=10) geomean 499.8n 962.0n +92.47% │ aes-vt-ghash.bench │ aes-ct-ghash.bench │ │ sec/op │ sec/op vs base │ AESGCM/Open-128-64-10 688.6n ± 2% 1007.5n ± 4% +46.31% (p=0.000 n=10) AESGCM/Seal-128-64-10 672.8n ± 2% 998.0n ± 1% +48.34% (p=0.000 n=10) AESGCM/Open-256-64-10 788.2n ± 0% 1110.0n ± 0% +40.83% (p=0.000 n=10) AESGCM/Seal-256-64-10 789.2n ± 4% 1104.0n ± 0% +39.89% (p=0.000 n=10) AESGCM/Open-128-1350-10 9.535µ ± 2% 14.091µ ± 0% +47.78% (p=0.000 n=10) AESGCM/Seal-128-1350-10 9.512µ ± 1% 14.065µ ± 0% +47.87% (p=0.000 n=10) AESGCM/Open-256-1350-10 11.08µ ± 0% 15.67µ ± 3% +41.45% (p=0.000 n=10) AESGCM/Seal-256-1350-10 11.20µ ± 2% 15.96µ ± 1% +42.46% (p=0.000 n=10) AESGCM/Open-128-8192-10 55.66µ ± 23% 84.01µ ± 1% +50.94% (p=0.002 n=10) AESGCM/Seal-128-8192-10 55.44µ ± 1% 83.85µ ± 5% +51.23% (p=0.000 n=10) AESGCM/Open-256-8192-10 64.88µ ± 2% 93.23µ ± 0% +43.70% (p=0.000 n=10) AESGCM/Seal-256-8192-10 64.47µ ± 1% 93.15µ ± 0% +44.48% (p=0.000 n=10) geomean 7.676µ 11.16µ +45.39% Updates #69025 Cq-Include-Trybots: luci.golang.try:gotip-linux-amd64-longtest,x_crypto-gotip-linux-amd64-longtest Change-Id: I29f916ce30bfdb5c83885369e1cb6aff5ea5d4fe Reviewed-on: https://go-review.googlesource.com/c/go/+/746120 LUCI-TryBot-Result: golang-scoped@luci-project-accounts.iam.gserviceaccount.com <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Daniel McCarney <daniel@binaryparadox.net> Reviewed-by: Neal Patel <nealpatel@google.com> Reviewed-by: Filippo Valsorda <filippo@golang.org> Auto-Submit: Roland Shoemaker <roland@golang.org>
This commit is contained in:
parent
c1f0b9bdba
commit
71c7ea1c6c
2 changed files with 147 additions and 135 deletions
|
|
@ -23,10 +23,6 @@ const (
|
|||
|
||||
// NewGCM returns the given 128-bit, block cipher wrapped in Galois Counter Mode
|
||||
// with the standard nonce length.
|
||||
//
|
||||
// In general, the GHASH operation performed by this implementation of GCM is not constant-time.
|
||||
// An exception is when the underlying [Block] was created by aes.NewCipher
|
||||
// on systems with hardware support for AES. See the [crypto/aes] package documentation for details.
|
||||
func NewGCM(cipher Block) (AEAD, error) {
|
||||
if fips140only.Enforced() {
|
||||
return nil, errors.New("crypto/cipher: use of GCM with arbitrary IVs is not allowed in FIPS 140-only mode, use NewGCMWithRandomNonce")
|
||||
|
|
|
|||
|
|
@ -9,18 +9,6 @@ import (
|
|||
"crypto/internal/fips140deps/byteorder"
|
||||
)
|
||||
|
||||
// gcmFieldElement represents a value in GF(2¹²⁸). In order to reflect the GCM
|
||||
// standard and make binary.BigEndian suitable for marshaling these values, the
|
||||
// bits are stored in big endian order. For example:
|
||||
//
|
||||
// the coefficient of x⁰ can be obtained by v.low >> 63.
|
||||
// the coefficient of x⁶³ can be obtained by v.low & 1.
|
||||
// the coefficient of x⁶⁴ can be obtained by v.high >> 63.
|
||||
// the coefficient of x¹²⁷ can be obtained by v.high & 1.
|
||||
type gcmFieldElement struct {
|
||||
low, high uint64
|
||||
}
|
||||
|
||||
// GHASH is exposed to allow crypto/cipher to implement non-AES GCM modes.
|
||||
// It is not allowed as a stand-alone operation in FIPS mode because it
|
||||
// is not ACVP tested.
|
||||
|
|
@ -31,133 +19,161 @@ func GHASH(key *[16]byte, inputs ...[]byte) []byte {
|
|||
return out[:]
|
||||
}
|
||||
|
||||
// ghash is a variable-time generic implementation of GHASH, which shouldn't
|
||||
// be used on any architecture with hardware support for AES-GCM.
|
||||
//
|
||||
// Each input is zero-padded to 128-bit before being absorbed.
|
||||
// ghashMul does constant-time carry-less multiplication of two 32-bit integers,
|
||||
// returning the 64-bit product.
|
||||
func ghashMul(x, y uint32) uint64 {
|
||||
// This function implements carryless multiplication using a technique first
|
||||
// described by Thomas Pornin in the BearSSL documentation [0]. This
|
||||
// technique uses generic integer multiplication, but ignores the carrys by
|
||||
// masking all but 8 bits of the inputs, creating three bit holes between
|
||||
// each unmasked bit. If the multiplications of any of the unmasked bits
|
||||
// then cause a carry, the resulting carry bit spills into one of the three
|
||||
// bit holes.
|
||||
//
|
||||
// Each 32-bit input is split into four 32-bit masked values, each
|
||||
// containing 8 unmasked bits. The mask is shifted by one bit for each of
|
||||
// the four values, such that the four values cover the full 32 bits of the
|
||||
// input.
|
||||
//
|
||||
// In order to compute the bits at position z_k, z_k+4, z_k+8, ..., z_k+60
|
||||
// for k = 0, 1, 2, 3, we compute the sum of the products x_i*y_j for all i,
|
||||
// j such that i+j = k mod 4.
|
||||
//
|
||||
// We then mask the sum of each of the four products with the same mask used
|
||||
// for the input values, which zeros out any spilled carry bits, and OR the
|
||||
// masked values to get the final product.
|
||||
//
|
||||
// [0] https://www.bearssl.org/constanttime.html#ghash-for-gcm
|
||||
|
||||
var xm, ym [4]uint32
|
||||
var z [4]uint64
|
||||
|
||||
for i := range 4 {
|
||||
// Mask off the three bit holes in each input, creating four masked
|
||||
// values for each input.
|
||||
xm[i] = x & (0x11111111 << i)
|
||||
ym[i] = y & (0x11111111 << i)
|
||||
}
|
||||
|
||||
for i := range 4 {
|
||||
// Compute the multiplication of x by the circulant matrix of y, using
|
||||
// XOR to get carryless addition of the products:
|
||||
//
|
||||
// | z[0] | | ym[0] ym[3] ym[2] ym[1] | | xm[0] |
|
||||
// | z[1] | = | ym[1] ym[0] ym[3] ym[2] | x | xm[1] |
|
||||
// | z[2] | | ym[2] ym[1] ym[0] ym[3] | | xm[2] |
|
||||
// | z[3] | | ym[3] ym[2] ym[1] ym[0] | | xm[3] |
|
||||
z[i] = (uint64(xm[0]) * uint64(ym[i])) ^ (uint64(xm[1]) * uint64(ym[(i+3)%4])) ^ (uint64(xm[2]) * uint64(ym[(i+2)%4])) ^ (uint64(xm[3]) * uint64(ym[(i+1)%4]))
|
||||
z[i] &= 0x1111111111111111 << i
|
||||
}
|
||||
|
||||
return z[0] | z[1] | z[2] | z[3]
|
||||
}
|
||||
|
||||
func ghash(out, H *[gcmBlockSize]byte, inputs ...[]byte) {
|
||||
// productTable contains the first sixteen powers of the key, H.
|
||||
// However, they are in bit reversed order.
|
||||
var productTable [16]gcmFieldElement
|
||||
// The GHASH algorithm computes the sum of the products of two 128 bit
|
||||
// integers Y and H (the input block and the key, respectively) in the field
|
||||
// GF(2^128), modulo the field polynomial.
|
||||
//
|
||||
// We use the Karatsuba algorithm to decompose the 128-bit multiplication
|
||||
// into three 64-bit multiplications, which we further decompose into 9
|
||||
// 32-bit multiplications with 64-bit products.
|
||||
|
||||
// We precompute 16 multiples of H. However, when we do lookups
|
||||
// into this table we'll be using bits from a field element and
|
||||
// therefore the bits will be in the reverse order. So normally one
|
||||
// would expect, say, 4*H to be in index 4 of the table but due to
|
||||
// this bit ordering it will actually be in index 0010 (base 2) = 2.
|
||||
x := gcmFieldElement{
|
||||
byteorder.BEUint64(H[:8]),
|
||||
byteorder.BEUint64(H[8:]),
|
||||
}
|
||||
productTable[reverseBits(1)] = x
|
||||
// Make sure out is zeroed before we use it.
|
||||
clear(out[:])
|
||||
|
||||
for i := 2; i < 16; i += 2 {
|
||||
productTable[reverseBits(i)] = ghashDouble(&productTable[reverseBits(i/2)])
|
||||
productTable[reverseBits(i+1)] = ghashAdd(&productTable[reverseBits(i)], &x)
|
||||
var y, h [4]uint32
|
||||
for i := range 4 {
|
||||
h[3-i] = byteorder.BEUint32(H[i*4 : (i*4)+4])
|
||||
}
|
||||
|
||||
var y gcmFieldElement
|
||||
for _, input := range inputs {
|
||||
ghashUpdate(&productTable, &y, input)
|
||||
}
|
||||
|
||||
byteorder.BEPutUint64(out[:], y.low)
|
||||
byteorder.BEPutUint64(out[8:], y.high)
|
||||
}
|
||||
|
||||
// reverseBits reverses the order of the bits of 4-bit number in i.
|
||||
func reverseBits(i int) int {
|
||||
i = ((i << 2) & 0xc) | ((i >> 2) & 0x3)
|
||||
i = ((i << 1) & 0xa) | ((i >> 1) & 0x5)
|
||||
return i
|
||||
}
|
||||
|
||||
// ghashAdd adds two elements of GF(2¹²⁸) and returns the sum.
|
||||
func ghashAdd(x, y *gcmFieldElement) gcmFieldElement {
|
||||
// Addition in a characteristic 2 field is just XOR.
|
||||
return gcmFieldElement{x.low ^ y.low, x.high ^ y.high}
|
||||
}
|
||||
|
||||
// ghashDouble returns the result of doubling an element of GF(2¹²⁸).
|
||||
func ghashDouble(x *gcmFieldElement) (double gcmFieldElement) {
|
||||
msbSet := x.high&1 == 1
|
||||
|
||||
// Because of the bit-ordering, doubling is actually a right shift.
|
||||
double.high = x.high >> 1
|
||||
double.high |= x.low << 63
|
||||
double.low = x.low >> 1
|
||||
|
||||
// If the most-significant bit was set before shifting then it,
|
||||
// conceptually, becomes a term of x^128. This is greater than the
|
||||
// irreducible polynomial so the result has to be reduced. The
|
||||
// irreducible polynomial is 1+x+x^2+x^7+x^128. We can subtract that to
|
||||
// eliminate the term at x^128 which also means subtracting the other
|
||||
// four terms. In characteristic 2 fields, subtraction == addition ==
|
||||
// XOR.
|
||||
if msbSet {
|
||||
double.low ^= 0xe100000000000000
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
var ghashReductionTable = []uint16{
|
||||
0x0000, 0x1c20, 0x3840, 0x2460, 0x7080, 0x6ca0, 0x48c0, 0x54e0,
|
||||
0xe100, 0xfd20, 0xd940, 0xc560, 0x9180, 0x8da0, 0xa9c0, 0xb5e0,
|
||||
}
|
||||
|
||||
// ghashMul sets y to y*H, where H is the GCM key, fixed during New.
|
||||
func ghashMul(productTable *[16]gcmFieldElement, y *gcmFieldElement) {
|
||||
var z gcmFieldElement
|
||||
|
||||
for i := 0; i < 2; i++ {
|
||||
word := y.high
|
||||
if i == 1 {
|
||||
word = y.low
|
||||
}
|
||||
|
||||
// Multiplication works by multiplying z by 16 and adding in
|
||||
// one of the precomputed multiples of H.
|
||||
for j := 0; j < 64; j += 4 {
|
||||
msw := z.high & 0xf
|
||||
z.high >>= 4
|
||||
z.high |= z.low << 60
|
||||
z.low >>= 4
|
||||
z.low ^= uint64(ghashReductionTable[msw]) << 48
|
||||
|
||||
// the values in |table| are ordered for little-endian bit
|
||||
// positions. See the comment in New.
|
||||
t := productTable[word&0xf]
|
||||
|
||||
z.low ^= t.low
|
||||
z.high ^= t.high
|
||||
word >>= 4
|
||||
blockIterator := func(yield func([]byte) bool) {
|
||||
for _, input := range inputs {
|
||||
for len(input) >= 16 {
|
||||
if !yield(input[:16]) {
|
||||
return
|
||||
}
|
||||
input = input[16:]
|
||||
}
|
||||
if len(input) > 0 {
|
||||
var partialBlock [gcmBlockSize]byte
|
||||
copy(partialBlock[:], input)
|
||||
if !yield(partialBlock[:]) {
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
*y = z
|
||||
}
|
||||
// Compute the GHASH of the inputs by iterating over 16-byte blocks of the
|
||||
// inputs, XORing each block into the current state, and multiplying the
|
||||
// result by the key.
|
||||
for block := range blockIterator {
|
||||
for i := range 4 {
|
||||
y[3-i] ^= byteorder.BEUint32(block[i*4 : (i*4)+4])
|
||||
}
|
||||
|
||||
// updateBlocks extends y with more polynomial terms from blocks, based on
|
||||
// Horner's rule. There must be a multiple of gcmBlockSize bytes in blocks.
|
||||
func updateBlocks(productTable *[16]gcmFieldElement, y *gcmFieldElement, blocks []byte) {
|
||||
for len(blocks) > 0 {
|
||||
y.low ^= byteorder.BEUint64(blocks)
|
||||
y.high ^= byteorder.BEUint64(blocks[8:])
|
||||
ghashMul(productTable, y)
|
||||
blocks = blocks[gcmBlockSize:]
|
||||
}
|
||||
}
|
||||
|
||||
// ghashUpdate extends y with more polynomial terms from data. If data is not a
|
||||
// multiple of gcmBlockSize bytes long then the remainder is zero padded.
|
||||
func ghashUpdate(productTable *[16]gcmFieldElement, y *gcmFieldElement, data []byte) {
|
||||
fullBlocks := (len(data) >> 4) << 4
|
||||
updateBlocks(productTable, y, data[:fullBlocks])
|
||||
|
||||
if len(data) != fullBlocks {
|
||||
var partialBlock [gcmBlockSize]byte
|
||||
copy(partialBlock[:], data[fullBlocks:])
|
||||
updateBlocks(productTable, y, partialBlock[:])
|
||||
// Split y*h into nine products:
|
||||
//
|
||||
// zLo = y0*h0, y2*h2, (y0^y2) * (h0^h2)
|
||||
// zHi = y1*h1, y3*h3, (y1^y3) * (h1^h3)
|
||||
// zSum = (y0^y1) * (h0^h1), (y2^y3) * (h2^h3), ((y0^y2) ^ (y1^y3)) * ((h0^h2) ^ (h1^h3))
|
||||
var zLo, zHi, zSum [3]uint64
|
||||
|
||||
zLo[0] = ghashMul(y[0], h[0])
|
||||
zHi[0] = ghashMul(y[1], h[1])
|
||||
zSum[0] = ghashMul(y[0]^y[1], h[0]^h[1])
|
||||
|
||||
zLo[1] = ghashMul(y[2], h[2])
|
||||
zHi[1] = ghashMul(y[3], h[3])
|
||||
zSum[1] = ghashMul(y[2]^y[3], h[2]^h[3])
|
||||
|
||||
zLo[2] = ghashMul(y[0]^y[2], h[0]^h[2])
|
||||
zHi[2] = ghashMul(y[1]^y[3], h[1]^h[3])
|
||||
zSum[2] = ghashMul((y[0]^y[2])^(y[1]^y[3]), (h[0]^h[2])^(h[1]^h[3]))
|
||||
|
||||
// Reconstruct the 128-bit terms zLo, zHi, and zSum from their constituent 64-bit products
|
||||
var result [3][2]uint64
|
||||
for i := range 3 {
|
||||
mid := zSum[i] ^ zLo[i] ^ zHi[i]
|
||||
// Add the lower 32 bits of the middle term to the low term
|
||||
result[i][0] = zLo[i] ^ (mid << 32)
|
||||
// Add the upper 32 bits of the middle term to the high term
|
||||
result[i][1] = zHi[i] ^ (mid >> 32)
|
||||
}
|
||||
|
||||
// Compute the middle term by adding the high and low terms to the sum term
|
||||
result[2][0] ^= result[0][0] ^ result[1][0]
|
||||
result[2][1] ^= result[0][1] ^ result[1][1]
|
||||
|
||||
// Add the lower bits of the middle term to the higher bits of the low term
|
||||
result[0][1] ^= result[2][0]
|
||||
// Add the higher bits of the middle term to the lower bits of the high term
|
||||
result[1][0] ^= result[2][1]
|
||||
|
||||
// Reconstruct the 256-bit product from the low and high terms, shifted
|
||||
// by one bit to satisfy the GHASH construction.
|
||||
var z [4]uint64
|
||||
z[0] = result[0][0] << 1
|
||||
z[1] = (result[0][1] << 1) | (result[0][0] >> 63)
|
||||
z[2] = (result[1][0] << 1) | (result[0][1] >> 63)
|
||||
z[3] = (result[1][1] << 1) | (result[1][0] >> 63)
|
||||
|
||||
// Reduce the 256-bit product modulo the field polynomial. z0 and z1 contain
|
||||
// the high-degree terms (255 to 128), and z2 and z3 contain the low-degree terms (127 to 0).
|
||||
for i := range 2 {
|
||||
lw := z[i]
|
||||
// Add the remainders of the high-degree terms to the low-degree terms
|
||||
z[i+2] ^= lw ^ (lw >> 1) ^ (lw >> 2) ^ (lw >> 7)
|
||||
// Add the carrys from the reduction
|
||||
z[i+1] ^= (lw << 63) ^ (lw << 62) ^ (lw << 57)
|
||||
}
|
||||
|
||||
// Write the reduced 128-bit product back into y
|
||||
y[0], y[1], y[2], y[3] = uint32(z[2]), uint32(z[2]>>32), uint32(z[3]), uint32(z[3]>>32)
|
||||
}
|
||||
|
||||
for i := range 4 {
|
||||
byteorder.BEPutUint32(out[i*4:(i*4)+4], y[3-i])
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue