crypto/internal/fips140/aes/gcm: constant-time GHASH

Replace our variable time GHASH implementation with a constant-time one.
This implementation is slower on platforms which lack native AES
instructions (the only places we use the generic implementation), but
that is a reasonable trade-off to remove one of the remaining
variable-time pieces of the generic AES implementation.

The technique used in this implementation is based on the technique
described in the BearSSL constant-time notes[0] with Karatsuba
multiplication to decompose the 128-bit multiplication into smaller
pieces.

[0] https://www.bearssl.org/constanttime.html#ghash-for-gcm

goos: darwin
goarch: arm64
pkg: crypto/internal/fips140/aes/gcm
cpu: Apple M1 Pro
              │ ghash-vt.bench │            ghash-ct.bench            │
              │     sec/op     │    sec/op     vs base                │
GHASH/16-10        72.41n ± 2%   129.85n ± 1%  +79.31% (p=0.000 n=10)
GHASH/32-10        130.5n ± 3%    243.3n ± 0%  +86.44% (p=0.000 n=10)
GHASH/64-10        248.7n ± 1%    474.2n ± 1%  +90.69% (p=0.000 n=10)
GHASH/128-10       487.0n ± 1%    953.3n ± 2%  +95.75% (p=0.000 n=10)
GHASH/256-10       952.7n ± 2%   1893.0n ± 3%  +98.71% (p=0.000 n=10)
GHASH/512-10       1.893µ ± 1%    3.775µ ± 2%  +99.47% (p=0.000 n=10)
GHASH/1024-10      3.777µ ± 2%    7.472µ ± 2%  +97.84% (p=0.000 n=10)
geomean            499.8n         962.0n       +92.47%

                        │ aes-vt-ghash.bench │          aes-ct-ghash.bench          │
                        │       sec/op       │    sec/op     vs base                │
AESGCM/Open-128-64-10           688.6n ±  2%   1007.5n ± 4%  +46.31% (p=0.000 n=10)
AESGCM/Seal-128-64-10           672.8n ±  2%    998.0n ± 1%  +48.34% (p=0.000 n=10)
AESGCM/Open-256-64-10           788.2n ±  0%   1110.0n ± 0%  +40.83% (p=0.000 n=10)
AESGCM/Seal-256-64-10           789.2n ±  4%   1104.0n ± 0%  +39.89% (p=0.000 n=10)
AESGCM/Open-128-1350-10         9.535µ ±  2%   14.091µ ± 0%  +47.78% (p=0.000 n=10)
AESGCM/Seal-128-1350-10         9.512µ ±  1%   14.065µ ± 0%  +47.87% (p=0.000 n=10)
AESGCM/Open-256-1350-10         11.08µ ±  0%    15.67µ ± 3%  +41.45% (p=0.000 n=10)
AESGCM/Seal-256-1350-10         11.20µ ±  2%    15.96µ ± 1%  +42.46% (p=0.000 n=10)
AESGCM/Open-128-8192-10         55.66µ ± 23%    84.01µ ± 1%  +50.94% (p=0.002 n=10)
AESGCM/Seal-128-8192-10         55.44µ ±  1%    83.85µ ± 5%  +51.23% (p=0.000 n=10)
AESGCM/Open-256-8192-10         64.88µ ±  2%    93.23µ ± 0%  +43.70% (p=0.000 n=10)
AESGCM/Seal-256-8192-10         64.47µ ±  1%    93.15µ ± 0%  +44.48% (p=0.000 n=10)
geomean                         7.676µ          11.16µ       +45.39%

Updates #69025

Cq-Include-Trybots: luci.golang.try:gotip-linux-amd64-longtest,x_crypto-gotip-linux-amd64-longtest
Change-Id: I29f916ce30bfdb5c83885369e1cb6aff5ea5d4fe
Reviewed-on: https://go-review.googlesource.com/c/go/+/746120
LUCI-TryBot-Result: golang-scoped@luci-project-accounts.iam.gserviceaccount.com <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Daniel McCarney <daniel@binaryparadox.net>
Reviewed-by: Neal Patel <nealpatel@google.com>
Reviewed-by: Filippo Valsorda <filippo@golang.org>
Auto-Submit: Roland Shoemaker <roland@golang.org>
This commit is contained in:
Roland Shoemaker 2026-02-16 21:00:30 -08:00 committed by Gopher Robot
parent c1f0b9bdba
commit 71c7ea1c6c
2 changed files with 147 additions and 135 deletions

View file

@ -23,10 +23,6 @@ const (
// NewGCM returns the given 128-bit, block cipher wrapped in Galois Counter Mode
// with the standard nonce length.
//
// In general, the GHASH operation performed by this implementation of GCM is not constant-time.
// An exception is when the underlying [Block] was created by aes.NewCipher
// on systems with hardware support for AES. See the [crypto/aes] package documentation for details.
func NewGCM(cipher Block) (AEAD, error) {
if fips140only.Enforced() {
return nil, errors.New("crypto/cipher: use of GCM with arbitrary IVs is not allowed in FIPS 140-only mode, use NewGCMWithRandomNonce")

View file

@ -9,18 +9,6 @@ import (
"crypto/internal/fips140deps/byteorder"
)
// gcmFieldElement represents a value in GF(2¹²⁸). In order to reflect the GCM
// standard and make binary.BigEndian suitable for marshaling these values, the
// bits are stored in big endian order. For example:
//
// the coefficient of x⁰ can be obtained by v.low >> 63.
// the coefficient of x⁶³ can be obtained by v.low & 1.
// the coefficient of x⁶⁴ can be obtained by v.high >> 63.
// the coefficient of x¹²⁷ can be obtained by v.high & 1.
type gcmFieldElement struct {
low, high uint64
}
// GHASH is exposed to allow crypto/cipher to implement non-AES GCM modes.
// It is not allowed as a stand-alone operation in FIPS mode because it
// is not ACVP tested.
@ -31,133 +19,161 @@ func GHASH(key *[16]byte, inputs ...[]byte) []byte {
return out[:]
}
// ghash is a variable-time generic implementation of GHASH, which shouldn't
// be used on any architecture with hardware support for AES-GCM.
//
// Each input is zero-padded to 128-bit before being absorbed.
// ghashMul does constant-time carry-less multiplication of two 32-bit integers,
// returning the 64-bit product.
func ghashMul(x, y uint32) uint64 {
// This function implements carryless multiplication using a technique first
// described by Thomas Pornin in the BearSSL documentation [0]. This
// technique uses generic integer multiplication, but ignores the carrys by
// masking all but 8 bits of the inputs, creating three bit holes between
// each unmasked bit. If the multiplications of any of the unmasked bits
// then cause a carry, the resulting carry bit spills into one of the three
// bit holes.
//
// Each 32-bit input is split into four 32-bit masked values, each
// containing 8 unmasked bits. The mask is shifted by one bit for each of
// the four values, such that the four values cover the full 32 bits of the
// input.
//
// In order to compute the bits at position z_k, z_k+4, z_k+8, ..., z_k+60
// for k = 0, 1, 2, 3, we compute the sum of the products x_i*y_j for all i,
// j such that i+j = k mod 4.
//
// We then mask the sum of each of the four products with the same mask used
// for the input values, which zeros out any spilled carry bits, and OR the
// masked values to get the final product.
//
// [0] https://www.bearssl.org/constanttime.html#ghash-for-gcm
var xm, ym [4]uint32
var z [4]uint64
for i := range 4 {
// Mask off the three bit holes in each input, creating four masked
// values for each input.
xm[i] = x & (0x11111111 << i)
ym[i] = y & (0x11111111 << i)
}
for i := range 4 {
// Compute the multiplication of x by the circulant matrix of y, using
// XOR to get carryless addition of the products:
//
// | z[0] | | ym[0] ym[3] ym[2] ym[1] | | xm[0] |
// | z[1] | = | ym[1] ym[0] ym[3] ym[2] | x | xm[1] |
// | z[2] | | ym[2] ym[1] ym[0] ym[3] | | xm[2] |
// | z[3] | | ym[3] ym[2] ym[1] ym[0] | | xm[3] |
z[i] = (uint64(xm[0]) * uint64(ym[i])) ^ (uint64(xm[1]) * uint64(ym[(i+3)%4])) ^ (uint64(xm[2]) * uint64(ym[(i+2)%4])) ^ (uint64(xm[3]) * uint64(ym[(i+1)%4]))
z[i] &= 0x1111111111111111 << i
}
return z[0] | z[1] | z[2] | z[3]
}
func ghash(out, H *[gcmBlockSize]byte, inputs ...[]byte) {
// productTable contains the first sixteen powers of the key, H.
// However, they are in bit reversed order.
var productTable [16]gcmFieldElement
// The GHASH algorithm computes the sum of the products of two 128 bit
// integers Y and H (the input block and the key, respectively) in the field
// GF(2^128), modulo the field polynomial.
//
// We use the Karatsuba algorithm to decompose the 128-bit multiplication
// into three 64-bit multiplications, which we further decompose into 9
// 32-bit multiplications with 64-bit products.
// We precompute 16 multiples of H. However, when we do lookups
// into this table we'll be using bits from a field element and
// therefore the bits will be in the reverse order. So normally one
// would expect, say, 4*H to be in index 4 of the table but due to
// this bit ordering it will actually be in index 0010 (base 2) = 2.
x := gcmFieldElement{
byteorder.BEUint64(H[:8]),
byteorder.BEUint64(H[8:]),
}
productTable[reverseBits(1)] = x
// Make sure out is zeroed before we use it.
clear(out[:])
for i := 2; i < 16; i += 2 {
productTable[reverseBits(i)] = ghashDouble(&productTable[reverseBits(i/2)])
productTable[reverseBits(i+1)] = ghashAdd(&productTable[reverseBits(i)], &x)
var y, h [4]uint32
for i := range 4 {
h[3-i] = byteorder.BEUint32(H[i*4 : (i*4)+4])
}
var y gcmFieldElement
for _, input := range inputs {
ghashUpdate(&productTable, &y, input)
}
byteorder.BEPutUint64(out[:], y.low)
byteorder.BEPutUint64(out[8:], y.high)
}
// reverseBits reverses the order of the bits of 4-bit number in i.
func reverseBits(i int) int {
i = ((i << 2) & 0xc) | ((i >> 2) & 0x3)
i = ((i << 1) & 0xa) | ((i >> 1) & 0x5)
return i
}
// ghashAdd adds two elements of GF(2¹²⁸) and returns the sum.
func ghashAdd(x, y *gcmFieldElement) gcmFieldElement {
// Addition in a characteristic 2 field is just XOR.
return gcmFieldElement{x.low ^ y.low, x.high ^ y.high}
}
// ghashDouble returns the result of doubling an element of GF(2¹²⁸).
func ghashDouble(x *gcmFieldElement) (double gcmFieldElement) {
msbSet := x.high&1 == 1
// Because of the bit-ordering, doubling is actually a right shift.
double.high = x.high >> 1
double.high |= x.low << 63
double.low = x.low >> 1
// If the most-significant bit was set before shifting then it,
// conceptually, becomes a term of x^128. This is greater than the
// irreducible polynomial so the result has to be reduced. The
// irreducible polynomial is 1+x+x^2+x^7+x^128. We can subtract that to
// eliminate the term at x^128 which also means subtracting the other
// four terms. In characteristic 2 fields, subtraction == addition ==
// XOR.
if msbSet {
double.low ^= 0xe100000000000000
}
return
}
var ghashReductionTable = []uint16{
0x0000, 0x1c20, 0x3840, 0x2460, 0x7080, 0x6ca0, 0x48c0, 0x54e0,
0xe100, 0xfd20, 0xd940, 0xc560, 0x9180, 0x8da0, 0xa9c0, 0xb5e0,
}
// ghashMul sets y to y*H, where H is the GCM key, fixed during New.
func ghashMul(productTable *[16]gcmFieldElement, y *gcmFieldElement) {
var z gcmFieldElement
for i := 0; i < 2; i++ {
word := y.high
if i == 1 {
word = y.low
}
// Multiplication works by multiplying z by 16 and adding in
// one of the precomputed multiples of H.
for j := 0; j < 64; j += 4 {
msw := z.high & 0xf
z.high >>= 4
z.high |= z.low << 60
z.low >>= 4
z.low ^= uint64(ghashReductionTable[msw]) << 48
// the values in |table| are ordered for little-endian bit
// positions. See the comment in New.
t := productTable[word&0xf]
z.low ^= t.low
z.high ^= t.high
word >>= 4
blockIterator := func(yield func([]byte) bool) {
for _, input := range inputs {
for len(input) >= 16 {
if !yield(input[:16]) {
return
}
input = input[16:]
}
if len(input) > 0 {
var partialBlock [gcmBlockSize]byte
copy(partialBlock[:], input)
if !yield(partialBlock[:]) {
return
}
}
}
}
*y = z
}
// Compute the GHASH of the inputs by iterating over 16-byte blocks of the
// inputs, XORing each block into the current state, and multiplying the
// result by the key.
for block := range blockIterator {
for i := range 4 {
y[3-i] ^= byteorder.BEUint32(block[i*4 : (i*4)+4])
}
// updateBlocks extends y with more polynomial terms from blocks, based on
// Horner's rule. There must be a multiple of gcmBlockSize bytes in blocks.
func updateBlocks(productTable *[16]gcmFieldElement, y *gcmFieldElement, blocks []byte) {
for len(blocks) > 0 {
y.low ^= byteorder.BEUint64(blocks)
y.high ^= byteorder.BEUint64(blocks[8:])
ghashMul(productTable, y)
blocks = blocks[gcmBlockSize:]
}
}
// ghashUpdate extends y with more polynomial terms from data. If data is not a
// multiple of gcmBlockSize bytes long then the remainder is zero padded.
func ghashUpdate(productTable *[16]gcmFieldElement, y *gcmFieldElement, data []byte) {
fullBlocks := (len(data) >> 4) << 4
updateBlocks(productTable, y, data[:fullBlocks])
if len(data) != fullBlocks {
var partialBlock [gcmBlockSize]byte
copy(partialBlock[:], data[fullBlocks:])
updateBlocks(productTable, y, partialBlock[:])
// Split y*h into nine products:
//
// zLo = y0*h0, y2*h2, (y0^y2) * (h0^h2)
// zHi = y1*h1, y3*h3, (y1^y3) * (h1^h3)
// zSum = (y0^y1) * (h0^h1), (y2^y3) * (h2^h3), ((y0^y2) ^ (y1^y3)) * ((h0^h2) ^ (h1^h3))
var zLo, zHi, zSum [3]uint64
zLo[0] = ghashMul(y[0], h[0])
zHi[0] = ghashMul(y[1], h[1])
zSum[0] = ghashMul(y[0]^y[1], h[0]^h[1])
zLo[1] = ghashMul(y[2], h[2])
zHi[1] = ghashMul(y[3], h[3])
zSum[1] = ghashMul(y[2]^y[3], h[2]^h[3])
zLo[2] = ghashMul(y[0]^y[2], h[0]^h[2])
zHi[2] = ghashMul(y[1]^y[3], h[1]^h[3])
zSum[2] = ghashMul((y[0]^y[2])^(y[1]^y[3]), (h[0]^h[2])^(h[1]^h[3]))
// Reconstruct the 128-bit terms zLo, zHi, and zSum from their constituent 64-bit products
var result [3][2]uint64
for i := range 3 {
mid := zSum[i] ^ zLo[i] ^ zHi[i]
// Add the lower 32 bits of the middle term to the low term
result[i][0] = zLo[i] ^ (mid << 32)
// Add the upper 32 bits of the middle term to the high term
result[i][1] = zHi[i] ^ (mid >> 32)
}
// Compute the middle term by adding the high and low terms to the sum term
result[2][0] ^= result[0][0] ^ result[1][0]
result[2][1] ^= result[0][1] ^ result[1][1]
// Add the lower bits of the middle term to the higher bits of the low term
result[0][1] ^= result[2][0]
// Add the higher bits of the middle term to the lower bits of the high term
result[1][0] ^= result[2][1]
// Reconstruct the 256-bit product from the low and high terms, shifted
// by one bit to satisfy the GHASH construction.
var z [4]uint64
z[0] = result[0][0] << 1
z[1] = (result[0][1] << 1) | (result[0][0] >> 63)
z[2] = (result[1][0] << 1) | (result[0][1] >> 63)
z[3] = (result[1][1] << 1) | (result[1][0] >> 63)
// Reduce the 256-bit product modulo the field polynomial. z0 and z1 contain
// the high-degree terms (255 to 128), and z2 and z3 contain the low-degree terms (127 to 0).
for i := range 2 {
lw := z[i]
// Add the remainders of the high-degree terms to the low-degree terms
z[i+2] ^= lw ^ (lw >> 1) ^ (lw >> 2) ^ (lw >> 7)
// Add the carrys from the reduction
z[i+1] ^= (lw << 63) ^ (lw << 62) ^ (lw << 57)
}
// Write the reduced 128-bit product back into y
y[0], y[1], y[2], y[3] = uint32(z[2]), uint32(z[2]>>32), uint32(z[3]), uint32(z[3]>>32)
}
for i := range 4 {
byteorder.BEPutUint32(out[i*4:(i*4)+4], y[3-i])
}
}