crypto/internal/fips140/aes/gcm: constant-time GHASH

Replace our variable time GHASH implementation with a constant-time one. This implementation is slower on platforms which lack native AES instructions (the only places we use the generic implementation), but that is a reasonable trade-off to remove one of the remaining variable-time pieces of the generic AES implementation. The technique used in this implementation is based on the technique described in the BearSSL constant-time notes[0] with Karatsuba multiplication to decompose the 128-bit multiplication into smaller pieces. [0] https://www.bearssl.org/constanttime.html#ghash-for-gcm goos: darwin goarch: arm64 pkg: crypto/internal/fips140/aes/gcm cpu: Apple M1 Pro │ ghash-vt.bench │ ghash-ct.bench │ │ sec/op │ sec/op vs base │ GHASH/16-10 72.41n ± 2% 129.85n ± 1% +79.31% (p=0.000 n=10) GHASH/32-10 130.5n ± 3% 243.3n ± 0% +86.44% (p=0.000 n=10) GHASH/64-10 248.7n ± 1% 474.2n ± 1% +90.69% (p=0.000 n=10) GHASH/128-10 487.0n ± 1% 953.3n ± 2% +95.75% (p=0.000 n=10) GHASH/256-10 952.7n ± 2% 1893.0n ± 3% +98.71% (p=0.000 n=10) GHASH/512-10 1.893µ ± 1% 3.775µ ± 2% +99.47% (p=0.000 n=10) GHASH/1024-10 3.777µ ± 2% 7.472µ ± 2% +97.84% (p=0.000 n=10) geomean 499.8n 962.0n +92.47% │ aes-vt-ghash.bench │ aes-ct-ghash.bench │ │ sec/op │ sec/op vs base │ AESGCM/Open-128-64-10 688.6n ± 2% 1007.5n ± 4% +46.31% (p=0.000 n=10) AESGCM/Seal-128-64-10 672.8n ± 2% 998.0n ± 1% +48.34% (p=0.000 n=10) AESGCM/Open-256-64-10 788.2n ± 0% 1110.0n ± 0% +40.83% (p=0.000 n=10) AESGCM/Seal-256-64-10 789.2n ± 4% 1104.0n ± 0% +39.89% (p=0.000 n=10) AESGCM/Open-128-1350-10 9.535µ ± 2% 14.091µ ± 0% +47.78% (p=0.000 n=10) AESGCM/Seal-128-1350-10 9.512µ ± 1% 14.065µ ± 0% +47.87% (p=0.000 n=10) AESGCM/Open-256-1350-10 11.08µ ± 0% 15.67µ ± 3% +41.45% (p=0.000 n=10) AESGCM/Seal-256-1350-10 11.20µ ± 2% 15.96µ ± 1% +42.46% (p=0.000 n=10) AESGCM/Open-128-8192-10 55.66µ ± 23% 84.01µ ± 1% +50.94% (p=0.002 n=10) AESGCM/Seal-128-8192-10 55.44µ ± 1% 83.85µ ± 5% +51.23% (p=0.000 n=10) AESGCM/Open-256-8192-10 64.88µ ± 2% 93.23µ ± 0% +43.70% (p=0.000 n=10) AESGCM/Seal-256-8192-10 64.47µ ± 1% 93.15µ ± 0% +44.48% (p=0.000 n=10) geomean 7.676µ 11.16µ +45.39% Updates #69025 Cq-Include-Trybots: luci.golang.try:gotip-linux-amd64-longtest,x_crypto-gotip-linux-amd64-longtest Change-Id: I29f916ce30bfdb5c83885369e1cb6aff5ea5d4fe Reviewed-on: https://go-review.googlesource.com/c/go/+/746120 LUCI-TryBot-Result: golang-scoped@luci-project-accounts.iam.gserviceaccount.com <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Daniel McCarney <daniel@binaryparadox.net> Reviewed-by: Neal Patel <nealpatel@google.com> Reviewed-by: Filippo Valsorda <filippo@golang.org> Auto-Submit: Roland Shoemaker <roland@golang.org>
2026-06-27 19:30:52 +00:00 · 2026-02-16 21:00:30 -08:00 · 2026-02-16 21:00:30 -08:00 · 71c7ea1c6c
commit 71c7ea1c6c
parent c1f0b9bdba
2 changed files with 147 additions and 135 deletions
--- a/src/crypto/cipher/gcm.go
+++ b/src/crypto/cipher/gcm.go
@ -23,10 +23,6 @@ const (

 // NewGCM returns the given 128-bit, block cipher wrapped in Galois Counter Mode
 // with the standard nonce length.
-//
-// In general, the GHASH operation performed by this implementation of GCM is not constant-time.
-// An exception is when the underlying [Block] was created by aes.NewCipher
-// on systems with hardware support for AES. See the [crypto/aes] package documentation for details.
 func NewGCM(cipher Block) (AEAD, error) {
 	if fips140only.Enforced() {
 		return nil, errors.New("crypto/cipher: use of GCM with arbitrary IVs is not allowed in FIPS 140-only mode, use NewGCMWithRandomNonce")
--- a/src/crypto/internal/fips140/aes/gcm/ghash.go
+++ b/src/crypto/internal/fips140/aes/gcm/ghash.go
@ -9,18 +9,6 @@ import (
 	"crypto/internal/fips140deps/byteorder"
 )

-// gcmFieldElement represents a value in GF(2¹²⁸). In order to reflect the GCM
-// standard and make binary.BigEndian suitable for marshaling these values, the
-// bits are stored in big endian order. For example:
-//
-//	the coefficient of x⁰ can be obtained by v.low >> 63.
-//	the coefficient of x⁶³ can be obtained by v.low & 1.
-//	the coefficient of x⁶⁴ can be obtained by v.high >> 63.
-//	the coefficient of x¹²⁷ can be obtained by v.high & 1.
-type gcmFieldElement struct {
-	low, high uint64
-}
-
 // GHASH is exposed to allow crypto/cipher to implement non-AES GCM modes.
 // It is not allowed as a stand-alone operation in FIPS mode because it
 // is not ACVP tested.
@ -31,133 +19,161 @@ func GHASH(key *[16]byte, inputs ...[]byte) []byte {
 	return out[:]
 }

-// ghash is a variable-time generic implementation of GHASH, which shouldn't
-// be used on any architecture with hardware support for AES-GCM.
-//
-// Each input is zero-padded to 128-bit before being absorbed.
+// ghashMul does constant-time carry-less multiplication of two 32-bit integers,
+// returning the 64-bit product.
+func ghashMul(x, y uint32) uint64 {
+	// This function implements carryless multiplication using a technique first
+	// described by Thomas Pornin in the BearSSL documentation [0]. This
+	// technique uses generic integer multiplication, but ignores the carrys by
+	// masking all but 8 bits of the inputs, creating three bit holes between
+	// each unmasked bit. If the multiplications of any of the unmasked bits
+	// then cause a carry, the resulting carry bit spills into one of the three
+	// bit holes.
+	//
+	// Each 32-bit input is split into four 32-bit masked values, each
+	// containing 8 unmasked bits. The mask is shifted by one bit for each of
+	// the four values, such that the four values cover the full 32 bits of the
+	// input.
+	//
+	// In order to compute the bits at position z_k, z_k+4, z_k+8, ..., z_k+60
+	// for k = 0, 1, 2, 3, we compute the sum of the products x_i*y_j for all i,
+	// j such that i+j = k mod 4.
+	//
+	// We then mask the sum of each of the four products with the same mask used
+	// for the input values, which zeros out any spilled carry bits, and OR the
+	// masked values to get the final product.
+	//
+	// [0] https://www.bearssl.org/constanttime.html#ghash-for-gcm
+
+	var xm, ym [4]uint32
+	var z [4]uint64
+
+	for i := range 4 {
+		// Mask off the three bit holes in each input, creating four masked
+		// values for each input.
+		xm[i] = x & (0x11111111 << i)
+		ym[i] = y & (0x11111111 << i)
+	}
+
+	for i := range 4 {
+		// Compute the multiplication of x by the circulant matrix of y, using
+		// XOR to get carryless addition of the products:
+		//
+		//  | z[0] |   | ym[0] ym[3] ym[2] ym[1] |   | xm[0] |
+		//  | z[1] | = | ym[1] ym[0] ym[3] ym[2] | x | xm[1] |
+		//  | z[2] |   | ym[2] ym[1] ym[0] ym[3] |   | xm[2] |
+		//  | z[3] |   | ym[3] ym[2] ym[1] ym[0] |   | xm[3] |
+		z[i] = (uint64(xm[0]) * uint64(ym[i])) ^ (uint64(xm[1]) * uint64(ym[(i+3)%4])) ^ (uint64(xm[2]) * uint64(ym[(i+2)%4])) ^ (uint64(xm[3]) * uint64(ym[(i+1)%4]))
+		z[i] &= 0x1111111111111111 << i
+	}
+
+	return z[0] | z[1] | z[2] | z[3]
+}
+
 func ghash(out, H *[gcmBlockSize]byte, inputs ...[]byte) {
-	// productTable contains the first sixteen powers of the key, H.
-	// However, they are in bit reversed order.
-	var productTable [16]gcmFieldElement
+	// The GHASH algorithm computes the sum of the products of two 128 bit
+	// integers Y and H (the input block and the key, respectively) in the field
+	// GF(2^128), modulo the field polynomial.
+	//
+	// We use the Karatsuba algorithm to decompose the 128-bit multiplication
+	// into three 64-bit multiplications, which we further decompose into 9
+	// 32-bit multiplications with 64-bit products.

-	// We precompute 16 multiples of H. However, when we do lookups
-	// into this table we'll be using bits from a field element and
-	// therefore the bits will be in the reverse order. So normally one
-	// would expect, say, 4*H to be in index 4 of the table but due to
-	// this bit ordering it will actually be in index 0010 (base 2) = 2.
-	x := gcmFieldElement{
-		byteorder.BEUint64(H[:8]),
-		byteorder.BEUint64(H[8:]),
-	}
-	productTable[reverseBits(1)] = x
+	// Make sure out is zeroed before we use it.
+	clear(out[:])

-	for i := 2; i < 16; i += 2 {
-		productTable[reverseBits(i)] = ghashDouble(&productTable[reverseBits(i/2)])
-		productTable[reverseBits(i+1)] = ghashAdd(&productTable[reverseBits(i)], &x)
+	var y, h [4]uint32
+	for i := range 4 {
+		h[3-i] = byteorder.BEUint32(H[i*4 : (i*4)+4])
 	}

-	var y gcmFieldElement
-	for _, input := range inputs {
-		ghashUpdate(&productTable, &y, input)
-	}
-
-	byteorder.BEPutUint64(out[:], y.low)
-	byteorder.BEPutUint64(out[8:], y.high)
-}
-
-// reverseBits reverses the order of the bits of 4-bit number in i.
-func reverseBits(i int) int {
-	i = ((i << 2) & 0xc) | ((i >> 2) & 0x3)
-	i = ((i << 1) & 0xa) | ((i >> 1) & 0x5)
-	return i
-}
-
-// ghashAdd adds two elements of GF(2¹²⁸) and returns the sum.
-func ghashAdd(x, y *gcmFieldElement) gcmFieldElement {
-	// Addition in a characteristic 2 field is just XOR.
-	return gcmFieldElement{x.low ^ y.low, x.high ^ y.high}
-}
-
-// ghashDouble returns the result of doubling an element of GF(2¹²⁸).
-func ghashDouble(x *gcmFieldElement) (double gcmFieldElement) {
-	msbSet := x.high&1 == 1
-
-	// Because of the bit-ordering, doubling is actually a right shift.
-	double.high = x.high >> 1
-	double.high |= x.low << 63
-	double.low = x.low >> 1
-
-	// If the most-significant bit was set before shifting then it,
-	// conceptually, becomes a term of x^128. This is greater than the
-	// irreducible polynomial so the result has to be reduced. The
-	// irreducible polynomial is 1+x+x^2+x^7+x^128. We can subtract that to
-	// eliminate the term at x^128 which also means subtracting the other
-	// four terms. In characteristic 2 fields, subtraction == addition ==
-	// XOR.
-	if msbSet {
-		double.low ^= 0xe100000000000000
-	}
-
-	return
-}
-
-var ghashReductionTable = []uint16{
-	0x0000, 0x1c20, 0x3840, 0x2460, 0x7080, 0x6ca0, 0x48c0, 0x54e0,
-	0xe100, 0xfd20, 0xd940, 0xc560, 0x9180, 0x8da0, 0xa9c0, 0xb5e0,
-}
-
-// ghashMul sets y to y*H, where H is the GCM key, fixed during New.
-func ghashMul(productTable *[16]gcmFieldElement, y *gcmFieldElement) {
-	var z gcmFieldElement
-
-	for i := 0; i < 2; i++ {
-		word := y.high
-		if i == 1 {
-			word = y.low
-		}
-
-		// Multiplication works by multiplying z by 16 and adding in
-		// one of the precomputed multiples of H.
-		for j := 0; j < 64; j += 4 {
-			msw := z.high & 0xf
-			z.high >>= 4
-			z.high |= z.low << 60
-			z.low >>= 4
-			z.low ^= uint64(ghashReductionTable[msw]) << 48
-
-			// the values in |table| are ordered for little-endian bit
-			// positions. See the comment in New.
-			t := productTable[word&0xf]
-
-			z.low ^= t.low
-			z.high ^= t.high
-			word >>= 4
+	blockIterator := func(yield func([]byte) bool) {
+		for _, input := range inputs {
+			for len(input) >= 16 {
+				if !yield(input[:16]) {
+					return
+				}
+				input = input[16:]
+			}
+			if len(input) > 0 {
+				var partialBlock [gcmBlockSize]byte
+				copy(partialBlock[:], input)
+				if !yield(partialBlock[:]) {
+					return
+				}
+			}
 		}
 	}

-	*y = z
-}
+	// Compute the GHASH of the inputs by iterating over 16-byte blocks of the
+	// inputs, XORing each block into the current state, and multiplying the
+	// result by the key.
+	for block := range blockIterator {
+		for i := range 4 {
+			y[3-i] ^= byteorder.BEUint32(block[i*4 : (i*4)+4])
+		}

-// updateBlocks extends y with more polynomial terms from blocks, based on
-// Horner's rule. There must be a multiple of gcmBlockSize bytes in blocks.
-func updateBlocks(productTable *[16]gcmFieldElement, y *gcmFieldElement, blocks []byte) {
-	for len(blocks) > 0 {
-		y.low ^= byteorder.BEUint64(blocks)
-		y.high ^= byteorder.BEUint64(blocks[8:])
-		ghashMul(productTable, y)
-		blocks = blocks[gcmBlockSize:]
-	}
-}
-
-// ghashUpdate extends y with more polynomial terms from data. If data is not a
-// multiple of gcmBlockSize bytes long then the remainder is zero padded.
-func ghashUpdate(productTable *[16]gcmFieldElement, y *gcmFieldElement, data []byte) {
-	fullBlocks := (len(data) >> 4) << 4
-	updateBlocks(productTable, y, data[:fullBlocks])
-
-	if len(data) != fullBlocks {
-		var partialBlock [gcmBlockSize]byte
-		copy(partialBlock[:], data[fullBlocks:])
-		updateBlocks(productTable, y, partialBlock[:])
+		// Split y*h into nine products:
+		//
+		//  zLo = y0*h0, y2*h2, (y0^y2) * (h0^h2)
+		//  zHi = y1*h1, y3*h3, (y1^y3) * (h1^h3)
+		//  zSum = (y0^y1) * (h0^h1), (y2^y3) * (h2^h3), ((y0^y2) ^ (y1^y3)) * ((h0^h2) ^ (h1^h3))
+		var zLo, zHi, zSum [3]uint64
+
+		zLo[0] = ghashMul(y[0], h[0])
+		zHi[0] = ghashMul(y[1], h[1])
+		zSum[0] = ghashMul(y[0]^y[1], h[0]^h[1])
+
+		zLo[1] = ghashMul(y[2], h[2])
+		zHi[1] = ghashMul(y[3], h[3])
+		zSum[1] = ghashMul(y[2]^y[3], h[2]^h[3])
+
+		zLo[2] = ghashMul(y[0]^y[2], h[0]^h[2])
+		zHi[2] = ghashMul(y[1]^y[3], h[1]^h[3])
+		zSum[2] = ghashMul((y[0]^y[2])^(y[1]^y[3]), (h[0]^h[2])^(h[1]^h[3]))
+
+		// Reconstruct the 128-bit terms zLo, zHi, and zSum from their constituent 64-bit products
+		var result [3][2]uint64
+		for i := range 3 {
+			mid := zSum[i] ^ zLo[i] ^ zHi[i]
+			// Add the lower 32 bits of the middle term to the low term
+			result[i][0] = zLo[i] ^ (mid << 32)
+			// Add the upper 32 bits of the middle term to the high term
+			result[i][1] = zHi[i] ^ (mid >> 32)
+		}
+
+		// Compute the middle term by adding the high and low terms to the sum term
+		result[2][0] ^= result[0][0] ^ result[1][0]
+		result[2][1] ^= result[0][1] ^ result[1][1]
+
+		// Add the lower bits of the middle term to the higher bits of the low term
+		result[0][1] ^= result[2][0]
+		// Add the higher bits of the middle term to the lower bits of the high term
+		result[1][0] ^= result[2][1]
+
+		// Reconstruct the 256-bit product from the low and high terms, shifted
+		// by one bit to satisfy the GHASH construction.
+		var z [4]uint64
+		z[0] = result[0][0] << 1
+		z[1] = (result[0][1] << 1) | (result[0][0] >> 63)
+		z[2] = (result[1][0] << 1) | (result[0][1] >> 63)
+		z[3] = (result[1][1] << 1) | (result[1][0] >> 63)
+
+		// Reduce the 256-bit product modulo the field polynomial. z0 and z1 contain
+		// the high-degree terms (255 to 128), and z2 and z3 contain the low-degree terms (127 to 0).
+		for i := range 2 {
+			lw := z[i]
+			// Add the remainders of the high-degree terms to the low-degree terms
+			z[i+2] ^= lw ^ (lw >> 1) ^ (lw >> 2) ^ (lw >> 7)
+			// Add the carrys from the reduction
+			z[i+1] ^= (lw << 63) ^ (lw << 62) ^ (lw << 57)
+		}
+
+		// Write the reduced 128-bit product back into y
+		y[0], y[1], y[2], y[3] = uint32(z[2]), uint32(z[2]>>32), uint32(z[3]), uint32(z[3]>>32)
+	}
+
+	for i := range 4 {
+		byteorder.BEPutUint32(out[i*4:(i*4)+4], y[3-i])
 	}
 }