crypto/internal/fips140/aes: optimize ctrBlocks8Asm on amd64

Implement overflow-aware optimization in ctrBlocks8Asm: make a fast branch in case when there is no overflow. One branch per 8 blocks is faster than 7 increments in general purpose registers and transfers from them to XMM. Added AES-192 and AES-256 modes to the AES-CTR benchmark. Added a correctness test in ctr_test.go for the overflow optimization. This improves performance, especially in AES-128 mode. goos: windows goarch: amd64 pkg: crypto/cipher cpu: AMD Ryzen 7 5800H with Radeon Graphics │ B/s │ B/s vs base AESCTR/128/50-16 1.377Gi ± 0% 1.384Gi ± 0% +0.51% (p=0.028 n=20) AESCTR/128/1K-16 6.164Gi ± 0% 6.892Gi ± 1% +11.81% (p=0.000 n=20) AESCTR/128/8K-16 7.372Gi ± 0% 8.768Gi ± 1% +18.95% (p=0.000 n=20) AESCTR/192/50-16 1.289Gi ± 0% 1.279Gi ± 0% -0.75% (p=0.001 n=20) AESCTR/192/1K-16 5.734Gi ± 0% 6.011Gi ± 0% +4.83% (p=0.000 n=20) AESCTR/192/8K-16 6.889Gi ± 1% 7.437Gi ± 0% +7.96% (p=0.000 n=20) AESCTR/256/50-16 1.170Gi ± 0% 1.163Gi ± 0% -0.54% (p=0.005 n=20) AESCTR/256/1K-16 5.235Gi ± 0% 5.391Gi ± 0% +2.98% (p=0.000 n=20) AESCTR/256/8K-16 6.361Gi ± 0% 6.676Gi ± 0% +4.94% (p=0.000 n=20) geomean 3.681Gi 3.882Gi +5.46% The slight slowdown on 50-byte workloads is unrelated to this change, because such workloads never use ctrBlocks8Asm. Updates #76061 Change-Id: Idfd628ac8bb282d9c73c6adf048eb12274a41379 GitHub-Last-Rev: 5aadd39351 GitHub-Pull-Request: golang/go#76059 Reviewed-on: https://go-review.googlesource.com/c/go/+/714361 Reviewed-by: Cherry Mui <cherryyz@google.com> Reviewed-by: AHMAD ابو وليد <mizommz@gmail.com> Reviewed-by: Filippo Valsorda <filippo@golang.org> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Roland Shoemaker <roland@golang.org> Auto-Submit: Filippo Valsorda <filippo@golang.org>
2025-12-08 06:10:04 +00:00 · 2025-11-26 08:26:49 +00:00 · 2025-11-26 08:26:49 +00:00 · 71f8f031b2
commit 71f8f031b2
parent 03fcb33c0e
4 changed files with 185 additions and 29 deletions
--- a/src/crypto/cipher/benchmark_test.go
+++ b/src/crypto/cipher/benchmark_test.go
@ -65,12 +65,12 @@ func BenchmarkAESGCM(b *testing.B) {
 	}
 }
-func benchmarkAESStream(b *testing.B, mode func(cipher.Block, []byte) cipher.Stream, buf []byte) {
+func benchmarkAESStream(b *testing.B, mode func(cipher.Block, []byte) cipher.Stream, buf []byte, keySize int) {
 	b.SetBytes(int64(len(buf)))
-	var key [16]byte
+	key := make([]byte, keySize)
 	var iv [16]byte
-	aes, _ := aes.NewCipher(key[:])
+	aes, _ := aes.NewCipher(key)
 	stream := mode(aes, iv[:])
 	b.ResetTimer()
@ -87,15 +87,20 @@ const almost1K = 1024 - 5
 const almost8K = 8*1024 - 5
 func BenchmarkAESCTR(b *testing.B) {
 	for _, keyBits := range []int{128, 192, 256} {
 		keySize := keyBits / 8
 		b.Run(strconv.Itoa(keyBits), func(b *testing.B) {
 			b.Run("50", func(b *testing.B) {
-		benchmarkAESStream(b, cipher.NewCTR, make([]byte, 50))
+				benchmarkAESStream(b, cipher.NewCTR, make([]byte, 50), keySize)
 			})
 			b.Run("1K", func(b *testing.B) {
-		benchmarkAESStream(b, cipher.NewCTR, make([]byte, almost1K))
+				benchmarkAESStream(b, cipher.NewCTR, make([]byte, almost1K), keySize)
 			})
 			b.Run("8K", func(b *testing.B) {
-		benchmarkAESStream(b, cipher.NewCTR, make([]byte, almost8K))
+				benchmarkAESStream(b, cipher.NewCTR, make([]byte, almost8K), keySize)
 			})
 		})
 	}
 }
 func BenchmarkAESCBCEncrypt1K(b *testing.B) {
--- a/src/crypto/cipher/ctr_aes_test.go
+++ b/src/crypto/cipher/ctr_aes_test.go
@ -17,6 +17,7 @@ import (
 	"crypto/internal/boring"
 	"crypto/internal/cryptotest"
 	fipsaes "crypto/internal/fips140/aes"
 	"encoding/binary"
 	"encoding/hex"
 	"fmt"
 	"math/rand"
@ -117,6 +118,60 @@ func makeTestingCiphers(aesBlock cipher.Block, iv []byte) (genericCtr, multibloc
 	return cipher.NewCTR(wrap(aesBlock), iv), cipher.NewCTR(aesBlock, iv)
 }
 // TestCTR_AES_blocks8FastPathMatchesGeneric ensures the overlow aware branch
 // produces identical keystreams to the generic counter walker across
 // representative IVs, including near-overflow cases.
 func TestCTR_AES_blocks8FastPathMatchesGeneric(t *testing.T) {
 	key := make([]byte, aes.BlockSize)
 	block, err := aes.NewCipher(key)
 	if err != nil {
 		t.Fatal(err)
 	}
 	if _, ok := block.(*fipsaes.Block); !ok {
 		t.Skip("requires crypto/internal/fips140/aes")
 	}
 	keystream := make([]byte, 8*aes.BlockSize)
 	testCases := []struct {
 		name string
 		hi   uint64
 		lo   uint64
 	}{
 		{"Zero", 0, 0},
 		{"NearOverflowMinus7", 1, ^uint64(0) - 7},
 		{"NearOverflowMinus6", 2, ^uint64(0) - 6},
 		{"Overflow", 0, ^uint64(0)},
 	}
 	for _, tc := range testCases {
 		t.Run(tc.name, func(t *testing.T) {
 			var iv [aes.BlockSize]byte
 			binary.BigEndian.PutUint64(iv[0:8], tc.hi)
 			binary.BigEndian.PutUint64(iv[8:], tc.lo)
 			generic, multiblock := makeTestingCiphers(block, iv[:])
 			genericOut := make([]byte, len(keystream))
 			multiblockOut := make([]byte, len(keystream))
 			generic.XORKeyStream(genericOut, keystream)
 			multiblock.XORKeyStream(multiblockOut, keystream)
 			if !bytes.Equal(multiblockOut, genericOut) {
 				t.Fatalf("mismatch for iv %#x:%#x\n"+
 					"asm keystream: %x\n"+
 					"gen keystream: %x\n"+
 					"asm counters: %x\n"+
 					"gen counters: %x",
 					tc.hi, tc.lo, multiblockOut, genericOut,
 					extractCounters(block, multiblockOut),
 					extractCounters(block, genericOut))
 			}
 		})
 	}
 }
 func randBytes(t *testing.T, r *rand.Rand, count int) []byte {
 	t.Helper()
 	buf := make([]byte, count)
@ -297,3 +352,12 @@ func TestCTR_AES_multiblock_XORKeyStreamAt(t *testing.T) {
 		})
 	}
 }
 func extractCounters(block cipher.Block, keystream []byte) []byte {
 	blockSize := block.BlockSize()
 	res := make([]byte, len(keystream))
 	for i := 0; i < len(keystream); i += blockSize {
 		block.Decrypt(res[i:i+blockSize], keystream[i:i+blockSize])
 	}
 	return res
 }
--- a/src/crypto/internal/fips140/aes/_asm/ctr/ctr_amd64_asm.go
+++ b/src/crypto/internal/fips140/aes/_asm/ctr/ctr_amd64_asm.go
@ -40,12 +40,71 @@ func ctrBlocks(numBlocks int) {
 	bswap := XMM()
 	MOVOU(bswapMask(), bswap)
-	blocks := make([]VecVirtual, 0, numBlocks)
+	blocks := make([]VecVirtual, numBlocks)
 	// For the 8-block case we optimize counter generation. We build the first
 	// counter as usual, then check whether the remaining seven increments will
 	// overflow. When they do not (the common case) we keep the work entirely in
 	// XMM registers to avoid expensive general-purpose -> XMM moves. Otherwise
 	// we fall back to the traditional scalar path.
 	if numBlocks == 8 {
 		for i := range blocks {
 			blocks[i] = XMM()
 		}
 		base := XMM()
 		tmp := GP64()
 		addVec := XMM()
 		MOVQ(ivlo, blocks[0])
 		PINSRQ(Imm(1), ivhi, blocks[0])
 		MOVAPS(blocks[0], base)
 		PSHUFB(bswap, blocks[0])
 		// Check whether any of these eight counters will overflow.
 		MOVQ(ivlo, tmp)
 		ADDQ(Imm(uint64(numBlocks-1)), tmp)
 		slowLabel := fmt.Sprintf("ctr%d_slow", numBlocks)
 		doneLabel := fmt.Sprintf("ctr%d_done", numBlocks)
 		JC(LabelRef(slowLabel))
 		// Fast branch: create an XMM increment vector containing the value 1.
 		// Adding it to the base counter yields each subsequent counter.
 		XORQ(tmp, tmp)
 		INCQ(tmp)
 		PXOR(addVec, addVec)
 		PINSRQ(Imm(0), tmp, addVec)
 		for i := 1; i < numBlocks; i++ {
 			PADDQ(addVec, base)
 			MOVAPS(base, blocks[i])
 		}
 		JMP(LabelRef(doneLabel))
 		Label(slowLabel)
 		ADDQ(Imm(1), ivlo)
 		ADCQ(Imm(0), ivhi)
 		for i := 1; i < numBlocks; i++ {
 			MOVQ(ivlo, blocks[i])
 			PINSRQ(Imm(1), ivhi, blocks[i])
 			if i < numBlocks-1 {
 				ADDQ(Imm(1), ivlo)
 				ADCQ(Imm(0), ivhi)
 			}
 		}
 		Label(doneLabel)
 		// Convert little-endian counters to big-endian after the branch since
 		// both paths share the same shuffle sequence.
 		for i := 1; i < numBlocks; i++ {
 			PSHUFB(bswap, blocks[i])
 		}
 	} else {
 		// Lay out counter block plaintext.
 		for i := 0; i < numBlocks; i++ {
 			x := XMM()
-		blocks = append(blocks, x)
+			blocks[i] = x
 			MOVQ(ivlo, x)
 			PINSRQ(Imm(1), ivhi, x)
@ -55,6 +114,7 @@ func ctrBlocks(numBlocks int) {
 				ADCQ(Imm(0), ivhi)
 			}
 		}
 	}
 	// Initial key add.
 	aesRoundStart(blocks, Mem{Base: xk})
--- a/src/crypto/internal/fips140/aes/ctr_amd64.s
+++ b/src/crypto/internal/fips140/aes/ctr_amd64.s
@ -286,41 +286,68 @@ TEXT ·ctrBlocks8Asm(SB), $0-48
 	MOVOU  bswapMask<>+0(SB), X0
 	MOVQ   SI, X1
 	PINSRQ $0x01, DI, X1
 	MOVAPS X1, X8
 	PSHUFB X0, X1
 	MOVQ   SI, R8
 	ADDQ   $0x07, R8
 	JC     ctr8_slow
 	XORQ   R8, R8
 	INCQ   R8
 	PXOR   X9, X9
 	PINSRQ $0x00, R8, X9
 	PADDQ  X9, X8
 	MOVAPS X8, X2
 	PADDQ  X9, X8
 	MOVAPS X8, X3
 	PADDQ  X9, X8
 	MOVAPS X8, X4
 	PADDQ  X9, X8
 	MOVAPS X8, X5
 	PADDQ  X9, X8
 	MOVAPS X8, X6
 	PADDQ  X9, X8
 	MOVAPS X8, X7
 	PADDQ  X9, X8
 	MOVAPS X8, X8
 	JMP    ctr8_done
 ctr8_slow:
 	ADDQ   $0x01, SI
 	ADCQ   $0x00, DI
 	MOVQ   SI, X2
 	PINSRQ $0x01, DI, X2
 	PSHUFB X0, X2
 	ADDQ   $0x01, SI
 	ADCQ   $0x00, DI
 	MOVQ   SI, X3
 	PINSRQ $0x01, DI, X3
 	PSHUFB X0, X3
 	ADDQ   $0x01, SI
 	ADCQ   $0x00, DI
 	MOVQ   SI, X4
 	PINSRQ $0x01, DI, X4
 	PSHUFB X0, X4
 	ADDQ   $0x01, SI
 	ADCQ   $0x00, DI
 	MOVQ   SI, X5
 	PINSRQ $0x01, DI, X5
 	PSHUFB X0, X5
 	ADDQ   $0x01, SI
 	ADCQ   $0x00, DI
 	MOVQ   SI, X6
 	PINSRQ $0x01, DI, X6
 	PSHUFB X0, X6
 	ADDQ   $0x01, SI
 	ADCQ   $0x00, DI
 	MOVQ   SI, X7
 	PINSRQ $0x01, DI, X7
 	PSHUFB X0, X7
 	ADDQ   $0x01, SI
 	ADCQ   $0x00, DI
 	MOVQ   SI, X8
 	PINSRQ $0x01, DI, X8
 ctr8_done:
 	PSHUFB X0, X2
 	PSHUFB X0, X3
 	PSHUFB X0, X4
 	PSHUFB X0, X5
 	PSHUFB X0, X6
 	PSHUFB X0, X7
 	PSHUFB X0, X8
 	MOVUPS (CX), X0
 	PXOR   X0, X1