crypto/elliptic: reduce allocations on amd64

This is inspired by https://blog.cloudflare.com/go-dont-collect-my-garbage/ This CL adds allocation tracking and parallelizes p256-related benchmarks. Amount of allocations can be significantly reduced by marking amd64 asm functions as noescape. This exposes a bug in p256MovCond: PANDN with memory argument will fault if memory is not aligned, so they are replaced with MOVDQU (which is ok with unaligned memory) and register version of PANDN. Results on 88-thread machine (2x 22 cores) below: crypto/elliptic: name old time/op new time/op delta BaseMultP256-88 1.50µs ±11% 1.19µs ± 5% -20.20% (p=0.000 n=10+10) ScalarMultP256-88 5.47µs ± 5% 3.63µs ±10% -33.66% (p=0.000 n=9+10) name old alloc/op new alloc/op delta BaseMultP256-88 800B ± 0% 288B ± 0% -64.00% (p=0.000 n=10+10) ScalarMultP256-88 2.59kB ± 0% 0.26kB ± 0% -90.12% (p=0.000 n=10+10) name old allocs/op new allocs/op delta BaseMultP256-88 13.0 ± 0% 6.0 ± 0% -53.85% (p=0.000 n=10+10) ScalarMultP256-88 16.0 ± 0% 5.0 ± 0% -68.75% (p=0.000 n=10+10) crypto/ecdsa: name old time/op new time/op delta SignP256-88 8.63µs ±37% 7.55µs ±38% ~ (p=0.393 n=10+10) VerifyP256-88 13.9µs ± 8% 7.0µs ± 7% -49.29% (p=0.000 n=10+9) KeyGeneration-88 2.77µs ±11% 2.34µs ±11% -15.57% (p=0.000 n=10+10) name old alloc/op new alloc/op delta SignP256-88 4.14kB ± 1% 2.98kB ± 2% -27.94% (p=0.000 n=10+10) VerifyP256-88 4.47kB ± 0% 0.99kB ± 0% -77.84% (p=0.000 n=9+10) KeyGeneration-88 1.21kB ± 0% 0.69kB ± 0% -42.78% (p=0.000 n=10+10) name old allocs/op new allocs/op delta SignP256-88 47.0 ± 0% 34.0 ± 0% -27.66% (p=0.000 n=10+10) VerifyP256-88 38.0 ± 0% 17.0 ± 0% -55.26% (p=0.000 n=10+10) KeyGeneration-88 20.0 ± 0% 13.0 ± 0% -35.00% (p=0.000 n=10+10) On machine with only 4 cores, results are much less impressive: around 2% performance gain. Change-Id: I8a2f8168f83d27ad9ace1b4b1a1e11cb83edf717 Reviewed-on: https://go-review.googlesource.com/80757 Run-TryBot: Ilya Tocar <ilya.tocar@intel.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Ian Lance Taylor <iant@golang.org>
2025-12-08 06:10:04 +00:00 · 2017-11-29 13:20:08 -06:00 · 2017-11-29 13:20:08 -06:00 · 73f284e2f2
commit 73f284e2f2
parent 1992893307
4 changed files with 70 additions and 27 deletions
--- a/src/crypto/ecdsa/ecdsa_test.go
+++ b/src/crypto/ecdsa/ecdsa_test.go
@ -48,10 +48,13 @@ func BenchmarkSignP256(b *testing.B) {
 	hashed := []byte("testing")
 	priv, _ := GenerateKey(p256, rand.Reader)

+	b.ReportAllocs()
 	b.ResetTimer()
-	for i := 0; i < b.N; i++ {
-		_, _, _ = Sign(rand.Reader, priv, hashed)
-	}
+	b.RunParallel(func(pb *testing.PB) {
+		for pb.Next() {
+			_, _, _ = Sign(rand.Reader, priv, hashed)
+		}
+	})
 }

 func BenchmarkSignP384(b *testing.B) {
@ -60,10 +63,13 @@ func BenchmarkSignP384(b *testing.B) {
 	hashed := []byte("testing")
 	priv, _ := GenerateKey(p384, rand.Reader)

+	b.ReportAllocs()
 	b.ResetTimer()
-	for i := 0; i < b.N; i++ {
-		_, _, _ = Sign(rand.Reader, priv, hashed)
-	}
+	b.RunParallel(func(pb *testing.PB) {
+		for pb.Next() {
+			_, _, _ = Sign(rand.Reader, priv, hashed)
+		}
+	})
 }

 func BenchmarkVerifyP256(b *testing.B) {
@ -73,20 +79,26 @@ func BenchmarkVerifyP256(b *testing.B) {
 	priv, _ := GenerateKey(p256, rand.Reader)
 	r, s, _ := Sign(rand.Reader, priv, hashed)

+	b.ReportAllocs()
 	b.ResetTimer()
-	for i := 0; i < b.N; i++ {
-		Verify(&priv.PublicKey, hashed, r, s)
-	}
+	b.RunParallel(func(pb *testing.PB) {
+		for pb.Next() {
+			Verify(&priv.PublicKey, hashed, r, s)
+		}
+	})
 }

 func BenchmarkKeyGeneration(b *testing.B) {
 	b.ResetTimer()
 	p256 := elliptic.P256()

+	b.ReportAllocs()
 	b.ResetTimer()
-	for i := 0; i < b.N; i++ {
-		GenerateKey(p256, rand.Reader)
-	}
+	b.RunParallel(func(pb *testing.PB) {
+		for pb.Next() {
+			GenerateKey(p256, rand.Reader)
+		}
+	})
 }

 func testSignAndVerify(t *testing.T, c elliptic.Curve, tag string) {