crypto/md5: improve ppc64x performance

This is mostly cleanup and simplification.  This removes
many unneeded register moves, loads, and bit twiddlings
which were holdovers from porting this from the amd64
version.

The updated code loads each block once per iteration
instead of once per round. Similarly, the logical
operations now match the original md5 specification.

Likewise, add extra sizes to the benchtest to give more
data points on how the implementation scales with input
size.

All in all, this is roughly a 20% improvement on ppc64le
code running on POWER9 (POWER8 is similar, but around
16%):

name                 old time/op    new time/op    delta
Hash8Bytes              297ns ± 0%     255ns ± 0%  -14.14%
Hash64                  527ns ± 0%     444ns ± 0%  -15.76%
Hash128                 771ns ± 0%     645ns ± 0%  -16.35%
Hash256                1.26µs ± 0%    1.05µs ± 0%  -16.68%
Hash512                2.23µs ± 0%    1.85µs ± 0%  -16.82%
Hash1K                 4.16µs ± 0%    3.46µs ± 0%  -16.83%
Hash8K                 31.2µs ± 0%    26.0µs ± 0%  -16.74%
Hash1M                 3.58ms ± 0%    2.98ms ± 0%  -16.74%
Hash8M                 26.1ms ± 0%    21.7ms ± 0%  -16.81%
Hash8BytesUnaligned     297ns ± 0%     255ns ± 0%  -14.08%
Hash1KUnaligned        4.16µs ± 0%    3.46µs ± 0%  -16.79%
Hash8KUnaligned        31.2µs ± 0%    26.0µs ± 0%  -16.78%

name                 old speed      new speed      delta
Hash8Bytes           26.9MB/s ± 0%  31.4MB/s ± 0%  +16.45%
Hash64                122MB/s ± 0%   144MB/s ± 0%  +18.69%
Hash128               166MB/s ± 0%   199MB/s ± 0%  +19.54%
Hash256               203MB/s ± 0%   244MB/s ± 0%  +20.01%
Hash512               230MB/s ± 0%   276MB/s ± 0%  +20.18%
Hash1K                246MB/s ± 0%   296MB/s ± 0%  +20.26%
Hash8K                263MB/s ± 0%   315MB/s ± 0%  +20.11%
Hash1M                293MB/s ± 0%   352MB/s ± 0%  +20.10%
Hash8M                321MB/s ± 0%   386MB/s ± 0%  +20.21%
Hash8BytesUnaligned  26.9MB/s ± 0%  31.4MB/s ± 0%  +16.41%
Hash1KUnaligned       246MB/s ± 0%   296MB/s ± 0%  +20.19%
Hash8KUnaligned       263MB/s ± 0%   315MB/s ± 0%  +20.15%

Change-Id: I269bfa6878966bb4f6a64dc349100f5dc453ab7c
Reviewed-on: https://go-review.googlesource.com/c/go/+/300613
Run-TryBot: Paul Murphy <murp@ibm.com>
Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com>
TryBot-Result: Go Bot <gobot@golang.org>
Trust: Emmanuel Odeke <emmanuel@orijtech.com>
This commit is contained in:
Paul E. Murphy 2021-03-10 17:06:54 -06:00 committed by Lynn Boger
parent 6ccb5c49cc
commit 4350e4961a
2 changed files with 171 additions and 136 deletions

View file

@ -212,7 +212,7 @@ func TestLargeHashes(t *testing.T) {
} }
var bench = New() var bench = New()
var buf = make([]byte, 8192+1) var buf = make([]byte, 1024*1024*8+1)
var sum = make([]byte, bench.Size()) var sum = make([]byte, bench.Size())
func benchmarkSize(b *testing.B, size int, unaligned bool) { func benchmarkSize(b *testing.B, size int, unaligned bool) {
@ -235,6 +235,22 @@ func BenchmarkHash8Bytes(b *testing.B) {
benchmarkSize(b, 8, false) benchmarkSize(b, 8, false)
} }
func BenchmarkHash64(b *testing.B) {
benchmarkSize(b, 64, false)
}
func BenchmarkHash128(b *testing.B) {
benchmarkSize(b, 128, false)
}
func BenchmarkHash256(b *testing.B) {
benchmarkSize(b, 256, false)
}
func BenchmarkHash512(b *testing.B) {
benchmarkSize(b, 512, false)
}
func BenchmarkHash1K(b *testing.B) { func BenchmarkHash1K(b *testing.B) {
benchmarkSize(b, 1024, false) benchmarkSize(b, 1024, false)
} }
@ -243,6 +259,14 @@ func BenchmarkHash8K(b *testing.B) {
benchmarkSize(b, 8192, false) benchmarkSize(b, 8192, false)
} }
func BenchmarkHash1M(b *testing.B) {
benchmarkSize(b, 1024*1024, false)
}
func BenchmarkHash8M(b *testing.B) {
benchmarkSize(b, 8*1024*1024, false)
}
func BenchmarkHash8BytesUnaligned(b *testing.B) { func BenchmarkHash8BytesUnaligned(b *testing.B) {
benchmarkSize(b, 8, true) benchmarkSize(b, 8, true)
} }

View file

@ -28,169 +28,179 @@
MOVWBR (idx)(ptr), dst MOVWBR (idx)(ptr), dst
#endif #endif
#define M00 R18
#define M01 R19
#define M02 R20
#define M03 R24
#define M04 R25
#define M05 R26
#define M06 R27
#define M07 R28
#define M08 R29
#define M09 R21
#define M10 R11
#define M11 R8
#define M12 R7
#define M13 R12
#define M14 R23
#define M15 R10
#define ROUND1(a, b, c, d, index, const, shift) \
ADD $const, index, R9; \
ADD R9, a; \
AND b, c, R9; \
ANDN b, d, R31; \
OR R9, R31, R9; \
ADD R9, a; \
ROTLW $shift, a; \
ADD b, a;
#define ROUND2(a, b, c, d, index, const, shift) \
ADD $const, index, R9; \
ADD R9, a; \
AND b, d, R31; \
ANDN d, c, R9; \
OR R9, R31; \
ADD R31, a; \
ROTLW $shift, a; \
ADD b, a;
#define ROUND3(a, b, c, d, index, const, shift) \
ADD $const, index, R9; \
ADD R9, a; \
XOR d, c, R31; \
XOR b, R31; \
ADD R31, a; \
ROTLW $shift, a; \
ADD b, a;
#define ROUND4(a, b, c, d, index, const, shift) \
ADD $const, index, R9; \
ADD R9, a; \
ORN d, b, R31; \
XOR c, R31; \
ADD R31, a; \
ROTLW $shift, a; \
ADD b, a;
TEXT ·block(SB),NOSPLIT,$0-32 TEXT ·block(SB),NOSPLIT,$0-32
MOVD dig+0(FP), R10 MOVD dig+0(FP), R10
MOVD p+8(FP), R6 MOVD p+8(FP), R6
MOVD p_len+16(FP), R5 MOVD p_len+16(FP), R5
SLD $6, R5
// We assume p_len >= 64
SRD $6, R5 SRD $6, R5
ADD R6, R5, R7 MOVD R5, CTR
MOVWZ 0(R10), R22 MOVWZ 0(R10), R22
MOVWZ 4(R10), R3 MOVWZ 4(R10), R3
MOVWZ 8(R10), R4 MOVWZ 8(R10), R4
MOVWZ 12(R10), R5 MOVWZ 12(R10), R5
CMP R6, R7
BEQ end
loop: loop:
MOVWZ R22, R14 MOVD R22, R14
MOVWZ R3, R15 MOVD R3, R15
MOVWZ R4, R16 MOVD R4, R16
MOVWZ R5, R17 MOVD R5, R17
ENDIAN_MOVE(0,R6,R8,R21) ENDIAN_MOVE( 0,R6,M00,M15)
MOVWZ R5, R9 ENDIAN_MOVE( 4,R6,M01,M15)
ENDIAN_MOVE( 8,R6,M02,M15)
ENDIAN_MOVE(12,R6,M03,M15)
#define ROUND1(a, b, c, d, index, const, shift) \ ROUND1(R22,R3,R4,R5,M00,0xd76aa478, 7);
XOR c, R9; \ ROUND1(R5,R22,R3,R4,M01,0xe8c7b756,12);
ADD $const, a; \ ROUND1(R4,R5,R22,R3,M02,0x242070db,17);
ADD R8, a; \ ROUND1(R3,R4,R5,R22,M03,0xc1bdceee,22);
AND b, R9; \
XOR d, R9; \
ENDIAN_MOVE(index*4,R6,R8,R21); \
ADD R9, a; \
RLWMI $shift, a, $0xffffffff, a; \
MOVWZ c, R9; \
ADD b, a; \
MOVWZ a, a
ROUND1(R22,R3,R4,R5, 1,0xd76aa478, 7); ENDIAN_MOVE(16,R6,M04,M15)
ROUND1(R5,R22,R3,R4, 2,0xe8c7b756,12); ENDIAN_MOVE(20,R6,M05,M15)
ROUND1(R4,R5,R22,R3, 3,0x242070db,17); ENDIAN_MOVE(24,R6,M06,M15)
ROUND1(R3,R4,R5,R22, 4,0xc1bdceee,22); ENDIAN_MOVE(28,R6,M07,M15)
ROUND1(R22,R3,R4,R5, 5,0xf57c0faf, 7);
ROUND1(R5,R22,R3,R4, 6,0x4787c62a,12);
ROUND1(R4,R5,R22,R3, 7,0xa8304613,17);
ROUND1(R3,R4,R5,R22, 8,0xfd469501,22);
ROUND1(R22,R3,R4,R5, 9,0x698098d8, 7);
ROUND1(R5,R22,R3,R4,10,0x8b44f7af,12);
ROUND1(R4,R5,R22,R3,11,0xffff5bb1,17);
ROUND1(R3,R4,R5,R22,12,0x895cd7be,22);
ROUND1(R22,R3,R4,R5,13,0x6b901122, 7);
ROUND1(R5,R22,R3,R4,14,0xfd987193,12);
ROUND1(R4,R5,R22,R3,15,0xa679438e,17);
ROUND1(R3,R4,R5,R22, 0,0x49b40821,22);
ENDIAN_MOVE(1*4,R6,R8,R21) ROUND1(R22,R3,R4,R5,M04,0xf57c0faf, 7);
MOVWZ R5, R9 ROUND1(R5,R22,R3,R4,M05,0x4787c62a,12);
MOVWZ R5, R10 ROUND1(R4,R5,R22,R3,M06,0xa8304613,17);
ROUND1(R3,R4,R5,R22,M07,0xfd469501,22);
#define ROUND2(a, b, c, d, index, const, shift) \ ENDIAN_MOVE(32,R6,M08,M15)
XOR $0xffffffff, R9; \ // NOTW R9 ENDIAN_MOVE(36,R6,M09,M15)
ADD $const, a; \ ENDIAN_MOVE(40,R6,M10,M15)
ADD R8, a; \ ENDIAN_MOVE(44,R6,M11,M15)
AND b, R10; \
AND c, R9; \
ENDIAN_MOVE(index*4,R6,R8,R21); \
OR R9, R10; \
MOVWZ c, R9; \
ADD R10, a; \
MOVWZ c, R10; \
RLWMI $shift, a, $0xffffffff, a; \
ADD b, a; \
MOVWZ a, a
ROUND2(R22,R3,R4,R5, 6,0xf61e2562, 5); ROUND1(R22,R3,R4,R5,M08,0x698098d8, 7);
ROUND2(R5,R22,R3,R4,11,0xc040b340, 9); ROUND1(R5,R22,R3,R4,M09,0x8b44f7af,12);
ROUND2(R4,R5,R22,R3, 0,0x265e5a51,14); ROUND1(R4,R5,R22,R3,M10,0xffff5bb1,17);
ROUND2(R3,R4,R5,R22, 5,0xe9b6c7aa,20); ROUND1(R3,R4,R5,R22,M11,0x895cd7be,22);
ROUND2(R22,R3,R4,R5,10,0xd62f105d, 5);
ROUND2(R5,R22,R3,R4,15, 0x2441453, 9);
ROUND2(R4,R5,R22,R3, 4,0xd8a1e681,14);
ROUND2(R3,R4,R5,R22, 9,0xe7d3fbc8,20);
ROUND2(R22,R3,R4,R5,14,0x21e1cde6, 5);
ROUND2(R5,R22,R3,R4, 3,0xc33707d6, 9);
ROUND2(R4,R5,R22,R3, 8,0xf4d50d87,14);
ROUND2(R3,R4,R5,R22,13,0x455a14ed,20);
ROUND2(R22,R3,R4,R5, 2,0xa9e3e905, 5);
ROUND2(R5,R22,R3,R4, 7,0xfcefa3f8, 9);
ROUND2(R4,R5,R22,R3,12,0x676f02d9,14);
ROUND2(R3,R4,R5,R22, 0,0x8d2a4c8a,20);
ENDIAN_MOVE(5*4,R6,R8,R21) ENDIAN_MOVE(48,R6,M12,M15)
MOVWZ R4, R9 ENDIAN_MOVE(52,R6,M13,M15)
ENDIAN_MOVE(56,R6,M14,M15)
ENDIAN_MOVE(60,R6,M15,M15)
#define ROUND3(a, b, c, d, index, const, shift) \ ROUND1(R22,R3,R4,R5,M12,0x6b901122, 7);
ADD $const, a; \ ROUND1(R5,R22,R3,R4,M13,0xfd987193,12);
ADD R8, a; \ ROUND1(R4,R5,R22,R3,M14,0xa679438e,17);
ENDIAN_MOVE(index*4,R6,R8,R21); \ ROUND1(R3,R4,R5,R22,M15,0x49b40821,22);
XOR d, R9; \
XOR b, R9; \
ADD R9, a; \
RLWMI $shift, a, $0xffffffff, a; \
MOVWZ b, R9; \
ADD b, a; \
MOVWZ a, a
ROUND3(R22,R3,R4,R5, 8,0xfffa3942, 4); ROUND2(R22,R3,R4,R5,M01,0xf61e2562, 5);
ROUND3(R5,R22,R3,R4,11,0x8771f681,11); ROUND2(R5,R22,R3,R4,M06,0xc040b340, 9);
ROUND3(R4,R5,R22,R3,14,0x6d9d6122,16); ROUND2(R4,R5,R22,R3,M11,0x265e5a51,14);
ROUND3(R3,R4,R5,R22, 1,0xfde5380c,23); ROUND2(R3,R4,R5,R22,M00,0xe9b6c7aa,20);
ROUND3(R22,R3,R4,R5, 4,0xa4beea44, 4); ROUND2(R22,R3,R4,R5,M05,0xd62f105d, 5);
ROUND3(R5,R22,R3,R4, 7,0x4bdecfa9,11); ROUND2(R5,R22,R3,R4,M10, 0x2441453, 9);
ROUND3(R4,R5,R22,R3,10,0xf6bb4b60,16); ROUND2(R4,R5,R22,R3,M15,0xd8a1e681,14);
ROUND3(R3,R4,R5,R22,13,0xbebfbc70,23); ROUND2(R3,R4,R5,R22,M04,0xe7d3fbc8,20);
ROUND3(R22,R3,R4,R5, 0,0x289b7ec6, 4); ROUND2(R22,R3,R4,R5,M09,0x21e1cde6, 5);
ROUND3(R5,R22,R3,R4, 3,0xeaa127fa,11); ROUND2(R5,R22,R3,R4,M14,0xc33707d6, 9);
ROUND3(R4,R5,R22,R3, 6,0xd4ef3085,16); ROUND2(R4,R5,R22,R3,M03,0xf4d50d87,14);
ROUND3(R3,R4,R5,R22, 9, 0x4881d05,23); ROUND2(R3,R4,R5,R22,M08,0x455a14ed,20);
ROUND3(R22,R3,R4,R5,12,0xd9d4d039, 4); ROUND2(R22,R3,R4,R5,M13,0xa9e3e905, 5);
ROUND3(R5,R22,R3,R4,15,0xe6db99e5,11); ROUND2(R5,R22,R3,R4,M02,0xfcefa3f8, 9);
ROUND3(R4,R5,R22,R3, 2,0x1fa27cf8,16); ROUND2(R4,R5,R22,R3,M07,0x676f02d9,14);
ROUND3(R3,R4,R5,R22, 0,0xc4ac5665,23); ROUND2(R3,R4,R5,R22,M12,0x8d2a4c8a,20);
ENDIAN_MOVE(0,R6,R8,R21) ROUND3(R22,R3,R4,R5,M05,0xfffa3942, 4);
MOVWZ $0xffffffff, R9 ROUND3(R5,R22,R3,R4,M08,0x8771f681,11);
XOR R5, R9 ROUND3(R4,R5,R22,R3,M11,0x6d9d6122,16);
ROUND3(R3,R4,R5,R22,M14,0xfde5380c,23);
ROUND3(R22,R3,R4,R5,M01,0xa4beea44, 4);
ROUND3(R5,R22,R3,R4,M04,0x4bdecfa9,11);
ROUND3(R4,R5,R22,R3,M07,0xf6bb4b60,16);
ROUND3(R3,R4,R5,R22,M10,0xbebfbc70,23);
ROUND3(R22,R3,R4,R5,M13,0x289b7ec6, 4);
ROUND3(R5,R22,R3,R4,M00,0xeaa127fa,11);
ROUND3(R4,R5,R22,R3,M03,0xd4ef3085,16);
ROUND3(R3,R4,R5,R22,M06, 0x4881d05,23);
ROUND3(R22,R3,R4,R5,M09,0xd9d4d039, 4);
ROUND3(R5,R22,R3,R4,M12,0xe6db99e5,11);
ROUND3(R4,R5,R22,R3,M15,0x1fa27cf8,16);
ROUND3(R3,R4,R5,R22,M02,0xc4ac5665,23);
#define ROUND4(a, b, c, d, index, const, shift) \ ROUND4(R22,R3,R4,R5,M00,0xf4292244, 6);
ADD $const, a; \ ROUND4(R5,R22,R3,R4,M07,0x432aff97,10);
ADD R8, a; \ ROUND4(R4,R5,R22,R3,M14,0xab9423a7,15);
OR b, R9; \ ROUND4(R3,R4,R5,R22,M05,0xfc93a039,21);
XOR c, R9; \ ROUND4(R22,R3,R4,R5,M12,0x655b59c3, 6);
ADD R9, a; \ ROUND4(R5,R22,R3,R4,M03,0x8f0ccc92,10);
ENDIAN_MOVE(index*4,R6,R8,R21); \ ROUND4(R4,R5,R22,R3,M10,0xffeff47d,15);
MOVWZ $0xffffffff, R9; \ ROUND4(R3,R4,R5,R22,M01,0x85845dd1,21);
RLWMI $shift, a, $0xffffffff, a; \ ROUND4(R22,R3,R4,R5,M08,0x6fa87e4f, 6);
XOR c, R9; \ ROUND4(R5,R22,R3,R4,M15,0xfe2ce6e0,10);
ADD b, a; \ ROUND4(R4,R5,R22,R3,M06,0xa3014314,15);
MOVWZ a, a ROUND4(R3,R4,R5,R22,M13,0x4e0811a1,21);
ROUND4(R22,R3,R4,R5,M04,0xf7537e82, 6);
ROUND4(R22,R3,R4,R5, 7,0xf4292244, 6); ROUND4(R5,R22,R3,R4,M11,0xbd3af235,10);
ROUND4(R5,R22,R3,R4,14,0x432aff97,10); ROUND4(R4,R5,R22,R3,M02,0x2ad7d2bb,15);
ROUND4(R4,R5,R22,R3, 5,0xab9423a7,15); ROUND4(R3,R4,R5,R22,M09,0xeb86d391,21);
ROUND4(R3,R4,R5,R22,12,0xfc93a039,21);
ROUND4(R22,R3,R4,R5, 3,0x655b59c3, 6);
ROUND4(R5,R22,R3,R4,10,0x8f0ccc92,10);
ROUND4(R4,R5,R22,R3, 1,0xffeff47d,15);
ROUND4(R3,R4,R5,R22, 8,0x85845dd1,21);
ROUND4(R22,R3,R4,R5,15,0x6fa87e4f, 6);
ROUND4(R5,R22,R3,R4, 6,0xfe2ce6e0,10);
ROUND4(R4,R5,R22,R3,13,0xa3014314,15);
ROUND4(R3,R4,R5,R22, 4,0x4e0811a1,21);
ROUND4(R22,R3,R4,R5,11,0xf7537e82, 6);
ROUND4(R5,R22,R3,R4, 2,0xbd3af235,10);
ROUND4(R4,R5,R22,R3, 9,0x2ad7d2bb,15);
ROUND4(R3,R4,R5,R22, 0,0xeb86d391,21);
ADD R14, R22 ADD R14, R22
ADD R15, R3 ADD R15, R3
ADD R16, R4 ADD R16, R4
ADD R17, R5 ADD R17, R5
ADD $64, R6 ADD $64, R6
CMP R6, R7 BC 16, 0, loop // bdnz
BLT loop
end: end:
MOVD dig+0(FP), R10 MOVD dig+0(FP), R10
@ -198,4 +208,5 @@ end:
MOVWZ R3, 4(R10) MOVWZ R3, 4(R10)
MOVWZ R4, 8(R10) MOVWZ R4, 8(R10)
MOVWZ R5, 12(R10) MOVWZ R5, 12(R10)
RET RET