crypto/sha1: use const table for key loading on loong64

Load constant keys from a static memory table rather than loading immediates into registers on loong64.

Benchmark for Loongson-3A5000:
goos: linux
goarch: loong64
pkg: crypto/sha1
cpu: Loongson-3A5000-HV @ 2500.00MHz
                   │     old      │                new                │
                   │    sec/op    │   sec/op     vs base              │
Hash8Bytes/New-4      235.9n ± 0%   229.1n ± 0%  -2.88% (p=0.000 n=8)
Hash8Bytes/Sum-4      1.892µ ± 0%   1.882µ ± 0%  -0.50% (p=0.000 n=8)
Hash320Bytes/New-4   1022.0n ± 0%   963.8n ± 0%  -5.70% (p=0.000 n=8)
Hash320Bytes/Sum-4   1037.0n ± 0%   981.1n ± 0%  -5.39% (p=0.000 n=8)
Hash1K/New-4          2.760µ ± 0%   2.594µ ± 0%  -6.01% (p=0.000 n=8)
Hash1K/Sum-4          2.775µ ± 0%   2.610µ ± 0%  -5.95% (p=0.000 n=8)
Hash8K/New-4          20.46µ ± 0%   19.20µ ± 0%  -6.17% (p=0.000 n=8)
Hash8K/Sum-4          20.49µ ± 0%   19.22µ ± 0%  -6.17% (p=0.000 n=8)
geomean               2.498µ        2.377µ       -4.87%

                   │     old      │                new                 │
                   │     B/s      │     B/s       vs base              │
Hash8Bytes/New-4     32.34Mi ± 0%   33.30Mi ± 0%  +2.98% (p=0.000 n=8)
Hash8Bytes/Sum-4     4.034Mi ± 0%   4.053Mi ± 0%  +0.47% (p=0.000 n=8)
Hash320Bytes/New-4   298.7Mi ± 0%   316.7Mi ± 0%  +6.02% (p=0.000 n=8)
Hash320Bytes/Sum-4   294.3Mi ± 0%   311.0Mi ± 0%  +5.69% (p=0.000 n=8)
Hash1K/New-4         353.8Mi ± 0%   376.5Mi ± 0%  +6.41% (p=0.000 n=8)
Hash1K/Sum-4         351.9Mi ± 0%   374.1Mi ± 0%  +6.31% (p=0.000 n=8)
Hash8K/New-4         381.8Mi ± 0%   406.9Mi ± 0%  +6.57% (p=0.000 n=8)
Hash8K/Sum-4         381.4Mi ± 0%   406.4Mi ± 0%  +6.58% (p=0.000 n=8)
geomean              146.1Mi        153.6Mi       +5.11%

Change-Id: I7305caefa1434ab2bb4ce94a1c789d4ee5b7ccf3
Reviewed-on: https://go-review.googlesource.com/c/go/+/732580
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: abner chenc <chenguoqi@loongson.cn>
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
Reviewed-by: Carlos Amedee <carlos@golang.org>
This commit is contained in:
Julian Zhu 2025-12-24 21:16:56 +08:00
parent acd65ebb13
commit d99be5c444

View file

@ -26,6 +26,10 @@
#define REGTMP1 R17
#define REGTMP2 R18
#define REGTMP3 R19
#define KEYREG1 R25
#define KEYREG2 R26
#define KEYREG3 R27
#define KEYREG4 R28
#define LOAD1(index) \
MOVW (index*4)(R5), REGTMP3; \
@ -63,38 +67,38 @@
#define FUNC4 FUNC2
#define MIX(a, b, c, d, e, const) \
#define MIX(a, b, c, d, e, key) \
ROTR $2, b; \ // b << 30
ADD REGTMP1, e; \ // e = e + f
ROTR $27, a, REGTMP2; \ // a << 5
ADD REGTMP3, e; \ // e = e + w[i]
ADDV $const, e; \ // e = e + k
ADDV key, e; \ // e = e + k
ADD REGTMP2, e // e = e + a<<5
#define ROUND1(a, b, c, d, e, index) \
LOAD1(index); \
FUNC1(a, b, c, d, e); \
MIX(a, b, c, d, e, 0x5A827999)
MIX(a, b, c, d, e, KEYREG1)
#define ROUND1x(a, b, c, d, e, index) \
LOAD(index); \
FUNC1(a, b, c, d, e); \
MIX(a, b, c, d, e, 0x5A827999)
MIX(a, b, c, d, e, KEYREG1)
#define ROUND2(a, b, c, d, e, index) \
LOAD(index); \
FUNC2(a, b, c, d, e); \
MIX(a, b, c, d, e, 0x6ED9EBA1)
MIX(a, b, c, d, e, KEYREG2)
#define ROUND3(a, b, c, d, e, index) \
LOAD(index); \
FUNC3(a, b, c, d, e); \
MIX(a, b, c, d, e, 0x8F1BBCDC)
MIX(a, b, c, d, e, KEYREG3)
#define ROUND4(a, b, c, d, e, index) \
LOAD(index); \
FUNC4(a, b, c, d, e); \
MIX(a, b, c, d, e, 0xCA62C1D6)
MIX(a, b, c, d, e, KEYREG4)
// A stack frame size of 64 bytes is required here, because
// the frame size used for data expansion is 64 bytes.
@ -108,13 +112,19 @@ TEXT ·block(SB),NOSPLIT,$64-32
BEQ R6, zero
// p_len >= 64
ADDV R5, R6, R24
ADDV R5, R6, R24
MOVW (0*4)(R4), R7
MOVW (1*4)(R4), R8
MOVW (2*4)(R4), R9
MOVW (3*4)(R4), R10
MOVW (4*4)(R4), R11
MOVV $·_K(SB), R21
MOVW (0*4)(R21), KEYREG1
MOVW (1*4)(R21), KEYREG2
MOVW (2*4)(R21), KEYREG3
MOVW (3*4)(R21), KEYREG4
loop:
MOVW R7, R12
MOVW R8, R13
@ -224,3 +234,9 @@ end:
MOVW R11, (4*4)(R4)
zero:
RET
GLOBL ·_K(SB),RODATA,$16
DATA ·_K+0(SB)/4, $0x5A827999
DATA ·_K+4(SB)/4, $0x6ED9EBA1
DATA ·_K+8(SB)/4, $0x8F1BBCDC
DATA ·_K+12(SB)/4, $0xCA62C1D6