crypto/internal/fips140/bigmod: vector implementation of addMulVVWx on s390x

addMulVVWx assembly routine is used to multiply bignum multiplicand with a 64-bit multiplier.
The new implementation for s390x architecture uses an algorithm based on vector instructions,
with a significant performance improvement.

Note: z13 is the minimum architecture for Go, which already has VX support.

The performance improvement is as below:

goos: linux
goarch: s390x
pkg: crypto/internal/fips140/bigmod
                  Orig.txt       Vector_Patch.txt
                   sec/op             sec/op          vs base
ModAdd          164.1n ± 0%   159.7n ± 0%      -2.7% (p=0.000 n=10)
ModSub          152.3n ± 1%   147.3n ± 0%      -3.25 (p=0.000 n=10)
MontgomeryRepr  4.806µ ± 3% 1.829µ ± 0%    -61.94% (p=0.000 n=10)
MontgomeryMul   4.812µ ± 5% 1.834µ ± 0%    -61.90% (p=0.000 n=10)
ModMul          9.646µ ± 3% 3.698µ ± 0%    -61.67% (p=0.000 n=10)
ExpBig          11.28m ± 0%   11.28m ± 0%      +0.04 (p=0.035 n=10)
Exp             12.284m ± 5%  5.004m ± 1%    -59.26  (p=0.000 n=10)
geomean         18.61µ        10.74µ         -42.2

Change-Id: I679944c9dac9f43f1626b018f72efa6da0d2442d
Cq-Include-Trybots: luci.golang.try:gotip-linux-s390x
Reviewed-on: https://go-review.googlesource.com/c/go/+/716480
Auto-Submit: Filippo Valsorda <filippo@golang.org>
Reviewed-by: Vishwanatha HD <vishwanatha.hd@ibm.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: Roland Shoemaker <roland@golang.org>
Reviewed-by: Filippo Valsorda <filippo@golang.org>
Reviewed-by: Srinivas Pokala <Pokala.Srinivas@ibm.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
This commit is contained in:
kmvijay 2025-10-30 14:50:14 +00:00 committed by Gopher Robot
parent 657b331ff5
commit 6e4a0d8e44

View file

@ -4,82 +4,157 @@
//go:build !purego //go:build !purego
// Register usage (z13 convention):
// R2 = rp (result pointer)
// R3 = ap (source pointer)
// R4 = an / idx (loop counter)
// R5 = b0 (multiplier limb)
// R6 = cy (carry)
#include "textflag.h" #include "textflag.h"
// func addMulVVW1024(z, x *uint, y uint) (c uint) // func addMulVVW1024(z, x *uint, y uint) (c uint)
TEXT ·addMulVVW1024(SB), $0-32 TEXT ·addMulVVW1024(SB), $0-32
MOVD $16, R5 MOVD $16, R4
JMP addMulVVWx(SB) JMP addMulVVWx(SB)
// func addMulVVW1536(z, x *uint, y uint) (c uint) // func addMulVVW1536(z, x *uint, y uint) (c uint)
TEXT ·addMulVVW1536(SB), $0-32 TEXT ·addMulVVW1536(SB), $0-32
MOVD $24, R5 MOVD $24, R4
JMP addMulVVWx(SB) JMP addMulVVWx(SB)
// func addMulVVW2048(z, x *uint, y uint) (c uint) // func addMulVVW2048(z, x *uint, y uint) (c uint)
TEXT ·addMulVVW2048(SB), $0-32 TEXT ·addMulVVW2048(SB), $0-32
MOVD $32, R5 MOVD $32, R4
JMP addMulVVWx(SB) JMP addMulVVWx(SB)
TEXT addMulVVWx(SB), NOFRAME|NOSPLIT, $0 TEXT addMulVVWx(SB), NOFRAME|NOSPLIT, $0
MOVD z+0(FP), R2 MOVD z+0(FP), R2
MOVD x+8(FP), R8 MOVD x+8(FP), R3
MOVD y+16(FP), R9 MOVD y+16(FP), R5
MOVD $0, R1 // i*8 = 0 MOVD $0, R6
MOVD $0, R7 // i = 0
MOVD $0, R0 // make sure it's zero
MOVD $0, R4 // c = 0
MOVD R5, R12 L_ent:
AND $-2, R12 VZERO V0
CMPBGE R5, $2, A6 VZERO V2
BR E6 SRD $2, R4, R10
TMLL R4, $1
BRC $8, L_bx0
A6: L_bx1:
MOVD (R8)(R1*1), R6 VLEG $1, 0(R2), V2
MULHDU R9, R6 VZERO V4
MOVD (R2)(R1*1), R10 TMLL R4, $2
ADDC R10, R11 // add to low order bits BRC $7, L_b11
L_b01:
MOVD $-24, R4
MOVD R6, R0
MOVD 0(R3), R7
MLGR R5, R6
ADDC R0, R7
MOVD $0, R0
ADDE R0, R6 ADDE R0, R6
ADDC R4, R11 VLVGG $1, R7, V4
ADDE R0, R6 VAQ V2, V4, V2
MOVD R6, R4 VSTEG $1, V2, 0(R2)
MOVD R11, (R2)(R1*1) VMRHG V2, V2, V2
CMPBEQ R10, $0, L_1
BR L_cj0
MOVD (8)(R8)(R1*1), R6 L_b11:
MULHDU R9, R6 MOVD $-8, R4
MOVD (8)(R2)(R1*1), R10 MOVD 0(R3), R9
ADDC R10, R11 // add to low order bits MLGR R5, R8
ADDE R0, R6 ADDC R6, R9
ADDC R4, R11 MOVD $0, R6
ADDE R0, R6 ADDE R6, R8
MOVD R6, R4 VLVGG $1, R9, V4
MOVD R11, (8)(R2)(R1*1) VAQ V2, V4, V2
VSTEG $1, V2, 0(R2)
VMRHG V2, V2, V2
BR L_cj1
ADD $16, R1 // i*8 + 8 L_bx0:
ADD $2, R7 // i++ TMLL R4, $2
BRC $7, L_b10
CMPBLT R7, R12, A6 L_b00:
BR E6 MOVD $-32, R4
L6: L_cj0:
// TODO: drop unused single-step loop. MOVD 32(R3)(R4), R1
MOVD (R8)(R1*1), R6 MOVD 40(R3)(R4), R9
MULHDU R9, R6 MLGR R5, R0
MOVD (R2)(R1*1), R10 MLGR R5, R8
ADDC R10, R11 // add to low order bits VL 32(R4)(R2), V1
ADDE R0, R6 VPDI $4, V1, V1, V1
ADDC R4, R11 VLVGP R0, R1, V6
ADDE R0, R6 VLVGP R9, R6, V7
MOVD R6, R4 BR L_mid
MOVD R11, (R2)(R1*1)
ADD $8, R1 // i*8 + 8 L_b10:
ADD $1, R7 // i++ MOVD $-16, R4
MOVD R6, R8
E6: L_cj1:
CMPBLT R7, R5, L6 // i < n MOVD 16(R4)(R3), R1
MOVD 24(R4)(R3), R7
MLGR R5, R0
MLGR R5, R6
VL 16(R4)(R2), V1
VPDI $4, V1, V1, V1
VLVGP R0, R1, V6
VLVGP R7, R8, V7
CMPBEQ R10, $0, L_end
MOVD R4, c+24(FP) L_top:
MOVD 32(R4)(R3), R1
MOVD 40(R4)(R3), R9
MLGR R5, R0
MLGR R5, R8
VACQ V6, V1, V0, V5
VACCCQ V6, V1, V0, V0
VACQ V5, V7, V2, V3
VACCCQ V5, V7, V2, V2
VPDI $4, V3, V3, V3
VL 32(R4)(R2), V1
VPDI $4, V1, V1, V1
VST V3, 16(R4)(R2)
VLVGP R0, R1, V6
VLVGP R9, R6, V7
L_mid:
MOVD 48(R4)(R3), R1
MOVD 56(R4)(R3), R7
MLGR R5, R0
MLGR R5, R6
VACQ V6, V1, V0, V5
VACCCQ V6, V1, V0, V0
VACQ V5, V7, V2, V3
VACCCQ V5, V7, V2, V2
VPDI $4, V3, V3, V3
VL 48(R4)(R2), V1
VPDI $4, V1, V1, V1
VST V3, 32(R4)(R2)
VLVGP R0, R1, V6
VLVGP R7, R8, V7
MOVD $32(R4), R4
BRCTG R10, L_top
L_end:
VACQ V6, V1, V0, V5
VACCCQ V6, V1, V0, V0
VACQ V5, V7, V2, V3
VACCCQ V5, V7, V2, V2
VPDI $4, V3, V3, V3
VST V3, 16(R2)(R4)
VAG V0, V2, V2
L_1:
VLGVG $1, V2, R2
ADDC R6, R2
MOVD R2, c+24(FP)
RET RET