crypto/internal/fips140/bigmod: vector implementation of addMulVVWx on s390x

addMulVVWx assembly routine is used to multiply bignum multiplicand with a 64-bit multiplier.
The new implementation for s390x architecture uses an algorithm based on vector instructions,
with a significant performance improvement.

Note: z13 is the minimum architecture for Go, which already has VX support.

The performance improvement is as below:

goos: linux
goarch: s390x
pkg: crypto/internal/fips140/bigmod
                  Orig.txt       Vector_Patch.txt
                   sec/op             sec/op          vs base
ModAdd          164.1n ± 0%   159.7n ± 0%      -2.7% (p=0.000 n=10)
ModSub          152.3n ± 1%   147.3n ± 0%      -3.25 (p=0.000 n=10)
MontgomeryRepr  4.806µ ± 3% 1.829µ ± 0%    -61.94% (p=0.000 n=10)
MontgomeryMul   4.812µ ± 5% 1.834µ ± 0%    -61.90% (p=0.000 n=10)
ModMul          9.646µ ± 3% 3.698µ ± 0%    -61.67% (p=0.000 n=10)
ExpBig          11.28m ± 0%   11.28m ± 0%      +0.04 (p=0.035 n=10)
Exp             12.284m ± 5%  5.004m ± 1%    -59.26  (p=0.000 n=10)
geomean         18.61µ        10.74µ         -42.2

Change-Id: I679944c9dac9f43f1626b018f72efa6da0d2442d
Cq-Include-Trybots: luci.golang.try:gotip-linux-s390x
Reviewed-on: https://go-review.googlesource.com/c/go/+/716480
Auto-Submit: Filippo Valsorda <filippo@golang.org>
Reviewed-by: Vishwanatha HD <vishwanatha.hd@ibm.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: Roland Shoemaker <roland@golang.org>
Reviewed-by: Filippo Valsorda <filippo@golang.org>
Reviewed-by: Srinivas Pokala <Pokala.Srinivas@ibm.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
This commit is contained in:
kmvijay 2025-10-30 14:50:14 +00:00 committed by Gopher Robot
parent 657b331ff5
commit 6e4a0d8e44

View file

@ -4,82 +4,157 @@
//go:build !purego //go:build !purego
// Register usage (z13 convention):
// R2 = rp (result pointer)
// R3 = ap (source pointer)
// R4 = an / idx (loop counter)
// R5 = b0 (multiplier limb)
// R6 = cy (carry)
#include "textflag.h" #include "textflag.h"
// func addMulVVW1024(z, x *uint, y uint) (c uint) // func addMulVVW1024(z, x *uint, y uint) (c uint)
TEXT ·addMulVVW1024(SB), $0-32 TEXT ·addMulVVW1024(SB), $0-32
MOVD $16, R5 MOVD $16, R4
JMP addMulVVWx(SB) JMP addMulVVWx(SB)
// func addMulVVW1536(z, x *uint, y uint) (c uint) // func addMulVVW1536(z, x *uint, y uint) (c uint)
TEXT ·addMulVVW1536(SB), $0-32 TEXT ·addMulVVW1536(SB), $0-32
MOVD $24, R5 MOVD $24, R4
JMP addMulVVWx(SB) JMP addMulVVWx(SB)
// func addMulVVW2048(z, x *uint, y uint) (c uint) // func addMulVVW2048(z, x *uint, y uint) (c uint)
TEXT ·addMulVVW2048(SB), $0-32 TEXT ·addMulVVW2048(SB), $0-32
MOVD $32, R5 MOVD $32, R4
JMP addMulVVWx(SB) JMP addMulVVWx(SB)
TEXT addMulVVWx(SB), NOFRAME|NOSPLIT, $0 TEXT addMulVVWx(SB), NOFRAME|NOSPLIT, $0
MOVD z+0(FP), R2 MOVD z+0(FP), R2
MOVD x+8(FP), R8 MOVD x+8(FP), R3
MOVD y+16(FP), R9 MOVD y+16(FP), R5
MOVD $0, R1 // i*8 = 0 MOVD $0, R6
MOVD $0, R7 // i = 0
MOVD $0, R0 // make sure it's zero
MOVD $0, R4 // c = 0
MOVD R5, R12 L_ent:
AND $-2, R12 VZERO V0
CMPBGE R5, $2, A6 VZERO V2
BR E6 SRD $2, R4, R10
TMLL R4, $1
BRC $8, L_bx0
A6: L_bx1:
MOVD (R8)(R1*1), R6 VLEG $1, 0(R2), V2
MULHDU R9, R6 VZERO V4
MOVD (R2)(R1*1), R10 TMLL R4, $2
ADDC R10, R11 // add to low order bits BRC $7, L_b11
ADDE R0, R6
ADDC R4, R11
ADDE R0, R6
MOVD R6, R4
MOVD R11, (R2)(R1*1)
MOVD (8)(R8)(R1*1), R6 L_b01:
MULHDU R9, R6 MOVD $-24, R4
MOVD (8)(R2)(R1*1), R10 MOVD R6, R0
ADDC R10, R11 // add to low order bits MOVD 0(R3), R7
ADDE R0, R6 MLGR R5, R6
ADDC R4, R11 ADDC R0, R7
ADDE R0, R6 MOVD $0, R0
MOVD R6, R4 ADDE R0, R6
MOVD R11, (8)(R2)(R1*1) VLVGG $1, R7, V4
VAQ V2, V4, V2
VSTEG $1, V2, 0(R2)
VMRHG V2, V2, V2
CMPBEQ R10, $0, L_1
BR L_cj0
ADD $16, R1 // i*8 + 8 L_b11:
ADD $2, R7 // i++ MOVD $-8, R4
MOVD 0(R3), R9
MLGR R5, R8
ADDC R6, R9
MOVD $0, R6
ADDE R6, R8
VLVGG $1, R9, V4
VAQ V2, V4, V2
VSTEG $1, V2, 0(R2)
VMRHG V2, V2, V2
BR L_cj1
CMPBLT R7, R12, A6 L_bx0:
BR E6 TMLL R4, $2
BRC $7, L_b10
L6: L_b00:
// TODO: drop unused single-step loop. MOVD $-32, R4
MOVD (R8)(R1*1), R6
MULHDU R9, R6
MOVD (R2)(R1*1), R10
ADDC R10, R11 // add to low order bits
ADDE R0, R6
ADDC R4, R11
ADDE R0, R6
MOVD R6, R4
MOVD R11, (R2)(R1*1)
ADD $8, R1 // i*8 + 8 L_cj0:
ADD $1, R7 // i++ MOVD 32(R3)(R4), R1
MOVD 40(R3)(R4), R9
MLGR R5, R0
MLGR R5, R8
VL 32(R4)(R2), V1
VPDI $4, V1, V1, V1
VLVGP R0, R1, V6
VLVGP R9, R6, V7
BR L_mid
E6: L_b10:
CMPBLT R7, R5, L6 // i < n MOVD $-16, R4
MOVD R6, R8
MOVD R4, c+24(FP) L_cj1:
MOVD 16(R4)(R3), R1
MOVD 24(R4)(R3), R7
MLGR R5, R0
MLGR R5, R6
VL 16(R4)(R2), V1
VPDI $4, V1, V1, V1
VLVGP R0, R1, V6
VLVGP R7, R8, V7
CMPBEQ R10, $0, L_end
L_top:
MOVD 32(R4)(R3), R1
MOVD 40(R4)(R3), R9
MLGR R5, R0
MLGR R5, R8
VACQ V6, V1, V0, V5
VACCCQ V6, V1, V0, V0
VACQ V5, V7, V2, V3
VACCCQ V5, V7, V2, V2
VPDI $4, V3, V3, V3
VL 32(R4)(R2), V1
VPDI $4, V1, V1, V1
VST V3, 16(R4)(R2)
VLVGP R0, R1, V6
VLVGP R9, R6, V7
L_mid:
MOVD 48(R4)(R3), R1
MOVD 56(R4)(R3), R7
MLGR R5, R0
MLGR R5, R6
VACQ V6, V1, V0, V5
VACCCQ V6, V1, V0, V0
VACQ V5, V7, V2, V3
VACCCQ V5, V7, V2, V2
VPDI $4, V3, V3, V3
VL 48(R4)(R2), V1
VPDI $4, V1, V1, V1
VST V3, 32(R4)(R2)
VLVGP R0, R1, V6
VLVGP R7, R8, V7
MOVD $32(R4), R4
BRCTG R10, L_top
L_end:
VACQ V6, V1, V0, V5
VACCCQ V6, V1, V0, V0
VACQ V5, V7, V2, V3
VACCCQ V5, V7, V2, V2
VPDI $4, V3, V3, V3
VST V3, 16(R2)(R4)
VAG V0, V2, V2
L_1:
VLGVG $1, V2, R2
ADDC R6, R2
MOVD R2, c+24(FP)
RET RET