mirror of
https://github.com/golang/go.git
synced 2025-12-08 06:10:04 +00:00
crypto/internal/fips140/bigmod: vector implementation of addMulVVWx on s390x
addMulVVWx assembly routine is used to multiply bignum multiplicand with a 64-bit multiplier.
The new implementation for s390x architecture uses an algorithm based on vector instructions,
with a significant performance improvement.
Note: z13 is the minimum architecture for Go, which already has VX support.
The performance improvement is as below:
goos: linux
goarch: s390x
pkg: crypto/internal/fips140/bigmod
Orig.txt Vector_Patch.txt
sec/op sec/op vs base
ModAdd 164.1n ± 0% 159.7n ± 0% -2.7% (p=0.000 n=10)
ModSub 152.3n ± 1% 147.3n ± 0% -3.25 (p=0.000 n=10)
MontgomeryRepr 4.806µ ± 3% 1.829µ ± 0% -61.94% (p=0.000 n=10)
MontgomeryMul 4.812µ ± 5% 1.834µ ± 0% -61.90% (p=0.000 n=10)
ModMul 9.646µ ± 3% 3.698µ ± 0% -61.67% (p=0.000 n=10)
ExpBig 11.28m ± 0% 11.28m ± 0% +0.04 (p=0.035 n=10)
Exp 12.284m ± 5% 5.004m ± 1% -59.26 (p=0.000 n=10)
geomean 18.61µ 10.74µ -42.2
Change-Id: I679944c9dac9f43f1626b018f72efa6da0d2442d
Cq-Include-Trybots: luci.golang.try:gotip-linux-s390x
Reviewed-on: https://go-review.googlesource.com/c/go/+/716480
Auto-Submit: Filippo Valsorda <filippo@golang.org>
Reviewed-by: Vishwanatha HD <vishwanatha.hd@ibm.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: Roland Shoemaker <roland@golang.org>
Reviewed-by: Filippo Valsorda <filippo@golang.org>
Reviewed-by: Srinivas Pokala <Pokala.Srinivas@ibm.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
This commit is contained in:
parent
657b331ff5
commit
6e4a0d8e44
1 changed files with 130 additions and 55 deletions
|
|
@ -4,82 +4,157 @@
|
|||
|
||||
//go:build !purego
|
||||
|
||||
// Register usage (z13 convention):
|
||||
// R2 = rp (result pointer)
|
||||
// R3 = ap (source pointer)
|
||||
// R4 = an / idx (loop counter)
|
||||
// R5 = b0 (multiplier limb)
|
||||
// R6 = cy (carry)
|
||||
|
||||
#include "textflag.h"
|
||||
|
||||
// func addMulVVW1024(z, x *uint, y uint) (c uint)
|
||||
TEXT ·addMulVVW1024(SB), $0-32
|
||||
MOVD $16, R5
|
||||
JMP addMulVVWx(SB)
|
||||
MOVD $16, R4
|
||||
JMP addMulVVWx(SB)
|
||||
|
||||
// func addMulVVW1536(z, x *uint, y uint) (c uint)
|
||||
TEXT ·addMulVVW1536(SB), $0-32
|
||||
MOVD $24, R5
|
||||
JMP addMulVVWx(SB)
|
||||
MOVD $24, R4
|
||||
JMP addMulVVWx(SB)
|
||||
|
||||
// func addMulVVW2048(z, x *uint, y uint) (c uint)
|
||||
TEXT ·addMulVVW2048(SB), $0-32
|
||||
MOVD $32, R5
|
||||
JMP addMulVVWx(SB)
|
||||
MOVD $32, R4
|
||||
JMP addMulVVWx(SB)
|
||||
|
||||
TEXT addMulVVWx(SB), NOFRAME|NOSPLIT, $0
|
||||
MOVD z+0(FP), R2
|
||||
MOVD x+8(FP), R8
|
||||
MOVD y+16(FP), R9
|
||||
MOVD x+8(FP), R3
|
||||
MOVD y+16(FP), R5
|
||||
|
||||
MOVD $0, R1 // i*8 = 0
|
||||
MOVD $0, R7 // i = 0
|
||||
MOVD $0, R0 // make sure it's zero
|
||||
MOVD $0, R4 // c = 0
|
||||
MOVD $0, R6
|
||||
|
||||
MOVD R5, R12
|
||||
AND $-2, R12
|
||||
CMPBGE R5, $2, A6
|
||||
BR E6
|
||||
L_ent:
|
||||
VZERO V0
|
||||
VZERO V2
|
||||
SRD $2, R4, R10
|
||||
TMLL R4, $1
|
||||
BRC $8, L_bx0
|
||||
|
||||
A6:
|
||||
MOVD (R8)(R1*1), R6
|
||||
MULHDU R9, R6
|
||||
MOVD (R2)(R1*1), R10
|
||||
ADDC R10, R11 // add to low order bits
|
||||
ADDE R0, R6
|
||||
ADDC R4, R11
|
||||
ADDE R0, R6
|
||||
MOVD R6, R4
|
||||
MOVD R11, (R2)(R1*1)
|
||||
L_bx1:
|
||||
VLEG $1, 0(R2), V2
|
||||
VZERO V4
|
||||
TMLL R4, $2
|
||||
BRC $7, L_b11
|
||||
|
||||
MOVD (8)(R8)(R1*1), R6
|
||||
MULHDU R9, R6
|
||||
MOVD (8)(R2)(R1*1), R10
|
||||
ADDC R10, R11 // add to low order bits
|
||||
ADDE R0, R6
|
||||
ADDC R4, R11
|
||||
ADDE R0, R6
|
||||
MOVD R6, R4
|
||||
MOVD R11, (8)(R2)(R1*1)
|
||||
L_b01:
|
||||
MOVD $-24, R4
|
||||
MOVD R6, R0
|
||||
MOVD 0(R3), R7
|
||||
MLGR R5, R6
|
||||
ADDC R0, R7
|
||||
MOVD $0, R0
|
||||
ADDE R0, R6
|
||||
VLVGG $1, R7, V4
|
||||
VAQ V2, V4, V2
|
||||
VSTEG $1, V2, 0(R2)
|
||||
VMRHG V2, V2, V2
|
||||
CMPBEQ R10, $0, L_1
|
||||
BR L_cj0
|
||||
|
||||
ADD $16, R1 // i*8 + 8
|
||||
ADD $2, R7 // i++
|
||||
L_b11:
|
||||
MOVD $-8, R4
|
||||
MOVD 0(R3), R9
|
||||
MLGR R5, R8
|
||||
ADDC R6, R9
|
||||
MOVD $0, R6
|
||||
ADDE R6, R8
|
||||
VLVGG $1, R9, V4
|
||||
VAQ V2, V4, V2
|
||||
VSTEG $1, V2, 0(R2)
|
||||
VMRHG V2, V2, V2
|
||||
BR L_cj1
|
||||
|
||||
CMPBLT R7, R12, A6
|
||||
BR E6
|
||||
L_bx0:
|
||||
TMLL R4, $2
|
||||
BRC $7, L_b10
|
||||
|
||||
L6:
|
||||
// TODO: drop unused single-step loop.
|
||||
MOVD (R8)(R1*1), R6
|
||||
MULHDU R9, R6
|
||||
MOVD (R2)(R1*1), R10
|
||||
ADDC R10, R11 // add to low order bits
|
||||
ADDE R0, R6
|
||||
ADDC R4, R11
|
||||
ADDE R0, R6
|
||||
MOVD R6, R4
|
||||
MOVD R11, (R2)(R1*1)
|
||||
L_b00:
|
||||
MOVD $-32, R4
|
||||
|
||||
ADD $8, R1 // i*8 + 8
|
||||
ADD $1, R7 // i++
|
||||
L_cj0:
|
||||
MOVD 32(R3)(R4), R1
|
||||
MOVD 40(R3)(R4), R9
|
||||
MLGR R5, R0
|
||||
MLGR R5, R8
|
||||
VL 32(R4)(R2), V1
|
||||
VPDI $4, V1, V1, V1
|
||||
VLVGP R0, R1, V6
|
||||
VLVGP R9, R6, V7
|
||||
BR L_mid
|
||||
|
||||
E6:
|
||||
CMPBLT R7, R5, L6 // i < n
|
||||
L_b10:
|
||||
MOVD $-16, R4
|
||||
MOVD R6, R8
|
||||
|
||||
MOVD R4, c+24(FP)
|
||||
L_cj1:
|
||||
MOVD 16(R4)(R3), R1
|
||||
MOVD 24(R4)(R3), R7
|
||||
MLGR R5, R0
|
||||
MLGR R5, R6
|
||||
VL 16(R4)(R2), V1
|
||||
VPDI $4, V1, V1, V1
|
||||
VLVGP R0, R1, V6
|
||||
VLVGP R7, R8, V7
|
||||
CMPBEQ R10, $0, L_end
|
||||
|
||||
L_top:
|
||||
MOVD 32(R4)(R3), R1
|
||||
MOVD 40(R4)(R3), R9
|
||||
MLGR R5, R0
|
||||
MLGR R5, R8
|
||||
VACQ V6, V1, V0, V5
|
||||
VACCCQ V6, V1, V0, V0
|
||||
VACQ V5, V7, V2, V3
|
||||
VACCCQ V5, V7, V2, V2
|
||||
VPDI $4, V3, V3, V3
|
||||
VL 32(R4)(R2), V1
|
||||
VPDI $4, V1, V1, V1
|
||||
VST V3, 16(R4)(R2)
|
||||
VLVGP R0, R1, V6
|
||||
VLVGP R9, R6, V7
|
||||
|
||||
L_mid:
|
||||
MOVD 48(R4)(R3), R1
|
||||
MOVD 56(R4)(R3), R7
|
||||
MLGR R5, R0
|
||||
MLGR R5, R6
|
||||
VACQ V6, V1, V0, V5
|
||||
VACCCQ V6, V1, V0, V0
|
||||
VACQ V5, V7, V2, V3
|
||||
VACCCQ V5, V7, V2, V2
|
||||
VPDI $4, V3, V3, V3
|
||||
VL 48(R4)(R2), V1
|
||||
VPDI $4, V1, V1, V1
|
||||
VST V3, 32(R4)(R2)
|
||||
VLVGP R0, R1, V6
|
||||
VLVGP R7, R8, V7
|
||||
MOVD $32(R4), R4
|
||||
BRCTG R10, L_top
|
||||
|
||||
L_end:
|
||||
VACQ V6, V1, V0, V5
|
||||
VACCCQ V6, V1, V0, V0
|
||||
VACQ V5, V7, V2, V3
|
||||
VACCCQ V5, V7, V2, V2
|
||||
VPDI $4, V3, V3, V3
|
||||
VST V3, 16(R2)(R4)
|
||||
VAG V0, V2, V2
|
||||
|
||||
L_1:
|
||||
VLGVG $1, V2, R2
|
||||
ADDC R6, R2
|
||||
MOVD R2, c+24(FP)
|
||||
RET
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue