crypto/internal/fips140/subtle: add assembly implementation of xorBytes for mips64x

goos: linux
goarch: mips64le
pkg: crypto/subtle
                                     │  oldsubtle   │             newsubtle              │
                                     │    sec/op    │   sec/op     vs base               │
ConstantTimeByteEq-4                    5.011n ± 0%   5.014n ± 0%        ~ (p=0.110 n=8)
ConstantTimeEq-4                        3.342n ± 0%   3.342n ± 0%        ~ (p=0.993 n=8)
ConstantTimeLessOrEq-4                  4.455n ± 0%   4.458n ± 0%        ~ (p=0.182 n=8)
XORBytes/8Bytes-4                       36.48n ± 0%   26.73n ± 0%  -26.74% (p=0.000 n=8)
XORBytes/128Bytes-4                     70.21n ± 0%   50.14n ± 0%  -28.59% (p=0.000 n=8)
XORBytes/2048Bytes-4                    566.1n ± 0%   257.2n ± 0%  -54.58% (p=0.000 n=8)
XORBytes/8192Bytes-4                   2123.0n ± 0%   966.8n ± 0%  -54.46% (p=0.000 n=8)
XORBytes/32768Bytes-4                  13.740µ ± 0%   5.614µ ± 0%  -59.14% (p=0.000 n=8)
XORBytesAlignment/8Bytes0Offset-4       38.98n ± 0%   26.53n ± 0%  -31.95% (p=0.000 n=8)
XORBytesAlignment/8Bytes1Offset-4       43.27n ± 0%   26.54n ± 0%  -38.68% (p=0.000 n=8)
XORBytesAlignment/8Bytes2Offset-4       43.28n ± 0%   26.54n ± 0%  -38.69% (p=0.000 n=8)
XORBytesAlignment/8Bytes3Offset-4       43.32n ± 0%   26.54n ± 0%  -38.74% (p=0.000 n=8)
XORBytesAlignment/8Bytes4Offset-4       43.49n ± 0%   26.53n ± 0%  -38.99% (p=0.000 n=8)
XORBytesAlignment/8Bytes5Offset-4       43.53n ± 0%   26.54n ± 0%  -39.03% (p=0.000 n=8)
XORBytesAlignment/8Bytes6Offset-4       43.48n ± 0%   26.53n ± 0%  -38.98% (p=0.000 n=8)
XORBytesAlignment/8Bytes7Offset-4       43.46n ± 1%   26.53n ± 0%  -38.96% (p=0.000 n=8)
XORBytesAlignment/128Bytes0Offset-4     71.84n ± 0%   47.70n ± 1%  -33.60% (p=0.000 n=8)
XORBytesAlignment/128Bytes1Offset-4    260.60n ± 0%   59.87n ± 0%  -77.03% (p=0.000 n=8)
XORBytesAlignment/128Bytes2Offset-4    260.60n ± 0%   59.81n ± 0%  -77.05% (p=0.000 n=8)
XORBytesAlignment/128Bytes3Offset-4    260.55n ± 0%   59.89n ± 0%  -77.01% (p=0.000 n=8)
XORBytesAlignment/128Bytes4Offset-4    260.60n ± 0%   59.84n ± 0%  -77.04% (p=0.000 n=8)
XORBytesAlignment/128Bytes5Offset-4    260.70n ± 0%   59.82n ± 0%  -77.05% (p=0.000 n=8)
XORBytesAlignment/128Bytes6Offset-4    260.60n ± 0%   59.89n ± 0%  -77.02% (p=0.000 n=8)
XORBytesAlignment/128Bytes7Offset-4    260.70n ± 0%   59.85n ± 0%  -77.04% (p=0.000 n=8)
XORBytesAlignment/2048Bytes0Offset-4    552.2n ± 1%   250.0n ± 0%  -54.73% (p=0.000 n=8)
XORBytesAlignment/2048Bytes1Offset-4   3603.0n ± 0%   548.6n ± 0%  -84.77% (p=0.000 n=8)
XORBytesAlignment/2048Bytes2Offset-4   3602.0n ± 0%   548.6n ± 0%  -84.77% (p=0.000 n=8)
XORBytesAlignment/2048Bytes3Offset-4   3604.0n ± 0%   548.6n ± 0%  -84.78% (p=0.000 n=8)
XORBytesAlignment/2048Bytes4Offset-4   3603.5n ± 0%   548.9n ± 0%  -84.77% (p=0.000 n=8)
XORBytesAlignment/2048Bytes5Offset-4   3603.0n ± 0%   548.8n ± 0%  -84.77% (p=0.000 n=8)
XORBytesAlignment/2048Bytes6Offset-4   3602.0n ± 0%   548.6n ± 0%  -84.77% (p=0.000 n=8)
XORBytesAlignment/2048Bytes7Offset-4   3601.5n ± 0%   548.5n ± 0%  -84.77% (p=0.000 n=8)
geomean                                 220.0n        81.91n       -62.77%

                                     │  oldsubtle   │               newsubtle               │
                                     │     B/s      │      B/s       vs base                │
XORBytes/8Bytes-4                      209.1Mi ± 0%    285.5Mi ± 0%   +36.52% (p=0.000 n=8)
XORBytes/128Bytes-4                    1.698Gi ± 0%    2.378Gi ± 0%   +40.04% (p=0.000 n=8)
XORBytes/2048Bytes-4                   3.369Gi ± 0%    7.418Gi ± 0%  +120.17% (p=0.000 n=8)
XORBytes/8192Bytes-4                   3.594Gi ± 0%    7.892Gi ± 0%  +119.59% (p=0.000 n=8)
XORBytes/32768Bytes-4                  2.221Gi ± 0%    5.436Gi ± 0%  +144.76% (p=0.000 n=8)
XORBytesAlignment/8Bytes0Offset-4      195.7Mi ± 0%    287.6Mi ± 0%   +46.96% (p=0.000 n=8)
XORBytesAlignment/8Bytes1Offset-4      176.3Mi ± 0%    287.5Mi ± 0%   +63.06% (p=0.000 n=8)
XORBytesAlignment/8Bytes2Offset-4      176.3Mi ± 0%    287.4Mi ± 0%   +63.07% (p=0.000 n=8)
XORBytesAlignment/8Bytes3Offset-4      176.1Mi ± 0%    287.5Mi ± 0%   +63.25% (p=0.000 n=8)
XORBytesAlignment/8Bytes4Offset-4      175.5Mi ± 0%    287.6Mi ± 0%   +63.90% (p=0.000 n=8)
XORBytesAlignment/8Bytes5Offset-4      175.3Mi ± 0%    287.5Mi ± 0%   +64.02% (p=0.000 n=8)
XORBytesAlignment/8Bytes6Offset-4      175.5Mi ± 0%    287.6Mi ± 0%   +63.86% (p=0.000 n=8)
XORBytesAlignment/8Bytes7Offset-4      175.5Mi ± 0%    287.6Mi ± 0%   +63.85% (p=0.000 n=8)
XORBytesAlignment/128Bytes0Offset-4    1.659Gi ± 0%    2.499Gi ± 1%   +50.61% (p=0.000 n=8)
XORBytesAlignment/128Bytes1Offset-4    468.4Mi ± 0%   2039.0Mi ± 0%  +335.30% (p=0.000 n=8)
XORBytesAlignment/128Bytes2Offset-4    468.4Mi ± 0%   2040.9Mi ± 0%  +335.73% (p=0.000 n=8)
XORBytesAlignment/128Bytes3Offset-4    468.5Mi ± 0%   2038.1Mi ± 0%  +335.02% (p=0.000 n=8)
XORBytesAlignment/128Bytes4Offset-4    468.4Mi ± 0%   2040.0Mi ± 0%  +335.52% (p=0.000 n=8)
XORBytesAlignment/128Bytes5Offset-4    468.2Mi ± 0%   2040.5Mi ± 0%  +335.82% (p=0.000 n=8)
XORBytesAlignment/128Bytes6Offset-4    468.4Mi ± 0%   2038.2Mi ± 0%  +335.13% (p=0.000 n=8)
XORBytesAlignment/128Bytes7Offset-4    468.2Mi ± 0%   2039.4Mi ± 0%  +335.58% (p=0.000 n=8)
XORBytesAlignment/2048Bytes0Offset-4   3.454Gi ± 1%    7.629Gi ± 0%  +120.90% (p=0.000 n=8)
XORBytesAlignment/2048Bytes1Offset-4   542.1Mi ± 0%   3560.1Mi ± 0%  +556.68% (p=0.000 n=8)
XORBytesAlignment/2048Bytes2Offset-4   542.3Mi ± 0%   3560.1Mi ± 0%  +556.48% (p=0.000 n=8)
XORBytesAlignment/2048Bytes3Offset-4   541.9Mi ± 0%   3560.0Mi ± 0%  +556.93% (p=0.000 n=8)
XORBytesAlignment/2048Bytes4Offset-4   542.0Mi ± 0%   3558.8Mi ± 0%  +556.67% (p=0.000 n=8)
XORBytesAlignment/2048Bytes5Offset-4   542.1Mi ± 3%   3558.8Mi ± 0%  +556.53% (p=0.000 n=8)
XORBytesAlignment/2048Bytes6Offset-4   542.2Mi ± 0%   3560.2Mi ± 0%  +556.57% (p=0.000 n=8)
XORBytesAlignment/2048Bytes7Offset-4   542.3Mi ± 0%   3560.5Mi ± 0%  +556.56% (p=0.000 n=8)
geomean                                514.9Mi         1.496Gi       +197.56%

Change-Id: I649fa6bfca31296d65cccdf5fceb3dcfa0c588a1
Reviewed-on: https://go-review.googlesource.com/c/go/+/666255
Reviewed-by: Keith Randall <khr@google.com>
Auto-Submit: Keith Randall <khr@golang.org>
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: Keith Randall <khr@golang.org>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
This commit is contained in:
Julian Zhu 2025-04-17 16:04:08 +08:00 committed by Gopher Robot
parent 7bc714d601
commit 49d6777d87
3 changed files with 155 additions and 2 deletions

View file

@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style // Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file. // license that can be found in the LICENSE file.
//go:build (amd64 || arm64 || loong64 || ppc64 || ppc64le || riscv64) && !purego //go:build (amd64 || arm64 || loong64 || mips64 || mips64le || ppc64 || ppc64le || riscv64) && !purego
package subtle package subtle

View file

@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style // Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file. // license that can be found in the LICENSE file.
//go:build (!amd64 && !arm64 && !loong64 && !ppc64 && !ppc64le && !riscv64) || purego //go:build (!amd64 && !arm64 && !loong64 && !mips64 && !mips64le && !ppc64 && !ppc64le && !riscv64) || purego
package subtle package subtle

View file

@ -0,0 +1,153 @@
// Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build (mips64 || mips64le) && !purego
#include "textflag.h"
// func xorBytes(dst, a, b *byte, n int)
TEXT ·xorBytes(SB), NOSPLIT|NOFRAME, $0
MOVV dst+0(FP), R1
MOVV a+8(FP), R2
MOVV b+16(FP), R3
MOVV n+24(FP), R4
xor_64_check:
SGTU $64, R4, R5 // R5 = 1 if (64 > R4)
BNE R5, xor_32_check
xor_64:
MOVV (R2), R6
MOVV 8(R2), R7
MOVV 16(R2), R8
MOVV 24(R2), R9
MOVV (R3), R10
MOVV 8(R3), R11
MOVV 16(R3), R12
MOVV 24(R3), R13
XOR R6, R10
XOR R7, R11
XOR R8, R12
XOR R9, R13
MOVV R10, (R1)
MOVV R11, 8(R1)
MOVV R12, 16(R1)
MOVV R13, 24(R1)
MOVV 32(R2), R6
MOVV 40(R2), R7
MOVV 48(R2), R8
MOVV 56(R2), R9
MOVV 32(R3), R10
MOVV 40(R3), R11
MOVV 48(R3), R12
MOVV 56(R3), R13
XOR R6, R10
XOR R7, R11
XOR R8, R12
XOR R9, R13
MOVV R10, 32(R1)
MOVV R11, 40(R1)
MOVV R12, 48(R1)
MOVV R13, 56(R1)
ADDV $64, R2
ADDV $64, R3
ADDV $64, R1
SUBV $64, R4
SGTU $64, R4, R5
BEQ R0, R5, xor_64
BEQ R0, R4, end
xor_32_check:
SGTU $32, R4, R5
BNE R5, xor_16_check
xor_32:
MOVV (R2), R6
MOVV 8(R2), R7
MOVV 16(R2), R8
MOVV 24(R2), R9
MOVV (R3), R10
MOVV 8(R3), R11
MOVV 16(R3), R12
MOVV 24(R3), R13
XOR R6, R10
XOR R7, R11
XOR R8, R12
XOR R9, R13
MOVV R10, (R1)
MOVV R11, 8(R1)
MOVV R12, 16(R1)
MOVV R13, 24(R1)
ADDV $32, R2
ADDV $32, R3
ADDV $32, R1
SUBV $32, R4
BEQ R0, R4, end
xor_16_check:
SGTU $16, R4, R5
BNE R5, xor_8_check
xor_16:
MOVV (R2), R6
MOVV 8(R2), R7
MOVV (R3), R8
MOVV 8(R3), R9
XOR R6, R8
XOR R7, R9
MOVV R8, (R1)
MOVV R9, 8(R1)
ADDV $16, R2
ADDV $16, R3
ADDV $16, R1
SUBV $16, R4
BEQ R0, R4, end
xor_8_check:
SGTU $8, R4, R5
BNE R5, xor_4_check
xor_8:
MOVV (R2), R6
MOVV (R3), R7
XOR R6, R7
MOVV R7, (R1)
ADDV $8, R1
ADDV $8, R2
ADDV $8, R3
SUBV $8, R4
BEQ R0, R4, end
xor_4_check:
SGTU $4, R4, R5
BNE R5, xor_2_check
xor_4:
MOVW (R2), R6
MOVW (R3), R7
XOR R6, R7
MOVW R7, (R1)
ADDV $4, R2
ADDV $4, R3
ADDV $4, R1
SUBV $4, R4
BEQ R0, R4, end
xor_2_check:
SGTU $2, R4, R5
BNE R5, xor_1
xor_2:
MOVH (R2), R6
MOVH (R3), R7
XOR R6, R7
MOVH R7, (R1)
ADDV $2, R2
ADDV $2, R3
ADDV $2, R1
SUBV $2, R4
BEQ R0, R4, end
xor_1:
MOVB (R2), R6
MOVB (R3), R7
XOR R6, R7
MOVB R7, (R1)
end:
RET