mirror of
https://github.com/golang/go.git
synced 2025-10-19 19:13:18 +00:00
internal/bytealg: vector implementation of equal for riscv64
Provide a vector implementation of equal for riscv64, which is used when compiled with the rva23u64 profile, or when vector is detected to be available. Inputs that are 8 byte aligned will still be handled via a the non-vector code if the length is less than or equal to 64 bytes. On a Banana Pi F3, with GORISCV64=rva23u64: │ equal.1 │ equal.2 │ │ sec/op │ sec/op vs base │ Equal/0-8 1.254n ± 0% 1.254n ± 0% ~ (p=1.000 n=10) Equal/same/1-8 21.32n ± 0% 21.32n ± 0% ~ (p=0.466 n=10) Equal/same/6-8 21.32n ± 0% 21.32n ± 0% ~ (p=0.689 n=10) Equal/same/9-8 21.32n ± 0% 21.32n ± 0% ~ (p=0.861 n=10) Equal/same/15-8 21.32n ± 0% 21.32n ± 0% ~ (p=0.657 n=10) Equal/same/16-8 21.32n ± 0% 21.33n ± 0% ~ (p=0.075 n=10) Equal/same/20-8 21.32n ± 0% 21.32n ± 0% ~ (p=0.249 n=10) Equal/same/32-8 21.32n ± 0% 21.32n ± 0% ~ (p=0.303 n=10) Equal/same/4K-8 21.32n ± 0% 21.32n ± 0% ~ (p=1.000 n=10) Equal/same/4M-8 21.32n ± 0% 21.32n ± 0% ~ (p=0.582 n=10) Equal/same/64M-8 21.32n ± 0% 21.32n ± 0% ~ (p=0.930 n=10) Equal/1-8 39.16n ± 1% 38.71n ± 0% -1.15% (p=0.000 n=10) Equal/6-8 51.49n ± 1% 50.40n ± 1% -2.12% (p=0.000 n=10) Equal/9-8 54.46n ± 1% 53.89n ± 0% -1.04% (p=0.000 n=10) Equal/15-8 71.81n ± 1% 70.59n ± 0% -1.71% (p=0.000 n=10) Equal/16-8 69.14n ± 0% 68.21n ± 0% -1.34% (p=0.000 n=10) Equal/20-8 78.59n ± 0% 77.59n ± 0% -1.26% (p=0.000 n=10) Equal/32-8 41.55n ± 0% 41.16n ± 0% -0.96% (p=0.000 n=10) Equal/4K-8 925.5n ± 0% 561.4n ± 1% -39.34% (p=0.000 n=10) Equal/4M-8 3.110m ± 32% 2.463m ± 16% -20.80% (p=0.000 n=10) Equal/64M-8 47.34m ± 30% 39.89m ± 16% -15.75% (p=0.004 n=10) EqualBothUnaligned/64_0-8 32.17n ± 1% 32.11n ± 1% ~ (p=0.184 n=10) EqualBothUnaligned/64_1-8 79.48n ± 0% 48.24n ± 1% -39.31% (p=0.000 n=10) EqualBothUnaligned/64_4-8 72.71n ± 0% 48.37n ± 1% -33.48% (p=0.000 n=10) EqualBothUnaligned/64_7-8 77.12n ± 0% 48.16n ± 1% -37.56% (p=0.000 n=10) EqualBothUnaligned/4096_0-8 908.4n ± 0% 562.4n ± 2% -38.09% (p=0.000 n=10) EqualBothUnaligned/4096_1-8 956.6n ± 0% 571.4n ± 3% -40.26% (p=0.000 n=10) EqualBothUnaligned/4096_4-8 949.6n ± 0% 571.6n ± 3% -39.81% (p=0.000 n=10) EqualBothUnaligned/4096_7-8 954.2n ± 0% 571.7n ± 3% -40.09% (p=0.000 n=10) EqualBothUnaligned/4194304_0-8 2.935m ± 29% 2.664m ± 19% ~ (p=0.089 n=10) EqualBothUnaligned/4194304_1-8 3.341m ± 13% 2.896m ± 34% ~ (p=0.075 n=10) EqualBothUnaligned/4194304_4-8 3.204m ± 39% 3.352m ± 33% ~ (p=0.796 n=10) EqualBothUnaligned/4194304_7-8 3.226m ± 30% 2.737m ± 34% -15.16% (p=0.043 n=10) EqualBothUnaligned/67108864_0-8 49.04m ± 17% 39.94m ± 12% -18.57% (p=0.005 n=10) EqualBothUnaligned/67108864_1-8 51.96m ± 15% 42.48m ± 15% -18.23% (p=0.015 n=10) EqualBothUnaligned/67108864_4-8 47.67m ± 17% 37.85m ± 41% -20.61% (p=0.035 n=10) EqualBothUnaligned/67108864_7-8 53.00m ± 22% 38.76m ± 21% -26.87% (p=0.000 n=10) CompareBytesEqual-8 51.71n ± 1% 52.00n ± 0% +0.57% (p=0.002 n=10) geomean 1.469µ 1.265µ -13.93% │ equal.1 │ equal.2 │ │ B/s │ B/s vs base │ Equal/same/1-8 44.73Mi ± 0% 44.72Mi ± 0% ~ (p=0.426 n=10) Equal/same/6-8 268.3Mi ± 0% 268.4Mi ± 0% ~ (p=0.753 n=10) Equal/same/9-8 402.6Mi ± 0% 402.5Mi ± 0% ~ (p=0.209 n=10) Equal/same/15-8 670.9Mi ± 0% 670.9Mi ± 0% ~ (p=0.724 n=10) Equal/same/16-8 715.6Mi ± 0% 715.4Mi ± 0% -0.04% (p=0.022 n=10) Equal/same/20-8 894.6Mi ± 0% 894.5Mi ± 0% ~ (p=0.060 n=10) Equal/same/32-8 1.398Gi ± 0% 1.398Gi ± 0% ~ (p=0.986 n=10) Equal/same/4K-8 178.9Gi ± 0% 178.9Gi ± 0% ~ (p=0.853 n=10) Equal/same/4M-8 178.9Ti ± 0% 178.9Ti ± 0% ~ (p=0.971 n=10) Equal/same/64M-8 2862.8Ti ± 0% 2862.6Ti ± 0% ~ (p=0.971 n=10) Equal/1-8 24.35Mi ± 1% 24.63Mi ± 0% +1.16% (p=0.000 n=10) Equal/6-8 111.1Mi ± 1% 113.5Mi ± 1% +2.17% (p=0.000 n=10) Equal/9-8 157.6Mi ± 1% 159.3Mi ± 0% +1.05% (p=0.000 n=10) Equal/15-8 199.2Mi ± 1% 202.7Mi ± 0% +1.74% (p=0.000 n=10) Equal/16-8 220.7Mi ± 0% 223.7Mi ± 0% +1.36% (p=0.000 n=10) Equal/20-8 242.7Mi ± 0% 245.8Mi ± 0% +1.27% (p=0.000 n=10) Equal/32-8 734.3Mi ± 0% 741.6Mi ± 0% +0.98% (p=0.000 n=10) Equal/4K-8 4.122Gi ± 0% 6.795Gi ± 1% +64.84% (p=0.000 n=10) Equal/4M-8 1.258Gi ± 24% 1.586Gi ± 14% +26.12% (p=0.000 n=10) Equal/64M-8 1.320Gi ± 23% 1.567Gi ± 14% +18.69% (p=0.004 n=10) EqualBothUnaligned/64_0-8 1.853Gi ± 1% 1.856Gi ± 1% ~ (p=0.190 n=10) EqualBothUnaligned/64_1-8 767.9Mi ± 0% 1265.2Mi ± 1% +64.76% (p=0.000 n=10) EqualBothUnaligned/64_4-8 839.4Mi ± 0% 1261.9Mi ± 1% +50.33% (p=0.000 n=10) EqualBothUnaligned/64_7-8 791.4Mi ± 0% 1267.5Mi ± 1% +60.16% (p=0.000 n=10) EqualBothUnaligned/4096_0-8 4.199Gi ± 0% 6.784Gi ± 2% +61.54% (p=0.000 n=10) EqualBothUnaligned/4096_1-8 3.988Gi ± 0% 6.676Gi ± 3% +67.40% (p=0.000 n=10) EqualBothUnaligned/4096_4-8 4.017Gi ± 0% 6.674Gi ± 3% +66.14% (p=0.000 n=10) EqualBothUnaligned/4096_7-8 3.998Gi ± 0% 6.673Gi ± 3% +66.92% (p=0.000 n=10) EqualBothUnaligned/4194304_0-8 1.332Gi ± 22% 1.468Gi ± 16% ~ (p=0.089 n=10) EqualBothUnaligned/4194304_1-8 1.169Gi ± 12% 1.350Gi ± 25% ~ (p=0.075 n=10) EqualBothUnaligned/4194304_4-8 1.222Gi ± 28% 1.165Gi ± 48% ~ (p=0.796 n=10) EqualBothUnaligned/4194304_7-8 1.211Gi ± 23% 1.427Gi ± 26% +17.88% (p=0.043 n=10) EqualBothUnaligned/67108864_0-8 1.274Gi ± 14% 1.567Gi ± 14% +22.97% (p=0.005 n=10) EqualBothUnaligned/67108864_1-8 1.204Gi ± 14% 1.471Gi ± 13% +22.18% (p=0.015 n=10) EqualBothUnaligned/67108864_4-8 1.311Gi ± 14% 1.651Gi ± 29% +25.92% (p=0.035 n=10) EqualBothUnaligned/67108864_7-8 1.179Gi ± 18% 1.612Gi ± 17% +36.73% (p=0.000 n=10) geomean 1.870Gi 2.190Gi +17.16% Change-Id: I9c5270bcc6997d020a96d1e97c7e7cfc7ca7fd34 Reviewed-on: https://go-review.googlesource.com/c/go/+/646736 Reviewed-by: Mark Ryan <markdryan@rivosinc.com> Reviewed-by: Meng Zhuo <mengzhuo1203@gmail.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Dmitri Shuralyov <dmitshur@google.com> Reviewed-by: Mark Freeman <markfreeman@google.com>
This commit is contained in:
parent
17a8be7117
commit
75ea2d05c0
2 changed files with 36 additions and 4 deletions
|
@ -11,16 +11,18 @@ import (
|
|||
|
||||
// Offsets into internal/cpu records for use in assembly.
|
||||
const (
|
||||
offsetX86HasSSE42 = unsafe.Offsetof(cpu.X86.HasSSE42)
|
||||
offsetX86HasAVX2 = unsafe.Offsetof(cpu.X86.HasAVX2)
|
||||
offsetX86HasPOPCNT = unsafe.Offsetof(cpu.X86.HasPOPCNT)
|
||||
offsetPPC64HasPOWER9 = unsafe.Offsetof(cpu.PPC64.IsPOWER9)
|
||||
|
||||
offsetRISCV64HasV = unsafe.Offsetof(cpu.RISCV64.HasV)
|
||||
|
||||
offsetLOONG64HasLSX = unsafe.Offsetof(cpu.Loong64.HasLSX)
|
||||
offsetLOONG64HasLASX = unsafe.Offsetof(cpu.Loong64.HasLASX)
|
||||
|
||||
offsetS390xHasVX = unsafe.Offsetof(cpu.S390X.HasVX)
|
||||
|
||||
offsetPPC64HasPOWER9 = unsafe.Offsetof(cpu.PPC64.IsPOWER9)
|
||||
offsetX86HasSSE42 = unsafe.Offsetof(cpu.X86.HasSSE42)
|
||||
offsetX86HasAVX2 = unsafe.Offsetof(cpu.X86.HasAVX2)
|
||||
offsetX86HasPOPCNT = unsafe.Offsetof(cpu.X86.HasPOPCNT)
|
||||
)
|
||||
|
||||
// MaxLen is the maximum length of the string to be searched for (argument b) in Index.
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#include "asm_riscv64.h"
|
||||
#include "go_asm.h"
|
||||
#include "textflag.h"
|
||||
|
||||
|
@ -28,6 +29,35 @@ length_check:
|
|||
MOV $32, X23
|
||||
BLT X12, X23, loop4_check
|
||||
|
||||
#ifndef hasV
|
||||
MOVB internal∕cpu·RISCV64+const_offsetRISCV64HasV(SB), X5
|
||||
BEQZ X5, equal_scalar
|
||||
#endif
|
||||
|
||||
// Use vector if not 8 byte aligned.
|
||||
OR X10, X11, X5
|
||||
AND $7, X5
|
||||
BNEZ X5, vector_loop
|
||||
|
||||
// Use scalar if 8 byte aligned and <= 64 bytes.
|
||||
SUB $64, X12, X6
|
||||
BLEZ X6, loop32_check
|
||||
|
||||
PCALIGN $16
|
||||
vector_loop:
|
||||
VSETVLI X12, E8, M8, TA, MA, X5
|
||||
VLE8V (X10), V8
|
||||
VLE8V (X11), V16
|
||||
VMSNEVV V8, V16, V0
|
||||
VFIRSTM V0, X6
|
||||
BGEZ X6, done
|
||||
ADD X5, X10
|
||||
ADD X5, X11
|
||||
SUB X5, X12
|
||||
BNEZ X12, vector_loop
|
||||
JMP done
|
||||
|
||||
equal_scalar:
|
||||
// Check alignment - if alignment differs we have to do one byte at a time.
|
||||
AND $7, X10, X9
|
||||
AND $7, X11, X19
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue