internal/bytealg: vector implementation of equal for riscv64

Provide a vector implementation of equal for riscv64, which is used
when compiled with the rva23u64 profile, or when vector is detected
to be available. Inputs that are 8 byte aligned will still be handled
via a the non-vector code if the length is less than or equal to 64
bytes.

On a Banana Pi F3, with GORISCV64=rva23u64:

                                │   equal.1    │               equal.2                │
                                │    sec/op    │    sec/op     vs base                │
Equal/0-8                         1.254n ±  0%   1.254n ±  0%        ~ (p=1.000 n=10)
Equal/same/1-8                    21.32n ±  0%   21.32n ±  0%        ~ (p=0.466 n=10)
Equal/same/6-8                    21.32n ±  0%   21.32n ±  0%        ~ (p=0.689 n=10)
Equal/same/9-8                    21.32n ±  0%   21.32n ±  0%        ~ (p=0.861 n=10)
Equal/same/15-8                   21.32n ±  0%   21.32n ±  0%        ~ (p=0.657 n=10)
Equal/same/16-8                   21.32n ±  0%   21.33n ±  0%        ~ (p=0.075 n=10)
Equal/same/20-8                   21.32n ±  0%   21.32n ±  0%        ~ (p=0.249 n=10)
Equal/same/32-8                   21.32n ±  0%   21.32n ±  0%        ~ (p=0.303 n=10)
Equal/same/4K-8                   21.32n ±  0%   21.32n ±  0%        ~ (p=1.000 n=10)
Equal/same/4M-8                   21.32n ±  0%   21.32n ±  0%        ~ (p=0.582 n=10)
Equal/same/64M-8                  21.32n ±  0%   21.32n ±  0%        ~ (p=0.930 n=10)
Equal/1-8                         39.16n ±  1%   38.71n ±  0%   -1.15% (p=0.000 n=10)
Equal/6-8                         51.49n ±  1%   50.40n ±  1%   -2.12% (p=0.000 n=10)
Equal/9-8                         54.46n ±  1%   53.89n ±  0%   -1.04% (p=0.000 n=10)
Equal/15-8                        71.81n ±  1%   70.59n ±  0%   -1.71% (p=0.000 n=10)
Equal/16-8                        69.14n ±  0%   68.21n ±  0%   -1.34% (p=0.000 n=10)
Equal/20-8                        78.59n ±  0%   77.59n ±  0%   -1.26% (p=0.000 n=10)
Equal/32-8                        41.55n ±  0%   41.16n ±  0%   -0.96% (p=0.000 n=10)
Equal/4K-8                        925.5n ±  0%   561.4n ±  1%  -39.34% (p=0.000 n=10)
Equal/4M-8                        3.110m ± 32%   2.463m ± 16%  -20.80% (p=0.000 n=10)
Equal/64M-8                       47.34m ± 30%   39.89m ± 16%  -15.75% (p=0.004 n=10)
EqualBothUnaligned/64_0-8         32.17n ±  1%   32.11n ±  1%        ~ (p=0.184 n=10)
EqualBothUnaligned/64_1-8         79.48n ±  0%   48.24n ±  1%  -39.31% (p=0.000 n=10)
EqualBothUnaligned/64_4-8         72.71n ±  0%   48.37n ±  1%  -33.48% (p=0.000 n=10)
EqualBothUnaligned/64_7-8         77.12n ±  0%   48.16n ±  1%  -37.56% (p=0.000 n=10)
EqualBothUnaligned/4096_0-8       908.4n ±  0%   562.4n ±  2%  -38.09% (p=0.000 n=10)
EqualBothUnaligned/4096_1-8       956.6n ±  0%   571.4n ±  3%  -40.26% (p=0.000 n=10)
EqualBothUnaligned/4096_4-8       949.6n ±  0%   571.6n ±  3%  -39.81% (p=0.000 n=10)
EqualBothUnaligned/4096_7-8       954.2n ±  0%   571.7n ±  3%  -40.09% (p=0.000 n=10)
EqualBothUnaligned/4194304_0-8    2.935m ± 29%   2.664m ± 19%        ~ (p=0.089 n=10)
EqualBothUnaligned/4194304_1-8    3.341m ± 13%   2.896m ± 34%        ~ (p=0.075 n=10)
EqualBothUnaligned/4194304_4-8    3.204m ± 39%   3.352m ± 33%        ~ (p=0.796 n=10)
EqualBothUnaligned/4194304_7-8    3.226m ± 30%   2.737m ± 34%  -15.16% (p=0.043 n=10)
EqualBothUnaligned/67108864_0-8   49.04m ± 17%   39.94m ± 12%  -18.57% (p=0.005 n=10)
EqualBothUnaligned/67108864_1-8   51.96m ± 15%   42.48m ± 15%  -18.23% (p=0.015 n=10)
EqualBothUnaligned/67108864_4-8   47.67m ± 17%   37.85m ± 41%  -20.61% (p=0.035 n=10)
EqualBothUnaligned/67108864_7-8   53.00m ± 22%   38.76m ± 21%  -26.87% (p=0.000 n=10)
CompareBytesEqual-8               51.71n ±  1%   52.00n ±  0%   +0.57% (p=0.002 n=10)
geomean                           1.469µ         1.265µ        -13.93%

                                │    equal.1     │                equal.2                 │
                                │      B/s       │      B/s        vs base                │
Equal/same/1-8                     44.73Mi ±  0%    44.72Mi ±  0%        ~ (p=0.426 n=10)
Equal/same/6-8                     268.3Mi ±  0%    268.4Mi ±  0%        ~ (p=0.753 n=10)
Equal/same/9-8                     402.6Mi ±  0%    402.5Mi ±  0%        ~ (p=0.209 n=10)
Equal/same/15-8                    670.9Mi ±  0%    670.9Mi ±  0%        ~ (p=0.724 n=10)
Equal/same/16-8                    715.6Mi ±  0%    715.4Mi ±  0%   -0.04% (p=0.022 n=10)
Equal/same/20-8                    894.6Mi ±  0%    894.5Mi ±  0%        ~ (p=0.060 n=10)
Equal/same/32-8                    1.398Gi ±  0%    1.398Gi ±  0%        ~ (p=0.986 n=10)
Equal/same/4K-8                    178.9Gi ±  0%    178.9Gi ±  0%        ~ (p=0.853 n=10)
Equal/same/4M-8                    178.9Ti ±  0%    178.9Ti ±  0%        ~ (p=0.971 n=10)
Equal/same/64M-8                  2862.8Ti ±  0%   2862.6Ti ±  0%        ~ (p=0.971 n=10)
Equal/1-8                          24.35Mi ±  1%    24.63Mi ±  0%   +1.16% (p=0.000 n=10)
Equal/6-8                          111.1Mi ±  1%    113.5Mi ±  1%   +2.17% (p=0.000 n=10)
Equal/9-8                          157.6Mi ±  1%    159.3Mi ±  0%   +1.05% (p=0.000 n=10)
Equal/15-8                         199.2Mi ±  1%    202.7Mi ±  0%   +1.74% (p=0.000 n=10)
Equal/16-8                         220.7Mi ±  0%    223.7Mi ±  0%   +1.36% (p=0.000 n=10)
Equal/20-8                         242.7Mi ±  0%    245.8Mi ±  0%   +1.27% (p=0.000 n=10)
Equal/32-8                         734.3Mi ±  0%    741.6Mi ±  0%   +0.98% (p=0.000 n=10)
Equal/4K-8                         4.122Gi ±  0%    6.795Gi ±  1%  +64.84% (p=0.000 n=10)
Equal/4M-8                         1.258Gi ± 24%    1.586Gi ± 14%  +26.12% (p=0.000 n=10)
Equal/64M-8                        1.320Gi ± 23%    1.567Gi ± 14%  +18.69% (p=0.004 n=10)
EqualBothUnaligned/64_0-8          1.853Gi ±  1%    1.856Gi ±  1%        ~ (p=0.190 n=10)
EqualBothUnaligned/64_1-8          767.9Mi ±  0%   1265.2Mi ±  1%  +64.76% (p=0.000 n=10)
EqualBothUnaligned/64_4-8          839.4Mi ±  0%   1261.9Mi ±  1%  +50.33% (p=0.000 n=10)
EqualBothUnaligned/64_7-8          791.4Mi ±  0%   1267.5Mi ±  1%  +60.16% (p=0.000 n=10)
EqualBothUnaligned/4096_0-8        4.199Gi ±  0%    6.784Gi ±  2%  +61.54% (p=0.000 n=10)
EqualBothUnaligned/4096_1-8        3.988Gi ±  0%    6.676Gi ±  3%  +67.40% (p=0.000 n=10)
EqualBothUnaligned/4096_4-8        4.017Gi ±  0%    6.674Gi ±  3%  +66.14% (p=0.000 n=10)
EqualBothUnaligned/4096_7-8        3.998Gi ±  0%    6.673Gi ±  3%  +66.92% (p=0.000 n=10)
EqualBothUnaligned/4194304_0-8     1.332Gi ± 22%    1.468Gi ± 16%        ~ (p=0.089 n=10)
EqualBothUnaligned/4194304_1-8     1.169Gi ± 12%    1.350Gi ± 25%        ~ (p=0.075 n=10)
EqualBothUnaligned/4194304_4-8     1.222Gi ± 28%    1.165Gi ± 48%        ~ (p=0.796 n=10)
EqualBothUnaligned/4194304_7-8     1.211Gi ± 23%    1.427Gi ± 26%  +17.88% (p=0.043 n=10)
EqualBothUnaligned/67108864_0-8    1.274Gi ± 14%    1.567Gi ± 14%  +22.97% (p=0.005 n=10)
EqualBothUnaligned/67108864_1-8    1.204Gi ± 14%    1.471Gi ± 13%  +22.18% (p=0.015 n=10)
EqualBothUnaligned/67108864_4-8    1.311Gi ± 14%    1.651Gi ± 29%  +25.92% (p=0.035 n=10)
EqualBothUnaligned/67108864_7-8    1.179Gi ± 18%    1.612Gi ± 17%  +36.73% (p=0.000 n=10)
geomean                            1.870Gi          2.190Gi        +17.16%

Change-Id: I9c5270bcc6997d020a96d1e97c7e7cfc7ca7fd34
Reviewed-on: https://go-review.googlesource.com/c/go/+/646736
Reviewed-by: Mark Ryan <markdryan@rivosinc.com>
Reviewed-by: Meng Zhuo <mengzhuo1203@gmail.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
Reviewed-by: Mark Freeman <markfreeman@google.com>
This commit is contained in:
Joel Sing 2025-02-12 23:41:22 +11:00
parent 17a8be7117
commit 75ea2d05c0
2 changed files with 36 additions and 4 deletions

View file

@ -11,16 +11,18 @@ import (
// Offsets into internal/cpu records for use in assembly.
const (
offsetX86HasSSE42 = unsafe.Offsetof(cpu.X86.HasSSE42)
offsetX86HasAVX2 = unsafe.Offsetof(cpu.X86.HasAVX2)
offsetX86HasPOPCNT = unsafe.Offsetof(cpu.X86.HasPOPCNT)
offsetPPC64HasPOWER9 = unsafe.Offsetof(cpu.PPC64.IsPOWER9)
offsetRISCV64HasV = unsafe.Offsetof(cpu.RISCV64.HasV)
offsetLOONG64HasLSX = unsafe.Offsetof(cpu.Loong64.HasLSX)
offsetLOONG64HasLASX = unsafe.Offsetof(cpu.Loong64.HasLASX)
offsetS390xHasVX = unsafe.Offsetof(cpu.S390X.HasVX)
offsetPPC64HasPOWER9 = unsafe.Offsetof(cpu.PPC64.IsPOWER9)
offsetX86HasSSE42 = unsafe.Offsetof(cpu.X86.HasSSE42)
offsetX86HasAVX2 = unsafe.Offsetof(cpu.X86.HasAVX2)
offsetX86HasPOPCNT = unsafe.Offsetof(cpu.X86.HasPOPCNT)
)
// MaxLen is the maximum length of the string to be searched for (argument b) in Index.

View file

@ -2,6 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "asm_riscv64.h"
#include "go_asm.h"
#include "textflag.h"
@ -28,6 +29,35 @@ length_check:
MOV $32, X23
BLT X12, X23, loop4_check
#ifndef hasV
MOVB internalcpu·RISCV64+const_offsetRISCV64HasV(SB), X5
BEQZ X5, equal_scalar
#endif
// Use vector if not 8 byte aligned.
OR X10, X11, X5
AND $7, X5
BNEZ X5, vector_loop
// Use scalar if 8 byte aligned and <= 64 bytes.
SUB $64, X12, X6
BLEZ X6, loop32_check
PCALIGN $16
vector_loop:
VSETVLI X12, E8, M8, TA, MA, X5
VLE8V (X10), V8
VLE8V (X11), V16
VMSNEVV V8, V16, V0
VFIRSTM V0, X6
BGEZ X6, done
ADD X5, X10
ADD X5, X11
SUB X5, X12
BNEZ X12, vector_loop
JMP done
equal_scalar:
// Check alignment - if alignment differs we have to do one byte at a time.
AND $7, X10, X9
AND $7, X11, X19