internal/bytealg: vector implementation of indexbyte for riscv64

Provide a vector implementation of indexbyte for riscv64, which is used
when compiled with the rva23u64 profile, or when vector is detected
to be available. Inputs that are smaller than 24 bytes will continue
to use the non-vector path.

On a Banana Pi F3, with GORISCV64=rva23u64:

                │  indexbyte.1  │             indexbyte.2              │
                │    sec/op     │    sec/op     vs base                │
IndexByte/10-8     52.68n ±  0%   47.26n ±  0%  -10.30% (p=0.000 n=10)
IndexByte/32-8     68.62n ±  0%   47.02n ±  0%  -31.49% (p=0.000 n=10)
IndexByte/4K-8    2217.0n ±  0%   420.4n ±  0%  -81.04% (p=0.000 n=10)
IndexByte/4M-8    2624.4µ ±  0%   767.5µ ±  0%  -70.75% (p=0.000 n=10)
IndexByte/64M-8    68.08m ± 10%   47.84m ± 45%  -29.73% (p=0.004 n=10)
geomean            17.03µ         8.073µ        -52.59%

                │ indexbyte.1  │               indexbyte.2               │
                │     B/s      │      B/s        vs base                 │
IndexByte/10-8    181.0Mi ± 0%    201.8Mi ±  0%   +11.48% (p=0.000 n=10)
IndexByte/32-8    444.7Mi ± 0%    649.1Mi ±  0%   +45.97% (p=0.000 n=10)
IndexByte/4K-8    1.721Gi ± 0%    9.076Gi ±  0%  +427.51% (p=0.000 n=10)
IndexByte/4M-8    1.488Gi ± 0%    5.089Gi ±  0%  +241.93% (p=0.000 n=10)
IndexByte/64M-8   940.3Mi ± 9%   1337.8Mi ± 31%   +42.27% (p=0.004 n=10)
geomean           727.1Mi         1.498Gi        +110.94%

Change-Id: If7b0dbef38d76fa7a2021e4ecaed668a1d4b9783
Reviewed-on: https://go-review.googlesource.com/c/go/+/648856
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Meng Zhuo <mengzhuo1203@gmail.com>
Reviewed-by: Mark Freeman <markfreeman@google.com>
Reviewed-by: Mark Ryan <markdryan@rivosinc.com>
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
This commit is contained in:
Joel Sing 2025-02-08 01:03:23 +11:00
parent 75ea2d05c0
commit 3406a617d9

View file

@ -2,6 +2,7 @@
// Use of this source code is governed by a BSD-style // Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file. // license that can be found in the LICENSE file.
#include "asm_riscv64.h"
#include "go_asm.h" #include "go_asm.h"
#include "textflag.h" #include "textflag.h"
@ -11,12 +12,14 @@ TEXT ·IndexByte<ABIInternal>(SB),NOSPLIT,$0-40
// X12 = b_cap (unused) // X12 = b_cap (unused)
// X13 = byte to find // X13 = byte to find
AND $0xff, X13, X12 // x12 byte to look for AND $0xff, X13, X12 // x12 byte to look for
MOV X10, X13 // store base for later
SLTI $24, X11, X14 SLTI $24, X11, X14
ADD X10, X11 // end BNEZ X14, small
BEQZ X14, bigBody JMP indexByteBig<>(SB)
small:
MOV X10, X13 // store base for later
ADD X10, X11 // end
SUB $1, X10 SUB $1, X10
loop: loop:
ADD $1, X10 ADD $1, X10
@ -31,21 +34,19 @@ notfound:
MOV $-1, X10 MOV $-1, X10
RET RET
bigBody:
JMP indexByteBig<>(SB)
TEXT ·IndexByteString<ABIInternal>(SB),NOSPLIT,$0-32 TEXT ·IndexByteString<ABIInternal>(SB),NOSPLIT,$0-32
// X10 = b_base // X10 = b_base
// X11 = b_len // X11 = b_len
// X12 = byte to find // X12 = byte to find
AND $0xff, X12 // x12 byte to look for AND $0xff, X12 // x12 byte to look for
MOV X10, X13 // store base for later
SLTI $24, X11, X14 SLTI $24, X11, X14
ADD X10, X11 // end BNEZ X14, small
BEQZ X14, bigBody JMP indexByteBig<>(SB)
small:
MOV X10, X13 // store base for later
ADD X10, X11 // end
SUB $1, X10 SUB $1, X10
loop: loop:
ADD $1, X10 ADD $1, X10
@ -60,20 +61,41 @@ notfound:
MOV $-1, X10 MOV $-1, X10
RET RET
bigBody:
JMP indexByteBig<>(SB)
TEXT indexByteBig<>(SB),NOSPLIT|NOFRAME,$0 TEXT indexByteBig<>(SB),NOSPLIT|NOFRAME,$0
// On entry // On entry:
// X10 = b_base // X10 = b_base
// X11 = end // X11 = b_len (at least 16 bytes)
// X12 = byte to find // X12 = byte to find
// X13 = b_base // On exit:
// X11 is at least 16 bytes > X10
// On exit
// X10 = index of first instance of sought byte, if found, or -1 otherwise // X10 = index of first instance of sought byte, if found, or -1 otherwise
MOV X10, X13 // store base for later
#ifndef hasV
MOVB internalcpu·RISCV64+const_offsetRISCV64HasV(SB), X5
BEQZ X5, indexbyte_scalar
#endif
PCALIGN $16
vector_loop:
VSETVLI X11, E8, M8, TA, MA, X5
VLE8V (X10), V8
VMSEQVX X12, V8, V0
VFIRSTM V0, X6
BGEZ X6, vector_found
ADD X5, X10
SUB X5, X11
BNEZ X11, vector_loop
JMP notfound
vector_found:
SUB X13, X10
ADD X6, X10
RET
indexbyte_scalar:
ADD X10, X11 // end
// Process the first few bytes until we get to an 8 byte boundary // Process the first few bytes until we get to an 8 byte boundary
// No need to check for end here as we have at least 16 bytes in // No need to check for end here as we have at least 16 bytes in
// the buffer. // the buffer.