mirror of
https://github.com/golang/go.git
synced 2025-12-08 06:10:04 +00:00
internal/bytealg: optimize Index/IndexString on loong64
goos: linux
goarch: loong64
pkg: bytes
cpu: Loongson-3A6000 @ 2500.00MHz
| 3a6000.old.txt | 3a6000.new.txt |
| sec/op | sec/op vs base |
IndexRune/10 23.56n ± 1% 20.42n ± 0% -13.33% (p=0.000 n=10)
IndexRune/32 29.91n ± 1% 22.46n ± 0% -24.90% (p=0.000 n=10)
IndexRune/4K 102.45n ± 2% 72.66n ± 0% -29.08% (p=0.000 n=10)
IndexRune/4M 111.96µ ± 1% 52.50µ ± 1% -53.11% (p=0.000 n=10)
IndexRune/64M 3.653m ± 30% 3.633m ± 0% ~ (p=0.143 n=10)
IndexRuneASCII/10 8.736n ± 2% 7.206n ± 0% -17.51% (p=0.000 n=10)
IndexRuneASCII/32 10.195n ± 2% 8.008n ± 0% -21.45% (p=0.000 n=10)
IndexRuneASCII/4K 70.27n ± 2% 52.84n ± 0% -24.80% (p=0.000 n=10)
IndexRuneASCII/4M 98.15µ ± 1% 87.87µ ± 1% -10.47% (p=0.000 n=10)
IndexRuneASCII/64M 2.028m ± 0% 1.918m ± 2% -5.41% (p=0.000 n=10)
IndexRuneUnicode/Latin/10 18.80n ± 1% 13.61n ± 0% -27.59% (p=0.000 n=10)
IndexRuneUnicode/Latin/32 28.09n ± 2% 20.82n ± 0% -25.88% (p=0.000 n=10)
IndexRuneUnicode/Latin/4K 373.8n ± 1% 357.1n ± 0% -4.47% (p=0.000 n=10)
IndexRuneUnicode/Latin/4M 395.8µ ± 0% 381.0µ ± 0% -3.74% (p=0.000 n=10)
IndexRuneUnicode/Latin/64M 8.056m ± 0% 7.614m ± 0% -5.49% (p=0.000 n=10)
IndexRuneUnicode/Cyrillic/10 23.72n ± 1% 20.42n ± 0% -13.91% (p=0.000 n=10)
IndexRuneUnicode/Cyrillic/32 30.20n ± 1% 22.42n ± 0% -25.77% (p=0.000 n=10)
IndexRuneUnicode/Cyrillic/4K 1.134µ ± 1% 1.122µ ± 0% -1.06% (p=0.000 n=10)
IndexRuneUnicode/Cyrillic/4M 1.160m ± 1% 1.152m ± 0% -0.72% (p=0.005 n=10)
IndexRuneUnicode/Cyrillic/64M 20.26m ± 1% 19.61m ± 0% -3.24% (p=0.000 n=10)
IndexRuneUnicode/Han/10 30.11n ± 2% 24.82n ± 0% -17.57% (p=0.000 n=10)
IndexRuneUnicode/Han/32 36.16n ± 2% 27.20n ± 0% -24.78% (p=0.000 n=10)
IndexRuneUnicode/Han/4K 548.1n ± 0% 524.8n ± 0% -4.25% (p=0.000 n=10)
IndexRuneUnicode/Han/4M 706.7µ ± 1% 624.0µ ± 0% -11.70% (p=0.000 n=10)
IndexRuneUnicode/Han/64M 12.50m ± 1% 10.84m ± 1% -13.24% (p=0.000 n=10)
Index/10 42.03n ± 2% 10.01n ± 0% -76.18% (p=0.000 n=10)
Index/32 133.15n ± 1% 40.03n ± 0% -69.94% (p=0.000 n=10)
Index/4K 11.647µ ± 1% 2.493µ ± 0% -78.60% (p=0.000 n=10)
Index/4M 11.536m ± 0% 2.519m ± 0% -78.16% (p=0.000 n=10)
Index/64M 184.60m ± 1% 40.42m ± 0% -78.10% (p=0.000 n=10)
IndexEasy/10 17.290n ± 2% 9.608n ± 0% -44.43% (p=0.000 n=10)
IndexEasy/32 23.71n ± 2% 16.61n ± 0% -29.95% (p=0.000 n=10)
IndexEasy/4K 95.64n ± 2% 68.25n ± 0% -28.64% (p=0.000 n=10)
IndexEasy/4M 105.04µ ± 1% 91.94µ ± 0% -12.47% (p=0.000 n=10)
IndexEasy/64M 4.280m ± 0% 4.264m ± 0% -0.38% (p=0.002 n=10)
Count/10 53.09n ± 1% 16.81n ± 0% -68.33% (p=0.000 n=10)
Count/32 142.20n ± 2% 46.44n ± 0% -67.34% (p=0.000 n=10)
Count/4K 11.428µ ± 1% 2.500µ ± 1% -78.12% (p=0.000 n=10)
Count/4M 11.536m ± 1% 2.520m ± 0% -78.16% (p=0.000 n=10)
Count/64M 183.80m ± 1% 40.42m ± 0% -78.01% (p=0.000 n=10)
IndexHard1 2906.4µ ± 1% 420.4µ ± 0% -85.54% (p=0.000 n=10)
IndexHard2 2918.0µ ± 1% 421.1µ ± 1% -85.57% (p=0.000 n=10)
IndexHard3 2912.8µ ± 1% 440.2µ ± 0% -84.89% (p=0.000 n=10)
IndexHard4 2909.6µ ± 1% 840.4µ ± 0% -71.12% (p=0.000 n=10)
LastIndexHard1 2.939m ± 1% 2.621m ± 0% -10.83% (p=0.000 n=10)
LastIndexHard2 2.924m ± 1% 2.624m ± 0% -10.26% (p=0.000 n=10)
LastIndexHard3 2.936m ± 1% 2.580m ± 1% -12.12% (p=0.000 n=10)
CountHard1 2900.4µ ± 1% 420.0µ ± 0% -85.52% (p=0.000 n=10)
CountHard2 2915.6µ ± 1% 420.0µ ± 0% -85.59% (p=0.000 n=10)
CountHard3 2905.0µ ± 0% 440.0µ ± 0% -84.85% (p=0.000 n=10)
IndexPeriodic/IndexPeriodic2 181.95µ ± 1% 26.28µ ± 0% -85.56% (p=0.000 n=10)
IndexPeriodic/IndexPeriodic4 182.59µ ± 1% 26.29µ ± 0% -85.60% (p=0.000 n=10)
IndexPeriodic/IndexPeriodic8 183.9µ ± 1% 108.2µ ± 0% -41.14% (p=0.000 n=10)
IndexPeriodic/IndexPeriodic16 58.24µ ± 0% 56.58µ ± 0% -2.86% (p=0.000 n=10)
IndexPeriodic/IndexPeriodic32 30.82µ ± 0% 29.62µ ± 0% -3.92% (p=0.000 n=10)
IndexPeriodic/IndexPeriodic64 16.59µ ± 0% 15.00µ ± 0% -9.62% (p=0.000 n=10)
geomean 22.69µ 11.59µ -48.92%
Change-Id: Iacc9e686027f99bb0413b566cfc8ee6cd873d2d9
Reviewed-on: https://go-review.googlesource.com/c/go/+/693878
Reviewed-by: abner chenc <chenguoqi@loongson.cn>
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
Reviewed-by: Mark Freeman <markfreeman@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
This commit is contained in:
parent
73ff6d1480
commit
dd3abf6bc5
4 changed files with 335 additions and 2 deletions
|
|
@ -2,7 +2,7 @@
|
|||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
//go:build !amd64 && !arm64 && !s390x && !ppc64le && !ppc64
|
||||
//go:build !amd64 && !arm64 && !loong64 && !s390x && !ppc64le && !ppc64
|
||||
|
||||
package bytealg
|
||||
|
||||
|
|
|
|||
30
src/internal/bytealg/index_loong64.go
Normal file
30
src/internal/bytealg/index_loong64.go
Normal file
|
|
@ -0,0 +1,30 @@
|
|||
// Copyright 2025 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package bytealg
|
||||
|
||||
import "internal/cpu"
|
||||
|
||||
// Empirical data shows that using Index can get better
|
||||
// performance when len(s) <= 16.
|
||||
const MaxBruteForce = 16
|
||||
|
||||
func init() {
|
||||
// If SIMD is supported, optimize the cases where the substring length is less than 64 bytes,
|
||||
// otherwise, cases the length less than 32 bytes is optimized.
|
||||
if cpu.Loong64.HasLASX || cpu.Loong64.HasLSX {
|
||||
MaxLen = 64
|
||||
} else {
|
||||
MaxLen = 32
|
||||
}
|
||||
}
|
||||
|
||||
// Cutover reports the number of failures of IndexByte we should tolerate
|
||||
// before switching over to Index.
|
||||
// n is the number of bytes processed so far.
|
||||
// See the bytes.Index implementation for details.
|
||||
func Cutover(n int) int {
|
||||
// 1 error per 8 characters, plus a few slop to start.
|
||||
return (n + 16) / 8
|
||||
}
|
||||
303
src/internal/bytealg/index_loong64.s
Normal file
303
src/internal/bytealg/index_loong64.s
Normal file
|
|
@ -0,0 +1,303 @@
|
|||
// Copyright 2025 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#include "go_asm.h"
|
||||
#include "textflag.h"
|
||||
|
||||
TEXT ·Index<ABIInternal>(SB),NOSPLIT,$0-56
|
||||
MOVV R7, R6 // R6 = separator pointer
|
||||
MOVV R8, R7 // R7 = separator length
|
||||
JMP indexbody<>(SB)
|
||||
|
||||
TEXT ·IndexString<ABIInternal>(SB),NOSPLIT,$0-40
|
||||
JMP indexbody<>(SB)
|
||||
|
||||
// input:
|
||||
// R4 = string
|
||||
// R5 = length
|
||||
// R6 = separator pointer
|
||||
// R7 = separator length (2 <= len <= 64)
|
||||
TEXT indexbody<>(SB),NOSPLIT,$0
|
||||
// main idea is to load 'sep' into separate register(s)
|
||||
// to avoid repeatedly re-load it again and again
|
||||
// for sebsequent substring comparisons
|
||||
SUBV R7, R5, R8
|
||||
ADDV R4, R8 // R8 contains the start of last substring for comparison
|
||||
ADDV $1, R4, R9 // store base for later
|
||||
|
||||
MOVV $8, R5
|
||||
BGE R7, R5, len_gt_or_eq_8
|
||||
len_2_7:
|
||||
AND $0x4, R7, R5
|
||||
BNE R5, len_4_7
|
||||
|
||||
len_2_3:
|
||||
AND $0x1, R7, R5
|
||||
BNE R5, len_3
|
||||
|
||||
len_2:
|
||||
MOVHU (R6), R10
|
||||
loop_2:
|
||||
BLT R8, R4, not_found
|
||||
MOVHU (R4), R11
|
||||
ADDV $1, R4
|
||||
BNE R10, R11, loop_2
|
||||
JMP found
|
||||
|
||||
len_3:
|
||||
MOVHU (R6), R10
|
||||
MOVBU 2(R6), R11
|
||||
loop_3:
|
||||
BLT R8, R4, not_found
|
||||
MOVHU (R4), R12
|
||||
ADDV $1, R4
|
||||
BNE R10, R12, loop_3
|
||||
MOVBU 1(R4), R13
|
||||
BNE R11, R13, loop_3
|
||||
JMP found
|
||||
|
||||
len_4_7:
|
||||
AND $0x2, R7, R5
|
||||
BNE R5, len_6_7
|
||||
AND $0x1, R7, R5
|
||||
BNE R5, len_5
|
||||
len_4:
|
||||
MOVWU (R6), R10
|
||||
loop_4:
|
||||
BLT R8, R4, not_found
|
||||
MOVWU (R4), R11
|
||||
ADDV $1, R4
|
||||
BNE R10, R11, loop_4
|
||||
JMP found
|
||||
|
||||
len_5:
|
||||
MOVWU (R6), R10
|
||||
MOVBU 4(R6), R11
|
||||
loop_5:
|
||||
BLT R8, R4, not_found
|
||||
MOVWU (R4), R12
|
||||
ADDV $1, R4
|
||||
BNE R10, R12, loop_5
|
||||
MOVBU 3(R4), R13
|
||||
BNE R11, R13, loop_5
|
||||
JMP found
|
||||
|
||||
len_6_7:
|
||||
AND $0x1, R7, R5
|
||||
BNE R5, len_7
|
||||
len_6:
|
||||
MOVWU (R6), R10
|
||||
MOVHU 4(R6), R11
|
||||
loop_6:
|
||||
BLT R8, R4, not_found
|
||||
MOVWU (R4), R12
|
||||
ADDV $1, R4
|
||||
BNE R10, R12, loop_6
|
||||
MOVHU 3(R4), R13
|
||||
BNE R11, R13, loop_6
|
||||
JMP found
|
||||
|
||||
len_7:
|
||||
MOVWU (R6), R10
|
||||
MOVWU 3(R6), R11
|
||||
loop_7:
|
||||
BLT R8, R4, not_found
|
||||
MOVWU (R4), R12
|
||||
ADDV $1, R4
|
||||
BNE R10, R12, loop_7
|
||||
MOVWU 2(R4), R13
|
||||
BNE R11, R13, loop_7
|
||||
JMP found
|
||||
|
||||
len_gt_or_eq_8:
|
||||
BEQ R5, R7, len_8
|
||||
MOVV $17, R5
|
||||
BGE R7, R5, len_gt_or_eq_17
|
||||
JMP len_9_16
|
||||
len_8:
|
||||
MOVV (R6), R10
|
||||
loop_8:
|
||||
BLT R8, R4, not_found
|
||||
MOVV (R4), R11
|
||||
ADDV $1, R4
|
||||
BNE R10, R11, loop_8
|
||||
JMP found
|
||||
|
||||
len_9_16:
|
||||
MOVV (R6), R10
|
||||
SUBV $8, R7
|
||||
MOVV (R6)(R7), R11
|
||||
SUBV $1, R7
|
||||
loop_9_16:
|
||||
BLT R8, R4, not_found
|
||||
MOVV (R4), R12
|
||||
ADDV $1, R4
|
||||
BNE R10, R12, loop_9_16
|
||||
MOVV (R4)(R7), R13
|
||||
BNE R11, R13, loop_9_16
|
||||
JMP found
|
||||
|
||||
len_gt_or_eq_17:
|
||||
MOVV $25, R5
|
||||
BGE R7, R5, len_gt_or_eq_25
|
||||
len_17_24:
|
||||
MOVV 0(R6), R10
|
||||
MOVV 8(R6), R11
|
||||
SUBV $8, R7
|
||||
MOVV (R6)(R7), R12
|
||||
SUBV $1, R7
|
||||
loop_17_24:
|
||||
BLT R8, R4, not_found
|
||||
MOVV (R4), R13
|
||||
ADDV $1, R4
|
||||
BNE R10, R13, loop_17_24
|
||||
MOVV 7(R4), R14
|
||||
BNE R11, R14, loop_17_24
|
||||
MOVV (R4)(R7), R15
|
||||
BNE R12, R15, loop_17_24
|
||||
JMP found
|
||||
|
||||
len_gt_or_eq_25:
|
||||
MOVV $33, R5
|
||||
BGE R7, R5, len_gt_or_eq_33
|
||||
MOVBU internal∕cpu·Loong64+const_offsetLOONG64HasLSX(SB), R10
|
||||
BNE R10, lsx_len_25_32
|
||||
len_25_32:
|
||||
MOVV 0(R6), R10
|
||||
MOVV 8(R6), R11
|
||||
MOVV 16(R6), R12
|
||||
SUBV $8, R7
|
||||
MOVV (R6)(R7), R13
|
||||
SUBV $1, R7
|
||||
loop_25_32:
|
||||
BLT R8, R4, not_found
|
||||
MOVV (R4), R14
|
||||
ADDV $1, R4
|
||||
BNE R10, R14, loop_25_32
|
||||
MOVV 7(R4), R15
|
||||
BNE R11, R15, loop_25_32
|
||||
MOVV 15(R4), R16
|
||||
BNE R12, R16, loop_25_32
|
||||
MOVV (R4)(R7), R17
|
||||
BNE R13, R17, loop_25_32
|
||||
JMP found
|
||||
|
||||
// On loong64, LSX is included if LASX is supported.
|
||||
lasx_len_25_32:
|
||||
lsx_len_25_32:
|
||||
VMOVQ 0(R6), V0
|
||||
SUBV $16, R7
|
||||
VMOVQ (R6)(R7), V1
|
||||
SUBV $1, R7
|
||||
lsx_loop_25_32:
|
||||
BLT R8, R4, not_found
|
||||
VMOVQ (R4), V2
|
||||
ADDV $1, R4
|
||||
VSEQV V0, V2, V2
|
||||
VSETANYEQV V2, FCC0
|
||||
BFPT FCC0, lsx_loop_25_32
|
||||
|
||||
VMOVQ (R4)(R7), V3
|
||||
VSEQV V1, V3, V3
|
||||
VSETANYEQV V3, FCC1
|
||||
BFPT FCC1, lsx_loop_25_32
|
||||
JMP found
|
||||
|
||||
len_gt_or_eq_33:
|
||||
MOVBU internal∕cpu·Loong64+const_offsetLOONG64HasLASX(SB), R10
|
||||
MOVV $49, R5
|
||||
BGE R7, R5, len_gt_or_eq_49
|
||||
len_33_48:
|
||||
BNE R10, lasx_len_33_48
|
||||
JMP lsx_len_33_48
|
||||
|
||||
len_gt_or_eq_49:
|
||||
len_49_64:
|
||||
BNE R10, lasx_len_49_64
|
||||
JMP lsx_len_49_64
|
||||
|
||||
lsx_len_33_48:
|
||||
VMOVQ 0(R6), V0
|
||||
VMOVQ 16(R6), V1
|
||||
SUBV $16, R7
|
||||
VMOVQ (R6)(R7), V2
|
||||
SUBV $1, R7
|
||||
lsx_loop_33_48:
|
||||
BLT R8, R4, not_found
|
||||
VMOVQ 0(R4), V3
|
||||
ADDV $1, R4
|
||||
VSEQV V0, V3, V3
|
||||
VSETANYEQV V3, FCC0
|
||||
BFPT FCC0, lsx_loop_33_48
|
||||
|
||||
VMOVQ 15(R4), V4
|
||||
VSEQV V1, V4, V4
|
||||
VSETANYEQV V4, FCC1
|
||||
BFPT FCC1, lsx_loop_33_48
|
||||
|
||||
VMOVQ (R4)(R7), V5
|
||||
VSEQV V2, V5, V5
|
||||
VSETANYEQV V5, FCC2
|
||||
BFPT FCC2, lsx_loop_33_48
|
||||
JMP found
|
||||
|
||||
lsx_len_49_64:
|
||||
VMOVQ 0(R6), V0
|
||||
VMOVQ 16(R6), V1
|
||||
VMOVQ 32(R6), V2
|
||||
SUBV $16, R7
|
||||
VMOVQ (R6)(R7), V3
|
||||
SUBV $1, R7
|
||||
lsx_loop_49_64:
|
||||
BLT R8, R4, not_found
|
||||
VMOVQ 0(R4), V4
|
||||
ADDV $1, R4
|
||||
VSEQV V0, V4, V4
|
||||
VSETANYEQV V4, FCC0
|
||||
BFPT FCC0, lsx_loop_49_64
|
||||
|
||||
VMOVQ 15(R4), V5
|
||||
VSEQV V1, V5, V5
|
||||
VSETANYEQV V5, FCC1
|
||||
BFPT FCC1, lsx_loop_49_64
|
||||
|
||||
VMOVQ 31(R4), V6
|
||||
VSEQV V2, V6, V6
|
||||
VSETANYEQV V6, FCC2
|
||||
BFPT FCC2, lsx_loop_49_64
|
||||
|
||||
VMOVQ (R4)(R7), V7
|
||||
VSEQV V3, V7, V7
|
||||
VSETANYEQV V7, FCC3
|
||||
BFPT FCC3, lsx_loop_49_64
|
||||
JMP found
|
||||
|
||||
lasx_len_33_48:
|
||||
lasx_len_49_64:
|
||||
lasx_len_33_64:
|
||||
XVMOVQ (R6), X0
|
||||
SUBV $32, R7
|
||||
XVMOVQ (R6)(R7), X1
|
||||
SUBV $1, R7
|
||||
lasx_loop_33_64:
|
||||
BLT R8, R4, not_found
|
||||
XVMOVQ (R4), X2
|
||||
ADDV $1, R4
|
||||
XVSEQV X0, X2, X3
|
||||
XVSETANYEQV X3, FCC0
|
||||
BFPT FCC0, lasx_loop_33_64
|
||||
|
||||
XVMOVQ (R4)(R7), X4
|
||||
XVSEQV X1, X4, X5
|
||||
XVSETANYEQV X5, FCC1
|
||||
BFPT FCC1, lasx_loop_33_64
|
||||
JMP found
|
||||
|
||||
found:
|
||||
SUBV R9, R4
|
||||
RET
|
||||
|
||||
not_found:
|
||||
MOVV $-1, R4
|
||||
RET
|
||||
|
|
@ -2,7 +2,7 @@
|
|||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
//go:build amd64 || arm64 || s390x || ppc64le || ppc64
|
||||
//go:build amd64 || arm64 || loong64 || s390x || ppc64le || ppc64
|
||||
|
||||
package bytealg
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue