mirror of
https://github.com/golang/go.git
synced 2025-12-08 06:10:04 +00:00
strings: optimize Count for amd64
Move optimized Count implementation from bytes to runtime. Use in both bytes and strings packages. Add CountByte benchmark to strings. Strings benchmarks: name old time/op new time/op delta CountHard1-4 226µs ± 1% 226µs ± 2% ~ (p=0.247 n=10+10) CountHard2-4 316µs ± 1% 315µs ± 0% ~ (p=0.133 n=9+10) CountHard3-4 919µs ± 1% 920µs ± 1% ~ (p=0.968 n=10+9) CountTorture-4 15.4µs ± 1% 15.7µs ± 1% +2.47% (p=0.000 n=10+9) CountTortureOverlapping-4 9.60ms ± 0% 9.65ms ± 1% ~ (p=0.247 n=10+10) CountByte/10-4 26.3ns ± 1% 10.9ns ± 1% -58.71% (p=0.000 n=9+9) CountByte/32-4 42.7ns ± 0% 14.2ns ± 0% -66.64% (p=0.000 n=10+10) CountByte/4096-4 3.07µs ± 0% 0.31µs ± 2% -89.99% (p=0.000 n=9+10) CountByte/4194304-4 3.48ms ± 1% 0.34ms ± 1% -90.09% (p=0.000 n=10+9) CountByte/67108864-4 55.6ms ± 1% 7.0ms ± 0% -87.49% (p=0.000 n=9+8) name old speed new speed delta CountByte/10-4 380MB/s ± 1% 919MB/s ± 1% +142.21% (p=0.000 n=9+9) CountByte/32-4 750MB/s ± 0% 2247MB/s ± 0% +199.62% (p=0.000 n=10+10) CountByte/4096-4 1.33GB/s ± 0% 13.32GB/s ± 2% +898.13% (p=0.000 n=9+10) CountByte/4194304-4 1.21GB/s ± 1% 12.17GB/s ± 1% +908.87% (p=0.000 n=10+9) CountByte/67108864-4 1.21GB/s ± 1% 9.65GB/s ± 0% +699.29% (p=0.000 n=9+8) Fixes #19411 Change-Id: I8d2d409f0fa6df6d03b60790aa86e540b4a4e3b0 Reviewed-on: https://go-review.googlesource.com/38693 Reviewed-by: Keith Randall <khr@golang.org>
This commit is contained in:
parent
03d1aa6024
commit
d206af1e6c
9 changed files with 247 additions and 194 deletions
|
|
@ -11,6 +11,7 @@ package bytes
|
|||
func indexShortStr(s, c []byte) int // ../runtime/asm_$GOARCH.s
|
||||
func supportAVX2() bool // ../runtime/asm_$GOARCH.s
|
||||
func supportPOPCNT() bool // ../runtime/asm_$GOARCH.s
|
||||
func countByte(s []byte, c byte) int // ../runtime/asm_$GOARCH.s
|
||||
|
||||
var shortStringLen int
|
||||
|
||||
|
|
@ -95,9 +96,6 @@ func Index(s, sep []byte) int {
|
|||
return -1
|
||||
}
|
||||
|
||||
// Special case for when we must count occurrences of a single byte.
|
||||
func countByte(s []byte, c byte) int
|
||||
|
||||
// Count counts the number of non-overlapping instances of sep in s.
|
||||
// If sep is an empty slice, Count returns 1 + the number of Unicode code points in s.
|
||||
func Count(s, sep []byte) int {
|
||||
|
|
|
|||
|
|
@ -1,183 +0,0 @@
|
|||
// Copyright 2017 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#include "textflag.h"
|
||||
|
||||
// We use:
|
||||
// SI: data
|
||||
// BX: data len
|
||||
// AL: byte sought
|
||||
// This requires the POPCNT instruction
|
||||
TEXT ·countByte(SB),NOSPLIT,$0-40
|
||||
MOVQ s+0(FP), SI
|
||||
MOVQ s_len+8(FP), BX
|
||||
MOVB c+24(FP), AL
|
||||
|
||||
// Shuffle X0 around so that each byte contains
|
||||
// the character we're looking for.
|
||||
MOVD AX, X0
|
||||
PUNPCKLBW X0, X0
|
||||
PUNPCKLBW X0, X0
|
||||
PSHUFL $0, X0, X0
|
||||
|
||||
CMPQ BX, $16
|
||||
JLT small
|
||||
|
||||
MOVQ $0, R12 // Accumulator
|
||||
|
||||
MOVQ SI, DI
|
||||
|
||||
CMPQ BX, $32
|
||||
JA avx2
|
||||
sse:
|
||||
LEAQ -16(SI)(BX*1), AX // AX = address of last 16 bytes
|
||||
JMP sseloopentry
|
||||
|
||||
sseloop:
|
||||
// Move the next 16-byte chunk of the data into X1.
|
||||
MOVOU (DI), X1
|
||||
// Compare bytes in X0 to X1.
|
||||
PCMPEQB X0, X1
|
||||
// Take the top bit of each byte in X1 and put the result in DX.
|
||||
PMOVMSKB X1, DX
|
||||
// Count number of matching bytes
|
||||
POPCNTL DX, DX
|
||||
// Accumulate into R12
|
||||
ADDQ DX, R12
|
||||
// Advance to next block.
|
||||
ADDQ $16, DI
|
||||
sseloopentry:
|
||||
CMPQ DI, AX
|
||||
JBE sseloop
|
||||
|
||||
// Get the number of bytes to consider in the last 16 bytes
|
||||
ANDQ $15, BX
|
||||
JZ end
|
||||
|
||||
// Create mask to ignore overlap between previous 16 byte block
|
||||
// and the next.
|
||||
MOVQ $16,CX
|
||||
SUBQ BX, CX
|
||||
MOVQ $0xFFFF, R10
|
||||
SARQ CL, R10
|
||||
SALQ CL, R10
|
||||
|
||||
// Process the last 16-byte chunk. This chunk may overlap with the
|
||||
// chunks we've already searched so we need to mask part of it.
|
||||
MOVOU (AX), X1
|
||||
PCMPEQB X0, X1
|
||||
PMOVMSKB X1, DX
|
||||
// Apply mask
|
||||
ANDQ R10, DX
|
||||
POPCNTL DX, DX
|
||||
ADDQ DX, R12
|
||||
end:
|
||||
MOVQ R12, ret+32(FP)
|
||||
RET
|
||||
|
||||
// handle for lengths < 16
|
||||
small:
|
||||
TESTQ BX, BX
|
||||
JEQ endzero
|
||||
|
||||
// Check if we'll load across a page boundary.
|
||||
LEAQ 16(SI), AX
|
||||
TESTW $0xff0, AX
|
||||
JEQ endofpage
|
||||
|
||||
// We must ignore high bytes as they aren't part of our slice.
|
||||
// Create mask.
|
||||
MOVB BX, CX
|
||||
MOVQ $1, R10
|
||||
SALQ CL, R10
|
||||
SUBQ $1, R10
|
||||
|
||||
// Load data
|
||||
MOVOU (SI), X1
|
||||
// Compare target byte with each byte in data.
|
||||
PCMPEQB X0, X1
|
||||
// Move result bits to integer register.
|
||||
PMOVMSKB X1, DX
|
||||
// Apply mask
|
||||
ANDQ R10, DX
|
||||
POPCNTL DX, DX
|
||||
// Directly return DX, we don't need to accumulate
|
||||
// since we have <16 bytes.
|
||||
MOVQ DX, ret+32(FP)
|
||||
RET
|
||||
endzero:
|
||||
MOVQ $0, ret+32(FP)
|
||||
RET
|
||||
|
||||
endofpage:
|
||||
// We must ignore low bytes as they aren't part of our slice.
|
||||
MOVQ $16,CX
|
||||
SUBQ BX, CX
|
||||
MOVQ $0xFFFF, R10
|
||||
SARQ CL, R10
|
||||
SALQ CL, R10
|
||||
|
||||
// Load data into the high end of X1.
|
||||
MOVOU -16(SI)(BX*1), X1
|
||||
// Compare target byte with each byte in data.
|
||||
PCMPEQB X0, X1
|
||||
// Move result bits to integer register.
|
||||
PMOVMSKB X1, DX
|
||||
// Apply mask
|
||||
ANDQ R10, DX
|
||||
// Directly return DX, we don't need to accumulate
|
||||
// since we have <16 bytes.
|
||||
POPCNTL DX, DX
|
||||
MOVQ DX, ret+32(FP)
|
||||
RET
|
||||
|
||||
avx2:
|
||||
CMPB runtime·support_avx2(SB), $1
|
||||
JNE sse
|
||||
MOVD AX, X0
|
||||
LEAQ -32(SI)(BX*1), R11
|
||||
VPBROADCASTB X0, Y1
|
||||
avx2_loop:
|
||||
VMOVDQU (DI), Y2
|
||||
VPCMPEQB Y1, Y2, Y3
|
||||
VPMOVMSKB Y3, DX
|
||||
POPCNTL DX, DX
|
||||
ADDQ DX, R12
|
||||
ADDQ $32, DI
|
||||
CMPQ DI, R11
|
||||
JLE avx2_loop
|
||||
|
||||
// If last block is already processed,
|
||||
// skip to the end.
|
||||
CMPQ DI, R11
|
||||
JEQ endavx
|
||||
|
||||
// Load address of the last 32 bytes.
|
||||
// There is an overlap with the previous block.
|
||||
MOVQ R11, DI
|
||||
VMOVDQU (DI), Y2
|
||||
VPCMPEQB Y1, Y2, Y3
|
||||
VPMOVMSKB Y3, DX
|
||||
// Exit AVX mode.
|
||||
VZEROUPPER
|
||||
|
||||
// Create mask to ignore overlap between previous 32 byte block
|
||||
// and the next.
|
||||
ANDQ $31, BX
|
||||
MOVQ $32,CX
|
||||
SUBQ BX, CX
|
||||
MOVQ $0xFFFFFFFF, R10
|
||||
SARQ CL, R10
|
||||
SALQ CL, R10
|
||||
// Apply mask
|
||||
ANDQ R10, DX
|
||||
POPCNTL DX, DX
|
||||
ADDQ DX, R12
|
||||
MOVQ R12, ret+32(FP)
|
||||
RET
|
||||
endavx:
|
||||
// Exit AVX mode.
|
||||
VZEROUPPER
|
||||
MOVQ R12, ret+32(FP)
|
||||
RET
|
||||
|
|
@ -17,7 +17,10 @@ runtime/asm_amd64.s: [GOARCH] cannot check cross-package assembly function: Comp
|
|||
runtime/asm_amd64.s: [amd64] cannot check cross-package assembly function: indexShortStr is in package bytes
|
||||
runtime/asm_amd64.s: [amd64] cannot check cross-package assembly function: supportAVX2 is in package strings
|
||||
runtime/asm_amd64.s: [amd64] cannot check cross-package assembly function: supportAVX2 is in package bytes
|
||||
runtime/asm_amd64.s: [amd64] cannot check cross-package assembly function: supportPOPCNT is in package strings
|
||||
runtime/asm_amd64.s: [amd64] cannot check cross-package assembly function: supportPOPCNT is in package bytes
|
||||
runtime/asm_amd64.s: [amd64] cannot check cross-package assembly function: countByte is in package strings
|
||||
runtime/asm_amd64.s: [amd64] cannot check cross-package assembly function: countByte is in package bytes
|
||||
|
||||
// Intentionally missing declarations. These are special assembly routines.
|
||||
// Some are jumped into from other routines, with values in specific registers.
|
||||
|
|
|
|||
|
|
@ -1702,6 +1702,11 @@ TEXT bytes·supportAVX2(SB),NOSPLIT,$0-1
|
|||
MOVB AX, ret+0(FP)
|
||||
RET
|
||||
|
||||
TEXT strings·supportPOPCNT(SB),NOSPLIT,$0-1
|
||||
MOVBLZX runtime·support_popcnt(SB), AX
|
||||
MOVB AX, ret+0(FP)
|
||||
RET
|
||||
|
||||
TEXT bytes·supportPOPCNT(SB),NOSPLIT,$0-1
|
||||
MOVBLZX runtime·support_popcnt(SB), AX
|
||||
MOVB AX, ret+0(FP)
|
||||
|
|
@ -2131,6 +2136,196 @@ eqret:
|
|||
MOVB $0, ret+48(FP)
|
||||
RET
|
||||
|
||||
|
||||
TEXT bytes·countByte(SB),NOSPLIT,$0-40
|
||||
MOVQ s+0(FP), SI
|
||||
MOVQ s_len+8(FP), BX
|
||||
MOVB c+24(FP), AL
|
||||
LEAQ ret+32(FP), R8
|
||||
JMP runtime·countByte(SB)
|
||||
|
||||
TEXT strings·countByte(SB),NOSPLIT,$0-32
|
||||
MOVQ s+0(FP), SI
|
||||
MOVQ s_len+8(FP), BX
|
||||
MOVB c+16(FP), AL
|
||||
LEAQ ret+24(FP), R8
|
||||
JMP runtime·countByte(SB)
|
||||
|
||||
// input:
|
||||
// SI: data
|
||||
// BX: data len
|
||||
// AL: byte sought
|
||||
// R8: address to put result
|
||||
// This requires the POPCNT instruction
|
||||
TEXT runtime·countByte(SB),NOSPLIT,$0
|
||||
// Shuffle X0 around so that each byte contains
|
||||
// the character we're looking for.
|
||||
MOVD AX, X0
|
||||
PUNPCKLBW X0, X0
|
||||
PUNPCKLBW X0, X0
|
||||
PSHUFL $0, X0, X0
|
||||
|
||||
CMPQ BX, $16
|
||||
JLT small
|
||||
|
||||
MOVQ $0, R12 // Accumulator
|
||||
|
||||
MOVQ SI, DI
|
||||
|
||||
CMPQ BX, $32
|
||||
JA avx2
|
||||
sse:
|
||||
LEAQ -16(SI)(BX*1), AX // AX = address of last 16 bytes
|
||||
JMP sseloopentry
|
||||
|
||||
sseloop:
|
||||
// Move the next 16-byte chunk of the data into X1.
|
||||
MOVOU (DI), X1
|
||||
// Compare bytes in X0 to X1.
|
||||
PCMPEQB X0, X1
|
||||
// Take the top bit of each byte in X1 and put the result in DX.
|
||||
PMOVMSKB X1, DX
|
||||
// Count number of matching bytes
|
||||
POPCNTL DX, DX
|
||||
// Accumulate into R12
|
||||
ADDQ DX, R12
|
||||
// Advance to next block.
|
||||
ADDQ $16, DI
|
||||
sseloopentry:
|
||||
CMPQ DI, AX
|
||||
JBE sseloop
|
||||
|
||||
// Get the number of bytes to consider in the last 16 bytes
|
||||
ANDQ $15, BX
|
||||
JZ end
|
||||
|
||||
// Create mask to ignore overlap between previous 16 byte block
|
||||
// and the next.
|
||||
MOVQ $16,CX
|
||||
SUBQ BX, CX
|
||||
MOVQ $0xFFFF, R10
|
||||
SARQ CL, R10
|
||||
SALQ CL, R10
|
||||
|
||||
// Process the last 16-byte chunk. This chunk may overlap with the
|
||||
// chunks we've already searched so we need to mask part of it.
|
||||
MOVOU (AX), X1
|
||||
PCMPEQB X0, X1
|
||||
PMOVMSKB X1, DX
|
||||
// Apply mask
|
||||
ANDQ R10, DX
|
||||
POPCNTL DX, DX
|
||||
ADDQ DX, R12
|
||||
end:
|
||||
MOVQ R12, (R8)
|
||||
RET
|
||||
|
||||
// handle for lengths < 16
|
||||
small:
|
||||
TESTQ BX, BX
|
||||
JEQ endzero
|
||||
|
||||
// Check if we'll load across a page boundary.
|
||||
LEAQ 16(SI), AX
|
||||
TESTW $0xff0, AX
|
||||
JEQ endofpage
|
||||
|
||||
// We must ignore high bytes as they aren't part of our slice.
|
||||
// Create mask.
|
||||
MOVB BX, CX
|
||||
MOVQ $1, R10
|
||||
SALQ CL, R10
|
||||
SUBQ $1, R10
|
||||
|
||||
// Load data
|
||||
MOVOU (SI), X1
|
||||
// Compare target byte with each byte in data.
|
||||
PCMPEQB X0, X1
|
||||
// Move result bits to integer register.
|
||||
PMOVMSKB X1, DX
|
||||
// Apply mask
|
||||
ANDQ R10, DX
|
||||
POPCNTL DX, DX
|
||||
// Directly return DX, we don't need to accumulate
|
||||
// since we have <16 bytes.
|
||||
MOVQ DX, (R8)
|
||||
RET
|
||||
endzero:
|
||||
MOVQ $0, (R8)
|
||||
RET
|
||||
|
||||
endofpage:
|
||||
// We must ignore low bytes as they aren't part of our slice.
|
||||
MOVQ $16,CX
|
||||
SUBQ BX, CX
|
||||
MOVQ $0xFFFF, R10
|
||||
SARQ CL, R10
|
||||
SALQ CL, R10
|
||||
|
||||
// Load data into the high end of X1.
|
||||
MOVOU -16(SI)(BX*1), X1
|
||||
// Compare target byte with each byte in data.
|
||||
PCMPEQB X0, X1
|
||||
// Move result bits to integer register.
|
||||
PMOVMSKB X1, DX
|
||||
// Apply mask
|
||||
ANDQ R10, DX
|
||||
// Directly return DX, we don't need to accumulate
|
||||
// since we have <16 bytes.
|
||||
POPCNTL DX, DX
|
||||
MOVQ DX, (R8)
|
||||
RET
|
||||
|
||||
avx2:
|
||||
CMPB runtime·support_avx2(SB), $1
|
||||
JNE sse
|
||||
MOVD AX, X0
|
||||
LEAQ -32(SI)(BX*1), R11
|
||||
VPBROADCASTB X0, Y1
|
||||
avx2_loop:
|
||||
VMOVDQU (DI), Y2
|
||||
VPCMPEQB Y1, Y2, Y3
|
||||
VPMOVMSKB Y3, DX
|
||||
POPCNTL DX, DX
|
||||
ADDQ DX, R12
|
||||
ADDQ $32, DI
|
||||
CMPQ DI, R11
|
||||
JLE avx2_loop
|
||||
|
||||
// If last block is already processed,
|
||||
// skip to the end.
|
||||
CMPQ DI, R11
|
||||
JEQ endavx
|
||||
|
||||
// Load address of the last 32 bytes.
|
||||
// There is an overlap with the previous block.
|
||||
MOVQ R11, DI
|
||||
VMOVDQU (DI), Y2
|
||||
VPCMPEQB Y1, Y2, Y3
|
||||
VPMOVMSKB Y3, DX
|
||||
// Exit AVX mode.
|
||||
VZEROUPPER
|
||||
|
||||
// Create mask to ignore overlap between previous 32 byte block
|
||||
// and the next.
|
||||
ANDQ $31, BX
|
||||
MOVQ $32,CX
|
||||
SUBQ BX, CX
|
||||
MOVQ $0xFFFFFFFF, R10
|
||||
SARQ CL, R10
|
||||
SALQ CL, R10
|
||||
// Apply mask
|
||||
ANDQ R10, DX
|
||||
POPCNTL DX, DX
|
||||
ADDQ DX, R12
|
||||
MOVQ R12, (R8)
|
||||
RET
|
||||
endavx:
|
||||
// Exit AVX mode.
|
||||
VZEROUPPER
|
||||
MOVQ R12, (R8)
|
||||
RET
|
||||
|
||||
TEXT runtime·return0(SB), NOSPLIT, $0
|
||||
MOVL $0, AX
|
||||
RET
|
||||
|
|
|
|||
|
|
@ -72,9 +72,8 @@ func hashStrRev(sep string) (uint32, uint32) {
|
|||
return hash, pow
|
||||
}
|
||||
|
||||
// Count counts the number of non-overlapping instances of substr in s.
|
||||
// If substr is an empty string, Count returns 1 + the number of Unicode code points in s.
|
||||
func Count(s, substr string) int {
|
||||
// countGeneric implements Count.
|
||||
func countGeneric(s, substr string) int {
|
||||
// special case
|
||||
if len(substr) == 0 {
|
||||
return utf8.RuneCountInString(s) + 1
|
||||
|
|
|
|||
|
|
@ -10,6 +10,8 @@ package strings
|
|||
// indexShortStr requires 2 <= len(c) <= shortStringLen
|
||||
func indexShortStr(s, c string) int // ../runtime/asm_$GOARCH.s
|
||||
func supportAVX2() bool // ../runtime/asm_$GOARCH.s
|
||||
func supportPOPCNT() bool // ../runtime/asm_$GOARCH.s
|
||||
func countByte(s string, c byte) int // ../runtime/asm_$GOARCH.s
|
||||
|
||||
var shortStringLen int
|
||||
|
||||
|
|
@ -93,3 +95,12 @@ func Index(s, substr string) int {
|
|||
}
|
||||
return -1
|
||||
}
|
||||
|
||||
// Count counts the number of non-overlapping instances of substr in s.
|
||||
// If substr is an empty string, Count returns 1 + the number of Unicode code points in s.
|
||||
func Count(s, substr string) int {
|
||||
if len(substr) == 1 && supportPOPCNT() {
|
||||
return countByte(s, byte(substr[0]))
|
||||
}
|
||||
return countGeneric(s, substr)
|
||||
}
|
||||
|
|
|
|||
|
|
@ -45,3 +45,9 @@ func Index(s, substr string) int {
|
|||
}
|
||||
return -1
|
||||
}
|
||||
|
||||
// Count counts the number of non-overlapping instances of substr in s.
|
||||
// If substr is an empty string, Count returns 1 + the number of Unicode code points in s.
|
||||
func Count(s, substr string) int {
|
||||
return countGeneric(s, substr)
|
||||
}
|
||||
|
|
|
|||
|
|
@ -96,3 +96,9 @@ func Index(s, substr string) int {
|
|||
}
|
||||
return -1
|
||||
}
|
||||
|
||||
// Count counts the number of non-overlapping instances of substr in s.
|
||||
// If substr is an empty string, Count returns 1 + the number of Unicode code points in s.
|
||||
func Count(s, substr string) int {
|
||||
return countGeneric(s, substr)
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1457,6 +1457,24 @@ func BenchmarkCountTortureOverlapping(b *testing.B) {
|
|||
}
|
||||
}
|
||||
|
||||
func BenchmarkCountByte(b *testing.B) {
|
||||
indexSizes := []int{10, 32, 4 << 10, 4 << 20, 64 << 20}
|
||||
benchStr := Repeat(benchmarkString,
|
||||
(indexSizes[len(indexSizes)-1]+len(benchmarkString)-1)/len(benchmarkString))
|
||||
benchFunc := func(b *testing.B, benchStr string) {
|
||||
b.SetBytes(int64(len(benchStr)))
|
||||
for i := 0; i < b.N; i++ {
|
||||
Count(benchStr, "=")
|
||||
}
|
||||
}
|
||||
for _, size := range indexSizes {
|
||||
b.Run(fmt.Sprintf("%d", size), func(b *testing.B) {
|
||||
benchFunc(b, benchStr[:size])
|
||||
})
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
var makeFieldsInput = func() string {
|
||||
x := make([]byte, 1<<20)
|
||||
// Input is ~10% space, ~10% 2-byte UTF-8, rest ASCII non-space.
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue