mirror of
https://github.com/golang/go.git
synced 2025-12-08 06:10:04 +00:00
runtime: avoid using REP prefix for IndexByte
REP-prefixed instructions have a large startup cost. Avoid them like the plague. benchmark old ns/op new ns/op delta BenchmarkIndexByte10-8 22.4 5.34 -76.16% Fixes #13983 Change-Id: I857e956e240fc9681d053f2584ccf24c1b272bb3 Reviewed-on: https://go-review.googlesource.com/18703 Reviewed-by: Minux Ma <minux@golang.org> Run-TryBot: Keith Randall <khr@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org>
This commit is contained in:
parent
a337e30620
commit
687abca1ea
2 changed files with 110 additions and 71 deletions
|
|
@ -335,6 +335,41 @@ func TestIndexByteBig(t *testing.T) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// test a small index across all page offsets
|
||||||
|
func TestIndexByteSmall(t *testing.T) {
|
||||||
|
b := make([]byte, 5015) // bigger than a page
|
||||||
|
// Make sure we find the correct byte even when straddling a page.
|
||||||
|
for i := 0; i <= len(b)-15; i++ {
|
||||||
|
for j := 0; j < 15; j++ {
|
||||||
|
b[i+j] = byte(100 + j)
|
||||||
|
}
|
||||||
|
for j := 0; j < 15; j++ {
|
||||||
|
p := IndexByte(b[i:i+15], byte(100+j))
|
||||||
|
if p != j {
|
||||||
|
t.Errorf("IndexByte(%q, %d) = %d", b[i:i+15], 100+j, p)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for j := 0; j < 15; j++ {
|
||||||
|
b[i+j] = 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Make sure matches outside the slice never trigger.
|
||||||
|
for i := 0; i <= len(b)-15; i++ {
|
||||||
|
for j := 0; j < 15; j++ {
|
||||||
|
b[i+j] = 1
|
||||||
|
}
|
||||||
|
for j := 0; j < 15; j++ {
|
||||||
|
p := IndexByte(b[i:i+15], byte(0))
|
||||||
|
if p != -1 {
|
||||||
|
t.Errorf("IndexByte(%q, %d) = %d", b[i:i+15], 0, p)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for j := 0; j < 15; j++ {
|
||||||
|
b[i+j] = 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestIndexRune(t *testing.T) {
|
func TestIndexRune(t *testing.T) {
|
||||||
for _, tt := range indexRuneTests {
|
for _, tt := range indexRuneTests {
|
||||||
a := []byte(tt.a)
|
a := []byte(tt.a)
|
||||||
|
|
@ -348,10 +383,12 @@ func TestIndexRune(t *testing.T) {
|
||||||
|
|
||||||
var bmbuf []byte
|
var bmbuf []byte
|
||||||
|
|
||||||
|
func BenchmarkIndexByte10(b *testing.B) { bmIndexByte(b, IndexByte, 10) }
|
||||||
func BenchmarkIndexByte32(b *testing.B) { bmIndexByte(b, IndexByte, 32) }
|
func BenchmarkIndexByte32(b *testing.B) { bmIndexByte(b, IndexByte, 32) }
|
||||||
func BenchmarkIndexByte4K(b *testing.B) { bmIndexByte(b, IndexByte, 4<<10) }
|
func BenchmarkIndexByte4K(b *testing.B) { bmIndexByte(b, IndexByte, 4<<10) }
|
||||||
func BenchmarkIndexByte4M(b *testing.B) { bmIndexByte(b, IndexByte, 4<<20) }
|
func BenchmarkIndexByte4M(b *testing.B) { bmIndexByte(b, IndexByte, 4<<20) }
|
||||||
func BenchmarkIndexByte64M(b *testing.B) { bmIndexByte(b, IndexByte, 64<<20) }
|
func BenchmarkIndexByte64M(b *testing.B) { bmIndexByte(b, IndexByte, 64<<20) }
|
||||||
|
func BenchmarkIndexBytePortable10(b *testing.B) { bmIndexByte(b, IndexBytePortable, 10) }
|
||||||
func BenchmarkIndexBytePortable32(b *testing.B) { bmIndexByte(b, IndexBytePortable, 32) }
|
func BenchmarkIndexBytePortable32(b *testing.B) { bmIndexByte(b, IndexBytePortable, 32) }
|
||||||
func BenchmarkIndexBytePortable4K(b *testing.B) { bmIndexByte(b, IndexBytePortable, 4<<10) }
|
func BenchmarkIndexBytePortable4K(b *testing.B) { bmIndexByte(b, IndexBytePortable, 4<<10) }
|
||||||
func BenchmarkIndexBytePortable4M(b *testing.B) { bmIndexByte(b, IndexBytePortable, 4<<20) }
|
func BenchmarkIndexBytePortable4M(b *testing.B) { bmIndexByte(b, IndexBytePortable, 4<<20) }
|
||||||
|
|
|
||||||
|
|
@ -1838,80 +1838,98 @@ TEXT strings·IndexByte(SB),NOSPLIT,$0-32
|
||||||
// AL: byte sought
|
// AL: byte sought
|
||||||
// R8: address to put result
|
// R8: address to put result
|
||||||
TEXT runtime·indexbytebody(SB),NOSPLIT,$0
|
TEXT runtime·indexbytebody(SB),NOSPLIT,$0
|
||||||
MOVQ SI, DI
|
// Shuffle X0 around so that each byte contains
|
||||||
|
// the character we're looking for.
|
||||||
CMPQ BX, $16
|
|
||||||
JLT small
|
|
||||||
|
|
||||||
CMPQ BX, $32
|
|
||||||
JA avx2
|
|
||||||
no_avx2:
|
|
||||||
// round up to first 16-byte boundary
|
|
||||||
TESTQ $15, SI
|
|
||||||
JZ aligned
|
|
||||||
MOVQ SI, CX
|
|
||||||
ANDQ $~15, CX
|
|
||||||
ADDQ $16, CX
|
|
||||||
|
|
||||||
// search the beginning
|
|
||||||
SUBQ SI, CX
|
|
||||||
REPN; SCASB
|
|
||||||
JZ success
|
|
||||||
|
|
||||||
// DI is 16-byte aligned; get ready to search using SSE instructions
|
|
||||||
aligned:
|
|
||||||
// round down to last 16-byte boundary
|
|
||||||
MOVQ BX, R11
|
|
||||||
ADDQ SI, R11
|
|
||||||
ANDQ $~15, R11
|
|
||||||
|
|
||||||
// shuffle X0 around so that each byte contains c
|
|
||||||
MOVD AX, X0
|
MOVD AX, X0
|
||||||
PUNPCKLBW X0, X0
|
PUNPCKLBW X0, X0
|
||||||
PUNPCKLBW X0, X0
|
PUNPCKLBW X0, X0
|
||||||
PSHUFL $0, X0, X0
|
PSHUFL $0, X0, X0
|
||||||
JMP condition
|
|
||||||
|
|
||||||
|
CMPQ BX, $16
|
||||||
|
JLT small
|
||||||
|
|
||||||
|
MOVQ SI, DI
|
||||||
|
|
||||||
|
CMPQ BX, $32
|
||||||
|
JA avx2
|
||||||
sse:
|
sse:
|
||||||
// move the next 16-byte chunk of the buffer into X1
|
LEAQ -16(SI)(BX*1), AX // AX = address of last 16 bytes
|
||||||
MOVO (DI), X1
|
JMP sseloopentry
|
||||||
// compare bytes in X0 to X1
|
|
||||||
|
sseloop:
|
||||||
|
// Move the next 16-byte chunk of the data into X1.
|
||||||
|
MOVOU (DI), X1
|
||||||
|
// Compare bytes in X0 to X1.
|
||||||
PCMPEQB X0, X1
|
PCMPEQB X0, X1
|
||||||
// take the top bit of each byte in X1 and put the result in DX
|
// Take the top bit of each byte in X1 and put the result in DX.
|
||||||
PMOVMSKB X1, DX
|
PMOVMSKB X1, DX
|
||||||
TESTL DX, DX
|
// Find first set bit, if any.
|
||||||
|
BSFL DX, DX
|
||||||
JNZ ssesuccess
|
JNZ ssesuccess
|
||||||
|
// Advance to next block.
|
||||||
ADDQ $16, DI
|
ADDQ $16, DI
|
||||||
|
sseloopentry:
|
||||||
|
CMPQ DI, AX
|
||||||
|
JB sseloop
|
||||||
|
|
||||||
condition:
|
// Search the last 16-byte chunk. This chunk may overlap with the
|
||||||
CMPQ DI, R11
|
// chunks we've already searched, but that's ok.
|
||||||
JLT sse
|
MOVQ AX, DI
|
||||||
|
MOVOU (AX), X1
|
||||||
// search the end
|
PCMPEQB X0, X1
|
||||||
MOVQ SI, CX
|
PMOVMSKB X1, DX
|
||||||
ADDQ BX, CX
|
BSFL DX, DX
|
||||||
SUBQ R11, CX
|
JNZ ssesuccess
|
||||||
// if CX == 0, the zero flag will be set and we'll end up
|
|
||||||
// returning a false success
|
|
||||||
JZ failure
|
|
||||||
REPN; SCASB
|
|
||||||
JZ success
|
|
||||||
|
|
||||||
failure:
|
failure:
|
||||||
MOVQ $-1, (R8)
|
MOVQ $-1, (R8)
|
||||||
RET
|
RET
|
||||||
|
|
||||||
|
// We've found a chunk containing the byte.
|
||||||
|
// The chunk was loaded from DI.
|
||||||
|
// The index of the matching byte in the chunk is DX.
|
||||||
|
// The start of the data is SI.
|
||||||
|
ssesuccess:
|
||||||
|
SUBQ SI, DI // Compute offset of chunk within data.
|
||||||
|
ADDQ DX, DI // Add offset of byte within chunk.
|
||||||
|
MOVQ DI, (R8)
|
||||||
|
RET
|
||||||
|
|
||||||
// handle for lengths < 16
|
// handle for lengths < 16
|
||||||
small:
|
small:
|
||||||
MOVQ BX, CX
|
TESTQ BX, BX
|
||||||
REPN; SCASB
|
JEQ failure
|
||||||
JZ success
|
|
||||||
MOVQ $-1, (R8)
|
// Check if we'll load across a page boundary.
|
||||||
|
LEAQ 16(SI), AX
|
||||||
|
TESTW $0xff0, AX
|
||||||
|
JEQ endofpage
|
||||||
|
|
||||||
|
MOVOU (SI), X1 // Load data
|
||||||
|
PCMPEQB X0, X1 // Compare target byte with each byte in data.
|
||||||
|
PMOVMSKB X1, DX // Move result bits to integer register.
|
||||||
|
BSFL DX, DX // Find first set bit.
|
||||||
|
JZ failure // No set bit, failure.
|
||||||
|
CMPL DX, BX
|
||||||
|
JAE failure // Match is past end of data.
|
||||||
|
MOVQ DX, (R8)
|
||||||
|
RET
|
||||||
|
|
||||||
|
endofpage:
|
||||||
|
MOVOU -16(SI)(BX*1), X1 // Load data into the high end of X1.
|
||||||
|
PCMPEQB X0, X1 // Compare target byte with each byte in data.
|
||||||
|
PMOVMSKB X1, DX // Move result bits to integer register.
|
||||||
|
MOVL BX, CX
|
||||||
|
SHLL CX, DX
|
||||||
|
SHRL $16, DX // Shift desired bits down to bottom of register.
|
||||||
|
BSFL DX, DX // Find first set bit.
|
||||||
|
JZ failure // No set bit, failure.
|
||||||
|
MOVQ DX, (R8)
|
||||||
RET
|
RET
|
||||||
|
|
||||||
avx2:
|
avx2:
|
||||||
CMPB runtime·support_avx2(SB), $1
|
CMPB runtime·support_avx2(SB), $1
|
||||||
JNE no_avx2
|
JNE sse
|
||||||
MOVD AX, X0
|
MOVD AX, X0
|
||||||
LEAQ -32(SI)(BX*1), R11
|
LEAQ -32(SI)(BX*1), R11
|
||||||
VPBROADCASTB X0, Y1
|
VPBROADCASTB X0, Y1
|
||||||
|
|
@ -1941,22 +1959,6 @@ avx2success:
|
||||||
VZEROUPPER
|
VZEROUPPER
|
||||||
RET
|
RET
|
||||||
|
|
||||||
// we've found the chunk containing the byte
|
|
||||||
// now just figure out which specific byte it is
|
|
||||||
ssesuccess:
|
|
||||||
// get the index of the least significant set bit
|
|
||||||
BSFW DX, DX
|
|
||||||
SUBQ SI, DI
|
|
||||||
ADDQ DI, DX
|
|
||||||
MOVQ DX, (R8)
|
|
||||||
RET
|
|
||||||
|
|
||||||
success:
|
|
||||||
SUBQ SI, DI
|
|
||||||
SUBL $1, DI
|
|
||||||
MOVQ DI, (R8)
|
|
||||||
RET
|
|
||||||
|
|
||||||
TEXT bytes·Equal(SB),NOSPLIT,$0-49
|
TEXT bytes·Equal(SB),NOSPLIT,$0-49
|
||||||
MOVQ a_len+8(FP), BX
|
MOVQ a_len+8(FP), BX
|
||||||
MOVQ b_len+32(FP), CX
|
MOVQ b_len+32(FP), CX
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue