mirror of
https://github.com/golang/go.git
synced 2025-12-08 06:10:04 +00:00
runtime: avoid using REP prefix for IndexByte
REP-prefixed instructions have a large startup cost. Avoid them like the plague. benchmark old ns/op new ns/op delta BenchmarkIndexByte10-8 22.4 5.34 -76.16% Fixes #13983 Change-Id: I857e956e240fc9681d053f2584ccf24c1b272bb3 Reviewed-on: https://go-review.googlesource.com/18703 Reviewed-by: Minux Ma <minux@golang.org> Run-TryBot: Keith Randall <khr@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org>
This commit is contained in:
parent
a337e30620
commit
687abca1ea
2 changed files with 110 additions and 71 deletions
|
|
@ -335,6 +335,41 @@ func TestIndexByteBig(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
// test a small index across all page offsets
|
||||
func TestIndexByteSmall(t *testing.T) {
|
||||
b := make([]byte, 5015) // bigger than a page
|
||||
// Make sure we find the correct byte even when straddling a page.
|
||||
for i := 0; i <= len(b)-15; i++ {
|
||||
for j := 0; j < 15; j++ {
|
||||
b[i+j] = byte(100 + j)
|
||||
}
|
||||
for j := 0; j < 15; j++ {
|
||||
p := IndexByte(b[i:i+15], byte(100+j))
|
||||
if p != j {
|
||||
t.Errorf("IndexByte(%q, %d) = %d", b[i:i+15], 100+j, p)
|
||||
}
|
||||
}
|
||||
for j := 0; j < 15; j++ {
|
||||
b[i+j] = 0
|
||||
}
|
||||
}
|
||||
// Make sure matches outside the slice never trigger.
|
||||
for i := 0; i <= len(b)-15; i++ {
|
||||
for j := 0; j < 15; j++ {
|
||||
b[i+j] = 1
|
||||
}
|
||||
for j := 0; j < 15; j++ {
|
||||
p := IndexByte(b[i:i+15], byte(0))
|
||||
if p != -1 {
|
||||
t.Errorf("IndexByte(%q, %d) = %d", b[i:i+15], 0, p)
|
||||
}
|
||||
}
|
||||
for j := 0; j < 15; j++ {
|
||||
b[i+j] = 0
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestIndexRune(t *testing.T) {
|
||||
for _, tt := range indexRuneTests {
|
||||
a := []byte(tt.a)
|
||||
|
|
@ -348,10 +383,12 @@ func TestIndexRune(t *testing.T) {
|
|||
|
||||
var bmbuf []byte
|
||||
|
||||
func BenchmarkIndexByte10(b *testing.B) { bmIndexByte(b, IndexByte, 10) }
|
||||
func BenchmarkIndexByte32(b *testing.B) { bmIndexByte(b, IndexByte, 32) }
|
||||
func BenchmarkIndexByte4K(b *testing.B) { bmIndexByte(b, IndexByte, 4<<10) }
|
||||
func BenchmarkIndexByte4M(b *testing.B) { bmIndexByte(b, IndexByte, 4<<20) }
|
||||
func BenchmarkIndexByte64M(b *testing.B) { bmIndexByte(b, IndexByte, 64<<20) }
|
||||
func BenchmarkIndexBytePortable10(b *testing.B) { bmIndexByte(b, IndexBytePortable, 10) }
|
||||
func BenchmarkIndexBytePortable32(b *testing.B) { bmIndexByte(b, IndexBytePortable, 32) }
|
||||
func BenchmarkIndexBytePortable4K(b *testing.B) { bmIndexByte(b, IndexBytePortable, 4<<10) }
|
||||
func BenchmarkIndexBytePortable4M(b *testing.B) { bmIndexByte(b, IndexBytePortable, 4<<20) }
|
||||
|
|
|
|||
|
|
@ -1838,80 +1838,98 @@ TEXT strings·IndexByte(SB),NOSPLIT,$0-32
|
|||
// AL: byte sought
|
||||
// R8: address to put result
|
||||
TEXT runtime·indexbytebody(SB),NOSPLIT,$0
|
||||
MOVQ SI, DI
|
||||
|
||||
CMPQ BX, $16
|
||||
JLT small
|
||||
|
||||
CMPQ BX, $32
|
||||
JA avx2
|
||||
no_avx2:
|
||||
// round up to first 16-byte boundary
|
||||
TESTQ $15, SI
|
||||
JZ aligned
|
||||
MOVQ SI, CX
|
||||
ANDQ $~15, CX
|
||||
ADDQ $16, CX
|
||||
|
||||
// search the beginning
|
||||
SUBQ SI, CX
|
||||
REPN; SCASB
|
||||
JZ success
|
||||
|
||||
// DI is 16-byte aligned; get ready to search using SSE instructions
|
||||
aligned:
|
||||
// round down to last 16-byte boundary
|
||||
MOVQ BX, R11
|
||||
ADDQ SI, R11
|
||||
ANDQ $~15, R11
|
||||
|
||||
// shuffle X0 around so that each byte contains c
|
||||
// Shuffle X0 around so that each byte contains
|
||||
// the character we're looking for.
|
||||
MOVD AX, X0
|
||||
PUNPCKLBW X0, X0
|
||||
PUNPCKLBW X0, X0
|
||||
PSHUFL $0, X0, X0
|
||||
JMP condition
|
||||
|
||||
CMPQ BX, $16
|
||||
JLT small
|
||||
|
||||
MOVQ SI, DI
|
||||
|
||||
CMPQ BX, $32
|
||||
JA avx2
|
||||
sse:
|
||||
// move the next 16-byte chunk of the buffer into X1
|
||||
MOVO (DI), X1
|
||||
// compare bytes in X0 to X1
|
||||
PCMPEQB X0, X1
|
||||
// take the top bit of each byte in X1 and put the result in DX
|
||||
LEAQ -16(SI)(BX*1), AX // AX = address of last 16 bytes
|
||||
JMP sseloopentry
|
||||
|
||||
sseloop:
|
||||
// Move the next 16-byte chunk of the data into X1.
|
||||
MOVOU (DI), X1
|
||||
// Compare bytes in X0 to X1.
|
||||
PCMPEQB X0, X1
|
||||
// Take the top bit of each byte in X1 and put the result in DX.
|
||||
PMOVMSKB X1, DX
|
||||
TESTL DX, DX
|
||||
JNZ ssesuccess
|
||||
ADDQ $16, DI
|
||||
// Find first set bit, if any.
|
||||
BSFL DX, DX
|
||||
JNZ ssesuccess
|
||||
// Advance to next block.
|
||||
ADDQ $16, DI
|
||||
sseloopentry:
|
||||
CMPQ DI, AX
|
||||
JB sseloop
|
||||
|
||||
condition:
|
||||
CMPQ DI, R11
|
||||
JLT sse
|
||||
|
||||
// search the end
|
||||
MOVQ SI, CX
|
||||
ADDQ BX, CX
|
||||
SUBQ R11, CX
|
||||
// if CX == 0, the zero flag will be set and we'll end up
|
||||
// returning a false success
|
||||
JZ failure
|
||||
REPN; SCASB
|
||||
JZ success
|
||||
// Search the last 16-byte chunk. This chunk may overlap with the
|
||||
// chunks we've already searched, but that's ok.
|
||||
MOVQ AX, DI
|
||||
MOVOU (AX), X1
|
||||
PCMPEQB X0, X1
|
||||
PMOVMSKB X1, DX
|
||||
BSFL DX, DX
|
||||
JNZ ssesuccess
|
||||
|
||||
failure:
|
||||
MOVQ $-1, (R8)
|
||||
RET
|
||||
|
||||
// We've found a chunk containing the byte.
|
||||
// The chunk was loaded from DI.
|
||||
// The index of the matching byte in the chunk is DX.
|
||||
// The start of the data is SI.
|
||||
ssesuccess:
|
||||
SUBQ SI, DI // Compute offset of chunk within data.
|
||||
ADDQ DX, DI // Add offset of byte within chunk.
|
||||
MOVQ DI, (R8)
|
||||
RET
|
||||
|
||||
// handle for lengths < 16
|
||||
small:
|
||||
MOVQ BX, CX
|
||||
REPN; SCASB
|
||||
JZ success
|
||||
MOVQ $-1, (R8)
|
||||
TESTQ BX, BX
|
||||
JEQ failure
|
||||
|
||||
// Check if we'll load across a page boundary.
|
||||
LEAQ 16(SI), AX
|
||||
TESTW $0xff0, AX
|
||||
JEQ endofpage
|
||||
|
||||
MOVOU (SI), X1 // Load data
|
||||
PCMPEQB X0, X1 // Compare target byte with each byte in data.
|
||||
PMOVMSKB X1, DX // Move result bits to integer register.
|
||||
BSFL DX, DX // Find first set bit.
|
||||
JZ failure // No set bit, failure.
|
||||
CMPL DX, BX
|
||||
JAE failure // Match is past end of data.
|
||||
MOVQ DX, (R8)
|
||||
RET
|
||||
|
||||
endofpage:
|
||||
MOVOU -16(SI)(BX*1), X1 // Load data into the high end of X1.
|
||||
PCMPEQB X0, X1 // Compare target byte with each byte in data.
|
||||
PMOVMSKB X1, DX // Move result bits to integer register.
|
||||
MOVL BX, CX
|
||||
SHLL CX, DX
|
||||
SHRL $16, DX // Shift desired bits down to bottom of register.
|
||||
BSFL DX, DX // Find first set bit.
|
||||
JZ failure // No set bit, failure.
|
||||
MOVQ DX, (R8)
|
||||
RET
|
||||
|
||||
avx2:
|
||||
CMPB runtime·support_avx2(SB), $1
|
||||
JNE no_avx2
|
||||
JNE sse
|
||||
MOVD AX, X0
|
||||
LEAQ -32(SI)(BX*1), R11
|
||||
VPBROADCASTB X0, Y1
|
||||
|
|
@ -1941,22 +1959,6 @@ avx2success:
|
|||
VZEROUPPER
|
||||
RET
|
||||
|
||||
// we've found the chunk containing the byte
|
||||
// now just figure out which specific byte it is
|
||||
ssesuccess:
|
||||
// get the index of the least significant set bit
|
||||
BSFW DX, DX
|
||||
SUBQ SI, DI
|
||||
ADDQ DI, DX
|
||||
MOVQ DX, (R8)
|
||||
RET
|
||||
|
||||
success:
|
||||
SUBQ SI, DI
|
||||
SUBL $1, DI
|
||||
MOVQ DI, (R8)
|
||||
RET
|
||||
|
||||
TEXT bytes·Equal(SB),NOSPLIT,$0-49
|
||||
MOVQ a_len+8(FP), BX
|
||||
MOVQ b_len+32(FP), CX
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue