mirror of
https://github.com/golang/go.git
synced 2025-12-08 06:10:04 +00:00
strings: better mean complexity for Count and Index.
The O(n+m) complexity is obtained probabilistically by using Rabin-Karp algorithm, which provides the needed complexity unless exceptional collisions occur, without memory allocation. benchmark old ns/op new ns/op delta BenchmarkIndexHard1 6532331 4045886 -38.06% BenchmarkIndexHard2 8178173 4038975 -50.61% BenchmarkIndexHard3 6973687 4042591 -42.03% BenchmarkCountHard1 6270864 4071090 -35.08% BenchmarkCountHard2 7838039 4072853 -48.04% BenchmarkCountHard3 6697828 4071964 -39.20% BenchmarkIndexTorture 2730546 28934 -98.94% BenchmarkCountTorture 2729622 29064 -98.94% Fixes #4600. R=rsc, donovanhide, remyoudompheng CC=golang-dev https://golang.org/cl/7314095
This commit is contained in:
parent
f1037f0e86
commit
23093f86ee
2 changed files with 109 additions and 8 deletions
|
|
@ -36,15 +36,35 @@ func explode(s string, n int) []string {
|
||||||
return a
|
return a
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// primeRK is the prime base used in Rabin-Karp algorithm.
|
||||||
|
const primeRK = 16777619
|
||||||
|
|
||||||
|
// hashstr returns the hash and the appropriate multiplicative
|
||||||
|
// factor for use in Rabin-Karp algorithm.
|
||||||
|
func hashstr(sep string) (uint32, uint32) {
|
||||||
|
hash := uint32(0)
|
||||||
|
for i := 0; i < len(sep); i++ {
|
||||||
|
hash = hash*primeRK + uint32(sep[i])
|
||||||
|
|
||||||
|
}
|
||||||
|
var pow, sq uint32 = 1, primeRK
|
||||||
|
for i := len(sep); i > 0; i >>= 1 {
|
||||||
|
if i&1 != 0 {
|
||||||
|
pow *= sq
|
||||||
|
}
|
||||||
|
sq *= sq
|
||||||
|
}
|
||||||
|
return hash, pow
|
||||||
|
}
|
||||||
|
|
||||||
// Count counts the number of non-overlapping instances of sep in s.
|
// Count counts the number of non-overlapping instances of sep in s.
|
||||||
func Count(s, sep string) int {
|
func Count(s, sep string) int {
|
||||||
if sep == "" {
|
if sep == "" {
|
||||||
return utf8.RuneCountInString(s) + 1
|
return utf8.RuneCountInString(s) + 1
|
||||||
}
|
}
|
||||||
c := sep[0]
|
c := sep[0]
|
||||||
l := len(sep)
|
|
||||||
n := 0
|
n := 0
|
||||||
if l == 1 {
|
if len(sep) == 1 {
|
||||||
// special case worth making fast
|
// special case worth making fast
|
||||||
for i := 0; i < len(s); i++ {
|
for i := 0; i < len(s); i++ {
|
||||||
if s[i] == c {
|
if s[i] == c {
|
||||||
|
|
@ -53,11 +73,26 @@ func Count(s, sep string) int {
|
||||||
}
|
}
|
||||||
return n
|
return n
|
||||||
}
|
}
|
||||||
for i := 0; i+l <= len(s); i++ {
|
if len(sep) > len(s) {
|
||||||
if s[i] == c && s[i:i+l] == sep {
|
return 0
|
||||||
|
}
|
||||||
|
hashsep, pow := hashstr(sep)
|
||||||
|
h := uint32(0)
|
||||||
|
for i := 0; i < len(sep); i++ {
|
||||||
|
h = h*primeRK + uint32(s[i])
|
||||||
|
}
|
||||||
|
lastmatch := 0
|
||||||
|
for i := len(sep); ; i++ {
|
||||||
|
// Invariant: h = hash(s[i-l : i])
|
||||||
|
if h == hashsep && lastmatch <= i-len(sep) && s[i-len(sep):i] == sep {
|
||||||
n++
|
n++
|
||||||
i += l - 1
|
lastmatch = i
|
||||||
}
|
}
|
||||||
|
if i >= len(s) {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
h = h*primeRK + uint32(s[i])
|
||||||
|
h -= pow * uint32(s[i-len(sep)])
|
||||||
}
|
}
|
||||||
return n
|
return n
|
||||||
}
|
}
|
||||||
|
|
@ -94,10 +129,25 @@ func Index(s, sep string) int {
|
||||||
return -1
|
return -1
|
||||||
}
|
}
|
||||||
// n > 1
|
// n > 1
|
||||||
for i := 0; i+n <= len(s); i++ {
|
if n > len(s) {
|
||||||
if s[i] == c && s[i:i+n] == sep {
|
return -1
|
||||||
return i
|
}
|
||||||
|
// Hash sep.
|
||||||
|
hashsep, pow := hashstr(sep)
|
||||||
|
var h uint32
|
||||||
|
for i := 0; i < n; i++ {
|
||||||
|
h = h*primeRK + uint32(s[i])
|
||||||
|
}
|
||||||
|
for i := n; ; i++ {
|
||||||
|
// Invariant: h = hash(s[i-n : i])
|
||||||
|
if h == hashsep && s[i-n:i] == sep {
|
||||||
|
return i - n
|
||||||
}
|
}
|
||||||
|
if i >= len(s) {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
h = h*primeRK + uint32(s[i])
|
||||||
|
h -= pow * uint32(s[i-n])
|
||||||
}
|
}
|
||||||
return -1
|
return -1
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1001,6 +1001,57 @@ func TestEqualFold(t *testing.T) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func makeBenchInputHard() string {
|
||||||
|
tokens := [...]string{
|
||||||
|
"<a>", "<p>", "<b>", "<strong>",
|
||||||
|
"</a>", "</p>", "</b>", "</strong>",
|
||||||
|
"hello", "world",
|
||||||
|
}
|
||||||
|
x := make([]byte, 0, 1<<20)
|
||||||
|
for len(x) < 1<<20 {
|
||||||
|
i := rand.Intn(len(tokens))
|
||||||
|
x = append(x, tokens[i]...)
|
||||||
|
}
|
||||||
|
return string(x)
|
||||||
|
}
|
||||||
|
|
||||||
|
var benchInputHard = makeBenchInputHard()
|
||||||
|
|
||||||
|
func benchmarkIndexHard(b *testing.B, sep string) {
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
Index(benchInputHard, sep)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func benchmarkCountHard(b *testing.B, sep string) {
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
Count(benchInputHard, sep)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func BenchmarkIndexHard1(b *testing.B) { benchmarkIndexHard(b, "<>") }
|
||||||
|
func BenchmarkIndexHard2(b *testing.B) { benchmarkIndexHard(b, "</pre>") }
|
||||||
|
func BenchmarkIndexHard3(b *testing.B) { benchmarkIndexHard(b, "<b>hello world</b>") }
|
||||||
|
|
||||||
|
func BenchmarkCountHard1(b *testing.B) { benchmarkCountHard(b, "<>") }
|
||||||
|
func BenchmarkCountHard2(b *testing.B) { benchmarkCountHard(b, "</pre>") }
|
||||||
|
func BenchmarkCountHard3(b *testing.B) { benchmarkCountHard(b, "<b>hello world</b>") }
|
||||||
|
|
||||||
|
var benchInputTorture = Repeat("ABC", 1<<10) + "123" + Repeat("ABC", 1<<10)
|
||||||
|
var benchNeedleTorture = Repeat("ABC", 1<<10+1)
|
||||||
|
|
||||||
|
func BenchmarkIndexTorture(b *testing.B) {
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
Index(benchInputTorture, benchNeedleTorture)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func BenchmarkCountTorture(b *testing.B) {
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
Count(benchInputTorture, benchNeedleTorture)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
var makeFieldsInput = func() string {
|
var makeFieldsInput = func() string {
|
||||||
x := make([]byte, 1<<20)
|
x := make([]byte, 1<<20)
|
||||||
// Input is ~10% space, ~10% 2-byte UTF-8, rest ASCII non-space.
|
// Input is ~10% space, ~10% 2-byte UTF-8, rest ASCII non-space.
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue