| 
									
										
										
										
											2010-09-21 23:12:57 -07:00
										 |  |  | // Copyright 2010 The Go Authors. All rights reserved. | 
					
						
							|  |  |  | // Use of this source code is governed by a BSD-style | 
					
						
							|  |  |  | // license that can be found in the LICENSE file. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | package suffixarray | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | import ( | 
					
						
							| 
									
										
										
										
											2011-01-11 21:46:50 -08:00
										 |  |  | 	"bytes" | 
					
						
							| 
									
										
										
										
											2010-12-17 14:00:46 -08:00
										 |  |  | 	"regexp" | 
					
						
							| 
									
										
										
										
											2010-09-21 23:12:57 -07:00
										 |  |  | 	"sort" | 
					
						
							|  |  |  | 	"strings" | 
					
						
							|  |  |  | 	"testing" | 
					
						
							|  |  |  | ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | type testCase struct { | 
					
						
							| 
									
										
										
										
											2010-12-17 14:00:46 -08:00
										 |  |  | 	name     string   // name of test case | 
					
						
							|  |  |  | 	source   string   // source to index | 
					
						
							|  |  |  | 	patterns []string // patterns to lookup | 
					
						
							| 
									
										
										
										
											2010-09-21 23:12:57 -07:00
										 |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | var testCases = []testCase{ | 
					
						
							| 
									
										
										
										
											2010-10-22 10:06:33 -07:00
										 |  |  | 	{ | 
					
						
							| 
									
										
										
										
											2010-09-21 23:12:57 -07:00
										 |  |  | 		"empty string", | 
					
						
							|  |  |  | 		"", | 
					
						
							|  |  |  | 		[]string{ | 
					
						
							|  |  |  | 			"", | 
					
						
							|  |  |  | 			"foo", | 
					
						
							| 
									
										
										
										
											2010-12-17 14:00:46 -08:00
										 |  |  | 			"(foo)", | 
					
						
							|  |  |  | 			".*", | 
					
						
							|  |  |  | 			"a*", | 
					
						
							| 
									
										
										
										
											2010-09-21 23:12:57 -07:00
										 |  |  | 		}, | 
					
						
							|  |  |  | 	}, | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2010-10-22 10:06:33 -07:00
										 |  |  | 	{ | 
					
						
							| 
									
										
										
										
											2010-09-21 23:12:57 -07:00
										 |  |  | 		"all a's", | 
					
						
							|  |  |  | 		"aaaaaaaaaa", // 10 a's | 
					
						
							|  |  |  | 		[]string{ | 
					
						
							|  |  |  | 			"", | 
					
						
							|  |  |  | 			"a", | 
					
						
							|  |  |  | 			"aa", | 
					
						
							|  |  |  | 			"aaa", | 
					
						
							|  |  |  | 			"aaaa", | 
					
						
							|  |  |  | 			"aaaaa", | 
					
						
							|  |  |  | 			"aaaaaa", | 
					
						
							|  |  |  | 			"aaaaaaa", | 
					
						
							|  |  |  | 			"aaaaaaaa", | 
					
						
							|  |  |  | 			"aaaaaaaaa", | 
					
						
							|  |  |  | 			"aaaaaaaaaa", | 
					
						
							|  |  |  | 			"aaaaaaaaaaa", // 11 a's | 
					
						
							| 
									
										
										
										
											2010-12-17 14:00:46 -08:00
										 |  |  | 			".", | 
					
						
							|  |  |  | 			".*", | 
					
						
							|  |  |  | 			"a+", | 
					
						
							|  |  |  | 			"aa+", | 
					
						
							|  |  |  | 			"aaaa[b]?", | 
					
						
							|  |  |  | 			"aaa*", | 
					
						
							| 
									
										
										
										
											2010-09-21 23:12:57 -07:00
										 |  |  | 		}, | 
					
						
							|  |  |  | 	}, | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2010-10-22 10:06:33 -07:00
										 |  |  | 	{ | 
					
						
							| 
									
										
										
										
											2010-09-21 23:12:57 -07:00
										 |  |  | 		"abc", | 
					
						
							|  |  |  | 		"abc", | 
					
						
							|  |  |  | 		[]string{ | 
					
						
							|  |  |  | 			"a", | 
					
						
							|  |  |  | 			"b", | 
					
						
							|  |  |  | 			"c", | 
					
						
							|  |  |  | 			"ab", | 
					
						
							|  |  |  | 			"bc", | 
					
						
							|  |  |  | 			"abc", | 
					
						
							| 
									
										
										
										
											2010-12-17 14:00:46 -08:00
										 |  |  | 			"a.c", | 
					
						
							|  |  |  | 			"a(b|c)", | 
					
						
							|  |  |  | 			"abc?", | 
					
						
							| 
									
										
										
										
											2010-09-21 23:12:57 -07:00
										 |  |  | 		}, | 
					
						
							|  |  |  | 	}, | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2010-10-22 10:06:33 -07:00
										 |  |  | 	{ | 
					
						
							| 
									
										
										
										
											2010-09-21 23:12:57 -07:00
										 |  |  | 		"barbara*3", | 
					
						
							|  |  |  | 		"barbarabarbarabarbara", | 
					
						
							|  |  |  | 		[]string{ | 
					
						
							|  |  |  | 			"a", | 
					
						
							|  |  |  | 			"bar", | 
					
						
							|  |  |  | 			"rab", | 
					
						
							|  |  |  | 			"arab", | 
					
						
							|  |  |  | 			"barbar", | 
					
						
							| 
									
										
										
										
											2010-12-17 14:00:46 -08:00
										 |  |  | 			"bara?bar", | 
					
						
							| 
									
										
										
										
											2010-09-21 23:12:57 -07:00
										 |  |  | 		}, | 
					
						
							|  |  |  | 	}, | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2010-10-22 10:06:33 -07:00
										 |  |  | 	{ | 
					
						
							| 
									
										
										
										
											2010-09-21 23:12:57 -07:00
										 |  |  | 		"typing drill", | 
					
						
							|  |  |  | 		"Now is the time for all good men to come to the aid of their country.", | 
					
						
							|  |  |  | 		[]string{ | 
					
						
							|  |  |  | 			"Now", | 
					
						
							|  |  |  | 			"the time", | 
					
						
							|  |  |  | 			"to come the aid", | 
					
						
							|  |  |  | 			"is the time for all good men to come to the aid of their", | 
					
						
							| 
									
										
										
										
											2010-12-17 14:00:46 -08:00
										 |  |  | 			"to (come|the)?", | 
					
						
							| 
									
										
										
										
											2010-09-21 23:12:57 -07:00
										 |  |  | 		}, | 
					
						
							|  |  |  | 	}, | 
					
						
							| 
									
										
										
										
											2011-01-31 13:13:02 -08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 	{ | 
					
						
							|  |  |  | 		"godoc simulation", | 
					
						
							|  |  |  | 		"package main\n\nimport(\n    \"rand\"\n    ", | 
					
						
							|  |  |  | 		[]string{}, | 
					
						
							|  |  |  | 	}, | 
					
						
							| 
									
										
										
										
											2010-09-21 23:12:57 -07:00
										 |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2011-05-30 18:02:59 +10:00
										 |  |  | // find all occurrences of s in source; report at most n occurrences | 
					
						
							| 
									
										
										
										
											2010-09-21 23:12:57 -07:00
										 |  |  | func find(src, s string, n int) []int { | 
					
						
							| 
									
										
										
										
											2011-08-08 14:32:37 -07:00
										 |  |  | 	var res []int | 
					
						
							| 
									
										
										
										
											2010-09-21 23:12:57 -07:00
										 |  |  | 	if s != "" && n != 0 { | 
					
						
							| 
									
										
										
										
											2011-01-04 13:16:50 -08:00
										 |  |  | 		// find at most n occurrences of s in src | 
					
						
							| 
									
										
										
										
											2010-09-21 23:12:57 -07:00
										 |  |  | 		for i := -1; n < 0 || len(res) < n; { | 
					
						
							|  |  |  | 			j := strings.Index(src[i+1:], s) | 
					
						
							|  |  |  | 			if j < 0 { | 
					
						
							|  |  |  | 				break | 
					
						
							|  |  |  | 			} | 
					
						
							|  |  |  | 			i += j + 1 | 
					
						
							| 
									
										
										
										
											2011-08-08 14:32:37 -07:00
										 |  |  | 			res = append(res, i) | 
					
						
							| 
									
										
										
										
											2010-09-21 23:12:57 -07:00
										 |  |  | 		} | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 	return res | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2010-12-17 14:00:46 -08:00
										 |  |  | func testLookup(t *testing.T, tc *testCase, x *Index, s string, n int) { | 
					
						
							|  |  |  | 	res := x.Lookup([]byte(s), n) | 
					
						
							|  |  |  | 	exp := find(tc.source, s, n) | 
					
						
							| 
									
										
										
										
											2010-09-21 23:12:57 -07:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2010-12-17 14:00:46 -08:00
										 |  |  | 	// check that the lengths match | 
					
						
							|  |  |  | 	if len(res) != len(exp) { | 
					
						
							|  |  |  | 		t.Errorf("test %q, lookup %q (n = %d): expected %d results; got %d", tc.name, s, n, len(exp), len(res)) | 
					
						
							|  |  |  | 	} | 
					
						
							| 
									
										
										
										
											2010-09-21 23:12:57 -07:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2010-12-17 14:00:46 -08:00
										 |  |  | 	// if n >= 0 the number of results is limited --- unless n >= all results, | 
					
						
							|  |  |  | 	// we may obtain different positions from the Index and from find (because | 
					
						
							|  |  |  | 	// Index may not find the results in the same order as find) => in general | 
					
						
							|  |  |  | 	// we cannot simply check that the res and exp lists are equal | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	// check that each result is in fact a correct match and there are no duplicates | 
					
						
							| 
									
										
										
										
											2011-07-08 10:52:50 +10:00
										 |  |  | 	sort.Ints(res) | 
					
						
							| 
									
										
										
										
											2010-12-17 14:00:46 -08:00
										 |  |  | 	for i, r := range res { | 
					
						
							|  |  |  | 		if r < 0 || len(tc.source) <= r { | 
					
						
							|  |  |  | 			t.Errorf("test %q, lookup %q, result %d (n = %d): index %d out of range [0, %d[", tc.name, s, i, n, r, len(tc.source)) | 
					
						
							|  |  |  | 		} else if !strings.HasPrefix(tc.source[r:], s) { | 
					
						
							|  |  |  | 			t.Errorf("test %q, lookup %q, result %d (n = %d): index %d not a match", tc.name, s, i, n, r) | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 		if i > 0 && res[i-1] == r { | 
					
						
							|  |  |  | 			t.Errorf("test %q, lookup %q, result %d (n = %d): found duplicate index %d", tc.name, s, i, n, r) | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 	} | 
					
						
							| 
									
										
										
										
											2010-09-21 23:12:57 -07:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2010-12-17 14:00:46 -08:00
										 |  |  | 	if n < 0 { | 
					
						
							|  |  |  | 		// all results computed - sorted res and exp must be equal | 
					
						
							| 
									
										
										
										
											2010-09-21 23:12:57 -07:00
										 |  |  | 		for i, r := range res { | 
					
						
							| 
									
										
										
										
											2010-12-17 14:00:46 -08:00
										 |  |  | 			e := exp[i] | 
					
						
							|  |  |  | 			if r != e { | 
					
						
							|  |  |  | 				t.Errorf("test %q, lookup %q, result %d: expected index %d; got %d", tc.name, s, i, e, r) | 
					
						
							| 
									
										
										
										
											2010-09-21 23:12:57 -07:00
										 |  |  | 			} | 
					
						
							|  |  |  | 		} | 
					
						
							| 
									
										
										
										
											2010-12-17 14:00:46 -08:00
										 |  |  | 	} | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func testFindAllIndex(t *testing.T, tc *testCase, x *Index, rx *regexp.Regexp, n int) { | 
					
						
							|  |  |  | 	res := x.FindAllIndex(rx, n) | 
					
						
							|  |  |  | 	exp := rx.FindAllStringIndex(tc.source, n) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	// check that the lengths match | 
					
						
							|  |  |  | 	if len(res) != len(exp) { | 
					
						
							|  |  |  | 		t.Errorf("test %q, FindAllIndex %q (n = %d): expected %d results; got %d", tc.name, rx, n, len(exp), len(res)) | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	// if n >= 0 the number of results is limited --- unless n >= all results, | 
					
						
							|  |  |  | 	// we may obtain different positions from the Index and from regexp (because | 
					
						
							|  |  |  | 	// Index may not find the results in the same order as regexp) => in general | 
					
						
							|  |  |  | 	// we cannot simply check that the res and exp lists are equal | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	// check that each result is in fact a correct match and the result is sorted | 
					
						
							|  |  |  | 	for i, r := range res { | 
					
						
							|  |  |  | 		if r[0] < 0 || r[0] > r[1] || len(tc.source) < r[1] { | 
					
						
							|  |  |  | 			t.Errorf("test %q, FindAllIndex %q, result %d (n == %d): illegal match [%d, %d]", tc.name, rx, i, n, r[0], r[1]) | 
					
						
							|  |  |  | 		} else if !rx.MatchString(tc.source[r[0]:r[1]]) { | 
					
						
							|  |  |  | 			t.Errorf("test %q, FindAllIndex %q, result %d (n = %d): [%d, %d] not a match", tc.name, rx, i, n, r[0], r[1]) | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 	} | 
					
						
							| 
									
										
										
										
											2010-09-21 23:12:57 -07:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2010-12-17 14:00:46 -08:00
										 |  |  | 	if n < 0 { | 
					
						
							|  |  |  | 		// all results computed - sorted res and exp must be equal | 
					
						
							| 
									
										
										
										
											2010-09-21 23:12:57 -07:00
										 |  |  | 		for i, r := range res { | 
					
						
							| 
									
										
										
										
											2010-12-17 14:00:46 -08:00
										 |  |  | 			e := exp[i] | 
					
						
							|  |  |  | 			if r[0] != e[0] || r[1] != e[1] { | 
					
						
							|  |  |  | 				t.Errorf("test %q, FindAllIndex %q, result %d: expected match [%d, %d]; got [%d, %d]", | 
					
						
							|  |  |  | 					tc.name, rx, i, e[0], e[1], r[0], r[1]) | 
					
						
							| 
									
										
										
										
											2010-09-21 23:12:57 -07:00
										 |  |  | 			} | 
					
						
							|  |  |  | 		} | 
					
						
							| 
									
										
										
										
											2010-12-17 14:00:46 -08:00
										 |  |  | 	} | 
					
						
							|  |  |  | } | 
					
						
							| 
									
										
										
										
											2010-09-21 23:12:57 -07:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2010-12-17 14:00:46 -08:00
										 |  |  | func testLookups(t *testing.T, tc *testCase, x *Index, n int) { | 
					
						
							|  |  |  | 	for _, pat := range tc.patterns { | 
					
						
							|  |  |  | 		testLookup(t, tc, x, pat, n) | 
					
						
							|  |  |  | 		if rx, err := regexp.Compile(pat); err == nil { | 
					
						
							|  |  |  | 			testFindAllIndex(t, tc, x, rx, n) | 
					
						
							| 
									
										
										
										
											2010-09-21 23:12:57 -07:00
										 |  |  | 		} | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2011-01-11 21:46:50 -08:00
										 |  |  | // index is used to hide the sort.Interface | 
					
						
							|  |  |  | type index Index | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func (x *index) Len() int           { return len(x.sa) } | 
					
						
							|  |  |  | func (x *index) Less(i, j int) bool { return bytes.Compare(x.at(i), x.at(j)) < 0 } | 
					
						
							|  |  |  | func (x *index) Swap(i, j int)      { x.sa[i], x.sa[j] = x.sa[j], x.sa[i] } | 
					
						
							|  |  |  | func (a *index) at(i int) []byte    { return a.data[a.sa[i]:] } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | func testConstruction(t *testing.T, tc *testCase, x *Index) { | 
					
						
							|  |  |  | 	if !sort.IsSorted((*index)(x)) { | 
					
						
							|  |  |  | 		t.Errorf("testConstruction failed %s", tc.name) | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2010-09-21 23:12:57 -07:00
										 |  |  | func TestIndex(t *testing.T) { | 
					
						
							|  |  |  | 	for _, tc := range testCases { | 
					
						
							|  |  |  | 		x := New([]byte(tc.source)) | 
					
						
							| 
									
										
										
										
											2011-01-11 21:46:50 -08:00
										 |  |  | 		testConstruction(t, &tc, x) | 
					
						
							| 
									
										
										
										
											2010-12-17 14:00:46 -08:00
										 |  |  | 		testLookups(t, &tc, x, 0) | 
					
						
							|  |  |  | 		testLookups(t, &tc, x, 1) | 
					
						
							|  |  |  | 		testLookups(t, &tc, x, 10) | 
					
						
							|  |  |  | 		testLookups(t, &tc, x, 2e9) | 
					
						
							|  |  |  | 		testLookups(t, &tc, x, -1) | 
					
						
							| 
									
										
										
										
											2010-09-21 23:12:57 -07:00
										 |  |  | 	} | 
					
						
							|  |  |  | } |