regexp: new regularized methods for matching.

The previous set was spotty, incomplete, and confusing.
This CL proposes a regular, clean set with clearer names.
It's also complete.  Many existing methods will be deprecated,
but not in this CL.  Ditto for the tests.

R=rsc, gri
CC=golang-dev, rog
https://golang.org/cl/1946041
This commit is contained in:
Rob Pike 2010-08-12 14:41:52 +10:00
parent d75074974a
commit 6610d79eda
2 changed files with 770 additions and 13 deletions

View file

@ -19,11 +19,41 @@
// '[' [ '^' ] { character-range } ']'
// '(' regexp ')'
// character-range:
// character '-' character
// character [ '-' character ]
//
// All characters are UTF-8-encoded code points.
// Backslashes escape special characters, including inside
// character classes.
// All characters are UTF-8-encoded code points. Backslashes escape special
// characters, including inside character classes.
//
// There are 16 methods of Regexp that match a regular expression and identify
// the matched text. Their names are matched by this regular expression:
//
// Find(All)?(String)?(Submatch)?(Index)?
//
// If 'All' is present, the routine matches successive non-overlapping
// matches of the entire expression. Empty matches abutting a preceding
// match are ignored. The return value is a slice containing the successive
// return values of the corresponding non-'All' routine. These routines take
// an extra integer argument, n; if n >= 0, the function returns at most n
// matches/submatches.
//
// If 'String' is present, the argument is a string; otherwise it is a slice
// of bytes; return values are adjusted as appropriate.
//
// If 'Submatch' is present, the return value is a slice identifying the
// successive submatches of the expression. Submatches are matches of
// parenthesized subexpressions within the regular expression, numbered from
// left to right in order of opening parenthesis. Submatch 0 is the match of
// the entire expression, submatch 1 the match of the first parenthesized
// subexpression, and so on.
//
// If 'Index' is present, matches and submatches are identified by byte index
// pairs within the input string: result[2*n:2*n+1] identifies the indexes of
// the nth submatch. The pair for n==0 identifies the match of the entire
// expression. If 'Index' is not present, the match is identified by the
// text of the match/submatch. If an index is negative, it means that
// subexpression did not match any string in the input.
//
// (There are a few other methods that do not match this pattern.)
//
package regexp
@ -814,7 +844,7 @@ func (re *Regexp) doExecute(str string, bytestr []byte, pos int) []int {
advance = bytes.Index(bytestr[pos:], re.prefixBytes)
}
if advance == -1 {
return []int{}
return nil
}
pos += advance + len(re.prefix)
prefixed = true
@ -914,6 +944,7 @@ func (re *Regexp) doExecute(str string, bytestr []byte, pos int) []int {
// s[a[2*i]:a[2*i+1]] for i > 0 is the substring matched by the ith parenthesized subexpression.
// A negative value means the subexpression did not match any element of the string.
// An empty array means "no match".
// Deprecated; use FindString.
func (re *Regexp) ExecuteString(s string) (a []int) {
return re.doExecute(s, nil, 0)
}
@ -926,6 +957,7 @@ func (re *Regexp) ExecuteString(s string) (a []int) {
// b[a[2*i]:a[2*i+1]] for i > 0 is the subslice matched by the ith parenthesized subexpression.
// A negative value means the subexpression did not match any element of the slice.
// An empty array means "no match".
// Deprecated; use Find.
func (re *Regexp) Execute(b []byte) (a []int) { return re.doExecute("", b, 0) }
@ -944,6 +976,7 @@ func (re *Regexp) Match(b []byte) bool { return len(re.doExecute("", b, 0)) > 0
// a[0] is the substring matched by the entire expression.
// a[i] for i > 0 is the substring matched by the ith parenthesized subexpression.
// An empty array means ``no match''.
// Deprecated; use FindStringSubmatch.
func (re *Regexp) MatchStrings(s string) (a []string) {
r := re.doExecute(s, nil, 0)
if r == nil {
@ -963,6 +996,7 @@ func (re *Regexp) MatchStrings(s string) (a []string) {
// a[0] is the subslice matched by the entire expression.
// a[i] for i > 0 is the subslice matched by the ith parenthesized subexpression.
// An empty array means ``no match''.
// Deprecated; use FindSubmatch.
func (re *Regexp) MatchSlices(b []byte) (a [][]byte) {
r := re.doExecute("", b, 0)
if r == nil {
@ -1123,7 +1157,7 @@ func QuoteMeta(s string) string {
}
// Find matches in slice b if b is non-nil, otherwise find matches in string s.
func (re *Regexp) allMatches(s string, b []byte, n int, deliver func(int, int)) {
func (re *Regexp) allMatches(s string, b []byte, n int, deliver func([]int)) {
var end int
if b == nil {
end = len(s)
@ -1162,7 +1196,7 @@ func (re *Regexp) allMatches(s string, b []byte, n int, deliver func(int, int))
prevMatchEnd = matches[1]
if accept {
deliver(matches[0], matches[1])
deliver(matches)
i++
}
}
@ -1173,14 +1207,18 @@ func (re *Regexp) allMatches(s string, b []byte, n int, deliver func(int, int))
// matches. Text that does not match the expression will be skipped. Empty
// matches abutting a preceding match are ignored. The function returns a slice
// containing the matching substrings.
// Deprecated; use FindAll.
func (re *Regexp) AllMatches(b []byte, n int) [][]byte {
if n <= 0 {
n = len(b) + 1
}
// TODO: here and below, grow the result slice on demand
// to avoid allocating a huge slice for a small result and
// wasting memory.
result := make([][]byte, n)
i := 0
re.allMatches("", b, n, func(start, end int) {
result[i] = b[start:end]
re.allMatches("", b, n, func(match []int) {
result[i] = b[match[0]:match[1]]
i++
})
return result[0:i]
@ -1191,14 +1229,15 @@ func (re *Regexp) AllMatches(b []byte, n int) [][]byte {
// matches. Text that does not match the expression will be skipped. Empty
// matches abutting a preceding match are ignored. The function returns a slice
// containing the matching substrings.
// Deprecated; use FindAllString.
func (re *Regexp) AllMatchesString(s string, n int) []string {
if n <= 0 {
n = len(s) + 1
}
result := make([]string, n)
i := 0
re.allMatches(s, nil, n, func(start, end int) {
result[i] = s[start:end]
re.allMatches(s, nil, n, func(match []int) {
result[i] = s[match[0]:match[1]]
i++
})
return result[0:i]
@ -1215,7 +1254,7 @@ func (re *Regexp) AllMatchesIter(b []byte, n int) <-chan []byte {
}
c := make(chan []byte, 10)
go func() {
re.allMatches("", b, n, func(start, end int) { c <- b[start:end] })
re.allMatches("", b, n, func(match []int) { c <- b[match[0]:match[1]] })
close(c)
}()
return c
@ -1232,8 +1271,284 @@ func (re *Regexp) AllMatchesStringIter(s string, n int) <-chan string {
}
c := make(chan string, 10)
go func() {
re.allMatches(s, nil, n, func(start, end int) { c <- s[start:end] })
re.allMatches(s, nil, n, func(match []int) { c <- s[match[0]:match[1]] })
close(c)
}()
return c
}
// Find returns a slice holding the text of the leftmost match in b of the regular expression.
// A return value of nil indicates no match.
func (re *Regexp) Find(b []byte) []byte {
a := re.doExecute("", b, 0)
if a == nil {
return nil
}
return b[a[0]:a[1]]
}
// FindIndex returns a two-element slice of integers defining the location of
// the leftmost match in b of the regular expression. The match itself is at
// b[loc[0]:loc[1]].
// A return value of nil indicates no match.
func (re *Regexp) FindIndex(b []byte) (loc []int) {
a := re.doExecute("", b, 0)
if a == nil {
return nil
}
return a[0:2]
}
// FindString returns a string holding the text of the leftmost match in s of the regular
// expression. If there is no match, the return value is an empty string,
// but it will also be empty if the regular expression successfully matches
// an empty string. Use FindStringIndex or FindStringSubmatch if it is
// necessary to distinguish these cases.
func (re *Regexp) FindString(s string) string {
a := re.doExecute(s, nil, 0)
if a == nil {
return ""
}
return s[a[0]:a[1]]
}
// FindStringIndex returns a two-element slice of integers defining the
// location of the leftmost match in s of the regular expression. The match
// itself is at s[loc[0]:loc[1]].
// A return value of nil indicates no match.
func (re *Regexp) FindStringIndex(s string) []int {
a := re.doExecute(s, nil, 0)
if a == nil {
return nil
}
return a[0:2]
}
// FindSubmatch returns a slice of slices holding the text of the leftmost
// match of the regular expression in b and the matches, if any, of its
// subexpressions, as defined by the 'Submatch' descriptions in the package
// comment.
// A return value of nil indicates no match.
func (re *Regexp) FindSubmatch(b []byte) [][]byte {
a := re.doExecute("", b, 0)
if a == nil {
return nil
}
ret := make([][]byte, len(a)/2)
for i := range ret {
if a[2*i] >= 0 {
ret[i] = b[a[2*i]:a[2*i+1]]
}
}
return ret
}
// FindSubmatchIndex returns a slice holding the index pairs identifying the
// leftmost match of the regular expression in b and the matches, if any, of
// its subexpressions, as defined by the 'Submatch' and 'Index' descriptions
// in the package comment.
// A return value of nil indicates no match.
func (re *Regexp) FindSubmatchIndex(b []byte) []int {
return re.doExecute("", b, 0)
}
// FindStringSubmatch returns a slice of strings holding the text of the
// leftmost match of the regular expression in s and the matches, if any, of
// its subexpressions, as defined by the 'Submatch' description in the
// package comment.
// A return value of nil indicates no match.
func (re *Regexp) FindStringSubmatch(s string) []string {
a := re.doExecute(s, nil, 0)
if a == nil {
return nil
}
ret := make([]string, len(a)/2)
for i := range ret {
if a[2*i] >= 0 {
ret[i] = s[a[2*i]:a[2*i+1]]
}
}
return ret
}
// FindStringSubmatchIndex returns a slice holding the index pairs
// identifying the leftmost match of the regular expression in s and the
// matches, if any, of its subexpressions, as defined by the 'Submatch' and
// 'Index' descriptions in the package comment.
// A return value of nil indicates no match.
func (re *Regexp) FindStringSubmatchIndex(s string) []int {
return re.doExecute(s, nil, 0)
}
// FindAll is the 'All' version of Find; it returns a slice of all successive
// matches of the expression, as defined by the 'All' description in the
// package comment.
// A return value of nil indicates no match.
func (re *Regexp) FindAll(b []byte, n int) [][]byte {
if n < 0 {
n = len(b) + 1
}
result := make([][]byte, n)
i := 0
re.allMatches("", b, n, func(match []int) {
result[i] = b[match[0]:match[1]]
i++
})
if i == 0 {
return nil
}
return result[0:i]
}
// FindAllIndex is the 'All' version of FindIndex; it returns a slice of all
// successive matches of the expression, as defined by the 'All' description
// in the package comment.
// A return value of nil indicates no match.
func (re *Regexp) FindAllIndex(b []byte, n int) [][]int {
if n < 0 {
n = len(b) + 1
}
result := make([][]int, n)
i := 0
re.allMatches("", b, n, func(match []int) {
result[i] = match[0:2]
i++
})
if i == 0 {
return nil
}
return result[0:i]
}
// FindAllString is the 'All' version of FindString; it returns a slice of all
// successive matches of the expression, as defined by the 'All' description
// in the package comment.
// A return value of nil indicates no match.
func (re *Regexp) FindAllString(s string, n int) []string {
if n < 0 {
n = len(s) + 1
}
result := make([]string, n)
i := 0
re.allMatches(s, nil, n, func(match []int) {
result[i] = s[match[0]:match[1]]
i++
})
if i == 0 {
return nil
}
return result[0:i]
}
// FindAllStringIndex is the 'All' version of FindStringIndex; it returns a
// slice of all successive matches of the expression, as defined by the 'All'
// description in the package comment.
// A return value of nil indicates no match.
func (re *Regexp) FindAllStringIndex(s string, n int) [][]int {
if n < 0 {
n = len(s) + 1
}
result := make([][]int, n)
i := 0
re.allMatches(s, nil, n, func(match []int) {
result[i] = match[0:2]
i++
})
if i == 0 {
return nil
}
return result[0:i]
}
// FindAllSubmatch is the 'All' version of FindSubmatch; it returns a slice
// of all successive matches of the expression, as defined by the 'All'
// description in the package comment.
// A return value of nil indicates no match.
func (re *Regexp) FindAllSubmatch(b []byte, n int) [][][]byte {
if n < 0 {
n = len(b) + 1
}
result := make([][][]byte, n)
i := 0
re.allMatches("", b, n, func(match []int) {
slice := make([][]byte, len(match)/2)
for j := range slice {
if match[2*j] >= 0 {
slice[j] = b[match[2*j]:match[2*j+1]]
}
}
result[i] = slice
i++
})
if i == 0 {
return nil
}
return result[0:i]
}
// FindAllSubmatchIndex is the 'All' version of FindSubmatchIndex; it returns
// a slice of all successive matches of the expression, as defined by the
// 'All' description in the package comment.
// A return value of nil indicates no match.
func (re *Regexp) FindAllSubmatchIndex(b []byte, n int) [][]int {
if n < 0 {
n = len(b) + 1
}
result := make([][]int, n)
i := 0
re.allMatches("", b, n, func(match []int) {
result[i] = match
i++
})
if i == 0 {
return nil
}
return result[0:i]
}
// FindAllStringSubmatch is the 'All' version of FindStringSubmatch; it
// returns a slice of all successive matches of the expression, as defined by
// the 'All' description in the package comment.
// A return value of nil indicates no match.
func (re *Regexp) FindAllStringSubmatch(s string, n int) [][]string {
if n < 0 {
n = len(s) + 1
}
result := make([][]string, n)
i := 0
re.allMatches(s, nil, n, func(match []int) {
slice := make([]string, len(match)/2)
for j := range slice {
if match[2*j] >= 0 {
slice[j] = s[match[2*j]:match[2*j+1]]
}
}
result[i] = slice
i++
})
if i == 0 {
return nil
}
return result[0:i]
}
// FindAllStringSubmatchIndex is the 'All' version of
// FindStringSubmatchIndex; it returns a slice of all successive matches of
// the expression, as defined by the 'All' description in the package
// comment.
// A return value of nil indicates no match.
func (re *Regexp) FindAllStringSubmatchIndex(s string, n int) [][]int {
if n < 0 {
n = len(s) + 1
}
result := make([][]int, n)
i := 0
re.allMatches(s, nil, n, func(match []int) {
result[i] = match
i++
})
if i == 0 {
return nil
}
return result[0:i]
}