// Copyright 2011 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package syntax import ( "os" "sort" "unicode" "utf8" ) // An Error describes a failure to parse a regular expression // and gives the offending expression. type Error struct { Code ErrorCode Expr string } func (e *Error) String() string { return "error parsing regexp: " + e.Code.String() + ": `" + e.Expr + "`" } // An ErrorCode describes a failure to parse a regular expression. type ErrorCode string const ( // Unexpected error ErrInternalError ErrorCode = "regexp/syntax: internal error" // Parse errors ErrInvalidCharClass ErrorCode = "invalid character class" ErrInvalidCharRange ErrorCode = "invalid character class range" ErrInvalidEscape ErrorCode = "invalid escape sequence" ErrInvalidNamedCapture ErrorCode = "invalid named capture" ErrInvalidPerlOp ErrorCode = "invalid or unsupported Perl syntax" ErrInvalidRepeatOp ErrorCode = "invalid nested repetition operator" ErrInvalidRepeatSize ErrorCode = "invalid repeat count" ErrInvalidUTF8 ErrorCode = "invalid UTF-8" ErrMissingBracket ErrorCode = "missing closing ]" ErrMissingParen ErrorCode = "missing closing )" ErrMissingRepeatArgument ErrorCode = "missing argument to repetition operator" ErrTrailingBackslash ErrorCode = "trailing backslash at end of expression" ) func (e ErrorCode) String() string { return string(e) } // Flags control the behavior of the parser and record information about regexp context. type Flags uint16 const ( FoldCase Flags = 1 << iota // case-insensitive match Literal // treat pattern as literal string ClassNL // allow character classes like [^a-z] and [[:space:]] to match newline DotNL // allow . to match newline OneLine // treat ^ and $ as only matching at beginning and end of text NonGreedy // make repetition operators default to non-greedy PerlX // allow Perl extensions UnicodeGroups // allow \p{Han}, \P{Han} for Unicode group and negation WasDollar // regexp OpEndText was $, not \z Simple // regexp contains no counted repetition MatchNL = ClassNL | DotNL Perl = ClassNL | OneLine | PerlX | UnicodeGroups // as close to Perl as possible POSIX Flags = 0 // POSIX syntax ) // Pseudo-ops for parsing stack. const ( opLeftParen = opPseudo + iota opVerticalBar ) type parser struct { flags Flags // parse mode flags stack []*Regexp // stack of parsed expressions numCap int // number of capturing groups seen wholeRegexp string } // Parse stack manipulation. // push pushes the regexp re onto the parse stack and returns the regexp. func (p *parser) push(re *Regexp) *Regexp { // TODO: automatic concatenation // TODO: turn character class into literal // TODO: compute simple p.stack = append(p.stack, re) return re } // newLiteral returns a new OpLiteral Regexp with the given flags func newLiteral(r int, flags Flags) *Regexp { re := &Regexp{ Op: OpLiteral, Flags: flags, } re.Rune0[0] = r re.Rune = re.Rune0[:1] return re } // literal pushes a literal regexp for the rune r on the stack // and returns that regexp. func (p *parser) literal(r int) *Regexp { return p.push(newLiteral(r, p.flags)) } // op pushes a regexp with the given op onto the stack // and returns that regexp. func (p *parser) op(op Op) *Regexp { return p.push(&Regexp{Op: op, Flags: p.flags}) } // repeat replaces the top stack element with itself repeated // according to op. func (p *parser) repeat(op Op, opstr string) os.Error { n := len(p.stack) if n == 0 { return &Error{ErrMissingRepeatArgument, opstr} } sub := p.stack[n-1] re := &Regexp{ Op: op, } re.Sub = re.Sub0[:1] re.Sub[0] = sub p.stack[n-1] = re return nil } // concat replaces the top of the stack (above the topmost '|' or '(') with its concatenation. func (p *parser) concat() *Regexp { // TODO: Flatten concats. // Scan down to find pseudo-operator | or (. i := len(p.stack) for i > 0 && p.stack[i-1].Op < opPseudo { i-- } sub := p.stack[i:] p.stack = p.stack[:i] var re *Regexp switch len(sub) { case 0: re = &Regexp{Op: OpEmptyMatch} case 1: re = sub[0] default: re = &Regexp{Op: OpConcat} re.Sub = append(re.Sub0[:0], sub...) } return p.push(re) } // alternate replaces the top of the stack (above the topmost '(') with its alternation. func (p *parser) alternate() *Regexp { // TODO: Flatten alternates. // Scan down to find pseudo-operator (. // There are no | above (. i := len(p.stack) for i > 0 && p.stack[i-1].Op < opPseudo { i-- } sub := p.stack[i:] p.stack = p.stack[:i] var re *Regexp switch len(sub) { case 0: re = &Regexp{Op: OpNoMatch} case 1: re = sub[0] default: re = &Regexp{Op: OpAlternate} re.Sub = append(re.Sub0[:0], sub...) } return p.push(re) } // Parsing. func Parse(s string, flags Flags) (*Regexp, os.Error) { if flags&Literal != 0 { // Trivial parser for literal string. if err := checkUTF8(s); err != nil { return nil, err } re := &Regexp{ Op: OpLiteral, Flags: flags, } re.Rune = re.Rune0[:0] // use local storage for small strings for _, c := range s { if len(re.Rune) >= cap(re.Rune) { // string is too long to fit in Rune0. let Go handle it re.Rune = []int(s) break } re.Rune = append(re.Rune, c) } return re, nil } // Otherwise, must do real work. var ( p parser err os.Error c int op Op ) p.flags = flags p.wholeRegexp = s t := s for t != "" { switch t[0] { default: if c, t, err = nextRune(t); err != nil { return nil, err } p.literal(c) case '(': // TODO: Actual Perl flag parsing. if len(t) >= 3 && t[1] == '?' && t[2] == ':' { // non-capturing paren p.op(opLeftParen) t = t[3:] break } p.numCap++ p.op(opLeftParen).Cap = p.numCap t = t[1:] case '|': p.concat() if err = p.parseVerticalBar(); err != nil { return nil, err } t = t[1:] case ')': if err = p.parseRightParen(); err != nil { return nil, err } t = t[1:] case '^': if p.flags&OneLine != 0 { p.op(OpBeginText) } else { p.op(OpBeginLine) } t = t[1:] case '$': if p.flags&OneLine != 0 { p.op(OpEndText).Flags |= WasDollar } else { p.op(OpEndLine) } t = t[1:] case '.': if p.flags&DotNL != 0 { p.op(OpAnyChar) } else { p.op(OpAnyCharNotNL) } t = t[1:] case '[': if t, err = p.parseClass(t); err != nil { return nil, err } case '*', '+', '?': switch t[0] { case '*': op = OpStar case '+': op = OpPlus case '?': op = OpQuest } // TODO: greedy if err = p.repeat(op, t[0:1]); err != nil { return nil, err } t = t[1:] case '{': return nil, os.NewError("repeat not implemented") case '\\': return nil, os.NewError("escape not implemented") } } p.concat() if p.swapVerticalBar() { // pop vertical bar p.stack = p.stack[:len(p.stack)-1] } p.alternate() n := len(p.stack) if n != 1 { return nil, &Error{ErrMissingParen, s} } return p.stack[0], nil } // parseVerticalBar handles a | in the input. func (p *parser) parseVerticalBar() os.Error { p.concat() // The concatenation we just parsed is on top of the stack. // If it sits above an opVerticalBar, swap it below // (things below an opVerticalBar become an alternation). // Otherwise, push a new vertical bar. if !p.swapVerticalBar() { p.op(opVerticalBar) } return nil } // If the top of the stack is an element followed by an opVerticalBar // swapVerticalBar swaps the two and returns true. // Otherwise it returns false. func (p *parser) swapVerticalBar() bool { if n := len(p.stack); n >= 2 { re1 := p.stack[n-1] re2 := p.stack[n-2] if re2.Op == opVerticalBar { p.stack[n-2] = re1 p.stack[n-1] = re2 return true } } return false } // parseRightParen handles a ) in the input. func (p *parser) parseRightParen() os.Error { p.concat() if p.swapVerticalBar() { // pop vertical bar p.stack = p.stack[:len(p.stack)-1] } p.alternate() n := len(p.stack) if n < 2 { return &Error{ErrInternalError, ""} } re1 := p.stack[n-1] re2 := p.stack[n-2] p.stack = p.stack[:n-2] if re2.Op != opLeftParen { return &Error{ErrMissingParen, p.wholeRegexp} } if re2.Cap == 0 { // Just for grouping. p.push(re1) } else { re2.Op = OpCapture re2.Sub = re2.Sub0[:1] re2.Sub[0] = re1 p.push(re2) } return nil } // parseClassChar parses a character class character at the beginning of s // and returns it. func (p *parser) parseClassChar(s, wholeClass string) (r int, rest string, err os.Error) { if s == "" { return 0, "", &Error{Code: ErrMissingBracket, Expr: wholeClass} } // TODO: Escapes return nextRune(s) } // parseClass parses a character class at the beginning of s // and pushes it onto the parse stack. func (p *parser) parseClass(s string) (rest string, err os.Error) { t := s[1:] // chop [ re := &Regexp{Op: OpCharClass, Flags: p.flags} re.Rune = re.Rune0[:0] sign := +1 if t != "" && t[0] == '^' { sign = -1 t = t[1:] // If character class does not match \n, add it here, // so that negation later will do the right thing. if p.flags&ClassNL == 0 { re.Rune = append(re.Rune, '\n', '\n') } } class := re.Rune first := true // ] and - are okay as first char in class for t == "" || t[0] != ']' || first { // POSIX: - is only okay unescaped as first or last in class. // Perl: - is okay anywhere. if t != "" && t[0] == '-' && p.flags&PerlX == 0 && !first && (len(t) == 1 || t[1] != ']') { _, size := utf8.DecodeRuneInString(t[1:]) return "", &Error{Code: ErrInvalidCharRange, Expr: t[:1+size]} } first = false // TODO: Look for [:alnum:] // TODO: Look for Unicode group. // TODO: Look for Perl group. // Single character or simple range. rng := t var lo, hi int if lo, t, err = p.parseClassChar(t, s); err != nil { return "", err } hi = lo // [a-] means (a|-) so check for final ]. if len(t) >= 2 && t[0] == '-' && t[1] != ']' { t = t[1:] if hi, t, err = p.parseClassChar(t, s); err != nil { return "", err } if hi < lo { rng = rng[:len(rng)-len(t)] return "", &Error{Code: ErrInvalidCharRange, Expr: rng} } } // Expand last range if overlaps or abuts. if n := len(class); n > 0 { clo, chi := class[n-2], class[n-1] if lo <= chi+1 && clo <= hi+1 { if lo < clo { class[n-2] = lo } if hi > chi { class[n-1] = hi } continue } } class = append(class, lo, hi) } t = t[1:] // chop ] // Use &re.Rune instead of &class to avoid allocation. re.Rune = class class = cleanClass(&re.Rune) if sign < 0 { class = negateClass(class) } re.Rune = class p.push(re) return t, nil } // cleanClass sorts the ranges (pairs of elements of r), // merges them, and eliminates duplicates. func cleanClass(rp *[]int) []int { // Sort by lo increasing, hi decreasing to break ties. sort.Sort(ranges{rp}) r := *rp // Merge abutting, overlapping. w := 2 // write index for i := 2; i < len(r); i += 2 { lo, hi := r[i], r[i+1] if lo <= r[w-1]+1 { // merge with previous range if hi > r[w-1] { r[w-1] = hi } continue } // new disjoint range r[w] = lo r[w+1] = hi w += 2 } return r[:w] } // negateClass overwrites r and returns r's negation. // It assumes the class r is already clean. func negateClass(r []int) []int { nextLo := 0 // lo end of next class to add w := 0 // write index for i := 0; i < len(r); i += 2 { lo, hi := r[i], r[i+1] if nextLo <= lo-1 { r[w] = nextLo r[w+1] = lo - 1 w += 2 } nextLo = hi + 1 } if nextLo <= unicode.MaxRune { // It's possible for the negation to have one more // range - this one - than the original class, so use append. r = append(r[:w], nextLo, unicode.MaxRune) } return r } // ranges implements sort.Interface on a []rune. // The choice of receiver type definition is strange // but avoids an allocation since we already have // a *[]int. type ranges struct { p *[]int } func (ra ranges) Less(i, j int) bool { p := *ra.p i *= 2 j *= 2 return p[i] < p[j] || p[i] == p[j] && p[i+1] > p[j+1] } func (ra ranges) Len() int { return len(*ra.p) / 2 } func (ra ranges) Swap(i, j int) { p := *ra.p i *= 2 j *= 2 p[i], p[i+1], p[j], p[j+1] = p[j], p[j+1], p[i], p[i+1] } func checkUTF8(s string) os.Error { for s != "" { rune, size := utf8.DecodeRuneInString(s) if rune == utf8.RuneError && size == 1 { return &Error{Code: ErrInvalidUTF8, Expr: s} } s = s[size:] } return nil } func nextRune(s string) (c int, t string, err os.Error) { c, size := utf8.DecodeRuneInString(s) if c == utf8.RuneError && size == 1 { return 0, "", &Error{Code: ErrInvalidUTF8, Expr: s} } return c, s[size:], nil }