mirror of
https://github.com/golang/go.git
synced 2025-12-08 06:10:04 +00:00
exp/regexp/syntax: syntax data structures, parser
Parser is a work in progress but can populate most of the interesting parts of the data structure, so a good checkpoint. All the complicated Perl syntax is missing, as are various important optimizations made during parsing to the syntax tree. The plan is that exp/regexp's API will mimic regexp, and exp/regexp/syntax provides the parser directly for programs that need it (and for implementing exp/regexp). Once finished, exp/regexp will replace regexp. R=r, sam.thorogood, kevlar, edsrzf CC=golang-dev https://golang.org/cl/4538123
This commit is contained in:
parent
b4bb970e18
commit
b96c3477f8
5 changed files with 1050 additions and 0 deletions
210
src/pkg/exp/regexp/syntax/regexp.go
Normal file
210
src/pkg/exp/regexp/syntax/regexp.go
Normal file
|
|
@ -0,0 +1,210 @@
|
|||
// Copyright 2011 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Package syntax parses regular expressions into syntax trees.
|
||||
// WORK IN PROGRESS.
|
||||
package syntax
|
||||
|
||||
// Note to implementers:
|
||||
// In this package, re is always a *Regexp and r is always a rune.
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"strconv"
|
||||
"strings"
|
||||
"unicode"
|
||||
)
|
||||
|
||||
// A Regexp is a node in a regular expression syntax tree.
|
||||
type Regexp struct {
|
||||
Op Op // operator
|
||||
Flags Flags
|
||||
Sub []*Regexp // subexpressions, if any
|
||||
Sub0 [1]*Regexp // storage for short Sub
|
||||
Rune []int // matched runes, for OpLiteral, OpCharClass
|
||||
Rune0 [2]int // storage for short Rune
|
||||
Min, Max int // min, max for OpRepeat
|
||||
Cap int // capturing index, for OpCapture
|
||||
Name string // capturing name, for OpCapture
|
||||
}
|
||||
|
||||
// An Op is a single regular expression operator.
|
||||
type Op uint8
|
||||
|
||||
// Operators are listed in precedence order, tightest binding to weakest.
|
||||
|
||||
const (
|
||||
OpNoMatch Op = 1 + iota // matches no strings
|
||||
OpEmptyMatch // matches empty string
|
||||
OpLiteral // matches Runes sequence
|
||||
OpCharClass // matches Runes interpreted as range pair list
|
||||
OpAnyCharNotNL // matches any character
|
||||
OpAnyChar // matches any character
|
||||
OpBeginLine // matches empty string at beginning of line
|
||||
OpEndLine // matches empty string at end of line
|
||||
OpBeginText // matches empty string at beginning of text
|
||||
OpEndText // matches empty string at end of text
|
||||
OpWordBoundary // matches word boundary `\b`
|
||||
OpNoWordBoundary // matches word non-boundary `\B`
|
||||
OpCapture // capturing subexpression with index Cap, optional name Name
|
||||
OpStar // matches Sub[0] zero or more times
|
||||
OpPlus // matches Sub[0] one or more times
|
||||
OpQuest // matches Sub[0] zero or one times
|
||||
OpRepeat // matches Sub[0] at least Min times, at most Max (Max == -1 is no limit)
|
||||
OpConcat // matches concatenation of Subs
|
||||
OpAlternate // matches alternation of Subs
|
||||
)
|
||||
|
||||
const opPseudo Op = 128 // where pseudo-ops start
|
||||
|
||||
// writeRegexp writes the Perl syntax for the regular expression re to b.
|
||||
func writeRegexp(b *bytes.Buffer, re *Regexp) {
|
||||
switch re.Op {
|
||||
default:
|
||||
b.WriteString("<invalid op" + strconv.Itoa(int(re.Op)) + ">")
|
||||
case OpNoMatch:
|
||||
b.WriteString(`[^\x00-\x{10FFFF}]`)
|
||||
case OpEmptyMatch:
|
||||
b.WriteString(`(?:)`)
|
||||
case OpLiteral:
|
||||
for _, r := range re.Rune {
|
||||
escape(b, r, false)
|
||||
}
|
||||
case OpCharClass:
|
||||
if len(re.Rune)%2 != 0 {
|
||||
b.WriteString(`[invalid char class]`)
|
||||
break
|
||||
}
|
||||
b.WriteRune('[')
|
||||
if len(re.Rune) > 0 && re.Rune[0] == 0 && re.Rune[len(re.Rune)-1] == unicode.MaxRune {
|
||||
// Contains 0 and MaxRune. Probably a negated class.
|
||||
// Print the gaps.
|
||||
b.WriteRune('^')
|
||||
for i := 1; i < len(re.Rune)-1; i += 2 {
|
||||
lo, hi := re.Rune[i]+1, re.Rune[i+1]-1
|
||||
escape(b, lo, lo == '-')
|
||||
if lo != hi {
|
||||
b.WriteRune('-')
|
||||
escape(b, hi, hi == '-')
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for i := 0; i < len(re.Rune); i += 2 {
|
||||
lo, hi := re.Rune[i], re.Rune[i+1]
|
||||
escape(b, lo, lo == '-')
|
||||
if lo != hi {
|
||||
b.WriteRune('-')
|
||||
escape(b, hi, hi == '-')
|
||||
}
|
||||
}
|
||||
}
|
||||
b.WriteRune(']')
|
||||
case OpAnyCharNotNL:
|
||||
b.WriteString(`[^\n]`)
|
||||
case OpAnyChar:
|
||||
b.WriteRune('.')
|
||||
case OpBeginLine:
|
||||
b.WriteRune('^')
|
||||
case OpEndLine:
|
||||
b.WriteRune('$')
|
||||
case OpBeginText:
|
||||
b.WriteString(`\A`)
|
||||
case OpEndText:
|
||||
b.WriteString(`\z`)
|
||||
case OpWordBoundary:
|
||||
b.WriteString(`\b`)
|
||||
case OpNoWordBoundary:
|
||||
b.WriteString(`\B`)
|
||||
case OpCapture:
|
||||
if re.Name != "" {
|
||||
b.WriteString(`(?P<`)
|
||||
b.WriteString(re.Name)
|
||||
b.WriteRune('>')
|
||||
} else {
|
||||
b.WriteRune('(')
|
||||
}
|
||||
writeRegexp(b, re.Sub[0])
|
||||
b.WriteRune(')')
|
||||
case OpStar, OpPlus, OpQuest, OpRepeat:
|
||||
if sub := re.Sub[0]; sub.Op > OpCapture {
|
||||
b.WriteString(`(?:`)
|
||||
writeRegexp(b, sub)
|
||||
b.WriteString(`)`)
|
||||
} else {
|
||||
writeRegexp(b, sub)
|
||||
}
|
||||
switch re.Op {
|
||||
case OpStar:
|
||||
b.WriteRune('*')
|
||||
case OpPlus:
|
||||
b.WriteRune('+')
|
||||
case OpQuest:
|
||||
b.WriteRune('?')
|
||||
case OpRepeat:
|
||||
b.WriteRune('{')
|
||||
b.WriteString(strconv.Itoa(re.Min))
|
||||
if re.Max != re.Min {
|
||||
b.WriteRune(',')
|
||||
if re.Max >= 0 {
|
||||
b.WriteString(strconv.Itoa(re.Max))
|
||||
}
|
||||
}
|
||||
b.WriteRune('}')
|
||||
}
|
||||
case OpConcat:
|
||||
for _, sub := range re.Sub {
|
||||
if sub.Op == OpAlternate {
|
||||
b.WriteString(`(?:`)
|
||||
writeRegexp(b, sub)
|
||||
b.WriteString(`)`)
|
||||
} else {
|
||||
writeRegexp(b, sub)
|
||||
}
|
||||
}
|
||||
case OpAlternate:
|
||||
for i, sub := range re.Sub {
|
||||
if i > 0 {
|
||||
b.WriteRune('|')
|
||||
}
|
||||
writeRegexp(b, sub)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (re *Regexp) String() string {
|
||||
var b bytes.Buffer
|
||||
writeRegexp(&b, re)
|
||||
return b.String()
|
||||
}
|
||||
|
||||
const meta = `\.+*?()|[]{}^$`
|
||||
|
||||
func escape(b *bytes.Buffer, r int, force bool) {
|
||||
if unicode.IsPrint(r) {
|
||||
if strings.IndexRune(meta, r) >= 0 || force {
|
||||
b.WriteRune('\\')
|
||||
}
|
||||
b.WriteRune(r)
|
||||
return
|
||||
}
|
||||
|
||||
switch r {
|
||||
case '\a':
|
||||
b.WriteString(`\a`)
|
||||
case '\f':
|
||||
b.WriteString(`\f`)
|
||||
case '\n':
|
||||
b.WriteString(`\n`)
|
||||
case '\r':
|
||||
b.WriteString(`\r`)
|
||||
case '\t':
|
||||
b.WriteString(`\t`)
|
||||
case '\v':
|
||||
b.WriteString(`\v`)
|
||||
default:
|
||||
b.WriteString(`\x{`)
|
||||
b.WriteString(strconv.Itob(r, 16))
|
||||
b.WriteString(`}`)
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue