22b955cca5
Reviewed-on: https://go-review.googlesource.com/25150 From-SVN: r238662
320 lines
7.6 KiB
Go
320 lines
7.6 KiB
Go
// Copyright 2011 The Go Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style
|
|
// license that can be found in the LICENSE file.
|
|
|
|
package syntax
|
|
|
|
// Note to implementers:
|
|
// In this package, re is always a *Regexp and r is always a rune.
|
|
|
|
import (
|
|
"bytes"
|
|
"strconv"
|
|
"strings"
|
|
"unicode"
|
|
)
|
|
|
|
// A Regexp is a node in a regular expression syntax tree.
|
|
type Regexp struct {
|
|
Op Op // operator
|
|
Flags Flags
|
|
Sub []*Regexp // subexpressions, if any
|
|
Sub0 [1]*Regexp // storage for short Sub
|
|
Rune []rune // matched runes, for OpLiteral, OpCharClass
|
|
Rune0 [2]rune // storage for short Rune
|
|
Min, Max int // min, max for OpRepeat
|
|
Cap int // capturing index, for OpCapture
|
|
Name string // capturing name, for OpCapture
|
|
}
|
|
|
|
// An Op is a single regular expression operator.
|
|
type Op uint8
|
|
|
|
// Operators are listed in precedence order, tightest binding to weakest.
|
|
// Character class operators are listed simplest to most complex
|
|
// (OpLiteral, OpCharClass, OpAnyCharNotNL, OpAnyChar).
|
|
|
|
const (
|
|
OpNoMatch Op = 1 + iota // matches no strings
|
|
OpEmptyMatch // matches empty string
|
|
OpLiteral // matches Runes sequence
|
|
OpCharClass // matches Runes interpreted as range pair list
|
|
OpAnyCharNotNL // matches any character except newline
|
|
OpAnyChar // matches any character
|
|
OpBeginLine // matches empty string at beginning of line
|
|
OpEndLine // matches empty string at end of line
|
|
OpBeginText // matches empty string at beginning of text
|
|
OpEndText // matches empty string at end of text
|
|
OpWordBoundary // matches word boundary `\b`
|
|
OpNoWordBoundary // matches word non-boundary `\B`
|
|
OpCapture // capturing subexpression with index Cap, optional name Name
|
|
OpStar // matches Sub[0] zero or more times
|
|
OpPlus // matches Sub[0] one or more times
|
|
OpQuest // matches Sub[0] zero or one times
|
|
OpRepeat // matches Sub[0] at least Min times, at most Max (Max == -1 is no limit)
|
|
OpConcat // matches concatenation of Subs
|
|
OpAlternate // matches alternation of Subs
|
|
)
|
|
|
|
const opPseudo Op = 128 // where pseudo-ops start
|
|
|
|
// Equal returns true if x and y have identical structure.
|
|
func (x *Regexp) Equal(y *Regexp) bool {
|
|
if x == nil || y == nil {
|
|
return x == y
|
|
}
|
|
if x.Op != y.Op {
|
|
return false
|
|
}
|
|
switch x.Op {
|
|
case OpEndText:
|
|
// The parse flags remember whether this is \z or \Z.
|
|
if x.Flags&WasDollar != y.Flags&WasDollar {
|
|
return false
|
|
}
|
|
|
|
case OpLiteral, OpCharClass:
|
|
if len(x.Rune) != len(y.Rune) {
|
|
return false
|
|
}
|
|
for i, r := range x.Rune {
|
|
if r != y.Rune[i] {
|
|
return false
|
|
}
|
|
}
|
|
|
|
case OpAlternate, OpConcat:
|
|
if len(x.Sub) != len(y.Sub) {
|
|
return false
|
|
}
|
|
for i, sub := range x.Sub {
|
|
if !sub.Equal(y.Sub[i]) {
|
|
return false
|
|
}
|
|
}
|
|
|
|
case OpStar, OpPlus, OpQuest:
|
|
if x.Flags&NonGreedy != y.Flags&NonGreedy || !x.Sub[0].Equal(y.Sub[0]) {
|
|
return false
|
|
}
|
|
|
|
case OpRepeat:
|
|
if x.Flags&NonGreedy != y.Flags&NonGreedy || x.Min != y.Min || x.Max != y.Max || !x.Sub[0].Equal(y.Sub[0]) {
|
|
return false
|
|
}
|
|
|
|
case OpCapture:
|
|
if x.Cap != y.Cap || x.Name != y.Name || !x.Sub[0].Equal(y.Sub[0]) {
|
|
return false
|
|
}
|
|
}
|
|
return true
|
|
}
|
|
|
|
// writeRegexp writes the Perl syntax for the regular expression re to b.
|
|
func writeRegexp(b *bytes.Buffer, re *Regexp) {
|
|
switch re.Op {
|
|
default:
|
|
b.WriteString("<invalid op" + strconv.Itoa(int(re.Op)) + ">")
|
|
case OpNoMatch:
|
|
b.WriteString(`[^\x00-\x{10FFFF}]`)
|
|
case OpEmptyMatch:
|
|
b.WriteString(`(?:)`)
|
|
case OpLiteral:
|
|
if re.Flags&FoldCase != 0 {
|
|
b.WriteString(`(?i:`)
|
|
}
|
|
for _, r := range re.Rune {
|
|
escape(b, r, false)
|
|
}
|
|
if re.Flags&FoldCase != 0 {
|
|
b.WriteString(`)`)
|
|
}
|
|
case OpCharClass:
|
|
if len(re.Rune)%2 != 0 {
|
|
b.WriteString(`[invalid char class]`)
|
|
break
|
|
}
|
|
b.WriteRune('[')
|
|
if len(re.Rune) == 0 {
|
|
b.WriteString(`^\x00-\x{10FFFF}`)
|
|
} else if re.Rune[0] == 0 && re.Rune[len(re.Rune)-1] == unicode.MaxRune {
|
|
// Contains 0 and MaxRune. Probably a negated class.
|
|
// Print the gaps.
|
|
b.WriteRune('^')
|
|
for i := 1; i < len(re.Rune)-1; i += 2 {
|
|
lo, hi := re.Rune[i]+1, re.Rune[i+1]-1
|
|
escape(b, lo, lo == '-')
|
|
if lo != hi {
|
|
b.WriteRune('-')
|
|
escape(b, hi, hi == '-')
|
|
}
|
|
}
|
|
} else {
|
|
for i := 0; i < len(re.Rune); i += 2 {
|
|
lo, hi := re.Rune[i], re.Rune[i+1]
|
|
escape(b, lo, lo == '-')
|
|
if lo != hi {
|
|
b.WriteRune('-')
|
|
escape(b, hi, hi == '-')
|
|
}
|
|
}
|
|
}
|
|
b.WriteRune(']')
|
|
case OpAnyCharNotNL:
|
|
b.WriteString(`(?-s:.)`)
|
|
case OpAnyChar:
|
|
b.WriteString(`(?s:.)`)
|
|
case OpBeginLine:
|
|
b.WriteString(`(?m:^)`)
|
|
case OpEndLine:
|
|
b.WriteString(`(?m:$)`)
|
|
case OpBeginText:
|
|
b.WriteString(`\A`)
|
|
case OpEndText:
|
|
if re.Flags&WasDollar != 0 {
|
|
b.WriteString(`(?-m:$)`)
|
|
} else {
|
|
b.WriteString(`\z`)
|
|
}
|
|
case OpWordBoundary:
|
|
b.WriteString(`\b`)
|
|
case OpNoWordBoundary:
|
|
b.WriteString(`\B`)
|
|
case OpCapture:
|
|
if re.Name != "" {
|
|
b.WriteString(`(?P<`)
|
|
b.WriteString(re.Name)
|
|
b.WriteRune('>')
|
|
} else {
|
|
b.WriteRune('(')
|
|
}
|
|
if re.Sub[0].Op != OpEmptyMatch {
|
|
writeRegexp(b, re.Sub[0])
|
|
}
|
|
b.WriteRune(')')
|
|
case OpStar, OpPlus, OpQuest, OpRepeat:
|
|
if sub := re.Sub[0]; sub.Op > OpCapture || sub.Op == OpLiteral && len(sub.Rune) > 1 {
|
|
b.WriteString(`(?:`)
|
|
writeRegexp(b, sub)
|
|
b.WriteString(`)`)
|
|
} else {
|
|
writeRegexp(b, sub)
|
|
}
|
|
switch re.Op {
|
|
case OpStar:
|
|
b.WriteRune('*')
|
|
case OpPlus:
|
|
b.WriteRune('+')
|
|
case OpQuest:
|
|
b.WriteRune('?')
|
|
case OpRepeat:
|
|
b.WriteRune('{')
|
|
b.WriteString(strconv.Itoa(re.Min))
|
|
if re.Max != re.Min {
|
|
b.WriteRune(',')
|
|
if re.Max >= 0 {
|
|
b.WriteString(strconv.Itoa(re.Max))
|
|
}
|
|
}
|
|
b.WriteRune('}')
|
|
}
|
|
if re.Flags&NonGreedy != 0 {
|
|
b.WriteRune('?')
|
|
}
|
|
case OpConcat:
|
|
for _, sub := range re.Sub {
|
|
if sub.Op == OpAlternate {
|
|
b.WriteString(`(?:`)
|
|
writeRegexp(b, sub)
|
|
b.WriteString(`)`)
|
|
} else {
|
|
writeRegexp(b, sub)
|
|
}
|
|
}
|
|
case OpAlternate:
|
|
for i, sub := range re.Sub {
|
|
if i > 0 {
|
|
b.WriteRune('|')
|
|
}
|
|
writeRegexp(b, sub)
|
|
}
|
|
}
|
|
}
|
|
|
|
func (re *Regexp) String() string {
|
|
var b bytes.Buffer
|
|
writeRegexp(&b, re)
|
|
return b.String()
|
|
}
|
|
|
|
const meta = `\.+*?()|[]{}^$`
|
|
|
|
func escape(b *bytes.Buffer, r rune, force bool) {
|
|
if unicode.IsPrint(r) {
|
|
if strings.ContainsRune(meta, r) || force {
|
|
b.WriteRune('\\')
|
|
}
|
|
b.WriteRune(r)
|
|
return
|
|
}
|
|
|
|
switch r {
|
|
case '\a':
|
|
b.WriteString(`\a`)
|
|
case '\f':
|
|
b.WriteString(`\f`)
|
|
case '\n':
|
|
b.WriteString(`\n`)
|
|
case '\r':
|
|
b.WriteString(`\r`)
|
|
case '\t':
|
|
b.WriteString(`\t`)
|
|
case '\v':
|
|
b.WriteString(`\v`)
|
|
default:
|
|
if r < 0x100 {
|
|
b.WriteString(`\x`)
|
|
s := strconv.FormatInt(int64(r), 16)
|
|
if len(s) == 1 {
|
|
b.WriteRune('0')
|
|
}
|
|
b.WriteString(s)
|
|
break
|
|
}
|
|
b.WriteString(`\x{`)
|
|
b.WriteString(strconv.FormatInt(int64(r), 16))
|
|
b.WriteString(`}`)
|
|
}
|
|
}
|
|
|
|
// MaxCap walks the regexp to find the maximum capture index.
|
|
func (re *Regexp) MaxCap() int {
|
|
m := 0
|
|
if re.Op == OpCapture {
|
|
m = re.Cap
|
|
}
|
|
for _, sub := range re.Sub {
|
|
if n := sub.MaxCap(); m < n {
|
|
m = n
|
|
}
|
|
}
|
|
return m
|
|
}
|
|
|
|
// CapNames walks the regexp to find the names of capturing groups.
|
|
func (re *Regexp) CapNames() []string {
|
|
names := make([]string, re.MaxCap()+1)
|
|
re.capNames(names)
|
|
return names
|
|
}
|
|
|
|
func (re *Regexp) capNames(names []string) {
|
|
if re.Op == OpCapture {
|
|
names[re.Cap] = re.Name
|
|
}
|
|
for _, sub := range re.Sub {
|
|
sub.capNames(names)
|
|
}
|
|
}
|