// Copyright 2010 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package regexp import ( "bufio" "compress/bzip2" "fmt" "io" "math/rand" "os" "path/filepath" "regexp/syntax" "strconv" "strings" "testing" "unicode/utf8" ) // TestRE2 tests this package's regexp API against test cases // considered during RE2's exhaustive tests, which run all possible // regexps over a given set of atoms and operators, up to a given // complexity, over all possible strings over a given alphabet, // up to a given size. Rather than try to link with RE2, we read a // log file containing the test cases and the expected matches. // The log file, re2.txt, is generated by running 'make exhaustive-log' // in the open source RE2 distribution. http://code.google.com/p/re2/ // // The test file format is a sequence of stanzas like: // // strings // "abc" // "123x" // regexps // "[a-z]+" // 0-3;0-3 // -;- // "([0-9])([0-9])([0-9])" // -;- // -;0-3 0-1 1-2 2-3 // // The stanza begins by defining a set of strings, quoted // using Go double-quote syntax, one per line. Then the // regexps section gives a sequence of regexps to run on // the strings. In the block that follows a regexp, each line // gives the semicolon-separated match results of running // the regexp on the corresponding string. // Each match result is either a single -, meaning no match, or a // space-separated sequence of pairs giving the match and // submatch indices. An unmatched subexpression formats // its pair as a single - (not illustrated above). For now // each regexp run produces two match results, one for a // ``full match'' that restricts the regexp to matching the entire // string or nothing, and one for a ``partial match'' that gives // the leftmost first match found in the string. // // Lines beginning with # are comments. Lines beginning with // a capital letter are test names printed during RE2's test suite // and are echoed into t but otherwise ignored. // // At time of writing, re2.txt is 32 MB but compresses to 760 kB, // so we store re2.txt.gz in the repository and decompress it on the fly. // func TestRE2Search(t *testing.T) { testRE2(t, "testdata/re2-search.txt") } func TestRE2Exhaustive(t *testing.T) { if testing.Short() { t.Log("skipping TestRE2Exhaustive during short test") return } testRE2(t, "testdata/re2-exhaustive.txt.bz2") } func testRE2(t *testing.T, file string) { f, err := os.Open(file) if err != nil { t.Fatal(err) } defer f.Close() var txt io.Reader if strings.HasSuffix(file, ".bz2") { z := bzip2.NewReader(f) txt = z file = file[:len(file)-len(".bz2")] // for error messages } else { txt = f } lineno := 0 r := bufio.NewReader(txt) var ( str []string input []string inStrings bool re *Regexp refull *Regexp nfail int ncase int ) for { line, err := r.ReadString('\n') if err != nil { if err == io.EOF { break } t.Fatalf("%s:%d: %v", file, lineno, err) } line = line[:len(line)-1] // chop \n lineno++ switch { case line == "": t.Fatalf("%s:%d: unexpected blank line", file, lineno) case line[0] == '#': continue case 'A' <= line[0] && line[0] <= 'Z': // Test name. t.Logf("%s\n", line) continue case line == "strings": str = str[:0] inStrings = true case line == "regexps": inStrings = false case line[0] == '"': q, err := strconv.Unquote(line) if err != nil { // Fatal because we'll get out of sync. t.Fatalf("%s:%d: unquote %s: %v", file, lineno, line, err) } if inStrings { str = append(str, q) continue } // Is a regexp. if len(input) != 0 { t.Fatalf("%s:%d: out of sync: have %d strings left before %#q", file, lineno, len(input), q) } re, err = tryCompile(q) if err != nil { if err.Error() == "error parsing regexp: invalid escape sequence: `\\C`" { // We don't and likely never will support \C; keep going. continue } t.Errorf("%s:%d: compile %#q: %v", file, lineno, q, err) if nfail++; nfail >= 100 { t.Fatalf("stopping after %d errors", nfail) } continue } full := `\A(?:` + q + `)\z` refull, err = tryCompile(full) if err != nil { // Fatal because q worked, so this should always work. t.Fatalf("%s:%d: compile full %#q: %v", file, lineno, full, err) } input = str case line[0] == '-' || '0' <= line[0] && line[0] <= '9': // A sequence of match results. ncase++ if re == nil { // Failed to compile: skip results. continue } if len(input) == 0 { t.Fatalf("%s:%d: out of sync: no input remaining", file, lineno) } var text string text, input = input[0], input[1:] if !isSingleBytes(text) && strings.Contains(re.String(), `\B`) { // RE2's \B considers every byte position, // so it sees 'not word boundary' in the // middle of UTF-8 sequences. This package // only considers the positions between runes, // so it disagrees. Skip those cases. continue } res := strings.Split(line, ";") if len(res) != len(run) { t.Fatalf("%s:%d: have %d test results, want %d", file, lineno, len(res), len(run)) } for i := range res { have, suffix := run[i](re, refull, text) want := parseResult(t, file, lineno, res[i]) if !same(have, want) { t.Errorf("%s:%d: %#q%s.FindSubmatchIndex(%#q) = %v, want %v", file, lineno, re, suffix, text, have, want) if nfail++; nfail >= 100 { t.Fatalf("stopping after %d errors", nfail) } continue } b, suffix := match[i](re, refull, text) if b != (want != nil) { t.Errorf("%s:%d: %#q%s.MatchString(%#q) = %v, want %v", file, lineno, re, suffix, text, b, !b) if nfail++; nfail >= 100 { t.Fatalf("stopping after %d errors", nfail) } continue } } default: t.Fatalf("%s:%d: out of sync: %s\n", file, lineno, line) } } if len(input) != 0 { t.Fatalf("%s:%d: out of sync: have %d strings left at EOF", file, lineno, len(input)) } t.Logf("%d cases tested", ncase) } var run = []func(*Regexp, *Regexp, string) ([]int, string){ runFull, runPartial, runFullLongest, runPartialLongest, } func runFull(re, refull *Regexp, text string) ([]int, string) { refull.longest = false return refull.FindStringSubmatchIndex(text), "[full]" } func runPartial(re, refull *Regexp, text string) ([]int, string) { re.longest = false return re.FindStringSubmatchIndex(text), "" } func runFullLongest(re, refull *Regexp, text string) ([]int, string) { refull.longest = true return refull.FindStringSubmatchIndex(text), "[full,longest]" } func runPartialLongest(re, refull *Regexp, text string) ([]int, string) { re.longest = true return re.FindStringSubmatchIndex(text), "[longest]" } var match = []func(*Regexp, *Regexp, string) (bool, string){ matchFull, matchPartial, matchFullLongest, matchPartialLongest, } func matchFull(re, refull *Regexp, text string) (bool, string) { refull.longest = false return refull.MatchString(text), "[full]" } func matchPartial(re, refull *Regexp, text string) (bool, string) { re.longest = false return re.MatchString(text), "" } func matchFullLongest(re, refull *Regexp, text string) (bool, string) { refull.longest = true return refull.MatchString(text), "[full,longest]" } func matchPartialLongest(re, refull *Regexp, text string) (bool, string) { re.longest = true return re.MatchString(text), "[longest]" } func isSingleBytes(s string) bool { for _, c := range s { if c >= utf8.RuneSelf { return false } } return true } func tryCompile(s string) (re *Regexp, err error) { // Protect against panic during Compile. defer func() { if r := recover(); r != nil { err = fmt.Errorf("panic: %v", r) } }() return Compile(s) } func parseResult(t *testing.T, file string, lineno int, res string) []int { // A single - indicates no match. if res == "-" { return nil } // Otherwise, a space-separated list of pairs. n := 1 for j := 0; j < len(res); j++ { if res[j] == ' ' { n++ } } out := make([]int, 2*n) i := 0 n = 0 for j := 0; j <= len(res); j++ { if j == len(res) || res[j] == ' ' { // Process a single pair. - means no submatch. pair := res[i:j] if pair == "-" { out[n] = -1 out[n+1] = -1 } else { k := strings.Index(pair, "-") if k < 0 { t.Fatalf("%s:%d: invalid pair %s", file, lineno, pair) } lo, err1 := strconv.Atoi(pair[:k]) hi, err2 := strconv.Atoi(pair[k+1:]) if err1 != nil || err2 != nil || lo > hi { t.Fatalf("%s:%d: invalid pair %s", file, lineno, pair) } out[n] = lo out[n+1] = hi } n += 2 i = j + 1 } } return out } func same(x, y []int) bool { if len(x) != len(y) { return false } for i, xi := range x { if xi != y[i] { return false } } return true } // TestFowler runs this package's regexp API against the // POSIX regular expression tests collected by Glenn Fowler // at http://www2.research.att.com/~gsf/testregex/. func TestFowler(t *testing.T) { files, err := filepath.Glob("testdata/*.dat") if err != nil { t.Fatal(err) } for _, file := range files { t.Log(file) testFowler(t, file) } } var notab = MustCompilePOSIX(`[^\t]+`) func testFowler(t *testing.T, file string) { f, err := os.Open(file) if err != nil { t.Error(err) return } defer f.Close() b := bufio.NewReader(f) lineno := 0 lastRegexp := "" Reading: for { lineno++ line, err := b.ReadString('\n') if err != nil { if err != io.EOF { t.Errorf("%s:%d: %v", file, lineno, err) } break Reading } // http://www2.research.att.com/~gsf/man/man1/testregex.html // // INPUT FORMAT // Input lines may be blank, a comment beginning with #, or a test // specification. A specification is five fields separated by one // or more tabs. NULL denotes the empty string and NIL denotes the // 0 pointer. if line[0] == '#' || line[0] == '\n' { continue Reading } line = line[:len(line)-1] field := notab.FindAllString(line, -1) for i, f := range field { if f == "NULL" { field[i] = "" } if f == "NIL" { t.Logf("%s:%d: skip: %s", file, lineno, line) continue Reading } } if len(field) == 0 { continue Reading } // Field 1: the regex(3) flags to apply, one character per REG_feature // flag. The test is skipped if REG_feature is not supported by the // implementation. If the first character is not [BEASKLP] then the // specification is a global control line. One or more of [BEASKLP] may be // specified; the test will be repeated for each mode. // // B basic BRE (grep, ed, sed) // E REG_EXTENDED ERE (egrep) // A REG_AUGMENTED ARE (egrep with negation) // S REG_SHELL SRE (sh glob) // K REG_SHELL|REG_AUGMENTED KRE (ksh glob) // L REG_LITERAL LRE (fgrep) // // a REG_LEFT|REG_RIGHT implicit ^...$ // b REG_NOTBOL lhs does not match ^ // c REG_COMMENT ignore space and #...\n // d REG_SHELL_DOT explicit leading . match // e REG_NOTEOL rhs does not match $ // f REG_MULTIPLE multiple \n separated patterns // g FNM_LEADING_DIR testfnmatch only -- match until / // h REG_MULTIREF multiple digit backref // i REG_ICASE ignore case // j REG_SPAN . matches \n // k REG_ESCAPE \ to ecape [...] delimiter // l REG_LEFT implicit ^... // m REG_MINIMAL minimal match // n REG_NEWLINE explicit \n match // o REG_ENCLOSED (|&) magic inside [@|&](...) // p REG_SHELL_PATH explicit / match // q REG_DELIMITED delimited pattern // r REG_RIGHT implicit ...$ // s REG_SHELL_ESCAPED \ not special // t REG_MUSTDELIM all delimiters must be specified // u standard unspecified behavior -- errors not counted // v REG_CLASS_ESCAPE \ special inside [...] // w REG_NOSUB no subexpression match array // x REG_LENIENT let some errors slide // y REG_LEFT regexec() implicit ^... // z REG_NULL NULL subexpressions ok // $ expand C \c escapes in fields 2 and 3 // / field 2 is a regsubcomp() expression // = field 3 is a regdecomp() expression // // Field 1 control lines: // // C set LC_COLLATE and LC_CTYPE to locale in field 2 // // ?test ... output field 5 if passed and != EXPECTED, silent otherwise // &test ... output field 5 if current and previous passed // |test ... output field 5 if current passed and previous failed // ; ... output field 2 if previous failed // {test ... skip if failed until } // } end of skip // // : comment comment copied as output NOTE // :comment:test :comment: ignored // N[OTE] comment comment copied as output NOTE // T[EST] comment comment // // number use number for nmatch (20 by default) flag := field[0] switch flag[0] { case '?', '&', '|', ';', '{', '}': // Ignore all the control operators. // Just run everything. flag = flag[1:] if flag == "" { continue Reading } case ':': i := strings.Index(flag[1:], ":") if i < 0 { t.Logf("skip: %s", line) continue Reading } flag = flag[1+i+1:] case 'C', 'N', 'T', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': t.Logf("skip: %s", line) continue Reading } // Can check field count now that we've handled the myriad comment formats. if len(field) < 4 { t.Errorf("%s:%d: too few fields: %s", file, lineno, line) continue Reading } // Expand C escapes (a.k.a. Go escapes). if strings.Contains(flag, "$") { f := `"` + field[1] + `"` if field[1], err = strconv.Unquote(f); err != nil { t.Errorf("%s:%d: cannot unquote %s", file, lineno, f) } f = `"` + field[2] + `"` if field[2], err = strconv.Unquote(f); err != nil { t.Errorf("%s:%d: cannot unquote %s", file, lineno, f) } } // Field 2: the regular expression pattern; SAME uses the pattern from // the previous specification. // if field[1] == "SAME" { field[1] = lastRegexp } lastRegexp = field[1] // Field 3: the string to match. text := field[2] // Field 4: the test outcome... ok, shouldCompile, shouldMatch, pos := parseFowlerResult(field[3]) if !ok { t.Errorf("%s:%d: cannot parse result %#q", file, lineno, field[3]) continue Reading } // Field 5: optional comment appended to the report. Testing: // Run test once for each specified capital letter mode that we support. for _, c := range flag { pattern := field[1] syn := syntax.POSIX | syntax.ClassNL switch c { default: continue Testing case 'E': // extended regexp (what we support) case 'L': // literal pattern = QuoteMeta(pattern) } for _, c := range flag { switch c { case 'i': syn |= syntax.FoldCase } } re, err := compile(pattern, syn, true) if err != nil { if shouldCompile { t.Errorf("%s:%d: %#q did not compile", file, lineno, pattern) } continue Testing } if !shouldCompile { t.Errorf("%s:%d: %#q should not compile", file, lineno, pattern) continue Testing } match := re.MatchString(text) if match != shouldMatch { t.Errorf("%s:%d: %#q.Match(%#q) = %v, want %v", file, lineno, pattern, text, match, shouldMatch) continue Testing } have := re.FindStringSubmatchIndex(text) if (len(have) > 0) != match { t.Errorf("%s:%d: %#q.Match(%#q) = %v, but %#q.FindSubmatchIndex(%#q) = %v", file, lineno, pattern, text, match, pattern, text, have) continue Testing } if len(have) > len(pos) { have = have[:len(pos)] } if !same(have, pos) { t.Errorf("%s:%d: %#q.FindSubmatchIndex(%#q) = %v, want %v", file, lineno, pattern, text, have, pos) } } } } func parseFowlerResult(s string) (ok, compiled, matched bool, pos []int) { // Field 4: the test outcome. This is either one of the posix error // codes (with REG_ omitted) or the match array, a list of (m,n) // entries with m and n being first and last+1 positions in the // field 3 string, or NULL if REG_NOSUB is in effect and success // is expected. BADPAT is acceptable in place of any regcomp(3) // error code. The match[] array is initialized to (-2,-2) before // each test. All array elements from 0 to nmatch-1 must be specified // in the outcome. Unspecified endpoints (offset -1) are denoted by ?. // Unset endpoints (offset -2) are denoted by X. {x}(o:n) denotes a // matched (?{...}) expression, where x is the text enclosed by {...}, // o is the expression ordinal counting from 1, and n is the length of // the unmatched portion of the subject string. If x starts with a // number then that is the return value of re_execf(), otherwise 0 is // returned. switch { case s == "": // Match with no position information. ok = true compiled = true matched = true return case s == "NOMATCH": // Match failure. ok = true compiled = true matched = false return case 'A' <= s[0] && s[0] <= 'Z': // All the other error codes are compile errors. ok = true compiled = false return } compiled = true var x []int for s != "" { var end byte = ')' if len(x)%2 == 0 { if s[0] != '(' { ok = false return } s = s[1:] end = ',' } i := 0 for i < len(s) && s[i] != end { i++ } if i == 0 || i == len(s) { ok = false return } var v = -1 var err error if s[:i] != "?" { v, err = strconv.Atoi(s[:i]) if err != nil { ok = false return } } x = append(x, v) s = s[i+1:] } if len(x)%2 != 0 { ok = false return } ok = true matched = true pos = x return } var text []byte func makeText(n int) []byte { if len(text) >= n { return text[:n] } text = make([]byte, n) for i := range text { if rand.Intn(30) == 0 { text[i] = '\n' } else { text[i] = byte(rand.Intn(0x7E+1-0x20) + 0x20) } } return text } func benchmark(b *testing.B, re string, n int) { r := MustCompile(re) t := makeText(n) b.ResetTimer() b.SetBytes(int64(n)) for i := 0; i < b.N; i++ { if r.Match(t) { panic("match!") } } } const ( easy0 = "ABCDEFGHIJKLMNOPQRSTUVWXYZ$" easy1 = "A[AB]B[BC]C[CD]D[DE]E[EF]F[FG]G[GH]H[HI]I[IJ]J$" medium = "[XYZ]ABCDEFGHIJKLMNOPQRSTUVWXYZ$" hard = "[ -~]*ABCDEFGHIJKLMNOPQRSTUVWXYZ$" parens = "([ -~])*(A)(B)(C)(D)(E)(F)(G)(H)(I)(J)(K)(L)(M)" + "(N)(O)(P)(Q)(R)(S)(T)(U)(V)(W)(X)(Y)(Z)$" ) func BenchmarkMatchEasy0_32(b *testing.B) { benchmark(b, easy0, 32<<0) } func BenchmarkMatchEasy0_1K(b *testing.B) { benchmark(b, easy0, 1<<10) } func BenchmarkMatchEasy0_32K(b *testing.B) { benchmark(b, easy0, 32<<10) } func BenchmarkMatchEasy0_1M(b *testing.B) { benchmark(b, easy0, 1<<20) } func BenchmarkMatchEasy0_32M(b *testing.B) { benchmark(b, easy0, 32<<20) } func BenchmarkMatchEasy1_32(b *testing.B) { benchmark(b, easy1, 32<<0) } func BenchmarkMatchEasy1_1K(b *testing.B) { benchmark(b, easy1, 1<<10) } func BenchmarkMatchEasy1_32K(b *testing.B) { benchmark(b, easy1, 32<<10) } func BenchmarkMatchEasy1_1M(b *testing.B) { benchmark(b, easy1, 1<<20) } func BenchmarkMatchEasy1_32M(b *testing.B) { benchmark(b, easy1, 32<<20) } func BenchmarkMatchMedium_32(b *testing.B) { benchmark(b, medium, 1<<0) } func BenchmarkMatchMedium_1K(b *testing.B) { benchmark(b, medium, 1<<10) } func BenchmarkMatchMedium_32K(b *testing.B) { benchmark(b, medium, 32<<10) } func BenchmarkMatchMedium_1M(b *testing.B) { benchmark(b, medium, 1<<20) } func BenchmarkMatchMedium_32M(b *testing.B) { benchmark(b, medium, 32<<20) } func BenchmarkMatchHard_32(b *testing.B) { benchmark(b, hard, 32<<0) } func BenchmarkMatchHard_1K(b *testing.B) { benchmark(b, hard, 1<<10) } func BenchmarkMatchHard_32K(b *testing.B) { benchmark(b, hard, 32<<10) } func BenchmarkMatchHard_1M(b *testing.B) { benchmark(b, hard, 1<<20) } func BenchmarkMatchHard_32M(b *testing.B) { benchmark(b, hard, 32<<20) }