* posix/Makefile: Add rules to build and run tst-rxspencer.
	(distribute): Add rxspencer/tests and rxspencer/COPYRIGHT.
	* posix/tst-rxspencer.c: New file.
	* posix/rxspencer/tests: New file.
	* posix/rxspencer/COPYRIGHT: New file.
	Patch mostly by Jakub Jelinek.
This commit is contained in:
Ulrich Drepper 2003-11-13 20:52:55 +00:00
parent 78d8b07a44
commit 78c81ab7b4
5 changed files with 1051 additions and 2 deletions

View File

@ -1,5 +1,12 @@
2003-11-13 Ulrich Drepper <drepper@redhat.com>
* posix/Makefile: Add rules to build and run tst-rxspencer.
(distribute): Add rxspencer/tests and rxspencer/COPYRIGHT.
* posix/tst-rxspencer.c: New file.
* posix/rxspencer/tests: New file.
* posix/rxspencer/COPYRIGHT: New file.
Patch mostly by Jakub Jelinek.
* posix/regcomp.c (parse_bracket_exp): Don't check for range if
this is no option given the first token.

View File

@ -34,7 +34,7 @@ distribute := confstr.h TESTS TESTS2C.sed testcases.h \
PTESTS PTESTS2C.sed ptestcases.h \
globtest.c globtest.sh wordexp-tst.sh annexc.c fnmatch_loop.c \
spawn_int.h tst-getconf.sh regcomp.c regexec.c regex_internal.c \
regex_internal.h fork.h
regex_internal.h fork.h rxspencer/tests rxspencer/COPYRIGHT
routines := \
uname \
@ -78,7 +78,7 @@ tests := tstgetopt testfnm runtests runptests \
bug-regex8 bug-regex9 bug-regex10 bug-regex11 bug-regex12 \
bug-regex13 bug-regex14 bug-regex15 bug-regex16 \
bug-regex17 bug-regex18 bug-regex19 bug-regex20 \
tst-nice tst-nanosleep transbug
tst-nice tst-nanosleep transbug tst-rxspencer
ifeq (yes,$(build-shared))
test-srcs := globtest
tests += wordexp-test tst-exec tst-spawn
@ -147,6 +147,7 @@ tst-exec-ARGS = -- $(built-program-cmd)
tst-spawn-ARGS = -- $(built-program-cmd)
tst-dir-ARGS = `pwd` `cd $(common-objdir)/$(subdir); pwd` `cd $(common-objdir); pwd` $(objpfx)tst-dir
tst-chmod-ARGS = `pwd`
tst-rxspencer-ARGS = rxspencer/tests
tst-fnmatch-ENV = LOCPATH=$(common-objpfx)localedata
tst-regexloc-ENV = LOCPATH=$(common-objpfx)localedata

20
posix/rxspencer/COPYRIGHT Normal file
View File

@ -0,0 +1,20 @@
Copyright 1992, 1993, 1994, 1997 Henry Spencer. All rights reserved.
This software is not subject to any license of the American Telephone
and Telegraph Company or of the Regents of the University of California.
Permission is granted to anyone to use this software for any purpose on
any computer system, and to alter it and redistribute it, subject
to the following restrictions:
1. The author is not responsible for the consequences of use of this
software, no matter how awful, even if they arise from flaws in it.
2. The origin of this software must not be misrepresented, either by
explicit claim or by omission. Since few users ever read sources,
credits must appear in the documentation.
3. Altered versions must be plainly marked as such, and must not be
misrepresented as being the original software. Since few users
ever read sources, credits must appear in the documentation.
4. This notice may not be removed or altered.

506
posix/rxspencer/tests Normal file
View File

@ -0,0 +1,506 @@
# regular expression test set
# Lines are at least three fields, separated by one or more tabs. "" stands
# for an empty field. First field is an RE. Second field is flags. If
# C flag given, regcomp() is expected to fail, and the third field is the
# error name (minus the leading REG_).
#
# Otherwise it is expected to succeed, and the third field is the string to
# try matching it against. If there is no fourth field, the match is
# expected to fail. If there is a fourth field, it is the substring that
# the RE is expected to match. If there is a fifth field, it is a comma-
# separated list of what the subexpressions should match, with - indicating
# no match for that one. In both the fourth and fifth fields, a (sub)field
# starting with @ indicates that the (sub)expression is expected to match
# a null string followed by the stuff after the @; this provides a way to
# test where null strings match. The character `N' in REs and strings
# is newline, `S' is space, `T' is tab, `Z' is NUL.
#
# The full list of flags:
# - placeholder, does nothing
# b RE is a BRE, not an ERE
# & try it as both an ERE and a BRE
# C regcomp() error expected, third field is error name
# i REG_ICASE
# m ("mundane") REG_NOSPEC
# s REG_NOSUB (not really testable)
# n REG_NEWLINE
# ^ REG_NOTBOL
# $ REG_NOTEOL
# # REG_STARTEND (see below)
# p REG_PEND
#
# For REG_STARTEND, the start/end offsets are those of the substring
# enclosed in ().
# basics
a & a a
abc & abc abc
abc|de - abc abc
a|b|c - abc a
# parentheses and perversions thereof
a(b)c - abc abc
a\(b\)c b abc abc
a( C EPAREN
a( b a( a(
a\( - a( a(
a\( bC EPAREN
a\(b bC EPAREN
a(b C EPAREN
a(b b a(b a(b
# gag me with a right parenthesis -- 1003.2 goofed here (my fault, partly)
a) - a) a)
) - ) )
# end gagging (in a just world, those *should* give EPAREN)
a) b a) a)
a\) bC EPAREN
\) bC EPAREN
a()b - ab ab
a\(\)b b ab ab
# anchoring and REG_NEWLINE
^abc$ & abc abc
a^b - a^b
a^b b a^b a^b
a$b - a$b
a$b b a$b a$b
^ & abc @abc
$ & abc @
^$ & "" @
$^ - "" @
\($\)\(^\) b "" @
# stop retching, those are legitimate (although disgusting)
^^ - "" @
$$ - "" @
b$ & abNc
b$ &n abNc b
^b$ & aNbNc
^b$ &n aNbNc b
^$ &n aNNb @Nb
^$ n abc
^$ n abcN @
$^ n aNNb @Nb
\($\)\(^\) bn aNNb @Nb
^^ n^ aNNb @Nb
$$ n aNNb @NN
^a ^ a
a$ $ a
^a ^n aNb
^b ^n aNb b
a$ $n bNa
b$ $n bNa b
a*(^b$)c* - b b
a*\(^b$\)c* b b b
# certain syntax errors and non-errors
| C EMPTY
| b | |
* C BADRPT
* b * *
+ C BADRPT
? C BADRPT
"" &C EMPTY
() - abc @abc
\(\) b abc @abc
a||b C EMPTY
|ab C EMPTY
ab| C EMPTY
(|a)b C EMPTY
(a|)b C EMPTY
(*a) C BADRPT
(+a) C BADRPT
(?a) C BADRPT
({1}a) C BADRPT
\(\{1\}a\) bC BADRPT
(a|*b) C BADRPT
(a|+b) C BADRPT
(a|?b) C BADRPT
(a|{1}b) C BADRPT
^* C BADRPT
^* b * *
^+ C BADRPT
^? C BADRPT
^{1} C BADRPT
^\{1\} bC BADRPT
# metacharacters, backslashes
a.c & abc abc
a[bc]d & abd abd
a\*c & a*c a*c
a\\b & a\b a\b
a\\\*b & a\*b a\*b
# The following test is wrong. Using \b in an BRE or ERE is undefined.
# a\bc & abc abc
a\ &C EESCAPE
a\\bc & a\bc a\bc
\{ bC BADRPT
a\[b & a[b a[b
a[b &C EBRACK
# trailing $ is a peculiar special case for the BRE code
a$ & a a
a$ & a$
a\$ & a
a\$ & a$ a$
a\\$ & a
a\\$ & a$
a\\$ & a\$
a\\$ & a\ a\
# back references, ugh
a\(b\)\2c bC ESUBREG
a\(b\1\)c bC ESUBREG
a\(b*\)c\1d b abbcbbd abbcbbd bb
a\(b*\)c\1d b abbcbd
a\(b*\)c\1d b abbcbbbd
^\(.\)\1 b abc
a\([bc]\)\1d b abcdabbd abbd b
a\(\([bc]\)\2\)*d b abbccd abbccd
a\(\([bc]\)\2\)*d b abbcbd
# actually, this next one probably ought to fail, but the spec is unclear
a\(\(b\)*\2\)*d b abbbd abbbd
# here is a case that no NFA implementation does right
\(ab*\)[ab]*\1 b ababaaa ababaaa a
# check out normal matching in the presence of back refs
\(a\)\1bcd b aabcd aabcd
\(a\)\1bc*d b aabcd aabcd
\(a\)\1bc*d b aabd aabd
\(a\)\1bc*d b aabcccd aabcccd
\(a\)\1bc*[ce]d b aabcccd aabcccd
^\(a\)\1b\(c\)*cd$ b aabcccd aabcccd
# ordinary repetitions
ab*c & abc abc
ab+c - abc abc
ab?c - abc abc
a\(*\)b b a*b a*b
a\(**\)b b ab ab
a\(***\)b bC BADRPT
*a b *a *a
**a b a a
***a bC BADRPT
# the dreaded bounded repetitions
# The following two tests are not correct:
#{ & { {
#{abc & {abc {abc
# '{' is always a special char outside bracket expressions. So test ony BRE:
{ b { {
{abc b {abc {abc
{1 C BADRPT
{1} C BADRPT
# Same reason as for the two tests above:
#a{b & a{b a{b
a{b b a{b a{b
a{1}b - ab ab
a\{1\}b b ab ab
a{1,}b - ab ab
a\{1,\}b b ab ab
a{1,2}b - aab aab
a\{1,2\}b b aab aab
a{1 C EBRACE
a\{1 bC EBRACE
a{1a C EBRACE
a\{1a bC EBRACE
a{1a} C BADBR
a\{1a\} bC BADBR
# These four tests checks for undefined behavior. Our implementation does
# something different.
#a{,2} - a{,2} a{,2}
#a\{,2\} bC BADBR
#a{,} - a{,} a{,}
#a\{,\} bC BADBR
a{1,x} C BADBR
a\{1,x\} bC BADBR
a{1,x C EBRACE
a\{1,x bC EBRACE
# These two tests probably fails due to an arbitrary limit on the number of
# repetitions in the other implementation.
#a{300} C BADBR
#a\{300\} bC BADBR
a{1,0} C BADBR
a\{1,0\} bC BADBR
ab{0,0}c - abcac ac
ab\{0,0\}c b abcac ac
ab{0,1}c - abcac abc
ab\{0,1\}c b abcac abc
ab{0,3}c - abbcac abbc
ab\{0,3\}c b abbcac abbc
ab{1,1}c - acabc abc
ab\{1,1\}c b acabc abc
ab{1,3}c - acabc abc
ab\{1,3\}c b acabc abc
ab{2,2}c - abcabbc abbc
ab\{2,2\}c b abcabbc abbc
ab{2,4}c - abcabbc abbc
ab\{2,4\}c b abcabbc abbc
((a{1,10}){1,10}){1,10} - a a a,a
# multiple repetitions
# Wow, there is serious disconnect here. The ERE grammar is like this:
# ERE_expression : one_char_or_coll_elem_ERE
# | '^'
# | '$'
# | '(' extended_reg_exp ')'
# | ERE_expression ERE_dupl_symbol
# ;
# where ERE_dupl_symbol is any of the repetition methods. It is clear from
# this that consecutive repetition is OK. On top of this, the one test not
# marked as failing must fail. For BREs the situation is different, so we
# use the four tests.
#a** &C BADRPT
a** bC BADRPT
#a++ C BADRPT
#a?? C BADRPT
#a*+ C BADRPT
#a*? C BADRPT
#a+* C BADRPT
#a+? C BADRPT
#a?* C BADRPT
#a?+ C BADRPT
#a{1}{1} C BADRPT
#a*{1} C BADRPT
#a+{1} C BADRPT
#a?{1} C BADRPT
#a{1}* C BADRPT
#a{1}+ C BADRPT
#a{1}? C BADRPT
#a*{b} - a{b} a{b}
a\{1\}\{1\} bC BADRPT
a*\{1\} bC BADRPT
a\{1\}* bC BADRPT
# brackets, and numerous perversions thereof
a[b]c & abc abc
a[ab]c & abc abc
a[^ab]c & adc adc
a[]b]c & a]c a]c
a[[b]c & a[c a[c
a[-b]c & a-c a-c
a[^]b]c & adc adc
a[^-b]c & adc adc
a[b-]c & a-c a-c
a[b &C EBRACK
a[] &C EBRACK
a[1-3]c & a2c a2c
a[3-1]c &C ERANGE
a[1-3-5]c &C ERANGE
a[[.-.]--]c & a-c a-c
# I don't thing the error value should be ERANGE since a[1-] would be
# valid, too. Expect EBRACK.
#a[1- &C ERANGE
a[1- &C EBRACK
a[[. &C EBRACK
a[[.x &C EBRACK
a[[.x. &C EBRACK
a[[.x.] &C EBRACK
a[[.x.]] & ax ax
a[[.x,.]] &C ECOLLATE
# XXX Doesn't work yet.
# a[[.one.]]b & a1b a1b
a[[.notdef.]]b &C ECOLLATE
a[[.].]]b & a]b a]b
a[[:alpha:]]c & abc abc
a[[:notdef:]]c &C ECTYPE
a[[: &C EBRACK
a[[:alpha &C EBRACK
a[[:alpha:] &C EBRACK
a[[:alpha,:] &C ECTYPE
a[[:]:]]b &C ECTYPE
a[[:-:]]b &C ECTYPE
a[[:alph:]] &C ECTYPE
a[[:alphabet:]] &C ECTYPE
[[:alnum:]]+ - -%@a0X- a0X
[[:alpha:]]+ - -%@aX0- aX
[[:blank:]]+ - aSSTb SST
[[:cntrl:]]+ - aNTb NT
[[:digit:]]+ - a019b 019
[[:graph:]]+ - Sa%bS a%b
[[:lower:]]+ - AabC ab
[[:print:]]+ - NaSbN aSb
[[:punct:]]+ - S%-&T %-&
[[:space:]]+ - aSNTb SNT
[[:upper:]]+ - aBCd BC
[[:xdigit:]]+ - p0f3Cq 0f3C
a[[=b=]]c & abc abc
a[[= &C EBRACK
a[[=b &C EBRACK
a[[=b= &C EBRACK
a[[=b=] &C EBRACK
a[[=b,=]] &C ECOLLATE
# XXX Doesn't work yet.
#a[[=one=]]b & a1b a1b
# complexities
a(((b)))c - abc abc
a(b|(c))d - abd abd
a(b*|c)d - abbd abbd
# just gotta have one DFA-buster, of course
a[ab]{20} - aaaaabaaaabaaaabaaaab aaaaabaaaabaaaabaaaab
# and an inline expansion in case somebody gets tricky
a[ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab] - aaaaabaaaabaaaabaaaab aaaaabaaaabaaaabaaaab
# and in case somebody just slips in an NFA...
a[ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab](wee|week)(knights|night) - aaaaabaaaabaaaabaaaabweeknights aaaaabaaaabaaaabaaaabweeknights
# fish for anomalies as the number of states passes 32
12345678901234567890123456789 - a12345678901234567890123456789b 12345678901234567890123456789
123456789012345678901234567890 - a123456789012345678901234567890b 123456789012345678901234567890
1234567890123456789012345678901 - a1234567890123456789012345678901b 1234567890123456789012345678901
12345678901234567890123456789012 - a12345678901234567890123456789012b 12345678901234567890123456789012
123456789012345678901234567890123 - a123456789012345678901234567890123b 123456789012345678901234567890123
# and one really big one, beyond any plausible word width
1234567890123456789012345678901234567890123456789012345678901234567890 - a1234567890123456789012345678901234567890123456789012345678901234567890b 1234567890123456789012345678901234567890123456789012345678901234567890
# fish for problems as brackets go past 8
[ab][cd][ef][gh][ij][kl][mn] - xacegikmoq acegikm
[ab][cd][ef][gh][ij][kl][mn][op] - xacegikmoq acegikmo
[ab][cd][ef][gh][ij][kl][mn][op][qr] - xacegikmoqy acegikmoq
[ab][cd][ef][gh][ij][kl][mn][op][q] - xacegikmoqy acegikmoq
# subtleties of matching
abc & xabcy abc
a\(b\)?c\1d b acd
aBc i Abc Abc
a[Bc]*d i abBCcd abBCcd
0[[:upper:]]1 &i 0a1 0a1
0[[:lower:]]1 &i 0A1 0A1
a[^b]c &i abc
a[^b]c &i aBc
a[^b]c &i adc adc
[a]b[c] - abc abc
[a]b[a] - aba aba
[abc]b[abc] - abc abc
[abc]b[abd] - abd abd
a(b?c)+d - accd accd
(wee|week)(knights|night) - weeknights weeknights
(we|wee|week|frob)(knights|night|day) - weeknights weeknights
a[bc]d - xyzaaabcaababdacd abd
a[ab]c - aaabc abc
abc s abc abc
a* & b @b
# Let's have some fun -- try to match a C comment.
# first the obvious, which looks okay at first glance...
/\*.*\*/ - /*x*/ /*x*/
# but...
/\*.*\*/ - /*x*/y/*z*/ /*x*/y/*z*/
# okay, we must not match */ inside; try to do that...
/\*([^*]|\*[^/])*\*/ - /*x*/ /*x*/
/\*([^*]|\*[^/])*\*/ - /*x*/y/*z*/ /*x*/
# but...
/\*([^*]|\*[^/])*\*/ - /*x**/y/*z*/ /*x**/y/*z*/
# and a still fancier version, which does it right (I think)...
/\*([^*]|\*+[^*/])*\*+/ - /*x*/ /*x*/
/\*([^*]|\*+[^*/])*\*+/ - /*x*/y/*z*/ /*x*/
/\*([^*]|\*+[^*/])*\*+/ - /*x**/y/*z*/ /*x**/
/\*([^*]|\*+[^*/])*\*+/ - /*x****/y/*z*/ /*x****/
/\*([^*]|\*+[^*/])*\*+/ - /*x**x*/y/*z*/ /*x**x*/
/\*([^*]|\*+[^*/])*\*+/ - /*x***x/y/*z*/ /*x***x/y/*z*/
# subexpressions
.* - abc abc -
a(b)(c)d - abcd abcd b,c
a(((b)))c - abc abc b,b,b
a(b|(c))d - abd abd b,-
a(b*|c|e)d - abbd abbd bb
a(b*|c|e)d - acd acd c
a(b*|c|e)d - ad ad @d
a(b?)c - abc abc b
a(b?)c - ac ac @c
a(b+)c - abc abc b
a(b+)c - abbbc abbbc bbb
a(b*)c - ac ac @c
(a|ab)(bc([de]+)f|cde) - abcdef abcdef a,bcdef,de
# the regression tester only asks for 9 subexpressions
a(b)(c)(d)(e)(f)(g)(h)(i)(j)k - abcdefghijk abcdefghijk b,c,d,e,f,g,h,i,j
a(b)(c)(d)(e)(f)(g)(h)(i)(j)(k)l - abcdefghijkl abcdefghijkl b,c,d,e,f,g,h,i,j,k
a([bc]?)c - abc abc b
a([bc]?)c - ac ac @c
a([bc]+)c - abc abc b
a([bc]+)c - abcc abcc bc
a([bc]+)bc - abcbc abcbc bc
a(bb+|b)b - abb abb b
a(bbb+|bb+|b)b - abb abb b
a(bbb+|bb+|b)b - abbb abbb bb
a(bbb+|bb+|b)bb - abbb abbb b
(.*).* - abcdef abcdef abcdef
(a*)* - bc @b @b
# do we get the right subexpression when it is used more than once?
a(b|c)*d - ad ad -
a(b|c)*d - abcd abcd c
a(b|c)+d - abd abd b
a(b|c)+d - abcd abcd c
a(b|c?)+d - ad ad @d
a(b|c?)+d - abcd abcd @d
a(b|c){0,0}d - ad ad -
a(b|c){0,1}d - ad ad -
a(b|c){0,1}d - abd abd b
a(b|c){0,2}d - ad ad -
a(b|c){0,2}d - abcd abcd c
a(b|c){0,}d - ad ad -
a(b|c){0,}d - abcd abcd c
a(b|c){1,1}d - abd abd b
a(b|c){1,1}d - acd acd c
a(b|c){1,2}d - abd abd b
a(b|c){1,2}d - abcd abcd c
a(b|c){1,}d - abd abd b
a(b|c){1,}d - abcd abcd c
a(b|c){2,2}d - acbd acbd b
a(b|c){2,2}d - abcd abcd c
a(b|c){2,4}d - abcd abcd c
a(b|c){2,4}d - abcbd abcbd b
a(b|c){2,4}d - abcbcd abcbcd c
a(b|c){2,}d - abcd abcd c
a(b|c){2,}d - abcbd abcbd b
a(b+|((c)*))+d - abd abd @d,@d,-
# XXX Needs to be checked.
#a(b+|((c)*))+d - abcd abcd @d,@d,-
# check out the STARTEND option
[abc] &# a(b)c b
[abc] &# a(d)c
[abc] &# a(bc)d b
[abc] &# a(dc)d c
. &# a()c
b.*c &# b(bc)c bc
b.* &# b(bc)c bc
.*c &# b(bc)c bc
# plain strings, with the NOSPEC flag
abc m abc abc
abc m xabcy abc
abc m xyz
a*b m aba*b a*b
a*b m ab
"" mC EMPTY
# cases involving NULs
aZb & a a
aZb &p a
aZb &p# (aZb) aZb
aZ*b &p# (ab) ab
a.b &# (aZb) aZb
a.* &# (aZb)c aZb
# word boundaries (ick)
[[:<:]]a & a a
[[:<:]]a & ba
[[:<:]]a & -a a
a[[:>:]] & a a
a[[:>:]] & ab
a[[:>:]] & a- a
[[:<:]]a.c[[:>:]] & axcd-dayc-dazce-abc abc
[[:<:]]a.c[[:>:]] & axcd-dayc-dazce-abc-q abc
[[:<:]]a.c[[:>:]] & axc-dayc-dazce-abc axc
[[:<:]]b.c[[:>:]] & a_bxc-byc_d-bzc-q bzc
[[:<:]].x..[[:>:]] & y_xa_-_xb_y-_xc_-axdc _xc_
[[:<:]]a_b[[:>:]] & x_a_b
# past problems, and suspected problems
(A[1])|(A[2])|(A[3])|(A[4])|(A[5])|(A[6])|(A[7])|(A[8])|(A[9])|(A[A]) - A1 A1
abcdefghijklmnop i abcdefghijklmnop abcdefghijklmnop
abcdefghijklmnopqrstuv i abcdefghijklmnopqrstuv abcdefghijklmnopqrstuv
(ALAK)|(ALT[AB])|(CC[123]1)|(CM[123]1)|(GAMC)|(LC[23][EO ])|(SEM[1234])|(SL[ES][12])|(SLWW)|(SLF )|(SLDT)|(VWH[12])|(WH[34][EW])|(WP1[ESN]) - CC11 CC11
CC[13]1|a{21}[23][EO][123][Es][12]a{15}aa[34][EW]aaaaaaa[X]a - CC11 CC11
Char \([a-z0-9_]*\)\[.* b Char xyz[k Char xyz[k xyz
a?b - ab ab
-\{0,1\}[0-9]*$ b -5 -5
a*a*a*a*a*a*a* & aaaaaa aaaaaa

515
posix/tst-rxspencer.c Normal file
View File

@ -0,0 +1,515 @@
/* Regular expression tests.
Copyright (C) 2003 Free Software Foundation, Inc.
This file is part of the GNU C Library.
Contributed by Jakub Jelinek <jakub@redhat.com>, 2003.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, write to the Free
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
02111-1307 USA. */
#include <sys/types.h>
#include <mcheck.h>
#include <regex.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <locale.h>
#include <getopt.h>
static void
replace_special_chars (char *str)
{
for (; (str = strpbrk (str, "NTSZ")) != NULL; ++str)
switch (*str)
{
case 'N': *str = '\n'; break;
case 'T': *str = '\t'; break;
case 'S': *str = ' '; break;
case 'Z': *str = '\0'; break;
}
}
static void
glibc_re_syntax (char *str)
{
char *p, *end = strchr (str, '\0') + 1;
/* Replace [[:<:]] with \< and [[:>:]] with \>. */
for (p = str; (p = strstr (p, "[[:")) != NULL; )
if ((p[3] == '<' || p[3] == '>') && strncmp (p + 4, ":]]", 3) == 0)
{
p[0] = '\\';
p[1] = p[3];
memmove (p + 2, p + 7, end - p - 7);
end -= 5;
p += 2;
}
else
p += 3;
}
static char *
mb_replace (char *dst, const char c)
{
switch (c)
{
/* Replace a with \'a and A with \'A. */
case 'a':
*dst++ = '\xc3';
*dst++ = '\xa1';
break;
case 'A':
*dst++ = '\xc3';
*dst++ = '\x81';
break;
/* Replace b with \v{c} and B with \v{C}. */
case 'b':
*dst++ = '\xc4';
*dst++ = '\x8d';
break;
case 'B':
*dst++ = '\xc4';
*dst++ = '\x8c';
break;
/* Replace c with \v{d} and C with \v{D}. */
case 'c':
*dst++ = '\xc4';
*dst++ = '\x8f';
break;
case 'C':
*dst++ = '\xc4';
*dst++ = '\x8e';
break;
/* Replace d with \'e and D with \'E. */
case 'd':
*dst++ = '\xc3';
*dst++ = '\xa9';
break;
case 'D':
*dst++ = '\xc3';
*dst++ = '\x89';
break;
}
return dst;
}
static char *
mb_frob_string (const char *str, const char *letters)
{
char *ret, *dst;
const char *src;
if (str == NULL)
return NULL;
ret = malloc (2 * strlen (str) + 1);
if (ret == NULL)
return NULL;
for (src = str, dst = ret; *src; ++src)
if (strchr (letters, *src))
dst = mb_replace (dst, *src);
else
*dst++ = *src;
*dst = '\0';
return ret;
}
/* Like mb_frob_string, but don't replace anything between
[: and :], [. and .] or [= and =]. */
static char *
mb_frob_pattern (const char *str, const char *letters)
{
char *ret, *dst;
const char *src;
int in_class = 0;
if (str == NULL)
return NULL;
ret = malloc (2 * strlen (str) + 1);
if (ret == NULL)
return NULL;
for (src = str, dst = ret; *src; ++src)
if (!in_class && strchr (letters, *src))
dst = mb_replace (dst, *src);
else
{
if (!in_class && *src == '[' && strchr (":.=", src[1]))
in_class = 1;
else if (in_class && *src == ']' && strchr (":.=", src[-1]))
in_class = 0;
*dst++ = *src;
}
*dst = '\0';
return ret;
}
static int
check_match (regmatch_t *rm, int idx, const char *string,
const char *match, const char *fail)
{
if (match[0] == '-' && match[1] == '\0')
{
if (rm[idx].rm_so == -1 && rm[idx].rm_eo == -1)
return 0;
printf ("%s rm[%d] unexpectedly matched\n", fail, idx);
return 1;
}
if (rm[idx].rm_so == -1 || rm[idx].rm_eo == -1)
{
printf ("%s rm[%d] unexpectedly did not match\n", fail, idx);
return 1;
}
if (match[0] == '@')
{
if (rm[idx].rm_so != rm[idx].rm_eo)
{
printf ("%s rm[%d] not empty\n", fail, idx);
return 1;
}
if (strncmp (string + rm[idx].rm_so, match + 1, strlen (match + 1)))
{
printf ("%s rm[%d] not matching %s\n", fail, idx, match);
return 1;
}
return 0;
}
if (rm[idx].rm_eo - rm[idx].rm_so != strlen (match)
|| strncmp (string + rm[idx].rm_so, match,
rm[idx].rm_eo - rm[idx].rm_so))
{
printf ("%s rm[%d] not matching %s\n", fail, idx, match);
return 1;
}
return 0;
}
static int
test (const char *pattern, int cflags, const char *string, int eflags,
char *expect, char *matches, const char *fail)
{
regex_t re;
regmatch_t rm[10];
int n, ret = 0;
n = regcomp (&re, pattern, cflags);
if (n != 0)
{
if (eflags == -1)
{
static struct { reg_errcode_t code; const char *name; } codes []
#define C(x) { REG_##x, #x }
= { C(NOERROR), C(NOMATCH), C(BADPAT), C(ECOLLATE),
C(ECTYPE), C(EESCAPE), C(ESUBREG), C(EBRACK),
C(EPAREN), C(EBRACE), C(BADBR), C(ERANGE),
C(ESPACE), C(BADRPT) };
for (int i = 0; i < sizeof (codes) / sizeof (codes[0]); ++i)
if (n == codes[i].code)
{
if (strcmp (string, codes[i].name))
{
printf ("%s regcomp returned REG_%s (expected REG_%s)\n",
fail, codes[i].name, string);
return 1;
}
return 0;
}
printf ("%s regcomp return value REG_%d\n", fail, n);
return 1;
}
char buf[500];
regerror (n, &re, buf, sizeof (buf));
printf ("%s regcomp failed: %s\n", fail, buf);
return 1;
}
if (eflags == -1)
{
regfree (&re);
/* The test case file assumes something only guaranteed by the
rxspencer regex implementation. Namely that for empty
expressions regcomp() return REG_EMPTY. This is not the case
for us and so we ignore this error. */
if (strcmp (string, "EMPTY") == 0)
return 0;
printf ("%s regcomp unexpectedly succeeded\n", fail);
return 1;
}
if (regexec (&re, string, 10, rm, eflags))
{
regfree (&re);
if (expect == NULL)
return 0;
printf ("%s regexec failed\n", fail);
return 1;
}
regfree (&re);
if (expect == NULL)
{
printf ("%s regexec unexpectedly succeeded\n", fail);
return 1;
}
if (cflags & REG_NOSUB)
return 0;
ret = check_match (rm, 0, string, expect, fail);
if (matches == NULL)
return ret;
for (n = 1; ret == 0 && n < 10; ++n)
{
char *p = NULL;
if (matches)
{
p = strchr (matches, ',');
if (p != NULL)
*p = '\0';
}
ret = check_match (rm, n, string, matches ?: "-", fail);
if (p)
{
*p = ',';
matches = p + 1;
}
else
matches = NULL;
}
return ret;
}
static int
mb_test (const char *pattern, int cflags, const char *string, int eflags,
char *expect, const char *matches, const char *letters,
const char *fail)
{
char *pattern_mb = mb_frob_pattern (pattern, letters);
const char *string_mb
= eflags == -1 ? string : mb_frob_string (string, letters);
char *expect_mb = mb_frob_string (expect, letters);
char *matches_mb = mb_frob_string (matches, letters);
int ret = 0;
if (!pattern_mb || !string_mb
|| (expect && !expect_mb) || (matches && !matches_mb))
{
printf ("%s %m", fail);
ret = 1;
}
else
ret = test (pattern_mb, cflags, string_mb, eflags, expect_mb,
matches_mb, fail);
free (matches_mb);
free (expect_mb);
if (string_mb != string)
free ((char *) string_mb);
free (pattern_mb);
return ret;
}
static int
mb_tests (const char *pattern, int cflags, const char *string, int eflags,
char *expect, const char *matches)
{
int ret = 0;
int i;
char letters[9], fail[20];
/* The tests aren't supposed to work with xdigit, since a-dA-D are
hex digits while \'a \'A \v{c}\v{C}\v{d}\v{D}\'e \'E are not. */
if (strstr (pattern, "[:xdigit:]"))
return 0;
for (i = 1; i < 16; ++i)
{
char *p = letters;
if (i & 1)
*p++ = 'a', *p++ = 'A';
if (i & 2)
*p++ = 'b', *p++ = 'B';
if (i & 4)
*p++ = 'c', *p++ = 'C';
if (i & 8)
*p++ = 'd', *p++ = 'D';
*p++ = '\0';
sprintf (fail, "UTF-8 %s FAIL", letters);
ret |= mb_test (pattern, cflags, string, eflags, expect, matches,
letters, fail);
}
return ret;
}
int
main (int argc, char **argv)
{
int ret = 0;
char *line = NULL;
size_t line_len = 0;
ssize_t len;
FILE *f;
static int test_utf8 = 0;
static const struct option options[] =
{
{"utf8", no_argument, &test_utf8, 1},
{NULL, 0, NULL, 0 }
};
while (getopt_long (argc, argv, "u", options, NULL) >= 0);
if (optind + 1 != argc)
{
fprintf (stderr, "Missing test filename\n");
return 1;
}
f = fopen (argv[optind], "r");
if (f == NULL)
{
fprintf (stderr, "Couldn't open %s\n", argv[1]);
return 1;
}
while ((len = getline (&line, &line_len, f)) > 0)
{
char *pattern, *flagstr, *string, *expect, *matches, *p;
int cflags = REG_EXTENDED, eflags = 0, try_bre_ere = 0;
if (line[len - 1] == '\n')
line[len - 1] = '\0';
/* Skip comments and empty lines. */
if (*line == '#' || *line == '\0')
continue;
puts (line);
fflush (stdout);
pattern = strtok (line, "\t");
if (pattern == NULL)
continue;
if (strcmp (pattern, "\"\"") == 0)
pattern += 2;
flagstr = strtok (NULL, "\t");
if (flagstr == NULL)
continue;
string = strtok (NULL, "\t");
if (string == NULL)
continue;
if (strcmp (string, "\"\"") == 0)
string += 2;
for (p = flagstr; *p; ++p)
switch (*p)
{
case '-':
break;
case 'b':
cflags &= ~REG_EXTENDED;
break;
case '&':
try_bre_ere = 1;
break;
case 'C':
eflags = -1;
break;
case 'i':
cflags |= REG_ICASE;
break;
case 's':
cflags |= REG_NOSUB;
break;
case 'n':
cflags |= REG_NEWLINE;
break;
case '^':
eflags |= REG_NOTBOL;
break;
case '$':
eflags |= REG_NOTEOL;
break;
case 'm':
case 'p':
case '#':
/* Not supported. */
flagstr = NULL;
break;
}
if (flagstr == NULL)
continue;
replace_special_chars (pattern);
glibc_re_syntax (pattern);
if (eflags != -1)
replace_special_chars (string);
expect = strtok (NULL, "\t");
matches = NULL;
if (expect != NULL)
{
replace_special_chars (expect);
matches = strtok (NULL, "\t");
if (matches != NULL)
replace_special_chars (matches);
}
setlocale (LC_ALL, "C");
if (test (pattern, cflags, string, eflags, expect, matches, "FAIL")
|| (try_bre_ere
&& test (pattern, cflags & ~REG_EXTENDED, string, eflags,
expect, matches, "FAIL")))
ret = 1;
else if (test_utf8)
{
setlocale (LC_ALL, "cs_CZ.UTF-8");
if (test (pattern, cflags, string, eflags, expect, matches,
"UTF-8 FAIL")
|| (try_bre_ere
&& test (pattern, cflags & ~REG_EXTENDED, string, eflags,
expect, matches, "UTF-8 FAIL")))
ret = 1;
else if (mb_tests (pattern, cflags, string, eflags, expect, matches)
|| (try_bre_ere
&& mb_tests (pattern, cflags & ~REG_EXTENDED, string,
eflags, expect, matches)))
ret = 1;
}
}
fclose (f);
return ret;
}