diff --git a/ChangeLog b/ChangeLog index 3273204560..d99ae09f46 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,12 @@ 2003-11-13 Ulrich Drepper + * posix/Makefile: Add rules to build and run tst-rxspencer. + (distribute): Add rxspencer/tests and rxspencer/COPYRIGHT. + * posix/tst-rxspencer.c: New file. + * posix/rxspencer/tests: New file. + * posix/rxspencer/COPYRIGHT: New file. + Patch mostly by Jakub Jelinek. + * posix/regcomp.c (parse_bracket_exp): Don't check for range if this is no option given the first token. diff --git a/posix/Makefile b/posix/Makefile index faff565936..c305c5e6dc 100644 --- a/posix/Makefile +++ b/posix/Makefile @@ -34,7 +34,7 @@ distribute := confstr.h TESTS TESTS2C.sed testcases.h \ PTESTS PTESTS2C.sed ptestcases.h \ globtest.c globtest.sh wordexp-tst.sh annexc.c fnmatch_loop.c \ spawn_int.h tst-getconf.sh regcomp.c regexec.c regex_internal.c \ - regex_internal.h fork.h + regex_internal.h fork.h rxspencer/tests rxspencer/COPYRIGHT routines := \ uname \ @@ -78,7 +78,7 @@ tests := tstgetopt testfnm runtests runptests \ bug-regex8 bug-regex9 bug-regex10 bug-regex11 bug-regex12 \ bug-regex13 bug-regex14 bug-regex15 bug-regex16 \ bug-regex17 bug-regex18 bug-regex19 bug-regex20 \ - tst-nice tst-nanosleep transbug + tst-nice tst-nanosleep transbug tst-rxspencer ifeq (yes,$(build-shared)) test-srcs := globtest tests += wordexp-test tst-exec tst-spawn @@ -147,6 +147,7 @@ tst-exec-ARGS = -- $(built-program-cmd) tst-spawn-ARGS = -- $(built-program-cmd) tst-dir-ARGS = `pwd` `cd $(common-objdir)/$(subdir); pwd` `cd $(common-objdir); pwd` $(objpfx)tst-dir tst-chmod-ARGS = `pwd` +tst-rxspencer-ARGS = rxspencer/tests tst-fnmatch-ENV = LOCPATH=$(common-objpfx)localedata tst-regexloc-ENV = LOCPATH=$(common-objpfx)localedata diff --git a/posix/rxspencer/COPYRIGHT b/posix/rxspencer/COPYRIGHT new file mode 100644 index 0000000000..30c1f7a488 --- /dev/null +++ b/posix/rxspencer/COPYRIGHT @@ -0,0 +1,20 @@ +Copyright 1992, 1993, 1994, 1997 Henry Spencer. All rights reserved. +This software is not subject to any license of the American Telephone +and Telegraph Company or of the Regents of the University of California. + +Permission is granted to anyone to use this software for any purpose on +any computer system, and to alter it and redistribute it, subject +to the following restrictions: + +1. The author is not responsible for the consequences of use of this + software, no matter how awful, even if they arise from flaws in it. + +2. The origin of this software must not be misrepresented, either by + explicit claim or by omission. Since few users ever read sources, + credits must appear in the documentation. + +3. Altered versions must be plainly marked as such, and must not be + misrepresented as being the original software. Since few users + ever read sources, credits must appear in the documentation. + +4. This notice may not be removed or altered. diff --git a/posix/rxspencer/tests b/posix/rxspencer/tests new file mode 100644 index 0000000000..acd4623c74 --- /dev/null +++ b/posix/rxspencer/tests @@ -0,0 +1,506 @@ +# regular expression test set +# Lines are at least three fields, separated by one or more tabs. "" stands +# for an empty field. First field is an RE. Second field is flags. If +# C flag given, regcomp() is expected to fail, and the third field is the +# error name (minus the leading REG_). +# +# Otherwise it is expected to succeed, and the third field is the string to +# try matching it against. If there is no fourth field, the match is +# expected to fail. If there is a fourth field, it is the substring that +# the RE is expected to match. If there is a fifth field, it is a comma- +# separated list of what the subexpressions should match, with - indicating +# no match for that one. In both the fourth and fifth fields, a (sub)field +# starting with @ indicates that the (sub)expression is expected to match +# a null string followed by the stuff after the @; this provides a way to +# test where null strings match. The character `N' in REs and strings +# is newline, `S' is space, `T' is tab, `Z' is NUL. +# +# The full list of flags: +# - placeholder, does nothing +# b RE is a BRE, not an ERE +# & try it as both an ERE and a BRE +# C regcomp() error expected, third field is error name +# i REG_ICASE +# m ("mundane") REG_NOSPEC +# s REG_NOSUB (not really testable) +# n REG_NEWLINE +# ^ REG_NOTBOL +# $ REG_NOTEOL +# # REG_STARTEND (see below) +# p REG_PEND +# +# For REG_STARTEND, the start/end offsets are those of the substring +# enclosed in (). + +# basics +a & a a +abc & abc abc +abc|de - abc abc +a|b|c - abc a + +# parentheses and perversions thereof +a(b)c - abc abc +a\(b\)c b abc abc +a( C EPAREN +a( b a( a( +a\( - a( a( +a\( bC EPAREN +a\(b bC EPAREN +a(b C EPAREN +a(b b a(b a(b +# gag me with a right parenthesis -- 1003.2 goofed here (my fault, partly) +a) - a) a) +) - ) ) +# end gagging (in a just world, those *should* give EPAREN) +a) b a) a) +a\) bC EPAREN +\) bC EPAREN +a()b - ab ab +a\(\)b b ab ab + +# anchoring and REG_NEWLINE +^abc$ & abc abc +a^b - a^b +a^b b a^b a^b +a$b - a$b +a$b b a$b a$b +^ & abc @abc +$ & abc @ +^$ & "" @ +$^ - "" @ +\($\)\(^\) b "" @ +# stop retching, those are legitimate (although disgusting) +^^ - "" @ +$$ - "" @ +b$ & abNc +b$ &n abNc b +^b$ & aNbNc +^b$ &n aNbNc b +^$ &n aNNb @Nb +^$ n abc +^$ n abcN @ +$^ n aNNb @Nb +\($\)\(^\) bn aNNb @Nb +^^ n^ aNNb @Nb +$$ n aNNb @NN +^a ^ a +a$ $ a +^a ^n aNb +^b ^n aNb b +a$ $n bNa +b$ $n bNa b +a*(^b$)c* - b b +a*\(^b$\)c* b b b + +# certain syntax errors and non-errors +| C EMPTY +| b | | +* C BADRPT +* b * * ++ C BADRPT +? C BADRPT +"" &C EMPTY +() - abc @abc +\(\) b abc @abc +a||b C EMPTY +|ab C EMPTY +ab| C EMPTY +(|a)b C EMPTY +(a|)b C EMPTY +(*a) C BADRPT +(+a) C BADRPT +(?a) C BADRPT +({1}a) C BADRPT +\(\{1\}a\) bC BADRPT +(a|*b) C BADRPT +(a|+b) C BADRPT +(a|?b) C BADRPT +(a|{1}b) C BADRPT +^* C BADRPT +^* b * * +^+ C BADRPT +^? C BADRPT +^{1} C BADRPT +^\{1\} bC BADRPT + +# metacharacters, backslashes +a.c & abc abc +a[bc]d & abd abd +a\*c & a*c a*c +a\\b & a\b a\b +a\\\*b & a\*b a\*b +# The following test is wrong. Using \b in an BRE or ERE is undefined. +# a\bc & abc abc +a\ &C EESCAPE +a\\bc & a\bc a\bc +\{ bC BADRPT +a\[b & a[b a[b +a[b &C EBRACK +# trailing $ is a peculiar special case for the BRE code +a$ & a a +a$ & a$ +a\$ & a +a\$ & a$ a$ +a\\$ & a +a\\$ & a$ +a\\$ & a\$ +a\\$ & a\ a\ + +# back references, ugh +a\(b\)\2c bC ESUBREG +a\(b\1\)c bC ESUBREG +a\(b*\)c\1d b abbcbbd abbcbbd bb +a\(b*\)c\1d b abbcbd +a\(b*\)c\1d b abbcbbbd +^\(.\)\1 b abc +a\([bc]\)\1d b abcdabbd abbd b +a\(\([bc]\)\2\)*d b abbccd abbccd +a\(\([bc]\)\2\)*d b abbcbd +# actually, this next one probably ought to fail, but the spec is unclear +a\(\(b\)*\2\)*d b abbbd abbbd +# here is a case that no NFA implementation does right +\(ab*\)[ab]*\1 b ababaaa ababaaa a +# check out normal matching in the presence of back refs +\(a\)\1bcd b aabcd aabcd +\(a\)\1bc*d b aabcd aabcd +\(a\)\1bc*d b aabd aabd +\(a\)\1bc*d b aabcccd aabcccd +\(a\)\1bc*[ce]d b aabcccd aabcccd +^\(a\)\1b\(c\)*cd$ b aabcccd aabcccd + +# ordinary repetitions +ab*c & abc abc +ab+c - abc abc +ab?c - abc abc +a\(*\)b b a*b a*b +a\(**\)b b ab ab +a\(***\)b bC BADRPT +*a b *a *a +**a b a a +***a bC BADRPT + +# the dreaded bounded repetitions +# The following two tests are not correct: +#{ & { { +#{abc & {abc {abc +# '{' is always a special char outside bracket expressions. So test ony BRE: +{ b { { +{abc b {abc {abc +{1 C BADRPT +{1} C BADRPT +# Same reason as for the two tests above: +#a{b & a{b a{b +a{b b a{b a{b +a{1}b - ab ab +a\{1\}b b ab ab +a{1,}b - ab ab +a\{1,\}b b ab ab +a{1,2}b - aab aab +a\{1,2\}b b aab aab +a{1 C EBRACE +a\{1 bC EBRACE +a{1a C EBRACE +a\{1a bC EBRACE +a{1a} C BADBR +a\{1a\} bC BADBR +# These four tests checks for undefined behavior. Our implementation does +# something different. +#a{,2} - a{,2} a{,2} +#a\{,2\} bC BADBR +#a{,} - a{,} a{,} +#a\{,\} bC BADBR +a{1,x} C BADBR +a\{1,x\} bC BADBR +a{1,x C EBRACE +a\{1,x bC EBRACE +# These two tests probably fails due to an arbitrary limit on the number of +# repetitions in the other implementation. +#a{300} C BADBR +#a\{300\} bC BADBR +a{1,0} C BADBR +a\{1,0\} bC BADBR +ab{0,0}c - abcac ac +ab\{0,0\}c b abcac ac +ab{0,1}c - abcac abc +ab\{0,1\}c b abcac abc +ab{0,3}c - abbcac abbc +ab\{0,3\}c b abbcac abbc +ab{1,1}c - acabc abc +ab\{1,1\}c b acabc abc +ab{1,3}c - acabc abc +ab\{1,3\}c b acabc abc +ab{2,2}c - abcabbc abbc +ab\{2,2\}c b abcabbc abbc +ab{2,4}c - abcabbc abbc +ab\{2,4\}c b abcabbc abbc +((a{1,10}){1,10}){1,10} - a a a,a + +# multiple repetitions +# Wow, there is serious disconnect here. The ERE grammar is like this: +# ERE_expression : one_char_or_coll_elem_ERE +# | '^' +# | '$' +# | '(' extended_reg_exp ')' +# | ERE_expression ERE_dupl_symbol +# ; +# where ERE_dupl_symbol is any of the repetition methods. It is clear from +# this that consecutive repetition is OK. On top of this, the one test not +# marked as failing must fail. For BREs the situation is different, so we +# use the four tests. +#a** &C BADRPT +a** bC BADRPT +#a++ C BADRPT +#a?? C BADRPT +#a*+ C BADRPT +#a*? C BADRPT +#a+* C BADRPT +#a+? C BADRPT +#a?* C BADRPT +#a?+ C BADRPT +#a{1}{1} C BADRPT +#a*{1} C BADRPT +#a+{1} C BADRPT +#a?{1} C BADRPT +#a{1}* C BADRPT +#a{1}+ C BADRPT +#a{1}? C BADRPT +#a*{b} - a{b} a{b} +a\{1\}\{1\} bC BADRPT +a*\{1\} bC BADRPT +a\{1\}* bC BADRPT + +# brackets, and numerous perversions thereof +a[b]c & abc abc +a[ab]c & abc abc +a[^ab]c & adc adc +a[]b]c & a]c a]c +a[[b]c & a[c a[c +a[-b]c & a-c a-c +a[^]b]c & adc adc +a[^-b]c & adc adc +a[b-]c & a-c a-c +a[b &C EBRACK +a[] &C EBRACK +a[1-3]c & a2c a2c +a[3-1]c &C ERANGE +a[1-3-5]c &C ERANGE +a[[.-.]--]c & a-c a-c +# I don't thing the error value should be ERANGE since a[1-] would be +# valid, too. Expect EBRACK. +#a[1- &C ERANGE +a[1- &C EBRACK +a[[. &C EBRACK +a[[.x &C EBRACK +a[[.x. &C EBRACK +a[[.x.] &C EBRACK +a[[.x.]] & ax ax +a[[.x,.]] &C ECOLLATE +# XXX Doesn't work yet. +# a[[.one.]]b & a1b a1b +a[[.notdef.]]b &C ECOLLATE +a[[.].]]b & a]b a]b +a[[:alpha:]]c & abc abc +a[[:notdef:]]c &C ECTYPE +a[[: &C EBRACK +a[[:alpha &C EBRACK +a[[:alpha:] &C EBRACK +a[[:alpha,:] &C ECTYPE +a[[:]:]]b &C ECTYPE +a[[:-:]]b &C ECTYPE +a[[:alph:]] &C ECTYPE +a[[:alphabet:]] &C ECTYPE +[[:alnum:]]+ - -%@a0X- a0X +[[:alpha:]]+ - -%@aX0- aX +[[:blank:]]+ - aSSTb SST +[[:cntrl:]]+ - aNTb NT +[[:digit:]]+ - a019b 019 +[[:graph:]]+ - Sa%bS a%b +[[:lower:]]+ - AabC ab +[[:print:]]+ - NaSbN aSb +[[:punct:]]+ - S%-&T %-& +[[:space:]]+ - aSNTb SNT +[[:upper:]]+ - aBCd BC +[[:xdigit:]]+ - p0f3Cq 0f3C +a[[=b=]]c & abc abc +a[[= &C EBRACK +a[[=b &C EBRACK +a[[=b= &C EBRACK +a[[=b=] &C EBRACK +a[[=b,=]] &C ECOLLATE +# XXX Doesn't work yet. +#a[[=one=]]b & a1b a1b + +# complexities +a(((b)))c - abc abc +a(b|(c))d - abd abd +a(b*|c)d - abbd abbd +# just gotta have one DFA-buster, of course +a[ab]{20} - aaaaabaaaabaaaabaaaab aaaaabaaaabaaaabaaaab +# and an inline expansion in case somebody gets tricky +a[ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab] - aaaaabaaaabaaaabaaaab aaaaabaaaabaaaabaaaab +# and in case somebody just slips in an NFA... +a[ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab](wee|week)(knights|night) - aaaaabaaaabaaaabaaaabweeknights aaaaabaaaabaaaabaaaabweeknights +# fish for anomalies as the number of states passes 32 +12345678901234567890123456789 - a12345678901234567890123456789b 12345678901234567890123456789 +123456789012345678901234567890 - a123456789012345678901234567890b 123456789012345678901234567890 +1234567890123456789012345678901 - a1234567890123456789012345678901b 1234567890123456789012345678901 +12345678901234567890123456789012 - a12345678901234567890123456789012b 12345678901234567890123456789012 +123456789012345678901234567890123 - a123456789012345678901234567890123b 123456789012345678901234567890123 +# and one really big one, beyond any plausible word width +1234567890123456789012345678901234567890123456789012345678901234567890 - a1234567890123456789012345678901234567890123456789012345678901234567890b 1234567890123456789012345678901234567890123456789012345678901234567890 +# fish for problems as brackets go past 8 +[ab][cd][ef][gh][ij][kl][mn] - xacegikmoq acegikm +[ab][cd][ef][gh][ij][kl][mn][op] - xacegikmoq acegikmo +[ab][cd][ef][gh][ij][kl][mn][op][qr] - xacegikmoqy acegikmoq +[ab][cd][ef][gh][ij][kl][mn][op][q] - xacegikmoqy acegikmoq + +# subtleties of matching +abc & xabcy abc +a\(b\)?c\1d b acd +aBc i Abc Abc +a[Bc]*d i abBCcd abBCcd +0[[:upper:]]1 &i 0a1 0a1 +0[[:lower:]]1 &i 0A1 0A1 +a[^b]c &i abc +a[^b]c &i aBc +a[^b]c &i adc adc +[a]b[c] - abc abc +[a]b[a] - aba aba +[abc]b[abc] - abc abc +[abc]b[abd] - abd abd +a(b?c)+d - accd accd +(wee|week)(knights|night) - weeknights weeknights +(we|wee|week|frob)(knights|night|day) - weeknights weeknights +a[bc]d - xyzaaabcaababdacd abd +a[ab]c - aaabc abc +abc s abc abc +a* & b @b + +# Let's have some fun -- try to match a C comment. +# first the obvious, which looks okay at first glance... +/\*.*\*/ - /*x*/ /*x*/ +# but... +/\*.*\*/ - /*x*/y/*z*/ /*x*/y/*z*/ +# okay, we must not match */ inside; try to do that... +/\*([^*]|\*[^/])*\*/ - /*x*/ /*x*/ +/\*([^*]|\*[^/])*\*/ - /*x*/y/*z*/ /*x*/ +# but... +/\*([^*]|\*[^/])*\*/ - /*x**/y/*z*/ /*x**/y/*z*/ +# and a still fancier version, which does it right (I think)... +/\*([^*]|\*+[^*/])*\*+/ - /*x*/ /*x*/ +/\*([^*]|\*+[^*/])*\*+/ - /*x*/y/*z*/ /*x*/ +/\*([^*]|\*+[^*/])*\*+/ - /*x**/y/*z*/ /*x**/ +/\*([^*]|\*+[^*/])*\*+/ - /*x****/y/*z*/ /*x****/ +/\*([^*]|\*+[^*/])*\*+/ - /*x**x*/y/*z*/ /*x**x*/ +/\*([^*]|\*+[^*/])*\*+/ - /*x***x/y/*z*/ /*x***x/y/*z*/ + +# subexpressions +.* - abc abc - +a(b)(c)d - abcd abcd b,c +a(((b)))c - abc abc b,b,b +a(b|(c))d - abd abd b,- +a(b*|c|e)d - abbd abbd bb +a(b*|c|e)d - acd acd c +a(b*|c|e)d - ad ad @d +a(b?)c - abc abc b +a(b?)c - ac ac @c +a(b+)c - abc abc b +a(b+)c - abbbc abbbc bbb +a(b*)c - ac ac @c +(a|ab)(bc([de]+)f|cde) - abcdef abcdef a,bcdef,de +# the regression tester only asks for 9 subexpressions +a(b)(c)(d)(e)(f)(g)(h)(i)(j)k - abcdefghijk abcdefghijk b,c,d,e,f,g,h,i,j +a(b)(c)(d)(e)(f)(g)(h)(i)(j)(k)l - abcdefghijkl abcdefghijkl b,c,d,e,f,g,h,i,j,k +a([bc]?)c - abc abc b +a([bc]?)c - ac ac @c +a([bc]+)c - abc abc b +a([bc]+)c - abcc abcc bc +a([bc]+)bc - abcbc abcbc bc +a(bb+|b)b - abb abb b +a(bbb+|bb+|b)b - abb abb b +a(bbb+|bb+|b)b - abbb abbb bb +a(bbb+|bb+|b)bb - abbb abbb b +(.*).* - abcdef abcdef abcdef +(a*)* - bc @b @b + +# do we get the right subexpression when it is used more than once? +a(b|c)*d - ad ad - +a(b|c)*d - abcd abcd c +a(b|c)+d - abd abd b +a(b|c)+d - abcd abcd c +a(b|c?)+d - ad ad @d +a(b|c?)+d - abcd abcd @d +a(b|c){0,0}d - ad ad - +a(b|c){0,1}d - ad ad - +a(b|c){0,1}d - abd abd b +a(b|c){0,2}d - ad ad - +a(b|c){0,2}d - abcd abcd c +a(b|c){0,}d - ad ad - +a(b|c){0,}d - abcd abcd c +a(b|c){1,1}d - abd abd b +a(b|c){1,1}d - acd acd c +a(b|c){1,2}d - abd abd b +a(b|c){1,2}d - abcd abcd c +a(b|c){1,}d - abd abd b +a(b|c){1,}d - abcd abcd c +a(b|c){2,2}d - acbd acbd b +a(b|c){2,2}d - abcd abcd c +a(b|c){2,4}d - abcd abcd c +a(b|c){2,4}d - abcbd abcbd b +a(b|c){2,4}d - abcbcd abcbcd c +a(b|c){2,}d - abcd abcd c +a(b|c){2,}d - abcbd abcbd b +a(b+|((c)*))+d - abd abd @d,@d,- +# XXX Needs to be checked. +#a(b+|((c)*))+d - abcd abcd @d,@d,- + +# check out the STARTEND option +[abc] &# a(b)c b +[abc] &# a(d)c +[abc] &# a(bc)d b +[abc] &# a(dc)d c +. &# a()c +b.*c &# b(bc)c bc +b.* &# b(bc)c bc +.*c &# b(bc)c bc + +# plain strings, with the NOSPEC flag +abc m abc abc +abc m xabcy abc +abc m xyz +a*b m aba*b a*b +a*b m ab +"" mC EMPTY + +# cases involving NULs +aZb & a a +aZb &p a +aZb &p# (aZb) aZb +aZ*b &p# (ab) ab +a.b &# (aZb) aZb +a.* &# (aZb)c aZb + +# word boundaries (ick) +[[:<:]]a & a a +[[:<:]]a & ba +[[:<:]]a & -a a +a[[:>:]] & a a +a[[:>:]] & ab +a[[:>:]] & a- a +[[:<:]]a.c[[:>:]] & axcd-dayc-dazce-abc abc +[[:<:]]a.c[[:>:]] & axcd-dayc-dazce-abc-q abc +[[:<:]]a.c[[:>:]] & axc-dayc-dazce-abc axc +[[:<:]]b.c[[:>:]] & a_bxc-byc_d-bzc-q bzc +[[:<:]].x..[[:>:]] & y_xa_-_xb_y-_xc_-axdc _xc_ +[[:<:]]a_b[[:>:]] & x_a_b + +# past problems, and suspected problems +(A[1])|(A[2])|(A[3])|(A[4])|(A[5])|(A[6])|(A[7])|(A[8])|(A[9])|(A[A]) - A1 A1 +abcdefghijklmnop i abcdefghijklmnop abcdefghijklmnop +abcdefghijklmnopqrstuv i abcdefghijklmnopqrstuv abcdefghijklmnopqrstuv +(ALAK)|(ALT[AB])|(CC[123]1)|(CM[123]1)|(GAMC)|(LC[23][EO ])|(SEM[1234])|(SL[ES][12])|(SLWW)|(SLF )|(SLDT)|(VWH[12])|(WH[34][EW])|(WP1[ESN]) - CC11 CC11 +CC[13]1|a{21}[23][EO][123][Es][12]a{15}aa[34][EW]aaaaaaa[X]a - CC11 CC11 +Char \([a-z0-9_]*\)\[.* b Char xyz[k Char xyz[k xyz +a?b - ab ab +-\{0,1\}[0-9]*$ b -5 -5 +a*a*a*a*a*a*a* & aaaaaa aaaaaa diff --git a/posix/tst-rxspencer.c b/posix/tst-rxspencer.c new file mode 100644 index 0000000000..eed3e1820b --- /dev/null +++ b/posix/tst-rxspencer.c @@ -0,0 +1,515 @@ +/* Regular expression tests. + Copyright (C) 2003 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Jakub Jelinek , 2003. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include +#include +#include +#include +#include +#include +#include +#include + +static void +replace_special_chars (char *str) +{ + for (; (str = strpbrk (str, "NTSZ")) != NULL; ++str) + switch (*str) + { + case 'N': *str = '\n'; break; + case 'T': *str = '\t'; break; + case 'S': *str = ' '; break; + case 'Z': *str = '\0'; break; + } +} + +static void +glibc_re_syntax (char *str) +{ + char *p, *end = strchr (str, '\0') + 1; + + /* Replace [[:<:]] with \< and [[:>:]] with \>. */ + for (p = str; (p = strstr (p, "[[:")) != NULL; ) + if ((p[3] == '<' || p[3] == '>') && strncmp (p + 4, ":]]", 3) == 0) + { + p[0] = '\\'; + p[1] = p[3]; + memmove (p + 2, p + 7, end - p - 7); + end -= 5; + p += 2; + } + else + p += 3; +} + +static char * +mb_replace (char *dst, const char c) +{ + switch (c) + { + /* Replace a with \'a and A with \'A. */ + case 'a': + *dst++ = '\xc3'; + *dst++ = '\xa1'; + break; + case 'A': + *dst++ = '\xc3'; + *dst++ = '\x81'; + break; + /* Replace b with \v{c} and B with \v{C}. */ + case 'b': + *dst++ = '\xc4'; + *dst++ = '\x8d'; + break; + case 'B': + *dst++ = '\xc4'; + *dst++ = '\x8c'; + break; + /* Replace c with \v{d} and C with \v{D}. */ + case 'c': + *dst++ = '\xc4'; + *dst++ = '\x8f'; + break; + case 'C': + *dst++ = '\xc4'; + *dst++ = '\x8e'; + break; + /* Replace d with \'e and D with \'E. */ + case 'd': + *dst++ = '\xc3'; + *dst++ = '\xa9'; + break; + case 'D': + *dst++ = '\xc3'; + *dst++ = '\x89'; + break; + } + return dst; +} + +static char * +mb_frob_string (const char *str, const char *letters) +{ + char *ret, *dst; + const char *src; + + if (str == NULL) + return NULL; + + ret = malloc (2 * strlen (str) + 1); + if (ret == NULL) + return NULL; + + for (src = str, dst = ret; *src; ++src) + if (strchr (letters, *src)) + dst = mb_replace (dst, *src); + else + *dst++ = *src; + *dst = '\0'; + return ret; +} + +/* Like mb_frob_string, but don't replace anything between + [: and :], [. and .] or [= and =]. */ + +static char * +mb_frob_pattern (const char *str, const char *letters) +{ + char *ret, *dst; + const char *src; + int in_class = 0; + + if (str == NULL) + return NULL; + + ret = malloc (2 * strlen (str) + 1); + if (ret == NULL) + return NULL; + + for (src = str, dst = ret; *src; ++src) + if (!in_class && strchr (letters, *src)) + dst = mb_replace (dst, *src); + else + { + if (!in_class && *src == '[' && strchr (":.=", src[1])) + in_class = 1; + else if (in_class && *src == ']' && strchr (":.=", src[-1])) + in_class = 0; + *dst++ = *src; + } + *dst = '\0'; + return ret; +} + +static int +check_match (regmatch_t *rm, int idx, const char *string, + const char *match, const char *fail) +{ + if (match[0] == '-' && match[1] == '\0') + { + if (rm[idx].rm_so == -1 && rm[idx].rm_eo == -1) + return 0; + printf ("%s rm[%d] unexpectedly matched\n", fail, idx); + return 1; + } + + if (rm[idx].rm_so == -1 || rm[idx].rm_eo == -1) + { + printf ("%s rm[%d] unexpectedly did not match\n", fail, idx); + return 1; + } + + if (match[0] == '@') + { + if (rm[idx].rm_so != rm[idx].rm_eo) + { + printf ("%s rm[%d] not empty\n", fail, idx); + return 1; + } + + if (strncmp (string + rm[idx].rm_so, match + 1, strlen (match + 1))) + { + printf ("%s rm[%d] not matching %s\n", fail, idx, match); + return 1; + } + return 0; + } + + if (rm[idx].rm_eo - rm[idx].rm_so != strlen (match) + || strncmp (string + rm[idx].rm_so, match, + rm[idx].rm_eo - rm[idx].rm_so)) + { + printf ("%s rm[%d] not matching %s\n", fail, idx, match); + return 1; + } + + return 0; +} + +static int +test (const char *pattern, int cflags, const char *string, int eflags, + char *expect, char *matches, const char *fail) +{ + regex_t re; + regmatch_t rm[10]; + int n, ret = 0; + + n = regcomp (&re, pattern, cflags); + if (n != 0) + { + if (eflags == -1) + { + static struct { reg_errcode_t code; const char *name; } codes [] +#define C(x) { REG_##x, #x } + = { C(NOERROR), C(NOMATCH), C(BADPAT), C(ECOLLATE), + C(ECTYPE), C(EESCAPE), C(ESUBREG), C(EBRACK), + C(EPAREN), C(EBRACE), C(BADBR), C(ERANGE), + C(ESPACE), C(BADRPT) }; + + for (int i = 0; i < sizeof (codes) / sizeof (codes[0]); ++i) + if (n == codes[i].code) + { + if (strcmp (string, codes[i].name)) + { + printf ("%s regcomp returned REG_%s (expected REG_%s)\n", + fail, codes[i].name, string); + return 1; + } + return 0; + } + + printf ("%s regcomp return value REG_%d\n", fail, n); + return 1; + } + + char buf[500]; + regerror (n, &re, buf, sizeof (buf)); + printf ("%s regcomp failed: %s\n", fail, buf); + return 1; + } + + if (eflags == -1) + { + regfree (&re); + + /* The test case file assumes something only guaranteed by the + rxspencer regex implementation. Namely that for empty + expressions regcomp() return REG_EMPTY. This is not the case + for us and so we ignore this error. */ + if (strcmp (string, "EMPTY") == 0) + return 0; + + printf ("%s regcomp unexpectedly succeeded\n", fail); + return 1; + } + + if (regexec (&re, string, 10, rm, eflags)) + { + regfree (&re); + if (expect == NULL) + return 0; + printf ("%s regexec failed\n", fail); + return 1; + } + + regfree (&re); + + if (expect == NULL) + { + printf ("%s regexec unexpectedly succeeded\n", fail); + return 1; + } + + if (cflags & REG_NOSUB) + return 0; + + ret = check_match (rm, 0, string, expect, fail); + if (matches == NULL) + return ret; + + for (n = 1; ret == 0 && n < 10; ++n) + { + char *p = NULL; + + if (matches) + { + p = strchr (matches, ','); + if (p != NULL) + *p = '\0'; + } + ret = check_match (rm, n, string, matches ?: "-", fail); + if (p) + { + *p = ','; + matches = p + 1; + } + else + matches = NULL; + } + + return ret; +} + +static int +mb_test (const char *pattern, int cflags, const char *string, int eflags, + char *expect, const char *matches, const char *letters, + const char *fail) +{ + char *pattern_mb = mb_frob_pattern (pattern, letters); + const char *string_mb + = eflags == -1 ? string : mb_frob_string (string, letters); + char *expect_mb = mb_frob_string (expect, letters); + char *matches_mb = mb_frob_string (matches, letters); + int ret = 0; + + if (!pattern_mb || !string_mb + || (expect && !expect_mb) || (matches && !matches_mb)) + { + printf ("%s %m", fail); + ret = 1; + } + else + ret = test (pattern_mb, cflags, string_mb, eflags, expect_mb, + matches_mb, fail); + + free (matches_mb); + free (expect_mb); + if (string_mb != string) + free ((char *) string_mb); + free (pattern_mb); + return ret; +} + +static int +mb_tests (const char *pattern, int cflags, const char *string, int eflags, + char *expect, const char *matches) +{ + int ret = 0; + int i; + char letters[9], fail[20]; + + /* The tests aren't supposed to work with xdigit, since a-dA-D are + hex digits while \'a \'A \v{c}\v{C}\v{d}\v{D}\'e \'E are not. */ + if (strstr (pattern, "[:xdigit:]")) + return 0; + + for (i = 1; i < 16; ++i) + { + char *p = letters; + if (i & 1) + *p++ = 'a', *p++ = 'A'; + if (i & 2) + *p++ = 'b', *p++ = 'B'; + if (i & 4) + *p++ = 'c', *p++ = 'C'; + if (i & 8) + *p++ = 'd', *p++ = 'D'; + *p++ = '\0'; + sprintf (fail, "UTF-8 %s FAIL", letters); + ret |= mb_test (pattern, cflags, string, eflags, expect, matches, + letters, fail); + } + return ret; +} + +int +main (int argc, char **argv) +{ + int ret = 0; + char *line = NULL; + size_t line_len = 0; + ssize_t len; + FILE *f; + static int test_utf8 = 0; + static const struct option options[] = + { + {"utf8", no_argument, &test_utf8, 1}, + {NULL, 0, NULL, 0 } + }; + + while (getopt_long (argc, argv, "u", options, NULL) >= 0); + + if (optind + 1 != argc) + { + fprintf (stderr, "Missing test filename\n"); + return 1; + } + + f = fopen (argv[optind], "r"); + if (f == NULL) + { + fprintf (stderr, "Couldn't open %s\n", argv[1]); + return 1; + } + + while ((len = getline (&line, &line_len, f)) > 0) + { + char *pattern, *flagstr, *string, *expect, *matches, *p; + int cflags = REG_EXTENDED, eflags = 0, try_bre_ere = 0; + + if (line[len - 1] == '\n') + line[len - 1] = '\0'; + + /* Skip comments and empty lines. */ + if (*line == '#' || *line == '\0') + continue; + + puts (line); + fflush (stdout); + + pattern = strtok (line, "\t"); + if (pattern == NULL) + continue; + + if (strcmp (pattern, "\"\"") == 0) + pattern += 2; + + flagstr = strtok (NULL, "\t"); + if (flagstr == NULL) + continue; + + string = strtok (NULL, "\t"); + if (string == NULL) + continue; + + if (strcmp (string, "\"\"") == 0) + string += 2; + + for (p = flagstr; *p; ++p) + switch (*p) + { + case '-': + break; + case 'b': + cflags &= ~REG_EXTENDED; + break; + case '&': + try_bre_ere = 1; + break; + case 'C': + eflags = -1; + break; + case 'i': + cflags |= REG_ICASE; + break; + case 's': + cflags |= REG_NOSUB; + break; + case 'n': + cflags |= REG_NEWLINE; + break; + case '^': + eflags |= REG_NOTBOL; + break; + case '$': + eflags |= REG_NOTEOL; + break; + case 'm': + case 'p': + case '#': + /* Not supported. */ + flagstr = NULL; + break; + } + + if (flagstr == NULL) + continue; + + replace_special_chars (pattern); + glibc_re_syntax (pattern); + if (eflags != -1) + replace_special_chars (string); + + expect = strtok (NULL, "\t"); + matches = NULL; + if (expect != NULL) + { + replace_special_chars (expect); + matches = strtok (NULL, "\t"); + if (matches != NULL) + replace_special_chars (matches); + } + + setlocale (LC_ALL, "C"); + if (test (pattern, cflags, string, eflags, expect, matches, "FAIL") + || (try_bre_ere + && test (pattern, cflags & ~REG_EXTENDED, string, eflags, + expect, matches, "FAIL"))) + ret = 1; + else if (test_utf8) + { + setlocale (LC_ALL, "cs_CZ.UTF-8"); + if (test (pattern, cflags, string, eflags, expect, matches, + "UTF-8 FAIL") + || (try_bre_ere + && test (pattern, cflags & ~REG_EXTENDED, string, eflags, + expect, matches, "UTF-8 FAIL"))) + ret = 1; + else if (mb_tests (pattern, cflags, string, eflags, expect, matches) + || (try_bre_ere + && mb_tests (pattern, cflags & ~REG_EXTENDED, string, + eflags, expect, matches))) + ret = 1; + } + } + + fclose (f); + return ret; +}