From c13c99fa92bff8320d2af23717c3e4d656923932 Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Thu, 20 Nov 2003 23:36:40 +0000 Subject: [PATCH] Update. 2003-11-20 Ulrich Drepper * posix/PTESTS: Fix first test in GA143. 2003-11-20 Jakub Jelinek * posix/regex_internal.h (re_dfastate_t): Remove trtable_search. Add word_trtable. * posix/regex_internal.c (create_newstate_common, free_state): Don't free trtable_search. * posix/regexec.c (check_matching): Remove fl_search argument. (transit_state_sb): Likewise. #ifdef out as unused. (build_trtable): Remove fl_search argument. Set state->word_trtable and state->trtable. Build separate word and non-word tables if multi-byte and they differ for some character. (transit_state): Remove fl_search argument. Don't update state->trtable here. Handle state->word_trtable. #ifdef out unused call to transit_state_sb. (re_search_internal): Update check_matching caller. (group_nodes_into_DFAstates): Don't clear non-ascii chars in accepts bitmask for multi-byte locales. * posix/bug-regex19.c (tests): Enable some commented out tests, add 2 new tests. * posix/tst-rxspencer.c (mb_tests): Don't test [[=b=]] for now as multi-byte. Don't run identical multi-byte tests multiple times unnecessarily. (main): Check setlocale return value. * posix/Makefile (tst-rxspencer-ARGS): Add --utf8 argument. (tst-rxspencer-ENV): Remove MALLOC_TRACE, add LOCPATH. ($(objpfx)tst-rxspencer-mem): Run another tst-rxspencer test here, without --utf8 argument but with MALLOC_TRACE. --- ChangeLog | 32 +++++++ localedata/ChangeLog | 4 + localedata/Makefile | 2 +- posix/Makefile | 11 ++- posix/PTESTS | 2 +- posix/bug-regex19.c | 11 ++- posix/regex_internal.c | 2 - posix/regex_internal.h | 2 +- posix/regexec.c | 205 ++++++++++++++++++----------------------- posix/tst-rxspencer.c | 42 ++++++--- 10 files changed, 176 insertions(+), 137 deletions(-) diff --git a/ChangeLog b/ChangeLog index 4fa706acc8..4eaa92c619 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,35 @@ +2003-11-20 Ulrich Drepper + + * posix/PTESTS: Fix first test in GA143. + +2003-11-20 Jakub Jelinek + + * posix/regex_internal.h (re_dfastate_t): Remove trtable_search. + Add word_trtable. + * posix/regex_internal.c (create_newstate_common, free_state): + Don't free trtable_search. + * posix/regexec.c (check_matching): Remove fl_search argument. + (transit_state_sb): Likewise. #ifdef out as unused. + (build_trtable): Remove fl_search argument. Set state->word_trtable + and state->trtable. Build separate word and non-word tables if + multi-byte and they differ for some character. + (transit_state): Remove fl_search argument. Don't update + state->trtable here. Handle state->word_trtable. + #ifdef out unused call to transit_state_sb. + (re_search_internal): Update check_matching caller. + (group_nodes_into_DFAstates): Don't clear non-ascii chars in accepts + bitmask for multi-byte locales. + * posix/bug-regex19.c (tests): Enable some commented out tests, add + 2 new tests. + * posix/tst-rxspencer.c (mb_tests): Don't test [[=b=]] for now as + multi-byte. Don't run identical multi-byte tests multiple times + unnecessarily. + (main): Check setlocale return value. + * posix/Makefile (tst-rxspencer-ARGS): Add --utf8 argument. + (tst-rxspencer-ENV): Remove MALLOC_TRACE, add LOCPATH. + ($(objpfx)tst-rxspencer-mem): Run another tst-rxspencer test + here, without --utf8 argument but with MALLOC_TRACE. + 2003-11-19 Jakub Jelinek * posix/regexec.c (extend_buffers): Don't allocate diff --git a/localedata/ChangeLog b/localedata/ChangeLog index 77ace9d0ce..fcce659c0c 100644 --- a/localedata/ChangeLog +++ b/localedata/ChangeLog @@ -1,3 +1,7 @@ +2003-11-20 Jakub Jelinek + + * Makefile (LOCALES): Add cs_CZ.UTF-8. + 2003-11-15 Ulrich Drepper * Makefile (tst-leaks-ENV): Add LOCPATH. diff --git a/localedata/Makefile b/localedata/Makefile index ebba83a42a..68dd9e8fef 100644 --- a/localedata/Makefile +++ b/localedata/Makefile @@ -132,7 +132,7 @@ LOCALES := de_DE.ISO-8859-1 de_DE.UTF-8 en_US.ANSI_X3.4-1968 \ en_US.ISO-8859-1 ja_JP.EUC-JP da_DK.ISO-8859-1 \ hr_HR.ISO-8859-2 sv_SE.ISO-8859-1 ja_JP.SJIS fr_FR.ISO-8859-1 \ vi_VN.TCVN5712-1 nb_NO.ISO-8859-1 nn_NO.ISO-8859-1 \ - tr_TR.UTF-8 + tr_TR.UTF-8 cs_CZ.UTF-8 LOCALE_SRCS := $(shell echo "$(LOCALES)"|sed 's/\([^ .]*\)[^ ]*/\1/g') CHARMAPS := $(shell echo "$(LOCALES)" | \ sed -e 's/[^ .]*[.]\([^ ]*\)/\1/g' -e s/SJIS/SHIFT_JIS/g) diff --git a/posix/Makefile b/posix/Makefile index 692b474764..c74b631241 100644 --- a/posix/Makefile +++ b/posix/Makefile @@ -148,7 +148,6 @@ tst-exec-ARGS = -- $(built-program-cmd) tst-spawn-ARGS = -- $(built-program-cmd) tst-dir-ARGS = `pwd` `cd $(common-objdir)/$(subdir); pwd` `cd $(common-objdir); pwd` $(objpfx)tst-dir tst-chmod-ARGS = `pwd` -tst-rxspencer-ARGS = rxspencer/tests tst-fnmatch-ENV = LOCPATH=$(common-objpfx)localedata tst-regexloc-ENV = LOCPATH=$(common-objpfx)localedata @@ -160,6 +159,8 @@ bug-regex17-ENV = LOCPATH=$(common-objpfx)localedata bug-regex18-ENV = LOCPATH=$(common-objpfx)localedata bug-regex19-ENV = LOCPATH=$(common-objpfx)localedata bug-regex20-ENV = LOCPATH=$(common-objpfx)localedata +tst-rxspencer-ARGS = --utf8 rxspencer/tests +tst-rxspencer-ENV = LOCPATH=$(common-objpfx)localedata testcases.h: TESTS TESTS2C.sed sed -f TESTS2C.sed < $< > $@T @@ -207,9 +208,13 @@ bug-regex21-ENV = MALLOC_TRACE=$(objpfx)bug-regex21.mtrace $(objpfx)bug-regex21-mem: $(objpfx)bug-regex21.out $(common-objpfx)malloc/mtrace $(objpfx)bug-regex21.mtrace > $@ -tst-rxspencer-ENV = MALLOC_TRACE=$(objpfx)tst-rxspencer.mtrace - +# tst-rxspencer.mtrace is generated only when run without --utf8 +# option, since otherwise the file has almost 100M and takes very long +# time to process. $(objpfx)tst-rxspencer-mem: $(objpfx)tst-rxspencer.out + MALLOC_TRACE=$(objpfx)tst-rxspencer.mtrace $(tst-rxspencer-ENV) \ + $(run-program-prefix) $(objpfx)tst-rxspencer rxspencer/tests \ + > /dev/null $(common-objpfx)malloc/mtrace $(objpfx)tst-rxspencer.mtrace > $@ $(objpfx)tst-getconf.out: tst-getconf.sh $(objpfx)getconf diff --git a/posix/PTESTS b/posix/PTESTS index 8235384fec..7d2676e20f 100644 --- a/posix/PTESTS +++ b/posix/PTESTS @@ -270,7 +270,7 @@ 1¦63¦a\{1,63\}¦aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa¦ # 2.8.3.4 BRE Precedence # GA143 -2¦20¦\^\[[[.].]]\\(\\1\\)\\*\\{1,2\\}\$¦a^[]\(1\)\*\{1,2\}$b¦ +2¦20¦\^\[[[.].]]\\(\\1\\)\*\\{1,2\\}\$¦a^[]\(1\)*\{1,2\}$b¦ 1¦6¦[[=*=]][[=\=]][[=]=]][[===]][[...]][[:punct:]]¦*\]=.;¦ 1¦6¦[$\(*\)^]*¦$\()*^¦ 1¦1¦[\1]¦1¦ diff --git a/posix/bug-regex19.c b/posix/bug-regex19.c index 837ab654bc..fb870338c3 100644 --- a/posix/bug-regex19.c +++ b/posix/bug-regex19.c @@ -37,17 +37,21 @@ static struct \xc3\x96 LATIN CAPITAL LETTER O WITH DIAERESIS \xe2\x80\x94 EM DASH */ /* Should not match. */ + {RE_SYNTAX_POSIX_BASIC, "\\", "aAAO", 1, -1}, + {RE_SYNTAX_POSIX_BASIC, "\\bA", "aOAA", 0, -1}, {RE_SYNTAX_POSIX_BASIC, "\\bA", "aOAA", 2, -1}, {RE_SYNTAX_POSIX_BASIC, "A\\b", "aAAO", 1, -1}, + {RE_SYNTAX_POSIX_BASIC, "\\<\xc3\x84", "a\xc3\x96\xc3\x84\xc3\x84", 0, -1}, {RE_SYNTAX_POSIX_BASIC, "\\<\xc3\x84", "a\xc3\x96\xc3\x84\xc3\x84", 3, -1}, {RE_SYNTAX_POSIX_BASIC, "\xc3\x84\\>", "a\xc3\x84\xc3\x84\xc3\x96", 1, -1}, #if 0 - /* XXX Not used since they fail so far. */ + /* XXX these 2 tests still fail. */ + {RE_SYNTAX_POSIX_BASIC, "\\b\xc3\x84", "a\xc3\x96\xc3\x84\xc3\x84", 0, -1}, {RE_SYNTAX_POSIX_BASIC, "\\b\xc3\x84", "a\xc3\x96\xc3\x84\xc3\x84", 3, -1}, - {RE_SYNTAX_POSIX_BASIC, "\xc3\x84\\b", "a\xc3\x84\xc3\x84\xc3\x96", 1, -1}, #endif + {RE_SYNTAX_POSIX_BASIC, "\xc3\x84\\b", "a\xc3\x84\xc3\x84\xc3\x96", 1, -1}, /* Should match. */ {RE_SYNTAX_POSIX_BASIC, "\\", "a\xc3\x84\xc3\x84\xe2\x80\x94", 1, 3}, @@ -67,7 +69,6 @@ static struct {RE_SYNTAX_POSIX_BASIC, "\\b\xc3\x84", "a\xe2\x80\x94\xc3\x84\xc3\x84", 4, 4}, {RE_SYNTAX_POSIX_BASIC, "\xc3\x84\\b", "a\xc3\x84\xc3\x84\xe2\x80\x94", 1, 3}, {RE_SYNTAX_POSIX_BASIC, "\xc3\x84\\b", "a\xc3\x84\xc3\x84", 1, 3} -#endif }; int diff --git a/posix/regex_internal.c b/posix/regex_internal.c index 859fe16c61..71496ab906 100644 --- a/posix/regex_internal.c +++ b/posix/regex_internal.c @@ -1207,7 +1207,6 @@ create_newstate_common (dfa, nodes, hash) return NULL; } newstate->trtable = NULL; - newstate->trtable_search = NULL; newstate->hash = hash; return newstate; } @@ -1369,6 +1368,5 @@ free_state (state) } re_node_set_free (&state->nodes); re_free (state->trtable); - re_free (state->trtable_search); re_free (state); } diff --git a/posix/regex_internal.h b/posix/regex_internal.h index 628dc94066..0230b5d73e 100644 --- a/posix/regex_internal.h +++ b/posix/regex_internal.h @@ -456,7 +456,6 @@ struct re_dfastate_t re_node_set nodes; re_node_set *entrance_nodes; struct re_dfastate_t **trtable; - struct re_dfastate_t **trtable_search; /* If this state is a special state. A state is a special state if the state is the halt state, or a anchor. */ @@ -469,6 +468,7 @@ struct re_dfastate_t /* If this state has backreference node(s). */ unsigned int has_backref : 1; unsigned int has_constraint : 1; + unsigned int word_trtable : 1; }; typedef struct re_dfastate_t re_dfastate_t; diff --git a/posix/regexec.c b/posix/regexec.c index 4688c9babb..91c48b3c4e 100644 --- a/posix/regexec.c +++ b/posix/regexec.c @@ -57,7 +57,7 @@ static re_dfastate_t *acquire_init_state_context (reg_errcode_t *err, static reg_errcode_t prune_impossible_nodes (const regex_t *preg, re_match_context_t *mctx); static int check_matching (const regex_t *preg, re_match_context_t *mctx, - int fl_search, int fl_longest_match); + int fl_longest_match); static int check_halt_node_context (const re_dfa_t *dfa, int node, unsigned int context); static int check_halt_state_context (const regex_t *preg, @@ -123,15 +123,16 @@ static reg_errcode_t merge_state_array (re_dfa_t *dfa, re_dfastate_t **dst, re_dfastate_t **src, int num); static re_dfastate_t *transit_state (reg_errcode_t *err, const regex_t *preg, re_match_context_t *mctx, - re_dfastate_t *state, int fl_search); + re_dfastate_t *state); static reg_errcode_t check_subexp_matching_top (re_dfa_t *dfa, re_match_context_t *mctx, re_node_set *cur_nodes, int str_idx); +#if 0 static re_dfastate_t *transit_state_sb (reg_errcode_t *err, const regex_t *preg, re_dfastate_t *pstate, - int fl_search, re_match_context_t *mctx); +#endif #ifdef RE_ENABLE_I18N static reg_errcode_t transit_state_mb (const regex_t *preg, re_dfastate_t *pstate, @@ -173,8 +174,7 @@ static reg_errcode_t expand_bkref_cache (const regex_t *preg, int last_str, int subexp_num, int fl_open); static re_dfastate_t **build_trtable (const regex_t *dfa, - const re_dfastate_t *state, - int fl_search); + re_dfastate_t *state); #ifdef RE_ENABLE_I18N static int check_node_accept_bytes (const regex_t *preg, int node_idx, const re_string_t *input, int idx); @@ -741,7 +741,7 @@ re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch, /* It seems to be appropriate one, then use the matcher. */ /* We assume that the matching starts from 0. */ mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0; - match_last = check_matching (preg, &mctx, 0, fl_longest_match); + match_last = check_matching (preg, &mctx, fl_longest_match); if (match_last != -1) { if (BE (match_last == -2, 0)) @@ -919,8 +919,8 @@ acquire_init_state_context (err, preg, mctx, idx) if (dfa->init_state->has_constraint) { unsigned int context; - context = re_string_context_at (mctx->input, idx - 1, mctx->eflags, - preg->newline_anchor); + context = re_string_context_at (mctx->input, idx - 1, mctx->eflags, + preg->newline_anchor); if (IS_WORD_CONTEXT (context)) return dfa->init_state_word; else if (IS_ORDINARY_CONTEXT (context)) @@ -947,16 +947,15 @@ acquire_init_state_context (err, preg, mctx, idx) /* Check whether the regular expression match input string INPUT or not, and return the index where the matching end, return -1 if not match, or return -2 in case of an error. - FL_SEARCH means we must search where the matching starts, FL_LONGEST_MATCH means we want the POSIX longest matching. Note that the matcher assume that the maching starts from the current index of the buffer. */ static int -check_matching (preg, mctx, fl_search, fl_longest_match) +check_matching (preg, mctx, fl_longest_match) const regex_t *preg; re_match_context_t *mctx; - int fl_search, fl_longest_match; + int fl_longest_match; { re_dfa_t *dfa = (re_dfa_t *) preg->buffer; reg_errcode_t err; @@ -1006,31 +1005,15 @@ check_matching (preg, mctx, fl_search, fl_longest_match) while (!re_string_eoi (mctx->input)) { - cur_state = transit_state (&err, preg, mctx, cur_state, - fl_search && !match); + cur_state = transit_state (&err, preg, mctx, cur_state); if (cur_state == NULL) /* Reached at the invalid state or an error. */ { cur_str_idx = re_string_cur_idx (mctx->input); if (BE (err != REG_NOERROR, 0)) return -2; - if (fl_search && !match) - { - /* Restart from initial state, since we are searching - the point from where matching start. */ -#ifdef RE_ENABLE_I18N - if (dfa->mb_cur_max == 1 - || re_string_first_byte (mctx->input, cur_str_idx)) -#endif /* RE_ENABLE_I18N */ - cur_state = acquire_init_state_context (&err, preg, mctx, - cur_str_idx); - if (BE (cur_state == NULL && err != REG_NOERROR, 0)) - return -2; - if (mctx->state_log != NULL) - mctx->state_log[cur_str_idx] = cur_state; - } - else if (!fl_longest_match && match) + if (!fl_longest_match && match) break; - else /* (fl_longest_match && match) || (!fl_search && !match) */ + else { if (mctx->state_log == NULL) break; @@ -2069,12 +2052,11 @@ sift_states_iter_mb (preg, mctx, sctx, node_idx, str_idx, max_str_idx) update the destination of STATE_LOG. */ static re_dfastate_t * -transit_state (err, preg, mctx, state, fl_search) +transit_state (err, preg, mctx, state) reg_errcode_t *err; const regex_t *preg; re_match_context_t *mctx; re_dfastate_t *state; - int fl_search; { re_dfa_t *dfa = (re_dfa_t *) preg->buffer; re_dfastate_t **trtable, *next_state; @@ -2113,24 +2095,40 @@ transit_state (err, preg, mctx, state, fl_search) { /* Use transition table */ ch = re_string_fetch_byte (mctx->input); - trtable = fl_search ? state->trtable_search : state->trtable; + trtable = state->trtable; if (trtable == NULL) { - trtable = build_trtable (preg, state, fl_search); - if (fl_search) - state->trtable_search = trtable; - else - state->trtable = trtable; + trtable = build_trtable (preg, state); + if (trtable == NULL) + { + *err = REG_ESPACE; + return NULL; + } } - next_state = trtable[ch]; + if (BE (state->word_trtable, 0)) + { + unsigned int context; + context + = re_string_context_at (mctx->input, + re_string_cur_idx (mctx->input) - 1, + mctx->eflags, preg->newline_anchor); + if (IS_WORD_CONTEXT (context)) + next_state = trtable[ch + SBC_MAX]; + else + next_state = trtable[ch]; + } + else + next_state = trtable[ch]; } +#if 0 else { /* don't use transition table */ - next_state = transit_state_sb (err, preg, state, fl_search, mctx); + next_state = transit_state_sb (err, preg, state, mctx); if (BE (next_state == NULL && err != REG_NOERROR, 0)) return NULL; } +#endif } cur_idx = re_string_cur_idx (mctx->input); @@ -2242,15 +2240,15 @@ check_subexp_matching_top (dfa, mctx, cur_nodes, str_idx) return REG_NOERROR; } +#if 0 /* Return the next state to which the current state STATE will transit by accepting the current input byte. */ static re_dfastate_t * -transit_state_sb (err, preg, state, fl_search, mctx) +transit_state_sb (err, preg, state, mctx) reg_errcode_t *err; const regex_t *preg; re_dfastate_t *state; - int fl_search; re_match_context_t *mctx; { re_dfa_t *dfa = (re_dfa_t *) preg->buffer; @@ -2276,29 +2274,6 @@ transit_state_sb (err, preg, state, fl_search, mctx) } } } - if (fl_search) - { -#ifdef RE_ENABLE_I18N - int not_initial = 0; - if (dfa->mb_cur_max > 1) - for (node_cnt = 0; node_cnt < next_nodes.nelem; ++node_cnt) - if (dfa->nodes[next_nodes.elems[node_cnt]].type == CHARACTER) - { - not_initial = dfa->nodes[next_nodes.elems[node_cnt]].mb_partial; - break; - } - if (!not_initial) -#endif - { - *err = re_node_set_merge (&next_nodes, - dfa->init_state->entrance_nodes); - if (BE (*err != REG_NOERROR, 0)) - { - re_node_set_free (&next_nodes); - return NULL; - } - } - } context = re_string_context_at (mctx->input, cur_str_idx, mctx->eflags, preg->newline_anchor); next_state = re_acquire_state_context (err, dfa, &next_nodes, context); @@ -2309,6 +2284,7 @@ transit_state_sb (err, preg, state, fl_search, mctx) re_string_skip_bytes (mctx->input, 1); return next_state; } +#endif #ifdef RE_ENABLE_I18N static reg_errcode_t @@ -3117,10 +3093,9 @@ expand_bkref_cache (preg, mctx, cur_nodes, cur_str, last_str, subexp_num, Return the new table if succeeded, otherwise return NULL. */ static re_dfastate_t ** -build_trtable (preg, state, fl_search) +build_trtable (preg, state) const regex_t *preg; - const re_dfastate_t *state; - int fl_search; + re_dfastate_t *state; { reg_errcode_t err; re_dfa_t *dfa = (re_dfa_t *) preg->buffer; @@ -3154,6 +3129,7 @@ build_trtable (preg, state, fl_search) /* Initialize transiton table. */ trtable = (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX); + state->word_trtable = 0; if (BE (trtable == NULL, 0)) { if (dests_node_malloced) @@ -3170,7 +3146,10 @@ build_trtable (preg, state, fl_search) free (dests_node); /* Return NULL in case of an error, trtable otherwise. */ if (ndests == 0) - return trtable; + { + state->trtable = trtable; + return trtable; + } free (trtable); return NULL; } @@ -3224,26 +3203,6 @@ out_free: goto out_free; } } - /* If search flag is set, merge the initial state. */ - if (fl_search) - { -#ifdef RE_ENABLE_I18N - int not_initial = 0; - for (j = 0; j < follows.nelem; ++j) - if (dfa->nodes[follows.elems[j]].type == CHARACTER) - { - not_initial = dfa->nodes[follows.elems[j]].mb_partial; - break; - } - if (!not_initial) -#endif - { - err = re_node_set_merge (&follows, - dfa->init_state->entrance_nodes); - if (BE (err != REG_NOERROR, 0)) - goto out_free; - } - } dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0); if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0)) goto out_free; @@ -3274,31 +3233,41 @@ out_free: for (j = 0; j < UINT_BITS; ++j, ++ch) if ((acceptable[i] >> j) & 1) { - /* The current state accepts the character ch. */ - if (IS_WORD_CHAR (ch)) - { - for (k = 0; k < ndests; ++k) - if ((dests_ch[k][i] >> j) & 1) + for (k = 0; k < ndests; ++k) + if ((dests_ch[k][i] >> j) & 1) + { + /* k-th destination accepts the word character ch. */ + if (state->word_trtable) { - /* k-th destination accepts the word character ch. */ - trtable[ch] = dest_states_word[k]; - /* There must be only one destination which accepts - character ch. See group_nodes_into_DFAstates. */ - break; - } - } - else /* not WORD_CHAR */ - { - for (k = 0; k < ndests; ++k) - if ((dests_ch[k][i] >> j) & 1) - { - /* k-th destination accepts the non-word character ch. */ trtable[ch] = dest_states[k]; - /* There must be only one destination which accepts - character ch. See group_nodes_into_DFAstates. */ - break; + trtable[ch + SBC_MAX] = dest_states_word[k]; } - } + else if (dfa->mb_cur_max > 1 + && dest_states[k] != dest_states_word[k]) + { + re_dfastate_t **new_trtable; + + new_trtable = (re_dfastate_t **) + realloc (trtable, + sizeof (re_dfastate_t *) + * 2 * SBC_MAX); + if (BE (new_trtable == NULL, 0)) + goto out_free; + memcpy (new_trtable + SBC_MAX, new_trtable, + sizeof (re_dfastate_t *) * SBC_MAX); + trtable = new_trtable; + state->word_trtable = 1; + trtable[ch] = dest_states[k]; + trtable[ch + SBC_MAX] = dest_states_word[k]; + } + else if (IS_WORD_CHAR (ch)) + trtable[ch] = dest_states_word[k]; + else + trtable[ch] = dest_states[k]; + /* There must be only one destination which accepts + character ch. See group_nodes_into_DFAstates. */ + break; + } } /* new line */ if (bitset_contain (acceptable, NEWLINE_CHAR)) @@ -3309,6 +3278,8 @@ out_free: { /* k-th destination accepts newline character. */ trtable[NEWLINE_CHAR] = dest_states_nl[k]; + if (state->word_trtable) + trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[k]; /* There must be only one destination which accepts newline. See group_nodes_into_DFAstates. */ break; @@ -3325,6 +3296,7 @@ out_free: if (dests_node_malloced) free (dests_node); + state->trtable = trtable; return trtable; } @@ -3386,6 +3358,8 @@ group_nodes_into_DFAstates (preg, state, dests_node, dests_ch) match it the context. */ if (constraint) { + int word_char_max; + if (constraint & NEXT_NEWLINE_CONSTRAINT) { int accepts_newline = bitset_contain (accepts, NEWLINE_CHAR); @@ -3400,11 +3374,16 @@ group_nodes_into_DFAstates (preg, state, dests_node, dests_ch) bitset_empty (accepts); continue; } + + /* This assumes ASCII compatible locale. We cannot say + anything about the non-ascii chars. */ + word_char_max + = dfa->mb_cur_max > 1 ? BITSET_UINTS / 2 : BITSET_UINTS; if (constraint & NEXT_WORD_CONSTRAINT) - for (j = 0; j < BITSET_UINTS; ++j) + for (j = 0; j < word_char_max; ++j) accepts[j] &= dfa->word_char[j]; if (constraint & NEXT_NOTWORD_CONSTRAINT) - for (j = 0; j < BITSET_UINTS; ++j) + for (j = 0; j < word_char_max; ++j) accepts[j] &= ~dfa->word_char[j]; } diff --git a/posix/tst-rxspencer.c b/posix/tst-rxspencer.c index 45bafda7a7..1b4b56f333 100644 --- a/posix/tst-rxspencer.c +++ b/posix/tst-rxspencer.c @@ -350,16 +350,28 @@ mb_tests (const char *pattern, int cflags, const char *string, int eflags, if (strstr (pattern, "[:xdigit:]")) return 0; + /* XXX: regex ATM handles only single byte equivalence classes. */ + if (strstr (pattern, "[[=b=]]")) + return 0; + for (i = 1; i < 16; ++i) { char *p = letters; - if (i & 1) + if ((i & 1) + && (strchr (pattern, 'a') || strchr (string, 'a') + || strchr (pattern, 'A') || strchr (string, 'A'))) *p++ = 'a', *p++ = 'A'; - if (i & 2) + if ((i & 2) + && (strchr (pattern, 'b') || strchr (string, 'b') + || strchr (pattern, 'B') || strchr (string, 'B'))) *p++ = 'b', *p++ = 'B'; - if (i & 4) + if ((i & 4) + && (strchr (pattern, 'c') || strchr (string, 'c') + || strchr (pattern, 'C') || strchr (string, 'C'))) *p++ = 'c', *p++ = 'C'; - if (i & 8) + if ((i & 8) + && (strchr (pattern, 'd') || strchr (string, 'd') + || strchr (pattern, 'D') || strchr (string, 'D'))) *p++ = 'd', *p++ = 'D'; *p++ = '\0'; sprintf (fail, "UTF-8 %s FAIL", letters); @@ -489,7 +501,11 @@ main (int argc, char **argv) replace_special_chars (matches); } - setlocale (LC_ALL, "C"); + if (setlocale (LC_ALL, "C") == NULL) + { + puts ("setlocale C failed"); + ret = 1; + } if (test (pattern, cflags, string, eflags, expect, matches, "FAIL") || (try_bre_ere && test (pattern, cflags & ~REG_EXTENDED, string, eflags, @@ -497,12 +513,16 @@ main (int argc, char **argv) ret = 1; else if (test_utf8) { - setlocale (LC_ALL, "cs_CZ.UTF-8"); - if (test (pattern, cflags, string, eflags, expect, matches, - "UTF-8 FAIL") - || (try_bre_ere - && test (pattern, cflags & ~REG_EXTENDED, string, eflags, - expect, matches, "UTF-8 FAIL"))) + if (setlocale (LC_ALL, "cs_CZ.UTF-8") == NULL) + { + puts ("setlocale cs_CZ.UTF-8 failed"); + ret = 1; + } + else if (test (pattern, cflags, string, eflags, expect, matches, + "UTF-8 FAIL") + || (try_bre_ere + && test (pattern, cflags & ~REG_EXTENDED, string, + eflags, expect, matches, "UTF-8 FAIL"))) ret = 1; else if (mb_tests (pattern, cflags, string, eflags, expect, matches) || (try_bre_ere