2003-11-20  Ulrich Drepper  <drepper@redhat.com>

	* posix/PTESTS: Fix first test in GA143.

2003-11-20  Jakub Jelinek  <jakub@redhat.com>

	* posix/regex_internal.h (re_dfastate_t): Remove trtable_search.
	Add word_trtable.
	* posix/regex_internal.c (create_newstate_common, free_state):
	Don't free trtable_search.
	* posix/regexec.c (check_matching): Remove fl_search argument.
	(transit_state_sb): Likewise.  #ifdef out as unused.
	(build_trtable): Remove fl_search argument.  Set state->word_trtable
	and state->trtable.  Build separate word and non-word tables if
	multi-byte and they differ for some character.
	(transit_state): Remove fl_search argument.  Don't update
	state->trtable here.  Handle state->word_trtable.
	#ifdef out unused call to transit_state_sb.
	(re_search_internal): Update check_matching caller.
	(group_nodes_into_DFAstates): Don't clear non-ascii chars in accepts
	bitmask for multi-byte locales.
	* posix/bug-regex19.c (tests): Enable some commented out tests, add
	2 new tests.
	* posix/tst-rxspencer.c (mb_tests): Don't test [[=b=]] for now as
	multi-byte.  Don't run identical multi-byte tests multiple times
	unnecessarily.
	(main): Check setlocale return value.
	* posix/Makefile (tst-rxspencer-ARGS): Add --utf8 argument.
	(tst-rxspencer-ENV): Remove MALLOC_TRACE, add LOCPATH.
	($(objpfx)tst-rxspencer-mem): Run another tst-rxspencer test
	here, without --utf8 argument but with MALLOC_TRACE.
This commit is contained in:
Ulrich Drepper 2003-11-20 23:36:40 +00:00
parent beac34a2ad
commit c13c99fa92
10 changed files with 176 additions and 137 deletions

View File

@ -1,3 +1,35 @@
2003-11-20 Ulrich Drepper <drepper@redhat.com>
* posix/PTESTS: Fix first test in GA143.
2003-11-20 Jakub Jelinek <jakub@redhat.com>
* posix/regex_internal.h (re_dfastate_t): Remove trtable_search.
Add word_trtable.
* posix/regex_internal.c (create_newstate_common, free_state):
Don't free trtable_search.
* posix/regexec.c (check_matching): Remove fl_search argument.
(transit_state_sb): Likewise. #ifdef out as unused.
(build_trtable): Remove fl_search argument. Set state->word_trtable
and state->trtable. Build separate word and non-word tables if
multi-byte and they differ for some character.
(transit_state): Remove fl_search argument. Don't update
state->trtable here. Handle state->word_trtable.
#ifdef out unused call to transit_state_sb.
(re_search_internal): Update check_matching caller.
(group_nodes_into_DFAstates): Don't clear non-ascii chars in accepts
bitmask for multi-byte locales.
* posix/bug-regex19.c (tests): Enable some commented out tests, add
2 new tests.
* posix/tst-rxspencer.c (mb_tests): Don't test [[=b=]] for now as
multi-byte. Don't run identical multi-byte tests multiple times
unnecessarily.
(main): Check setlocale return value.
* posix/Makefile (tst-rxspencer-ARGS): Add --utf8 argument.
(tst-rxspencer-ENV): Remove MALLOC_TRACE, add LOCPATH.
($(objpfx)tst-rxspencer-mem): Run another tst-rxspencer test
here, without --utf8 argument but with MALLOC_TRACE.
2003-11-19 Jakub Jelinek <jakub@redhat.com> 2003-11-19 Jakub Jelinek <jakub@redhat.com>
* posix/regexec.c (extend_buffers): Don't allocate * posix/regexec.c (extend_buffers): Don't allocate

View File

@ -1,3 +1,7 @@
2003-11-20 Jakub Jelinek <jakub@redhat.com>
* Makefile (LOCALES): Add cs_CZ.UTF-8.
2003-11-15 Ulrich Drepper <drepper@redhat.com> 2003-11-15 Ulrich Drepper <drepper@redhat.com>
* Makefile (tst-leaks-ENV): Add LOCPATH. * Makefile (tst-leaks-ENV): Add LOCPATH.

View File

@ -132,7 +132,7 @@ LOCALES := de_DE.ISO-8859-1 de_DE.UTF-8 en_US.ANSI_X3.4-1968 \
en_US.ISO-8859-1 ja_JP.EUC-JP da_DK.ISO-8859-1 \ en_US.ISO-8859-1 ja_JP.EUC-JP da_DK.ISO-8859-1 \
hr_HR.ISO-8859-2 sv_SE.ISO-8859-1 ja_JP.SJIS fr_FR.ISO-8859-1 \ hr_HR.ISO-8859-2 sv_SE.ISO-8859-1 ja_JP.SJIS fr_FR.ISO-8859-1 \
vi_VN.TCVN5712-1 nb_NO.ISO-8859-1 nn_NO.ISO-8859-1 \ vi_VN.TCVN5712-1 nb_NO.ISO-8859-1 nn_NO.ISO-8859-1 \
tr_TR.UTF-8 tr_TR.UTF-8 cs_CZ.UTF-8
LOCALE_SRCS := $(shell echo "$(LOCALES)"|sed 's/\([^ .]*\)[^ ]*/\1/g') LOCALE_SRCS := $(shell echo "$(LOCALES)"|sed 's/\([^ .]*\)[^ ]*/\1/g')
CHARMAPS := $(shell echo "$(LOCALES)" | \ CHARMAPS := $(shell echo "$(LOCALES)" | \
sed -e 's/[^ .]*[.]\([^ ]*\)/\1/g' -e s/SJIS/SHIFT_JIS/g) sed -e 's/[^ .]*[.]\([^ ]*\)/\1/g' -e s/SJIS/SHIFT_JIS/g)

View File

@ -148,7 +148,6 @@ tst-exec-ARGS = -- $(built-program-cmd)
tst-spawn-ARGS = -- $(built-program-cmd) tst-spawn-ARGS = -- $(built-program-cmd)
tst-dir-ARGS = `pwd` `cd $(common-objdir)/$(subdir); pwd` `cd $(common-objdir); pwd` $(objpfx)tst-dir tst-dir-ARGS = `pwd` `cd $(common-objdir)/$(subdir); pwd` `cd $(common-objdir); pwd` $(objpfx)tst-dir
tst-chmod-ARGS = `pwd` tst-chmod-ARGS = `pwd`
tst-rxspencer-ARGS = rxspencer/tests
tst-fnmatch-ENV = LOCPATH=$(common-objpfx)localedata tst-fnmatch-ENV = LOCPATH=$(common-objpfx)localedata
tst-regexloc-ENV = LOCPATH=$(common-objpfx)localedata tst-regexloc-ENV = LOCPATH=$(common-objpfx)localedata
@ -160,6 +159,8 @@ bug-regex17-ENV = LOCPATH=$(common-objpfx)localedata
bug-regex18-ENV = LOCPATH=$(common-objpfx)localedata bug-regex18-ENV = LOCPATH=$(common-objpfx)localedata
bug-regex19-ENV = LOCPATH=$(common-objpfx)localedata bug-regex19-ENV = LOCPATH=$(common-objpfx)localedata
bug-regex20-ENV = LOCPATH=$(common-objpfx)localedata bug-regex20-ENV = LOCPATH=$(common-objpfx)localedata
tst-rxspencer-ARGS = --utf8 rxspencer/tests
tst-rxspencer-ENV = LOCPATH=$(common-objpfx)localedata
testcases.h: TESTS TESTS2C.sed testcases.h: TESTS TESTS2C.sed
sed -f TESTS2C.sed < $< > $@T sed -f TESTS2C.sed < $< > $@T
@ -207,9 +208,13 @@ bug-regex21-ENV = MALLOC_TRACE=$(objpfx)bug-regex21.mtrace
$(objpfx)bug-regex21-mem: $(objpfx)bug-regex21.out $(objpfx)bug-regex21-mem: $(objpfx)bug-regex21.out
$(common-objpfx)malloc/mtrace $(objpfx)bug-regex21.mtrace > $@ $(common-objpfx)malloc/mtrace $(objpfx)bug-regex21.mtrace > $@
tst-rxspencer-ENV = MALLOC_TRACE=$(objpfx)tst-rxspencer.mtrace # tst-rxspencer.mtrace is generated only when run without --utf8
# option, since otherwise the file has almost 100M and takes very long
# time to process.
$(objpfx)tst-rxspencer-mem: $(objpfx)tst-rxspencer.out $(objpfx)tst-rxspencer-mem: $(objpfx)tst-rxspencer.out
MALLOC_TRACE=$(objpfx)tst-rxspencer.mtrace $(tst-rxspencer-ENV) \
$(run-program-prefix) $(objpfx)tst-rxspencer rxspencer/tests \
> /dev/null
$(common-objpfx)malloc/mtrace $(objpfx)tst-rxspencer.mtrace > $@ $(common-objpfx)malloc/mtrace $(objpfx)tst-rxspencer.mtrace > $@
$(objpfx)tst-getconf.out: tst-getconf.sh $(objpfx)getconf $(objpfx)tst-getconf.out: tst-getconf.sh $(objpfx)getconf

View File

@ -270,7 +270,7 @@
1¦63¦a\{1,63\}¦aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa¦ 1¦63¦a\{1,63\}¦aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa¦
# 2.8.3.4 BRE Precedence # 2.8.3.4 BRE Precedence
# GA143 # GA143
2¦20¦\^\[[[.].]]\\(\\1\\)\\*\\{1,2\\}\$¦a^[]\(1\)\*\{1,2\}$b¦ 2¦20¦\^\[[[.].]]\\(\\1\\)\*\\{1,2\\}\$¦a^[]\(1\)*\{1,2\}$b¦
1¦6¦[[=*=]][[=\=]][[=]=]][[===]][[...]][[:punct:]]¦*\]=.;¦ 1¦6¦[[=*=]][[=\=]][[=]=]][[===]][[...]][[:punct:]]¦*\]=.;¦
1¦6¦[$\(*\)^]*¦$\()*^¦ 1¦6¦[$\(*\)^]*¦$\()*^¦
1¦1¦[\1]¦1¦ 1¦1¦[\1]¦1¦

View File

@ -37,17 +37,21 @@ static struct
\xc3\x96 LATIN CAPITAL LETTER O WITH DIAERESIS \xc3\x96 LATIN CAPITAL LETTER O WITH DIAERESIS
\xe2\x80\x94 EM DASH */ \xe2\x80\x94 EM DASH */
/* Should not match. */ /* Should not match. */
{RE_SYNTAX_POSIX_BASIC, "\\<A", "aOAA", 0, -1},
{RE_SYNTAX_POSIX_BASIC, "\\<A", "aOAA", 2, -1}, {RE_SYNTAX_POSIX_BASIC, "\\<A", "aOAA", 2, -1},
{RE_SYNTAX_POSIX_BASIC, "A\\>", "aAAO", 1, -1}, {RE_SYNTAX_POSIX_BASIC, "A\\>", "aAAO", 1, -1},
{RE_SYNTAX_POSIX_BASIC, "\\bA", "aOAA", 0, -1},
{RE_SYNTAX_POSIX_BASIC, "\\bA", "aOAA", 2, -1}, {RE_SYNTAX_POSIX_BASIC, "\\bA", "aOAA", 2, -1},
{RE_SYNTAX_POSIX_BASIC, "A\\b", "aAAO", 1, -1}, {RE_SYNTAX_POSIX_BASIC, "A\\b", "aAAO", 1, -1},
{RE_SYNTAX_POSIX_BASIC, "\\<\xc3\x84", "a\xc3\x96\xc3\x84\xc3\x84", 0, -1},
{RE_SYNTAX_POSIX_BASIC, "\\<\xc3\x84", "a\xc3\x96\xc3\x84\xc3\x84", 3, -1}, {RE_SYNTAX_POSIX_BASIC, "\\<\xc3\x84", "a\xc3\x96\xc3\x84\xc3\x84", 3, -1},
{RE_SYNTAX_POSIX_BASIC, "\xc3\x84\\>", "a\xc3\x84\xc3\x84\xc3\x96", 1, -1}, {RE_SYNTAX_POSIX_BASIC, "\xc3\x84\\>", "a\xc3\x84\xc3\x84\xc3\x96", 1, -1},
#if 0 #if 0
/* XXX Not used since they fail so far. */ /* XXX these 2 tests still fail. */
{RE_SYNTAX_POSIX_BASIC, "\\b\xc3\x84", "a\xc3\x96\xc3\x84\xc3\x84", 0, -1},
{RE_SYNTAX_POSIX_BASIC, "\\b\xc3\x84", "a\xc3\x96\xc3\x84\xc3\x84", 3, -1}, {RE_SYNTAX_POSIX_BASIC, "\\b\xc3\x84", "a\xc3\x96\xc3\x84\xc3\x84", 3, -1},
{RE_SYNTAX_POSIX_BASIC, "\xc3\x84\\b", "a\xc3\x84\xc3\x84\xc3\x96", 1, -1},
#endif #endif
{RE_SYNTAX_POSIX_BASIC, "\xc3\x84\\b", "a\xc3\x84\xc3\x84\xc3\x96", 1, -1},
/* Should match. */ /* Should match. */
{RE_SYNTAX_POSIX_BASIC, "\\<A", "AA", 0, 0}, {RE_SYNTAX_POSIX_BASIC, "\\<A", "AA", 0, 0},
{RE_SYNTAX_POSIX_BASIC, "\\<A", "a-AA", 2, 2}, {RE_SYNTAX_POSIX_BASIC, "\\<A", "a-AA", 2, 2},
@ -57,8 +61,6 @@ static struct
{RE_SYNTAX_POSIX_BASIC, "\\bA", "a-AA", 2, 2}, {RE_SYNTAX_POSIX_BASIC, "\\bA", "a-AA", 2, 2},
{RE_SYNTAX_POSIX_BASIC, "A\\b", "aAA-", 1, 2}, {RE_SYNTAX_POSIX_BASIC, "A\\b", "aAA-", 1, 2},
{RE_SYNTAX_POSIX_BASIC, "A\\b", "aAA", 1, 2}, {RE_SYNTAX_POSIX_BASIC, "A\\b", "aAA", 1, 2},
#if 0
/* XXX Not used since they fail so far. */
{RE_SYNTAX_POSIX_BASIC, "\\<\xc3\x84", "\xc3\x84\xc3\x84", 0, 0}, {RE_SYNTAX_POSIX_BASIC, "\\<\xc3\x84", "\xc3\x84\xc3\x84", 0, 0},
{RE_SYNTAX_POSIX_BASIC, "\\<\xc3\x84", "a\xe2\x80\x94\xc3\x84\xc3\x84", 4, 4}, {RE_SYNTAX_POSIX_BASIC, "\\<\xc3\x84", "a\xe2\x80\x94\xc3\x84\xc3\x84", 4, 4},
{RE_SYNTAX_POSIX_BASIC, "\xc3\x84\\>", "a\xc3\x84\xc3\x84\xe2\x80\x94", 1, 3}, {RE_SYNTAX_POSIX_BASIC, "\xc3\x84\\>", "a\xc3\x84\xc3\x84\xe2\x80\x94", 1, 3},
@ -67,7 +69,6 @@ static struct
{RE_SYNTAX_POSIX_BASIC, "\\b\xc3\x84", "a\xe2\x80\x94\xc3\x84\xc3\x84", 4, 4}, {RE_SYNTAX_POSIX_BASIC, "\\b\xc3\x84", "a\xe2\x80\x94\xc3\x84\xc3\x84", 4, 4},
{RE_SYNTAX_POSIX_BASIC, "\xc3\x84\\b", "a\xc3\x84\xc3\x84\xe2\x80\x94", 1, 3}, {RE_SYNTAX_POSIX_BASIC, "\xc3\x84\\b", "a\xc3\x84\xc3\x84\xe2\x80\x94", 1, 3},
{RE_SYNTAX_POSIX_BASIC, "\xc3\x84\\b", "a\xc3\x84\xc3\x84", 1, 3} {RE_SYNTAX_POSIX_BASIC, "\xc3\x84\\b", "a\xc3\x84\xc3\x84", 1, 3}
#endif
}; };
int int

View File

@ -1207,7 +1207,6 @@ create_newstate_common (dfa, nodes, hash)
return NULL; return NULL;
} }
newstate->trtable = NULL; newstate->trtable = NULL;
newstate->trtable_search = NULL;
newstate->hash = hash; newstate->hash = hash;
return newstate; return newstate;
} }
@ -1369,6 +1368,5 @@ free_state (state)
} }
re_node_set_free (&state->nodes); re_node_set_free (&state->nodes);
re_free (state->trtable); re_free (state->trtable);
re_free (state->trtable_search);
re_free (state); re_free (state);
} }

View File

@ -456,7 +456,6 @@ struct re_dfastate_t
re_node_set nodes; re_node_set nodes;
re_node_set *entrance_nodes; re_node_set *entrance_nodes;
struct re_dfastate_t **trtable; struct re_dfastate_t **trtable;
struct re_dfastate_t **trtable_search;
/* If this state is a special state. /* If this state is a special state.
A state is a special state if the state is the halt state, or A state is a special state if the state is the halt state, or
a anchor. */ a anchor. */
@ -469,6 +468,7 @@ struct re_dfastate_t
/* If this state has backreference node(s). */ /* If this state has backreference node(s). */
unsigned int has_backref : 1; unsigned int has_backref : 1;
unsigned int has_constraint : 1; unsigned int has_constraint : 1;
unsigned int word_trtable : 1;
}; };
typedef struct re_dfastate_t re_dfastate_t; typedef struct re_dfastate_t re_dfastate_t;

View File

@ -57,7 +57,7 @@ static re_dfastate_t *acquire_init_state_context (reg_errcode_t *err,
static reg_errcode_t prune_impossible_nodes (const regex_t *preg, static reg_errcode_t prune_impossible_nodes (const regex_t *preg,
re_match_context_t *mctx); re_match_context_t *mctx);
static int check_matching (const regex_t *preg, re_match_context_t *mctx, static int check_matching (const regex_t *preg, re_match_context_t *mctx,
int fl_search, int fl_longest_match); int fl_longest_match);
static int check_halt_node_context (const re_dfa_t *dfa, int node, static int check_halt_node_context (const re_dfa_t *dfa, int node,
unsigned int context); unsigned int context);
static int check_halt_state_context (const regex_t *preg, static int check_halt_state_context (const regex_t *preg,
@ -123,15 +123,16 @@ static reg_errcode_t merge_state_array (re_dfa_t *dfa, re_dfastate_t **dst,
re_dfastate_t **src, int num); re_dfastate_t **src, int num);
static re_dfastate_t *transit_state (reg_errcode_t *err, const regex_t *preg, static re_dfastate_t *transit_state (reg_errcode_t *err, const regex_t *preg,
re_match_context_t *mctx, re_match_context_t *mctx,
re_dfastate_t *state, int fl_search); re_dfastate_t *state);
static reg_errcode_t check_subexp_matching_top (re_dfa_t *dfa, static reg_errcode_t check_subexp_matching_top (re_dfa_t *dfa,
re_match_context_t *mctx, re_match_context_t *mctx,
re_node_set *cur_nodes, re_node_set *cur_nodes,
int str_idx); int str_idx);
#if 0
static re_dfastate_t *transit_state_sb (reg_errcode_t *err, const regex_t *preg, static re_dfastate_t *transit_state_sb (reg_errcode_t *err, const regex_t *preg,
re_dfastate_t *pstate, re_dfastate_t *pstate,
int fl_search,
re_match_context_t *mctx); re_match_context_t *mctx);
#endif
#ifdef RE_ENABLE_I18N #ifdef RE_ENABLE_I18N
static reg_errcode_t transit_state_mb (const regex_t *preg, static reg_errcode_t transit_state_mb (const regex_t *preg,
re_dfastate_t *pstate, re_dfastate_t *pstate,
@ -173,8 +174,7 @@ static reg_errcode_t expand_bkref_cache (const regex_t *preg,
int last_str, int subexp_num, int last_str, int subexp_num,
int fl_open); int fl_open);
static re_dfastate_t **build_trtable (const regex_t *dfa, static re_dfastate_t **build_trtable (const regex_t *dfa,
const re_dfastate_t *state, re_dfastate_t *state);
int fl_search);
#ifdef RE_ENABLE_I18N #ifdef RE_ENABLE_I18N
static int check_node_accept_bytes (const regex_t *preg, int node_idx, static int check_node_accept_bytes (const regex_t *preg, int node_idx,
const re_string_t *input, int idx); const re_string_t *input, int idx);
@ -741,7 +741,7 @@ re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
/* It seems to be appropriate one, then use the matcher. */ /* It seems to be appropriate one, then use the matcher. */
/* We assume that the matching starts from 0. */ /* We assume that the matching starts from 0. */
mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0; mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
match_last = check_matching (preg, &mctx, 0, fl_longest_match); match_last = check_matching (preg, &mctx, fl_longest_match);
if (match_last != -1) if (match_last != -1)
{ {
if (BE (match_last == -2, 0)) if (BE (match_last == -2, 0))
@ -919,8 +919,8 @@ acquire_init_state_context (err, preg, mctx, idx)
if (dfa->init_state->has_constraint) if (dfa->init_state->has_constraint)
{ {
unsigned int context; unsigned int context;
context = re_string_context_at (mctx->input, idx - 1, mctx->eflags, context = re_string_context_at (mctx->input, idx - 1, mctx->eflags,
preg->newline_anchor); preg->newline_anchor);
if (IS_WORD_CONTEXT (context)) if (IS_WORD_CONTEXT (context))
return dfa->init_state_word; return dfa->init_state_word;
else if (IS_ORDINARY_CONTEXT (context)) else if (IS_ORDINARY_CONTEXT (context))
@ -947,16 +947,15 @@ acquire_init_state_context (err, preg, mctx, idx)
/* Check whether the regular expression match input string INPUT or not, /* Check whether the regular expression match input string INPUT or not,
and return the index where the matching end, return -1 if not match, and return the index where the matching end, return -1 if not match,
or return -2 in case of an error. or return -2 in case of an error.
FL_SEARCH means we must search where the matching starts,
FL_LONGEST_MATCH means we want the POSIX longest matching. FL_LONGEST_MATCH means we want the POSIX longest matching.
Note that the matcher assume that the maching starts from the current Note that the matcher assume that the maching starts from the current
index of the buffer. */ index of the buffer. */
static int static int
check_matching (preg, mctx, fl_search, fl_longest_match) check_matching (preg, mctx, fl_longest_match)
const regex_t *preg; const regex_t *preg;
re_match_context_t *mctx; re_match_context_t *mctx;
int fl_search, fl_longest_match; int fl_longest_match;
{ {
re_dfa_t *dfa = (re_dfa_t *) preg->buffer; re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
reg_errcode_t err; reg_errcode_t err;
@ -1006,31 +1005,15 @@ check_matching (preg, mctx, fl_search, fl_longest_match)
while (!re_string_eoi (mctx->input)) while (!re_string_eoi (mctx->input))
{ {
cur_state = transit_state (&err, preg, mctx, cur_state, cur_state = transit_state (&err, preg, mctx, cur_state);
fl_search && !match);
if (cur_state == NULL) /* Reached at the invalid state or an error. */ if (cur_state == NULL) /* Reached at the invalid state or an error. */
{ {
cur_str_idx = re_string_cur_idx (mctx->input); cur_str_idx = re_string_cur_idx (mctx->input);
if (BE (err != REG_NOERROR, 0)) if (BE (err != REG_NOERROR, 0))
return -2; return -2;
if (fl_search && !match) if (!fl_longest_match && match)
{
/* Restart from initial state, since we are searching
the point from where matching start. */
#ifdef RE_ENABLE_I18N
if (dfa->mb_cur_max == 1
|| re_string_first_byte (mctx->input, cur_str_idx))
#endif /* RE_ENABLE_I18N */
cur_state = acquire_init_state_context (&err, preg, mctx,
cur_str_idx);
if (BE (cur_state == NULL && err != REG_NOERROR, 0))
return -2;
if (mctx->state_log != NULL)
mctx->state_log[cur_str_idx] = cur_state;
}
else if (!fl_longest_match && match)
break; break;
else /* (fl_longest_match && match) || (!fl_search && !match) */ else
{ {
if (mctx->state_log == NULL) if (mctx->state_log == NULL)
break; break;
@ -2069,12 +2052,11 @@ sift_states_iter_mb (preg, mctx, sctx, node_idx, str_idx, max_str_idx)
update the destination of STATE_LOG. */ update the destination of STATE_LOG. */
static re_dfastate_t * static re_dfastate_t *
transit_state (err, preg, mctx, state, fl_search) transit_state (err, preg, mctx, state)
reg_errcode_t *err; reg_errcode_t *err;
const regex_t *preg; const regex_t *preg;
re_match_context_t *mctx; re_match_context_t *mctx;
re_dfastate_t *state; re_dfastate_t *state;
int fl_search;
{ {
re_dfa_t *dfa = (re_dfa_t *) preg->buffer; re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
re_dfastate_t **trtable, *next_state; re_dfastate_t **trtable, *next_state;
@ -2113,24 +2095,40 @@ transit_state (err, preg, mctx, state, fl_search)
{ {
/* Use transition table */ /* Use transition table */
ch = re_string_fetch_byte (mctx->input); ch = re_string_fetch_byte (mctx->input);
trtable = fl_search ? state->trtable_search : state->trtable; trtable = state->trtable;
if (trtable == NULL) if (trtable == NULL)
{ {
trtable = build_trtable (preg, state, fl_search); trtable = build_trtable (preg, state);
if (fl_search) if (trtable == NULL)
state->trtable_search = trtable; {
else *err = REG_ESPACE;
state->trtable = trtable; return NULL;
}
} }
next_state = trtable[ch]; if (BE (state->word_trtable, 0))
{
unsigned int context;
context
= re_string_context_at (mctx->input,
re_string_cur_idx (mctx->input) - 1,
mctx->eflags, preg->newline_anchor);
if (IS_WORD_CONTEXT (context))
next_state = trtable[ch + SBC_MAX];
else
next_state = trtable[ch];
}
else
next_state = trtable[ch];
} }
#if 0
else else
{ {
/* don't use transition table */ /* don't use transition table */
next_state = transit_state_sb (err, preg, state, fl_search, mctx); next_state = transit_state_sb (err, preg, state, mctx);
if (BE (next_state == NULL && err != REG_NOERROR, 0)) if (BE (next_state == NULL && err != REG_NOERROR, 0))
return NULL; return NULL;
} }
#endif
} }
cur_idx = re_string_cur_idx (mctx->input); cur_idx = re_string_cur_idx (mctx->input);
@ -2242,15 +2240,15 @@ check_subexp_matching_top (dfa, mctx, cur_nodes, str_idx)
return REG_NOERROR; return REG_NOERROR;
} }
#if 0
/* Return the next state to which the current state STATE will transit by /* Return the next state to which the current state STATE will transit by
accepting the current input byte. */ accepting the current input byte. */
static re_dfastate_t * static re_dfastate_t *
transit_state_sb (err, preg, state, fl_search, mctx) transit_state_sb (err, preg, state, mctx)
reg_errcode_t *err; reg_errcode_t *err;
const regex_t *preg; const regex_t *preg;
re_dfastate_t *state; re_dfastate_t *state;
int fl_search;
re_match_context_t *mctx; re_match_context_t *mctx;
{ {
re_dfa_t *dfa = (re_dfa_t *) preg->buffer; re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
@ -2276,29 +2274,6 @@ transit_state_sb (err, preg, state, fl_search, mctx)
} }
} }
} }
if (fl_search)
{
#ifdef RE_ENABLE_I18N
int not_initial = 0;
if (dfa->mb_cur_max > 1)
for (node_cnt = 0; node_cnt < next_nodes.nelem; ++node_cnt)
if (dfa->nodes[next_nodes.elems[node_cnt]].type == CHARACTER)
{
not_initial = dfa->nodes[next_nodes.elems[node_cnt]].mb_partial;
break;
}
if (!not_initial)
#endif
{
*err = re_node_set_merge (&next_nodes,
dfa->init_state->entrance_nodes);
if (BE (*err != REG_NOERROR, 0))
{
re_node_set_free (&next_nodes);
return NULL;
}
}
}
context = re_string_context_at (mctx->input, cur_str_idx, mctx->eflags, context = re_string_context_at (mctx->input, cur_str_idx, mctx->eflags,
preg->newline_anchor); preg->newline_anchor);
next_state = re_acquire_state_context (err, dfa, &next_nodes, context); next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
@ -2309,6 +2284,7 @@ transit_state_sb (err, preg, state, fl_search, mctx)
re_string_skip_bytes (mctx->input, 1); re_string_skip_bytes (mctx->input, 1);
return next_state; return next_state;
} }
#endif
#ifdef RE_ENABLE_I18N #ifdef RE_ENABLE_I18N
static reg_errcode_t static reg_errcode_t
@ -3117,10 +3093,9 @@ expand_bkref_cache (preg, mctx, cur_nodes, cur_str, last_str, subexp_num,
Return the new table if succeeded, otherwise return NULL. */ Return the new table if succeeded, otherwise return NULL. */
static re_dfastate_t ** static re_dfastate_t **
build_trtable (preg, state, fl_search) build_trtable (preg, state)
const regex_t *preg; const regex_t *preg;
const re_dfastate_t *state; re_dfastate_t *state;
int fl_search;
{ {
reg_errcode_t err; reg_errcode_t err;
re_dfa_t *dfa = (re_dfa_t *) preg->buffer; re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
@ -3154,6 +3129,7 @@ build_trtable (preg, state, fl_search)
/* Initialize transiton table. */ /* Initialize transiton table. */
trtable = (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX); trtable = (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
state->word_trtable = 0;
if (BE (trtable == NULL, 0)) if (BE (trtable == NULL, 0))
{ {
if (dests_node_malloced) if (dests_node_malloced)
@ -3170,7 +3146,10 @@ build_trtable (preg, state, fl_search)
free (dests_node); free (dests_node);
/* Return NULL in case of an error, trtable otherwise. */ /* Return NULL in case of an error, trtable otherwise. */
if (ndests == 0) if (ndests == 0)
return trtable; {
state->trtable = trtable;
return trtable;
}
free (trtable); free (trtable);
return NULL; return NULL;
} }
@ -3224,26 +3203,6 @@ out_free:
goto out_free; goto out_free;
} }
} }
/* If search flag is set, merge the initial state. */
if (fl_search)
{
#ifdef RE_ENABLE_I18N
int not_initial = 0;
for (j = 0; j < follows.nelem; ++j)
if (dfa->nodes[follows.elems[j]].type == CHARACTER)
{
not_initial = dfa->nodes[follows.elems[j]].mb_partial;
break;
}
if (!not_initial)
#endif
{
err = re_node_set_merge (&follows,
dfa->init_state->entrance_nodes);
if (BE (err != REG_NOERROR, 0))
goto out_free;
}
}
dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0); dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0)) if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
goto out_free; goto out_free;
@ -3274,31 +3233,41 @@ out_free:
for (j = 0; j < UINT_BITS; ++j, ++ch) for (j = 0; j < UINT_BITS; ++j, ++ch)
if ((acceptable[i] >> j) & 1) if ((acceptable[i] >> j) & 1)
{ {
/* The current state accepts the character ch. */ for (k = 0; k < ndests; ++k)
if (IS_WORD_CHAR (ch)) if ((dests_ch[k][i] >> j) & 1)
{ {
for (k = 0; k < ndests; ++k) /* k-th destination accepts the word character ch. */
if ((dests_ch[k][i] >> j) & 1) if (state->word_trtable)
{ {
/* k-th destination accepts the word character ch. */
trtable[ch] = dest_states_word[k];
/* There must be only one destination which accepts
character ch. See group_nodes_into_DFAstates. */
break;
}
}
else /* not WORD_CHAR */
{
for (k = 0; k < ndests; ++k)
if ((dests_ch[k][i] >> j) & 1)
{
/* k-th destination accepts the non-word character ch. */
trtable[ch] = dest_states[k]; trtable[ch] = dest_states[k];
/* There must be only one destination which accepts trtable[ch + SBC_MAX] = dest_states_word[k];
character ch. See group_nodes_into_DFAstates. */
break;
} }
} else if (dfa->mb_cur_max > 1
&& dest_states[k] != dest_states_word[k])
{
re_dfastate_t **new_trtable;
new_trtable = (re_dfastate_t **)
realloc (trtable,
sizeof (re_dfastate_t *)
* 2 * SBC_MAX);
if (BE (new_trtable == NULL, 0))
goto out_free;
memcpy (new_trtable + SBC_MAX, new_trtable,
sizeof (re_dfastate_t *) * SBC_MAX);
trtable = new_trtable;
state->word_trtable = 1;
trtable[ch] = dest_states[k];
trtable[ch + SBC_MAX] = dest_states_word[k];
}
else if (IS_WORD_CHAR (ch))
trtable[ch] = dest_states_word[k];
else
trtable[ch] = dest_states[k];
/* There must be only one destination which accepts
character ch. See group_nodes_into_DFAstates. */
break;
}
} }
/* new line */ /* new line */
if (bitset_contain (acceptable, NEWLINE_CHAR)) if (bitset_contain (acceptable, NEWLINE_CHAR))
@ -3309,6 +3278,8 @@ out_free:
{ {
/* k-th destination accepts newline character. */ /* k-th destination accepts newline character. */
trtable[NEWLINE_CHAR] = dest_states_nl[k]; trtable[NEWLINE_CHAR] = dest_states_nl[k];
if (state->word_trtable)
trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[k];
/* There must be only one destination which accepts /* There must be only one destination which accepts
newline. See group_nodes_into_DFAstates. */ newline. See group_nodes_into_DFAstates. */
break; break;
@ -3325,6 +3296,7 @@ out_free:
if (dests_node_malloced) if (dests_node_malloced)
free (dests_node); free (dests_node);
state->trtable = trtable;
return trtable; return trtable;
} }
@ -3386,6 +3358,8 @@ group_nodes_into_DFAstates (preg, state, dests_node, dests_ch)
match it the context. */ match it the context. */
if (constraint) if (constraint)
{ {
int word_char_max;
if (constraint & NEXT_NEWLINE_CONSTRAINT) if (constraint & NEXT_NEWLINE_CONSTRAINT)
{ {
int accepts_newline = bitset_contain (accepts, NEWLINE_CHAR); int accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
@ -3400,11 +3374,16 @@ group_nodes_into_DFAstates (preg, state, dests_node, dests_ch)
bitset_empty (accepts); bitset_empty (accepts);
continue; continue;
} }
/* This assumes ASCII compatible locale. We cannot say
anything about the non-ascii chars. */
word_char_max
= dfa->mb_cur_max > 1 ? BITSET_UINTS / 2 : BITSET_UINTS;
if (constraint & NEXT_WORD_CONSTRAINT) if (constraint & NEXT_WORD_CONSTRAINT)
for (j = 0; j < BITSET_UINTS; ++j) for (j = 0; j < word_char_max; ++j)
accepts[j] &= dfa->word_char[j]; accepts[j] &= dfa->word_char[j];
if (constraint & NEXT_NOTWORD_CONSTRAINT) if (constraint & NEXT_NOTWORD_CONSTRAINT)
for (j = 0; j < BITSET_UINTS; ++j) for (j = 0; j < word_char_max; ++j)
accepts[j] &= ~dfa->word_char[j]; accepts[j] &= ~dfa->word_char[j];
} }

View File

@ -350,16 +350,28 @@ mb_tests (const char *pattern, int cflags, const char *string, int eflags,
if (strstr (pattern, "[:xdigit:]")) if (strstr (pattern, "[:xdigit:]"))
return 0; return 0;
/* XXX: regex ATM handles only single byte equivalence classes. */
if (strstr (pattern, "[[=b=]]"))
return 0;
for (i = 1; i < 16; ++i) for (i = 1; i < 16; ++i)
{ {
char *p = letters; char *p = letters;
if (i & 1) if ((i & 1)
&& (strchr (pattern, 'a') || strchr (string, 'a')
|| strchr (pattern, 'A') || strchr (string, 'A')))
*p++ = 'a', *p++ = 'A'; *p++ = 'a', *p++ = 'A';
if (i & 2) if ((i & 2)
&& (strchr (pattern, 'b') || strchr (string, 'b')
|| strchr (pattern, 'B') || strchr (string, 'B')))
*p++ = 'b', *p++ = 'B'; *p++ = 'b', *p++ = 'B';
if (i & 4) if ((i & 4)
&& (strchr (pattern, 'c') || strchr (string, 'c')
|| strchr (pattern, 'C') || strchr (string, 'C')))
*p++ = 'c', *p++ = 'C'; *p++ = 'c', *p++ = 'C';
if (i & 8) if ((i & 8)
&& (strchr (pattern, 'd') || strchr (string, 'd')
|| strchr (pattern, 'D') || strchr (string, 'D')))
*p++ = 'd', *p++ = 'D'; *p++ = 'd', *p++ = 'D';
*p++ = '\0'; *p++ = '\0';
sprintf (fail, "UTF-8 %s FAIL", letters); sprintf (fail, "UTF-8 %s FAIL", letters);
@ -489,7 +501,11 @@ main (int argc, char **argv)
replace_special_chars (matches); replace_special_chars (matches);
} }
setlocale (LC_ALL, "C"); if (setlocale (LC_ALL, "C") == NULL)
{
puts ("setlocale C failed");
ret = 1;
}
if (test (pattern, cflags, string, eflags, expect, matches, "FAIL") if (test (pattern, cflags, string, eflags, expect, matches, "FAIL")
|| (try_bre_ere || (try_bre_ere
&& test (pattern, cflags & ~REG_EXTENDED, string, eflags, && test (pattern, cflags & ~REG_EXTENDED, string, eflags,
@ -497,12 +513,16 @@ main (int argc, char **argv)
ret = 1; ret = 1;
else if (test_utf8) else if (test_utf8)
{ {
setlocale (LC_ALL, "cs_CZ.UTF-8"); if (setlocale (LC_ALL, "cs_CZ.UTF-8") == NULL)
if (test (pattern, cflags, string, eflags, expect, matches, {
"UTF-8 FAIL") puts ("setlocale cs_CZ.UTF-8 failed");
|| (try_bre_ere ret = 1;
&& test (pattern, cflags & ~REG_EXTENDED, string, eflags, }
expect, matches, "UTF-8 FAIL"))) else if (test (pattern, cflags, string, eflags, expect, matches,
"UTF-8 FAIL")
|| (try_bre_ere
&& test (pattern, cflags & ~REG_EXTENDED, string,
eflags, expect, matches, "UTF-8 FAIL")))
ret = 1; ret = 1;
else if (mb_tests (pattern, cflags, string, eflags, expect, matches) else if (mb_tests (pattern, cflags, string, eflags, expect, matches)
|| (try_bre_ere || (try_bre_ere