2004-12-07  Paolo Bonzini  <bonzini@gnu.org>

	* posix/regexec.c (proceed_next_node): Simplify treatment of epsilon
	nodes.  Pass the pushed node to push_fail_stack.
	(push_fail_stack): Accept a single node rather than an array
	of two epsilon destinations.
	(build_sifted_states): Only walk non-epsilon nodes.
	(check_arrival): Don't pass epsilon nodes to
	check_arrival_add_next_nodes.
	(check_arrival_add_next_nodes) [DEBUG]: Abort if an epsilon node is
	found.
	(check_node_accept): Do expensive checks later.
	(add_epsilon_src_nodes): Cache result of merging the inveclosures.
	* posix/regex_internal.h (re_dfastate_t): Add non_eps_nodes and
	inveclosure.
	(re_string_elem_size_at, re_string_char_size_at, re_string_wchar_at,
	re_string_context_at, re_string_peek_byte_case,
	re_string_fetch_byte_case, re_node_set_compare, re_node_set_contains):
	Declare as pure.
	* posix/regex_internal.c (create_newstate_common): Remove.
	(register_state): Move part of it here.  Initialize non_eps_nodes.
	(free_state): Free inveclosure and non_eps_nodes.
	(create_cd_newstate, create_ci_newstate): Allocate the new
	re_dfastate_t here.
This commit is contained in:
Ulrich Drepper 2004-12-10 04:37:58 +00:00
parent dc165f7b0b
commit 5cf1ec5256
4 changed files with 163 additions and 106 deletions

View File

@ -1,3 +1,28 @@
2004-12-07 Paolo Bonzini <bonzini@gnu.org>
* posix/regexec.c (proceed_next_node): Simplify treatment of epsilon
nodes. Pass the pushed node to push_fail_stack.
(push_fail_stack): Accept a single node rather than an array
of two epsilon destinations.
(build_sifted_states): Only walk non-epsilon nodes.
(check_arrival): Don't pass epsilon nodes to
check_arrival_add_next_nodes.
(check_arrival_add_next_nodes) [DEBUG]: Abort if an epsilon node is
found.
(check_node_accept): Do expensive checks later.
(add_epsilon_src_nodes): Cache result of merging the inveclosures.
* posix/regex_internal.h (re_dfastate_t): Add non_eps_nodes and
inveclosure.
(re_string_elem_size_at, re_string_char_size_at, re_string_wchar_at,
re_string_context_at, re_string_peek_byte_case,
re_string_fetch_byte_case, re_node_set_compare, re_node_set_contains):
Declare as pure.
* posix/regex_internal.c (create_newstate_common): Remove.
(register_state): Move part of it here. Initialize non_eps_nodes.
(free_state): Free inveclosure and non_eps_nodes.
(create_cd_newstate, create_ci_newstate): Allocate the new
re_dfastate_t here.
2004-12-09 Ulrich Drepper <drepper@redhat.com>
* malloc/malloc.c (public_rEALLOc): Add parameter checks.

View File

@ -26,9 +26,6 @@ static void re_string_construct_common (const char *str, int len,
static int re_string_skip_chars (re_string_t *pstr, int new_raw_idx,
wint_t *last_wc) internal_function;
#endif /* RE_ENABLE_I18N */
static re_dfastate_t *create_newstate_common (re_dfa_t *dfa,
const re_node_set *nodes,
unsigned int hash) internal_function;
static reg_errcode_t register_state (re_dfa_t *dfa, re_dfastate_t *newstate,
unsigned int hash) internal_function;
static re_dfastate_t *create_ci_newstate (re_dfa_t *dfa,
@ -1298,7 +1295,7 @@ re_node_set_contains (set, elem)
const re_node_set *set;
int elem;
{
int idx, right, mid;
unsigned int idx, right, mid;
if (set->nelem <= 0)
return 0;
@ -1484,33 +1481,9 @@ re_acquire_state_context (err, dfa, nodes, context)
}
}
/* Allocate memory for DFA state and initialize common properties.
Return the new state if succeeded, otherwise return NULL. */
static re_dfastate_t *
create_newstate_common (dfa, nodes, hash)
re_dfa_t *dfa;
const re_node_set *nodes;
unsigned int hash;
{
re_dfastate_t *newstate;
reg_errcode_t err;
newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
if (BE (newstate == NULL, 0))
return NULL;
err = re_node_set_init_copy (&newstate->nodes, nodes);
if (BE (err != REG_NOERROR, 0))
{
re_free (newstate);
return NULL;
}
newstate->trtable = NULL;
newstate->hash = hash;
return newstate;
}
/* Store the new state NEWSTATE whose hash value is HASH in appropriate
position. Return value indicate the error code if failed. */
/* Finish initialization of the new state NEWSTATE, and using its hash value
HASH put in the appropriate bucket of DFA's state table. Return value
indicates the error code if failed. */
static reg_errcode_t
register_state (dfa, newstate, hash)
@ -1519,8 +1492,21 @@ register_state (dfa, newstate, hash)
unsigned int hash;
{
struct re_state_table_entry *spot;
spot = dfa->state_table + (hash & dfa->state_hash_mask);
reg_errcode_t err;
int i;
newstate->hash = hash;
err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
if (BE (err != REG_NOERROR, 0))
return REG_ESPACE;
for (i = 0; i < newstate->nodes.nelem; i++)
{
int elem = newstate->nodes.elems[i];
if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
re_node_set_insert_last (&newstate->non_eps_nodes, elem);
}
spot = dfa->state_table + (hash & dfa->state_hash_mask);
if (BE (spot->alloc <= spot->num, 0))
{
int new_alloc = 2 * spot->num + 2;
@ -1547,11 +1533,18 @@ create_ci_newstate (dfa, nodes, hash)
int i;
reg_errcode_t err;
re_dfastate_t *newstate;
newstate = create_newstate_common (dfa, nodes, hash);
newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
if (BE (newstate == NULL, 0))
return NULL;
newstate->entrance_nodes = &newstate->nodes;
err = re_node_set_init_copy (&newstate->nodes, nodes);
if (BE (err != REG_NOERROR, 0))
{
re_free (newstate);
return NULL;
}
newstate->entrance_nodes = &newstate->nodes;
for (i = 0 ; i < nodes->nelem ; i++)
{
re_token_t *node = dfa->nodes + nodes->elems[i];
@ -1595,9 +1588,16 @@ create_cd_newstate (dfa, nodes, context, hash)
reg_errcode_t err;
re_dfastate_t *newstate;
newstate = create_newstate_common (dfa, nodes, hash);
newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
if (BE (newstate == NULL, 0))
return NULL;
err = re_node_set_init_copy (&newstate->nodes, nodes);
if (BE (err != REG_NOERROR, 0))
{
re_free (newstate);
return NULL;
}
newstate->context = context;
newstate->entrance_nodes = &newstate->nodes;
@ -1660,6 +1660,8 @@ static void
free_state (state)
re_dfastate_t *state;
{
re_node_set_free (&state->non_eps_nodes);
re_node_set_free (&state->inveclosure);
if (state->entrance_nodes != &state->nodes)
{
re_node_set_free (state->entrance_nodes);

View File

@ -388,18 +388,20 @@ static void re_string_translate_buffer (re_string_t *pstr) internal_function;
static void re_string_destruct (re_string_t *pstr) internal_function;
# ifdef RE_ENABLE_I18N
static int re_string_elem_size_at (const re_string_t *pstr, int idx)
internal_function;
internal_function __attribute ((pure));
static inline int re_string_char_size_at (const re_string_t *pstr, int idx)
internal_function;
internal_function __attribute ((pure));
static inline wint_t re_string_wchar_at (const re_string_t *pstr, int idx)
internal_function;
internal_function __attribute ((pure));
# endif /* RE_ENABLE_I18N */
static unsigned int re_string_context_at (const re_string_t *input, int idx,
int eflags) internal_function;
int eflags)
internal_function __attribute ((pure));
static unsigned char re_string_peek_byte_case (const re_string_t *pstr,
int idx) internal_function;
int idx)
internal_function __attribute ((pure));
static unsigned char re_string_fetch_byte_case (re_string_t *pstr)
internal_function;
internal_function __attribute ((pure));
#endif
#define re_string_peek_byte(pstr, offset) \
((pstr)->mbs[(pstr)->cur_idx + offset])
@ -481,6 +483,8 @@ struct re_dfastate_t
{
unsigned int hash;
re_node_set nodes;
re_node_set non_eps_nodes;
re_node_set inveclosure;
re_node_set *entrance_nodes;
struct re_dfastate_t **trtable;
unsigned int context : 4;
@ -665,8 +669,10 @@ static int re_node_set_insert (re_node_set *set, int elem) internal_function;
static int re_node_set_insert_last (re_node_set *set,
int elem) internal_function;
static int re_node_set_compare (const re_node_set *set1,
const re_node_set *set2) internal_function;
static int re_node_set_contains (const re_node_set *set, int elem) internal_function;
const re_node_set *set2)
internal_function __attribute ((pure));
static int re_node_set_contains (const re_node_set *set, int elem)
internal_function __attribute ((pure));
static void re_node_set_remove_at (re_node_set *set, int idx) internal_function;
#define re_node_set_remove(set,id) \
(re_node_set_remove_at (set, re_node_set_contains (set, id) - 1))

View File

@ -73,7 +73,7 @@ static int proceed_next_node (const re_match_context_t *mctx,
int *pidx, int node, re_node_set *eps_via_nodes,
struct re_fail_stack_t *fs) internal_function;
static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
int str_idx, int *dests, int nregs,
int str_idx, int dest_node, int nregs,
regmatch_t *regs,
re_node_set *eps_via_nodes) internal_function;
static int pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs,
@ -1223,30 +1223,38 @@ proceed_next_node (mctx, nregs, regs, pidx, node, eps_via_nodes, fs)
if (IS_EPSILON_NODE (dfa->nodes[node].type))
{
re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
int ndest, dest_nodes[2];
re_node_set *edests = &dfa->edests[node];
int dest_node;
err = re_node_set_insert (eps_via_nodes, node);
if (BE (err < 0, 0))
return -2;
/* Pick up valid destinations. */
for (ndest = 0, i = 0; i < dfa->edests[node].nelem; ++i)
/* Pick up a valid destination, or return -1 if none is found. */
for (dest_node = -1, i = 0; i < edests->nelem; ++i)
{
int candidate = dfa->edests[node].elems[i];
int candidate = edests->elems[i];
if (!re_node_set_contains (cur_nodes, candidate))
continue;
dest_nodes[0] = (ndest == 0) ? candidate : dest_nodes[0];
dest_nodes[1] = (ndest == 1) ? candidate : dest_nodes[1];
++ndest;
if (dest_node == -1)
dest_node = candidate;
else
{
/* In order to avoid infinite loop like "(a*)*", return the second
epsilon-transition if the first was already considered. */
if (re_node_set_contains (eps_via_nodes, dest_node))
return candidate;
/* Otherwise, push the second epsilon-transition on the fail stack. */
else if (fs != NULL
&& push_fail_stack (fs, *pidx, candidate, nregs, regs,
eps_via_nodes))
return -2;
/* We know we are going to exit. */
break;
}
}
if (ndest <= 1)
return ndest == 0 ? -1 : (ndest == 1 ? dest_nodes[0] : 0);
/* In order to avoid infinite loop like "(a*)*". */
if (re_node_set_contains (eps_via_nodes, dest_nodes[0]))
return dest_nodes[1];
if (fs != NULL
&& push_fail_stack (fs, *pidx, dest_nodes, nregs, regs,
eps_via_nodes))
return -2;
return dest_nodes[0];
return dest_node;
}
else
{
@ -1304,9 +1312,9 @@ proceed_next_node (mctx, nregs, regs, pidx, node, eps_via_nodes, fs)
}
static reg_errcode_t
push_fail_stack (fs, str_idx, dests, nregs, regs, eps_via_nodes)
push_fail_stack (fs, str_idx, dest_node, nregs, regs, eps_via_nodes)
struct re_fail_stack_t *fs;
int str_idx, *dests, nregs;
int str_idx, dest_node, nregs;
regmatch_t *regs;
re_node_set *eps_via_nodes;
{
@ -1323,7 +1331,7 @@ push_fail_stack (fs, str_idx, dests, nregs, regs, eps_via_nodes)
fs->stack = new_array;
}
fs->stack[num].idx = str_idx;
fs->stack[num].node = dests[1];
fs->stack[num].node = dest_node;
fs->stack[num].regs = re_malloc (regmatch_t, nregs);
if (fs->stack[num].regs == NULL)
return REG_ESPACE;
@ -1600,7 +1608,7 @@ build_sifted_states (mctx, sctx, str_idx, cur_dest)
re_node_set *cur_dest;
{
re_dfa_t *const dfa = mctx->dfa;
re_node_set *cur_src = &mctx->state_log[str_idx]->nodes;
re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
int i;
/* Then build the next sifted state.
@ -1608,16 +1616,20 @@ build_sifted_states (mctx, sctx, str_idx, cur_dest)
`sifted_states[str_idx]' with `cur_dest'.
Note:
`cur_dest' is the sifted state from `state_log[str_idx + 1]'.
`cur_src' points the node_set of the old `state_log[str_idx]'. */
`cur_src' points the node_set of the old `state_log[str_idx]'
(with the epsilon nodes pre-filtered out). */
for (i = 0; i < cur_src->nelem; i++)
{
int prev_node = cur_src->elems[i];
int naccepted = 0;
re_token_type_t type = dfa->nodes[prev_node].type;
int ret;
if (IS_EPSILON_NODE (type))
continue;
#if defined DEBUG || defined RE_ENABLE_I18N
re_token_type_t type = dfa->nodes[prev_node].type;
#endif
#ifdef DEBUG
assert (!IS_EPSILON_NODE (type));
#endif
#ifdef RE_ENABLE_I18N
/* If the node may accept `multi byte'. */
if (ACCEPT_MB_NODE (type))
@ -1764,26 +1776,24 @@ add_epsilon_src_nodes (dfa, dest_nodes, candidates)
re_node_set *dest_nodes;
const re_node_set *candidates;
{
reg_errcode_t err;
int src_idx;
re_node_set src_copy;
reg_errcode_t err = REG_NOERROR;
int i;
err = re_node_set_init_copy (&src_copy, dest_nodes);
re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
if (BE (err != REG_NOERROR, 0))
return err;
for (src_idx = 0; src_idx < src_copy.nelem; ++src_idx)
if (!state->inveclosure.alloc)
{
err = re_node_set_add_intersect (dest_nodes, candidates,
dfa->inveclosures
+ src_copy.elems[src_idx]);
err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
if (BE (err != REG_NOERROR, 0))
{
re_node_set_free (&src_copy);
return err;
}
return REG_ESPACE;
for (i = 0; i < dest_nodes->nelem; i++)
re_node_set_merge (&state->inveclosure,
dfa->inveclosures + dest_nodes->elems[i]);
}
re_node_set_free (&src_copy);
return REG_NOERROR;
return re_node_set_add_intersect (dest_nodes, candidates,
&state->inveclosure);
}
static reg_errcode_t
@ -2935,7 +2945,7 @@ check_arrival (mctx, path, top_node, top_str, last_node, last_str,
if (cur_state)
{
err = check_arrival_add_next_nodes (mctx, str_idx,
&cur_state->nodes, &next_nodes);
&cur_state->non_eps_nodes, &next_nodes);
if (BE (err != REG_NOERROR, 0))
{
re_node_set_free (&next_nodes);
@ -3009,9 +3019,12 @@ check_arrival_add_next_nodes (mctx, str_idx, cur_nodes, next_nodes)
{
int naccepted = 0;
int cur_node = cur_nodes->elems[cur_idx];
#if defined DEBUG || defined RE_ENABLE_I18N
re_token_type_t type = dfa->nodes[cur_node].type;
if (IS_EPSILON_NODE (type))
continue;
#endif
#ifdef DEBUG
assert (!IS_EPSILON_NODE (type));
#endif
#ifdef RE_ENABLE_I18N
/* If the node may accept `multi byte'. */
if (ACCEPT_MB_NODE (type))
@ -3988,8 +4001,36 @@ check_node_accept (mctx, node, idx)
const re_token_t *node;
int idx;
{
re_dfa_t *const dfa = mctx->dfa;
unsigned char ch;
ch = re_string_byte_at (&mctx->input, idx);
switch (node->type)
{
case CHARACTER:
if (node->opr.c != ch)
return 0;
break;
case SIMPLE_BRACKET:
if (!bitset_contain (node->opr.sbcset, ch))
return 0;
break;
#ifdef RE_ENABLE_I18N
case OP_UTF8_PERIOD:
if (ch >= 0x80)
return 0;
/* FALLTHROUGH */
#endif
case OP_PERIOD:
if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
|| (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
return 0;
break;
default:
return 0;
}
if (node->constraint)
{
/* The node has constraints. Check whether the current context
@ -3999,25 +4040,8 @@ check_node_accept (mctx, node, idx)
if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
return 0;
}
ch = re_string_byte_at (&mctx->input, idx);
switch (node->type)
{
case CHARACTER:
return node->opr.c == ch;
case SIMPLE_BRACKET:
return bitset_contain (node->opr.sbcset, ch);
#ifdef RE_ENABLE_I18N
case OP_UTF8_PERIOD:
if (ch >= 0x80)
return 0;
/* FALLTHROUGH */
#endif
case OP_PERIOD:
return !((ch == '\n' && !(dfa->syntax & RE_DOT_NEWLINE))
|| (ch == '\0' && (dfa->syntax & RE_DOT_NOT_NULL)));
default:
return 0;
}
return 1;
}
/* Extend the buffers, if the buffers have run out. */