re PR libstdc++/61424 (std::regex matches right to left, not leftmost longest)
PR libstdc++/61424 * include/bits/regex.tcc (__regex_algo_impl<>): Use DFS for ECMAScript, not just regex containing back-references. * include/bits/regex_compiler.tcc (_Compiler<>::_M_disjunction): exchange _M_next and _M_alt for alternative operator, making matching from left to right. * include/bits/regex_executor.h (_State_info<>::_M_get_sol_pos): Add position tracking fom DFS. * include/bits/regex_executor.tcc (_Executor<>::_M_main_dispatch, _Executor<>::_M_dfs): Likewise. * include/bits/regex_scanner.h: Remove unused enum entry. * testsuite/28_regex/algorithms/regex_search/61424.cc: New testcase from PR. From-SVN: r212184
This commit is contained in:
parent
25f28ff0ea
commit
ad9ec7b3c5
@ -1,3 +1,19 @@
|
||||
2014-07-01 Tim Shen <timshen@google.com>
|
||||
|
||||
PR libstdc++/61424
|
||||
* include/bits/regex.tcc (__regex_algo_impl<>): Use DFS for ECMAScript,
|
||||
not just regex containing back-references.
|
||||
* include/bits/regex_compiler.tcc (_Compiler<>::_M_disjunction):
|
||||
exchange _M_next and _M_alt for alternative operator,
|
||||
making matching from left to right.
|
||||
* include/bits/regex_executor.h (_State_info<>::_M_get_sol_pos):
|
||||
Add position tracking fom DFS.
|
||||
* include/bits/regex_executor.tcc (_Executor<>::_M_main_dispatch,
|
||||
_Executor<>::_M_dfs): Likewise.
|
||||
* include/bits/regex_scanner.h: Remove unused enum entry.
|
||||
* testsuite/28_regex/algorithms/regex_search/61424.cc: New
|
||||
testcase from PR.
|
||||
|
||||
2014-06-30 Jason Merrill <jason@redhat.com>
|
||||
|
||||
* libsupc++/cxxabi.h (class __pbase_type_info): __pointer_catch
|
||||
|
@ -71,6 +71,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
|
||||
// _GLIBCXX_REGEX_USE_THOMPSON_NFA if they need to use this approach.
|
||||
bool __ret;
|
||||
if (!__re._M_automaton->_M_has_backref
|
||||
&& !(__re._M_flags & regex_constants::ECMAScript)
|
||||
#ifndef _GLIBCXX_REGEX_USE_THOMPSON_NFA
|
||||
&& __policy == _RegexExecutorPolicy::_S_alternate
|
||||
#endif
|
||||
|
@ -103,9 +103,12 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
|
||||
auto __end = _M_nfa._M_insert_dummy();
|
||||
__alt1._M_append(__end);
|
||||
__alt2._M_append(__end);
|
||||
// __alt2 is state._M_next, __alt1 is state._M_alt. The executor
|
||||
// executes _M_alt before _M_next, as well as executing left
|
||||
// alternative before right one.
|
||||
_M_stack.push(_StateSeqT(_M_nfa,
|
||||
_M_nfa._M_insert_alt(__alt1._M_start,
|
||||
__alt2._M_start, false),
|
||||
_M_nfa._M_insert_alt(__alt2._M_start,
|
||||
__alt1._M_start, false),
|
||||
__end));
|
||||
}
|
||||
}
|
||||
|
@ -173,6 +173,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
|
||||
void _M_queue(_StateIdT __i, const _ResultsVec& __res)
|
||||
{ _M_match_queue.emplace_back(__i, __res); }
|
||||
|
||||
// Dummy implementations for BFS mode.
|
||||
_BiIter* _M_get_sol_pos() { return nullptr; }
|
||||
|
||||
// Saves states that need to be considered for the next character.
|
||||
vector<pair<_StateIdT, _ResultsVec>> _M_match_queue;
|
||||
// Indicates which states are already visited.
|
||||
@ -192,11 +195,13 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
|
||||
bool _M_visited(_StateIdT) const { return false; }
|
||||
void _M_queue(_StateIdT, const _ResultsVec&) { }
|
||||
|
||||
_BiIter* _M_get_sol_pos() { return &_M_sol_pos; }
|
||||
|
||||
// To record current solution.
|
||||
_StateIdT _M_start;
|
||||
_BiIter _M_sol_pos;
|
||||
};
|
||||
|
||||
|
||||
public:
|
||||
_ResultsVec _M_cur_results;
|
||||
_BiIter _M_current;
|
||||
|
@ -82,6 +82,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
|
||||
_M_main_dispatch(_Match_mode __match_mode, __dfs)
|
||||
{
|
||||
_M_has_sol = false;
|
||||
*_M_states._M_get_sol_pos() = _BiIter();
|
||||
_M_cur_results = _M_results;
|
||||
_M_dfs(__match_mode, _M_states._M_start);
|
||||
return _M_has_sol;
|
||||
@ -338,7 +339,29 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
|
||||
&& (_M_flags & regex_constants::match_not_null))
|
||||
_M_has_sol = false;
|
||||
if (_M_has_sol)
|
||||
_M_results = _M_cur_results;
|
||||
{
|
||||
if (_M_nfa._M_flags & regex_constants::ECMAScript)
|
||||
_M_results = _M_cur_results;
|
||||
else // POSIX
|
||||
{
|
||||
_GLIBCXX_DEBUG_ASSERT(_M_states._M_get_sol_pos());
|
||||
// Here's POSIX's logic: match the longest one. However
|
||||
// we never know which one (lhs or rhs of "|") is longer
|
||||
// unless we try both of them and compare the results.
|
||||
// The member variable _M_sol_pos records the end
|
||||
// position of the last successful match. It's better
|
||||
// to be larger, because POSIX regex is always greedy.
|
||||
// TODO: This could be slow.
|
||||
if (*_M_states._M_get_sol_pos() == _BiIter()
|
||||
|| std::distance(_M_begin,
|
||||
*_M_states._M_get_sol_pos())
|
||||
< std::distance(_M_begin, _M_current))
|
||||
{
|
||||
*_M_states._M_get_sol_pos() = _M_current;
|
||||
_M_results = _M_cur_results;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -354,9 +377,25 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
|
||||
}
|
||||
break;
|
||||
case _S_opcode_alternative:
|
||||
_M_dfs(__match_mode, __state._M_alt);
|
||||
if (!__dfs_mode || !_M_has_sol)
|
||||
_M_dfs(__match_mode, __state._M_next);
|
||||
if (_M_nfa._M_flags & regex_constants::ECMAScript)
|
||||
{
|
||||
// TODO: Let DFS support ECMAScript's alternative operation.
|
||||
_GLIBCXX_DEBUG_ASSERT(!__dfs_mode);
|
||||
_M_dfs(__match_mode, __state._M_alt);
|
||||
// Pick lhs if it matches. Only try rhs if it doesn't.
|
||||
if (!_M_has_sol)
|
||||
_M_dfs(__match_mode, __state._M_next);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Try both and compare the result.
|
||||
// See "case _S_opcode_accept:" handling above.
|
||||
_M_dfs(__match_mode, __state._M_alt);
|
||||
auto __has_sol = _M_has_sol;
|
||||
_M_has_sol = false;
|
||||
_M_dfs(__match_mode, __state._M_next);
|
||||
_M_has_sol |= __has_sol;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
_GLIBCXX_DEBUG_ASSERT(false);
|
||||
|
@ -67,7 +67,6 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
|
||||
_S_token_or,
|
||||
_S_token_closure0,
|
||||
_S_token_closure1,
|
||||
_S_token_ungreedy,
|
||||
_S_token_line_begin,
|
||||
_S_token_line_end,
|
||||
_S_token_word_bound, // neg if _M_value[0] == 'n'
|
||||
|
@ -0,0 +1,52 @@
|
||||
// { dg-options "-std=gnu++11" }
|
||||
|
||||
// Copyright (C) 2014 Free Software Foundation, Inc.
|
||||
//
|
||||
// This file is part of the GNU ISO C++ Library. This library is free
|
||||
// software; you can redistribute it and/or modify it under the
|
||||
// terms of the GNU General Public License as published by the
|
||||
// Free Software Foundation; either version 3, or (at your option)
|
||||
// any later version.
|
||||
|
||||
// This library is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
|
||||
// You should have received a copy of the GNU General Public License along
|
||||
// with this library; see the file COPYING3. If not see
|
||||
// <http://www.gnu.org/licenses/>.
|
||||
|
||||
// PR libstdc++/61424
|
||||
|
||||
#include <regex>
|
||||
#include <testsuite_hooks.h>
|
||||
#include <testsuite_regex.h>
|
||||
|
||||
using namespace std;
|
||||
using namespace __gnu_test;
|
||||
|
||||
int main()
|
||||
{
|
||||
regex_constants::syntax_option_type grammar[] = {
|
||||
regex_constants::ECMAScript, regex_constants::extended,
|
||||
regex_constants::awk, regex_constants::egrep
|
||||
};
|
||||
|
||||
string sol[] = {
|
||||
"tour",
|
||||
"tournament",
|
||||
"tournament",
|
||||
"tournament",
|
||||
};
|
||||
int i = 0;
|
||||
for (auto g : grammar)
|
||||
{
|
||||
regex re("tour|tournament|tourn", g);
|
||||
const char str[] = "tournament";
|
||||
cmatch m;
|
||||
VERIFY(regex_search_debug(str, m, re));
|
||||
VERIFY(sol[i] == m[0]);
|
||||
i++;
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user