diff --git a/libstdc++-v3/ChangeLog b/libstdc++-v3/ChangeLog index 1d6cf7a5bd8..75e4841626c 100644 --- a/libstdc++-v3/ChangeLog +++ b/libstdc++-v3/ChangeLog @@ -1,3 +1,19 @@ +2014-07-01 Tim Shen + + PR libstdc++/61424 + * include/bits/regex.tcc (__regex_algo_impl<>): Use DFS for ECMAScript, + not just regex containing back-references. + * include/bits/regex_compiler.tcc (_Compiler<>::_M_disjunction): + exchange _M_next and _M_alt for alternative operator, + making matching from left to right. + * include/bits/regex_executor.h (_State_info<>::_M_get_sol_pos): + Add position tracking fom DFS. + * include/bits/regex_executor.tcc (_Executor<>::_M_main_dispatch, + _Executor<>::_M_dfs): Likewise. + * include/bits/regex_scanner.h: Remove unused enum entry. + * testsuite/28_regex/algorithms/regex_search/61424.cc: New + testcase from PR. + 2014-06-30 Jason Merrill * libsupc++/cxxabi.h (class __pbase_type_info): __pointer_catch diff --git a/libstdc++-v3/include/bits/regex.tcc b/libstdc++-v3/include/bits/regex.tcc index a81f51750dc..3322379a5b8 100644 --- a/libstdc++-v3/include/bits/regex.tcc +++ b/libstdc++-v3/include/bits/regex.tcc @@ -71,6 +71,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION // _GLIBCXX_REGEX_USE_THOMPSON_NFA if they need to use this approach. bool __ret; if (!__re._M_automaton->_M_has_backref + && !(__re._M_flags & regex_constants::ECMAScript) #ifndef _GLIBCXX_REGEX_USE_THOMPSON_NFA && __policy == _RegexExecutorPolicy::_S_alternate #endif diff --git a/libstdc++-v3/include/bits/regex_compiler.tcc b/libstdc++-v3/include/bits/regex_compiler.tcc index 0df10cc1a8b..f15f7dd0f7b 100644 --- a/libstdc++-v3/include/bits/regex_compiler.tcc +++ b/libstdc++-v3/include/bits/regex_compiler.tcc @@ -103,9 +103,12 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION auto __end = _M_nfa._M_insert_dummy(); __alt1._M_append(__end); __alt2._M_append(__end); + // __alt2 is state._M_next, __alt1 is state._M_alt. The executor + // executes _M_alt before _M_next, as well as executing left + // alternative before right one. _M_stack.push(_StateSeqT(_M_nfa, - _M_nfa._M_insert_alt(__alt1._M_start, - __alt2._M_start, false), + _M_nfa._M_insert_alt(__alt2._M_start, + __alt1._M_start, false), __end)); } } diff --git a/libstdc++-v3/include/bits/regex_executor.h b/libstdc++-v3/include/bits/regex_executor.h index 1991c0007a1..40d3443d985 100644 --- a/libstdc++-v3/include/bits/regex_executor.h +++ b/libstdc++-v3/include/bits/regex_executor.h @@ -173,6 +173,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION void _M_queue(_StateIdT __i, const _ResultsVec& __res) { _M_match_queue.emplace_back(__i, __res); } + // Dummy implementations for BFS mode. + _BiIter* _M_get_sol_pos() { return nullptr; } + // Saves states that need to be considered for the next character. vector> _M_match_queue; // Indicates which states are already visited. @@ -192,11 +195,13 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION bool _M_visited(_StateIdT) const { return false; } void _M_queue(_StateIdT, const _ResultsVec&) { } + _BiIter* _M_get_sol_pos() { return &_M_sol_pos; } + // To record current solution. _StateIdT _M_start; + _BiIter _M_sol_pos; }; - public: _ResultsVec _M_cur_results; _BiIter _M_current; diff --git a/libstdc++-v3/include/bits/regex_executor.tcc b/libstdc++-v3/include/bits/regex_executor.tcc index aefa8f47137..38b8ff27013 100644 --- a/libstdc++-v3/include/bits/regex_executor.tcc +++ b/libstdc++-v3/include/bits/regex_executor.tcc @@ -82,6 +82,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION _M_main_dispatch(_Match_mode __match_mode, __dfs) { _M_has_sol = false; + *_M_states._M_get_sol_pos() = _BiIter(); _M_cur_results = _M_results; _M_dfs(__match_mode, _M_states._M_start); return _M_has_sol; @@ -338,7 +339,29 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION && (_M_flags & regex_constants::match_not_null)) _M_has_sol = false; if (_M_has_sol) - _M_results = _M_cur_results; + { + if (_M_nfa._M_flags & regex_constants::ECMAScript) + _M_results = _M_cur_results; + else // POSIX + { + _GLIBCXX_DEBUG_ASSERT(_M_states._M_get_sol_pos()); + // Here's POSIX's logic: match the longest one. However + // we never know which one (lhs or rhs of "|") is longer + // unless we try both of them and compare the results. + // The member variable _M_sol_pos records the end + // position of the last successful match. It's better + // to be larger, because POSIX regex is always greedy. + // TODO: This could be slow. + if (*_M_states._M_get_sol_pos() == _BiIter() + || std::distance(_M_begin, + *_M_states._M_get_sol_pos()) + < std::distance(_M_begin, _M_current)) + { + *_M_states._M_get_sol_pos() = _M_current; + _M_results = _M_cur_results; + } + } + } } else { @@ -354,9 +377,25 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION } break; case _S_opcode_alternative: - _M_dfs(__match_mode, __state._M_alt); - if (!__dfs_mode || !_M_has_sol) - _M_dfs(__match_mode, __state._M_next); + if (_M_nfa._M_flags & regex_constants::ECMAScript) + { + // TODO: Let DFS support ECMAScript's alternative operation. + _GLIBCXX_DEBUG_ASSERT(!__dfs_mode); + _M_dfs(__match_mode, __state._M_alt); + // Pick lhs if it matches. Only try rhs if it doesn't. + if (!_M_has_sol) + _M_dfs(__match_mode, __state._M_next); + } + else + { + // Try both and compare the result. + // See "case _S_opcode_accept:" handling above. + _M_dfs(__match_mode, __state._M_alt); + auto __has_sol = _M_has_sol; + _M_has_sol = false; + _M_dfs(__match_mode, __state._M_next); + _M_has_sol |= __has_sol; + } break; default: _GLIBCXX_DEBUG_ASSERT(false); diff --git a/libstdc++-v3/include/bits/regex_scanner.h b/libstdc++-v3/include/bits/regex_scanner.h index 6627db97ce0..5552226bdfb 100644 --- a/libstdc++-v3/include/bits/regex_scanner.h +++ b/libstdc++-v3/include/bits/regex_scanner.h @@ -67,7 +67,6 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION _S_token_or, _S_token_closure0, _S_token_closure1, - _S_token_ungreedy, _S_token_line_begin, _S_token_line_end, _S_token_word_bound, // neg if _M_value[0] == 'n' diff --git a/libstdc++-v3/testsuite/28_regex/algorithms/regex_search/61424.cc b/libstdc++-v3/testsuite/28_regex/algorithms/regex_search/61424.cc new file mode 100644 index 00000000000..bdccb4a454e --- /dev/null +++ b/libstdc++-v3/testsuite/28_regex/algorithms/regex_search/61424.cc @@ -0,0 +1,52 @@ +// { dg-options "-std=gnu++11" } + +// Copyright (C) 2014 Free Software Foundation, Inc. +// +// This file is part of the GNU ISO C++ Library. This library is free +// software; you can redistribute it and/or modify it under the +// terms of the GNU General Public License as published by the +// Free Software Foundation; either version 3, or (at your option) +// any later version. + +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License along +// with this library; see the file COPYING3. If not see +// . + +// PR libstdc++/61424 + +#include +#include +#include + +using namespace std; +using namespace __gnu_test; + +int main() +{ + regex_constants::syntax_option_type grammar[] = { + regex_constants::ECMAScript, regex_constants::extended, + regex_constants::awk, regex_constants::egrep + }; + + string sol[] = { + "tour", + "tournament", + "tournament", + "tournament", + }; + int i = 0; + for (auto g : grammar) + { + regex re("tour|tournament|tourn", g); + const char str[] = "tournament"; + cmatch m; + VERIFY(regex_search_debug(str, m, re)); + VERIFY(sol[i] == m[0]); + i++; + } +}