diff --git a/libstdc++-v3/ChangeLog b/libstdc++-v3/ChangeLog index 0c1cb434f91..6715e531f0b 100644 --- a/libstdc++-v3/ChangeLog +++ b/libstdc++-v3/ChangeLog @@ -1,3 +1,12 @@ +2013-08-18 Tim Shen + + * include/bits/regex_automaton.h: _S_opcode_backref. + * include/bits/regex_automaton.tcc: Backref automaton support. + * include/bits/regex_compiler.tcc: Parsing support. + * include/bits/regex_executor.h: Add _M_traits for _DFSExecutor. + * include/bits/regex_executor.tcc: Add _S_opcode_backref support. + * testsuite/28_regex/algorithms/regex_match/ecma/string_backref.cc: New. + 2013-08-16 Tim Shen * include/bits/regex.h (regex_traits<>::transform_primary): diff --git a/libstdc++-v3/include/bits/regex_automaton.h b/libstdc++-v3/include/bits/regex_automaton.h index 5817156aadf..b58071e7aea 100644 --- a/libstdc++-v3/include/bits/regex_automaton.h +++ b/libstdc++-v3/include/bits/regex_automaton.h @@ -53,6 +53,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION { _S_opcode_unknown = 0, _S_opcode_alternative = 1, + _S_opcode_backref = 2, _S_opcode_subexpr_begin = 4, _S_opcode_subexpr_end = 5, _S_opcode_match = 100, @@ -66,11 +67,15 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION typedef int _OpcodeT; typedef _Matcher<_CharT> _MatcherT; - _OpcodeT _M_opcode; // type of outgoing transition - _StateIdT _M_next; // outgoing transition - _StateIdT _M_alt; // for _S_opcode_alternative - unsigned int _M_subexpr; // for _S_opcode_subexpr_* - _MatcherT _M_matches; // for _S_opcode_match + _OpcodeT _M_opcode; // type of outgoing transition + _StateIdT _M_next; // outgoing transition + union // Since they are mutual exclusive. + { + _StateIdT _M_alt; // for _S_opcode_alternative + unsigned int _M_subexpr; // for _S_opcode_subexpr_* + unsigned int _M_backref_index; // for _S_opcode_backref + }; + _MatcherT _M_matches; // for _S_opcode_match explicit _State(_OpcodeT __opcode) : _M_opcode(__opcode), _M_next(_S_invalid_state_id) @@ -82,8 +87,14 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION { } _State(_OpcodeT __opcode, unsigned __index) - : _M_opcode(__opcode), _M_next(_S_invalid_state_id), _M_subexpr(__index) - { } + : _M_opcode(__opcode), _M_next(_S_invalid_state_id) + { + if (__opcode == _S_opcode_subexpr_begin + || __opcode == _S_opcode_subexpr_end) + _M_subexpr = __index; + else if (__opcode == _S_opcode_backref) + _M_backref_index = __index; + } _State(_StateIdT __next, _StateIdT __alt) : _M_opcode(_S_opcode_alternative), _M_next(__next), _M_alt(__alt) @@ -174,7 +185,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION _M_insert_subexpr_begin() { auto __id = _M_subexpr_count++; - _M_paren_stack.push(__id); + _M_paren_stack.push_back(__id); this->push_back(_StateT(_S_opcode_subexpr_begin, __id)); return this->size()-1; } @@ -182,26 +193,25 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION _StateIdT _M_insert_subexpr_end() { - this->push_back(_StateT(_S_opcode_subexpr_end, _M_paren_stack.top())); - _M_paren_stack.pop(); + this->push_back(_StateT(_S_opcode_subexpr_end, _M_paren_stack.back())); + _M_paren_stack.pop_back(); return this->size()-1; } - void - _M_set_backref(bool __b) - { _M_has_backref = __b; } + _StateIdT + _M_insert_backref(unsigned int __index); #ifdef _GLIBCXX_DEBUG std::ostream& _M_dot(std::ostream& __ostr) const; #endif - _FlagT _M_flags; - _StateIdT _M_start_state; - _StateSet _M_accepting_states; - _SizeT _M_subexpr_count; - bool _M_has_backref; - std::stack _M_paren_stack; + _FlagT _M_flags; + _StateIdT _M_start_state; + _StateSet _M_accepting_states; + _SizeT _M_subexpr_count; + bool _M_has_backref; + std::vector _M_paren_stack; }; /// Describes a sequence of one or more %_State, its current start diff --git a/libstdc++-v3/include/bits/regex_automaton.tcc b/libstdc++-v3/include/bits/regex_automaton.tcc index cf9c8eb3147..40a154790d2 100644 --- a/libstdc++-v3/include/bits/regex_automaton.tcc +++ b/libstdc++-v3/include/bits/regex_automaton.tcc @@ -50,6 +50,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION case _S_opcode_subexpr_end: ostr << "subexpr end next=" << _M_next << " index=" << _M_subexpr; break; + case _S_opcode_backref: + ostr << "backref next=" << _M_next << " index=" << _M_backref_index; + break; case _S_opcode_match: ostr << "match next=" << _M_next; break; @@ -87,6 +90,11 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION << _M_subexpr << "\"];\n" << __id << " -> " << _M_next << " [label=\"epsilon\"];\n"; break; + case _S_opcode_backref: + __ostr << __id << " [label=\"" << __id << "\\nBACKREF " + << _M_subexpr << "\"];\n" + << __id << " -> " << _M_next << " [label=\"\"];\n"; + break; case _S_opcode_match: __ostr << __id << " [label=\"" << __id << "\\nMATCH\"];\n" << __id << " -> " << _M_next << " [label=\"\"];\n"; @@ -115,6 +123,27 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION } #endif + template + _StateIdT _NFA<_CharT, _TraitsT>:: + _M_insert_backref(unsigned int __index) + { + // To figure out whether a backref is valid, a stack is used to store + // unfinished sub-expressions. For example, when parsing + // "(a(b)(c\\1(d)))" at '\\1', _M_subexpr_count is 3, indicating that 3 + // sub expressions are parsed or partially parsed(in the stack), aka, + // "(a..", "(b)" and "(c.."). + // _M_paren_stack is {1, 3}, for incomplete "(a.." and "(c..". At this + // time, "\\2" is valid, but "\\1" and "\\3" are not. + if (__index >= _M_subexpr_count) + __throw_regex_error(regex_constants::error_backref); + for (auto __it : _M_paren_stack) + if (__index == __it) + __throw_regex_error(regex_constants::error_backref); + _M_has_backref = true; + this->push_back(_StateT(_S_opcode_backref, __index)); + return this->size()-1; + } + template _StateSeq<_CharT, _TraitsT>& _StateSeq<_CharT, _TraitsT>:: operator=(const _StateSeq& __rhs) diff --git a/libstdc++-v3/include/bits/regex_compiler.tcc b/libstdc++-v3/include/bits/regex_compiler.tcc index 04301e49346..2a5e2c68655 100644 --- a/libstdc++-v3/include/bits/regex_compiler.tcc +++ b/libstdc++-v3/include/bits/regex_compiler.tcc @@ -745,8 +745,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION if (_M_match_token(_ScannerT::_S_token_backref)) { // __m.push(_Matcher::_S_opcode_ordchar, _M_cur_value); - _M_state_store._M_set_backref(true); - //return true; + _M_stack.push(_StateSeqT(_M_state_store, _M_state_store. + _M_insert_backref(_M_cur_int_value(10)))); + return true; } if (_M_match_token(_ScannerT::_S_token_subexpr_begin)) { diff --git a/libstdc++-v3/include/bits/regex_executor.h b/libstdc++-v3/include/bits/regex_executor.h index afac8d03816..0006a29b614 100644 --- a/libstdc++-v3/include/bits/regex_executor.h +++ b/libstdc++-v3/include/bits/regex_executor.h @@ -82,10 +82,10 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION __it.matched = false; } - _BiIter _M_current; - _BiIter _M_end; + _BiIter _M_current; + _BiIter _M_end; _ResultsT& _M_results; - _FlagT _M_flags; + _FlagT _M_flags; }; template _BaseT; typedef _NFA<_CharT, _TraitsT> _RegexT; - typedef typename _BaseT::_ResultsT _ResultsT; + typedef typename _BaseT::_ResultsT _ResultsT; typedef regex_constants::match_flag_type _FlagT; _DFSExecutor(_BiIter __begin, _BiIter __end, - _ResultsT& __results, + _ResultsT& __results, const _RegexT& __nfa, _FlagT __flags) : _BaseT(__begin, __end, __results, __flags, __nfa._M_sub_count()), - _M_nfa(__nfa) + _M_traits(_TraitsT()), _M_nfa(__nfa) { } bool @@ -121,6 +121,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION bool _M_dfs(_StateIdT __i); + _TraitsT _M_traits; const _RegexT& _M_nfa; }; diff --git a/libstdc++-v3/include/bits/regex_executor.tcc b/libstdc++-v3/include/bits/regex_executor.tcc index 32d153762e4..08b4915a3e3 100644 --- a/libstdc++-v3/include/bits/regex_executor.tcc +++ b/libstdc++-v3/include/bits/regex_executor.tcc @@ -63,8 +63,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION __ret = _M_dfs<__match_mode>(__state._M_next); break; case _S_opcode_subexpr_end: - __ret = _M_dfs<__match_mode>(__state._M_next); __results.at(__state._M_subexpr).second = __current; + __results.at(__state._M_subexpr).matched = true; + __ret = _M_dfs<__match_mode>(__state._M_next); __results.at(__state._M_subexpr).matched = __ret; break; case _S_opcode_match: @@ -75,6 +76,30 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION --__current; } break; + // First fetch the matched result from __results as __submatch; + // then compare it with + // (__current, __current + (__submatch.second - __submatch.first)) + // If matched, keep going; else just return to try another state. + case _S_opcode_backref: + { + auto& __submatch = __results.at(__state._M_backref_index); + if (!__submatch.matched) + break; + auto __last = __current; + for (auto __tmp = __submatch.first; + __last != __end && __tmp != __submatch.second; + ++__tmp) + ++__last; + if (_M_traits.transform(__submatch.first, __submatch.second) + == _M_traits.transform(__current, __last)) + { + auto __backup = __current; + __current = __last; + __ret = _M_dfs<__match_mode>(__state._M_next); + __current = __backup; + } + } + break; case _S_opcode_accept: if (__match_mode) __ret = __current == __end; diff --git a/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/ecma/string_backref.cc b/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/ecma/string_backref.cc new file mode 100644 index 00000000000..a828fea93c6 --- /dev/null +++ b/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/ecma/string_backref.cc @@ -0,0 +1,78 @@ +// { dg-options "-std=gnu++11" } + +// +// 2013-08-10 Tim Shen +// +// Copyright (C) 2013 Free Software Foundation, Inc. +// +// This file is part of the GNU ISO C++ Library. This library is free +// software; you can redistribute it and/or modify it under the +// terms of the GNU General Public License as published by the +// Free Software Foundation; either version 3, or (at your option) +// any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License along +// with this library; see the file COPYING3. If not see +// . + +// 28.11.2 regex_match +// Tests ECMAScript back-refernce against a std::string. + +#include +#include + +using namespace std; + +void +test01() +{ + bool test __attribute__((unused)) = true; + + regex re("([A-Z])\\1*"); + smatch m; + { + string s = "AAAA"; + regex_match(s, m, re); + VERIFY( m[0].matched ); + VERIFY( m[1].matched ); + VERIFY( std::string(m[0].first, m[0].second) == "AAAA" ); + VERIFY( std::string(m[1].first, m[1].second) == "A" ); + } + { + string s = "BBBB"; + regex_match(s, m, re); + VERIFY( m[0].matched ); + VERIFY( m[1].matched ); + VERIFY( std::string(m[0].first, m[0].second) == "BBBB" ); + VERIFY( std::string(m[1].first, m[1].second) == "B" ); + } + { + string s = "BBBA"; + regex_match(s, m, re); + VERIFY( !m[0].matched ); + VERIFY( !m[1].matched ); + } + { + try + { + regex re("(a(b)(c\\1(d)))"); + VERIFY( false ); + } + catch (...) + { + VERIFY( true ); + } + } +} + +int +main() +{ + test01(); + return 0; +}