regex_automaton.h: _S_opcode_backref.

2013-08-18  Tim Shen  <timshen91@gmail.com>

	* include/bits/regex_automaton.h: _S_opcode_backref.
	* include/bits/regex_automaton.tcc: Backref automaton support.
	* include/bits/regex_compiler.tcc: Parsing support.
	* include/bits/regex_executor.h: Add _M_traits for _DFSExecutor.
	* include/bits/regex_executor.tcc: Add _S_opcode_backref support.
	* testsuite/28_regex/algorithms/regex_match/ecma/string_backref.cc: New.

From-SVN: r201825
This commit is contained in:
Tim Shen 2013-08-18 13:55:48 +00:00 committed by Tim Shen
parent 1d5755efee
commit ce645eb091
7 changed files with 181 additions and 28 deletions

View File

@ -1,3 +1,12 @@
2013-08-18 Tim Shen <timshen91@gmail.com>
* include/bits/regex_automaton.h: _S_opcode_backref.
* include/bits/regex_automaton.tcc: Backref automaton support.
* include/bits/regex_compiler.tcc: Parsing support.
* include/bits/regex_executor.h: Add _M_traits for _DFSExecutor.
* include/bits/regex_executor.tcc: Add _S_opcode_backref support.
* testsuite/28_regex/algorithms/regex_match/ecma/string_backref.cc: New.
2013-08-16 Tim Shen <timshen91@gmail.com>
* include/bits/regex.h (regex_traits<>::transform_primary):

View File

@ -53,6 +53,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
{
_S_opcode_unknown = 0,
_S_opcode_alternative = 1,
_S_opcode_backref = 2,
_S_opcode_subexpr_begin = 4,
_S_opcode_subexpr_end = 5,
_S_opcode_match = 100,
@ -66,11 +67,15 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
typedef int _OpcodeT;
typedef _Matcher<_CharT> _MatcherT;
_OpcodeT _M_opcode; // type of outgoing transition
_StateIdT _M_next; // outgoing transition
_StateIdT _M_alt; // for _S_opcode_alternative
unsigned int _M_subexpr; // for _S_opcode_subexpr_*
_MatcherT _M_matches; // for _S_opcode_match
_OpcodeT _M_opcode; // type of outgoing transition
_StateIdT _M_next; // outgoing transition
union // Since they are mutual exclusive.
{
_StateIdT _M_alt; // for _S_opcode_alternative
unsigned int _M_subexpr; // for _S_opcode_subexpr_*
unsigned int _M_backref_index; // for _S_opcode_backref
};
_MatcherT _M_matches; // for _S_opcode_match
explicit _State(_OpcodeT __opcode)
: _M_opcode(__opcode), _M_next(_S_invalid_state_id)
@ -82,8 +87,14 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
{ }
_State(_OpcodeT __opcode, unsigned __index)
: _M_opcode(__opcode), _M_next(_S_invalid_state_id), _M_subexpr(__index)
{ }
: _M_opcode(__opcode), _M_next(_S_invalid_state_id)
{
if (__opcode == _S_opcode_subexpr_begin
|| __opcode == _S_opcode_subexpr_end)
_M_subexpr = __index;
else if (__opcode == _S_opcode_backref)
_M_backref_index = __index;
}
_State(_StateIdT __next, _StateIdT __alt)
: _M_opcode(_S_opcode_alternative), _M_next(__next), _M_alt(__alt)
@ -174,7 +185,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
_M_insert_subexpr_begin()
{
auto __id = _M_subexpr_count++;
_M_paren_stack.push(__id);
_M_paren_stack.push_back(__id);
this->push_back(_StateT(_S_opcode_subexpr_begin, __id));
return this->size()-1;
}
@ -182,26 +193,25 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
_StateIdT
_M_insert_subexpr_end()
{
this->push_back(_StateT(_S_opcode_subexpr_end, _M_paren_stack.top()));
_M_paren_stack.pop();
this->push_back(_StateT(_S_opcode_subexpr_end, _M_paren_stack.back()));
_M_paren_stack.pop_back();
return this->size()-1;
}
void
_M_set_backref(bool __b)
{ _M_has_backref = __b; }
_StateIdT
_M_insert_backref(unsigned int __index);
#ifdef _GLIBCXX_DEBUG
std::ostream&
_M_dot(std::ostream& __ostr) const;
#endif
_FlagT _M_flags;
_StateIdT _M_start_state;
_StateSet _M_accepting_states;
_SizeT _M_subexpr_count;
bool _M_has_backref;
std::stack<unsigned int> _M_paren_stack;
_FlagT _M_flags;
_StateIdT _M_start_state;
_StateSet _M_accepting_states;
_SizeT _M_subexpr_count;
bool _M_has_backref;
std::vector<unsigned int> _M_paren_stack;
};
/// Describes a sequence of one or more %_State, its current start

View File

@ -50,6 +50,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
case _S_opcode_subexpr_end:
ostr << "subexpr end next=" << _M_next << " index=" << _M_subexpr;
break;
case _S_opcode_backref:
ostr << "backref next=" << _M_next << " index=" << _M_backref_index;
break;
case _S_opcode_match:
ostr << "match next=" << _M_next;
break;
@ -87,6 +90,11 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
<< _M_subexpr << "\"];\n"
<< __id << " -> " << _M_next << " [label=\"epsilon\"];\n";
break;
case _S_opcode_backref:
__ostr << __id << " [label=\"" << __id << "\\nBACKREF "
<< _M_subexpr << "\"];\n"
<< __id << " -> " << _M_next << " [label=\"<match>\"];\n";
break;
case _S_opcode_match:
__ostr << __id << " [label=\"" << __id << "\\nMATCH\"];\n"
<< __id << " -> " << _M_next << " [label=\"<match>\"];\n";
@ -115,6 +123,27 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
}
#endif
template<typename _CharT, typename _TraitsT>
_StateIdT _NFA<_CharT, _TraitsT>::
_M_insert_backref(unsigned int __index)
{
// To figure out whether a backref is valid, a stack is used to store
// unfinished sub-expressions. For example, when parsing
// "(a(b)(c\\1(d)))" at '\\1', _M_subexpr_count is 3, indicating that 3
// sub expressions are parsed or partially parsed(in the stack), aka,
// "(a..", "(b)" and "(c..").
// _M_paren_stack is {1, 3}, for incomplete "(a.." and "(c..". At this
// time, "\\2" is valid, but "\\1" and "\\3" are not.
if (__index >= _M_subexpr_count)
__throw_regex_error(regex_constants::error_backref);
for (auto __it : _M_paren_stack)
if (__index == __it)
__throw_regex_error(regex_constants::error_backref);
_M_has_backref = true;
this->push_back(_StateT(_S_opcode_backref, __index));
return this->size()-1;
}
template<typename _CharT, typename _TraitsT>
_StateSeq<_CharT, _TraitsT>& _StateSeq<_CharT, _TraitsT>::
operator=(const _StateSeq& __rhs)

View File

@ -745,8 +745,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
if (_M_match_token(_ScannerT::_S_token_backref))
{
// __m.push(_Matcher::_S_opcode_ordchar, _M_cur_value);
_M_state_store._M_set_backref(true);
//return true;
_M_stack.push(_StateSeqT(_M_state_store, _M_state_store.
_M_insert_backref(_M_cur_int_value(10))));
return true;
}
if (_M_match_token(_ScannerT::_S_token_subexpr_begin))
{

View File

@ -82,10 +82,10 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
__it.matched = false;
}
_BiIter _M_current;
_BiIter _M_end;
_BiIter _M_current;
_BiIter _M_end;
_ResultsT& _M_results;
_FlagT _M_flags;
_FlagT _M_flags;
};
template<typename _BiIter, typename _Alloc,
@ -96,16 +96,16 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
public:
typedef _Executor<_BiIter, _Alloc, _CharT, _TraitsT> _BaseT;
typedef _NFA<_CharT, _TraitsT> _RegexT;
typedef typename _BaseT::_ResultsT _ResultsT;
typedef typename _BaseT::_ResultsT _ResultsT;
typedef regex_constants::match_flag_type _FlagT;
_DFSExecutor(_BiIter __begin,
_BiIter __end,
_ResultsT& __results,
_ResultsT& __results,
const _RegexT& __nfa,
_FlagT __flags)
: _BaseT(__begin, __end, __results, __flags, __nfa._M_sub_count()),
_M_nfa(__nfa)
_M_traits(_TraitsT()), _M_nfa(__nfa)
{ }
bool
@ -121,6 +121,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
bool
_M_dfs(_StateIdT __i);
_TraitsT _M_traits;
const _RegexT& _M_nfa;
};

View File

@ -63,8 +63,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
__ret = _M_dfs<__match_mode>(__state._M_next);
break;
case _S_opcode_subexpr_end:
__ret = _M_dfs<__match_mode>(__state._M_next);
__results.at(__state._M_subexpr).second = __current;
__results.at(__state._M_subexpr).matched = true;
__ret = _M_dfs<__match_mode>(__state._M_next);
__results.at(__state._M_subexpr).matched = __ret;
break;
case _S_opcode_match:
@ -75,6 +76,30 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
--__current;
}
break;
// First fetch the matched result from __results as __submatch;
// then compare it with
// (__current, __current + (__submatch.second - __submatch.first))
// If matched, keep going; else just return to try another state.
case _S_opcode_backref:
{
auto& __submatch = __results.at(__state._M_backref_index);
if (!__submatch.matched)
break;
auto __last = __current;
for (auto __tmp = __submatch.first;
__last != __end && __tmp != __submatch.second;
++__tmp)
++__last;
if (_M_traits.transform(__submatch.first, __submatch.second)
== _M_traits.transform(__current, __last))
{
auto __backup = __current;
__current = __last;
__ret = _M_dfs<__match_mode>(__state._M_next);
__current = __backup;
}
}
break;
case _S_opcode_accept:
if (__match_mode)
__ret = __current == __end;

View File

@ -0,0 +1,78 @@
// { dg-options "-std=gnu++11" }
//
// 2013-08-10 Tim Shen <timshen91@gmail.com>
//
// Copyright (C) 2013 Free Software Foundation, Inc.
//
// This file is part of the GNU ISO C++ Library. This library is free
// software; you can redistribute it and/or modify it under the
// terms of the GNU General Public License as published by the
// Free Software Foundation; either version 3, or (at your option)
// any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License along
// with this library; see the file COPYING3. If not see
// <http://www.gnu.org/licenses/>.
// 28.11.2 regex_match
// Tests ECMAScript back-refernce against a std::string.
#include <regex>
#include <testsuite_hooks.h>
using namespace std;
void
test01()
{
bool test __attribute__((unused)) = true;
regex re("([A-Z])\\1*");
smatch m;
{
string s = "AAAA";
regex_match(s, m, re);
VERIFY( m[0].matched );
VERIFY( m[1].matched );
VERIFY( std::string(m[0].first, m[0].second) == "AAAA" );
VERIFY( std::string(m[1].first, m[1].second) == "A" );
}
{
string s = "BBBB";
regex_match(s, m, re);
VERIFY( m[0].matched );
VERIFY( m[1].matched );
VERIFY( std::string(m[0].first, m[0].second) == "BBBB" );
VERIFY( std::string(m[1].first, m[1].second) == "B" );
}
{
string s = "BBBA";
regex_match(s, m, re);
VERIFY( !m[0].matched );
VERIFY( !m[1].matched );
}
{
try
{
regex re("(a(b)(c\\1(d)))");
VERIFY( false );
}
catch (...)
{
VERIFY( true );
}
}
}
int
main()
{
test01();
return 0;
}