Makefile.am: Add regex_scanner.{h,tcc}.

2013-08-26  Tim Shen  <timshen91@gmail.com>

	* include/Makefile.am: Add regex_scanner.{h,tcc}.
	* include/Makefile.in: Regenerate.
	* include/bits/regex.h (match_search): Handle the `__first == __last`
	  situation correctly.
	* include/bits/regex_compiler.h: Move _Scanner...
	* include/bits/regex_scanner.h: ...to here. New.
	* include/bits/regex_compiler.tcc: Move _Scanner...
	* include/bits/regex_scanner.tcc: ...to here, too. New.
	* include/bits/regex_executor.tcc: Use value instead of reference for
	  submatch.
	* include/std/regex: Add regex_scanner.h
	* testsuite/28_regex/algorithms/regex_match/awk/cstring_01.cc: New.
	* testsuite/28_regex/algorithms/regex_match/basic/empty_range.cc: New.
	* testsuite/28_regex/algorithms/regex_match/ecma/cstring_hex.cc: New.
	* testsuite/28_regex/algorithms/regex_match/ecma/empty_range.cc: New.
	* testsuite/28_regex/algorithms/regex_search/ecma/string_01.cc: New.

From-SVN: r202015
This commit is contained in:
Tim Shen 2013-08-27 02:49:22 +00:00 committed by Tim Shen
parent fd91cfe3e0
commit 33fbbb766c
15 changed files with 1289 additions and 790 deletions

View File

@ -1,3 +1,22 @@
2013-08-26 Tim Shen <timshen91@gmail.com>
* include/Makefile.am: Add regex_scanner.{h,tcc}.
* include/Makefile.in: Regenerate.
* include/bits/regex.h (match_search): Handle the `__first == __last`
situation correctly.
* include/bits/regex_compiler.h: Move _Scanner...
* include/bits/regex_scanner.h: ...to here. New.
* include/bits/regex_compiler.tcc: Move _Scanner...
* include/bits/regex_scanner.tcc: ...to here, too. New.
* include/bits/regex_executor.tcc: Use value instead of reference for
submatch.
* include/std/regex: Add regex_scanner.h
* testsuite/28_regex/algorithms/regex_match/awk/cstring_01.cc: New.
* testsuite/28_regex/algorithms/regex_match/basic/empty_range.cc: New.
* testsuite/28_regex/algorithms/regex_match/ecma/cstring_hex.cc: New.
* testsuite/28_regex/algorithms/regex_match/ecma/empty_range.cc: New.
* testsuite/28_regex/algorithms/regex_search/ecma/string_01.cc: New.
2013-08-22 Tim Shen <timshen91@gmail.com>
* include/bits/regex.h: Replace 8 spaces in indentation with a tab.

View File

@ -128,6 +128,8 @@ bits_headers = \
${bits_srcdir}/regex.h \
${bits_srcdir}/regex_constants.h \
${bits_srcdir}/regex_error.h \
${bits_srcdir}/regex_scanner.h \
${bits_srcdir}/regex_scanner.tcc \
${bits_srcdir}/regex_automaton.h \
${bits_srcdir}/regex_automaton.tcc \
${bits_srcdir}/regex_compiler.h \

View File

@ -395,6 +395,8 @@ bits_headers = \
${bits_srcdir}/regex.h \
${bits_srcdir}/regex_constants.h \
${bits_srcdir}/regex_error.h \
${bits_srcdir}/regex_scanner.h \
${bits_srcdir}/regex_scanner.tcc \
${bits_srcdir}/regex_automaton.h \
${bits_srcdir}/regex_automaton.tcc \
${bits_srcdir}/regex_compiler.h \

View File

@ -740,11 +740,11 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
* @throws regex_error if @p [__first, __last) is not a valid regular
* expression.
*/
template<typename _InputIterator>
basic_regex(_InputIterator __first, _InputIterator __last,
template<typename _FwdIter>
basic_regex(_FwdIter __first, _FwdIter __last,
flag_type __f = ECMAScript)
: _M_flags(__f),
_M_automaton(__detail::_Compiler<_InputIterator, _Ch_type, _Rx_traits>
_M_automaton(__detail::_Compiler<_FwdIter, _Ch_type, _Rx_traits>
(__first, __last, _M_traits, _M_flags)._M_get_nfa())
{ }
@ -2371,7 +2371,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
{
if (__re._M_automaton == nullptr)
return false;
for (auto __cur = __first; __cur != __last; ++__cur) // Any KMP-like algo?
auto __cur = __first;
// Continue when __cur == __last
do
{
__detail::__get_executor(__cur, __last, __m, __re, __flags)
->_M_search_from_first();
@ -2391,10 +2393,10 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
return true;
}
}
while (__cur++ != __last);
return false;
}
/**
* Searches for a regular expression within a range.
* @param __first [IN] The start of the string to search.

View File

@ -39,197 +39,11 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
* @{
*/
/// Matches a character range (bracket expression)
template<typename _CharT, typename _TraitsT>
struct _BracketMatcher
{
typedef typename _TraitsT::char_class_type _CharClassT;
typedef typename _TraitsT::string_type _StringT;
typedef regex_constants::syntax_option_type _FlagT;
explicit
_BracketMatcher(bool __is_non_matching,
const _TraitsT& __t,
_FlagT __flags)
: _M_is_non_matching(__is_non_matching), _M_traits(__t),
_M_flags(__flags), _M_class_set(0)
{ }
bool
operator()(_CharT) const;
void
_M_add_char(_CharT __c)
{
if (_M_flags & regex_constants::collate)
if (_M_is_icase())
_M_char_set.push_back(_M_traits.translate_nocase(__c));
else
_M_char_set.push_back(_M_traits.translate(__c));
else
_M_char_set.push_back(__c);
}
void
_M_add_collating_element(const _StringT& __s)
{
auto __st = _M_traits.lookup_collatename(&*__s.begin(), &*__s.end());
if (__st.empty())
__throw_regex_error(regex_constants::error_collate);
// TODO: digraph
_M_char_set.push_back(__st[0]);
}
void
_M_add_equivalence_class(const _StringT& __s)
{
_M_add_character_class(
_M_traits.transform_primary(&*__s.begin(), &*__s.end()));
}
void
_M_add_character_class(const _StringT& __s)
{
auto __st = _M_traits.
lookup_classname(&*__s.begin(), &*__s.end(), _M_is_icase());
if (__st == 0)
__throw_regex_error(regex_constants::error_ctype);
_M_class_set |= __st;
}
void
_M_make_range(_CharT __l, _CharT __r)
{ _M_range_set.push_back(make_pair(_M_get_str(__l), _M_get_str(__r))); }
bool
_M_is_icase() const
{ return _M_flags & regex_constants::icase; }
_StringT
_M_get_str(_CharT __c) const
{
auto __s = _StringT(1,
_M_is_icase()
? _M_traits.translate_nocase(__c)
: _M_traits.translate(__c));
return _M_traits.transform(__s.begin(), __s.end());
}
_TraitsT _M_traits;
_FlagT _M_flags;
bool _M_is_non_matching;
std::vector<_CharT> _M_char_set;
std::vector<pair<_StringT, _StringT>> _M_range_set;
_CharClassT _M_class_set;
};
/**
* @brief struct _Scanner. Scans an input range for regex tokens.
*
* The %_Scanner class interprets the regular expression pattern in
* the input range passed to its constructor as a sequence of parse
* tokens passed to the regular expression compiler. The sequence
* of tokens provided depends on the flag settings passed to the
* constructor: different regular expression grammars will interpret
* the same input pattern in syntactically different ways.
*/
template<typename _InputIter>
class _Scanner
{
public:
typedef unsigned int _StateT;
typedef typename std::iterator_traits<_InputIter>::value_type _CharT;
typedef std::basic_string<_CharT> _StringT;
typedef regex_constants::syntax_option_type _FlagT;
typedef const std::ctype<_CharT> _CtypeT;
/// Token types returned from the scanner.
enum _TokenT
{
_S_token_anychar,
_S_token_backref,
_S_token_bracket_begin,
_S_token_bracket_inverse_begin,
_S_token_bracket_end,
_S_token_char_class_name,
_S_token_closure0,
_S_token_closure1,
_S_token_collelem_multi,
_S_token_collelem_single,
_S_token_collsymbol,
_S_token_comma,
_S_token_dash,
_S_token_dup_count,
_S_token_eof,
_S_token_equiv_class_name,
_S_token_interval_begin,
_S_token_interval_end,
_S_token_line_begin,
_S_token_line_end,
_S_token_opt,
_S_token_or,
_S_token_ord_char,
_S_token_subexpr_begin,
_S_token_subexpr_end,
_S_token_word_begin,
_S_token_word_end,
_S_token_unknown
};
_Scanner(_InputIter __begin, _InputIter __end,
_FlagT __flags, std::locale __loc)
: _M_current(__begin) , _M_end(__end) , _M_flags(__flags),
_M_ctype(std::use_facet<_CtypeT>(__loc)), _M_state(0)
{ _M_advance(); }
void
_M_advance();
_TokenT
_M_token() const
{ return _M_curToken; }
const _StringT&
_M_value() const
{ return _M_curValue; }
#ifdef _GLIBCXX_DEBUG
std::ostream&
_M_print(std::ostream&);
#endif
private:
void
_M_eat_escape();
void
_M_scan_in_brace();
void
_M_scan_in_bracket();
void
_M_eat_charclass();
void
_M_eat_equivclass();
void
_M_eat_collsymbol();
static constexpr _StateT _S_state_in_brace = 1 << 0;
static constexpr _StateT _S_state_in_bracket = 1 << 1;
_InputIter _M_current;
_InputIter _M_end;
_FlagT _M_flags;
_CtypeT& _M_ctype;
_TokenT _M_curToken;
_StringT _M_curValue;
_StateT _M_state;
};
struct _BracketMatcher;
/// Builds an NFA from an input iterator interval.
template<typename _InputIter, typename _CharT, typename _TraitsT>
template<typename _FwdIter, typename _CharT, typename _TraitsT>
class _Compiler
{
public:
@ -237,7 +51,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
typedef _NFA<_CharT, _TraitsT> _RegexT;
typedef regex_constants::syntax_option_type _FlagT;
_Compiler(_InputIter __b, _InputIter __e,
_Compiler(_FwdIter __b, _FwdIter __e,
const _TraitsT& __traits, _FlagT __flags);
std::shared_ptr<_RegexT>
@ -245,7 +59,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
{ return std::shared_ptr<_RegexT>(new _RegexT(_M_state_store)); }
private:
typedef _Scanner<_InputIter> _ScannerT;
typedef _Scanner<_FwdIter> _ScannerT;
typedef typename _ScannerT::_TokenT _TokenT;
typedef _StateSeq<_CharT, _TraitsT> _StateSeqT;
typedef std::stack<_StateSeqT, std::vector<_StateSeqT>> _StackT;
@ -276,7 +90,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
bool
_M_bracket_expression();
bool
void
_M_bracket_list(_BMatcherT& __matcher);
bool
@ -303,14 +117,111 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
int
_M_cur_int_value(int __radix);
bool
_M_try_char();
_CharT
_M_get_char();
const _TraitsT& _M_traits;
_ScannerT _M_scanner;
_StringT _M_cur_value;
_StringT _M_value;
_RegexT _M_state_store;
_StackT _M_stack;
_FlagT _M_flags;
};
/// Matches a character range (bracket expression)
template<typename _CharT, typename _TraitsT>
struct _BracketMatcher
{
typedef typename _TraitsT::char_class_type _CharClassT;
typedef typename _TraitsT::string_type _StringT;
typedef regex_constants::syntax_option_type _FlagT;
explicit
_BracketMatcher(bool __is_non_matching,
const _TraitsT& __t,
_FlagT __flags)
: _M_is_non_matching(__is_non_matching), _M_traits(__t),
_M_flags(__flags), _M_class_set(0)
{ }
bool
operator()(_CharT) const;
void
_M_add_char(_CharT __c)
{ _M_char_set.push_back(_M_translate(__c)); }
void
_M_add_collating_element(const _StringT& __s)
{
auto __st = _M_traits.lookup_collatename(__s.data(),
__s.data() + __s.size());
if (__st.empty())
__throw_regex_error(regex_constants::error_collate);
// TODO: digraph
_M_char_set.push_back(__st[0]);
}
void
_M_add_equivalence_class(const _StringT& __s)
{
_M_add_character_class(
_M_traits.transform_primary(__s.data(),
__s.data() + __s.size()));
}
void
_M_add_character_class(const _StringT& __s)
{
auto __st = _M_traits.
lookup_classname(__s.data(), __s.data() + __s.size(), _M_is_icase());
if (__st == 0)
__throw_regex_error(regex_constants::error_ctype);
_M_class_set |= __st;
}
void
_M_make_range(_CharT __l, _CharT __r)
{
_M_range_set.push_back(
make_pair(_M_get_str(_M_translate(__l)),
_M_get_str(_M_translate(__r))));
}
_CharT
_M_translate(_CharT __c) const
{
if (_M_flags & regex_constants::collate)
if (_M_is_icase())
return _M_traits.translate_nocase(__c);
else
return _M_traits.translate(__c);
else
return __c;
}
bool
_M_is_icase() const
{ return _M_flags & regex_constants::icase; }
_StringT
_M_get_str(_CharT __c) const
{
_StringT __s(1, __c);
return _M_traits.transform(__s.begin(), __s.end());
}
_TraitsT _M_traits;
_FlagT _M_flags;
bool _M_is_non_matching;
std::vector<_CharT> _M_char_set;
std::vector<pair<_StringT, _StringT>> _M_range_set;
_CharClassT _M_class_set;
};
//@} regex-detail
_GLIBCXX_END_NAMESPACE_VERSION
} // namespace __detail

View File

@ -34,506 +34,15 @@ namespace __detail
{
_GLIBCXX_BEGIN_NAMESPACE_VERSION
template<typename _BiIter>
void
_Scanner<_BiIter>::
_M_advance()
{
if (_M_current == _M_end)
{
_M_curToken = _S_token_eof;
return;
}
_CharT __c = *_M_current;
if (_M_state & _S_state_in_bracket)
{
_M_scan_in_bracket();
return;
}
if (_M_state & _S_state_in_brace)
{
_M_scan_in_brace();
return;
}
#if 0
// TODO: re-enable line anchors when _M_assertion is implemented.
// See PR libstdc++/47724
else if (_M_state & _S_state_at_start && __c == _M_ctype.widen('^'))
{
_M_curToken = _S_token_line_begin;
++_M_current;
return;
}
else if (__c == _M_ctype.widen('$'))
{
_M_curToken = _S_token_line_end;
++_M_current;
return;
}
#endif
else if (__c == _M_ctype.widen('.'))
{
_M_curToken = _S_token_anychar;
++_M_current;
return;
}
else if (__c == _M_ctype.widen('*'))
{
_M_curToken = _S_token_closure0;
++_M_current;
return;
}
else if (__c == _M_ctype.widen('+'))
{
_M_curToken = _S_token_closure1;
++_M_current;
return;
}
else if (__c == _M_ctype.widen('|'))
{
_M_curToken = _S_token_or;
++_M_current;
return;
}
else if (__c == _M_ctype.widen('['))
{
if (*++_M_current == _M_ctype.widen('^'))
{
_M_curToken = _S_token_bracket_inverse_begin;
++_M_current;
}
else
_M_curToken = _S_token_bracket_begin;
_M_state |= _S_state_in_bracket;
return;
}
else if (__c == _M_ctype.widen('\\'))
{
_M_eat_escape();
return;
}
else if (!(_M_flags & (regex_constants::basic | regex_constants::grep)))
{
if (__c == _M_ctype.widen('('))
{
_M_curToken = _S_token_subexpr_begin;
++_M_current;
return;
}
else if (__c == _M_ctype.widen(')'))
{
_M_curToken = _S_token_subexpr_end;
++_M_current;
return;
}
else if (__c == _M_ctype.widen('{'))
{
_M_curToken = _S_token_interval_begin;
_M_state |= _S_state_in_brace;
++_M_current;
return;
}
}
_M_curToken = _S_token_ord_char;
_M_curValue.assign(1, __c);
++_M_current;
}
template<typename _BiIter>
void
_Scanner<_BiIter>::
_M_scan_in_brace()
{
if (_M_ctype.is(_CtypeT::digit, *_M_current))
{
_M_curToken = _S_token_dup_count;
_M_curValue.assign(1, *_M_current);
++_M_current;
while (_M_current != _M_end
&& _M_ctype.is(_CtypeT::digit, *_M_current))
{
_M_curValue += *_M_current;
++_M_current;
}
return;
}
else if (*_M_current == _M_ctype.widen(','))
{
_M_curToken = _S_token_comma;
++_M_current;
return;
}
if (_M_flags & (regex_constants::basic | regex_constants::grep))
{
if (*_M_current == _M_ctype.widen('\\'))
_M_eat_escape();
}
else
{
if (*_M_current == _M_ctype.widen('}'))
{
_M_curToken = _S_token_interval_end;
_M_state &= ~_S_state_in_brace;
++_M_current;
return;
}
}
}
template<typename _BiIter>
void
_Scanner<_BiIter>::
_M_scan_in_bracket()
{
if (*_M_current == _M_ctype.widen('['))
{
++_M_current;
if (_M_current == _M_end)
{
_M_curToken = _S_token_eof;
return;
}
if (*_M_current == _M_ctype.widen('.'))
{
_M_curToken = _S_token_collsymbol;
_M_eat_collsymbol();
return;
}
else if (*_M_current == _M_ctype.widen(':'))
{
_M_curToken = _S_token_char_class_name;
_M_eat_charclass();
return;
}
else if (*_M_current == _M_ctype.widen('='))
{
_M_curToken = _S_token_equiv_class_name;
_M_eat_equivclass();
return;
}
}
else if (*_M_current == _M_ctype.widen('-'))
{
_M_curToken = _S_token_dash;
++_M_current;
return;
}
else if (*_M_current == _M_ctype.widen(']'))
{
_M_curToken = _S_token_bracket_end;
_M_state &= ~_S_state_in_bracket;
++_M_current;
return;
}
else if (*_M_current == _M_ctype.widen('\\'))
{
_M_eat_escape();
return;
}
_M_curToken = _S_token_collelem_single;
_M_curValue.assign(1, *_M_current);
++_M_current;
}
// TODO Complete it.
template<typename _BiIter>
void
_Scanner<_BiIter>::
_M_eat_escape()
{
++_M_current;
if (_M_current == _M_end)
{
_M_curToken = _S_token_eof;
return;
}
_CharT __c = *_M_current;
++_M_current;
if (__c == _M_ctype.widen('('))
{
if (!(_M_flags & (regex_constants::basic | regex_constants::grep)))
{
_M_curToken = _S_token_ord_char;
_M_curValue.assign(1, __c);
}
else
_M_curToken = _S_token_subexpr_begin;
}
else if (__c == _M_ctype.widen(')'))
{
if (!(_M_flags & (regex_constants::basic | regex_constants::grep)))
{
_M_curToken = _S_token_ord_char;
_M_curValue.assign(1, __c);
}
else
_M_curToken = _S_token_subexpr_end;
}
else if (__c == _M_ctype.widen('{'))
{
if (!(_M_flags & (regex_constants::basic | regex_constants::grep)))
{
_M_curToken = _S_token_ord_char;
_M_curValue.assign(1, __c);
}
else
{
_M_curToken = _S_token_interval_begin;
_M_state |= _S_state_in_brace;
}
}
else if (__c == _M_ctype.widen('}'))
{
if (!(_M_flags & (regex_constants::basic | regex_constants::grep)))
{
_M_curToken = _S_token_ord_char;
_M_curValue.assign(1, __c);
}
else
{
if (!(_M_state && _S_state_in_brace))
__throw_regex_error(regex_constants::error_badbrace);
_M_state &= ~_S_state_in_brace;
_M_curToken = _S_token_interval_end;
}
}
else if (__c == _M_ctype.widen('x'))
{
++_M_current;
if (_M_current == _M_end)
{
_M_curToken = _S_token_eof;
return;
}
if (_M_ctype.is(_CtypeT::digit, *_M_current))
{
_M_curValue.assign(1, *_M_current);
++_M_current;
if (_M_current == _M_end)
{
_M_curToken = _S_token_eof;
return;
}
if (_M_ctype.is(_CtypeT::digit, *_M_current))
{
_M_curValue += *_M_current;
++_M_current;
return;
}
}
}
else if (__c == _M_ctype.widen('^')
|| __c == _M_ctype.widen('.')
|| __c == _M_ctype.widen('*')
|| __c == _M_ctype.widen('$')
|| __c == _M_ctype.widen('\\'))
{
_M_curToken = _S_token_ord_char;
_M_curValue.assign(1, __c);
}
else if (_M_ctype.is(_CtypeT::digit, __c))
{
_M_curToken = _S_token_backref;
_M_curValue.assign(1, __c);
}
else if (_M_state & _S_state_in_bracket)
{
if (__c == _M_ctype.widen('-')
|| __c == _M_ctype.widen('[')
|| __c == _M_ctype.widen(']'))
{
_M_curToken = _S_token_ord_char;
_M_curValue.assign(1, __c);
}
else if ((_M_flags & regex_constants::ECMAScript)
&& __c == _M_ctype.widen('b'))
{
_M_curToken = _S_token_ord_char;
_M_curValue.assign(1, _M_ctype.widen(' '));
}
else
__throw_regex_error(regex_constants::error_escape);
}
else
__throw_regex_error(regex_constants::error_escape);
}
// Eats a character class or throwns an exception.
// current point to ':' delimiter on entry, char after ']' on return
template<typename _BiIter>
void
_Scanner<_BiIter>::
_M_eat_charclass()
{
++_M_current; // skip ':'
if (_M_current == _M_end)
__throw_regex_error(regex_constants::error_ctype);
for (_M_curValue.clear();
_M_current != _M_end && *_M_current != _M_ctype.widen(':');
++_M_current)
_M_curValue += *_M_current;
if (_M_current == _M_end)
__throw_regex_error(regex_constants::error_ctype);
++_M_current; // skip ':'
if (*_M_current != _M_ctype.widen(']'))
__throw_regex_error(regex_constants::error_ctype);
++_M_current; // skip ']'
}
template<typename _BiIter>
void
_Scanner<_BiIter>::
_M_eat_equivclass()
{
++_M_current; // skip '='
if (_M_current == _M_end)
__throw_regex_error(regex_constants::error_collate);
for (_M_curValue.clear();
_M_current != _M_end && *_M_current != _M_ctype.widen('=');
++_M_current)
_M_curValue += *_M_current;
if (_M_current == _M_end)
__throw_regex_error(regex_constants::error_collate);
++_M_current; // skip '='
if (*_M_current != _M_ctype.widen(']'))
__throw_regex_error(regex_constants::error_collate);
++_M_current; // skip ']'
}
template<typename _BiIter>
void
_Scanner<_BiIter>::
_M_eat_collsymbol()
{
++_M_current; // skip '.'
if (_M_current == _M_end)
__throw_regex_error(regex_constants::error_collate);
for (_M_curValue.clear();
_M_current != _M_end && *_M_current != _M_ctype.widen('.');
++_M_current)
_M_curValue += *_M_current;
if (_M_current == _M_end)
__throw_regex_error(regex_constants::error_collate);
++_M_current; // skip '.'
if (*_M_current != _M_ctype.widen(']'))
__throw_regex_error(regex_constants::error_collate);
++_M_current; // skip ']'
}
#ifdef _GLIBCXX_DEBUG
template<typename _BiIter>
std::ostream&
_Scanner<_BiIter>::
_M_print(std::ostream& ostr)
{
switch (_M_curToken)
{
case _S_token_anychar:
ostr << "any-character\n";
break;
case _S_token_backref:
ostr << "backref\n";
break;
case _S_token_bracket_begin:
ostr << "bracket-begin\n";
break;
case _S_token_bracket_inverse_begin:
ostr << "bracket-inverse-begin\n";
break;
case _S_token_bracket_end:
ostr << "bracket-end\n";
break;
case _S_token_char_class_name:
ostr << "char-class-name \"" << _M_curValue << "\"\n";
break;
case _S_token_closure0:
ostr << "closure0\n";
break;
case _S_token_closure1:
ostr << "closure1\n";
break;
case _S_token_collelem_multi:
ostr << "coll-elem-multi \"" << _M_curValue << "\"\n";
break;
case _S_token_collelem_single:
ostr << "coll-elem-single \"" << _M_curValue << "\"\n";
break;
case _S_token_collsymbol:
ostr << "collsymbol \"" << _M_curValue << "\"\n";
break;
case _S_token_comma:
ostr << "comma\n";
break;
case _S_token_dash:
ostr << "dash\n";
break;
case _S_token_dup_count:
ostr << "dup count: " << _M_curValue << "\n";
break;
case _S_token_eof:
ostr << "EOF\n";
break;
case _S_token_equiv_class_name:
ostr << "equiv-class-name \"" << _M_curValue << "\"\n";
break;
case _S_token_interval_begin:
ostr << "interval begin\n";
break;
case _S_token_interval_end:
ostr << "interval end\n";
break;
case _S_token_line_begin:
ostr << "line begin\n";
break;
case _S_token_line_end:
ostr << "line end\n";
break;
case _S_token_opt:
ostr << "opt\n";
break;
case _S_token_or:
ostr << "or\n";
break;
case _S_token_ord_char:
ostr << "ordinary character: \"" << _M_value() << "\"\n";
break;
case _S_token_subexpr_begin:
ostr << "subexpr begin\n";
break;
case _S_token_subexpr_end:
ostr << "subexpr end\n";
break;
case _S_token_word_begin:
ostr << "word begin\n";
break;
case _S_token_word_end:
ostr << "word end\n";
break;
case _S_token_unknown:
ostr << "-- unknown token --\n";
break;
default:
_GLIBCXX_DEBUG_ASSERT(false);
}
return ostr;
}
#endif
template<typename _InputIter, typename _CharT, typename _TraitsT>
_Compiler<_InputIter, _CharT, _TraitsT>::
_Compiler(_InputIter __b, _InputIter __e,
template<typename _FwdIter, typename _CharT, typename _TraitsT>
_Compiler<_FwdIter, _CharT, _TraitsT>::
_Compiler(_FwdIter __b, _FwdIter __e,
const _TraitsT& __traits, _FlagT __flags)
: _M_traits(__traits), _M_scanner(__b, __e, __flags, _M_traits.getloc()),
_M_state_store(__flags), _M_flags(__flags)
{
_StateSeqT __r(_M_state_store,
_M_state_store._M_insert_subexpr_begin());
_M_state_store._M_insert_subexpr_begin());
_M_disjunction();
if (!_M_stack.empty())
{
@ -544,23 +53,23 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
__r._M_append(_M_state_store._M_insert_accept());
}
template<typename _InputIter, typename _CharT, typename _TraitsT>
template<typename _FwdIter, typename _CharT, typename _TraitsT>
bool
_Compiler<_InputIter, _CharT, _TraitsT>::
_M_match_token(_Compiler<_InputIter, _CharT, _TraitsT>::_TokenT token)
_Compiler<_FwdIter, _CharT, _TraitsT>::
_M_match_token(_TokenT token)
{
if (token == _M_scanner._M_token())
if (token == _M_scanner._M_get_token())
{
_M_cur_value = _M_scanner._M_value();
_M_value = _M_scanner._M_get_value();
_M_scanner._M_advance();
return true;
}
return false;
}
template<typename _InputIter, typename _CharT, typename _TraitsT>
template<typename _FwdIter, typename _CharT, typename _TraitsT>
void
_Compiler<_InputIter, _CharT, _TraitsT>::
_Compiler<_FwdIter, _CharT, _TraitsT>::
_M_disjunction()
{
this->_M_alternative();
@ -573,9 +82,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
}
}
template<typename _InputIter, typename _CharT, typename _TraitsT>
template<typename _FwdIter, typename _CharT, typename _TraitsT>
void
_Compiler<_InputIter, _CharT, _TraitsT>::
_Compiler<_FwdIter, _CharT, _TraitsT>::
_M_alternative()
{
if (this->_M_term())
@ -591,9 +100,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
}
}
template<typename _InputIter, typename _CharT, typename _TraitsT>
template<typename _FwdIter, typename _CharT, typename _TraitsT>
bool
_Compiler<_InputIter, _CharT, _TraitsT>::
_Compiler<_FwdIter, _CharT, _TraitsT>::
_M_term()
{
if (this->_M_assertion())
@ -606,37 +115,18 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
return false;
}
template<typename _InputIter, typename _CharT, typename _TraitsT>
// TODO Implement it.
template<typename _FwdIter, typename _CharT, typename _TraitsT>
bool
_Compiler<_InputIter, _CharT, _TraitsT>::
_Compiler<_FwdIter, _CharT, _TraitsT>::
_M_assertion()
{
if (_M_match_token(_ScannerT::_S_token_line_begin))
{
// __m.push(_Matcher::_S_opcode_line_begin);
return true;
}
if (_M_match_token(_ScannerT::_S_token_line_end))
{
// __m.push(_Matcher::_S_opcode_line_end);
return true;
}
if (_M_match_token(_ScannerT::_S_token_word_begin))
{
// __m.push(_Matcher::_S_opcode_word_begin);
return true;
}
if (_M_match_token(_ScannerT::_S_token_word_end))
{
// __m.push(_Matcher::_S_opcode_word_end);
return true;
}
return false;
}
template<typename _InputIter, typename _CharT, typename _TraitsT>
template<typename _FwdIter, typename _CharT, typename _TraitsT>
void
_Compiler<_InputIter, _CharT, _TraitsT>::
_Compiler<_FwdIter, _CharT, _TraitsT>::
_M_quantifier()
{
if (_M_match_token(_ScannerT::_S_token_closure0))
@ -707,15 +197,15 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
}
}
template<typename _InputIter, typename _CharT, typename _TraitsT>
template<typename _FwdIter, typename _CharT, typename _TraitsT>
bool
_Compiler<_InputIter, _CharT, _TraitsT>::
_Compiler<_FwdIter, _CharT, _TraitsT>::
_M_atom()
{
if (_M_match_token(_ScannerT::_S_token_anychar))
{
const static auto&
__any_matcher = [](_CharT) -> bool
__any_matcher = [](_CharT __ch) -> bool
{ return true; };
_M_stack.push(_StateSeqT(_M_state_store,
@ -723,9 +213,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
(__any_matcher)));
return true;
}
if (_M_match_token(_ScannerT::_S_token_ord_char))
if (_M_try_char())
{
auto __c = _M_cur_value[0];
_CharT __c = _M_value[0];
__detail::_Matcher<_CharT> f;
if (_M_flags & regex_constants::icase)
{
@ -744,7 +234,6 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
}
if (_M_match_token(_ScannerT::_S_token_backref))
{
// __m.push(_Matcher::_S_opcode_ordchar, _M_cur_value);
_M_stack.push(_StateSeqT(_M_state_store, _M_state_store.
_M_insert_backref(_M_cur_int_value(10))));
return true;
@ -770,90 +259,111 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
return _M_bracket_expression();
}
template<typename _InputIter, typename _CharT, typename _TraitsT>
template<typename _FwdIter, typename _CharT, typename _TraitsT>
bool
_Compiler<_InputIter, _CharT, _TraitsT>::
_Compiler<_FwdIter, _CharT, _TraitsT>::
_M_bracket_expression()
{
bool __inverse =
_M_match_token(_ScannerT::_S_token_bracket_inverse_begin);
if (!(__inverse || _M_match_token(_ScannerT::_S_token_bracket_begin)))
bool __neg =
_M_match_token(_ScannerT::_S_token_bracket_neg_begin);
if (!(__neg || _M_match_token(_ScannerT::_S_token_bracket_begin)))
return false;
_BMatcherT __matcher( __inverse, _M_traits, _M_flags);
// special case: only if _not_ chr first after
// '[' or '[^' or if ECMAscript
if (!_M_bracket_list(__matcher) // list is empty
&& !(_M_flags & regex_constants::ECMAScript))
__throw_regex_error(regex_constants::error_brack);
_BMatcherT __matcher(__neg, _M_traits, _M_flags);
_M_bracket_list(__matcher);
_M_stack.push(_StateSeqT(_M_state_store,
_M_state_store._M_insert_matcher(__matcher)));
return true;
}
template<typename _InputIter, typename _CharT, typename _TraitsT>
bool // list is non-empty
_Compiler<_InputIter, _CharT, _TraitsT>::
template<typename _FwdIter, typename _CharT, typename _TraitsT>
void
_Compiler<_FwdIter, _CharT, _TraitsT>::
_M_bracket_list(_BMatcherT& __matcher)
{
if (_M_match_token(_ScannerT::_S_token_bracket_end))
return false;
return;
_M_expression_term(__matcher);
_M_bracket_list(__matcher);
return true;
return;
}
template<typename _InputIter, typename _CharT, typename _TraitsT>
template<typename _FwdIter, typename _CharT, typename _TraitsT>
void
_Compiler<_InputIter, _CharT, _TraitsT>::
_Compiler<_FwdIter, _CharT, _TraitsT>::
_M_expression_term(_BMatcherT& __matcher)
{
if (_M_match_token(_ScannerT::_S_token_collsymbol))
{
__matcher._M_add_collating_element(_M_cur_value);
__matcher._M_add_collating_element(_M_value);
return;
}
if (_M_match_token(_ScannerT::_S_token_equiv_class_name))
{
__matcher._M_add_equivalence_class(_M_cur_value);
__matcher._M_add_equivalence_class(_M_value);
return;
}
if (_M_match_token(_ScannerT::_S_token_char_class_name))
{
__matcher._M_add_character_class(_M_cur_value);
__matcher._M_add_character_class(_M_value);
return;
}
if (_M_match_token(_ScannerT::_S_token_collelem_single)) // [a
if (_M_try_char()) // [a
{
auto __ch = _M_cur_value[0];
if (_M_match_token(_ScannerT::_S_token_dash)) // [a-
auto __ch = _M_value[0];
if (_M_try_char())
{
// If the dash is the last character in the bracket expression,
// it is not special.
if (_M_scanner._M_token() == _ScannerT::_S_token_bracket_end)
__matcher._M_add_char(_M_cur_value[0]); // [a-] <=> [a\-]
else // [a-z]
if (_M_value[0] == std::use_facet<std::ctype<_CharT>>
(_M_traits.getloc()).widen('-')) // [a-
{
if (!_M_match_token(_ScannerT::_S_token_collelem_single))
if (_M_try_char()) // [a-z]
{
__matcher._M_make_range(__ch, _M_value[0]);
return;
}
// If the dash is the last character in the bracket
// expression, it is not special.
if (_M_scanner._M_get_token()
!= _ScannerT::_S_token_bracket_end)
__throw_regex_error(regex_constants::error_range);
__matcher._M_make_range(__ch, _M_cur_value[0]);
}
__matcher._M_add_char(_M_value[0]);
}
else // [a]
__matcher._M_add_char(__ch);
__matcher._M_add_char(__ch);
return;
}
__throw_regex_error(regex_constants::error_brack);
}
template<typename _InputIter, typename _CharT, typename _TraitsT>
template<typename _FwdIter, typename _CharT, typename _TraitsT>
bool
_Compiler<_FwdIter, _CharT, _TraitsT>::
_M_try_char()
{
bool __is_char = false;
if (_M_match_token(_ScannerT::_S_token_oct_num))
{
__is_char = true;
_M_value.assign(1, _M_cur_int_value(8));
}
else if (_M_match_token(_ScannerT::_S_token_hex_num))
{
__is_char = true;
_M_value.assign(1, _M_cur_int_value(16));
}
else if (_M_match_token(_ScannerT::_S_token_ord_char))
__is_char = true;
return __is_char;
}
template<typename _FwdIter, typename _CharT, typename _TraitsT>
int
_Compiler<_InputIter, _CharT, _TraitsT>::
_Compiler<_FwdIter, _CharT, _TraitsT>::
_M_cur_int_value(int __radix)
{
int __v = 0;
for (typename _StringT::size_type __i = 0;
__i < _M_cur_value.length(); ++__i)
__v =__v * __radix + _M_traits.value(_M_cur_value[__i], __radix);
__i < _M_value.length(); ++__i)
__v =__v * __radix + _M_traits.value(_M_value[__i], __radix);
return __v;
}
@ -861,35 +371,34 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
bool _BracketMatcher<_CharT, _TraitsT>::
operator()(_CharT __ch) const
{
auto __oldch = __ch;
if (_M_flags & regex_constants::collate)
if (_M_is_icase())
__ch = _M_traits.translate_nocase(__ch);
else
__ch = _M_traits.translate(__ch);
bool __ret = false;
for (auto __c : _M_char_set)
if (__c == __ch)
{
__ret = true;
break;
}
if (!__ret && _M_traits.isctype(__oldch, _M_class_set))
if (_M_traits.isctype(__ch, _M_class_set))
__ret = true;
else
{
_StringT __s = _M_get_str(__ch);
for (auto& __it : _M_range_set)
if (__it.first <= __s && __s <= __it.second)
__ch = _M_translate(__ch);
for (auto __c : _M_char_set)
if (__c == __ch)
{
__ret = true;
break;
}
if (!__ret)
{
_StringT __s = _M_get_str(__ch);
for (auto& __it : _M_range_set)
if (__it.first <= __s && __s <= __it.second)
{
__ret = true;
break;
}
}
}
if (_M_is_non_matching)
__ret = !__ret;
return __ret;
return !__ret;
else
return __ret;
}
_GLIBCXX_END_NAMESPACE_VERSION

View File

@ -260,7 +260,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
auto __size = __u.size();
for (auto __i = 0; __i < __size; __i++)
{
auto& __uit = __u[__i], __vit = __v[__i];
auto __uit = __u[__i], __vit = __v[__i];
if (__uit.matched && !__vit.matched)
return true;
if (!__uit.matched && __vit.matched)

View File

@ -0,0 +1,194 @@
// class template regex -*- C++ -*-
// Copyright (C) 2013 Free Software Foundation, Inc.
//
// This file is part of the GNU ISO C++ Library. This library is free
// software; you can redistribute it and/or modify it under the
// terms of the GNU General Public License as published by the
// Free Software Foundation; either version 3, or (at your option)
// any later version.
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// Under Section 7 of GPL version 3, you are granted additional
// permissions described in the GCC Runtime Library Exception, version
// 3.1, as published by the Free Software Foundation.
// You should have received a copy of the GNU General Public License and
// a copy of the GCC Runtime Library Exception along with this program;
// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
// <http://www.gnu.org/licenses/>.
/**
* @file bits/regex_scanner.h
* This is an internal header file, included by other library headers.
* Do not attempt to use it directly. @headername{regex}
*/
namespace std _GLIBCXX_VISIBILITY(default)
{
namespace __detail
{
_GLIBCXX_BEGIN_NAMESPACE_VERSION
/**
* @addtogroup regex-detail
* @{
*/
/**
* @brief struct _Scanner. Scans an input range for regex tokens.
*
* The %_Scanner class interprets the regular expression pattern in
* the input range passed to its constructor as a sequence of parse
* tokens passed to the regular expression compiler. The sequence
* of tokens provided depends on the flag settings passed to the
* constructor: different regular expression grammars will interpret
* the same input pattern in syntactically different ways.
*/
template<typename _FwdIter>
class _Scanner
{
public:
typedef typename std::iterator_traits<_FwdIter>::value_type _CharT;
typedef std::basic_string<_CharT> _StringT;
typedef regex_constants::syntax_option_type _FlagT;
typedef const std::ctype<_CharT> _CtypeT;
/// Token types returned from the scanner.
enum _TokenT
{
_S_token_anychar,
_S_token_ord_char,
_S_token_oct_num,
_S_token_hex_num,
_S_token_backref,
_S_token_subexpr_begin,
_S_token_subexpr_no_group_begin,
_S_token_subexpr_lookahead_begin,
_S_token_subexpr_neg_lookahead_begin,
_S_token_subexpr_end,
_S_token_bracket_begin,
_S_token_bracket_neg_begin,
_S_token_bracket_end,
_S_token_interval_begin,
_S_token_interval_end,
_S_token_quoted_class,
_S_token_char_class_name,
_S_token_collsymbol,
_S_token_equiv_class_name,
_S_token_opt,
_S_token_or,
_S_token_closure0,
_S_token_closure1,
_S_token_line_begin,
_S_token_line_end,
_S_token_comma,
_S_token_dup_count,
_S_token_eof,
_S_token_unknown
};
_Scanner(_FwdIter __begin, _FwdIter __end,
_FlagT __flags, std::locale __loc);
void
_M_advance();
_TokenT
_M_get_token() const
{ return _M_token; }
const _StringT&
_M_get_value() const
{ return _M_value; }
#ifdef _GLIBCXX_DEBUG
std::ostream&
_M_print(std::ostream&);
#endif
private:
enum _StateT
{
_S_state_normal,
_S_state_in_brace,
_S_state_in_bracket,
};
void
_M_scan_normal();
void
_M_scan_in_bracket();
void
_M_scan_in_brace();
void
_M_eat_escape_ecma();
void
_M_eat_escape_posix();
void
_M_eat_escape_awk();
void
_M_eat_class(char);
constexpr bool
_M_is_ecma()
{ return _M_flags & regex_constants::ECMAScript; }
constexpr bool
_M_is_basic()
{ return _M_flags & (regex_constants::basic | regex_constants::grep); }
constexpr bool
_M_is_extended()
{
return _M_flags & (regex_constants::extended
| regex_constants::egrep
| regex_constants::awk);
}
constexpr bool
_M_is_grep()
{ return _M_flags & (regex_constants::grep | regex_constants::egrep); }
constexpr bool
_M_is_awk()
{ return _M_flags & regex_constants::awk; }
_StateT _M_state;
_FwdIter _M_current;
_FwdIter _M_end;
_FlagT _M_flags;
_CtypeT& _M_ctype;
_TokenT _M_token;
_StringT _M_value;
bool _M_at_bracket_start;
public:
// TODO: make them static when this file is stable.
const std::map<char, _TokenT> _M_token_map;
const std::map<char, char> _M_ecma_escape_map;
const std::map<char, char> _M_awk_escape_map;
const std::set<char> _M_ecma_spec_char;
const std::set<char> _M_basic_spec_char;
const std::set<char> _M_extended_spec_char;
const std::map<char, char>& _M_escape_map;
const std::set<char>& _M_spec_char;
void (_Scanner::* _M_eat_escape)();
};
//@} regex-detail
_GLIBCXX_END_NAMESPACE_VERSION
} // namespace __detail
} // namespace std
#include <bits/regex_scanner.tcc>

View File

@ -0,0 +1,609 @@
// class template regex -*- C++ -*-
// Copyright (C) 2013 Free Software Foundation, Inc.
//
// This file is part of the GNU ISO C++ Library. This library is free
// software; you can redistribute it and/or modify it under the
// terms of the GNU General Public License as published by the
// Free Software Foundation; either version 3, or (at your option)
// any later version.
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// Under Section 7 of GPL version 3, you are granted additional
// permissions described in the GCC Runtime Library Exception, version
// 3.1, as published by the Free Software Foundation.
// You should have received a copy of the GNU General Public License and
// a copy of the GCC Runtime Library Exception along with this program;
// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
// <http://www.gnu.org/licenses/>.
/**
* @file bits/regex_scanner.tcc
* This is an internal header file, included by other library headers.
* Do not attempt to use it directly. @headername{regex}
*/
// TODO make comments doxygen format
// N3376 specified 6 regex styles: ECMAScript, basic, extended, grep, egrep
// and awk
// 1) grep is basic except '\n' is treated as '|'
// 2) egrep is extended except '\n' is treated as '|'
// 3) awk is extended except special escaping rules, and there's no
// back-reference.
//
// References:
//
// ECMAScript: ECMA-262 15.10
//
// basic, extended:
// http://pubs.opengroup.org/onlinepubs/009695399/basedefs/xbd_chap09.html
//
// awk: http://pubs.opengroup.org/onlinepubs/000095399/utilities/awk.html
namespace std _GLIBCXX_VISIBILITY(default)
{
namespace __detail
{
_GLIBCXX_BEGIN_NAMESPACE_VERSION
template<typename _FwdIter>
_Scanner<_FwdIter>::
_Scanner(_FwdIter __begin, _FwdIter __end,
_FlagT __flags, std::locale __loc)
: _M_current(__begin) , _M_end(__end) , _M_flags(__flags),
_M_ctype(std::use_facet<_CtypeT>(__loc)), _M_state(_S_state_normal),
_M_at_bracket_start(false),
_M_token_map
{
{'^', _S_token_line_begin},
{'$', _S_token_line_end},
{'.', _S_token_anychar},
{'*', _S_token_closure0},
{'+', _S_token_closure1},
{'?', _S_token_opt},
{'|', _S_token_or},
// grep and egrep
{'\n', _S_token_or},
},
_M_ecma_escape_map
{
{'0', '\0'},
{'b', '\b'},
{'f', '\f'},
{'n', '\n'},
{'r', '\r'},
{'t', '\t'},
{'v', '\v'},
},
_M_awk_escape_map
{
{'"', '"'},
{'/', '/'},
{'\\', '\\'},
{'a', '\a'},
{'b', '\b'},
{'f', '\f'},
{'n', '\n'},
{'r', '\r'},
{'t', '\t'},
{'v', '\v'},
},
_M_escape_map(_M_is_ecma()
? _M_ecma_escape_map
: _M_awk_escape_map),
_M_ecma_spec_char
{
'^',
'$',
'\\',
'.',
'*',
'+',
'?',
'(',
')',
'[',
']',
'{',
'}',
'|',
},
_M_basic_spec_char
{
'.',
'[',
'\\',
'*',
'^',
'$',
},
_M_extended_spec_char
{
'.',
'[',
'\\',
'(',
')',
'*',
'+',
'?',
'{',
'|',
'^',
'$',
},
_M_eat_escape(_M_is_ecma()
? &_Scanner::_M_eat_escape_ecma
: &_Scanner::_M_eat_escape_posix),
_M_spec_char(_M_is_ecma()
? _M_ecma_spec_char
: _M_is_basic()
? _M_basic_spec_char
: _M_extended_spec_char)
{ _M_advance(); }
template<typename _FwdIter>
void
_Scanner<_FwdIter>::
_M_advance()
{
if (_M_current == _M_end)
{
_M_token = _S_token_eof;
return;
}
if (_M_state == _S_state_normal)
_M_scan_normal();
else if (_M_state == _S_state_in_bracket)
_M_scan_in_bracket();
else if (_M_state == _S_state_in_brace)
_M_scan_in_brace();
else
_GLIBCXX_DEBUG_ASSERT(false);
}
// Differences between styles:
// 1) "\(", "\)", "\{" in basic. It's not escaping.
// 2) "(?:", "(?=", "(?!" in ECMAScript.
template<typename _FwdIter>
void
_Scanner<_FwdIter>::
_M_scan_normal()
{
auto __c = *_M_current++;
if (__c == '\\')
{
if (_M_current == _M_end)
__throw_regex_error(regex_constants::error_escape);
if (!_M_is_basic()
|| (*_M_current != '('
&& *_M_current != ')'
&& *_M_current != '{'))
{
(this->*_M_eat_escape)();
return;
}
__c = *_M_current++;
}
if (__c == '(')
{
if (_M_is_ecma() && *_M_current == '?')
{
if (++_M_current == _M_end)
__throw_regex_error(regex_constants::error_paren);
if (*_M_current == ':')
{
++_M_current;
_M_token = _S_token_subexpr_no_group_begin;
}
else if (*_M_current == '=')
{
++_M_current;
_M_token = _S_token_subexpr_lookahead_begin;
}
else if (*_M_current == '!')
{
++_M_current;
_M_token = _S_token_subexpr_neg_lookahead_begin;
}
else
__throw_regex_error(regex_constants::error_paren);
}
else
_M_token = _S_token_subexpr_begin;
}
else if (__c == ')')
_M_token = _S_token_subexpr_end;
else if (__c == '[')
{
_M_state = _S_state_in_bracket;
_M_at_bracket_start = true;
if (_M_current != _M_end && *_M_current == '^')
{
_M_token = _S_token_bracket_neg_begin;
++_M_current;
}
else
_M_token = _S_token_bracket_begin;
}
else if (__c == '{')
{
_M_state = _S_state_in_brace;
_M_token = _S_token_interval_begin;
}
else if (_M_spec_char.count(__c)
&& __c != ']'
&& __c != '}'
|| (_M_is_grep() && __c == '\n'))
_M_token = _M_token_map.at(__c);
else
{
_M_token = _S_token_ord_char;
_M_value.assign(1, __c);
}
}
// Differences between styles:
// 1) different semantics of "[]" and "[^]".
// 2) Escaping in bracket expr.
template<typename _FwdIter>
void
_Scanner<_FwdIter>::
_M_scan_in_bracket()
{
if (_M_current == _M_end)
__throw_regex_error(regex_constants::error_brack);
auto __c = *_M_current++;
if (__c == '[')
{
if (_M_current == _M_end)
__throw_regex_error(regex_constants::error_brack);
if (*_M_current == '.')
{
_M_token = _S_token_collsymbol;
_M_eat_class(*_M_current++);
}
else if (*_M_current == ':')
{
_M_token = _S_token_char_class_name;
_M_eat_class(*_M_current++);
}
else if (*_M_current == '=')
{
_M_token = _S_token_equiv_class_name;
_M_eat_class(*_M_current++);
}
else
{
_M_token = _S_token_ord_char;
_M_value.assign(1, __c);
}
}
// In POSIX, when encountering "[]" or "[^]", the ']' is interpreted
// literally. So "[]]" or "[^]]" is valid regex. See the testcases
// `*/empty_range.cc`.
else if (__c == ']' && (_M_is_ecma() || !_M_at_bracket_start))
{
_M_token = _S_token_bracket_end;
_M_state = _S_state_normal;
}
// ECMAScirpt and awk permmits escaping in bracket.
else if (__c == '\\' && (_M_is_ecma() || _M_is_awk()))
(this->*_M_eat_escape)();
else
{
_M_token = _S_token_ord_char;
_M_value.assign(1, __c);
}
_M_at_bracket_start = false;
}
// Differences between styles:
// 1) "\}" in basic style.
template<typename _FwdIter>
void
_Scanner<_FwdIter>::
_M_scan_in_brace()
{
if (_M_current == _M_end)
__throw_regex_error(regex_constants::error_brace);
auto __c = *_M_current++;
if (_M_ctype.is(_CtypeT::digit, __c))
{
_M_token = _S_token_dup_count;
_M_value.assign(1, __c);
while (_M_current != _M_end
&& _M_ctype.is(_CtypeT::digit, *_M_current))
_M_value += *_M_current++;
}
else if (__c == ',')
_M_token = _S_token_comma;
// basic use \}.
else if (_M_is_basic())
{
if (__c == '\\' && _M_current != _M_end && *_M_current == '}')
{
_M_state = _S_state_normal;
_M_token = _S_token_interval_end;
++_M_current;
}
else
__throw_regex_error(regex_constants::error_brace);
}
else if (__c == '}')
{
_M_state = _S_state_normal;
_M_token = _S_token_interval_end;
}
else
__throw_regex_error(regex_constants::error_brace);
}
template<typename _FwdIter>
void
_Scanner<_FwdIter>::
_M_eat_escape_ecma()
{
if (_M_current == _M_end)
__throw_regex_error(regex_constants::error_escape);
auto __c = *_M_current++;
if (_M_escape_map.count(__c)
&& (__c != 'b' || _M_state == _S_state_in_bracket))
{
_M_token = _S_token_ord_char;
_M_value.assign(1, _M_escape_map.at(__c));
}
// N3376 28.13
else if (__c == 'b'
|| __c == 'B'
|| __c == 'd'
|| __c == 'D'
|| __c == 's'
|| __c == 'S'
|| __c == 'w'
|| __c == 'W')
{
_M_token = _S_token_quoted_class;
_M_value.assign(1, __c);
}
else if (__c == 'c')
{
if (_M_current == _M_end)
__throw_regex_error(regex_constants::error_escape);
_M_token = _S_token_ord_char;
_M_value.assign(1, *_M_current++);
}
else if (__c == 'x' || __c == 'u')
{
_M_value.erase();
for (int i = 0; i < (__c == 'x' ? 2 : 4); i++)
{
if (_M_current == _M_end
|| !_M_ctype.is(_CtypeT::xdigit, *_M_current))
__throw_regex_error(regex_constants::error_escape);
_M_value += *_M_current++;
}
_M_token = _S_token_hex_num;
}
// ECMAScript recongnizes multi-digit back-references.
else if (_M_ctype.is(_CtypeT::digit, __c))
{
_M_value.assign(1, __c);
while (_M_current != _M_end
&& _M_ctype.is(_CtypeT::digit, *_M_current))
_M_value += *_M_current++;
_M_token = _S_token_backref;
}
else
{
_M_token = _S_token_ord_char;
_M_value.assign(1, __c);
}
}
template<typename _FwdIter>
void
_Scanner<_FwdIter>::
_M_eat_escape_posix()
{
if (_M_current == _M_end)
__throw_regex_error(regex_constants::error_escape);
auto __c = *_M_current;
if (_M_spec_char.count(__c))
{
_M_token = _S_token_ord_char;
_M_value.assign(1, __c);
}
// We MUST judge awk before handling backrefs. There's no backref in awk.
else if (_M_is_awk())
{
_M_eat_escape_awk();
return;
}
else if (_M_ctype.is(_CtypeT::digit, __c) && __c != '0')
{
_M_token = _S_token_backref;
_M_value.assign(1, __c);
}
else
__throw_regex_error(regex_constants::error_escape);
++_M_current;
}
template<typename _FwdIter>
void
_Scanner<_FwdIter>::
_M_eat_escape_awk()
{
auto __c = *_M_current++;
if (_M_escape_map.count(__c))
{
_M_token = _S_token_ord_char;
_M_value.assign(1, _M_escape_map.at(__c));
}
// \ddd for oct representation
else if (_M_ctype.is(_CtypeT::digit, __c)
&& __c != '8'
&& __c != '9')
{
_M_value.assign(1, __c);
for (int __i = 0;
__i < 2
&& _M_current != _M_end
&& _M_ctype.is(_CtypeT::digit, *_M_current)
&& *_M_current != '8'
&& *_M_current != '9';
__i++)
_M_value += *_M_current++;
_M_token = _S_token_oct_num;
return;
}
else
__throw_regex_error(regex_constants::error_escape);
}
// Eats a character class or throwns an exception.
// __ch cound be ':', '.' or '=', _M_current is the char after ']' when
// returning.
template<typename _FwdIter>
void
_Scanner<_FwdIter>::
_M_eat_class(char __ch)
{
for (_M_value.clear(); _M_current != _M_end && *_M_current != __ch;)
_M_value += *_M_current++;
if (_M_current == _M_end
|| *_M_current++ != __ch
|| _M_current == _M_end // skip __ch
|| *_M_current++ != ']') // skip ']'
if (__ch == ':')
__throw_regex_error(regex_constants::error_ctype);
else
__throw_regex_error(regex_constants::error_collate);
}
#ifdef _GLIBCXX_DEBUG
template<typename _FwdIter>
std::ostream&
_Scanner<_FwdIter>::
_M_print(std::ostream& ostr)
{
switch (_M_token)
{
case _S_token_anychar:
ostr << "any-character\n";
break;
case _S_token_backref:
ostr << "backref\n";
break;
case _S_token_bracket_begin:
ostr << "bracket-begin\n";
break;
case _S_token_bracket_neg_begin:
ostr << "bracket-neg-begin\n";
break;
case _S_token_bracket_end:
ostr << "bracket-end\n";
break;
case _S_token_char_class_name:
ostr << "char-class-name \"" << _M_value << "\"\n";
break;
case _S_token_closure0:
ostr << "closure0\n";
break;
case _S_token_closure1:
ostr << "closure1\n";
break;
case _S_token_collsymbol:
ostr << "collsymbol \"" << _M_value << "\"\n";
break;
case _S_token_comma:
ostr << "comma\n";
break;
case _S_token_dup_count:
ostr << "dup count: " << _M_value << "\n";
break;
case _S_token_eof:
ostr << "EOF\n";
break;
case _S_token_equiv_class_name:
ostr << "equiv-class-name \"" << _M_value << "\"\n";
break;
case _S_token_interval_begin:
ostr << "interval begin\n";
break;
case _S_token_interval_end:
ostr << "interval end\n";
break;
case _S_token_line_begin:
ostr << "line begin\n";
break;
case _S_token_line_end:
ostr << "line end\n";
break;
case _S_token_opt:
ostr << "opt\n";
break;
case _S_token_or:
ostr << "or\n";
break;
case _S_token_ord_char:
ostr << "ordinary character: \"" << _M_value << "\"\n";
break;
case _S_token_subexpr_begin:
ostr << "subexpr begin\n";
break;
case _S_token_subexpr_no_group_begin:
ostr << "no grouping subexpr begin\n";
break;
case _S_token_subexpr_lookahead_begin:
ostr << "lookahead subexpr begin\n";
break;
case _S_token_subexpr_neg_lookahead_begin:
ostr << "neg lookahead subexpr begin\n";
break;
case _S_token_subexpr_end:
ostr << "subexpr end\n";
break;
case _S_token_unknown:
ostr << "-- unknown token --\n";
break;
case _S_token_oct_num:
ostr << "oct number " << _M_value << "\n";
break;
case _S_token_hex_num:
ostr << "hex number " << _M_value << "\n";
break;
case _S_token_quoted_class:
ostr << "quoted class " << "\\" << _M_value << "\n";
break;
default:
_GLIBCXX_DEBUG_ASSERT(false);
}
return ostr;
}
#endif
_GLIBCXX_END_NAMESPACE_VERSION
} // namespace __detail
} // namespace

View File

@ -56,6 +56,7 @@
#include <bits/regex_constants.h>
#include <bits/regex_error.h>
#include <bits/regex_scanner.h>
#include <bits/regex_automaton.h>
#include <bits/regex_compiler.h>
#include <bits/regex_executor.h>

View File

@ -0,0 +1,50 @@
// { dg-options "-std=gnu++11" }
//
// 2013-08-26 Tim Shen <timshen91@gmail.com>
//
// Copyright (C) 2013 Free Software Foundation, Inc.
//
// This file is part of the GNU ISO C++ Library. This library is free
// software; you can redistribute it and/or modify it under the
// terms of the GNU General Public License as published by the
// Free Software Foundation; either version 3, or (at your option)
// any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License along
// with this library; see the file COPYING3. If not see
// <http://www.gnu.org/licenses/>.
// 28.11.2 regex_match
// Tests awk escaping.
#include <regex>
#include <testsuite_hooks.h>
using namespace std;
void
test01()
{
bool test __attribute__((unused)) = true;
regex("\\[", regex_constants::awk);
VERIFY(regex_match("\"", regex("[\\\"]", regex_constants::awk)));
VERIFY(regex_match("/", regex("/", regex_constants::awk)));
VERIFY(regex_match("\a", regex("\\a", regex_constants::awk)));
VERIFY(regex_match("\"", regex("\\\"", regex_constants::awk)));
VERIFY(regex_match("5", regex("\\65", regex_constants::awk)));
VERIFY(regex_match("53", regex("\\0653", regex_constants::awk)));
}
int
main()
{
test01();
return 0;
}

View File

@ -0,0 +1,57 @@
// { dg-options "-std=gnu++11" }
//
// 2013-08-26 Tim Shen <timshen91@gmail.com>
//
// Copyright (C) 2013 Free Software Foundation, Inc.
//
// This file is part of the GNU ISO C++ Library. This library is free
// software; you can redistribute it and/or modify it under the
// terms of the GNU General Public License as published by the
// Free Software Foundation; either version 3, or (at your option)
// any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License along
// with this library; see the file COPYING3. If not see
// <http://www.gnu.org/licenses/>.
// 28.11.2 regex_match
// Tests ECMAScript empty range.
#include <regex>
#include <testsuite_hooks.h>
using namespace std;
void
test01()
{
bool test __attribute__((unused)) = true;
#define FAIL(s) \
try\
{\
regex re(s, regex_constants::basic);\
VERIFY(false);\
}\
catch (...)\
{\
VERIFY(true);\
}
FAIL("[]");
FAIL("[^]");
VERIFY(regex_match("]", regex("[]]", regex_constants::basic)));
VERIFY(!regex_match("]", regex("[^]]", regex_constants::basic)));
}
int
main()
{
test01();
return 0;
}

View File

@ -0,0 +1,54 @@
// { dg-options "-std=gnu++11" }
//
// 2013-08-26 Tim Shen <timshen91@gmail.com>
//
// Copyright (C) 2013 Free Software Foundation, Inc.
//
// This file is part of the GNU ISO C++ Library. This library is free
// software; you can redistribute it and/or modify it under the
// terms of the GNU General Public License as published by the
// Free Software Foundation; either version 3, or (at your option)
// any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License along
// with this library; see the file COPYING3. If not see
// <http://www.gnu.org/licenses/>.
// 28.11.2 regex_match
// Tests ECMAScript \x and \u.
#include <regex>
#include <testsuite_hooks.h>
using namespace std;
void
test01()
{
bool test __attribute__((unused)) = true;
VERIFY(regex_match(":", regex("\\x3a")));
VERIFY(regex_match(L"\u1234", wregex(L"\\u1234")));
try
{
regex("\\u400x");
VERIFY(false);
}
catch (...)
{
VERIFY(true);
}
}
int
main()
{
test01();
return 0;
}

View File

@ -0,0 +1,47 @@
// { dg-options "-std=gnu++11" }
//
// 2013-08-26 Tim Shen <timshen91@gmail.com>
//
// Copyright (C) 2013 Free Software Foundation, Inc.
//
// This file is part of the GNU ISO C++ Library. This library is free
// software; you can redistribute it and/or modify it under the
// terms of the GNU General Public License as published by the
// Free Software Foundation; either version 3, or (at your option)
// any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License along
// with this library; see the file COPYING3. If not see
// <http://www.gnu.org/licenses/>.
// 28.11.2 regex_match
// Tests ECMAScript empty range.
#include <regex>
#include <testsuite_hooks.h>
using namespace std;
void
test01()
{
bool test __attribute__((unused)) = true;
VERIFY(!regex_match("x", regex("[]")));
VERIFY(regex_match("x", regex("[^]")));
VERIFY(!regex_match("]", regex("[]]")));
VERIFY(!regex_match("]", regex("[^]]")));
}
int
main()
{
test01();
return 0;
}

View File

@ -0,0 +1,42 @@
// { dg-options "-std=gnu++11" }
//
// 2013-08-26 Tim Shen <timshen91@gmail.com>
//
// Copyright (C) 2013 Free Software Foundation, Inc.
//
// This file is part of the GNU ISO C++ Library. This library is free
// software; you can redistribute it and/or modify it under the
// terms of the GNU General Public License as published by the
// Free Software Foundation; either version 3, or (at your option)
// any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License along
// with this library; see the file COPYING3. If not see
// <http://www.gnu.org/licenses/>.
// 28.11.3 regex_search
// Tests BRE against a std::string target.
#include <regex>
#include <testsuite_hooks.h>
void
test01()
{
bool test __attribute__((unused)) = true;
VERIFY(std::regex_search("", std::regex("")));
}
int
main()
{
test01();
return 0;
}