From 7c812a2a57e58218eea234b80386197d1fe7672a Mon Sep 17 00:00:00 2001 From: Tim Shen Date: Thu, 5 Sep 2013 15:20:39 +0000 Subject: [PATCH] regex_automaton.h: Add dummy node type. 2013-09-05 Tim Shen * include/bits/regex_automaton.h: Add dummy node type. Rewrite _StateSeq. * include/bits/regex_automaton.tcc: Implement them. * include/bits/regex_compiler.h: Rewrite _Compiler to use new _StateSeq interfaces. * include/bits/regex_compiler.tcc: Implement them. * include/bits/regex_scanner.h: Add word boundry assertion token. * include/bits/regex_scanner.tcc (_Scanner<>::_M_eat_escape_ecma): Support word boundry. * testsuite/28_regex/algorithms/regex_match/basic/ string_range_02_03.cc: Remove "xfail". * testsuite/28_regex/algorithms/regex_match/extended/cstring_plus.cc: Likewise. * testsuite/28_regex/algorithms/regex_match/extended/ string_range_02_03.cc: Likewise. * testsuite/28_regex/algorithms/regex_match/extended/ cstring_questionmark.cc: Remove xfail and get correct length of c-string. * testsuite/28_regex/algorithms/regex_match/extended/ string_range_00_03.cc: Likewise. * testsuite/28_regex/algorithms/regex_match/ecma/char/quoted_char.cc: New. * testsuite/28_regex/algorithms/regex_match/extended/cstring_range.cc: New. * testsuite/28_regex/iterators/regex_iterator/wchar_t/string_02.cc: New. From-SVN: r202290 --- libstdc++-v3/ChangeLog | 30 +- libstdc++-v3/include/bits/regex_automaton.h | 79 ++--- libstdc++-v3/include/bits/regex_automaton.tcc | 109 +++--- libstdc++-v3/include/bits/regex_compiler.h | 24 +- libstdc++-v3/include/bits/regex_compiler.tcc | 313 ++++++++++-------- libstdc++-v3/include/bits/regex_scanner.h | 2 + libstdc++-v3/include/bits/regex_scanner.tcc | 10 +- .../regex_match/basic/string_range_02_03.cc | 1 - .../regex_match/ecma/char/quoted_char.cc | 52 +++ .../regex_match/extended/cstring_plus.cc | 43 +-- .../extended/cstring_questionmark.cc | 43 +-- .../regex_match/extended/cstring_range.cc | 68 ++++ .../extended/string_range_00_03.cc | 30 +- .../extended/string_range_02_03.cc | 1 - .../regex_iterator/wchar_t/string_02.cc | 59 ++++ 15 files changed, 553 insertions(+), 311 deletions(-) create mode 100644 libstdc++-v3/testsuite/28_regex/algorithms/regex_match/ecma/char/quoted_char.cc create mode 100644 libstdc++-v3/testsuite/28_regex/algorithms/regex_match/extended/cstring_range.cc create mode 100644 libstdc++-v3/testsuite/28_regex/iterators/regex_iterator/wchar_t/string_02.cc diff --git a/libstdc++-v3/ChangeLog b/libstdc++-v3/ChangeLog index 50d65dd3843..bdbde8d12dd 100644 --- a/libstdc++-v3/ChangeLog +++ b/libstdc++-v3/ChangeLog @@ -1,8 +1,36 @@ +2013-09-05 Tim Shen + + * include/bits/regex_automaton.h: Add dummy node type. Rewrite + _StateSeq. + * include/bits/regex_automaton.tcc: Implement them. + * include/bits/regex_compiler.h: Rewrite _Compiler to use new + _StateSeq interfaces. + * include/bits/regex_compiler.tcc: Implement them. + * include/bits/regex_scanner.h: Add word boundry assertion token. + * include/bits/regex_scanner.tcc (_Scanner<>::_M_eat_escape_ecma): + Support word boundry. + * testsuite/28_regex/algorithms/regex_match/basic/ + string_range_02_03.cc: Remove "xfail". + * testsuite/28_regex/algorithms/regex_match/extended/cstring_plus.cc: + Likewise. + * testsuite/28_regex/algorithms/regex_match/extended/ + string_range_02_03.cc: Likewise. + * testsuite/28_regex/algorithms/regex_match/extended/ + cstring_questionmark.cc: Remove xfail and get correct length of + c-string. + * testsuite/28_regex/algorithms/regex_match/extended/ + string_range_00_03.cc: Likewise. + * testsuite/28_regex/algorithms/regex_match/ecma/char/quoted_char.cc: + New. + * testsuite/28_regex/algorithms/regex_match/extended/cstring_range.cc: + New. + * testsuite/28_regex/iterators/regex_iterator/wchar_t/string_02.cc: New. + 2013-09-03 Paolo Carlini PR libstdc++/58302 * include/bits/random.tcc (negative_binomial_distribution<>:: - operator()(_UniformRandomNumberGenerator&, const param_type&): + operator()(_UniformRandomNumberGenerator&, const param_type&)): Fix typo in template argument. * testsuite/26_numerics/random/negative_binomial_distribution/ operators/58302.cc: New. diff --git a/libstdc++-v3/include/bits/regex_automaton.h b/libstdc++-v3/include/bits/regex_automaton.h index 2c872aa9482..77551756f65 100644 --- a/libstdc++-v3/include/bits/regex_automaton.h +++ b/libstdc++-v3/include/bits/regex_automaton.h @@ -56,6 +56,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION _S_opcode_backref = 2, _S_opcode_subexpr_begin = 4, _S_opcode_subexpr_end = 5, + _S_opcode_dummy = 6, _S_opcode_match = 100, _S_opcode_accept = 255 }; @@ -69,7 +70,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION _OpcodeT _M_opcode; // type of outgoing transition _StateIdT _M_next; // outgoing transition - union // Since they are mutual exclusive. + union // Since they are mutually exclusive. { _StateIdT _M_alt; // for _S_opcode_alternative unsigned int _M_subexpr; // for _S_opcode_subexpr_* @@ -201,6 +202,24 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION _StateIdT _M_insert_backref(unsigned int __index); + _StateIdT + _M_insert_dummy() + { + this->push_back(_StateT(_S_opcode_dummy)); + return this->size()-1; + } + + _StateIdT + _M_insert_state(_StateT __s) + { + this->push_back(__s); + return this->size()-1; + } + + // Eliminate dummy node in this NFA to make it compact. + void + _M_eliminate_dummy(); + #ifdef _GLIBCXX_DEBUG std::ostream& _M_dot(std::ostream& __ostr) const; @@ -222,58 +241,40 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION { public: typedef _NFA<_CharT, _TraitsT> _RegexT; + public: - // Constructs a single-node sequence - _StateSeq(_RegexT& __ss, _StateIdT __s, - _StateIdT __e = _S_invalid_state_id) - : _M_nfa(__ss), _M_start(__s), _M_end1(__s), _M_end2(__e) - { } - // Constructs a split sequence from two other sequencces - _StateSeq(const _StateSeq& __e1, const _StateSeq& __e2) - : _M_nfa(__e1._M_nfa), - _M_start(_M_nfa._M_insert_alt(__e1._M_start, __e2._M_start)), - _M_end1(__e1._M_end1), _M_end2(__e2._M_end1) + _StateSeq(_RegexT& __nfa, _StateIdT __s) + : _StateSeq(__nfa, __s, __s) { } - // Constructs a split sequence from a single sequence - _StateSeq(const _StateSeq& __e, _StateIdT __id) - : _M_nfa(__e._M_nfa), - _M_start(_M_nfa._M_insert_alt(__id, __e._M_start)), - _M_end1(__id), _M_end2(__e._M_end1) + _StateSeq(_RegexT& __nfa, _StateIdT __s, _StateIdT __end) + : _M_nfa(__nfa), _M_start(__s), _M_end(__end) { } - // Constructs a copy of a %_StateSeq - _StateSeq(const _StateSeq& __rhs) - : _M_nfa(__rhs._M_nfa), _M_start(__rhs._M_start), - _M_end1(__rhs._M_end1), _M_end2(__rhs._M_end2) - { } - - _StateSeq& operator=(const _StateSeq& __rhs); - - _StateIdT - _M_front() const - { return _M_start; } - - // Extends a sequence by one. + // Append a state on *this and change *this to the new sequence. void - _M_push_back(_StateIdT __id); + _M_append(_StateIdT __id) + { + _M_nfa[_M_end]._M_next = __id; + _M_end = __id; + } - // Extends and maybe joins a sequence. + // Append a sequence on *this and change *this to the new sequence. void - _M_append(_StateIdT __id); - - void - _M_append(_StateSeq& __rhs); + _M_append(const _StateSeq& __s) + { + _M_nfa[_M_end]._M_next = __s._M_start; + _M_end = __s._M_end; + } // Clones an entire sequence. - _StateIdT + _StateSeq _M_clone(); - private: + public: _RegexT& _M_nfa; _StateIdT _M_start; - _StateIdT _M_end1; - _StateIdT _M_end2; + _StateIdT _M_end; }; //@} regex-detail diff --git a/libstdc++-v3/include/bits/regex_automaton.tcc b/libstdc++-v3/include/bits/regex_automaton.tcc index 2c25d97549c..2d34b95cdba 100644 --- a/libstdc++-v3/include/bits/regex_automaton.tcc +++ b/libstdc++-v3/include/bits/regex_automaton.tcc @@ -102,9 +102,10 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION case _S_opcode_accept: __ostr << __id << " [label=\"" << __id << "\\nACC\"];\n" ; break; + case _S_opcode_dummy: + break; default: - __ostr << __id << " [label=\"" << __id << "\\nUNK\"];\n" - << __id << " -> " << _M_next << " [label=\"?\"];\n"; + _GLIBCXX_DEBUG_ASSERT(false); break; } return __ostr; @@ -145,65 +146,61 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION } template - _StateSeq<_CharT, _TraitsT>& _StateSeq<_CharT, _TraitsT>:: - operator=(const _StateSeq& __rhs) + void _NFA<_CharT, _TraitsT>:: + _M_eliminate_dummy() { - _M_start = __rhs._M_start; - _M_end1 = __rhs._M_end1; - _M_end2 = __rhs._M_end2; - return *this; + for (auto& __it : *this) + { + while (__it._M_next >= 0 && (*this)[__it._M_next]._M_opcode + == _S_opcode_dummy) + __it._M_next = (*this)[__it._M_next]._M_next; + if (__it._M_opcode == _S_opcode_alternative) + while (__it._M_alt >= 0 && (*this)[__it._M_alt]._M_opcode + == _S_opcode_dummy) + __it._M_alt = (*this)[__it._M_alt]._M_next; + } } + // Just apply DFS on the sequence and re-link their links. template - void _StateSeq<_CharT, _TraitsT>:: - _M_push_back(_StateIdT __id) - { - if (_M_end1 != _S_invalid_state_id) - _M_nfa[_M_end1]._M_next = __id; - _M_end1 = __id; - } - - template - void _StateSeq<_CharT, _TraitsT>:: - _M_append(_StateIdT __id) - { - if (_M_end2 != _S_invalid_state_id) - { - if (_M_end2 == _M_end1) - _M_nfa[_M_end2]._M_alt = __id; - else - _M_nfa[_M_end2]._M_next = __id; - _M_end2 = _S_invalid_state_id; - } - if (_M_end1 != _S_invalid_state_id) - _M_nfa[_M_end1]._M_next = __id; - _M_end1 = __id; - } - - template - void _StateSeq<_CharT, _TraitsT>:: - _M_append(_StateSeq& __rhs) - { - if (_M_end2 != _S_invalid_state_id) - { - if (_M_end2 == _M_end1) - _M_nfa[_M_end2]._M_alt = __rhs._M_start; - else - _M_nfa[_M_end2]._M_next = __rhs._M_start; - _M_end2 = _S_invalid_state_id; - } - if (__rhs._M_end2 != _S_invalid_state_id) - _M_end2 = __rhs._M_end2; - if (_M_end1 != _S_invalid_state_id) - _M_nfa[_M_end1]._M_next = __rhs._M_start; - _M_end1 = __rhs._M_end1; - } - - // @todo implement this function. - template - _StateIdT _StateSeq<_CharT, _TraitsT>:: + _StateSeq<_CharT, _TraitsT> _StateSeq<_CharT, _TraitsT>:: _M_clone() - { return 0; } + { + std::map<_StateIdT, _StateIdT> __m; + std::stack<_StateIdT> __stack; + __stack.push(_M_start); + while (!__stack.empty()) + { + auto __u = __stack.top(); + __stack.pop(); + auto __dup = _M_nfa[__u]; + auto __id = _M_nfa._M_insert_state(__dup); + __m[__u] = __id; + if (__u == _M_end) + continue; + if (__m.count(__dup._M_next) == 0) + __stack.push(__dup._M_next); + if (__dup._M_opcode == _S_opcode_alternative) + if (__m.count(__dup._M_alt) == 0) + __stack.push(__dup._M_alt); + } + for (auto __it : __m) + { + auto& __ref = _M_nfa[__it.second]; + if (__ref._M_next != -1) + { + _GLIBCXX_DEBUG_ASSERT(__m.count(__ref._M_next)); + __ref._M_next = __m[__ref._M_next]; + } + if (__ref._M_opcode == _S_opcode_alternative) + if (__ref._M_alt != -1) + { + _GLIBCXX_DEBUG_ASSERT(__m.count(__ref._M_alt)); + __ref._M_alt = __m[__ref._M_alt]; + } + } + return _StateSeq(_M_nfa, __m[_M_start], __m[_M_end]); + } _GLIBCXX_END_NAMESPACE_VERSION } // namespace __detail diff --git a/libstdc++-v3/include/bits/regex_compiler.h b/libstdc++-v3/include/bits/regex_compiler.h index 55ecdb92d41..96a0d294177 100644 --- a/libstdc++-v3/include/bits/regex_compiler.h +++ b/libstdc++-v3/include/bits/regex_compiler.h @@ -56,7 +56,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION std::shared_ptr<_RegexT> _M_get_nfa() const - { return std::shared_ptr<_RegexT>(new _RegexT(_M_state_store)); } + { return std::shared_ptr<_RegexT>(new _RegexT(_M_nfa)); } private: typedef _Scanner<_FwdIter> _ScannerT; @@ -64,6 +64,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION typedef _StateSeq<_CharT, _TraitsT> _StateSeqT; typedef std::stack<_StateSeqT, std::vector<_StateSeqT>> _StackT; typedef _BracketMatcher<_CharT, _TraitsT> _BMatcherT; + typedef std::ctype<_CharT> _CtypeT; // accepts a specific token or returns false. bool @@ -90,21 +91,12 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION bool _M_bracket_expression(); - void - _M_bracket_list(_BMatcherT& __matcher); - - bool - _M_follow_list(_BMatcherT& __matcher); - void _M_expression_term(_BMatcherT& __matcher); bool _M_range_expression(_BMatcherT& __matcher); - bool - _M_start_range(_BMatcherT& __matcher); - bool _M_collating_symbol(_BMatcherT& __matcher); @@ -120,12 +112,18 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION bool _M_try_char(); - _CharT - _M_get_char(); + _StateSeqT + _M_pop() + { + auto ret = _M_stack.top(); + _M_stack.pop(); + return ret; + } const _TraitsT& _M_traits; + const _CtypeT& _M_ctype; _ScannerT _M_scanner; - _RegexT _M_state_store; + _RegexT _M_nfa; _StringT _M_value; _StackT _M_stack; _FlagT _M_flags; diff --git a/libstdc++-v3/include/bits/regex_compiler.tcc b/libstdc++-v3/include/bits/regex_compiler.tcc index e41b251c257..a574e8e5ddd 100644 --- a/libstdc++-v3/include/bits/regex_compiler.tcc +++ b/libstdc++-v3/include/bits/regex_compiler.tcc @@ -28,6 +28,31 @@ * Do not attempt to use it directly. @headername{regex} */ +// TODO make comments doxygen format. + +// This compiler refers to "Regular Expression Matching Can Be Simple And Fast" +// (http://swtch.com/~rsc/regexp/regexp1.html"), +// but doesn't strictly follow it. +// +// When compiling, states are *chained* instead of tree- or graph-constructed. +// It's more like structured programs: there's if statement and loop statement. +// +// For alternative structure(say "a|b"), aka "if statement", two branchs should +// be constructed. However, these two shall merge to an "end_tag" at the end of +// this operator: +// +// branch1 +// / \ +// => begin_tag end_tag => +// \ / +// branch2 +// +// This is the difference between this implementation and that in Russ's +// article. +// +// That's why we introduced dummy node here ------ "end_tag" is a dummy node. +// All dummy node will be eliminated at the end of compiling process. + namespace std _GLIBCXX_VISIBILITY(default) { namespace __detail @@ -39,32 +64,19 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION _Compiler(_FwdIter __b, _FwdIter __e, const _TraitsT& __traits, _FlagT __flags) : _M_traits(__traits), _M_scanner(__b, __e, __flags, _M_traits.getloc()), - _M_state_store(__flags), _M_flags(__flags) + _M_ctype(std::use_facet>(_M_traits.getloc())), + _M_nfa(__flags), _M_flags(__flags) { - _StateSeqT __r(_M_state_store, - _M_state_store._M_insert_subexpr_begin()); - _M_disjunction(); - if (!_M_stack.empty()) - { - __r._M_append(_M_stack.top()); - _M_stack.pop(); - } - __r._M_append(_M_state_store._M_insert_subexpr_end()); - __r._M_append(_M_state_store._M_insert_accept()); - } - - template - bool - _Compiler<_FwdIter, _CharT, _TraitsT>:: - _M_match_token(_TokenT token) - { - if (token == _M_scanner._M_get_token()) - { - _M_value = _M_scanner._M_get_value(); - _M_scanner._M_advance(); - return true; - } - return false; + _StateSeqT __r(_M_nfa, _M_nfa._M_start()); + __r._M_append(_M_nfa._M_insert_subexpr_begin()); + this->_M_disjunction(); + if (!_M_match_token(_ScannerT::_S_token_eof)) + __throw_regex_error(regex_constants::error_paren); + __r._M_append(_M_pop()); + _GLIBCXX_DEBUG_ASSERT(_M_stack.empty()); + __r._M_append(_M_nfa._M_insert_subexpr_end()); + __r._M_append(_M_nfa._M_insert_accept()); + _M_nfa._M_eliminate_dummy(); } template @@ -73,12 +85,19 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION _M_disjunction() { this->_M_alternative(); - if (_M_match_token(_ScannerT::_S_token_or)) + // TODO empty alternative like, um, "(|asdf)" + while (_M_match_token(_ScannerT::_S_token_or)) { - _StateSeqT __alt1 = _M_stack.top(); _M_stack.pop(); - this->_M_disjunction(); - _StateSeqT __alt2 = _M_stack.top(); _M_stack.pop(); - _M_stack.push(_StateSeqT(__alt1, __alt2)); + _StateSeqT __alt1 = _M_pop(); + this->_M_alternative(); + _StateSeqT __alt2 = _M_pop(); + auto __end = _M_nfa._M_insert_dummy(); + __alt1._M_append(__end); + __alt2._M_append(__end); + _M_stack.push(_StateSeqT(_M_nfa, + _M_nfa._M_insert_alt(__alt1._M_start, + __alt2._M_start), + __end)); } } @@ -89,15 +108,13 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION { if (this->_M_term()) { - _StateSeqT __re = _M_stack.top(); _M_stack.pop(); + _StateSeqT __re = _M_pop(); this->_M_alternative(); - if (!_M_stack.empty()) - { - __re._M_append(_M_stack.top()); - _M_stack.pop(); - } + __re._M_append(_M_pop()); _M_stack.push(__re); } + else + _M_stack.push(_StateSeqT(_M_nfa, _M_nfa._M_insert_dummy())); } template @@ -121,7 +138,22 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION _Compiler<_FwdIter, _CharT, _TraitsT>:: _M_assertion() { - return false; + // temporary place holders. + if (_M_match_token(_ScannerT::_S_token_line_begin)) + _M_stack.push(_StateSeqT(_M_nfa, _M_nfa._M_insert_dummy())); + else if (_M_match_token(_ScannerT::_S_token_line_end)) + _M_stack.push(_StateSeqT(_M_nfa, _M_nfa._M_insert_dummy())); + else if (_M_match_token(_ScannerT::_S_token_word_bound)) + _M_stack.push(_StateSeqT(_M_nfa, _M_nfa._M_insert_dummy())); + else if (_M_match_token(_ScannerT::_S_token_neg_word_bound)) + _M_stack.push(_StateSeqT(_M_nfa, _M_nfa._M_insert_dummy())); + else if (_M_match_token(_ScannerT::_S_token_subexpr_lookahead_begin)) + _M_stack.push(_StateSeqT(_M_nfa, _M_nfa._M_insert_dummy())); + else if (_M_match_token(_ScannerT::_S_token_subexpr_neg_lookahead_begin)) + _M_stack.push(_StateSeqT(_M_nfa, _M_nfa._M_insert_dummy())); + else + return false; + return true; } template @@ -133,67 +165,70 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION { if (_M_stack.empty()) __throw_regex_error(regex_constants::error_badrepeat); - _StateSeqT __r(_M_stack.top(), -1); - __r._M_append(__r._M_front()); - _M_stack.pop(); + auto __e = _M_pop(); + _StateSeqT __r(_M_nfa, _M_nfa._M_insert_alt(_S_invalid_state_id, + __e._M_start)); + __e._M_append(__r); _M_stack.push(__r); - return; } - if (_M_match_token(_ScannerT::_S_token_closure1)) + else if (_M_match_token(_ScannerT::_S_token_closure1)) { if (_M_stack.empty()) __throw_regex_error(regex_constants::error_badrepeat); - _StateSeqT __r(_M_state_store, - _M_state_store. - _M_insert_alt(_S_invalid_state_id, - _M_stack.top()._M_front())); - _M_stack.top()._M_append(__r); - return; + auto __e = _M_pop(); + __e._M_append(_M_nfa._M_insert_alt(_S_invalid_state_id, __e._M_start)); + _M_stack.push(__e); } - if (_M_match_token(_ScannerT::_S_token_opt)) + else if (_M_match_token(_ScannerT::_S_token_opt)) { if (_M_stack.empty()) - __throw_regex_error(regex_constants::error_badrepeat); - _StateSeqT __r(_M_stack.top(), -1); - _M_stack.pop(); + __throw_regex_error(regex_constants::error_badrepeat); + auto __e = _M_pop(); + auto __end = _M_nfa._M_insert_dummy(); + _StateSeqT __r(_M_nfa, _M_nfa._M_insert_alt(_S_invalid_state_id, + __e._M_start)); + __e._M_append(__end); + __r._M_append(__end); _M_stack.push(__r); - return; } - if (_M_match_token(_ScannerT::_S_token_interval_begin)) + else if (_M_match_token(_ScannerT::_S_token_interval_begin)) { if (_M_stack.empty()) __throw_regex_error(regex_constants::error_badrepeat); if (!_M_match_token(_ScannerT::_S_token_dup_count)) __throw_regex_error(regex_constants::error_badbrace); - _StateSeqT __r(_M_stack.top()); + _StateSeqT __r(_M_pop()); + _StateSeqT __e(_M_nfa, _M_nfa._M_insert_dummy()); int __min_rep = _M_cur_int_value(10); - for (int __i = 1; __i < __min_rep; ++__i) - _M_stack.top()._M_append(__r._M_clone()); + // {3 + for (int __i = 0; __i < __min_rep; ++__i) + __e._M_append(__r._M_clone()); if (_M_match_token(_ScannerT::_S_token_comma)) - if (_M_match_token(_ScannerT::_S_token_dup_count)) + if (_M_match_token(_ScannerT::_S_token_dup_count)) // {3,7} { - int __n = _M_cur_int_value(10) - __min_rep; - if (__n < 0) - __throw_regex_error(regex_constants::error_badbrace); - for (int __i = 0; __i < __n; ++__i) - { - _StateSeqT __r(_M_state_store, - _M_state_store. - _M_insert_alt(_S_invalid_state_id, - _M_stack.top()._M_front())); - _M_stack.top()._M_append(__r); - } + int __n = _M_cur_int_value(10) - __min_rep; + if (__n < 0) + __throw_regex_error(regex_constants::error_badbrace); + auto __end = _M_nfa._M_insert_dummy(); + for (int __i = 0; __i < __n; ++__i) + { + auto __tmp = __r._M_clone(); + __e._M_append(_StateSeqT(_M_nfa, _M_nfa. + _M_insert_alt(__tmp._M_start, __end), __tmp._M_end)); + } + __e._M_append(__end); } - else + else // {3,} { - _StateSeqT __r(_M_stack.top(), -1); - __r._M_push_back(__r._M_front()); - _M_stack.pop(); - _M_stack.push(__r); + auto __tmp = __r._M_clone(); + _StateSeqT __s(_M_nfa, _M_nfa._M_insert_alt(_S_invalid_state_id, + __tmp._M_start)); + __tmp._M_append(__s); + __e._M_append(__s); } if (!_M_match_token(_ScannerT::_S_token_interval_end)) __throw_regex_error(regex_constants::error_brace); - return; + _M_stack.push(__e); } } @@ -203,46 +238,50 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION _M_atom() { if (_M_match_token(_ScannerT::_S_token_anychar)) + _M_stack.push(_StateSeqT(_M_nfa, + _M_nfa._M_insert_matcher + (_AnyMatcher<_CharT, _TraitsT>(_M_traits)))); + else if (_M_try_char()) + _M_stack.push(_StateSeqT(_M_nfa, + _M_nfa._M_insert_matcher + (_CharMatcher<_CharT, _TraitsT>(_M_value[0], + _M_traits, + _M_flags)))); + else if (_M_match_token(_ScannerT::_S_token_backref)) + _M_stack.push(_StateSeqT(_M_nfa, _M_nfa. + _M_insert_backref(_M_cur_int_value(10)))); + else if (_M_match_token(_ScannerT::_S_token_quoted_class)) { - _M_stack.push(_StateSeqT(_M_state_store, - _M_state_store._M_insert_matcher - (_AnyMatcher<_CharT, _TraitsT>(_M_traits)))); - return true; + _GLIBCXX_DEBUG_ASSERT(_M_value.size() == 1); + _BMatcherT __matcher(_M_ctype.is(_CtypeT::upper, _M_value[0]), + _M_traits, _M_flags); + __matcher._M_add_character_class(_M_value); + _M_stack.push(_StateSeqT(_M_nfa, + _M_nfa._M_insert_matcher(__matcher))); } - if (_M_try_char()) + else if (_M_match_token(_ScannerT::_S_token_subexpr_no_group_begin)) { - _M_stack.push(_StateSeqT(_M_state_store, - _M_state_store._M_insert_matcher - (_CharMatcher<_CharT, _TraitsT>(_M_value[0], - _M_traits, - _M_flags)))); - return true; - } - if (_M_match_token(_ScannerT::_S_token_backref)) - { - _M_stack.push(_StateSeqT(_M_state_store, _M_state_store. - _M_insert_backref(_M_cur_int_value(10)))); - return true; - } - if (_M_match_token(_ScannerT::_S_token_subexpr_begin)) - { - int __mark = _M_state_store._M_sub_count(); - _StateSeqT __r(_M_state_store, - _M_state_store. - _M_insert_subexpr_begin()); + _StateSeqT __r(_M_nfa, _M_nfa._M_insert_dummy()); this->_M_disjunction(); if (!_M_match_token(_ScannerT::_S_token_subexpr_end)) __throw_regex_error(regex_constants::error_paren); - if (!_M_stack.empty()) - { - __r._M_append(_M_stack.top()); - _M_stack.pop(); - } - __r._M_append(_M_state_store._M_insert_subexpr_end()); + __r._M_append(_M_pop()); _M_stack.push(__r); - return true; } - return _M_bracket_expression(); + else if (_M_match_token(_ScannerT::_S_token_subexpr_begin)) + { + int __mark = _M_nfa._M_sub_count(); + _StateSeqT __r(_M_nfa, _M_nfa._M_insert_subexpr_begin()); + this->_M_disjunction(); + if (!_M_match_token(_ScannerT::_S_token_subexpr_end)) + __throw_regex_error(regex_constants::error_paren); + __r._M_append(_M_pop()); + __r._M_append(_M_nfa._M_insert_subexpr_end()); + _M_stack.push(__r); + } + else if (!_M_bracket_expression()) + return false; + return true; } template @@ -255,51 +294,29 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION if (!(__neg || _M_match_token(_ScannerT::_S_token_bracket_begin))) return false; _BMatcherT __matcher(__neg, _M_traits, _M_flags); - _M_bracket_list(__matcher); - _M_stack.push(_StateSeqT(_M_state_store, - _M_state_store._M_insert_matcher(__matcher))); + while (!_M_match_token(_ScannerT::_S_token_bracket_end)) + _M_expression_term(__matcher); + _M_stack.push(_StateSeqT(_M_nfa, _M_nfa._M_insert_matcher(__matcher))); return true; } - template - void - _Compiler<_FwdIter, _CharT, _TraitsT>:: - _M_bracket_list(_BMatcherT& __matcher) - { - if (_M_match_token(_ScannerT::_S_token_bracket_end)) - return; - _M_expression_term(__matcher); - _M_bracket_list(__matcher); - return; - } - template void _Compiler<_FwdIter, _CharT, _TraitsT>:: _M_expression_term(_BMatcherT& __matcher) { if (_M_match_token(_ScannerT::_S_token_collsymbol)) - { - __matcher._M_add_collating_element(_M_value); - return; - } - if (_M_match_token(_ScannerT::_S_token_equiv_class_name)) - { - __matcher._M_add_equivalence_class(_M_value); - return; - } - if (_M_match_token(_ScannerT::_S_token_char_class_name)) - { - __matcher._M_add_character_class(_M_value); - return; - } - if (_M_try_char()) // [a + __matcher._M_add_collating_element(_M_value); + else if (_M_match_token(_ScannerT::_S_token_equiv_class_name)) + __matcher._M_add_equivalence_class(_M_value); + else if (_M_match_token(_ScannerT::_S_token_char_class_name)) + __matcher._M_add_character_class(_M_value); + else if (_M_try_char()) // [a { auto __ch = _M_value[0]; if (_M_try_char()) { - if (_M_value[0] == std::use_facet> - (_M_traits.getloc()).widen('-')) // [a- + if (_M_value[0] == '-') // [a- { if (_M_try_char()) // [a-z] { @@ -315,9 +332,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION __matcher._M_add_char(_M_value[0]); } __matcher._M_add_char(__ch); - return; } - __throw_regex_error(regex_constants::error_brack); + else + __throw_regex_error(regex_constants::error_brack); } template @@ -341,6 +358,20 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION return __is_char; } + template + bool + _Compiler<_FwdIter, _CharT, _TraitsT>:: + _M_match_token(_TokenT token) + { + if (token == _M_scanner._M_get_token()) + { + _M_value = _M_scanner._M_get_value(); + _M_scanner._M_advance(); + return true; + } + return false; + } + template int _Compiler<_FwdIter, _CharT, _TraitsT>:: diff --git a/libstdc++-v3/include/bits/regex_scanner.h b/libstdc++-v3/include/bits/regex_scanner.h index 080ef635b0c..064c1832796 100644 --- a/libstdc++-v3/include/bits/regex_scanner.h +++ b/libstdc++-v3/include/bits/regex_scanner.h @@ -86,6 +86,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION _S_token_closure1, _S_token_line_begin, _S_token_line_end, + _S_token_word_bound, + _S_token_neg_word_bound, _S_token_comma, _S_token_dup_count, _S_token_eof, diff --git a/libstdc++-v3/include/bits/regex_scanner.tcc b/libstdc++-v3/include/bits/regex_scanner.tcc index 0d1d2cd9778..3303aa56a38 100644 --- a/libstdc++-v3/include/bits/regex_scanner.tcc +++ b/libstdc++-v3/include/bits/regex_scanner.tcc @@ -28,7 +28,7 @@ * Do not attempt to use it directly. @headername{regex} */ -// TODO make comments doxygen format +// TODO make comments doxygen format. // N3376 specified 6 regex styles: ECMAScript, basic, extended, grep, egrep // and awk @@ -370,10 +370,12 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION _M_token = _S_token_ord_char; _M_value.assign(1, _M_escape_map.at(__c)); } + else if (__c == 'b') + _M_token = _S_token_word_bound; + else if (__c == 'B') + _M_token = _S_token_neg_word_bound; // N3376 28.13 - else if (__c == 'b' - || __c == 'B' - || __c == 'd' + else if (__c == 'd' || __c == 'D' || __c == 's' || __c == 'S' diff --git a/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/basic/string_range_02_03.cc b/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/basic/string_range_02_03.cc index 5d43d7c4354..91bc101392b 100644 --- a/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/basic/string_range_02_03.cc +++ b/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/basic/string_range_02_03.cc @@ -1,5 +1,4 @@ // { dg-options "-std=c++0x" } -// { dg-do run { xfail *-*-* } } // // 2010-06-16 Stephen M. Webb diff --git a/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/ecma/char/quoted_char.cc b/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/ecma/char/quoted_char.cc new file mode 100644 index 00000000000..b54f5619a24 --- /dev/null +++ b/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/ecma/char/quoted_char.cc @@ -0,0 +1,52 @@ +// { dg-options "-std=gnu++11" } + +// +// 2013-09-05 Tim Shen +// +// Copyright (C) 2013 Free Software Foundation, Inc. +// +// This file is part of the GNU ISO C++ Library. This library is free +// software; you can redistribute it and/or modify it under the +// terms of the GNU General Public License as published by the +// Free Software Foundation; either version 3, or (at your option) +// any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License along +// with this library; see the file COPYING3. If not see +// . + +// 28.11.2 regex_match +// Tests ECMAScript \d \D \s \S \w \W + +#include +#include + +using namespace std; + +void +test01() +{ + bool test __attribute__((unused)) = true; + + VERIFY(regex_match("01", regex("\\d*"))); + VERIFY(regex_match("asdfjkl", regex("\\D*"))); + VERIFY(!regex_match("asdfjkl0", regex("\\D*"))); + VERIFY(regex_match("\r\t\v\f ", regex("\\s*"))); + VERIFY(regex_match("asdfjkl", regex("\\S*"))); + VERIFY(!regex_match("asdfjkl\r", regex("\\S*"))); + VERIFY(regex_match("_az", regex("\\w*"))); + VERIFY(regex_match("!@#$%", regex("\\W*"))); + VERIFY(!regex_match("_01234", regex("\\W*"))); +} + +int +main() +{ + test01(); + return 0; +} diff --git a/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/extended/cstring_plus.cc b/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/extended/cstring_plus.cc index 67bc1d8a5c2..375f34b8064 100644 --- a/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/extended/cstring_plus.cc +++ b/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/extended/cstring_plus.cc @@ -1,5 +1,4 @@ // { dg-options "-std=c++0x" } -// { dg-do run { xfail *-*-* } } // // 2010-06-21 Stephen M. Webb @@ -32,27 +31,31 @@ test01() { bool test __attribute__((unused)) = true; - std::regex re("(a+)", std::regex::extended); - const char target[] = "aa"; - std::cmatch m; + std::regex re("(a+)", std::regex::extended); + const char target[] = "aa"; + std::cmatch m; - VERIFY( std::regex_match(target, m, re) ); + VERIFY( std::regex_match(target, m, re) ); - VERIFY( re.mark_count() == 1 ); - VERIFY( m.size() == re.mark_count()+1 ); - VERIFY( m.empty() == false ); - VERIFY( m.prefix().first == target ); - VERIFY( m.prefix().second == target ); - VERIFY( m.prefix().matched == false ); - VERIFY( m.suffix().first == target+sizeof(target) ); - VERIFY( m.suffix().second == target+sizeof(target) ); - VERIFY( m.suffix().matched == false ); - VERIFY( m[0].first == target ); - VERIFY( m[0].second == target+sizeof(target) ); - VERIFY( m[0].matched == true ); - VERIFY( m[1].first == target ); - VERIFY( m[1].second == target+sizeof(target) ); - VERIFY( m[1].matched == true ); + VERIFY( re.mark_count() == 1 ); + VERIFY( m.size() == re.mark_count()+1 ); + VERIFY( m.empty() == false ); + VERIFY( m.prefix().first == target ); + VERIFY( m.prefix().second == target ); + VERIFY( m.prefix().matched == false ); + VERIFY( m.suffix().first == target+sizeof(target)-1 ); + VERIFY( m.suffix().second == target+sizeof(target)-1 ); + VERIFY( m.suffix().matched == false ); + VERIFY( m[0].first == target ); + VERIFY( m[0].second == target+sizeof(target)-1 ); + VERIFY( m[0].matched == true ); + VERIFY( m[1].first == target ); + VERIFY( m[1].second == target+sizeof(target)-1 ); + VERIFY( m[1].matched == true ); + + VERIFY(!std::regex_match("", std::regex("a+", std::regex::extended))); + VERIFY(std::regex_match("a", std::regex("a+", std::regex::extended))); + VERIFY(std::regex_match("aa", std::regex("a+", std::regex::extended))); } diff --git a/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/extended/cstring_questionmark.cc b/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/extended/cstring_questionmark.cc index c0c8b92965b..79b52a88c4f 100644 --- a/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/extended/cstring_questionmark.cc +++ b/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/extended/cstring_questionmark.cc @@ -1,5 +1,4 @@ // { dg-options "-std=c++0x" } -// { dg-do run { xfail *-*-* } } // // 2010-06-21 Stephen M. Webb @@ -32,27 +31,31 @@ test01() { bool test __attribute__((unused)) = true; - std::regex re("(aa?)", std::regex::extended); - char target[] = "a"; - std::cmatch m; + std::regex re("(aa?)", std::regex::extended); + char target[] = "a"; + std::cmatch m; - VERIFY( std::regex_match(target, m, re) ); + VERIFY( std::regex_match(target, m, re) ); - VERIFY( re.mark_count() == 1 ); - VERIFY( m.size() == re.mark_count()+1 ); - VERIFY( m.empty() == false ); - VERIFY( m.prefix().first == target ); - VERIFY( m.prefix().second == target ); - VERIFY( m.prefix().matched == false ); - VERIFY( m.suffix().first == target+sizeof(target) ); - VERIFY( m.suffix().second == target+sizeof(target) ); - VERIFY( m.suffix().matched == false ); - VERIFY( m[0].first == target ); - VERIFY( m[0].second == target+sizeof(target) ); - VERIFY( m[0].matched == true ); - VERIFY( m[1].first == target ); - VERIFY( m[1].second == target+sizeof(target) ); - VERIFY( m[1].matched == true ); + VERIFY( re.mark_count() == 1 ); + VERIFY( m.size() == re.mark_count()+1 ); + VERIFY( m.empty() == false ); + VERIFY( m.prefix().first == target ); + VERIFY( m.prefix().second == target ); + VERIFY( m.prefix().matched == false ); + VERIFY( m.suffix().first == target+sizeof(target)-1 ); + VERIFY( m.suffix().second == target+sizeof(target)-1 ); + VERIFY( m.suffix().matched == false ); + VERIFY( m[0].first == target ); + VERIFY( m[0].second == target+sizeof(target)-1 ); + VERIFY( m[0].matched == true ); + VERIFY( m[1].first == target ); + VERIFY( m[1].second == target+sizeof(target)-1 ); + VERIFY( m[1].matched == true ); + + VERIFY(std::regex_match("", std::regex("a?", std::regex::extended))); + VERIFY(std::regex_match("a", std::regex("a?", std::regex::extended))); + VERIFY(!std::regex_match("aa", std::regex("a?", std::regex::extended))); } diff --git a/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/extended/cstring_range.cc b/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/extended/cstring_range.cc new file mode 100644 index 00000000000..62f825a0fb9 --- /dev/null +++ b/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/extended/cstring_range.cc @@ -0,0 +1,68 @@ +// { dg-options "-std=gnu++11" } + +// +// 2013-09-05 Tim Shen +// +// Copyright (C) 2013 Free Software Foundation, Inc. +// +// This file is part of the GNU ISO C++ Library. This library is free +// software; you can redistribute it and/or modify it under the +// terms of the GNU General Public License as published by the +// Free Software Foundation; either version 3, or (at your option) +// any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License along +// with this library; see the file COPYING3. If not see +// . + +// 28.11.2 regex_match +// Tests Extended interval range. + +#include +#include + +using namespace std; + +void +test01() +{ + bool test __attribute__((unused)) = true; + + regex re; + re.assign("(ab){3}", std::regex::extended); + VERIFY(!regex_match("abab", re)); + VERIFY(regex_match("ababab", re)); + VERIFY(!regex_match("abababab", re)); + re.assign("(ab){3,}", std::regex::extended); + VERIFY(!regex_match("abab", re)); + VERIFY(regex_match("ababab", re)); + VERIFY(regex_match("abababab", re)); + VERIFY(regex_match("ababababab", re)); + re.assign("(ab){0,3}", std::regex::extended); + VERIFY(regex_match("", re)); + VERIFY(regex_match("ab", re)); + VERIFY(regex_match("abab", re)); + VERIFY(regex_match("ababab", re)); + VERIFY(!regex_match("abababab", re)); + re.assign("(a|b){0,2}", std::regex::extended); + VERIFY(regex_match("", re)); + VERIFY(regex_match("a", re)); + VERIFY(regex_match("b", re)); + VERIFY(regex_match("aa", re)); + VERIFY(regex_match("ab", re)); + VERIFY(regex_match("ba", re)); + VERIFY(regex_match("bb", re)); + VERIFY(!regex_match("aaa", re)); +} + +int +main() +{ + test01(); + return 0; +} diff --git a/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/extended/string_range_00_03.cc b/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/extended/string_range_00_03.cc index 180c35962e1..e10dba81ffa 100644 --- a/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/extended/string_range_00_03.cc +++ b/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/extended/string_range_00_03.cc @@ -31,23 +31,23 @@ test01() { bool test __attribute__((unused)) = true; - std::regex re("a{0,3}", std::regex::extended); - std::string target("aa"); - std::smatch m; + std::regex re("a{0,3}", std::regex::extended); + std::string target("aa"); + std::smatch m; - VERIFY( std::regex_match(target, m, re) ); + VERIFY( std::regex_match(target, m, re) ); - VERIFY( m.size() == re.mark_count()+1 ); - VERIFY( m.empty() == false ); - VERIFY( m.prefix().first == target.begin() ); - VERIFY( m.prefix().second == target.begin() ); - VERIFY( m.prefix().matched == false ); - VERIFY( m.suffix().first == target.end() ); - VERIFY( m.suffix().second == target.end() ); - VERIFY( m.suffix().matched == false ); - VERIFY( m[0].first == target.begin() ); - VERIFY( m[0].second == target.end() ); - VERIFY( m[0].matched == true ); + VERIFY( m.size() == re.mark_count()+1 ); + VERIFY( m.empty() == false ); + VERIFY( m.prefix().first == target.begin() ); + VERIFY( m.prefix().second == target.begin() ); + VERIFY( m.prefix().matched == false ); + VERIFY( m.suffix().first == target.end() ); + VERIFY( m.suffix().second == target.end() ); + VERIFY( m.suffix().matched == false ); + VERIFY( m[0].first == target.begin() ); + VERIFY( m[0].second == target.end() ); + VERIFY( m[0].matched == true ); } diff --git a/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/extended/string_range_02_03.cc b/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/extended/string_range_02_03.cc index 532869d1341..62793b4a199 100644 --- a/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/extended/string_range_02_03.cc +++ b/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/extended/string_range_02_03.cc @@ -1,5 +1,4 @@ // { dg-options "-std=c++0x" } -// { dg-do run { xfail *-*-* } } // // 2010-06-16 Stephen M. Webb diff --git a/libstdc++-v3/testsuite/28_regex/iterators/regex_iterator/wchar_t/string_02.cc b/libstdc++-v3/testsuite/28_regex/iterators/regex_iterator/wchar_t/string_02.cc new file mode 100644 index 00000000000..cd2c68e33ee --- /dev/null +++ b/libstdc++-v3/testsuite/28_regex/iterators/regex_iterator/wchar_t/string_02.cc @@ -0,0 +1,59 @@ +// { dg-options "-std=gnu++11" } +// { dg-require-namedlocale "en_US.UTF-8" } +// { dg-do run { xfail *-*-* } } + +// +// 2013-09-05 Tim Shen +// +// Copyright (C) 2013 Free Software Foundation, Inc. +// +// This file is part of the GNU ISO C++ Library. This library is free +// software; you can redistribute it and/or modify it under the +// terms of the GNU General Public License as published by the +// Free Software Foundation; either version 3, or (at your option) +// any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License along +// with this library; see the file COPYING3. If not see +// . + +// 28.12.1 regex_iterator +// Tests regex_iterator class + +#include +#include + +void +test01() +{ + bool test __attribute__((unused)) = true; + + std::setlocale(LC_ALL, "en_US.UTF-8"); + + std::wstring str2 = L"ä\u2009Ä\u2009ö\u2009Ö\u2009ü\u2009Ü"; + + std::wregex re2; + re2.imbue(std::locale("en_US.UTF-8")); + + re2.assign(L"([[:lower:]]{0,1}[[:space:]]{0,1}[[:upper:]]{0,1})"); + + std::wsregex_iterator p(str2.begin(), str2.end(), re2); + auto a = p; + ++p; + VERIFY(a != p); + //for (std::wsregex_iterator p(str2.begin(), str2.end(), re2); + // p != std::wsregex_iterator{}; ++p) + // std::wcout << (*p)[1] << std::endl; +} + +int +main() +{ + test01(); + return 0; +}