diff --git a/libstdc++-v3/ChangeLog b/libstdc++-v3/ChangeLog index 7717da08e8a..0957953de2f 100644 --- a/libstdc++-v3/ChangeLog +++ b/libstdc++-v3/ChangeLog @@ -1,3 +1,14 @@ +2013-10-20 Tim Shen + + * include/bits/regex.h: Remove virtual class _Automaton. + * include/bits/regex_automaton.h: Likewise. + * include/bits/regex.tcc: Adjust comment for policy changing. + * include/bits/regex_executor.h: Update comments of complexity. + * include/bits/regex_executor.tcc: Adjust executor choosing + policy. Now DFS executor is the default one. + * testsuite/util/testsuite_regex.h (regex_match_debug, + regex_search_debug): Adjust for policy changing. + 2013-10-20 Chris Jefferson Paolo Carlini diff --git a/libstdc++-v3/include/bits/regex.h b/libstdc++-v3/include/bits/regex.h index 5d1a8f4b71e..32d38b491bd 100644 --- a/libstdc++-v3/include/bits/regex.h +++ b/libstdc++-v3/include/bits/regex.h @@ -727,7 +727,7 @@ _GLIBCXX_END_NAMESPACE_VERSION #endif protected: - typedef std::shared_ptr<__detail::_Automaton<_Ch_type, _Rx_traits>> + typedef std::shared_ptr<__detail::_NFA<_Ch_type, _Rx_traits>> _AutomatonPtr; template - class _Automaton - { - public: - typedef size_t _SizeT; - - public: - virtual - ~_Automaton() - { } - - virtual _SizeT - _M_sub_count() const = 0; - -#ifdef _GLIBCXX_DEBUG - virtual std::ostream& - _M_dot(std::ostream& __ostr) const = 0; -#endif - }; - template class _NFA - : public _Automaton<_CharT, _TraitsT>, - public std::vector<_State<_CharT, _TraitsT>> + : public std::vector<_State<_CharT, _TraitsT>> { public: typedef _State<_CharT, _TraitsT> _StateT; diff --git a/libstdc++-v3/include/bits/regex_executor.h b/libstdc++-v3/include/bits/regex_executor.h index 018b649a25a..6d9a83e8c5c 100644 --- a/libstdc++-v3/include/bits/regex_executor.h +++ b/libstdc++-v3/include/bits/regex_executor.h @@ -179,8 +179,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION // TODO: This approach is exponentially slow for certain input. // Try to compile the NFA to a DFA. // - // Time complexity: exponential - // Space complexity: O(__end - __begin) + // Time complexity: o(match_length), O(2^(_M_nfa->size())) + // Space complexity: \theta(match_results.size() + match_length) template class _DFSExecutor @@ -200,16 +200,14 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION const _RegexT& __re, _FlagT __flags) : _BaseT(__begin, __end, __results, __re, __flags), - _M_nfa(*std::static_pointer_cast<_NFA<_CharT, _TraitsT>> - (__re._M_automaton)), - _M_start_state(_M_nfa._M_start()) + _M_nfa(__re._M_automaton), _M_start_state(_M_nfa->_M_start()) { } private: void _M_init(_BiIter __cur) { - _M_cur_results.resize(_M_nfa._M_sub_count() + 2); + _M_cur_results.resize(_M_nfa->_M_sub_count() + 2); this->_M_current = __cur; } @@ -235,9 +233,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION } // To record current solution. - _ResultsVec _M_cur_results; - const _NFAT& _M_nfa; - _StateIdT _M_start_state; + std::shared_ptr<_NFAT> _M_nfa; + _ResultsVec _M_cur_results; + _StateIdT _M_start_state; }; // Like the DFS approach, it try every possible state transition; Unlike DFS, @@ -251,8 +249,13 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION // matching head. When states transit, solutions will be compared and // deduplicated(based on which greedy mode we have). // - // Time complexity: O((__end - __begin) * _M_nfa.size()) - // Space complexity: O(_M_nfa.size() * _M_nfa.mark_count()) + // Time complexity: o(match_length * (quantifier_number + // + match_results.size()) + // O(match_length * _M_nfa->size() + // * (quantifier_number + match_results.size()) + // Space complexity: o(quantifier_number + match_results.size()) + // O(_M_nfa->size() + // * (quantifier_number + match_results.size()) template class _BFSExecutor @@ -382,11 +385,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION const _RegexT& __re, _FlagT __flags) : _BaseT(__begin, __end, __results, __re, __flags), - _M_nfa(*std::static_pointer_cast<_NFA<_CharT, _TraitsT>> - (__re._M_automaton)), - _M_match_stack(_M_nfa.size()), - _M_stack(_M_nfa.size()), - _M_start_state(_M_nfa._M_start()) + _M_nfa(__re._M_automaton), _M_match_stack(_M_nfa->size()), + _M_stack(_M_nfa->size()), _M_start_state(_M_nfa->_M_start()) { } private: @@ -398,7 +398,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION _ResultsVec& __res(this->_M_results); _M_covered[this->_M_start_state] = _ResultsPtr(new _ResultsEntry(__res.size(), - _M_nfa._M_quant_count)); + _M_nfa->_M_quant_count)); _M_stack._M_push(this->_M_start_state); } @@ -428,7 +428,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION this->_M_flags)); } - const _NFAT& _M_nfa; + std::shared_ptr<_NFAT> _M_nfa; std::map<_StateIdT, _ResultsPtr> _M_covered; _TodoList _M_match_stack; _TodoList _M_stack; diff --git a/libstdc++-v3/include/bits/regex_executor.tcc b/libstdc++-v3/include/bits/regex_executor.tcc index b8755736e60..7c084add031 100644 --- a/libstdc++-v3/include/bits/regex_executor.tcc +++ b/libstdc++-v3/include/bits/regex_executor.tcc @@ -28,6 +28,13 @@ * Do not attempt to use it directly. @headername{regex} */ +// See below __get_executor to get what this is talking about. The default +// value 1 indicated a conservative optimization without giving up worst case +// performance. +#ifndef _GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT +#define _GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT 1 +#endif + namespace std _GLIBCXX_VISIBILITY(default) { namespace __detail @@ -60,7 +67,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION _M_dfs(_StateIdT __i) { auto& __current = this->_M_current; - const auto& __state = _M_nfa[__i]; + const auto& __state = (*_M_nfa)[__i]; bool __ret = false; switch (__state._M_opcode) { @@ -216,7 +223,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION { auto __u = _M_stack._M_pop(); _GLIBCXX_DEBUG_ASSERT(_M_covered.count(__u)); - const auto& __state = _M_nfa[__u]; + const auto& __state = (*_M_nfa)[__u]; // Can be implemented using method, but there will be too many // arguments. I would use macro function before C++11, but lambda is @@ -314,7 +321,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION while (!_M_match_stack._M_empty()) { auto __u = _M_match_stack._M_pop(); - const auto& __state = _M_nfa[__u]; + const auto& __state = (*_M_nfa)[__u]; auto& __cu = _M_covered[__u]; if (__state._M_matches(*this->_M_current) && (__next.count(__state._M_next) == 0 @@ -333,7 +340,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION _M_includes_some() { bool __succ = false; - for (auto __u : _M_nfa._M_final_states()) + for (auto __u : _M_nfa->_M_final_states()) if (_M_covered.count(__u)) { __succ = true; @@ -380,8 +387,21 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION } enum class _RegexExecutorPolicy : int - { _S_auto, _S_force_dfs }; + { _S_auto, _S_alternate }; + // This function decide which executor to use under given circumstances. + // The _S_auto policy now is the following: if a NFA has no back-references + // and has more than _GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT quantifiers + // (*, +, ?), the _BFSExecutor will be used, other wise _DFSExecutor. This is + // because _DFSExecutor has a exponential upper bound, but better best-case + // performace. Meanwhile, _BFSExecutor can effectively prevent from + // exponential-long time matching (which must contains many quantifiers), but + // it's slower in average. + // + // For simple regex, _BFSExecutor could be 2 or more times slower than + // _DFSExecutor. + // + // Of course, _BFSExecutor cannot handle back-references. template @@ -396,12 +416,12 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION _ExecutorPtr; typedef _DFSExecutor<_BiIter, _Alloc, _CharT, _TraitsT> _DFSExecutorT; typedef _BFSExecutor<_BiIter, _Alloc, _CharT, _TraitsT> _BFSExecutorT; - auto __p = std::static_pointer_cast<_NFA<_CharT, _TraitsT>> - (__re._M_automaton); - if (__policy == _RegexExecutorPolicy::_S_force_dfs - || (__policy == _RegexExecutorPolicy::_S_auto && __p->_M_has_backref)) - return _ExecutorPtr(new _DFSExecutorT(__b, __e, __m, __re, __flags)); - return _ExecutorPtr(new _BFSExecutorT(__b, __e, __m, __re, __flags)); + if (!__re._M_automaton->_M_has_backref + && (__policy == _RegexExecutorPolicy::_S_alternate + || __re._M_automaton->_M_quant_count + > _GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT)) + return _ExecutorPtr(new _BFSExecutorT(__b, __e, __m, __re, __flags)); + return _ExecutorPtr(new _DFSExecutorT(__b, __e, __m, __re, __flags)); } _GLIBCXX_END_NAMESPACE_VERSION diff --git a/libstdc++-v3/testsuite/util/testsuite_regex.h b/libstdc++-v3/testsuite/util/testsuite_regex.h index 17be78d9c1b..bc0f3d5bfd9 100644 --- a/libstdc++-v3/testsuite/util/testsuite_regex.h +++ b/libstdc++-v3/testsuite/util/testsuite_regex.h @@ -148,7 +148,7 @@ namespace __gnu_test (__s, __e, __m, __re, __flags); match_results<_Bi_iter, _Alloc> __mm; auto __res2 = __regex_algo_impl<_Bi_iter, _Alloc, _Ch_type, _Rx_traits, - _RegexExecutorPolicy::_S_force_dfs, true> + _RegexExecutorPolicy::_S_alternate, true> (__s, __e, __mm, __re, __flags); if (__res1 == __res2 && __m == __mm) return __res1; @@ -234,7 +234,7 @@ namespace __gnu_test (__s, __e, __m, __re, __flags); match_results<_Bi_iter, _Alloc> __mm; auto __res2 = __regex_algo_impl<_Bi_iter, _Alloc, _Ch_type, _Rx_traits, - _RegexExecutorPolicy::_S_force_dfs, false> + _RegexExecutorPolicy::_S_alternate, false> (__s, __e, __mm, __re, __flags); if (__res1 == __res2 && __m == __mm) return __res1;