[1/N] std::regex refactoring - _BracketMatcher

Message ID	20140602193629.GW6953@redhat.com
State	New
Headers	show Return-Path: <gcc-patches-return-369225-incoming=patchwork.ozlabs.org@gcc.gnu.org> DomainKey-Signature: a=rsa-sha1; c=nofws; d=gcc.gnu.org; h=list-id :list-unsubscribe:list-archive:list-post:list-help:sender:date :from:to:cc:subject:message-id:references:mime-version :content-type:in-reply-to; q=dns; s=default; b=Tj1HHOrLfUigTKnxX ESvj/OVQB3JS+B3/uZWeNz6ES3p3fIX6acFaYJBHBAgy3YtDXrJsARFJ4Mbl2WxQ vHCOyejmEMthC9C/JvJBp1zVB3jjRw7CLS7e1ccFUxrjcGU6rrJFfGoeP36FqYBy 6FoMl7+f+zI0zhcFpMZIJcqDqg= Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm Precedence: bulk Sender: gcc-patches-owner@gcc.gnu.org Date: Mon, 2 Jun 2014 20:36:29 +0100 From: Jonathan Wakely <jwakely@redhat.com> To: Tim Shen <timshen91@gmail.com> Cc: libstdc++@gcc.gnu.org, gcc-patches@gcc.gnu.org Subject: Re: [patch 1/N] std::regex refactoring - _BracketMatcher Message-ID: <20140602193629.GW6953@redhat.com> References: <20140428124003.GW928@redhat.com> MIME-Version: 1.0 Content-Type: multipart/mixed; boundary="6axCafNXXMM8qu6Q" Content-Disposition: inline In-Reply-To: <20140428124003.GW928@redhat.com> User-Agent: Mutt/1.5.23 (2014-03-12)

commit f8e0936df9cdc31460bb79ac82f43486967983b1 Author: Jonathan Wakely <jwakely@redhat.com> Date: Tue Apr 8 16:24:56 2014 +0100 * include/bits/regex_compiler.h (__detail::_BracketMatcher): Reorder members to avoid wasted space when not using a cache. (__detail::_BracketMatcher::_M_ready()): Sort and deduplicate set. * include/bits/regex_compiler.tcc (__detail::_BracketMatcher::_M_apply(_CharT, false_type)): Use binary search on set. * include/bits/regex_executor.h (__detail::_Executor::_Match_mode): New enumeration type to indicate match mode. (__detail::_Executor::_State_info): New type holding members only needed in BFS-mode. Replace unique_ptr<vector<bool>> with unique_ptr<bool[]>. (__detail::_Executor::_M_rep_once_more, __detail::_Executor::_M_dfs): Replace template parameter with run-time function parameter. (__detail::_Executor::_M_main): Likewise. Dispatch to ... (__detail::_Executor::_M_main_dispatch): New overloaded functions to implement DFS and BFS mode. * include/bits/regex_executor.tcc (__detail::_Executor::_M_main): Split implementation into ... (__detail::_Executor::_M_main_dispatch): New overloaded functions. (__detail::_Executor::_M_lookahead): Create nested executor on stack. (__detail::_Executor::_M_rep_once_more): Pass match mode as function argument instead of template argument. (__detail::_Executor::_M_dfs): Likewise. * include/bits/regex_scanner.tcc: Fix typos in comments. * testsuite/performance/28_regex/range.cc: New. diff --git a/libstdc++-v3/include/bits/regex.tcc b/libstdc++-v3/include/bits/regex.tcc index 0d737a0..a81f517 100644 --- a/libstdc++-v3/include/bits/regex.tcc +++ b/libstdc++-v3/include/bits/regex.tcc @@ -32,7 +32,7 @@ // If _GLIBCXX_REGEX_USE_THOMPSON_NFA is defined, the thompson NFA // algorithm will be used. This algorithm is not enabled by default, // and cannot be used if the regex contains back-references, but has better -// (polynomial instead of exponential) worst case performace. +// (polynomial instead of exponential) worst case performance. // See __regex_algo_impl below. namespace std _GLIBCXX_VISIBILITY(default) diff --git a/libstdc++-v3/include/bits/regex_compiler.h b/libstdc++-v3/include/bits/regex_compiler.h index 52f7235..ca116de 100644 --- a/libstdc++-v3/include/bits/regex_compiler.h +++ b/libstdc++-v3/include/bits/regex_compiler.h @@ -43,7 +43,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION struct _BracketMatcher; /** - * @brief Builds an NFA from an input iterator interval. + * @brief Builds an NFA from an input iterator range. * * The %_TraitsT type should fulfill requirements [28.3]. */ @@ -329,7 +329,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION operator()(_CharT __ch) const { _GLIBCXX_DEBUG_ASSERT(_M_is_ready); - return _M_apply(__ch, _IsChar()); + return _M_apply(__ch, _UseCache()); } void @@ -400,21 +400,28 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION void _M_ready() { - _M_make_cache(_IsChar()); + std::sort(_M_char_set.begin(), _M_char_set.end()); + auto __end = std::unique(_M_char_set.begin(), _M_char_set.end()); + _M_char_set.erase(__end, _M_char_set.end()); + _M_make_cache(_UseCache()); #ifdef _GLIBCXX_DEBUG _M_is_ready = true; #endif } private: - typedef typename is_same<_CharT, char>::type _IsChar; + // Currently we only use the cache for char + typedef typename std::is_same<_CharT, char>::type _UseCache; + + static constexpr size_t + _S_cache_size() { return 1ul << (sizeof(_CharT) * __CHAR_BIT__); } + struct _Dummy { }; - typedef typename conditional<_IsChar::value, - std::bitset<1 << (8 * sizeof(_CharT))>, - _Dummy>::type _CacheT; - typedef typename make_unsigned<_CharT>::type _UnsignedCharT; + typedef typename std::conditional<_UseCache::value, + std::bitset<_S_cache_size()>, + _Dummy>::type _CacheT; + typedef typename std::make_unsigned<_CharT>::type _UnsignedCharT; - private: bool _M_apply(_CharT __ch, false_type) const; @@ -425,9 +432,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION void _M_make_cache(true_type) { - for (size_t __i = 0; __i < _M_cache.size(); __i++) - _M_cache[static_cast<_UnsignedCharT>(__i)] = - _M_apply(__i, false_type()); + for (unsigned __i = 0; __i < _M_cache.size(); __i++) + _M_cache[__i] = _M_apply(static_cast<_CharT>(__i), false_type()); } void @@ -435,7 +441,6 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION { } private: - _CacheT _M_cache; std::vector<_CharT> _M_char_set; std::vector<_StringT> _M_equiv_set; std::vector<pair<_StrTransT, _StrTransT>> _M_range_set; @@ -444,6 +449,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION _TransT _M_translator; const _TraitsT& _M_traits; bool _M_is_non_matching; + _CacheT _M_cache; #ifdef _GLIBCXX_DEBUG bool _M_is_ready; #endif diff --git a/libstdc++-v3/include/bits/regex_compiler.tcc b/libstdc++-v3/include/bits/regex_compiler.tcc index 472cf1f..0df10cc 100644 --- a/libstdc++-v3/include/bits/regex_compiler.tcc +++ b/libstdc++-v3/include/bits/regex_compiler.tcc @@ -37,9 +37,9 @@ // When compiling, states are *chained* instead of tree- or graph-constructed. // It's more like structured programs: there's if statement and loop statement. // -// For alternative structure(say "a|b"), aka "if statement", two branchs should -// be constructed. However, these two shall merge to an "end_tag" at the end of -// this operator: +// For alternative structure (say "a|b"), aka "if statement", two branches +// should be constructed. However, these two shall merge to an "end_tag" at +// the end of this operator: // // branch1 // / \ @@ -151,7 +151,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION else if (_M_match_token(_ScannerT::_S_token_line_end)) _M_stack.push(_StateSeqT(_M_nfa, _M_nfa._M_insert_line_end())); else if (_M_match_token(_ScannerT::_S_token_word_bound)) - // _M_value[0] == 'n' means it's negtive, say "not word boundary". + // _M_value[0] == 'n' means it's negative, say "not word boundary". _M_stack.push(_StateSeqT(_M_nfa, _M_nfa. _M_insert_word_bound(_M_value[0] == 'n'))); else if (_M_match_token(_ScannerT::_S_token_subexpr_lookahead_begin)) @@ -256,7 +256,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION auto __end = _M_nfa._M_insert_dummy(); // _M_alt is the "match more" branch, and _M_next is the // "match less" one. Switch _M_alt and _M_next of all created - // nodes. This is a hacking but IMO works well. + // nodes. This is a hack but IMO works well. std::stack<_StateIdT> __stack; for (long __i = 0; __i < __n; ++__i) { @@ -511,12 +511,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION _BracketMatcher<_TraitsT, __icase, __collate>:: _M_apply(_CharT __ch, false_type) const { - bool __ret = false; - if (std::find(_M_char_set.begin(), _M_char_set.end(), - _M_translator._M_translate(__ch)) - != _M_char_set.end()) - __ret = true; - else + bool __ret = std::binary_search(_M_char_set.begin(), _M_char_set.end(), + _M_translator._M_translate(__ch)); + if (!__ret) { auto __s = _M_translator._M_transform(__ch); for (auto& __it : _M_range_set) diff --git a/libstdc++-v3/include/bits/regex_executor.h b/libstdc++-v3/include/bits/regex_executor.h index c110b88..1991c00 100644 --- a/libstdc++-v3/include/bits/regex_executor.h +++ b/libstdc++-v3/include/bits/regex_executor.h @@ -42,8 +42,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION */ /** - * @brief Takes a regex and an input string in and - * do the matching. + * @brief Takes a regex and an input string and does the matching. * * The %_Executor class has two modes: DFS mode and BFS mode, controlled * by the template parameter %__dfs_mode. @@ -52,6 +51,12 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION bool __dfs_mode> class _Executor { + using __search_mode = integral_constant<bool, __dfs_mode>; + using __dfs = true_type; + using __bfs = false_type; + + enum class _Match_mode : unsigned char { _Exact, _Prefix }; + public: typedef typename iterator_traits<_BiIter>::value_type _CharT; typedef basic_regex<_CharT, _TraitsT> _RegexT; @@ -71,24 +76,21 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION _M_re(__re), _M_nfa(*__re._M_automaton), _M_results(__results), - _M_match_queue(__dfs_mode ? nullptr - : new vector<pair<_StateIdT, _ResultsVec>>()), _M_rep_count(_M_nfa.size()), - _M_visited(__dfs_mode ? nullptr : new vector<bool>(_M_nfa.size())), + _M_states(_M_nfa._M_start(), _M_nfa.size()), _M_flags((__flags & regex_constants::match_prev_avail) ? (__flags & ~regex_constants::match_not_bol & ~regex_constants::match_not_bow) - : __flags), - _M_start_state(_M_nfa._M_start()) + : __flags) { } - // Set matched when string exactly match the pattern. + // Set matched when string exactly matches the pattern. bool _M_match() { _M_current = _M_begin; - return _M_main<true>(); + return _M_main(_Match_mode::_Exact); } // Set matched when some prefix of the string matches the pattern. @@ -96,24 +98,28 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION _M_search_from_first() { _M_current = _M_begin; - return _M_main<false>(); + return _M_main(_Match_mode::_Prefix); } bool _M_search(); private: - template<bool __match_mode> - void - _M_rep_once_more(_StateIdT); + void + _M_rep_once_more(_Match_mode __match_mode, _StateIdT); - template<bool __match_mode> - void - _M_dfs(_StateIdT __start); + void + _M_dfs(_Match_mode __match_mode, _StateIdT __start); - template<bool __match_mode> - bool - _M_main(); + bool + _M_main(_Match_mode __match_mode) + { return _M_main_dispatch(__match_mode, __search_mode{}); } + + bool + _M_main_dispatch(_Match_mode __match_mode, __dfs); + + bool + _M_main_dispatch(_Match_mode __match_mode, __bfs); bool _M_is_word(_CharT __ch) const @@ -144,6 +150,53 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION bool _M_lookahead(_State<_TraitsT> __state); + // Holds additional information used in BFS-mode. + template<typename _SearchMode, typename _ResultsVec> + struct _State_info; + + template<typename _ResultsVec> + struct _State_info<__bfs, _ResultsVec> + { + explicit + _State_info(_StateIdT __start, size_t __n) + : _M_start(__start), _M_visited_states(new bool[__n]()) + { } + + bool _M_visited(_StateIdT __i) + { + if (_M_visited_states[__i]) + return true; + _M_visited_states[__i] = true; + return false; + } + + void _M_queue(_StateIdT __i, const _ResultsVec& __res) + { _M_match_queue.emplace_back(__i, __res); } + + // Saves states that need to be considered for the next character. + vector<pair<_StateIdT, _ResultsVec>> _M_match_queue; + // Indicates which states are already visited. + unique_ptr<bool[]> _M_visited_states; + // To record current solution. + _StateIdT _M_start; + }; + + template<typename _ResultsVec> + struct _State_info<__dfs, _ResultsVec> + { + explicit + _State_info(_StateIdT __start, size_t) : _M_start(__start) + { } + + // Dummy implementations for DFS mode. + bool _M_visited(_StateIdT) const { return false; } + void _M_queue(_StateIdT, const _ResultsVec&) { } + + // To record current solution. + _StateIdT _M_start; + }; + + public: _ResultsVec _M_cur_results; _BiIter _M_current; @@ -152,15 +205,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION const _RegexT& _M_re; const _NFAT& _M_nfa; _ResultsVec& _M_results; - // Used in BFS, saving states that need to be considered for the next - // character. - unique_ptr<vector<pair<_StateIdT, _ResultsVec>>> _M_match_queue; - // Used in BFS, indicating that which state is already visited. vector<pair<_BiIter, int>> _M_rep_count; - unique_ptr<vector<bool>> _M_visited; + _State_info<__search_mode, _ResultsVec> _M_states; _FlagT _M_flags; - // To record current solution. - _StateIdT _M_start_state; // Do we have a solution so far? bool _M_has_sol; }; diff --git a/libstdc++-v3/include/bits/regex_executor.tcc b/libstdc++-v3/include/bits/regex_executor.tcc index 7f89933..aefa8f4 100644 --- a/libstdc++-v3/include/bits/regex_executor.tcc +++ b/libstdc++-v3/include/bits/regex_executor.tcc @@ -35,7 +35,7 @@ namespace __detail _GLIBCXX_BEGIN_NAMESPACE_VERSION template<typename _BiIter, typename _Alloc, typename _TraitsT, - bool __dfs_mode> + bool __dfs_mode> bool _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>:: _M_search() { @@ -45,7 +45,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION do { _M_current = __cur; - if (_M_main<false>()) + if (_M_main(_Match_mode::_Prefix)) return true; } // Continue when __cur == _M_end @@ -53,8 +53,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION return false; } - // This function operates in different modes, DFS mode or BFS mode, indicated - // by template parameter __dfs_mode. See _M_main for details. + // The _M_main function operates in different modes, DFS mode or BFS mode, + // indicated by template parameter __dfs_mode, and dispatches to one of the + // _M_main_dispatch overloads. // // ------------------------------------------------------------ // @@ -62,12 +63,12 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION // // It applies a Depth-First-Search (aka backtracking) on given NFA and input // string. - // At the very beginning the executor stands in the start state, then it tries - // every possible state transition in current state recursively. Some state - // transitions consume input string, say, a single-char-matcher or a + // At the very beginning the executor stands in the start state, then it + // tries every possible state transition in current state recursively. Some + // state transitions consume input string, say, a single-char-matcher or a // back-reference matcher; some don't, like assertion or other anchor nodes. - // When the input is exhausted and/or the current state is an accepting state, - // the whole executor returns true. + // When the input is exhausted and/or the current state is an accepting + // state, the whole executor returns true. // // TODO: This approach is exponentially slow for certain input. // Try to compile the NFA to a DFA. @@ -75,6 +76,17 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION // Time complexity: \Omega(match_length), O(2^(_M_nfa.size())) // Space complexity: \theta(match_results.size() + match_length) // + template<typename _BiIter, typename _Alloc, typename _TraitsT, + bool __dfs_mode> + bool _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>:: + _M_main_dispatch(_Match_mode __match_mode, __dfs) + { + _M_has_sol = false; + _M_cur_results = _M_results; + _M_dfs(__match_mode, _M_states._M_start); + return _M_has_sol; + } + // ------------------------------------------------------------ // // BFS mode: @@ -84,11 +96,11 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION // // It first computes epsilon closure (states that can be achieved without // consuming characters) for every state that's still matching, - // using the same DFS algorithm, but doesn't re-enter states (find a true in - // _M_visited), nor follows _S_opcode_match. + // using the same DFS algorithm, but doesn't re-enter states (using + // _M_states._M_visited to check), nor follow _S_opcode_match. // - // Then apply DFS using every _S_opcode_match (in _M_match_queue) as the start - // state. + // Then apply DFS using every _S_opcode_match (in _M_states._M_match_queue) + // as the start state. // // It significantly reduces potential duplicate states, so has a better // upper bound; but it requires more overhead. @@ -98,60 +110,45 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION // Space complexity: \Omega(_M_nfa.size() + match_results.size()) // O(_M_nfa.size() * match_results.size()) template<typename _BiIter, typename _Alloc, typename _TraitsT, - bool __dfs_mode> - template<bool __match_mode> + bool __dfs_mode> bool _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>:: - _M_main() + _M_main_dispatch(_Match_mode __match_mode, __bfs) { - if (__dfs_mode) + _M_states._M_queue(_M_states._M_start, _M_results); + bool __ret = false; + while (1) { _M_has_sol = false; - _M_cur_results = _M_results; - _M_dfs<__match_mode>(_M_start_state); - return _M_has_sol; - } - else - { - _M_match_queue->push_back(make_pair(_M_start_state, _M_results)); - bool __ret = false; - while (1) + if (_M_states._M_match_queue.empty()) + break; + std::fill_n(_M_states._M_visited_states.get(), _M_nfa.size(), false); + auto __old_queue = std::move(_M_states._M_match_queue); + for (auto& __task : __old_queue) { - _M_has_sol = false; - if (_M_match_queue->empty()) - break; - _M_visited->assign(_M_visited->size(), false); - auto _M_old_queue = std::move(*_M_match_queue); - for (auto __task : _M_old_queue) - { - _M_cur_results = __task.second; - _M_dfs<__match_mode>(__task.first); - } - if (!__match_mode) - __ret |= _M_has_sol; - if (_M_current == _M_end) - break; - ++_M_current; + _M_cur_results = std::move(__task.second); + _M_dfs(__match_mode, __task.first); } - if (__match_mode) - __ret = _M_has_sol; - return __ret; + if (__match_mode == _Match_mode::_Prefix) + __ret |= _M_has_sol; + if (_M_current == _M_end) + break; + ++_M_current; } + if (__match_mode == _Match_mode::_Exact) + __ret = _M_has_sol; + return __ret; } // Return whether now match the given sub-NFA. template<typename _BiIter, typename _Alloc, typename _TraitsT, - bool __dfs_mode> + bool __dfs_mode> bool _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>:: _M_lookahead(_State<_TraitsT> __state) { _ResultsVec __what(_M_cur_results.size()); - auto __sub = std::unique_ptr<_Executor>(new _Executor(_M_current, - _M_end, - __what, - _M_re, - _M_flags)); - __sub->_M_start_state = __state._M_alt; - if (__sub->_M_search_from_first()) + _Executor __sub(_M_current, _M_end, __what, _M_re, _M_flags); + __sub._M_states._M_start = __state._M_alt; + if (__sub._M_search_from_first()) { for (size_t __i = 0; __i < __what.size(); __i++) if (__what[__i].matched) @@ -169,9 +166,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION // we need to spare one more time for potential group capture. template<typename _BiIter, typename _Alloc, typename _TraitsT, bool __dfs_mode> - template<bool __match_mode> void _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>:: - _M_rep_once_more(_StateIdT __i) + _M_rep_once_more(_Match_mode __match_mode, _StateIdT __i) { const auto& __state = _M_nfa[__i]; auto& __rep_count = _M_rep_count[__i]; @@ -180,7 +176,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION auto __back = __rep_count; __rep_count.first = _M_current; __rep_count.second = 1; - _M_dfs<__match_mode>(__state._M_alt); + _M_dfs(__match_mode, __state._M_alt); __rep_count = __back; } else @@ -188,24 +184,19 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION if (__rep_count.second < 2) { __rep_count.second++; - _M_dfs<__match_mode>(__state._M_alt); + _M_dfs(__match_mode, __state._M_alt); __rep_count.second--; } } }; template<typename _BiIter, typename _Alloc, typename _TraitsT, - bool __dfs_mode> - template<bool __match_mode> + bool __dfs_mode> void _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>:: - _M_dfs(_StateIdT __i) + _M_dfs(_Match_mode __match_mode, _StateIdT __i) { - if (!__dfs_mode) - { - if ((*_M_visited)[__i]) - return; - (*_M_visited)[__i] = true; - } + if (_M_states._M_visited(__i)) + return; const auto& __state = _M_nfa[__i]; // Every change on _M_cur_results and _M_current will be rolled back after @@ -221,33 +212,33 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION // Greedy. if (!__state._M_neg) { - _M_rep_once_more<__match_mode>(__i); + _M_rep_once_more(__match_mode, __i); // If it's DFS executor and already accepted, we're done. if (!__dfs_mode || !_M_has_sol) - _M_dfs<__match_mode>(__state._M_next); + _M_dfs(__match_mode, __state._M_next); } else // Non-greedy mode { if (__dfs_mode) { // vice-versa. - _M_dfs<__match_mode>(__state._M_next); + _M_dfs(__match_mode, __state._M_next); if (!_M_has_sol) - _M_rep_once_more<__match_mode>(__i); + _M_rep_once_more(__match_mode, __i); } else { // DON'T attempt anything, because there's already another - // state with higher priority accepted. This state cannot be - // better by attempting its next node. + // state with higher priority accepted. This state cannot + // be better by attempting its next node. if (!_M_has_sol) { - _M_dfs<__match_mode>(__state._M_next); + _M_dfs(__match_mode, __state._M_next); // DON'T attempt anything if it's already accepted. An // accepted state *must* be better than a solution that // matches a non-greedy quantifier one more time. if (!_M_has_sol) - _M_rep_once_more<__match_mode>(__i); + _M_rep_once_more(__match_mode, __i); } } } @@ -258,7 +249,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION auto& __res = _M_cur_results[__state._M_subexpr]; auto __back = __res.first; __res.first = _M_current; - _M_dfs<__match_mode>(__state._M_next); + _M_dfs(__match_mode, __state._M_next); __res.first = __back; } break; @@ -268,27 +259,27 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION auto __back = __res; __res.second = _M_current; __res.matched = true; - _M_dfs<__match_mode>(__state._M_next); + _M_dfs(__match_mode, __state._M_next); __res = __back; } break; case _S_opcode_line_begin_assertion: if (_M_at_begin()) - _M_dfs<__match_mode>(__state._M_next); + _M_dfs(__match_mode, __state._M_next); break; case _S_opcode_line_end_assertion: if (_M_at_end()) - _M_dfs<__match_mode>(__state._M_next); + _M_dfs(__match_mode, __state._M_next); break; case _S_opcode_word_boundary: if (_M_word_boundary(__state) == !__state._M_neg) - _M_dfs<__match_mode>(__state._M_next); + _M_dfs(__match_mode, __state._M_next); break; // Here __state._M_alt offers a single start node for a sub-NFA. // We recursively invoke our algorithm to match the sub-NFA. case _S_opcode_subexpr_lookahead: if (_M_lookahead(__state) == !__state._M_neg) - _M_dfs<__match_mode>(__state._M_next); + _M_dfs(__match_mode, __state._M_next); break; case _S_opcode_match: if (__dfs_mode) @@ -296,14 +287,13 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION if (_M_current != _M_end && __state._M_matches(*_M_current)) { ++_M_current; - _M_dfs<__match_mode>(__state._M_next); + _M_dfs(__match_mode, __state._M_next); --_M_current; } } else if (__state._M_matches(*_M_current)) - _M_match_queue->push_back(make_pair(__state._M_next, - _M_cur_results)); + _M_states._M_queue(__state._M_next, _M_cur_results); break; // First fetch the matched result from _M_cur_results as __submatch; // then compare it with @@ -328,11 +318,11 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION { auto __backup = _M_current; _M_current = __last; - _M_dfs<__match_mode>(__state._M_next); + _M_dfs(__match_mode, __state._M_next); _M_current = __backup; } else - _M_dfs<__match_mode>(__state._M_next); + _M_dfs(__match_mode, __state._M_next); } } break; @@ -340,7 +330,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION if (__dfs_mode) { _GLIBCXX_DEBUG_ASSERT(!_M_has_sol); - if (__match_mode) + if (__match_mode == _Match_mode::_Exact) _M_has_sol = _M_current == _M_end; else _M_has_sol = true; @@ -355,7 +345,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION if (_M_current == _M_begin && (_M_flags & regex_constants::match_not_null)) break; - if (!__match_mode || _M_current == _M_end) + if (__match_mode == _Match_mode::_Prefix || _M_current == _M_end) if (!_M_has_sol) { _M_has_sol = true; @@ -364,9 +354,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION } break; case _S_opcode_alternative: - _M_dfs<__match_mode>(__state._M_alt); + _M_dfs(__match_mode, __state._M_alt); if (!__dfs_mode || !_M_has_sol) - _M_dfs<__match_mode>(__state._M_next); + _M_dfs(__match_mode, __state._M_next); break; default: _GLIBCXX_DEBUG_ASSERT(false); @@ -375,7 +365,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION // Return whether now is at some word boundary. template<typename _BiIter, typename _Alloc, typename _TraitsT, - bool __dfs_mode> + bool __dfs_mode> bool _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>:: _M_word_boundary(_State<_TraitsT> __state) const { diff --git a/libstdc++-v3/include/bits/regex_scanner.tcc b/libstdc++-v3/include/bits/regex_scanner.tcc index f501ff6..818e47b 100644 --- a/libstdc++-v3/include/bits/regex_scanner.tcc +++ b/libstdc++-v3/include/bits/regex_scanner.tcc @@ -227,14 +227,14 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION } } // In POSIX, when encountering "[]" or "[^]", the ']' is interpreted - // literally. So "[]]" or "[^]]" is valid regex. See the testcases + // literally. So "[]]" and "[^]]" are valid regexes. See the testcases // `*/empty_range.cc`. else if (__c == ']' && (_M_is_ecma() || !_M_at_bracket_start)) { _M_token = _S_token_bracket_end; _M_state = _S_state_normal; } - // ECMAScirpt and awk permmits escaping in bracket. + // ECMAScript and awk permits escaping in bracket. else if (__c == '\\' && (_M_is_ecma() || _M_is_awk())) (this->*_M_eat_escape)(); else @@ -344,7 +344,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION } _M_token = _S_token_hex_num; } - // ECMAScript recongnizes multi-digit back-references. + // ECMAScript recognizes multi-digit back-references. else if (_M_ctype.is(_CtypeT::digit, __c)) { _M_value.assign(1, __c); @@ -392,6 +392,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION else { #ifdef __STRICT_ANSI__ + // POSIX says it is undefined to escape ordinary characters __throw_regex_error(regex_constants::error_escape); #else _M_token = _S_token_ord_char; @@ -435,8 +436,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION __throw_regex_error(regex_constants::error_escape); } - // Eats a character class or throwns an exception. - // __ch cound be ':', '.' or '=', _M_current is the char after ']' when + // Eats a character class or throws an exception. + // __ch could be ':', '.' or '=', _M_current is the char after ']' when // returning. template<typename _CharT> void diff --git a/libstdc++-v3/testsuite/performance/28_regex/range.cc b/libstdc++-v3/testsuite/performance/28_regex/range.cc new file mode 100644 index 0000000..891cc2e --- /dev/null +++ b/libstdc++-v3/testsuite/performance/28_regex/range.cc @@ -0,0 +1,41 @@ +// Copyright (C) 2014 Free Software Foundation, Inc. +// +// This file is part of the GNU ISO C++ Library. This library is free +// software; you can redistribute it and/or modify it under the +// terms of the GNU General Public License as published by the +// Free Software Foundation; either version 3, or (at your option) +// any later version. + +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License along +// with this library; see the file COPYING3. If not see +// <http://www.gnu.org/licenses/>. + +#include <regex> +#include <testsuite_performance.h> + +using namespace __gnu_test; + +int main() +{ + time_counter time; + resource_counter resource; + + start_counters(time, resource); + + // this should get compiled to just L"[abcd]" + auto re = std::wregex(L'[' + std::wstring(300, L'a') + L"bc" + + std::wstring(1000, 'a') + L"d]"); + bool ok = true; + for (int i = 0; i < 100000; ++i) + ok = ok && (std::regex_match(L"b", re) && std::regex_match(L"d", re)); + + stop_counters(time, resource); + report_performance(__FILE__, "", time, resource); + + return ok ? 0 : 1; +}

[1/N] std::regex refactoring - _BracketMatcher

Commit Message

Patch