From patchwork Mon Apr 28 12:40:03 2014 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Jonathan Wakely X-Patchwork-Id: 343386 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Received: from sourceware.org (server1.sourceware.org [209.132.180.131]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by ozlabs.org (Postfix) with ESMTPS id 03B0214008F for ; Mon, 28 Apr 2014 23:04:37 +1000 (EST) DomainKey-Signature: a=rsa-sha1; c=nofws; d=gcc.gnu.org; h=list-id :list-unsubscribe:list-archive:list-post:list-help:sender:date :from:to:cc:subject:message-id:mime-version:content-type; q=dns; s=default; b=DPkBPltUL6CLPcEdyE0EUE1KRnxr+w3RTZCy7jARod9ScgA/Wo TAk1/FySGwGTmeTg9hKWSTHhbePvmjYgsx2lvkUnTaXczDcrNsMwAoZypdD/YWPK T7b3vThSQ3HHuaepaTvpuLOJXj4+nZtp8movw3qeQb/wV8atwYm3+WxwQ= DKIM-Signature: v=1; a=rsa-sha1; c=relaxed; d=gcc.gnu.org; h=list-id :list-unsubscribe:list-archive:list-post:list-help:sender:date :from:to:cc:subject:message-id:mime-version:content-type; s= default; bh=ZTWHtM/Ic3kGnvKNh7WJ/anY5xA=; b=MootjAyjycbRb0MyC1fW HxoSB2bvEc++Z7Oq8D955xRWIk5tJul3IXR7UfKfxOBEPvaytW72n56UyQD+X0nk wIzC1j+7NXA+Wu2J8eeFNJjBC9j9DNQyXhYGt+ykQZ1WjHsDPii/TMnWgSfqANI4 Ev7zJGU7DlQlYJ62ormisaw= Received: (qmail 8478 invoked by alias); 28 Apr 2014 13:04:30 -0000 Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Archive: List-Post: List-Help: Sender: gcc-patches-owner@gcc.gnu.org Delivered-To: mailing list gcc-patches@gcc.gnu.org Received: (qmail 8452 invoked by uid 89); 28 Apr 2014 13:04:30 -0000 Authentication-Results: sourceware.org; auth=none X-Virus-Found: No X-Spam-SWARE-Status: No, score=-0.9 required=5.0 tests=AWL, BAYES_00, RP_MATCHES_RCVD, SPF_HELO_PASS, SPF_PASS, TBC, T_FILL_THIS_FORM_SHORT autolearn=no version=3.3.2 X-Spam-User: qpsmtpd, 2 recipients X-HELO: mx1.redhat.com Received: from mx1.redhat.com (HELO mx1.redhat.com) (209.132.183.28) by sourceware.org (qpsmtpd/0.93/v0.84-503-g423c35a) with ESMTP; Mon, 28 Apr 2014 13:04:29 +0000 Received: from int-mx12.intmail.prod.int.phx2.redhat.com (int-mx12.intmail.prod.int.phx2.redhat.com [10.5.11.25]) by mx1.redhat.com (8.14.4/8.14.4) with ESMTP id s3SD4SJA019988 (version=TLSv1/SSLv3 cipher=DHE-RSA-AES256-GCM-SHA384 bits=256 verify=OK); Mon, 28 Apr 2014 09:04:28 -0400 Received: from localhost (vpn1-4-80.ams2.redhat.com [10.36.4.80]) by int-mx12.intmail.prod.int.phx2.redhat.com (8.14.4/8.14.4) with ESMTP id s3SCe45J027591; Mon, 28 Apr 2014 08:40:04 -0400 Date: Mon, 28 Apr 2014 13:40:03 +0100 From: Jonathan Wakely To: Tim Shen Cc: libstdc++@gcc.gnu.org, gcc-patches@gcc.gnu.org Subject: [patch 1/N] std::regex refactoring - _BracketMatcher Message-ID: <20140428124003.GW928@redhat.com> MIME-Version: 1.0 Content-Disposition: inline User-Agent: Mutt/1.5.21 (2010-09-15) Hi, I've been looking through the regex code and have a few ideas for simplifications or optimisations that I'd like to share. This first patch is for _BracketMatcher. We only use std::bitset when is_same<_CharT, char> so 8 * sizeof(_CharT) should be __CHAR_BIT__ instead. We also only user _UnsignedCharT when is_same<_CharT, char> so that can just be simplified to unsigned char. The contents of _BracketMatcher::_M_char_set are not sorted and can contain duplicates in the current code. Making that a sorted, unique list in _BracketMatcher::_M_ready() allows a binary search instead of linear search. This improves worst case performance for pathological regular expressions like std::wregex('['+std::wstring(1000, 'a')+"b]") but I'm not sure if it helps in the common case. Finally, in the non-char case the _CacheT member is an unused empty object, so having that as the first member requires 7 bytes of padding. Re-ordering the members reduces the size of a non-char _BracketMatcher by 8 bytes (but it's still a whopping 96 bytes). (For a char _BracketMatcher the bitset cache makes it 128 bytes, this patch doesn't change that). Thoughts? diff --git a/libstdc++-v3/include/bits/regex_compiler.h b/libstdc++-v3/include/bits/regex_compiler.h index f5a198f..a9dd8d3 100644 --- a/libstdc++-v3/include/bits/regex_compiler.h +++ b/libstdc++-v3/include/bits/regex_compiler.h @@ -396,6 +396,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION void _M_ready() { + std::sort(_M_char_set.begin(), _M_char_set.end()); + auto __end = std::unique(_M_char_set.begin(), _M_char_set.end()); + _M_char_set.erase(__end, _M_char_set.end()); _M_make_cache(_IsChar()); #ifdef _GLIBCXX_DEBUG _M_is_ready = true; @@ -405,10 +408,10 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION private: typedef typename is_same<_CharT, char>::type _IsChar; struct _Dummy { }; + static constexpr size_t _S_cache_size() { return 1ul << __CHAR_BIT__; } typedef typename conditional<_IsChar::value, - std::bitset<1 << (8 * sizeof(_CharT))>, + std::bitset<_S_cache_size()>, _Dummy>::type _CacheT; - typedef typename make_unsigned<_CharT>::type _UnsignedCharT; private: bool @@ -416,14 +419,13 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION bool _M_apply(_CharT __ch, true_type) const - { return _M_cache[static_cast<_UnsignedCharT>(__ch)]; } + { return _M_cache[static_cast(__ch)]; } void _M_make_cache(true_type) { - for (int __i = 0; __i < _M_cache.size(); __i++) - _M_cache[static_cast<_UnsignedCharT>(__i)] = - _M_apply(__i, false_type()); + for (unsigned __i = 0; __i < _S_cache_size(); __i++) + _M_cache[__i] = _M_apply(static_cast(__i), false_type()); } void @@ -431,13 +433,13 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION { } private: - _CacheT _M_cache; std::vector<_CharT> _M_char_set; std::vector<_StringT> _M_equiv_set; std::vector> _M_range_set; _CharClassT _M_class_set; _TransT _M_translator; const _TraitsT& _M_traits; + _CacheT _M_cache; bool _M_is_non_matching; #ifdef _GLIBCXX_DEBUG bool _M_is_ready; diff --git a/libstdc++-v3/include/bits/regex_compiler.tcc b/libstdc++-v3/include/bits/regex_compiler.tcc index 128dac1..36edfba 100644 --- a/libstdc++-v3/include/bits/regex_compiler.tcc +++ b/libstdc++-v3/include/bits/regex_compiler.tcc @@ -507,12 +507,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION _BracketMatcher<_TraitsT, __icase, __collate>:: _M_apply(_CharT __ch, false_type) const { - bool __ret = false; - if (std::find(_M_char_set.begin(), _M_char_set.end(), - _M_translator._M_translate(__ch)) - != _M_char_set.end()) - __ret = true; - else + bool __ret = std::binary_search(_M_char_set.begin(), _M_char_set.end(), + _M_translator._M_translate(__ch)); + if (!__ret) { auto __s = _M_translator._M_transform(__ch); for (auto& __it : _M_range_set)