[2/3] libcpp: replace SSE4.2 helper with an SSSE3 one

Message ID	20240806161850.18839-2-amonakov@ispras.ru
State	New
Headers	show Return-Path: <gcc-patches-bounces~incoming=patchwork.ozlabs.org@gcc.gnu.org> DMARC-Filter: OpenDMARC Filter v1.4.2 sourceware.org F07D1385842C From: Alexander Monakov <amonakov@ispras.ru> To: gcc-patches@gcc.gnu.org Cc: Andi Kleen <andi@firstfloor.org>, Alexander Monakov <amonakov@ispras.ru> Subject: [PATCH 2/3] libcpp: replace SSE4.2 helper with an SSSE3 one Date: Tue, 6 Aug 2024 19:18:49 +0300 Message-Id: <20240806161850.18839-2-amonakov@ispras.ru> In-Reply-To: <20240806161850.18839-1-amonakov@ispras.ru> References: <df196bee-cbf8-0221-f412-235d8f79a1d5@ispras.ru> <20240806161850.18839-1-amonakov@ispras.ru> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Precedence: list Errors-To: gcc-patches-bounces~incoming=patchwork.ozlabs.org@gcc.gnu.org
Series	libcpp: improve x86 vectorized helpers \| expand [0/3] libcpp: improve x86 vectorized helpers [1/3] libcpp: configure: check for AVX2 instead of SSE4 [2/3] libcpp: replace SSE4.2 helper with an SSSE3 one [3/3] libcpp: add AVX2 helper

diff --git a/libcpp/lex.cc b/libcpp/lex.cc index fa9c03614c..815b8abd29 100644 --- a/libcpp/lex.cc +++ b/libcpp/lex.cc @@ -345,84 +345,58 @@ search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED) } #ifdef HAVE_AVX2 -/* A version of the fast scanner using SSE 4.2 vectorized string insns. */ +/* A version of the fast scanner using SSSE3 shuffle (PSHUFB) insns. */ static const uchar * -#ifndef __SSE4_2__ -__attribute__((__target__("sse4.2"))) +#ifndef __SSSE3__ +__attribute__((__target__("ssse3"))) #endif -search_line_sse42 (const uchar *s, const uchar *end) +search_line_ssse3 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED) { typedef char v16qi __attribute__ ((__vector_size__ (16))); - static const v16qi search = { '\n', '\r', '?', '\\' }; - - uintptr_t si = (uintptr_t)s; - uintptr_t index; - - /* Check for unaligned input. */ - if (si & 15) - { - v16qi sv; - - if (__builtin_expect (end - s < 16, 0) - && __builtin_expect ((si & 0xfff) > 0xff0, 0)) - { - /* There are less than 16 bytes left in the buffer, and less - than 16 bytes left on the page. Reading 16 bytes at this - point might generate a spurious page fault. Defer to the - SSE2 implementation, which already handles alignment. */ - return search_line_sse2 (s, end); - } - - /* ??? The builtin doesn't understand that the PCMPESTRI read from - memory need not be aligned. */ - sv = __builtin_ia32_loaddqu ((const char *) s); - index = __builtin_ia32_pcmpestri128 (search, 4, sv, 16, 0); - - if (__builtin_expect (index < 16, 0)) - goto found; - - /* Advance the pointer to an aligned address. We will re-scan a - few bytes, but we no longer need care for reading past the - end of a page, since we're guaranteed a match. */ - s = (const uchar *)((si + 15) & -16); - } - - /* Main loop, processing 16 bytes at a time. */ -#ifdef __GCC_ASM_FLAG_OUTPUTS__ - while (1) + typedef v16qi v16qi_u __attribute__ ((__aligned__ (1))); + /* Helper vector for pshufb-based matching: + each character C we're searching for is at position (C % 16). */ + v16qi lut = { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, '\n', 0, '\\', '\r', 0, '?' }; + static_assert ('\n' == 10 && '\r' == 13 && '\\' == 92 && '?' == 63); + + int found; + /* Process three 16-byte chunks per iteration. */ + for (; ; s += 48) { - char f; - - /* By using inline assembly instead of the builtin, - we can use the result, as well as the flags set. */ - __asm ("%vpcmpestri\t$0, %2, %3" - : "=c"(index), "=@ccc"(f) - : "m"(*s), "x"(search), "a"(4), "d"(16)); - if (f) - break; - - s += 16; + v16qi data, t; + /* Unaligned load. Reading beyond the final newline is safe, since + files.cc:read_file_guts pads the allocation. */ + data = *(const v16qi_u *)s; + /* Prevent propagation into pshufb and pcmp as memory operand. */ + __asm__ ("" : "+x" (data)); + t = __builtin_ia32_pshufb128 (lut, data); + if ((found = __builtin_ia32_pmovmskb128 (t == data))) + goto done; + /* Second chunk. */ + data = *(const v16qi_u *)(s + 16); + __asm__ ("" : "+x" (data)); + t = __builtin_ia32_pshufb128 (lut, data); + if ((found = __builtin_ia32_pmovmskb128 (t == data))) + goto add_16; + /* Third chunk. */ + data = *(const v16qi_u *)(s + 32); + __asm__ ("" : "+x" (data)); + t = __builtin_ia32_pshufb128 (lut, data); + if ((found = __builtin_ia32_pmovmskb128 (t == data))) + goto add_32; } -#else - s -= 16; - /* By doing the whole loop in inline assembly, - we can make proper use of the flags set. */ - __asm ( ".balign 16\n" - "0: add $16, %1\n" - " %vpcmpestri\t$0, (%1), %2\n" - " jnc 0b" - : "=&c"(index), "+r"(s) - : "x"(search), "a"(4), "d"(16)); -#endif - - found: - return s + index; +add_32: + s += 16; +add_16: + s += 16; +done: + return s + __builtin_ctz (found); } #else -/* Work around out-dated assemblers without sse4 support. */ -#define search_line_sse42 search_line_sse2 +/* Work around out-dated assemblers without SSSE3 support. */ +#define search_line_ssse3 search_line_sse2 #endif /* Check the CPU capabilities. */ @@ -440,18 +414,18 @@ init_vectorized_lexer (void) search_line_fast_type impl = search_line_acc_char; int minimum = 0; -#if defined(__SSE4_2__) +#if defined(__SSSE3__) minimum = 3; #elif defined(__SSE2__) minimum = 2; #endif if (minimum == 3) - impl = search_line_sse42; + impl = search_line_ssse3; else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2) { - if (minimum == 3 || (ecx & bit_SSE4_2)) - impl = search_line_sse42; + if (minimum == 3 || (ecx & bit_SSSE3)) + impl = search_line_ssse3; else if (minimum == 2 || (edx & bit_SSE2)) impl = search_line_sse2; }

[2/3] libcpp: replace SSE4.2 helper with an SSSE3 one

Commit Message

Comments

Patch