Message ID | 20150626204713.GA8001@domone |
---|---|
State | New |
Headers | show |
ping On Fri, Jun 26, 2015 at 10:47:13PM +0200, Ondřej Bílka wrote: > Hi, > > I had idea to optimize strspn more to exploit that 75% of time mismatch > happens in first character. I looked at sse4.2 implementation and found > that control flow is suboptimal. I replaced that by common one of first > checking 64 bytes unaligned unless they cross page boundary, then using > 16 byte loop. > > When I checked generated assembly gcc messed that up. For testing a flag > gcc first zeroed register, then used conditional move to set that > register when flag is set and then finally checked if register is > nonzero. > > Just fixing that mistake and using flags directly makes loop 25-40% > faster. > > New control flow makes strpbrk+strcspn around 10% faster in practice. > > Results are here. > http://kam.mff.cuni.cz/~ondra/benchmark_string/strpbrk_profile.html > > I have in my todo list to extend sse4.2 handling of accept longer than > 16 bytes. Second is sse2 optimization. That relates to strpbrk prolog, > idea is to check first 16 bytes of s by each byte of a and oring > vectors. Now I need to compare that with overhead of constructing 256 > byte table. Depending on accept size I migth do it several times until > it costs same as table construction. On core2 its 40% faster for gcc > workload and also faster when accept is less than 6 bytes. > > Problem is that these constants are quite cpu dependent, how should I > handle that. > > So is ok to check this patch? > > * sysdeps/x86_64/multiarch/Makefile: Updated. > * sysdeps/x86_64/multiarch/strcspn-c.c: Deleted. > * sysdeps/x86_64/multiarch/strpbrk-c.c: Likewise. > * sysdeps/x86_64/multiarch/strspn-c.c: Likewise. > * sysdeps/x86_64/multiarch/varshift.c: Likewise. > * sysdeps/x86_64/multiarch/varshift.h: Likewise. > * sysdeps/x86_64/multiarch/strpbrk_sse42.S: New file. > * sysdeps/x86_64/multiarch/strspn_sse42.S: Likewise. > * sysdeps/x86_64/multiarch/strcspn_sse42.S: Likewise. > > --- > sysdeps/x86_64/multiarch/Makefile | 6 +- > sysdeps/x86_64/multiarch/strcspn-c.c | 173 -------------------------- > sysdeps/x86_64/multiarch/strcspn_sse42.S | 3 + > sysdeps/x86_64/multiarch/strpbrk-c.c | 8 -- > sysdeps/x86_64/multiarch/strpbrk_sse42.S | 204 +++++++++++++++++++++++++++++++ > sysdeps/x86_64/multiarch/strspn-c.c | 145 ---------------------- > sysdeps/x86_64/multiarch/strspn_sse42.S | 3 + > sysdeps/x86_64/multiarch/varshift.c | 25 ---- > sysdeps/x86_64/multiarch/varshift.h | 30 ----- > 9 files changed, 211 insertions(+), 386 deletions(-) > delete mode 100644 sysdeps/x86_64/multiarch/strcspn-c.c > create mode 100644 sysdeps/x86_64/multiarch/strcspn_sse42.S > delete mode 100644 sysdeps/x86_64/multiarch/strpbrk-c.c > create mode 100644 sysdeps/x86_64/multiarch/strpbrk_sse42.S > delete mode 100644 sysdeps/x86_64/multiarch/strspn-c.c > create mode 100644 sysdeps/x86_64/multiarch/strspn_sse42.S > delete mode 100644 sysdeps/x86_64/multiarch/varshift.c > delete mode 100644 sysdeps/x86_64/multiarch/varshift.h > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile > index 8094162..05d5c9b 100644 > --- a/sysdeps/x86_64/multiarch/Makefile > +++ b/sysdeps/x86_64/multiarch/Makefile > @@ -22,11 +22,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \ > strchr-sse2-no-bsf memcmp-ssse3 strstr-sse2-unaligned > > ifeq (yes,$(config-cflags-sse4)) > -sysdep_routines += strcspn-c strpbrk-c strspn-c varshift > -CFLAGS-varshift.c += -msse4 > -CFLAGS-strcspn-c.c += -msse4 > -CFLAGS-strpbrk-c.c += -msse4 > -CFLAGS-strspn-c.c += -msse4 > +sysdep_routines += strcspn_sse42 strpbrk_sse42 strspn_sse42 > endif > > ifeq (yes,$(config-cflags-avx2)) > diff --git a/sysdeps/x86_64/multiarch/strcspn-c.c b/sysdeps/x86_64/multiarch/strcspn-c.c > deleted file mode 100644 > index 60b2ed7..0000000 > --- a/sysdeps/x86_64/multiarch/strcspn-c.c > +++ /dev/null > @@ -1,173 +0,0 @@ > -/* strcspn with SSE4.2 intrinsics > - Copyright (C) 2009-2015 Free Software Foundation, Inc. > - Contributed by Intel Corporation. > - This file is part of the GNU C Library. > - > - The GNU C Library is free software; you can redistribute it and/or > - modify it under the terms of the GNU Lesser General Public > - License as published by the Free Software Foundation; either > - version 2.1 of the License, or (at your option) any later version. > - > - The GNU C Library is distributed in the hope that it will be useful, > - but WITHOUT ANY WARRANTY; without even the implied warranty of > - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > - Lesser General Public License for more details. > - > - You should have received a copy of the GNU Lesser General Public > - License along with the GNU C Library; if not, see > - <http://www.gnu.org/licenses/>. */ > - > -#include <nmmintrin.h> > -#include <string.h> > -#include "varshift.h" > - > -/* We use 0x2: > - _SIDD_SBYTE_OPS > - | _SIDD_CMP_EQUAL_ANY > - | _SIDD_POSITIVE_POLARITY > - | _SIDD_LEAST_SIGNIFICANT > - on pcmpistri to compare xmm/mem128 > - > - 0 1 2 3 4 5 6 7 8 9 A B C D E F > - X X X X X X X X X X X X X X X X > - > - against xmm > - > - 0 1 2 3 4 5 6 7 8 9 A B C D E F > - A A A A A A A A A A A A A A A A > - > - to find out if the first 16byte data element has any byte A and > - the offset of the first byte. There are 3 cases: > - > - 1. The first 16byte data element has the byte A at the offset X. > - 2. The first 16byte data element has EOS and doesn't have the byte A. > - 3. The first 16byte data element is valid and doesn't have the byte A. > - > - Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases: > - > - 1 X 1 0/1 0 > - 2 16 0 1 0 > - 3 16 0 0 0 > - > - We exit from the loop for cases 1 and 2 with jbe which branches > - when either CFlag or ZFlag is 1. If CFlag == 1, ECX has the offset > - X for case 1. */ > - > -#ifndef STRCSPN_SSE2 > -# define STRCSPN_SSE2 __strcspn_sse2 > -# define STRCSPN_SSE42 __strcspn_sse42 > -#endif > - > -#ifdef USE_AS_STRPBRK > -# define RETURN(val1, val2) return val1 > -#else > -# define RETURN(val1, val2) return val2 > -#endif > - > -extern > -#ifdef USE_AS_STRPBRK > -char * > -#else > -size_t > -#endif > -STRCSPN_SSE2 (const char *, const char *); > - > - > -#ifdef USE_AS_STRPBRK > -char * > -#else > -size_t > -#endif > -__attribute__ ((section (".text.sse4.2"))) > -STRCSPN_SSE42 (const char *s, const char *a) > -{ > - if (*a == 0) > - RETURN (NULL, strlen (s)); > - > - const char *aligned; > - __m128i mask; > - int offset = (int) ((size_t) a & 15); > - if (offset != 0) > - { > - /* Load masks. */ > - aligned = (const char *) ((size_t) a & -16L); > - __m128i mask0 = _mm_load_si128 ((__m128i *) aligned); > - > - mask = __m128i_shift_right (mask0, offset); > - > - /* Find where the NULL terminator is. */ > - int length = _mm_cmpistri (mask, mask, 0x3a); > - if (length == 16 - offset) > - { > - /* There is no NULL terminator. */ > - __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16)); > - int index = _mm_cmpistri (mask1, mask1, 0x3a); > - length += index; > - > - /* Don't use SSE4.2 if the length of A > 16. */ > - if (length > 16) > - return STRCSPN_SSE2 (s, a); > - > - if (index != 0) > - { > - /* Combine mask0 and mask1. We could play games with > - palignr, but frankly this data should be in L1 now > - so do the merge via an unaligned load. */ > - mask = _mm_loadu_si128 ((__m128i *) a); > - } > - } > - } > - else > - { > - /* A is aligned. */ > - mask = _mm_load_si128 ((__m128i *) a); > - > - /* Find where the NULL terminator is. */ > - int length = _mm_cmpistri (mask, mask, 0x3a); > - if (length == 16) > - { > - /* There is no NULL terminator. Don't use SSE4.2 if the length > - of A > 16. */ > - if (a[16] != 0) > - return STRCSPN_SSE2 (s, a); > - } > - } > - > - offset = (int) ((size_t) s & 15); > - if (offset != 0) > - { > - /* Check partial string. */ > - aligned = (const char *) ((size_t) s & -16L); > - __m128i value = _mm_load_si128 ((__m128i *) aligned); > - > - value = __m128i_shift_right (value, offset); > - > - int length = _mm_cmpistri (mask, value, 0x2); > - /* No need to check ZFlag since ZFlag is always 1. */ > - int cflag = _mm_cmpistrc (mask, value, 0x2); > - if (cflag) > - RETURN ((char *) (s + length), length); > - /* Find where the NULL terminator is. */ > - int index = _mm_cmpistri (value, value, 0x3a); > - if (index < 16 - offset) > - RETURN (NULL, index); > - aligned += 16; > - } > - else > - aligned = s; > - > - while (1) > - { > - __m128i value = _mm_load_si128 ((__m128i *) aligned); > - int index = _mm_cmpistri (mask, value, 0x2); > - int cflag = _mm_cmpistrc (mask, value, 0x2); > - int zflag = _mm_cmpistrz (mask, value, 0x2); > - if (cflag) > - RETURN ((char *) (aligned + index), (size_t) (aligned + index - s)); > - if (zflag) > - RETURN (NULL, > - /* Find where the NULL terminator is. */ > - (size_t) (aligned + _mm_cmpistri (value, value, 0x3a) - s)); > - aligned += 16; > - } > -} > diff --git a/sysdeps/x86_64/multiarch/strcspn_sse42.S b/sysdeps/x86_64/multiarch/strcspn_sse42.S > new file mode 100644 > index 0000000..3e4e659 > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/strcspn_sse42.S > @@ -0,0 +1,3 @@ > +#define AS_STRCSPN > +#define __strpbrk_sse42 __strcspn_sse42 > +#include "strpbrk_sse42.S" > diff --git a/sysdeps/x86_64/multiarch/strpbrk-c.c b/sysdeps/x86_64/multiarch/strpbrk-c.c > deleted file mode 100644 > index bbf5c49..0000000 > --- a/sysdeps/x86_64/multiarch/strpbrk-c.c > +++ /dev/null > @@ -1,8 +0,0 @@ > -/* Don't define multiple versions for strpbrk in static library since we > - need strpbrk before the initialization happened. */ > -#ifdef SHARED > -# define USE_AS_STRPBRK > -# define STRCSPN_SSE2 __strpbrk_sse2 > -# define STRCSPN_SSE42 __strpbrk_sse42 > -# include "strcspn-c.c" > -#endif > diff --git a/sysdeps/x86_64/multiarch/strpbrk_sse42.S b/sysdeps/x86_64/multiarch/strpbrk_sse42.S > new file mode 100644 > index 0000000..512ac19 > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/strpbrk_sse42.S > @@ -0,0 +1,204 @@ > +/* strcspn (str, ss) -- Return the length of the initial segment of STR > + which contains no characters from SS. > + Copyright (C) 2015 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <http://www.gnu.org/licenses/>. */ > + > +#include <sysdep.h> > + > +#ifdef AS_STRSPN > +# define AS_STRCSPN > +# define MATCH_ALL $18 > +#else > +# define MATCH_ALL $2 > +#endif > + > +ENTRY(__strpbrk_sse42) > + movq %rdi, %rax > + andl $4095, %eax > + cmp $4032, %eax > + ja L(cross_page) > + movq %rsi, %rax > + andl $4095, %eax > + cmp $4080, %eax > + ja L(cross_page) > + movdqu (%rsi), %xmm4 > + movdqu (%rdi), %xmm1 > + movdqu 16(%rdi), %xmm5 > + movdqu 32(%rdi), %xmm6 > + movdqu 48(%rdi), %xmm7 > + > +L(back_from_crosspage): > + pxor %xmm3, %xmm3 > + pxor %xmm2, %xmm2 > + > + pcmpeqb %xmm4, %xmm2 > + pmovmskb %xmm2, %eax > + testl %eax, %eax > + je L(call) > + pcmpistri MATCH_ALL, %xmm1, %xmm4 > + jc L(rx0) > + je L(ret0) > + pcmpistri MATCH_ALL, %xmm5, %xmm4 > + jc L(rx16) > + je L(ret16) > + pcmpistri MATCH_ALL, %xmm6, %xmm4 > + jc L(rx32) > + je L(ret32) > + pcmpistri MATCH_ALL, %xmm7, %xmm4 > + jc L(rx48) > + je L(ret48) > + > + movq %rdi, %rax > + andq $-16, %rax > + addq $16, %rax > + .p2align 4,,10 > + .p2align 3 > +L(loop): > + pcmpistri MATCH_ALL, (%rax), %xmm4 > + lea 16(%rax), %rax > + jc L(rx_loop) > + jne L(loop) > +#ifdef AS_STRCSPN > + movdqa -16(%rax), %xmm1 > + pcmpistri $58, %xmm1, %xmm1 > + lea -16(%rcx, %rax), %rax > + sub %rdi, %rax > +#else > + xor %eax, %eax > +#endif > + ret > +L(rx_loop): > + lea -16(%rcx, %rax), %rax > +#ifdef AS_STRCSPN > + sub %rdi, %rax > +#endif > + ret > + .p2align 4,,10 > + .p2align 3 > +#ifndef AS_STRCSPN > +L(ret0): > +L(ret16): > +L(ret32): > +L(ret48): > + xorl %eax, %eax > + ret > +#endif > +L(call): > +#ifdef AS_STRCSPN > +# ifdef AS_STRSPN > + jmp __strspn_sse2 > +# else > + jmp __strcspn_sse2 > +# endif > +#else > + jmp __strpbrk_sse2 > +#endif > + .p2align 4,,10 > + .p2align 3 > +#ifdef AS_STRCSPN > +L(ret0): > + pcmpistri $58, %xmm1, %xmm1 > +L(rx0): > + lea 0(%rcx), %rax > +#else > +L(rx0): > + leaq (%rdi,%rcx), %rax > +#endif > + ret > +#ifdef AS_STRCSPN > +L(ret16): > + pcmpistri $58, %xmm5, %xmm5 > +L(rx16): > + lea 16(%rcx), %rax > +#else > +L(rx16): > + leaq 16(%rdi,%rcx), %rax > +#endif > + ret > +#ifdef AS_STRCSPN > +L(ret32): > + pcmpistri $58, %xmm6, %xmm6 > +L(rx32): > + lea 32(%rcx), %rax > +#else > +L(rx32): > + leaq 32(%rdi,%rcx), %rax > +#endif > + ret > +#ifdef AS_STRCSPN > +L(ret48): > + pcmpistri $58, %xmm7, %xmm7 > +L(rx48): > + lea 48(%rcx), %rax > +#else > +L(rx48): > + leaq 48(%rdi,%rcx), %rax > +#endif > + ret > + > + .p2align 4,,10 > + .p2align 3 > +L(cross_page): > + movzbl (%rdi), %ecx > + xorl %eax, %eax > + leaq -80(%rsp), %r8 > + testb %cl, %cl > + je L(sloop_end) > + leaq -80(%rsp), %r8 > + xorl %edx, %edx > + xorl %eax, %eax > + .p2align 4,,10 > + .p2align 3 > +L(sloop): > + movb %cl, (%r8,%rdx) > + movzbl 1(%rdi,%rdx), %ecx > + addl $1, %eax > + testb %cl, %cl > + je L(sloop_end) > + addq $1, %rdx > + cmpl $64, %eax > + jne L(sloop) > +L(sloop_end): > + movzbl (%rsi), %ecx > + cltq > + movb $0, -80(%rsp,%rax) > + movdqu (%r8), %xmm1 > + movdqu 16(%r8), %xmm5 > + movdqu 32(%r8), %xmm6 > + movdqu 48(%r8), %xmm7 > + > + xorl %eax, %eax > + testb %cl, %cl > + je L(aloop_end) > + xorl %edx, %edx > + .p2align 4,,10 > + .p2align 3 > +L(aloop): > + movb %cl, (%r8,%rdx) > + movzbl 1(%rsi,%rdx), %ecx > + addl $1, %eax > + testb %cl, %cl > + je L(aloop_end) > + addq $1, %rdx > + cmpl $16, %eax > + jne L(aloop) > +L(aloop_end): > + cltq > + movb $0, -80(%rsp,%rax) > + movdqu (%r8), %xmm4 > + jmp L(back_from_crosspage) > +END(__strpbrk_sse42) > diff --git a/sysdeps/x86_64/multiarch/strspn-c.c b/sysdeps/x86_64/multiarch/strspn-c.c > deleted file mode 100644 > index 6b0c80a..0000000 > --- a/sysdeps/x86_64/multiarch/strspn-c.c > +++ /dev/null > @@ -1,145 +0,0 @@ > -/* strspn with SSE4.2 intrinsics > - Copyright (C) 2009-2015 Free Software Foundation, Inc. > - Contributed by Intel Corporation. > - This file is part of the GNU C Library. > - > - The GNU C Library is free software; you can redistribute it and/or > - modify it under the terms of the GNU Lesser General Public > - License as published by the Free Software Foundation; either > - version 2.1 of the License, or (at your option) any later version. > - > - The GNU C Library is distributed in the hope that it will be useful, > - but WITHOUT ANY WARRANTY; without even the implied warranty of > - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > - Lesser General Public License for more details. > - > - You should have received a copy of the GNU Lesser General Public > - License along with the GNU C Library; if not, see > - <http://www.gnu.org/licenses/>. */ > - > -#include <nmmintrin.h> > -#include <string.h> > -#include "varshift.h" > - > -/* We use 0x12: > - _SIDD_SBYTE_OPS > - | _SIDD_CMP_EQUAL_ANY > - | _SIDD_NEGATIVE_POLARITY > - | _SIDD_LEAST_SIGNIFICANT > - on pcmpistri to compare xmm/mem128 > - > - 0 1 2 3 4 5 6 7 8 9 A B C D E F > - X X X X X X X X X X X X X X X X > - > - against xmm > - > - 0 1 2 3 4 5 6 7 8 9 A B C D E F > - A A A A A A A A A A A A A A A A > - > - to find out if the first 16byte data element has any non-A byte and > - the offset of the first byte. There are 2 cases: > - > - 1. The first 16byte data element has the non-A byte, including > - EOS, at the offset X. > - 2. The first 16byte data element is valid and doesn't have the non-A > - byte. > - > - Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases: > - > - case ECX CFlag ZFlag SFlag > - 1 X 1 0/1 0 > - 2 16 0 0 0 > - > - We exit from the loop for case 1. */ > - > -extern size_t __strspn_sse2 (const char *, const char *); > - > - > -size_t > -__attribute__ ((section (".text.sse4.2"))) > -__strspn_sse42 (const char *s, const char *a) > -{ > - if (*a == 0) > - return 0; > - > - const char *aligned; > - __m128i mask; > - int offset = (int) ((size_t) a & 15); > - if (offset != 0) > - { > - /* Load masks. */ > - aligned = (const char *) ((size_t) a & -16L); > - __m128i mask0 = _mm_load_si128 ((__m128i *) aligned); > - > - mask = __m128i_shift_right (mask0, offset); > - > - /* Find where the NULL terminator is. */ > - int length = _mm_cmpistri (mask, mask, 0x3a); > - if (length == 16 - offset) > - { > - /* There is no NULL terminator. */ > - __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16)); > - int index = _mm_cmpistri (mask1, mask1, 0x3a); > - length += index; > - > - /* Don't use SSE4.2 if the length of A > 16. */ > - if (length > 16) > - return __strspn_sse2 (s, a); > - > - if (index != 0) > - { > - /* Combine mask0 and mask1. We could play games with > - palignr, but frankly this data should be in L1 now > - so do the merge via an unaligned load. */ > - mask = _mm_loadu_si128 ((__m128i *) a); > - } > - } > - } > - else > - { > - /* A is aligned. */ > - mask = _mm_load_si128 ((__m128i *) a); > - > - /* Find where the NULL terminator is. */ > - int length = _mm_cmpistri (mask, mask, 0x3a); > - if (length == 16) > - { > - /* There is no NULL terminator. Don't use SSE4.2 if the length > - of A > 16. */ > - if (a[16] != 0) > - return __strspn_sse2 (s, a); > - } > - } > - > - offset = (int) ((size_t) s & 15); > - if (offset != 0) > - { > - /* Check partial string. */ > - aligned = (const char *) ((size_t) s & -16L); > - __m128i value = _mm_load_si128 ((__m128i *) aligned); > - > - value = __m128i_shift_right (value, offset); > - > - int length = _mm_cmpistri (mask, value, 0x12); > - /* No need to check CFlag since it is always 1. */ > - if (length < 16 - offset) > - return length; > - /* Find where the NULL terminator is. */ > - int index = _mm_cmpistri (value, value, 0x3a); > - if (index < 16 - offset) > - return length; > - aligned += 16; > - } > - else > - aligned = s; > - > - while (1) > - { > - __m128i value = _mm_load_si128 ((__m128i *) aligned); > - int index = _mm_cmpistri (mask, value, 0x12); > - int cflag = _mm_cmpistrc (mask, value, 0x12); > - if (cflag) > - return (size_t) (aligned + index - s); > - aligned += 16; > - } > -} > diff --git a/sysdeps/x86_64/multiarch/strspn_sse42.S b/sysdeps/x86_64/multiarch/strspn_sse42.S > new file mode 100644 > index 0000000..d460167 > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/strspn_sse42.S > @@ -0,0 +1,3 @@ > +#define AS_STRSPN > +#define __strpbrk_sse42 __strspn_sse42 > +#include "strpbrk_sse42.S" > diff --git a/sysdeps/x86_64/multiarch/varshift.c b/sysdeps/x86_64/multiarch/varshift.c > deleted file mode 100644 > index 0007ef7..0000000 > --- a/sysdeps/x86_64/multiarch/varshift.c > +++ /dev/null > @@ -1,25 +0,0 @@ > -/* Helper for variable shifts of SSE registers. > - Copyright (C) 2010-2015 Free Software Foundation, Inc. > - This file is part of the GNU C Library. > - > - The GNU C Library is free software; you can redistribute it and/or > - modify it under the terms of the GNU Lesser General Public > - License as published by the Free Software Foundation; either > - version 2.1 of the License, or (at your option) any later version. > - > - The GNU C Library is distributed in the hope that it will be useful, > - but WITHOUT ANY WARRANTY; without even the implied warranty of > - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > - Lesser General Public License for more details. > - > - You should have received a copy of the GNU Lesser General Public > - License along with the GNU C Library; if not, see > - <http://www.gnu.org/licenses/>. */ > - > -#include "varshift.h" > - > -const int8_t ___m128i_shift_right[31] attribute_hidden = > - { > - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, > - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 > - }; > diff --git a/sysdeps/x86_64/multiarch/varshift.h b/sysdeps/x86_64/multiarch/varshift.h > deleted file mode 100644 > index 30ace3d..0000000 > --- a/sysdeps/x86_64/multiarch/varshift.h > +++ /dev/null > @@ -1,30 +0,0 @@ > -/* Helper for variable shifts of SSE registers. > - Copyright (C) 2010-2015 Free Software Foundation, Inc. > - This file is part of the GNU C Library. > - > - The GNU C Library is free software; you can redistribute it and/or > - modify it under the terms of the GNU Lesser General Public > - License as published by the Free Software Foundation; either > - version 2.1 of the License, or (at your option) any later version. > - > - The GNU C Library is distributed in the hope that it will be useful, > - but WITHOUT ANY WARRANTY; without even the implied warranty of > - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > - Lesser General Public License for more details. > - > - You should have received a copy of the GNU Lesser General Public > - License along with the GNU C Library; if not, see > - <http://www.gnu.org/licenses/>. */ > - > -#include <stdint.h> > -#include <tmmintrin.h> > - > -extern const int8_t ___m128i_shift_right[31] attribute_hidden; > - > -static __inline__ __m128i > -__m128i_shift_right (__m128i value, unsigned long int offset) > -{ > - return _mm_shuffle_epi8 (value, > - _mm_loadu_si128 ((__m128i *) (___m128i_shift_right > - + offset))); > -} > -- > 1.8.4.rc3
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index 8094162..05d5c9b 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -22,11 +22,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \ strchr-sse2-no-bsf memcmp-ssse3 strstr-sse2-unaligned ifeq (yes,$(config-cflags-sse4)) -sysdep_routines += strcspn-c strpbrk-c strspn-c varshift -CFLAGS-varshift.c += -msse4 -CFLAGS-strcspn-c.c += -msse4 -CFLAGS-strpbrk-c.c += -msse4 -CFLAGS-strspn-c.c += -msse4 +sysdep_routines += strcspn_sse42 strpbrk_sse42 strspn_sse42 endif ifeq (yes,$(config-cflags-avx2)) diff --git a/sysdeps/x86_64/multiarch/strcspn-c.c b/sysdeps/x86_64/multiarch/strcspn-c.c deleted file mode 100644 index 60b2ed7..0000000 --- a/sysdeps/x86_64/multiarch/strcspn-c.c +++ /dev/null @@ -1,173 +0,0 @@ -/* strcspn with SSE4.2 intrinsics - Copyright (C) 2009-2015 Free Software Foundation, Inc. - Contributed by Intel Corporation. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <nmmintrin.h> -#include <string.h> -#include "varshift.h" - -/* We use 0x2: - _SIDD_SBYTE_OPS - | _SIDD_CMP_EQUAL_ANY - | _SIDD_POSITIVE_POLARITY - | _SIDD_LEAST_SIGNIFICANT - on pcmpistri to compare xmm/mem128 - - 0 1 2 3 4 5 6 7 8 9 A B C D E F - X X X X X X X X X X X X X X X X - - against xmm - - 0 1 2 3 4 5 6 7 8 9 A B C D E F - A A A A A A A A A A A A A A A A - - to find out if the first 16byte data element has any byte A and - the offset of the first byte. There are 3 cases: - - 1. The first 16byte data element has the byte A at the offset X. - 2. The first 16byte data element has EOS and doesn't have the byte A. - 3. The first 16byte data element is valid and doesn't have the byte A. - - Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases: - - 1 X 1 0/1 0 - 2 16 0 1 0 - 3 16 0 0 0 - - We exit from the loop for cases 1 and 2 with jbe which branches - when either CFlag or ZFlag is 1. If CFlag == 1, ECX has the offset - X for case 1. */ - -#ifndef STRCSPN_SSE2 -# define STRCSPN_SSE2 __strcspn_sse2 -# define STRCSPN_SSE42 __strcspn_sse42 -#endif - -#ifdef USE_AS_STRPBRK -# define RETURN(val1, val2) return val1 -#else -# define RETURN(val1, val2) return val2 -#endif - -extern -#ifdef USE_AS_STRPBRK -char * -#else -size_t -#endif -STRCSPN_SSE2 (const char *, const char *); - - -#ifdef USE_AS_STRPBRK -char * -#else -size_t -#endif -__attribute__ ((section (".text.sse4.2"))) -STRCSPN_SSE42 (const char *s, const char *a) -{ - if (*a == 0) - RETURN (NULL, strlen (s)); - - const char *aligned; - __m128i mask; - int offset = (int) ((size_t) a & 15); - if (offset != 0) - { - /* Load masks. */ - aligned = (const char *) ((size_t) a & -16L); - __m128i mask0 = _mm_load_si128 ((__m128i *) aligned); - - mask = __m128i_shift_right (mask0, offset); - - /* Find where the NULL terminator is. */ - int length = _mm_cmpistri (mask, mask, 0x3a); - if (length == 16 - offset) - { - /* There is no NULL terminator. */ - __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16)); - int index = _mm_cmpistri (mask1, mask1, 0x3a); - length += index; - - /* Don't use SSE4.2 if the length of A > 16. */ - if (length > 16) - return STRCSPN_SSE2 (s, a); - - if (index != 0) - { - /* Combine mask0 and mask1. We could play games with - palignr, but frankly this data should be in L1 now - so do the merge via an unaligned load. */ - mask = _mm_loadu_si128 ((__m128i *) a); - } - } - } - else - { - /* A is aligned. */ - mask = _mm_load_si128 ((__m128i *) a); - - /* Find where the NULL terminator is. */ - int length = _mm_cmpistri (mask, mask, 0x3a); - if (length == 16) - { - /* There is no NULL terminator. Don't use SSE4.2 if the length - of A > 16. */ - if (a[16] != 0) - return STRCSPN_SSE2 (s, a); - } - } - - offset = (int) ((size_t) s & 15); - if (offset != 0) - { - /* Check partial string. */ - aligned = (const char *) ((size_t) s & -16L); - __m128i value = _mm_load_si128 ((__m128i *) aligned); - - value = __m128i_shift_right (value, offset); - - int length = _mm_cmpistri (mask, value, 0x2); - /* No need to check ZFlag since ZFlag is always 1. */ - int cflag = _mm_cmpistrc (mask, value, 0x2); - if (cflag) - RETURN ((char *) (s + length), length); - /* Find where the NULL terminator is. */ - int index = _mm_cmpistri (value, value, 0x3a); - if (index < 16 - offset) - RETURN (NULL, index); - aligned += 16; - } - else - aligned = s; - - while (1) - { - __m128i value = _mm_load_si128 ((__m128i *) aligned); - int index = _mm_cmpistri (mask, value, 0x2); - int cflag = _mm_cmpistrc (mask, value, 0x2); - int zflag = _mm_cmpistrz (mask, value, 0x2); - if (cflag) - RETURN ((char *) (aligned + index), (size_t) (aligned + index - s)); - if (zflag) - RETURN (NULL, - /* Find where the NULL terminator is. */ - (size_t) (aligned + _mm_cmpistri (value, value, 0x3a) - s)); - aligned += 16; - } -} diff --git a/sysdeps/x86_64/multiarch/strcspn_sse42.S b/sysdeps/x86_64/multiarch/strcspn_sse42.S new file mode 100644 index 0000000..3e4e659 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strcspn_sse42.S @@ -0,0 +1,3 @@ +#define AS_STRCSPN +#define __strpbrk_sse42 __strcspn_sse42 +#include "strpbrk_sse42.S" diff --git a/sysdeps/x86_64/multiarch/strpbrk-c.c b/sysdeps/x86_64/multiarch/strpbrk-c.c deleted file mode 100644 index bbf5c49..0000000 --- a/sysdeps/x86_64/multiarch/strpbrk-c.c +++ /dev/null @@ -1,8 +0,0 @@ -/* Don't define multiple versions for strpbrk in static library since we - need strpbrk before the initialization happened. */ -#ifdef SHARED -# define USE_AS_STRPBRK -# define STRCSPN_SSE2 __strpbrk_sse2 -# define STRCSPN_SSE42 __strpbrk_sse42 -# include "strcspn-c.c" -#endif diff --git a/sysdeps/x86_64/multiarch/strpbrk_sse42.S b/sysdeps/x86_64/multiarch/strpbrk_sse42.S new file mode 100644 index 0000000..512ac19 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strpbrk_sse42.S @@ -0,0 +1,204 @@ +/* strcspn (str, ss) -- Return the length of the initial segment of STR + which contains no characters from SS. + Copyright (C) 2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +#ifdef AS_STRSPN +# define AS_STRCSPN +# define MATCH_ALL $18 +#else +# define MATCH_ALL $2 +#endif + +ENTRY(__strpbrk_sse42) + movq %rdi, %rax + andl $4095, %eax + cmp $4032, %eax + ja L(cross_page) + movq %rsi, %rax + andl $4095, %eax + cmp $4080, %eax + ja L(cross_page) + movdqu (%rsi), %xmm4 + movdqu (%rdi), %xmm1 + movdqu 16(%rdi), %xmm5 + movdqu 32(%rdi), %xmm6 + movdqu 48(%rdi), %xmm7 + +L(back_from_crosspage): + pxor %xmm3, %xmm3 + pxor %xmm2, %xmm2 + + pcmpeqb %xmm4, %xmm2 + pmovmskb %xmm2, %eax + testl %eax, %eax + je L(call) + pcmpistri MATCH_ALL, %xmm1, %xmm4 + jc L(rx0) + je L(ret0) + pcmpistri MATCH_ALL, %xmm5, %xmm4 + jc L(rx16) + je L(ret16) + pcmpistri MATCH_ALL, %xmm6, %xmm4 + jc L(rx32) + je L(ret32) + pcmpistri MATCH_ALL, %xmm7, %xmm4 + jc L(rx48) + je L(ret48) + + movq %rdi, %rax + andq $-16, %rax + addq $16, %rax + .p2align 4,,10 + .p2align 3 +L(loop): + pcmpistri MATCH_ALL, (%rax), %xmm4 + lea 16(%rax), %rax + jc L(rx_loop) + jne L(loop) +#ifdef AS_STRCSPN + movdqa -16(%rax), %xmm1 + pcmpistri $58, %xmm1, %xmm1 + lea -16(%rcx, %rax), %rax + sub %rdi, %rax +#else + xor %eax, %eax +#endif + ret +L(rx_loop): + lea -16(%rcx, %rax), %rax +#ifdef AS_STRCSPN + sub %rdi, %rax +#endif + ret + .p2align 4,,10 + .p2align 3 +#ifndef AS_STRCSPN +L(ret0): +L(ret16): +L(ret32): +L(ret48): + xorl %eax, %eax + ret +#endif +L(call): +#ifdef AS_STRCSPN +# ifdef AS_STRSPN + jmp __strspn_sse2 +# else + jmp __strcspn_sse2 +# endif +#else + jmp __strpbrk_sse2 +#endif + .p2align 4,,10 + .p2align 3 +#ifdef AS_STRCSPN +L(ret0): + pcmpistri $58, %xmm1, %xmm1 +L(rx0): + lea 0(%rcx), %rax +#else +L(rx0): + leaq (%rdi,%rcx), %rax +#endif + ret +#ifdef AS_STRCSPN +L(ret16): + pcmpistri $58, %xmm5, %xmm5 +L(rx16): + lea 16(%rcx), %rax +#else +L(rx16): + leaq 16(%rdi,%rcx), %rax +#endif + ret +#ifdef AS_STRCSPN +L(ret32): + pcmpistri $58, %xmm6, %xmm6 +L(rx32): + lea 32(%rcx), %rax +#else +L(rx32): + leaq 32(%rdi,%rcx), %rax +#endif + ret +#ifdef AS_STRCSPN +L(ret48): + pcmpistri $58, %xmm7, %xmm7 +L(rx48): + lea 48(%rcx), %rax +#else +L(rx48): + leaq 48(%rdi,%rcx), %rax +#endif + ret + + .p2align 4,,10 + .p2align 3 +L(cross_page): + movzbl (%rdi), %ecx + xorl %eax, %eax + leaq -80(%rsp), %r8 + testb %cl, %cl + je L(sloop_end) + leaq -80(%rsp), %r8 + xorl %edx, %edx + xorl %eax, %eax + .p2align 4,,10 + .p2align 3 +L(sloop): + movb %cl, (%r8,%rdx) + movzbl 1(%rdi,%rdx), %ecx + addl $1, %eax + testb %cl, %cl + je L(sloop_end) + addq $1, %rdx + cmpl $64, %eax + jne L(sloop) +L(sloop_end): + movzbl (%rsi), %ecx + cltq + movb $0, -80(%rsp,%rax) + movdqu (%r8), %xmm1 + movdqu 16(%r8), %xmm5 + movdqu 32(%r8), %xmm6 + movdqu 48(%r8), %xmm7 + + xorl %eax, %eax + testb %cl, %cl + je L(aloop_end) + xorl %edx, %edx + .p2align 4,,10 + .p2align 3 +L(aloop): + movb %cl, (%r8,%rdx) + movzbl 1(%rsi,%rdx), %ecx + addl $1, %eax + testb %cl, %cl + je L(aloop_end) + addq $1, %rdx + cmpl $16, %eax + jne L(aloop) +L(aloop_end): + cltq + movb $0, -80(%rsp,%rax) + movdqu (%r8), %xmm4 + jmp L(back_from_crosspage) +END(__strpbrk_sse42) diff --git a/sysdeps/x86_64/multiarch/strspn-c.c b/sysdeps/x86_64/multiarch/strspn-c.c deleted file mode 100644 index 6b0c80a..0000000 --- a/sysdeps/x86_64/multiarch/strspn-c.c +++ /dev/null @@ -1,145 +0,0 @@ -/* strspn with SSE4.2 intrinsics - Copyright (C) 2009-2015 Free Software Foundation, Inc. - Contributed by Intel Corporation. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <nmmintrin.h> -#include <string.h> -#include "varshift.h" - -/* We use 0x12: - _SIDD_SBYTE_OPS - | _SIDD_CMP_EQUAL_ANY - | _SIDD_NEGATIVE_POLARITY - | _SIDD_LEAST_SIGNIFICANT - on pcmpistri to compare xmm/mem128 - - 0 1 2 3 4 5 6 7 8 9 A B C D E F - X X X X X X X X X X X X X X X X - - against xmm - - 0 1 2 3 4 5 6 7 8 9 A B C D E F - A A A A A A A A A A A A A A A A - - to find out if the first 16byte data element has any non-A byte and - the offset of the first byte. There are 2 cases: - - 1. The first 16byte data element has the non-A byte, including - EOS, at the offset X. - 2. The first 16byte data element is valid and doesn't have the non-A - byte. - - Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases: - - case ECX CFlag ZFlag SFlag - 1 X 1 0/1 0 - 2 16 0 0 0 - - We exit from the loop for case 1. */ - -extern size_t __strspn_sse2 (const char *, const char *); - - -size_t -__attribute__ ((section (".text.sse4.2"))) -__strspn_sse42 (const char *s, const char *a) -{ - if (*a == 0) - return 0; - - const char *aligned; - __m128i mask; - int offset = (int) ((size_t) a & 15); - if (offset != 0) - { - /* Load masks. */ - aligned = (const char *) ((size_t) a & -16L); - __m128i mask0 = _mm_load_si128 ((__m128i *) aligned); - - mask = __m128i_shift_right (mask0, offset); - - /* Find where the NULL terminator is. */ - int length = _mm_cmpistri (mask, mask, 0x3a); - if (length == 16 - offset) - { - /* There is no NULL terminator. */ - __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16)); - int index = _mm_cmpistri (mask1, mask1, 0x3a); - length += index; - - /* Don't use SSE4.2 if the length of A > 16. */ - if (length > 16) - return __strspn_sse2 (s, a); - - if (index != 0) - { - /* Combine mask0 and mask1. We could play games with - palignr, but frankly this data should be in L1 now - so do the merge via an unaligned load. */ - mask = _mm_loadu_si128 ((__m128i *) a); - } - } - } - else - { - /* A is aligned. */ - mask = _mm_load_si128 ((__m128i *) a); - - /* Find where the NULL terminator is. */ - int length = _mm_cmpistri (mask, mask, 0x3a); - if (length == 16) - { - /* There is no NULL terminator. Don't use SSE4.2 if the length - of A > 16. */ - if (a[16] != 0) - return __strspn_sse2 (s, a); - } - } - - offset = (int) ((size_t) s & 15); - if (offset != 0) - { - /* Check partial string. */ - aligned = (const char *) ((size_t) s & -16L); - __m128i value = _mm_load_si128 ((__m128i *) aligned); - - value = __m128i_shift_right (value, offset); - - int length = _mm_cmpistri (mask, value, 0x12); - /* No need to check CFlag since it is always 1. */ - if (length < 16 - offset) - return length; - /* Find where the NULL terminator is. */ - int index = _mm_cmpistri (value, value, 0x3a); - if (index < 16 - offset) - return length; - aligned += 16; - } - else - aligned = s; - - while (1) - { - __m128i value = _mm_load_si128 ((__m128i *) aligned); - int index = _mm_cmpistri (mask, value, 0x12); - int cflag = _mm_cmpistrc (mask, value, 0x12); - if (cflag) - return (size_t) (aligned + index - s); - aligned += 16; - } -} diff --git a/sysdeps/x86_64/multiarch/strspn_sse42.S b/sysdeps/x86_64/multiarch/strspn_sse42.S new file mode 100644 index 0000000..d460167 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strspn_sse42.S @@ -0,0 +1,3 @@ +#define AS_STRSPN +#define __strpbrk_sse42 __strspn_sse42 +#include "strpbrk_sse42.S" diff --git a/sysdeps/x86_64/multiarch/varshift.c b/sysdeps/x86_64/multiarch/varshift.c deleted file mode 100644 index 0007ef7..0000000 --- a/sysdeps/x86_64/multiarch/varshift.c +++ /dev/null @@ -1,25 +0,0 @@ -/* Helper for variable shifts of SSE registers. - Copyright (C) 2010-2015 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include "varshift.h" - -const int8_t ___m128i_shift_right[31] attribute_hidden = - { - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 - }; diff --git a/sysdeps/x86_64/multiarch/varshift.h b/sysdeps/x86_64/multiarch/varshift.h deleted file mode 100644 index 30ace3d..0000000 --- a/sysdeps/x86_64/multiarch/varshift.h +++ /dev/null @@ -1,30 +0,0 @@ -/* Helper for variable shifts of SSE registers. - Copyright (C) 2010-2015 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <stdint.h> -#include <tmmintrin.h> - -extern const int8_t ___m128i_shift_right[31] attribute_hidden; - -static __inline__ __m128i -__m128i_shift_right (__m128i value, unsigned long int offset) -{ - return _mm_shuffle_epi8 (value, - _mm_loadu_si128 ((__m128i *) (___m128i_shift_right - + offset))); -}