Message ID | 20220610005840.557184-1-goldstein.w.n@gmail.com |
---|---|
State | New |
Headers | show |
Series | [v2] x86: Rename generic functions with unique postfix for clarity | expand |
On Thu, Jun 9, 2022 at 5:58 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > No functions are changed. It just renames generic implementations from > '{func}_sse2' to '{func}_generic'. This is just because the postfix > "_sse2" was overloaded and was used for files that had hand-optimized > sse2 assembly implementations and files that just redirected back > to the generic implementation. This change isn't small and its benefit is very small. Can it be the part of a big change to support building glibc with -march=x86-64-vN > Full xcheck passed on x86_64. > --- > sysdeps/x86_64/multiarch/Makefile | 15 +- > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 16 +- > sysdeps/x86_64/multiarch/ifunc-sse4_2.h | 4 +- > sysdeps/x86_64/multiarch/ifunc-strcpy.h | 8 +- > sysdeps/x86_64/multiarch/ifunc-wcslen.h | 8 +- > sysdeps/x86_64/multiarch/stpncpy-c.c | 2 +- > sysdeps/x86_64/multiarch/stpncpy.c | 1 + > sysdeps/x86_64/multiarch/strcspn-c-sse4.c | 163 ++++++++++++++++++ > sysdeps/x86_64/multiarch/strcspn-c.c | 151 +--------------- > sysdeps/x86_64/multiarch/strcspn-sse2.c | 28 --- > sysdeps/x86_64/multiarch/strncat-c.c | 2 +- > sysdeps/x86_64/multiarch/strncat.c | 1 + > sysdeps/x86_64/multiarch/strncpy-c.c | 2 +- > sysdeps/x86_64/multiarch/strncpy.c | 1 + > .../{strspn-sse2.c => strpbrk-c-sse4.c} | 18 +- > sysdeps/x86_64/multiarch/strpbrk-c.c | 18 +- > sysdeps/x86_64/multiarch/strpbrk-sse2.c | 28 --- > sysdeps/x86_64/multiarch/strspn-c-sse4.c | 136 +++++++++++++++ > sysdeps/x86_64/multiarch/strspn-c.c | 126 +------------- > sysdeps/x86_64/multiarch/wcscpy-c.c | 2 +- > sysdeps/x86_64/multiarch/wcscpy.c | 4 +- > sysdeps/x86_64/multiarch/wcsnlen-c.c | 4 +- > sysdeps/x86_64/multiarch/wcsnlen.c | 1 + > 23 files changed, 376 insertions(+), 363 deletions(-) > create mode 100644 sysdeps/x86_64/multiarch/strcspn-c-sse4.c > delete mode 100644 sysdeps/x86_64/multiarch/strcspn-sse2.c > rename sysdeps/x86_64/multiarch/{strspn-sse2.c => strpbrk-c-sse4.c} (74%) > delete mode 100644 sysdeps/x86_64/multiarch/strpbrk-sse2.c > create mode 100644 sysdeps/x86_64/multiarch/strspn-c-sse4.c > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile > index 3d153cac35..86c6ecdfc1 100644 > --- a/sysdeps/x86_64/multiarch/Makefile > +++ b/sysdeps/x86_64/multiarch/Makefile > @@ -77,7 +77,7 @@ sysdep_routines += \ > strcpy-sse2 \ > strcpy-sse2-unaligned \ > strcspn-c \ > - strcspn-sse2 \ > + strcspn-c-sse4 \ > strlen-avx2 \ > strlen-avx2-rtm \ > strlen-evex \ > @@ -109,21 +109,22 @@ sysdep_routines += \ > strnlen-evex512 \ > strnlen-sse2 \ > strpbrk-c \ > - strpbrk-sse2 \ > + strpbrk-c-sse4 \ > strrchr-avx2 \ > strrchr-avx2-rtm \ > strrchr-evex \ > strrchr-sse2 \ > strspn-c \ > - strspn-sse2 \ > + strspn-c-sse4 \ > strstr-avx512 \ > strstr-sse2-unaligned \ > varshift \ > # sysdep_routines > -CFLAGS-varshift.c += -msse4 > -CFLAGS-strcspn-c.c += -msse4 > -CFLAGS-strpbrk-c.c += -msse4 > -CFLAGS-strspn-c.c += -msse4 > + > +CFLAGS-strcspn-c-sse4.c += -msse4 > +CFLAGS-strpbrk-c-sse4.c += -msse4 > +CFLAGS-strspn-c-sse4.c += -msse4 > + > CFLAGS-strstr-avx512.c += -mavx512f -mavx512vl -mavx512dq -mavx512bw -mbmi -mbmi2 -O3 > endif > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > index 58f3ec8306..4cbd200d39 100644 > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > @@ -372,7 +372,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > __stpncpy_evex) > IFUNC_IMPL_ADD (array, i, stpncpy, 1, > __stpncpy_sse2_unaligned) > - IFUNC_IMPL_ADD (array, i, stpncpy, 1, __stpncpy_sse2)) > + IFUNC_IMPL_ADD (array, i, stpncpy, 1, __stpncpy_generic)) > > /* Support sysdeps/x86_64/multiarch/stpcpy.c. */ > IFUNC_IMPL (i, name, stpcpy, > @@ -531,7 +531,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > IFUNC_IMPL (i, name, strcspn, > IFUNC_IMPL_ADD (array, i, strcspn, CPU_FEATURE_USABLE (SSE4_2), > __strcspn_sse42) > - IFUNC_IMPL_ADD (array, i, strcspn, 1, __strcspn_sse2)) > + IFUNC_IMPL_ADD (array, i, strcspn, 1, __strcspn_generic)) > > /* Support sysdeps/x86_64/multiarch/strncase_l.c. */ > IFUNC_IMPL (i, name, strncasecmp, > @@ -585,7 +585,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > __strncat_evex) > IFUNC_IMPL_ADD (array, i, strncat, 1, > __strncat_sse2_unaligned) > - IFUNC_IMPL_ADD (array, i, strncat, 1, __strncat_sse2)) > + IFUNC_IMPL_ADD (array, i, strncat, 1, __strncat_generic)) > > /* Support sysdeps/x86_64/multiarch/strncpy.c. */ > IFUNC_IMPL (i, name, strncpy, > @@ -601,20 +601,20 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > __strncpy_evex) > IFUNC_IMPL_ADD (array, i, strncpy, 1, > __strncpy_sse2_unaligned) > - IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_sse2)) > + IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_generic)) > > /* Support sysdeps/x86_64/multiarch/strpbrk.c. */ > IFUNC_IMPL (i, name, strpbrk, > IFUNC_IMPL_ADD (array, i, strpbrk, CPU_FEATURE_USABLE (SSE4_2), > __strpbrk_sse42) > - IFUNC_IMPL_ADD (array, i, strpbrk, 1, __strpbrk_sse2)) > + IFUNC_IMPL_ADD (array, i, strpbrk, 1, __strpbrk_generic)) > > > /* Support sysdeps/x86_64/multiarch/strspn.c. */ > IFUNC_IMPL (i, name, strspn, > IFUNC_IMPL_ADD (array, i, strspn, CPU_FEATURE_USABLE (SSE4_2), > __strspn_sse42) > - IFUNC_IMPL_ADD (array, i, strspn, 1, __strspn_sse2)) > + IFUNC_IMPL_ADD (array, i, strspn, 1, __strspn_generic)) > > /* Support sysdeps/x86_64/multiarch/strstr.c. */ > IFUNC_IMPL (i, name, strstr, > @@ -697,7 +697,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > IFUNC_IMPL (i, name, wcscpy, > IFUNC_IMPL_ADD (array, i, wcscpy, CPU_FEATURE_USABLE (SSSE3), > __wcscpy_ssse3) > - IFUNC_IMPL_ADD (array, i, wcscpy, 1, __wcscpy_sse2)) > + IFUNC_IMPL_ADD (array, i, wcscpy, 1, __wcscpy_generic)) > > /* Support sysdeps/x86_64/multiarch/wcslen.c. */ > IFUNC_IMPL (i, name, wcslen, > @@ -749,7 +749,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > IFUNC_IMPL_ADD (array, i, wcsnlen, > CPU_FEATURE_USABLE (SSE4_1), > __wcsnlen_sse4_1) > - IFUNC_IMPL_ADD (array, i, wcsnlen, 1, __wcsnlen_sse2)) > + IFUNC_IMPL_ADD (array, i, wcsnlen, 1, __wcsnlen_generic)) > > /* Support sysdeps/x86_64/multiarch/wmemchr.c. */ > IFUNC_IMPL (i, name, wmemchr, > diff --git a/sysdeps/x86_64/multiarch/ifunc-sse4_2.h b/sysdeps/x86_64/multiarch/ifunc-sse4_2.h > index b555ff2fac..ee36525bcf 100644 > --- a/sysdeps/x86_64/multiarch/ifunc-sse4_2.h > +++ b/sysdeps/x86_64/multiarch/ifunc-sse4_2.h > @@ -19,7 +19,7 @@ > > #include <init-arch.h> > > -extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; > +extern __typeof (REDIRECT_NAME) OPTIMIZE (generic) attribute_hidden; > extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden; > > static inline void * > @@ -30,5 +30,5 @@ IFUNC_SELECTOR (void) > if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_2)) > return OPTIMIZE (sse42); > > - return OPTIMIZE (sse2); > + return OPTIMIZE (generic); > } > diff --git a/sysdeps/x86_64/multiarch/ifunc-strcpy.h b/sysdeps/x86_64/multiarch/ifunc-strcpy.h > index a15afa44e9..80529458d1 100644 > --- a/sysdeps/x86_64/multiarch/ifunc-strcpy.h > +++ b/sysdeps/x86_64/multiarch/ifunc-strcpy.h > @@ -20,7 +20,11 @@ > > #include <init-arch.h> > > -extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; > +#ifndef GENERIC > +# define GENERIC sse2 > +#endif > + > +extern __typeof (REDIRECT_NAME) OPTIMIZE (GENERIC) attribute_hidden; > extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) > attribute_hidden; > extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; > @@ -49,5 +53,5 @@ IFUNC_SELECTOR (void) > if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load)) > return OPTIMIZE (sse2_unaligned); > > - return OPTIMIZE (sse2); > + return OPTIMIZE (GENERIC); > } > diff --git a/sysdeps/x86_64/multiarch/ifunc-wcslen.h b/sysdeps/x86_64/multiarch/ifunc-wcslen.h > index 2b29e7608a..88c1c502af 100644 > --- a/sysdeps/x86_64/multiarch/ifunc-wcslen.h > +++ b/sysdeps/x86_64/multiarch/ifunc-wcslen.h > @@ -19,7 +19,11 @@ > > #include <init-arch.h> > > -extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; > +#ifndef GENERIC > +# define GENERIC sse2 > +#endif > + > +extern __typeof (REDIRECT_NAME) OPTIMIZE (GENERIC) attribute_hidden; > extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden; > extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; > extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; > @@ -48,5 +52,5 @@ IFUNC_SELECTOR (void) > if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1)) > return OPTIMIZE (sse4_1); > > - return OPTIMIZE (sse2); > + return OPTIMIZE (GENERIC); > } > diff --git a/sysdeps/x86_64/multiarch/stpncpy-c.c b/sysdeps/x86_64/multiarch/stpncpy-c.c > index b016e487e1..eb62fcf388 100644 > --- a/sysdeps/x86_64/multiarch/stpncpy-c.c > +++ b/sysdeps/x86_64/multiarch/stpncpy-c.c > @@ -1,4 +1,4 @@ > -#define STPNCPY __stpncpy_sse2 > +#define STPNCPY __stpncpy_generic > #undef weak_alias > #define weak_alias(ignored1, ignored2) > #undef libc_hidden_def > diff --git a/sysdeps/x86_64/multiarch/stpncpy.c b/sysdeps/x86_64/multiarch/stpncpy.c > index 82fa53957d..879bc83f0b 100644 > --- a/sysdeps/x86_64/multiarch/stpncpy.c > +++ b/sysdeps/x86_64/multiarch/stpncpy.c > @@ -25,6 +25,7 @@ > # undef stpncpy > # undef __stpncpy > > +# define GENERIC generic > # define SYMBOL_NAME stpncpy > # include "ifunc-strcpy.h" > > diff --git a/sysdeps/x86_64/multiarch/strcspn-c-sse4.c b/sysdeps/x86_64/multiarch/strcspn-c-sse4.c > new file mode 100644 > index 0000000000..59f64f9fe8 > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/strcspn-c-sse4.c > @@ -0,0 +1,163 @@ > +/* strcspn with SSE4.2 intrinsics > + Copyright (C) 2009-2022 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +#include <nmmintrin.h> > +#include <string.h> > +#include "varshift.h" > + > +/* We use 0x2: > + _SIDD_SBYTE_OPS > + | _SIDD_CMP_EQUAL_ANY > + | _SIDD_POSITIVE_POLARITY > + | _SIDD_LEAST_SIGNIFICANT > + on pcmpistri to compare xmm/mem128 > + > + 0 1 2 3 4 5 6 7 8 9 A B C D E F > + X X X X X X X X X X X X X X X X > + > + against xmm > + > + 0 1 2 3 4 5 6 7 8 9 A B C D E F > + A A A A A A A A A A A A A A A A > + > + to find out if the first 16byte data element has any byte A and > + the offset of the first byte. There are 3 cases: > + > + 1. The first 16byte data element has the byte A at the offset X. > + 2. The first 16byte data element has EOS and doesn't have the byte A. > + 3. The first 16byte data element is valid and doesn't have the byte A. > + > + Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases: > + > + 1 X 1 0/1 0 > + 2 16 0 1 0 > + 3 16 0 0 0 > + > + We exit from the loop for cases 1 and 2 with jbe which branches > + when either CFlag or ZFlag is 1. If CFlag == 1, ECX has the offset > + X for case 1. */ > + > +#ifndef STRCSPN_GENERIC > +# define STRCSPN_GENERIC __strcspn_generic > +# define STRCSPN_SSE42 __strcspn_sse42 > +#endif > + > +#ifdef USE_AS_STRPBRK > +# define RETURN(val1, val2) return val1 > +#else > +# define RETURN(val1, val2) return val2 > +#endif > + > +extern > +#ifdef USE_AS_STRPBRK > +char * > +#else > +size_t > +#endif > +STRCSPN_GENERIC (const char *, const char *) attribute_hidden; > + > + > +#ifdef USE_AS_STRPBRK > +char * > +#else > +size_t > +#endif > +__attribute__ ((section (".text.sse4.2"))) > +STRCSPN_SSE42 (const char *s, const char *a) > +{ > + if (*a == 0) > + RETURN (NULL, strlen (s)); > + > + const char *aligned; > + __m128i mask, maskz, zero; > + unsigned int maskz_bits; > + unsigned int offset = (unsigned int) ((size_t) a & 15); > + zero = _mm_set1_epi8 (0); > + if (offset != 0) > + { > + /* Load masks. */ > + aligned = (const char *) ((size_t) a & -16L); > + __m128i mask0 = _mm_load_si128 ((__m128i *) aligned); > + maskz = _mm_cmpeq_epi8 (mask0, zero); > + > + /* Find where the NULL terminator is. */ > + maskz_bits = _mm_movemask_epi8 (maskz) >> offset; > + if (maskz_bits != 0) > + { > + mask = __m128i_shift_right (mask0, offset); > + offset = (unsigned int) ((size_t) s & 15); > + if (offset) > + goto start_unaligned; > + > + aligned = s; > + goto start_loop; > + } > + } > + > + /* A is aligned. */ > + mask = _mm_loadu_si128 ((__m128i *) a); > + /* Find where the NULL terminator is. */ > + maskz = _mm_cmpeq_epi8 (mask, zero); > + maskz_bits = _mm_movemask_epi8 (maskz); > + if (maskz_bits == 0) > + { > + /* There is no NULL terminator. Don't use SSE4.2 if the length > + of A > 16. */ > + if (a[16] != 0) > + return STRCSPN_GENERIC (s, a); > + } > + > + aligned = s; > + offset = (unsigned int) ((size_t) s & 15); > + if (offset != 0) > + { > + start_unaligned: > + /* Check partial string. */ > + aligned = (const char *) ((size_t) s & -16L); > + __m128i value = _mm_load_si128 ((__m128i *) aligned); > + > + value = __m128i_shift_right (value, offset); > + > + unsigned int length = _mm_cmpistri (mask, value, 0x2); > + /* No need to check ZFlag since ZFlag is always 1. */ > + unsigned int cflag = _mm_cmpistrc (mask, value, 0x2); > + if (cflag) > + RETURN ((char *) (s + length), length); > + /* Find where the NULL terminator is. */ > + unsigned int index = _mm_cmpistri (value, value, 0x3a); > + if (index < 16 - offset) > + RETURN (NULL, index); > + aligned += 16; > + } > + > +start_loop: > + while (1) > + { > + __m128i value = _mm_load_si128 ((__m128i *) aligned); > + unsigned int index = _mm_cmpistri (mask, value, 0x2); > + unsigned int cflag = _mm_cmpistrc (mask, value, 0x2); > + unsigned int zflag = _mm_cmpistrz (mask, value, 0x2); > + if (cflag) > + RETURN ((char *) (aligned + index), (size_t) (aligned + index - s)); > + if (zflag) > + RETURN (NULL, > + /* Find where the NULL terminator is. */ > + (size_t) (aligned + _mm_cmpistri (value, value, 0x3a) - s)); > + aligned += 16; > + } > +} > diff --git a/sysdeps/x86_64/multiarch/strcspn-c.c b/sysdeps/x86_64/multiarch/strcspn-c.c > index c312fab8b1..423de2e2b2 100644 > --- a/sysdeps/x86_64/multiarch/strcspn-c.c > +++ b/sysdeps/x86_64/multiarch/strcspn-c.c > @@ -1,5 +1,5 @@ > -/* strcspn with SSE4.2 intrinsics > - Copyright (C) 2009-2022 Free Software Foundation, Inc. > +/* strcspn. > + Copyright (C) 2017-2022 Free Software Foundation, Inc. > This file is part of the GNU C Library. > > The GNU C Library is free software; you can redistribute it and/or > @@ -16,148 +16,13 @@ > License along with the GNU C Library; if not, see > <https://www.gnu.org/licenses/>. */ > > -#include <nmmintrin.h> > -#include <string.h> > -#include "varshift.h" > +#if IS_IN (libc) > > -/* We use 0x2: > - _SIDD_SBYTE_OPS > - | _SIDD_CMP_EQUAL_ANY > - | _SIDD_POSITIVE_POLARITY > - | _SIDD_LEAST_SIGNIFICANT > - on pcmpistri to compare xmm/mem128 > +# include <sysdep.h> > +# define STRCSPN __strcspn_generic > > - 0 1 2 3 4 5 6 7 8 9 A B C D E F > - X X X X X X X X X X X X X X X X > - > - against xmm > - > - 0 1 2 3 4 5 6 7 8 9 A B C D E F > - A A A A A A A A A A A A A A A A > - > - to find out if the first 16byte data element has any byte A and > - the offset of the first byte. There are 3 cases: > - > - 1. The first 16byte data element has the byte A at the offset X. > - 2. The first 16byte data element has EOS and doesn't have the byte A. > - 3. The first 16byte data element is valid and doesn't have the byte A. > - > - Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases: > - > - 1 X 1 0/1 0 > - 2 16 0 1 0 > - 3 16 0 0 0 > - > - We exit from the loop for cases 1 and 2 with jbe which branches > - when either CFlag or ZFlag is 1. If CFlag == 1, ECX has the offset > - X for case 1. */ > - > -#ifndef STRCSPN_SSE2 > -# define STRCSPN_SSE2 __strcspn_sse2 > -# define STRCSPN_SSE42 __strcspn_sse42 > -#endif > - > -#ifdef USE_AS_STRPBRK > -# define RETURN(val1, val2) return val1 > -#else > -# define RETURN(val1, val2) return val2 > -#endif > - > -extern > -#ifdef USE_AS_STRPBRK > -char * > -#else > -size_t > -#endif > -STRCSPN_SSE2 (const char *, const char *) attribute_hidden; > - > - > -#ifdef USE_AS_STRPBRK > -char * > -#else > -size_t > +# undef libc_hidden_builtin_def > +# define libc_hidden_builtin_def(STRCSPN) > #endif > -__attribute__ ((section (".text.sse4.2"))) > -STRCSPN_SSE42 (const char *s, const char *a) > -{ > - if (*a == 0) > - RETURN (NULL, strlen (s)); > - > - const char *aligned; > - __m128i mask, maskz, zero; > - unsigned int maskz_bits; > - unsigned int offset = (unsigned int) ((size_t) a & 15); > - zero = _mm_set1_epi8 (0); > - if (offset != 0) > - { > - /* Load masks. */ > - aligned = (const char *) ((size_t) a & -16L); > - __m128i mask0 = _mm_load_si128 ((__m128i *) aligned); > - maskz = _mm_cmpeq_epi8 (mask0, zero); > - > - /* Find where the NULL terminator is. */ > - maskz_bits = _mm_movemask_epi8 (maskz) >> offset; > - if (maskz_bits != 0) > - { > - mask = __m128i_shift_right (mask0, offset); > - offset = (unsigned int) ((size_t) s & 15); > - if (offset) > - goto start_unaligned; > - > - aligned = s; > - goto start_loop; > - } > - } > - > - /* A is aligned. */ > - mask = _mm_loadu_si128 ((__m128i *) a); > - /* Find where the NULL terminator is. */ > - maskz = _mm_cmpeq_epi8 (mask, zero); > - maskz_bits = _mm_movemask_epi8 (maskz); > - if (maskz_bits == 0) > - { > - /* There is no NULL terminator. Don't use SSE4.2 if the length > - of A > 16. */ > - if (a[16] != 0) > - return STRCSPN_SSE2 (s, a); > - } > - > - aligned = s; > - offset = (unsigned int) ((size_t) s & 15); > - if (offset != 0) > - { > - start_unaligned: > - /* Check partial string. */ > - aligned = (const char *) ((size_t) s & -16L); > - __m128i value = _mm_load_si128 ((__m128i *) aligned); > - > - value = __m128i_shift_right (value, offset); > - > - unsigned int length = _mm_cmpistri (mask, value, 0x2); > - /* No need to check ZFlag since ZFlag is always 1. */ > - unsigned int cflag = _mm_cmpistrc (mask, value, 0x2); > - if (cflag) > - RETURN ((char *) (s + length), length); > - /* Find where the NULL terminator is. */ > - unsigned int index = _mm_cmpistri (value, value, 0x3a); > - if (index < 16 - offset) > - RETURN (NULL, index); > - aligned += 16; > - } > > -start_loop: > - while (1) > - { > - __m128i value = _mm_load_si128 ((__m128i *) aligned); > - unsigned int index = _mm_cmpistri (mask, value, 0x2); > - unsigned int cflag = _mm_cmpistrc (mask, value, 0x2); > - unsigned int zflag = _mm_cmpistrz (mask, value, 0x2); > - if (cflag) > - RETURN ((char *) (aligned + index), (size_t) (aligned + index - s)); > - if (zflag) > - RETURN (NULL, > - /* Find where the NULL terminator is. */ > - (size_t) (aligned + _mm_cmpistri (value, value, 0x3a) - s)); > - aligned += 16; > - } > -} > +#include <string/strcspn.c> > diff --git a/sysdeps/x86_64/multiarch/strcspn-sse2.c b/sysdeps/x86_64/multiarch/strcspn-sse2.c > deleted file mode 100644 > index 3a04bb39fc..0000000000 > --- a/sysdeps/x86_64/multiarch/strcspn-sse2.c > +++ /dev/null > @@ -1,28 +0,0 @@ > -/* strcspn. > - Copyright (C) 2017-2022 Free Software Foundation, Inc. > - This file is part of the GNU C Library. > - > - The GNU C Library is free software; you can redistribute it and/or > - modify it under the terms of the GNU Lesser General Public > - License as published by the Free Software Foundation; either > - version 2.1 of the License, or (at your option) any later version. > - > - The GNU C Library is distributed in the hope that it will be useful, > - but WITHOUT ANY WARRANTY; without even the implied warranty of > - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > - Lesser General Public License for more details. > - > - You should have received a copy of the GNU Lesser General Public > - License along with the GNU C Library; if not, see > - <https://www.gnu.org/licenses/>. */ > - > -#if IS_IN (libc) > - > -# include <sysdep.h> > -# define STRCSPN __strcspn_sse2 > - > -# undef libc_hidden_builtin_def > -# define libc_hidden_builtin_def(STRCSPN) > -#endif > - > -#include <string/strcspn.c> > diff --git a/sysdeps/x86_64/multiarch/strncat-c.c b/sysdeps/x86_64/multiarch/strncat-c.c > index 93a7fab7ea..b729c033d9 100644 > --- a/sysdeps/x86_64/multiarch/strncat-c.c > +++ b/sysdeps/x86_64/multiarch/strncat-c.c > @@ -1,2 +1,2 @@ > -#define STRNCAT __strncat_sse2 > +#define STRNCAT __strncat_generic > #include <string/strncat.c> > diff --git a/sysdeps/x86_64/multiarch/strncat.c b/sysdeps/x86_64/multiarch/strncat.c > index b649343a97..50fba8a41f 100644 > --- a/sysdeps/x86_64/multiarch/strncat.c > +++ b/sysdeps/x86_64/multiarch/strncat.c > @@ -24,6 +24,7 @@ > # undef strncat > > # define SYMBOL_NAME strncat > +# define GENERIC generic > # include "ifunc-strcpy.h" > > libc_ifunc_redirected (__redirect_strncat, strncat, IFUNC_SELECTOR ()); > diff --git a/sysdeps/x86_64/multiarch/strncpy-c.c b/sysdeps/x86_64/multiarch/strncpy-c.c > index 57c45ac7ab..183b0b8e0f 100644 > --- a/sysdeps/x86_64/multiarch/strncpy-c.c > +++ b/sysdeps/x86_64/multiarch/strncpy-c.c > @@ -1,4 +1,4 @@ > -#define STRNCPY __strncpy_sse2 > +#define STRNCPY __strncpy_generic > #undef libc_hidden_builtin_def > #define libc_hidden_builtin_def(strncpy) > > diff --git a/sysdeps/x86_64/multiarch/strncpy.c b/sysdeps/x86_64/multiarch/strncpy.c > index 2a780a7e16..7fc7d72ec5 100644 > --- a/sysdeps/x86_64/multiarch/strncpy.c > +++ b/sysdeps/x86_64/multiarch/strncpy.c > @@ -24,6 +24,7 @@ > # undef strncpy > > # define SYMBOL_NAME strncpy > +# define GENERIC generic > # include "ifunc-strcpy.h" > > libc_ifunc_redirected (__redirect_strncpy, strncpy, IFUNC_SELECTOR ()); > diff --git a/sysdeps/x86_64/multiarch/strspn-sse2.c b/sysdeps/x86_64/multiarch/strpbrk-c-sse4.c > similarity index 74% > rename from sysdeps/x86_64/multiarch/strspn-sse2.c > rename to sysdeps/x86_64/multiarch/strpbrk-c-sse4.c > index 61cc6cb0a5..8700276773 100644 > --- a/sysdeps/x86_64/multiarch/strspn-sse2.c > +++ b/sysdeps/x86_64/multiarch/strpbrk-c-sse4.c > @@ -1,5 +1,5 @@ > -/* strspn. > - Copyright (C) 2017-2022 Free Software Foundation, Inc. > +/* strpbrk with SSE4.2 intrinsics > + Copyright (C) 2022 Free Software Foundation, Inc. > This file is part of the GNU C Library. > > The GNU C Library is free software; you can redistribute it and/or > @@ -16,13 +16,7 @@ > License along with the GNU C Library; if not, see > <https://www.gnu.org/licenses/>. */ > > -#if IS_IN (libc) > - > -# include <sysdep.h> > -# define STRSPN __strspn_sse2 > - > -# undef libc_hidden_builtin_def > -# define libc_hidden_builtin_def(STRSPN) > -#endif > - > -#include <string/strspn.c> > +#define USE_AS_STRPBRK > +#define STRCSPN_GENERIC __strpbrk_generic > +#define STRCSPN_SSE42 __strpbrk_sse42 > +#include "strcspn-c-sse4.c" > diff --git a/sysdeps/x86_64/multiarch/strpbrk-c.c b/sysdeps/x86_64/multiarch/strpbrk-c.c > index abf4ff7f1a..d31acfe495 100644 > --- a/sysdeps/x86_64/multiarch/strpbrk-c.c > +++ b/sysdeps/x86_64/multiarch/strpbrk-c.c > @@ -1,5 +1,5 @@ > -/* strpbrk with SSE4.2 intrinsics > - Copyright (C) 2022 Free Software Foundation, Inc. > +/* strpbrk. > + Copyright (C) 2017-2022 Free Software Foundation, Inc. > This file is part of the GNU C Library. > > The GNU C Library is free software; you can redistribute it and/or > @@ -16,7 +16,13 @@ > License along with the GNU C Library; if not, see > <https://www.gnu.org/licenses/>. */ > > -#define USE_AS_STRPBRK > -#define STRCSPN_SSE2 __strpbrk_sse2 > -#define STRCSPN_SSE42 __strpbrk_sse42 > -#include "strcspn-c.c" > +#if IS_IN (libc) > + > +# include <sysdep.h> > +# define STRPBRK __strpbrk_generic > + > +# undef libc_hidden_builtin_def > +# define libc_hidden_builtin_def(STRPBRK) > +#endif > + > +#include <string/strpbrk.c> > diff --git a/sysdeps/x86_64/multiarch/strpbrk-sse2.c b/sysdeps/x86_64/multiarch/strpbrk-sse2.c > deleted file mode 100644 > index d03214c4fb..0000000000 > --- a/sysdeps/x86_64/multiarch/strpbrk-sse2.c > +++ /dev/null > @@ -1,28 +0,0 @@ > -/* strpbrk. > - Copyright (C) 2017-2022 Free Software Foundation, Inc. > - This file is part of the GNU C Library. > - > - The GNU C Library is free software; you can redistribute it and/or > - modify it under the terms of the GNU Lesser General Public > - License as published by the Free Software Foundation; either > - version 2.1 of the License, or (at your option) any later version. > - > - The GNU C Library is distributed in the hope that it will be useful, > - but WITHOUT ANY WARRANTY; without even the implied warranty of > - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > - Lesser General Public License for more details. > - > - You should have received a copy of the GNU Lesser General Public > - License along with the GNU C Library; if not, see > - <https://www.gnu.org/licenses/>. */ > - > -#if IS_IN (libc) > - > -# include <sysdep.h> > -# define STRPBRK __strpbrk_sse2 > - > -# undef libc_hidden_builtin_def > -# define libc_hidden_builtin_def(STRPBRK) > -#endif > - > -#include <string/strpbrk.c> > diff --git a/sysdeps/x86_64/multiarch/strspn-c-sse4.c b/sysdeps/x86_64/multiarch/strspn-c-sse4.c > new file mode 100644 > index 0000000000..d044916688 > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/strspn-c-sse4.c > @@ -0,0 +1,136 @@ > +/* strspn with SSE4.2 intrinsics > + Copyright (C) 2009-2022 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +#include <nmmintrin.h> > +#include <string.h> > +#include "varshift.h" > + > +/* We use 0x12: > + _SIDD_SBYTE_OPS > + | _SIDD_CMP_EQUAL_ANY > + | _SIDD_NEGATIVE_POLARITY > + | _SIDD_LEAST_SIGNIFICANT > + on pcmpistri to compare xmm/mem128 > + > + 0 1 2 3 4 5 6 7 8 9 A B C D E F > + X X X X X X X X X X X X X X X X > + > + against xmm > + > + 0 1 2 3 4 5 6 7 8 9 A B C D E F > + A A A A A A A A A A A A A A A A > + > + to find out if the first 16byte data element has any non-A byte and > + the offset of the first byte. There are 2 cases: > + > + 1. The first 16byte data element has the non-A byte, including > + EOS, at the offset X. > + 2. The first 16byte data element is valid and doesn't have the non-A > + byte. > + > + Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases: > + > + case ECX CFlag ZFlag SFlag > + 1 X 1 0/1 0 > + 2 16 0 0 0 > + > + We exit from the loop for case 1. */ > + > +extern size_t __strspn_generic (const char *, const char *) attribute_hidden; > + > + > +size_t > +__attribute__ ((section (".text.sse4.2"))) > +__strspn_sse42 (const char *s, const char *a) > +{ > + if (*a == 0) > + return 0; > + > + const char *aligned; > + __m128i mask, maskz, zero; > + unsigned int maskz_bits; > + unsigned int offset = (int) ((size_t) a & 15); > + zero = _mm_set1_epi8 (0); > + if (offset != 0) > + { > + /* Load masks. */ > + aligned = (const char *) ((size_t) a & -16L); > + __m128i mask0 = _mm_load_si128 ((__m128i *) aligned); > + maskz = _mm_cmpeq_epi8 (mask0, zero); > + > + /* Find where the NULL terminator is. */ > + maskz_bits = _mm_movemask_epi8 (maskz) >> offset; > + if (maskz_bits != 0) > + { > + mask = __m128i_shift_right (mask0, offset); > + offset = (unsigned int) ((size_t) s & 15); > + if (offset) > + goto start_unaligned; > + > + aligned = s; > + goto start_loop; > + } > + } > + > + /* A is aligned. */ > + mask = _mm_loadu_si128 ((__m128i *) a); > + > + /* Find where the NULL terminator is. */ > + maskz = _mm_cmpeq_epi8 (mask, zero); > + maskz_bits = _mm_movemask_epi8 (maskz); > + if (maskz_bits == 0) > + { > + /* There is no NULL terminator. Don't use SSE4.2 if the length > + of A > 16. */ > + if (a[16] != 0) > + return __strspn_generic (s, a); > + } > + aligned = s; > + offset = (unsigned int) ((size_t) s & 15); > + > + if (offset != 0) > + { > + start_unaligned: > + /* Check partial string. */ > + aligned = (const char *) ((size_t) s & -16L); > + __m128i value = _mm_load_si128 ((__m128i *) aligned); > + __m128i adj_value = __m128i_shift_right (value, offset); > + > + unsigned int length = _mm_cmpistri (mask, adj_value, 0x12); > + /* No need to check CFlag since it is always 1. */ > + if (length < 16 - offset) > + return length; > + /* Find where the NULL terminator is. */ > + maskz = _mm_cmpeq_epi8 (value, zero); > + maskz_bits = _mm_movemask_epi8 (maskz) >> offset; > + if (maskz_bits != 0) > + return length; > + aligned += 16; > + } > + > +start_loop: > + while (1) > + { > + __m128i value = _mm_load_si128 ((__m128i *) aligned); > + unsigned int index = _mm_cmpistri (mask, value, 0x12); > + unsigned int cflag = _mm_cmpistrc (mask, value, 0x12); > + if (cflag) > + return (size_t) (aligned + index - s); > + aligned += 16; > + } > +} > diff --git a/sysdeps/x86_64/multiarch/strspn-c.c b/sysdeps/x86_64/multiarch/strspn-c.c > index 6124033ceb..6b50c36432 100644 > --- a/sysdeps/x86_64/multiarch/strspn-c.c > +++ b/sysdeps/x86_64/multiarch/strspn-c.c > @@ -1,5 +1,5 @@ > -/* strspn with SSE4.2 intrinsics > - Copyright (C) 2009-2022 Free Software Foundation, Inc. > +/* strspn. > + Copyright (C) 2017-2022 Free Software Foundation, Inc. > This file is part of the GNU C Library. > > The GNU C Library is free software; you can redistribute it and/or > @@ -16,121 +16,13 @@ > License along with the GNU C Library; if not, see > <https://www.gnu.org/licenses/>. */ > > -#include <nmmintrin.h> > -#include <string.h> > -#include "varshift.h" > +#if IS_IN (libc) > > -/* We use 0x12: > - _SIDD_SBYTE_OPS > - | _SIDD_CMP_EQUAL_ANY > - | _SIDD_NEGATIVE_POLARITY > - | _SIDD_LEAST_SIGNIFICANT > - on pcmpistri to compare xmm/mem128 > +# include <sysdep.h> > +# define STRSPN __strspn_generic > > - 0 1 2 3 4 5 6 7 8 9 A B C D E F > - X X X X X X X X X X X X X X X X > +# undef libc_hidden_builtin_def > +# define libc_hidden_builtin_def(STRSPN) > +#endif > > - against xmm > - > - 0 1 2 3 4 5 6 7 8 9 A B C D E F > - A A A A A A A A A A A A A A A A > - > - to find out if the first 16byte data element has any non-A byte and > - the offset of the first byte. There are 2 cases: > - > - 1. The first 16byte data element has the non-A byte, including > - EOS, at the offset X. > - 2. The first 16byte data element is valid and doesn't have the non-A > - byte. > - > - Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases: > - > - case ECX CFlag ZFlag SFlag > - 1 X 1 0/1 0 > - 2 16 0 0 0 > - > - We exit from the loop for case 1. */ > - > -extern size_t __strspn_sse2 (const char *, const char *) attribute_hidden; > - > - > -size_t > -__attribute__ ((section (".text.sse4.2"))) > -__strspn_sse42 (const char *s, const char *a) > -{ > - if (*a == 0) > - return 0; > - > - const char *aligned; > - __m128i mask, maskz, zero; > - unsigned int maskz_bits; > - unsigned int offset = (int) ((size_t) a & 15); > - zero = _mm_set1_epi8 (0); > - if (offset != 0) > - { > - /* Load masks. */ > - aligned = (const char *) ((size_t) a & -16L); > - __m128i mask0 = _mm_load_si128 ((__m128i *) aligned); > - maskz = _mm_cmpeq_epi8 (mask0, zero); > - > - /* Find where the NULL terminator is. */ > - maskz_bits = _mm_movemask_epi8 (maskz) >> offset; > - if (maskz_bits != 0) > - { > - mask = __m128i_shift_right (mask0, offset); > - offset = (unsigned int) ((size_t) s & 15); > - if (offset) > - goto start_unaligned; > - > - aligned = s; > - goto start_loop; > - } > - } > - > - /* A is aligned. */ > - mask = _mm_loadu_si128 ((__m128i *) a); > - > - /* Find where the NULL terminator is. */ > - maskz = _mm_cmpeq_epi8 (mask, zero); > - maskz_bits = _mm_movemask_epi8 (maskz); > - if (maskz_bits == 0) > - { > - /* There is no NULL terminator. Don't use SSE4.2 if the length > - of A > 16. */ > - if (a[16] != 0) > - return __strspn_sse2 (s, a); > - } > - aligned = s; > - offset = (unsigned int) ((size_t) s & 15); > - > - if (offset != 0) > - { > - start_unaligned: > - /* Check partial string. */ > - aligned = (const char *) ((size_t) s & -16L); > - __m128i value = _mm_load_si128 ((__m128i *) aligned); > - __m128i adj_value = __m128i_shift_right (value, offset); > - > - unsigned int length = _mm_cmpistri (mask, adj_value, 0x12); > - /* No need to check CFlag since it is always 1. */ > - if (length < 16 - offset) > - return length; > - /* Find where the NULL terminator is. */ > - maskz = _mm_cmpeq_epi8 (value, zero); > - maskz_bits = _mm_movemask_epi8 (maskz) >> offset; > - if (maskz_bits != 0) > - return length; > - aligned += 16; > - } > - > -start_loop: > - while (1) > - { > - __m128i value = _mm_load_si128 ((__m128i *) aligned); > - unsigned int index = _mm_cmpistri (mask, value, 0x12); > - unsigned int cflag = _mm_cmpistrc (mask, value, 0x12); > - if (cflag) > - return (size_t) (aligned + index - s); > - aligned += 16; > - } > -} > +#include <string/strspn.c> > diff --git a/sysdeps/x86_64/multiarch/wcscpy-c.c b/sysdeps/x86_64/multiarch/wcscpy-c.c > index 26d6984e9b..fa38dd898d 100644 > --- a/sysdeps/x86_64/multiarch/wcscpy-c.c > +++ b/sysdeps/x86_64/multiarch/wcscpy-c.c > @@ -1,5 +1,5 @@ > #if IS_IN (libc) > -# define WCSCPY __wcscpy_sse2 > +# define WCSCPY __wcscpy_generic > #endif > > #include <wcsmbs/wcscpy.c> > diff --git a/sysdeps/x86_64/multiarch/wcscpy.c b/sysdeps/x86_64/multiarch/wcscpy.c > index 6a2d1421d9..53c3228dc2 100644 > --- a/sysdeps/x86_64/multiarch/wcscpy.c > +++ b/sysdeps/x86_64/multiarch/wcscpy.c > @@ -26,7 +26,7 @@ > # define SYMBOL_NAME wcscpy > # include <init-arch.h> > > -extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; > +extern __typeof (REDIRECT_NAME) OPTIMIZE (generic) attribute_hidden; > extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; > > static inline void * > @@ -37,7 +37,7 @@ IFUNC_SELECTOR (void) > if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3)) > return OPTIMIZE (ssse3); > > - return OPTIMIZE (sse2); > + return OPTIMIZE (generic); > } > > libc_ifunc_redirected (__redirect_wcscpy, __wcscpy, IFUNC_SELECTOR ()); > diff --git a/sysdeps/x86_64/multiarch/wcsnlen-c.c b/sysdeps/x86_64/multiarch/wcsnlen-c.c > index e1ec7cfbb5..1c9c04241a 100644 > --- a/sysdeps/x86_64/multiarch/wcsnlen-c.c > +++ b/sysdeps/x86_64/multiarch/wcsnlen-c.c > @@ -1,9 +1,9 @@ > #if IS_IN (libc) > # include <wchar.h> > > -# define WCSNLEN __wcsnlen_sse2 > +# define WCSNLEN __wcsnlen_generic > > -extern __typeof (wcsnlen) __wcsnlen_sse2; > +extern __typeof (wcsnlen) __wcsnlen_generic; > #endif > > #include "wcsmbs/wcsnlen.c" > diff --git a/sysdeps/x86_64/multiarch/wcsnlen.c b/sysdeps/x86_64/multiarch/wcsnlen.c > index baa26666a8..05b7a211de 100644 > --- a/sysdeps/x86_64/multiarch/wcsnlen.c > +++ b/sysdeps/x86_64/multiarch/wcsnlen.c > @@ -24,6 +24,7 @@ > # undef __wcsnlen > > # define SYMBOL_NAME wcsnlen > +# define GENERIC generic > # include "ifunc-wcslen.h" > > libc_ifunc_redirected (__redirect_wcsnlen, __wcsnlen, IFUNC_SELECTOR ()); > -- > 2.34.1 >
On Thu, Jun 9, 2022 at 6:20 PM H.J. Lu <hjl.tools@gmail.com> wrote: > > On Thu, Jun 9, 2022 at 5:58 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > No functions are changed. It just renames generic implementations from > > '{func}_sse2' to '{func}_generic'. This is just because the postfix > > "_sse2" was overloaded and was used for files that had hand-optimized > > sse2 assembly implementations and files that just redirected back > > to the generic implementation. > > This change isn't small and its benefit is very small. Can it be the part of > a big change to support building glibc with > > -march=x86-64-vN kk > > > Full xcheck passed on x86_64. > > --- > > sysdeps/x86_64/multiarch/Makefile | 15 +- > > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 16 +- > > sysdeps/x86_64/multiarch/ifunc-sse4_2.h | 4 +- > > sysdeps/x86_64/multiarch/ifunc-strcpy.h | 8 +- > > sysdeps/x86_64/multiarch/ifunc-wcslen.h | 8 +- > > sysdeps/x86_64/multiarch/stpncpy-c.c | 2 +- > > sysdeps/x86_64/multiarch/stpncpy.c | 1 + > > sysdeps/x86_64/multiarch/strcspn-c-sse4.c | 163 ++++++++++++++++++ > > sysdeps/x86_64/multiarch/strcspn-c.c | 151 +--------------- > > sysdeps/x86_64/multiarch/strcspn-sse2.c | 28 --- > > sysdeps/x86_64/multiarch/strncat-c.c | 2 +- > > sysdeps/x86_64/multiarch/strncat.c | 1 + > > sysdeps/x86_64/multiarch/strncpy-c.c | 2 +- > > sysdeps/x86_64/multiarch/strncpy.c | 1 + > > .../{strspn-sse2.c => strpbrk-c-sse4.c} | 18 +- > > sysdeps/x86_64/multiarch/strpbrk-c.c | 18 +- > > sysdeps/x86_64/multiarch/strpbrk-sse2.c | 28 --- > > sysdeps/x86_64/multiarch/strspn-c-sse4.c | 136 +++++++++++++++ > > sysdeps/x86_64/multiarch/strspn-c.c | 126 +------------- > > sysdeps/x86_64/multiarch/wcscpy-c.c | 2 +- > > sysdeps/x86_64/multiarch/wcscpy.c | 4 +- > > sysdeps/x86_64/multiarch/wcsnlen-c.c | 4 +- > > sysdeps/x86_64/multiarch/wcsnlen.c | 1 + > > 23 files changed, 376 insertions(+), 363 deletions(-) > > create mode 100644 sysdeps/x86_64/multiarch/strcspn-c-sse4.c > > delete mode 100644 sysdeps/x86_64/multiarch/strcspn-sse2.c > > rename sysdeps/x86_64/multiarch/{strspn-sse2.c => strpbrk-c-sse4.c} (74%) > > delete mode 100644 sysdeps/x86_64/multiarch/strpbrk-sse2.c > > create mode 100644 sysdeps/x86_64/multiarch/strspn-c-sse4.c > > > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile > > index 3d153cac35..86c6ecdfc1 100644 > > --- a/sysdeps/x86_64/multiarch/Makefile > > +++ b/sysdeps/x86_64/multiarch/Makefile > > @@ -77,7 +77,7 @@ sysdep_routines += \ > > strcpy-sse2 \ > > strcpy-sse2-unaligned \ > > strcspn-c \ > > - strcspn-sse2 \ > > + strcspn-c-sse4 \ > > strlen-avx2 \ > > strlen-avx2-rtm \ > > strlen-evex \ > > @@ -109,21 +109,22 @@ sysdep_routines += \ > > strnlen-evex512 \ > > strnlen-sse2 \ > > strpbrk-c \ > > - strpbrk-sse2 \ > > + strpbrk-c-sse4 \ > > strrchr-avx2 \ > > strrchr-avx2-rtm \ > > strrchr-evex \ > > strrchr-sse2 \ > > strspn-c \ > > - strspn-sse2 \ > > + strspn-c-sse4 \ > > strstr-avx512 \ > > strstr-sse2-unaligned \ > > varshift \ > > # sysdep_routines > > -CFLAGS-varshift.c += -msse4 > > -CFLAGS-strcspn-c.c += -msse4 > > -CFLAGS-strpbrk-c.c += -msse4 > > -CFLAGS-strspn-c.c += -msse4 > > + > > +CFLAGS-strcspn-c-sse4.c += -msse4 > > +CFLAGS-strpbrk-c-sse4.c += -msse4 > > +CFLAGS-strspn-c-sse4.c += -msse4 > > + > > CFLAGS-strstr-avx512.c += -mavx512f -mavx512vl -mavx512dq -mavx512bw -mbmi -mbmi2 -O3 > > endif > > > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > index 58f3ec8306..4cbd200d39 100644 > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > @@ -372,7 +372,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > __stpncpy_evex) > > IFUNC_IMPL_ADD (array, i, stpncpy, 1, > > __stpncpy_sse2_unaligned) > > - IFUNC_IMPL_ADD (array, i, stpncpy, 1, __stpncpy_sse2)) > > + IFUNC_IMPL_ADD (array, i, stpncpy, 1, __stpncpy_generic)) > > > > /* Support sysdeps/x86_64/multiarch/stpcpy.c. */ > > IFUNC_IMPL (i, name, stpcpy, > > @@ -531,7 +531,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > IFUNC_IMPL (i, name, strcspn, > > IFUNC_IMPL_ADD (array, i, strcspn, CPU_FEATURE_USABLE (SSE4_2), > > __strcspn_sse42) > > - IFUNC_IMPL_ADD (array, i, strcspn, 1, __strcspn_sse2)) > > + IFUNC_IMPL_ADD (array, i, strcspn, 1, __strcspn_generic)) > > > > /* Support sysdeps/x86_64/multiarch/strncase_l.c. */ > > IFUNC_IMPL (i, name, strncasecmp, > > @@ -585,7 +585,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > __strncat_evex) > > IFUNC_IMPL_ADD (array, i, strncat, 1, > > __strncat_sse2_unaligned) > > - IFUNC_IMPL_ADD (array, i, strncat, 1, __strncat_sse2)) > > + IFUNC_IMPL_ADD (array, i, strncat, 1, __strncat_generic)) > > > > /* Support sysdeps/x86_64/multiarch/strncpy.c. */ > > IFUNC_IMPL (i, name, strncpy, > > @@ -601,20 +601,20 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > __strncpy_evex) > > IFUNC_IMPL_ADD (array, i, strncpy, 1, > > __strncpy_sse2_unaligned) > > - IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_sse2)) > > + IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_generic)) > > > > /* Support sysdeps/x86_64/multiarch/strpbrk.c. */ > > IFUNC_IMPL (i, name, strpbrk, > > IFUNC_IMPL_ADD (array, i, strpbrk, CPU_FEATURE_USABLE (SSE4_2), > > __strpbrk_sse42) > > - IFUNC_IMPL_ADD (array, i, strpbrk, 1, __strpbrk_sse2)) > > + IFUNC_IMPL_ADD (array, i, strpbrk, 1, __strpbrk_generic)) > > > > > > /* Support sysdeps/x86_64/multiarch/strspn.c. */ > > IFUNC_IMPL (i, name, strspn, > > IFUNC_IMPL_ADD (array, i, strspn, CPU_FEATURE_USABLE (SSE4_2), > > __strspn_sse42) > > - IFUNC_IMPL_ADD (array, i, strspn, 1, __strspn_sse2)) > > + IFUNC_IMPL_ADD (array, i, strspn, 1, __strspn_generic)) > > > > /* Support sysdeps/x86_64/multiarch/strstr.c. */ > > IFUNC_IMPL (i, name, strstr, > > @@ -697,7 +697,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > IFUNC_IMPL (i, name, wcscpy, > > IFUNC_IMPL_ADD (array, i, wcscpy, CPU_FEATURE_USABLE (SSSE3), > > __wcscpy_ssse3) > > - IFUNC_IMPL_ADD (array, i, wcscpy, 1, __wcscpy_sse2)) > > + IFUNC_IMPL_ADD (array, i, wcscpy, 1, __wcscpy_generic)) > > > > /* Support sysdeps/x86_64/multiarch/wcslen.c. */ > > IFUNC_IMPL (i, name, wcslen, > > @@ -749,7 +749,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > IFUNC_IMPL_ADD (array, i, wcsnlen, > > CPU_FEATURE_USABLE (SSE4_1), > > __wcsnlen_sse4_1) > > - IFUNC_IMPL_ADD (array, i, wcsnlen, 1, __wcsnlen_sse2)) > > + IFUNC_IMPL_ADD (array, i, wcsnlen, 1, __wcsnlen_generic)) > > > > /* Support sysdeps/x86_64/multiarch/wmemchr.c. */ > > IFUNC_IMPL (i, name, wmemchr, > > diff --git a/sysdeps/x86_64/multiarch/ifunc-sse4_2.h b/sysdeps/x86_64/multiarch/ifunc-sse4_2.h > > index b555ff2fac..ee36525bcf 100644 > > --- a/sysdeps/x86_64/multiarch/ifunc-sse4_2.h > > +++ b/sysdeps/x86_64/multiarch/ifunc-sse4_2.h > > @@ -19,7 +19,7 @@ > > > > #include <init-arch.h> > > > > -extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; > > +extern __typeof (REDIRECT_NAME) OPTIMIZE (generic) attribute_hidden; > > extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden; > > > > static inline void * > > @@ -30,5 +30,5 @@ IFUNC_SELECTOR (void) > > if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_2)) > > return OPTIMIZE (sse42); > > > > - return OPTIMIZE (sse2); > > + return OPTIMIZE (generic); > > } > > diff --git a/sysdeps/x86_64/multiarch/ifunc-strcpy.h b/sysdeps/x86_64/multiarch/ifunc-strcpy.h > > index a15afa44e9..80529458d1 100644 > > --- a/sysdeps/x86_64/multiarch/ifunc-strcpy.h > > +++ b/sysdeps/x86_64/multiarch/ifunc-strcpy.h > > @@ -20,7 +20,11 @@ > > > > #include <init-arch.h> > > > > -extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; > > +#ifndef GENERIC > > +# define GENERIC sse2 > > +#endif > > + > > +extern __typeof (REDIRECT_NAME) OPTIMIZE (GENERIC) attribute_hidden; > > extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) > > attribute_hidden; > > extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; > > @@ -49,5 +53,5 @@ IFUNC_SELECTOR (void) > > if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load)) > > return OPTIMIZE (sse2_unaligned); > > > > - return OPTIMIZE (sse2); > > + return OPTIMIZE (GENERIC); > > } > > diff --git a/sysdeps/x86_64/multiarch/ifunc-wcslen.h b/sysdeps/x86_64/multiarch/ifunc-wcslen.h > > index 2b29e7608a..88c1c502af 100644 > > --- a/sysdeps/x86_64/multiarch/ifunc-wcslen.h > > +++ b/sysdeps/x86_64/multiarch/ifunc-wcslen.h > > @@ -19,7 +19,11 @@ > > > > #include <init-arch.h> > > > > -extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; > > +#ifndef GENERIC > > +# define GENERIC sse2 > > +#endif > > + > > +extern __typeof (REDIRECT_NAME) OPTIMIZE (GENERIC) attribute_hidden; > > extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden; > > extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; > > extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; > > @@ -48,5 +52,5 @@ IFUNC_SELECTOR (void) > > if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1)) > > return OPTIMIZE (sse4_1); > > > > - return OPTIMIZE (sse2); > > + return OPTIMIZE (GENERIC); > > } > > diff --git a/sysdeps/x86_64/multiarch/stpncpy-c.c b/sysdeps/x86_64/multiarch/stpncpy-c.c > > index b016e487e1..eb62fcf388 100644 > > --- a/sysdeps/x86_64/multiarch/stpncpy-c.c > > +++ b/sysdeps/x86_64/multiarch/stpncpy-c.c > > @@ -1,4 +1,4 @@ > > -#define STPNCPY __stpncpy_sse2 > > +#define STPNCPY __stpncpy_generic > > #undef weak_alias > > #define weak_alias(ignored1, ignored2) > > #undef libc_hidden_def > > diff --git a/sysdeps/x86_64/multiarch/stpncpy.c b/sysdeps/x86_64/multiarch/stpncpy.c > > index 82fa53957d..879bc83f0b 100644 > > --- a/sysdeps/x86_64/multiarch/stpncpy.c > > +++ b/sysdeps/x86_64/multiarch/stpncpy.c > > @@ -25,6 +25,7 @@ > > # undef stpncpy > > # undef __stpncpy > > > > +# define GENERIC generic > > # define SYMBOL_NAME stpncpy > > # include "ifunc-strcpy.h" > > > > diff --git a/sysdeps/x86_64/multiarch/strcspn-c-sse4.c b/sysdeps/x86_64/multiarch/strcspn-c-sse4.c > > new file mode 100644 > > index 0000000000..59f64f9fe8 > > --- /dev/null > > +++ b/sysdeps/x86_64/multiarch/strcspn-c-sse4.c > > @@ -0,0 +1,163 @@ > > +/* strcspn with SSE4.2 intrinsics > > + Copyright (C) 2009-2022 Free Software Foundation, Inc. > > + This file is part of the GNU C Library. > > + > > + The GNU C Library is free software; you can redistribute it and/or > > + modify it under the terms of the GNU Lesser General Public > > + License as published by the Free Software Foundation; either > > + version 2.1 of the License, or (at your option) any later version. > > + > > + The GNU C Library is distributed in the hope that it will be useful, > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > + Lesser General Public License for more details. > > + > > + You should have received a copy of the GNU Lesser General Public > > + License along with the GNU C Library; if not, see > > + <https://www.gnu.org/licenses/>. */ > > + > > +#include <nmmintrin.h> > > +#include <string.h> > > +#include "varshift.h" > > + > > +/* We use 0x2: > > + _SIDD_SBYTE_OPS > > + | _SIDD_CMP_EQUAL_ANY > > + | _SIDD_POSITIVE_POLARITY > > + | _SIDD_LEAST_SIGNIFICANT > > + on pcmpistri to compare xmm/mem128 > > + > > + 0 1 2 3 4 5 6 7 8 9 A B C D E F > > + X X X X X X X X X X X X X X X X > > + > > + against xmm > > + > > + 0 1 2 3 4 5 6 7 8 9 A B C D E F > > + A A A A A A A A A A A A A A A A > > + > > + to find out if the first 16byte data element has any byte A and > > + the offset of the first byte. There are 3 cases: > > + > > + 1. The first 16byte data element has the byte A at the offset X. > > + 2. The first 16byte data element has EOS and doesn't have the byte A. > > + 3. The first 16byte data element is valid and doesn't have the byte A. > > + > > + Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases: > > + > > + 1 X 1 0/1 0 > > + 2 16 0 1 0 > > + 3 16 0 0 0 > > + > > + We exit from the loop for cases 1 and 2 with jbe which branches > > + when either CFlag or ZFlag is 1. If CFlag == 1, ECX has the offset > > + X for case 1. */ > > + > > +#ifndef STRCSPN_GENERIC > > +# define STRCSPN_GENERIC __strcspn_generic > > +# define STRCSPN_SSE42 __strcspn_sse42 > > +#endif > > + > > +#ifdef USE_AS_STRPBRK > > +# define RETURN(val1, val2) return val1 > > +#else > > +# define RETURN(val1, val2) return val2 > > +#endif > > + > > +extern > > +#ifdef USE_AS_STRPBRK > > +char * > > +#else > > +size_t > > +#endif > > +STRCSPN_GENERIC (const char *, const char *) attribute_hidden; > > + > > + > > +#ifdef USE_AS_STRPBRK > > +char * > > +#else > > +size_t > > +#endif > > +__attribute__ ((section (".text.sse4.2"))) > > +STRCSPN_SSE42 (const char *s, const char *a) > > +{ > > + if (*a == 0) > > + RETURN (NULL, strlen (s)); > > + > > + const char *aligned; > > + __m128i mask, maskz, zero; > > + unsigned int maskz_bits; > > + unsigned int offset = (unsigned int) ((size_t) a & 15); > > + zero = _mm_set1_epi8 (0); > > + if (offset != 0) > > + { > > + /* Load masks. */ > > + aligned = (const char *) ((size_t) a & -16L); > > + __m128i mask0 = _mm_load_si128 ((__m128i *) aligned); > > + maskz = _mm_cmpeq_epi8 (mask0, zero); > > + > > + /* Find where the NULL terminator is. */ > > + maskz_bits = _mm_movemask_epi8 (maskz) >> offset; > > + if (maskz_bits != 0) > > + { > > + mask = __m128i_shift_right (mask0, offset); > > + offset = (unsigned int) ((size_t) s & 15); > > + if (offset) > > + goto start_unaligned; > > + > > + aligned = s; > > + goto start_loop; > > + } > > + } > > + > > + /* A is aligned. */ > > + mask = _mm_loadu_si128 ((__m128i *) a); > > + /* Find where the NULL terminator is. */ > > + maskz = _mm_cmpeq_epi8 (mask, zero); > > + maskz_bits = _mm_movemask_epi8 (maskz); > > + if (maskz_bits == 0) > > + { > > + /* There is no NULL terminator. Don't use SSE4.2 if the length > > + of A > 16. */ > > + if (a[16] != 0) > > + return STRCSPN_GENERIC (s, a); > > + } > > + > > + aligned = s; > > + offset = (unsigned int) ((size_t) s & 15); > > + if (offset != 0) > > + { > > + start_unaligned: > > + /* Check partial string. */ > > + aligned = (const char *) ((size_t) s & -16L); > > + __m128i value = _mm_load_si128 ((__m128i *) aligned); > > + > > + value = __m128i_shift_right (value, offset); > > + > > + unsigned int length = _mm_cmpistri (mask, value, 0x2); > > + /* No need to check ZFlag since ZFlag is always 1. */ > > + unsigned int cflag = _mm_cmpistrc (mask, value, 0x2); > > + if (cflag) > > + RETURN ((char *) (s + length), length); > > + /* Find where the NULL terminator is. */ > > + unsigned int index = _mm_cmpistri (value, value, 0x3a); > > + if (index < 16 - offset) > > + RETURN (NULL, index); > > + aligned += 16; > > + } > > + > > +start_loop: > > + while (1) > > + { > > + __m128i value = _mm_load_si128 ((__m128i *) aligned); > > + unsigned int index = _mm_cmpistri (mask, value, 0x2); > > + unsigned int cflag = _mm_cmpistrc (mask, value, 0x2); > > + unsigned int zflag = _mm_cmpistrz (mask, value, 0x2); > > + if (cflag) > > + RETURN ((char *) (aligned + index), (size_t) (aligned + index - s)); > > + if (zflag) > > + RETURN (NULL, > > + /* Find where the NULL terminator is. */ > > + (size_t) (aligned + _mm_cmpistri (value, value, 0x3a) - s)); > > + aligned += 16; > > + } > > +} > > diff --git a/sysdeps/x86_64/multiarch/strcspn-c.c b/sysdeps/x86_64/multiarch/strcspn-c.c > > index c312fab8b1..423de2e2b2 100644 > > --- a/sysdeps/x86_64/multiarch/strcspn-c.c > > +++ b/sysdeps/x86_64/multiarch/strcspn-c.c > > @@ -1,5 +1,5 @@ > > -/* strcspn with SSE4.2 intrinsics > > - Copyright (C) 2009-2022 Free Software Foundation, Inc. > > +/* strcspn. > > + Copyright (C) 2017-2022 Free Software Foundation, Inc. > > This file is part of the GNU C Library. > > > > The GNU C Library is free software; you can redistribute it and/or > > @@ -16,148 +16,13 @@ > > License along with the GNU C Library; if not, see > > <https://www.gnu.org/licenses/>. */ > > > > -#include <nmmintrin.h> > > -#include <string.h> > > -#include "varshift.h" > > +#if IS_IN (libc) > > > > -/* We use 0x2: > > - _SIDD_SBYTE_OPS > > - | _SIDD_CMP_EQUAL_ANY > > - | _SIDD_POSITIVE_POLARITY > > - | _SIDD_LEAST_SIGNIFICANT > > - on pcmpistri to compare xmm/mem128 > > +# include <sysdep.h> > > +# define STRCSPN __strcspn_generic > > > > - 0 1 2 3 4 5 6 7 8 9 A B C D E F > > - X X X X X X X X X X X X X X X X > > - > > - against xmm > > - > > - 0 1 2 3 4 5 6 7 8 9 A B C D E F > > - A A A A A A A A A A A A A A A A > > - > > - to find out if the first 16byte data element has any byte A and > > - the offset of the first byte. There are 3 cases: > > - > > - 1. The first 16byte data element has the byte A at the offset X. > > - 2. The first 16byte data element has EOS and doesn't have the byte A. > > - 3. The first 16byte data element is valid and doesn't have the byte A. > > - > > - Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases: > > - > > - 1 X 1 0/1 0 > > - 2 16 0 1 0 > > - 3 16 0 0 0 > > - > > - We exit from the loop for cases 1 and 2 with jbe which branches > > - when either CFlag or ZFlag is 1. If CFlag == 1, ECX has the offset > > - X for case 1. */ > > - > > -#ifndef STRCSPN_SSE2 > > -# define STRCSPN_SSE2 __strcspn_sse2 > > -# define STRCSPN_SSE42 __strcspn_sse42 > > -#endif > > - > > -#ifdef USE_AS_STRPBRK > > -# define RETURN(val1, val2) return val1 > > -#else > > -# define RETURN(val1, val2) return val2 > > -#endif > > - > > -extern > > -#ifdef USE_AS_STRPBRK > > -char * > > -#else > > -size_t > > -#endif > > -STRCSPN_SSE2 (const char *, const char *) attribute_hidden; > > - > > - > > -#ifdef USE_AS_STRPBRK > > -char * > > -#else > > -size_t > > +# undef libc_hidden_builtin_def > > +# define libc_hidden_builtin_def(STRCSPN) > > #endif > > -__attribute__ ((section (".text.sse4.2"))) > > -STRCSPN_SSE42 (const char *s, const char *a) > > -{ > > - if (*a == 0) > > - RETURN (NULL, strlen (s)); > > - > > - const char *aligned; > > - __m128i mask, maskz, zero; > > - unsigned int maskz_bits; > > - unsigned int offset = (unsigned int) ((size_t) a & 15); > > - zero = _mm_set1_epi8 (0); > > - if (offset != 0) > > - { > > - /* Load masks. */ > > - aligned = (const char *) ((size_t) a & -16L); > > - __m128i mask0 = _mm_load_si128 ((__m128i *) aligned); > > - maskz = _mm_cmpeq_epi8 (mask0, zero); > > - > > - /* Find where the NULL terminator is. */ > > - maskz_bits = _mm_movemask_epi8 (maskz) >> offset; > > - if (maskz_bits != 0) > > - { > > - mask = __m128i_shift_right (mask0, offset); > > - offset = (unsigned int) ((size_t) s & 15); > > - if (offset) > > - goto start_unaligned; > > - > > - aligned = s; > > - goto start_loop; > > - } > > - } > > - > > - /* A is aligned. */ > > - mask = _mm_loadu_si128 ((__m128i *) a); > > - /* Find where the NULL terminator is. */ > > - maskz = _mm_cmpeq_epi8 (mask, zero); > > - maskz_bits = _mm_movemask_epi8 (maskz); > > - if (maskz_bits == 0) > > - { > > - /* There is no NULL terminator. Don't use SSE4.2 if the length > > - of A > 16. */ > > - if (a[16] != 0) > > - return STRCSPN_SSE2 (s, a); > > - } > > - > > - aligned = s; > > - offset = (unsigned int) ((size_t) s & 15); > > - if (offset != 0) > > - { > > - start_unaligned: > > - /* Check partial string. */ > > - aligned = (const char *) ((size_t) s & -16L); > > - __m128i value = _mm_load_si128 ((__m128i *) aligned); > > - > > - value = __m128i_shift_right (value, offset); > > - > > - unsigned int length = _mm_cmpistri (mask, value, 0x2); > > - /* No need to check ZFlag since ZFlag is always 1. */ > > - unsigned int cflag = _mm_cmpistrc (mask, value, 0x2); > > - if (cflag) > > - RETURN ((char *) (s + length), length); > > - /* Find where the NULL terminator is. */ > > - unsigned int index = _mm_cmpistri (value, value, 0x3a); > > - if (index < 16 - offset) > > - RETURN (NULL, index); > > - aligned += 16; > > - } > > > > -start_loop: > > - while (1) > > - { > > - __m128i value = _mm_load_si128 ((__m128i *) aligned); > > - unsigned int index = _mm_cmpistri (mask, value, 0x2); > > - unsigned int cflag = _mm_cmpistrc (mask, value, 0x2); > > - unsigned int zflag = _mm_cmpistrz (mask, value, 0x2); > > - if (cflag) > > - RETURN ((char *) (aligned + index), (size_t) (aligned + index - s)); > > - if (zflag) > > - RETURN (NULL, > > - /* Find where the NULL terminator is. */ > > - (size_t) (aligned + _mm_cmpistri (value, value, 0x3a) - s)); > > - aligned += 16; > > - } > > -} > > +#include <string/strcspn.c> > > diff --git a/sysdeps/x86_64/multiarch/strcspn-sse2.c b/sysdeps/x86_64/multiarch/strcspn-sse2.c > > deleted file mode 100644 > > index 3a04bb39fc..0000000000 > > --- a/sysdeps/x86_64/multiarch/strcspn-sse2.c > > +++ /dev/null > > @@ -1,28 +0,0 @@ > > -/* strcspn. > > - Copyright (C) 2017-2022 Free Software Foundation, Inc. > > - This file is part of the GNU C Library. > > - > > - The GNU C Library is free software; you can redistribute it and/or > > - modify it under the terms of the GNU Lesser General Public > > - License as published by the Free Software Foundation; either > > - version 2.1 of the License, or (at your option) any later version. > > - > > - The GNU C Library is distributed in the hope that it will be useful, > > - but WITHOUT ANY WARRANTY; without even the implied warranty of > > - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > - Lesser General Public License for more details. > > - > > - You should have received a copy of the GNU Lesser General Public > > - License along with the GNU C Library; if not, see > > - <https://www.gnu.org/licenses/>. */ > > - > > -#if IS_IN (libc) > > - > > -# include <sysdep.h> > > -# define STRCSPN __strcspn_sse2 > > - > > -# undef libc_hidden_builtin_def > > -# define libc_hidden_builtin_def(STRCSPN) > > -#endif > > - > > -#include <string/strcspn.c> > > diff --git a/sysdeps/x86_64/multiarch/strncat-c.c b/sysdeps/x86_64/multiarch/strncat-c.c > > index 93a7fab7ea..b729c033d9 100644 > > --- a/sysdeps/x86_64/multiarch/strncat-c.c > > +++ b/sysdeps/x86_64/multiarch/strncat-c.c > > @@ -1,2 +1,2 @@ > > -#define STRNCAT __strncat_sse2 > > +#define STRNCAT __strncat_generic > > #include <string/strncat.c> > > diff --git a/sysdeps/x86_64/multiarch/strncat.c b/sysdeps/x86_64/multiarch/strncat.c > > index b649343a97..50fba8a41f 100644 > > --- a/sysdeps/x86_64/multiarch/strncat.c > > +++ b/sysdeps/x86_64/multiarch/strncat.c > > @@ -24,6 +24,7 @@ > > # undef strncat > > > > # define SYMBOL_NAME strncat > > +# define GENERIC generic > > # include "ifunc-strcpy.h" > > > > libc_ifunc_redirected (__redirect_strncat, strncat, IFUNC_SELECTOR ()); > > diff --git a/sysdeps/x86_64/multiarch/strncpy-c.c b/sysdeps/x86_64/multiarch/strncpy-c.c > > index 57c45ac7ab..183b0b8e0f 100644 > > --- a/sysdeps/x86_64/multiarch/strncpy-c.c > > +++ b/sysdeps/x86_64/multiarch/strncpy-c.c > > @@ -1,4 +1,4 @@ > > -#define STRNCPY __strncpy_sse2 > > +#define STRNCPY __strncpy_generic > > #undef libc_hidden_builtin_def > > #define libc_hidden_builtin_def(strncpy) > > > > diff --git a/sysdeps/x86_64/multiarch/strncpy.c b/sysdeps/x86_64/multiarch/strncpy.c > > index 2a780a7e16..7fc7d72ec5 100644 > > --- a/sysdeps/x86_64/multiarch/strncpy.c > > +++ b/sysdeps/x86_64/multiarch/strncpy.c > > @@ -24,6 +24,7 @@ > > # undef strncpy > > > > # define SYMBOL_NAME strncpy > > +# define GENERIC generic > > # include "ifunc-strcpy.h" > > > > libc_ifunc_redirected (__redirect_strncpy, strncpy, IFUNC_SELECTOR ()); > > diff --git a/sysdeps/x86_64/multiarch/strspn-sse2.c b/sysdeps/x86_64/multiarch/strpbrk-c-sse4.c > > similarity index 74% > > rename from sysdeps/x86_64/multiarch/strspn-sse2.c > > rename to sysdeps/x86_64/multiarch/strpbrk-c-sse4.c > > index 61cc6cb0a5..8700276773 100644 > > --- a/sysdeps/x86_64/multiarch/strspn-sse2.c > > +++ b/sysdeps/x86_64/multiarch/strpbrk-c-sse4.c > > @@ -1,5 +1,5 @@ > > -/* strspn. > > - Copyright (C) 2017-2022 Free Software Foundation, Inc. > > +/* strpbrk with SSE4.2 intrinsics > > + Copyright (C) 2022 Free Software Foundation, Inc. > > This file is part of the GNU C Library. > > > > The GNU C Library is free software; you can redistribute it and/or > > @@ -16,13 +16,7 @@ > > License along with the GNU C Library; if not, see > > <https://www.gnu.org/licenses/>. */ > > > > -#if IS_IN (libc) > > - > > -# include <sysdep.h> > > -# define STRSPN __strspn_sse2 > > - > > -# undef libc_hidden_builtin_def > > -# define libc_hidden_builtin_def(STRSPN) > > -#endif > > - > > -#include <string/strspn.c> > > +#define USE_AS_STRPBRK > > +#define STRCSPN_GENERIC __strpbrk_generic > > +#define STRCSPN_SSE42 __strpbrk_sse42 > > +#include "strcspn-c-sse4.c" > > diff --git a/sysdeps/x86_64/multiarch/strpbrk-c.c b/sysdeps/x86_64/multiarch/strpbrk-c.c > > index abf4ff7f1a..d31acfe495 100644 > > --- a/sysdeps/x86_64/multiarch/strpbrk-c.c > > +++ b/sysdeps/x86_64/multiarch/strpbrk-c.c > > @@ -1,5 +1,5 @@ > > -/* strpbrk with SSE4.2 intrinsics > > - Copyright (C) 2022 Free Software Foundation, Inc. > > +/* strpbrk. > > + Copyright (C) 2017-2022 Free Software Foundation, Inc. > > This file is part of the GNU C Library. > > > > The GNU C Library is free software; you can redistribute it and/or > > @@ -16,7 +16,13 @@ > > License along with the GNU C Library; if not, see > > <https://www.gnu.org/licenses/>. */ > > > > -#define USE_AS_STRPBRK > > -#define STRCSPN_SSE2 __strpbrk_sse2 > > -#define STRCSPN_SSE42 __strpbrk_sse42 > > -#include "strcspn-c.c" > > +#if IS_IN (libc) > > + > > +# include <sysdep.h> > > +# define STRPBRK __strpbrk_generic > > + > > +# undef libc_hidden_builtin_def > > +# define libc_hidden_builtin_def(STRPBRK) > > +#endif > > + > > +#include <string/strpbrk.c> > > diff --git a/sysdeps/x86_64/multiarch/strpbrk-sse2.c b/sysdeps/x86_64/multiarch/strpbrk-sse2.c > > deleted file mode 100644 > > index d03214c4fb..0000000000 > > --- a/sysdeps/x86_64/multiarch/strpbrk-sse2.c > > +++ /dev/null > > @@ -1,28 +0,0 @@ > > -/* strpbrk. > > - Copyright (C) 2017-2022 Free Software Foundation, Inc. > > - This file is part of the GNU C Library. > > - > > - The GNU C Library is free software; you can redistribute it and/or > > - modify it under the terms of the GNU Lesser General Public > > - License as published by the Free Software Foundation; either > > - version 2.1 of the License, or (at your option) any later version. > > - > > - The GNU C Library is distributed in the hope that it will be useful, > > - but WITHOUT ANY WARRANTY; without even the implied warranty of > > - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > - Lesser General Public License for more details. > > - > > - You should have received a copy of the GNU Lesser General Public > > - License along with the GNU C Library; if not, see > > - <https://www.gnu.org/licenses/>. */ > > - > > -#if IS_IN (libc) > > - > > -# include <sysdep.h> > > -# define STRPBRK __strpbrk_sse2 > > - > > -# undef libc_hidden_builtin_def > > -# define libc_hidden_builtin_def(STRPBRK) > > -#endif > > - > > -#include <string/strpbrk.c> > > diff --git a/sysdeps/x86_64/multiarch/strspn-c-sse4.c b/sysdeps/x86_64/multiarch/strspn-c-sse4.c > > new file mode 100644 > > index 0000000000..d044916688 > > --- /dev/null > > +++ b/sysdeps/x86_64/multiarch/strspn-c-sse4.c > > @@ -0,0 +1,136 @@ > > +/* strspn with SSE4.2 intrinsics > > + Copyright (C) 2009-2022 Free Software Foundation, Inc. > > + This file is part of the GNU C Library. > > + > > + The GNU C Library is free software; you can redistribute it and/or > > + modify it under the terms of the GNU Lesser General Public > > + License as published by the Free Software Foundation; either > > + version 2.1 of the License, or (at your option) any later version. > > + > > + The GNU C Library is distributed in the hope that it will be useful, > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > + Lesser General Public License for more details. > > + > > + You should have received a copy of the GNU Lesser General Public > > + License along with the GNU C Library; if not, see > > + <https://www.gnu.org/licenses/>. */ > > + > > +#include <nmmintrin.h> > > +#include <string.h> > > +#include "varshift.h" > > + > > +/* We use 0x12: > > + _SIDD_SBYTE_OPS > > + | _SIDD_CMP_EQUAL_ANY > > + | _SIDD_NEGATIVE_POLARITY > > + | _SIDD_LEAST_SIGNIFICANT > > + on pcmpistri to compare xmm/mem128 > > + > > + 0 1 2 3 4 5 6 7 8 9 A B C D E F > > + X X X X X X X X X X X X X X X X > > + > > + against xmm > > + > > + 0 1 2 3 4 5 6 7 8 9 A B C D E F > > + A A A A A A A A A A A A A A A A > > + > > + to find out if the first 16byte data element has any non-A byte and > > + the offset of the first byte. There are 2 cases: > > + > > + 1. The first 16byte data element has the non-A byte, including > > + EOS, at the offset X. > > + 2. The first 16byte data element is valid and doesn't have the non-A > > + byte. > > + > > + Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases: > > + > > + case ECX CFlag ZFlag SFlag > > + 1 X 1 0/1 0 > > + 2 16 0 0 0 > > + > > + We exit from the loop for case 1. */ > > + > > +extern size_t __strspn_generic (const char *, const char *) attribute_hidden; > > + > > + > > +size_t > > +__attribute__ ((section (".text.sse4.2"))) > > +__strspn_sse42 (const char *s, const char *a) > > +{ > > + if (*a == 0) > > + return 0; > > + > > + const char *aligned; > > + __m128i mask, maskz, zero; > > + unsigned int maskz_bits; > > + unsigned int offset = (int) ((size_t) a & 15); > > + zero = _mm_set1_epi8 (0); > > + if (offset != 0) > > + { > > + /* Load masks. */ > > + aligned = (const char *) ((size_t) a & -16L); > > + __m128i mask0 = _mm_load_si128 ((__m128i *) aligned); > > + maskz = _mm_cmpeq_epi8 (mask0, zero); > > + > > + /* Find where the NULL terminator is. */ > > + maskz_bits = _mm_movemask_epi8 (maskz) >> offset; > > + if (maskz_bits != 0) > > + { > > + mask = __m128i_shift_right (mask0, offset); > > + offset = (unsigned int) ((size_t) s & 15); > > + if (offset) > > + goto start_unaligned; > > + > > + aligned = s; > > + goto start_loop; > > + } > > + } > > + > > + /* A is aligned. */ > > + mask = _mm_loadu_si128 ((__m128i *) a); > > + > > + /* Find where the NULL terminator is. */ > > + maskz = _mm_cmpeq_epi8 (mask, zero); > > + maskz_bits = _mm_movemask_epi8 (maskz); > > + if (maskz_bits == 0) > > + { > > + /* There is no NULL terminator. Don't use SSE4.2 if the length > > + of A > 16. */ > > + if (a[16] != 0) > > + return __strspn_generic (s, a); > > + } > > + aligned = s; > > + offset = (unsigned int) ((size_t) s & 15); > > + > > + if (offset != 0) > > + { > > + start_unaligned: > > + /* Check partial string. */ > > + aligned = (const char *) ((size_t) s & -16L); > > + __m128i value = _mm_load_si128 ((__m128i *) aligned); > > + __m128i adj_value = __m128i_shift_right (value, offset); > > + > > + unsigned int length = _mm_cmpistri (mask, adj_value, 0x12); > > + /* No need to check CFlag since it is always 1. */ > > + if (length < 16 - offset) > > + return length; > > + /* Find where the NULL terminator is. */ > > + maskz = _mm_cmpeq_epi8 (value, zero); > > + maskz_bits = _mm_movemask_epi8 (maskz) >> offset; > > + if (maskz_bits != 0) > > + return length; > > + aligned += 16; > > + } > > + > > +start_loop: > > + while (1) > > + { > > + __m128i value = _mm_load_si128 ((__m128i *) aligned); > > + unsigned int index = _mm_cmpistri (mask, value, 0x12); > > + unsigned int cflag = _mm_cmpistrc (mask, value, 0x12); > > + if (cflag) > > + return (size_t) (aligned + index - s); > > + aligned += 16; > > + } > > +} > > diff --git a/sysdeps/x86_64/multiarch/strspn-c.c b/sysdeps/x86_64/multiarch/strspn-c.c > > index 6124033ceb..6b50c36432 100644 > > --- a/sysdeps/x86_64/multiarch/strspn-c.c > > +++ b/sysdeps/x86_64/multiarch/strspn-c.c > > @@ -1,5 +1,5 @@ > > -/* strspn with SSE4.2 intrinsics > > - Copyright (C) 2009-2022 Free Software Foundation, Inc. > > +/* strspn. > > + Copyright (C) 2017-2022 Free Software Foundation, Inc. > > This file is part of the GNU C Library. > > > > The GNU C Library is free software; you can redistribute it and/or > > @@ -16,121 +16,13 @@ > > License along with the GNU C Library; if not, see > > <https://www.gnu.org/licenses/>. */ > > > > -#include <nmmintrin.h> > > -#include <string.h> > > -#include "varshift.h" > > +#if IS_IN (libc) > > > > -/* We use 0x12: > > - _SIDD_SBYTE_OPS > > - | _SIDD_CMP_EQUAL_ANY > > - | _SIDD_NEGATIVE_POLARITY > > - | _SIDD_LEAST_SIGNIFICANT > > - on pcmpistri to compare xmm/mem128 > > +# include <sysdep.h> > > +# define STRSPN __strspn_generic > > > > - 0 1 2 3 4 5 6 7 8 9 A B C D E F > > - X X X X X X X X X X X X X X X X > > +# undef libc_hidden_builtin_def > > +# define libc_hidden_builtin_def(STRSPN) > > +#endif > > > > - against xmm > > - > > - 0 1 2 3 4 5 6 7 8 9 A B C D E F > > - A A A A A A A A A A A A A A A A > > - > > - to find out if the first 16byte data element has any non-A byte and > > - the offset of the first byte. There are 2 cases: > > - > > - 1. The first 16byte data element has the non-A byte, including > > - EOS, at the offset X. > > - 2. The first 16byte data element is valid and doesn't have the non-A > > - byte. > > - > > - Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases: > > - > > - case ECX CFlag ZFlag SFlag > > - 1 X 1 0/1 0 > > - 2 16 0 0 0 > > - > > - We exit from the loop for case 1. */ > > - > > -extern size_t __strspn_sse2 (const char *, const char *) attribute_hidden; > > - > > - > > -size_t > > -__attribute__ ((section (".text.sse4.2"))) > > -__strspn_sse42 (const char *s, const char *a) > > -{ > > - if (*a == 0) > > - return 0; > > - > > - const char *aligned; > > - __m128i mask, maskz, zero; > > - unsigned int maskz_bits; > > - unsigned int offset = (int) ((size_t) a & 15); > > - zero = _mm_set1_epi8 (0); > > - if (offset != 0) > > - { > > - /* Load masks. */ > > - aligned = (const char *) ((size_t) a & -16L); > > - __m128i mask0 = _mm_load_si128 ((__m128i *) aligned); > > - maskz = _mm_cmpeq_epi8 (mask0, zero); > > - > > - /* Find where the NULL terminator is. */ > > - maskz_bits = _mm_movemask_epi8 (maskz) >> offset; > > - if (maskz_bits != 0) > > - { > > - mask = __m128i_shift_right (mask0, offset); > > - offset = (unsigned int) ((size_t) s & 15); > > - if (offset) > > - goto start_unaligned; > > - > > - aligned = s; > > - goto start_loop; > > - } > > - } > > - > > - /* A is aligned. */ > > - mask = _mm_loadu_si128 ((__m128i *) a); > > - > > - /* Find where the NULL terminator is. */ > > - maskz = _mm_cmpeq_epi8 (mask, zero); > > - maskz_bits = _mm_movemask_epi8 (maskz); > > - if (maskz_bits == 0) > > - { > > - /* There is no NULL terminator. Don't use SSE4.2 if the length > > - of A > 16. */ > > - if (a[16] != 0) > > - return __strspn_sse2 (s, a); > > - } > > - aligned = s; > > - offset = (unsigned int) ((size_t) s & 15); > > - > > - if (offset != 0) > > - { > > - start_unaligned: > > - /* Check partial string. */ > > - aligned = (const char *) ((size_t) s & -16L); > > - __m128i value = _mm_load_si128 ((__m128i *) aligned); > > - __m128i adj_value = __m128i_shift_right (value, offset); > > - > > - unsigned int length = _mm_cmpistri (mask, adj_value, 0x12); > > - /* No need to check CFlag since it is always 1. */ > > - if (length < 16 - offset) > > - return length; > > - /* Find where the NULL terminator is. */ > > - maskz = _mm_cmpeq_epi8 (value, zero); > > - maskz_bits = _mm_movemask_epi8 (maskz) >> offset; > > - if (maskz_bits != 0) > > - return length; > > - aligned += 16; > > - } > > - > > -start_loop: > > - while (1) > > - { > > - __m128i value = _mm_load_si128 ((__m128i *) aligned); > > - unsigned int index = _mm_cmpistri (mask, value, 0x12); > > - unsigned int cflag = _mm_cmpistrc (mask, value, 0x12); > > - if (cflag) > > - return (size_t) (aligned + index - s); > > - aligned += 16; > > - } > > -} > > +#include <string/strspn.c> > > diff --git a/sysdeps/x86_64/multiarch/wcscpy-c.c b/sysdeps/x86_64/multiarch/wcscpy-c.c > > index 26d6984e9b..fa38dd898d 100644 > > --- a/sysdeps/x86_64/multiarch/wcscpy-c.c > > +++ b/sysdeps/x86_64/multiarch/wcscpy-c.c > > @@ -1,5 +1,5 @@ > > #if IS_IN (libc) > > -# define WCSCPY __wcscpy_sse2 > > +# define WCSCPY __wcscpy_generic > > #endif > > > > #include <wcsmbs/wcscpy.c> > > diff --git a/sysdeps/x86_64/multiarch/wcscpy.c b/sysdeps/x86_64/multiarch/wcscpy.c > > index 6a2d1421d9..53c3228dc2 100644 > > --- a/sysdeps/x86_64/multiarch/wcscpy.c > > +++ b/sysdeps/x86_64/multiarch/wcscpy.c > > @@ -26,7 +26,7 @@ > > # define SYMBOL_NAME wcscpy > > # include <init-arch.h> > > > > -extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; > > +extern __typeof (REDIRECT_NAME) OPTIMIZE (generic) attribute_hidden; > > extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; > > > > static inline void * > > @@ -37,7 +37,7 @@ IFUNC_SELECTOR (void) > > if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3)) > > return OPTIMIZE (ssse3); > > > > - return OPTIMIZE (sse2); > > + return OPTIMIZE (generic); > > } > > > > libc_ifunc_redirected (__redirect_wcscpy, __wcscpy, IFUNC_SELECTOR ()); > > diff --git a/sysdeps/x86_64/multiarch/wcsnlen-c.c b/sysdeps/x86_64/multiarch/wcsnlen-c.c > > index e1ec7cfbb5..1c9c04241a 100644 > > --- a/sysdeps/x86_64/multiarch/wcsnlen-c.c > > +++ b/sysdeps/x86_64/multiarch/wcsnlen-c.c > > @@ -1,9 +1,9 @@ > > #if IS_IN (libc) > > # include <wchar.h> > > > > -# define WCSNLEN __wcsnlen_sse2 > > +# define WCSNLEN __wcsnlen_generic > > > > -extern __typeof (wcsnlen) __wcsnlen_sse2; > > +extern __typeof (wcsnlen) __wcsnlen_generic; > > #endif > > > > #include "wcsmbs/wcsnlen.c" > > diff --git a/sysdeps/x86_64/multiarch/wcsnlen.c b/sysdeps/x86_64/multiarch/wcsnlen.c > > index baa26666a8..05b7a211de 100644 > > --- a/sysdeps/x86_64/multiarch/wcsnlen.c > > +++ b/sysdeps/x86_64/multiarch/wcsnlen.c > > @@ -24,6 +24,7 @@ > > # undef __wcsnlen > > > > # define SYMBOL_NAME wcsnlen > > +# define GENERIC generic > > # include "ifunc-wcslen.h" > > > > libc_ifunc_redirected (__redirect_wcsnlen, __wcsnlen, IFUNC_SELECTOR ()); > > -- > > 2.34.1 > > > > > -- > H.J.
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index 3d153cac35..86c6ecdfc1 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -77,7 +77,7 @@ sysdep_routines += \ strcpy-sse2 \ strcpy-sse2-unaligned \ strcspn-c \ - strcspn-sse2 \ + strcspn-c-sse4 \ strlen-avx2 \ strlen-avx2-rtm \ strlen-evex \ @@ -109,21 +109,22 @@ sysdep_routines += \ strnlen-evex512 \ strnlen-sse2 \ strpbrk-c \ - strpbrk-sse2 \ + strpbrk-c-sse4 \ strrchr-avx2 \ strrchr-avx2-rtm \ strrchr-evex \ strrchr-sse2 \ strspn-c \ - strspn-sse2 \ + strspn-c-sse4 \ strstr-avx512 \ strstr-sse2-unaligned \ varshift \ # sysdep_routines -CFLAGS-varshift.c += -msse4 -CFLAGS-strcspn-c.c += -msse4 -CFLAGS-strpbrk-c.c += -msse4 -CFLAGS-strspn-c.c += -msse4 + +CFLAGS-strcspn-c-sse4.c += -msse4 +CFLAGS-strpbrk-c-sse4.c += -msse4 +CFLAGS-strspn-c-sse4.c += -msse4 + CFLAGS-strstr-avx512.c += -mavx512f -mavx512vl -mavx512dq -mavx512bw -mbmi -mbmi2 -O3 endif diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index 58f3ec8306..4cbd200d39 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -372,7 +372,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, __stpncpy_evex) IFUNC_IMPL_ADD (array, i, stpncpy, 1, __stpncpy_sse2_unaligned) - IFUNC_IMPL_ADD (array, i, stpncpy, 1, __stpncpy_sse2)) + IFUNC_IMPL_ADD (array, i, stpncpy, 1, __stpncpy_generic)) /* Support sysdeps/x86_64/multiarch/stpcpy.c. */ IFUNC_IMPL (i, name, stpcpy, @@ -531,7 +531,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL (i, name, strcspn, IFUNC_IMPL_ADD (array, i, strcspn, CPU_FEATURE_USABLE (SSE4_2), __strcspn_sse42) - IFUNC_IMPL_ADD (array, i, strcspn, 1, __strcspn_sse2)) + IFUNC_IMPL_ADD (array, i, strcspn, 1, __strcspn_generic)) /* Support sysdeps/x86_64/multiarch/strncase_l.c. */ IFUNC_IMPL (i, name, strncasecmp, @@ -585,7 +585,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, __strncat_evex) IFUNC_IMPL_ADD (array, i, strncat, 1, __strncat_sse2_unaligned) - IFUNC_IMPL_ADD (array, i, strncat, 1, __strncat_sse2)) + IFUNC_IMPL_ADD (array, i, strncat, 1, __strncat_generic)) /* Support sysdeps/x86_64/multiarch/strncpy.c. */ IFUNC_IMPL (i, name, strncpy, @@ -601,20 +601,20 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, __strncpy_evex) IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_sse2_unaligned) - IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_sse2)) + IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_generic)) /* Support sysdeps/x86_64/multiarch/strpbrk.c. */ IFUNC_IMPL (i, name, strpbrk, IFUNC_IMPL_ADD (array, i, strpbrk, CPU_FEATURE_USABLE (SSE4_2), __strpbrk_sse42) - IFUNC_IMPL_ADD (array, i, strpbrk, 1, __strpbrk_sse2)) + IFUNC_IMPL_ADD (array, i, strpbrk, 1, __strpbrk_generic)) /* Support sysdeps/x86_64/multiarch/strspn.c. */ IFUNC_IMPL (i, name, strspn, IFUNC_IMPL_ADD (array, i, strspn, CPU_FEATURE_USABLE (SSE4_2), __strspn_sse42) - IFUNC_IMPL_ADD (array, i, strspn, 1, __strspn_sse2)) + IFUNC_IMPL_ADD (array, i, strspn, 1, __strspn_generic)) /* Support sysdeps/x86_64/multiarch/strstr.c. */ IFUNC_IMPL (i, name, strstr, @@ -697,7 +697,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL (i, name, wcscpy, IFUNC_IMPL_ADD (array, i, wcscpy, CPU_FEATURE_USABLE (SSSE3), __wcscpy_ssse3) - IFUNC_IMPL_ADD (array, i, wcscpy, 1, __wcscpy_sse2)) + IFUNC_IMPL_ADD (array, i, wcscpy, 1, __wcscpy_generic)) /* Support sysdeps/x86_64/multiarch/wcslen.c. */ IFUNC_IMPL (i, name, wcslen, @@ -749,7 +749,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, wcsnlen, CPU_FEATURE_USABLE (SSE4_1), __wcsnlen_sse4_1) - IFUNC_IMPL_ADD (array, i, wcsnlen, 1, __wcsnlen_sse2)) + IFUNC_IMPL_ADD (array, i, wcsnlen, 1, __wcsnlen_generic)) /* Support sysdeps/x86_64/multiarch/wmemchr.c. */ IFUNC_IMPL (i, name, wmemchr, diff --git a/sysdeps/x86_64/multiarch/ifunc-sse4_2.h b/sysdeps/x86_64/multiarch/ifunc-sse4_2.h index b555ff2fac..ee36525bcf 100644 --- a/sysdeps/x86_64/multiarch/ifunc-sse4_2.h +++ b/sysdeps/x86_64/multiarch/ifunc-sse4_2.h @@ -19,7 +19,7 @@ #include <init-arch.h> -extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (generic) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden; static inline void * @@ -30,5 +30,5 @@ IFUNC_SELECTOR (void) if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_2)) return OPTIMIZE (sse42); - return OPTIMIZE (sse2); + return OPTIMIZE (generic); } diff --git a/sysdeps/x86_64/multiarch/ifunc-strcpy.h b/sysdeps/x86_64/multiarch/ifunc-strcpy.h index a15afa44e9..80529458d1 100644 --- a/sysdeps/x86_64/multiarch/ifunc-strcpy.h +++ b/sysdeps/x86_64/multiarch/ifunc-strcpy.h @@ -20,7 +20,11 @@ #include <init-arch.h> -extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; +#ifndef GENERIC +# define GENERIC sse2 +#endif + +extern __typeof (REDIRECT_NAME) OPTIMIZE (GENERIC) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; @@ -49,5 +53,5 @@ IFUNC_SELECTOR (void) if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load)) return OPTIMIZE (sse2_unaligned); - return OPTIMIZE (sse2); + return OPTIMIZE (GENERIC); } diff --git a/sysdeps/x86_64/multiarch/ifunc-wcslen.h b/sysdeps/x86_64/multiarch/ifunc-wcslen.h index 2b29e7608a..88c1c502af 100644 --- a/sysdeps/x86_64/multiarch/ifunc-wcslen.h +++ b/sysdeps/x86_64/multiarch/ifunc-wcslen.h @@ -19,7 +19,11 @@ #include <init-arch.h> -extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; +#ifndef GENERIC +# define GENERIC sse2 +#endif + +extern __typeof (REDIRECT_NAME) OPTIMIZE (GENERIC) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; @@ -48,5 +52,5 @@ IFUNC_SELECTOR (void) if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1)) return OPTIMIZE (sse4_1); - return OPTIMIZE (sse2); + return OPTIMIZE (GENERIC); } diff --git a/sysdeps/x86_64/multiarch/stpncpy-c.c b/sysdeps/x86_64/multiarch/stpncpy-c.c index b016e487e1..eb62fcf388 100644 --- a/sysdeps/x86_64/multiarch/stpncpy-c.c +++ b/sysdeps/x86_64/multiarch/stpncpy-c.c @@ -1,4 +1,4 @@ -#define STPNCPY __stpncpy_sse2 +#define STPNCPY __stpncpy_generic #undef weak_alias #define weak_alias(ignored1, ignored2) #undef libc_hidden_def diff --git a/sysdeps/x86_64/multiarch/stpncpy.c b/sysdeps/x86_64/multiarch/stpncpy.c index 82fa53957d..879bc83f0b 100644 --- a/sysdeps/x86_64/multiarch/stpncpy.c +++ b/sysdeps/x86_64/multiarch/stpncpy.c @@ -25,6 +25,7 @@ # undef stpncpy # undef __stpncpy +# define GENERIC generic # define SYMBOL_NAME stpncpy # include "ifunc-strcpy.h" diff --git a/sysdeps/x86_64/multiarch/strcspn-c-sse4.c b/sysdeps/x86_64/multiarch/strcspn-c-sse4.c new file mode 100644 index 0000000000..59f64f9fe8 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strcspn-c-sse4.c @@ -0,0 +1,163 @@ +/* strcspn with SSE4.2 intrinsics + Copyright (C) 2009-2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include <nmmintrin.h> +#include <string.h> +#include "varshift.h" + +/* We use 0x2: + _SIDD_SBYTE_OPS + | _SIDD_CMP_EQUAL_ANY + | _SIDD_POSITIVE_POLARITY + | _SIDD_LEAST_SIGNIFICANT + on pcmpistri to compare xmm/mem128 + + 0 1 2 3 4 5 6 7 8 9 A B C D E F + X X X X X X X X X X X X X X X X + + against xmm + + 0 1 2 3 4 5 6 7 8 9 A B C D E F + A A A A A A A A A A A A A A A A + + to find out if the first 16byte data element has any byte A and + the offset of the first byte. There are 3 cases: + + 1. The first 16byte data element has the byte A at the offset X. + 2. The first 16byte data element has EOS and doesn't have the byte A. + 3. The first 16byte data element is valid and doesn't have the byte A. + + Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases: + + 1 X 1 0/1 0 + 2 16 0 1 0 + 3 16 0 0 0 + + We exit from the loop for cases 1 and 2 with jbe which branches + when either CFlag or ZFlag is 1. If CFlag == 1, ECX has the offset + X for case 1. */ + +#ifndef STRCSPN_GENERIC +# define STRCSPN_GENERIC __strcspn_generic +# define STRCSPN_SSE42 __strcspn_sse42 +#endif + +#ifdef USE_AS_STRPBRK +# define RETURN(val1, val2) return val1 +#else +# define RETURN(val1, val2) return val2 +#endif + +extern +#ifdef USE_AS_STRPBRK +char * +#else +size_t +#endif +STRCSPN_GENERIC (const char *, const char *) attribute_hidden; + + +#ifdef USE_AS_STRPBRK +char * +#else +size_t +#endif +__attribute__ ((section (".text.sse4.2"))) +STRCSPN_SSE42 (const char *s, const char *a) +{ + if (*a == 0) + RETURN (NULL, strlen (s)); + + const char *aligned; + __m128i mask, maskz, zero; + unsigned int maskz_bits; + unsigned int offset = (unsigned int) ((size_t) a & 15); + zero = _mm_set1_epi8 (0); + if (offset != 0) + { + /* Load masks. */ + aligned = (const char *) ((size_t) a & -16L); + __m128i mask0 = _mm_load_si128 ((__m128i *) aligned); + maskz = _mm_cmpeq_epi8 (mask0, zero); + + /* Find where the NULL terminator is. */ + maskz_bits = _mm_movemask_epi8 (maskz) >> offset; + if (maskz_bits != 0) + { + mask = __m128i_shift_right (mask0, offset); + offset = (unsigned int) ((size_t) s & 15); + if (offset) + goto start_unaligned; + + aligned = s; + goto start_loop; + } + } + + /* A is aligned. */ + mask = _mm_loadu_si128 ((__m128i *) a); + /* Find where the NULL terminator is. */ + maskz = _mm_cmpeq_epi8 (mask, zero); + maskz_bits = _mm_movemask_epi8 (maskz); + if (maskz_bits == 0) + { + /* There is no NULL terminator. Don't use SSE4.2 if the length + of A > 16. */ + if (a[16] != 0) + return STRCSPN_GENERIC (s, a); + } + + aligned = s; + offset = (unsigned int) ((size_t) s & 15); + if (offset != 0) + { + start_unaligned: + /* Check partial string. */ + aligned = (const char *) ((size_t) s & -16L); + __m128i value = _mm_load_si128 ((__m128i *) aligned); + + value = __m128i_shift_right (value, offset); + + unsigned int length = _mm_cmpistri (mask, value, 0x2); + /* No need to check ZFlag since ZFlag is always 1. */ + unsigned int cflag = _mm_cmpistrc (mask, value, 0x2); + if (cflag) + RETURN ((char *) (s + length), length); + /* Find where the NULL terminator is. */ + unsigned int index = _mm_cmpistri (value, value, 0x3a); + if (index < 16 - offset) + RETURN (NULL, index); + aligned += 16; + } + +start_loop: + while (1) + { + __m128i value = _mm_load_si128 ((__m128i *) aligned); + unsigned int index = _mm_cmpistri (mask, value, 0x2); + unsigned int cflag = _mm_cmpistrc (mask, value, 0x2); + unsigned int zflag = _mm_cmpistrz (mask, value, 0x2); + if (cflag) + RETURN ((char *) (aligned + index), (size_t) (aligned + index - s)); + if (zflag) + RETURN (NULL, + /* Find where the NULL terminator is. */ + (size_t) (aligned + _mm_cmpistri (value, value, 0x3a) - s)); + aligned += 16; + } +} diff --git a/sysdeps/x86_64/multiarch/strcspn-c.c b/sysdeps/x86_64/multiarch/strcspn-c.c index c312fab8b1..423de2e2b2 100644 --- a/sysdeps/x86_64/multiarch/strcspn-c.c +++ b/sysdeps/x86_64/multiarch/strcspn-c.c @@ -1,5 +1,5 @@ -/* strcspn with SSE4.2 intrinsics - Copyright (C) 2009-2022 Free Software Foundation, Inc. +/* strcspn. + Copyright (C) 2017-2022 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -16,148 +16,13 @@ License along with the GNU C Library; if not, see <https://www.gnu.org/licenses/>. */ -#include <nmmintrin.h> -#include <string.h> -#include "varshift.h" +#if IS_IN (libc) -/* We use 0x2: - _SIDD_SBYTE_OPS - | _SIDD_CMP_EQUAL_ANY - | _SIDD_POSITIVE_POLARITY - | _SIDD_LEAST_SIGNIFICANT - on pcmpistri to compare xmm/mem128 +# include <sysdep.h> +# define STRCSPN __strcspn_generic - 0 1 2 3 4 5 6 7 8 9 A B C D E F - X X X X X X X X X X X X X X X X - - against xmm - - 0 1 2 3 4 5 6 7 8 9 A B C D E F - A A A A A A A A A A A A A A A A - - to find out if the first 16byte data element has any byte A and - the offset of the first byte. There are 3 cases: - - 1. The first 16byte data element has the byte A at the offset X. - 2. The first 16byte data element has EOS and doesn't have the byte A. - 3. The first 16byte data element is valid and doesn't have the byte A. - - Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases: - - 1 X 1 0/1 0 - 2 16 0 1 0 - 3 16 0 0 0 - - We exit from the loop for cases 1 and 2 with jbe which branches - when either CFlag or ZFlag is 1. If CFlag == 1, ECX has the offset - X for case 1. */ - -#ifndef STRCSPN_SSE2 -# define STRCSPN_SSE2 __strcspn_sse2 -# define STRCSPN_SSE42 __strcspn_sse42 -#endif - -#ifdef USE_AS_STRPBRK -# define RETURN(val1, val2) return val1 -#else -# define RETURN(val1, val2) return val2 -#endif - -extern -#ifdef USE_AS_STRPBRK -char * -#else -size_t -#endif -STRCSPN_SSE2 (const char *, const char *) attribute_hidden; - - -#ifdef USE_AS_STRPBRK -char * -#else -size_t +# undef libc_hidden_builtin_def +# define libc_hidden_builtin_def(STRCSPN) #endif -__attribute__ ((section (".text.sse4.2"))) -STRCSPN_SSE42 (const char *s, const char *a) -{ - if (*a == 0) - RETURN (NULL, strlen (s)); - - const char *aligned; - __m128i mask, maskz, zero; - unsigned int maskz_bits; - unsigned int offset = (unsigned int) ((size_t) a & 15); - zero = _mm_set1_epi8 (0); - if (offset != 0) - { - /* Load masks. */ - aligned = (const char *) ((size_t) a & -16L); - __m128i mask0 = _mm_load_si128 ((__m128i *) aligned); - maskz = _mm_cmpeq_epi8 (mask0, zero); - - /* Find where the NULL terminator is. */ - maskz_bits = _mm_movemask_epi8 (maskz) >> offset; - if (maskz_bits != 0) - { - mask = __m128i_shift_right (mask0, offset); - offset = (unsigned int) ((size_t) s & 15); - if (offset) - goto start_unaligned; - - aligned = s; - goto start_loop; - } - } - - /* A is aligned. */ - mask = _mm_loadu_si128 ((__m128i *) a); - /* Find where the NULL terminator is. */ - maskz = _mm_cmpeq_epi8 (mask, zero); - maskz_bits = _mm_movemask_epi8 (maskz); - if (maskz_bits == 0) - { - /* There is no NULL terminator. Don't use SSE4.2 if the length - of A > 16. */ - if (a[16] != 0) - return STRCSPN_SSE2 (s, a); - } - - aligned = s; - offset = (unsigned int) ((size_t) s & 15); - if (offset != 0) - { - start_unaligned: - /* Check partial string. */ - aligned = (const char *) ((size_t) s & -16L); - __m128i value = _mm_load_si128 ((__m128i *) aligned); - - value = __m128i_shift_right (value, offset); - - unsigned int length = _mm_cmpistri (mask, value, 0x2); - /* No need to check ZFlag since ZFlag is always 1. */ - unsigned int cflag = _mm_cmpistrc (mask, value, 0x2); - if (cflag) - RETURN ((char *) (s + length), length); - /* Find where the NULL terminator is. */ - unsigned int index = _mm_cmpistri (value, value, 0x3a); - if (index < 16 - offset) - RETURN (NULL, index); - aligned += 16; - } -start_loop: - while (1) - { - __m128i value = _mm_load_si128 ((__m128i *) aligned); - unsigned int index = _mm_cmpistri (mask, value, 0x2); - unsigned int cflag = _mm_cmpistrc (mask, value, 0x2); - unsigned int zflag = _mm_cmpistrz (mask, value, 0x2); - if (cflag) - RETURN ((char *) (aligned + index), (size_t) (aligned + index - s)); - if (zflag) - RETURN (NULL, - /* Find where the NULL terminator is. */ - (size_t) (aligned + _mm_cmpistri (value, value, 0x3a) - s)); - aligned += 16; - } -} +#include <string/strcspn.c> diff --git a/sysdeps/x86_64/multiarch/strcspn-sse2.c b/sysdeps/x86_64/multiarch/strcspn-sse2.c deleted file mode 100644 index 3a04bb39fc..0000000000 --- a/sysdeps/x86_64/multiarch/strcspn-sse2.c +++ /dev/null @@ -1,28 +0,0 @@ -/* strcspn. - Copyright (C) 2017-2022 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <https://www.gnu.org/licenses/>. */ - -#if IS_IN (libc) - -# include <sysdep.h> -# define STRCSPN __strcspn_sse2 - -# undef libc_hidden_builtin_def -# define libc_hidden_builtin_def(STRCSPN) -#endif - -#include <string/strcspn.c> diff --git a/sysdeps/x86_64/multiarch/strncat-c.c b/sysdeps/x86_64/multiarch/strncat-c.c index 93a7fab7ea..b729c033d9 100644 --- a/sysdeps/x86_64/multiarch/strncat-c.c +++ b/sysdeps/x86_64/multiarch/strncat-c.c @@ -1,2 +1,2 @@ -#define STRNCAT __strncat_sse2 +#define STRNCAT __strncat_generic #include <string/strncat.c> diff --git a/sysdeps/x86_64/multiarch/strncat.c b/sysdeps/x86_64/multiarch/strncat.c index b649343a97..50fba8a41f 100644 --- a/sysdeps/x86_64/multiarch/strncat.c +++ b/sysdeps/x86_64/multiarch/strncat.c @@ -24,6 +24,7 @@ # undef strncat # define SYMBOL_NAME strncat +# define GENERIC generic # include "ifunc-strcpy.h" libc_ifunc_redirected (__redirect_strncat, strncat, IFUNC_SELECTOR ()); diff --git a/sysdeps/x86_64/multiarch/strncpy-c.c b/sysdeps/x86_64/multiarch/strncpy-c.c index 57c45ac7ab..183b0b8e0f 100644 --- a/sysdeps/x86_64/multiarch/strncpy-c.c +++ b/sysdeps/x86_64/multiarch/strncpy-c.c @@ -1,4 +1,4 @@ -#define STRNCPY __strncpy_sse2 +#define STRNCPY __strncpy_generic #undef libc_hidden_builtin_def #define libc_hidden_builtin_def(strncpy) diff --git a/sysdeps/x86_64/multiarch/strncpy.c b/sysdeps/x86_64/multiarch/strncpy.c index 2a780a7e16..7fc7d72ec5 100644 --- a/sysdeps/x86_64/multiarch/strncpy.c +++ b/sysdeps/x86_64/multiarch/strncpy.c @@ -24,6 +24,7 @@ # undef strncpy # define SYMBOL_NAME strncpy +# define GENERIC generic # include "ifunc-strcpy.h" libc_ifunc_redirected (__redirect_strncpy, strncpy, IFUNC_SELECTOR ()); diff --git a/sysdeps/x86_64/multiarch/strspn-sse2.c b/sysdeps/x86_64/multiarch/strpbrk-c-sse4.c similarity index 74% rename from sysdeps/x86_64/multiarch/strspn-sse2.c rename to sysdeps/x86_64/multiarch/strpbrk-c-sse4.c index 61cc6cb0a5..8700276773 100644 --- a/sysdeps/x86_64/multiarch/strspn-sse2.c +++ b/sysdeps/x86_64/multiarch/strpbrk-c-sse4.c @@ -1,5 +1,5 @@ -/* strspn. - Copyright (C) 2017-2022 Free Software Foundation, Inc. +/* strpbrk with SSE4.2 intrinsics + Copyright (C) 2022 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -16,13 +16,7 @@ License along with the GNU C Library; if not, see <https://www.gnu.org/licenses/>. */ -#if IS_IN (libc) - -# include <sysdep.h> -# define STRSPN __strspn_sse2 - -# undef libc_hidden_builtin_def -# define libc_hidden_builtin_def(STRSPN) -#endif - -#include <string/strspn.c> +#define USE_AS_STRPBRK +#define STRCSPN_GENERIC __strpbrk_generic +#define STRCSPN_SSE42 __strpbrk_sse42 +#include "strcspn-c-sse4.c" diff --git a/sysdeps/x86_64/multiarch/strpbrk-c.c b/sysdeps/x86_64/multiarch/strpbrk-c.c index abf4ff7f1a..d31acfe495 100644 --- a/sysdeps/x86_64/multiarch/strpbrk-c.c +++ b/sysdeps/x86_64/multiarch/strpbrk-c.c @@ -1,5 +1,5 @@ -/* strpbrk with SSE4.2 intrinsics - Copyright (C) 2022 Free Software Foundation, Inc. +/* strpbrk. + Copyright (C) 2017-2022 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -16,7 +16,13 @@ License along with the GNU C Library; if not, see <https://www.gnu.org/licenses/>. */ -#define USE_AS_STRPBRK -#define STRCSPN_SSE2 __strpbrk_sse2 -#define STRCSPN_SSE42 __strpbrk_sse42 -#include "strcspn-c.c" +#if IS_IN (libc) + +# include <sysdep.h> +# define STRPBRK __strpbrk_generic + +# undef libc_hidden_builtin_def +# define libc_hidden_builtin_def(STRPBRK) +#endif + +#include <string/strpbrk.c> diff --git a/sysdeps/x86_64/multiarch/strpbrk-sse2.c b/sysdeps/x86_64/multiarch/strpbrk-sse2.c deleted file mode 100644 index d03214c4fb..0000000000 --- a/sysdeps/x86_64/multiarch/strpbrk-sse2.c +++ /dev/null @@ -1,28 +0,0 @@ -/* strpbrk. - Copyright (C) 2017-2022 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <https://www.gnu.org/licenses/>. */ - -#if IS_IN (libc) - -# include <sysdep.h> -# define STRPBRK __strpbrk_sse2 - -# undef libc_hidden_builtin_def -# define libc_hidden_builtin_def(STRPBRK) -#endif - -#include <string/strpbrk.c> diff --git a/sysdeps/x86_64/multiarch/strspn-c-sse4.c b/sysdeps/x86_64/multiarch/strspn-c-sse4.c new file mode 100644 index 0000000000..d044916688 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strspn-c-sse4.c @@ -0,0 +1,136 @@ +/* strspn with SSE4.2 intrinsics + Copyright (C) 2009-2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include <nmmintrin.h> +#include <string.h> +#include "varshift.h" + +/* We use 0x12: + _SIDD_SBYTE_OPS + | _SIDD_CMP_EQUAL_ANY + | _SIDD_NEGATIVE_POLARITY + | _SIDD_LEAST_SIGNIFICANT + on pcmpistri to compare xmm/mem128 + + 0 1 2 3 4 5 6 7 8 9 A B C D E F + X X X X X X X X X X X X X X X X + + against xmm + + 0 1 2 3 4 5 6 7 8 9 A B C D E F + A A A A A A A A A A A A A A A A + + to find out if the first 16byte data element has any non-A byte and + the offset of the first byte. There are 2 cases: + + 1. The first 16byte data element has the non-A byte, including + EOS, at the offset X. + 2. The first 16byte data element is valid and doesn't have the non-A + byte. + + Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases: + + case ECX CFlag ZFlag SFlag + 1 X 1 0/1 0 + 2 16 0 0 0 + + We exit from the loop for case 1. */ + +extern size_t __strspn_generic (const char *, const char *) attribute_hidden; + + +size_t +__attribute__ ((section (".text.sse4.2"))) +__strspn_sse42 (const char *s, const char *a) +{ + if (*a == 0) + return 0; + + const char *aligned; + __m128i mask, maskz, zero; + unsigned int maskz_bits; + unsigned int offset = (int) ((size_t) a & 15); + zero = _mm_set1_epi8 (0); + if (offset != 0) + { + /* Load masks. */ + aligned = (const char *) ((size_t) a & -16L); + __m128i mask0 = _mm_load_si128 ((__m128i *) aligned); + maskz = _mm_cmpeq_epi8 (mask0, zero); + + /* Find where the NULL terminator is. */ + maskz_bits = _mm_movemask_epi8 (maskz) >> offset; + if (maskz_bits != 0) + { + mask = __m128i_shift_right (mask0, offset); + offset = (unsigned int) ((size_t) s & 15); + if (offset) + goto start_unaligned; + + aligned = s; + goto start_loop; + } + } + + /* A is aligned. */ + mask = _mm_loadu_si128 ((__m128i *) a); + + /* Find where the NULL terminator is. */ + maskz = _mm_cmpeq_epi8 (mask, zero); + maskz_bits = _mm_movemask_epi8 (maskz); + if (maskz_bits == 0) + { + /* There is no NULL terminator. Don't use SSE4.2 if the length + of A > 16. */ + if (a[16] != 0) + return __strspn_generic (s, a); + } + aligned = s; + offset = (unsigned int) ((size_t) s & 15); + + if (offset != 0) + { + start_unaligned: + /* Check partial string. */ + aligned = (const char *) ((size_t) s & -16L); + __m128i value = _mm_load_si128 ((__m128i *) aligned); + __m128i adj_value = __m128i_shift_right (value, offset); + + unsigned int length = _mm_cmpistri (mask, adj_value, 0x12); + /* No need to check CFlag since it is always 1. */ + if (length < 16 - offset) + return length; + /* Find where the NULL terminator is. */ + maskz = _mm_cmpeq_epi8 (value, zero); + maskz_bits = _mm_movemask_epi8 (maskz) >> offset; + if (maskz_bits != 0) + return length; + aligned += 16; + } + +start_loop: + while (1) + { + __m128i value = _mm_load_si128 ((__m128i *) aligned); + unsigned int index = _mm_cmpistri (mask, value, 0x12); + unsigned int cflag = _mm_cmpistrc (mask, value, 0x12); + if (cflag) + return (size_t) (aligned + index - s); + aligned += 16; + } +} diff --git a/sysdeps/x86_64/multiarch/strspn-c.c b/sysdeps/x86_64/multiarch/strspn-c.c index 6124033ceb..6b50c36432 100644 --- a/sysdeps/x86_64/multiarch/strspn-c.c +++ b/sysdeps/x86_64/multiarch/strspn-c.c @@ -1,5 +1,5 @@ -/* strspn with SSE4.2 intrinsics - Copyright (C) 2009-2022 Free Software Foundation, Inc. +/* strspn. + Copyright (C) 2017-2022 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -16,121 +16,13 @@ License along with the GNU C Library; if not, see <https://www.gnu.org/licenses/>. */ -#include <nmmintrin.h> -#include <string.h> -#include "varshift.h" +#if IS_IN (libc) -/* We use 0x12: - _SIDD_SBYTE_OPS - | _SIDD_CMP_EQUAL_ANY - | _SIDD_NEGATIVE_POLARITY - | _SIDD_LEAST_SIGNIFICANT - on pcmpistri to compare xmm/mem128 +# include <sysdep.h> +# define STRSPN __strspn_generic - 0 1 2 3 4 5 6 7 8 9 A B C D E F - X X X X X X X X X X X X X X X X +# undef libc_hidden_builtin_def +# define libc_hidden_builtin_def(STRSPN) +#endif - against xmm - - 0 1 2 3 4 5 6 7 8 9 A B C D E F - A A A A A A A A A A A A A A A A - - to find out if the first 16byte data element has any non-A byte and - the offset of the first byte. There are 2 cases: - - 1. The first 16byte data element has the non-A byte, including - EOS, at the offset X. - 2. The first 16byte data element is valid and doesn't have the non-A - byte. - - Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases: - - case ECX CFlag ZFlag SFlag - 1 X 1 0/1 0 - 2 16 0 0 0 - - We exit from the loop for case 1. */ - -extern size_t __strspn_sse2 (const char *, const char *) attribute_hidden; - - -size_t -__attribute__ ((section (".text.sse4.2"))) -__strspn_sse42 (const char *s, const char *a) -{ - if (*a == 0) - return 0; - - const char *aligned; - __m128i mask, maskz, zero; - unsigned int maskz_bits; - unsigned int offset = (int) ((size_t) a & 15); - zero = _mm_set1_epi8 (0); - if (offset != 0) - { - /* Load masks. */ - aligned = (const char *) ((size_t) a & -16L); - __m128i mask0 = _mm_load_si128 ((__m128i *) aligned); - maskz = _mm_cmpeq_epi8 (mask0, zero); - - /* Find where the NULL terminator is. */ - maskz_bits = _mm_movemask_epi8 (maskz) >> offset; - if (maskz_bits != 0) - { - mask = __m128i_shift_right (mask0, offset); - offset = (unsigned int) ((size_t) s & 15); - if (offset) - goto start_unaligned; - - aligned = s; - goto start_loop; - } - } - - /* A is aligned. */ - mask = _mm_loadu_si128 ((__m128i *) a); - - /* Find where the NULL terminator is. */ - maskz = _mm_cmpeq_epi8 (mask, zero); - maskz_bits = _mm_movemask_epi8 (maskz); - if (maskz_bits == 0) - { - /* There is no NULL terminator. Don't use SSE4.2 if the length - of A > 16. */ - if (a[16] != 0) - return __strspn_sse2 (s, a); - } - aligned = s; - offset = (unsigned int) ((size_t) s & 15); - - if (offset != 0) - { - start_unaligned: - /* Check partial string. */ - aligned = (const char *) ((size_t) s & -16L); - __m128i value = _mm_load_si128 ((__m128i *) aligned); - __m128i adj_value = __m128i_shift_right (value, offset); - - unsigned int length = _mm_cmpistri (mask, adj_value, 0x12); - /* No need to check CFlag since it is always 1. */ - if (length < 16 - offset) - return length; - /* Find where the NULL terminator is. */ - maskz = _mm_cmpeq_epi8 (value, zero); - maskz_bits = _mm_movemask_epi8 (maskz) >> offset; - if (maskz_bits != 0) - return length; - aligned += 16; - } - -start_loop: - while (1) - { - __m128i value = _mm_load_si128 ((__m128i *) aligned); - unsigned int index = _mm_cmpistri (mask, value, 0x12); - unsigned int cflag = _mm_cmpistrc (mask, value, 0x12); - if (cflag) - return (size_t) (aligned + index - s); - aligned += 16; - } -} +#include <string/strspn.c> diff --git a/sysdeps/x86_64/multiarch/wcscpy-c.c b/sysdeps/x86_64/multiarch/wcscpy-c.c index 26d6984e9b..fa38dd898d 100644 --- a/sysdeps/x86_64/multiarch/wcscpy-c.c +++ b/sysdeps/x86_64/multiarch/wcscpy-c.c @@ -1,5 +1,5 @@ #if IS_IN (libc) -# define WCSCPY __wcscpy_sse2 +# define WCSCPY __wcscpy_generic #endif #include <wcsmbs/wcscpy.c> diff --git a/sysdeps/x86_64/multiarch/wcscpy.c b/sysdeps/x86_64/multiarch/wcscpy.c index 6a2d1421d9..53c3228dc2 100644 --- a/sysdeps/x86_64/multiarch/wcscpy.c +++ b/sysdeps/x86_64/multiarch/wcscpy.c @@ -26,7 +26,7 @@ # define SYMBOL_NAME wcscpy # include <init-arch.h> -extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (generic) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; static inline void * @@ -37,7 +37,7 @@ IFUNC_SELECTOR (void) if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3)) return OPTIMIZE (ssse3); - return OPTIMIZE (sse2); + return OPTIMIZE (generic); } libc_ifunc_redirected (__redirect_wcscpy, __wcscpy, IFUNC_SELECTOR ()); diff --git a/sysdeps/x86_64/multiarch/wcsnlen-c.c b/sysdeps/x86_64/multiarch/wcsnlen-c.c index e1ec7cfbb5..1c9c04241a 100644 --- a/sysdeps/x86_64/multiarch/wcsnlen-c.c +++ b/sysdeps/x86_64/multiarch/wcsnlen-c.c @@ -1,9 +1,9 @@ #if IS_IN (libc) # include <wchar.h> -# define WCSNLEN __wcsnlen_sse2 +# define WCSNLEN __wcsnlen_generic -extern __typeof (wcsnlen) __wcsnlen_sse2; +extern __typeof (wcsnlen) __wcsnlen_generic; #endif #include "wcsmbs/wcsnlen.c" diff --git a/sysdeps/x86_64/multiarch/wcsnlen.c b/sysdeps/x86_64/multiarch/wcsnlen.c index baa26666a8..05b7a211de 100644 --- a/sysdeps/x86_64/multiarch/wcsnlen.c +++ b/sysdeps/x86_64/multiarch/wcsnlen.c @@ -24,6 +24,7 @@ # undef __wcsnlen # define SYMBOL_NAME wcsnlen +# define GENERIC generic # include "ifunc-wcslen.h" libc_ifunc_redirected (__redirect_wcsnlen, __wcsnlen, IFUNC_SELECTOR ());