Message ID | 20220622171200.1738965-2-goldstein.w.n@gmail.com |
---|---|
State | New |
Headers | show |
Series | [v8,1/2] x86: Add defines / utilities for making ISA specific x86 builds | expand |
On Wed, Jun 22, 2022 at 10:12 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > 1. Refactor files so that all implementations for in the multiarch > directory. > - Essentially moved sse2 {raw|w}memchr.S implementation to > multiarch/{raw|w}memchr-sse2.S > > - The non-multiarch {raw|w}memchr.S file now only includes one of > the implementations in the multiarch directory based on the > compiled ISA level (only used for non-multiarch builds. > Otherwise we go through the ifunc selector). > > 2. Add ISA level build guards to different implementations. > - I.e memchr-avx2.S which is ISA level 3 will only build if > compiled ISA level <= 3. Otherwise there is no reason to include > it as we will always use one of the ISA level 4 > implementations (memchr-evex{-rtm}.S). > > 3. Add new multiarch/rtld-{raw}memchr.S that just include the > non-multiarch {raw}memchr.S which will in turn select the best > implementation based on the compiled ISA level. > > 4. Refactor the ifunc selector and ifunc implementation list to use > the ISA level aware wrapper macros that allow functions below the > compiled ISA level (with a guranteed replacement) to be skipped. > - Guranteed replacement essentially means that for any ISA level > build there must be a function that the baseline of the ISA > supports. So for {raw|w}memchr.S since there is not ISA level 2 > function, the ISA level 2 build still includes the ISA level > 1 (sse2) function. Once we reach the ISA level 3 build, however, > {raw|w}memchr-avx2{-rtm}.S will always be sufficient so the ISA > level 1 implementation ({raw|w}memchr-sse2.S) will not be built. > > Tested with and without multiarch on x86_64 for ISA levels: > {generic, x86-64-v2, x86-64-v3, x86-64-v4} > --- > sysdeps/x86_64/memchr.S | 355 +---------------- > sysdeps/x86_64/multiarch/ifunc-evex.h | 31 +- > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 72 ++-- > sysdeps/x86_64/multiarch/memchr-avx2.S | 10 +- > sysdeps/x86_64/multiarch/memchr-evex.S | 10 +- > sysdeps/x86_64/multiarch/memchr-sse2.S | 368 +++++++++++++++++- > sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S | 7 +- > sysdeps/x86_64/multiarch/rawmemchr-avx2.S | 7 +- > sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S | 8 +- > sysdeps/x86_64/multiarch/rawmemchr-evex.S | 7 +- > sysdeps/x86_64/multiarch/rawmemchr-sse2.S | 203 +++++++++- > sysdeps/x86_64/multiarch/rtld-memchr.S | 18 + > sysdeps/x86_64/multiarch/rtld-rawmemchr.S | 18 + > sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S | 7 +- > sysdeps/x86_64/multiarch/wmemchr-avx2.S | 7 +- > sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S | 8 +- > sysdeps/x86_64/multiarch/wmemchr-evex.S | 7 +- > sysdeps/x86_64/multiarch/wmemchr-sse2.S | 27 +- > sysdeps/x86_64/rawmemchr.S | 186 +-------- > sysdeps/x86_64/wmemchr.S | 24 ++ > 20 files changed, 773 insertions(+), 607 deletions(-) > create mode 100644 sysdeps/x86_64/multiarch/rtld-memchr.S > create mode 100644 sysdeps/x86_64/multiarch/rtld-rawmemchr.S > create mode 100644 sysdeps/x86_64/wmemchr.S > > diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S > index a160fd9b00..018bb06f04 100644 > --- a/sysdeps/x86_64/memchr.S > +++ b/sysdeps/x86_64/memchr.S > @@ -15,358 +15,13 @@ > License along with the GNU C Library; if not, see > <https://www.gnu.org/licenses/>. */ > > -#include <sysdep.h> > +#define MEMCHR memchr > > -#ifdef USE_AS_WMEMCHR > -# define MEMCHR wmemchr > -# define PCMPEQ pcmpeqd > -# define CHAR_PER_VEC 4 > -#else > -# define MEMCHR memchr > -# define PCMPEQ pcmpeqb > -# define CHAR_PER_VEC 16 > -#endif > +#define DEFAULT_IMPL_V1 "multiarch/memchr-sse2.S" > +#define DEFAULT_IMPL_V3 "multiarch/memchr-avx2.S" > +#define DEFAULT_IMPL_V4 "multiarch/memchr-evex.S" > > -/* fast SSE2 version with using pmaxub and 64 byte loop */ > +#include "isa-default-impl.h" > > - .text > -ENTRY(MEMCHR) > - movd %esi, %xmm1 > - mov %edi, %ecx > - > -#ifdef __ILP32__ > - /* Clear the upper 32 bits. */ > - movl %edx, %edx > -#endif > -#ifdef USE_AS_WMEMCHR > - test %RDX_LP, %RDX_LP > - jz L(return_null) > -#else > - punpcklbw %xmm1, %xmm1 > - test %RDX_LP, %RDX_LP > - jz L(return_null) > - punpcklbw %xmm1, %xmm1 > -#endif > - > - and $63, %ecx > - pshufd $0, %xmm1, %xmm1 > - > - cmp $48, %ecx > - ja L(crosscache) > - > - movdqu (%rdi), %xmm0 > - PCMPEQ %xmm1, %xmm0 > - pmovmskb %xmm0, %eax > - test %eax, %eax > - > - jnz L(matches_1) > - sub $CHAR_PER_VEC, %rdx > - jbe L(return_null) > - add $16, %rdi > - and $15, %ecx > - and $-16, %rdi > -#ifdef USE_AS_WMEMCHR > - shr $2, %ecx > -#endif > - add %rcx, %rdx > - sub $(CHAR_PER_VEC * 4), %rdx > - jbe L(exit_loop) > - jmp L(loop_prolog) > - > - .p2align 4 > -L(crosscache): > - and $15, %ecx > - and $-16, %rdi > - movdqa (%rdi), %xmm0 > - > - PCMPEQ %xmm1, %xmm0 > - /* Check if there is a match. */ > - pmovmskb %xmm0, %eax > - /* Remove the leading bytes. */ > - sar %cl, %eax > - test %eax, %eax > - je L(unaligned_no_match) > - /* Check which byte is a match. */ > - bsf %eax, %eax > -#ifdef USE_AS_WMEMCHR > - mov %eax, %esi > - shr $2, %esi > - sub %rsi, %rdx > -#else > - sub %rax, %rdx > -#endif > - jbe L(return_null) > - add %rdi, %rax > - add %rcx, %rax > - ret > - > - .p2align 4 > -L(unaligned_no_match): > - /* "rcx" is less than 16. Calculate "rdx + rcx - 16" by using > - "rdx - (16 - rcx)" instead of "(rdx + rcx) - 16" to void > - possible addition overflow. */ > - neg %rcx > - add $16, %rcx > -#ifdef USE_AS_WMEMCHR > - shr $2, %ecx > -#endif > - sub %rcx, %rdx > - jbe L(return_null) > - add $16, %rdi > - sub $(CHAR_PER_VEC * 4), %rdx > - jbe L(exit_loop) > - > - .p2align 4 > -L(loop_prolog): > - movdqa (%rdi), %xmm0 > - PCMPEQ %xmm1, %xmm0 > - pmovmskb %xmm0, %eax > - test %eax, %eax > - jnz L(matches) > - > - movdqa 16(%rdi), %xmm2 > - PCMPEQ %xmm1, %xmm2 > - pmovmskb %xmm2, %eax > - test %eax, %eax > - jnz L(matches16) > - > - movdqa 32(%rdi), %xmm3 > - PCMPEQ %xmm1, %xmm3 > - pmovmskb %xmm3, %eax > - test %eax, %eax > - jnz L(matches32) > - > - movdqa 48(%rdi), %xmm4 > - PCMPEQ %xmm1, %xmm4 > - add $64, %rdi > - pmovmskb %xmm4, %eax > - test %eax, %eax > - jnz L(matches0) > - > - test $0x3f, %rdi > - jz L(align64_loop) > - > - sub $(CHAR_PER_VEC * 4), %rdx > - jbe L(exit_loop) > - > - movdqa (%rdi), %xmm0 > - PCMPEQ %xmm1, %xmm0 > - pmovmskb %xmm0, %eax > - test %eax, %eax > - jnz L(matches) > - > - movdqa 16(%rdi), %xmm2 > - PCMPEQ %xmm1, %xmm2 > - pmovmskb %xmm2, %eax > - test %eax, %eax > - jnz L(matches16) > - > - movdqa 32(%rdi), %xmm3 > - PCMPEQ %xmm1, %xmm3 > - pmovmskb %xmm3, %eax > - test %eax, %eax > - jnz L(matches32) > - > - movdqa 48(%rdi), %xmm3 > - PCMPEQ %xmm1, %xmm3 > - pmovmskb %xmm3, %eax > - > - add $64, %rdi > - test %eax, %eax > - jnz L(matches0) > - > - mov %rdi, %rcx > - and $-64, %rdi > - and $63, %ecx > -#ifdef USE_AS_WMEMCHR > - shr $2, %ecx > -#endif > - add %rcx, %rdx > - > - .p2align 4 > -L(align64_loop): > - sub $(CHAR_PER_VEC * 4), %rdx > - jbe L(exit_loop) > - movdqa (%rdi), %xmm0 > - movdqa 16(%rdi), %xmm2 > - movdqa 32(%rdi), %xmm3 > - movdqa 48(%rdi), %xmm4 > - > - PCMPEQ %xmm1, %xmm0 > - PCMPEQ %xmm1, %xmm2 > - PCMPEQ %xmm1, %xmm3 > - PCMPEQ %xmm1, %xmm4 > - > - pmaxub %xmm0, %xmm3 > - pmaxub %xmm2, %xmm4 > - pmaxub %xmm3, %xmm4 > - pmovmskb %xmm4, %eax > - > - add $64, %rdi > - > - test %eax, %eax > - jz L(align64_loop) > - > - sub $64, %rdi > - > - pmovmskb %xmm0, %eax > - test %eax, %eax > - jnz L(matches) > - > - pmovmskb %xmm2, %eax > - test %eax, %eax > - jnz L(matches16) > - > - movdqa 32(%rdi), %xmm3 > - PCMPEQ %xmm1, %xmm3 > - > - PCMPEQ 48(%rdi), %xmm1 > - pmovmskb %xmm3, %eax > - test %eax, %eax > - jnz L(matches32) > - > - pmovmskb %xmm1, %eax > - bsf %eax, %eax > - lea 48(%rdi, %rax), %rax > - ret > - > - .p2align 4 > -L(exit_loop): > - add $(CHAR_PER_VEC * 2), %edx > - jle L(exit_loop_32) > - > - movdqa (%rdi), %xmm0 > - PCMPEQ %xmm1, %xmm0 > - pmovmskb %xmm0, %eax > - test %eax, %eax > - jnz L(matches) > - > - movdqa 16(%rdi), %xmm2 > - PCMPEQ %xmm1, %xmm2 > - pmovmskb %xmm2, %eax > - test %eax, %eax > - jnz L(matches16) > - > - movdqa 32(%rdi), %xmm3 > - PCMPEQ %xmm1, %xmm3 > - pmovmskb %xmm3, %eax > - test %eax, %eax > - jnz L(matches32_1) > - sub $CHAR_PER_VEC, %edx > - jle L(return_null) > - > - PCMPEQ 48(%rdi), %xmm1 > - pmovmskb %xmm1, %eax > - test %eax, %eax > - jnz L(matches48_1) > - xor %eax, %eax > - ret > - > - .p2align 4 > -L(exit_loop_32): > - add $(CHAR_PER_VEC * 2), %edx > - movdqa (%rdi), %xmm0 > - PCMPEQ %xmm1, %xmm0 > - pmovmskb %xmm0, %eax > - test %eax, %eax > - jnz L(matches_1) > - sub $CHAR_PER_VEC, %edx > - jbe L(return_null) > - > - PCMPEQ 16(%rdi), %xmm1 > - pmovmskb %xmm1, %eax > - test %eax, %eax > - jnz L(matches16_1) > - xor %eax, %eax > - ret > - > - .p2align 4 > -L(matches0): > - bsf %eax, %eax > - lea -16(%rax, %rdi), %rax > - ret > - > - .p2align 4 > -L(matches): > - bsf %eax, %eax > - add %rdi, %rax > - ret > - > - .p2align 4 > -L(matches16): > - bsf %eax, %eax > - lea 16(%rax, %rdi), %rax > - ret > - > - .p2align 4 > -L(matches32): > - bsf %eax, %eax > - lea 32(%rax, %rdi), %rax > - ret > - > - .p2align 4 > -L(matches_1): > - bsf %eax, %eax > -#ifdef USE_AS_WMEMCHR > - mov %eax, %esi > - shr $2, %esi > - sub %rsi, %rdx > -#else > - sub %rax, %rdx > -#endif > - jbe L(return_null) > - add %rdi, %rax > - ret > - > - .p2align 4 > -L(matches16_1): > - bsf %eax, %eax > -#ifdef USE_AS_WMEMCHR > - mov %eax, %esi > - shr $2, %esi > - sub %rsi, %rdx > -#else > - sub %rax, %rdx > -#endif > - jbe L(return_null) > - lea 16(%rdi, %rax), %rax > - ret > - > - .p2align 4 > -L(matches32_1): > - bsf %eax, %eax > -#ifdef USE_AS_WMEMCHR > - mov %eax, %esi > - shr $2, %esi > - sub %rsi, %rdx > -#else > - sub %rax, %rdx > -#endif > - jbe L(return_null) > - lea 32(%rdi, %rax), %rax > - ret > - > - .p2align 4 > -L(matches48_1): > - bsf %eax, %eax > -#ifdef USE_AS_WMEMCHR > - mov %eax, %esi > - shr $2, %esi > - sub %rsi, %rdx > -#else > - sub %rax, %rdx > -#endif > - jbe L(return_null) > - lea 48(%rdi, %rax), %rax > - ret > - > - .p2align 4 > -L(return_null): > - xor %eax, %eax > - ret > -END(MEMCHR) > - > -#ifndef USE_AS_WMEMCHR > strong_alias (memchr, __memchr) > libc_hidden_builtin_def(memchr) > -#endif > diff --git a/sysdeps/x86_64/multiarch/ifunc-evex.h b/sysdeps/x86_64/multiarch/ifunc-evex.h > index b8f7a12ea2..a2f854b98d 100644 > --- a/sysdeps/x86_64/multiarch/ifunc-evex.h > +++ b/sysdeps/x86_64/multiarch/ifunc-evex.h > @@ -19,24 +19,28 @@ > > #include <init-arch.h> > > -extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; > -extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; > -extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; > extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; > extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_rtm) attribute_hidden; > > +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; > +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; > > +extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; Unrelated changes. > +/* TODO: Look into using the ISA build level to remove some/all of the > + feature checks. */ This comment should be removed. > static inline void * > IFUNC_SELECTOR (void) > { > - const struct cpu_features* cpu_features = __get_cpu_features (); > + const struct cpu_features *cpu_features = __get_cpu_features (); > > - if (CPU_FEATURE_USABLE_P (cpu_features, AVX2) > - && CPU_FEATURE_USABLE_P (cpu_features, BMI2) > - && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) > + if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX2) > + && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2) > + && X86_ISA_CPU_FEATURES_ARCH_P (cpu_features, > + AVX_Fast_Unaligned_Load)) > { > - if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) > - && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)) > + if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) > + && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)) > { > if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) > return OPTIMIZE (evex_rtm); > @@ -44,12 +48,19 @@ IFUNC_SELECTOR (void) > return OPTIMIZE (evex); > } > > + X86_ERROR_IF_REACHABLE_V4 (); > + > if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) > return OPTIMIZE (avx2_rtm); > > - if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) > + if (X86_ISA_CPU_FEATURES_ARCH_P (cpu_features, > + Prefer_No_VZEROUPPER)) > return OPTIMIZE (avx2); > } > > + X86_ERROR_IF_REACHABLE_V3 (); > + > + /* This is unreachable (compile time checked) if ISA level >= 3 > + so no need for a robust fallback here. */ > return OPTIMIZE (sse2); > } > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > index 883362f63d..bf52cf96d0 100644 > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > @@ -25,7 +25,8 @@ > > /* Fill ARRAY of MAX elements with IFUNC implementations for function > NAME supported on target machine and return the number of valid > - entries. */ > + entries. Each set of implementations for a given function is sorted in > + descending order by ISA level. */ > > size_t > __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > @@ -53,24 +54,27 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > /* Support sysdeps/x86_64/multiarch/memchr.c. */ > IFUNC_IMPL (i, name, memchr, > - IFUNC_IMPL_ADD (array, i, memchr, > - CPU_FEATURE_USABLE (AVX2), > - __memchr_avx2) > - IFUNC_IMPL_ADD (array, i, memchr, > - (CPU_FEATURE_USABLE (AVX2) > - && CPU_FEATURE_USABLE (RTM)), > - __memchr_avx2_rtm) > - IFUNC_IMPL_ADD (array, i, memchr, > + X86_IFUNC_IMPL_ADD_V4 (array, i, memchr, > (CPU_FEATURE_USABLE (AVX512VL) > && CPU_FEATURE_USABLE (AVX512BW) > && CPU_FEATURE_USABLE (BMI2)), > __memchr_evex) > - IFUNC_IMPL_ADD (array, i, memchr, > + X86_IFUNC_IMPL_ADD_V4 (array, i, memchr, > (CPU_FEATURE_USABLE (AVX512VL) > && CPU_FEATURE_USABLE (AVX512BW) > && CPU_FEATURE_USABLE (BMI2)), > __memchr_evex_rtm) > - IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_sse2)) > + X86_IFUNC_IMPL_ADD_V3 (array, i, memchr, > + CPU_FEATURE_USABLE (AVX2), > + __memchr_avx2) > + X86_IFUNC_IMPL_ADD_V3 (array, i, memchr, > + (CPU_FEATURE_USABLE (AVX2) > + && CPU_FEATURE_USABLE (RTM)), > + __memchr_avx2_rtm) > + /* Can be lowered to V1 if a V2 implementation is added. */ > + X86_IFUNC_IMPL_ADD_V2 (array, i, memchr, > + 1, > + __memchr_sse2)) > > /* Support sysdeps/x86_64/multiarch/memcmp.c. */ > IFUNC_IMPL (i, name, memcmp, > @@ -288,24 +292,27 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > /* Support sysdeps/x86_64/multiarch/rawmemchr.c. */ > IFUNC_IMPL (i, name, rawmemchr, > - IFUNC_IMPL_ADD (array, i, rawmemchr, > - CPU_FEATURE_USABLE (AVX2), > - __rawmemchr_avx2) > - IFUNC_IMPL_ADD (array, i, rawmemchr, > - (CPU_FEATURE_USABLE (AVX2) > - && CPU_FEATURE_USABLE (RTM)), > - __rawmemchr_avx2_rtm) > - IFUNC_IMPL_ADD (array, i, rawmemchr, > + X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr, > (CPU_FEATURE_USABLE (AVX512VL) > && CPU_FEATURE_USABLE (AVX512BW) > && CPU_FEATURE_USABLE (BMI2)), > __rawmemchr_evex) > - IFUNC_IMPL_ADD (array, i, rawmemchr, > + X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr, > (CPU_FEATURE_USABLE (AVX512VL) > && CPU_FEATURE_USABLE (AVX512BW) > && CPU_FEATURE_USABLE (BMI2)), > __rawmemchr_evex_rtm) > - IFUNC_IMPL_ADD (array, i, rawmemchr, 1, __rawmemchr_sse2)) > + X86_IFUNC_IMPL_ADD_V3 (array, i, rawmemchr, > + CPU_FEATURE_USABLE (AVX2), > + __rawmemchr_avx2) > + X86_IFUNC_IMPL_ADD_V3 (array, i, rawmemchr, > + (CPU_FEATURE_USABLE (AVX2) > + && CPU_FEATURE_USABLE (RTM)), > + __rawmemchr_avx2_rtm) > + /* Can be lowered to V1 if a V2 implementation is added. */ > + X86_IFUNC_IMPL_ADD_V2 (array, i, rawmemchr, > + 1, > + __rawmemchr_sse2)) > > /* Support sysdeps/x86_64/multiarch/strlen.c. */ > IFUNC_IMPL (i, name, strlen, > @@ -748,24 +755,27 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > /* Support sysdeps/x86_64/multiarch/wmemchr.c. */ > IFUNC_IMPL (i, name, wmemchr, > - IFUNC_IMPL_ADD (array, i, wmemchr, > - CPU_FEATURE_USABLE (AVX2), > - __wmemchr_avx2) > - IFUNC_IMPL_ADD (array, i, wmemchr, > - (CPU_FEATURE_USABLE (AVX2) > - && CPU_FEATURE_USABLE (RTM)), > - __wmemchr_avx2_rtm) > - IFUNC_IMPL_ADD (array, i, wmemchr, > + X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr, > (CPU_FEATURE_USABLE (AVX512VL) > && CPU_FEATURE_USABLE (AVX512BW) > && CPU_FEATURE_USABLE (BMI2)), > __wmemchr_evex) > - IFUNC_IMPL_ADD (array, i, wmemchr, > + X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr, > (CPU_FEATURE_USABLE (AVX512VL) > && CPU_FEATURE_USABLE (AVX512BW) > && CPU_FEATURE_USABLE (BMI2)), > __wmemchr_evex_rtm) > - IFUNC_IMPL_ADD (array, i, wmemchr, 1, __wmemchr_sse2)) > + X86_IFUNC_IMPL_ADD_V3 (array, i, wmemchr, > + CPU_FEATURE_USABLE (AVX2), > + __wmemchr_avx2) > + X86_IFUNC_IMPL_ADD_V3 (array, i, wmemchr, > + (CPU_FEATURE_USABLE (AVX2) > + && CPU_FEATURE_USABLE (RTM)), > + __wmemchr_avx2_rtm) > + /* Can be lowered to V1 if a V2 implementation is added. */ > + X86_IFUNC_IMPL_ADD_V2 (array, i, wmemchr, > + 1, > + __wmemchr_sse2)) > > /* Support sysdeps/x86_64/multiarch/wmemcmp.c. */ > IFUNC_IMPL (i, name, wmemcmp, > diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S > index c5a256eb37..691662f0fb 100644 > --- a/sysdeps/x86_64/multiarch/memchr-avx2.S > +++ b/sysdeps/x86_64/multiarch/memchr-avx2.S > @@ -16,7 +16,15 @@ > License along with the GNU C Library; if not, see > <https://www.gnu.org/licenses/>. */ > > -#if IS_IN (libc) > +#include <isa-level.h> > + > +#if defined IS_MULTIARCH && defined ISA_DEFAULT_IMPL Where is IS_MULTIARCH defined? > +# error "Multiarch build should never default include!" > +#endif > + > +#if (MINIMUM_X86_ISA_LEVEL <= 3 && IS_IN (libc)) \ > + || defined ISA_DEFAULT_IMPL > + > > # include <sysdep.h> > > diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S > index 0fd11b7632..10ed0434ae 100644 > --- a/sysdeps/x86_64/multiarch/memchr-evex.S > +++ b/sysdeps/x86_64/multiarch/memchr-evex.S > @@ -16,7 +16,15 @@ > License along with the GNU C Library; if not, see > <https://www.gnu.org/licenses/>. */ > > -#if IS_IN (libc) > +#include <isa-level.h> > + > +#if defined IS_MULTIARCH && defined ISA_DEFAULT_IMPL > +# error "Multiarch build should never default include!" > +#endif > + > +#if (MINIMUM_X86_ISA_LEVEL <= 4 && IS_IN (libc)) \ > + || defined ISA_DEFAULT_IMPL > + > > # include <sysdep.h> > > diff --git a/sysdeps/x86_64/multiarch/memchr-sse2.S b/sysdeps/x86_64/multiarch/memchr-sse2.S > index 2c6fdd41d6..acd5c15e22 100644 > --- a/sysdeps/x86_64/multiarch/memchr-sse2.S > +++ b/sysdeps/x86_64/multiarch/memchr-sse2.S > @@ -16,13 +16,367 @@ > License along with the GNU C Library; if not, see > <https://www.gnu.org/licenses/>. */ > > -#if IS_IN (libc) > -# define memchr __memchr_sse2 > +#include <isa-level.h> > > -# undef strong_alias > -# define strong_alias(memchr, __memchr) > -# undef libc_hidden_builtin_def > -# define libc_hidden_builtin_def(memchr) > +#if defined IS_MULTIARCH && defined ISA_DEFAULT_IMPL > +# error "Multiarch build should never default include!" > #endif > > -#include "../memchr.S" > +/* __X86_ISA_LEVEL <= 2 because there is no V2 implementation so we > + need this to build for ISA V2 builds. */ > +#if (MINIMUM_X86_ISA_LEVEL <= 2 && IS_IN (libc)) \ > + || defined ISA_DEFAULT_IMPL > + > + > +# include <sysdep.h> > + > +# ifndef MEMCHR > +# define MEMCHR __memchr_sse2 > +# endif > +# ifdef USE_AS_WMEMCHR > +# define PCMPEQ pcmpeqd > +# define CHAR_PER_VEC 4 > +# else > +# define PCMPEQ pcmpeqb > +# define CHAR_PER_VEC 16 > +# endif > + > +/* fast SSE2 version with using pmaxub and 64 byte loop */ > + > + .text > +ENTRY(MEMCHR) > + movd %esi, %xmm1 > + mov %edi, %ecx > + > +# ifdef __ILP32__ > + /* Clear the upper 32 bits. */ > + movl %edx, %edx > +# endif > +# ifdef USE_AS_WMEMCHR > + test %RDX_LP, %RDX_LP > + jz L(return_null) > +# else > + punpcklbw %xmm1, %xmm1 > + test %RDX_LP, %RDX_LP > + jz L(return_null) > + punpcklbw %xmm1, %xmm1 > +# endif > + > + and $63, %ecx > + pshufd $0, %xmm1, %xmm1 > + > + cmp $48, %ecx > + ja L(crosscache) > + > + movdqu (%rdi), %xmm0 > + PCMPEQ %xmm1, %xmm0 > + pmovmskb %xmm0, %eax > + test %eax, %eax > + > + jnz L(matches_1) > + sub $CHAR_PER_VEC, %rdx > + jbe L(return_null) > + add $16, %rdi > + and $15, %ecx > + and $-16, %rdi > +# ifdef USE_AS_WMEMCHR > + shr $2, %ecx > +# endif > + add %rcx, %rdx > + sub $(CHAR_PER_VEC * 4), %rdx > + jbe L(exit_loop) > + jmp L(loop_prolog) > + > + .p2align 4 > +L(crosscache): > + and $15, %ecx > + and $-16, %rdi > + movdqa (%rdi), %xmm0 > + > + PCMPEQ %xmm1, %xmm0 > + /* Check if there is a match. */ > + pmovmskb %xmm0, %eax > + /* Remove the leading bytes. */ > + sar %cl, %eax > + test %eax, %eax > + je L(unaligned_no_match) > + /* Check which byte is a match. */ > + bsf %eax, %eax > +# ifdef USE_AS_WMEMCHR > + mov %eax, %esi > + shr $2, %esi > + sub %rsi, %rdx > +# else > + sub %rax, %rdx > +# endif > + jbe L(return_null) > + add %rdi, %rax > + add %rcx, %rax > + ret > + > + .p2align 4 > +L(unaligned_no_match): > + /* "rcx" is less than 16. Calculate "rdx + rcx - 16" by using > + "rdx - (16 - rcx)" instead of "(rdx + rcx) - 16" to void > + possible addition overflow. */ > + neg %rcx > + add $16, %rcx > +# ifdef USE_AS_WMEMCHR > + shr $2, %ecx > +# endif > + sub %rcx, %rdx > + jbe L(return_null) > + add $16, %rdi > + sub $(CHAR_PER_VEC * 4), %rdx > + jbe L(exit_loop) > + > + .p2align 4 > +L(loop_prolog): > + movdqa (%rdi), %xmm0 > + PCMPEQ %xmm1, %xmm0 > + pmovmskb %xmm0, %eax > + test %eax, %eax > + jnz L(matches) > + > + movdqa 16(%rdi), %xmm2 > + PCMPEQ %xmm1, %xmm2 > + pmovmskb %xmm2, %eax > + test %eax, %eax > + jnz L(matches16) > + > + movdqa 32(%rdi), %xmm3 > + PCMPEQ %xmm1, %xmm3 > + pmovmskb %xmm3, %eax > + test %eax, %eax > + jnz L(matches32) > + > + movdqa 48(%rdi), %xmm4 > + PCMPEQ %xmm1, %xmm4 > + add $64, %rdi > + pmovmskb %xmm4, %eax > + test %eax, %eax > + jnz L(matches0) > + > + test $0x3f, %rdi > + jz L(align64_loop) > + > + sub $(CHAR_PER_VEC * 4), %rdx > + jbe L(exit_loop) > + > + movdqa (%rdi), %xmm0 > + PCMPEQ %xmm1, %xmm0 > + pmovmskb %xmm0, %eax > + test %eax, %eax > + jnz L(matches) > + > + movdqa 16(%rdi), %xmm2 > + PCMPEQ %xmm1, %xmm2 > + pmovmskb %xmm2, %eax > + test %eax, %eax > + jnz L(matches16) > + > + movdqa 32(%rdi), %xmm3 > + PCMPEQ %xmm1, %xmm3 > + pmovmskb %xmm3, %eax > + test %eax, %eax > + jnz L(matches32) > + > + movdqa 48(%rdi), %xmm3 > + PCMPEQ %xmm1, %xmm3 > + pmovmskb %xmm3, %eax > + > + add $64, %rdi > + test %eax, %eax > + jnz L(matches0) > + > + mov %rdi, %rcx > + and $-64, %rdi > + and $63, %ecx > +# ifdef USE_AS_WMEMCHR > + shr $2, %ecx > +# endif > + add %rcx, %rdx > + > + .p2align 4 > +L(align64_loop): > + sub $(CHAR_PER_VEC * 4), %rdx > + jbe L(exit_loop) > + movdqa (%rdi), %xmm0 > + movdqa 16(%rdi), %xmm2 > + movdqa 32(%rdi), %xmm3 > + movdqa 48(%rdi), %xmm4 > + > + PCMPEQ %xmm1, %xmm0 > + PCMPEQ %xmm1, %xmm2 > + PCMPEQ %xmm1, %xmm3 > + PCMPEQ %xmm1, %xmm4 > + > + pmaxub %xmm0, %xmm3 > + pmaxub %xmm2, %xmm4 > + pmaxub %xmm3, %xmm4 > + pmovmskb %xmm4, %eax > + > + add $64, %rdi > + > + test %eax, %eax > + jz L(align64_loop) > + > + sub $64, %rdi > + > + pmovmskb %xmm0, %eax > + test %eax, %eax > + jnz L(matches) > + > + pmovmskb %xmm2, %eax > + test %eax, %eax > + jnz L(matches16) > + > + movdqa 32(%rdi), %xmm3 > + PCMPEQ %xmm1, %xmm3 > + > + PCMPEQ 48(%rdi), %xmm1 > + pmovmskb %xmm3, %eax > + test %eax, %eax > + jnz L(matches32) > + > + pmovmskb %xmm1, %eax > + bsf %eax, %eax > + lea 48(%rdi, %rax), %rax > + ret > + > + .p2align 4 > +L(exit_loop): > + add $(CHAR_PER_VEC * 2), %edx > + jle L(exit_loop_32) > + > + movdqa (%rdi), %xmm0 > + PCMPEQ %xmm1, %xmm0 > + pmovmskb %xmm0, %eax > + test %eax, %eax > + jnz L(matches) > + > + movdqa 16(%rdi), %xmm2 > + PCMPEQ %xmm1, %xmm2 > + pmovmskb %xmm2, %eax > + test %eax, %eax > + jnz L(matches16) > + > + movdqa 32(%rdi), %xmm3 > + PCMPEQ %xmm1, %xmm3 > + pmovmskb %xmm3, %eax > + test %eax, %eax > + jnz L(matches32_1) > + sub $CHAR_PER_VEC, %edx > + jle L(return_null) > + > + PCMPEQ 48(%rdi), %xmm1 > + pmovmskb %xmm1, %eax > + test %eax, %eax > + jnz L(matches48_1) > + xor %eax, %eax > + ret > + > + .p2align 4 > +L(exit_loop_32): > + add $(CHAR_PER_VEC * 2), %edx > + movdqa (%rdi), %xmm0 > + PCMPEQ %xmm1, %xmm0 > + pmovmskb %xmm0, %eax > + test %eax, %eax > + jnz L(matches_1) > + sub $CHAR_PER_VEC, %edx > + jbe L(return_null) > + > + PCMPEQ 16(%rdi), %xmm1 > + pmovmskb %xmm1, %eax > + test %eax, %eax > + jnz L(matches16_1) > + xor %eax, %eax > + ret > + > + .p2align 4 > +L(matches0): > + bsf %eax, %eax > + lea -16(%rax, %rdi), %rax > + ret > + > + .p2align 4 > +L(matches): > + bsf %eax, %eax > + add %rdi, %rax > + ret > + > + .p2align 4 > +L(matches16): > + bsf %eax, %eax > + lea 16(%rax, %rdi), %rax > + ret > + > + .p2align 4 > +L(matches32): > + bsf %eax, %eax > + lea 32(%rax, %rdi), %rax > + ret > + > + .p2align 4 > +L(matches_1): > + bsf %eax, %eax > +# ifdef USE_AS_WMEMCHR > + mov %eax, %esi > + shr $2, %esi > + sub %rsi, %rdx > +# else > + sub %rax, %rdx > +# endif > + jbe L(return_null) > + add %rdi, %rax > + ret > + > + .p2align 4 > +L(matches16_1): > + bsf %eax, %eax > +# ifdef USE_AS_WMEMCHR > + mov %eax, %esi > + shr $2, %esi > + sub %rsi, %rdx > +# else > + sub %rax, %rdx > +# endif > + jbe L(return_null) > + lea 16(%rdi, %rax), %rax > + ret > + > + .p2align 4 > +L(matches32_1): > + bsf %eax, %eax > +# ifdef USE_AS_WMEMCHR > + mov %eax, %esi > + shr $2, %esi > + sub %rsi, %rdx > +# else > + sub %rax, %rdx > +# endif > + jbe L(return_null) > + lea 32(%rdi, %rax), %rax > + ret > + > + .p2align 4 > +L(matches48_1): > + bsf %eax, %eax > +# ifdef USE_AS_WMEMCHR > + mov %eax, %esi > + shr $2, %esi > + sub %rsi, %rdx > +# else > + sub %rax, %rdx > +# endif > + jbe L(return_null) > + lea 48(%rdi, %rax), %rax > + ret > + > + .p2align 4 > +L(return_null): > + xor %eax, %eax > + ret > +END(MEMCHR) > +#endif > diff --git a/sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S > index acc5f6e2fb..5c1dcd3ca7 100644 > --- a/sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S > +++ b/sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S > @@ -1,4 +1,7 @@ > -#define MEMCHR __rawmemchr_avx2_rtm > -#define USE_AS_RAWMEMCHR 1 > +#ifndef RAWMEMCHR > +# define RAWMEMCHR __rawmemchr_avx2_rtm > +#endif > +#define USE_AS_RAWMEMCHR 1 > +#define MEMCHR RAWMEMCHR > > #include "memchr-avx2-rtm.S" > diff --git a/sysdeps/x86_64/multiarch/rawmemchr-avx2.S b/sysdeps/x86_64/multiarch/rawmemchr-avx2.S > index 128f9ea637..d6bff28757 100644 > --- a/sysdeps/x86_64/multiarch/rawmemchr-avx2.S > +++ b/sysdeps/x86_64/multiarch/rawmemchr-avx2.S > @@ -1,4 +1,7 @@ > -#define MEMCHR __rawmemchr_avx2 > -#define USE_AS_RAWMEMCHR 1 > +#ifndef RAWMEMCHR > +# define RAWMEMCHR __rawmemchr_avx2 > +#endif > +#define USE_AS_RAWMEMCHR 1 > +#define MEMCHR RAWMEMCHR > > #include "memchr-avx2.S" > diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S b/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S > index deda1ca395..8ff7f27c9c 100644 > --- a/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S > +++ b/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S > @@ -1,3 +1,7 @@ > -#define MEMCHR __rawmemchr_evex_rtm > -#define USE_AS_RAWMEMCHR 1 > +#ifndef RAWMEMCHR > +# define RAWMEMCHR __rawmemchr_evex_rtm > +#endif > +#define USE_AS_RAWMEMCHR 1 > +#define MEMCHR RAWMEMCHR > + > #include "memchr-evex-rtm.S" > diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex.S b/sysdeps/x86_64/multiarch/rawmemchr-evex.S > index ec942b77ba..dc1c450699 100644 > --- a/sysdeps/x86_64/multiarch/rawmemchr-evex.S > +++ b/sysdeps/x86_64/multiarch/rawmemchr-evex.S > @@ -1,4 +1,7 @@ > -#define MEMCHR __rawmemchr_evex > -#define USE_AS_RAWMEMCHR 1 > +#ifndef RAWMEMCHR > +# define RAWMEMCHR __rawmemchr_evex > +#endif > +#define USE_AS_RAWMEMCHR 1 > +#define MEMCHR RAWMEMCHR > > #include "memchr-evex.S" > diff --git a/sysdeps/x86_64/multiarch/rawmemchr-sse2.S b/sysdeps/x86_64/multiarch/rawmemchr-sse2.S > index 3841c14c34..73f4fa9589 100644 > --- a/sysdeps/x86_64/multiarch/rawmemchr-sse2.S > +++ b/sysdeps/x86_64/multiarch/rawmemchr-sse2.S > @@ -16,14 +16,199 @@ > License along with the GNU C Library; if not, see > <https://www.gnu.org/licenses/>. */ > > -/* Define multiple versions only for the definition in libc. */ > -#if IS_IN (libc) > -# define __rawmemchr __rawmemchr_sse2 > - > -# undef weak_alias > -# define weak_alias(__rawmemchr, rawmemchr) > -# undef libc_hidden_def > -# define libc_hidden_def(__rawmemchr) > +#include <isa-level.h> > + > +#if defined IS_MULTIARCH && defined ISA_DEFAULT_IMPL > +# error "Multiarch build should never default include!" > #endif > > -#include "../rawmemchr.S" > +/* __X86_ISA_LEVEL <= 2 because there is no V2 implementation so we > + need this to build for ISA V2 builds. */ > +#if (MINIMUM_X86_ISA_LEVEL <= 2 && IS_IN (libc)) \ > + || defined ISA_DEFAULT_IMPL > + > + > +# include <sysdep.h> > + > +# ifndef RAWMEMCHR > +# define RAWMEMCHR __rawmemchr_sse2 > +# endif > + > + .text > +ENTRY (RAWMEMCHR) > + movd %rsi, %xmm1 > + mov %rdi, %rcx > + > + punpcklbw %xmm1, %xmm1 > + punpcklbw %xmm1, %xmm1 > + > + and $63, %rcx > + pshufd $0, %xmm1, %xmm1 > + > + cmp $48, %rcx > + ja L(crosscache) > + > + movdqu (%rdi), %xmm0 > + pcmpeqb %xmm1, %xmm0 > +/* Check if there is a match. */ > + pmovmskb %xmm0, %eax > + test %eax, %eax > + > + jnz L(matches) > + add $16, %rdi > + and $-16, %rdi > + jmp L(loop_prolog) > + > + .p2align 4 > +L(crosscache): > + and $15, %rcx > + and $-16, %rdi > + movdqa (%rdi), %xmm0 > + > + pcmpeqb %xmm1, %xmm0 > +/* Check if there is a match. */ > + pmovmskb %xmm0, %eax > +/* Remove the leading bytes. */ > + sar %cl, %eax > + test %eax, %eax > + je L(unaligned_no_match) > +/* Check which byte is a match. */ > + bsf %eax, %eax > + > + add %rdi, %rax > + add %rcx, %rax > + ret > + > + .p2align 4 > +L(unaligned_no_match): > + add $16, %rdi > + > + .p2align 4 > +L(loop_prolog): > + movdqa (%rdi), %xmm0 > + pcmpeqb %xmm1, %xmm0 > + pmovmskb %xmm0, %eax > + test %eax, %eax > + jnz L(matches) > + > + movdqa 16(%rdi), %xmm2 > + pcmpeqb %xmm1, %xmm2 > + pmovmskb %xmm2, %eax > + test %eax, %eax > + jnz L(matches16) > + > + movdqa 32(%rdi), %xmm3 > + pcmpeqb %xmm1, %xmm3 > + pmovmskb %xmm3, %eax > + test %eax, %eax > + jnz L(matches32) > + > + movdqa 48(%rdi), %xmm4 > + pcmpeqb %xmm1, %xmm4 > + add $64, %rdi > + pmovmskb %xmm4, %eax > + test %eax, %eax > + jnz L(matches0) > + > + test $0x3f, %rdi > + jz L(align64_loop) > + > + movdqa (%rdi), %xmm0 > + pcmpeqb %xmm1, %xmm0 > + pmovmskb %xmm0, %eax > + test %eax, %eax > + jnz L(matches) > + > + movdqa 16(%rdi), %xmm2 > + pcmpeqb %xmm1, %xmm2 > + pmovmskb %xmm2, %eax > + test %eax, %eax > + jnz L(matches16) > + > + movdqa 32(%rdi), %xmm3 > + pcmpeqb %xmm1, %xmm3 > + pmovmskb %xmm3, %eax > + test %eax, %eax > + jnz L(matches32) > + > + movdqa 48(%rdi), %xmm3 > + pcmpeqb %xmm1, %xmm3 > + pmovmskb %xmm3, %eax > + > + add $64, %rdi > + test %eax, %eax > + jnz L(matches0) > + > + and $-64, %rdi > + > + .p2align 4 > +L(align64_loop): > + movdqa (%rdi), %xmm0 > + movdqa 16(%rdi), %xmm2 > + movdqa 32(%rdi), %xmm3 > + movdqa 48(%rdi), %xmm4 > + > + pcmpeqb %xmm1, %xmm0 > + pcmpeqb %xmm1, %xmm2 > + pcmpeqb %xmm1, %xmm3 > + pcmpeqb %xmm1, %xmm4 > + > + pmaxub %xmm0, %xmm3 > + pmaxub %xmm2, %xmm4 > + pmaxub %xmm3, %xmm4 > + pmovmskb %xmm4, %eax > + > + add $64, %rdi > + > + test %eax, %eax > + jz L(align64_loop) > + > + sub $64, %rdi > + > + pmovmskb %xmm0, %eax > + test %eax, %eax > + jnz L(matches) > + > + pmovmskb %xmm2, %eax > + test %eax, %eax > + jnz L(matches16) > + > + movdqa 32(%rdi), %xmm3 > + pcmpeqb %xmm1, %xmm3 > + > + pcmpeqb 48(%rdi), %xmm1 > + pmovmskb %xmm3, %eax > + test %eax, %eax > + jnz L(matches32) > + > + pmovmskb %xmm1, %eax > + bsf %eax, %eax > + lea 48(%rdi, %rax), %rax > + ret > + > + .p2align 4 > +L(matches0): > + bsf %eax, %eax > + lea -16(%rax, %rdi), %rax > + ret > + > + .p2align 4 > +L(matches): > + bsf %eax, %eax > + add %rdi, %rax > + ret > + > + .p2align 4 > +L(matches16): > + bsf %eax, %eax > + lea 16(%rax, %rdi), %rax > + ret > + > + .p2align 4 > +L(matches32): > + bsf %eax, %eax > + lea 32(%rax, %rdi), %rax > + ret > + > +END (RAWMEMCHR) > +#endif > diff --git a/sysdeps/x86_64/multiarch/rtld-memchr.S b/sysdeps/x86_64/multiarch/rtld-memchr.S > new file mode 100644 > index 0000000000..a14b192bed > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/rtld-memchr.S > @@ -0,0 +1,18 @@ > +/* Copyright (C) 2022 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +#include "../memchr.S" > diff --git a/sysdeps/x86_64/multiarch/rtld-rawmemchr.S b/sysdeps/x86_64/multiarch/rtld-rawmemchr.S > new file mode 100644 > index 0000000000..5d4110a052 > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/rtld-rawmemchr.S > @@ -0,0 +1,18 @@ > +/* Copyright (C) 2022 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +#include "../rawmemchr.S" > diff --git a/sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S > index 58ed21db01..2a1cff5b05 100644 > --- a/sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S > +++ b/sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S > @@ -1,4 +1,7 @@ > -#define MEMCHR __wmemchr_avx2_rtm > -#define USE_AS_WMEMCHR 1 > +#ifndef WMEMCHR > +# define WMEMCHR __wmemchr_avx2_rtm > +#endif > +#define USE_AS_WMEMCHR 1 > +#define MEMCHR WMEMCHR > > #include "memchr-avx2-rtm.S" > diff --git a/sysdeps/x86_64/multiarch/wmemchr-avx2.S b/sysdeps/x86_64/multiarch/wmemchr-avx2.S > index 282854f1a1..2bf93fd84b 100644 > --- a/sysdeps/x86_64/multiarch/wmemchr-avx2.S > +++ b/sysdeps/x86_64/multiarch/wmemchr-avx2.S > @@ -1,4 +1,7 @@ > -#define MEMCHR __wmemchr_avx2 > -#define USE_AS_WMEMCHR 1 > +#ifndef WMEMCHR > +# define WMEMCHR __wmemchr_avx2 > +#endif > +#define USE_AS_WMEMCHR 1 > +#define MEMCHR WMEMCHR > > #include "memchr-avx2.S" > diff --git a/sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S b/sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S > index a346cd35a1..c67309e8a1 100644 > --- a/sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S > +++ b/sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S > @@ -1,3 +1,7 @@ > -#define MEMCHR __wmemchr_evex_rtm > -#define USE_AS_WMEMCHR 1 > +#ifndef WMEMCHR > +# define WMEMCHR __wmemchr_evex_rtm > +#endif > +#define USE_AS_WMEMCHR 1 > +#define MEMCHR WMEMCHR > + > #include "memchr-evex-rtm.S" > diff --git a/sysdeps/x86_64/multiarch/wmemchr-evex.S b/sysdeps/x86_64/multiarch/wmemchr-evex.S > index 06cd0f9f5a..5512d5cdc3 100644 > --- a/sysdeps/x86_64/multiarch/wmemchr-evex.S > +++ b/sysdeps/x86_64/multiarch/wmemchr-evex.S > @@ -1,4 +1,7 @@ > -#define MEMCHR __wmemchr_evex > -#define USE_AS_WMEMCHR 1 > +#ifndef WMEMCHR > +# define WMEMCHR __wmemchr_evex > +#endif > +#define USE_AS_WMEMCHR 1 > +#define MEMCHR WMEMCHR > > #include "memchr-evex.S" > diff --git a/sysdeps/x86_64/multiarch/wmemchr-sse2.S b/sysdeps/x86_64/multiarch/wmemchr-sse2.S > index 70a965d552..3081fb6821 100644 > --- a/sysdeps/x86_64/multiarch/wmemchr-sse2.S > +++ b/sysdeps/x86_64/multiarch/wmemchr-sse2.S > @@ -1,4 +1,25 @@ > -#define USE_AS_WMEMCHR 1 > -#define wmemchr __wmemchr_sse2 > +/* wmemchr optimized with SSE2 > + Copyright (C) 2022 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > > -#include "../memchr.S" > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +#ifndef WMEMCHR > +# define WMEMCHR __wmemchr_sse2 > +#endif > +#define USE_AS_WMEMCHR 1 > +#define MEMCHR WMEMCHR > + > +#include "memchr-sse2.S" > diff --git a/sysdeps/x86_64/rawmemchr.S b/sysdeps/x86_64/rawmemchr.S > index 4c1a3383b9..e401a2ac53 100644 > --- a/sysdeps/x86_64/rawmemchr.S > +++ b/sysdeps/x86_64/rawmemchr.S > @@ -17,185 +17,13 @@ > License along with the GNU C Library; if not, see > <https://www.gnu.org/licenses/>. */ > > -#include <sysdep.h> > +#define RAWMEMCHR rawmemchr > > - .text > -ENTRY (__rawmemchr) > - movd %rsi, %xmm1 > - mov %rdi, %rcx > +#define DEFAULT_IMPL_V1 "multiarch/rawmemchr-sse2.S" > +#define DEFAULT_IMPL_V3 "multiarch/rawmemchr-avx2.S" > +#define DEFAULT_IMPL_V4 "multiarch/rawmemchr-evex.S" > > - punpcklbw %xmm1, %xmm1 > - punpcklbw %xmm1, %xmm1 > +#include "isa-default-impl.h" > > - and $63, %rcx > - pshufd $0, %xmm1, %xmm1 > - > - cmp $48, %rcx > - ja L(crosscache) > - > - movdqu (%rdi), %xmm0 > - pcmpeqb %xmm1, %xmm0 > -/* Check if there is a match. */ > - pmovmskb %xmm0, %eax > - test %eax, %eax > - > - jnz L(matches) > - add $16, %rdi > - and $-16, %rdi > - jmp L(loop_prolog) > - > - .p2align 4 > -L(crosscache): > - and $15, %rcx > - and $-16, %rdi > - movdqa (%rdi), %xmm0 > - > - pcmpeqb %xmm1, %xmm0 > -/* Check if there is a match. */ > - pmovmskb %xmm0, %eax > -/* Remove the leading bytes. */ > - sar %cl, %eax > - test %eax, %eax > - je L(unaligned_no_match) > -/* Check which byte is a match. */ > - bsf %eax, %eax > - > - add %rdi, %rax > - add %rcx, %rax > - ret > - > - .p2align 4 > -L(unaligned_no_match): > - add $16, %rdi > - > - .p2align 4 > -L(loop_prolog): > - movdqa (%rdi), %xmm0 > - pcmpeqb %xmm1, %xmm0 > - pmovmskb %xmm0, %eax > - test %eax, %eax > - jnz L(matches) > - > - movdqa 16(%rdi), %xmm2 > - pcmpeqb %xmm1, %xmm2 > - pmovmskb %xmm2, %eax > - test %eax, %eax > - jnz L(matches16) > - > - movdqa 32(%rdi), %xmm3 > - pcmpeqb %xmm1, %xmm3 > - pmovmskb %xmm3, %eax > - test %eax, %eax > - jnz L(matches32) > - > - movdqa 48(%rdi), %xmm4 > - pcmpeqb %xmm1, %xmm4 > - add $64, %rdi > - pmovmskb %xmm4, %eax > - test %eax, %eax > - jnz L(matches0) > - > - test $0x3f, %rdi > - jz L(align64_loop) > - > - movdqa (%rdi), %xmm0 > - pcmpeqb %xmm1, %xmm0 > - pmovmskb %xmm0, %eax > - test %eax, %eax > - jnz L(matches) > - > - movdqa 16(%rdi), %xmm2 > - pcmpeqb %xmm1, %xmm2 > - pmovmskb %xmm2, %eax > - test %eax, %eax > - jnz L(matches16) > - > - movdqa 32(%rdi), %xmm3 > - pcmpeqb %xmm1, %xmm3 > - pmovmskb %xmm3, %eax > - test %eax, %eax > - jnz L(matches32) > - > - movdqa 48(%rdi), %xmm3 > - pcmpeqb %xmm1, %xmm3 > - pmovmskb %xmm3, %eax > - > - add $64, %rdi > - test %eax, %eax > - jnz L(matches0) > - > - and $-64, %rdi > - > - .p2align 4 > -L(align64_loop): > - movdqa (%rdi), %xmm0 > - movdqa 16(%rdi), %xmm2 > - movdqa 32(%rdi), %xmm3 > - movdqa 48(%rdi), %xmm4 > - > - pcmpeqb %xmm1, %xmm0 > - pcmpeqb %xmm1, %xmm2 > - pcmpeqb %xmm1, %xmm3 > - pcmpeqb %xmm1, %xmm4 > - > - pmaxub %xmm0, %xmm3 > - pmaxub %xmm2, %xmm4 > - pmaxub %xmm3, %xmm4 > - pmovmskb %xmm4, %eax > - > - add $64, %rdi > - > - test %eax, %eax > - jz L(align64_loop) > - > - sub $64, %rdi > - > - pmovmskb %xmm0, %eax > - test %eax, %eax > - jnz L(matches) > - > - pmovmskb %xmm2, %eax > - test %eax, %eax > - jnz L(matches16) > - > - movdqa 32(%rdi), %xmm3 > - pcmpeqb %xmm1, %xmm3 > - > - pcmpeqb 48(%rdi), %xmm1 > - pmovmskb %xmm3, %eax > - test %eax, %eax > - jnz L(matches32) > - > - pmovmskb %xmm1, %eax > - bsf %eax, %eax > - lea 48(%rdi, %rax), %rax > - ret > - > - .p2align 4 > -L(matches0): > - bsf %eax, %eax > - lea -16(%rax, %rdi), %rax > - ret > - > - .p2align 4 > -L(matches): > - bsf %eax, %eax > - add %rdi, %rax > - ret > - > - .p2align 4 > -L(matches16): > - bsf %eax, %eax > - lea 16(%rax, %rdi), %rax > - ret > - > - .p2align 4 > -L(matches32): > - bsf %eax, %eax > - lea 32(%rax, %rdi), %rax > - ret > - > -END (__rawmemchr) > - > -weak_alias (__rawmemchr, rawmemchr) > -libc_hidden_builtin_def (__rawmemchr) > +strong_alias (rawmemchr, __rawmemchr) > +libc_hidden_builtin_def (rawmemchr) > diff --git a/sysdeps/x86_64/wmemchr.S b/sysdeps/x86_64/wmemchr.S > new file mode 100644 > index 0000000000..dd0490f86b > --- /dev/null > +++ b/sysdeps/x86_64/wmemchr.S > @@ -0,0 +1,24 @@ > +/* Copyright (C) 2011-2022 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +#define WMEMCHR wmemchr > + > +#define DEFAULT_IMPL_V1 "multiarch/wmemchr-sse2.S" > +#define DEFAULT_IMPL_V3 "multiarch/wmemchr-avx2.S" > +#define DEFAULT_IMPL_V4 "multiarch/wmemchr-evex.S" > + > +#include "isa-default-impl.h" > -- > 2.34.1 > -- H.J.
On Wed, Jun 22, 2022 at 10:50 AM H.J. Lu <hjl.tools@gmail.com> wrote: > > On Wed, Jun 22, 2022 at 10:12 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > 1. Refactor files so that all implementations for in the multiarch > > directory. > > - Essentially moved sse2 {raw|w}memchr.S implementation to > > multiarch/{raw|w}memchr-sse2.S > > > > - The non-multiarch {raw|w}memchr.S file now only includes one of > > the implementations in the multiarch directory based on the > > compiled ISA level (only used for non-multiarch builds. > > Otherwise we go through the ifunc selector). > > > > 2. Add ISA level build guards to different implementations. > > - I.e memchr-avx2.S which is ISA level 3 will only build if > > compiled ISA level <= 3. Otherwise there is no reason to include > > it as we will always use one of the ISA level 4 > > implementations (memchr-evex{-rtm}.S). > > > > 3. Add new multiarch/rtld-{raw}memchr.S that just include the > > non-multiarch {raw}memchr.S which will in turn select the best > > implementation based on the compiled ISA level. > > > > 4. Refactor the ifunc selector and ifunc implementation list to use > > the ISA level aware wrapper macros that allow functions below the > > compiled ISA level (with a guranteed replacement) to be skipped. > > - Guranteed replacement essentially means that for any ISA level > > build there must be a function that the baseline of the ISA > > supports. So for {raw|w}memchr.S since there is not ISA level 2 > > function, the ISA level 2 build still includes the ISA level > > 1 (sse2) function. Once we reach the ISA level 3 build, however, > > {raw|w}memchr-avx2{-rtm}.S will always be sufficient so the ISA > > level 1 implementation ({raw|w}memchr-sse2.S) will not be built. > > > > Tested with and without multiarch on x86_64 for ISA levels: > > {generic, x86-64-v2, x86-64-v3, x86-64-v4} > > --- > > sysdeps/x86_64/memchr.S | 355 +---------------- > > sysdeps/x86_64/multiarch/ifunc-evex.h | 31 +- > > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 72 ++-- > > sysdeps/x86_64/multiarch/memchr-avx2.S | 10 +- > > sysdeps/x86_64/multiarch/memchr-evex.S | 10 +- > > sysdeps/x86_64/multiarch/memchr-sse2.S | 368 +++++++++++++++++- > > sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S | 7 +- > > sysdeps/x86_64/multiarch/rawmemchr-avx2.S | 7 +- > > sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S | 8 +- > > sysdeps/x86_64/multiarch/rawmemchr-evex.S | 7 +- > > sysdeps/x86_64/multiarch/rawmemchr-sse2.S | 203 +++++++++- > > sysdeps/x86_64/multiarch/rtld-memchr.S | 18 + > > sysdeps/x86_64/multiarch/rtld-rawmemchr.S | 18 + > > sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S | 7 +- > > sysdeps/x86_64/multiarch/wmemchr-avx2.S | 7 +- > > sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S | 8 +- > > sysdeps/x86_64/multiarch/wmemchr-evex.S | 7 +- > > sysdeps/x86_64/multiarch/wmemchr-sse2.S | 27 +- > > sysdeps/x86_64/rawmemchr.S | 186 +-------- > > sysdeps/x86_64/wmemchr.S | 24 ++ > > 20 files changed, 773 insertions(+), 607 deletions(-) > > create mode 100644 sysdeps/x86_64/multiarch/rtld-memchr.S > > create mode 100644 sysdeps/x86_64/multiarch/rtld-rawmemchr.S > > create mode 100644 sysdeps/x86_64/wmemchr.S > > > > diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S > > index a160fd9b00..018bb06f04 100644 > > --- a/sysdeps/x86_64/memchr.S > > +++ b/sysdeps/x86_64/memchr.S > > @@ -15,358 +15,13 @@ > > License along with the GNU C Library; if not, see > > <https://www.gnu.org/licenses/>. */ > > > > -#include <sysdep.h> > > +#define MEMCHR memchr > > > > -#ifdef USE_AS_WMEMCHR > > -# define MEMCHR wmemchr > > -# define PCMPEQ pcmpeqd > > -# define CHAR_PER_VEC 4 > > -#else > > -# define MEMCHR memchr > > -# define PCMPEQ pcmpeqb > > -# define CHAR_PER_VEC 16 > > -#endif > > +#define DEFAULT_IMPL_V1 "multiarch/memchr-sse2.S" > > +#define DEFAULT_IMPL_V3 "multiarch/memchr-avx2.S" > > +#define DEFAULT_IMPL_V4 "multiarch/memchr-evex.S" > > > > -/* fast SSE2 version with using pmaxub and 64 byte loop */ > > +#include "isa-default-impl.h" > > > > - .text > > -ENTRY(MEMCHR) > > - movd %esi, %xmm1 > > - mov %edi, %ecx > > - > > -#ifdef __ILP32__ > > - /* Clear the upper 32 bits. */ > > - movl %edx, %edx > > -#endif > > -#ifdef USE_AS_WMEMCHR > > - test %RDX_LP, %RDX_LP > > - jz L(return_null) > > -#else > > - punpcklbw %xmm1, %xmm1 > > - test %RDX_LP, %RDX_LP > > - jz L(return_null) > > - punpcklbw %xmm1, %xmm1 > > -#endif > > - > > - and $63, %ecx > > - pshufd $0, %xmm1, %xmm1 > > - > > - cmp $48, %ecx > > - ja L(crosscache) > > - > > - movdqu (%rdi), %xmm0 > > - PCMPEQ %xmm1, %xmm0 > > - pmovmskb %xmm0, %eax > > - test %eax, %eax > > - > > - jnz L(matches_1) > > - sub $CHAR_PER_VEC, %rdx > > - jbe L(return_null) > > - add $16, %rdi > > - and $15, %ecx > > - and $-16, %rdi > > -#ifdef USE_AS_WMEMCHR > > - shr $2, %ecx > > -#endif > > - add %rcx, %rdx > > - sub $(CHAR_PER_VEC * 4), %rdx > > - jbe L(exit_loop) > > - jmp L(loop_prolog) > > - > > - .p2align 4 > > -L(crosscache): > > - and $15, %ecx > > - and $-16, %rdi > > - movdqa (%rdi), %xmm0 > > - > > - PCMPEQ %xmm1, %xmm0 > > - /* Check if there is a match. */ > > - pmovmskb %xmm0, %eax > > - /* Remove the leading bytes. */ > > - sar %cl, %eax > > - test %eax, %eax > > - je L(unaligned_no_match) > > - /* Check which byte is a match. */ > > - bsf %eax, %eax > > -#ifdef USE_AS_WMEMCHR > > - mov %eax, %esi > > - shr $2, %esi > > - sub %rsi, %rdx > > -#else > > - sub %rax, %rdx > > -#endif > > - jbe L(return_null) > > - add %rdi, %rax > > - add %rcx, %rax > > - ret > > - > > - .p2align 4 > > -L(unaligned_no_match): > > - /* "rcx" is less than 16. Calculate "rdx + rcx - 16" by using > > - "rdx - (16 - rcx)" instead of "(rdx + rcx) - 16" to void > > - possible addition overflow. */ > > - neg %rcx > > - add $16, %rcx > > -#ifdef USE_AS_WMEMCHR > > - shr $2, %ecx > > -#endif > > - sub %rcx, %rdx > > - jbe L(return_null) > > - add $16, %rdi > > - sub $(CHAR_PER_VEC * 4), %rdx > > - jbe L(exit_loop) > > - > > - .p2align 4 > > -L(loop_prolog): > > - movdqa (%rdi), %xmm0 > > - PCMPEQ %xmm1, %xmm0 > > - pmovmskb %xmm0, %eax > > - test %eax, %eax > > - jnz L(matches) > > - > > - movdqa 16(%rdi), %xmm2 > > - PCMPEQ %xmm1, %xmm2 > > - pmovmskb %xmm2, %eax > > - test %eax, %eax > > - jnz L(matches16) > > - > > - movdqa 32(%rdi), %xmm3 > > - PCMPEQ %xmm1, %xmm3 > > - pmovmskb %xmm3, %eax > > - test %eax, %eax > > - jnz L(matches32) > > - > > - movdqa 48(%rdi), %xmm4 > > - PCMPEQ %xmm1, %xmm4 > > - add $64, %rdi > > - pmovmskb %xmm4, %eax > > - test %eax, %eax > > - jnz L(matches0) > > - > > - test $0x3f, %rdi > > - jz L(align64_loop) > > - > > - sub $(CHAR_PER_VEC * 4), %rdx > > - jbe L(exit_loop) > > - > > - movdqa (%rdi), %xmm0 > > - PCMPEQ %xmm1, %xmm0 > > - pmovmskb %xmm0, %eax > > - test %eax, %eax > > - jnz L(matches) > > - > > - movdqa 16(%rdi), %xmm2 > > - PCMPEQ %xmm1, %xmm2 > > - pmovmskb %xmm2, %eax > > - test %eax, %eax > > - jnz L(matches16) > > - > > - movdqa 32(%rdi), %xmm3 > > - PCMPEQ %xmm1, %xmm3 > > - pmovmskb %xmm3, %eax > > - test %eax, %eax > > - jnz L(matches32) > > - > > - movdqa 48(%rdi), %xmm3 > > - PCMPEQ %xmm1, %xmm3 > > - pmovmskb %xmm3, %eax > > - > > - add $64, %rdi > > - test %eax, %eax > > - jnz L(matches0) > > - > > - mov %rdi, %rcx > > - and $-64, %rdi > > - and $63, %ecx > > -#ifdef USE_AS_WMEMCHR > > - shr $2, %ecx > > -#endif > > - add %rcx, %rdx > > - > > - .p2align 4 > > -L(align64_loop): > > - sub $(CHAR_PER_VEC * 4), %rdx > > - jbe L(exit_loop) > > - movdqa (%rdi), %xmm0 > > - movdqa 16(%rdi), %xmm2 > > - movdqa 32(%rdi), %xmm3 > > - movdqa 48(%rdi), %xmm4 > > - > > - PCMPEQ %xmm1, %xmm0 > > - PCMPEQ %xmm1, %xmm2 > > - PCMPEQ %xmm1, %xmm3 > > - PCMPEQ %xmm1, %xmm4 > > - > > - pmaxub %xmm0, %xmm3 > > - pmaxub %xmm2, %xmm4 > > - pmaxub %xmm3, %xmm4 > > - pmovmskb %xmm4, %eax > > - > > - add $64, %rdi > > - > > - test %eax, %eax > > - jz L(align64_loop) > > - > > - sub $64, %rdi > > - > > - pmovmskb %xmm0, %eax > > - test %eax, %eax > > - jnz L(matches) > > - > > - pmovmskb %xmm2, %eax > > - test %eax, %eax > > - jnz L(matches16) > > - > > - movdqa 32(%rdi), %xmm3 > > - PCMPEQ %xmm1, %xmm3 > > - > > - PCMPEQ 48(%rdi), %xmm1 > > - pmovmskb %xmm3, %eax > > - test %eax, %eax > > - jnz L(matches32) > > - > > - pmovmskb %xmm1, %eax > > - bsf %eax, %eax > > - lea 48(%rdi, %rax), %rax > > - ret > > - > > - .p2align 4 > > -L(exit_loop): > > - add $(CHAR_PER_VEC * 2), %edx > > - jle L(exit_loop_32) > > - > > - movdqa (%rdi), %xmm0 > > - PCMPEQ %xmm1, %xmm0 > > - pmovmskb %xmm0, %eax > > - test %eax, %eax > > - jnz L(matches) > > - > > - movdqa 16(%rdi), %xmm2 > > - PCMPEQ %xmm1, %xmm2 > > - pmovmskb %xmm2, %eax > > - test %eax, %eax > > - jnz L(matches16) > > - > > - movdqa 32(%rdi), %xmm3 > > - PCMPEQ %xmm1, %xmm3 > > - pmovmskb %xmm3, %eax > > - test %eax, %eax > > - jnz L(matches32_1) > > - sub $CHAR_PER_VEC, %edx > > - jle L(return_null) > > - > > - PCMPEQ 48(%rdi), %xmm1 > > - pmovmskb %xmm1, %eax > > - test %eax, %eax > > - jnz L(matches48_1) > > - xor %eax, %eax > > - ret > > - > > - .p2align 4 > > -L(exit_loop_32): > > - add $(CHAR_PER_VEC * 2), %edx > > - movdqa (%rdi), %xmm0 > > - PCMPEQ %xmm1, %xmm0 > > - pmovmskb %xmm0, %eax > > - test %eax, %eax > > - jnz L(matches_1) > > - sub $CHAR_PER_VEC, %edx > > - jbe L(return_null) > > - > > - PCMPEQ 16(%rdi), %xmm1 > > - pmovmskb %xmm1, %eax > > - test %eax, %eax > > - jnz L(matches16_1) > > - xor %eax, %eax > > - ret > > - > > - .p2align 4 > > -L(matches0): > > - bsf %eax, %eax > > - lea -16(%rax, %rdi), %rax > > - ret > > - > > - .p2align 4 > > -L(matches): > > - bsf %eax, %eax > > - add %rdi, %rax > > - ret > > - > > - .p2align 4 > > -L(matches16): > > - bsf %eax, %eax > > - lea 16(%rax, %rdi), %rax > > - ret > > - > > - .p2align 4 > > -L(matches32): > > - bsf %eax, %eax > > - lea 32(%rax, %rdi), %rax > > - ret > > - > > - .p2align 4 > > -L(matches_1): > > - bsf %eax, %eax > > -#ifdef USE_AS_WMEMCHR > > - mov %eax, %esi > > - shr $2, %esi > > - sub %rsi, %rdx > > -#else > > - sub %rax, %rdx > > -#endif > > - jbe L(return_null) > > - add %rdi, %rax > > - ret > > - > > - .p2align 4 > > -L(matches16_1): > > - bsf %eax, %eax > > -#ifdef USE_AS_WMEMCHR > > - mov %eax, %esi > > - shr $2, %esi > > - sub %rsi, %rdx > > -#else > > - sub %rax, %rdx > > -#endif > > - jbe L(return_null) > > - lea 16(%rdi, %rax), %rax > > - ret > > - > > - .p2align 4 > > -L(matches32_1): > > - bsf %eax, %eax > > -#ifdef USE_AS_WMEMCHR > > - mov %eax, %esi > > - shr $2, %esi > > - sub %rsi, %rdx > > -#else > > - sub %rax, %rdx > > -#endif > > - jbe L(return_null) > > - lea 32(%rdi, %rax), %rax > > - ret > > - > > - .p2align 4 > > -L(matches48_1): > > - bsf %eax, %eax > > -#ifdef USE_AS_WMEMCHR > > - mov %eax, %esi > > - shr $2, %esi > > - sub %rsi, %rdx > > -#else > > - sub %rax, %rdx > > -#endif > > - jbe L(return_null) > > - lea 48(%rdi, %rax), %rax > > - ret > > - > > - .p2align 4 > > -L(return_null): > > - xor %eax, %eax > > - ret > > -END(MEMCHR) > > - > > -#ifndef USE_AS_WMEMCHR > > strong_alias (memchr, __memchr) > > libc_hidden_builtin_def(memchr) > > -#endif > > diff --git a/sysdeps/x86_64/multiarch/ifunc-evex.h b/sysdeps/x86_64/multiarch/ifunc-evex.h > > index b8f7a12ea2..a2f854b98d 100644 > > --- a/sysdeps/x86_64/multiarch/ifunc-evex.h > > +++ b/sysdeps/x86_64/multiarch/ifunc-evex.h > > @@ -19,24 +19,28 @@ > > > > #include <init-arch.h> > > > > -extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; > > -extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; > > -extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; > > extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; > > extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_rtm) attribute_hidden; > > > > +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; > > +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; > > > > +extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; > > Unrelated changes. They are now sorted / seperated by ISA level. Makes clearer. > > > +/* TODO: Look into using the ISA build level to remove some/all of the > > + feature checks. */ > > This comment should be removed. > > > static inline void * > > IFUNC_SELECTOR (void) > > { > > - const struct cpu_features* cpu_features = __get_cpu_features (); > > + const struct cpu_features *cpu_features = __get_cpu_features (); > > > > - if (CPU_FEATURE_USABLE_P (cpu_features, AVX2) > > - && CPU_FEATURE_USABLE_P (cpu_features, BMI2) > > - && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) > > + if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX2) > > + && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2) > > + && X86_ISA_CPU_FEATURES_ARCH_P (cpu_features, > > + AVX_Fast_Unaligned_Load)) > > { > > - if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) > > - && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)) > > + if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) > > + && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)) > > { > > if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) > > return OPTIMIZE (evex_rtm); > > @@ -44,12 +48,19 @@ IFUNC_SELECTOR (void) > > return OPTIMIZE (evex); > > } > > > > + X86_ERROR_IF_REACHABLE_V4 (); > > + > > if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) > > return OPTIMIZE (avx2_rtm); > > > > - if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) > > + if (X86_ISA_CPU_FEATURES_ARCH_P (cpu_features, > > + Prefer_No_VZEROUPPER)) > > return OPTIMIZE (avx2); > > } > > > > + X86_ERROR_IF_REACHABLE_V3 (); > > + > > + /* This is unreachable (compile time checked) if ISA level >= 3 > > + so no need for a robust fallback here. */ > > return OPTIMIZE (sse2); > > } > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > index 883362f63d..bf52cf96d0 100644 > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > @@ -25,7 +25,8 @@ > > > > /* Fill ARRAY of MAX elements with IFUNC implementations for function > > NAME supported on target machine and return the number of valid > > - entries. */ > > + entries. Each set of implementations for a given function is sorted in > > + descending order by ISA level. */ > > > > size_t > > __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > @@ -53,24 +54,27 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > > /* Support sysdeps/x86_64/multiarch/memchr.c. */ > > IFUNC_IMPL (i, name, memchr, > > - IFUNC_IMPL_ADD (array, i, memchr, > > - CPU_FEATURE_USABLE (AVX2), > > - __memchr_avx2) > > - IFUNC_IMPL_ADD (array, i, memchr, > > - (CPU_FEATURE_USABLE (AVX2) > > - && CPU_FEATURE_USABLE (RTM)), > > - __memchr_avx2_rtm) > > - IFUNC_IMPL_ADD (array, i, memchr, > > + X86_IFUNC_IMPL_ADD_V4 (array, i, memchr, > > (CPU_FEATURE_USABLE (AVX512VL) > > && CPU_FEATURE_USABLE (AVX512BW) > > && CPU_FEATURE_USABLE (BMI2)), > > __memchr_evex) > > - IFUNC_IMPL_ADD (array, i, memchr, > > + X86_IFUNC_IMPL_ADD_V4 (array, i, memchr, > > (CPU_FEATURE_USABLE (AVX512VL) > > && CPU_FEATURE_USABLE (AVX512BW) > > && CPU_FEATURE_USABLE (BMI2)), > > __memchr_evex_rtm) > > - IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_sse2)) > > + X86_IFUNC_IMPL_ADD_V3 (array, i, memchr, > > + CPU_FEATURE_USABLE (AVX2), > > + __memchr_avx2) > > + X86_IFUNC_IMPL_ADD_V3 (array, i, memchr, > > + (CPU_FEATURE_USABLE (AVX2) > > + && CPU_FEATURE_USABLE (RTM)), > > + __memchr_avx2_rtm) > > + /* Can be lowered to V1 if a V2 implementation is added. */ > > + X86_IFUNC_IMPL_ADD_V2 (array, i, memchr, > > + 1, > > + __memchr_sse2)) > > > > /* Support sysdeps/x86_64/multiarch/memcmp.c. */ > > IFUNC_IMPL (i, name, memcmp, > > @@ -288,24 +292,27 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > > /* Support sysdeps/x86_64/multiarch/rawmemchr.c. */ > > IFUNC_IMPL (i, name, rawmemchr, > > - IFUNC_IMPL_ADD (array, i, rawmemchr, > > - CPU_FEATURE_USABLE (AVX2), > > - __rawmemchr_avx2) > > - IFUNC_IMPL_ADD (array, i, rawmemchr, > > - (CPU_FEATURE_USABLE (AVX2) > > - && CPU_FEATURE_USABLE (RTM)), > > - __rawmemchr_avx2_rtm) > > - IFUNC_IMPL_ADD (array, i, rawmemchr, > > + X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr, > > (CPU_FEATURE_USABLE (AVX512VL) > > && CPU_FEATURE_USABLE (AVX512BW) > > && CPU_FEATURE_USABLE (BMI2)), > > __rawmemchr_evex) > > - IFUNC_IMPL_ADD (array, i, rawmemchr, > > + X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr, > > (CPU_FEATURE_USABLE (AVX512VL) > > && CPU_FEATURE_USABLE (AVX512BW) > > && CPU_FEATURE_USABLE (BMI2)), > > __rawmemchr_evex_rtm) > > - IFUNC_IMPL_ADD (array, i, rawmemchr, 1, __rawmemchr_sse2)) > > + X86_IFUNC_IMPL_ADD_V3 (array, i, rawmemchr, > > + CPU_FEATURE_USABLE (AVX2), > > + __rawmemchr_avx2) > > + X86_IFUNC_IMPL_ADD_V3 (array, i, rawmemchr, > > + (CPU_FEATURE_USABLE (AVX2) > > + && CPU_FEATURE_USABLE (RTM)), > > + __rawmemchr_avx2_rtm) > > + /* Can be lowered to V1 if a V2 implementation is added. */ > > + X86_IFUNC_IMPL_ADD_V2 (array, i, rawmemchr, > > + 1, > > + __rawmemchr_sse2)) > > > > /* Support sysdeps/x86_64/multiarch/strlen.c. */ > > IFUNC_IMPL (i, name, strlen, > > @@ -748,24 +755,27 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > > /* Support sysdeps/x86_64/multiarch/wmemchr.c. */ > > IFUNC_IMPL (i, name, wmemchr, > > - IFUNC_IMPL_ADD (array, i, wmemchr, > > - CPU_FEATURE_USABLE (AVX2), > > - __wmemchr_avx2) > > - IFUNC_IMPL_ADD (array, i, wmemchr, > > - (CPU_FEATURE_USABLE (AVX2) > > - && CPU_FEATURE_USABLE (RTM)), > > - __wmemchr_avx2_rtm) > > - IFUNC_IMPL_ADD (array, i, wmemchr, > > + X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr, > > (CPU_FEATURE_USABLE (AVX512VL) > > && CPU_FEATURE_USABLE (AVX512BW) > > && CPU_FEATURE_USABLE (BMI2)), > > __wmemchr_evex) > > - IFUNC_IMPL_ADD (array, i, wmemchr, > > + X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr, > > (CPU_FEATURE_USABLE (AVX512VL) > > && CPU_FEATURE_USABLE (AVX512BW) > > && CPU_FEATURE_USABLE (BMI2)), > > __wmemchr_evex_rtm) > > - IFUNC_IMPL_ADD (array, i, wmemchr, 1, __wmemchr_sse2)) > > + X86_IFUNC_IMPL_ADD_V3 (array, i, wmemchr, > > + CPU_FEATURE_USABLE (AVX2), > > + __wmemchr_avx2) > > + X86_IFUNC_IMPL_ADD_V3 (array, i, wmemchr, > > + (CPU_FEATURE_USABLE (AVX2) > > + && CPU_FEATURE_USABLE (RTM)), > > + __wmemchr_avx2_rtm) > > + /* Can be lowered to V1 if a V2 implementation is added. */ > > + X86_IFUNC_IMPL_ADD_V2 (array, i, wmemchr, > > + 1, > > + __wmemchr_sse2)) > > > > /* Support sysdeps/x86_64/multiarch/wmemcmp.c. */ > > IFUNC_IMPL (i, name, wmemcmp, > > diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S > > index c5a256eb37..691662f0fb 100644 > > --- a/sysdeps/x86_64/multiarch/memchr-avx2.S > > +++ b/sysdeps/x86_64/multiarch/memchr-avx2.S > > @@ -16,7 +16,15 @@ > > License along with the GNU C Library; if not, see > > <https://www.gnu.org/licenses/>. */ > > > > -#if IS_IN (libc) > > +#include <isa-level.h> > > + > > +#if defined IS_MULTIARCH && defined ISA_DEFAULT_IMPL > > Where is IS_MULTIARCH defined? > > > > +# error "Multiarch build should never default include!" > > +#endif > > + > > +#if (MINIMUM_X86_ISA_LEVEL <= 3 && IS_IN (libc)) \ > > + || defined ISA_DEFAULT_IMPL > > + > > > > # include <sysdep.h> > > > > diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S > > index 0fd11b7632..10ed0434ae 100644 > > --- a/sysdeps/x86_64/multiarch/memchr-evex.S > > +++ b/sysdeps/x86_64/multiarch/memchr-evex.S > > @@ -16,7 +16,15 @@ > > License along with the GNU C Library; if not, see > > <https://www.gnu.org/licenses/>. */ > > > > -#if IS_IN (libc) > > +#include <isa-level.h> > > + > > +#if defined IS_MULTIARCH && defined ISA_DEFAULT_IMPL > > +# error "Multiarch build should never default include!" > > +#endif > > + > > +#if (MINIMUM_X86_ISA_LEVEL <= 4 && IS_IN (libc)) \ > > + || defined ISA_DEFAULT_IMPL > > + > > > > # include <sysdep.h> > > > > diff --git a/sysdeps/x86_64/multiarch/memchr-sse2.S b/sysdeps/x86_64/multiarch/memchr-sse2.S > > index 2c6fdd41d6..acd5c15e22 100644 > > --- a/sysdeps/x86_64/multiarch/memchr-sse2.S > > +++ b/sysdeps/x86_64/multiarch/memchr-sse2.S > > @@ -16,13 +16,367 @@ > > License along with the GNU C Library; if not, see > > <https://www.gnu.org/licenses/>. */ > > > > -#if IS_IN (libc) > > -# define memchr __memchr_sse2 > > +#include <isa-level.h> > > > > -# undef strong_alias > > -# define strong_alias(memchr, __memchr) > > -# undef libc_hidden_builtin_def > > -# define libc_hidden_builtin_def(memchr) > > +#if defined IS_MULTIARCH && defined ISA_DEFAULT_IMPL > > +# error "Multiarch build should never default include!" > > #endif > > > > -#include "../memchr.S" > > +/* __X86_ISA_LEVEL <= 2 because there is no V2 implementation so we > > + need this to build for ISA V2 builds. */ > > +#if (MINIMUM_X86_ISA_LEVEL <= 2 && IS_IN (libc)) \ > > + || defined ISA_DEFAULT_IMPL > > + > > + > > +# include <sysdep.h> > > + > > +# ifndef MEMCHR > > +# define MEMCHR __memchr_sse2 > > +# endif > > +# ifdef USE_AS_WMEMCHR > > +# define PCMPEQ pcmpeqd > > +# define CHAR_PER_VEC 4 > > +# else > > +# define PCMPEQ pcmpeqb > > +# define CHAR_PER_VEC 16 > > +# endif > > + > > +/* fast SSE2 version with using pmaxub and 64 byte loop */ > > + > > + .text > > +ENTRY(MEMCHR) > > + movd %esi, %xmm1 > > + mov %edi, %ecx > > + > > +# ifdef __ILP32__ > > + /* Clear the upper 32 bits. */ > > + movl %edx, %edx > > +# endif > > +# ifdef USE_AS_WMEMCHR > > + test %RDX_LP, %RDX_LP > > + jz L(return_null) > > +# else > > + punpcklbw %xmm1, %xmm1 > > + test %RDX_LP, %RDX_LP > > + jz L(return_null) > > + punpcklbw %xmm1, %xmm1 > > +# endif > > + > > + and $63, %ecx > > + pshufd $0, %xmm1, %xmm1 > > + > > + cmp $48, %ecx > > + ja L(crosscache) > > + > > + movdqu (%rdi), %xmm0 > > + PCMPEQ %xmm1, %xmm0 > > + pmovmskb %xmm0, %eax > > + test %eax, %eax > > + > > + jnz L(matches_1) > > + sub $CHAR_PER_VEC, %rdx > > + jbe L(return_null) > > + add $16, %rdi > > + and $15, %ecx > > + and $-16, %rdi > > +# ifdef USE_AS_WMEMCHR > > + shr $2, %ecx > > +# endif > > + add %rcx, %rdx > > + sub $(CHAR_PER_VEC * 4), %rdx > > + jbe L(exit_loop) > > + jmp L(loop_prolog) > > + > > + .p2align 4 > > +L(crosscache): > > + and $15, %ecx > > + and $-16, %rdi > > + movdqa (%rdi), %xmm0 > > + > > + PCMPEQ %xmm1, %xmm0 > > + /* Check if there is a match. */ > > + pmovmskb %xmm0, %eax > > + /* Remove the leading bytes. */ > > + sar %cl, %eax > > + test %eax, %eax > > + je L(unaligned_no_match) > > + /* Check which byte is a match. */ > > + bsf %eax, %eax > > +# ifdef USE_AS_WMEMCHR > > + mov %eax, %esi > > + shr $2, %esi > > + sub %rsi, %rdx > > +# else > > + sub %rax, %rdx > > +# endif > > + jbe L(return_null) > > + add %rdi, %rax > > + add %rcx, %rax > > + ret > > + > > + .p2align 4 > > +L(unaligned_no_match): > > + /* "rcx" is less than 16. Calculate "rdx + rcx - 16" by using > > + "rdx - (16 - rcx)" instead of "(rdx + rcx) - 16" to void > > + possible addition overflow. */ > > + neg %rcx > > + add $16, %rcx > > +# ifdef USE_AS_WMEMCHR > > + shr $2, %ecx > > +# endif > > + sub %rcx, %rdx > > + jbe L(return_null) > > + add $16, %rdi > > + sub $(CHAR_PER_VEC * 4), %rdx > > + jbe L(exit_loop) > > + > > + .p2align 4 > > +L(loop_prolog): > > + movdqa (%rdi), %xmm0 > > + PCMPEQ %xmm1, %xmm0 > > + pmovmskb %xmm0, %eax > > + test %eax, %eax > > + jnz L(matches) > > + > > + movdqa 16(%rdi), %xmm2 > > + PCMPEQ %xmm1, %xmm2 > > + pmovmskb %xmm2, %eax > > + test %eax, %eax > > + jnz L(matches16) > > + > > + movdqa 32(%rdi), %xmm3 > > + PCMPEQ %xmm1, %xmm3 > > + pmovmskb %xmm3, %eax > > + test %eax, %eax > > + jnz L(matches32) > > + > > + movdqa 48(%rdi), %xmm4 > > + PCMPEQ %xmm1, %xmm4 > > + add $64, %rdi > > + pmovmskb %xmm4, %eax > > + test %eax, %eax > > + jnz L(matches0) > > + > > + test $0x3f, %rdi > > + jz L(align64_loop) > > + > > + sub $(CHAR_PER_VEC * 4), %rdx > > + jbe L(exit_loop) > > + > > + movdqa (%rdi), %xmm0 > > + PCMPEQ %xmm1, %xmm0 > > + pmovmskb %xmm0, %eax > > + test %eax, %eax > > + jnz L(matches) > > + > > + movdqa 16(%rdi), %xmm2 > > + PCMPEQ %xmm1, %xmm2 > > + pmovmskb %xmm2, %eax > > + test %eax, %eax > > + jnz L(matches16) > > + > > + movdqa 32(%rdi), %xmm3 > > + PCMPEQ %xmm1, %xmm3 > > + pmovmskb %xmm3, %eax > > + test %eax, %eax > > + jnz L(matches32) > > + > > + movdqa 48(%rdi), %xmm3 > > + PCMPEQ %xmm1, %xmm3 > > + pmovmskb %xmm3, %eax > > + > > + add $64, %rdi > > + test %eax, %eax > > + jnz L(matches0) > > + > > + mov %rdi, %rcx > > + and $-64, %rdi > > + and $63, %ecx > > +# ifdef USE_AS_WMEMCHR > > + shr $2, %ecx > > +# endif > > + add %rcx, %rdx > > + > > + .p2align 4 > > +L(align64_loop): > > + sub $(CHAR_PER_VEC * 4), %rdx > > + jbe L(exit_loop) > > + movdqa (%rdi), %xmm0 > > + movdqa 16(%rdi), %xmm2 > > + movdqa 32(%rdi), %xmm3 > > + movdqa 48(%rdi), %xmm4 > > + > > + PCMPEQ %xmm1, %xmm0 > > + PCMPEQ %xmm1, %xmm2 > > + PCMPEQ %xmm1, %xmm3 > > + PCMPEQ %xmm1, %xmm4 > > + > > + pmaxub %xmm0, %xmm3 > > + pmaxub %xmm2, %xmm4 > > + pmaxub %xmm3, %xmm4 > > + pmovmskb %xmm4, %eax > > + > > + add $64, %rdi > > + > > + test %eax, %eax > > + jz L(align64_loop) > > + > > + sub $64, %rdi > > + > > + pmovmskb %xmm0, %eax > > + test %eax, %eax > > + jnz L(matches) > > + > > + pmovmskb %xmm2, %eax > > + test %eax, %eax > > + jnz L(matches16) > > + > > + movdqa 32(%rdi), %xmm3 > > + PCMPEQ %xmm1, %xmm3 > > + > > + PCMPEQ 48(%rdi), %xmm1 > > + pmovmskb %xmm3, %eax > > + test %eax, %eax > > + jnz L(matches32) > > + > > + pmovmskb %xmm1, %eax > > + bsf %eax, %eax > > + lea 48(%rdi, %rax), %rax > > + ret > > + > > + .p2align 4 > > +L(exit_loop): > > + add $(CHAR_PER_VEC * 2), %edx > > + jle L(exit_loop_32) > > + > > + movdqa (%rdi), %xmm0 > > + PCMPEQ %xmm1, %xmm0 > > + pmovmskb %xmm0, %eax > > + test %eax, %eax > > + jnz L(matches) > > + > > + movdqa 16(%rdi), %xmm2 > > + PCMPEQ %xmm1, %xmm2 > > + pmovmskb %xmm2, %eax > > + test %eax, %eax > > + jnz L(matches16) > > + > > + movdqa 32(%rdi), %xmm3 > > + PCMPEQ %xmm1, %xmm3 > > + pmovmskb %xmm3, %eax > > + test %eax, %eax > > + jnz L(matches32_1) > > + sub $CHAR_PER_VEC, %edx > > + jle L(return_null) > > + > > + PCMPEQ 48(%rdi), %xmm1 > > + pmovmskb %xmm1, %eax > > + test %eax, %eax > > + jnz L(matches48_1) > > + xor %eax, %eax > > + ret > > + > > + .p2align 4 > > +L(exit_loop_32): > > + add $(CHAR_PER_VEC * 2), %edx > > + movdqa (%rdi), %xmm0 > > + PCMPEQ %xmm1, %xmm0 > > + pmovmskb %xmm0, %eax > > + test %eax, %eax > > + jnz L(matches_1) > > + sub $CHAR_PER_VEC, %edx > > + jbe L(return_null) > > + > > + PCMPEQ 16(%rdi), %xmm1 > > + pmovmskb %xmm1, %eax > > + test %eax, %eax > > + jnz L(matches16_1) > > + xor %eax, %eax > > + ret > > + > > + .p2align 4 > > +L(matches0): > > + bsf %eax, %eax > > + lea -16(%rax, %rdi), %rax > > + ret > > + > > + .p2align 4 > > +L(matches): > > + bsf %eax, %eax > > + add %rdi, %rax > > + ret > > + > > + .p2align 4 > > +L(matches16): > > + bsf %eax, %eax > > + lea 16(%rax, %rdi), %rax > > + ret > > + > > + .p2align 4 > > +L(matches32): > > + bsf %eax, %eax > > + lea 32(%rax, %rdi), %rax > > + ret > > + > > + .p2align 4 > > +L(matches_1): > > + bsf %eax, %eax > > +# ifdef USE_AS_WMEMCHR > > + mov %eax, %esi > > + shr $2, %esi > > + sub %rsi, %rdx > > +# else > > + sub %rax, %rdx > > +# endif > > + jbe L(return_null) > > + add %rdi, %rax > > + ret > > + > > + .p2align 4 > > +L(matches16_1): > > + bsf %eax, %eax > > +# ifdef USE_AS_WMEMCHR > > + mov %eax, %esi > > + shr $2, %esi > > + sub %rsi, %rdx > > +# else > > + sub %rax, %rdx > > +# endif > > + jbe L(return_null) > > + lea 16(%rdi, %rax), %rax > > + ret > > + > > + .p2align 4 > > +L(matches32_1): > > + bsf %eax, %eax > > +# ifdef USE_AS_WMEMCHR > > + mov %eax, %esi > > + shr $2, %esi > > + sub %rsi, %rdx > > +# else > > + sub %rax, %rdx > > +# endif > > + jbe L(return_null) > > + lea 32(%rdi, %rax), %rax > > + ret > > + > > + .p2align 4 > > +L(matches48_1): > > + bsf %eax, %eax > > +# ifdef USE_AS_WMEMCHR > > + mov %eax, %esi > > + shr $2, %esi > > + sub %rsi, %rdx > > +# else > > + sub %rax, %rdx > > +# endif > > + jbe L(return_null) > > + lea 48(%rdi, %rax), %rax > > + ret > > + > > + .p2align 4 > > +L(return_null): > > + xor %eax, %eax > > + ret > > +END(MEMCHR) > > +#endif > > diff --git a/sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S > > index acc5f6e2fb..5c1dcd3ca7 100644 > > --- a/sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S > > +++ b/sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S > > @@ -1,4 +1,7 @@ > > -#define MEMCHR __rawmemchr_avx2_rtm > > -#define USE_AS_RAWMEMCHR 1 > > +#ifndef RAWMEMCHR > > +# define RAWMEMCHR __rawmemchr_avx2_rtm > > +#endif > > +#define USE_AS_RAWMEMCHR 1 > > +#define MEMCHR RAWMEMCHR > > > > #include "memchr-avx2-rtm.S" > > diff --git a/sysdeps/x86_64/multiarch/rawmemchr-avx2.S b/sysdeps/x86_64/multiarch/rawmemchr-avx2.S > > index 128f9ea637..d6bff28757 100644 > > --- a/sysdeps/x86_64/multiarch/rawmemchr-avx2.S > > +++ b/sysdeps/x86_64/multiarch/rawmemchr-avx2.S > > @@ -1,4 +1,7 @@ > > -#define MEMCHR __rawmemchr_avx2 > > -#define USE_AS_RAWMEMCHR 1 > > +#ifndef RAWMEMCHR > > +# define RAWMEMCHR __rawmemchr_avx2 > > +#endif > > +#define USE_AS_RAWMEMCHR 1 > > +#define MEMCHR RAWMEMCHR > > > > #include "memchr-avx2.S" > > diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S b/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S > > index deda1ca395..8ff7f27c9c 100644 > > --- a/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S > > +++ b/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S > > @@ -1,3 +1,7 @@ > > -#define MEMCHR __rawmemchr_evex_rtm > > -#define USE_AS_RAWMEMCHR 1 > > +#ifndef RAWMEMCHR > > +# define RAWMEMCHR __rawmemchr_evex_rtm > > +#endif > > +#define USE_AS_RAWMEMCHR 1 > > +#define MEMCHR RAWMEMCHR > > + > > #include "memchr-evex-rtm.S" > > diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex.S b/sysdeps/x86_64/multiarch/rawmemchr-evex.S > > index ec942b77ba..dc1c450699 100644 > > --- a/sysdeps/x86_64/multiarch/rawmemchr-evex.S > > +++ b/sysdeps/x86_64/multiarch/rawmemchr-evex.S > > @@ -1,4 +1,7 @@ > > -#define MEMCHR __rawmemchr_evex > > -#define USE_AS_RAWMEMCHR 1 > > +#ifndef RAWMEMCHR > > +# define RAWMEMCHR __rawmemchr_evex > > +#endif > > +#define USE_AS_RAWMEMCHR 1 > > +#define MEMCHR RAWMEMCHR > > > > #include "memchr-evex.S" > > diff --git a/sysdeps/x86_64/multiarch/rawmemchr-sse2.S b/sysdeps/x86_64/multiarch/rawmemchr-sse2.S > > index 3841c14c34..73f4fa9589 100644 > > --- a/sysdeps/x86_64/multiarch/rawmemchr-sse2.S > > +++ b/sysdeps/x86_64/multiarch/rawmemchr-sse2.S > > @@ -16,14 +16,199 @@ > > License along with the GNU C Library; if not, see > > <https://www.gnu.org/licenses/>. */ > > > > -/* Define multiple versions only for the definition in libc. */ > > -#if IS_IN (libc) > > -# define __rawmemchr __rawmemchr_sse2 > > - > > -# undef weak_alias > > -# define weak_alias(__rawmemchr, rawmemchr) > > -# undef libc_hidden_def > > -# define libc_hidden_def(__rawmemchr) > > +#include <isa-level.h> > > + > > +#if defined IS_MULTIARCH && defined ISA_DEFAULT_IMPL > > +# error "Multiarch build should never default include!" > > #endif > > > > -#include "../rawmemchr.S" > > +/* __X86_ISA_LEVEL <= 2 because there is no V2 implementation so we > > + need this to build for ISA V2 builds. */ > > +#if (MINIMUM_X86_ISA_LEVEL <= 2 && IS_IN (libc)) \ > > + || defined ISA_DEFAULT_IMPL > > + > > + > > +# include <sysdep.h> > > + > > +# ifndef RAWMEMCHR > > +# define RAWMEMCHR __rawmemchr_sse2 > > +# endif > > + > > + .text > > +ENTRY (RAWMEMCHR) > > + movd %rsi, %xmm1 > > + mov %rdi, %rcx > > + > > + punpcklbw %xmm1, %xmm1 > > + punpcklbw %xmm1, %xmm1 > > + > > + and $63, %rcx > > + pshufd $0, %xmm1, %xmm1 > > + > > + cmp $48, %rcx > > + ja L(crosscache) > > + > > + movdqu (%rdi), %xmm0 > > + pcmpeqb %xmm1, %xmm0 > > +/* Check if there is a match. */ > > + pmovmskb %xmm0, %eax > > + test %eax, %eax > > + > > + jnz L(matches) > > + add $16, %rdi > > + and $-16, %rdi > > + jmp L(loop_prolog) > > + > > + .p2align 4 > > +L(crosscache): > > + and $15, %rcx > > + and $-16, %rdi > > + movdqa (%rdi), %xmm0 > > + > > + pcmpeqb %xmm1, %xmm0 > > +/* Check if there is a match. */ > > + pmovmskb %xmm0, %eax > > +/* Remove the leading bytes. */ > > + sar %cl, %eax > > + test %eax, %eax > > + je L(unaligned_no_match) > > +/* Check which byte is a match. */ > > + bsf %eax, %eax > > + > > + add %rdi, %rax > > + add %rcx, %rax > > + ret > > + > > + .p2align 4 > > +L(unaligned_no_match): > > + add $16, %rdi > > + > > + .p2align 4 > > +L(loop_prolog): > > + movdqa (%rdi), %xmm0 > > + pcmpeqb %xmm1, %xmm0 > > + pmovmskb %xmm0, %eax > > + test %eax, %eax > > + jnz L(matches) > > + > > + movdqa 16(%rdi), %xmm2 > > + pcmpeqb %xmm1, %xmm2 > > + pmovmskb %xmm2, %eax > > + test %eax, %eax > > + jnz L(matches16) > > + > > + movdqa 32(%rdi), %xmm3 > > + pcmpeqb %xmm1, %xmm3 > > + pmovmskb %xmm3, %eax > > + test %eax, %eax > > + jnz L(matches32) > > + > > + movdqa 48(%rdi), %xmm4 > > + pcmpeqb %xmm1, %xmm4 > > + add $64, %rdi > > + pmovmskb %xmm4, %eax > > + test %eax, %eax > > + jnz L(matches0) > > + > > + test $0x3f, %rdi > > + jz L(align64_loop) > > + > > + movdqa (%rdi), %xmm0 > > + pcmpeqb %xmm1, %xmm0 > > + pmovmskb %xmm0, %eax > > + test %eax, %eax > > + jnz L(matches) > > + > > + movdqa 16(%rdi), %xmm2 > > + pcmpeqb %xmm1, %xmm2 > > + pmovmskb %xmm2, %eax > > + test %eax, %eax > > + jnz L(matches16) > > + > > + movdqa 32(%rdi), %xmm3 > > + pcmpeqb %xmm1, %xmm3 > > + pmovmskb %xmm3, %eax > > + test %eax, %eax > > + jnz L(matches32) > > + > > + movdqa 48(%rdi), %xmm3 > > + pcmpeqb %xmm1, %xmm3 > > + pmovmskb %xmm3, %eax > > + > > + add $64, %rdi > > + test %eax, %eax > > + jnz L(matches0) > > + > > + and $-64, %rdi > > + > > + .p2align 4 > > +L(align64_loop): > > + movdqa (%rdi), %xmm0 > > + movdqa 16(%rdi), %xmm2 > > + movdqa 32(%rdi), %xmm3 > > + movdqa 48(%rdi), %xmm4 > > + > > + pcmpeqb %xmm1, %xmm0 > > + pcmpeqb %xmm1, %xmm2 > > + pcmpeqb %xmm1, %xmm3 > > + pcmpeqb %xmm1, %xmm4 > > + > > + pmaxub %xmm0, %xmm3 > > + pmaxub %xmm2, %xmm4 > > + pmaxub %xmm3, %xmm4 > > + pmovmskb %xmm4, %eax > > + > > + add $64, %rdi > > + > > + test %eax, %eax > > + jz L(align64_loop) > > + > > + sub $64, %rdi > > + > > + pmovmskb %xmm0, %eax > > + test %eax, %eax > > + jnz L(matches) > > + > > + pmovmskb %xmm2, %eax > > + test %eax, %eax > > + jnz L(matches16) > > + > > + movdqa 32(%rdi), %xmm3 > > + pcmpeqb %xmm1, %xmm3 > > + > > + pcmpeqb 48(%rdi), %xmm1 > > + pmovmskb %xmm3, %eax > > + test %eax, %eax > > + jnz L(matches32) > > + > > + pmovmskb %xmm1, %eax > > + bsf %eax, %eax > > + lea 48(%rdi, %rax), %rax > > + ret > > + > > + .p2align 4 > > +L(matches0): > > + bsf %eax, %eax > > + lea -16(%rax, %rdi), %rax > > + ret > > + > > + .p2align 4 > > +L(matches): > > + bsf %eax, %eax > > + add %rdi, %rax > > + ret > > + > > + .p2align 4 > > +L(matches16): > > + bsf %eax, %eax > > + lea 16(%rax, %rdi), %rax > > + ret > > + > > + .p2align 4 > > +L(matches32): > > + bsf %eax, %eax > > + lea 32(%rax, %rdi), %rax > > + ret > > + > > +END (RAWMEMCHR) > > +#endif > > diff --git a/sysdeps/x86_64/multiarch/rtld-memchr.S b/sysdeps/x86_64/multiarch/rtld-memchr.S > > new file mode 100644 > > index 0000000000..a14b192bed > > --- /dev/null > > +++ b/sysdeps/x86_64/multiarch/rtld-memchr.S > > @@ -0,0 +1,18 @@ > > +/* Copyright (C) 2022 Free Software Foundation, Inc. > > + This file is part of the GNU C Library. > > + > > + The GNU C Library is free software; you can redistribute it and/or > > + modify it under the terms of the GNU Lesser General Public > > + License as published by the Free Software Foundation; either > > + version 2.1 of the License, or (at your option) any later version. > > + > > + The GNU C Library is distributed in the hope that it will be useful, > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > + Lesser General Public License for more details. > > + > > + You should have received a copy of the GNU Lesser General Public > > + License along with the GNU C Library; if not, see > > + <https://www.gnu.org/licenses/>. */ > > + > > +#include "../memchr.S" > > diff --git a/sysdeps/x86_64/multiarch/rtld-rawmemchr.S b/sysdeps/x86_64/multiarch/rtld-rawmemchr.S > > new file mode 100644 > > index 0000000000..5d4110a052 > > --- /dev/null > > +++ b/sysdeps/x86_64/multiarch/rtld-rawmemchr.S > > @@ -0,0 +1,18 @@ > > +/* Copyright (C) 2022 Free Software Foundation, Inc. > > + This file is part of the GNU C Library. > > + > > + The GNU C Library is free software; you can redistribute it and/or > > + modify it under the terms of the GNU Lesser General Public > > + License as published by the Free Software Foundation; either > > + version 2.1 of the License, or (at your option) any later version. > > + > > + The GNU C Library is distributed in the hope that it will be useful, > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > + Lesser General Public License for more details. > > + > > + You should have received a copy of the GNU Lesser General Public > > + License along with the GNU C Library; if not, see > > + <https://www.gnu.org/licenses/>. */ > > + > > +#include "../rawmemchr.S" > > diff --git a/sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S > > index 58ed21db01..2a1cff5b05 100644 > > --- a/sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S > > +++ b/sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S > > @@ -1,4 +1,7 @@ > > -#define MEMCHR __wmemchr_avx2_rtm > > -#define USE_AS_WMEMCHR 1 > > +#ifndef WMEMCHR > > +# define WMEMCHR __wmemchr_avx2_rtm > > +#endif > > +#define USE_AS_WMEMCHR 1 > > +#define MEMCHR WMEMCHR > > > > #include "memchr-avx2-rtm.S" > > diff --git a/sysdeps/x86_64/multiarch/wmemchr-avx2.S b/sysdeps/x86_64/multiarch/wmemchr-avx2.S > > index 282854f1a1..2bf93fd84b 100644 > > --- a/sysdeps/x86_64/multiarch/wmemchr-avx2.S > > +++ b/sysdeps/x86_64/multiarch/wmemchr-avx2.S > > @@ -1,4 +1,7 @@ > > -#define MEMCHR __wmemchr_avx2 > > -#define USE_AS_WMEMCHR 1 > > +#ifndef WMEMCHR > > +# define WMEMCHR __wmemchr_avx2 > > +#endif > > +#define USE_AS_WMEMCHR 1 > > +#define MEMCHR WMEMCHR > > > > #include "memchr-avx2.S" > > diff --git a/sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S b/sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S > > index a346cd35a1..c67309e8a1 100644 > > --- a/sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S > > +++ b/sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S > > @@ -1,3 +1,7 @@ > > -#define MEMCHR __wmemchr_evex_rtm > > -#define USE_AS_WMEMCHR 1 > > +#ifndef WMEMCHR > > +# define WMEMCHR __wmemchr_evex_rtm > > +#endif > > +#define USE_AS_WMEMCHR 1 > > +#define MEMCHR WMEMCHR > > + > > #include "memchr-evex-rtm.S" > > diff --git a/sysdeps/x86_64/multiarch/wmemchr-evex.S b/sysdeps/x86_64/multiarch/wmemchr-evex.S > > index 06cd0f9f5a..5512d5cdc3 100644 > > --- a/sysdeps/x86_64/multiarch/wmemchr-evex.S > > +++ b/sysdeps/x86_64/multiarch/wmemchr-evex.S > > @@ -1,4 +1,7 @@ > > -#define MEMCHR __wmemchr_evex > > -#define USE_AS_WMEMCHR 1 > > +#ifndef WMEMCHR > > +# define WMEMCHR __wmemchr_evex > > +#endif > > +#define USE_AS_WMEMCHR 1 > > +#define MEMCHR WMEMCHR > > > > #include "memchr-evex.S" > > diff --git a/sysdeps/x86_64/multiarch/wmemchr-sse2.S b/sysdeps/x86_64/multiarch/wmemchr-sse2.S > > index 70a965d552..3081fb6821 100644 > > --- a/sysdeps/x86_64/multiarch/wmemchr-sse2.S > > +++ b/sysdeps/x86_64/multiarch/wmemchr-sse2.S > > @@ -1,4 +1,25 @@ > > -#define USE_AS_WMEMCHR 1 > > -#define wmemchr __wmemchr_sse2 > > +/* wmemchr optimized with SSE2 > > + Copyright (C) 2022 Free Software Foundation, Inc. > > + This file is part of the GNU C Library. > > > > -#include "../memchr.S" > > + The GNU C Library is free software; you can redistribute it and/or > > + modify it under the terms of the GNU Lesser General Public > > + License as published by the Free Software Foundation; either > > + version 2.1 of the License, or (at your option) any later version. > > + > > + The GNU C Library is distributed in the hope that it will be useful, > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > + Lesser General Public License for more details. > > + > > + You should have received a copy of the GNU Lesser General Public > > + License along with the GNU C Library; if not, see > > + <https://www.gnu.org/licenses/>. */ > > + > > +#ifndef WMEMCHR > > +# define WMEMCHR __wmemchr_sse2 > > +#endif > > +#define USE_AS_WMEMCHR 1 > > +#define MEMCHR WMEMCHR > > + > > +#include "memchr-sse2.S" > > diff --git a/sysdeps/x86_64/rawmemchr.S b/sysdeps/x86_64/rawmemchr.S > > index 4c1a3383b9..e401a2ac53 100644 > > --- a/sysdeps/x86_64/rawmemchr.S > > +++ b/sysdeps/x86_64/rawmemchr.S > > @@ -17,185 +17,13 @@ > > License along with the GNU C Library; if not, see > > <https://www.gnu.org/licenses/>. */ > > > > -#include <sysdep.h> > > +#define RAWMEMCHR rawmemchr > > > > - .text > > -ENTRY (__rawmemchr) > > - movd %rsi, %xmm1 > > - mov %rdi, %rcx > > +#define DEFAULT_IMPL_V1 "multiarch/rawmemchr-sse2.S" > > +#define DEFAULT_IMPL_V3 "multiarch/rawmemchr-avx2.S" > > +#define DEFAULT_IMPL_V4 "multiarch/rawmemchr-evex.S" > > > > - punpcklbw %xmm1, %xmm1 > > - punpcklbw %xmm1, %xmm1 > > +#include "isa-default-impl.h" > > > > - and $63, %rcx > > - pshufd $0, %xmm1, %xmm1 > > - > > - cmp $48, %rcx > > - ja L(crosscache) > > - > > - movdqu (%rdi), %xmm0 > > - pcmpeqb %xmm1, %xmm0 > > -/* Check if there is a match. */ > > - pmovmskb %xmm0, %eax > > - test %eax, %eax > > - > > - jnz L(matches) > > - add $16, %rdi > > - and $-16, %rdi > > - jmp L(loop_prolog) > > - > > - .p2align 4 > > -L(crosscache): > > - and $15, %rcx > > - and $-16, %rdi > > - movdqa (%rdi), %xmm0 > > - > > - pcmpeqb %xmm1, %xmm0 > > -/* Check if there is a match. */ > > - pmovmskb %xmm0, %eax > > -/* Remove the leading bytes. */ > > - sar %cl, %eax > > - test %eax, %eax > > - je L(unaligned_no_match) > > -/* Check which byte is a match. */ > > - bsf %eax, %eax > > - > > - add %rdi, %rax > > - add %rcx, %rax > > - ret > > - > > - .p2align 4 > > -L(unaligned_no_match): > > - add $16, %rdi > > - > > - .p2align 4 > > -L(loop_prolog): > > - movdqa (%rdi), %xmm0 > > - pcmpeqb %xmm1, %xmm0 > > - pmovmskb %xmm0, %eax > > - test %eax, %eax > > - jnz L(matches) > > - > > - movdqa 16(%rdi), %xmm2 > > - pcmpeqb %xmm1, %xmm2 > > - pmovmskb %xmm2, %eax > > - test %eax, %eax > > - jnz L(matches16) > > - > > - movdqa 32(%rdi), %xmm3 > > - pcmpeqb %xmm1, %xmm3 > > - pmovmskb %xmm3, %eax > > - test %eax, %eax > > - jnz L(matches32) > > - > > - movdqa 48(%rdi), %xmm4 > > - pcmpeqb %xmm1, %xmm4 > > - add $64, %rdi > > - pmovmskb %xmm4, %eax > > - test %eax, %eax > > - jnz L(matches0) > > - > > - test $0x3f, %rdi > > - jz L(align64_loop) > > - > > - movdqa (%rdi), %xmm0 > > - pcmpeqb %xmm1, %xmm0 > > - pmovmskb %xmm0, %eax > > - test %eax, %eax > > - jnz L(matches) > > - > > - movdqa 16(%rdi), %xmm2 > > - pcmpeqb %xmm1, %xmm2 > > - pmovmskb %xmm2, %eax > > - test %eax, %eax > > - jnz L(matches16) > > - > > - movdqa 32(%rdi), %xmm3 > > - pcmpeqb %xmm1, %xmm3 > > - pmovmskb %xmm3, %eax > > - test %eax, %eax > > - jnz L(matches32) > > - > > - movdqa 48(%rdi), %xmm3 > > - pcmpeqb %xmm1, %xmm3 > > - pmovmskb %xmm3, %eax > > - > > - add $64, %rdi > > - test %eax, %eax > > - jnz L(matches0) > > - > > - and $-64, %rdi > > - > > - .p2align 4 > > -L(align64_loop): > > - movdqa (%rdi), %xmm0 > > - movdqa 16(%rdi), %xmm2 > > - movdqa 32(%rdi), %xmm3 > > - movdqa 48(%rdi), %xmm4 > > - > > - pcmpeqb %xmm1, %xmm0 > > - pcmpeqb %xmm1, %xmm2 > > - pcmpeqb %xmm1, %xmm3 > > - pcmpeqb %xmm1, %xmm4 > > - > > - pmaxub %xmm0, %xmm3 > > - pmaxub %xmm2, %xmm4 > > - pmaxub %xmm3, %xmm4 > > - pmovmskb %xmm4, %eax > > - > > - add $64, %rdi > > - > > - test %eax, %eax > > - jz L(align64_loop) > > - > > - sub $64, %rdi > > - > > - pmovmskb %xmm0, %eax > > - test %eax, %eax > > - jnz L(matches) > > - > > - pmovmskb %xmm2, %eax > > - test %eax, %eax > > - jnz L(matches16) > > - > > - movdqa 32(%rdi), %xmm3 > > - pcmpeqb %xmm1, %xmm3 > > - > > - pcmpeqb 48(%rdi), %xmm1 > > - pmovmskb %xmm3, %eax > > - test %eax, %eax > > - jnz L(matches32) > > - > > - pmovmskb %xmm1, %eax > > - bsf %eax, %eax > > - lea 48(%rdi, %rax), %rax > > - ret > > - > > - .p2align 4 > > -L(matches0): > > - bsf %eax, %eax > > - lea -16(%rax, %rdi), %rax > > - ret > > - > > - .p2align 4 > > -L(matches): > > - bsf %eax, %eax > > - add %rdi, %rax > > - ret > > - > > - .p2align 4 > > -L(matches16): > > - bsf %eax, %eax > > - lea 16(%rax, %rdi), %rax > > - ret > > - > > - .p2align 4 > > -L(matches32): > > - bsf %eax, %eax > > - lea 32(%rax, %rdi), %rax > > - ret > > - > > -END (__rawmemchr) > > - > > -weak_alias (__rawmemchr, rawmemchr) > > -libc_hidden_builtin_def (__rawmemchr) > > +strong_alias (rawmemchr, __rawmemchr) > > +libc_hidden_builtin_def (rawmemchr) > > diff --git a/sysdeps/x86_64/wmemchr.S b/sysdeps/x86_64/wmemchr.S > > new file mode 100644 > > index 0000000000..dd0490f86b > > --- /dev/null > > +++ b/sysdeps/x86_64/wmemchr.S > > @@ -0,0 +1,24 @@ > > +/* Copyright (C) 2011-2022 Free Software Foundation, Inc. > > + This file is part of the GNU C Library. > > + > > + The GNU C Library is free software; you can redistribute it and/or > > + modify it under the terms of the GNU Lesser General Public > > + License as published by the Free Software Foundation; either > > + version 2.1 of the License, or (at your option) any later version. > > + > > + The GNU C Library is distributed in the hope that it will be useful, > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > + Lesser General Public License for more details. > > + > > + You should have received a copy of the GNU Lesser General Public > > + License along with the GNU C Library; if not, see > > + <https://www.gnu.org/licenses/>. */ > > + > > +#define WMEMCHR wmemchr > > + > > +#define DEFAULT_IMPL_V1 "multiarch/wmemchr-sse2.S" > > +#define DEFAULT_IMPL_V3 "multiarch/wmemchr-avx2.S" > > +#define DEFAULT_IMPL_V4 "multiarch/wmemchr-evex.S" > > + > > +#include "isa-default-impl.h" > > -- > > 2.34.1 > > > > > -- > H.J.
diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S index a160fd9b00..018bb06f04 100644 --- a/sysdeps/x86_64/memchr.S +++ b/sysdeps/x86_64/memchr.S @@ -15,358 +15,13 @@ License along with the GNU C Library; if not, see <https://www.gnu.org/licenses/>. */ -#include <sysdep.h> +#define MEMCHR memchr -#ifdef USE_AS_WMEMCHR -# define MEMCHR wmemchr -# define PCMPEQ pcmpeqd -# define CHAR_PER_VEC 4 -#else -# define MEMCHR memchr -# define PCMPEQ pcmpeqb -# define CHAR_PER_VEC 16 -#endif +#define DEFAULT_IMPL_V1 "multiarch/memchr-sse2.S" +#define DEFAULT_IMPL_V3 "multiarch/memchr-avx2.S" +#define DEFAULT_IMPL_V4 "multiarch/memchr-evex.S" -/* fast SSE2 version with using pmaxub and 64 byte loop */ +#include "isa-default-impl.h" - .text -ENTRY(MEMCHR) - movd %esi, %xmm1 - mov %edi, %ecx - -#ifdef __ILP32__ - /* Clear the upper 32 bits. */ - movl %edx, %edx -#endif -#ifdef USE_AS_WMEMCHR - test %RDX_LP, %RDX_LP - jz L(return_null) -#else - punpcklbw %xmm1, %xmm1 - test %RDX_LP, %RDX_LP - jz L(return_null) - punpcklbw %xmm1, %xmm1 -#endif - - and $63, %ecx - pshufd $0, %xmm1, %xmm1 - - cmp $48, %ecx - ja L(crosscache) - - movdqu (%rdi), %xmm0 - PCMPEQ %xmm1, %xmm0 - pmovmskb %xmm0, %eax - test %eax, %eax - - jnz L(matches_1) - sub $CHAR_PER_VEC, %rdx - jbe L(return_null) - add $16, %rdi - and $15, %ecx - and $-16, %rdi -#ifdef USE_AS_WMEMCHR - shr $2, %ecx -#endif - add %rcx, %rdx - sub $(CHAR_PER_VEC * 4), %rdx - jbe L(exit_loop) - jmp L(loop_prolog) - - .p2align 4 -L(crosscache): - and $15, %ecx - and $-16, %rdi - movdqa (%rdi), %xmm0 - - PCMPEQ %xmm1, %xmm0 - /* Check if there is a match. */ - pmovmskb %xmm0, %eax - /* Remove the leading bytes. */ - sar %cl, %eax - test %eax, %eax - je L(unaligned_no_match) - /* Check which byte is a match. */ - bsf %eax, %eax -#ifdef USE_AS_WMEMCHR - mov %eax, %esi - shr $2, %esi - sub %rsi, %rdx -#else - sub %rax, %rdx -#endif - jbe L(return_null) - add %rdi, %rax - add %rcx, %rax - ret - - .p2align 4 -L(unaligned_no_match): - /* "rcx" is less than 16. Calculate "rdx + rcx - 16" by using - "rdx - (16 - rcx)" instead of "(rdx + rcx) - 16" to void - possible addition overflow. */ - neg %rcx - add $16, %rcx -#ifdef USE_AS_WMEMCHR - shr $2, %ecx -#endif - sub %rcx, %rdx - jbe L(return_null) - add $16, %rdi - sub $(CHAR_PER_VEC * 4), %rdx - jbe L(exit_loop) - - .p2align 4 -L(loop_prolog): - movdqa (%rdi), %xmm0 - PCMPEQ %xmm1, %xmm0 - pmovmskb %xmm0, %eax - test %eax, %eax - jnz L(matches) - - movdqa 16(%rdi), %xmm2 - PCMPEQ %xmm1, %xmm2 - pmovmskb %xmm2, %eax - test %eax, %eax - jnz L(matches16) - - movdqa 32(%rdi), %xmm3 - PCMPEQ %xmm1, %xmm3 - pmovmskb %xmm3, %eax - test %eax, %eax - jnz L(matches32) - - movdqa 48(%rdi), %xmm4 - PCMPEQ %xmm1, %xmm4 - add $64, %rdi - pmovmskb %xmm4, %eax - test %eax, %eax - jnz L(matches0) - - test $0x3f, %rdi - jz L(align64_loop) - - sub $(CHAR_PER_VEC * 4), %rdx - jbe L(exit_loop) - - movdqa (%rdi), %xmm0 - PCMPEQ %xmm1, %xmm0 - pmovmskb %xmm0, %eax - test %eax, %eax - jnz L(matches) - - movdqa 16(%rdi), %xmm2 - PCMPEQ %xmm1, %xmm2 - pmovmskb %xmm2, %eax - test %eax, %eax - jnz L(matches16) - - movdqa 32(%rdi), %xmm3 - PCMPEQ %xmm1, %xmm3 - pmovmskb %xmm3, %eax - test %eax, %eax - jnz L(matches32) - - movdqa 48(%rdi), %xmm3 - PCMPEQ %xmm1, %xmm3 - pmovmskb %xmm3, %eax - - add $64, %rdi - test %eax, %eax - jnz L(matches0) - - mov %rdi, %rcx - and $-64, %rdi - and $63, %ecx -#ifdef USE_AS_WMEMCHR - shr $2, %ecx -#endif - add %rcx, %rdx - - .p2align 4 -L(align64_loop): - sub $(CHAR_PER_VEC * 4), %rdx - jbe L(exit_loop) - movdqa (%rdi), %xmm0 - movdqa 16(%rdi), %xmm2 - movdqa 32(%rdi), %xmm3 - movdqa 48(%rdi), %xmm4 - - PCMPEQ %xmm1, %xmm0 - PCMPEQ %xmm1, %xmm2 - PCMPEQ %xmm1, %xmm3 - PCMPEQ %xmm1, %xmm4 - - pmaxub %xmm0, %xmm3 - pmaxub %xmm2, %xmm4 - pmaxub %xmm3, %xmm4 - pmovmskb %xmm4, %eax - - add $64, %rdi - - test %eax, %eax - jz L(align64_loop) - - sub $64, %rdi - - pmovmskb %xmm0, %eax - test %eax, %eax - jnz L(matches) - - pmovmskb %xmm2, %eax - test %eax, %eax - jnz L(matches16) - - movdqa 32(%rdi), %xmm3 - PCMPEQ %xmm1, %xmm3 - - PCMPEQ 48(%rdi), %xmm1 - pmovmskb %xmm3, %eax - test %eax, %eax - jnz L(matches32) - - pmovmskb %xmm1, %eax - bsf %eax, %eax - lea 48(%rdi, %rax), %rax - ret - - .p2align 4 -L(exit_loop): - add $(CHAR_PER_VEC * 2), %edx - jle L(exit_loop_32) - - movdqa (%rdi), %xmm0 - PCMPEQ %xmm1, %xmm0 - pmovmskb %xmm0, %eax - test %eax, %eax - jnz L(matches) - - movdqa 16(%rdi), %xmm2 - PCMPEQ %xmm1, %xmm2 - pmovmskb %xmm2, %eax - test %eax, %eax - jnz L(matches16) - - movdqa 32(%rdi), %xmm3 - PCMPEQ %xmm1, %xmm3 - pmovmskb %xmm3, %eax - test %eax, %eax - jnz L(matches32_1) - sub $CHAR_PER_VEC, %edx - jle L(return_null) - - PCMPEQ 48(%rdi), %xmm1 - pmovmskb %xmm1, %eax - test %eax, %eax - jnz L(matches48_1) - xor %eax, %eax - ret - - .p2align 4 -L(exit_loop_32): - add $(CHAR_PER_VEC * 2), %edx - movdqa (%rdi), %xmm0 - PCMPEQ %xmm1, %xmm0 - pmovmskb %xmm0, %eax - test %eax, %eax - jnz L(matches_1) - sub $CHAR_PER_VEC, %edx - jbe L(return_null) - - PCMPEQ 16(%rdi), %xmm1 - pmovmskb %xmm1, %eax - test %eax, %eax - jnz L(matches16_1) - xor %eax, %eax - ret - - .p2align 4 -L(matches0): - bsf %eax, %eax - lea -16(%rax, %rdi), %rax - ret - - .p2align 4 -L(matches): - bsf %eax, %eax - add %rdi, %rax - ret - - .p2align 4 -L(matches16): - bsf %eax, %eax - lea 16(%rax, %rdi), %rax - ret - - .p2align 4 -L(matches32): - bsf %eax, %eax - lea 32(%rax, %rdi), %rax - ret - - .p2align 4 -L(matches_1): - bsf %eax, %eax -#ifdef USE_AS_WMEMCHR - mov %eax, %esi - shr $2, %esi - sub %rsi, %rdx -#else - sub %rax, %rdx -#endif - jbe L(return_null) - add %rdi, %rax - ret - - .p2align 4 -L(matches16_1): - bsf %eax, %eax -#ifdef USE_AS_WMEMCHR - mov %eax, %esi - shr $2, %esi - sub %rsi, %rdx -#else - sub %rax, %rdx -#endif - jbe L(return_null) - lea 16(%rdi, %rax), %rax - ret - - .p2align 4 -L(matches32_1): - bsf %eax, %eax -#ifdef USE_AS_WMEMCHR - mov %eax, %esi - shr $2, %esi - sub %rsi, %rdx -#else - sub %rax, %rdx -#endif - jbe L(return_null) - lea 32(%rdi, %rax), %rax - ret - - .p2align 4 -L(matches48_1): - bsf %eax, %eax -#ifdef USE_AS_WMEMCHR - mov %eax, %esi - shr $2, %esi - sub %rsi, %rdx -#else - sub %rax, %rdx -#endif - jbe L(return_null) - lea 48(%rdi, %rax), %rax - ret - - .p2align 4 -L(return_null): - xor %eax, %eax - ret -END(MEMCHR) - -#ifndef USE_AS_WMEMCHR strong_alias (memchr, __memchr) libc_hidden_builtin_def(memchr) -#endif diff --git a/sysdeps/x86_64/multiarch/ifunc-evex.h b/sysdeps/x86_64/multiarch/ifunc-evex.h index b8f7a12ea2..a2f854b98d 100644 --- a/sysdeps/x86_64/multiarch/ifunc-evex.h +++ b/sysdeps/x86_64/multiarch/ifunc-evex.h @@ -19,24 +19,28 @@ #include <init-arch.h> -extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; -extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; -extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_rtm) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; + +/* TODO: Look into using the ISA build level to remove some/all of the + feature checks. */ static inline void * IFUNC_SELECTOR (void) { - const struct cpu_features* cpu_features = __get_cpu_features (); + const struct cpu_features *cpu_features = __get_cpu_features (); - if (CPU_FEATURE_USABLE_P (cpu_features, AVX2) - && CPU_FEATURE_USABLE_P (cpu_features, BMI2) - && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) + if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX2) + && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2) + && X86_ISA_CPU_FEATURES_ARCH_P (cpu_features, + AVX_Fast_Unaligned_Load)) { - if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) - && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)) + if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) + && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)) { if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) return OPTIMIZE (evex_rtm); @@ -44,12 +48,19 @@ IFUNC_SELECTOR (void) return OPTIMIZE (evex); } + X86_ERROR_IF_REACHABLE_V4 (); + if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) return OPTIMIZE (avx2_rtm); - if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) + if (X86_ISA_CPU_FEATURES_ARCH_P (cpu_features, + Prefer_No_VZEROUPPER)) return OPTIMIZE (avx2); } + X86_ERROR_IF_REACHABLE_V3 (); + + /* This is unreachable (compile time checked) if ISA level >= 3 + so no need for a robust fallback here. */ return OPTIMIZE (sse2); } diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index 883362f63d..bf52cf96d0 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -25,7 +25,8 @@ /* Fill ARRAY of MAX elements with IFUNC implementations for function NAME supported on target machine and return the number of valid - entries. */ + entries. Each set of implementations for a given function is sorted in + descending order by ISA level. */ size_t __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, @@ -53,24 +54,27 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/x86_64/multiarch/memchr.c. */ IFUNC_IMPL (i, name, memchr, - IFUNC_IMPL_ADD (array, i, memchr, - CPU_FEATURE_USABLE (AVX2), - __memchr_avx2) - IFUNC_IMPL_ADD (array, i, memchr, - (CPU_FEATURE_USABLE (AVX2) - && CPU_FEATURE_USABLE (RTM)), - __memchr_avx2_rtm) - IFUNC_IMPL_ADD (array, i, memchr, + X86_IFUNC_IMPL_ADD_V4 (array, i, memchr, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW) && CPU_FEATURE_USABLE (BMI2)), __memchr_evex) - IFUNC_IMPL_ADD (array, i, memchr, + X86_IFUNC_IMPL_ADD_V4 (array, i, memchr, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW) && CPU_FEATURE_USABLE (BMI2)), __memchr_evex_rtm) - IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_sse2)) + X86_IFUNC_IMPL_ADD_V3 (array, i, memchr, + CPU_FEATURE_USABLE (AVX2), + __memchr_avx2) + X86_IFUNC_IMPL_ADD_V3 (array, i, memchr, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __memchr_avx2_rtm) + /* Can be lowered to V1 if a V2 implementation is added. */ + X86_IFUNC_IMPL_ADD_V2 (array, i, memchr, + 1, + __memchr_sse2)) /* Support sysdeps/x86_64/multiarch/memcmp.c. */ IFUNC_IMPL (i, name, memcmp, @@ -288,24 +292,27 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/x86_64/multiarch/rawmemchr.c. */ IFUNC_IMPL (i, name, rawmemchr, - IFUNC_IMPL_ADD (array, i, rawmemchr, - CPU_FEATURE_USABLE (AVX2), - __rawmemchr_avx2) - IFUNC_IMPL_ADD (array, i, rawmemchr, - (CPU_FEATURE_USABLE (AVX2) - && CPU_FEATURE_USABLE (RTM)), - __rawmemchr_avx2_rtm) - IFUNC_IMPL_ADD (array, i, rawmemchr, + X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW) && CPU_FEATURE_USABLE (BMI2)), __rawmemchr_evex) - IFUNC_IMPL_ADD (array, i, rawmemchr, + X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW) && CPU_FEATURE_USABLE (BMI2)), __rawmemchr_evex_rtm) - IFUNC_IMPL_ADD (array, i, rawmemchr, 1, __rawmemchr_sse2)) + X86_IFUNC_IMPL_ADD_V3 (array, i, rawmemchr, + CPU_FEATURE_USABLE (AVX2), + __rawmemchr_avx2) + X86_IFUNC_IMPL_ADD_V3 (array, i, rawmemchr, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __rawmemchr_avx2_rtm) + /* Can be lowered to V1 if a V2 implementation is added. */ + X86_IFUNC_IMPL_ADD_V2 (array, i, rawmemchr, + 1, + __rawmemchr_sse2)) /* Support sysdeps/x86_64/multiarch/strlen.c. */ IFUNC_IMPL (i, name, strlen, @@ -748,24 +755,27 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/x86_64/multiarch/wmemchr.c. */ IFUNC_IMPL (i, name, wmemchr, - IFUNC_IMPL_ADD (array, i, wmemchr, - CPU_FEATURE_USABLE (AVX2), - __wmemchr_avx2) - IFUNC_IMPL_ADD (array, i, wmemchr, - (CPU_FEATURE_USABLE (AVX2) - && CPU_FEATURE_USABLE (RTM)), - __wmemchr_avx2_rtm) - IFUNC_IMPL_ADD (array, i, wmemchr, + X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW) && CPU_FEATURE_USABLE (BMI2)), __wmemchr_evex) - IFUNC_IMPL_ADD (array, i, wmemchr, + X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW) && CPU_FEATURE_USABLE (BMI2)), __wmemchr_evex_rtm) - IFUNC_IMPL_ADD (array, i, wmemchr, 1, __wmemchr_sse2)) + X86_IFUNC_IMPL_ADD_V3 (array, i, wmemchr, + CPU_FEATURE_USABLE (AVX2), + __wmemchr_avx2) + X86_IFUNC_IMPL_ADD_V3 (array, i, wmemchr, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __wmemchr_avx2_rtm) + /* Can be lowered to V1 if a V2 implementation is added. */ + X86_IFUNC_IMPL_ADD_V2 (array, i, wmemchr, + 1, + __wmemchr_sse2)) /* Support sysdeps/x86_64/multiarch/wmemcmp.c. */ IFUNC_IMPL (i, name, wmemcmp, diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S index c5a256eb37..691662f0fb 100644 --- a/sysdeps/x86_64/multiarch/memchr-avx2.S +++ b/sysdeps/x86_64/multiarch/memchr-avx2.S @@ -16,7 +16,15 @@ License along with the GNU C Library; if not, see <https://www.gnu.org/licenses/>. */ -#if IS_IN (libc) +#include <isa-level.h> + +#if defined IS_MULTIARCH && defined ISA_DEFAULT_IMPL +# error "Multiarch build should never default include!" +#endif + +#if (MINIMUM_X86_ISA_LEVEL <= 3 && IS_IN (libc)) \ + || defined ISA_DEFAULT_IMPL + # include <sysdep.h> diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S index 0fd11b7632..10ed0434ae 100644 --- a/sysdeps/x86_64/multiarch/memchr-evex.S +++ b/sysdeps/x86_64/multiarch/memchr-evex.S @@ -16,7 +16,15 @@ License along with the GNU C Library; if not, see <https://www.gnu.org/licenses/>. */ -#if IS_IN (libc) +#include <isa-level.h> + +#if defined IS_MULTIARCH && defined ISA_DEFAULT_IMPL +# error "Multiarch build should never default include!" +#endif + +#if (MINIMUM_X86_ISA_LEVEL <= 4 && IS_IN (libc)) \ + || defined ISA_DEFAULT_IMPL + # include <sysdep.h> diff --git a/sysdeps/x86_64/multiarch/memchr-sse2.S b/sysdeps/x86_64/multiarch/memchr-sse2.S index 2c6fdd41d6..acd5c15e22 100644 --- a/sysdeps/x86_64/multiarch/memchr-sse2.S +++ b/sysdeps/x86_64/multiarch/memchr-sse2.S @@ -16,13 +16,367 @@ License along with the GNU C Library; if not, see <https://www.gnu.org/licenses/>. */ -#if IS_IN (libc) -# define memchr __memchr_sse2 +#include <isa-level.h> -# undef strong_alias -# define strong_alias(memchr, __memchr) -# undef libc_hidden_builtin_def -# define libc_hidden_builtin_def(memchr) +#if defined IS_MULTIARCH && defined ISA_DEFAULT_IMPL +# error "Multiarch build should never default include!" #endif -#include "../memchr.S" +/* __X86_ISA_LEVEL <= 2 because there is no V2 implementation so we + need this to build for ISA V2 builds. */ +#if (MINIMUM_X86_ISA_LEVEL <= 2 && IS_IN (libc)) \ + || defined ISA_DEFAULT_IMPL + + +# include <sysdep.h> + +# ifndef MEMCHR +# define MEMCHR __memchr_sse2 +# endif +# ifdef USE_AS_WMEMCHR +# define PCMPEQ pcmpeqd +# define CHAR_PER_VEC 4 +# else +# define PCMPEQ pcmpeqb +# define CHAR_PER_VEC 16 +# endif + +/* fast SSE2 version with using pmaxub and 64 byte loop */ + + .text +ENTRY(MEMCHR) + movd %esi, %xmm1 + mov %edi, %ecx + +# ifdef __ILP32__ + /* Clear the upper 32 bits. */ + movl %edx, %edx +# endif +# ifdef USE_AS_WMEMCHR + test %RDX_LP, %RDX_LP + jz L(return_null) +# else + punpcklbw %xmm1, %xmm1 + test %RDX_LP, %RDX_LP + jz L(return_null) + punpcklbw %xmm1, %xmm1 +# endif + + and $63, %ecx + pshufd $0, %xmm1, %xmm1 + + cmp $48, %ecx + ja L(crosscache) + + movdqu (%rdi), %xmm0 + PCMPEQ %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + + jnz L(matches_1) + sub $CHAR_PER_VEC, %rdx + jbe L(return_null) + add $16, %rdi + and $15, %ecx + and $-16, %rdi +# ifdef USE_AS_WMEMCHR + shr $2, %ecx +# endif + add %rcx, %rdx + sub $(CHAR_PER_VEC * 4), %rdx + jbe L(exit_loop) + jmp L(loop_prolog) + + .p2align 4 +L(crosscache): + and $15, %ecx + and $-16, %rdi + movdqa (%rdi), %xmm0 + + PCMPEQ %xmm1, %xmm0 + /* Check if there is a match. */ + pmovmskb %xmm0, %eax + /* Remove the leading bytes. */ + sar %cl, %eax + test %eax, %eax + je L(unaligned_no_match) + /* Check which byte is a match. */ + bsf %eax, %eax +# ifdef USE_AS_WMEMCHR + mov %eax, %esi + shr $2, %esi + sub %rsi, %rdx +# else + sub %rax, %rdx +# endif + jbe L(return_null) + add %rdi, %rax + add %rcx, %rax + ret + + .p2align 4 +L(unaligned_no_match): + /* "rcx" is less than 16. Calculate "rdx + rcx - 16" by using + "rdx - (16 - rcx)" instead of "(rdx + rcx) - 16" to void + possible addition overflow. */ + neg %rcx + add $16, %rcx +# ifdef USE_AS_WMEMCHR + shr $2, %ecx +# endif + sub %rcx, %rdx + jbe L(return_null) + add $16, %rdi + sub $(CHAR_PER_VEC * 4), %rdx + jbe L(exit_loop) + + .p2align 4 +L(loop_prolog): + movdqa (%rdi), %xmm0 + PCMPEQ %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches) + + movdqa 16(%rdi), %xmm2 + PCMPEQ %xmm1, %xmm2 + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches16) + + movdqa 32(%rdi), %xmm3 + PCMPEQ %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches32) + + movdqa 48(%rdi), %xmm4 + PCMPEQ %xmm1, %xmm4 + add $64, %rdi + pmovmskb %xmm4, %eax + test %eax, %eax + jnz L(matches0) + + test $0x3f, %rdi + jz L(align64_loop) + + sub $(CHAR_PER_VEC * 4), %rdx + jbe L(exit_loop) + + movdqa (%rdi), %xmm0 + PCMPEQ %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches) + + movdqa 16(%rdi), %xmm2 + PCMPEQ %xmm1, %xmm2 + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches16) + + movdqa 32(%rdi), %xmm3 + PCMPEQ %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches32) + + movdqa 48(%rdi), %xmm3 + PCMPEQ %xmm1, %xmm3 + pmovmskb %xmm3, %eax + + add $64, %rdi + test %eax, %eax + jnz L(matches0) + + mov %rdi, %rcx + and $-64, %rdi + and $63, %ecx +# ifdef USE_AS_WMEMCHR + shr $2, %ecx +# endif + add %rcx, %rdx + + .p2align 4 +L(align64_loop): + sub $(CHAR_PER_VEC * 4), %rdx + jbe L(exit_loop) + movdqa (%rdi), %xmm0 + movdqa 16(%rdi), %xmm2 + movdqa 32(%rdi), %xmm3 + movdqa 48(%rdi), %xmm4 + + PCMPEQ %xmm1, %xmm0 + PCMPEQ %xmm1, %xmm2 + PCMPEQ %xmm1, %xmm3 + PCMPEQ %xmm1, %xmm4 + + pmaxub %xmm0, %xmm3 + pmaxub %xmm2, %xmm4 + pmaxub %xmm3, %xmm4 + pmovmskb %xmm4, %eax + + add $64, %rdi + + test %eax, %eax + jz L(align64_loop) + + sub $64, %rdi + + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches) + + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches16) + + movdqa 32(%rdi), %xmm3 + PCMPEQ %xmm1, %xmm3 + + PCMPEQ 48(%rdi), %xmm1 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches32) + + pmovmskb %xmm1, %eax + bsf %eax, %eax + lea 48(%rdi, %rax), %rax + ret + + .p2align 4 +L(exit_loop): + add $(CHAR_PER_VEC * 2), %edx + jle L(exit_loop_32) + + movdqa (%rdi), %xmm0 + PCMPEQ %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches) + + movdqa 16(%rdi), %xmm2 + PCMPEQ %xmm1, %xmm2 + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches16) + + movdqa 32(%rdi), %xmm3 + PCMPEQ %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches32_1) + sub $CHAR_PER_VEC, %edx + jle L(return_null) + + PCMPEQ 48(%rdi), %xmm1 + pmovmskb %xmm1, %eax + test %eax, %eax + jnz L(matches48_1) + xor %eax, %eax + ret + + .p2align 4 +L(exit_loop_32): + add $(CHAR_PER_VEC * 2), %edx + movdqa (%rdi), %xmm0 + PCMPEQ %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches_1) + sub $CHAR_PER_VEC, %edx + jbe L(return_null) + + PCMPEQ 16(%rdi), %xmm1 + pmovmskb %xmm1, %eax + test %eax, %eax + jnz L(matches16_1) + xor %eax, %eax + ret + + .p2align 4 +L(matches0): + bsf %eax, %eax + lea -16(%rax, %rdi), %rax + ret + + .p2align 4 +L(matches): + bsf %eax, %eax + add %rdi, %rax + ret + + .p2align 4 +L(matches16): + bsf %eax, %eax + lea 16(%rax, %rdi), %rax + ret + + .p2align 4 +L(matches32): + bsf %eax, %eax + lea 32(%rax, %rdi), %rax + ret + + .p2align 4 +L(matches_1): + bsf %eax, %eax +# ifdef USE_AS_WMEMCHR + mov %eax, %esi + shr $2, %esi + sub %rsi, %rdx +# else + sub %rax, %rdx +# endif + jbe L(return_null) + add %rdi, %rax + ret + + .p2align 4 +L(matches16_1): + bsf %eax, %eax +# ifdef USE_AS_WMEMCHR + mov %eax, %esi + shr $2, %esi + sub %rsi, %rdx +# else + sub %rax, %rdx +# endif + jbe L(return_null) + lea 16(%rdi, %rax), %rax + ret + + .p2align 4 +L(matches32_1): + bsf %eax, %eax +# ifdef USE_AS_WMEMCHR + mov %eax, %esi + shr $2, %esi + sub %rsi, %rdx +# else + sub %rax, %rdx +# endif + jbe L(return_null) + lea 32(%rdi, %rax), %rax + ret + + .p2align 4 +L(matches48_1): + bsf %eax, %eax +# ifdef USE_AS_WMEMCHR + mov %eax, %esi + shr $2, %esi + sub %rsi, %rdx +# else + sub %rax, %rdx +# endif + jbe L(return_null) + lea 48(%rdi, %rax), %rax + ret + + .p2align 4 +L(return_null): + xor %eax, %eax + ret +END(MEMCHR) +#endif diff --git a/sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S index acc5f6e2fb..5c1dcd3ca7 100644 --- a/sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S +++ b/sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S @@ -1,4 +1,7 @@ -#define MEMCHR __rawmemchr_avx2_rtm -#define USE_AS_RAWMEMCHR 1 +#ifndef RAWMEMCHR +# define RAWMEMCHR __rawmemchr_avx2_rtm +#endif +#define USE_AS_RAWMEMCHR 1 +#define MEMCHR RAWMEMCHR #include "memchr-avx2-rtm.S" diff --git a/sysdeps/x86_64/multiarch/rawmemchr-avx2.S b/sysdeps/x86_64/multiarch/rawmemchr-avx2.S index 128f9ea637..d6bff28757 100644 --- a/sysdeps/x86_64/multiarch/rawmemchr-avx2.S +++ b/sysdeps/x86_64/multiarch/rawmemchr-avx2.S @@ -1,4 +1,7 @@ -#define MEMCHR __rawmemchr_avx2 -#define USE_AS_RAWMEMCHR 1 +#ifndef RAWMEMCHR +# define RAWMEMCHR __rawmemchr_avx2 +#endif +#define USE_AS_RAWMEMCHR 1 +#define MEMCHR RAWMEMCHR #include "memchr-avx2.S" diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S b/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S index deda1ca395..8ff7f27c9c 100644 --- a/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S +++ b/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S @@ -1,3 +1,7 @@ -#define MEMCHR __rawmemchr_evex_rtm -#define USE_AS_RAWMEMCHR 1 +#ifndef RAWMEMCHR +# define RAWMEMCHR __rawmemchr_evex_rtm +#endif +#define USE_AS_RAWMEMCHR 1 +#define MEMCHR RAWMEMCHR + #include "memchr-evex-rtm.S" diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex.S b/sysdeps/x86_64/multiarch/rawmemchr-evex.S index ec942b77ba..dc1c450699 100644 --- a/sysdeps/x86_64/multiarch/rawmemchr-evex.S +++ b/sysdeps/x86_64/multiarch/rawmemchr-evex.S @@ -1,4 +1,7 @@ -#define MEMCHR __rawmemchr_evex -#define USE_AS_RAWMEMCHR 1 +#ifndef RAWMEMCHR +# define RAWMEMCHR __rawmemchr_evex +#endif +#define USE_AS_RAWMEMCHR 1 +#define MEMCHR RAWMEMCHR #include "memchr-evex.S" diff --git a/sysdeps/x86_64/multiarch/rawmemchr-sse2.S b/sysdeps/x86_64/multiarch/rawmemchr-sse2.S index 3841c14c34..73f4fa9589 100644 --- a/sysdeps/x86_64/multiarch/rawmemchr-sse2.S +++ b/sysdeps/x86_64/multiarch/rawmemchr-sse2.S @@ -16,14 +16,199 @@ License along with the GNU C Library; if not, see <https://www.gnu.org/licenses/>. */ -/* Define multiple versions only for the definition in libc. */ -#if IS_IN (libc) -# define __rawmemchr __rawmemchr_sse2 - -# undef weak_alias -# define weak_alias(__rawmemchr, rawmemchr) -# undef libc_hidden_def -# define libc_hidden_def(__rawmemchr) +#include <isa-level.h> + +#if defined IS_MULTIARCH && defined ISA_DEFAULT_IMPL +# error "Multiarch build should never default include!" #endif -#include "../rawmemchr.S" +/* __X86_ISA_LEVEL <= 2 because there is no V2 implementation so we + need this to build for ISA V2 builds. */ +#if (MINIMUM_X86_ISA_LEVEL <= 2 && IS_IN (libc)) \ + || defined ISA_DEFAULT_IMPL + + +# include <sysdep.h> + +# ifndef RAWMEMCHR +# define RAWMEMCHR __rawmemchr_sse2 +# endif + + .text +ENTRY (RAWMEMCHR) + movd %rsi, %xmm1 + mov %rdi, %rcx + + punpcklbw %xmm1, %xmm1 + punpcklbw %xmm1, %xmm1 + + and $63, %rcx + pshufd $0, %xmm1, %xmm1 + + cmp $48, %rcx + ja L(crosscache) + + movdqu (%rdi), %xmm0 + pcmpeqb %xmm1, %xmm0 +/* Check if there is a match. */ + pmovmskb %xmm0, %eax + test %eax, %eax + + jnz L(matches) + add $16, %rdi + and $-16, %rdi + jmp L(loop_prolog) + + .p2align 4 +L(crosscache): + and $15, %rcx + and $-16, %rdi + movdqa (%rdi), %xmm0 + + pcmpeqb %xmm1, %xmm0 +/* Check if there is a match. */ + pmovmskb %xmm0, %eax +/* Remove the leading bytes. */ + sar %cl, %eax + test %eax, %eax + je L(unaligned_no_match) +/* Check which byte is a match. */ + bsf %eax, %eax + + add %rdi, %rax + add %rcx, %rax + ret + + .p2align 4 +L(unaligned_no_match): + add $16, %rdi + + .p2align 4 +L(loop_prolog): + movdqa (%rdi), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches) + + movdqa 16(%rdi), %xmm2 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches16) + + movdqa 32(%rdi), %xmm3 + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches32) + + movdqa 48(%rdi), %xmm4 + pcmpeqb %xmm1, %xmm4 + add $64, %rdi + pmovmskb %xmm4, %eax + test %eax, %eax + jnz L(matches0) + + test $0x3f, %rdi + jz L(align64_loop) + + movdqa (%rdi), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches) + + movdqa 16(%rdi), %xmm2 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches16) + + movdqa 32(%rdi), %xmm3 + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches32) + + movdqa 48(%rdi), %xmm3 + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + + add $64, %rdi + test %eax, %eax + jnz L(matches0) + + and $-64, %rdi + + .p2align 4 +L(align64_loop): + movdqa (%rdi), %xmm0 + movdqa 16(%rdi), %xmm2 + movdqa 32(%rdi), %xmm3 + movdqa 48(%rdi), %xmm4 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm1, %xmm2 + pcmpeqb %xmm1, %xmm3 + pcmpeqb %xmm1, %xmm4 + + pmaxub %xmm0, %xmm3 + pmaxub %xmm2, %xmm4 + pmaxub %xmm3, %xmm4 + pmovmskb %xmm4, %eax + + add $64, %rdi + + test %eax, %eax + jz L(align64_loop) + + sub $64, %rdi + + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches) + + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches16) + + movdqa 32(%rdi), %xmm3 + pcmpeqb %xmm1, %xmm3 + + pcmpeqb 48(%rdi), %xmm1 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches32) + + pmovmskb %xmm1, %eax + bsf %eax, %eax + lea 48(%rdi, %rax), %rax + ret + + .p2align 4 +L(matches0): + bsf %eax, %eax + lea -16(%rax, %rdi), %rax + ret + + .p2align 4 +L(matches): + bsf %eax, %eax + add %rdi, %rax + ret + + .p2align 4 +L(matches16): + bsf %eax, %eax + lea 16(%rax, %rdi), %rax + ret + + .p2align 4 +L(matches32): + bsf %eax, %eax + lea 32(%rax, %rdi), %rax + ret + +END (RAWMEMCHR) +#endif diff --git a/sysdeps/x86_64/multiarch/rtld-memchr.S b/sysdeps/x86_64/multiarch/rtld-memchr.S new file mode 100644 index 0000000000..a14b192bed --- /dev/null +++ b/sysdeps/x86_64/multiarch/rtld-memchr.S @@ -0,0 +1,18 @@ +/* Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include "../memchr.S" diff --git a/sysdeps/x86_64/multiarch/rtld-rawmemchr.S b/sysdeps/x86_64/multiarch/rtld-rawmemchr.S new file mode 100644 index 0000000000..5d4110a052 --- /dev/null +++ b/sysdeps/x86_64/multiarch/rtld-rawmemchr.S @@ -0,0 +1,18 @@ +/* Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include "../rawmemchr.S" diff --git a/sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S index 58ed21db01..2a1cff5b05 100644 --- a/sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S +++ b/sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S @@ -1,4 +1,7 @@ -#define MEMCHR __wmemchr_avx2_rtm -#define USE_AS_WMEMCHR 1 +#ifndef WMEMCHR +# define WMEMCHR __wmemchr_avx2_rtm +#endif +#define USE_AS_WMEMCHR 1 +#define MEMCHR WMEMCHR #include "memchr-avx2-rtm.S" diff --git a/sysdeps/x86_64/multiarch/wmemchr-avx2.S b/sysdeps/x86_64/multiarch/wmemchr-avx2.S index 282854f1a1..2bf93fd84b 100644 --- a/sysdeps/x86_64/multiarch/wmemchr-avx2.S +++ b/sysdeps/x86_64/multiarch/wmemchr-avx2.S @@ -1,4 +1,7 @@ -#define MEMCHR __wmemchr_avx2 -#define USE_AS_WMEMCHR 1 +#ifndef WMEMCHR +# define WMEMCHR __wmemchr_avx2 +#endif +#define USE_AS_WMEMCHR 1 +#define MEMCHR WMEMCHR #include "memchr-avx2.S" diff --git a/sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S b/sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S index a346cd35a1..c67309e8a1 100644 --- a/sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S +++ b/sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S @@ -1,3 +1,7 @@ -#define MEMCHR __wmemchr_evex_rtm -#define USE_AS_WMEMCHR 1 +#ifndef WMEMCHR +# define WMEMCHR __wmemchr_evex_rtm +#endif +#define USE_AS_WMEMCHR 1 +#define MEMCHR WMEMCHR + #include "memchr-evex-rtm.S" diff --git a/sysdeps/x86_64/multiarch/wmemchr-evex.S b/sysdeps/x86_64/multiarch/wmemchr-evex.S index 06cd0f9f5a..5512d5cdc3 100644 --- a/sysdeps/x86_64/multiarch/wmemchr-evex.S +++ b/sysdeps/x86_64/multiarch/wmemchr-evex.S @@ -1,4 +1,7 @@ -#define MEMCHR __wmemchr_evex -#define USE_AS_WMEMCHR 1 +#ifndef WMEMCHR +# define WMEMCHR __wmemchr_evex +#endif +#define USE_AS_WMEMCHR 1 +#define MEMCHR WMEMCHR #include "memchr-evex.S" diff --git a/sysdeps/x86_64/multiarch/wmemchr-sse2.S b/sysdeps/x86_64/multiarch/wmemchr-sse2.S index 70a965d552..3081fb6821 100644 --- a/sysdeps/x86_64/multiarch/wmemchr-sse2.S +++ b/sysdeps/x86_64/multiarch/wmemchr-sse2.S @@ -1,4 +1,25 @@ -#define USE_AS_WMEMCHR 1 -#define wmemchr __wmemchr_sse2 +/* wmemchr optimized with SSE2 + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. -#include "../memchr.S" + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#ifndef WMEMCHR +# define WMEMCHR __wmemchr_sse2 +#endif +#define USE_AS_WMEMCHR 1 +#define MEMCHR WMEMCHR + +#include "memchr-sse2.S" diff --git a/sysdeps/x86_64/rawmemchr.S b/sysdeps/x86_64/rawmemchr.S index 4c1a3383b9..e401a2ac53 100644 --- a/sysdeps/x86_64/rawmemchr.S +++ b/sysdeps/x86_64/rawmemchr.S @@ -17,185 +17,13 @@ License along with the GNU C Library; if not, see <https://www.gnu.org/licenses/>. */ -#include <sysdep.h> +#define RAWMEMCHR rawmemchr - .text -ENTRY (__rawmemchr) - movd %rsi, %xmm1 - mov %rdi, %rcx +#define DEFAULT_IMPL_V1 "multiarch/rawmemchr-sse2.S" +#define DEFAULT_IMPL_V3 "multiarch/rawmemchr-avx2.S" +#define DEFAULT_IMPL_V4 "multiarch/rawmemchr-evex.S" - punpcklbw %xmm1, %xmm1 - punpcklbw %xmm1, %xmm1 +#include "isa-default-impl.h" - and $63, %rcx - pshufd $0, %xmm1, %xmm1 - - cmp $48, %rcx - ja L(crosscache) - - movdqu (%rdi), %xmm0 - pcmpeqb %xmm1, %xmm0 -/* Check if there is a match. */ - pmovmskb %xmm0, %eax - test %eax, %eax - - jnz L(matches) - add $16, %rdi - and $-16, %rdi - jmp L(loop_prolog) - - .p2align 4 -L(crosscache): - and $15, %rcx - and $-16, %rdi - movdqa (%rdi), %xmm0 - - pcmpeqb %xmm1, %xmm0 -/* Check if there is a match. */ - pmovmskb %xmm0, %eax -/* Remove the leading bytes. */ - sar %cl, %eax - test %eax, %eax - je L(unaligned_no_match) -/* Check which byte is a match. */ - bsf %eax, %eax - - add %rdi, %rax - add %rcx, %rax - ret - - .p2align 4 -L(unaligned_no_match): - add $16, %rdi - - .p2align 4 -L(loop_prolog): - movdqa (%rdi), %xmm0 - pcmpeqb %xmm1, %xmm0 - pmovmskb %xmm0, %eax - test %eax, %eax - jnz L(matches) - - movdqa 16(%rdi), %xmm2 - pcmpeqb %xmm1, %xmm2 - pmovmskb %xmm2, %eax - test %eax, %eax - jnz L(matches16) - - movdqa 32(%rdi), %xmm3 - pcmpeqb %xmm1, %xmm3 - pmovmskb %xmm3, %eax - test %eax, %eax - jnz L(matches32) - - movdqa 48(%rdi), %xmm4 - pcmpeqb %xmm1, %xmm4 - add $64, %rdi - pmovmskb %xmm4, %eax - test %eax, %eax - jnz L(matches0) - - test $0x3f, %rdi - jz L(align64_loop) - - movdqa (%rdi), %xmm0 - pcmpeqb %xmm1, %xmm0 - pmovmskb %xmm0, %eax - test %eax, %eax - jnz L(matches) - - movdqa 16(%rdi), %xmm2 - pcmpeqb %xmm1, %xmm2 - pmovmskb %xmm2, %eax - test %eax, %eax - jnz L(matches16) - - movdqa 32(%rdi), %xmm3 - pcmpeqb %xmm1, %xmm3 - pmovmskb %xmm3, %eax - test %eax, %eax - jnz L(matches32) - - movdqa 48(%rdi), %xmm3 - pcmpeqb %xmm1, %xmm3 - pmovmskb %xmm3, %eax - - add $64, %rdi - test %eax, %eax - jnz L(matches0) - - and $-64, %rdi - - .p2align 4 -L(align64_loop): - movdqa (%rdi), %xmm0 - movdqa 16(%rdi), %xmm2 - movdqa 32(%rdi), %xmm3 - movdqa 48(%rdi), %xmm4 - - pcmpeqb %xmm1, %xmm0 - pcmpeqb %xmm1, %xmm2 - pcmpeqb %xmm1, %xmm3 - pcmpeqb %xmm1, %xmm4 - - pmaxub %xmm0, %xmm3 - pmaxub %xmm2, %xmm4 - pmaxub %xmm3, %xmm4 - pmovmskb %xmm4, %eax - - add $64, %rdi - - test %eax, %eax - jz L(align64_loop) - - sub $64, %rdi - - pmovmskb %xmm0, %eax - test %eax, %eax - jnz L(matches) - - pmovmskb %xmm2, %eax - test %eax, %eax - jnz L(matches16) - - movdqa 32(%rdi), %xmm3 - pcmpeqb %xmm1, %xmm3 - - pcmpeqb 48(%rdi), %xmm1 - pmovmskb %xmm3, %eax - test %eax, %eax - jnz L(matches32) - - pmovmskb %xmm1, %eax - bsf %eax, %eax - lea 48(%rdi, %rax), %rax - ret - - .p2align 4 -L(matches0): - bsf %eax, %eax - lea -16(%rax, %rdi), %rax - ret - - .p2align 4 -L(matches): - bsf %eax, %eax - add %rdi, %rax - ret - - .p2align 4 -L(matches16): - bsf %eax, %eax - lea 16(%rax, %rdi), %rax - ret - - .p2align 4 -L(matches32): - bsf %eax, %eax - lea 32(%rax, %rdi), %rax - ret - -END (__rawmemchr) - -weak_alias (__rawmemchr, rawmemchr) -libc_hidden_builtin_def (__rawmemchr) +strong_alias (rawmemchr, __rawmemchr) +libc_hidden_builtin_def (rawmemchr) diff --git a/sysdeps/x86_64/wmemchr.S b/sysdeps/x86_64/wmemchr.S new file mode 100644 index 0000000000..dd0490f86b --- /dev/null +++ b/sysdeps/x86_64/wmemchr.S @@ -0,0 +1,24 @@ +/* Copyright (C) 2011-2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#define WMEMCHR wmemchr + +#define DEFAULT_IMPL_V1 "multiarch/wmemchr-sse2.S" +#define DEFAULT_IMPL_V3 "multiarch/wmemchr-avx2.S" +#define DEFAULT_IMPL_V4 "multiarch/wmemchr-evex.S" + +#include "isa-default-impl.h"