Message ID | 20220622205813.1923342-2-goldstein.w.n@gmail.com |
---|---|
State | New |
Headers | show |
Series | [v10,1/2] x86: Add defines / utilities for making ISA specific x86 builds | expand |
On Wed, Jun 22, 2022 at 1:58 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > 1. Refactor files so that all implementations for in the multiarch > directory. > - Essentially moved sse2 {raw|w}memchr.S implementation to > multiarch/{raw|w}memchr-sse2.S > > - The non-multiarch {raw|w}memchr.S file now only includes one of > the implementations in the multiarch directory based on the > compiled ISA level (only used for non-multiarch builds. > Otherwise we go through the ifunc selector). > > 2. Add ISA level build guards to different implementations. > - I.e memchr-avx2.S which is ISA level 3 will only build if > compiled ISA level <= 3. Otherwise there is no reason to include > it as we will always use one of the ISA level 4 > implementations (memchr-evex{-rtm}.S). > > 3. Add new multiarch/rtld-{raw}memchr.S that just include the > non-multiarch {raw}memchr.S which will in turn select the best > implementation based on the compiled ISA level. > > 4. Refactor the ifunc selector and ifunc implementation list to use > the ISA level aware wrapper macros that allow functions below the > compiled ISA level (with a guranteed replacement) to be skipped. > - Guranteed replacement essentially means that for any ISA level > build there must be a function that the baseline of the ISA > supports. So for {raw|w}memchr.S since there is not ISA level 2 > function, the ISA level 2 build still includes the ISA level > 1 (sse2) function. Once we reach the ISA level 3 build, however, > {raw|w}memchr-avx2{-rtm}.S will always be sufficient so the ISA > level 1 implementation ({raw|w}memchr-sse2.S) will not be built. > > Tested with and without multiarch on x86_64 for ISA levels: > {generic, x86-64-v2, x86-64-v3, x86-64-v4} > > And m32 with and without multiarch. > --- > sysdeps/x86_64/isa-default-impl.h | 8 + > sysdeps/x86_64/memchr.S | 357 +---------------- > sysdeps/x86_64/multiarch/ifunc-evex.h | 29 +- > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 72 ++-- > sysdeps/x86_64/multiarch/memchr-avx2.S | 5 +- > sysdeps/x86_64/multiarch/memchr-evex.S | 5 +- > sysdeps/x86_64/multiarch/memchr-sse2.S | 363 +++++++++++++++++- > sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S | 7 +- > sysdeps/x86_64/multiarch/rawmemchr-avx2.S | 7 +- > sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S | 8 +- > sysdeps/x86_64/multiarch/rawmemchr-evex.S | 7 +- > sysdeps/x86_64/multiarch/rawmemchr-sse2.S | 198 +++++++++- > sysdeps/x86_64/multiarch/rtld-memchr.S | 18 + > sysdeps/x86_64/multiarch/rtld-rawmemchr.S | 18 + > sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S | 7 +- > sysdeps/x86_64/multiarch/wmemchr-avx2.S | 7 +- > sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S | 8 +- > sysdeps/x86_64/multiarch/wmemchr-evex.S | 7 +- > sysdeps/x86_64/multiarch/wmemchr-sse2.S | 9 +- > sysdeps/x86_64/rawmemchr.S | 184 +-------- > sysdeps/x86_64/wmemchr.S | 28 ++ > 21 files changed, 740 insertions(+), 612 deletions(-) > create mode 100644 sysdeps/x86_64/multiarch/rtld-memchr.S > create mode 100644 sysdeps/x86_64/multiarch/rtld-rawmemchr.S > create mode 100644 sysdeps/x86_64/wmemchr.S > > diff --git a/sysdeps/x86_64/isa-default-impl.h b/sysdeps/x86_64/isa-default-impl.h > index 34634668e5..b374a38b8b 100644 > --- a/sysdeps/x86_64/isa-default-impl.h > +++ b/sysdeps/x86_64/isa-default-impl.h > @@ -46,4 +46,12 @@ > # error "Unsupported ISA Level!" > #endif > > +#if IS_IN(rtld) && !defined USE_MULTIARCH > +# error "RTLD version should only exist in multiarch build" > +#endif > + > +#if defined USE_MULTIARCH && !IS_IN(rtld) > +# error "Multiarch build should not use ISA_DEFAULT_IMPL without RTLD" > +#endif Please do #if IS_IN (rtld) #else #endif > #include ISA_DEFAULT_IMPL > diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S > index a160fd9b00..20b43508c4 100644 > --- a/sysdeps/x86_64/memchr.S > +++ b/sysdeps/x86_64/memchr.S > @@ -15,358 +15,13 @@ > License along with the GNU C Library; if not, see > <https://www.gnu.org/licenses/>. */ > > -#include <sysdep.h> > +#define MEMCHR __memchr > > -#ifdef USE_AS_WMEMCHR > -# define MEMCHR wmemchr > -# define PCMPEQ pcmpeqd > -# define CHAR_PER_VEC 4 > -#else > -# define MEMCHR memchr > -# define PCMPEQ pcmpeqb > -# define CHAR_PER_VEC 16 > -#endif > +#define DEFAULT_IMPL_V1 "multiarch/memchr-sse2.S" > +#define DEFAULT_IMPL_V3 "multiarch/memchr-avx2.S" > +#define DEFAULT_IMPL_V4 "multiarch/memchr-evex.S" > > -/* fast SSE2 version with using pmaxub and 64 byte loop */ > +#include "isa-default-impl.h" > > - .text > -ENTRY(MEMCHR) > - movd %esi, %xmm1 > - mov %edi, %ecx > - > -#ifdef __ILP32__ > - /* Clear the upper 32 bits. */ > - movl %edx, %edx > -#endif > -#ifdef USE_AS_WMEMCHR > - test %RDX_LP, %RDX_LP > - jz L(return_null) > -#else > - punpcklbw %xmm1, %xmm1 > - test %RDX_LP, %RDX_LP > - jz L(return_null) > - punpcklbw %xmm1, %xmm1 > -#endif > - > - and $63, %ecx > - pshufd $0, %xmm1, %xmm1 > - > - cmp $48, %ecx > - ja L(crosscache) > - > - movdqu (%rdi), %xmm0 > - PCMPEQ %xmm1, %xmm0 > - pmovmskb %xmm0, %eax > - test %eax, %eax > - > - jnz L(matches_1) > - sub $CHAR_PER_VEC, %rdx > - jbe L(return_null) > - add $16, %rdi > - and $15, %ecx > - and $-16, %rdi > -#ifdef USE_AS_WMEMCHR > - shr $2, %ecx > -#endif > - add %rcx, %rdx > - sub $(CHAR_PER_VEC * 4), %rdx > - jbe L(exit_loop) > - jmp L(loop_prolog) > - > - .p2align 4 > -L(crosscache): > - and $15, %ecx > - and $-16, %rdi > - movdqa (%rdi), %xmm0 > - > - PCMPEQ %xmm1, %xmm0 > - /* Check if there is a match. */ > - pmovmskb %xmm0, %eax > - /* Remove the leading bytes. */ > - sar %cl, %eax > - test %eax, %eax > - je L(unaligned_no_match) > - /* Check which byte is a match. */ > - bsf %eax, %eax > -#ifdef USE_AS_WMEMCHR > - mov %eax, %esi > - shr $2, %esi > - sub %rsi, %rdx > -#else > - sub %rax, %rdx > -#endif > - jbe L(return_null) > - add %rdi, %rax > - add %rcx, %rax > - ret > - > - .p2align 4 > -L(unaligned_no_match): > - /* "rcx" is less than 16. Calculate "rdx + rcx - 16" by using > - "rdx - (16 - rcx)" instead of "(rdx + rcx) - 16" to void > - possible addition overflow. */ > - neg %rcx > - add $16, %rcx > -#ifdef USE_AS_WMEMCHR > - shr $2, %ecx > -#endif > - sub %rcx, %rdx > - jbe L(return_null) > - add $16, %rdi > - sub $(CHAR_PER_VEC * 4), %rdx > - jbe L(exit_loop) > - > - .p2align 4 > -L(loop_prolog): > - movdqa (%rdi), %xmm0 > - PCMPEQ %xmm1, %xmm0 > - pmovmskb %xmm0, %eax > - test %eax, %eax > - jnz L(matches) > - > - movdqa 16(%rdi), %xmm2 > - PCMPEQ %xmm1, %xmm2 > - pmovmskb %xmm2, %eax > - test %eax, %eax > - jnz L(matches16) > - > - movdqa 32(%rdi), %xmm3 > - PCMPEQ %xmm1, %xmm3 > - pmovmskb %xmm3, %eax > - test %eax, %eax > - jnz L(matches32) > - > - movdqa 48(%rdi), %xmm4 > - PCMPEQ %xmm1, %xmm4 > - add $64, %rdi > - pmovmskb %xmm4, %eax > - test %eax, %eax > - jnz L(matches0) > - > - test $0x3f, %rdi > - jz L(align64_loop) > - > - sub $(CHAR_PER_VEC * 4), %rdx > - jbe L(exit_loop) > - > - movdqa (%rdi), %xmm0 > - PCMPEQ %xmm1, %xmm0 > - pmovmskb %xmm0, %eax > - test %eax, %eax > - jnz L(matches) > - > - movdqa 16(%rdi), %xmm2 > - PCMPEQ %xmm1, %xmm2 > - pmovmskb %xmm2, %eax > - test %eax, %eax > - jnz L(matches16) > - > - movdqa 32(%rdi), %xmm3 > - PCMPEQ %xmm1, %xmm3 > - pmovmskb %xmm3, %eax > - test %eax, %eax > - jnz L(matches32) > - > - movdqa 48(%rdi), %xmm3 > - PCMPEQ %xmm1, %xmm3 > - pmovmskb %xmm3, %eax > - > - add $64, %rdi > - test %eax, %eax > - jnz L(matches0) > - > - mov %rdi, %rcx > - and $-64, %rdi > - and $63, %ecx > -#ifdef USE_AS_WMEMCHR > - shr $2, %ecx > -#endif > - add %rcx, %rdx > - > - .p2align 4 > -L(align64_loop): > - sub $(CHAR_PER_VEC * 4), %rdx > - jbe L(exit_loop) > - movdqa (%rdi), %xmm0 > - movdqa 16(%rdi), %xmm2 > - movdqa 32(%rdi), %xmm3 > - movdqa 48(%rdi), %xmm4 > - > - PCMPEQ %xmm1, %xmm0 > - PCMPEQ %xmm1, %xmm2 > - PCMPEQ %xmm1, %xmm3 > - PCMPEQ %xmm1, %xmm4 > - > - pmaxub %xmm0, %xmm3 > - pmaxub %xmm2, %xmm4 > - pmaxub %xmm3, %xmm4 > - pmovmskb %xmm4, %eax > - > - add $64, %rdi > - > - test %eax, %eax > - jz L(align64_loop) > - > - sub $64, %rdi > - > - pmovmskb %xmm0, %eax > - test %eax, %eax > - jnz L(matches) > - > - pmovmskb %xmm2, %eax > - test %eax, %eax > - jnz L(matches16) > - > - movdqa 32(%rdi), %xmm3 > - PCMPEQ %xmm1, %xmm3 > - > - PCMPEQ 48(%rdi), %xmm1 > - pmovmskb %xmm3, %eax > - test %eax, %eax > - jnz L(matches32) > - > - pmovmskb %xmm1, %eax > - bsf %eax, %eax > - lea 48(%rdi, %rax), %rax > - ret > - > - .p2align 4 > -L(exit_loop): > - add $(CHAR_PER_VEC * 2), %edx > - jle L(exit_loop_32) > - > - movdqa (%rdi), %xmm0 > - PCMPEQ %xmm1, %xmm0 > - pmovmskb %xmm0, %eax > - test %eax, %eax > - jnz L(matches) > - > - movdqa 16(%rdi), %xmm2 > - PCMPEQ %xmm1, %xmm2 > - pmovmskb %xmm2, %eax > - test %eax, %eax > - jnz L(matches16) > - > - movdqa 32(%rdi), %xmm3 > - PCMPEQ %xmm1, %xmm3 > - pmovmskb %xmm3, %eax > - test %eax, %eax > - jnz L(matches32_1) > - sub $CHAR_PER_VEC, %edx > - jle L(return_null) > - > - PCMPEQ 48(%rdi), %xmm1 > - pmovmskb %xmm1, %eax > - test %eax, %eax > - jnz L(matches48_1) > - xor %eax, %eax > - ret > - > - .p2align 4 > -L(exit_loop_32): > - add $(CHAR_PER_VEC * 2), %edx > - movdqa (%rdi), %xmm0 > - PCMPEQ %xmm1, %xmm0 > - pmovmskb %xmm0, %eax > - test %eax, %eax > - jnz L(matches_1) > - sub $CHAR_PER_VEC, %edx > - jbe L(return_null) > - > - PCMPEQ 16(%rdi), %xmm1 > - pmovmskb %xmm1, %eax > - test %eax, %eax > - jnz L(matches16_1) > - xor %eax, %eax > - ret > - > - .p2align 4 > -L(matches0): > - bsf %eax, %eax > - lea -16(%rax, %rdi), %rax > - ret > - > - .p2align 4 > -L(matches): > - bsf %eax, %eax > - add %rdi, %rax > - ret > - > - .p2align 4 > -L(matches16): > - bsf %eax, %eax > - lea 16(%rax, %rdi), %rax > - ret > - > - .p2align 4 > -L(matches32): > - bsf %eax, %eax > - lea 32(%rax, %rdi), %rax > - ret > - > - .p2align 4 > -L(matches_1): > - bsf %eax, %eax > -#ifdef USE_AS_WMEMCHR > - mov %eax, %esi > - shr $2, %esi > - sub %rsi, %rdx > -#else > - sub %rax, %rdx > -#endif > - jbe L(return_null) > - add %rdi, %rax > - ret > - > - .p2align 4 > -L(matches16_1): > - bsf %eax, %eax > -#ifdef USE_AS_WMEMCHR > - mov %eax, %esi > - shr $2, %esi > - sub %rsi, %rdx > -#else > - sub %rax, %rdx > -#endif > - jbe L(return_null) > - lea 16(%rdi, %rax), %rax > - ret > - > - .p2align 4 > -L(matches32_1): > - bsf %eax, %eax > -#ifdef USE_AS_WMEMCHR > - mov %eax, %esi > - shr $2, %esi > - sub %rsi, %rdx > -#else > - sub %rax, %rdx > -#endif > - jbe L(return_null) > - lea 32(%rdi, %rax), %rax > - ret > - > - .p2align 4 > -L(matches48_1): > - bsf %eax, %eax > -#ifdef USE_AS_WMEMCHR > - mov %eax, %esi > - shr $2, %esi > - sub %rsi, %rdx > -#else > - sub %rax, %rdx > -#endif > - jbe L(return_null) > - lea 48(%rdi, %rax), %rax > - ret > - > - .p2align 4 > -L(return_null): > - xor %eax, %eax > - ret > -END(MEMCHR) > - > -#ifndef USE_AS_WMEMCHR > -strong_alias (memchr, __memchr) > +weak_alias (__memchr, memchr) > libc_hidden_builtin_def(memchr) > -#endif > diff --git a/sysdeps/x86_64/multiarch/ifunc-evex.h b/sysdeps/x86_64/multiarch/ifunc-evex.h > index b8f7a12ea2..856c6261f8 100644 > --- a/sysdeps/x86_64/multiarch/ifunc-evex.h > +++ b/sysdeps/x86_64/multiarch/ifunc-evex.h > @@ -19,24 +19,28 @@ > > #include <init-arch.h> > > -extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; > -extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; > -extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; > extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; > extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_rtm) attribute_hidden; > > +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; > +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; > + > +extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; > > static inline void * > IFUNC_SELECTOR (void) > { > - const struct cpu_features* cpu_features = __get_cpu_features (); > - > - if (CPU_FEATURE_USABLE_P (cpu_features, AVX2) > - && CPU_FEATURE_USABLE_P (cpu_features, BMI2) > - && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) > + const struct cpu_features *cpu_features = __get_cpu_features (); > + > + /* NB: The X86_ISA_* feature check macros are evaluated at > + compile time. */ > + if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX2) > + && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2) > + && X86_ISA_CPU_FEATURES_ARCH_P (cpu_features, > + AVX_Fast_Unaligned_Load)) > { > - if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) > - && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)) > + if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) > + && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)) > { > if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) > return OPTIMIZE (evex_rtm); > @@ -47,9 +51,12 @@ IFUNC_SELECTOR (void) > if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) > return OPTIMIZE (avx2_rtm); > > - if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) > + if (X86_ISA_CPU_FEATURES_ARCH_P (cpu_features, > + Prefer_No_VZEROUPPER)) > return OPTIMIZE (avx2); > } > > + /* This is unreachable (compile time checked) if ISA level >= 3 > + so no need for a robust fallback here. */ > return OPTIMIZE (sse2); > } > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > index 883362f63d..bf52cf96d0 100644 > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > @@ -25,7 +25,8 @@ > > /* Fill ARRAY of MAX elements with IFUNC implementations for function > NAME supported on target machine and return the number of valid > - entries. */ > + entries. Each set of implementations for a given function is sorted in > + descending order by ISA level. */ > > size_t > __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > @@ -53,24 +54,27 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > /* Support sysdeps/x86_64/multiarch/memchr.c. */ > IFUNC_IMPL (i, name, memchr, > - IFUNC_IMPL_ADD (array, i, memchr, > - CPU_FEATURE_USABLE (AVX2), > - __memchr_avx2) > - IFUNC_IMPL_ADD (array, i, memchr, > - (CPU_FEATURE_USABLE (AVX2) > - && CPU_FEATURE_USABLE (RTM)), > - __memchr_avx2_rtm) > - IFUNC_IMPL_ADD (array, i, memchr, > + X86_IFUNC_IMPL_ADD_V4 (array, i, memchr, > (CPU_FEATURE_USABLE (AVX512VL) > && CPU_FEATURE_USABLE (AVX512BW) > && CPU_FEATURE_USABLE (BMI2)), > __memchr_evex) > - IFUNC_IMPL_ADD (array, i, memchr, > + X86_IFUNC_IMPL_ADD_V4 (array, i, memchr, > (CPU_FEATURE_USABLE (AVX512VL) > && CPU_FEATURE_USABLE (AVX512BW) > && CPU_FEATURE_USABLE (BMI2)), > __memchr_evex_rtm) > - IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_sse2)) > + X86_IFUNC_IMPL_ADD_V3 (array, i, memchr, > + CPU_FEATURE_USABLE (AVX2), > + __memchr_avx2) > + X86_IFUNC_IMPL_ADD_V3 (array, i, memchr, > + (CPU_FEATURE_USABLE (AVX2) > + && CPU_FEATURE_USABLE (RTM)), > + __memchr_avx2_rtm) > + /* Can be lowered to V1 if a V2 implementation is added. */ > + X86_IFUNC_IMPL_ADD_V2 (array, i, memchr, > + 1, > + __memchr_sse2)) > > /* Support sysdeps/x86_64/multiarch/memcmp.c. */ > IFUNC_IMPL (i, name, memcmp, > @@ -288,24 +292,27 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > /* Support sysdeps/x86_64/multiarch/rawmemchr.c. */ > IFUNC_IMPL (i, name, rawmemchr, > - IFUNC_IMPL_ADD (array, i, rawmemchr, > - CPU_FEATURE_USABLE (AVX2), > - __rawmemchr_avx2) > - IFUNC_IMPL_ADD (array, i, rawmemchr, > - (CPU_FEATURE_USABLE (AVX2) > - && CPU_FEATURE_USABLE (RTM)), > - __rawmemchr_avx2_rtm) > - IFUNC_IMPL_ADD (array, i, rawmemchr, > + X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr, > (CPU_FEATURE_USABLE (AVX512VL) > && CPU_FEATURE_USABLE (AVX512BW) > && CPU_FEATURE_USABLE (BMI2)), > __rawmemchr_evex) > - IFUNC_IMPL_ADD (array, i, rawmemchr, > + X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr, > (CPU_FEATURE_USABLE (AVX512VL) > && CPU_FEATURE_USABLE (AVX512BW) > && CPU_FEATURE_USABLE (BMI2)), > __rawmemchr_evex_rtm) > - IFUNC_IMPL_ADD (array, i, rawmemchr, 1, __rawmemchr_sse2)) > + X86_IFUNC_IMPL_ADD_V3 (array, i, rawmemchr, > + CPU_FEATURE_USABLE (AVX2), > + __rawmemchr_avx2) > + X86_IFUNC_IMPL_ADD_V3 (array, i, rawmemchr, > + (CPU_FEATURE_USABLE (AVX2) > + && CPU_FEATURE_USABLE (RTM)), > + __rawmemchr_avx2_rtm) > + /* Can be lowered to V1 if a V2 implementation is added. */ > + X86_IFUNC_IMPL_ADD_V2 (array, i, rawmemchr, > + 1, > + __rawmemchr_sse2)) > > /* Support sysdeps/x86_64/multiarch/strlen.c. */ > IFUNC_IMPL (i, name, strlen, > @@ -748,24 +755,27 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > /* Support sysdeps/x86_64/multiarch/wmemchr.c. */ > IFUNC_IMPL (i, name, wmemchr, > - IFUNC_IMPL_ADD (array, i, wmemchr, > - CPU_FEATURE_USABLE (AVX2), > - __wmemchr_avx2) > - IFUNC_IMPL_ADD (array, i, wmemchr, > - (CPU_FEATURE_USABLE (AVX2) > - && CPU_FEATURE_USABLE (RTM)), > - __wmemchr_avx2_rtm) > - IFUNC_IMPL_ADD (array, i, wmemchr, > + X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr, > (CPU_FEATURE_USABLE (AVX512VL) > && CPU_FEATURE_USABLE (AVX512BW) > && CPU_FEATURE_USABLE (BMI2)), > __wmemchr_evex) > - IFUNC_IMPL_ADD (array, i, wmemchr, > + X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr, > (CPU_FEATURE_USABLE (AVX512VL) > && CPU_FEATURE_USABLE (AVX512BW) > && CPU_FEATURE_USABLE (BMI2)), > __wmemchr_evex_rtm) > - IFUNC_IMPL_ADD (array, i, wmemchr, 1, __wmemchr_sse2)) > + X86_IFUNC_IMPL_ADD_V3 (array, i, wmemchr, > + CPU_FEATURE_USABLE (AVX2), > + __wmemchr_avx2) > + X86_IFUNC_IMPL_ADD_V3 (array, i, wmemchr, > + (CPU_FEATURE_USABLE (AVX2) > + && CPU_FEATURE_USABLE (RTM)), > + __wmemchr_avx2_rtm) > + /* Can be lowered to V1 if a V2 implementation is added. */ > + X86_IFUNC_IMPL_ADD_V2 (array, i, wmemchr, > + 1, > + __wmemchr_sse2)) > > /* Support sysdeps/x86_64/multiarch/wmemcmp.c. */ > IFUNC_IMPL (i, name, wmemcmp, > diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S > index c5a256eb37..39be5f7083 100644 > --- a/sysdeps/x86_64/multiarch/memchr-avx2.S > +++ b/sysdeps/x86_64/multiarch/memchr-avx2.S > @@ -16,9 +16,10 @@ > License along with the GNU C Library; if not, see > <https://www.gnu.org/licenses/>. */ > > -#if IS_IN (libc) > +#include <isa-level.h> > +#include <sysdep.h> > > -# include <sysdep.h> > +#if ISA_SHOULD_BUILD (3) > > # ifndef MEMCHR > # define MEMCHR __memchr_avx2 > diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S > index 0fd11b7632..0dd4f1dcce 100644 > --- a/sysdeps/x86_64/multiarch/memchr-evex.S > +++ b/sysdeps/x86_64/multiarch/memchr-evex.S > @@ -16,9 +16,10 @@ > License along with the GNU C Library; if not, see > <https://www.gnu.org/licenses/>. */ > > -#if IS_IN (libc) > +#include <isa-level.h> > +#include <sysdep.h> > > -# include <sysdep.h> > +#if ISA_SHOULD_BUILD (4) > > # ifndef MEMCHR > # define MEMCHR __memchr_evex > diff --git a/sysdeps/x86_64/multiarch/memchr-sse2.S b/sysdeps/x86_64/multiarch/memchr-sse2.S > index 2c6fdd41d6..8c561cd687 100644 > --- a/sysdeps/x86_64/multiarch/memchr-sse2.S > +++ b/sysdeps/x86_64/multiarch/memchr-sse2.S > @@ -16,13 +16,360 @@ > License along with the GNU C Library; if not, see > <https://www.gnu.org/licenses/>. */ > > -#if IS_IN (libc) > -# define memchr __memchr_sse2 > +#include <isa-level.h> > +#include <sysdep.h> > > -# undef strong_alias > -# define strong_alias(memchr, __memchr) > -# undef libc_hidden_builtin_def > -# define libc_hidden_builtin_def(memchr) > -#endif > +/* MINIMUM_X86_ISA_LEVEL <= 2 because there is no V2 implementation > + so we need this to build for ISA V2 builds. */ > +#if ISA_SHOULD_BUILD (2) > + > +# ifndef MEMCHR > +# define MEMCHR __memchr_sse2 > +# endif > +# ifdef USE_AS_WMEMCHR > +# define PCMPEQ pcmpeqd > +# define CHAR_PER_VEC 4 > +# else > +# define PCMPEQ pcmpeqb > +# define CHAR_PER_VEC 16 > +# endif > + > +/* fast SSE2 version with using pmaxub and 64 byte loop */ > + > + .text > +ENTRY(MEMCHR) > + movd %esi, %xmm1 > + mov %edi, %ecx > + > +# ifdef __ILP32__ > + /* Clear the upper 32 bits. */ > + movl %edx, %edx > +# endif > +# ifdef USE_AS_WMEMCHR > + test %RDX_LP, %RDX_LP > + jz L(return_null) > +# else > + punpcklbw %xmm1, %xmm1 > + test %RDX_LP, %RDX_LP > + jz L(return_null) > + punpcklbw %xmm1, %xmm1 > +# endif > + > + and $63, %ecx > + pshufd $0, %xmm1, %xmm1 > + > + cmp $48, %ecx > + ja L(crosscache) > + > + movdqu (%rdi), %xmm0 > + PCMPEQ %xmm1, %xmm0 > + pmovmskb %xmm0, %eax > + test %eax, %eax > + > + jnz L(matches_1) > + sub $CHAR_PER_VEC, %rdx > + jbe L(return_null) > + add $16, %rdi > + and $15, %ecx > + and $-16, %rdi > +# ifdef USE_AS_WMEMCHR > + shr $2, %ecx > +# endif > + add %rcx, %rdx > + sub $(CHAR_PER_VEC * 4), %rdx > + jbe L(exit_loop) > + jmp L(loop_prolog) > + > + .p2align 4 > +L(crosscache): > + and $15, %ecx > + and $-16, %rdi > + movdqa (%rdi), %xmm0 > + > + PCMPEQ %xmm1, %xmm0 > + /* Check if there is a match. */ > + pmovmskb %xmm0, %eax > + /* Remove the leading bytes. */ > + sar %cl, %eax > + test %eax, %eax > + je L(unaligned_no_match) > + /* Check which byte is a match. */ > + bsf %eax, %eax > +# ifdef USE_AS_WMEMCHR > + mov %eax, %esi > + shr $2, %esi > + sub %rsi, %rdx > +# else > + sub %rax, %rdx > +# endif > + jbe L(return_null) > + add %rdi, %rax > + add %rcx, %rax > + ret > + > + .p2align 4 > +L(unaligned_no_match): > + /* "rcx" is less than 16. Calculate "rdx + rcx - 16" by using > + "rdx - (16 - rcx)" instead of "(rdx + rcx) - 16" to void > + possible addition overflow. */ > + neg %rcx > + add $16, %rcx > +# ifdef USE_AS_WMEMCHR > + shr $2, %ecx > +# endif > + sub %rcx, %rdx > + jbe L(return_null) > + add $16, %rdi > + sub $(CHAR_PER_VEC * 4), %rdx > + jbe L(exit_loop) > + > + .p2align 4 > +L(loop_prolog): > + movdqa (%rdi), %xmm0 > + PCMPEQ %xmm1, %xmm0 > + pmovmskb %xmm0, %eax > + test %eax, %eax > + jnz L(matches) > + > + movdqa 16(%rdi), %xmm2 > + PCMPEQ %xmm1, %xmm2 > + pmovmskb %xmm2, %eax > + test %eax, %eax > + jnz L(matches16) > + > + movdqa 32(%rdi), %xmm3 > + PCMPEQ %xmm1, %xmm3 > + pmovmskb %xmm3, %eax > + test %eax, %eax > + jnz L(matches32) > + > + movdqa 48(%rdi), %xmm4 > + PCMPEQ %xmm1, %xmm4 > + add $64, %rdi > + pmovmskb %xmm4, %eax > + test %eax, %eax > + jnz L(matches0) > + > + test $0x3f, %rdi > + jz L(align64_loop) > + > + sub $(CHAR_PER_VEC * 4), %rdx > + jbe L(exit_loop) > + > + movdqa (%rdi), %xmm0 > + PCMPEQ %xmm1, %xmm0 > + pmovmskb %xmm0, %eax > + test %eax, %eax > + jnz L(matches) > + > + movdqa 16(%rdi), %xmm2 > + PCMPEQ %xmm1, %xmm2 > + pmovmskb %xmm2, %eax > + test %eax, %eax > + jnz L(matches16) > + > + movdqa 32(%rdi), %xmm3 > + PCMPEQ %xmm1, %xmm3 > + pmovmskb %xmm3, %eax > + test %eax, %eax > + jnz L(matches32) > + > + movdqa 48(%rdi), %xmm3 > + PCMPEQ %xmm1, %xmm3 > + pmovmskb %xmm3, %eax > + > + add $64, %rdi > + test %eax, %eax > + jnz L(matches0) > + > + mov %rdi, %rcx > + and $-64, %rdi > + and $63, %ecx > +# ifdef USE_AS_WMEMCHR > + shr $2, %ecx > +# endif > + add %rcx, %rdx > + > + .p2align 4 > +L(align64_loop): > + sub $(CHAR_PER_VEC * 4), %rdx > + jbe L(exit_loop) > + movdqa (%rdi), %xmm0 > + movdqa 16(%rdi), %xmm2 > + movdqa 32(%rdi), %xmm3 > + movdqa 48(%rdi), %xmm4 > + > + PCMPEQ %xmm1, %xmm0 > + PCMPEQ %xmm1, %xmm2 > + PCMPEQ %xmm1, %xmm3 > + PCMPEQ %xmm1, %xmm4 > > -#include "../memchr.S" > + pmaxub %xmm0, %xmm3 > + pmaxub %xmm2, %xmm4 > + pmaxub %xmm3, %xmm4 > + pmovmskb %xmm4, %eax > + > + add $64, %rdi > + > + test %eax, %eax > + jz L(align64_loop) > + > + sub $64, %rdi > + > + pmovmskb %xmm0, %eax > + test %eax, %eax > + jnz L(matches) > + > + pmovmskb %xmm2, %eax > + test %eax, %eax > + jnz L(matches16) > + > + movdqa 32(%rdi), %xmm3 > + PCMPEQ %xmm1, %xmm3 > + > + PCMPEQ 48(%rdi), %xmm1 > + pmovmskb %xmm3, %eax > + test %eax, %eax > + jnz L(matches32) > + > + pmovmskb %xmm1, %eax > + bsf %eax, %eax > + lea 48(%rdi, %rax), %rax > + ret > + > + .p2align 4 > +L(exit_loop): > + add $(CHAR_PER_VEC * 2), %edx > + jle L(exit_loop_32) > + > + movdqa (%rdi), %xmm0 > + PCMPEQ %xmm1, %xmm0 > + pmovmskb %xmm0, %eax > + test %eax, %eax > + jnz L(matches) > + > + movdqa 16(%rdi), %xmm2 > + PCMPEQ %xmm1, %xmm2 > + pmovmskb %xmm2, %eax > + test %eax, %eax > + jnz L(matches16) > + > + movdqa 32(%rdi), %xmm3 > + PCMPEQ %xmm1, %xmm3 > + pmovmskb %xmm3, %eax > + test %eax, %eax > + jnz L(matches32_1) > + sub $CHAR_PER_VEC, %edx > + jle L(return_null) > + > + PCMPEQ 48(%rdi), %xmm1 > + pmovmskb %xmm1, %eax > + test %eax, %eax > + jnz L(matches48_1) > + xor %eax, %eax > + ret > + > + .p2align 4 > +L(exit_loop_32): > + add $(CHAR_PER_VEC * 2), %edx > + movdqa (%rdi), %xmm0 > + PCMPEQ %xmm1, %xmm0 > + pmovmskb %xmm0, %eax > + test %eax, %eax > + jnz L(matches_1) > + sub $CHAR_PER_VEC, %edx > + jbe L(return_null) > + > + PCMPEQ 16(%rdi), %xmm1 > + pmovmskb %xmm1, %eax > + test %eax, %eax > + jnz L(matches16_1) > + xor %eax, %eax > + ret > + > + .p2align 4 > +L(matches0): > + bsf %eax, %eax > + lea -16(%rax, %rdi), %rax > + ret > + > + .p2align 4 > +L(matches): > + bsf %eax, %eax > + add %rdi, %rax > + ret > + > + .p2align 4 > +L(matches16): > + bsf %eax, %eax > + lea 16(%rax, %rdi), %rax > + ret > + > + .p2align 4 > +L(matches32): > + bsf %eax, %eax > + lea 32(%rax, %rdi), %rax > + ret > + > + .p2align 4 > +L(matches_1): > + bsf %eax, %eax > +# ifdef USE_AS_WMEMCHR > + mov %eax, %esi > + shr $2, %esi > + sub %rsi, %rdx > +# else > + sub %rax, %rdx > +# endif > + jbe L(return_null) > + add %rdi, %rax > + ret > + > + .p2align 4 > +L(matches16_1): > + bsf %eax, %eax > +# ifdef USE_AS_WMEMCHR > + mov %eax, %esi > + shr $2, %esi > + sub %rsi, %rdx > +# else > + sub %rax, %rdx > +# endif > + jbe L(return_null) > + lea 16(%rdi, %rax), %rax > + ret > + > + .p2align 4 > +L(matches32_1): > + bsf %eax, %eax > +# ifdef USE_AS_WMEMCHR > + mov %eax, %esi > + shr $2, %esi > + sub %rsi, %rdx > +# else > + sub %rax, %rdx > +# endif > + jbe L(return_null) > + lea 32(%rdi, %rax), %rax > + ret > + > + .p2align 4 > +L(matches48_1): > + bsf %eax, %eax > +# ifdef USE_AS_WMEMCHR > + mov %eax, %esi > + shr $2, %esi > + sub %rsi, %rdx > +# else > + sub %rax, %rdx > +# endif > + jbe L(return_null) > + lea 48(%rdi, %rax), %rax > + ret > + > + .p2align 4 > +L(return_null): > + xor %eax, %eax > + ret > +END(MEMCHR) > +#endif > diff --git a/sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S > index acc5f6e2fb..5c1dcd3ca7 100644 > --- a/sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S > +++ b/sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S > @@ -1,4 +1,7 @@ > -#define MEMCHR __rawmemchr_avx2_rtm > -#define USE_AS_RAWMEMCHR 1 > +#ifndef RAWMEMCHR > +# define RAWMEMCHR __rawmemchr_avx2_rtm > +#endif > +#define USE_AS_RAWMEMCHR 1 > +#define MEMCHR RAWMEMCHR > > #include "memchr-avx2-rtm.S" Will we ever use the RTM version as the default? > diff --git a/sysdeps/x86_64/multiarch/rawmemchr-avx2.S b/sysdeps/x86_64/multiarch/rawmemchr-avx2.S > index 128f9ea637..d6bff28757 100644 > --- a/sysdeps/x86_64/multiarch/rawmemchr-avx2.S > +++ b/sysdeps/x86_64/multiarch/rawmemchr-avx2.S > @@ -1,4 +1,7 @@ > -#define MEMCHR __rawmemchr_avx2 > -#define USE_AS_RAWMEMCHR 1 > +#ifndef RAWMEMCHR > +# define RAWMEMCHR __rawmemchr_avx2 > +#endif > +#define USE_AS_RAWMEMCHR 1 > +#define MEMCHR RAWMEMCHR > > #include "memchr-avx2.S" > diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S b/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S > index deda1ca395..8ff7f27c9c 100644 > --- a/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S > +++ b/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S > @@ -1,3 +1,7 @@ > -#define MEMCHR __rawmemchr_evex_rtm > -#define USE_AS_RAWMEMCHR 1 > +#ifndef RAWMEMCHR > +# define RAWMEMCHR __rawmemchr_evex_rtm > +#endif > +#define USE_AS_RAWMEMCHR 1 > +#define MEMCHR RAWMEMCHR > + Will we ever use the RTM version as the default? > #include "memchr-evex-rtm.S" > diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex.S b/sysdeps/x86_64/multiarch/rawmemchr-evex.S > index ec942b77ba..dc1c450699 100644 > --- a/sysdeps/x86_64/multiarch/rawmemchr-evex.S > +++ b/sysdeps/x86_64/multiarch/rawmemchr-evex.S > @@ -1,4 +1,7 @@ > -#define MEMCHR __rawmemchr_evex > -#define USE_AS_RAWMEMCHR 1 > +#ifndef RAWMEMCHR > +# define RAWMEMCHR __rawmemchr_evex > +#endif > +#define USE_AS_RAWMEMCHR 1 > +#define MEMCHR RAWMEMCHR > > #include "memchr-evex.S" > diff --git a/sysdeps/x86_64/multiarch/rawmemchr-sse2.S b/sysdeps/x86_64/multiarch/rawmemchr-sse2.S > index 3841c14c34..e2c2e20d85 100644 > --- a/sysdeps/x86_64/multiarch/rawmemchr-sse2.S > +++ b/sysdeps/x86_64/multiarch/rawmemchr-sse2.S > @@ -16,14 +16,192 @@ > License along with the GNU C Library; if not, see > <https://www.gnu.org/licenses/>. */ > > -/* Define multiple versions only for the definition in libc. */ > -#if IS_IN (libc) > -# define __rawmemchr __rawmemchr_sse2 > - > -# undef weak_alias > -# define weak_alias(__rawmemchr, rawmemchr) > -# undef libc_hidden_def > -# define libc_hidden_def(__rawmemchr) > -#endif > +#include <isa-level.h> > +#include <sysdep.h> > + > +/* MINIMUM_X86_ISA_LEVEL <= 2 because there is no V2 implementation > + so we need this to build for ISA V2 builds. */ > +#if ISA_SHOULD_BUILD (2) > + > +# ifndef RAWMEMCHR > +# define RAWMEMCHR __rawmemchr_sse2 > +# endif > + > + .text > +ENTRY (RAWMEMCHR) > + movd %rsi, %xmm1 > + mov %rdi, %rcx > + > + punpcklbw %xmm1, %xmm1 > + punpcklbw %xmm1, %xmm1 > + > + and $63, %rcx > + pshufd $0, %xmm1, %xmm1 > + > + cmp $48, %rcx > + ja L(crosscache) > + > + movdqu (%rdi), %xmm0 > + pcmpeqb %xmm1, %xmm0 > +/* Check if there is a match. */ > + pmovmskb %xmm0, %eax > + test %eax, %eax > + > + jnz L(matches) > + add $16, %rdi > + and $-16, %rdi > + jmp L(loop_prolog) > + > + .p2align 4 > +L(crosscache): > + and $15, %rcx > + and $-16, %rdi > + movdqa (%rdi), %xmm0 > + > + pcmpeqb %xmm1, %xmm0 > +/* Check if there is a match. */ > + pmovmskb %xmm0, %eax > +/* Remove the leading bytes. */ > + sar %cl, %eax > + test %eax, %eax > + je L(unaligned_no_match) > +/* Check which byte is a match. */ > + bsf %eax, %eax > + > + add %rdi, %rax > + add %rcx, %rax > + ret > + > + .p2align 4 > +L(unaligned_no_match): > + add $16, %rdi > + > + .p2align 4 > +L(loop_prolog): > + movdqa (%rdi), %xmm0 > + pcmpeqb %xmm1, %xmm0 > + pmovmskb %xmm0, %eax > + test %eax, %eax > + jnz L(matches) > + > + movdqa 16(%rdi), %xmm2 > + pcmpeqb %xmm1, %xmm2 > + pmovmskb %xmm2, %eax > + test %eax, %eax > + jnz L(matches16) > + > + movdqa 32(%rdi), %xmm3 > + pcmpeqb %xmm1, %xmm3 > + pmovmskb %xmm3, %eax > + test %eax, %eax > + jnz L(matches32) > + > + movdqa 48(%rdi), %xmm4 > + pcmpeqb %xmm1, %xmm4 > + add $64, %rdi > + pmovmskb %xmm4, %eax > + test %eax, %eax > + jnz L(matches0) > + > + test $0x3f, %rdi > + jz L(align64_loop) > + > + movdqa (%rdi), %xmm0 > + pcmpeqb %xmm1, %xmm0 > + pmovmskb %xmm0, %eax > + test %eax, %eax > + jnz L(matches) > + > + movdqa 16(%rdi), %xmm2 > + pcmpeqb %xmm1, %xmm2 > + pmovmskb %xmm2, %eax > + test %eax, %eax > + jnz L(matches16) > > -#include "../rawmemchr.S" > + movdqa 32(%rdi), %xmm3 > + pcmpeqb %xmm1, %xmm3 > + pmovmskb %xmm3, %eax > + test %eax, %eax > + jnz L(matches32) > + > + movdqa 48(%rdi), %xmm3 > + pcmpeqb %xmm1, %xmm3 > + pmovmskb %xmm3, %eax > + > + add $64, %rdi > + test %eax, %eax > + jnz L(matches0) > + > + and $-64, %rdi > + > + .p2align 4 > +L(align64_loop): > + movdqa (%rdi), %xmm0 > + movdqa 16(%rdi), %xmm2 > + movdqa 32(%rdi), %xmm3 > + movdqa 48(%rdi), %xmm4 > + > + pcmpeqb %xmm1, %xmm0 > + pcmpeqb %xmm1, %xmm2 > + pcmpeqb %xmm1, %xmm3 > + pcmpeqb %xmm1, %xmm4 > + > + pmaxub %xmm0, %xmm3 > + pmaxub %xmm2, %xmm4 > + pmaxub %xmm3, %xmm4 > + pmovmskb %xmm4, %eax > + > + add $64, %rdi > + > + test %eax, %eax > + jz L(align64_loop) > + > + sub $64, %rdi > + > + pmovmskb %xmm0, %eax > + test %eax, %eax > + jnz L(matches) > + > + pmovmskb %xmm2, %eax > + test %eax, %eax > + jnz L(matches16) > + > + movdqa 32(%rdi), %xmm3 > + pcmpeqb %xmm1, %xmm3 > + > + pcmpeqb 48(%rdi), %xmm1 > + pmovmskb %xmm3, %eax > + test %eax, %eax > + jnz L(matches32) > + > + pmovmskb %xmm1, %eax > + bsf %eax, %eax > + lea 48(%rdi, %rax), %rax > + ret > + > + .p2align 4 > +L(matches0): > + bsf %eax, %eax > + lea -16(%rax, %rdi), %rax > + ret > + > + .p2align 4 > +L(matches): > + bsf %eax, %eax > + add %rdi, %rax > + ret > + > + .p2align 4 > +L(matches16): > + bsf %eax, %eax > + lea 16(%rax, %rdi), %rax > + ret > + > + .p2align 4 > +L(matches32): > + bsf %eax, %eax > + lea 32(%rax, %rdi), %rax > + ret > + > +END (RAWMEMCHR) > +#endif > diff --git a/sysdeps/x86_64/multiarch/rtld-memchr.S b/sysdeps/x86_64/multiarch/rtld-memchr.S > new file mode 100644 > index 0000000000..a14b192bed > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/rtld-memchr.S > @@ -0,0 +1,18 @@ > +/* Copyright (C) 2022 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +#include "../memchr.S" > diff --git a/sysdeps/x86_64/multiarch/rtld-rawmemchr.S b/sysdeps/x86_64/multiarch/rtld-rawmemchr.S > new file mode 100644 > index 0000000000..5d4110a052 > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/rtld-rawmemchr.S > @@ -0,0 +1,18 @@ > +/* Copyright (C) 2022 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +#include "../rawmemchr.S" > diff --git a/sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S > index 58ed21db01..2a1cff5b05 100644 > --- a/sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S > +++ b/sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S > @@ -1,4 +1,7 @@ > -#define MEMCHR __wmemchr_avx2_rtm > -#define USE_AS_WMEMCHR 1 > +#ifndef WMEMCHR > +# define WMEMCHR __wmemchr_avx2_rtm > +#endif > +#define USE_AS_WMEMCHR 1 > +#define MEMCHR WMEMCHR > > #include "memchr-avx2-rtm.S" > diff --git a/sysdeps/x86_64/multiarch/wmemchr-avx2.S b/sysdeps/x86_64/multiarch/wmemchr-avx2.S > index 282854f1a1..2bf93fd84b 100644 > --- a/sysdeps/x86_64/multiarch/wmemchr-avx2.S > +++ b/sysdeps/x86_64/multiarch/wmemchr-avx2.S > @@ -1,4 +1,7 @@ > -#define MEMCHR __wmemchr_avx2 > -#define USE_AS_WMEMCHR 1 > +#ifndef WMEMCHR > +# define WMEMCHR __wmemchr_avx2 > +#endif > +#define USE_AS_WMEMCHR 1 > +#define MEMCHR WMEMCHR > > #include "memchr-avx2.S" > diff --git a/sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S b/sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S > index a346cd35a1..c67309e8a1 100644 > --- a/sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S > +++ b/sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S > @@ -1,3 +1,7 @@ > -#define MEMCHR __wmemchr_evex_rtm > -#define USE_AS_WMEMCHR 1 > +#ifndef WMEMCHR > +# define WMEMCHR __wmemchr_evex_rtm > +#endif > +#define USE_AS_WMEMCHR 1 > +#define MEMCHR WMEMCHR > + > #include "memchr-evex-rtm.S" > diff --git a/sysdeps/x86_64/multiarch/wmemchr-evex.S b/sysdeps/x86_64/multiarch/wmemchr-evex.S > index 06cd0f9f5a..5512d5cdc3 100644 > --- a/sysdeps/x86_64/multiarch/wmemchr-evex.S > +++ b/sysdeps/x86_64/multiarch/wmemchr-evex.S > @@ -1,4 +1,7 @@ > -#define MEMCHR __wmemchr_evex > -#define USE_AS_WMEMCHR 1 > +#ifndef WMEMCHR > +# define WMEMCHR __wmemchr_evex > +#endif > +#define USE_AS_WMEMCHR 1 > +#define MEMCHR WMEMCHR > > #include "memchr-evex.S" > diff --git a/sysdeps/x86_64/multiarch/wmemchr-sse2.S b/sysdeps/x86_64/multiarch/wmemchr-sse2.S > index 70a965d552..b675a070d4 100644 > --- a/sysdeps/x86_64/multiarch/wmemchr-sse2.S > +++ b/sysdeps/x86_64/multiarch/wmemchr-sse2.S > @@ -1,4 +1,7 @@ > -#define USE_AS_WMEMCHR 1 > -#define wmemchr __wmemchr_sse2 > +#ifndef WMEMCHR > +# define WMEMCHR __wmemchr_sse2 > +#endif > +#define USE_AS_WMEMCHR 1 > +#define MEMCHR WMEMCHR > > -#include "../memchr.S" > +#include "memchr-sse2.S" > diff --git a/sysdeps/x86_64/rawmemchr.S b/sysdeps/x86_64/rawmemchr.S > index 4c1a3383b9..ba7e5202e6 100644 > --- a/sysdeps/x86_64/rawmemchr.S > +++ b/sysdeps/x86_64/rawmemchr.S > @@ -17,185 +17,13 @@ > License along with the GNU C Library; if not, see > <https://www.gnu.org/licenses/>. */ > > -#include <sysdep.h> > +#define RAWMEMCHR __rawmemchr > > - .text > -ENTRY (__rawmemchr) > - movd %rsi, %xmm1 > - mov %rdi, %rcx > +#define DEFAULT_IMPL_V1 "multiarch/rawmemchr-sse2.S" > +#define DEFAULT_IMPL_V3 "multiarch/rawmemchr-avx2.S" > +#define DEFAULT_IMPL_V4 "multiarch/rawmemchr-evex.S" > > - punpcklbw %xmm1, %xmm1 > - punpcklbw %xmm1, %xmm1 > - > - and $63, %rcx > - pshufd $0, %xmm1, %xmm1 > - > - cmp $48, %rcx > - ja L(crosscache) > - > - movdqu (%rdi), %xmm0 > - pcmpeqb %xmm1, %xmm0 > -/* Check if there is a match. */ > - pmovmskb %xmm0, %eax > - test %eax, %eax > - > - jnz L(matches) > - add $16, %rdi > - and $-16, %rdi > - jmp L(loop_prolog) > - > - .p2align 4 > -L(crosscache): > - and $15, %rcx > - and $-16, %rdi > - movdqa (%rdi), %xmm0 > - > - pcmpeqb %xmm1, %xmm0 > -/* Check if there is a match. */ > - pmovmskb %xmm0, %eax > -/* Remove the leading bytes. */ > - sar %cl, %eax > - test %eax, %eax > - je L(unaligned_no_match) > -/* Check which byte is a match. */ > - bsf %eax, %eax > - > - add %rdi, %rax > - add %rcx, %rax > - ret > - > - .p2align 4 > -L(unaligned_no_match): > - add $16, %rdi > - > - .p2align 4 > -L(loop_prolog): > - movdqa (%rdi), %xmm0 > - pcmpeqb %xmm1, %xmm0 > - pmovmskb %xmm0, %eax > - test %eax, %eax > - jnz L(matches) > - > - movdqa 16(%rdi), %xmm2 > - pcmpeqb %xmm1, %xmm2 > - pmovmskb %xmm2, %eax > - test %eax, %eax > - jnz L(matches16) > - > - movdqa 32(%rdi), %xmm3 > - pcmpeqb %xmm1, %xmm3 > - pmovmskb %xmm3, %eax > - test %eax, %eax > - jnz L(matches32) > - > - movdqa 48(%rdi), %xmm4 > - pcmpeqb %xmm1, %xmm4 > - add $64, %rdi > - pmovmskb %xmm4, %eax > - test %eax, %eax > - jnz L(matches0) > - > - test $0x3f, %rdi > - jz L(align64_loop) > - > - movdqa (%rdi), %xmm0 > - pcmpeqb %xmm1, %xmm0 > - pmovmskb %xmm0, %eax > - test %eax, %eax > - jnz L(matches) > - > - movdqa 16(%rdi), %xmm2 > - pcmpeqb %xmm1, %xmm2 > - pmovmskb %xmm2, %eax > - test %eax, %eax > - jnz L(matches16) > - > - movdqa 32(%rdi), %xmm3 > - pcmpeqb %xmm1, %xmm3 > - pmovmskb %xmm3, %eax > - test %eax, %eax > - jnz L(matches32) > - > - movdqa 48(%rdi), %xmm3 > - pcmpeqb %xmm1, %xmm3 > - pmovmskb %xmm3, %eax > - > - add $64, %rdi > - test %eax, %eax > - jnz L(matches0) > - > - and $-64, %rdi > - > - .p2align 4 > -L(align64_loop): > - movdqa (%rdi), %xmm0 > - movdqa 16(%rdi), %xmm2 > - movdqa 32(%rdi), %xmm3 > - movdqa 48(%rdi), %xmm4 > - > - pcmpeqb %xmm1, %xmm0 > - pcmpeqb %xmm1, %xmm2 > - pcmpeqb %xmm1, %xmm3 > - pcmpeqb %xmm1, %xmm4 > - > - pmaxub %xmm0, %xmm3 > - pmaxub %xmm2, %xmm4 > - pmaxub %xmm3, %xmm4 > - pmovmskb %xmm4, %eax > - > - add $64, %rdi > - > - test %eax, %eax > - jz L(align64_loop) > - > - sub $64, %rdi > - > - pmovmskb %xmm0, %eax > - test %eax, %eax > - jnz L(matches) > - > - pmovmskb %xmm2, %eax > - test %eax, %eax > - jnz L(matches16) > - > - movdqa 32(%rdi), %xmm3 > - pcmpeqb %xmm1, %xmm3 > - > - pcmpeqb 48(%rdi), %xmm1 > - pmovmskb %xmm3, %eax > - test %eax, %eax > - jnz L(matches32) > - > - pmovmskb %xmm1, %eax > - bsf %eax, %eax > - lea 48(%rdi, %rax), %rax > - ret > - > - .p2align 4 > -L(matches0): > - bsf %eax, %eax > - lea -16(%rax, %rdi), %rax > - ret > - > - .p2align 4 > -L(matches): > - bsf %eax, %eax > - add %rdi, %rax > - ret > - > - .p2align 4 > -L(matches16): > - bsf %eax, %eax > - lea 16(%rax, %rdi), %rax > - ret > - > - .p2align 4 > -L(matches32): > - bsf %eax, %eax > - lea 32(%rax, %rdi), %rax > - ret > - > -END (__rawmemchr) > +#include "isa-default-impl.h" > > weak_alias (__rawmemchr, rawmemchr) > -libc_hidden_builtin_def (__rawmemchr) > +libc_hidden_def (__rawmemchr) > diff --git a/sysdeps/x86_64/wmemchr.S b/sysdeps/x86_64/wmemchr.S > new file mode 100644 > index 0000000000..eef91e556b > --- /dev/null > +++ b/sysdeps/x86_64/wmemchr.S > @@ -0,0 +1,28 @@ > +/* Copyright (C) 2011-2022 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +#define WMEMCHR __wmemchr > + > +#define DEFAULT_IMPL_V1 "multiarch/wmemchr-sse2.S" > +#define DEFAULT_IMPL_V3 "multiarch/wmemchr-avx2.S" > +#define DEFAULT_IMPL_V4 "multiarch/wmemchr-evex.S" > + > +#include "isa-default-impl.h" > + > +libc_hidden_def (__wmemchr) > +weak_alias (__wmemchr, wmemchr) > +libc_hidden_weak (wmemchr) > -- > 2.34.1 >
On Wed, Jun 22, 2022 at 2:52 PM H.J. Lu <hjl.tools@gmail.com> wrote: > > On Wed, Jun 22, 2022 at 1:58 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > 1. Refactor files so that all implementations for in the multiarch > > directory. > > - Essentially moved sse2 {raw|w}memchr.S implementation to > > multiarch/{raw|w}memchr-sse2.S > > > > - The non-multiarch {raw|w}memchr.S file now only includes one of > > the implementations in the multiarch directory based on the > > compiled ISA level (only used for non-multiarch builds. > > Otherwise we go through the ifunc selector). > > > > 2. Add ISA level build guards to different implementations. > > - I.e memchr-avx2.S which is ISA level 3 will only build if > > compiled ISA level <= 3. Otherwise there is no reason to include > > it as we will always use one of the ISA level 4 > > implementations (memchr-evex{-rtm}.S). > > > > 3. Add new multiarch/rtld-{raw}memchr.S that just include the > > non-multiarch {raw}memchr.S which will in turn select the best > > implementation based on the compiled ISA level. > > > > 4. Refactor the ifunc selector and ifunc implementation list to use > > the ISA level aware wrapper macros that allow functions below the > > compiled ISA level (with a guranteed replacement) to be skipped. > > - Guranteed replacement essentially means that for any ISA level > > build there must be a function that the baseline of the ISA > > supports. So for {raw|w}memchr.S since there is not ISA level 2 > > function, the ISA level 2 build still includes the ISA level > > 1 (sse2) function. Once we reach the ISA level 3 build, however, > > {raw|w}memchr-avx2{-rtm}.S will always be sufficient so the ISA > > level 1 implementation ({raw|w}memchr-sse2.S) will not be built. > > > > Tested with and without multiarch on x86_64 for ISA levels: > > {generic, x86-64-v2, x86-64-v3, x86-64-v4} > > > > And m32 with and without multiarch. > > --- > > sysdeps/x86_64/isa-default-impl.h | 8 + > > sysdeps/x86_64/memchr.S | 357 +---------------- > > sysdeps/x86_64/multiarch/ifunc-evex.h | 29 +- > > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 72 ++-- > > sysdeps/x86_64/multiarch/memchr-avx2.S | 5 +- > > sysdeps/x86_64/multiarch/memchr-evex.S | 5 +- > > sysdeps/x86_64/multiarch/memchr-sse2.S | 363 +++++++++++++++++- > > sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S | 7 +- > > sysdeps/x86_64/multiarch/rawmemchr-avx2.S | 7 +- > > sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S | 8 +- > > sysdeps/x86_64/multiarch/rawmemchr-evex.S | 7 +- > > sysdeps/x86_64/multiarch/rawmemchr-sse2.S | 198 +++++++++- > > sysdeps/x86_64/multiarch/rtld-memchr.S | 18 + > > sysdeps/x86_64/multiarch/rtld-rawmemchr.S | 18 + > > sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S | 7 +- > > sysdeps/x86_64/multiarch/wmemchr-avx2.S | 7 +- > > sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S | 8 +- > > sysdeps/x86_64/multiarch/wmemchr-evex.S | 7 +- > > sysdeps/x86_64/multiarch/wmemchr-sse2.S | 9 +- > > sysdeps/x86_64/rawmemchr.S | 184 +-------- > > sysdeps/x86_64/wmemchr.S | 28 ++ > > 21 files changed, 740 insertions(+), 612 deletions(-) > > create mode 100644 sysdeps/x86_64/multiarch/rtld-memchr.S > > create mode 100644 sysdeps/x86_64/multiarch/rtld-rawmemchr.S > > create mode 100644 sysdeps/x86_64/wmemchr.S > > > > diff --git a/sysdeps/x86_64/isa-default-impl.h b/sysdeps/x86_64/isa-default-impl.h > > index 34634668e5..b374a38b8b 100644 > > --- a/sysdeps/x86_64/isa-default-impl.h > > +++ b/sysdeps/x86_64/isa-default-impl.h > > @@ -46,4 +46,12 @@ > > # error "Unsupported ISA Level!" > > #endif > > > > +#if IS_IN(rtld) && !defined USE_MULTIARCH > > +# error "RTLD version should only exist in multiarch build" > > +#endif > > + > > +#if defined USE_MULTIARCH && !IS_IN(rtld) > > +# error "Multiarch build should not use ISA_DEFAULT_IMPL without RTLD" > > +#endif > > Please do > > #if IS_IN (rtld) > #else > #endif > > > #include ISA_DEFAULT_IMPL > > diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S > > index a160fd9b00..20b43508c4 100644 > > --- a/sysdeps/x86_64/memchr.S > > +++ b/sysdeps/x86_64/memchr.S > > @@ -15,358 +15,13 @@ > > License along with the GNU C Library; if not, see > > <https://www.gnu.org/licenses/>. */ > > > > -#include <sysdep.h> > > +#define MEMCHR __memchr > > > > -#ifdef USE_AS_WMEMCHR > > -# define MEMCHR wmemchr > > -# define PCMPEQ pcmpeqd > > -# define CHAR_PER_VEC 4 > > -#else > > -# define MEMCHR memchr > > -# define PCMPEQ pcmpeqb > > -# define CHAR_PER_VEC 16 > > -#endif > > +#define DEFAULT_IMPL_V1 "multiarch/memchr-sse2.S" > > +#define DEFAULT_IMPL_V3 "multiarch/memchr-avx2.S" > > +#define DEFAULT_IMPL_V4 "multiarch/memchr-evex.S" > > > > -/* fast SSE2 version with using pmaxub and 64 byte loop */ > > +#include "isa-default-impl.h" > > > > - .text > > -ENTRY(MEMCHR) > > - movd %esi, %xmm1 > > - mov %edi, %ecx > > - > > -#ifdef __ILP32__ > > - /* Clear the upper 32 bits. */ > > - movl %edx, %edx > > -#endif > > -#ifdef USE_AS_WMEMCHR > > - test %RDX_LP, %RDX_LP > > - jz L(return_null) > > -#else > > - punpcklbw %xmm1, %xmm1 > > - test %RDX_LP, %RDX_LP > > - jz L(return_null) > > - punpcklbw %xmm1, %xmm1 > > -#endif > > - > > - and $63, %ecx > > - pshufd $0, %xmm1, %xmm1 > > - > > - cmp $48, %ecx > > - ja L(crosscache) > > - > > - movdqu (%rdi), %xmm0 > > - PCMPEQ %xmm1, %xmm0 > > - pmovmskb %xmm0, %eax > > - test %eax, %eax > > - > > - jnz L(matches_1) > > - sub $CHAR_PER_VEC, %rdx > > - jbe L(return_null) > > - add $16, %rdi > > - and $15, %ecx > > - and $-16, %rdi > > -#ifdef USE_AS_WMEMCHR > > - shr $2, %ecx > > -#endif > > - add %rcx, %rdx > > - sub $(CHAR_PER_VEC * 4), %rdx > > - jbe L(exit_loop) > > - jmp L(loop_prolog) > > - > > - .p2align 4 > > -L(crosscache): > > - and $15, %ecx > > - and $-16, %rdi > > - movdqa (%rdi), %xmm0 > > - > > - PCMPEQ %xmm1, %xmm0 > > - /* Check if there is a match. */ > > - pmovmskb %xmm0, %eax > > - /* Remove the leading bytes. */ > > - sar %cl, %eax > > - test %eax, %eax > > - je L(unaligned_no_match) > > - /* Check which byte is a match. */ > > - bsf %eax, %eax > > -#ifdef USE_AS_WMEMCHR > > - mov %eax, %esi > > - shr $2, %esi > > - sub %rsi, %rdx > > -#else > > - sub %rax, %rdx > > -#endif > > - jbe L(return_null) > > - add %rdi, %rax > > - add %rcx, %rax > > - ret > > - > > - .p2align 4 > > -L(unaligned_no_match): > > - /* "rcx" is less than 16. Calculate "rdx + rcx - 16" by using > > - "rdx - (16 - rcx)" instead of "(rdx + rcx) - 16" to void > > - possible addition overflow. */ > > - neg %rcx > > - add $16, %rcx > > -#ifdef USE_AS_WMEMCHR > > - shr $2, %ecx > > -#endif > > - sub %rcx, %rdx > > - jbe L(return_null) > > - add $16, %rdi > > - sub $(CHAR_PER_VEC * 4), %rdx > > - jbe L(exit_loop) > > - > > - .p2align 4 > > -L(loop_prolog): > > - movdqa (%rdi), %xmm0 > > - PCMPEQ %xmm1, %xmm0 > > - pmovmskb %xmm0, %eax > > - test %eax, %eax > > - jnz L(matches) > > - > > - movdqa 16(%rdi), %xmm2 > > - PCMPEQ %xmm1, %xmm2 > > - pmovmskb %xmm2, %eax > > - test %eax, %eax > > - jnz L(matches16) > > - > > - movdqa 32(%rdi), %xmm3 > > - PCMPEQ %xmm1, %xmm3 > > - pmovmskb %xmm3, %eax > > - test %eax, %eax > > - jnz L(matches32) > > - > > - movdqa 48(%rdi), %xmm4 > > - PCMPEQ %xmm1, %xmm4 > > - add $64, %rdi > > - pmovmskb %xmm4, %eax > > - test %eax, %eax > > - jnz L(matches0) > > - > > - test $0x3f, %rdi > > - jz L(align64_loop) > > - > > - sub $(CHAR_PER_VEC * 4), %rdx > > - jbe L(exit_loop) > > - > > - movdqa (%rdi), %xmm0 > > - PCMPEQ %xmm1, %xmm0 > > - pmovmskb %xmm0, %eax > > - test %eax, %eax > > - jnz L(matches) > > - > > - movdqa 16(%rdi), %xmm2 > > - PCMPEQ %xmm1, %xmm2 > > - pmovmskb %xmm2, %eax > > - test %eax, %eax > > - jnz L(matches16) > > - > > - movdqa 32(%rdi), %xmm3 > > - PCMPEQ %xmm1, %xmm3 > > - pmovmskb %xmm3, %eax > > - test %eax, %eax > > - jnz L(matches32) > > - > > - movdqa 48(%rdi), %xmm3 > > - PCMPEQ %xmm1, %xmm3 > > - pmovmskb %xmm3, %eax > > - > > - add $64, %rdi > > - test %eax, %eax > > - jnz L(matches0) > > - > > - mov %rdi, %rcx > > - and $-64, %rdi > > - and $63, %ecx > > -#ifdef USE_AS_WMEMCHR > > - shr $2, %ecx > > -#endif > > - add %rcx, %rdx > > - > > - .p2align 4 > > -L(align64_loop): > > - sub $(CHAR_PER_VEC * 4), %rdx > > - jbe L(exit_loop) > > - movdqa (%rdi), %xmm0 > > - movdqa 16(%rdi), %xmm2 > > - movdqa 32(%rdi), %xmm3 > > - movdqa 48(%rdi), %xmm4 > > - > > - PCMPEQ %xmm1, %xmm0 > > - PCMPEQ %xmm1, %xmm2 > > - PCMPEQ %xmm1, %xmm3 > > - PCMPEQ %xmm1, %xmm4 > > - > > - pmaxub %xmm0, %xmm3 > > - pmaxub %xmm2, %xmm4 > > - pmaxub %xmm3, %xmm4 > > - pmovmskb %xmm4, %eax > > - > > - add $64, %rdi > > - > > - test %eax, %eax > > - jz L(align64_loop) > > - > > - sub $64, %rdi > > - > > - pmovmskb %xmm0, %eax > > - test %eax, %eax > > - jnz L(matches) > > - > > - pmovmskb %xmm2, %eax > > - test %eax, %eax > > - jnz L(matches16) > > - > > - movdqa 32(%rdi), %xmm3 > > - PCMPEQ %xmm1, %xmm3 > > - > > - PCMPEQ 48(%rdi), %xmm1 > > - pmovmskb %xmm3, %eax > > - test %eax, %eax > > - jnz L(matches32) > > - > > - pmovmskb %xmm1, %eax > > - bsf %eax, %eax > > - lea 48(%rdi, %rax), %rax > > - ret > > - > > - .p2align 4 > > -L(exit_loop): > > - add $(CHAR_PER_VEC * 2), %edx > > - jle L(exit_loop_32) > > - > > - movdqa (%rdi), %xmm0 > > - PCMPEQ %xmm1, %xmm0 > > - pmovmskb %xmm0, %eax > > - test %eax, %eax > > - jnz L(matches) > > - > > - movdqa 16(%rdi), %xmm2 > > - PCMPEQ %xmm1, %xmm2 > > - pmovmskb %xmm2, %eax > > - test %eax, %eax > > - jnz L(matches16) > > - > > - movdqa 32(%rdi), %xmm3 > > - PCMPEQ %xmm1, %xmm3 > > - pmovmskb %xmm3, %eax > > - test %eax, %eax > > - jnz L(matches32_1) > > - sub $CHAR_PER_VEC, %edx > > - jle L(return_null) > > - > > - PCMPEQ 48(%rdi), %xmm1 > > - pmovmskb %xmm1, %eax > > - test %eax, %eax > > - jnz L(matches48_1) > > - xor %eax, %eax > > - ret > > - > > - .p2align 4 > > -L(exit_loop_32): > > - add $(CHAR_PER_VEC * 2), %edx > > - movdqa (%rdi), %xmm0 > > - PCMPEQ %xmm1, %xmm0 > > - pmovmskb %xmm0, %eax > > - test %eax, %eax > > - jnz L(matches_1) > > - sub $CHAR_PER_VEC, %edx > > - jbe L(return_null) > > - > > - PCMPEQ 16(%rdi), %xmm1 > > - pmovmskb %xmm1, %eax > > - test %eax, %eax > > - jnz L(matches16_1) > > - xor %eax, %eax > > - ret > > - > > - .p2align 4 > > -L(matches0): > > - bsf %eax, %eax > > - lea -16(%rax, %rdi), %rax > > - ret > > - > > - .p2align 4 > > -L(matches): > > - bsf %eax, %eax > > - add %rdi, %rax > > - ret > > - > > - .p2align 4 > > -L(matches16): > > - bsf %eax, %eax > > - lea 16(%rax, %rdi), %rax > > - ret > > - > > - .p2align 4 > > -L(matches32): > > - bsf %eax, %eax > > - lea 32(%rax, %rdi), %rax > > - ret > > - > > - .p2align 4 > > -L(matches_1): > > - bsf %eax, %eax > > -#ifdef USE_AS_WMEMCHR > > - mov %eax, %esi > > - shr $2, %esi > > - sub %rsi, %rdx > > -#else > > - sub %rax, %rdx > > -#endif > > - jbe L(return_null) > > - add %rdi, %rax > > - ret > > - > > - .p2align 4 > > -L(matches16_1): > > - bsf %eax, %eax > > -#ifdef USE_AS_WMEMCHR > > - mov %eax, %esi > > - shr $2, %esi > > - sub %rsi, %rdx > > -#else > > - sub %rax, %rdx > > -#endif > > - jbe L(return_null) > > - lea 16(%rdi, %rax), %rax > > - ret > > - > > - .p2align 4 > > -L(matches32_1): > > - bsf %eax, %eax > > -#ifdef USE_AS_WMEMCHR > > - mov %eax, %esi > > - shr $2, %esi > > - sub %rsi, %rdx > > -#else > > - sub %rax, %rdx > > -#endif > > - jbe L(return_null) > > - lea 32(%rdi, %rax), %rax > > - ret > > - > > - .p2align 4 > > -L(matches48_1): > > - bsf %eax, %eax > > -#ifdef USE_AS_WMEMCHR > > - mov %eax, %esi > > - shr $2, %esi > > - sub %rsi, %rdx > > -#else > > - sub %rax, %rdx > > -#endif > > - jbe L(return_null) > > - lea 48(%rdi, %rax), %rax > > - ret > > - > > - .p2align 4 > > -L(return_null): > > - xor %eax, %eax > > - ret > > -END(MEMCHR) > > - > > -#ifndef USE_AS_WMEMCHR > > -strong_alias (memchr, __memchr) > > +weak_alias (__memchr, memchr) > > libc_hidden_builtin_def(memchr) > > -#endif > > diff --git a/sysdeps/x86_64/multiarch/ifunc-evex.h b/sysdeps/x86_64/multiarch/ifunc-evex.h > > index b8f7a12ea2..856c6261f8 100644 > > --- a/sysdeps/x86_64/multiarch/ifunc-evex.h > > +++ b/sysdeps/x86_64/multiarch/ifunc-evex.h > > @@ -19,24 +19,28 @@ > > > > #include <init-arch.h> > > > > -extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; > > -extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; > > -extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; > > extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; > > extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_rtm) attribute_hidden; > > > > +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; > > +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; > > + > > +extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; > > > > static inline void * > > IFUNC_SELECTOR (void) > > { > > - const struct cpu_features* cpu_features = __get_cpu_features (); > > - > > - if (CPU_FEATURE_USABLE_P (cpu_features, AVX2) > > - && CPU_FEATURE_USABLE_P (cpu_features, BMI2) > > - && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) > > + const struct cpu_features *cpu_features = __get_cpu_features (); > > + > > + /* NB: The X86_ISA_* feature check macros are evaluated at > > + compile time. */ > > + if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX2) > > + && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2) > > + && X86_ISA_CPU_FEATURES_ARCH_P (cpu_features, > > + AVX_Fast_Unaligned_Load)) > > { > > - if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) > > - && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)) > > + if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) > > + && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)) > > { > > if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) > > return OPTIMIZE (evex_rtm); > > @@ -47,9 +51,12 @@ IFUNC_SELECTOR (void) > > if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) > > return OPTIMIZE (avx2_rtm); > > > > - if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) > > + if (X86_ISA_CPU_FEATURES_ARCH_P (cpu_features, > > + Prefer_No_VZEROUPPER)) > > return OPTIMIZE (avx2); > > } > > > > + /* This is unreachable (compile time checked) if ISA level >= 3 > > + so no need for a robust fallback here. */ > > return OPTIMIZE (sse2); > > } > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > index 883362f63d..bf52cf96d0 100644 > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > @@ -25,7 +25,8 @@ > > > > /* Fill ARRAY of MAX elements with IFUNC implementations for function > > NAME supported on target machine and return the number of valid > > - entries. */ > > + entries. Each set of implementations for a given function is sorted in > > + descending order by ISA level. */ > > > > size_t > > __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > @@ -53,24 +54,27 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > > /* Support sysdeps/x86_64/multiarch/memchr.c. */ > > IFUNC_IMPL (i, name, memchr, > > - IFUNC_IMPL_ADD (array, i, memchr, > > - CPU_FEATURE_USABLE (AVX2), > > - __memchr_avx2) > > - IFUNC_IMPL_ADD (array, i, memchr, > > - (CPU_FEATURE_USABLE (AVX2) > > - && CPU_FEATURE_USABLE (RTM)), > > - __memchr_avx2_rtm) > > - IFUNC_IMPL_ADD (array, i, memchr, > > + X86_IFUNC_IMPL_ADD_V4 (array, i, memchr, > > (CPU_FEATURE_USABLE (AVX512VL) > > && CPU_FEATURE_USABLE (AVX512BW) > > && CPU_FEATURE_USABLE (BMI2)), > > __memchr_evex) > > - IFUNC_IMPL_ADD (array, i, memchr, > > + X86_IFUNC_IMPL_ADD_V4 (array, i, memchr, > > (CPU_FEATURE_USABLE (AVX512VL) > > && CPU_FEATURE_USABLE (AVX512BW) > > && CPU_FEATURE_USABLE (BMI2)), > > __memchr_evex_rtm) > > - IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_sse2)) > > + X86_IFUNC_IMPL_ADD_V3 (array, i, memchr, > > + CPU_FEATURE_USABLE (AVX2), > > + __memchr_avx2) > > + X86_IFUNC_IMPL_ADD_V3 (array, i, memchr, > > + (CPU_FEATURE_USABLE (AVX2) > > + && CPU_FEATURE_USABLE (RTM)), > > + __memchr_avx2_rtm) > > + /* Can be lowered to V1 if a V2 implementation is added. */ > > + X86_IFUNC_IMPL_ADD_V2 (array, i, memchr, > > + 1, > > + __memchr_sse2)) > > > > /* Support sysdeps/x86_64/multiarch/memcmp.c. */ > > IFUNC_IMPL (i, name, memcmp, > > @@ -288,24 +292,27 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > > /* Support sysdeps/x86_64/multiarch/rawmemchr.c. */ > > IFUNC_IMPL (i, name, rawmemchr, > > - IFUNC_IMPL_ADD (array, i, rawmemchr, > > - CPU_FEATURE_USABLE (AVX2), > > - __rawmemchr_avx2) > > - IFUNC_IMPL_ADD (array, i, rawmemchr, > > - (CPU_FEATURE_USABLE (AVX2) > > - && CPU_FEATURE_USABLE (RTM)), > > - __rawmemchr_avx2_rtm) > > - IFUNC_IMPL_ADD (array, i, rawmemchr, > > + X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr, > > (CPU_FEATURE_USABLE (AVX512VL) > > && CPU_FEATURE_USABLE (AVX512BW) > > && CPU_FEATURE_USABLE (BMI2)), > > __rawmemchr_evex) > > - IFUNC_IMPL_ADD (array, i, rawmemchr, > > + X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr, > > (CPU_FEATURE_USABLE (AVX512VL) > > && CPU_FEATURE_USABLE (AVX512BW) > > && CPU_FEATURE_USABLE (BMI2)), > > __rawmemchr_evex_rtm) > > - IFUNC_IMPL_ADD (array, i, rawmemchr, 1, __rawmemchr_sse2)) > > + X86_IFUNC_IMPL_ADD_V3 (array, i, rawmemchr, > > + CPU_FEATURE_USABLE (AVX2), > > + __rawmemchr_avx2) > > + X86_IFUNC_IMPL_ADD_V3 (array, i, rawmemchr, > > + (CPU_FEATURE_USABLE (AVX2) > > + && CPU_FEATURE_USABLE (RTM)), > > + __rawmemchr_avx2_rtm) > > + /* Can be lowered to V1 if a V2 implementation is added. */ > > + X86_IFUNC_IMPL_ADD_V2 (array, i, rawmemchr, > > + 1, > > + __rawmemchr_sse2)) > > > > /* Support sysdeps/x86_64/multiarch/strlen.c. */ > > IFUNC_IMPL (i, name, strlen, > > @@ -748,24 +755,27 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > > /* Support sysdeps/x86_64/multiarch/wmemchr.c. */ > > IFUNC_IMPL (i, name, wmemchr, > > - IFUNC_IMPL_ADD (array, i, wmemchr, > > - CPU_FEATURE_USABLE (AVX2), > > - __wmemchr_avx2) > > - IFUNC_IMPL_ADD (array, i, wmemchr, > > - (CPU_FEATURE_USABLE (AVX2) > > - && CPU_FEATURE_USABLE (RTM)), > > - __wmemchr_avx2_rtm) > > - IFUNC_IMPL_ADD (array, i, wmemchr, > > + X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr, > > (CPU_FEATURE_USABLE (AVX512VL) > > && CPU_FEATURE_USABLE (AVX512BW) > > && CPU_FEATURE_USABLE (BMI2)), > > __wmemchr_evex) > > - IFUNC_IMPL_ADD (array, i, wmemchr, > > + X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr, > > (CPU_FEATURE_USABLE (AVX512VL) > > && CPU_FEATURE_USABLE (AVX512BW) > > && CPU_FEATURE_USABLE (BMI2)), > > __wmemchr_evex_rtm) > > - IFUNC_IMPL_ADD (array, i, wmemchr, 1, __wmemchr_sse2)) > > + X86_IFUNC_IMPL_ADD_V3 (array, i, wmemchr, > > + CPU_FEATURE_USABLE (AVX2), > > + __wmemchr_avx2) > > + X86_IFUNC_IMPL_ADD_V3 (array, i, wmemchr, > > + (CPU_FEATURE_USABLE (AVX2) > > + && CPU_FEATURE_USABLE (RTM)), > > + __wmemchr_avx2_rtm) > > + /* Can be lowered to V1 if a V2 implementation is added. */ > > + X86_IFUNC_IMPL_ADD_V2 (array, i, wmemchr, > > + 1, > > + __wmemchr_sse2)) > > > > /* Support sysdeps/x86_64/multiarch/wmemcmp.c. */ > > IFUNC_IMPL (i, name, wmemcmp, > > diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S > > index c5a256eb37..39be5f7083 100644 > > --- a/sysdeps/x86_64/multiarch/memchr-avx2.S > > +++ b/sysdeps/x86_64/multiarch/memchr-avx2.S > > @@ -16,9 +16,10 @@ > > License along with the GNU C Library; if not, see > > <https://www.gnu.org/licenses/>. */ > > > > -#if IS_IN (libc) > > +#include <isa-level.h> > > +#include <sysdep.h> > > > > -# include <sysdep.h> > > +#if ISA_SHOULD_BUILD (3) > > > > # ifndef MEMCHR > > # define MEMCHR __memchr_avx2 > > diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S > > index 0fd11b7632..0dd4f1dcce 100644 > > --- a/sysdeps/x86_64/multiarch/memchr-evex.S > > +++ b/sysdeps/x86_64/multiarch/memchr-evex.S > > @@ -16,9 +16,10 @@ > > License along with the GNU C Library; if not, see > > <https://www.gnu.org/licenses/>. */ > > > > -#if IS_IN (libc) > > +#include <isa-level.h> > > +#include <sysdep.h> > > > > -# include <sysdep.h> > > +#if ISA_SHOULD_BUILD (4) > > > > # ifndef MEMCHR > > # define MEMCHR __memchr_evex > > diff --git a/sysdeps/x86_64/multiarch/memchr-sse2.S b/sysdeps/x86_64/multiarch/memchr-sse2.S > > index 2c6fdd41d6..8c561cd687 100644 > > --- a/sysdeps/x86_64/multiarch/memchr-sse2.S > > +++ b/sysdeps/x86_64/multiarch/memchr-sse2.S > > @@ -16,13 +16,360 @@ > > License along with the GNU C Library; if not, see > > <https://www.gnu.org/licenses/>. */ > > > > -#if IS_IN (libc) > > -# define memchr __memchr_sse2 > > +#include <isa-level.h> > > +#include <sysdep.h> > > > > -# undef strong_alias > > -# define strong_alias(memchr, __memchr) > > -# undef libc_hidden_builtin_def > > -# define libc_hidden_builtin_def(memchr) > > -#endif > > +/* MINIMUM_X86_ISA_LEVEL <= 2 because there is no V2 implementation > > + so we need this to build for ISA V2 builds. */ > > +#if ISA_SHOULD_BUILD (2) > > + > > +# ifndef MEMCHR > > +# define MEMCHR __memchr_sse2 > > +# endif > > +# ifdef USE_AS_WMEMCHR > > +# define PCMPEQ pcmpeqd > > +# define CHAR_PER_VEC 4 > > +# else > > +# define PCMPEQ pcmpeqb > > +# define CHAR_PER_VEC 16 > > +# endif > > + > > +/* fast SSE2 version with using pmaxub and 64 byte loop */ > > + > > + .text > > +ENTRY(MEMCHR) > > + movd %esi, %xmm1 > > + mov %edi, %ecx > > + > > +# ifdef __ILP32__ > > + /* Clear the upper 32 bits. */ > > + movl %edx, %edx > > +# endif > > +# ifdef USE_AS_WMEMCHR > > + test %RDX_LP, %RDX_LP > > + jz L(return_null) > > +# else > > + punpcklbw %xmm1, %xmm1 > > + test %RDX_LP, %RDX_LP > > + jz L(return_null) > > + punpcklbw %xmm1, %xmm1 > > +# endif > > + > > + and $63, %ecx > > + pshufd $0, %xmm1, %xmm1 > > + > > + cmp $48, %ecx > > + ja L(crosscache) > > + > > + movdqu (%rdi), %xmm0 > > + PCMPEQ %xmm1, %xmm0 > > + pmovmskb %xmm0, %eax > > + test %eax, %eax > > + > > + jnz L(matches_1) > > + sub $CHAR_PER_VEC, %rdx > > + jbe L(return_null) > > + add $16, %rdi > > + and $15, %ecx > > + and $-16, %rdi > > +# ifdef USE_AS_WMEMCHR > > + shr $2, %ecx > > +# endif > > + add %rcx, %rdx > > + sub $(CHAR_PER_VEC * 4), %rdx > > + jbe L(exit_loop) > > + jmp L(loop_prolog) > > + > > + .p2align 4 > > +L(crosscache): > > + and $15, %ecx > > + and $-16, %rdi > > + movdqa (%rdi), %xmm0 > > + > > + PCMPEQ %xmm1, %xmm0 > > + /* Check if there is a match. */ > > + pmovmskb %xmm0, %eax > > + /* Remove the leading bytes. */ > > + sar %cl, %eax > > + test %eax, %eax > > + je L(unaligned_no_match) > > + /* Check which byte is a match. */ > > + bsf %eax, %eax > > +# ifdef USE_AS_WMEMCHR > > + mov %eax, %esi > > + shr $2, %esi > > + sub %rsi, %rdx > > +# else > > + sub %rax, %rdx > > +# endif > > + jbe L(return_null) > > + add %rdi, %rax > > + add %rcx, %rax > > + ret > > + > > + .p2align 4 > > +L(unaligned_no_match): > > + /* "rcx" is less than 16. Calculate "rdx + rcx - 16" by using > > + "rdx - (16 - rcx)" instead of "(rdx + rcx) - 16" to void > > + possible addition overflow. */ > > + neg %rcx > > + add $16, %rcx > > +# ifdef USE_AS_WMEMCHR > > + shr $2, %ecx > > +# endif > > + sub %rcx, %rdx > > + jbe L(return_null) > > + add $16, %rdi > > + sub $(CHAR_PER_VEC * 4), %rdx > > + jbe L(exit_loop) > > + > > + .p2align 4 > > +L(loop_prolog): > > + movdqa (%rdi), %xmm0 > > + PCMPEQ %xmm1, %xmm0 > > + pmovmskb %xmm0, %eax > > + test %eax, %eax > > + jnz L(matches) > > + > > + movdqa 16(%rdi), %xmm2 > > + PCMPEQ %xmm1, %xmm2 > > + pmovmskb %xmm2, %eax > > + test %eax, %eax > > + jnz L(matches16) > > + > > + movdqa 32(%rdi), %xmm3 > > + PCMPEQ %xmm1, %xmm3 > > + pmovmskb %xmm3, %eax > > + test %eax, %eax > > + jnz L(matches32) > > + > > + movdqa 48(%rdi), %xmm4 > > + PCMPEQ %xmm1, %xmm4 > > + add $64, %rdi > > + pmovmskb %xmm4, %eax > > + test %eax, %eax > > + jnz L(matches0) > > + > > + test $0x3f, %rdi > > + jz L(align64_loop) > > + > > + sub $(CHAR_PER_VEC * 4), %rdx > > + jbe L(exit_loop) > > + > > + movdqa (%rdi), %xmm0 > > + PCMPEQ %xmm1, %xmm0 > > + pmovmskb %xmm0, %eax > > + test %eax, %eax > > + jnz L(matches) > > + > > + movdqa 16(%rdi), %xmm2 > > + PCMPEQ %xmm1, %xmm2 > > + pmovmskb %xmm2, %eax > > + test %eax, %eax > > + jnz L(matches16) > > + > > + movdqa 32(%rdi), %xmm3 > > + PCMPEQ %xmm1, %xmm3 > > + pmovmskb %xmm3, %eax > > + test %eax, %eax > > + jnz L(matches32) > > + > > + movdqa 48(%rdi), %xmm3 > > + PCMPEQ %xmm1, %xmm3 > > + pmovmskb %xmm3, %eax > > + > > + add $64, %rdi > > + test %eax, %eax > > + jnz L(matches0) > > + > > + mov %rdi, %rcx > > + and $-64, %rdi > > + and $63, %ecx > > +# ifdef USE_AS_WMEMCHR > > + shr $2, %ecx > > +# endif > > + add %rcx, %rdx > > + > > + .p2align 4 > > +L(align64_loop): > > + sub $(CHAR_PER_VEC * 4), %rdx > > + jbe L(exit_loop) > > + movdqa (%rdi), %xmm0 > > + movdqa 16(%rdi), %xmm2 > > + movdqa 32(%rdi), %xmm3 > > + movdqa 48(%rdi), %xmm4 > > + > > + PCMPEQ %xmm1, %xmm0 > > + PCMPEQ %xmm1, %xmm2 > > + PCMPEQ %xmm1, %xmm3 > > + PCMPEQ %xmm1, %xmm4 > > > > -#include "../memchr.S" > > + pmaxub %xmm0, %xmm3 > > + pmaxub %xmm2, %xmm4 > > + pmaxub %xmm3, %xmm4 > > + pmovmskb %xmm4, %eax > > + > > + add $64, %rdi > > + > > + test %eax, %eax > > + jz L(align64_loop) > > + > > + sub $64, %rdi > > + > > + pmovmskb %xmm0, %eax > > + test %eax, %eax > > + jnz L(matches) > > + > > + pmovmskb %xmm2, %eax > > + test %eax, %eax > > + jnz L(matches16) > > + > > + movdqa 32(%rdi), %xmm3 > > + PCMPEQ %xmm1, %xmm3 > > + > > + PCMPEQ 48(%rdi), %xmm1 > > + pmovmskb %xmm3, %eax > > + test %eax, %eax > > + jnz L(matches32) > > + > > + pmovmskb %xmm1, %eax > > + bsf %eax, %eax > > + lea 48(%rdi, %rax), %rax > > + ret > > + > > + .p2align 4 > > +L(exit_loop): > > + add $(CHAR_PER_VEC * 2), %edx > > + jle L(exit_loop_32) > > + > > + movdqa (%rdi), %xmm0 > > + PCMPEQ %xmm1, %xmm0 > > + pmovmskb %xmm0, %eax > > + test %eax, %eax > > + jnz L(matches) > > + > > + movdqa 16(%rdi), %xmm2 > > + PCMPEQ %xmm1, %xmm2 > > + pmovmskb %xmm2, %eax > > + test %eax, %eax > > + jnz L(matches16) > > + > > + movdqa 32(%rdi), %xmm3 > > + PCMPEQ %xmm1, %xmm3 > > + pmovmskb %xmm3, %eax > > + test %eax, %eax > > + jnz L(matches32_1) > > + sub $CHAR_PER_VEC, %edx > > + jle L(return_null) > > + > > + PCMPEQ 48(%rdi), %xmm1 > > + pmovmskb %xmm1, %eax > > + test %eax, %eax > > + jnz L(matches48_1) > > + xor %eax, %eax > > + ret > > + > > + .p2align 4 > > +L(exit_loop_32): > > + add $(CHAR_PER_VEC * 2), %edx > > + movdqa (%rdi), %xmm0 > > + PCMPEQ %xmm1, %xmm0 > > + pmovmskb %xmm0, %eax > > + test %eax, %eax > > + jnz L(matches_1) > > + sub $CHAR_PER_VEC, %edx > > + jbe L(return_null) > > + > > + PCMPEQ 16(%rdi), %xmm1 > > + pmovmskb %xmm1, %eax > > + test %eax, %eax > > + jnz L(matches16_1) > > + xor %eax, %eax > > + ret > > + > > + .p2align 4 > > +L(matches0): > > + bsf %eax, %eax > > + lea -16(%rax, %rdi), %rax > > + ret > > + > > + .p2align 4 > > +L(matches): > > + bsf %eax, %eax > > + add %rdi, %rax > > + ret > > + > > + .p2align 4 > > +L(matches16): > > + bsf %eax, %eax > > + lea 16(%rax, %rdi), %rax > > + ret > > + > > + .p2align 4 > > +L(matches32): > > + bsf %eax, %eax > > + lea 32(%rax, %rdi), %rax > > + ret > > + > > + .p2align 4 > > +L(matches_1): > > + bsf %eax, %eax > > +# ifdef USE_AS_WMEMCHR > > + mov %eax, %esi > > + shr $2, %esi > > + sub %rsi, %rdx > > +# else > > + sub %rax, %rdx > > +# endif > > + jbe L(return_null) > > + add %rdi, %rax > > + ret > > + > > + .p2align 4 > > +L(matches16_1): > > + bsf %eax, %eax > > +# ifdef USE_AS_WMEMCHR > > + mov %eax, %esi > > + shr $2, %esi > > + sub %rsi, %rdx > > +# else > > + sub %rax, %rdx > > +# endif > > + jbe L(return_null) > > + lea 16(%rdi, %rax), %rax > > + ret > > + > > + .p2align 4 > > +L(matches32_1): > > + bsf %eax, %eax > > +# ifdef USE_AS_WMEMCHR > > + mov %eax, %esi > > + shr $2, %esi > > + sub %rsi, %rdx > > +# else > > + sub %rax, %rdx > > +# endif > > + jbe L(return_null) > > + lea 32(%rdi, %rax), %rax > > + ret > > + > > + .p2align 4 > > +L(matches48_1): > > + bsf %eax, %eax > > +# ifdef USE_AS_WMEMCHR > > + mov %eax, %esi > > + shr $2, %esi > > + sub %rsi, %rdx > > +# else > > + sub %rax, %rdx > > +# endif > > + jbe L(return_null) > > + lea 48(%rdi, %rax), %rax > > + ret > > + > > + .p2align 4 > > +L(return_null): > > + xor %eax, %eax > > + ret > > +END(MEMCHR) > > +#endif > > diff --git a/sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S > > index acc5f6e2fb..5c1dcd3ca7 100644 > > --- a/sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S > > +++ b/sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S > > @@ -1,4 +1,7 @@ > > -#define MEMCHR __rawmemchr_avx2_rtm > > -#define USE_AS_RAWMEMCHR 1 > > +#ifndef RAWMEMCHR > > +# define RAWMEMCHR __rawmemchr_avx2_rtm > > +#endif > > +#define USE_AS_RAWMEMCHR 1 > > +#define MEMCHR RAWMEMCHR > > > > #include "memchr-avx2-rtm.S" > > Will we ever use the RTM version as the default? We had talked about it and agreed not to. I think we can safely say we don't need it for the RTLD default because we know no transactions. As for the non-multiarch build selection it's a bit more ambiguous. > > > diff --git a/sysdeps/x86_64/multiarch/rawmemchr-avx2.S b/sysdeps/x86_64/multiarch/rawmemchr-avx2.S > > index 128f9ea637..d6bff28757 100644 > > --- a/sysdeps/x86_64/multiarch/rawmemchr-avx2.S > > +++ b/sysdeps/x86_64/multiarch/rawmemchr-avx2.S > > @@ -1,4 +1,7 @@ > > -#define MEMCHR __rawmemchr_avx2 > > -#define USE_AS_RAWMEMCHR 1 > > +#ifndef RAWMEMCHR > > +# define RAWMEMCHR __rawmemchr_avx2 > > +#endif > > +#define USE_AS_RAWMEMCHR 1 > > +#define MEMCHR RAWMEMCHR > > > > #include "memchr-avx2.S" > > diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S b/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S > > index deda1ca395..8ff7f27c9c 100644 > > --- a/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S > > +++ b/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S > > @@ -1,3 +1,7 @@ > > -#define MEMCHR __rawmemchr_evex_rtm > > -#define USE_AS_RAWMEMCHR 1 > > +#ifndef RAWMEMCHR > > +# define RAWMEMCHR __rawmemchr_evex_rtm > > +#endif > > +#define USE_AS_RAWMEMCHR 1 > > +#define MEMCHR RAWMEMCHR > > + > > Will we ever use the RTM version as the default? > > > #include "memchr-evex-rtm.S" > > diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex.S b/sysdeps/x86_64/multiarch/rawmemchr-evex.S > > index ec942b77ba..dc1c450699 100644 > > --- a/sysdeps/x86_64/multiarch/rawmemchr-evex.S > > +++ b/sysdeps/x86_64/multiarch/rawmemchr-evex.S > > @@ -1,4 +1,7 @@ > > -#define MEMCHR __rawmemchr_evex > > -#define USE_AS_RAWMEMCHR 1 > > +#ifndef RAWMEMCHR > > +# define RAWMEMCHR __rawmemchr_evex > > +#endif > > +#define USE_AS_RAWMEMCHR 1 > > +#define MEMCHR RAWMEMCHR > > > > #include "memchr-evex.S" > > diff --git a/sysdeps/x86_64/multiarch/rawmemchr-sse2.S b/sysdeps/x86_64/multiarch/rawmemchr-sse2.S > > index 3841c14c34..e2c2e20d85 100644 > > --- a/sysdeps/x86_64/multiarch/rawmemchr-sse2.S > > +++ b/sysdeps/x86_64/multiarch/rawmemchr-sse2.S > > @@ -16,14 +16,192 @@ > > License along with the GNU C Library; if not, see > > <https://www.gnu.org/licenses/>. */ > > > > -/* Define multiple versions only for the definition in libc. */ > > -#if IS_IN (libc) > > -# define __rawmemchr __rawmemchr_sse2 > > - > > -# undef weak_alias > > -# define weak_alias(__rawmemchr, rawmemchr) > > -# undef libc_hidden_def > > -# define libc_hidden_def(__rawmemchr) > > -#endif > > +#include <isa-level.h> > > +#include <sysdep.h> > > + > > +/* MINIMUM_X86_ISA_LEVEL <= 2 because there is no V2 implementation > > + so we need this to build for ISA V2 builds. */ > > +#if ISA_SHOULD_BUILD (2) > > + > > +# ifndef RAWMEMCHR > > +# define RAWMEMCHR __rawmemchr_sse2 > > +# endif > > + > > + .text > > +ENTRY (RAWMEMCHR) > > + movd %rsi, %xmm1 > > + mov %rdi, %rcx > > + > > + punpcklbw %xmm1, %xmm1 > > + punpcklbw %xmm1, %xmm1 > > + > > + and $63, %rcx > > + pshufd $0, %xmm1, %xmm1 > > + > > + cmp $48, %rcx > > + ja L(crosscache) > > + > > + movdqu (%rdi), %xmm0 > > + pcmpeqb %xmm1, %xmm0 > > +/* Check if there is a match. */ > > + pmovmskb %xmm0, %eax > > + test %eax, %eax > > + > > + jnz L(matches) > > + add $16, %rdi > > + and $-16, %rdi > > + jmp L(loop_prolog) > > + > > + .p2align 4 > > +L(crosscache): > > + and $15, %rcx > > + and $-16, %rdi > > + movdqa (%rdi), %xmm0 > > + > > + pcmpeqb %xmm1, %xmm0 > > +/* Check if there is a match. */ > > + pmovmskb %xmm0, %eax > > +/* Remove the leading bytes. */ > > + sar %cl, %eax > > + test %eax, %eax > > + je L(unaligned_no_match) > > +/* Check which byte is a match. */ > > + bsf %eax, %eax > > + > > + add %rdi, %rax > > + add %rcx, %rax > > + ret > > + > > + .p2align 4 > > +L(unaligned_no_match): > > + add $16, %rdi > > + > > + .p2align 4 > > +L(loop_prolog): > > + movdqa (%rdi), %xmm0 > > + pcmpeqb %xmm1, %xmm0 > > + pmovmskb %xmm0, %eax > > + test %eax, %eax > > + jnz L(matches) > > + > > + movdqa 16(%rdi), %xmm2 > > + pcmpeqb %xmm1, %xmm2 > > + pmovmskb %xmm2, %eax > > + test %eax, %eax > > + jnz L(matches16) > > + > > + movdqa 32(%rdi), %xmm3 > > + pcmpeqb %xmm1, %xmm3 > > + pmovmskb %xmm3, %eax > > + test %eax, %eax > > + jnz L(matches32) > > + > > + movdqa 48(%rdi), %xmm4 > > + pcmpeqb %xmm1, %xmm4 > > + add $64, %rdi > > + pmovmskb %xmm4, %eax > > + test %eax, %eax > > + jnz L(matches0) > > + > > + test $0x3f, %rdi > > + jz L(align64_loop) > > + > > + movdqa (%rdi), %xmm0 > > + pcmpeqb %xmm1, %xmm0 > > + pmovmskb %xmm0, %eax > > + test %eax, %eax > > + jnz L(matches) > > + > > + movdqa 16(%rdi), %xmm2 > > + pcmpeqb %xmm1, %xmm2 > > + pmovmskb %xmm2, %eax > > + test %eax, %eax > > + jnz L(matches16) > > > > -#include "../rawmemchr.S" > > + movdqa 32(%rdi), %xmm3 > > + pcmpeqb %xmm1, %xmm3 > > + pmovmskb %xmm3, %eax > > + test %eax, %eax > > + jnz L(matches32) > > + > > + movdqa 48(%rdi), %xmm3 > > + pcmpeqb %xmm1, %xmm3 > > + pmovmskb %xmm3, %eax > > + > > + add $64, %rdi > > + test %eax, %eax > > + jnz L(matches0) > > + > > + and $-64, %rdi > > + > > + .p2align 4 > > +L(align64_loop): > > + movdqa (%rdi), %xmm0 > > + movdqa 16(%rdi), %xmm2 > > + movdqa 32(%rdi), %xmm3 > > + movdqa 48(%rdi), %xmm4 > > + > > + pcmpeqb %xmm1, %xmm0 > > + pcmpeqb %xmm1, %xmm2 > > + pcmpeqb %xmm1, %xmm3 > > + pcmpeqb %xmm1, %xmm4 > > + > > + pmaxub %xmm0, %xmm3 > > + pmaxub %xmm2, %xmm4 > > + pmaxub %xmm3, %xmm4 > > + pmovmskb %xmm4, %eax > > + > > + add $64, %rdi > > + > > + test %eax, %eax > > + jz L(align64_loop) > > + > > + sub $64, %rdi > > + > > + pmovmskb %xmm0, %eax > > + test %eax, %eax > > + jnz L(matches) > > + > > + pmovmskb %xmm2, %eax > > + test %eax, %eax > > + jnz L(matches16) > > + > > + movdqa 32(%rdi), %xmm3 > > + pcmpeqb %xmm1, %xmm3 > > + > > + pcmpeqb 48(%rdi), %xmm1 > > + pmovmskb %xmm3, %eax > > + test %eax, %eax > > + jnz L(matches32) > > + > > + pmovmskb %xmm1, %eax > > + bsf %eax, %eax > > + lea 48(%rdi, %rax), %rax > > + ret > > + > > + .p2align 4 > > +L(matches0): > > + bsf %eax, %eax > > + lea -16(%rax, %rdi), %rax > > + ret > > + > > + .p2align 4 > > +L(matches): > > + bsf %eax, %eax > > + add %rdi, %rax > > + ret > > + > > + .p2align 4 > > +L(matches16): > > + bsf %eax, %eax > > + lea 16(%rax, %rdi), %rax > > + ret > > + > > + .p2align 4 > > +L(matches32): > > + bsf %eax, %eax > > + lea 32(%rax, %rdi), %rax > > + ret > > + > > +END (RAWMEMCHR) > > +#endif > > diff --git a/sysdeps/x86_64/multiarch/rtld-memchr.S b/sysdeps/x86_64/multiarch/rtld-memchr.S > > new file mode 100644 > > index 0000000000..a14b192bed > > --- /dev/null > > +++ b/sysdeps/x86_64/multiarch/rtld-memchr.S > > @@ -0,0 +1,18 @@ > > +/* Copyright (C) 2022 Free Software Foundation, Inc. > > + This file is part of the GNU C Library. > > + > > + The GNU C Library is free software; you can redistribute it and/or > > + modify it under the terms of the GNU Lesser General Public > > + License as published by the Free Software Foundation; either > > + version 2.1 of the License, or (at your option) any later version. > > + > > + The GNU C Library is distributed in the hope that it will be useful, > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > + Lesser General Public License for more details. > > + > > + You should have received a copy of the GNU Lesser General Public > > + License along with the GNU C Library; if not, see > > + <https://www.gnu.org/licenses/>. */ > > + > > +#include "../memchr.S" > > diff --git a/sysdeps/x86_64/multiarch/rtld-rawmemchr.S b/sysdeps/x86_64/multiarch/rtld-rawmemchr.S > > new file mode 100644 > > index 0000000000..5d4110a052 > > --- /dev/null > > +++ b/sysdeps/x86_64/multiarch/rtld-rawmemchr.S > > @@ -0,0 +1,18 @@ > > +/* Copyright (C) 2022 Free Software Foundation, Inc. > > + This file is part of the GNU C Library. > > + > > + The GNU C Library is free software; you can redistribute it and/or > > + modify it under the terms of the GNU Lesser General Public > > + License as published by the Free Software Foundation; either > > + version 2.1 of the License, or (at your option) any later version. > > + > > + The GNU C Library is distributed in the hope that it will be useful, > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > + Lesser General Public License for more details. > > + > > + You should have received a copy of the GNU Lesser General Public > > + License along with the GNU C Library; if not, see > > + <https://www.gnu.org/licenses/>. */ > > + > > +#include "../rawmemchr.S" > > diff --git a/sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S > > index 58ed21db01..2a1cff5b05 100644 > > --- a/sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S > > +++ b/sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S > > @@ -1,4 +1,7 @@ > > -#define MEMCHR __wmemchr_avx2_rtm > > -#define USE_AS_WMEMCHR 1 > > +#ifndef WMEMCHR > > +# define WMEMCHR __wmemchr_avx2_rtm > > +#endif > > +#define USE_AS_WMEMCHR 1 > > +#define MEMCHR WMEMCHR > > > > #include "memchr-avx2-rtm.S" > > diff --git a/sysdeps/x86_64/multiarch/wmemchr-avx2.S b/sysdeps/x86_64/multiarch/wmemchr-avx2.S > > index 282854f1a1..2bf93fd84b 100644 > > --- a/sysdeps/x86_64/multiarch/wmemchr-avx2.S > > +++ b/sysdeps/x86_64/multiarch/wmemchr-avx2.S > > @@ -1,4 +1,7 @@ > > -#define MEMCHR __wmemchr_avx2 > > -#define USE_AS_WMEMCHR 1 > > +#ifndef WMEMCHR > > +# define WMEMCHR __wmemchr_avx2 > > +#endif > > +#define USE_AS_WMEMCHR 1 > > +#define MEMCHR WMEMCHR > > > > #include "memchr-avx2.S" > > diff --git a/sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S b/sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S > > index a346cd35a1..c67309e8a1 100644 > > --- a/sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S > > +++ b/sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S > > @@ -1,3 +1,7 @@ > > -#define MEMCHR __wmemchr_evex_rtm > > -#define USE_AS_WMEMCHR 1 > > +#ifndef WMEMCHR > > +# define WMEMCHR __wmemchr_evex_rtm > > +#endif > > +#define USE_AS_WMEMCHR 1 > > +#define MEMCHR WMEMCHR > > + > > #include "memchr-evex-rtm.S" > > diff --git a/sysdeps/x86_64/multiarch/wmemchr-evex.S b/sysdeps/x86_64/multiarch/wmemchr-evex.S > > index 06cd0f9f5a..5512d5cdc3 100644 > > --- a/sysdeps/x86_64/multiarch/wmemchr-evex.S > > +++ b/sysdeps/x86_64/multiarch/wmemchr-evex.S > > @@ -1,4 +1,7 @@ > > -#define MEMCHR __wmemchr_evex > > -#define USE_AS_WMEMCHR 1 > > +#ifndef WMEMCHR > > +# define WMEMCHR __wmemchr_evex > > +#endif > > +#define USE_AS_WMEMCHR 1 > > +#define MEMCHR WMEMCHR > > > > #include "memchr-evex.S" > > diff --git a/sysdeps/x86_64/multiarch/wmemchr-sse2.S b/sysdeps/x86_64/multiarch/wmemchr-sse2.S > > index 70a965d552..b675a070d4 100644 > > --- a/sysdeps/x86_64/multiarch/wmemchr-sse2.S > > +++ b/sysdeps/x86_64/multiarch/wmemchr-sse2.S > > @@ -1,4 +1,7 @@ > > -#define USE_AS_WMEMCHR 1 > > -#define wmemchr __wmemchr_sse2 > > +#ifndef WMEMCHR > > +# define WMEMCHR __wmemchr_sse2 > > +#endif > > +#define USE_AS_WMEMCHR 1 > > +#define MEMCHR WMEMCHR > > > > -#include "../memchr.S" > > +#include "memchr-sse2.S" > > diff --git a/sysdeps/x86_64/rawmemchr.S b/sysdeps/x86_64/rawmemchr.S > > index 4c1a3383b9..ba7e5202e6 100644 > > --- a/sysdeps/x86_64/rawmemchr.S > > +++ b/sysdeps/x86_64/rawmemchr.S > > @@ -17,185 +17,13 @@ > > License along with the GNU C Library; if not, see > > <https://www.gnu.org/licenses/>. */ > > > > -#include <sysdep.h> > > +#define RAWMEMCHR __rawmemchr > > > > - .text > > -ENTRY (__rawmemchr) > > - movd %rsi, %xmm1 > > - mov %rdi, %rcx > > +#define DEFAULT_IMPL_V1 "multiarch/rawmemchr-sse2.S" > > +#define DEFAULT_IMPL_V3 "multiarch/rawmemchr-avx2.S" > > +#define DEFAULT_IMPL_V4 "multiarch/rawmemchr-evex.S" > > > > - punpcklbw %xmm1, %xmm1 > > - punpcklbw %xmm1, %xmm1 > > - > > - and $63, %rcx > > - pshufd $0, %xmm1, %xmm1 > > - > > - cmp $48, %rcx > > - ja L(crosscache) > > - > > - movdqu (%rdi), %xmm0 > > - pcmpeqb %xmm1, %xmm0 > > -/* Check if there is a match. */ > > - pmovmskb %xmm0, %eax > > - test %eax, %eax > > - > > - jnz L(matches) > > - add $16, %rdi > > - and $-16, %rdi > > - jmp L(loop_prolog) > > - > > - .p2align 4 > > -L(crosscache): > > - and $15, %rcx > > - and $-16, %rdi > > - movdqa (%rdi), %xmm0 > > - > > - pcmpeqb %xmm1, %xmm0 > > -/* Check if there is a match. */ > > - pmovmskb %xmm0, %eax > > -/* Remove the leading bytes. */ > > - sar %cl, %eax > > - test %eax, %eax > > - je L(unaligned_no_match) > > -/* Check which byte is a match. */ > > - bsf %eax, %eax > > - > > - add %rdi, %rax > > - add %rcx, %rax > > - ret > > - > > - .p2align 4 > > -L(unaligned_no_match): > > - add $16, %rdi > > - > > - .p2align 4 > > -L(loop_prolog): > > - movdqa (%rdi), %xmm0 > > - pcmpeqb %xmm1, %xmm0 > > - pmovmskb %xmm0, %eax > > - test %eax, %eax > > - jnz L(matches) > > - > > - movdqa 16(%rdi), %xmm2 > > - pcmpeqb %xmm1, %xmm2 > > - pmovmskb %xmm2, %eax > > - test %eax, %eax > > - jnz L(matches16) > > - > > - movdqa 32(%rdi), %xmm3 > > - pcmpeqb %xmm1, %xmm3 > > - pmovmskb %xmm3, %eax > > - test %eax, %eax > > - jnz L(matches32) > > - > > - movdqa 48(%rdi), %xmm4 > > - pcmpeqb %xmm1, %xmm4 > > - add $64, %rdi > > - pmovmskb %xmm4, %eax > > - test %eax, %eax > > - jnz L(matches0) > > - > > - test $0x3f, %rdi > > - jz L(align64_loop) > > - > > - movdqa (%rdi), %xmm0 > > - pcmpeqb %xmm1, %xmm0 > > - pmovmskb %xmm0, %eax > > - test %eax, %eax > > - jnz L(matches) > > - > > - movdqa 16(%rdi), %xmm2 > > - pcmpeqb %xmm1, %xmm2 > > - pmovmskb %xmm2, %eax > > - test %eax, %eax > > - jnz L(matches16) > > - > > - movdqa 32(%rdi), %xmm3 > > - pcmpeqb %xmm1, %xmm3 > > - pmovmskb %xmm3, %eax > > - test %eax, %eax > > - jnz L(matches32) > > - > > - movdqa 48(%rdi), %xmm3 > > - pcmpeqb %xmm1, %xmm3 > > - pmovmskb %xmm3, %eax > > - > > - add $64, %rdi > > - test %eax, %eax > > - jnz L(matches0) > > - > > - and $-64, %rdi > > - > > - .p2align 4 > > -L(align64_loop): > > - movdqa (%rdi), %xmm0 > > - movdqa 16(%rdi), %xmm2 > > - movdqa 32(%rdi), %xmm3 > > - movdqa 48(%rdi), %xmm4 > > - > > - pcmpeqb %xmm1, %xmm0 > > - pcmpeqb %xmm1, %xmm2 > > - pcmpeqb %xmm1, %xmm3 > > - pcmpeqb %xmm1, %xmm4 > > - > > - pmaxub %xmm0, %xmm3 > > - pmaxub %xmm2, %xmm4 > > - pmaxub %xmm3, %xmm4 > > - pmovmskb %xmm4, %eax > > - > > - add $64, %rdi > > - > > - test %eax, %eax > > - jz L(align64_loop) > > - > > - sub $64, %rdi > > - > > - pmovmskb %xmm0, %eax > > - test %eax, %eax > > - jnz L(matches) > > - > > - pmovmskb %xmm2, %eax > > - test %eax, %eax > > - jnz L(matches16) > > - > > - movdqa 32(%rdi), %xmm3 > > - pcmpeqb %xmm1, %xmm3 > > - > > - pcmpeqb 48(%rdi), %xmm1 > > - pmovmskb %xmm3, %eax > > - test %eax, %eax > > - jnz L(matches32) > > - > > - pmovmskb %xmm1, %eax > > - bsf %eax, %eax > > - lea 48(%rdi, %rax), %rax > > - ret > > - > > - .p2align 4 > > -L(matches0): > > - bsf %eax, %eax > > - lea -16(%rax, %rdi), %rax > > - ret > > - > > - .p2align 4 > > -L(matches): > > - bsf %eax, %eax > > - add %rdi, %rax > > - ret > > - > > - .p2align 4 > > -L(matches16): > > - bsf %eax, %eax > > - lea 16(%rax, %rdi), %rax > > - ret > > - > > - .p2align 4 > > -L(matches32): > > - bsf %eax, %eax > > - lea 32(%rax, %rdi), %rax > > - ret > > - > > -END (__rawmemchr) > > +#include "isa-default-impl.h" > > > > weak_alias (__rawmemchr, rawmemchr) > > -libc_hidden_builtin_def (__rawmemchr) > > +libc_hidden_def (__rawmemchr) > > diff --git a/sysdeps/x86_64/wmemchr.S b/sysdeps/x86_64/wmemchr.S > > new file mode 100644 > > index 0000000000..eef91e556b > > --- /dev/null > > +++ b/sysdeps/x86_64/wmemchr.S > > @@ -0,0 +1,28 @@ > > +/* Copyright (C) 2011-2022 Free Software Foundation, Inc. > > + This file is part of the GNU C Library. > > + > > + The GNU C Library is free software; you can redistribute it and/or > > + modify it under the terms of the GNU Lesser General Public > > + License as published by the Free Software Foundation; either > > + version 2.1 of the License, or (at your option) any later version. > > + > > + The GNU C Library is distributed in the hope that it will be useful, > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > + Lesser General Public License for more details. > > + > > + You should have received a copy of the GNU Lesser General Public > > + License along with the GNU C Library; if not, see > > + <https://www.gnu.org/licenses/>. */ > > + > > +#define WMEMCHR __wmemchr > > + > > +#define DEFAULT_IMPL_V1 "multiarch/wmemchr-sse2.S" > > +#define DEFAULT_IMPL_V3 "multiarch/wmemchr-avx2.S" > > +#define DEFAULT_IMPL_V4 "multiarch/wmemchr-evex.S" > > + > > +#include "isa-default-impl.h" > > + > > +libc_hidden_def (__wmemchr) > > +weak_alias (__wmemchr, wmemchr) > > +libc_hidden_weak (wmemchr) > > -- > > 2.34.1 > > > > > -- > H.J.
On Wed, Jun 22, 2022 at 3:04 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > On Wed, Jun 22, 2022 at 2:52 PM H.J. Lu <hjl.tools@gmail.com> wrote: > > > > On Wed, Jun 22, 2022 at 1:58 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > > > 1. Refactor files so that all implementations for in the multiarch > > > directory. > > > - Essentially moved sse2 {raw|w}memchr.S implementation to > > > multiarch/{raw|w}memchr-sse2.S > > > > > > - The non-multiarch {raw|w}memchr.S file now only includes one of > > > the implementations in the multiarch directory based on the > > > compiled ISA level (only used for non-multiarch builds. > > > Otherwise we go through the ifunc selector). > > > > > > 2. Add ISA level build guards to different implementations. > > > - I.e memchr-avx2.S which is ISA level 3 will only build if > > > compiled ISA level <= 3. Otherwise there is no reason to include > > > it as we will always use one of the ISA level 4 > > > implementations (memchr-evex{-rtm}.S). > > > > > > 3. Add new multiarch/rtld-{raw}memchr.S that just include the > > > non-multiarch {raw}memchr.S which will in turn select the best > > > implementation based on the compiled ISA level. > > > > > > 4. Refactor the ifunc selector and ifunc implementation list to use > > > the ISA level aware wrapper macros that allow functions below the > > > compiled ISA level (with a guranteed replacement) to be skipped. > > > - Guranteed replacement essentially means that for any ISA level > > > build there must be a function that the baseline of the ISA > > > supports. So for {raw|w}memchr.S since there is not ISA level 2 > > > function, the ISA level 2 build still includes the ISA level > > > 1 (sse2) function. Once we reach the ISA level 3 build, however, > > > {raw|w}memchr-avx2{-rtm}.S will always be sufficient so the ISA > > > level 1 implementation ({raw|w}memchr-sse2.S) will not be built. > > > > > > Tested with and without multiarch on x86_64 for ISA levels: > > > {generic, x86-64-v2, x86-64-v3, x86-64-v4} > > > > > > And m32 with and without multiarch. > > > --- > > > sysdeps/x86_64/isa-default-impl.h | 8 + > > > sysdeps/x86_64/memchr.S | 357 +---------------- > > > sysdeps/x86_64/multiarch/ifunc-evex.h | 29 +- > > > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 72 ++-- > > > sysdeps/x86_64/multiarch/memchr-avx2.S | 5 +- > > > sysdeps/x86_64/multiarch/memchr-evex.S | 5 +- > > > sysdeps/x86_64/multiarch/memchr-sse2.S | 363 +++++++++++++++++- > > > sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S | 7 +- > > > sysdeps/x86_64/multiarch/rawmemchr-avx2.S | 7 +- > > > sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S | 8 +- > > > sysdeps/x86_64/multiarch/rawmemchr-evex.S | 7 +- > > > sysdeps/x86_64/multiarch/rawmemchr-sse2.S | 198 +++++++++- > > > sysdeps/x86_64/multiarch/rtld-memchr.S | 18 + > > > sysdeps/x86_64/multiarch/rtld-rawmemchr.S | 18 + > > > sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S | 7 +- > > > sysdeps/x86_64/multiarch/wmemchr-avx2.S | 7 +- > > > sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S | 8 +- > > > sysdeps/x86_64/multiarch/wmemchr-evex.S | 7 +- > > > sysdeps/x86_64/multiarch/wmemchr-sse2.S | 9 +- > > > sysdeps/x86_64/rawmemchr.S | 184 +-------- > > > sysdeps/x86_64/wmemchr.S | 28 ++ > > > 21 files changed, 740 insertions(+), 612 deletions(-) > > > create mode 100644 sysdeps/x86_64/multiarch/rtld-memchr.S > > > create mode 100644 sysdeps/x86_64/multiarch/rtld-rawmemchr.S > > > create mode 100644 sysdeps/x86_64/wmemchr.S > > > > > > diff --git a/sysdeps/x86_64/isa-default-impl.h b/sysdeps/x86_64/isa-default-impl.h > > > index 34634668e5..b374a38b8b 100644 > > > --- a/sysdeps/x86_64/isa-default-impl.h > > > +++ b/sysdeps/x86_64/isa-default-impl.h > > > @@ -46,4 +46,12 @@ > > > # error "Unsupported ISA Level!" > > > #endif > > > > > > +#if IS_IN(rtld) && !defined USE_MULTIARCH > > > +# error "RTLD version should only exist in multiarch build" > > > +#endif > > > + > > > +#if defined USE_MULTIARCH && !IS_IN(rtld) > > > +# error "Multiarch build should not use ISA_DEFAULT_IMPL without RTLD" > > > +#endif > > > > Please do > > > > #if IS_IN (rtld) > > #else > > #endif > > > > > #include ISA_DEFAULT_IMPL > > > diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S > > > index a160fd9b00..20b43508c4 100644 > > > --- a/sysdeps/x86_64/memchr.S > > > +++ b/sysdeps/x86_64/memchr.S > > > @@ -15,358 +15,13 @@ > > > License along with the GNU C Library; if not, see > > > <https://www.gnu.org/licenses/>. */ > > > > > > -#include <sysdep.h> > > > +#define MEMCHR __memchr > > > > > > -#ifdef USE_AS_WMEMCHR > > > -# define MEMCHR wmemchr > > > -# define PCMPEQ pcmpeqd > > > -# define CHAR_PER_VEC 4 > > > -#else > > > -# define MEMCHR memchr > > > -# define PCMPEQ pcmpeqb > > > -# define CHAR_PER_VEC 16 > > > -#endif > > > +#define DEFAULT_IMPL_V1 "multiarch/memchr-sse2.S" > > > +#define DEFAULT_IMPL_V3 "multiarch/memchr-avx2.S" > > > +#define DEFAULT_IMPL_V4 "multiarch/memchr-evex.S" > > > > > > -/* fast SSE2 version with using pmaxub and 64 byte loop */ > > > +#include "isa-default-impl.h" > > > > > > - .text > > > -ENTRY(MEMCHR) > > > - movd %esi, %xmm1 > > > - mov %edi, %ecx > > > - > > > -#ifdef __ILP32__ > > > - /* Clear the upper 32 bits. */ > > > - movl %edx, %edx > > > -#endif > > > -#ifdef USE_AS_WMEMCHR > > > - test %RDX_LP, %RDX_LP > > > - jz L(return_null) > > > -#else > > > - punpcklbw %xmm1, %xmm1 > > > - test %RDX_LP, %RDX_LP > > > - jz L(return_null) > > > - punpcklbw %xmm1, %xmm1 > > > -#endif > > > - > > > - and $63, %ecx > > > - pshufd $0, %xmm1, %xmm1 > > > - > > > - cmp $48, %ecx > > > - ja L(crosscache) > > > - > > > - movdqu (%rdi), %xmm0 > > > - PCMPEQ %xmm1, %xmm0 > > > - pmovmskb %xmm0, %eax > > > - test %eax, %eax > > > - > > > - jnz L(matches_1) > > > - sub $CHAR_PER_VEC, %rdx > > > - jbe L(return_null) > > > - add $16, %rdi > > > - and $15, %ecx > > > - and $-16, %rdi > > > -#ifdef USE_AS_WMEMCHR > > > - shr $2, %ecx > > > -#endif > > > - add %rcx, %rdx > > > - sub $(CHAR_PER_VEC * 4), %rdx > > > - jbe L(exit_loop) > > > - jmp L(loop_prolog) > > > - > > > - .p2align 4 > > > -L(crosscache): > > > - and $15, %ecx > > > - and $-16, %rdi > > > - movdqa (%rdi), %xmm0 > > > - > > > - PCMPEQ %xmm1, %xmm0 > > > - /* Check if there is a match. */ > > > - pmovmskb %xmm0, %eax > > > - /* Remove the leading bytes. */ > > > - sar %cl, %eax > > > - test %eax, %eax > > > - je L(unaligned_no_match) > > > - /* Check which byte is a match. */ > > > - bsf %eax, %eax > > > -#ifdef USE_AS_WMEMCHR > > > - mov %eax, %esi > > > - shr $2, %esi > > > - sub %rsi, %rdx > > > -#else > > > - sub %rax, %rdx > > > -#endif > > > - jbe L(return_null) > > > - add %rdi, %rax > > > - add %rcx, %rax > > > - ret > > > - > > > - .p2align 4 > > > -L(unaligned_no_match): > > > - /* "rcx" is less than 16. Calculate "rdx + rcx - 16" by using > > > - "rdx - (16 - rcx)" instead of "(rdx + rcx) - 16" to void > > > - possible addition overflow. */ > > > - neg %rcx > > > - add $16, %rcx > > > -#ifdef USE_AS_WMEMCHR > > > - shr $2, %ecx > > > -#endif > > > - sub %rcx, %rdx > > > - jbe L(return_null) > > > - add $16, %rdi > > > - sub $(CHAR_PER_VEC * 4), %rdx > > > - jbe L(exit_loop) > > > - > > > - .p2align 4 > > > -L(loop_prolog): > > > - movdqa (%rdi), %xmm0 > > > - PCMPEQ %xmm1, %xmm0 > > > - pmovmskb %xmm0, %eax > > > - test %eax, %eax > > > - jnz L(matches) > > > - > > > - movdqa 16(%rdi), %xmm2 > > > - PCMPEQ %xmm1, %xmm2 > > > - pmovmskb %xmm2, %eax > > > - test %eax, %eax > > > - jnz L(matches16) > > > - > > > - movdqa 32(%rdi), %xmm3 > > > - PCMPEQ %xmm1, %xmm3 > > > - pmovmskb %xmm3, %eax > > > - test %eax, %eax > > > - jnz L(matches32) > > > - > > > - movdqa 48(%rdi), %xmm4 > > > - PCMPEQ %xmm1, %xmm4 > > > - add $64, %rdi > > > - pmovmskb %xmm4, %eax > > > - test %eax, %eax > > > - jnz L(matches0) > > > - > > > - test $0x3f, %rdi > > > - jz L(align64_loop) > > > - > > > - sub $(CHAR_PER_VEC * 4), %rdx > > > - jbe L(exit_loop) > > > - > > > - movdqa (%rdi), %xmm0 > > > - PCMPEQ %xmm1, %xmm0 > > > - pmovmskb %xmm0, %eax > > > - test %eax, %eax > > > - jnz L(matches) > > > - > > > - movdqa 16(%rdi), %xmm2 > > > - PCMPEQ %xmm1, %xmm2 > > > - pmovmskb %xmm2, %eax > > > - test %eax, %eax > > > - jnz L(matches16) > > > - > > > - movdqa 32(%rdi), %xmm3 > > > - PCMPEQ %xmm1, %xmm3 > > > - pmovmskb %xmm3, %eax > > > - test %eax, %eax > > > - jnz L(matches32) > > > - > > > - movdqa 48(%rdi), %xmm3 > > > - PCMPEQ %xmm1, %xmm3 > > > - pmovmskb %xmm3, %eax > > > - > > > - add $64, %rdi > > > - test %eax, %eax > > > - jnz L(matches0) > > > - > > > - mov %rdi, %rcx > > > - and $-64, %rdi > > > - and $63, %ecx > > > -#ifdef USE_AS_WMEMCHR > > > - shr $2, %ecx > > > -#endif > > > - add %rcx, %rdx > > > - > > > - .p2align 4 > > > -L(align64_loop): > > > - sub $(CHAR_PER_VEC * 4), %rdx > > > - jbe L(exit_loop) > > > - movdqa (%rdi), %xmm0 > > > - movdqa 16(%rdi), %xmm2 > > > - movdqa 32(%rdi), %xmm3 > > > - movdqa 48(%rdi), %xmm4 > > > - > > > - PCMPEQ %xmm1, %xmm0 > > > - PCMPEQ %xmm1, %xmm2 > > > - PCMPEQ %xmm1, %xmm3 > > > - PCMPEQ %xmm1, %xmm4 > > > - > > > - pmaxub %xmm0, %xmm3 > > > - pmaxub %xmm2, %xmm4 > > > - pmaxub %xmm3, %xmm4 > > > - pmovmskb %xmm4, %eax > > > - > > > - add $64, %rdi > > > - > > > - test %eax, %eax > > > - jz L(align64_loop) > > > - > > > - sub $64, %rdi > > > - > > > - pmovmskb %xmm0, %eax > > > - test %eax, %eax > > > - jnz L(matches) > > > - > > > - pmovmskb %xmm2, %eax > > > - test %eax, %eax > > > - jnz L(matches16) > > > - > > > - movdqa 32(%rdi), %xmm3 > > > - PCMPEQ %xmm1, %xmm3 > > > - > > > - PCMPEQ 48(%rdi), %xmm1 > > > - pmovmskb %xmm3, %eax > > > - test %eax, %eax > > > - jnz L(matches32) > > > - > > > - pmovmskb %xmm1, %eax > > > - bsf %eax, %eax > > > - lea 48(%rdi, %rax), %rax > > > - ret > > > - > > > - .p2align 4 > > > -L(exit_loop): > > > - add $(CHAR_PER_VEC * 2), %edx > > > - jle L(exit_loop_32) > > > - > > > - movdqa (%rdi), %xmm0 > > > - PCMPEQ %xmm1, %xmm0 > > > - pmovmskb %xmm0, %eax > > > - test %eax, %eax > > > - jnz L(matches) > > > - > > > - movdqa 16(%rdi), %xmm2 > > > - PCMPEQ %xmm1, %xmm2 > > > - pmovmskb %xmm2, %eax > > > - test %eax, %eax > > > - jnz L(matches16) > > > - > > > - movdqa 32(%rdi), %xmm3 > > > - PCMPEQ %xmm1, %xmm3 > > > - pmovmskb %xmm3, %eax > > > - test %eax, %eax > > > - jnz L(matches32_1) > > > - sub $CHAR_PER_VEC, %edx > > > - jle L(return_null) > > > - > > > - PCMPEQ 48(%rdi), %xmm1 > > > - pmovmskb %xmm1, %eax > > > - test %eax, %eax > > > - jnz L(matches48_1) > > > - xor %eax, %eax > > > - ret > > > - > > > - .p2align 4 > > > -L(exit_loop_32): > > > - add $(CHAR_PER_VEC * 2), %edx > > > - movdqa (%rdi), %xmm0 > > > - PCMPEQ %xmm1, %xmm0 > > > - pmovmskb %xmm0, %eax > > > - test %eax, %eax > > > - jnz L(matches_1) > > > - sub $CHAR_PER_VEC, %edx > > > - jbe L(return_null) > > > - > > > - PCMPEQ 16(%rdi), %xmm1 > > > - pmovmskb %xmm1, %eax > > > - test %eax, %eax > > > - jnz L(matches16_1) > > > - xor %eax, %eax > > > - ret > > > - > > > - .p2align 4 > > > -L(matches0): > > > - bsf %eax, %eax > > > - lea -16(%rax, %rdi), %rax > > > - ret > > > - > > > - .p2align 4 > > > -L(matches): > > > - bsf %eax, %eax > > > - add %rdi, %rax > > > - ret > > > - > > > - .p2align 4 > > > -L(matches16): > > > - bsf %eax, %eax > > > - lea 16(%rax, %rdi), %rax > > > - ret > > > - > > > - .p2align 4 > > > -L(matches32): > > > - bsf %eax, %eax > > > - lea 32(%rax, %rdi), %rax > > > - ret > > > - > > > - .p2align 4 > > > -L(matches_1): > > > - bsf %eax, %eax > > > -#ifdef USE_AS_WMEMCHR > > > - mov %eax, %esi > > > - shr $2, %esi > > > - sub %rsi, %rdx > > > -#else > > > - sub %rax, %rdx > > > -#endif > > > - jbe L(return_null) > > > - add %rdi, %rax > > > - ret > > > - > > > - .p2align 4 > > > -L(matches16_1): > > > - bsf %eax, %eax > > > -#ifdef USE_AS_WMEMCHR > > > - mov %eax, %esi > > > - shr $2, %esi > > > - sub %rsi, %rdx > > > -#else > > > - sub %rax, %rdx > > > -#endif > > > - jbe L(return_null) > > > - lea 16(%rdi, %rax), %rax > > > - ret > > > - > > > - .p2align 4 > > > -L(matches32_1): > > > - bsf %eax, %eax > > > -#ifdef USE_AS_WMEMCHR > > > - mov %eax, %esi > > > - shr $2, %esi > > > - sub %rsi, %rdx > > > -#else > > > - sub %rax, %rdx > > > -#endif > > > - jbe L(return_null) > > > - lea 32(%rdi, %rax), %rax > > > - ret > > > - > > > - .p2align 4 > > > -L(matches48_1): > > > - bsf %eax, %eax > > > -#ifdef USE_AS_WMEMCHR > > > - mov %eax, %esi > > > - shr $2, %esi > > > - sub %rsi, %rdx > > > -#else > > > - sub %rax, %rdx > > > -#endif > > > - jbe L(return_null) > > > - lea 48(%rdi, %rax), %rax > > > - ret > > > - > > > - .p2align 4 > > > -L(return_null): > > > - xor %eax, %eax > > > - ret > > > -END(MEMCHR) > > > - > > > -#ifndef USE_AS_WMEMCHR > > > -strong_alias (memchr, __memchr) > > > +weak_alias (__memchr, memchr) > > > libc_hidden_builtin_def(memchr) > > > -#endif > > > diff --git a/sysdeps/x86_64/multiarch/ifunc-evex.h b/sysdeps/x86_64/multiarch/ifunc-evex.h > > > index b8f7a12ea2..856c6261f8 100644 > > > --- a/sysdeps/x86_64/multiarch/ifunc-evex.h > > > +++ b/sysdeps/x86_64/multiarch/ifunc-evex.h > > > @@ -19,24 +19,28 @@ > > > > > > #include <init-arch.h> > > > > > > -extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; > > > -extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; > > > -extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; > > > extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; > > > extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_rtm) attribute_hidden; > > > > > > +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; > > > +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; > > > + > > > +extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; > > > > > > static inline void * > > > IFUNC_SELECTOR (void) > > > { > > > - const struct cpu_features* cpu_features = __get_cpu_features (); > > > - > > > - if (CPU_FEATURE_USABLE_P (cpu_features, AVX2) > > > - && CPU_FEATURE_USABLE_P (cpu_features, BMI2) > > > - && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) > > > + const struct cpu_features *cpu_features = __get_cpu_features (); > > > + > > > + /* NB: The X86_ISA_* feature check macros are evaluated at > > > + compile time. */ > > > + if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX2) > > > + && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2) > > > + && X86_ISA_CPU_FEATURES_ARCH_P (cpu_features, > > > + AVX_Fast_Unaligned_Load)) > > > { > > > - if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) > > > - && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)) > > > + if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) > > > + && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)) > > > { > > > if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) > > > return OPTIMIZE (evex_rtm); > > > @@ -47,9 +51,12 @@ IFUNC_SELECTOR (void) > > > if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) > > > return OPTIMIZE (avx2_rtm); > > > > > > - if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) > > > + if (X86_ISA_CPU_FEATURES_ARCH_P (cpu_features, > > > + Prefer_No_VZEROUPPER)) > > > return OPTIMIZE (avx2); > > > } > > > > > > + /* This is unreachable (compile time checked) if ISA level >= 3 > > > + so no need for a robust fallback here. */ > > > return OPTIMIZE (sse2); > > > } > > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > > index 883362f63d..bf52cf96d0 100644 > > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > > @@ -25,7 +25,8 @@ > > > > > > /* Fill ARRAY of MAX elements with IFUNC implementations for function > > > NAME supported on target machine and return the number of valid > > > - entries. */ > > > + entries. Each set of implementations for a given function is sorted in > > > + descending order by ISA level. */ > > > > > > size_t > > > __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > @@ -53,24 +54,27 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > > > > /* Support sysdeps/x86_64/multiarch/memchr.c. */ > > > IFUNC_IMPL (i, name, memchr, > > > - IFUNC_IMPL_ADD (array, i, memchr, > > > - CPU_FEATURE_USABLE (AVX2), > > > - __memchr_avx2) > > > - IFUNC_IMPL_ADD (array, i, memchr, > > > - (CPU_FEATURE_USABLE (AVX2) > > > - && CPU_FEATURE_USABLE (RTM)), > > > - __memchr_avx2_rtm) > > > - IFUNC_IMPL_ADD (array, i, memchr, > > > + X86_IFUNC_IMPL_ADD_V4 (array, i, memchr, > > > (CPU_FEATURE_USABLE (AVX512VL) > > > && CPU_FEATURE_USABLE (AVX512BW) > > > && CPU_FEATURE_USABLE (BMI2)), > > > __memchr_evex) > > > - IFUNC_IMPL_ADD (array, i, memchr, > > > + X86_IFUNC_IMPL_ADD_V4 (array, i, memchr, > > > (CPU_FEATURE_USABLE (AVX512VL) > > > && CPU_FEATURE_USABLE (AVX512BW) > > > && CPU_FEATURE_USABLE (BMI2)), > > > __memchr_evex_rtm) > > > - IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_sse2)) > > > + X86_IFUNC_IMPL_ADD_V3 (array, i, memchr, > > > + CPU_FEATURE_USABLE (AVX2), > > > + __memchr_avx2) > > > + X86_IFUNC_IMPL_ADD_V3 (array, i, memchr, > > > + (CPU_FEATURE_USABLE (AVX2) > > > + && CPU_FEATURE_USABLE (RTM)), > > > + __memchr_avx2_rtm) > > > + /* Can be lowered to V1 if a V2 implementation is added. */ > > > + X86_IFUNC_IMPL_ADD_V2 (array, i, memchr, > > > + 1, > > > + __memchr_sse2)) > > > > > > /* Support sysdeps/x86_64/multiarch/memcmp.c. */ > > > IFUNC_IMPL (i, name, memcmp, > > > @@ -288,24 +292,27 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > > > > /* Support sysdeps/x86_64/multiarch/rawmemchr.c. */ > > > IFUNC_IMPL (i, name, rawmemchr, > > > - IFUNC_IMPL_ADD (array, i, rawmemchr, > > > - CPU_FEATURE_USABLE (AVX2), > > > - __rawmemchr_avx2) > > > - IFUNC_IMPL_ADD (array, i, rawmemchr, > > > - (CPU_FEATURE_USABLE (AVX2) > > > - && CPU_FEATURE_USABLE (RTM)), > > > - __rawmemchr_avx2_rtm) > > > - IFUNC_IMPL_ADD (array, i, rawmemchr, > > > + X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr, > > > (CPU_FEATURE_USABLE (AVX512VL) > > > && CPU_FEATURE_USABLE (AVX512BW) > > > && CPU_FEATURE_USABLE (BMI2)), > > > __rawmemchr_evex) > > > - IFUNC_IMPL_ADD (array, i, rawmemchr, > > > + X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr, > > > (CPU_FEATURE_USABLE (AVX512VL) > > > && CPU_FEATURE_USABLE (AVX512BW) > > > && CPU_FEATURE_USABLE (BMI2)), > > > __rawmemchr_evex_rtm) > > > - IFUNC_IMPL_ADD (array, i, rawmemchr, 1, __rawmemchr_sse2)) > > > + X86_IFUNC_IMPL_ADD_V3 (array, i, rawmemchr, > > > + CPU_FEATURE_USABLE (AVX2), > > > + __rawmemchr_avx2) > > > + X86_IFUNC_IMPL_ADD_V3 (array, i, rawmemchr, > > > + (CPU_FEATURE_USABLE (AVX2) > > > + && CPU_FEATURE_USABLE (RTM)), > > > + __rawmemchr_avx2_rtm) > > > + /* Can be lowered to V1 if a V2 implementation is added. */ > > > + X86_IFUNC_IMPL_ADD_V2 (array, i, rawmemchr, > > > + 1, > > > + __rawmemchr_sse2)) > > > > > > /* Support sysdeps/x86_64/multiarch/strlen.c. */ > > > IFUNC_IMPL (i, name, strlen, > > > @@ -748,24 +755,27 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > > > > /* Support sysdeps/x86_64/multiarch/wmemchr.c. */ > > > IFUNC_IMPL (i, name, wmemchr, > > > - IFUNC_IMPL_ADD (array, i, wmemchr, > > > - CPU_FEATURE_USABLE (AVX2), > > > - __wmemchr_avx2) > > > - IFUNC_IMPL_ADD (array, i, wmemchr, > > > - (CPU_FEATURE_USABLE (AVX2) > > > - && CPU_FEATURE_USABLE (RTM)), > > > - __wmemchr_avx2_rtm) > > > - IFUNC_IMPL_ADD (array, i, wmemchr, > > > + X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr, > > > (CPU_FEATURE_USABLE (AVX512VL) > > > && CPU_FEATURE_USABLE (AVX512BW) > > > && CPU_FEATURE_USABLE (BMI2)), > > > __wmemchr_evex) > > > - IFUNC_IMPL_ADD (array, i, wmemchr, > > > + X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr, > > > (CPU_FEATURE_USABLE (AVX512VL) > > > && CPU_FEATURE_USABLE (AVX512BW) > > > && CPU_FEATURE_USABLE (BMI2)), > > > __wmemchr_evex_rtm) > > > - IFUNC_IMPL_ADD (array, i, wmemchr, 1, __wmemchr_sse2)) > > > + X86_IFUNC_IMPL_ADD_V3 (array, i, wmemchr, > > > + CPU_FEATURE_USABLE (AVX2), > > > + __wmemchr_avx2) > > > + X86_IFUNC_IMPL_ADD_V3 (array, i, wmemchr, > > > + (CPU_FEATURE_USABLE (AVX2) > > > + && CPU_FEATURE_USABLE (RTM)), > > > + __wmemchr_avx2_rtm) > > > + /* Can be lowered to V1 if a V2 implementation is added. */ > > > + X86_IFUNC_IMPL_ADD_V2 (array, i, wmemchr, > > > + 1, > > > + __wmemchr_sse2)) > > > > > > /* Support sysdeps/x86_64/multiarch/wmemcmp.c. */ > > > IFUNC_IMPL (i, name, wmemcmp, > > > diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S > > > index c5a256eb37..39be5f7083 100644 > > > --- a/sysdeps/x86_64/multiarch/memchr-avx2.S > > > +++ b/sysdeps/x86_64/multiarch/memchr-avx2.S > > > @@ -16,9 +16,10 @@ > > > License along with the GNU C Library; if not, see > > > <https://www.gnu.org/licenses/>. */ > > > > > > -#if IS_IN (libc) > > > +#include <isa-level.h> > > > +#include <sysdep.h> > > > > > > -# include <sysdep.h> > > > +#if ISA_SHOULD_BUILD (3) > > > > > > # ifndef MEMCHR > > > # define MEMCHR __memchr_avx2 > > > diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S > > > index 0fd11b7632..0dd4f1dcce 100644 > > > --- a/sysdeps/x86_64/multiarch/memchr-evex.S > > > +++ b/sysdeps/x86_64/multiarch/memchr-evex.S > > > @@ -16,9 +16,10 @@ > > > License along with the GNU C Library; if not, see > > > <https://www.gnu.org/licenses/>. */ > > > > > > -#if IS_IN (libc) > > > +#include <isa-level.h> > > > +#include <sysdep.h> > > > > > > -# include <sysdep.h> > > > +#if ISA_SHOULD_BUILD (4) > > > > > > # ifndef MEMCHR > > > # define MEMCHR __memchr_evex > > > diff --git a/sysdeps/x86_64/multiarch/memchr-sse2.S b/sysdeps/x86_64/multiarch/memchr-sse2.S > > > index 2c6fdd41d6..8c561cd687 100644 > > > --- a/sysdeps/x86_64/multiarch/memchr-sse2.S > > > +++ b/sysdeps/x86_64/multiarch/memchr-sse2.S > > > @@ -16,13 +16,360 @@ > > > License along with the GNU C Library; if not, see > > > <https://www.gnu.org/licenses/>. */ > > > > > > -#if IS_IN (libc) > > > -# define memchr __memchr_sse2 > > > +#include <isa-level.h> > > > +#include <sysdep.h> > > > > > > -# undef strong_alias > > > -# define strong_alias(memchr, __memchr) > > > -# undef libc_hidden_builtin_def > > > -# define libc_hidden_builtin_def(memchr) > > > -#endif > > > +/* MINIMUM_X86_ISA_LEVEL <= 2 because there is no V2 implementation > > > + so we need this to build for ISA V2 builds. */ > > > +#if ISA_SHOULD_BUILD (2) > > > + > > > +# ifndef MEMCHR > > > +# define MEMCHR __memchr_sse2 > > > +# endif > > > +# ifdef USE_AS_WMEMCHR > > > +# define PCMPEQ pcmpeqd > > > +# define CHAR_PER_VEC 4 > > > +# else > > > +# define PCMPEQ pcmpeqb > > > +# define CHAR_PER_VEC 16 > > > +# endif > > > + > > > +/* fast SSE2 version with using pmaxub and 64 byte loop */ > > > + > > > + .text > > > +ENTRY(MEMCHR) > > > + movd %esi, %xmm1 > > > + mov %edi, %ecx > > > + > > > +# ifdef __ILP32__ > > > + /* Clear the upper 32 bits. */ > > > + movl %edx, %edx > > > +# endif > > > +# ifdef USE_AS_WMEMCHR > > > + test %RDX_LP, %RDX_LP > > > + jz L(return_null) > > > +# else > > > + punpcklbw %xmm1, %xmm1 > > > + test %RDX_LP, %RDX_LP > > > + jz L(return_null) > > > + punpcklbw %xmm1, %xmm1 > > > +# endif > > > + > > > + and $63, %ecx > > > + pshufd $0, %xmm1, %xmm1 > > > + > > > + cmp $48, %ecx > > > + ja L(crosscache) > > > + > > > + movdqu (%rdi), %xmm0 > > > + PCMPEQ %xmm1, %xmm0 > > > + pmovmskb %xmm0, %eax > > > + test %eax, %eax > > > + > > > + jnz L(matches_1) > > > + sub $CHAR_PER_VEC, %rdx > > > + jbe L(return_null) > > > + add $16, %rdi > > > + and $15, %ecx > > > + and $-16, %rdi > > > +# ifdef USE_AS_WMEMCHR > > > + shr $2, %ecx > > > +# endif > > > + add %rcx, %rdx > > > + sub $(CHAR_PER_VEC * 4), %rdx > > > + jbe L(exit_loop) > > > + jmp L(loop_prolog) > > > + > > > + .p2align 4 > > > +L(crosscache): > > > + and $15, %ecx > > > + and $-16, %rdi > > > + movdqa (%rdi), %xmm0 > > > + > > > + PCMPEQ %xmm1, %xmm0 > > > + /* Check if there is a match. */ > > > + pmovmskb %xmm0, %eax > > > + /* Remove the leading bytes. */ > > > + sar %cl, %eax > > > + test %eax, %eax > > > + je L(unaligned_no_match) > > > + /* Check which byte is a match. */ > > > + bsf %eax, %eax > > > +# ifdef USE_AS_WMEMCHR > > > + mov %eax, %esi > > > + shr $2, %esi > > > + sub %rsi, %rdx > > > +# else > > > + sub %rax, %rdx > > > +# endif > > > + jbe L(return_null) > > > + add %rdi, %rax > > > + add %rcx, %rax > > > + ret > > > + > > > + .p2align 4 > > > +L(unaligned_no_match): > > > + /* "rcx" is less than 16. Calculate "rdx + rcx - 16" by using > > > + "rdx - (16 - rcx)" instead of "(rdx + rcx) - 16" to void > > > + possible addition overflow. */ > > > + neg %rcx > > > + add $16, %rcx > > > +# ifdef USE_AS_WMEMCHR > > > + shr $2, %ecx > > > +# endif > > > + sub %rcx, %rdx > > > + jbe L(return_null) > > > + add $16, %rdi > > > + sub $(CHAR_PER_VEC * 4), %rdx > > > + jbe L(exit_loop) > > > + > > > + .p2align 4 > > > +L(loop_prolog): > > > + movdqa (%rdi), %xmm0 > > > + PCMPEQ %xmm1, %xmm0 > > > + pmovmskb %xmm0, %eax > > > + test %eax, %eax > > > + jnz L(matches) > > > + > > > + movdqa 16(%rdi), %xmm2 > > > + PCMPEQ %xmm1, %xmm2 > > > + pmovmskb %xmm2, %eax > > > + test %eax, %eax > > > + jnz L(matches16) > > > + > > > + movdqa 32(%rdi), %xmm3 > > > + PCMPEQ %xmm1, %xmm3 > > > + pmovmskb %xmm3, %eax > > > + test %eax, %eax > > > + jnz L(matches32) > > > + > > > + movdqa 48(%rdi), %xmm4 > > > + PCMPEQ %xmm1, %xmm4 > > > + add $64, %rdi > > > + pmovmskb %xmm4, %eax > > > + test %eax, %eax > > > + jnz L(matches0) > > > + > > > + test $0x3f, %rdi > > > + jz L(align64_loop) > > > + > > > + sub $(CHAR_PER_VEC * 4), %rdx > > > + jbe L(exit_loop) > > > + > > > + movdqa (%rdi), %xmm0 > > > + PCMPEQ %xmm1, %xmm0 > > > + pmovmskb %xmm0, %eax > > > + test %eax, %eax > > > + jnz L(matches) > > > + > > > + movdqa 16(%rdi), %xmm2 > > > + PCMPEQ %xmm1, %xmm2 > > > + pmovmskb %xmm2, %eax > > > + test %eax, %eax > > > + jnz L(matches16) > > > + > > > + movdqa 32(%rdi), %xmm3 > > > + PCMPEQ %xmm1, %xmm3 > > > + pmovmskb %xmm3, %eax > > > + test %eax, %eax > > > + jnz L(matches32) > > > + > > > + movdqa 48(%rdi), %xmm3 > > > + PCMPEQ %xmm1, %xmm3 > > > + pmovmskb %xmm3, %eax > > > + > > > + add $64, %rdi > > > + test %eax, %eax > > > + jnz L(matches0) > > > + > > > + mov %rdi, %rcx > > > + and $-64, %rdi > > > + and $63, %ecx > > > +# ifdef USE_AS_WMEMCHR > > > + shr $2, %ecx > > > +# endif > > > + add %rcx, %rdx > > > + > > > + .p2align 4 > > > +L(align64_loop): > > > + sub $(CHAR_PER_VEC * 4), %rdx > > > + jbe L(exit_loop) > > > + movdqa (%rdi), %xmm0 > > > + movdqa 16(%rdi), %xmm2 > > > + movdqa 32(%rdi), %xmm3 > > > + movdqa 48(%rdi), %xmm4 > > > + > > > + PCMPEQ %xmm1, %xmm0 > > > + PCMPEQ %xmm1, %xmm2 > > > + PCMPEQ %xmm1, %xmm3 > > > + PCMPEQ %xmm1, %xmm4 > > > > > > -#include "../memchr.S" > > > + pmaxub %xmm0, %xmm3 > > > + pmaxub %xmm2, %xmm4 > > > + pmaxub %xmm3, %xmm4 > > > + pmovmskb %xmm4, %eax > > > + > > > + add $64, %rdi > > > + > > > + test %eax, %eax > > > + jz L(align64_loop) > > > + > > > + sub $64, %rdi > > > + > > > + pmovmskb %xmm0, %eax > > > + test %eax, %eax > > > + jnz L(matches) > > > + > > > + pmovmskb %xmm2, %eax > > > + test %eax, %eax > > > + jnz L(matches16) > > > + > > > + movdqa 32(%rdi), %xmm3 > > > + PCMPEQ %xmm1, %xmm3 > > > + > > > + PCMPEQ 48(%rdi), %xmm1 > > > + pmovmskb %xmm3, %eax > > > + test %eax, %eax > > > + jnz L(matches32) > > > + > > > + pmovmskb %xmm1, %eax > > > + bsf %eax, %eax > > > + lea 48(%rdi, %rax), %rax > > > + ret > > > + > > > + .p2align 4 > > > +L(exit_loop): > > > + add $(CHAR_PER_VEC * 2), %edx > > > + jle L(exit_loop_32) > > > + > > > + movdqa (%rdi), %xmm0 > > > + PCMPEQ %xmm1, %xmm0 > > > + pmovmskb %xmm0, %eax > > > + test %eax, %eax > > > + jnz L(matches) > > > + > > > + movdqa 16(%rdi), %xmm2 > > > + PCMPEQ %xmm1, %xmm2 > > > + pmovmskb %xmm2, %eax > > > + test %eax, %eax > > > + jnz L(matches16) > > > + > > > + movdqa 32(%rdi), %xmm3 > > > + PCMPEQ %xmm1, %xmm3 > > > + pmovmskb %xmm3, %eax > > > + test %eax, %eax > > > + jnz L(matches32_1) > > > + sub $CHAR_PER_VEC, %edx > > > + jle L(return_null) > > > + > > > + PCMPEQ 48(%rdi), %xmm1 > > > + pmovmskb %xmm1, %eax > > > + test %eax, %eax > > > + jnz L(matches48_1) > > > + xor %eax, %eax > > > + ret > > > + > > > + .p2align 4 > > > +L(exit_loop_32): > > > + add $(CHAR_PER_VEC * 2), %edx > > > + movdqa (%rdi), %xmm0 > > > + PCMPEQ %xmm1, %xmm0 > > > + pmovmskb %xmm0, %eax > > > + test %eax, %eax > > > + jnz L(matches_1) > > > + sub $CHAR_PER_VEC, %edx > > > + jbe L(return_null) > > > + > > > + PCMPEQ 16(%rdi), %xmm1 > > > + pmovmskb %xmm1, %eax > > > + test %eax, %eax > > > + jnz L(matches16_1) > > > + xor %eax, %eax > > > + ret > > > + > > > + .p2align 4 > > > +L(matches0): > > > + bsf %eax, %eax > > > + lea -16(%rax, %rdi), %rax > > > + ret > > > + > > > + .p2align 4 > > > +L(matches): > > > + bsf %eax, %eax > > > + add %rdi, %rax > > > + ret > > > + > > > + .p2align 4 > > > +L(matches16): > > > + bsf %eax, %eax > > > + lea 16(%rax, %rdi), %rax > > > + ret > > > + > > > + .p2align 4 > > > +L(matches32): > > > + bsf %eax, %eax > > > + lea 32(%rax, %rdi), %rax > > > + ret > > > + > > > + .p2align 4 > > > +L(matches_1): > > > + bsf %eax, %eax > > > +# ifdef USE_AS_WMEMCHR > > > + mov %eax, %esi > > > + shr $2, %esi > > > + sub %rsi, %rdx > > > +# else > > > + sub %rax, %rdx > > > +# endif > > > + jbe L(return_null) > > > + add %rdi, %rax > > > + ret > > > + > > > + .p2align 4 > > > +L(matches16_1): > > > + bsf %eax, %eax > > > +# ifdef USE_AS_WMEMCHR > > > + mov %eax, %esi > > > + shr $2, %esi > > > + sub %rsi, %rdx > > > +# else > > > + sub %rax, %rdx > > > +# endif > > > + jbe L(return_null) > > > + lea 16(%rdi, %rax), %rax > > > + ret > > > + > > > + .p2align 4 > > > +L(matches32_1): > > > + bsf %eax, %eax > > > +# ifdef USE_AS_WMEMCHR > > > + mov %eax, %esi > > > + shr $2, %esi > > > + sub %rsi, %rdx > > > +# else > > > + sub %rax, %rdx > > > +# endif > > > + jbe L(return_null) > > > + lea 32(%rdi, %rax), %rax > > > + ret > > > + > > > + .p2align 4 > > > +L(matches48_1): > > > + bsf %eax, %eax > > > +# ifdef USE_AS_WMEMCHR > > > + mov %eax, %esi > > > + shr $2, %esi > > > + sub %rsi, %rdx > > > +# else > > > + sub %rax, %rdx > > > +# endif > > > + jbe L(return_null) > > > + lea 48(%rdi, %rax), %rax > > > + ret > > > + > > > + .p2align 4 > > > +L(return_null): > > > + xor %eax, %eax > > > + ret > > > +END(MEMCHR) > > > +#endif > > > diff --git a/sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S > > > index acc5f6e2fb..5c1dcd3ca7 100644 > > > --- a/sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S > > > +++ b/sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S > > > @@ -1,4 +1,7 @@ > > > -#define MEMCHR __rawmemchr_avx2_rtm > > > -#define USE_AS_RAWMEMCHR 1 > > > +#ifndef RAWMEMCHR > > > +# define RAWMEMCHR __rawmemchr_avx2_rtm > > > +#endif > > > +#define USE_AS_RAWMEMCHR 1 > > > +#define MEMCHR RAWMEMCHR > > > > > > #include "memchr-avx2-rtm.S" > > > > Will we ever use the RTM version as the default? > > We had talked about it and agreed not to. I think we can > safely say we don't need it for the RTLD default because > we know no transactions. > > As for the non-multiarch build selection it's a bit more ambiguous. Since the RTM version isn't used for default, we should leave the RTM .S files alone. > > > > > diff --git a/sysdeps/x86_64/multiarch/rawmemchr-avx2.S b/sysdeps/x86_64/multiarch/rawmemchr-avx2.S > > > index 128f9ea637..d6bff28757 100644 > > > --- a/sysdeps/x86_64/multiarch/rawmemchr-avx2.S > > > +++ b/sysdeps/x86_64/multiarch/rawmemchr-avx2.S > > > @@ -1,4 +1,7 @@ > > > -#define MEMCHR __rawmemchr_avx2 > > > -#define USE_AS_RAWMEMCHR 1 > > > +#ifndef RAWMEMCHR > > > +# define RAWMEMCHR __rawmemchr_avx2 > > > +#endif > > > +#define USE_AS_RAWMEMCHR 1 > > > +#define MEMCHR RAWMEMCHR > > > > > > #include "memchr-avx2.S" > > > diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S b/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S > > > index deda1ca395..8ff7f27c9c 100644 > > > --- a/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S > > > +++ b/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S > > > @@ -1,3 +1,7 @@ > > > -#define MEMCHR __rawmemchr_evex_rtm > > > -#define USE_AS_RAWMEMCHR 1 > > > +#ifndef RAWMEMCHR > > > +# define RAWMEMCHR __rawmemchr_evex_rtm > > > +#endif > > > +#define USE_AS_RAWMEMCHR 1 > > > +#define MEMCHR RAWMEMCHR > > > + > > > > Will we ever use the RTM version as the default? > > > > > #include "memchr-evex-rtm.S" > > > diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex.S b/sysdeps/x86_64/multiarch/rawmemchr-evex.S > > > index ec942b77ba..dc1c450699 100644 > > > --- a/sysdeps/x86_64/multiarch/rawmemchr-evex.S > > > +++ b/sysdeps/x86_64/multiarch/rawmemchr-evex.S > > > @@ -1,4 +1,7 @@ > > > -#define MEMCHR __rawmemchr_evex > > > -#define USE_AS_RAWMEMCHR 1 > > > +#ifndef RAWMEMCHR > > > +# define RAWMEMCHR __rawmemchr_evex > > > +#endif > > > +#define USE_AS_RAWMEMCHR 1 > > > +#define MEMCHR RAWMEMCHR > > > > > > #include "memchr-evex.S" > > > diff --git a/sysdeps/x86_64/multiarch/rawmemchr-sse2.S b/sysdeps/x86_64/multiarch/rawmemchr-sse2.S > > > index 3841c14c34..e2c2e20d85 100644 > > > --- a/sysdeps/x86_64/multiarch/rawmemchr-sse2.S > > > +++ b/sysdeps/x86_64/multiarch/rawmemchr-sse2.S > > > @@ -16,14 +16,192 @@ > > > License along with the GNU C Library; if not, see > > > <https://www.gnu.org/licenses/>. */ > > > > > > -/* Define multiple versions only for the definition in libc. */ > > > -#if IS_IN (libc) > > > -# define __rawmemchr __rawmemchr_sse2 > > > - > > > -# undef weak_alias > > > -# define weak_alias(__rawmemchr, rawmemchr) > > > -# undef libc_hidden_def > > > -# define libc_hidden_def(__rawmemchr) > > > -#endif > > > +#include <isa-level.h> > > > +#include <sysdep.h> > > > + > > > +/* MINIMUM_X86_ISA_LEVEL <= 2 because there is no V2 implementation > > > + so we need this to build for ISA V2 builds. */ > > > +#if ISA_SHOULD_BUILD (2) > > > + > > > +# ifndef RAWMEMCHR > > > +# define RAWMEMCHR __rawmemchr_sse2 > > > +# endif > > > + > > > + .text > > > +ENTRY (RAWMEMCHR) > > > + movd %rsi, %xmm1 > > > + mov %rdi, %rcx > > > + > > > + punpcklbw %xmm1, %xmm1 > > > + punpcklbw %xmm1, %xmm1 > > > + > > > + and $63, %rcx > > > + pshufd $0, %xmm1, %xmm1 > > > + > > > + cmp $48, %rcx > > > + ja L(crosscache) > > > + > > > + movdqu (%rdi), %xmm0 > > > + pcmpeqb %xmm1, %xmm0 > > > +/* Check if there is a match. */ > > > + pmovmskb %xmm0, %eax > > > + test %eax, %eax > > > + > > > + jnz L(matches) > > > + add $16, %rdi > > > + and $-16, %rdi > > > + jmp L(loop_prolog) > > > + > > > + .p2align 4 > > > +L(crosscache): > > > + and $15, %rcx > > > + and $-16, %rdi > > > + movdqa (%rdi), %xmm0 > > > + > > > + pcmpeqb %xmm1, %xmm0 > > > +/* Check if there is a match. */ > > > + pmovmskb %xmm0, %eax > > > +/* Remove the leading bytes. */ > > > + sar %cl, %eax > > > + test %eax, %eax > > > + je L(unaligned_no_match) > > > +/* Check which byte is a match. */ > > > + bsf %eax, %eax > > > + > > > + add %rdi, %rax > > > + add %rcx, %rax > > > + ret > > > + > > > + .p2align 4 > > > +L(unaligned_no_match): > > > + add $16, %rdi > > > + > > > + .p2align 4 > > > +L(loop_prolog): > > > + movdqa (%rdi), %xmm0 > > > + pcmpeqb %xmm1, %xmm0 > > > + pmovmskb %xmm0, %eax > > > + test %eax, %eax > > > + jnz L(matches) > > > + > > > + movdqa 16(%rdi), %xmm2 > > > + pcmpeqb %xmm1, %xmm2 > > > + pmovmskb %xmm2, %eax > > > + test %eax, %eax > > > + jnz L(matches16) > > > + > > > + movdqa 32(%rdi), %xmm3 > > > + pcmpeqb %xmm1, %xmm3 > > > + pmovmskb %xmm3, %eax > > > + test %eax, %eax > > > + jnz L(matches32) > > > + > > > + movdqa 48(%rdi), %xmm4 > > > + pcmpeqb %xmm1, %xmm4 > > > + add $64, %rdi > > > + pmovmskb %xmm4, %eax > > > + test %eax, %eax > > > + jnz L(matches0) > > > + > > > + test $0x3f, %rdi > > > + jz L(align64_loop) > > > + > > > + movdqa (%rdi), %xmm0 > > > + pcmpeqb %xmm1, %xmm0 > > > + pmovmskb %xmm0, %eax > > > + test %eax, %eax > > > + jnz L(matches) > > > + > > > + movdqa 16(%rdi), %xmm2 > > > + pcmpeqb %xmm1, %xmm2 > > > + pmovmskb %xmm2, %eax > > > + test %eax, %eax > > > + jnz L(matches16) > > > > > > -#include "../rawmemchr.S" > > > + movdqa 32(%rdi), %xmm3 > > > + pcmpeqb %xmm1, %xmm3 > > > + pmovmskb %xmm3, %eax > > > + test %eax, %eax > > > + jnz L(matches32) > > > + > > > + movdqa 48(%rdi), %xmm3 > > > + pcmpeqb %xmm1, %xmm3 > > > + pmovmskb %xmm3, %eax > > > + > > > + add $64, %rdi > > > + test %eax, %eax > > > + jnz L(matches0) > > > + > > > + and $-64, %rdi > > > + > > > + .p2align 4 > > > +L(align64_loop): > > > + movdqa (%rdi), %xmm0 > > > + movdqa 16(%rdi), %xmm2 > > > + movdqa 32(%rdi), %xmm3 > > > + movdqa 48(%rdi), %xmm4 > > > + > > > + pcmpeqb %xmm1, %xmm0 > > > + pcmpeqb %xmm1, %xmm2 > > > + pcmpeqb %xmm1, %xmm3 > > > + pcmpeqb %xmm1, %xmm4 > > > + > > > + pmaxub %xmm0, %xmm3 > > > + pmaxub %xmm2, %xmm4 > > > + pmaxub %xmm3, %xmm4 > > > + pmovmskb %xmm4, %eax > > > + > > > + add $64, %rdi > > > + > > > + test %eax, %eax > > > + jz L(align64_loop) > > > + > > > + sub $64, %rdi > > > + > > > + pmovmskb %xmm0, %eax > > > + test %eax, %eax > > > + jnz L(matches) > > > + > > > + pmovmskb %xmm2, %eax > > > + test %eax, %eax > > > + jnz L(matches16) > > > + > > > + movdqa 32(%rdi), %xmm3 > > > + pcmpeqb %xmm1, %xmm3 > > > + > > > + pcmpeqb 48(%rdi), %xmm1 > > > + pmovmskb %xmm3, %eax > > > + test %eax, %eax > > > + jnz L(matches32) > > > + > > > + pmovmskb %xmm1, %eax > > > + bsf %eax, %eax > > > + lea 48(%rdi, %rax), %rax > > > + ret > > > + > > > + .p2align 4 > > > +L(matches0): > > > + bsf %eax, %eax > > > + lea -16(%rax, %rdi), %rax > > > + ret > > > + > > > + .p2align 4 > > > +L(matches): > > > + bsf %eax, %eax > > > + add %rdi, %rax > > > + ret > > > + > > > + .p2align 4 > > > +L(matches16): > > > + bsf %eax, %eax > > > + lea 16(%rax, %rdi), %rax > > > + ret > > > + > > > + .p2align 4 > > > +L(matches32): > > > + bsf %eax, %eax > > > + lea 32(%rax, %rdi), %rax > > > + ret > > > + > > > +END (RAWMEMCHR) > > > +#endif > > > diff --git a/sysdeps/x86_64/multiarch/rtld-memchr.S b/sysdeps/x86_64/multiarch/rtld-memchr.S > > > new file mode 100644 > > > index 0000000000..a14b192bed > > > --- /dev/null > > > +++ b/sysdeps/x86_64/multiarch/rtld-memchr.S > > > @@ -0,0 +1,18 @@ > > > +/* Copyright (C) 2022 Free Software Foundation, Inc. > > > + This file is part of the GNU C Library. > > > + > > > + The GNU C Library is free software; you can redistribute it and/or > > > + modify it under the terms of the GNU Lesser General Public > > > + License as published by the Free Software Foundation; either > > > + version 2.1 of the License, or (at your option) any later version. > > > + > > > + The GNU C Library is distributed in the hope that it will be useful, > > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > > + Lesser General Public License for more details. > > > + > > > + You should have received a copy of the GNU Lesser General Public > > > + License along with the GNU C Library; if not, see > > > + <https://www.gnu.org/licenses/>. */ > > > + > > > +#include "../memchr.S" > > > diff --git a/sysdeps/x86_64/multiarch/rtld-rawmemchr.S b/sysdeps/x86_64/multiarch/rtld-rawmemchr.S > > > new file mode 100644 > > > index 0000000000..5d4110a052 > > > --- /dev/null > > > +++ b/sysdeps/x86_64/multiarch/rtld-rawmemchr.S > > > @@ -0,0 +1,18 @@ > > > +/* Copyright (C) 2022 Free Software Foundation, Inc. > > > + This file is part of the GNU C Library. > > > + > > > + The GNU C Library is free software; you can redistribute it and/or > > > + modify it under the terms of the GNU Lesser General Public > > > + License as published by the Free Software Foundation; either > > > + version 2.1 of the License, or (at your option) any later version. > > > + > > > + The GNU C Library is distributed in the hope that it will be useful, > > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > > + Lesser General Public License for more details. > > > + > > > + You should have received a copy of the GNU Lesser General Public > > > + License along with the GNU C Library; if not, see > > > + <https://www.gnu.org/licenses/>. */ > > > + > > > +#include "../rawmemchr.S" > > > diff --git a/sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S > > > index 58ed21db01..2a1cff5b05 100644 > > > --- a/sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S > > > +++ b/sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S > > > @@ -1,4 +1,7 @@ > > > -#define MEMCHR __wmemchr_avx2_rtm > > > -#define USE_AS_WMEMCHR 1 > > > +#ifndef WMEMCHR > > > +# define WMEMCHR __wmemchr_avx2_rtm > > > +#endif > > > +#define USE_AS_WMEMCHR 1 > > > +#define MEMCHR WMEMCHR > > > > > > #include "memchr-avx2-rtm.S" > > > diff --git a/sysdeps/x86_64/multiarch/wmemchr-avx2.S b/sysdeps/x86_64/multiarch/wmemchr-avx2.S > > > index 282854f1a1..2bf93fd84b 100644 > > > --- a/sysdeps/x86_64/multiarch/wmemchr-avx2.S > > > +++ b/sysdeps/x86_64/multiarch/wmemchr-avx2.S > > > @@ -1,4 +1,7 @@ > > > -#define MEMCHR __wmemchr_avx2 > > > -#define USE_AS_WMEMCHR 1 > > > +#ifndef WMEMCHR > > > +# define WMEMCHR __wmemchr_avx2 > > > +#endif > > > +#define USE_AS_WMEMCHR 1 > > > +#define MEMCHR WMEMCHR > > > > > > #include "memchr-avx2.S" > > > diff --git a/sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S b/sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S > > > index a346cd35a1..c67309e8a1 100644 > > > --- a/sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S > > > +++ b/sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S > > > @@ -1,3 +1,7 @@ > > > -#define MEMCHR __wmemchr_evex_rtm > > > -#define USE_AS_WMEMCHR 1 > > > +#ifndef WMEMCHR > > > +# define WMEMCHR __wmemchr_evex_rtm > > > +#endif > > > +#define USE_AS_WMEMCHR 1 > > > +#define MEMCHR WMEMCHR > > > + > > > #include "memchr-evex-rtm.S" > > > diff --git a/sysdeps/x86_64/multiarch/wmemchr-evex.S b/sysdeps/x86_64/multiarch/wmemchr-evex.S > > > index 06cd0f9f5a..5512d5cdc3 100644 > > > --- a/sysdeps/x86_64/multiarch/wmemchr-evex.S > > > +++ b/sysdeps/x86_64/multiarch/wmemchr-evex.S > > > @@ -1,4 +1,7 @@ > > > -#define MEMCHR __wmemchr_evex > > > -#define USE_AS_WMEMCHR 1 > > > +#ifndef WMEMCHR > > > +# define WMEMCHR __wmemchr_evex > > > +#endif > > > +#define USE_AS_WMEMCHR 1 > > > +#define MEMCHR WMEMCHR > > > > > > #include "memchr-evex.S" > > > diff --git a/sysdeps/x86_64/multiarch/wmemchr-sse2.S b/sysdeps/x86_64/multiarch/wmemchr-sse2.S > > > index 70a965d552..b675a070d4 100644 > > > --- a/sysdeps/x86_64/multiarch/wmemchr-sse2.S > > > +++ b/sysdeps/x86_64/multiarch/wmemchr-sse2.S > > > @@ -1,4 +1,7 @@ > > > -#define USE_AS_WMEMCHR 1 > > > -#define wmemchr __wmemchr_sse2 > > > +#ifndef WMEMCHR > > > +# define WMEMCHR __wmemchr_sse2 > > > +#endif > > > +#define USE_AS_WMEMCHR 1 > > > +#define MEMCHR WMEMCHR > > > > > > -#include "../memchr.S" > > > +#include "memchr-sse2.S" > > > diff --git a/sysdeps/x86_64/rawmemchr.S b/sysdeps/x86_64/rawmemchr.S > > > index 4c1a3383b9..ba7e5202e6 100644 > > > --- a/sysdeps/x86_64/rawmemchr.S > > > +++ b/sysdeps/x86_64/rawmemchr.S > > > @@ -17,185 +17,13 @@ > > > License along with the GNU C Library; if not, see > > > <https://www.gnu.org/licenses/>. */ > > > > > > -#include <sysdep.h> > > > +#define RAWMEMCHR __rawmemchr > > > > > > - .text > > > -ENTRY (__rawmemchr) > > > - movd %rsi, %xmm1 > > > - mov %rdi, %rcx > > > +#define DEFAULT_IMPL_V1 "multiarch/rawmemchr-sse2.S" > > > +#define DEFAULT_IMPL_V3 "multiarch/rawmemchr-avx2.S" > > > +#define DEFAULT_IMPL_V4 "multiarch/rawmemchr-evex.S" > > > > > > - punpcklbw %xmm1, %xmm1 > > > - punpcklbw %xmm1, %xmm1 > > > - > > > - and $63, %rcx > > > - pshufd $0, %xmm1, %xmm1 > > > - > > > - cmp $48, %rcx > > > - ja L(crosscache) > > > - > > > - movdqu (%rdi), %xmm0 > > > - pcmpeqb %xmm1, %xmm0 > > > -/* Check if there is a match. */ > > > - pmovmskb %xmm0, %eax > > > - test %eax, %eax > > > - > > > - jnz L(matches) > > > - add $16, %rdi > > > - and $-16, %rdi > > > - jmp L(loop_prolog) > > > - > > > - .p2align 4 > > > -L(crosscache): > > > - and $15, %rcx > > > - and $-16, %rdi > > > - movdqa (%rdi), %xmm0 > > > - > > > - pcmpeqb %xmm1, %xmm0 > > > -/* Check if there is a match. */ > > > - pmovmskb %xmm0, %eax > > > -/* Remove the leading bytes. */ > > > - sar %cl, %eax > > > - test %eax, %eax > > > - je L(unaligned_no_match) > > > -/* Check which byte is a match. */ > > > - bsf %eax, %eax > > > - > > > - add %rdi, %rax > > > - add %rcx, %rax > > > - ret > > > - > > > - .p2align 4 > > > -L(unaligned_no_match): > > > - add $16, %rdi > > > - > > > - .p2align 4 > > > -L(loop_prolog): > > > - movdqa (%rdi), %xmm0 > > > - pcmpeqb %xmm1, %xmm0 > > > - pmovmskb %xmm0, %eax > > > - test %eax, %eax > > > - jnz L(matches) > > > - > > > - movdqa 16(%rdi), %xmm2 > > > - pcmpeqb %xmm1, %xmm2 > > > - pmovmskb %xmm2, %eax > > > - test %eax, %eax > > > - jnz L(matches16) > > > - > > > - movdqa 32(%rdi), %xmm3 > > > - pcmpeqb %xmm1, %xmm3 > > > - pmovmskb %xmm3, %eax > > > - test %eax, %eax > > > - jnz L(matches32) > > > - > > > - movdqa 48(%rdi), %xmm4 > > > - pcmpeqb %xmm1, %xmm4 > > > - add $64, %rdi > > > - pmovmskb %xmm4, %eax > > > - test %eax, %eax > > > - jnz L(matches0) > > > - > > > - test $0x3f, %rdi > > > - jz L(align64_loop) > > > - > > > - movdqa (%rdi), %xmm0 > > > - pcmpeqb %xmm1, %xmm0 > > > - pmovmskb %xmm0, %eax > > > - test %eax, %eax > > > - jnz L(matches) > > > - > > > - movdqa 16(%rdi), %xmm2 > > > - pcmpeqb %xmm1, %xmm2 > > > - pmovmskb %xmm2, %eax > > > - test %eax, %eax > > > - jnz L(matches16) > > > - > > > - movdqa 32(%rdi), %xmm3 > > > - pcmpeqb %xmm1, %xmm3 > > > - pmovmskb %xmm3, %eax > > > - test %eax, %eax > > > - jnz L(matches32) > > > - > > > - movdqa 48(%rdi), %xmm3 > > > - pcmpeqb %xmm1, %xmm3 > > > - pmovmskb %xmm3, %eax > > > - > > > - add $64, %rdi > > > - test %eax, %eax > > > - jnz L(matches0) > > > - > > > - and $-64, %rdi > > > - > > > - .p2align 4 > > > -L(align64_loop): > > > - movdqa (%rdi), %xmm0 > > > - movdqa 16(%rdi), %xmm2 > > > - movdqa 32(%rdi), %xmm3 > > > - movdqa 48(%rdi), %xmm4 > > > - > > > - pcmpeqb %xmm1, %xmm0 > > > - pcmpeqb %xmm1, %xmm2 > > > - pcmpeqb %xmm1, %xmm3 > > > - pcmpeqb %xmm1, %xmm4 > > > - > > > - pmaxub %xmm0, %xmm3 > > > - pmaxub %xmm2, %xmm4 > > > - pmaxub %xmm3, %xmm4 > > > - pmovmskb %xmm4, %eax > > > - > > > - add $64, %rdi > > > - > > > - test %eax, %eax > > > - jz L(align64_loop) > > > - > > > - sub $64, %rdi > > > - > > > - pmovmskb %xmm0, %eax > > > - test %eax, %eax > > > - jnz L(matches) > > > - > > > - pmovmskb %xmm2, %eax > > > - test %eax, %eax > > > - jnz L(matches16) > > > - > > > - movdqa 32(%rdi), %xmm3 > > > - pcmpeqb %xmm1, %xmm3 > > > - > > > - pcmpeqb 48(%rdi), %xmm1 > > > - pmovmskb %xmm3, %eax > > > - test %eax, %eax > > > - jnz L(matches32) > > > - > > > - pmovmskb %xmm1, %eax > > > - bsf %eax, %eax > > > - lea 48(%rdi, %rax), %rax > > > - ret > > > - > > > - .p2align 4 > > > -L(matches0): > > > - bsf %eax, %eax > > > - lea -16(%rax, %rdi), %rax > > > - ret > > > - > > > - .p2align 4 > > > -L(matches): > > > - bsf %eax, %eax > > > - add %rdi, %rax > > > - ret > > > - > > > - .p2align 4 > > > -L(matches16): > > > - bsf %eax, %eax > > > - lea 16(%rax, %rdi), %rax > > > - ret > > > - > > > - .p2align 4 > > > -L(matches32): > > > - bsf %eax, %eax > > > - lea 32(%rax, %rdi), %rax > > > - ret > > > - > > > -END (__rawmemchr) > > > +#include "isa-default-impl.h" > > > > > > weak_alias (__rawmemchr, rawmemchr) > > > -libc_hidden_builtin_def (__rawmemchr) > > > +libc_hidden_def (__rawmemchr) > > > diff --git a/sysdeps/x86_64/wmemchr.S b/sysdeps/x86_64/wmemchr.S > > > new file mode 100644 > > > index 0000000000..eef91e556b > > > --- /dev/null > > > +++ b/sysdeps/x86_64/wmemchr.S > > > @@ -0,0 +1,28 @@ > > > +/* Copyright (C) 2011-2022 Free Software Foundation, Inc. > > > + This file is part of the GNU C Library. > > > + > > > + The GNU C Library is free software; you can redistribute it and/or > > > + modify it under the terms of the GNU Lesser General Public > > > + License as published by the Free Software Foundation; either > > > + version 2.1 of the License, or (at your option) any later version. > > > + > > > + The GNU C Library is distributed in the hope that it will be useful, > > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > > + Lesser General Public License for more details. > > > + > > > + You should have received a copy of the GNU Lesser General Public > > > + License along with the GNU C Library; if not, see > > > + <https://www.gnu.org/licenses/>. */ > > > + > > > +#define WMEMCHR __wmemchr > > > + > > > +#define DEFAULT_IMPL_V1 "multiarch/wmemchr-sse2.S" > > > +#define DEFAULT_IMPL_V3 "multiarch/wmemchr-avx2.S" > > > +#define DEFAULT_IMPL_V4 "multiarch/wmemchr-evex.S" > > > + > > > +#include "isa-default-impl.h" > > > + > > > +libc_hidden_def (__wmemchr) > > > +weak_alias (__wmemchr, wmemchr) > > > +libc_hidden_weak (wmemchr) > > > -- > > > 2.34.1 > > > > > > > > > -- > > H.J.
On Wed, Jun 22, 2022 at 2:52 PM H.J. Lu <hjl.tools@gmail.com> wrote: > > On Wed, Jun 22, 2022 at 1:58 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > 1. Refactor files so that all implementations for in the multiarch > > directory. > > - Essentially moved sse2 {raw|w}memchr.S implementation to > > multiarch/{raw|w}memchr-sse2.S > > > > - The non-multiarch {raw|w}memchr.S file now only includes one of > > the implementations in the multiarch directory based on the > > compiled ISA level (only used for non-multiarch builds. > > Otherwise we go through the ifunc selector). > > > > 2. Add ISA level build guards to different implementations. > > - I.e memchr-avx2.S which is ISA level 3 will only build if > > compiled ISA level <= 3. Otherwise there is no reason to include > > it as we will always use one of the ISA level 4 > > implementations (memchr-evex{-rtm}.S). > > > > 3. Add new multiarch/rtld-{raw}memchr.S that just include the > > non-multiarch {raw}memchr.S which will in turn select the best > > implementation based on the compiled ISA level. > > > > 4. Refactor the ifunc selector and ifunc implementation list to use > > the ISA level aware wrapper macros that allow functions below the > > compiled ISA level (with a guranteed replacement) to be skipped. > > - Guranteed replacement essentially means that for any ISA level > > build there must be a function that the baseline of the ISA > > supports. So for {raw|w}memchr.S since there is not ISA level 2 > > function, the ISA level 2 build still includes the ISA level > > 1 (sse2) function. Once we reach the ISA level 3 build, however, > > {raw|w}memchr-avx2{-rtm}.S will always be sufficient so the ISA > > level 1 implementation ({raw|w}memchr-sse2.S) will not be built. > > > > Tested with and without multiarch on x86_64 for ISA levels: > > {generic, x86-64-v2, x86-64-v3, x86-64-v4} > > > > And m32 with and without multiarch. > > --- > > sysdeps/x86_64/isa-default-impl.h | 8 + > > sysdeps/x86_64/memchr.S | 357 +---------------- > > sysdeps/x86_64/multiarch/ifunc-evex.h | 29 +- > > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 72 ++-- > > sysdeps/x86_64/multiarch/memchr-avx2.S | 5 +- > > sysdeps/x86_64/multiarch/memchr-evex.S | 5 +- > > sysdeps/x86_64/multiarch/memchr-sse2.S | 363 +++++++++++++++++- > > sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S | 7 +- > > sysdeps/x86_64/multiarch/rawmemchr-avx2.S | 7 +- > > sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S | 8 +- > > sysdeps/x86_64/multiarch/rawmemchr-evex.S | 7 +- > > sysdeps/x86_64/multiarch/rawmemchr-sse2.S | 198 +++++++++- > > sysdeps/x86_64/multiarch/rtld-memchr.S | 18 + > > sysdeps/x86_64/multiarch/rtld-rawmemchr.S | 18 + > > sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S | 7 +- > > sysdeps/x86_64/multiarch/wmemchr-avx2.S | 7 +- > > sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S | 8 +- > > sysdeps/x86_64/multiarch/wmemchr-evex.S | 7 +- > > sysdeps/x86_64/multiarch/wmemchr-sse2.S | 9 +- > > sysdeps/x86_64/rawmemchr.S | 184 +-------- > > sysdeps/x86_64/wmemchr.S | 28 ++ > > 21 files changed, 740 insertions(+), 612 deletions(-) > > create mode 100644 sysdeps/x86_64/multiarch/rtld-memchr.S > > create mode 100644 sysdeps/x86_64/multiarch/rtld-rawmemchr.S > > create mode 100644 sysdeps/x86_64/wmemchr.S > > > > diff --git a/sysdeps/x86_64/isa-default-impl.h b/sysdeps/x86_64/isa-default-impl.h > > index 34634668e5..b374a38b8b 100644 > > --- a/sysdeps/x86_64/isa-default-impl.h > > +++ b/sysdeps/x86_64/isa-default-impl.h > > @@ -46,4 +46,12 @@ > > # error "Unsupported ISA Level!" > > #endif > > > > +#if IS_IN(rtld) && !defined USE_MULTIARCH > > +# error "RTLD version should only exist in multiarch build" > > +#endif > > + > > +#if defined USE_MULTIARCH && !IS_IN(rtld) > > +# error "Multiarch build should not use ISA_DEFAULT_IMPL without RTLD" > > +#endif > > Please do > > #if IS_IN (rtld) Fixed in v12. > #else > #endif > > > #include ISA_DEFAULT_IMPL > > diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S > > index a160fd9b00..20b43508c4 100644 > > --- a/sysdeps/x86_64/memchr.S > > +++ b/sysdeps/x86_64/memchr.S > > @@ -15,358 +15,13 @@ > > License along with the GNU C Library; if not, see > > <https://www.gnu.org/licenses/>. */ > > > > -#include <sysdep.h> > > +#define MEMCHR __memchr > > > > -#ifdef USE_AS_WMEMCHR > > -# define MEMCHR wmemchr > > -# define PCMPEQ pcmpeqd > > -# define CHAR_PER_VEC 4 > > -#else > > -# define MEMCHR memchr > > -# define PCMPEQ pcmpeqb > > -# define CHAR_PER_VEC 16 > > -#endif > > +#define DEFAULT_IMPL_V1 "multiarch/memchr-sse2.S" > > +#define DEFAULT_IMPL_V3 "multiarch/memchr-avx2.S" > > +#define DEFAULT_IMPL_V4 "multiarch/memchr-evex.S" > > > > -/* fast SSE2 version with using pmaxub and 64 byte loop */ > > +#include "isa-default-impl.h" > > > > - .text > > -ENTRY(MEMCHR) > > - movd %esi, %xmm1 > > - mov %edi, %ecx > > - > > -#ifdef __ILP32__ > > - /* Clear the upper 32 bits. */ > > - movl %edx, %edx > > -#endif > > -#ifdef USE_AS_WMEMCHR > > - test %RDX_LP, %RDX_LP > > - jz L(return_null) > > -#else > > - punpcklbw %xmm1, %xmm1 > > - test %RDX_LP, %RDX_LP > > - jz L(return_null) > > - punpcklbw %xmm1, %xmm1 > > -#endif > > - > > - and $63, %ecx > > - pshufd $0, %xmm1, %xmm1 > > - > > - cmp $48, %ecx > > - ja L(crosscache) > > - > > - movdqu (%rdi), %xmm0 > > - PCMPEQ %xmm1, %xmm0 > > - pmovmskb %xmm0, %eax > > - test %eax, %eax > > - > > - jnz L(matches_1) > > - sub $CHAR_PER_VEC, %rdx > > - jbe L(return_null) > > - add $16, %rdi > > - and $15, %ecx > > - and $-16, %rdi > > -#ifdef USE_AS_WMEMCHR > > - shr $2, %ecx > > -#endif > > - add %rcx, %rdx > > - sub $(CHAR_PER_VEC * 4), %rdx > > - jbe L(exit_loop) > > - jmp L(loop_prolog) > > - > > - .p2align 4 > > -L(crosscache): > > - and $15, %ecx > > - and $-16, %rdi > > - movdqa (%rdi), %xmm0 > > - > > - PCMPEQ %xmm1, %xmm0 > > - /* Check if there is a match. */ > > - pmovmskb %xmm0, %eax > > - /* Remove the leading bytes. */ > > - sar %cl, %eax > > - test %eax, %eax > > - je L(unaligned_no_match) > > - /* Check which byte is a match. */ > > - bsf %eax, %eax > > -#ifdef USE_AS_WMEMCHR > > - mov %eax, %esi > > - shr $2, %esi > > - sub %rsi, %rdx > > -#else > > - sub %rax, %rdx > > -#endif > > - jbe L(return_null) > > - add %rdi, %rax > > - add %rcx, %rax > > - ret > > - > > - .p2align 4 > > -L(unaligned_no_match): > > - /* "rcx" is less than 16. Calculate "rdx + rcx - 16" by using > > - "rdx - (16 - rcx)" instead of "(rdx + rcx) - 16" to void > > - possible addition overflow. */ > > - neg %rcx > > - add $16, %rcx > > -#ifdef USE_AS_WMEMCHR > > - shr $2, %ecx > > -#endif > > - sub %rcx, %rdx > > - jbe L(return_null) > > - add $16, %rdi > > - sub $(CHAR_PER_VEC * 4), %rdx > > - jbe L(exit_loop) > > - > > - .p2align 4 > > -L(loop_prolog): > > - movdqa (%rdi), %xmm0 > > - PCMPEQ %xmm1, %xmm0 > > - pmovmskb %xmm0, %eax > > - test %eax, %eax > > - jnz L(matches) > > - > > - movdqa 16(%rdi), %xmm2 > > - PCMPEQ %xmm1, %xmm2 > > - pmovmskb %xmm2, %eax > > - test %eax, %eax > > - jnz L(matches16) > > - > > - movdqa 32(%rdi), %xmm3 > > - PCMPEQ %xmm1, %xmm3 > > - pmovmskb %xmm3, %eax > > - test %eax, %eax > > - jnz L(matches32) > > - > > - movdqa 48(%rdi), %xmm4 > > - PCMPEQ %xmm1, %xmm4 > > - add $64, %rdi > > - pmovmskb %xmm4, %eax > > - test %eax, %eax > > - jnz L(matches0) > > - > > - test $0x3f, %rdi > > - jz L(align64_loop) > > - > > - sub $(CHAR_PER_VEC * 4), %rdx > > - jbe L(exit_loop) > > - > > - movdqa (%rdi), %xmm0 > > - PCMPEQ %xmm1, %xmm0 > > - pmovmskb %xmm0, %eax > > - test %eax, %eax > > - jnz L(matches) > > - > > - movdqa 16(%rdi), %xmm2 > > - PCMPEQ %xmm1, %xmm2 > > - pmovmskb %xmm2, %eax > > - test %eax, %eax > > - jnz L(matches16) > > - > > - movdqa 32(%rdi), %xmm3 > > - PCMPEQ %xmm1, %xmm3 > > - pmovmskb %xmm3, %eax > > - test %eax, %eax > > - jnz L(matches32) > > - > > - movdqa 48(%rdi), %xmm3 > > - PCMPEQ %xmm1, %xmm3 > > - pmovmskb %xmm3, %eax > > - > > - add $64, %rdi > > - test %eax, %eax > > - jnz L(matches0) > > - > > - mov %rdi, %rcx > > - and $-64, %rdi > > - and $63, %ecx > > -#ifdef USE_AS_WMEMCHR > > - shr $2, %ecx > > -#endif > > - add %rcx, %rdx > > - > > - .p2align 4 > > -L(align64_loop): > > - sub $(CHAR_PER_VEC * 4), %rdx > > - jbe L(exit_loop) > > - movdqa (%rdi), %xmm0 > > - movdqa 16(%rdi), %xmm2 > > - movdqa 32(%rdi), %xmm3 > > - movdqa 48(%rdi), %xmm4 > > - > > - PCMPEQ %xmm1, %xmm0 > > - PCMPEQ %xmm1, %xmm2 > > - PCMPEQ %xmm1, %xmm3 > > - PCMPEQ %xmm1, %xmm4 > > - > > - pmaxub %xmm0, %xmm3 > > - pmaxub %xmm2, %xmm4 > > - pmaxub %xmm3, %xmm4 > > - pmovmskb %xmm4, %eax > > - > > - add $64, %rdi > > - > > - test %eax, %eax > > - jz L(align64_loop) > > - > > - sub $64, %rdi > > - > > - pmovmskb %xmm0, %eax > > - test %eax, %eax > > - jnz L(matches) > > - > > - pmovmskb %xmm2, %eax > > - test %eax, %eax > > - jnz L(matches16) > > - > > - movdqa 32(%rdi), %xmm3 > > - PCMPEQ %xmm1, %xmm3 > > - > > - PCMPEQ 48(%rdi), %xmm1 > > - pmovmskb %xmm3, %eax > > - test %eax, %eax > > - jnz L(matches32) > > - > > - pmovmskb %xmm1, %eax > > - bsf %eax, %eax > > - lea 48(%rdi, %rax), %rax > > - ret > > - > > - .p2align 4 > > -L(exit_loop): > > - add $(CHAR_PER_VEC * 2), %edx > > - jle L(exit_loop_32) > > - > > - movdqa (%rdi), %xmm0 > > - PCMPEQ %xmm1, %xmm0 > > - pmovmskb %xmm0, %eax > > - test %eax, %eax > > - jnz L(matches) > > - > > - movdqa 16(%rdi), %xmm2 > > - PCMPEQ %xmm1, %xmm2 > > - pmovmskb %xmm2, %eax > > - test %eax, %eax > > - jnz L(matches16) > > - > > - movdqa 32(%rdi), %xmm3 > > - PCMPEQ %xmm1, %xmm3 > > - pmovmskb %xmm3, %eax > > - test %eax, %eax > > - jnz L(matches32_1) > > - sub $CHAR_PER_VEC, %edx > > - jle L(return_null) > > - > > - PCMPEQ 48(%rdi), %xmm1 > > - pmovmskb %xmm1, %eax > > - test %eax, %eax > > - jnz L(matches48_1) > > - xor %eax, %eax > > - ret > > - > > - .p2align 4 > > -L(exit_loop_32): > > - add $(CHAR_PER_VEC * 2), %edx > > - movdqa (%rdi), %xmm0 > > - PCMPEQ %xmm1, %xmm0 > > - pmovmskb %xmm0, %eax > > - test %eax, %eax > > - jnz L(matches_1) > > - sub $CHAR_PER_VEC, %edx > > - jbe L(return_null) > > - > > - PCMPEQ 16(%rdi), %xmm1 > > - pmovmskb %xmm1, %eax > > - test %eax, %eax > > - jnz L(matches16_1) > > - xor %eax, %eax > > - ret > > - > > - .p2align 4 > > -L(matches0): > > - bsf %eax, %eax > > - lea -16(%rax, %rdi), %rax > > - ret > > - > > - .p2align 4 > > -L(matches): > > - bsf %eax, %eax > > - add %rdi, %rax > > - ret > > - > > - .p2align 4 > > -L(matches16): > > - bsf %eax, %eax > > - lea 16(%rax, %rdi), %rax > > - ret > > - > > - .p2align 4 > > -L(matches32): > > - bsf %eax, %eax > > - lea 32(%rax, %rdi), %rax > > - ret > > - > > - .p2align 4 > > -L(matches_1): > > - bsf %eax, %eax > > -#ifdef USE_AS_WMEMCHR > > - mov %eax, %esi > > - shr $2, %esi > > - sub %rsi, %rdx > > -#else > > - sub %rax, %rdx > > -#endif > > - jbe L(return_null) > > - add %rdi, %rax > > - ret > > - > > - .p2align 4 > > -L(matches16_1): > > - bsf %eax, %eax > > -#ifdef USE_AS_WMEMCHR > > - mov %eax, %esi > > - shr $2, %esi > > - sub %rsi, %rdx > > -#else > > - sub %rax, %rdx > > -#endif > > - jbe L(return_null) > > - lea 16(%rdi, %rax), %rax > > - ret > > - > > - .p2align 4 > > -L(matches32_1): > > - bsf %eax, %eax > > -#ifdef USE_AS_WMEMCHR > > - mov %eax, %esi > > - shr $2, %esi > > - sub %rsi, %rdx > > -#else > > - sub %rax, %rdx > > -#endif > > - jbe L(return_null) > > - lea 32(%rdi, %rax), %rax > > - ret > > - > > - .p2align 4 > > -L(matches48_1): > > - bsf %eax, %eax > > -#ifdef USE_AS_WMEMCHR > > - mov %eax, %esi > > - shr $2, %esi > > - sub %rsi, %rdx > > -#else > > - sub %rax, %rdx > > -#endif > > - jbe L(return_null) > > - lea 48(%rdi, %rax), %rax > > - ret > > - > > - .p2align 4 > > -L(return_null): > > - xor %eax, %eax > > - ret > > -END(MEMCHR) > > - > > -#ifndef USE_AS_WMEMCHR > > -strong_alias (memchr, __memchr) > > +weak_alias (__memchr, memchr) > > libc_hidden_builtin_def(memchr) > > -#endif > > diff --git a/sysdeps/x86_64/multiarch/ifunc-evex.h b/sysdeps/x86_64/multiarch/ifunc-evex.h > > index b8f7a12ea2..856c6261f8 100644 > > --- a/sysdeps/x86_64/multiarch/ifunc-evex.h > > +++ b/sysdeps/x86_64/multiarch/ifunc-evex.h > > @@ -19,24 +19,28 @@ > > > > #include <init-arch.h> > > > > -extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; > > -extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; > > -extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; > > extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; > > extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_rtm) attribute_hidden; > > > > +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; > > +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; > > + > > +extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; > > > > static inline void * > > IFUNC_SELECTOR (void) > > { > > - const struct cpu_features* cpu_features = __get_cpu_features (); > > - > > - if (CPU_FEATURE_USABLE_P (cpu_features, AVX2) > > - && CPU_FEATURE_USABLE_P (cpu_features, BMI2) > > - && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) > > + const struct cpu_features *cpu_features = __get_cpu_features (); > > + > > + /* NB: The X86_ISA_* feature check macros are evaluated at > > + compile time. */ > > + if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX2) > > + && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2) > > + && X86_ISA_CPU_FEATURES_ARCH_P (cpu_features, > > + AVX_Fast_Unaligned_Load)) > > { > > - if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) > > - && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)) > > + if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) > > + && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)) > > { > > if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) > > return OPTIMIZE (evex_rtm); > > @@ -47,9 +51,12 @@ IFUNC_SELECTOR (void) > > if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) > > return OPTIMIZE (avx2_rtm); > > > > - if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) > > + if (X86_ISA_CPU_FEATURES_ARCH_P (cpu_features, > > + Prefer_No_VZEROUPPER)) > > return OPTIMIZE (avx2); > > } > > > > + /* This is unreachable (compile time checked) if ISA level >= 3 > > + so no need for a robust fallback here. */ > > return OPTIMIZE (sse2); > > } > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > index 883362f63d..bf52cf96d0 100644 > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > @@ -25,7 +25,8 @@ > > > > /* Fill ARRAY of MAX elements with IFUNC implementations for function > > NAME supported on target machine and return the number of valid > > - entries. */ > > + entries. Each set of implementations for a given function is sorted in > > + descending order by ISA level. */ > > > > size_t > > __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > @@ -53,24 +54,27 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > > /* Support sysdeps/x86_64/multiarch/memchr.c. */ > > IFUNC_IMPL (i, name, memchr, > > - IFUNC_IMPL_ADD (array, i, memchr, > > - CPU_FEATURE_USABLE (AVX2), > > - __memchr_avx2) > > - IFUNC_IMPL_ADD (array, i, memchr, > > - (CPU_FEATURE_USABLE (AVX2) > > - && CPU_FEATURE_USABLE (RTM)), > > - __memchr_avx2_rtm) > > - IFUNC_IMPL_ADD (array, i, memchr, > > + X86_IFUNC_IMPL_ADD_V4 (array, i, memchr, > > (CPU_FEATURE_USABLE (AVX512VL) > > && CPU_FEATURE_USABLE (AVX512BW) > > && CPU_FEATURE_USABLE (BMI2)), > > __memchr_evex) > > - IFUNC_IMPL_ADD (array, i, memchr, > > + X86_IFUNC_IMPL_ADD_V4 (array, i, memchr, > > (CPU_FEATURE_USABLE (AVX512VL) > > && CPU_FEATURE_USABLE (AVX512BW) > > && CPU_FEATURE_USABLE (BMI2)), > > __memchr_evex_rtm) > > - IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_sse2)) > > + X86_IFUNC_IMPL_ADD_V3 (array, i, memchr, > > + CPU_FEATURE_USABLE (AVX2), > > + __memchr_avx2) > > + X86_IFUNC_IMPL_ADD_V3 (array, i, memchr, > > + (CPU_FEATURE_USABLE (AVX2) > > + && CPU_FEATURE_USABLE (RTM)), > > + __memchr_avx2_rtm) > > + /* Can be lowered to V1 if a V2 implementation is added. */ > > + X86_IFUNC_IMPL_ADD_V2 (array, i, memchr, > > + 1, > > + __memchr_sse2)) > > > > /* Support sysdeps/x86_64/multiarch/memcmp.c. */ > > IFUNC_IMPL (i, name, memcmp, > > @@ -288,24 +292,27 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > > /* Support sysdeps/x86_64/multiarch/rawmemchr.c. */ > > IFUNC_IMPL (i, name, rawmemchr, > > - IFUNC_IMPL_ADD (array, i, rawmemchr, > > - CPU_FEATURE_USABLE (AVX2), > > - __rawmemchr_avx2) > > - IFUNC_IMPL_ADD (array, i, rawmemchr, > > - (CPU_FEATURE_USABLE (AVX2) > > - && CPU_FEATURE_USABLE (RTM)), > > - __rawmemchr_avx2_rtm) > > - IFUNC_IMPL_ADD (array, i, rawmemchr, > > + X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr, > > (CPU_FEATURE_USABLE (AVX512VL) > > && CPU_FEATURE_USABLE (AVX512BW) > > && CPU_FEATURE_USABLE (BMI2)), > > __rawmemchr_evex) > > - IFUNC_IMPL_ADD (array, i, rawmemchr, > > + X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr, > > (CPU_FEATURE_USABLE (AVX512VL) > > && CPU_FEATURE_USABLE (AVX512BW) > > && CPU_FEATURE_USABLE (BMI2)), > > __rawmemchr_evex_rtm) > > - IFUNC_IMPL_ADD (array, i, rawmemchr, 1, __rawmemchr_sse2)) > > + X86_IFUNC_IMPL_ADD_V3 (array, i, rawmemchr, > > + CPU_FEATURE_USABLE (AVX2), > > + __rawmemchr_avx2) > > + X86_IFUNC_IMPL_ADD_V3 (array, i, rawmemchr, > > + (CPU_FEATURE_USABLE (AVX2) > > + && CPU_FEATURE_USABLE (RTM)), > > + __rawmemchr_avx2_rtm) > > + /* Can be lowered to V1 if a V2 implementation is added. */ > > + X86_IFUNC_IMPL_ADD_V2 (array, i, rawmemchr, > > + 1, > > + __rawmemchr_sse2)) > > > > /* Support sysdeps/x86_64/multiarch/strlen.c. */ > > IFUNC_IMPL (i, name, strlen, > > @@ -748,24 +755,27 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > > /* Support sysdeps/x86_64/multiarch/wmemchr.c. */ > > IFUNC_IMPL (i, name, wmemchr, > > - IFUNC_IMPL_ADD (array, i, wmemchr, > > - CPU_FEATURE_USABLE (AVX2), > > - __wmemchr_avx2) > > - IFUNC_IMPL_ADD (array, i, wmemchr, > > - (CPU_FEATURE_USABLE (AVX2) > > - && CPU_FEATURE_USABLE (RTM)), > > - __wmemchr_avx2_rtm) > > - IFUNC_IMPL_ADD (array, i, wmemchr, > > + X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr, > > (CPU_FEATURE_USABLE (AVX512VL) > > && CPU_FEATURE_USABLE (AVX512BW) > > && CPU_FEATURE_USABLE (BMI2)), > > __wmemchr_evex) > > - IFUNC_IMPL_ADD (array, i, wmemchr, > > + X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr, > > (CPU_FEATURE_USABLE (AVX512VL) > > && CPU_FEATURE_USABLE (AVX512BW) > > && CPU_FEATURE_USABLE (BMI2)), > > __wmemchr_evex_rtm) > > - IFUNC_IMPL_ADD (array, i, wmemchr, 1, __wmemchr_sse2)) > > + X86_IFUNC_IMPL_ADD_V3 (array, i, wmemchr, > > + CPU_FEATURE_USABLE (AVX2), > > + __wmemchr_avx2) > > + X86_IFUNC_IMPL_ADD_V3 (array, i, wmemchr, > > + (CPU_FEATURE_USABLE (AVX2) > > + && CPU_FEATURE_USABLE (RTM)), > > + __wmemchr_avx2_rtm) > > + /* Can be lowered to V1 if a V2 implementation is added. */ > > + X86_IFUNC_IMPL_ADD_V2 (array, i, wmemchr, > > + 1, > > + __wmemchr_sse2)) > > > > /* Support sysdeps/x86_64/multiarch/wmemcmp.c. */ > > IFUNC_IMPL (i, name, wmemcmp, > > diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S > > index c5a256eb37..39be5f7083 100644 > > --- a/sysdeps/x86_64/multiarch/memchr-avx2.S > > +++ b/sysdeps/x86_64/multiarch/memchr-avx2.S > > @@ -16,9 +16,10 @@ > > License along with the GNU C Library; if not, see > > <https://www.gnu.org/licenses/>. */ > > > > -#if IS_IN (libc) > > +#include <isa-level.h> > > +#include <sysdep.h> > > > > -# include <sysdep.h> > > +#if ISA_SHOULD_BUILD (3) > > > > # ifndef MEMCHR > > # define MEMCHR __memchr_avx2 > > diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S > > index 0fd11b7632..0dd4f1dcce 100644 > > --- a/sysdeps/x86_64/multiarch/memchr-evex.S > > +++ b/sysdeps/x86_64/multiarch/memchr-evex.S > > @@ -16,9 +16,10 @@ > > License along with the GNU C Library; if not, see > > <https://www.gnu.org/licenses/>. */ > > > > -#if IS_IN (libc) > > +#include <isa-level.h> > > +#include <sysdep.h> > > > > -# include <sysdep.h> > > +#if ISA_SHOULD_BUILD (4) > > > > # ifndef MEMCHR > > # define MEMCHR __memchr_evex > > diff --git a/sysdeps/x86_64/multiarch/memchr-sse2.S b/sysdeps/x86_64/multiarch/memchr-sse2.S > > index 2c6fdd41d6..8c561cd687 100644 > > --- a/sysdeps/x86_64/multiarch/memchr-sse2.S > > +++ b/sysdeps/x86_64/multiarch/memchr-sse2.S > > @@ -16,13 +16,360 @@ > > License along with the GNU C Library; if not, see > > <https://www.gnu.org/licenses/>. */ > > > > -#if IS_IN (libc) > > -# define memchr __memchr_sse2 > > +#include <isa-level.h> > > +#include <sysdep.h> > > > > -# undef strong_alias > > -# define strong_alias(memchr, __memchr) > > -# undef libc_hidden_builtin_def > > -# define libc_hidden_builtin_def(memchr) > > -#endif > > +/* MINIMUM_X86_ISA_LEVEL <= 2 because there is no V2 implementation > > + so we need this to build for ISA V2 builds. */ > > +#if ISA_SHOULD_BUILD (2) > > + > > +# ifndef MEMCHR > > +# define MEMCHR __memchr_sse2 > > +# endif > > +# ifdef USE_AS_WMEMCHR > > +# define PCMPEQ pcmpeqd > > +# define CHAR_PER_VEC 4 > > +# else > > +# define PCMPEQ pcmpeqb > > +# define CHAR_PER_VEC 16 > > +# endif > > + > > +/* fast SSE2 version with using pmaxub and 64 byte loop */ > > + > > + .text > > +ENTRY(MEMCHR) > > + movd %esi, %xmm1 > > + mov %edi, %ecx > > + > > +# ifdef __ILP32__ > > + /* Clear the upper 32 bits. */ > > + movl %edx, %edx > > +# endif > > +# ifdef USE_AS_WMEMCHR > > + test %RDX_LP, %RDX_LP > > + jz L(return_null) > > +# else > > + punpcklbw %xmm1, %xmm1 > > + test %RDX_LP, %RDX_LP > > + jz L(return_null) > > + punpcklbw %xmm1, %xmm1 > > +# endif > > + > > + and $63, %ecx > > + pshufd $0, %xmm1, %xmm1 > > + > > + cmp $48, %ecx > > + ja L(crosscache) > > + > > + movdqu (%rdi), %xmm0 > > + PCMPEQ %xmm1, %xmm0 > > + pmovmskb %xmm0, %eax > > + test %eax, %eax > > + > > + jnz L(matches_1) > > + sub $CHAR_PER_VEC, %rdx > > + jbe L(return_null) > > + add $16, %rdi > > + and $15, %ecx > > + and $-16, %rdi > > +# ifdef USE_AS_WMEMCHR > > + shr $2, %ecx > > +# endif > > + add %rcx, %rdx > > + sub $(CHAR_PER_VEC * 4), %rdx > > + jbe L(exit_loop) > > + jmp L(loop_prolog) > > + > > + .p2align 4 > > +L(crosscache): > > + and $15, %ecx > > + and $-16, %rdi > > + movdqa (%rdi), %xmm0 > > + > > + PCMPEQ %xmm1, %xmm0 > > + /* Check if there is a match. */ > > + pmovmskb %xmm0, %eax > > + /* Remove the leading bytes. */ > > + sar %cl, %eax > > + test %eax, %eax > > + je L(unaligned_no_match) > > + /* Check which byte is a match. */ > > + bsf %eax, %eax > > +# ifdef USE_AS_WMEMCHR > > + mov %eax, %esi > > + shr $2, %esi > > + sub %rsi, %rdx > > +# else > > + sub %rax, %rdx > > +# endif > > + jbe L(return_null) > > + add %rdi, %rax > > + add %rcx, %rax > > + ret > > + > > + .p2align 4 > > +L(unaligned_no_match): > > + /* "rcx" is less than 16. Calculate "rdx + rcx - 16" by using > > + "rdx - (16 - rcx)" instead of "(rdx + rcx) - 16" to void > > + possible addition overflow. */ > > + neg %rcx > > + add $16, %rcx > > +# ifdef USE_AS_WMEMCHR > > + shr $2, %ecx > > +# endif > > + sub %rcx, %rdx > > + jbe L(return_null) > > + add $16, %rdi > > + sub $(CHAR_PER_VEC * 4), %rdx > > + jbe L(exit_loop) > > + > > + .p2align 4 > > +L(loop_prolog): > > + movdqa (%rdi), %xmm0 > > + PCMPEQ %xmm1, %xmm0 > > + pmovmskb %xmm0, %eax > > + test %eax, %eax > > + jnz L(matches) > > + > > + movdqa 16(%rdi), %xmm2 > > + PCMPEQ %xmm1, %xmm2 > > + pmovmskb %xmm2, %eax > > + test %eax, %eax > > + jnz L(matches16) > > + > > + movdqa 32(%rdi), %xmm3 > > + PCMPEQ %xmm1, %xmm3 > > + pmovmskb %xmm3, %eax > > + test %eax, %eax > > + jnz L(matches32) > > + > > + movdqa 48(%rdi), %xmm4 > > + PCMPEQ %xmm1, %xmm4 > > + add $64, %rdi > > + pmovmskb %xmm4, %eax > > + test %eax, %eax > > + jnz L(matches0) > > + > > + test $0x3f, %rdi > > + jz L(align64_loop) > > + > > + sub $(CHAR_PER_VEC * 4), %rdx > > + jbe L(exit_loop) > > + > > + movdqa (%rdi), %xmm0 > > + PCMPEQ %xmm1, %xmm0 > > + pmovmskb %xmm0, %eax > > + test %eax, %eax > > + jnz L(matches) > > + > > + movdqa 16(%rdi), %xmm2 > > + PCMPEQ %xmm1, %xmm2 > > + pmovmskb %xmm2, %eax > > + test %eax, %eax > > + jnz L(matches16) > > + > > + movdqa 32(%rdi), %xmm3 > > + PCMPEQ %xmm1, %xmm3 > > + pmovmskb %xmm3, %eax > > + test %eax, %eax > > + jnz L(matches32) > > + > > + movdqa 48(%rdi), %xmm3 > > + PCMPEQ %xmm1, %xmm3 > > + pmovmskb %xmm3, %eax > > + > > + add $64, %rdi > > + test %eax, %eax > > + jnz L(matches0) > > + > > + mov %rdi, %rcx > > + and $-64, %rdi > > + and $63, %ecx > > +# ifdef USE_AS_WMEMCHR > > + shr $2, %ecx > > +# endif > > + add %rcx, %rdx > > + > > + .p2align 4 > > +L(align64_loop): > > + sub $(CHAR_PER_VEC * 4), %rdx > > + jbe L(exit_loop) > > + movdqa (%rdi), %xmm0 > > + movdqa 16(%rdi), %xmm2 > > + movdqa 32(%rdi), %xmm3 > > + movdqa 48(%rdi), %xmm4 > > + > > + PCMPEQ %xmm1, %xmm0 > > + PCMPEQ %xmm1, %xmm2 > > + PCMPEQ %xmm1, %xmm3 > > + PCMPEQ %xmm1, %xmm4 > > > > -#include "../memchr.S" > > + pmaxub %xmm0, %xmm3 > > + pmaxub %xmm2, %xmm4 > > + pmaxub %xmm3, %xmm4 > > + pmovmskb %xmm4, %eax > > + > > + add $64, %rdi > > + > > + test %eax, %eax > > + jz L(align64_loop) > > + > > + sub $64, %rdi > > + > > + pmovmskb %xmm0, %eax > > + test %eax, %eax > > + jnz L(matches) > > + > > + pmovmskb %xmm2, %eax > > + test %eax, %eax > > + jnz L(matches16) > > + > > + movdqa 32(%rdi), %xmm3 > > + PCMPEQ %xmm1, %xmm3 > > + > > + PCMPEQ 48(%rdi), %xmm1 > > + pmovmskb %xmm3, %eax > > + test %eax, %eax > > + jnz L(matches32) > > + > > + pmovmskb %xmm1, %eax > > + bsf %eax, %eax > > + lea 48(%rdi, %rax), %rax > > + ret > > + > > + .p2align 4 > > +L(exit_loop): > > + add $(CHAR_PER_VEC * 2), %edx > > + jle L(exit_loop_32) > > + > > + movdqa (%rdi), %xmm0 > > + PCMPEQ %xmm1, %xmm0 > > + pmovmskb %xmm0, %eax > > + test %eax, %eax > > + jnz L(matches) > > + > > + movdqa 16(%rdi), %xmm2 > > + PCMPEQ %xmm1, %xmm2 > > + pmovmskb %xmm2, %eax > > + test %eax, %eax > > + jnz L(matches16) > > + > > + movdqa 32(%rdi), %xmm3 > > + PCMPEQ %xmm1, %xmm3 > > + pmovmskb %xmm3, %eax > > + test %eax, %eax > > + jnz L(matches32_1) > > + sub $CHAR_PER_VEC, %edx > > + jle L(return_null) > > + > > + PCMPEQ 48(%rdi), %xmm1 > > + pmovmskb %xmm1, %eax > > + test %eax, %eax > > + jnz L(matches48_1) > > + xor %eax, %eax > > + ret > > + > > + .p2align 4 > > +L(exit_loop_32): > > + add $(CHAR_PER_VEC * 2), %edx > > + movdqa (%rdi), %xmm0 > > + PCMPEQ %xmm1, %xmm0 > > + pmovmskb %xmm0, %eax > > + test %eax, %eax > > + jnz L(matches_1) > > + sub $CHAR_PER_VEC, %edx > > + jbe L(return_null) > > + > > + PCMPEQ 16(%rdi), %xmm1 > > + pmovmskb %xmm1, %eax > > + test %eax, %eax > > + jnz L(matches16_1) > > + xor %eax, %eax > > + ret > > + > > + .p2align 4 > > +L(matches0): > > + bsf %eax, %eax > > + lea -16(%rax, %rdi), %rax > > + ret > > + > > + .p2align 4 > > +L(matches): > > + bsf %eax, %eax > > + add %rdi, %rax > > + ret > > + > > + .p2align 4 > > +L(matches16): > > + bsf %eax, %eax > > + lea 16(%rax, %rdi), %rax > > + ret > > + > > + .p2align 4 > > +L(matches32): > > + bsf %eax, %eax > > + lea 32(%rax, %rdi), %rax > > + ret > > + > > + .p2align 4 > > +L(matches_1): > > + bsf %eax, %eax > > +# ifdef USE_AS_WMEMCHR > > + mov %eax, %esi > > + shr $2, %esi > > + sub %rsi, %rdx > > +# else > > + sub %rax, %rdx > > +# endif > > + jbe L(return_null) > > + add %rdi, %rax > > + ret > > + > > + .p2align 4 > > +L(matches16_1): > > + bsf %eax, %eax > > +# ifdef USE_AS_WMEMCHR > > + mov %eax, %esi > > + shr $2, %esi > > + sub %rsi, %rdx > > +# else > > + sub %rax, %rdx > > +# endif > > + jbe L(return_null) > > + lea 16(%rdi, %rax), %rax > > + ret > > + > > + .p2align 4 > > +L(matches32_1): > > + bsf %eax, %eax > > +# ifdef USE_AS_WMEMCHR > > + mov %eax, %esi > > + shr $2, %esi > > + sub %rsi, %rdx > > +# else > > + sub %rax, %rdx > > +# endif > > + jbe L(return_null) > > + lea 32(%rdi, %rax), %rax > > + ret > > + > > + .p2align 4 > > +L(matches48_1): > > + bsf %eax, %eax > > +# ifdef USE_AS_WMEMCHR > > + mov %eax, %esi > > + shr $2, %esi > > + sub %rsi, %rdx > > +# else > > + sub %rax, %rdx > > +# endif > > + jbe L(return_null) > > + lea 48(%rdi, %rax), %rax > > + ret > > + > > + .p2align 4 > > +L(return_null): > > + xor %eax, %eax > > + ret > > +END(MEMCHR) > > +#endif > > diff --git a/sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S > > index acc5f6e2fb..5c1dcd3ca7 100644 > > --- a/sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S > > +++ b/sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S > > @@ -1,4 +1,7 @@ > > -#define MEMCHR __rawmemchr_avx2_rtm > > -#define USE_AS_RAWMEMCHR 1 > > +#ifndef RAWMEMCHR > > +# define RAWMEMCHR __rawmemchr_avx2_rtm > > +#endif > > +#define USE_AS_RAWMEMCHR 1 > > +#define MEMCHR RAWMEMCHR > > > > #include "memchr-avx2-rtm.S" > > Will we ever use the RTM version as the default? > > > diff --git a/sysdeps/x86_64/multiarch/rawmemchr-avx2.S b/sysdeps/x86_64/multiarch/rawmemchr-avx2.S > > index 128f9ea637..d6bff28757 100644 > > --- a/sysdeps/x86_64/multiarch/rawmemchr-avx2.S > > +++ b/sysdeps/x86_64/multiarch/rawmemchr-avx2.S > > @@ -1,4 +1,7 @@ > > -#define MEMCHR __rawmemchr_avx2 > > -#define USE_AS_RAWMEMCHR 1 > > +#ifndef RAWMEMCHR > > +# define RAWMEMCHR __rawmemchr_avx2 > > +#endif > > +#define USE_AS_RAWMEMCHR 1 > > +#define MEMCHR RAWMEMCHR > > > > #include "memchr-avx2.S" > > diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S b/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S > > index deda1ca395..8ff7f27c9c 100644 > > --- a/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S > > +++ b/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S > > @@ -1,3 +1,7 @@ > > -#define MEMCHR __rawmemchr_evex_rtm > > -#define USE_AS_RAWMEMCHR 1 > > +#ifndef RAWMEMCHR > > +# define RAWMEMCHR __rawmemchr_evex_rtm > > +#endif > > +#define USE_AS_RAWMEMCHR 1 > > +#define MEMCHR RAWMEMCHR > > + > > Will we ever use the RTM version as the default? > > > #include "memchr-evex-rtm.S" > > diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex.S b/sysdeps/x86_64/multiarch/rawmemchr-evex.S > > index ec942b77ba..dc1c450699 100644 > > --- a/sysdeps/x86_64/multiarch/rawmemchr-evex.S > > +++ b/sysdeps/x86_64/multiarch/rawmemchr-evex.S > > @@ -1,4 +1,7 @@ > > -#define MEMCHR __rawmemchr_evex > > -#define USE_AS_RAWMEMCHR 1 > > +#ifndef RAWMEMCHR > > +# define RAWMEMCHR __rawmemchr_evex > > +#endif > > +#define USE_AS_RAWMEMCHR 1 > > +#define MEMCHR RAWMEMCHR > > > > #include "memchr-evex.S" > > diff --git a/sysdeps/x86_64/multiarch/rawmemchr-sse2.S b/sysdeps/x86_64/multiarch/rawmemchr-sse2.S > > index 3841c14c34..e2c2e20d85 100644 > > --- a/sysdeps/x86_64/multiarch/rawmemchr-sse2.S > > +++ b/sysdeps/x86_64/multiarch/rawmemchr-sse2.S > > @@ -16,14 +16,192 @@ > > License along with the GNU C Library; if not, see > > <https://www.gnu.org/licenses/>. */ > > > > -/* Define multiple versions only for the definition in libc. */ > > -#if IS_IN (libc) > > -# define __rawmemchr __rawmemchr_sse2 > > - > > -# undef weak_alias > > -# define weak_alias(__rawmemchr, rawmemchr) > > -# undef libc_hidden_def > > -# define libc_hidden_def(__rawmemchr) > > -#endif > > +#include <isa-level.h> > > +#include <sysdep.h> > > + > > +/* MINIMUM_X86_ISA_LEVEL <= 2 because there is no V2 implementation > > + so we need this to build for ISA V2 builds. */ > > +#if ISA_SHOULD_BUILD (2) > > + > > +# ifndef RAWMEMCHR > > +# define RAWMEMCHR __rawmemchr_sse2 > > +# endif > > + > > + .text > > +ENTRY (RAWMEMCHR) > > + movd %rsi, %xmm1 > > + mov %rdi, %rcx > > + > > + punpcklbw %xmm1, %xmm1 > > + punpcklbw %xmm1, %xmm1 > > + > > + and $63, %rcx > > + pshufd $0, %xmm1, %xmm1 > > + > > + cmp $48, %rcx > > + ja L(crosscache) > > + > > + movdqu (%rdi), %xmm0 > > + pcmpeqb %xmm1, %xmm0 > > +/* Check if there is a match. */ > > + pmovmskb %xmm0, %eax > > + test %eax, %eax > > + > > + jnz L(matches) > > + add $16, %rdi > > + and $-16, %rdi > > + jmp L(loop_prolog) > > + > > + .p2align 4 > > +L(crosscache): > > + and $15, %rcx > > + and $-16, %rdi > > + movdqa (%rdi), %xmm0 > > + > > + pcmpeqb %xmm1, %xmm0 > > +/* Check if there is a match. */ > > + pmovmskb %xmm0, %eax > > +/* Remove the leading bytes. */ > > + sar %cl, %eax > > + test %eax, %eax > > + je L(unaligned_no_match) > > +/* Check which byte is a match. */ > > + bsf %eax, %eax > > + > > + add %rdi, %rax > > + add %rcx, %rax > > + ret > > + > > + .p2align 4 > > +L(unaligned_no_match): > > + add $16, %rdi > > + > > + .p2align 4 > > +L(loop_prolog): > > + movdqa (%rdi), %xmm0 > > + pcmpeqb %xmm1, %xmm0 > > + pmovmskb %xmm0, %eax > > + test %eax, %eax > > + jnz L(matches) > > + > > + movdqa 16(%rdi), %xmm2 > > + pcmpeqb %xmm1, %xmm2 > > + pmovmskb %xmm2, %eax > > + test %eax, %eax > > + jnz L(matches16) > > + > > + movdqa 32(%rdi), %xmm3 > > + pcmpeqb %xmm1, %xmm3 > > + pmovmskb %xmm3, %eax > > + test %eax, %eax > > + jnz L(matches32) > > + > > + movdqa 48(%rdi), %xmm4 > > + pcmpeqb %xmm1, %xmm4 > > + add $64, %rdi > > + pmovmskb %xmm4, %eax > > + test %eax, %eax > > + jnz L(matches0) > > + > > + test $0x3f, %rdi > > + jz L(align64_loop) > > + > > + movdqa (%rdi), %xmm0 > > + pcmpeqb %xmm1, %xmm0 > > + pmovmskb %xmm0, %eax > > + test %eax, %eax > > + jnz L(matches) > > + > > + movdqa 16(%rdi), %xmm2 > > + pcmpeqb %xmm1, %xmm2 > > + pmovmskb %xmm2, %eax > > + test %eax, %eax > > + jnz L(matches16) > > > > -#include "../rawmemchr.S" > > + movdqa 32(%rdi), %xmm3 > > + pcmpeqb %xmm1, %xmm3 > > + pmovmskb %xmm3, %eax > > + test %eax, %eax > > + jnz L(matches32) > > + > > + movdqa 48(%rdi), %xmm3 > > + pcmpeqb %xmm1, %xmm3 > > + pmovmskb %xmm3, %eax > > + > > + add $64, %rdi > > + test %eax, %eax > > + jnz L(matches0) > > + > > + and $-64, %rdi > > + > > + .p2align 4 > > +L(align64_loop): > > + movdqa (%rdi), %xmm0 > > + movdqa 16(%rdi), %xmm2 > > + movdqa 32(%rdi), %xmm3 > > + movdqa 48(%rdi), %xmm4 > > + > > + pcmpeqb %xmm1, %xmm0 > > + pcmpeqb %xmm1, %xmm2 > > + pcmpeqb %xmm1, %xmm3 > > + pcmpeqb %xmm1, %xmm4 > > + > > + pmaxub %xmm0, %xmm3 > > + pmaxub %xmm2, %xmm4 > > + pmaxub %xmm3, %xmm4 > > + pmovmskb %xmm4, %eax > > + > > + add $64, %rdi > > + > > + test %eax, %eax > > + jz L(align64_loop) > > + > > + sub $64, %rdi > > + > > + pmovmskb %xmm0, %eax > > + test %eax, %eax > > + jnz L(matches) > > + > > + pmovmskb %xmm2, %eax > > + test %eax, %eax > > + jnz L(matches16) > > + > > + movdqa 32(%rdi), %xmm3 > > + pcmpeqb %xmm1, %xmm3 > > + > > + pcmpeqb 48(%rdi), %xmm1 > > + pmovmskb %xmm3, %eax > > + test %eax, %eax > > + jnz L(matches32) > > + > > + pmovmskb %xmm1, %eax > > + bsf %eax, %eax > > + lea 48(%rdi, %rax), %rax > > + ret > > + > > + .p2align 4 > > +L(matches0): > > + bsf %eax, %eax > > + lea -16(%rax, %rdi), %rax > > + ret > > + > > + .p2align 4 > > +L(matches): > > + bsf %eax, %eax > > + add %rdi, %rax > > + ret > > + > > + .p2align 4 > > +L(matches16): > > + bsf %eax, %eax > > + lea 16(%rax, %rdi), %rax > > + ret > > + > > + .p2align 4 > > +L(matches32): > > + bsf %eax, %eax > > + lea 32(%rax, %rdi), %rax > > + ret > > + > > +END (RAWMEMCHR) > > +#endif > > diff --git a/sysdeps/x86_64/multiarch/rtld-memchr.S b/sysdeps/x86_64/multiarch/rtld-memchr.S > > new file mode 100644 > > index 0000000000..a14b192bed > > --- /dev/null > > +++ b/sysdeps/x86_64/multiarch/rtld-memchr.S > > @@ -0,0 +1,18 @@ > > +/* Copyright (C) 2022 Free Software Foundation, Inc. > > + This file is part of the GNU C Library. > > + > > + The GNU C Library is free software; you can redistribute it and/or > > + modify it under the terms of the GNU Lesser General Public > > + License as published by the Free Software Foundation; either > > + version 2.1 of the License, or (at your option) any later version. > > + > > + The GNU C Library is distributed in the hope that it will be useful, > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > + Lesser General Public License for more details. > > + > > + You should have received a copy of the GNU Lesser General Public > > + License along with the GNU C Library; if not, see > > + <https://www.gnu.org/licenses/>. */ > > + > > +#include "../memchr.S" > > diff --git a/sysdeps/x86_64/multiarch/rtld-rawmemchr.S b/sysdeps/x86_64/multiarch/rtld-rawmemchr.S > > new file mode 100644 > > index 0000000000..5d4110a052 > > --- /dev/null > > +++ b/sysdeps/x86_64/multiarch/rtld-rawmemchr.S > > @@ -0,0 +1,18 @@ > > +/* Copyright (C) 2022 Free Software Foundation, Inc. > > + This file is part of the GNU C Library. > > + > > + The GNU C Library is free software; you can redistribute it and/or > > + modify it under the terms of the GNU Lesser General Public > > + License as published by the Free Software Foundation; either > > + version 2.1 of the License, or (at your option) any later version. > > + > > + The GNU C Library is distributed in the hope that it will be useful, > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > + Lesser General Public License for more details. > > + > > + You should have received a copy of the GNU Lesser General Public > > + License along with the GNU C Library; if not, see > > + <https://www.gnu.org/licenses/>. */ > > + > > +#include "../rawmemchr.S" > > diff --git a/sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S > > index 58ed21db01..2a1cff5b05 100644 > > --- a/sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S > > +++ b/sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S > > @@ -1,4 +1,7 @@ > > -#define MEMCHR __wmemchr_avx2_rtm > > -#define USE_AS_WMEMCHR 1 > > +#ifndef WMEMCHR > > +# define WMEMCHR __wmemchr_avx2_rtm > > +#endif > > +#define USE_AS_WMEMCHR 1 > > +#define MEMCHR WMEMCHR > > > > #include "memchr-avx2-rtm.S" > > diff --git a/sysdeps/x86_64/multiarch/wmemchr-avx2.S b/sysdeps/x86_64/multiarch/wmemchr-avx2.S > > index 282854f1a1..2bf93fd84b 100644 > > --- a/sysdeps/x86_64/multiarch/wmemchr-avx2.S > > +++ b/sysdeps/x86_64/multiarch/wmemchr-avx2.S > > @@ -1,4 +1,7 @@ > > -#define MEMCHR __wmemchr_avx2 > > -#define USE_AS_WMEMCHR 1 > > +#ifndef WMEMCHR > > +# define WMEMCHR __wmemchr_avx2 > > +#endif > > +#define USE_AS_WMEMCHR 1 > > +#define MEMCHR WMEMCHR > > > > #include "memchr-avx2.S" > > diff --git a/sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S b/sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S > > index a346cd35a1..c67309e8a1 100644 > > --- a/sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S > > +++ b/sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S > > @@ -1,3 +1,7 @@ > > -#define MEMCHR __wmemchr_evex_rtm > > -#define USE_AS_WMEMCHR 1 > > +#ifndef WMEMCHR > > +# define WMEMCHR __wmemchr_evex_rtm > > +#endif > > +#define USE_AS_WMEMCHR 1 > > +#define MEMCHR WMEMCHR > > + > > #include "memchr-evex-rtm.S" > > diff --git a/sysdeps/x86_64/multiarch/wmemchr-evex.S b/sysdeps/x86_64/multiarch/wmemchr-evex.S > > index 06cd0f9f5a..5512d5cdc3 100644 > > --- a/sysdeps/x86_64/multiarch/wmemchr-evex.S > > +++ b/sysdeps/x86_64/multiarch/wmemchr-evex.S > > @@ -1,4 +1,7 @@ > > -#define MEMCHR __wmemchr_evex > > -#define USE_AS_WMEMCHR 1 > > +#ifndef WMEMCHR > > +# define WMEMCHR __wmemchr_evex > > +#endif > > +#define USE_AS_WMEMCHR 1 > > +#define MEMCHR WMEMCHR > > > > #include "memchr-evex.S" > > diff --git a/sysdeps/x86_64/multiarch/wmemchr-sse2.S b/sysdeps/x86_64/multiarch/wmemchr-sse2.S > > index 70a965d552..b675a070d4 100644 > > --- a/sysdeps/x86_64/multiarch/wmemchr-sse2.S > > +++ b/sysdeps/x86_64/multiarch/wmemchr-sse2.S > > @@ -1,4 +1,7 @@ > > -#define USE_AS_WMEMCHR 1 > > -#define wmemchr __wmemchr_sse2 > > +#ifndef WMEMCHR > > +# define WMEMCHR __wmemchr_sse2 > > +#endif > > +#define USE_AS_WMEMCHR 1 > > +#define MEMCHR WMEMCHR > > > > -#include "../memchr.S" > > +#include "memchr-sse2.S" > > diff --git a/sysdeps/x86_64/rawmemchr.S b/sysdeps/x86_64/rawmemchr.S > > index 4c1a3383b9..ba7e5202e6 100644 > > --- a/sysdeps/x86_64/rawmemchr.S > > +++ b/sysdeps/x86_64/rawmemchr.S > > @@ -17,185 +17,13 @@ > > License along with the GNU C Library; if not, see > > <https://www.gnu.org/licenses/>. */ > > > > -#include <sysdep.h> > > +#define RAWMEMCHR __rawmemchr > > > > - .text > > -ENTRY (__rawmemchr) > > - movd %rsi, %xmm1 > > - mov %rdi, %rcx > > +#define DEFAULT_IMPL_V1 "multiarch/rawmemchr-sse2.S" > > +#define DEFAULT_IMPL_V3 "multiarch/rawmemchr-avx2.S" > > +#define DEFAULT_IMPL_V4 "multiarch/rawmemchr-evex.S" > > > > - punpcklbw %xmm1, %xmm1 > > - punpcklbw %xmm1, %xmm1 > > - > > - and $63, %rcx > > - pshufd $0, %xmm1, %xmm1 > > - > > - cmp $48, %rcx > > - ja L(crosscache) > > - > > - movdqu (%rdi), %xmm0 > > - pcmpeqb %xmm1, %xmm0 > > -/* Check if there is a match. */ > > - pmovmskb %xmm0, %eax > > - test %eax, %eax > > - > > - jnz L(matches) > > - add $16, %rdi > > - and $-16, %rdi > > - jmp L(loop_prolog) > > - > > - .p2align 4 > > -L(crosscache): > > - and $15, %rcx > > - and $-16, %rdi > > - movdqa (%rdi), %xmm0 > > - > > - pcmpeqb %xmm1, %xmm0 > > -/* Check if there is a match. */ > > - pmovmskb %xmm0, %eax > > -/* Remove the leading bytes. */ > > - sar %cl, %eax > > - test %eax, %eax > > - je L(unaligned_no_match) > > -/* Check which byte is a match. */ > > - bsf %eax, %eax > > - > > - add %rdi, %rax > > - add %rcx, %rax > > - ret > > - > > - .p2align 4 > > -L(unaligned_no_match): > > - add $16, %rdi > > - > > - .p2align 4 > > -L(loop_prolog): > > - movdqa (%rdi), %xmm0 > > - pcmpeqb %xmm1, %xmm0 > > - pmovmskb %xmm0, %eax > > - test %eax, %eax > > - jnz L(matches) > > - > > - movdqa 16(%rdi), %xmm2 > > - pcmpeqb %xmm1, %xmm2 > > - pmovmskb %xmm2, %eax > > - test %eax, %eax > > - jnz L(matches16) > > - > > - movdqa 32(%rdi), %xmm3 > > - pcmpeqb %xmm1, %xmm3 > > - pmovmskb %xmm3, %eax > > - test %eax, %eax > > - jnz L(matches32) > > - > > - movdqa 48(%rdi), %xmm4 > > - pcmpeqb %xmm1, %xmm4 > > - add $64, %rdi > > - pmovmskb %xmm4, %eax > > - test %eax, %eax > > - jnz L(matches0) > > - > > - test $0x3f, %rdi > > - jz L(align64_loop) > > - > > - movdqa (%rdi), %xmm0 > > - pcmpeqb %xmm1, %xmm0 > > - pmovmskb %xmm0, %eax > > - test %eax, %eax > > - jnz L(matches) > > - > > - movdqa 16(%rdi), %xmm2 > > - pcmpeqb %xmm1, %xmm2 > > - pmovmskb %xmm2, %eax > > - test %eax, %eax > > - jnz L(matches16) > > - > > - movdqa 32(%rdi), %xmm3 > > - pcmpeqb %xmm1, %xmm3 > > - pmovmskb %xmm3, %eax > > - test %eax, %eax > > - jnz L(matches32) > > - > > - movdqa 48(%rdi), %xmm3 > > - pcmpeqb %xmm1, %xmm3 > > - pmovmskb %xmm3, %eax > > - > > - add $64, %rdi > > - test %eax, %eax > > - jnz L(matches0) > > - > > - and $-64, %rdi > > - > > - .p2align 4 > > -L(align64_loop): > > - movdqa (%rdi), %xmm0 > > - movdqa 16(%rdi), %xmm2 > > - movdqa 32(%rdi), %xmm3 > > - movdqa 48(%rdi), %xmm4 > > - > > - pcmpeqb %xmm1, %xmm0 > > - pcmpeqb %xmm1, %xmm2 > > - pcmpeqb %xmm1, %xmm3 > > - pcmpeqb %xmm1, %xmm4 > > - > > - pmaxub %xmm0, %xmm3 > > - pmaxub %xmm2, %xmm4 > > - pmaxub %xmm3, %xmm4 > > - pmovmskb %xmm4, %eax > > - > > - add $64, %rdi > > - > > - test %eax, %eax > > - jz L(align64_loop) > > - > > - sub $64, %rdi > > - > > - pmovmskb %xmm0, %eax > > - test %eax, %eax > > - jnz L(matches) > > - > > - pmovmskb %xmm2, %eax > > - test %eax, %eax > > - jnz L(matches16) > > - > > - movdqa 32(%rdi), %xmm3 > > - pcmpeqb %xmm1, %xmm3 > > - > > - pcmpeqb 48(%rdi), %xmm1 > > - pmovmskb %xmm3, %eax > > - test %eax, %eax > > - jnz L(matches32) > > - > > - pmovmskb %xmm1, %eax > > - bsf %eax, %eax > > - lea 48(%rdi, %rax), %rax > > - ret > > - > > - .p2align 4 > > -L(matches0): > > - bsf %eax, %eax > > - lea -16(%rax, %rdi), %rax > > - ret > > - > > - .p2align 4 > > -L(matches): > > - bsf %eax, %eax > > - add %rdi, %rax > > - ret > > - > > - .p2align 4 > > -L(matches16): > > - bsf %eax, %eax > > - lea 16(%rax, %rdi), %rax > > - ret > > - > > - .p2align 4 > > -L(matches32): > > - bsf %eax, %eax > > - lea 32(%rax, %rdi), %rax > > - ret > > - > > -END (__rawmemchr) > > +#include "isa-default-impl.h" > > > > weak_alias (__rawmemchr, rawmemchr) > > -libc_hidden_builtin_def (__rawmemchr) > > +libc_hidden_def (__rawmemchr) > > diff --git a/sysdeps/x86_64/wmemchr.S b/sysdeps/x86_64/wmemchr.S > > new file mode 100644 > > index 0000000000..eef91e556b > > --- /dev/null > > +++ b/sysdeps/x86_64/wmemchr.S > > @@ -0,0 +1,28 @@ > > +/* Copyright (C) 2011-2022 Free Software Foundation, Inc. > > + This file is part of the GNU C Library. > > + > > + The GNU C Library is free software; you can redistribute it and/or > > + modify it under the terms of the GNU Lesser General Public > > + License as published by the Free Software Foundation; either > > + version 2.1 of the License, or (at your option) any later version. > > + > > + The GNU C Library is distributed in the hope that it will be useful, > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > + Lesser General Public License for more details. > > + > > + You should have received a copy of the GNU Lesser General Public > > + License along with the GNU C Library; if not, see > > + <https://www.gnu.org/licenses/>. */ > > + > > +#define WMEMCHR __wmemchr > > + > > +#define DEFAULT_IMPL_V1 "multiarch/wmemchr-sse2.S" > > +#define DEFAULT_IMPL_V3 "multiarch/wmemchr-avx2.S" > > +#define DEFAULT_IMPL_V4 "multiarch/wmemchr-evex.S" > > + > > +#include "isa-default-impl.h" > > + > > +libc_hidden_def (__wmemchr) > > +weak_alias (__wmemchr, wmemchr) > > +libc_hidden_weak (wmemchr) > > -- > > 2.34.1 > > > > > -- > H.J.
diff --git a/sysdeps/x86_64/isa-default-impl.h b/sysdeps/x86_64/isa-default-impl.h index 34634668e5..b374a38b8b 100644 --- a/sysdeps/x86_64/isa-default-impl.h +++ b/sysdeps/x86_64/isa-default-impl.h @@ -46,4 +46,12 @@ # error "Unsupported ISA Level!" #endif +#if IS_IN(rtld) && !defined USE_MULTIARCH +# error "RTLD version should only exist in multiarch build" +#endif + +#if defined USE_MULTIARCH && !IS_IN(rtld) +# error "Multiarch build should not use ISA_DEFAULT_IMPL without RTLD" +#endif + #include ISA_DEFAULT_IMPL diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S index a160fd9b00..20b43508c4 100644 --- a/sysdeps/x86_64/memchr.S +++ b/sysdeps/x86_64/memchr.S @@ -15,358 +15,13 @@ License along with the GNU C Library; if not, see <https://www.gnu.org/licenses/>. */ -#include <sysdep.h> +#define MEMCHR __memchr -#ifdef USE_AS_WMEMCHR -# define MEMCHR wmemchr -# define PCMPEQ pcmpeqd -# define CHAR_PER_VEC 4 -#else -# define MEMCHR memchr -# define PCMPEQ pcmpeqb -# define CHAR_PER_VEC 16 -#endif +#define DEFAULT_IMPL_V1 "multiarch/memchr-sse2.S" +#define DEFAULT_IMPL_V3 "multiarch/memchr-avx2.S" +#define DEFAULT_IMPL_V4 "multiarch/memchr-evex.S" -/* fast SSE2 version with using pmaxub and 64 byte loop */ +#include "isa-default-impl.h" - .text -ENTRY(MEMCHR) - movd %esi, %xmm1 - mov %edi, %ecx - -#ifdef __ILP32__ - /* Clear the upper 32 bits. */ - movl %edx, %edx -#endif -#ifdef USE_AS_WMEMCHR - test %RDX_LP, %RDX_LP - jz L(return_null) -#else - punpcklbw %xmm1, %xmm1 - test %RDX_LP, %RDX_LP - jz L(return_null) - punpcklbw %xmm1, %xmm1 -#endif - - and $63, %ecx - pshufd $0, %xmm1, %xmm1 - - cmp $48, %ecx - ja L(crosscache) - - movdqu (%rdi), %xmm0 - PCMPEQ %xmm1, %xmm0 - pmovmskb %xmm0, %eax - test %eax, %eax - - jnz L(matches_1) - sub $CHAR_PER_VEC, %rdx - jbe L(return_null) - add $16, %rdi - and $15, %ecx - and $-16, %rdi -#ifdef USE_AS_WMEMCHR - shr $2, %ecx -#endif - add %rcx, %rdx - sub $(CHAR_PER_VEC * 4), %rdx - jbe L(exit_loop) - jmp L(loop_prolog) - - .p2align 4 -L(crosscache): - and $15, %ecx - and $-16, %rdi - movdqa (%rdi), %xmm0 - - PCMPEQ %xmm1, %xmm0 - /* Check if there is a match. */ - pmovmskb %xmm0, %eax - /* Remove the leading bytes. */ - sar %cl, %eax - test %eax, %eax - je L(unaligned_no_match) - /* Check which byte is a match. */ - bsf %eax, %eax -#ifdef USE_AS_WMEMCHR - mov %eax, %esi - shr $2, %esi - sub %rsi, %rdx -#else - sub %rax, %rdx -#endif - jbe L(return_null) - add %rdi, %rax - add %rcx, %rax - ret - - .p2align 4 -L(unaligned_no_match): - /* "rcx" is less than 16. Calculate "rdx + rcx - 16" by using - "rdx - (16 - rcx)" instead of "(rdx + rcx) - 16" to void - possible addition overflow. */ - neg %rcx - add $16, %rcx -#ifdef USE_AS_WMEMCHR - shr $2, %ecx -#endif - sub %rcx, %rdx - jbe L(return_null) - add $16, %rdi - sub $(CHAR_PER_VEC * 4), %rdx - jbe L(exit_loop) - - .p2align 4 -L(loop_prolog): - movdqa (%rdi), %xmm0 - PCMPEQ %xmm1, %xmm0 - pmovmskb %xmm0, %eax - test %eax, %eax - jnz L(matches) - - movdqa 16(%rdi), %xmm2 - PCMPEQ %xmm1, %xmm2 - pmovmskb %xmm2, %eax - test %eax, %eax - jnz L(matches16) - - movdqa 32(%rdi), %xmm3 - PCMPEQ %xmm1, %xmm3 - pmovmskb %xmm3, %eax - test %eax, %eax - jnz L(matches32) - - movdqa 48(%rdi), %xmm4 - PCMPEQ %xmm1, %xmm4 - add $64, %rdi - pmovmskb %xmm4, %eax - test %eax, %eax - jnz L(matches0) - - test $0x3f, %rdi - jz L(align64_loop) - - sub $(CHAR_PER_VEC * 4), %rdx - jbe L(exit_loop) - - movdqa (%rdi), %xmm0 - PCMPEQ %xmm1, %xmm0 - pmovmskb %xmm0, %eax - test %eax, %eax - jnz L(matches) - - movdqa 16(%rdi), %xmm2 - PCMPEQ %xmm1, %xmm2 - pmovmskb %xmm2, %eax - test %eax, %eax - jnz L(matches16) - - movdqa 32(%rdi), %xmm3 - PCMPEQ %xmm1, %xmm3 - pmovmskb %xmm3, %eax - test %eax, %eax - jnz L(matches32) - - movdqa 48(%rdi), %xmm3 - PCMPEQ %xmm1, %xmm3 - pmovmskb %xmm3, %eax - - add $64, %rdi - test %eax, %eax - jnz L(matches0) - - mov %rdi, %rcx - and $-64, %rdi - and $63, %ecx -#ifdef USE_AS_WMEMCHR - shr $2, %ecx -#endif - add %rcx, %rdx - - .p2align 4 -L(align64_loop): - sub $(CHAR_PER_VEC * 4), %rdx - jbe L(exit_loop) - movdqa (%rdi), %xmm0 - movdqa 16(%rdi), %xmm2 - movdqa 32(%rdi), %xmm3 - movdqa 48(%rdi), %xmm4 - - PCMPEQ %xmm1, %xmm0 - PCMPEQ %xmm1, %xmm2 - PCMPEQ %xmm1, %xmm3 - PCMPEQ %xmm1, %xmm4 - - pmaxub %xmm0, %xmm3 - pmaxub %xmm2, %xmm4 - pmaxub %xmm3, %xmm4 - pmovmskb %xmm4, %eax - - add $64, %rdi - - test %eax, %eax - jz L(align64_loop) - - sub $64, %rdi - - pmovmskb %xmm0, %eax - test %eax, %eax - jnz L(matches) - - pmovmskb %xmm2, %eax - test %eax, %eax - jnz L(matches16) - - movdqa 32(%rdi), %xmm3 - PCMPEQ %xmm1, %xmm3 - - PCMPEQ 48(%rdi), %xmm1 - pmovmskb %xmm3, %eax - test %eax, %eax - jnz L(matches32) - - pmovmskb %xmm1, %eax - bsf %eax, %eax - lea 48(%rdi, %rax), %rax - ret - - .p2align 4 -L(exit_loop): - add $(CHAR_PER_VEC * 2), %edx - jle L(exit_loop_32) - - movdqa (%rdi), %xmm0 - PCMPEQ %xmm1, %xmm0 - pmovmskb %xmm0, %eax - test %eax, %eax - jnz L(matches) - - movdqa 16(%rdi), %xmm2 - PCMPEQ %xmm1, %xmm2 - pmovmskb %xmm2, %eax - test %eax, %eax - jnz L(matches16) - - movdqa 32(%rdi), %xmm3 - PCMPEQ %xmm1, %xmm3 - pmovmskb %xmm3, %eax - test %eax, %eax - jnz L(matches32_1) - sub $CHAR_PER_VEC, %edx - jle L(return_null) - - PCMPEQ 48(%rdi), %xmm1 - pmovmskb %xmm1, %eax - test %eax, %eax - jnz L(matches48_1) - xor %eax, %eax - ret - - .p2align 4 -L(exit_loop_32): - add $(CHAR_PER_VEC * 2), %edx - movdqa (%rdi), %xmm0 - PCMPEQ %xmm1, %xmm0 - pmovmskb %xmm0, %eax - test %eax, %eax - jnz L(matches_1) - sub $CHAR_PER_VEC, %edx - jbe L(return_null) - - PCMPEQ 16(%rdi), %xmm1 - pmovmskb %xmm1, %eax - test %eax, %eax - jnz L(matches16_1) - xor %eax, %eax - ret - - .p2align 4 -L(matches0): - bsf %eax, %eax - lea -16(%rax, %rdi), %rax - ret - - .p2align 4 -L(matches): - bsf %eax, %eax - add %rdi, %rax - ret - - .p2align 4 -L(matches16): - bsf %eax, %eax - lea 16(%rax, %rdi), %rax - ret - - .p2align 4 -L(matches32): - bsf %eax, %eax - lea 32(%rax, %rdi), %rax - ret - - .p2align 4 -L(matches_1): - bsf %eax, %eax -#ifdef USE_AS_WMEMCHR - mov %eax, %esi - shr $2, %esi - sub %rsi, %rdx -#else - sub %rax, %rdx -#endif - jbe L(return_null) - add %rdi, %rax - ret - - .p2align 4 -L(matches16_1): - bsf %eax, %eax -#ifdef USE_AS_WMEMCHR - mov %eax, %esi - shr $2, %esi - sub %rsi, %rdx -#else - sub %rax, %rdx -#endif - jbe L(return_null) - lea 16(%rdi, %rax), %rax - ret - - .p2align 4 -L(matches32_1): - bsf %eax, %eax -#ifdef USE_AS_WMEMCHR - mov %eax, %esi - shr $2, %esi - sub %rsi, %rdx -#else - sub %rax, %rdx -#endif - jbe L(return_null) - lea 32(%rdi, %rax), %rax - ret - - .p2align 4 -L(matches48_1): - bsf %eax, %eax -#ifdef USE_AS_WMEMCHR - mov %eax, %esi - shr $2, %esi - sub %rsi, %rdx -#else - sub %rax, %rdx -#endif - jbe L(return_null) - lea 48(%rdi, %rax), %rax - ret - - .p2align 4 -L(return_null): - xor %eax, %eax - ret -END(MEMCHR) - -#ifndef USE_AS_WMEMCHR -strong_alias (memchr, __memchr) +weak_alias (__memchr, memchr) libc_hidden_builtin_def(memchr) -#endif diff --git a/sysdeps/x86_64/multiarch/ifunc-evex.h b/sysdeps/x86_64/multiarch/ifunc-evex.h index b8f7a12ea2..856c6261f8 100644 --- a/sysdeps/x86_64/multiarch/ifunc-evex.h +++ b/sysdeps/x86_64/multiarch/ifunc-evex.h @@ -19,24 +19,28 @@ #include <init-arch.h> -extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; -extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; -extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_rtm) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; + +extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; static inline void * IFUNC_SELECTOR (void) { - const struct cpu_features* cpu_features = __get_cpu_features (); - - if (CPU_FEATURE_USABLE_P (cpu_features, AVX2) - && CPU_FEATURE_USABLE_P (cpu_features, BMI2) - && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) + const struct cpu_features *cpu_features = __get_cpu_features (); + + /* NB: The X86_ISA_* feature check macros are evaluated at + compile time. */ + if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX2) + && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2) + && X86_ISA_CPU_FEATURES_ARCH_P (cpu_features, + AVX_Fast_Unaligned_Load)) { - if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) - && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)) + if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) + && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)) { if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) return OPTIMIZE (evex_rtm); @@ -47,9 +51,12 @@ IFUNC_SELECTOR (void) if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) return OPTIMIZE (avx2_rtm); - if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) + if (X86_ISA_CPU_FEATURES_ARCH_P (cpu_features, + Prefer_No_VZEROUPPER)) return OPTIMIZE (avx2); } + /* This is unreachable (compile time checked) if ISA level >= 3 + so no need for a robust fallback here. */ return OPTIMIZE (sse2); } diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index 883362f63d..bf52cf96d0 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -25,7 +25,8 @@ /* Fill ARRAY of MAX elements with IFUNC implementations for function NAME supported on target machine and return the number of valid - entries. */ + entries. Each set of implementations for a given function is sorted in + descending order by ISA level. */ size_t __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, @@ -53,24 +54,27 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/x86_64/multiarch/memchr.c. */ IFUNC_IMPL (i, name, memchr, - IFUNC_IMPL_ADD (array, i, memchr, - CPU_FEATURE_USABLE (AVX2), - __memchr_avx2) - IFUNC_IMPL_ADD (array, i, memchr, - (CPU_FEATURE_USABLE (AVX2) - && CPU_FEATURE_USABLE (RTM)), - __memchr_avx2_rtm) - IFUNC_IMPL_ADD (array, i, memchr, + X86_IFUNC_IMPL_ADD_V4 (array, i, memchr, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW) && CPU_FEATURE_USABLE (BMI2)), __memchr_evex) - IFUNC_IMPL_ADD (array, i, memchr, + X86_IFUNC_IMPL_ADD_V4 (array, i, memchr, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW) && CPU_FEATURE_USABLE (BMI2)), __memchr_evex_rtm) - IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_sse2)) + X86_IFUNC_IMPL_ADD_V3 (array, i, memchr, + CPU_FEATURE_USABLE (AVX2), + __memchr_avx2) + X86_IFUNC_IMPL_ADD_V3 (array, i, memchr, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __memchr_avx2_rtm) + /* Can be lowered to V1 if a V2 implementation is added. */ + X86_IFUNC_IMPL_ADD_V2 (array, i, memchr, + 1, + __memchr_sse2)) /* Support sysdeps/x86_64/multiarch/memcmp.c. */ IFUNC_IMPL (i, name, memcmp, @@ -288,24 +292,27 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/x86_64/multiarch/rawmemchr.c. */ IFUNC_IMPL (i, name, rawmemchr, - IFUNC_IMPL_ADD (array, i, rawmemchr, - CPU_FEATURE_USABLE (AVX2), - __rawmemchr_avx2) - IFUNC_IMPL_ADD (array, i, rawmemchr, - (CPU_FEATURE_USABLE (AVX2) - && CPU_FEATURE_USABLE (RTM)), - __rawmemchr_avx2_rtm) - IFUNC_IMPL_ADD (array, i, rawmemchr, + X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW) && CPU_FEATURE_USABLE (BMI2)), __rawmemchr_evex) - IFUNC_IMPL_ADD (array, i, rawmemchr, + X86_IFUNC_IMPL_ADD_V4 (array, i, rawmemchr, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW) && CPU_FEATURE_USABLE (BMI2)), __rawmemchr_evex_rtm) - IFUNC_IMPL_ADD (array, i, rawmemchr, 1, __rawmemchr_sse2)) + X86_IFUNC_IMPL_ADD_V3 (array, i, rawmemchr, + CPU_FEATURE_USABLE (AVX2), + __rawmemchr_avx2) + X86_IFUNC_IMPL_ADD_V3 (array, i, rawmemchr, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __rawmemchr_avx2_rtm) + /* Can be lowered to V1 if a V2 implementation is added. */ + X86_IFUNC_IMPL_ADD_V2 (array, i, rawmemchr, + 1, + __rawmemchr_sse2)) /* Support sysdeps/x86_64/multiarch/strlen.c. */ IFUNC_IMPL (i, name, strlen, @@ -748,24 +755,27 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/x86_64/multiarch/wmemchr.c. */ IFUNC_IMPL (i, name, wmemchr, - IFUNC_IMPL_ADD (array, i, wmemchr, - CPU_FEATURE_USABLE (AVX2), - __wmemchr_avx2) - IFUNC_IMPL_ADD (array, i, wmemchr, - (CPU_FEATURE_USABLE (AVX2) - && CPU_FEATURE_USABLE (RTM)), - __wmemchr_avx2_rtm) - IFUNC_IMPL_ADD (array, i, wmemchr, + X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW) && CPU_FEATURE_USABLE (BMI2)), __wmemchr_evex) - IFUNC_IMPL_ADD (array, i, wmemchr, + X86_IFUNC_IMPL_ADD_V4 (array, i, wmemchr, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW) && CPU_FEATURE_USABLE (BMI2)), __wmemchr_evex_rtm) - IFUNC_IMPL_ADD (array, i, wmemchr, 1, __wmemchr_sse2)) + X86_IFUNC_IMPL_ADD_V3 (array, i, wmemchr, + CPU_FEATURE_USABLE (AVX2), + __wmemchr_avx2) + X86_IFUNC_IMPL_ADD_V3 (array, i, wmemchr, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __wmemchr_avx2_rtm) + /* Can be lowered to V1 if a V2 implementation is added. */ + X86_IFUNC_IMPL_ADD_V2 (array, i, wmemchr, + 1, + __wmemchr_sse2)) /* Support sysdeps/x86_64/multiarch/wmemcmp.c. */ IFUNC_IMPL (i, name, wmemcmp, diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S index c5a256eb37..39be5f7083 100644 --- a/sysdeps/x86_64/multiarch/memchr-avx2.S +++ b/sysdeps/x86_64/multiarch/memchr-avx2.S @@ -16,9 +16,10 @@ License along with the GNU C Library; if not, see <https://www.gnu.org/licenses/>. */ -#if IS_IN (libc) +#include <isa-level.h> +#include <sysdep.h> -# include <sysdep.h> +#if ISA_SHOULD_BUILD (3) # ifndef MEMCHR # define MEMCHR __memchr_avx2 diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S index 0fd11b7632..0dd4f1dcce 100644 --- a/sysdeps/x86_64/multiarch/memchr-evex.S +++ b/sysdeps/x86_64/multiarch/memchr-evex.S @@ -16,9 +16,10 @@ License along with the GNU C Library; if not, see <https://www.gnu.org/licenses/>. */ -#if IS_IN (libc) +#include <isa-level.h> +#include <sysdep.h> -# include <sysdep.h> +#if ISA_SHOULD_BUILD (4) # ifndef MEMCHR # define MEMCHR __memchr_evex diff --git a/sysdeps/x86_64/multiarch/memchr-sse2.S b/sysdeps/x86_64/multiarch/memchr-sse2.S index 2c6fdd41d6..8c561cd687 100644 --- a/sysdeps/x86_64/multiarch/memchr-sse2.S +++ b/sysdeps/x86_64/multiarch/memchr-sse2.S @@ -16,13 +16,360 @@ License along with the GNU C Library; if not, see <https://www.gnu.org/licenses/>. */ -#if IS_IN (libc) -# define memchr __memchr_sse2 +#include <isa-level.h> +#include <sysdep.h> -# undef strong_alias -# define strong_alias(memchr, __memchr) -# undef libc_hidden_builtin_def -# define libc_hidden_builtin_def(memchr) -#endif +/* MINIMUM_X86_ISA_LEVEL <= 2 because there is no V2 implementation + so we need this to build for ISA V2 builds. */ +#if ISA_SHOULD_BUILD (2) + +# ifndef MEMCHR +# define MEMCHR __memchr_sse2 +# endif +# ifdef USE_AS_WMEMCHR +# define PCMPEQ pcmpeqd +# define CHAR_PER_VEC 4 +# else +# define PCMPEQ pcmpeqb +# define CHAR_PER_VEC 16 +# endif + +/* fast SSE2 version with using pmaxub and 64 byte loop */ + + .text +ENTRY(MEMCHR) + movd %esi, %xmm1 + mov %edi, %ecx + +# ifdef __ILP32__ + /* Clear the upper 32 bits. */ + movl %edx, %edx +# endif +# ifdef USE_AS_WMEMCHR + test %RDX_LP, %RDX_LP + jz L(return_null) +# else + punpcklbw %xmm1, %xmm1 + test %RDX_LP, %RDX_LP + jz L(return_null) + punpcklbw %xmm1, %xmm1 +# endif + + and $63, %ecx + pshufd $0, %xmm1, %xmm1 + + cmp $48, %ecx + ja L(crosscache) + + movdqu (%rdi), %xmm0 + PCMPEQ %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + + jnz L(matches_1) + sub $CHAR_PER_VEC, %rdx + jbe L(return_null) + add $16, %rdi + and $15, %ecx + and $-16, %rdi +# ifdef USE_AS_WMEMCHR + shr $2, %ecx +# endif + add %rcx, %rdx + sub $(CHAR_PER_VEC * 4), %rdx + jbe L(exit_loop) + jmp L(loop_prolog) + + .p2align 4 +L(crosscache): + and $15, %ecx + and $-16, %rdi + movdqa (%rdi), %xmm0 + + PCMPEQ %xmm1, %xmm0 + /* Check if there is a match. */ + pmovmskb %xmm0, %eax + /* Remove the leading bytes. */ + sar %cl, %eax + test %eax, %eax + je L(unaligned_no_match) + /* Check which byte is a match. */ + bsf %eax, %eax +# ifdef USE_AS_WMEMCHR + mov %eax, %esi + shr $2, %esi + sub %rsi, %rdx +# else + sub %rax, %rdx +# endif + jbe L(return_null) + add %rdi, %rax + add %rcx, %rax + ret + + .p2align 4 +L(unaligned_no_match): + /* "rcx" is less than 16. Calculate "rdx + rcx - 16" by using + "rdx - (16 - rcx)" instead of "(rdx + rcx) - 16" to void + possible addition overflow. */ + neg %rcx + add $16, %rcx +# ifdef USE_AS_WMEMCHR + shr $2, %ecx +# endif + sub %rcx, %rdx + jbe L(return_null) + add $16, %rdi + sub $(CHAR_PER_VEC * 4), %rdx + jbe L(exit_loop) + + .p2align 4 +L(loop_prolog): + movdqa (%rdi), %xmm0 + PCMPEQ %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches) + + movdqa 16(%rdi), %xmm2 + PCMPEQ %xmm1, %xmm2 + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches16) + + movdqa 32(%rdi), %xmm3 + PCMPEQ %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches32) + + movdqa 48(%rdi), %xmm4 + PCMPEQ %xmm1, %xmm4 + add $64, %rdi + pmovmskb %xmm4, %eax + test %eax, %eax + jnz L(matches0) + + test $0x3f, %rdi + jz L(align64_loop) + + sub $(CHAR_PER_VEC * 4), %rdx + jbe L(exit_loop) + + movdqa (%rdi), %xmm0 + PCMPEQ %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches) + + movdqa 16(%rdi), %xmm2 + PCMPEQ %xmm1, %xmm2 + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches16) + + movdqa 32(%rdi), %xmm3 + PCMPEQ %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches32) + + movdqa 48(%rdi), %xmm3 + PCMPEQ %xmm1, %xmm3 + pmovmskb %xmm3, %eax + + add $64, %rdi + test %eax, %eax + jnz L(matches0) + + mov %rdi, %rcx + and $-64, %rdi + and $63, %ecx +# ifdef USE_AS_WMEMCHR + shr $2, %ecx +# endif + add %rcx, %rdx + + .p2align 4 +L(align64_loop): + sub $(CHAR_PER_VEC * 4), %rdx + jbe L(exit_loop) + movdqa (%rdi), %xmm0 + movdqa 16(%rdi), %xmm2 + movdqa 32(%rdi), %xmm3 + movdqa 48(%rdi), %xmm4 + + PCMPEQ %xmm1, %xmm0 + PCMPEQ %xmm1, %xmm2 + PCMPEQ %xmm1, %xmm3 + PCMPEQ %xmm1, %xmm4 -#include "../memchr.S" + pmaxub %xmm0, %xmm3 + pmaxub %xmm2, %xmm4 + pmaxub %xmm3, %xmm4 + pmovmskb %xmm4, %eax + + add $64, %rdi + + test %eax, %eax + jz L(align64_loop) + + sub $64, %rdi + + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches) + + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches16) + + movdqa 32(%rdi), %xmm3 + PCMPEQ %xmm1, %xmm3 + + PCMPEQ 48(%rdi), %xmm1 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches32) + + pmovmskb %xmm1, %eax + bsf %eax, %eax + lea 48(%rdi, %rax), %rax + ret + + .p2align 4 +L(exit_loop): + add $(CHAR_PER_VEC * 2), %edx + jle L(exit_loop_32) + + movdqa (%rdi), %xmm0 + PCMPEQ %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches) + + movdqa 16(%rdi), %xmm2 + PCMPEQ %xmm1, %xmm2 + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches16) + + movdqa 32(%rdi), %xmm3 + PCMPEQ %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches32_1) + sub $CHAR_PER_VEC, %edx + jle L(return_null) + + PCMPEQ 48(%rdi), %xmm1 + pmovmskb %xmm1, %eax + test %eax, %eax + jnz L(matches48_1) + xor %eax, %eax + ret + + .p2align 4 +L(exit_loop_32): + add $(CHAR_PER_VEC * 2), %edx + movdqa (%rdi), %xmm0 + PCMPEQ %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches_1) + sub $CHAR_PER_VEC, %edx + jbe L(return_null) + + PCMPEQ 16(%rdi), %xmm1 + pmovmskb %xmm1, %eax + test %eax, %eax + jnz L(matches16_1) + xor %eax, %eax + ret + + .p2align 4 +L(matches0): + bsf %eax, %eax + lea -16(%rax, %rdi), %rax + ret + + .p2align 4 +L(matches): + bsf %eax, %eax + add %rdi, %rax + ret + + .p2align 4 +L(matches16): + bsf %eax, %eax + lea 16(%rax, %rdi), %rax + ret + + .p2align 4 +L(matches32): + bsf %eax, %eax + lea 32(%rax, %rdi), %rax + ret + + .p2align 4 +L(matches_1): + bsf %eax, %eax +# ifdef USE_AS_WMEMCHR + mov %eax, %esi + shr $2, %esi + sub %rsi, %rdx +# else + sub %rax, %rdx +# endif + jbe L(return_null) + add %rdi, %rax + ret + + .p2align 4 +L(matches16_1): + bsf %eax, %eax +# ifdef USE_AS_WMEMCHR + mov %eax, %esi + shr $2, %esi + sub %rsi, %rdx +# else + sub %rax, %rdx +# endif + jbe L(return_null) + lea 16(%rdi, %rax), %rax + ret + + .p2align 4 +L(matches32_1): + bsf %eax, %eax +# ifdef USE_AS_WMEMCHR + mov %eax, %esi + shr $2, %esi + sub %rsi, %rdx +# else + sub %rax, %rdx +# endif + jbe L(return_null) + lea 32(%rdi, %rax), %rax + ret + + .p2align 4 +L(matches48_1): + bsf %eax, %eax +# ifdef USE_AS_WMEMCHR + mov %eax, %esi + shr $2, %esi + sub %rsi, %rdx +# else + sub %rax, %rdx +# endif + jbe L(return_null) + lea 48(%rdi, %rax), %rax + ret + + .p2align 4 +L(return_null): + xor %eax, %eax + ret +END(MEMCHR) +#endif diff --git a/sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S index acc5f6e2fb..5c1dcd3ca7 100644 --- a/sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S +++ b/sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S @@ -1,4 +1,7 @@ -#define MEMCHR __rawmemchr_avx2_rtm -#define USE_AS_RAWMEMCHR 1 +#ifndef RAWMEMCHR +# define RAWMEMCHR __rawmemchr_avx2_rtm +#endif +#define USE_AS_RAWMEMCHR 1 +#define MEMCHR RAWMEMCHR #include "memchr-avx2-rtm.S" diff --git a/sysdeps/x86_64/multiarch/rawmemchr-avx2.S b/sysdeps/x86_64/multiarch/rawmemchr-avx2.S index 128f9ea637..d6bff28757 100644 --- a/sysdeps/x86_64/multiarch/rawmemchr-avx2.S +++ b/sysdeps/x86_64/multiarch/rawmemchr-avx2.S @@ -1,4 +1,7 @@ -#define MEMCHR __rawmemchr_avx2 -#define USE_AS_RAWMEMCHR 1 +#ifndef RAWMEMCHR +# define RAWMEMCHR __rawmemchr_avx2 +#endif +#define USE_AS_RAWMEMCHR 1 +#define MEMCHR RAWMEMCHR #include "memchr-avx2.S" diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S b/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S index deda1ca395..8ff7f27c9c 100644 --- a/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S +++ b/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S @@ -1,3 +1,7 @@ -#define MEMCHR __rawmemchr_evex_rtm -#define USE_AS_RAWMEMCHR 1 +#ifndef RAWMEMCHR +# define RAWMEMCHR __rawmemchr_evex_rtm +#endif +#define USE_AS_RAWMEMCHR 1 +#define MEMCHR RAWMEMCHR + #include "memchr-evex-rtm.S" diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex.S b/sysdeps/x86_64/multiarch/rawmemchr-evex.S index ec942b77ba..dc1c450699 100644 --- a/sysdeps/x86_64/multiarch/rawmemchr-evex.S +++ b/sysdeps/x86_64/multiarch/rawmemchr-evex.S @@ -1,4 +1,7 @@ -#define MEMCHR __rawmemchr_evex -#define USE_AS_RAWMEMCHR 1 +#ifndef RAWMEMCHR +# define RAWMEMCHR __rawmemchr_evex +#endif +#define USE_AS_RAWMEMCHR 1 +#define MEMCHR RAWMEMCHR #include "memchr-evex.S" diff --git a/sysdeps/x86_64/multiarch/rawmemchr-sse2.S b/sysdeps/x86_64/multiarch/rawmemchr-sse2.S index 3841c14c34..e2c2e20d85 100644 --- a/sysdeps/x86_64/multiarch/rawmemchr-sse2.S +++ b/sysdeps/x86_64/multiarch/rawmemchr-sse2.S @@ -16,14 +16,192 @@ License along with the GNU C Library; if not, see <https://www.gnu.org/licenses/>. */ -/* Define multiple versions only for the definition in libc. */ -#if IS_IN (libc) -# define __rawmemchr __rawmemchr_sse2 - -# undef weak_alias -# define weak_alias(__rawmemchr, rawmemchr) -# undef libc_hidden_def -# define libc_hidden_def(__rawmemchr) -#endif +#include <isa-level.h> +#include <sysdep.h> + +/* MINIMUM_X86_ISA_LEVEL <= 2 because there is no V2 implementation + so we need this to build for ISA V2 builds. */ +#if ISA_SHOULD_BUILD (2) + +# ifndef RAWMEMCHR +# define RAWMEMCHR __rawmemchr_sse2 +# endif + + .text +ENTRY (RAWMEMCHR) + movd %rsi, %xmm1 + mov %rdi, %rcx + + punpcklbw %xmm1, %xmm1 + punpcklbw %xmm1, %xmm1 + + and $63, %rcx + pshufd $0, %xmm1, %xmm1 + + cmp $48, %rcx + ja L(crosscache) + + movdqu (%rdi), %xmm0 + pcmpeqb %xmm1, %xmm0 +/* Check if there is a match. */ + pmovmskb %xmm0, %eax + test %eax, %eax + + jnz L(matches) + add $16, %rdi + and $-16, %rdi + jmp L(loop_prolog) + + .p2align 4 +L(crosscache): + and $15, %rcx + and $-16, %rdi + movdqa (%rdi), %xmm0 + + pcmpeqb %xmm1, %xmm0 +/* Check if there is a match. */ + pmovmskb %xmm0, %eax +/* Remove the leading bytes. */ + sar %cl, %eax + test %eax, %eax + je L(unaligned_no_match) +/* Check which byte is a match. */ + bsf %eax, %eax + + add %rdi, %rax + add %rcx, %rax + ret + + .p2align 4 +L(unaligned_no_match): + add $16, %rdi + + .p2align 4 +L(loop_prolog): + movdqa (%rdi), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches) + + movdqa 16(%rdi), %xmm2 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches16) + + movdqa 32(%rdi), %xmm3 + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches32) + + movdqa 48(%rdi), %xmm4 + pcmpeqb %xmm1, %xmm4 + add $64, %rdi + pmovmskb %xmm4, %eax + test %eax, %eax + jnz L(matches0) + + test $0x3f, %rdi + jz L(align64_loop) + + movdqa (%rdi), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches) + + movdqa 16(%rdi), %xmm2 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches16) -#include "../rawmemchr.S" + movdqa 32(%rdi), %xmm3 + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches32) + + movdqa 48(%rdi), %xmm3 + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + + add $64, %rdi + test %eax, %eax + jnz L(matches0) + + and $-64, %rdi + + .p2align 4 +L(align64_loop): + movdqa (%rdi), %xmm0 + movdqa 16(%rdi), %xmm2 + movdqa 32(%rdi), %xmm3 + movdqa 48(%rdi), %xmm4 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm1, %xmm2 + pcmpeqb %xmm1, %xmm3 + pcmpeqb %xmm1, %xmm4 + + pmaxub %xmm0, %xmm3 + pmaxub %xmm2, %xmm4 + pmaxub %xmm3, %xmm4 + pmovmskb %xmm4, %eax + + add $64, %rdi + + test %eax, %eax + jz L(align64_loop) + + sub $64, %rdi + + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches) + + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches16) + + movdqa 32(%rdi), %xmm3 + pcmpeqb %xmm1, %xmm3 + + pcmpeqb 48(%rdi), %xmm1 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches32) + + pmovmskb %xmm1, %eax + bsf %eax, %eax + lea 48(%rdi, %rax), %rax + ret + + .p2align 4 +L(matches0): + bsf %eax, %eax + lea -16(%rax, %rdi), %rax + ret + + .p2align 4 +L(matches): + bsf %eax, %eax + add %rdi, %rax + ret + + .p2align 4 +L(matches16): + bsf %eax, %eax + lea 16(%rax, %rdi), %rax + ret + + .p2align 4 +L(matches32): + bsf %eax, %eax + lea 32(%rax, %rdi), %rax + ret + +END (RAWMEMCHR) +#endif diff --git a/sysdeps/x86_64/multiarch/rtld-memchr.S b/sysdeps/x86_64/multiarch/rtld-memchr.S new file mode 100644 index 0000000000..a14b192bed --- /dev/null +++ b/sysdeps/x86_64/multiarch/rtld-memchr.S @@ -0,0 +1,18 @@ +/* Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include "../memchr.S" diff --git a/sysdeps/x86_64/multiarch/rtld-rawmemchr.S b/sysdeps/x86_64/multiarch/rtld-rawmemchr.S new file mode 100644 index 0000000000..5d4110a052 --- /dev/null +++ b/sysdeps/x86_64/multiarch/rtld-rawmemchr.S @@ -0,0 +1,18 @@ +/* Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include "../rawmemchr.S" diff --git a/sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S index 58ed21db01..2a1cff5b05 100644 --- a/sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S +++ b/sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S @@ -1,4 +1,7 @@ -#define MEMCHR __wmemchr_avx2_rtm -#define USE_AS_WMEMCHR 1 +#ifndef WMEMCHR +# define WMEMCHR __wmemchr_avx2_rtm +#endif +#define USE_AS_WMEMCHR 1 +#define MEMCHR WMEMCHR #include "memchr-avx2-rtm.S" diff --git a/sysdeps/x86_64/multiarch/wmemchr-avx2.S b/sysdeps/x86_64/multiarch/wmemchr-avx2.S index 282854f1a1..2bf93fd84b 100644 --- a/sysdeps/x86_64/multiarch/wmemchr-avx2.S +++ b/sysdeps/x86_64/multiarch/wmemchr-avx2.S @@ -1,4 +1,7 @@ -#define MEMCHR __wmemchr_avx2 -#define USE_AS_WMEMCHR 1 +#ifndef WMEMCHR +# define WMEMCHR __wmemchr_avx2 +#endif +#define USE_AS_WMEMCHR 1 +#define MEMCHR WMEMCHR #include "memchr-avx2.S" diff --git a/sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S b/sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S index a346cd35a1..c67309e8a1 100644 --- a/sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S +++ b/sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S @@ -1,3 +1,7 @@ -#define MEMCHR __wmemchr_evex_rtm -#define USE_AS_WMEMCHR 1 +#ifndef WMEMCHR +# define WMEMCHR __wmemchr_evex_rtm +#endif +#define USE_AS_WMEMCHR 1 +#define MEMCHR WMEMCHR + #include "memchr-evex-rtm.S" diff --git a/sysdeps/x86_64/multiarch/wmemchr-evex.S b/sysdeps/x86_64/multiarch/wmemchr-evex.S index 06cd0f9f5a..5512d5cdc3 100644 --- a/sysdeps/x86_64/multiarch/wmemchr-evex.S +++ b/sysdeps/x86_64/multiarch/wmemchr-evex.S @@ -1,4 +1,7 @@ -#define MEMCHR __wmemchr_evex -#define USE_AS_WMEMCHR 1 +#ifndef WMEMCHR +# define WMEMCHR __wmemchr_evex +#endif +#define USE_AS_WMEMCHR 1 +#define MEMCHR WMEMCHR #include "memchr-evex.S" diff --git a/sysdeps/x86_64/multiarch/wmemchr-sse2.S b/sysdeps/x86_64/multiarch/wmemchr-sse2.S index 70a965d552..b675a070d4 100644 --- a/sysdeps/x86_64/multiarch/wmemchr-sse2.S +++ b/sysdeps/x86_64/multiarch/wmemchr-sse2.S @@ -1,4 +1,7 @@ -#define USE_AS_WMEMCHR 1 -#define wmemchr __wmemchr_sse2 +#ifndef WMEMCHR +# define WMEMCHR __wmemchr_sse2 +#endif +#define USE_AS_WMEMCHR 1 +#define MEMCHR WMEMCHR -#include "../memchr.S" +#include "memchr-sse2.S" diff --git a/sysdeps/x86_64/rawmemchr.S b/sysdeps/x86_64/rawmemchr.S index 4c1a3383b9..ba7e5202e6 100644 --- a/sysdeps/x86_64/rawmemchr.S +++ b/sysdeps/x86_64/rawmemchr.S @@ -17,185 +17,13 @@ License along with the GNU C Library; if not, see <https://www.gnu.org/licenses/>. */ -#include <sysdep.h> +#define RAWMEMCHR __rawmemchr - .text -ENTRY (__rawmemchr) - movd %rsi, %xmm1 - mov %rdi, %rcx +#define DEFAULT_IMPL_V1 "multiarch/rawmemchr-sse2.S" +#define DEFAULT_IMPL_V3 "multiarch/rawmemchr-avx2.S" +#define DEFAULT_IMPL_V4 "multiarch/rawmemchr-evex.S" - punpcklbw %xmm1, %xmm1 - punpcklbw %xmm1, %xmm1 - - and $63, %rcx - pshufd $0, %xmm1, %xmm1 - - cmp $48, %rcx - ja L(crosscache) - - movdqu (%rdi), %xmm0 - pcmpeqb %xmm1, %xmm0 -/* Check if there is a match. */ - pmovmskb %xmm0, %eax - test %eax, %eax - - jnz L(matches) - add $16, %rdi - and $-16, %rdi - jmp L(loop_prolog) - - .p2align 4 -L(crosscache): - and $15, %rcx - and $-16, %rdi - movdqa (%rdi), %xmm0 - - pcmpeqb %xmm1, %xmm0 -/* Check if there is a match. */ - pmovmskb %xmm0, %eax -/* Remove the leading bytes. */ - sar %cl, %eax - test %eax, %eax - je L(unaligned_no_match) -/* Check which byte is a match. */ - bsf %eax, %eax - - add %rdi, %rax - add %rcx, %rax - ret - - .p2align 4 -L(unaligned_no_match): - add $16, %rdi - - .p2align 4 -L(loop_prolog): - movdqa (%rdi), %xmm0 - pcmpeqb %xmm1, %xmm0 - pmovmskb %xmm0, %eax - test %eax, %eax - jnz L(matches) - - movdqa 16(%rdi), %xmm2 - pcmpeqb %xmm1, %xmm2 - pmovmskb %xmm2, %eax - test %eax, %eax - jnz L(matches16) - - movdqa 32(%rdi), %xmm3 - pcmpeqb %xmm1, %xmm3 - pmovmskb %xmm3, %eax - test %eax, %eax - jnz L(matches32) - - movdqa 48(%rdi), %xmm4 - pcmpeqb %xmm1, %xmm4 - add $64, %rdi - pmovmskb %xmm4, %eax - test %eax, %eax - jnz L(matches0) - - test $0x3f, %rdi - jz L(align64_loop) - - movdqa (%rdi), %xmm0 - pcmpeqb %xmm1, %xmm0 - pmovmskb %xmm0, %eax - test %eax, %eax - jnz L(matches) - - movdqa 16(%rdi), %xmm2 - pcmpeqb %xmm1, %xmm2 - pmovmskb %xmm2, %eax - test %eax, %eax - jnz L(matches16) - - movdqa 32(%rdi), %xmm3 - pcmpeqb %xmm1, %xmm3 - pmovmskb %xmm3, %eax - test %eax, %eax - jnz L(matches32) - - movdqa 48(%rdi), %xmm3 - pcmpeqb %xmm1, %xmm3 - pmovmskb %xmm3, %eax - - add $64, %rdi - test %eax, %eax - jnz L(matches0) - - and $-64, %rdi - - .p2align 4 -L(align64_loop): - movdqa (%rdi), %xmm0 - movdqa 16(%rdi), %xmm2 - movdqa 32(%rdi), %xmm3 - movdqa 48(%rdi), %xmm4 - - pcmpeqb %xmm1, %xmm0 - pcmpeqb %xmm1, %xmm2 - pcmpeqb %xmm1, %xmm3 - pcmpeqb %xmm1, %xmm4 - - pmaxub %xmm0, %xmm3 - pmaxub %xmm2, %xmm4 - pmaxub %xmm3, %xmm4 - pmovmskb %xmm4, %eax - - add $64, %rdi - - test %eax, %eax - jz L(align64_loop) - - sub $64, %rdi - - pmovmskb %xmm0, %eax - test %eax, %eax - jnz L(matches) - - pmovmskb %xmm2, %eax - test %eax, %eax - jnz L(matches16) - - movdqa 32(%rdi), %xmm3 - pcmpeqb %xmm1, %xmm3 - - pcmpeqb 48(%rdi), %xmm1 - pmovmskb %xmm3, %eax - test %eax, %eax - jnz L(matches32) - - pmovmskb %xmm1, %eax - bsf %eax, %eax - lea 48(%rdi, %rax), %rax - ret - - .p2align 4 -L(matches0): - bsf %eax, %eax - lea -16(%rax, %rdi), %rax - ret - - .p2align 4 -L(matches): - bsf %eax, %eax - add %rdi, %rax - ret - - .p2align 4 -L(matches16): - bsf %eax, %eax - lea 16(%rax, %rdi), %rax - ret - - .p2align 4 -L(matches32): - bsf %eax, %eax - lea 32(%rax, %rdi), %rax - ret - -END (__rawmemchr) +#include "isa-default-impl.h" weak_alias (__rawmemchr, rawmemchr) -libc_hidden_builtin_def (__rawmemchr) +libc_hidden_def (__rawmemchr) diff --git a/sysdeps/x86_64/wmemchr.S b/sysdeps/x86_64/wmemchr.S new file mode 100644 index 0000000000..eef91e556b --- /dev/null +++ b/sysdeps/x86_64/wmemchr.S @@ -0,0 +1,28 @@ +/* Copyright (C) 2011-2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#define WMEMCHR __wmemchr + +#define DEFAULT_IMPL_V1 "multiarch/wmemchr-sse2.S" +#define DEFAULT_IMPL_V3 "multiarch/wmemchr-avx2.S" +#define DEFAULT_IMPL_V4 "multiarch/wmemchr-evex.S" + +#include "isa-default-impl.h" + +libc_hidden_def (__wmemchr) +weak_alias (__wmemchr, wmemchr) +libc_hidden_weak (wmemchr)