Message ID | 20220712192910.351121-4-goldstein.w.n@gmail.com |
---|---|
State | New |
Headers | show |
Series | [v1] x86: Move strrchr SSE2 implementation to multiarch/strrchr-sse2.S | expand |
On Tue, Jul 12, 2022 at 12:29 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > This commit doesn't affect libc.so.6, its just housekeeping to prepare > for adding explicit ISA level support. > > Tested build on x86_64 and x86_32 with/without multiarch. > --- > sysdeps/x86_64/multiarch/strrchr-sse2.S | 358 ++++++++++++++++++++++- > sysdeps/x86_64/multiarch/wcsrchr-sse2.S | 10 +- > sysdeps/x86_64/strrchr.S | 364 +----------------------- > sysdeps/x86_64/wcsrchr.S | 11 +- > 4 files changed, 366 insertions(+), 377 deletions(-) > > diff --git a/sysdeps/x86_64/multiarch/strrchr-sse2.S b/sysdeps/x86_64/multiarch/strrchr-sse2.S > index 866396e947..6ee7a5e33a 100644 > --- a/sysdeps/x86_64/multiarch/strrchr-sse2.S > +++ b/sysdeps/x86_64/multiarch/strrchr-sse2.S > @@ -17,12 +17,358 @@ > <https://www.gnu.org/licenses/>. */ > > #if IS_IN (libc) > -# define STRRCHR __strrchr_sse2 > +# ifndef STRRCHR > +# define STRRCHR __strrchr_sse2 > +# endif > +#endif > + > +#include <sysdep.h> > + > +#ifdef USE_AS_WCSRCHR > +# define PCMPEQ pcmpeqd > +# define CHAR_SIZE 4 > +# define PMINU pminud > +#else > +# define PCMPEQ pcmpeqb > +# define CHAR_SIZE 1 > +# define PMINU pminub > +#endif > + > +#define PAGE_SIZE 4096 > +#define VEC_SIZE 16 > + > + .text > +ENTRY(STRRCHR) > + movd %esi, %xmm0 > + movq %rdi, %rax > + andl $(PAGE_SIZE - 1), %eax > +#ifndef USE_AS_WCSRCHR > + punpcklbw %xmm0, %xmm0 > + punpcklwd %xmm0, %xmm0 > +#endif > + pshufd $0, %xmm0, %xmm0 > + cmpl $(PAGE_SIZE - VEC_SIZE), %eax > + ja L(cross_page) > + > +L(cross_page_continue): > + movups (%rdi), %xmm1 > + pxor %xmm2, %xmm2 > + PCMPEQ %xmm1, %xmm2 > + pmovmskb %xmm2, %ecx > + testl %ecx, %ecx > + jz L(aligned_more) > + > + PCMPEQ %xmm0, %xmm1 > + pmovmskb %xmm1, %eax > + leal -1(%rcx), %edx > + xorl %edx, %ecx > + andl %ecx, %eax > + jz L(ret0) > + bsrl %eax, %eax > + addq %rdi, %rax > + /* We are off by 3 for wcsrchr if search CHAR is non-zero. If > + search CHAR is zero we are correct. Either way `andq > + -CHAR_SIZE, %rax` gets the correct result. */ > +#ifdef USE_AS_WCSRCHR > + andq $-CHAR_SIZE, %rax > +#endif > +L(ret0): > + ret > + > + /* Returns for first vec x1/x2 have hard coded backward search > + path for earlier matches. */ > + .p2align 4 > +L(first_vec_x0_test): > + PCMPEQ %xmm0, %xmm1 > + pmovmskb %xmm1, %eax > + testl %eax, %eax > + jz L(ret0) > + bsrl %eax, %eax > + addq %r8, %rax > +#ifdef USE_AS_WCSRCHR > + andq $-CHAR_SIZE, %rax > +#endif > + ret > + > + .p2align 4 > +L(first_vec_x1): > + PCMPEQ %xmm0, %xmm2 > + pmovmskb %xmm2, %eax > + leal -1(%rcx), %edx > + xorl %edx, %ecx > + andl %ecx, %eax > + jz L(first_vec_x0_test) > + bsrl %eax, %eax > + leaq (VEC_SIZE)(%rdi, %rax), %rax > +#ifdef USE_AS_WCSRCHR > + andq $-CHAR_SIZE, %rax > +#endif > + ret > + > + .p2align 4 > +L(first_vec_x1_test): > + PCMPEQ %xmm0, %xmm2 > + pmovmskb %xmm2, %eax > + testl %eax, %eax > + jz L(first_vec_x0_test) > + bsrl %eax, %eax > + leaq (VEC_SIZE)(%rdi, %rax), %rax > +#ifdef USE_AS_WCSRCHR > + andq $-CHAR_SIZE, %rax > +#endif > + ret > + > + .p2align 4 > +L(first_vec_x2): > + PCMPEQ %xmm0, %xmm3 > + pmovmskb %xmm3, %eax > + leal -1(%rcx), %edx > + xorl %edx, %ecx > + andl %ecx, %eax > + jz L(first_vec_x1_test) > + bsrl %eax, %eax > + leaq (VEC_SIZE * 2)(%rdi, %rax), %rax > +#ifdef USE_AS_WCSRCHR > + andq $-CHAR_SIZE, %rax > +#endif > + ret > + > + .p2align 4 > +L(aligned_more): > + /* Save original pointer if match was in VEC 0. */ > + movq %rdi, %r8 > + andq $-VEC_SIZE, %rdi > + > + movaps VEC_SIZE(%rdi), %xmm2 > + pxor %xmm3, %xmm3 > + PCMPEQ %xmm2, %xmm3 > + pmovmskb %xmm3, %ecx > + testl %ecx, %ecx > + jnz L(first_vec_x1) > + > + movaps (VEC_SIZE * 2)(%rdi), %xmm3 > + pxor %xmm4, %xmm4 > + PCMPEQ %xmm3, %xmm4 > + pmovmskb %xmm4, %ecx > + testl %ecx, %ecx > + jnz L(first_vec_x2) > + > + addq $VEC_SIZE, %rdi > + /* Save pointer again before realigning. */ > + movq %rdi, %rsi > + andq $-(VEC_SIZE * 2), %rdi > + .p2align 4 > +L(first_loop): > + /* Do 2x VEC at a time. */ > + movaps (VEC_SIZE * 2)(%rdi), %xmm4 > + movaps (VEC_SIZE * 3)(%rdi), %xmm5 > + /* Since SSE2 no pminud so wcsrchr needs seperate logic for > + detecting zero. Note if this is found to be a bottleneck it > + may be worth adding an SSE4.1 wcsrchr implementation. */ > +#ifdef USE_AS_WCSRCHR > + movaps %xmm5, %xmm6 > + pxor %xmm8, %xmm8 > + > + PCMPEQ %xmm8, %xmm5 > + PCMPEQ %xmm4, %xmm8 > + por %xmm5, %xmm8 > +#else > + movaps %xmm5, %xmm6 > + PMINU %xmm4, %xmm5 > +#endif > + > + movaps %xmm4, %xmm9 > + PCMPEQ %xmm0, %xmm4 > + PCMPEQ %xmm0, %xmm6 > + movaps %xmm6, %xmm7 > + por %xmm4, %xmm6 > +#ifndef USE_AS_WCSRCHR > + pxor %xmm8, %xmm8 > + PCMPEQ %xmm5, %xmm8 > +#endif > + pmovmskb %xmm8, %ecx > + pmovmskb %xmm6, %eax > > -# undef weak_alias > -# define weak_alias(strrchr, rindex) > -# undef libc_hidden_builtin_def > -# define libc_hidden_builtin_def(strrchr) > + addq $(VEC_SIZE * 2), %rdi > + /* Use `addl` 1) so we can undo it with `subl` and 2) it can > + macro-fuse with `jz`. */ > + addl %ecx, %eax > + jz L(first_loop) > + > + /* Check if there is zero match. */ > + testl %ecx, %ecx > + jz L(second_loop_match) > + > + /* Check if there was a match in last iteration. */ > + subl %ecx, %eax > + jnz L(new_match) > + > +L(first_loop_old_match): > + PCMPEQ %xmm0, %xmm2 > + PCMPEQ %xmm0, %xmm3 > + pmovmskb %xmm2, %ecx > + pmovmskb %xmm3, %eax > + addl %eax, %ecx > + jz L(first_vec_x0_test) > + /* NB: We could move this shift to before the branch and save a > + bit of code size / performance on the fall through. The > + branch leads to the null case which generally seems hotter > + than char in first 3x VEC. */ > + sall $16, %eax > + orl %ecx, %eax > + > + bsrl %eax, %eax > + addq %rsi, %rax > +#ifdef USE_AS_WCSRCHR > + andq $-CHAR_SIZE, %rax > +#endif > + ret > + > + .p2align 4 > +L(new_match): > + pxor %xmm6, %xmm6 > + PCMPEQ %xmm9, %xmm6 > + pmovmskb %xmm6, %eax > + sall $16, %ecx > + orl %eax, %ecx > + > + /* We can't reuse either of the old comparisons as since we mask > + of zeros after first zero (instead of using the full > + comparison) we can't gurantee no interference between match > + after end of string and valid match. */ > + pmovmskb %xmm4, %eax > + pmovmskb %xmm7, %edx > + sall $16, %edx > + orl %edx, %eax > + > + leal -1(%ecx), %edx > + xorl %edx, %ecx > + andl %ecx, %eax > + jz L(first_loop_old_match) > + bsrl %eax, %eax > + addq %rdi, %rax > +#ifdef USE_AS_WCSRCHR > + andq $-CHAR_SIZE, %rax > +#endif > + ret > + > + /* Save minimum state for getting most recent match. We can > + throw out all previous work. */ > + .p2align 4 > +L(second_loop_match): > + movq %rdi, %rsi > + movaps %xmm4, %xmm2 > + movaps %xmm7, %xmm3 > + > + .p2align 4 > +L(second_loop): > + movaps (VEC_SIZE * 2)(%rdi), %xmm4 > + movaps (VEC_SIZE * 3)(%rdi), %xmm5 > + /* Since SSE2 no pminud so wcsrchr needs seperate logic for > + detecting zero. Note if this is found to be a bottleneck it > + may be worth adding an SSE4.1 wcsrchr implementation. */ > +#ifdef USE_AS_WCSRCHR > + movaps %xmm5, %xmm6 > + pxor %xmm8, %xmm8 > + > + PCMPEQ %xmm8, %xmm5 > + PCMPEQ %xmm4, %xmm8 > + por %xmm5, %xmm8 > +#else > + movaps %xmm5, %xmm6 > + PMINU %xmm4, %xmm5 > +#endif > + > + movaps %xmm4, %xmm9 > + PCMPEQ %xmm0, %xmm4 > + PCMPEQ %xmm0, %xmm6 > + movaps %xmm6, %xmm7 > + por %xmm4, %xmm6 > +#ifndef USE_AS_WCSRCHR > + pxor %xmm8, %xmm8 > + PCMPEQ %xmm5, %xmm8 > #endif > > -#include "../strrchr.S" > + pmovmskb %xmm8, %ecx > + pmovmskb %xmm6, %eax > + > + addq $(VEC_SIZE * 2), %rdi > + /* Either null term or new occurence of CHAR. */ > + addl %ecx, %eax > + jz L(second_loop) > + > + /* No null term so much be new occurence of CHAR. */ > + testl %ecx, %ecx > + jz L(second_loop_match) > + > + > + subl %ecx, %eax > + jnz L(second_loop_new_match) > + > +L(second_loop_old_match): > + pmovmskb %xmm2, %ecx > + pmovmskb %xmm3, %eax > + sall $16, %eax > + orl %ecx, %eax > + bsrl %eax, %eax > + addq %rsi, %rax > +#ifdef USE_AS_WCSRCHR > + andq $-CHAR_SIZE, %rax > +#endif > + ret > + > + .p2align 4 > +L(second_loop_new_match): > + pxor %xmm6, %xmm6 > + PCMPEQ %xmm9, %xmm6 > + pmovmskb %xmm6, %eax > + sall $16, %ecx > + orl %eax, %ecx > + > + /* We can't reuse either of the old comparisons as since we mask > + of zeros after first zero (instead of using the full > + comparison) we can't gurantee no interference between match > + after end of string and valid match. */ > + pmovmskb %xmm4, %eax > + pmovmskb %xmm7, %edx > + sall $16, %edx > + orl %edx, %eax > + > + leal -1(%ecx), %edx > + xorl %edx, %ecx > + andl %ecx, %eax > + jz L(second_loop_old_match) > + bsrl %eax, %eax > + addq %rdi, %rax > +#ifdef USE_AS_WCSRCHR > + andq $-CHAR_SIZE, %rax > +#endif > + ret > + > + .p2align 4,, 4 > +L(cross_page): > + movq %rdi, %rsi > + andq $-VEC_SIZE, %rsi > + movaps (%rsi), %xmm1 > + pxor %xmm2, %xmm2 > + PCMPEQ %xmm1, %xmm2 > + pmovmskb %xmm2, %edx > + movl %edi, %ecx > + andl $(VEC_SIZE - 1), %ecx > + sarl %cl, %edx > + jz L(cross_page_continue) > + PCMPEQ %xmm0, %xmm1 > + pmovmskb %xmm1, %eax > + sarl %cl, %eax > + leal -1(%rdx), %ecx > + xorl %edx, %ecx > + andl %ecx, %eax > + jz L(ret1) > + bsrl %eax, %eax > + addq %rdi, %rax > +#ifdef USE_AS_WCSRCHR > + andq $-CHAR_SIZE, %rax > +#endif > +L(ret1): > + ret > +END(STRRCHR) > diff --git a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S > index 69d2f3cdb1..d9259720f8 100644 > --- a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S > +++ b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S > @@ -17,6 +17,12 @@ > <https://www.gnu.org/licenses/>. */ > > #if IS_IN (libc) > -# define STRRCHR __wcsrchr_sse2 > +# ifndef STRRCHR > +# define STRRCHR __wcsrchr_sse2 > +# endif > #endif > -#include "../wcsrchr.S" > + > +#define USE_AS_WCSRCHR 1 > +#define NO_PMINU 1 > + > +#include "strrchr-sse2.S" > diff --git a/sysdeps/x86_64/strrchr.S b/sysdeps/x86_64/strrchr.S > index 4d7ba4ceb2..f39da60454 100644 > --- a/sysdeps/x86_64/strrchr.S > +++ b/sysdeps/x86_64/strrchr.S > @@ -16,363 +16,7 @@ > License along with the GNU C Library; if not, see > <https://www.gnu.org/licenses/>. */ > > - > -#include <sysdep.h> > - > -#ifndef STRRCHR > -# define STRRCHR strrchr > -#endif > - > -#ifdef USE_AS_WCSRCHR > -# define PCMPEQ pcmpeqd > -# define CHAR_SIZE 4 > -# define PMINU pminud > -#else > -# define PCMPEQ pcmpeqb > -# define CHAR_SIZE 1 > -# define PMINU pminub > -#endif > - > -#define PAGE_SIZE 4096 > -#define VEC_SIZE 16 > - > - .text > -ENTRY(STRRCHR) > - movd %esi, %xmm0 > - movq %rdi, %rax > - andl $(PAGE_SIZE - 1), %eax > -#ifndef USE_AS_WCSRCHR > - punpcklbw %xmm0, %xmm0 > - punpcklwd %xmm0, %xmm0 > -#endif > - pshufd $0, %xmm0, %xmm0 > - cmpl $(PAGE_SIZE - VEC_SIZE), %eax > - ja L(cross_page) > - > -L(cross_page_continue): > - movups (%rdi), %xmm1 > - pxor %xmm2, %xmm2 > - PCMPEQ %xmm1, %xmm2 > - pmovmskb %xmm2, %ecx > - testl %ecx, %ecx > - jz L(aligned_more) > - > - PCMPEQ %xmm0, %xmm1 > - pmovmskb %xmm1, %eax > - leal -1(%rcx), %edx > - xorl %edx, %ecx > - andl %ecx, %eax > - jz L(ret0) > - bsrl %eax, %eax > - addq %rdi, %rax > - /* We are off by 3 for wcsrchr if search CHAR is non-zero. If > - search CHAR is zero we are correct. Either way `andq > - -CHAR_SIZE, %rax` gets the correct result. */ > -#ifdef USE_AS_WCSRCHR > - andq $-CHAR_SIZE, %rax > -#endif > -L(ret0): > - ret > - > - /* Returns for first vec x1/x2 have hard coded backward search > - path for earlier matches. */ > - .p2align 4 > -L(first_vec_x0_test): > - PCMPEQ %xmm0, %xmm1 > - pmovmskb %xmm1, %eax > - testl %eax, %eax > - jz L(ret0) > - bsrl %eax, %eax > - addq %r8, %rax > -#ifdef USE_AS_WCSRCHR > - andq $-CHAR_SIZE, %rax > -#endif > - ret > - > - .p2align 4 > -L(first_vec_x1): > - PCMPEQ %xmm0, %xmm2 > - pmovmskb %xmm2, %eax > - leal -1(%rcx), %edx > - xorl %edx, %ecx > - andl %ecx, %eax > - jz L(first_vec_x0_test) > - bsrl %eax, %eax > - leaq (VEC_SIZE)(%rdi, %rax), %rax > -#ifdef USE_AS_WCSRCHR > - andq $-CHAR_SIZE, %rax > -#endif > - ret > - > - .p2align 4 > -L(first_vec_x1_test): > - PCMPEQ %xmm0, %xmm2 > - pmovmskb %xmm2, %eax > - testl %eax, %eax > - jz L(first_vec_x0_test) > - bsrl %eax, %eax > - leaq (VEC_SIZE)(%rdi, %rax), %rax > -#ifdef USE_AS_WCSRCHR > - andq $-CHAR_SIZE, %rax > -#endif > - ret > - > - .p2align 4 > -L(first_vec_x2): > - PCMPEQ %xmm0, %xmm3 > - pmovmskb %xmm3, %eax > - leal -1(%rcx), %edx > - xorl %edx, %ecx > - andl %ecx, %eax > - jz L(first_vec_x1_test) > - bsrl %eax, %eax > - leaq (VEC_SIZE * 2)(%rdi, %rax), %rax > -#ifdef USE_AS_WCSRCHR > - andq $-CHAR_SIZE, %rax > -#endif > - ret > - > - .p2align 4 > -L(aligned_more): > - /* Save original pointer if match was in VEC 0. */ > - movq %rdi, %r8 > - andq $-VEC_SIZE, %rdi > - > - movaps VEC_SIZE(%rdi), %xmm2 > - pxor %xmm3, %xmm3 > - PCMPEQ %xmm2, %xmm3 > - pmovmskb %xmm3, %ecx > - testl %ecx, %ecx > - jnz L(first_vec_x1) > - > - movaps (VEC_SIZE * 2)(%rdi), %xmm3 > - pxor %xmm4, %xmm4 > - PCMPEQ %xmm3, %xmm4 > - pmovmskb %xmm4, %ecx > - testl %ecx, %ecx > - jnz L(first_vec_x2) > - > - addq $VEC_SIZE, %rdi > - /* Save pointer again before realigning. */ > - movq %rdi, %rsi > - andq $-(VEC_SIZE * 2), %rdi > - .p2align 4 > -L(first_loop): > - /* Do 2x VEC at a time. */ > - movaps (VEC_SIZE * 2)(%rdi), %xmm4 > - movaps (VEC_SIZE * 3)(%rdi), %xmm5 > - /* Since SSE2 no pminud so wcsrchr needs seperate logic for > - detecting zero. Note if this is found to be a bottleneck it > - may be worth adding an SSE4.1 wcsrchr implementation. */ > -#ifdef USE_AS_WCSRCHR > - movaps %xmm5, %xmm6 > - pxor %xmm8, %xmm8 > - > - PCMPEQ %xmm8, %xmm5 > - PCMPEQ %xmm4, %xmm8 > - por %xmm5, %xmm8 > -#else > - movaps %xmm5, %xmm6 > - PMINU %xmm4, %xmm5 > -#endif > - > - movaps %xmm4, %xmm9 > - PCMPEQ %xmm0, %xmm4 > - PCMPEQ %xmm0, %xmm6 > - movaps %xmm6, %xmm7 > - por %xmm4, %xmm6 > -#ifndef USE_AS_WCSRCHR > - pxor %xmm8, %xmm8 > - PCMPEQ %xmm5, %xmm8 > -#endif > - pmovmskb %xmm8, %ecx > - pmovmskb %xmm6, %eax > - > - addq $(VEC_SIZE * 2), %rdi > - /* Use `addl` 1) so we can undo it with `subl` and 2) it can > - macro-fuse with `jz`. */ > - addl %ecx, %eax > - jz L(first_loop) > - > - /* Check if there is zero match. */ > - testl %ecx, %ecx > - jz L(second_loop_match) > - > - /* Check if there was a match in last iteration. */ > - subl %ecx, %eax > - jnz L(new_match) > - > -L(first_loop_old_match): > - PCMPEQ %xmm0, %xmm2 > - PCMPEQ %xmm0, %xmm3 > - pmovmskb %xmm2, %ecx > - pmovmskb %xmm3, %eax > - addl %eax, %ecx > - jz L(first_vec_x0_test) > - /* NB: We could move this shift to before the branch and save a > - bit of code size / performance on the fall through. The > - branch leads to the null case which generally seems hotter > - than char in first 3x VEC. */ > - sall $16, %eax > - orl %ecx, %eax > - > - bsrl %eax, %eax > - addq %rsi, %rax > -#ifdef USE_AS_WCSRCHR > - andq $-CHAR_SIZE, %rax > -#endif > - ret > - > - .p2align 4 > -L(new_match): > - pxor %xmm6, %xmm6 > - PCMPEQ %xmm9, %xmm6 > - pmovmskb %xmm6, %eax > - sall $16, %ecx > - orl %eax, %ecx > - > - /* We can't reuse either of the old comparisons as since we mask > - of zeros after first zero (instead of using the full > - comparison) we can't gurantee no interference between match > - after end of string and valid match. */ > - pmovmskb %xmm4, %eax > - pmovmskb %xmm7, %edx > - sall $16, %edx > - orl %edx, %eax > - > - leal -1(%ecx), %edx > - xorl %edx, %ecx > - andl %ecx, %eax > - jz L(first_loop_old_match) > - bsrl %eax, %eax > - addq %rdi, %rax > -#ifdef USE_AS_WCSRCHR > - andq $-CHAR_SIZE, %rax > -#endif > - ret > - > - /* Save minimum state for getting most recent match. We can > - throw out all previous work. */ > - .p2align 4 > -L(second_loop_match): > - movq %rdi, %rsi > - movaps %xmm4, %xmm2 > - movaps %xmm7, %xmm3 > - > - .p2align 4 > -L(second_loop): > - movaps (VEC_SIZE * 2)(%rdi), %xmm4 > - movaps (VEC_SIZE * 3)(%rdi), %xmm5 > - /* Since SSE2 no pminud so wcsrchr needs seperate logic for > - detecting zero. Note if this is found to be a bottleneck it > - may be worth adding an SSE4.1 wcsrchr implementation. */ > -#ifdef USE_AS_WCSRCHR > - movaps %xmm5, %xmm6 > - pxor %xmm8, %xmm8 > - > - PCMPEQ %xmm8, %xmm5 > - PCMPEQ %xmm4, %xmm8 > - por %xmm5, %xmm8 > -#else > - movaps %xmm5, %xmm6 > - PMINU %xmm4, %xmm5 > -#endif > - > - movaps %xmm4, %xmm9 > - PCMPEQ %xmm0, %xmm4 > - PCMPEQ %xmm0, %xmm6 > - movaps %xmm6, %xmm7 > - por %xmm4, %xmm6 > -#ifndef USE_AS_WCSRCHR > - pxor %xmm8, %xmm8 > - PCMPEQ %xmm5, %xmm8 > -#endif > - > - pmovmskb %xmm8, %ecx > - pmovmskb %xmm6, %eax > - > - addq $(VEC_SIZE * 2), %rdi > - /* Either null term or new occurence of CHAR. */ > - addl %ecx, %eax > - jz L(second_loop) > - > - /* No null term so much be new occurence of CHAR. */ > - testl %ecx, %ecx > - jz L(second_loop_match) > - > - > - subl %ecx, %eax > - jnz L(second_loop_new_match) > - > -L(second_loop_old_match): > - pmovmskb %xmm2, %ecx > - pmovmskb %xmm3, %eax > - sall $16, %eax > - orl %ecx, %eax > - bsrl %eax, %eax > - addq %rsi, %rax > -#ifdef USE_AS_WCSRCHR > - andq $-CHAR_SIZE, %rax > -#endif > - ret > - > - .p2align 4 > -L(second_loop_new_match): > - pxor %xmm6, %xmm6 > - PCMPEQ %xmm9, %xmm6 > - pmovmskb %xmm6, %eax > - sall $16, %ecx > - orl %eax, %ecx > - > - /* We can't reuse either of the old comparisons as since we mask > - of zeros after first zero (instead of using the full > - comparison) we can't gurantee no interference between match > - after end of string and valid match. */ > - pmovmskb %xmm4, %eax > - pmovmskb %xmm7, %edx > - sall $16, %edx > - orl %edx, %eax > - > - leal -1(%ecx), %edx > - xorl %edx, %ecx > - andl %ecx, %eax > - jz L(second_loop_old_match) > - bsrl %eax, %eax > - addq %rdi, %rax > -#ifdef USE_AS_WCSRCHR > - andq $-CHAR_SIZE, %rax > -#endif > - ret > - > - .p2align 4,, 4 > -L(cross_page): > - movq %rdi, %rsi > - andq $-VEC_SIZE, %rsi > - movaps (%rsi), %xmm1 > - pxor %xmm2, %xmm2 > - PCMPEQ %xmm1, %xmm2 > - pmovmskb %xmm2, %edx > - movl %edi, %ecx > - andl $(VEC_SIZE - 1), %ecx > - sarl %cl, %edx > - jz L(cross_page_continue) > - PCMPEQ %xmm0, %xmm1 > - pmovmskb %xmm1, %eax > - sarl %cl, %eax > - leal -1(%rdx), %ecx > - xorl %edx, %ecx > - andl %ecx, %eax > - jz L(ret1) > - bsrl %eax, %eax > - addq %rdi, %rax > -#ifdef USE_AS_WCSRCHR > - andq $-CHAR_SIZE, %rax > -#endif > -L(ret1): > - ret > -END(STRRCHR) > - > -#ifndef USE_AS_WCSRCHR > - weak_alias (STRRCHR, rindex) > - libc_hidden_builtin_def (STRRCHR) > -#endif > +#define STRRCHR strrchr > +#include "multiarch/strrchr-sse2.S" > +weak_alias (strrchr, rindex) > +libc_hidden_builtin_def (strrchr) > diff --git a/sysdeps/x86_64/wcsrchr.S b/sysdeps/x86_64/wcsrchr.S > index 2b80efc5ef..1d4b1eb21c 100644 > --- a/sysdeps/x86_64/wcsrchr.S > +++ b/sysdeps/x86_64/wcsrchr.S > @@ -16,12 +16,5 @@ > License along with the GNU C Library; if not, see > <https://www.gnu.org/licenses/>. */ > > - > -#define USE_AS_WCSRCHR 1 > -#define NO_PMINU 1 > - > -#ifndef STRRCHR > -# define STRRCHR wcsrchr > -#endif > - > -#include "../strrchr.S" > +#define STRRCHR wcsrchr > +#include "multiarch/wcsrchr-sse2.S" > -- > 2.34.1 > LGTM. Thanks.
diff --git a/sysdeps/x86_64/multiarch/strrchr-sse2.S b/sysdeps/x86_64/multiarch/strrchr-sse2.S index 866396e947..6ee7a5e33a 100644 --- a/sysdeps/x86_64/multiarch/strrchr-sse2.S +++ b/sysdeps/x86_64/multiarch/strrchr-sse2.S @@ -17,12 +17,358 @@ <https://www.gnu.org/licenses/>. */ #if IS_IN (libc) -# define STRRCHR __strrchr_sse2 +# ifndef STRRCHR +# define STRRCHR __strrchr_sse2 +# endif +#endif + +#include <sysdep.h> + +#ifdef USE_AS_WCSRCHR +# define PCMPEQ pcmpeqd +# define CHAR_SIZE 4 +# define PMINU pminud +#else +# define PCMPEQ pcmpeqb +# define CHAR_SIZE 1 +# define PMINU pminub +#endif + +#define PAGE_SIZE 4096 +#define VEC_SIZE 16 + + .text +ENTRY(STRRCHR) + movd %esi, %xmm0 + movq %rdi, %rax + andl $(PAGE_SIZE - 1), %eax +#ifndef USE_AS_WCSRCHR + punpcklbw %xmm0, %xmm0 + punpcklwd %xmm0, %xmm0 +#endif + pshufd $0, %xmm0, %xmm0 + cmpl $(PAGE_SIZE - VEC_SIZE), %eax + ja L(cross_page) + +L(cross_page_continue): + movups (%rdi), %xmm1 + pxor %xmm2, %xmm2 + PCMPEQ %xmm1, %xmm2 + pmovmskb %xmm2, %ecx + testl %ecx, %ecx + jz L(aligned_more) + + PCMPEQ %xmm0, %xmm1 + pmovmskb %xmm1, %eax + leal -1(%rcx), %edx + xorl %edx, %ecx + andl %ecx, %eax + jz L(ret0) + bsrl %eax, %eax + addq %rdi, %rax + /* We are off by 3 for wcsrchr if search CHAR is non-zero. If + search CHAR is zero we are correct. Either way `andq + -CHAR_SIZE, %rax` gets the correct result. */ +#ifdef USE_AS_WCSRCHR + andq $-CHAR_SIZE, %rax +#endif +L(ret0): + ret + + /* Returns for first vec x1/x2 have hard coded backward search + path for earlier matches. */ + .p2align 4 +L(first_vec_x0_test): + PCMPEQ %xmm0, %xmm1 + pmovmskb %xmm1, %eax + testl %eax, %eax + jz L(ret0) + bsrl %eax, %eax + addq %r8, %rax +#ifdef USE_AS_WCSRCHR + andq $-CHAR_SIZE, %rax +#endif + ret + + .p2align 4 +L(first_vec_x1): + PCMPEQ %xmm0, %xmm2 + pmovmskb %xmm2, %eax + leal -1(%rcx), %edx + xorl %edx, %ecx + andl %ecx, %eax + jz L(first_vec_x0_test) + bsrl %eax, %eax + leaq (VEC_SIZE)(%rdi, %rax), %rax +#ifdef USE_AS_WCSRCHR + andq $-CHAR_SIZE, %rax +#endif + ret + + .p2align 4 +L(first_vec_x1_test): + PCMPEQ %xmm0, %xmm2 + pmovmskb %xmm2, %eax + testl %eax, %eax + jz L(first_vec_x0_test) + bsrl %eax, %eax + leaq (VEC_SIZE)(%rdi, %rax), %rax +#ifdef USE_AS_WCSRCHR + andq $-CHAR_SIZE, %rax +#endif + ret + + .p2align 4 +L(first_vec_x2): + PCMPEQ %xmm0, %xmm3 + pmovmskb %xmm3, %eax + leal -1(%rcx), %edx + xorl %edx, %ecx + andl %ecx, %eax + jz L(first_vec_x1_test) + bsrl %eax, %eax + leaq (VEC_SIZE * 2)(%rdi, %rax), %rax +#ifdef USE_AS_WCSRCHR + andq $-CHAR_SIZE, %rax +#endif + ret + + .p2align 4 +L(aligned_more): + /* Save original pointer if match was in VEC 0. */ + movq %rdi, %r8 + andq $-VEC_SIZE, %rdi + + movaps VEC_SIZE(%rdi), %xmm2 + pxor %xmm3, %xmm3 + PCMPEQ %xmm2, %xmm3 + pmovmskb %xmm3, %ecx + testl %ecx, %ecx + jnz L(first_vec_x1) + + movaps (VEC_SIZE * 2)(%rdi), %xmm3 + pxor %xmm4, %xmm4 + PCMPEQ %xmm3, %xmm4 + pmovmskb %xmm4, %ecx + testl %ecx, %ecx + jnz L(first_vec_x2) + + addq $VEC_SIZE, %rdi + /* Save pointer again before realigning. */ + movq %rdi, %rsi + andq $-(VEC_SIZE * 2), %rdi + .p2align 4 +L(first_loop): + /* Do 2x VEC at a time. */ + movaps (VEC_SIZE * 2)(%rdi), %xmm4 + movaps (VEC_SIZE * 3)(%rdi), %xmm5 + /* Since SSE2 no pminud so wcsrchr needs seperate logic for + detecting zero. Note if this is found to be a bottleneck it + may be worth adding an SSE4.1 wcsrchr implementation. */ +#ifdef USE_AS_WCSRCHR + movaps %xmm5, %xmm6 + pxor %xmm8, %xmm8 + + PCMPEQ %xmm8, %xmm5 + PCMPEQ %xmm4, %xmm8 + por %xmm5, %xmm8 +#else + movaps %xmm5, %xmm6 + PMINU %xmm4, %xmm5 +#endif + + movaps %xmm4, %xmm9 + PCMPEQ %xmm0, %xmm4 + PCMPEQ %xmm0, %xmm6 + movaps %xmm6, %xmm7 + por %xmm4, %xmm6 +#ifndef USE_AS_WCSRCHR + pxor %xmm8, %xmm8 + PCMPEQ %xmm5, %xmm8 +#endif + pmovmskb %xmm8, %ecx + pmovmskb %xmm6, %eax -# undef weak_alias -# define weak_alias(strrchr, rindex) -# undef libc_hidden_builtin_def -# define libc_hidden_builtin_def(strrchr) + addq $(VEC_SIZE * 2), %rdi + /* Use `addl` 1) so we can undo it with `subl` and 2) it can + macro-fuse with `jz`. */ + addl %ecx, %eax + jz L(first_loop) + + /* Check if there is zero match. */ + testl %ecx, %ecx + jz L(second_loop_match) + + /* Check if there was a match in last iteration. */ + subl %ecx, %eax + jnz L(new_match) + +L(first_loop_old_match): + PCMPEQ %xmm0, %xmm2 + PCMPEQ %xmm0, %xmm3 + pmovmskb %xmm2, %ecx + pmovmskb %xmm3, %eax + addl %eax, %ecx + jz L(first_vec_x0_test) + /* NB: We could move this shift to before the branch and save a + bit of code size / performance on the fall through. The + branch leads to the null case which generally seems hotter + than char in first 3x VEC. */ + sall $16, %eax + orl %ecx, %eax + + bsrl %eax, %eax + addq %rsi, %rax +#ifdef USE_AS_WCSRCHR + andq $-CHAR_SIZE, %rax +#endif + ret + + .p2align 4 +L(new_match): + pxor %xmm6, %xmm6 + PCMPEQ %xmm9, %xmm6 + pmovmskb %xmm6, %eax + sall $16, %ecx + orl %eax, %ecx + + /* We can't reuse either of the old comparisons as since we mask + of zeros after first zero (instead of using the full + comparison) we can't gurantee no interference between match + after end of string and valid match. */ + pmovmskb %xmm4, %eax + pmovmskb %xmm7, %edx + sall $16, %edx + orl %edx, %eax + + leal -1(%ecx), %edx + xorl %edx, %ecx + andl %ecx, %eax + jz L(first_loop_old_match) + bsrl %eax, %eax + addq %rdi, %rax +#ifdef USE_AS_WCSRCHR + andq $-CHAR_SIZE, %rax +#endif + ret + + /* Save minimum state for getting most recent match. We can + throw out all previous work. */ + .p2align 4 +L(second_loop_match): + movq %rdi, %rsi + movaps %xmm4, %xmm2 + movaps %xmm7, %xmm3 + + .p2align 4 +L(second_loop): + movaps (VEC_SIZE * 2)(%rdi), %xmm4 + movaps (VEC_SIZE * 3)(%rdi), %xmm5 + /* Since SSE2 no pminud so wcsrchr needs seperate logic for + detecting zero. Note if this is found to be a bottleneck it + may be worth adding an SSE4.1 wcsrchr implementation. */ +#ifdef USE_AS_WCSRCHR + movaps %xmm5, %xmm6 + pxor %xmm8, %xmm8 + + PCMPEQ %xmm8, %xmm5 + PCMPEQ %xmm4, %xmm8 + por %xmm5, %xmm8 +#else + movaps %xmm5, %xmm6 + PMINU %xmm4, %xmm5 +#endif + + movaps %xmm4, %xmm9 + PCMPEQ %xmm0, %xmm4 + PCMPEQ %xmm0, %xmm6 + movaps %xmm6, %xmm7 + por %xmm4, %xmm6 +#ifndef USE_AS_WCSRCHR + pxor %xmm8, %xmm8 + PCMPEQ %xmm5, %xmm8 #endif -#include "../strrchr.S" + pmovmskb %xmm8, %ecx + pmovmskb %xmm6, %eax + + addq $(VEC_SIZE * 2), %rdi + /* Either null term or new occurence of CHAR. */ + addl %ecx, %eax + jz L(second_loop) + + /* No null term so much be new occurence of CHAR. */ + testl %ecx, %ecx + jz L(second_loop_match) + + + subl %ecx, %eax + jnz L(second_loop_new_match) + +L(second_loop_old_match): + pmovmskb %xmm2, %ecx + pmovmskb %xmm3, %eax + sall $16, %eax + orl %ecx, %eax + bsrl %eax, %eax + addq %rsi, %rax +#ifdef USE_AS_WCSRCHR + andq $-CHAR_SIZE, %rax +#endif + ret + + .p2align 4 +L(second_loop_new_match): + pxor %xmm6, %xmm6 + PCMPEQ %xmm9, %xmm6 + pmovmskb %xmm6, %eax + sall $16, %ecx + orl %eax, %ecx + + /* We can't reuse either of the old comparisons as since we mask + of zeros after first zero (instead of using the full + comparison) we can't gurantee no interference between match + after end of string and valid match. */ + pmovmskb %xmm4, %eax + pmovmskb %xmm7, %edx + sall $16, %edx + orl %edx, %eax + + leal -1(%ecx), %edx + xorl %edx, %ecx + andl %ecx, %eax + jz L(second_loop_old_match) + bsrl %eax, %eax + addq %rdi, %rax +#ifdef USE_AS_WCSRCHR + andq $-CHAR_SIZE, %rax +#endif + ret + + .p2align 4,, 4 +L(cross_page): + movq %rdi, %rsi + andq $-VEC_SIZE, %rsi + movaps (%rsi), %xmm1 + pxor %xmm2, %xmm2 + PCMPEQ %xmm1, %xmm2 + pmovmskb %xmm2, %edx + movl %edi, %ecx + andl $(VEC_SIZE - 1), %ecx + sarl %cl, %edx + jz L(cross_page_continue) + PCMPEQ %xmm0, %xmm1 + pmovmskb %xmm1, %eax + sarl %cl, %eax + leal -1(%rdx), %ecx + xorl %edx, %ecx + andl %ecx, %eax + jz L(ret1) + bsrl %eax, %eax + addq %rdi, %rax +#ifdef USE_AS_WCSRCHR + andq $-CHAR_SIZE, %rax +#endif +L(ret1): + ret +END(STRRCHR) diff --git a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S index 69d2f3cdb1..d9259720f8 100644 --- a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S +++ b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S @@ -17,6 +17,12 @@ <https://www.gnu.org/licenses/>. */ #if IS_IN (libc) -# define STRRCHR __wcsrchr_sse2 +# ifndef STRRCHR +# define STRRCHR __wcsrchr_sse2 +# endif #endif -#include "../wcsrchr.S" + +#define USE_AS_WCSRCHR 1 +#define NO_PMINU 1 + +#include "strrchr-sse2.S" diff --git a/sysdeps/x86_64/strrchr.S b/sysdeps/x86_64/strrchr.S index 4d7ba4ceb2..f39da60454 100644 --- a/sysdeps/x86_64/strrchr.S +++ b/sysdeps/x86_64/strrchr.S @@ -16,363 +16,7 @@ License along with the GNU C Library; if not, see <https://www.gnu.org/licenses/>. */ - -#include <sysdep.h> - -#ifndef STRRCHR -# define STRRCHR strrchr -#endif - -#ifdef USE_AS_WCSRCHR -# define PCMPEQ pcmpeqd -# define CHAR_SIZE 4 -# define PMINU pminud -#else -# define PCMPEQ pcmpeqb -# define CHAR_SIZE 1 -# define PMINU pminub -#endif - -#define PAGE_SIZE 4096 -#define VEC_SIZE 16 - - .text -ENTRY(STRRCHR) - movd %esi, %xmm0 - movq %rdi, %rax - andl $(PAGE_SIZE - 1), %eax -#ifndef USE_AS_WCSRCHR - punpcklbw %xmm0, %xmm0 - punpcklwd %xmm0, %xmm0 -#endif - pshufd $0, %xmm0, %xmm0 - cmpl $(PAGE_SIZE - VEC_SIZE), %eax - ja L(cross_page) - -L(cross_page_continue): - movups (%rdi), %xmm1 - pxor %xmm2, %xmm2 - PCMPEQ %xmm1, %xmm2 - pmovmskb %xmm2, %ecx - testl %ecx, %ecx - jz L(aligned_more) - - PCMPEQ %xmm0, %xmm1 - pmovmskb %xmm1, %eax - leal -1(%rcx), %edx - xorl %edx, %ecx - andl %ecx, %eax - jz L(ret0) - bsrl %eax, %eax - addq %rdi, %rax - /* We are off by 3 for wcsrchr if search CHAR is non-zero. If - search CHAR is zero we are correct. Either way `andq - -CHAR_SIZE, %rax` gets the correct result. */ -#ifdef USE_AS_WCSRCHR - andq $-CHAR_SIZE, %rax -#endif -L(ret0): - ret - - /* Returns for first vec x1/x2 have hard coded backward search - path for earlier matches. */ - .p2align 4 -L(first_vec_x0_test): - PCMPEQ %xmm0, %xmm1 - pmovmskb %xmm1, %eax - testl %eax, %eax - jz L(ret0) - bsrl %eax, %eax - addq %r8, %rax -#ifdef USE_AS_WCSRCHR - andq $-CHAR_SIZE, %rax -#endif - ret - - .p2align 4 -L(first_vec_x1): - PCMPEQ %xmm0, %xmm2 - pmovmskb %xmm2, %eax - leal -1(%rcx), %edx - xorl %edx, %ecx - andl %ecx, %eax - jz L(first_vec_x0_test) - bsrl %eax, %eax - leaq (VEC_SIZE)(%rdi, %rax), %rax -#ifdef USE_AS_WCSRCHR - andq $-CHAR_SIZE, %rax -#endif - ret - - .p2align 4 -L(first_vec_x1_test): - PCMPEQ %xmm0, %xmm2 - pmovmskb %xmm2, %eax - testl %eax, %eax - jz L(first_vec_x0_test) - bsrl %eax, %eax - leaq (VEC_SIZE)(%rdi, %rax), %rax -#ifdef USE_AS_WCSRCHR - andq $-CHAR_SIZE, %rax -#endif - ret - - .p2align 4 -L(first_vec_x2): - PCMPEQ %xmm0, %xmm3 - pmovmskb %xmm3, %eax - leal -1(%rcx), %edx - xorl %edx, %ecx - andl %ecx, %eax - jz L(first_vec_x1_test) - bsrl %eax, %eax - leaq (VEC_SIZE * 2)(%rdi, %rax), %rax -#ifdef USE_AS_WCSRCHR - andq $-CHAR_SIZE, %rax -#endif - ret - - .p2align 4 -L(aligned_more): - /* Save original pointer if match was in VEC 0. */ - movq %rdi, %r8 - andq $-VEC_SIZE, %rdi - - movaps VEC_SIZE(%rdi), %xmm2 - pxor %xmm3, %xmm3 - PCMPEQ %xmm2, %xmm3 - pmovmskb %xmm3, %ecx - testl %ecx, %ecx - jnz L(first_vec_x1) - - movaps (VEC_SIZE * 2)(%rdi), %xmm3 - pxor %xmm4, %xmm4 - PCMPEQ %xmm3, %xmm4 - pmovmskb %xmm4, %ecx - testl %ecx, %ecx - jnz L(first_vec_x2) - - addq $VEC_SIZE, %rdi - /* Save pointer again before realigning. */ - movq %rdi, %rsi - andq $-(VEC_SIZE * 2), %rdi - .p2align 4 -L(first_loop): - /* Do 2x VEC at a time. */ - movaps (VEC_SIZE * 2)(%rdi), %xmm4 - movaps (VEC_SIZE * 3)(%rdi), %xmm5 - /* Since SSE2 no pminud so wcsrchr needs seperate logic for - detecting zero. Note if this is found to be a bottleneck it - may be worth adding an SSE4.1 wcsrchr implementation. */ -#ifdef USE_AS_WCSRCHR - movaps %xmm5, %xmm6 - pxor %xmm8, %xmm8 - - PCMPEQ %xmm8, %xmm5 - PCMPEQ %xmm4, %xmm8 - por %xmm5, %xmm8 -#else - movaps %xmm5, %xmm6 - PMINU %xmm4, %xmm5 -#endif - - movaps %xmm4, %xmm9 - PCMPEQ %xmm0, %xmm4 - PCMPEQ %xmm0, %xmm6 - movaps %xmm6, %xmm7 - por %xmm4, %xmm6 -#ifndef USE_AS_WCSRCHR - pxor %xmm8, %xmm8 - PCMPEQ %xmm5, %xmm8 -#endif - pmovmskb %xmm8, %ecx - pmovmskb %xmm6, %eax - - addq $(VEC_SIZE * 2), %rdi - /* Use `addl` 1) so we can undo it with `subl` and 2) it can - macro-fuse with `jz`. */ - addl %ecx, %eax - jz L(first_loop) - - /* Check if there is zero match. */ - testl %ecx, %ecx - jz L(second_loop_match) - - /* Check if there was a match in last iteration. */ - subl %ecx, %eax - jnz L(new_match) - -L(first_loop_old_match): - PCMPEQ %xmm0, %xmm2 - PCMPEQ %xmm0, %xmm3 - pmovmskb %xmm2, %ecx - pmovmskb %xmm3, %eax - addl %eax, %ecx - jz L(first_vec_x0_test) - /* NB: We could move this shift to before the branch and save a - bit of code size / performance on the fall through. The - branch leads to the null case which generally seems hotter - than char in first 3x VEC. */ - sall $16, %eax - orl %ecx, %eax - - bsrl %eax, %eax - addq %rsi, %rax -#ifdef USE_AS_WCSRCHR - andq $-CHAR_SIZE, %rax -#endif - ret - - .p2align 4 -L(new_match): - pxor %xmm6, %xmm6 - PCMPEQ %xmm9, %xmm6 - pmovmskb %xmm6, %eax - sall $16, %ecx - orl %eax, %ecx - - /* We can't reuse either of the old comparisons as since we mask - of zeros after first zero (instead of using the full - comparison) we can't gurantee no interference between match - after end of string and valid match. */ - pmovmskb %xmm4, %eax - pmovmskb %xmm7, %edx - sall $16, %edx - orl %edx, %eax - - leal -1(%ecx), %edx - xorl %edx, %ecx - andl %ecx, %eax - jz L(first_loop_old_match) - bsrl %eax, %eax - addq %rdi, %rax -#ifdef USE_AS_WCSRCHR - andq $-CHAR_SIZE, %rax -#endif - ret - - /* Save minimum state for getting most recent match. We can - throw out all previous work. */ - .p2align 4 -L(second_loop_match): - movq %rdi, %rsi - movaps %xmm4, %xmm2 - movaps %xmm7, %xmm3 - - .p2align 4 -L(second_loop): - movaps (VEC_SIZE * 2)(%rdi), %xmm4 - movaps (VEC_SIZE * 3)(%rdi), %xmm5 - /* Since SSE2 no pminud so wcsrchr needs seperate logic for - detecting zero. Note if this is found to be a bottleneck it - may be worth adding an SSE4.1 wcsrchr implementation. */ -#ifdef USE_AS_WCSRCHR - movaps %xmm5, %xmm6 - pxor %xmm8, %xmm8 - - PCMPEQ %xmm8, %xmm5 - PCMPEQ %xmm4, %xmm8 - por %xmm5, %xmm8 -#else - movaps %xmm5, %xmm6 - PMINU %xmm4, %xmm5 -#endif - - movaps %xmm4, %xmm9 - PCMPEQ %xmm0, %xmm4 - PCMPEQ %xmm0, %xmm6 - movaps %xmm6, %xmm7 - por %xmm4, %xmm6 -#ifndef USE_AS_WCSRCHR - pxor %xmm8, %xmm8 - PCMPEQ %xmm5, %xmm8 -#endif - - pmovmskb %xmm8, %ecx - pmovmskb %xmm6, %eax - - addq $(VEC_SIZE * 2), %rdi - /* Either null term or new occurence of CHAR. */ - addl %ecx, %eax - jz L(second_loop) - - /* No null term so much be new occurence of CHAR. */ - testl %ecx, %ecx - jz L(second_loop_match) - - - subl %ecx, %eax - jnz L(second_loop_new_match) - -L(second_loop_old_match): - pmovmskb %xmm2, %ecx - pmovmskb %xmm3, %eax - sall $16, %eax - orl %ecx, %eax - bsrl %eax, %eax - addq %rsi, %rax -#ifdef USE_AS_WCSRCHR - andq $-CHAR_SIZE, %rax -#endif - ret - - .p2align 4 -L(second_loop_new_match): - pxor %xmm6, %xmm6 - PCMPEQ %xmm9, %xmm6 - pmovmskb %xmm6, %eax - sall $16, %ecx - orl %eax, %ecx - - /* We can't reuse either of the old comparisons as since we mask - of zeros after first zero (instead of using the full - comparison) we can't gurantee no interference between match - after end of string and valid match. */ - pmovmskb %xmm4, %eax - pmovmskb %xmm7, %edx - sall $16, %edx - orl %edx, %eax - - leal -1(%ecx), %edx - xorl %edx, %ecx - andl %ecx, %eax - jz L(second_loop_old_match) - bsrl %eax, %eax - addq %rdi, %rax -#ifdef USE_AS_WCSRCHR - andq $-CHAR_SIZE, %rax -#endif - ret - - .p2align 4,, 4 -L(cross_page): - movq %rdi, %rsi - andq $-VEC_SIZE, %rsi - movaps (%rsi), %xmm1 - pxor %xmm2, %xmm2 - PCMPEQ %xmm1, %xmm2 - pmovmskb %xmm2, %edx - movl %edi, %ecx - andl $(VEC_SIZE - 1), %ecx - sarl %cl, %edx - jz L(cross_page_continue) - PCMPEQ %xmm0, %xmm1 - pmovmskb %xmm1, %eax - sarl %cl, %eax - leal -1(%rdx), %ecx - xorl %edx, %ecx - andl %ecx, %eax - jz L(ret1) - bsrl %eax, %eax - addq %rdi, %rax -#ifdef USE_AS_WCSRCHR - andq $-CHAR_SIZE, %rax -#endif -L(ret1): - ret -END(STRRCHR) - -#ifndef USE_AS_WCSRCHR - weak_alias (STRRCHR, rindex) - libc_hidden_builtin_def (STRRCHR) -#endif +#define STRRCHR strrchr +#include "multiarch/strrchr-sse2.S" +weak_alias (strrchr, rindex) +libc_hidden_builtin_def (strrchr) diff --git a/sysdeps/x86_64/wcsrchr.S b/sysdeps/x86_64/wcsrchr.S index 2b80efc5ef..1d4b1eb21c 100644 --- a/sysdeps/x86_64/wcsrchr.S +++ b/sysdeps/x86_64/wcsrchr.S @@ -16,12 +16,5 @@ License along with the GNU C Library; if not, see <https://www.gnu.org/licenses/>. */ - -#define USE_AS_WCSRCHR 1 -#define NO_PMINU 1 - -#ifndef STRRCHR -# define STRRCHR wcsrchr -#endif - -#include "../strrchr.S" +#define STRRCHR wcsrchr +#include "multiarch/wcsrchr-sse2.S"