Message ID | 20150624111855.GA15322@domone |
---|---|
State | New |
Headers | show |
On Wed, Jun 24, 2015 at 01:18:55PM +0200, Ondřej Bílka wrote: > Hi, > > As I wrote previous patches and was about to write strncasecmp I > realized that it would be easier to write and review that using macros > like existing one. > > So here is condensed version of previous patches. I will use that as so > we don't have to write same optimization twice in future. > > Also with these a sse42 implementation could be finally removed. > > > * sysdeps/x86_64/locale-defines.sym: Add LOCALE_TOLOWER. > * sysdeps/x86_64/multiarch/Makefile (routines): > Add strcmp-avx2, strncmp-avx2, strcasecmp-avx2, > strncasecmp-avx2 > * sysdeps/x86_64/multiarch/ifunc-impl-list.c: Update. > * sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S: Implement > strncasecmp, strncmp, strcasecmp. > * sysdeps/x86_64/multiarch/strcmp-sse42.S: Remove. > * sysdeps/x86_64/multiarch/strcmp-avx2.S: New file. > * sysdeps/x86_64/multiarch/strcasecmp_l-sse2-unaligned.S: Likewise. > sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S: Likewise. > * sysdeps/x86_64/multiarch/strncase_l-sse2-unaligned.S: Likewise. > * sysdeps/x86_64/multiarch/strncase_l-avx2.S: Likewise. > * sysdeps/x86_64/multiarch/strncmp-sse2-unaligned.S: Likewise. > * sysdeps/x86_64/multiarch/strncmp-avx2.S: Likewise. > --- > sysdeps/x86_64/locale-defines.sym | 1 + > sysdeps/x86_64/multiarch/Makefile | 5 +- > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 37 +- > sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S | 5 + > .../x86_64/multiarch/strcasecmp_l-sse2-unaligned.S | 3 + > sysdeps/x86_64/multiarch/strcmp-avx2.S | 3 + > sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S | 469 ++++- > sysdeps/x86_64/multiarch/strcmp-sse42.S | 1792 -------------------- > sysdeps/x86_64/multiarch/strcmp.S | 87 +- > sysdeps/x86_64/multiarch/strncase_l-avx2.S | 6 + > .../x86_64/multiarch/strncase_l-sse2-unaligned.S | 4 + > sysdeps/x86_64/multiarch/strncmp-avx2.S | 4 + > sysdeps/x86_64/multiarch/strncmp-sse2-unaligned.S | 3 + > 13 files changed, 467 insertions(+), 1952 deletions(-) > create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S > create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-sse2-unaligned.S > create mode 100644 sysdeps/x86_64/multiarch/strcmp-avx2.S > delete mode 100644 sysdeps/x86_64/multiarch/strcmp-sse42.S > create mode 100644 sysdeps/x86_64/multiarch/strncase_l-avx2.S > create mode 100644 sysdeps/x86_64/multiarch/strncase_l-sse2-unaligned.S > create mode 100644 sysdeps/x86_64/multiarch/strncmp-avx2.S > create mode 100644 sysdeps/x86_64/multiarch/strncmp-sse2-unaligned.S > > diff --git a/sysdeps/x86_64/locale-defines.sym b/sysdeps/x86_64/locale-defines.sym > index aebff9a..804debb 100644 > --- a/sysdeps/x86_64/locale-defines.sym > +++ b/sysdeps/x86_64/locale-defines.sym > @@ -8,4 +8,5 @@ LOCALE_T___LOCALES offsetof (struct __locale_struct, __locales) > LC_CTYPE > _NL_CTYPE_NONASCII_CASE > LOCALE_DATA_VALUES offsetof (struct __locale_data, values) > +LOCALE_TOLOWER offsetof (struct __locale_struct, __ctype_tolower) > SIZEOF_VALUES sizeof (((struct __locale_data *) 0)->values[0]) > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile > index 679db2a..8094162 100644 > --- a/sysdeps/x86_64/multiarch/Makefile > +++ b/sysdeps/x86_64/multiarch/Makefile > @@ -7,12 +7,13 @@ endif > ifeq ($(subdir),string) > > sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \ > - strcmp-sse2-unaligned strncmp-ssse3 \ > + strcmp-sse2-unaligned strncmp-sse2-unaligned strncmp-ssse3 \ > memcpy-ssse3 \ > memcpy-sse2-unaligned mempcpy-ssse3 \ > memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \ > memmove-avx-unaligned memcpy-avx-unaligned mempcpy-avx-unaligned \ > memmove-ssse3-back strcasecmp_l-ssse3 \ > + strcasecmp_l-sse2-unaligned strncase_l-sse2-unaligned \ > strncase_l-ssse3 strcat-ssse3 strncat-ssse3\ > strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \ > strcpy-sse2-unaligned strncpy-sse2-unaligned \ > @@ -29,7 +30,7 @@ CFLAGS-strspn-c.c += -msse4 > endif > > ifeq (yes,$(config-cflags-avx2)) > -sysdep_routines += memset-avx2 strcpy-avx2 stpcpy-avx2 memcmp-avx2 > +sysdep_routines += memset-avx2 strcpy-avx2 stpcpy-avx2 memcmp-avx2 strcmp-avx2 strncmp-avx2 strcasecmp_l-avx2 strncase_l-avx2 > endif > endif > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > index b3dbe65..8c71030 100644 > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > @@ -94,20 +94,18 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > /* Support sysdeps/x86_64/multiarch/strcasecmp_l.S. */ > IFUNC_IMPL (i, name, strcasecmp, > - IFUNC_IMPL_ADD (array, i, strcasecmp, HAS_AVX, > - __strcasecmp_avx) > - IFUNC_IMPL_ADD (array, i, strcasecmp, HAS_SSE4_2, > - __strcasecmp_sse42) > + IFUNC_IMPL_ADD (array, i, strcasecmp, 1, > + __strcasecmp_sse2_unaligned) > + IFUNC_IMPL_ADD (array, i, strcasecmp, HAS_AVX2, > + __strcasecmp_avx2) > IFUNC_IMPL_ADD (array, i, strcasecmp, HAS_SSSE3, > __strcasecmp_ssse3) > IFUNC_IMPL_ADD (array, i, strcasecmp, 1, __strcasecmp_sse2)) > > /* Support sysdeps/x86_64/multiarch/strcasecmp_l.S. */ > IFUNC_IMPL (i, name, strcasecmp_l, > - IFUNC_IMPL_ADD (array, i, strcasecmp_l, HAS_AVX, > - __strcasecmp_l_avx) > - IFUNC_IMPL_ADD (array, i, strcasecmp_l, HAS_SSE4_2, > - __strcasecmp_l_sse42) > + IFUNC_IMPL_ADD (array, i, strcasecmp_l, 1, > + __strcasecmp_sse2_unaligned_l) > IFUNC_IMPL_ADD (array, i, strcasecmp_l, HAS_SSSE3, > __strcasecmp_l_ssse3) > IFUNC_IMPL_ADD (array, i, strcasecmp_l, 1, > @@ -130,7 +128,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > /* Support sysdeps/x86_64/multiarch/strcmp.S. */ > IFUNC_IMPL (i, name, strcmp, > - IFUNC_IMPL_ADD (array, i, strcmp, HAS_SSE4_2, __strcmp_sse42) > + IFUNC_IMPL_ADD (array, i, strcmp, HAS_AVX2, __strcmp_avx2) > IFUNC_IMPL_ADD (array, i, strcmp, HAS_SSSE3, __strcmp_ssse3) > IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_sse2_unaligned) > IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_sse2)) > @@ -150,10 +148,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > /* Support sysdeps/x86_64/multiarch/strncase_l.S. */ > IFUNC_IMPL (i, name, strncasecmp, > - IFUNC_IMPL_ADD (array, i, strncasecmp, HAS_AVX, > - __strncasecmp_avx) > - IFUNC_IMPL_ADD (array, i, strncasecmp, HAS_SSE4_2, > - __strncasecmp_sse42) > + IFUNC_IMPL_ADD (array, i, strncasecmp, HAS_AVX2, > + __strncasecmp_avx2) > + IFUNC_IMPL_ADD (array, i, strncasecmp, 1, > + __strncasecmp_sse2_unaligned) > IFUNC_IMPL_ADD (array, i, strncasecmp, HAS_SSSE3, > __strncasecmp_ssse3) > IFUNC_IMPL_ADD (array, i, strncasecmp, 1, > @@ -161,10 +159,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > /* Support sysdeps/x86_64/multiarch/strncase_l.S. */ > IFUNC_IMPL (i, name, strncasecmp_l, > - IFUNC_IMPL_ADD (array, i, strncasecmp_l, HAS_AVX, > - __strncasecmp_l_avx) > - IFUNC_IMPL_ADD (array, i, strncasecmp_l, HAS_SSE4_2, > - __strncasecmp_l_sse42) > + IFUNC_IMPL_ADD (array, i, strncasecmp_l, HAS_AVX2, > + __strncasecmp_avx2_l) > + IFUNC_IMPL_ADD (array, i, strncasecmp_l, 1, > + __strncasecmp_sse2_unaligned_l) > IFUNC_IMPL_ADD (array, i, strncasecmp_l, HAS_SSSE3, > __strncasecmp_l_ssse3) > IFUNC_IMPL_ADD (array, i, strncasecmp_l, 1, > @@ -261,8 +259,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > /* Support sysdeps/x86_64/multiarch/strncmp.S. */ > IFUNC_IMPL (i, name, strncmp, > - IFUNC_IMPL_ADD (array, i, strncmp, HAS_SSE4_2, > - __strncmp_sse42) > + IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_sse2_unaligned) > + IFUNC_IMPL_ADD (array, i, strncmp, HAS_AVX2, __strncmp_avx2) > + > IFUNC_IMPL_ADD (array, i, strncmp, HAS_SSSE3, > __strncmp_ssse3) > IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_sse2)) > diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S > new file mode 100644 > index 0000000..d10379f > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S > @@ -0,0 +1,5 @@ > +#define AS_STRCASECMP > +#define USE_AVX2 > +#define __strcasecmp_sse2_unaligned __strcasecmp_avx2 > +#define STRCMP __strcasecmp_avx2_l > +#include "strcmp-sse2-unaligned.S" > diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcasecmp_l-sse2-unaligned.S > new file mode 100644 > index 0000000..e2ed03f > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/strcasecmp_l-sse2-unaligned.S > @@ -0,0 +1,3 @@ > +#define AS_STRCASECMP > +#define STRCMP __strcasecmp_sse2_unaligned_l > +#include "strcmp-sse2-unaligned.S" > diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S > new file mode 100644 > index 0000000..606df63 > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S > @@ -0,0 +1,3 @@ > +#define USE_AVX2 > +#define STRCMP __strcmp_avx2 > +#include "strcmp-sse2-unaligned.S" > diff --git a/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S > index 20b65fa..ef67fb0 100644 > --- a/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S > +++ b/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S > @@ -18,29 +18,127 @@ > > #include "sysdep.h" > > -ENTRY ( __strcmp_sse2_unaligned) > - movl %edi, %eax > - xorl %edx, %edx > +#ifndef STRCMP > +# define STRCMP __strcmp_sse2_unaligned > +#endif > + > +#ifdef AS_STRCASECMP > +# include "locale-defines.h" > + > +# ifdef AS_STRNCMP > +ENTRY (__strncasecmp_sse2_unaligned) > + movq __libc_tsd_LOCALE@gottpoff(%rip), %rax > + mov %fs:(%rax), %rcx > + // XXX 5 byte should be before the function > + /* 5-byte NOP. */ > + .byte 0x0f,0x1f,0x44,0x00,0x00 > + > +END (__strncasecmp_sse2_unaligned) > + > +ENTRY (STRCMP) > + test %rdx, %rdx > + je L(ret_zero) > + mov LOCALE_TOLOWER(%rcx), %r11 > +# else > +ENTRY (__strcasecmp_sse2_unaligned) > + movq __libc_tsd_LOCALE@gottpoff(%rip), %rax > + mov %fs:(%rax), %rdx > + // XXX 5 byte should be before the function > + /* 5-byte NOP. */ > + .byte 0x0f,0x1f,0x44,0x00,0x00 > + > +END (__strcasecmp_sse2_unaligned) > + > +ENTRY (STRCMP) > + mov LOCALE_TOLOWER(%rdx), %r11 > +# endif > + movzbl (%rdi), %eax > + movzbl (%rsi), %ecx > + movl (%r11,%rax,4), %eax > + subl (%r11,%rcx,4), %eax > + je L(next) > +L(return): > + ret > +L(next): > + test %ecx, %ecx > + je L(return) > + leaq 1(%rsi), %rsi > + leaq 1(%rdi), %rdi > +#ifdef AS_STRNCMP > + sub $1, %rdx > +#endif > + > +#else > +ENTRY (STRCMP) > +#endif > + > +#ifdef AS_STRNCMP > + lea -1(%rdx), %r10 > + test %rdx, %rdx > + je L(ret_zero) > +L(back_to_start): > + xor %rdx, %rdx > +#endif > + > pxor %xmm7, %xmm7 > - orl %esi, %eax > + movl %esi, %eax > + andl $4095, %eax > + cmpl $4032, %eax > + jg L(cross_page) > + > + movl %edi, %eax > andl $4095, %eax > cmpl $4032, %eax > jg L(cross_page) > +#ifdef AS_STRNCMP > + cmp $64, %r10 > + jae L(dont_set_mask) > + bts %r10, %rdx > +L(dont_set_mask): > +#endif > + > movdqu (%rdi), %xmm1 > movdqu (%rsi), %xmm0 > pcmpeqb %xmm1, %xmm0 > pminub %xmm1, %xmm0 > - pxor %xmm1, %xmm1 > - pcmpeqb %xmm1, %xmm0 > - pmovmskb %xmm0, %eax > - testq %rax, %rax > + pcmpeqb %xmm7, %xmm0 > + pmovmskb %xmm0, %ecx > +#ifdef AS_STRNCMP > + or %dx, %cx > +#else > + test %ecx, %ecx > +#endif > je L(next_48_bytes) > -L(return): > - bsfq %rax, %rdx > +#ifdef AS_STRCASECMP > +L(caseloop1): > + bsf %ecx, %r9d > + movzbl (%rdi,%r9), %eax > + movzbl (%rsi,%r9), %r8d > + movl (%r11,%rax,4), %eax > + subl (%r11,%r8,4), %eax > + jne L(return) > + test %r8d, %r8d > + je L(return) > +# ifdef AS_STRNCMP > + cmp %r9, %r10 > + je L(return) > +# endif > + leaq -1(%rcx), %rax > + andq %rax, %rcx > + je L(next_48_bytes) > + jmp L(caseloop1) > +#else > + bsf %ecx, %edx > movzbl (%rdi, %rdx), %eax > movzbl (%rsi, %rdx), %edx > subl %edx, %eax > ret > +#endif > +#ifdef AS_STRNCMP > + L(ret_zero): > + xor %eax, %eax > + ret > +#endif > > .p2align 4 > L(next_48_bytes): > @@ -50,49 +148,108 @@ L(next_48_bytes): > pcmpeqb %xmm6, %xmm3 > movdqu 32(%rsi), %xmm2 > pminub %xmm6, %xmm3 > - pcmpeqb %xmm1, %xmm3 > + pcmpeqb %xmm7, %xmm3 > movdqu 48(%rdi), %xmm4 > pcmpeqb %xmm5, %xmm2 > - pmovmskb %xmm3, %edx > movdqu 48(%rsi), %xmm0 > pminub %xmm5, %xmm2 > - pcmpeqb %xmm1, %xmm2 > + pcmpeqb %xmm7, %xmm2 > pcmpeqb %xmm4, %xmm0 > - pmovmskb %xmm2, %eax > - salq $16, %rdx > - pminub %xmm4, %xmm0 > - pcmpeqb %xmm1, %xmm0 > + pmovmskb %xmm2, %eax > salq $32, %rax > +#ifdef AS_STRNCMP > + or %rdx, %rax > +#endif > + pmovmskb %xmm3, %edx > + sal $16, %edx > + pminub %xmm4, %xmm0 > + pcmpeqb %xmm7, %xmm0 > orq %rdx, %rax > - pmovmskb %xmm0, %ecx > - movq %rcx, %rdx > - salq $48, %rdx > - orq %rdx, %rax > + pmovmskb %xmm0, %ecx > + salq $48, %rcx > + orq %rax, %rcx > + je L(main_loop_header) > +#ifdef AS_STRCASECMP > +L(caseloop2): > + bsf %rcx, %r9 > + movzbl (%rdi,%r9), %eax > + movzbl (%rsi,%r9), %r8d > + movl (%r11,%rax,4), %eax > + subl (%r11,%r8,4), %eax > jne L(return) > + test %r8d, %r8d > + je L(return) > +# ifdef AS_STRNCMP > + cmp %r9, %r10 > + je L(return) > +# endif > + leaq -1(%rcx), %rax > + andq %rax, %rcx > + je L(main_loop_header) > + jmp L(caseloop2) > +#else > + bsf %rcx, %rdx > + movzbl (%rdi, %rdx), %eax > + movzbl (%rsi, %rdx), %edx > + subl %edx, %eax > + ret > +#endif > + > L(main_loop_header): > +#ifdef USE_AVX2 > + vpxor %xmm7, %xmm7, %xmm7 > +#endif > leaq 64(%rdi), %rdx > - movl $4096, %ecx > - pxor %xmm9, %xmm9 > andq $-64, %rdx > +# ifdef AS_STRNCMP > + addq %rdi, %r10 > + subq %rdx, %r10 > +# endif > subq %rdi, %rdx > leaq (%rdi, %rdx), %rax > addq %rsi, %rdx > - movq %rdx, %rsi > - andl $4095, %esi > - subq %rsi, %rcx > - shrq $6, %rcx > - movq %rcx, %rsi > - jmp L(loop_start) > + movl $4096, %esi > + mov %edx, %ecx > + andl $4095, %ecx > + sub %ecx, %esi > + shr $6, %esi > +#ifdef AS_STRNCMP > + mov %r10, %r9 > + addq %rdx, %r10 > + shr $6, %r9 > + cmp %r9, %rsi > + jb L(dont_set_page_bound) > + mov %r9, %rsi > +L(dont_set_page_bound): > +#endif > > .p2align 4 > L(loop): > + add $-1, %rsi > + ja L(loop_cross_page) > +L(back_to_loop): > +#ifdef USE_AVX2 > + vmovdqa (%rax), %ymm4 > + vmovdqa 32(%rax), %ymm5 > + vmovdqu (%rdx), %ymm0 > + vmovdqu 32(%rdx), %ymm1 > + vpcmpeqb %ymm4, %ymm0, %ymm0 > + vpminub %ymm4, %ymm0, %ymm0 > + vpcmpeqb %ymm5, %ymm1, %ymm1 > + vpminub %ymm5, %ymm1, %ymm1 > + vpminub %ymm0, %ymm1, %ymm2 > + vpcmpeqb %ymm7, %ymm2, %ymm2 > addq $64, %rax > addq $64, %rdx > -L(loop_start): > - testq %rsi, %rsi > - leaq -1(%rsi), %rsi > - je L(loop_cross_page) > -L(back_to_loop): > + vpmovmskb %ymm2, %edi > + test %edi, %edi > + je L(loop) > + shl $32, %rdi > + vpcmpeqb %ymm7, %ymm0, %ymm0 > + vpmovmskb %ymm0, %ecx > + or %rdi, %rcx > + vzeroupper > +#else > movdqu (%rdx), %xmm0 > movdqu 16(%rdx), %xmm1 > movdqa (%rax), %xmm2 > @@ -104,61 +261,99 @@ L(back_to_loop): > movdqu 48(%rdx), %xmm6 > pminub %xmm3, %xmm1 > movdqa 32(%rax), %xmm2 > - pminub %xmm1, %xmm0 > movdqa 48(%rax), %xmm3 > pcmpeqb %xmm2, %xmm5 > pcmpeqb %xmm3, %xmm6 > + addq $64, %rax > pminub %xmm2, %xmm5 > pminub %xmm3, %xmm6 > - pminub %xmm5, %xmm0 > - pminub %xmm6, %xmm0 > - pcmpeqb %xmm7, %xmm0 > - pmovmskb %xmm0, %ecx > + addq $64, %rdx > + pminub %xmm5, %xmm6 > + pminub %xmm1, %xmm6 > + pminub %xmm0, %xmm6 > + pcmpeqb %xmm7, %xmm6 > + pmovmskb %xmm6, %ecx > testl %ecx, %ecx > je L(loop) > - pcmpeqb %xmm7, %xmm5 > - movdqu (%rdx), %xmm0 > - pcmpeqb %xmm7, %xmm1 > - movdqa (%rax), %xmm2 > - pcmpeqb %xmm2, %xmm0 > - pminub %xmm2, %xmm0 > - pcmpeqb %xmm7, %xmm6 > pcmpeqb %xmm7, %xmm0 > - pmovmskb %xmm1, %ecx > - pmovmskb %xmm5, %r8d > - pmovmskb %xmm0, %edi > - salq $16, %rcx > + pcmpeqb %xmm7, %xmm1 > + pcmpeqb %xmm7, %xmm5 > + pmovmskb %xmm0, %edi > + pmovmskb %xmm1, %r9d > + pmovmskb %xmm5, %r8d > + salq $48, %rcx > salq $32, %r8 > - pmovmskb %xmm6, %esi > orq %r8, %rcx > orq %rdi, %rcx > - salq $48, %rsi > - orq %rsi, %rcx > + sal $16, %r9d > + orq %r9, %rcx > +#endif > +#ifdef AS_STRCASECMP > +L(caseloop3): > + bsf %rcx, %r9 > + movzbl -64(%rax,%r9), %edi > + movzbl -64(%rdx,%r9), %r8d > + movl (%r11,%rdi,4), %edi > + subl (%r11,%r8,4), %edi > + jne L(return2) > + test %r8d, %r8d > + je L(return2) > + leaq -1(%rcx), %rdi > + andq %rdi, %rcx > + je L(loop) > + jmp L(caseloop3) > +L(return2): > + mov %rdi, %rax > + ret > +#else > bsfq %rcx, %rcx > - movzbl (%rax, %rcx), %eax > - movzbl (%rdx, %rcx), %edx > + movzbl -64(%rax, %rcx), %eax > + movzbl -64(%rdx, %rcx), %edx > subl %edx, %eax > ret > +#endif > > .p2align 4 > L(loop_cross_page): > - xor %r10, %r10 > - movq %rdx, %r9 > - and $63, %r9 > - subq %r9, %r10 > - > - movdqa (%rdx, %r10), %xmm0 > - movdqa 16(%rdx, %r10), %xmm1 > - movdqu (%rax, %r10), %xmm2 > - movdqu 16(%rax, %r10), %xmm3 > +#ifdef AS_STRNCMP > + mov %r10, %r9 > + sub %rdx, %r9 > + cmp $64, %r9 > + jb L(prepare_back_to_start) > +#endif > + > + mov %edx, %ecx > + and $63, %ecx > + neg %rcx > +#ifdef USE_AVX2 > + vmovdqu (%rax, %rcx), %ymm4 > + vmovdqu 32(%rax, %rcx), %ymm5 > + vmovdqa (%rdx, %rcx), %ymm0 > + vmovdqa 32(%rdx, %rcx), %ymm1 > + vpcmpeqb %ymm4, %ymm0, %ymm0 > + vpminub %ymm4, %ymm0, %ymm0 > + vpcmpeqb %ymm5, %ymm1, %ymm1 > + vpminub %ymm5, %ymm1, %ymm1 > + vpminub %ymm0, %ymm1, %ymm2 > + vpcmpeqb %ymm7, %ymm2, %ymm2 > + vpmovmskb %ymm2, %esi > + shl $32, %rsi > + vpcmpeqb %ymm7, %ymm0, %ymm0 > + vpmovmskb %ymm0, %edi > + or %rsi, %rdi > +#else > + movdqa (%rdx, %rcx), %xmm0 > + movdqa 16(%rdx, %rcx), %xmm1 > + movdqu (%rax, %rcx), %xmm2 > + movdqu 16(%rax, %rcx), %xmm3 > pcmpeqb %xmm2, %xmm0 > - movdqa 32(%rdx, %r10), %xmm5 > + movdqa 32(%rdx, %rcx), %xmm5 > pcmpeqb %xmm3, %xmm1 > pminub %xmm2, %xmm0 > - movdqa 48(%rdx, %r10), %xmm6 > + movdqa 48(%rdx, %rcx), %xmm6 > pminub %xmm3, %xmm1 > - movdqu 32(%rax, %r10), %xmm2 > - movdqu 48(%rax, %r10), %xmm3 > + movdqu 32(%rax, %rcx), %xmm2 > + movdqu 48(%rax, %rcx), %xmm3 > pcmpeqb %xmm2, %xmm5 > pcmpeqb %xmm3, %xmm6 > pminub %xmm2, %xmm5 > @@ -169,41 +364,143 @@ L(loop_cross_page): > pcmpeqb %xmm7, %xmm5 > pcmpeqb %xmm7, %xmm6 > > - pmovmskb %xmm1, %ecx > - pmovmskb %xmm5, %r8d > - pmovmskb %xmm0, %edi > - salq $16, %rcx > + pmovmskb %xmm1, %ecx > + pmovmskb %xmm5, %r8d > + pmovmskb %xmm0, %edi > + sal $16, %ecx > salq $32, %r8 > - pmovmskb %xmm6, %esi > + pmovmskb %xmm6, %esi > orq %r8, %rdi > orq %rcx, %rdi > salq $48, %rsi > orq %rsi, %rdi > - movq %r9, %rcx > - movq $63, %rsi > +#endif > + mov %edx, %ecx > + mov $63, %esi > +#ifdef AS_STRNCMP > + shr $6, %r9 > + sub $1, %r9 > + cmp %r9, %rsi > + jb L(dont_set_bound2) > + mov %r9, %rsi > +L(dont_set_bound2): > +#endif > shrq %cl, %rdi > test %rdi, %rdi > je L(back_to_loop) > +#ifdef USE_AVX2 > + vzeroupper > +#endif > + > +#ifdef AS_STRCASECMP > + mov %rdi, %rcx > +L(caseloop4): > + bsf %rcx, %r9 > + movzbl (%rax,%r9), %edi > + movzbl (%rdx,%r9), %r8d > + movl (%r11,%rdi,4), %edi > + subl (%r11,%r8,4), %edi > + jne L(return2) > + test %r8d, %r8d > + je L(return2) > + leaq -1(%rcx), %rdi > + andq %rdi, %rcx > + je L(back_to_loop) > + jmp L(caseloop4) > +#else > bsfq %rdi, %rcx > movzbl (%rax, %rcx), %eax > movzbl (%rdx, %rcx), %edx > subl %edx, %eax > ret > +#endif > +#ifdef AS_STRNCMP > +L(prepare_back_to_start): > +# ifdef USE_AVX2 > + vzeroupper > +# endif > + mov %r9, %r10 > + mov %rdx, %rsi > + mov %rax, %rdi > + jmp L(back_to_start) > +#endif > > + > +L(cross_page): > + xorl %edx, %edx > .p2align 4 > L(cross_page_loop): > - cmpb %cl, %al > - jne L(different) > - addq $1, %rdx > - cmpq $64, %rdx > - je L(main_loop_header) > -L(cross_page): > movzbl (%rdi, %rdx), %eax > movzbl (%rsi, %rdx), %ecx > - testb %al, %al > - jne L(cross_page_loop) > - xorl %eax, %eax > -L(different): > +#ifdef AS_STRCASECMP > + movl (%r11,%rax,4), %eax > + subl (%r11,%rcx,4), %eax > +#else > + subl %ecx, %eax > +#endif > + jne L(different) > +#ifdef AS_STRNCMP > + cmp %rdx, %r10 > + je L(different) > +#endif > + test %ecx, %ecx > + je L(different) > + > + movzbl 1(%rdi, %rdx), %eax > + movzbl 1(%rsi, %rdx), %ecx > +#ifdef AS_STRCASECMP > + movl (%r11,%rax,4), %eax > + subl (%r11,%rcx,4), %eax > +#else > subl %ecx, %eax > +#endif > + jne L(different) > +#ifdef AS_STRNCMP > + lea 1(%rdx), %r9 > + cmp %r9, %r10 > + je L(different) > +#endif > + test %ecx, %ecx > + je L(different) > + > + movzbl 2(%rdi, %rdx), %eax > + movzbl 2(%rsi, %rdx), %ecx > +#ifdef AS_STRCASECMP > + movl (%r11,%rax,4), %eax > + subl (%r11,%rcx,4), %eax > +#else > + subl %ecx, %eax > +#endif > + jne L(different) > +#ifdef AS_STRNCMP > + lea 2(%rdx), %r9 > + cmp %r9, %r10 > + je L(different) > +#endif > + test %ecx, %ecx > + je L(different) > + > + movzbl 3(%rdi, %rdx), %eax > + movzbl 3(%rsi, %rdx), %ecx > +#ifdef AS_STRCASECMP > + movl (%r11,%rax,4), %eax > + subl (%r11,%rcx,4), %eax > +#else > + subl %ecx, %eax > +#endif > + jne L(different) > +#ifdef AS_STRNCMP > + lea 3(%rdx), %r9 > + cmp %r9, %r10 > + je L(different) > +#endif > + test %ecx, %ecx > + je L(different) > + > + add $4, %edx > + cmp $64, %edx > + je L(main_loop_header) > + jmp L(cross_page_loop) > +L(different): > ret > -END (__strcmp_sse2_unaligned) > +END (STRCMP) > diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S > deleted file mode 100644 > index 4dff0a5..0000000 > --- a/sysdeps/x86_64/multiarch/strcmp-sse42.S > +++ /dev/null > @@ -1,1792 +0,0 @@ > -/* strcmp with SSE4.2 > - Copyright (C) 2009-2015 Free Software Foundation, Inc. > - Contributed by Intel Corporation. > - This file is part of the GNU C Library. > - > - The GNU C Library is free software; you can redistribute it and/or > - modify it under the terms of the GNU Lesser General Public > - License as published by the Free Software Foundation; either > - version 2.1 of the License, or (at your option) any later version. > - > - The GNU C Library is distributed in the hope that it will be useful, > - but WITHOUT ANY WARRANTY; without even the implied warranty of > - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > - Lesser General Public License for more details. > - > - You should have received a copy of the GNU Lesser General Public > - License along with the GNU C Library; if not, see > - <http://www.gnu.org/licenses/>. */ > - > - > -/* We use 0x1a: > - _SIDD_SBYTE_OPS > - | _SIDD_CMP_EQUAL_EACH > - | _SIDD_NEGATIVE_POLARITY > - | _SIDD_LEAST_SIGNIFICANT > - on pcmpistri to find out if two 16byte data elements are the same > - and the offset of the first different byte. There are 4 cases: > - > - 1. Both 16byte data elements are valid and identical. > - 2. Both 16byte data elements have EOS and identical. > - 3. Both 16byte data elements are valid and they differ at offset X. > - 4. At least one 16byte data element has EOS at offset X. Two 16byte > - data elements must differ at or before offset X. > - > - Here is the table of ECX, CFlag, ZFlag and SFlag for 4 cases: > - > - case ECX CFlag ZFlag SFlag > - 1 16 0 0 0 > - 2 16 0 1 1 > - 3 X 1 0 0 > - 4 0 <= X 1 0/1 0/1 > - > - We exit from the loop for cases 2, 3 and 4 with jbe which branches > - when either CFlag or ZFlag is 1. If CFlag == 0, we return 0 for > - case 2. */ > - > - /* Put all SSE 4.2 functions together. */ > - .section .text.SECTION,"ax",@progbits > - .align 16 > - .type STRCMP_SSE42, @function > - .globl STRCMP_SSE42 > - .hidden STRCMP_SSE42 > -#ifdef USE_AS_STRCASECMP_L > -ENTRY (GLABEL(__strcasecmp)) > - movq __libc_tsd_LOCALE@gottpoff(%rip),%rax > - mov %fs:(%rax),%RDX_LP > - > - // XXX 5 byte should be before the function > - /* 5-byte NOP. */ > - .byte 0x0f,0x1f,0x44,0x00,0x00 > -END (GLABEL(__strcasecmp)) > - /* FALLTHROUGH to strcasecmp_l. */ > -#endif > -#ifdef USE_AS_STRNCASECMP_L > -ENTRY (GLABEL(__strncasecmp)) > - movq __libc_tsd_LOCALE@gottpoff(%rip),%rax > - mov %fs:(%rax),%RCX_LP > - > - // XXX 5 byte should be before the function > - /* 5-byte NOP. */ > - .byte 0x0f,0x1f,0x44,0x00,0x00 > -END (GLABEL(__strncasecmp)) > - /* FALLTHROUGH to strncasecmp_l. */ > -#endif > - > - > -#ifdef USE_AVX > -# define movdqa vmovdqa > -# define movdqu vmovdqu > -# define pmovmskb vpmovmskb > -# define pcmpistri vpcmpistri > -# define psubb vpsubb > -# define pcmpeqb vpcmpeqb > -# define psrldq vpsrldq > -# define pslldq vpslldq > -# define palignr vpalignr > -# define pxor vpxor > -# define D(arg) arg, arg > -#else > -# define D(arg) arg > -#endif > - > -STRCMP_SSE42: > - cfi_startproc > - CALL_MCOUNT > - > -/* > - * This implementation uses SSE to compare up to 16 bytes at a time. > - */ > -#ifdef USE_AS_STRCASECMP_L > - /* We have to fall back on the C implementation for locales > - with encodings not matching ASCII for single bytes. */ > -# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0 > - mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rdx), %RAX_LP > -# else > - mov (%rdx), %RAX_LP > -# endif > - testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax) > - jne __strcasecmp_l_nonascii > -#endif > -#ifdef USE_AS_STRNCASECMP_L > - /* We have to fall back on the C implementation for locales > - with encodings not matching ASCII for single bytes. */ > -# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0 > - mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rcx), %RAX_LP > -# else > - mov (%rcx), %RAX_LP > -# endif > - testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax) > - jne __strncasecmp_l_nonascii > -#endif > - > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - test %rdx, %rdx > - je LABEL(strcmp_exitz) > - cmp $1, %rdx > - je LABEL(Byte0) > - mov %rdx, %r11 > -#endif > - mov %esi, %ecx > - mov %edi, %eax > -/* Use 64bit AND here to avoid long NOP padding. */ > - and $0x3f, %rcx /* rsi alignment in cache line */ > - and $0x3f, %rax /* rdi alignment in cache line */ > -#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L > - .section .rodata.cst16,"aM",@progbits,16 > - .align 16 > -LABEL(belowupper): > - .quad 0x4040404040404040 > - .quad 0x4040404040404040 > -LABEL(topupper): > -# ifdef USE_AVX > - .quad 0x5a5a5a5a5a5a5a5a > - .quad 0x5a5a5a5a5a5a5a5a > -# else > - .quad 0x5b5b5b5b5b5b5b5b > - .quad 0x5b5b5b5b5b5b5b5b > -# endif > -LABEL(touppermask): > - .quad 0x2020202020202020 > - .quad 0x2020202020202020 > - .previous > - movdqa LABEL(belowupper)(%rip), %xmm4 > -# define UCLOW_reg %xmm4 > - movdqa LABEL(topupper)(%rip), %xmm5 > -# define UCHIGH_reg %xmm5 > - movdqa LABEL(touppermask)(%rip), %xmm6 > -# define LCQWORD_reg %xmm6 > -#endif > - cmp $0x30, %ecx > - ja LABEL(crosscache)/* rsi: 16-byte load will cross cache line */ > - cmp $0x30, %eax > - ja LABEL(crosscache)/* rdi: 16-byte load will cross cache line */ > - movdqu (%rdi), %xmm1 > - movdqu (%rsi), %xmm2 > -#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L > -# ifdef USE_AVX > -# define TOLOWER(reg1, reg2) \ > - vpcmpgtb UCLOW_reg, reg1, %xmm7; \ > - vpcmpgtb UCHIGH_reg, reg1, %xmm8; \ > - vpcmpgtb UCLOW_reg, reg2, %xmm9; \ > - vpcmpgtb UCHIGH_reg, reg2, %xmm10; \ > - vpandn %xmm7, %xmm8, %xmm8; \ > - vpandn %xmm9, %xmm10, %xmm10; \ > - vpand LCQWORD_reg, %xmm8, %xmm8; \ > - vpand LCQWORD_reg, %xmm10, %xmm10; \ > - vpor reg1, %xmm8, reg1; \ > - vpor reg2, %xmm10, reg2 > -# else > -# define TOLOWER(reg1, reg2) \ > - movdqa reg1, %xmm7; \ > - movdqa UCHIGH_reg, %xmm8; \ > - movdqa reg2, %xmm9; \ > - movdqa UCHIGH_reg, %xmm10; \ > - pcmpgtb UCLOW_reg, %xmm7; \ > - pcmpgtb reg1, %xmm8; \ > - pcmpgtb UCLOW_reg, %xmm9; \ > - pcmpgtb reg2, %xmm10; \ > - pand %xmm8, %xmm7; \ > - pand %xmm10, %xmm9; \ > - pand LCQWORD_reg, %xmm7; \ > - pand LCQWORD_reg, %xmm9; \ > - por %xmm7, reg1; \ > - por %xmm9, reg2 > -# endif > - TOLOWER (%xmm1, %xmm2) > -#else > -# define TOLOWER(reg1, reg2) > -#endif > - pxor %xmm0, D(%xmm0) /* clear %xmm0 for null char checks */ > - pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */ > - pcmpeqb %xmm2, D(%xmm1) /* compare first 16 bytes for equality */ > - psubb %xmm0, D(%xmm1) /* packed sub of comparison results*/ > - pmovmskb %xmm1, %edx > - sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */ > - jnz LABEL(less16bytes)/* If not, find different value or null char */ > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - sub $16, %r11 > - jbe LABEL(strcmp_exitz)/* finish comparison */ > -#endif > - add $16, %rsi /* prepare to search next 16 bytes */ > - add $16, %rdi /* prepare to search next 16 bytes */ > - > - /* > - * Determine source and destination string offsets from 16-byte > - * alignment. Use relative offset difference between the two to > - * determine which case below to use. > - */ > - .p2align 4 > -LABEL(crosscache): > - and $0xfffffffffffffff0, %rsi /* force %rsi is 16 byte aligned */ > - and $0xfffffffffffffff0, %rdi /* force %rdi is 16 byte aligned */ > - mov $0xffff, %edx /* for equivalent offset */ > - xor %r8d, %r8d > - and $0xf, %ecx /* offset of rsi */ > - and $0xf, %eax /* offset of rdi */ > - pxor %xmm0, D(%xmm0) /* clear %xmm0 for null char check */ > - cmp %eax, %ecx > - je LABEL(ashr_0) /* rsi and rdi relative offset same */ > - ja LABEL(bigger) > - mov %edx, %r8d /* r8d is offset flag for exit tail */ > - xchg %ecx, %eax > - xchg %rsi, %rdi > -LABEL(bigger): > - movdqa (%rdi), %xmm2 > - movdqa (%rsi), %xmm1 > - lea 15(%rax), %r9 > - sub %rcx, %r9 > - lea LABEL(unaligned_table)(%rip), %r10 > - movslq (%r10, %r9,4), %r9 > - pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */ > - lea (%r10, %r9), %r10 > - jmp *%r10 /* jump to corresponding case */ > - > -/* > - * The following cases will be handled by ashr_0 > - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case > - * n(0~15) n(0~15) 15(15+ n-n) ashr_0 > - */ > - .p2align 4 > -LABEL(ashr_0): > - > - movdqa (%rsi), %xmm1 > - pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */ > -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > - pcmpeqb (%rdi), D(%xmm1) /* compare 16 bytes for equality */ > -#else > - movdqa (%rdi), %xmm2 > - TOLOWER (%xmm1, %xmm2) > - pcmpeqb %xmm2, D(%xmm1) /* compare 16 bytes for equality */ > -#endif > - psubb %xmm0, D(%xmm1) /* packed sub of comparison results*/ > - pmovmskb %xmm1, %r9d > - shr %cl, %edx /* adjust 0xffff for offset */ > - shr %cl, %r9d /* adjust for 16-byte offset */ > - sub %r9d, %edx > - /* > - * edx must be the same with r9d if in left byte (16-rcx) is equal to > - * the start from (16-rax) and no null char was seen. > - */ > - jne LABEL(less32bytes) /* mismatch or null char */ > - UPDATE_STRNCMP_COUNTER > - mov $16, %rcx > - mov $16, %r9 > - > - /* > - * Now both strings are aligned at 16-byte boundary. Loop over strings > - * checking 32-bytes per iteration. > - */ > - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ > - .p2align 4 > -LABEL(ashr_0_use): > - movdqa (%rdi,%rdx), %xmm0 > -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > - pcmpistri $0x1a,(%rsi,%rdx), %xmm0 > -#else > - movdqa (%rsi,%rdx), %xmm1 > - TOLOWER (%xmm0, %xmm1) > - pcmpistri $0x1a, %xmm1, %xmm0 > -#endif > - lea 16(%rdx), %rdx > - jbe LABEL(ashr_0_exit_use) > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - sub $16, %r11 > - jbe LABEL(strcmp_exitz) > -#endif > - > - movdqa (%rdi,%rdx), %xmm0 > -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > - pcmpistri $0x1a,(%rsi,%rdx), %xmm0 > -#else > - movdqa (%rsi,%rdx), %xmm1 > - TOLOWER (%xmm0, %xmm1) > - pcmpistri $0x1a, %xmm1, %xmm0 > -#endif > - lea 16(%rdx), %rdx > - jbe LABEL(ashr_0_exit_use) > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - sub $16, %r11 > - jbe LABEL(strcmp_exitz) > -#endif > - jmp LABEL(ashr_0_use) > - > - > - .p2align 4 > -LABEL(ashr_0_exit_use): > - jnc LABEL(strcmp_exitz) > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - sub %rcx, %r11 > - jbe LABEL(strcmp_exitz) > -#endif > - lea -16(%rdx, %rcx), %rcx > - movzbl (%rdi, %rcx), %eax > - movzbl (%rsi, %rcx), %edx > -#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L > - leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx > - movl (%rcx,%rax,4), %eax > - movl (%rcx,%rdx,4), %edx > -#endif > - sub %edx, %eax > - ret > - > - > - > -/* > - * The following cases will be handled by ashr_1 > - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case > - * n(15) n -15 0(15 +(n-15) - n) ashr_1 > - */ > - .p2align 4 > -LABEL(ashr_1): > - pslldq $15, D(%xmm2) /* shift first string to align with second */ > - TOLOWER (%xmm1, %xmm2) > - pcmpeqb %xmm1, D(%xmm2) /* compare 16 bytes for equality */ > - psubb %xmm0, D(%xmm2) /* packed sub of comparison results*/ > - pmovmskb %xmm2, %r9d > - shr %cl, %edx /* adjust 0xffff for offset */ > - shr %cl, %r9d /* adjust for 16-byte offset */ > - sub %r9d, %edx > - jnz LABEL(less32bytes) /* mismatch or null char seen */ > - movdqa (%rdi), %xmm3 > - UPDATE_STRNCMP_COUNTER > - > - mov $16, %rcx /* index for loads*/ > - mov $1, %r9d /* byte position left over from less32bytes case */ > - /* > - * Setup %r10 value allows us to detect crossing a page boundary. > - * When %r10 goes positive we have crossed a page boundary and > - * need to do a nibble. > - */ > - lea 1(%rdi), %r10 > - and $0xfff, %r10 /* offset into 4K page */ > - sub $0x1000, %r10 /* subtract 4K pagesize */ > - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ > - > - .p2align 4 > -LABEL(loop_ashr_1_use): > - add $16, %r10 > - jg LABEL(nibble_ashr_1_use) > - > -LABEL(nibble_ashr_1_restart_use): > - movdqa (%rdi, %rdx), %xmm0 > - palignr $1, -16(%rdi, %rdx), D(%xmm0) > -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > - pcmpistri $0x1a,(%rsi,%rdx), %xmm0 > -#else > - movdqa (%rsi,%rdx), %xmm1 > - TOLOWER (%xmm0, %xmm1) > - pcmpistri $0x1a, %xmm1, %xmm0 > -#endif > - jbe LABEL(exit_use) > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - sub $16, %r11 > - jbe LABEL(strcmp_exitz) > -#endif > - > - add $16, %rdx > - add $16, %r10 > - jg LABEL(nibble_ashr_1_use) > - > - movdqa (%rdi, %rdx), %xmm0 > - palignr $1, -16(%rdi, %rdx), D(%xmm0) > -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > - pcmpistri $0x1a,(%rsi,%rdx), %xmm0 > -#else > - movdqa (%rsi,%rdx), %xmm1 > - TOLOWER (%xmm0, %xmm1) > - pcmpistri $0x1a, %xmm1, %xmm0 > -#endif > - jbe LABEL(exit_use) > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - sub $16, %r11 > - jbe LABEL(strcmp_exitz) > -#endif > - add $16, %rdx > - jmp LABEL(loop_ashr_1_use) > - > - .p2align 4 > -LABEL(nibble_ashr_1_use): > - sub $0x1000, %r10 > - movdqa -16(%rdi, %rdx), %xmm0 > - psrldq $1, D(%xmm0) > - pcmpistri $0x3a,%xmm0, %xmm0 > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - cmp %r11, %rcx > - jae LABEL(nibble_ashr_exit_use) > -#endif > - cmp $14, %ecx > - ja LABEL(nibble_ashr_1_restart_use) > - > - jmp LABEL(nibble_ashr_exit_use) > - > -/* > - * The following cases will be handled by ashr_2 > - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case > - * n(14~15) n -14 1(15 +(n-14) - n) ashr_2 > - */ > - .p2align 4 > -LABEL(ashr_2): > - pslldq $14, D(%xmm2) > - TOLOWER (%xmm1, %xmm2) > - pcmpeqb %xmm1, D(%xmm2) > - psubb %xmm0, D(%xmm2) > - pmovmskb %xmm2, %r9d > - shr %cl, %edx > - shr %cl, %r9d > - sub %r9d, %edx > - jnz LABEL(less32bytes) > - movdqa (%rdi), %xmm3 > - UPDATE_STRNCMP_COUNTER > - > - mov $16, %rcx /* index for loads */ > - mov $2, %r9d /* byte position left over from less32bytes case */ > - /* > - * Setup %r10 value allows us to detect crossing a page boundary. > - * When %r10 goes positive we have crossed a page boundary and > - * need to do a nibble. > - */ > - lea 2(%rdi), %r10 > - and $0xfff, %r10 /* offset into 4K page */ > - sub $0x1000, %r10 /* subtract 4K pagesize */ > - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ > - > - .p2align 4 > -LABEL(loop_ashr_2_use): > - add $16, %r10 > - jg LABEL(nibble_ashr_2_use) > - > -LABEL(nibble_ashr_2_restart_use): > - movdqa (%rdi, %rdx), %xmm0 > - palignr $2, -16(%rdi, %rdx), D(%xmm0) > -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > - pcmpistri $0x1a,(%rsi,%rdx), %xmm0 > -#else > - movdqa (%rsi,%rdx), %xmm1 > - TOLOWER (%xmm0, %xmm1) > - pcmpistri $0x1a, %xmm1, %xmm0 > -#endif > - jbe LABEL(exit_use) > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - sub $16, %r11 > - jbe LABEL(strcmp_exitz) > -#endif > - > - add $16, %rdx > - add $16, %r10 > - jg LABEL(nibble_ashr_2_use) > - > - movdqa (%rdi, %rdx), %xmm0 > - palignr $2, -16(%rdi, %rdx), D(%xmm0) > -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > - pcmpistri $0x1a,(%rsi,%rdx), %xmm0 > -#else > - movdqa (%rsi,%rdx), %xmm1 > - TOLOWER (%xmm0, %xmm1) > - pcmpistri $0x1a, %xmm1, %xmm0 > -#endif > - jbe LABEL(exit_use) > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - sub $16, %r11 > - jbe LABEL(strcmp_exitz) > -#endif > - add $16, %rdx > - jmp LABEL(loop_ashr_2_use) > - > - .p2align 4 > -LABEL(nibble_ashr_2_use): > - sub $0x1000, %r10 > - movdqa -16(%rdi, %rdx), %xmm0 > - psrldq $2, D(%xmm0) > - pcmpistri $0x3a,%xmm0, %xmm0 > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - cmp %r11, %rcx > - jae LABEL(nibble_ashr_exit_use) > -#endif > - cmp $13, %ecx > - ja LABEL(nibble_ashr_2_restart_use) > - > - jmp LABEL(nibble_ashr_exit_use) > - > -/* > - * The following cases will be handled by ashr_3 > - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case > - * n(13~15) n -13 2(15 +(n-13) - n) ashr_3 > - */ > - .p2align 4 > -LABEL(ashr_3): > - pslldq $13, D(%xmm2) > - TOLOWER (%xmm1, %xmm2) > - pcmpeqb %xmm1, D(%xmm2) > - psubb %xmm0, D(%xmm2) > - pmovmskb %xmm2, %r9d > - shr %cl, %edx > - shr %cl, %r9d > - sub %r9d, %edx > - jnz LABEL(less32bytes) > - movdqa (%rdi), %xmm3 > - > - UPDATE_STRNCMP_COUNTER > - > - mov $16, %rcx /* index for loads */ > - mov $3, %r9d /* byte position left over from less32bytes case */ > - /* > - * Setup %r10 value allows us to detect crossing a page boundary. > - * When %r10 goes positive we have crossed a page boundary and > - * need to do a nibble. > - */ > - lea 3(%rdi), %r10 > - and $0xfff, %r10 /* offset into 4K page */ > - sub $0x1000, %r10 /* subtract 4K pagesize */ > - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ > - > -LABEL(loop_ashr_3_use): > - add $16, %r10 > - jg LABEL(nibble_ashr_3_use) > - > -LABEL(nibble_ashr_3_restart_use): > - movdqa (%rdi, %rdx), %xmm0 > - palignr $3, -16(%rdi, %rdx), D(%xmm0) > -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > - pcmpistri $0x1a,(%rsi,%rdx), %xmm0 > -#else > - movdqa (%rsi,%rdx), %xmm1 > - TOLOWER (%xmm0, %xmm1) > - pcmpistri $0x1a, %xmm1, %xmm0 > -#endif > - jbe LABEL(exit_use) > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - sub $16, %r11 > - jbe LABEL(strcmp_exitz) > -#endif > - > - add $16, %rdx > - add $16, %r10 > - jg LABEL(nibble_ashr_3_use) > - > - movdqa (%rdi, %rdx), %xmm0 > - palignr $3, -16(%rdi, %rdx), D(%xmm0) > -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > - pcmpistri $0x1a,(%rsi,%rdx), %xmm0 > -#else > - movdqa (%rsi,%rdx), %xmm1 > - TOLOWER (%xmm0, %xmm1) > - pcmpistri $0x1a, %xmm1, %xmm0 > -#endif > - jbe LABEL(exit_use) > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - sub $16, %r11 > - jbe LABEL(strcmp_exitz) > -#endif > - add $16, %rdx > - jmp LABEL(loop_ashr_3_use) > - > - .p2align 4 > -LABEL(nibble_ashr_3_use): > - sub $0x1000, %r10 > - movdqa -16(%rdi, %rdx), %xmm0 > - psrldq $3, D(%xmm0) > - pcmpistri $0x3a,%xmm0, %xmm0 > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - cmp %r11, %rcx > - jae LABEL(nibble_ashr_exit_use) > -#endif > - cmp $12, %ecx > - ja LABEL(nibble_ashr_3_restart_use) > - > - jmp LABEL(nibble_ashr_exit_use) > - > -/* > - * The following cases will be handled by ashr_4 > - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case > - * n(12~15) n -12 3(15 +(n-12) - n) ashr_4 > - */ > - .p2align 4 > -LABEL(ashr_4): > - pslldq $12, D(%xmm2) > - TOLOWER (%xmm1, %xmm2) > - pcmpeqb %xmm1, D(%xmm2) > - psubb %xmm0, D(%xmm2) > - pmovmskb %xmm2, %r9d > - shr %cl, %edx > - shr %cl, %r9d > - sub %r9d, %edx > - jnz LABEL(less32bytes) > - movdqa (%rdi), %xmm3 > - > - UPDATE_STRNCMP_COUNTER > - > - mov $16, %rcx /* index for loads */ > - mov $4, %r9d /* byte position left over from less32bytes case */ > - /* > - * Setup %r10 value allows us to detect crossing a page boundary. > - * When %r10 goes positive we have crossed a page boundary and > - * need to do a nibble. > - */ > - lea 4(%rdi), %r10 > - and $0xfff, %r10 /* offset into 4K page */ > - sub $0x1000, %r10 /* subtract 4K pagesize */ > - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ > - > - .p2align 4 > -LABEL(loop_ashr_4_use): > - add $16, %r10 > - jg LABEL(nibble_ashr_4_use) > - > -LABEL(nibble_ashr_4_restart_use): > - movdqa (%rdi, %rdx), %xmm0 > - palignr $4, -16(%rdi, %rdx), D(%xmm0) > -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > - pcmpistri $0x1a,(%rsi,%rdx), %xmm0 > -#else > - movdqa (%rsi,%rdx), %xmm1 > - TOLOWER (%xmm0, %xmm1) > - pcmpistri $0x1a, %xmm1, %xmm0 > -#endif > - jbe LABEL(exit_use) > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - sub $16, %r11 > - jbe LABEL(strcmp_exitz) > -#endif > - > - add $16, %rdx > - add $16, %r10 > - jg LABEL(nibble_ashr_4_use) > - > - movdqa (%rdi, %rdx), %xmm0 > - palignr $4, -16(%rdi, %rdx), D(%xmm0) > -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > - pcmpistri $0x1a,(%rsi,%rdx), %xmm0 > -#else > - movdqa (%rsi,%rdx), %xmm1 > - TOLOWER (%xmm0, %xmm1) > - pcmpistri $0x1a, %xmm1, %xmm0 > -#endif > - jbe LABEL(exit_use) > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - sub $16, %r11 > - jbe LABEL(strcmp_exitz) > -#endif > - add $16, %rdx > - jmp LABEL(loop_ashr_4_use) > - > - .p2align 4 > -LABEL(nibble_ashr_4_use): > - sub $0x1000, %r10 > - movdqa -16(%rdi, %rdx), %xmm0 > - psrldq $4, D(%xmm0) > - pcmpistri $0x3a,%xmm0, %xmm0 > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - cmp %r11, %rcx > - jae LABEL(nibble_ashr_exit_use) > -#endif > - cmp $11, %ecx > - ja LABEL(nibble_ashr_4_restart_use) > - > - jmp LABEL(nibble_ashr_exit_use) > - > -/* > - * The following cases will be handled by ashr_5 > - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case > - * n(11~15) n - 11 4(15 +(n-11) - n) ashr_5 > - */ > - .p2align 4 > -LABEL(ashr_5): > - pslldq $11, D(%xmm2) > - TOLOWER (%xmm1, %xmm2) > - pcmpeqb %xmm1, D(%xmm2) > - psubb %xmm0, D(%xmm2) > - pmovmskb %xmm2, %r9d > - shr %cl, %edx > - shr %cl, %r9d > - sub %r9d, %edx > - jnz LABEL(less32bytes) > - movdqa (%rdi), %xmm3 > - > - UPDATE_STRNCMP_COUNTER > - > - mov $16, %rcx /* index for loads */ > - mov $5, %r9d /* byte position left over from less32bytes case */ > - /* > - * Setup %r10 value allows us to detect crossing a page boundary. > - * When %r10 goes positive we have crossed a page boundary and > - * need to do a nibble. > - */ > - lea 5(%rdi), %r10 > - and $0xfff, %r10 /* offset into 4K page */ > - sub $0x1000, %r10 /* subtract 4K pagesize */ > - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ > - > - .p2align 4 > -LABEL(loop_ashr_5_use): > - add $16, %r10 > - jg LABEL(nibble_ashr_5_use) > - > -LABEL(nibble_ashr_5_restart_use): > - movdqa (%rdi, %rdx), %xmm0 > - palignr $5, -16(%rdi, %rdx), D(%xmm0) > -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > - pcmpistri $0x1a,(%rsi,%rdx), %xmm0 > -#else > - movdqa (%rsi,%rdx), %xmm1 > - TOLOWER (%xmm0, %xmm1) > - pcmpistri $0x1a, %xmm1, %xmm0 > -#endif > - jbe LABEL(exit_use) > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - sub $16, %r11 > - jbe LABEL(strcmp_exitz) > -#endif > - > - add $16, %rdx > - add $16, %r10 > - jg LABEL(nibble_ashr_5_use) > - > - movdqa (%rdi, %rdx), %xmm0 > - > - palignr $5, -16(%rdi, %rdx), D(%xmm0) > -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > - pcmpistri $0x1a,(%rsi,%rdx), %xmm0 > -#else > - movdqa (%rsi,%rdx), %xmm1 > - TOLOWER (%xmm0, %xmm1) > - pcmpistri $0x1a, %xmm1, %xmm0 > -#endif > - jbe LABEL(exit_use) > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - sub $16, %r11 > - jbe LABEL(strcmp_exitz) > -#endif > - add $16, %rdx > - jmp LABEL(loop_ashr_5_use) > - > - .p2align 4 > -LABEL(nibble_ashr_5_use): > - sub $0x1000, %r10 > - movdqa -16(%rdi, %rdx), %xmm0 > - psrldq $5, D(%xmm0) > - pcmpistri $0x3a,%xmm0, %xmm0 > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - cmp %r11, %rcx > - jae LABEL(nibble_ashr_exit_use) > -#endif > - cmp $10, %ecx > - ja LABEL(nibble_ashr_5_restart_use) > - > - jmp LABEL(nibble_ashr_exit_use) > - > -/* > - * The following cases will be handled by ashr_6 > - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case > - * n(10~15) n - 10 5(15 +(n-10) - n) ashr_6 > - */ > - .p2align 4 > -LABEL(ashr_6): > - pslldq $10, D(%xmm2) > - TOLOWER (%xmm1, %xmm2) > - pcmpeqb %xmm1, D(%xmm2) > - psubb %xmm0, D(%xmm2) > - pmovmskb %xmm2, %r9d > - shr %cl, %edx > - shr %cl, %r9d > - sub %r9d, %edx > - jnz LABEL(less32bytes) > - movdqa (%rdi), %xmm3 > - > - UPDATE_STRNCMP_COUNTER > - > - mov $16, %rcx /* index for loads */ > - mov $6, %r9d /* byte position left over from less32bytes case */ > - /* > - * Setup %r10 value allows us to detect crossing a page boundary. > - * When %r10 goes positive we have crossed a page boundary and > - * need to do a nibble. > - */ > - lea 6(%rdi), %r10 > - and $0xfff, %r10 /* offset into 4K page */ > - sub $0x1000, %r10 /* subtract 4K pagesize */ > - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ > - > - .p2align 4 > -LABEL(loop_ashr_6_use): > - add $16, %r10 > - jg LABEL(nibble_ashr_6_use) > - > -LABEL(nibble_ashr_6_restart_use): > - movdqa (%rdi, %rdx), %xmm0 > - palignr $6, -16(%rdi, %rdx), D(%xmm0) > -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > - pcmpistri $0x1a,(%rsi,%rdx), %xmm0 > -#else > - movdqa (%rsi,%rdx), %xmm1 > - TOLOWER (%xmm0, %xmm1) > - pcmpistri $0x1a, %xmm1, %xmm0 > -#endif > - jbe LABEL(exit_use) > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - sub $16, %r11 > - jbe LABEL(strcmp_exitz) > -#endif > - > - add $16, %rdx > - add $16, %r10 > - jg LABEL(nibble_ashr_6_use) > - > - movdqa (%rdi, %rdx), %xmm0 > - palignr $6, -16(%rdi, %rdx), D(%xmm0) > -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > - pcmpistri $0x1a,(%rsi,%rdx), %xmm0 > -#else > - movdqa (%rsi,%rdx), %xmm1 > - TOLOWER (%xmm0, %xmm1) > - pcmpistri $0x1a, %xmm1, %xmm0 > -#endif > - jbe LABEL(exit_use) > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - sub $16, %r11 > - jbe LABEL(strcmp_exitz) > -#endif > - add $16, %rdx > - jmp LABEL(loop_ashr_6_use) > - > - .p2align 4 > -LABEL(nibble_ashr_6_use): > - sub $0x1000, %r10 > - movdqa -16(%rdi, %rdx), %xmm0 > - psrldq $6, D(%xmm0) > - pcmpistri $0x3a,%xmm0, %xmm0 > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - cmp %r11, %rcx > - jae LABEL(nibble_ashr_exit_use) > -#endif > - cmp $9, %ecx > - ja LABEL(nibble_ashr_6_restart_use) > - > - jmp LABEL(nibble_ashr_exit_use) > - > -/* > - * The following cases will be handled by ashr_7 > - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case > - * n(9~15) n - 9 6(15 +(n - 9) - n) ashr_7 > - */ > - .p2align 4 > -LABEL(ashr_7): > - pslldq $9, D(%xmm2) > - TOLOWER (%xmm1, %xmm2) > - pcmpeqb %xmm1, D(%xmm2) > - psubb %xmm0, D(%xmm2) > - pmovmskb %xmm2, %r9d > - shr %cl, %edx > - shr %cl, %r9d > - sub %r9d, %edx > - jnz LABEL(less32bytes) > - movdqa (%rdi), %xmm3 > - > - UPDATE_STRNCMP_COUNTER > - > - mov $16, %rcx /* index for loads */ > - mov $7, %r9d /* byte position left over from less32bytes case */ > - /* > - * Setup %r10 value allows us to detect crossing a page boundary. > - * When %r10 goes positive we have crossed a page boundary and > - * need to do a nibble. > - */ > - lea 7(%rdi), %r10 > - and $0xfff, %r10 /* offset into 4K page */ > - sub $0x1000, %r10 /* subtract 4K pagesize */ > - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ > - > - .p2align 4 > -LABEL(loop_ashr_7_use): > - add $16, %r10 > - jg LABEL(nibble_ashr_7_use) > - > -LABEL(nibble_ashr_7_restart_use): > - movdqa (%rdi, %rdx), %xmm0 > - palignr $7, -16(%rdi, %rdx), D(%xmm0) > -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > - pcmpistri $0x1a,(%rsi,%rdx), %xmm0 > -#else > - movdqa (%rsi,%rdx), %xmm1 > - TOLOWER (%xmm0, %xmm1) > - pcmpistri $0x1a, %xmm1, %xmm0 > -#endif > - jbe LABEL(exit_use) > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - sub $16, %r11 > - jbe LABEL(strcmp_exitz) > -#endif > - > - add $16, %rdx > - add $16, %r10 > - jg LABEL(nibble_ashr_7_use) > - > - movdqa (%rdi, %rdx), %xmm0 > - palignr $7, -16(%rdi, %rdx), D(%xmm0) > -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > - pcmpistri $0x1a,(%rsi,%rdx), %xmm0 > -#else > - movdqa (%rsi,%rdx), %xmm1 > - TOLOWER (%xmm0, %xmm1) > - pcmpistri $0x1a, %xmm1, %xmm0 > -#endif > - jbe LABEL(exit_use) > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - sub $16, %r11 > - jbe LABEL(strcmp_exitz) > -#endif > - add $16, %rdx > - jmp LABEL(loop_ashr_7_use) > - > - .p2align 4 > -LABEL(nibble_ashr_7_use): > - sub $0x1000, %r10 > - movdqa -16(%rdi, %rdx), %xmm0 > - psrldq $7, D(%xmm0) > - pcmpistri $0x3a,%xmm0, %xmm0 > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - cmp %r11, %rcx > - jae LABEL(nibble_ashr_exit_use) > -#endif > - cmp $8, %ecx > - ja LABEL(nibble_ashr_7_restart_use) > - > - jmp LABEL(nibble_ashr_exit_use) > - > -/* > - * The following cases will be handled by ashr_8 > - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case > - * n(8~15) n - 8 7(15 +(n - 8) - n) ashr_8 > - */ > - .p2align 4 > -LABEL(ashr_8): > - pslldq $8, D(%xmm2) > - TOLOWER (%xmm1, %xmm2) > - pcmpeqb %xmm1, D(%xmm2) > - psubb %xmm0, D(%xmm2) > - pmovmskb %xmm2, %r9d > - shr %cl, %edx > - shr %cl, %r9d > - sub %r9d, %edx > - jnz LABEL(less32bytes) > - movdqa (%rdi), %xmm3 > - > - UPDATE_STRNCMP_COUNTER > - > - mov $16, %rcx /* index for loads */ > - mov $8, %r9d /* byte position left over from less32bytes case */ > - /* > - * Setup %r10 value allows us to detect crossing a page boundary. > - * When %r10 goes positive we have crossed a page boundary and > - * need to do a nibble. > - */ > - lea 8(%rdi), %r10 > - and $0xfff, %r10 /* offset into 4K page */ > - sub $0x1000, %r10 /* subtract 4K pagesize */ > - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ > - > - .p2align 4 > -LABEL(loop_ashr_8_use): > - add $16, %r10 > - jg LABEL(nibble_ashr_8_use) > - > -LABEL(nibble_ashr_8_restart_use): > - movdqa (%rdi, %rdx), %xmm0 > - palignr $8, -16(%rdi, %rdx), D(%xmm0) > -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > - pcmpistri $0x1a, (%rsi,%rdx), %xmm0 > -#else > - movdqa (%rsi,%rdx), %xmm1 > - TOLOWER (%xmm0, %xmm1) > - pcmpistri $0x1a, %xmm1, %xmm0 > -#endif > - jbe LABEL(exit_use) > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - sub $16, %r11 > - jbe LABEL(strcmp_exitz) > -#endif > - > - add $16, %rdx > - add $16, %r10 > - jg LABEL(nibble_ashr_8_use) > - > - movdqa (%rdi, %rdx), %xmm0 > - palignr $8, -16(%rdi, %rdx), D(%xmm0) > -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > - pcmpistri $0x1a, (%rsi,%rdx), %xmm0 > -#else > - movdqa (%rsi,%rdx), %xmm1 > - TOLOWER (%xmm0, %xmm1) > - pcmpistri $0x1a, %xmm1, %xmm0 > -#endif > - jbe LABEL(exit_use) > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - sub $16, %r11 > - jbe LABEL(strcmp_exitz) > -#endif > - add $16, %rdx > - jmp LABEL(loop_ashr_8_use) > - > - .p2align 4 > -LABEL(nibble_ashr_8_use): > - sub $0x1000, %r10 > - movdqa -16(%rdi, %rdx), %xmm0 > - psrldq $8, D(%xmm0) > - pcmpistri $0x3a,%xmm0, %xmm0 > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - cmp %r11, %rcx > - jae LABEL(nibble_ashr_exit_use) > -#endif > - cmp $7, %ecx > - ja LABEL(nibble_ashr_8_restart_use) > - > - jmp LABEL(nibble_ashr_exit_use) > - > -/* > - * The following cases will be handled by ashr_9 > - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case > - * n(7~15) n - 7 8(15 +(n - 7) - n) ashr_9 > - */ > - .p2align 4 > -LABEL(ashr_9): > - pslldq $7, D(%xmm2) > - TOLOWER (%xmm1, %xmm2) > - pcmpeqb %xmm1, D(%xmm2) > - psubb %xmm0, D(%xmm2) > - pmovmskb %xmm2, %r9d > - shr %cl, %edx > - shr %cl, %r9d > - sub %r9d, %edx > - jnz LABEL(less32bytes) > - movdqa (%rdi), %xmm3 > - > - UPDATE_STRNCMP_COUNTER > - > - mov $16, %rcx /* index for loads */ > - mov $9, %r9d /* byte position left over from less32bytes case */ > - /* > - * Setup %r10 value allows us to detect crossing a page boundary. > - * When %r10 goes positive we have crossed a page boundary and > - * need to do a nibble. > - */ > - lea 9(%rdi), %r10 > - and $0xfff, %r10 /* offset into 4K page */ > - sub $0x1000, %r10 /* subtract 4K pagesize */ > - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ > - > - .p2align 4 > -LABEL(loop_ashr_9_use): > - add $16, %r10 > - jg LABEL(nibble_ashr_9_use) > - > -LABEL(nibble_ashr_9_restart_use): > - movdqa (%rdi, %rdx), %xmm0 > - > - palignr $9, -16(%rdi, %rdx), D(%xmm0) > -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > - pcmpistri $0x1a, (%rsi,%rdx), %xmm0 > -#else > - movdqa (%rsi,%rdx), %xmm1 > - TOLOWER (%xmm0, %xmm1) > - pcmpistri $0x1a, %xmm1, %xmm0 > -#endif > - jbe LABEL(exit_use) > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - sub $16, %r11 > - jbe LABEL(strcmp_exitz) > -#endif > - > - add $16, %rdx > - add $16, %r10 > - jg LABEL(nibble_ashr_9_use) > - > - movdqa (%rdi, %rdx), %xmm0 > - palignr $9, -16(%rdi, %rdx), D(%xmm0) > -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > - pcmpistri $0x1a, (%rsi,%rdx), %xmm0 > -#else > - movdqa (%rsi,%rdx), %xmm1 > - TOLOWER (%xmm0, %xmm1) > - pcmpistri $0x1a, %xmm1, %xmm0 > -#endif > - jbe LABEL(exit_use) > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - sub $16, %r11 > - jbe LABEL(strcmp_exitz) > -#endif > - add $16, %rdx > - jmp LABEL(loop_ashr_9_use) > - > - .p2align 4 > -LABEL(nibble_ashr_9_use): > - sub $0x1000, %r10 > - movdqa -16(%rdi, %rdx), %xmm0 > - psrldq $9, D(%xmm0) > - pcmpistri $0x3a,%xmm0, %xmm0 > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - cmp %r11, %rcx > - jae LABEL(nibble_ashr_exit_use) > -#endif > - cmp $6, %ecx > - ja LABEL(nibble_ashr_9_restart_use) > - > - jmp LABEL(nibble_ashr_exit_use) > - > -/* > - * The following cases will be handled by ashr_10 > - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case > - * n(6~15) n - 6 9(15 +(n - 6) - n) ashr_10 > - */ > - .p2align 4 > -LABEL(ashr_10): > - pslldq $6, D(%xmm2) > - TOLOWER (%xmm1, %xmm2) > - pcmpeqb %xmm1, D(%xmm2) > - psubb %xmm0, D(%xmm2) > - pmovmskb %xmm2, %r9d > - shr %cl, %edx > - shr %cl, %r9d > - sub %r9d, %edx > - jnz LABEL(less32bytes) > - movdqa (%rdi), %xmm3 > - > - UPDATE_STRNCMP_COUNTER > - > - mov $16, %rcx /* index for loads */ > - mov $10, %r9d /* byte position left over from less32bytes case */ > - /* > - * Setup %r10 value allows us to detect crossing a page boundary. > - * When %r10 goes positive we have crossed a page boundary and > - * need to do a nibble. > - */ > - lea 10(%rdi), %r10 > - and $0xfff, %r10 /* offset into 4K page */ > - sub $0x1000, %r10 /* subtract 4K pagesize */ > - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ > - > - .p2align 4 > -LABEL(loop_ashr_10_use): > - add $16, %r10 > - jg LABEL(nibble_ashr_10_use) > - > -LABEL(nibble_ashr_10_restart_use): > - movdqa (%rdi, %rdx), %xmm0 > - palignr $10, -16(%rdi, %rdx), D(%xmm0) > -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > - pcmpistri $0x1a, (%rsi,%rdx), %xmm0 > -#else > - movdqa (%rsi,%rdx), %xmm1 > - TOLOWER (%xmm0, %xmm1) > - pcmpistri $0x1a, %xmm1, %xmm0 > -#endif > - jbe LABEL(exit_use) > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - sub $16, %r11 > - jbe LABEL(strcmp_exitz) > -#endif > - > - add $16, %rdx > - add $16, %r10 > - jg LABEL(nibble_ashr_10_use) > - > - movdqa (%rdi, %rdx), %xmm0 > - palignr $10, -16(%rdi, %rdx), D(%xmm0) > -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > - pcmpistri $0x1a, (%rsi,%rdx), %xmm0 > -#else > - movdqa (%rsi,%rdx), %xmm1 > - TOLOWER (%xmm0, %xmm1) > - pcmpistri $0x1a, %xmm1, %xmm0 > -#endif > - jbe LABEL(exit_use) > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - sub $16, %r11 > - jbe LABEL(strcmp_exitz) > -#endif > - add $16, %rdx > - jmp LABEL(loop_ashr_10_use) > - > - .p2align 4 > -LABEL(nibble_ashr_10_use): > - sub $0x1000, %r10 > - movdqa -16(%rdi, %rdx), %xmm0 > - psrldq $10, D(%xmm0) > - pcmpistri $0x3a,%xmm0, %xmm0 > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - cmp %r11, %rcx > - jae LABEL(nibble_ashr_exit_use) > -#endif > - cmp $5, %ecx > - ja LABEL(nibble_ashr_10_restart_use) > - > - jmp LABEL(nibble_ashr_exit_use) > - > -/* > - * The following cases will be handled by ashr_11 > - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case > - * n(5~15) n - 5 10(15 +(n - 5) - n) ashr_11 > - */ > - .p2align 4 > -LABEL(ashr_11): > - pslldq $5, D(%xmm2) > - TOLOWER (%xmm1, %xmm2) > - pcmpeqb %xmm1, D(%xmm2) > - psubb %xmm0, D(%xmm2) > - pmovmskb %xmm2, %r9d > - shr %cl, %edx > - shr %cl, %r9d > - sub %r9d, %edx > - jnz LABEL(less32bytes) > - movdqa (%rdi), %xmm3 > - > - UPDATE_STRNCMP_COUNTER > - > - mov $16, %rcx /* index for loads */ > - mov $11, %r9d /* byte position left over from less32bytes case */ > - /* > - * Setup %r10 value allows us to detect crossing a page boundary. > - * When %r10 goes positive we have crossed a page boundary and > - * need to do a nibble. > - */ > - lea 11(%rdi), %r10 > - and $0xfff, %r10 /* offset into 4K page */ > - sub $0x1000, %r10 /* subtract 4K pagesize */ > - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ > - > - .p2align 4 > -LABEL(loop_ashr_11_use): > - add $16, %r10 > - jg LABEL(nibble_ashr_11_use) > - > -LABEL(nibble_ashr_11_restart_use): > - movdqa (%rdi, %rdx), %xmm0 > - palignr $11, -16(%rdi, %rdx), D(%xmm0) > -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > - pcmpistri $0x1a, (%rsi,%rdx), %xmm0 > -#else > - movdqa (%rsi,%rdx), %xmm1 > - TOLOWER (%xmm0, %xmm1) > - pcmpistri $0x1a, %xmm1, %xmm0 > -#endif > - jbe LABEL(exit_use) > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - sub $16, %r11 > - jbe LABEL(strcmp_exitz) > -#endif > - > - add $16, %rdx > - add $16, %r10 > - jg LABEL(nibble_ashr_11_use) > - > - movdqa (%rdi, %rdx), %xmm0 > - palignr $11, -16(%rdi, %rdx), D(%xmm0) > -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > - pcmpistri $0x1a, (%rsi,%rdx), %xmm0 > -#else > - movdqa (%rsi,%rdx), %xmm1 > - TOLOWER (%xmm0, %xmm1) > - pcmpistri $0x1a, %xmm1, %xmm0 > -#endif > - jbe LABEL(exit_use) > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - sub $16, %r11 > - jbe LABEL(strcmp_exitz) > -#endif > - add $16, %rdx > - jmp LABEL(loop_ashr_11_use) > - > - .p2align 4 > -LABEL(nibble_ashr_11_use): > - sub $0x1000, %r10 > - movdqa -16(%rdi, %rdx), %xmm0 > - psrldq $11, D(%xmm0) > - pcmpistri $0x3a,%xmm0, %xmm0 > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - cmp %r11, %rcx > - jae LABEL(nibble_ashr_exit_use) > -#endif > - cmp $4, %ecx > - ja LABEL(nibble_ashr_11_restart_use) > - > - jmp LABEL(nibble_ashr_exit_use) > - > -/* > - * The following cases will be handled by ashr_12 > - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case > - * n(4~15) n - 4 11(15 +(n - 4) - n) ashr_12 > - */ > - .p2align 4 > -LABEL(ashr_12): > - pslldq $4, D(%xmm2) > - TOLOWER (%xmm1, %xmm2) > - pcmpeqb %xmm1, D(%xmm2) > - psubb %xmm0, D(%xmm2) > - pmovmskb %xmm2, %r9d > - shr %cl, %edx > - shr %cl, %r9d > - sub %r9d, %edx > - jnz LABEL(less32bytes) > - movdqa (%rdi), %xmm3 > - > - UPDATE_STRNCMP_COUNTER > - > - mov $16, %rcx /* index for loads */ > - mov $12, %r9d /* byte position left over from less32bytes case */ > - /* > - * Setup %r10 value allows us to detect crossing a page boundary. > - * When %r10 goes positive we have crossed a page boundary and > - * need to do a nibble. > - */ > - lea 12(%rdi), %r10 > - and $0xfff, %r10 /* offset into 4K page */ > - sub $0x1000, %r10 /* subtract 4K pagesize */ > - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ > - > - .p2align 4 > -LABEL(loop_ashr_12_use): > - add $16, %r10 > - jg LABEL(nibble_ashr_12_use) > - > -LABEL(nibble_ashr_12_restart_use): > - movdqa (%rdi, %rdx), %xmm0 > - palignr $12, -16(%rdi, %rdx), D(%xmm0) > -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > - pcmpistri $0x1a, (%rsi,%rdx), %xmm0 > -#else > - movdqa (%rsi,%rdx), %xmm1 > - TOLOWER (%xmm0, %xmm1) > - pcmpistri $0x1a, %xmm1, %xmm0 > -#endif > - jbe LABEL(exit_use) > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - sub $16, %r11 > - jbe LABEL(strcmp_exitz) > -#endif > - > - add $16, %rdx > - add $16, %r10 > - jg LABEL(nibble_ashr_12_use) > - > - movdqa (%rdi, %rdx), %xmm0 > - palignr $12, -16(%rdi, %rdx), D(%xmm0) > -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > - pcmpistri $0x1a, (%rsi,%rdx), %xmm0 > -#else > - movdqa (%rsi,%rdx), %xmm1 > - TOLOWER (%xmm0, %xmm1) > - pcmpistri $0x1a, %xmm1, %xmm0 > -#endif > - jbe LABEL(exit_use) > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - sub $16, %r11 > - jbe LABEL(strcmp_exitz) > -#endif > - add $16, %rdx > - jmp LABEL(loop_ashr_12_use) > - > - .p2align 4 > -LABEL(nibble_ashr_12_use): > - sub $0x1000, %r10 > - movdqa -16(%rdi, %rdx), %xmm0 > - psrldq $12, D(%xmm0) > - pcmpistri $0x3a,%xmm0, %xmm0 > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - cmp %r11, %rcx > - jae LABEL(nibble_ashr_exit_use) > -#endif > - cmp $3, %ecx > - ja LABEL(nibble_ashr_12_restart_use) > - > - jmp LABEL(nibble_ashr_exit_use) > - > -/* > - * The following cases will be handled by ashr_13 > - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case > - * n(3~15) n - 3 12(15 +(n - 3) - n) ashr_13 > - */ > - .p2align 4 > -LABEL(ashr_13): > - pslldq $3, D(%xmm2) > - TOLOWER (%xmm1, %xmm2) > - pcmpeqb %xmm1, D(%xmm2) > - psubb %xmm0, D(%xmm2) > - pmovmskb %xmm2, %r9d > - shr %cl, %edx > - shr %cl, %r9d > - sub %r9d, %edx > - jnz LABEL(less32bytes) > - movdqa (%rdi), %xmm3 > - > - UPDATE_STRNCMP_COUNTER > - > - mov $16, %rcx /* index for loads */ > - mov $13, %r9d /* byte position left over from less32bytes case */ > - /* > - * Setup %r10 value allows us to detect crossing a page boundary. > - * When %r10 goes positive we have crossed a page boundary and > - * need to do a nibble. > - */ > - lea 13(%rdi), %r10 > - and $0xfff, %r10 /* offset into 4K page */ > - sub $0x1000, %r10 /* subtract 4K pagesize */ > - > - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ > - > - .p2align 4 > -LABEL(loop_ashr_13_use): > - add $16, %r10 > - jg LABEL(nibble_ashr_13_use) > - > -LABEL(nibble_ashr_13_restart_use): > - movdqa (%rdi, %rdx), %xmm0 > - palignr $13, -16(%rdi, %rdx), D(%xmm0) > -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > - pcmpistri $0x1a, (%rsi,%rdx), %xmm0 > -#else > - movdqa (%rsi,%rdx), %xmm1 > - TOLOWER (%xmm0, %xmm1) > - pcmpistri $0x1a, %xmm1, %xmm0 > -#endif > - jbe LABEL(exit_use) > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - sub $16, %r11 > - jbe LABEL(strcmp_exitz) > -#endif > - > - add $16, %rdx > - add $16, %r10 > - jg LABEL(nibble_ashr_13_use) > - > - movdqa (%rdi, %rdx), %xmm0 > - palignr $13, -16(%rdi, %rdx), D(%xmm0) > -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > - pcmpistri $0x1a, (%rsi,%rdx), %xmm0 > -#else > - movdqa (%rsi,%rdx), %xmm1 > - TOLOWER (%xmm0, %xmm1) > - pcmpistri $0x1a, %xmm1, %xmm0 > -#endif > - jbe LABEL(exit_use) > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - sub $16, %r11 > - jbe LABEL(strcmp_exitz) > -#endif > - add $16, %rdx > - jmp LABEL(loop_ashr_13_use) > - > - .p2align 4 > -LABEL(nibble_ashr_13_use): > - sub $0x1000, %r10 > - movdqa -16(%rdi, %rdx), %xmm0 > - psrldq $13, D(%xmm0) > - pcmpistri $0x3a,%xmm0, %xmm0 > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - cmp %r11, %rcx > - jae LABEL(nibble_ashr_exit_use) > -#endif > - cmp $2, %ecx > - ja LABEL(nibble_ashr_13_restart_use) > - > - jmp LABEL(nibble_ashr_exit_use) > - > -/* > - * The following cases will be handled by ashr_14 > - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case > - * n(2~15) n - 2 13(15 +(n - 2) - n) ashr_14 > - */ > - .p2align 4 > -LABEL(ashr_14): > - pslldq $2, D(%xmm2) > - TOLOWER (%xmm1, %xmm2) > - pcmpeqb %xmm1, D(%xmm2) > - psubb %xmm0, D(%xmm2) > - pmovmskb %xmm2, %r9d > - shr %cl, %edx > - shr %cl, %r9d > - sub %r9d, %edx > - jnz LABEL(less32bytes) > - movdqa (%rdi), %xmm3 > - > - UPDATE_STRNCMP_COUNTER > - > - mov $16, %rcx /* index for loads */ > - mov $14, %r9d /* byte position left over from less32bytes case */ > - /* > - * Setup %r10 value allows us to detect crossing a page boundary. > - * When %r10 goes positive we have crossed a page boundary and > - * need to do a nibble. > - */ > - lea 14(%rdi), %r10 > - and $0xfff, %r10 /* offset into 4K page */ > - sub $0x1000, %r10 /* subtract 4K pagesize */ > - > - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ > - > - .p2align 4 > -LABEL(loop_ashr_14_use): > - add $16, %r10 > - jg LABEL(nibble_ashr_14_use) > - > -LABEL(nibble_ashr_14_restart_use): > - movdqa (%rdi, %rdx), %xmm0 > - palignr $14, -16(%rdi, %rdx), D(%xmm0) > -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > - pcmpistri $0x1a, (%rsi,%rdx), %xmm0 > -#else > - movdqa (%rsi,%rdx), %xmm1 > - TOLOWER (%xmm0, %xmm1) > - pcmpistri $0x1a, %xmm1, %xmm0 > -#endif > - jbe LABEL(exit_use) > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - sub $16, %r11 > - jbe LABEL(strcmp_exitz) > -#endif > - > - add $16, %rdx > - add $16, %r10 > - jg LABEL(nibble_ashr_14_use) > - > - movdqa (%rdi, %rdx), %xmm0 > - palignr $14, -16(%rdi, %rdx), D(%xmm0) > -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > - pcmpistri $0x1a, (%rsi,%rdx), %xmm0 > -#else > - movdqa (%rsi,%rdx), %xmm1 > - TOLOWER (%xmm0, %xmm1) > - pcmpistri $0x1a, %xmm1, %xmm0 > -#endif > - jbe LABEL(exit_use) > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - sub $16, %r11 > - jbe LABEL(strcmp_exitz) > -#endif > - add $16, %rdx > - jmp LABEL(loop_ashr_14_use) > - > - .p2align 4 > -LABEL(nibble_ashr_14_use): > - sub $0x1000, %r10 > - movdqa -16(%rdi, %rdx), %xmm0 > - psrldq $14, D(%xmm0) > - pcmpistri $0x3a,%xmm0, %xmm0 > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - cmp %r11, %rcx > - jae LABEL(nibble_ashr_exit_use) > -#endif > - cmp $1, %ecx > - ja LABEL(nibble_ashr_14_restart_use) > - > - jmp LABEL(nibble_ashr_exit_use) > - > -/* > - * The following cases will be handled by ashr_15 > - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case > - * n(1~15) n - 1 14(15 +(n - 1) - n) ashr_15 > - */ > - .p2align 4 > -LABEL(ashr_15): > - pslldq $1, D(%xmm2) > - TOLOWER (%xmm1, %xmm2) > - pcmpeqb %xmm1, D(%xmm2) > - psubb %xmm0, D(%xmm2) > - pmovmskb %xmm2, %r9d > - shr %cl, %edx > - shr %cl, %r9d > - sub %r9d, %edx > - jnz LABEL(less32bytes) > - > - movdqa (%rdi), %xmm3 > - > - UPDATE_STRNCMP_COUNTER > - > - mov $16, %rcx /* index for loads */ > - mov $15, %r9d /* byte position left over from less32bytes case */ > - /* > - * Setup %r10 value allows us to detect crossing a page boundary. > - * When %r10 goes positive we have crossed a page boundary and > - * need to do a nibble. > - */ > - lea 15(%rdi), %r10 > - and $0xfff, %r10 /* offset into 4K page */ > - > - sub $0x1000, %r10 /* subtract 4K pagesize */ > - > - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ > - > - .p2align 4 > -LABEL(loop_ashr_15_use): > - add $16, %r10 > - jg LABEL(nibble_ashr_15_use) > - > -LABEL(nibble_ashr_15_restart_use): > - movdqa (%rdi, %rdx), %xmm0 > - palignr $15, -16(%rdi, %rdx), D(%xmm0) > -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > - pcmpistri $0x1a, (%rsi,%rdx), %xmm0 > -#else > - movdqa (%rsi,%rdx), %xmm1 > - TOLOWER (%xmm0, %xmm1) > - pcmpistri $0x1a, %xmm1, %xmm0 > -#endif > - jbe LABEL(exit_use) > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - sub $16, %r11 > - jbe LABEL(strcmp_exitz) > -#endif > - > - add $16, %rdx > - add $16, %r10 > - jg LABEL(nibble_ashr_15_use) > - > - movdqa (%rdi, %rdx), %xmm0 > - palignr $15, -16(%rdi, %rdx), D(%xmm0) > -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > - pcmpistri $0x1a, (%rsi,%rdx), %xmm0 > -#else > - movdqa (%rsi,%rdx), %xmm1 > - TOLOWER (%xmm0, %xmm1) > - pcmpistri $0x1a, %xmm1, %xmm0 > -#endif > - jbe LABEL(exit_use) > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - sub $16, %r11 > - jbe LABEL(strcmp_exitz) > -#endif > - add $16, %rdx > - jmp LABEL(loop_ashr_15_use) > - > - .p2align 4 > -LABEL(nibble_ashr_15_use): > - sub $0x1000, %r10 > - movdqa -16(%rdi, %rdx), %xmm0 > - psrldq $15, D(%xmm0) > - pcmpistri $0x3a,%xmm0, %xmm0 > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - cmp %r11, %rcx > - jae LABEL(nibble_ashr_exit_use) > -#endif > - cmp $0, %ecx > - ja LABEL(nibble_ashr_15_restart_use) > - > -LABEL(nibble_ashr_exit_use): > -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > - pcmpistri $0x1a,(%rsi,%rdx), %xmm0 > -#else > - movdqa (%rsi,%rdx), %xmm1 > - TOLOWER (%xmm0, %xmm1) > - pcmpistri $0x1a, %xmm1, %xmm0 > -#endif > - .p2align 4 > -LABEL(exit_use): > - jnc LABEL(strcmp_exitz) > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - sub %rcx, %r11 > - jbe LABEL(strcmp_exitz) > -#endif > - add %rcx, %rdx > - lea -16(%rdi, %r9), %rdi > - movzbl (%rdi, %rdx), %eax > - movzbl (%rsi, %rdx), %edx > - test %r8d, %r8d > - jz LABEL(ret_use) > - xchg %eax, %edx > -LABEL(ret_use): > -#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L > - leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx > - movl (%rcx,%rdx,4), %edx > - movl (%rcx,%rax,4), %eax > -#endif > - > - sub %edx, %eax > - ret > - > -LABEL(less32bytes): > - lea (%rdi, %rax), %rdi /* locate the exact address for first operand(rdi) */ > - lea (%rsi, %rcx), %rsi /* locate the exact address for second operand(rsi) */ > - test %r8d, %r8d > - jz LABEL(ret) > - xchg %rsi, %rdi /* recover original order according to flag(%r8d) */ > - > - .p2align 4 > -LABEL(ret): > -LABEL(less16bytes): > - bsf %rdx, %rdx /* find and store bit index in %rdx */ > - > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - sub %rdx, %r11 > - jbe LABEL(strcmp_exitz) > -#endif > - movzbl (%rsi, %rdx), %ecx > - movzbl (%rdi, %rdx), %eax > - > -#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L > - leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx > - movl (%rdx,%rcx,4), %ecx > - movl (%rdx,%rax,4), %eax > -#endif > - > - sub %ecx, %eax > - ret > - > -LABEL(strcmp_exitz): > - xor %eax, %eax > - ret > - > - .p2align 4 > - // XXX Same as code above > -LABEL(Byte0): > - movzx (%rsi), %ecx > - movzx (%rdi), %eax > - > -#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L > - leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx > - movl (%rdx,%rcx,4), %ecx > - movl (%rdx,%rax,4), %eax > -#endif > - > - sub %ecx, %eax > - ret > - cfi_endproc > - .size STRCMP_SSE42, .-STRCMP_SSE42 > - > -#undef UCLOW_reg > -#undef UCHIGH_reg > -#undef LCQWORD_reg > -#undef TOLOWER > - > - /* Put all SSE 4.2 functions together. */ > - .section .rodata.SECTION,"a",@progbits > - .p2align 3 > -LABEL(unaligned_table): > - .int LABEL(ashr_1) - LABEL(unaligned_table) > - .int LABEL(ashr_2) - LABEL(unaligned_table) > - .int LABEL(ashr_3) - LABEL(unaligned_table) > - .int LABEL(ashr_4) - LABEL(unaligned_table) > - .int LABEL(ashr_5) - LABEL(unaligned_table) > - .int LABEL(ashr_6) - LABEL(unaligned_table) > - .int LABEL(ashr_7) - LABEL(unaligned_table) > - .int LABEL(ashr_8) - LABEL(unaligned_table) > - .int LABEL(ashr_9) - LABEL(unaligned_table) > - .int LABEL(ashr_10) - LABEL(unaligned_table) > - .int LABEL(ashr_11) - LABEL(unaligned_table) > - .int LABEL(ashr_12) - LABEL(unaligned_table) > - .int LABEL(ashr_13) - LABEL(unaligned_table) > - .int LABEL(ashr_14) - LABEL(unaligned_table) > - .int LABEL(ashr_15) - LABEL(unaligned_table) > - .int LABEL(ashr_0) - LABEL(unaligned_table) > - > -#undef LABEL > -#undef GLABEL > -#undef SECTION > -#undef movdqa > -#undef movdqu > -#undef pmovmskb > -#undef pcmpistri > -#undef psubb > -#undef pcmpeqb > -#undef psrldq > -#undef pslldq > -#undef palignr > -#undef pxor > -#undef D > diff --git a/sysdeps/x86_64/multiarch/strcmp.S b/sysdeps/x86_64/multiarch/strcmp.S > index f50f26c..63aa62e 100644 > --- a/sysdeps/x86_64/multiarch/strcmp.S > +++ b/sysdeps/x86_64/multiarch/strcmp.S > @@ -31,8 +31,8 @@ > test %r9, %r9; \ > je LABEL(strcmp_exitz); \ > mov %r9, %r11 > - > -# define STRCMP_SSE42 __strncmp_sse42 > +# define STRCMP_AVX2 __strncmp_avx2 > +# define STRCMP_SSE2_UNALIGNED __strncmp_sse2_unaligned > # define STRCMP_SSSE3 __strncmp_ssse3 > # define STRCMP_SSE2 __strncmp_sse2 > # define __GI_STRCMP __GI_strncmp > @@ -40,9 +40,8 @@ > # include "locale-defines.h" > > # define UPDATE_STRNCMP_COUNTER > - > -# define STRCMP_AVX __strcasecmp_l_avx > -# define STRCMP_SSE42 __strcasecmp_l_sse42 > +# define STRCMP_AVX2 __strcasecmp_avx2_l > +# define STRCMP_SSE2_UNALIGNED __strcasecmp_sse2_unaligned_l > # define STRCMP_SSSE3 __strcasecmp_l_ssse3 > # define STRCMP_SSE2 __strcasecmp_l_sse2 > # define __GI_STRCMP __GI___strcasecmp_l > @@ -60,8 +59,8 @@ > je LABEL(strcmp_exitz); \ > mov %r9, %r11 > > -# define STRCMP_AVX __strncasecmp_l_avx > -# define STRCMP_SSE42 __strncasecmp_l_sse42 > +# define STRCMP_AVX2 __strncasecmp_avx2_l > +# define STRCMP_SSE2_UNALIGNED __strncasecmp_sse2_unaligned_l > # define STRCMP_SSSE3 __strncasecmp_l_ssse3 > # define STRCMP_SSE2 __strncasecmp_l_sse2 > # define __GI_STRCMP __GI___strncasecmp_l > @@ -69,8 +68,9 @@ > # define USE_AS_STRCMP > # define UPDATE_STRNCMP_COUNTER > # ifndef STRCMP > +# define STRCMP_AVX2 __strcmp_avx2 > +# define STRCMP_SSE2_UNALIGNED __strcmp_sse2_unaligned > # define STRCMP strcmp > -# define STRCMP_SSE42 __strcmp_sse42 > # define STRCMP_SSSE3 __strcmp_ssse3 > # define STRCMP_SSE2 __strcmp_sse2 > # define __GI_STRCMP __GI_strcmp > @@ -89,17 +89,16 @@ ENTRY(STRCMP) > jne 1f > call __init_cpu_features > 1: > -#ifdef USE_AS_STRCMP > - leaq __strcmp_sse2_unaligned(%rip), %rax > +# ifdef HAVE_AVX2_SUPPORT > + > + leaq STRCMP_AVX2(%rip), %rax > + testl $bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip) > + jnz 3f > +# endif > + leaq STRCMP_SSE2_UNALIGNED(%rip), %rax > testl $bit_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_Fast_Unaligned_Load(%rip) > jnz 3f > -#else > - testl $bit_Slow_SSE4_2, __cpu_features+FEATURE_OFFSET+index_Slow_SSE4_2(%rip) > - jnz 2f > - leaq STRCMP_SSE42(%rip), %rax > - testl $bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip) > - jnz 3f > -#endif > + > 2: leaq STRCMP_SSSE3(%rip), %rax > testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip) > jnz 3f > @@ -115,21 +114,22 @@ ENTRY(__strcasecmp) > jne 1f > call __init_cpu_features > 1: > -# ifdef HAVE_AVX_SUPPORT > - leaq __strcasecmp_avx(%rip), %rax > - testl $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip) > +# ifdef HAVE_AVX2_SUPPORT > + > + leaq __strcasecmp_avx2(%rip), %rax > + testl $bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip) > jnz 3f > # endif > - testl $bit_Slow_SSE4_2, __cpu_features+FEATURE_OFFSET+index_Slow_SSE4_2(%rip) > - jnz 2f > - leaq __strcasecmp_sse42(%rip), %rax > - testl $bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip) > - jnz 3f > + leaq __strcasecmp_sse2_unaligned(%rip), %rax > + testl $bit_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_Fast_Unaligned_Load(%rip) > + jnz 3f > + > 2: leaq __strcasecmp_ssse3(%rip), %rax > testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip) > jnz 3f > leaq __strcasecmp_sse2(%rip), %rax > 3: ret > + > END(__strcasecmp) > weak_alias (__strcasecmp, strcasecmp) > # endif > @@ -141,45 +141,26 @@ ENTRY(__strncasecmp) > jne 1f > call __init_cpu_features > 1: > -# ifdef HAVE_AVX_SUPPORT > - leaq __strncasecmp_avx(%rip), %rax > - testl $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip) > +# ifdef HAVE_AVX2_SUPPORT > + > + leaq __strncasecmp_avx2(%rip), %rax > + testl $bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip) > jnz 3f > # endif > - testl $bit_Slow_SSE4_2, __cpu_features+FEATURE_OFFSET+index_Slow_SSE4_2(%rip) > - jnz 2f > - leaq __strncasecmp_sse42(%rip), %rax > - testl $bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip) > - jnz 3f > + leaq __strncasecmp_sse2_unaligned(%rip), %rax > + testl $bit_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_Fast_Unaligned_Load(%rip) > + jnz 3f > + > 2: leaq __strncasecmp_ssse3(%rip), %rax > testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip) > jnz 3f > leaq __strncasecmp_sse2(%rip), %rax > 3: ret > + > END(__strncasecmp) > weak_alias (__strncasecmp, strncasecmp) > # endif > > -# undef LABEL > -# define LABEL(l) .L##l##_sse42 > -# define GLABEL(l) l##_sse42 > -# define SECTION sse4.2 > -# include "strcmp-sse42.S" > - > - > -# ifdef HAVE_AVX_SUPPORT > -# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L > -# define LABEL(l) .L##l##_avx > -# define GLABEL(l) l##_avx > -# define USE_AVX 1 > -# undef STRCMP_SSE42 > -# define STRCMP_SSE42 STRCMP_AVX > -# define SECTION avx > -# include "strcmp-sse42.S" > -# endif > -# endif > - > - > # undef ENTRY > # define ENTRY(name) \ > .type STRCMP_SSE2, @function; \ > diff --git a/sysdeps/x86_64/multiarch/strncase_l-avx2.S b/sysdeps/x86_64/multiarch/strncase_l-avx2.S > new file mode 100644 > index 0000000..809b966 > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/strncase_l-avx2.S > @@ -0,0 +1,6 @@ > +#define AS_STRCASECMP > +#define AS_STRNCMP > +#define USE_AVX2 > +#define __strncasecmp_sse2_unaligned __strncasecmp_avx2 > +#define STRCMP __strncasecmp_avx2_l > +#include "strcmp-sse2-unaligned.S" > diff --git a/sysdeps/x86_64/multiarch/strncase_l-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strncase_l-sse2-unaligned.S > new file mode 100644 > index 0000000..a372ed4 > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/strncase_l-sse2-unaligned.S > @@ -0,0 +1,4 @@ > +#define AS_STRCASECMP > +#define AS_STRNCMP > +#define STRCMP __strncasecmp_sse2_unaligned_l > +#include "strcmp-sse2-unaligned.S" > diff --git a/sysdeps/x86_64/multiarch/strncmp-avx2.S b/sysdeps/x86_64/multiarch/strncmp-avx2.S > new file mode 100644 > index 0000000..2d9a032 > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/strncmp-avx2.S > @@ -0,0 +1,4 @@ > +#define USE_AVX2 > +#define AS_STRNCMP > +#define STRCMP __strncmp_avx2 > +#include "strcmp-sse2-unaligned.S" > diff --git a/sysdeps/x86_64/multiarch/strncmp-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strncmp-sse2-unaligned.S > new file mode 100644 > index 0000000..7f9a5fd > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/strncmp-sse2-unaligned.S > @@ -0,0 +1,3 @@ > +#define AS_STRNCMP > +#define STRCMP __strncmp_sse2_unaligned > +#include "strcmp-sse2-unaligned.S" > -- > 1.8.4.rc3
diff --git a/sysdeps/x86_64/locale-defines.sym b/sysdeps/x86_64/locale-defines.sym index aebff9a..804debb 100644 --- a/sysdeps/x86_64/locale-defines.sym +++ b/sysdeps/x86_64/locale-defines.sym @@ -8,4 +8,5 @@ LOCALE_T___LOCALES offsetof (struct __locale_struct, __locales) LC_CTYPE _NL_CTYPE_NONASCII_CASE LOCALE_DATA_VALUES offsetof (struct __locale_data, values) +LOCALE_TOLOWER offsetof (struct __locale_struct, __ctype_tolower) SIZEOF_VALUES sizeof (((struct __locale_data *) 0)->values[0]) diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index 679db2a..8094162 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -7,12 +7,13 @@ endif ifeq ($(subdir),string) sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \ - strcmp-sse2-unaligned strncmp-ssse3 \ + strcmp-sse2-unaligned strncmp-sse2-unaligned strncmp-ssse3 \ memcpy-ssse3 \ memcpy-sse2-unaligned mempcpy-ssse3 \ memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \ memmove-avx-unaligned memcpy-avx-unaligned mempcpy-avx-unaligned \ memmove-ssse3-back strcasecmp_l-ssse3 \ + strcasecmp_l-sse2-unaligned strncase_l-sse2-unaligned \ strncase_l-ssse3 strcat-ssse3 strncat-ssse3\ strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \ strcpy-sse2-unaligned strncpy-sse2-unaligned \ @@ -29,7 +30,7 @@ CFLAGS-strspn-c.c += -msse4 endif ifeq (yes,$(config-cflags-avx2)) -sysdep_routines += memset-avx2 strcpy-avx2 stpcpy-avx2 memcmp-avx2 +sysdep_routines += memset-avx2 strcpy-avx2 stpcpy-avx2 memcmp-avx2 strcmp-avx2 strncmp-avx2 strcasecmp_l-avx2 strncase_l-avx2 endif endif diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index b3dbe65..8c71030 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -94,20 +94,18 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/x86_64/multiarch/strcasecmp_l.S. */ IFUNC_IMPL (i, name, strcasecmp, - IFUNC_IMPL_ADD (array, i, strcasecmp, HAS_AVX, - __strcasecmp_avx) - IFUNC_IMPL_ADD (array, i, strcasecmp, HAS_SSE4_2, - __strcasecmp_sse42) + IFUNC_IMPL_ADD (array, i, strcasecmp, 1, + __strcasecmp_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, strcasecmp, HAS_AVX2, + __strcasecmp_avx2) IFUNC_IMPL_ADD (array, i, strcasecmp, HAS_SSSE3, __strcasecmp_ssse3) IFUNC_IMPL_ADD (array, i, strcasecmp, 1, __strcasecmp_sse2)) /* Support sysdeps/x86_64/multiarch/strcasecmp_l.S. */ IFUNC_IMPL (i, name, strcasecmp_l, - IFUNC_IMPL_ADD (array, i, strcasecmp_l, HAS_AVX, - __strcasecmp_l_avx) - IFUNC_IMPL_ADD (array, i, strcasecmp_l, HAS_SSE4_2, - __strcasecmp_l_sse42) + IFUNC_IMPL_ADD (array, i, strcasecmp_l, 1, + __strcasecmp_sse2_unaligned_l) IFUNC_IMPL_ADD (array, i, strcasecmp_l, HAS_SSSE3, __strcasecmp_l_ssse3) IFUNC_IMPL_ADD (array, i, strcasecmp_l, 1, @@ -130,7 +128,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/x86_64/multiarch/strcmp.S. */ IFUNC_IMPL (i, name, strcmp, - IFUNC_IMPL_ADD (array, i, strcmp, HAS_SSE4_2, __strcmp_sse42) + IFUNC_IMPL_ADD (array, i, strcmp, HAS_AVX2, __strcmp_avx2) IFUNC_IMPL_ADD (array, i, strcmp, HAS_SSSE3, __strcmp_ssse3) IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_sse2_unaligned) IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_sse2)) @@ -150,10 +148,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/x86_64/multiarch/strncase_l.S. */ IFUNC_IMPL (i, name, strncasecmp, - IFUNC_IMPL_ADD (array, i, strncasecmp, HAS_AVX, - __strncasecmp_avx) - IFUNC_IMPL_ADD (array, i, strncasecmp, HAS_SSE4_2, - __strncasecmp_sse42) + IFUNC_IMPL_ADD (array, i, strncasecmp, HAS_AVX2, + __strncasecmp_avx2) + IFUNC_IMPL_ADD (array, i, strncasecmp, 1, + __strncasecmp_sse2_unaligned) IFUNC_IMPL_ADD (array, i, strncasecmp, HAS_SSSE3, __strncasecmp_ssse3) IFUNC_IMPL_ADD (array, i, strncasecmp, 1, @@ -161,10 +159,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/x86_64/multiarch/strncase_l.S. */ IFUNC_IMPL (i, name, strncasecmp_l, - IFUNC_IMPL_ADD (array, i, strncasecmp_l, HAS_AVX, - __strncasecmp_l_avx) - IFUNC_IMPL_ADD (array, i, strncasecmp_l, HAS_SSE4_2, - __strncasecmp_l_sse42) + IFUNC_IMPL_ADD (array, i, strncasecmp_l, HAS_AVX2, + __strncasecmp_avx2_l) + IFUNC_IMPL_ADD (array, i, strncasecmp_l, 1, + __strncasecmp_sse2_unaligned_l) IFUNC_IMPL_ADD (array, i, strncasecmp_l, HAS_SSSE3, __strncasecmp_l_ssse3) IFUNC_IMPL_ADD (array, i, strncasecmp_l, 1, @@ -261,8 +259,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/x86_64/multiarch/strncmp.S. */ IFUNC_IMPL (i, name, strncmp, - IFUNC_IMPL_ADD (array, i, strncmp, HAS_SSE4_2, - __strncmp_sse42) + IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, strncmp, HAS_AVX2, __strncmp_avx2) + IFUNC_IMPL_ADD (array, i, strncmp, HAS_SSSE3, __strncmp_ssse3) IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_sse2)) diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S new file mode 100644 index 0000000..d10379f --- /dev/null +++ b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S @@ -0,0 +1,5 @@ +#define AS_STRCASECMP +#define USE_AVX2 +#define __strcasecmp_sse2_unaligned __strcasecmp_avx2 +#define STRCMP __strcasecmp_avx2_l +#include "strcmp-sse2-unaligned.S" diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcasecmp_l-sse2-unaligned.S new file mode 100644 index 0000000..e2ed03f --- /dev/null +++ b/sysdeps/x86_64/multiarch/strcasecmp_l-sse2-unaligned.S @@ -0,0 +1,3 @@ +#define AS_STRCASECMP +#define STRCMP __strcasecmp_sse2_unaligned_l +#include "strcmp-sse2-unaligned.S" diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S new file mode 100644 index 0000000..606df63 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S @@ -0,0 +1,3 @@ +#define USE_AVX2 +#define STRCMP __strcmp_avx2 +#include "strcmp-sse2-unaligned.S" diff --git a/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S index 20b65fa..ef67fb0 100644 --- a/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S +++ b/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S @@ -18,29 +18,127 @@ #include "sysdep.h" -ENTRY ( __strcmp_sse2_unaligned) - movl %edi, %eax - xorl %edx, %edx +#ifndef STRCMP +# define STRCMP __strcmp_sse2_unaligned +#endif + +#ifdef AS_STRCASECMP +# include "locale-defines.h" + +# ifdef AS_STRNCMP +ENTRY (__strncasecmp_sse2_unaligned) + movq __libc_tsd_LOCALE@gottpoff(%rip), %rax + mov %fs:(%rax), %rcx + // XXX 5 byte should be before the function + /* 5-byte NOP. */ + .byte 0x0f,0x1f,0x44,0x00,0x00 + +END (__strncasecmp_sse2_unaligned) + +ENTRY (STRCMP) + test %rdx, %rdx + je L(ret_zero) + mov LOCALE_TOLOWER(%rcx), %r11 +# else +ENTRY (__strcasecmp_sse2_unaligned) + movq __libc_tsd_LOCALE@gottpoff(%rip), %rax + mov %fs:(%rax), %rdx + // XXX 5 byte should be before the function + /* 5-byte NOP. */ + .byte 0x0f,0x1f,0x44,0x00,0x00 + +END (__strcasecmp_sse2_unaligned) + +ENTRY (STRCMP) + mov LOCALE_TOLOWER(%rdx), %r11 +# endif + movzbl (%rdi), %eax + movzbl (%rsi), %ecx + movl (%r11,%rax,4), %eax + subl (%r11,%rcx,4), %eax + je L(next) +L(return): + ret +L(next): + test %ecx, %ecx + je L(return) + leaq 1(%rsi), %rsi + leaq 1(%rdi), %rdi +#ifdef AS_STRNCMP + sub $1, %rdx +#endif + +#else +ENTRY (STRCMP) +#endif + +#ifdef AS_STRNCMP + lea -1(%rdx), %r10 + test %rdx, %rdx + je L(ret_zero) +L(back_to_start): + xor %rdx, %rdx +#endif + pxor %xmm7, %xmm7 - orl %esi, %eax + movl %esi, %eax + andl $4095, %eax + cmpl $4032, %eax + jg L(cross_page) + + movl %edi, %eax andl $4095, %eax cmpl $4032, %eax jg L(cross_page) +#ifdef AS_STRNCMP + cmp $64, %r10 + jae L(dont_set_mask) + bts %r10, %rdx +L(dont_set_mask): +#endif + movdqu (%rdi), %xmm1 movdqu (%rsi), %xmm0 pcmpeqb %xmm1, %xmm0 pminub %xmm1, %xmm0 - pxor %xmm1, %xmm1 - pcmpeqb %xmm1, %xmm0 - pmovmskb %xmm0, %eax - testq %rax, %rax + pcmpeqb %xmm7, %xmm0 + pmovmskb %xmm0, %ecx +#ifdef AS_STRNCMP + or %dx, %cx +#else + test %ecx, %ecx +#endif je L(next_48_bytes) -L(return): - bsfq %rax, %rdx +#ifdef AS_STRCASECMP +L(caseloop1): + bsf %ecx, %r9d + movzbl (%rdi,%r9), %eax + movzbl (%rsi,%r9), %r8d + movl (%r11,%rax,4), %eax + subl (%r11,%r8,4), %eax + jne L(return) + test %r8d, %r8d + je L(return) +# ifdef AS_STRNCMP + cmp %r9, %r10 + je L(return) +# endif + leaq -1(%rcx), %rax + andq %rax, %rcx + je L(next_48_bytes) + jmp L(caseloop1) +#else + bsf %ecx, %edx movzbl (%rdi, %rdx), %eax movzbl (%rsi, %rdx), %edx subl %edx, %eax ret +#endif +#ifdef AS_STRNCMP + L(ret_zero): + xor %eax, %eax + ret +#endif .p2align 4 L(next_48_bytes): @@ -50,49 +148,108 @@ L(next_48_bytes): pcmpeqb %xmm6, %xmm3 movdqu 32(%rsi), %xmm2 pminub %xmm6, %xmm3 - pcmpeqb %xmm1, %xmm3 + pcmpeqb %xmm7, %xmm3 movdqu 48(%rdi), %xmm4 pcmpeqb %xmm5, %xmm2 - pmovmskb %xmm3, %edx movdqu 48(%rsi), %xmm0 pminub %xmm5, %xmm2 - pcmpeqb %xmm1, %xmm2 + pcmpeqb %xmm7, %xmm2 pcmpeqb %xmm4, %xmm0 - pmovmskb %xmm2, %eax - salq $16, %rdx - pminub %xmm4, %xmm0 - pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm2, %eax salq $32, %rax +#ifdef AS_STRNCMP + or %rdx, %rax +#endif + pmovmskb %xmm3, %edx + sal $16, %edx + pminub %xmm4, %xmm0 + pcmpeqb %xmm7, %xmm0 orq %rdx, %rax - pmovmskb %xmm0, %ecx - movq %rcx, %rdx - salq $48, %rdx - orq %rdx, %rax + pmovmskb %xmm0, %ecx + salq $48, %rcx + orq %rax, %rcx + je L(main_loop_header) +#ifdef AS_STRCASECMP +L(caseloop2): + bsf %rcx, %r9 + movzbl (%rdi,%r9), %eax + movzbl (%rsi,%r9), %r8d + movl (%r11,%rax,4), %eax + subl (%r11,%r8,4), %eax jne L(return) + test %r8d, %r8d + je L(return) +# ifdef AS_STRNCMP + cmp %r9, %r10 + je L(return) +# endif + leaq -1(%rcx), %rax + andq %rax, %rcx + je L(main_loop_header) + jmp L(caseloop2) +#else + bsf %rcx, %rdx + movzbl (%rdi, %rdx), %eax + movzbl (%rsi, %rdx), %edx + subl %edx, %eax + ret +#endif + L(main_loop_header): +#ifdef USE_AVX2 + vpxor %xmm7, %xmm7, %xmm7 +#endif leaq 64(%rdi), %rdx - movl $4096, %ecx - pxor %xmm9, %xmm9 andq $-64, %rdx +# ifdef AS_STRNCMP + addq %rdi, %r10 + subq %rdx, %r10 +# endif subq %rdi, %rdx leaq (%rdi, %rdx), %rax addq %rsi, %rdx - movq %rdx, %rsi - andl $4095, %esi - subq %rsi, %rcx - shrq $6, %rcx - movq %rcx, %rsi - jmp L(loop_start) + movl $4096, %esi + mov %edx, %ecx + andl $4095, %ecx + sub %ecx, %esi + shr $6, %esi +#ifdef AS_STRNCMP + mov %r10, %r9 + addq %rdx, %r10 + shr $6, %r9 + cmp %r9, %rsi + jb L(dont_set_page_bound) + mov %r9, %rsi +L(dont_set_page_bound): +#endif .p2align 4 L(loop): + add $-1, %rsi + ja L(loop_cross_page) +L(back_to_loop): +#ifdef USE_AVX2 + vmovdqa (%rax), %ymm4 + vmovdqa 32(%rax), %ymm5 + vmovdqu (%rdx), %ymm0 + vmovdqu 32(%rdx), %ymm1 + vpcmpeqb %ymm4, %ymm0, %ymm0 + vpminub %ymm4, %ymm0, %ymm0 + vpcmpeqb %ymm5, %ymm1, %ymm1 + vpminub %ymm5, %ymm1, %ymm1 + vpminub %ymm0, %ymm1, %ymm2 + vpcmpeqb %ymm7, %ymm2, %ymm2 addq $64, %rax addq $64, %rdx -L(loop_start): - testq %rsi, %rsi - leaq -1(%rsi), %rsi - je L(loop_cross_page) -L(back_to_loop): + vpmovmskb %ymm2, %edi + test %edi, %edi + je L(loop) + shl $32, %rdi + vpcmpeqb %ymm7, %ymm0, %ymm0 + vpmovmskb %ymm0, %ecx + or %rdi, %rcx + vzeroupper +#else movdqu (%rdx), %xmm0 movdqu 16(%rdx), %xmm1 movdqa (%rax), %xmm2 @@ -104,61 +261,99 @@ L(back_to_loop): movdqu 48(%rdx), %xmm6 pminub %xmm3, %xmm1 movdqa 32(%rax), %xmm2 - pminub %xmm1, %xmm0 movdqa 48(%rax), %xmm3 pcmpeqb %xmm2, %xmm5 pcmpeqb %xmm3, %xmm6 + addq $64, %rax pminub %xmm2, %xmm5 pminub %xmm3, %xmm6 - pminub %xmm5, %xmm0 - pminub %xmm6, %xmm0 - pcmpeqb %xmm7, %xmm0 - pmovmskb %xmm0, %ecx + addq $64, %rdx + pminub %xmm5, %xmm6 + pminub %xmm1, %xmm6 + pminub %xmm0, %xmm6 + pcmpeqb %xmm7, %xmm6 + pmovmskb %xmm6, %ecx testl %ecx, %ecx je L(loop) - pcmpeqb %xmm7, %xmm5 - movdqu (%rdx), %xmm0 - pcmpeqb %xmm7, %xmm1 - movdqa (%rax), %xmm2 - pcmpeqb %xmm2, %xmm0 - pminub %xmm2, %xmm0 - pcmpeqb %xmm7, %xmm6 pcmpeqb %xmm7, %xmm0 - pmovmskb %xmm1, %ecx - pmovmskb %xmm5, %r8d - pmovmskb %xmm0, %edi - salq $16, %rcx + pcmpeqb %xmm7, %xmm1 + pcmpeqb %xmm7, %xmm5 + pmovmskb %xmm0, %edi + pmovmskb %xmm1, %r9d + pmovmskb %xmm5, %r8d + salq $48, %rcx salq $32, %r8 - pmovmskb %xmm6, %esi orq %r8, %rcx orq %rdi, %rcx - salq $48, %rsi - orq %rsi, %rcx + sal $16, %r9d + orq %r9, %rcx +#endif +#ifdef AS_STRCASECMP +L(caseloop3): + bsf %rcx, %r9 + movzbl -64(%rax,%r9), %edi + movzbl -64(%rdx,%r9), %r8d + movl (%r11,%rdi,4), %edi + subl (%r11,%r8,4), %edi + jne L(return2) + test %r8d, %r8d + je L(return2) + leaq -1(%rcx), %rdi + andq %rdi, %rcx + je L(loop) + jmp L(caseloop3) +L(return2): + mov %rdi, %rax + ret +#else bsfq %rcx, %rcx - movzbl (%rax, %rcx), %eax - movzbl (%rdx, %rcx), %edx + movzbl -64(%rax, %rcx), %eax + movzbl -64(%rdx, %rcx), %edx subl %edx, %eax ret +#endif .p2align 4 L(loop_cross_page): - xor %r10, %r10 - movq %rdx, %r9 - and $63, %r9 - subq %r9, %r10 - - movdqa (%rdx, %r10), %xmm0 - movdqa 16(%rdx, %r10), %xmm1 - movdqu (%rax, %r10), %xmm2 - movdqu 16(%rax, %r10), %xmm3 +#ifdef AS_STRNCMP + mov %r10, %r9 + sub %rdx, %r9 + cmp $64, %r9 + jb L(prepare_back_to_start) +#endif + + mov %edx, %ecx + and $63, %ecx + neg %rcx +#ifdef USE_AVX2 + vmovdqu (%rax, %rcx), %ymm4 + vmovdqu 32(%rax, %rcx), %ymm5 + vmovdqa (%rdx, %rcx), %ymm0 + vmovdqa 32(%rdx, %rcx), %ymm1 + vpcmpeqb %ymm4, %ymm0, %ymm0 + vpminub %ymm4, %ymm0, %ymm0 + vpcmpeqb %ymm5, %ymm1, %ymm1 + vpminub %ymm5, %ymm1, %ymm1 + vpminub %ymm0, %ymm1, %ymm2 + vpcmpeqb %ymm7, %ymm2, %ymm2 + vpmovmskb %ymm2, %esi + shl $32, %rsi + vpcmpeqb %ymm7, %ymm0, %ymm0 + vpmovmskb %ymm0, %edi + or %rsi, %rdi +#else + movdqa (%rdx, %rcx), %xmm0 + movdqa 16(%rdx, %rcx), %xmm1 + movdqu (%rax, %rcx), %xmm2 + movdqu 16(%rax, %rcx), %xmm3 pcmpeqb %xmm2, %xmm0 - movdqa 32(%rdx, %r10), %xmm5 + movdqa 32(%rdx, %rcx), %xmm5 pcmpeqb %xmm3, %xmm1 pminub %xmm2, %xmm0 - movdqa 48(%rdx, %r10), %xmm6 + movdqa 48(%rdx, %rcx), %xmm6 pminub %xmm3, %xmm1 - movdqu 32(%rax, %r10), %xmm2 - movdqu 48(%rax, %r10), %xmm3 + movdqu 32(%rax, %rcx), %xmm2 + movdqu 48(%rax, %rcx), %xmm3 pcmpeqb %xmm2, %xmm5 pcmpeqb %xmm3, %xmm6 pminub %xmm2, %xmm5 @@ -169,41 +364,143 @@ L(loop_cross_page): pcmpeqb %xmm7, %xmm5 pcmpeqb %xmm7, %xmm6 - pmovmskb %xmm1, %ecx - pmovmskb %xmm5, %r8d - pmovmskb %xmm0, %edi - salq $16, %rcx + pmovmskb %xmm1, %ecx + pmovmskb %xmm5, %r8d + pmovmskb %xmm0, %edi + sal $16, %ecx salq $32, %r8 - pmovmskb %xmm6, %esi + pmovmskb %xmm6, %esi orq %r8, %rdi orq %rcx, %rdi salq $48, %rsi orq %rsi, %rdi - movq %r9, %rcx - movq $63, %rsi +#endif + mov %edx, %ecx + mov $63, %esi +#ifdef AS_STRNCMP + shr $6, %r9 + sub $1, %r9 + cmp %r9, %rsi + jb L(dont_set_bound2) + mov %r9, %rsi +L(dont_set_bound2): +#endif shrq %cl, %rdi test %rdi, %rdi je L(back_to_loop) +#ifdef USE_AVX2 + vzeroupper +#endif + +#ifdef AS_STRCASECMP + mov %rdi, %rcx +L(caseloop4): + bsf %rcx, %r9 + movzbl (%rax,%r9), %edi + movzbl (%rdx,%r9), %r8d + movl (%r11,%rdi,4), %edi + subl (%r11,%r8,4), %edi + jne L(return2) + test %r8d, %r8d + je L(return2) + leaq -1(%rcx), %rdi + andq %rdi, %rcx + je L(back_to_loop) + jmp L(caseloop4) +#else bsfq %rdi, %rcx movzbl (%rax, %rcx), %eax movzbl (%rdx, %rcx), %edx subl %edx, %eax ret +#endif +#ifdef AS_STRNCMP +L(prepare_back_to_start): +# ifdef USE_AVX2 + vzeroupper +# endif + mov %r9, %r10 + mov %rdx, %rsi + mov %rax, %rdi + jmp L(back_to_start) +#endif + +L(cross_page): + xorl %edx, %edx .p2align 4 L(cross_page_loop): - cmpb %cl, %al - jne L(different) - addq $1, %rdx - cmpq $64, %rdx - je L(main_loop_header) -L(cross_page): movzbl (%rdi, %rdx), %eax movzbl (%rsi, %rdx), %ecx - testb %al, %al - jne L(cross_page_loop) - xorl %eax, %eax -L(different): +#ifdef AS_STRCASECMP + movl (%r11,%rax,4), %eax + subl (%r11,%rcx,4), %eax +#else + subl %ecx, %eax +#endif + jne L(different) +#ifdef AS_STRNCMP + cmp %rdx, %r10 + je L(different) +#endif + test %ecx, %ecx + je L(different) + + movzbl 1(%rdi, %rdx), %eax + movzbl 1(%rsi, %rdx), %ecx +#ifdef AS_STRCASECMP + movl (%r11,%rax,4), %eax + subl (%r11,%rcx,4), %eax +#else subl %ecx, %eax +#endif + jne L(different) +#ifdef AS_STRNCMP + lea 1(%rdx), %r9 + cmp %r9, %r10 + je L(different) +#endif + test %ecx, %ecx + je L(different) + + movzbl 2(%rdi, %rdx), %eax + movzbl 2(%rsi, %rdx), %ecx +#ifdef AS_STRCASECMP + movl (%r11,%rax,4), %eax + subl (%r11,%rcx,4), %eax +#else + subl %ecx, %eax +#endif + jne L(different) +#ifdef AS_STRNCMP + lea 2(%rdx), %r9 + cmp %r9, %r10 + je L(different) +#endif + test %ecx, %ecx + je L(different) + + movzbl 3(%rdi, %rdx), %eax + movzbl 3(%rsi, %rdx), %ecx +#ifdef AS_STRCASECMP + movl (%r11,%rax,4), %eax + subl (%r11,%rcx,4), %eax +#else + subl %ecx, %eax +#endif + jne L(different) +#ifdef AS_STRNCMP + lea 3(%rdx), %r9 + cmp %r9, %r10 + je L(different) +#endif + test %ecx, %ecx + je L(different) + + add $4, %edx + cmp $64, %edx + je L(main_loop_header) + jmp L(cross_page_loop) +L(different): ret -END (__strcmp_sse2_unaligned) +END (STRCMP) diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S deleted file mode 100644 index 4dff0a5..0000000 --- a/sysdeps/x86_64/multiarch/strcmp-sse42.S +++ /dev/null @@ -1,1792 +0,0 @@ -/* strcmp with SSE4.2 - Copyright (C) 2009-2015 Free Software Foundation, Inc. - Contributed by Intel Corporation. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - - -/* We use 0x1a: - _SIDD_SBYTE_OPS - | _SIDD_CMP_EQUAL_EACH - | _SIDD_NEGATIVE_POLARITY - | _SIDD_LEAST_SIGNIFICANT - on pcmpistri to find out if two 16byte data elements are the same - and the offset of the first different byte. There are 4 cases: - - 1. Both 16byte data elements are valid and identical. - 2. Both 16byte data elements have EOS and identical. - 3. Both 16byte data elements are valid and they differ at offset X. - 4. At least one 16byte data element has EOS at offset X. Two 16byte - data elements must differ at or before offset X. - - Here is the table of ECX, CFlag, ZFlag and SFlag for 4 cases: - - case ECX CFlag ZFlag SFlag - 1 16 0 0 0 - 2 16 0 1 1 - 3 X 1 0 0 - 4 0 <= X 1 0/1 0/1 - - We exit from the loop for cases 2, 3 and 4 with jbe which branches - when either CFlag or ZFlag is 1. If CFlag == 0, we return 0 for - case 2. */ - - /* Put all SSE 4.2 functions together. */ - .section .text.SECTION,"ax",@progbits - .align 16 - .type STRCMP_SSE42, @function - .globl STRCMP_SSE42 - .hidden STRCMP_SSE42 -#ifdef USE_AS_STRCASECMP_L -ENTRY (GLABEL(__strcasecmp)) - movq __libc_tsd_LOCALE@gottpoff(%rip),%rax - mov %fs:(%rax),%RDX_LP - - // XXX 5 byte should be before the function - /* 5-byte NOP. */ - .byte 0x0f,0x1f,0x44,0x00,0x00 -END (GLABEL(__strcasecmp)) - /* FALLTHROUGH to strcasecmp_l. */ -#endif -#ifdef USE_AS_STRNCASECMP_L -ENTRY (GLABEL(__strncasecmp)) - movq __libc_tsd_LOCALE@gottpoff(%rip),%rax - mov %fs:(%rax),%RCX_LP - - // XXX 5 byte should be before the function - /* 5-byte NOP. */ - .byte 0x0f,0x1f,0x44,0x00,0x00 -END (GLABEL(__strncasecmp)) - /* FALLTHROUGH to strncasecmp_l. */ -#endif - - -#ifdef USE_AVX -# define movdqa vmovdqa -# define movdqu vmovdqu -# define pmovmskb vpmovmskb -# define pcmpistri vpcmpistri -# define psubb vpsubb -# define pcmpeqb vpcmpeqb -# define psrldq vpsrldq -# define pslldq vpslldq -# define palignr vpalignr -# define pxor vpxor -# define D(arg) arg, arg -#else -# define D(arg) arg -#endif - -STRCMP_SSE42: - cfi_startproc - CALL_MCOUNT - -/* - * This implementation uses SSE to compare up to 16 bytes at a time. - */ -#ifdef USE_AS_STRCASECMP_L - /* We have to fall back on the C implementation for locales - with encodings not matching ASCII for single bytes. */ -# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0 - mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rdx), %RAX_LP -# else - mov (%rdx), %RAX_LP -# endif - testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax) - jne __strcasecmp_l_nonascii -#endif -#ifdef USE_AS_STRNCASECMP_L - /* We have to fall back on the C implementation for locales - with encodings not matching ASCII for single bytes. */ -# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0 - mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rcx), %RAX_LP -# else - mov (%rcx), %RAX_LP -# endif - testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax) - jne __strncasecmp_l_nonascii -#endif - -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - test %rdx, %rdx - je LABEL(strcmp_exitz) - cmp $1, %rdx - je LABEL(Byte0) - mov %rdx, %r11 -#endif - mov %esi, %ecx - mov %edi, %eax -/* Use 64bit AND here to avoid long NOP padding. */ - and $0x3f, %rcx /* rsi alignment in cache line */ - and $0x3f, %rax /* rdi alignment in cache line */ -#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L - .section .rodata.cst16,"aM",@progbits,16 - .align 16 -LABEL(belowupper): - .quad 0x4040404040404040 - .quad 0x4040404040404040 -LABEL(topupper): -# ifdef USE_AVX - .quad 0x5a5a5a5a5a5a5a5a - .quad 0x5a5a5a5a5a5a5a5a -# else - .quad 0x5b5b5b5b5b5b5b5b - .quad 0x5b5b5b5b5b5b5b5b -# endif -LABEL(touppermask): - .quad 0x2020202020202020 - .quad 0x2020202020202020 - .previous - movdqa LABEL(belowupper)(%rip), %xmm4 -# define UCLOW_reg %xmm4 - movdqa LABEL(topupper)(%rip), %xmm5 -# define UCHIGH_reg %xmm5 - movdqa LABEL(touppermask)(%rip), %xmm6 -# define LCQWORD_reg %xmm6 -#endif - cmp $0x30, %ecx - ja LABEL(crosscache)/* rsi: 16-byte load will cross cache line */ - cmp $0x30, %eax - ja LABEL(crosscache)/* rdi: 16-byte load will cross cache line */ - movdqu (%rdi), %xmm1 - movdqu (%rsi), %xmm2 -#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L -# ifdef USE_AVX -# define TOLOWER(reg1, reg2) \ - vpcmpgtb UCLOW_reg, reg1, %xmm7; \ - vpcmpgtb UCHIGH_reg, reg1, %xmm8; \ - vpcmpgtb UCLOW_reg, reg2, %xmm9; \ - vpcmpgtb UCHIGH_reg, reg2, %xmm10; \ - vpandn %xmm7, %xmm8, %xmm8; \ - vpandn %xmm9, %xmm10, %xmm10; \ - vpand LCQWORD_reg, %xmm8, %xmm8; \ - vpand LCQWORD_reg, %xmm10, %xmm10; \ - vpor reg1, %xmm8, reg1; \ - vpor reg2, %xmm10, reg2 -# else -# define TOLOWER(reg1, reg2) \ - movdqa reg1, %xmm7; \ - movdqa UCHIGH_reg, %xmm8; \ - movdqa reg2, %xmm9; \ - movdqa UCHIGH_reg, %xmm10; \ - pcmpgtb UCLOW_reg, %xmm7; \ - pcmpgtb reg1, %xmm8; \ - pcmpgtb UCLOW_reg, %xmm9; \ - pcmpgtb reg2, %xmm10; \ - pand %xmm8, %xmm7; \ - pand %xmm10, %xmm9; \ - pand LCQWORD_reg, %xmm7; \ - pand LCQWORD_reg, %xmm9; \ - por %xmm7, reg1; \ - por %xmm9, reg2 -# endif - TOLOWER (%xmm1, %xmm2) -#else -# define TOLOWER(reg1, reg2) -#endif - pxor %xmm0, D(%xmm0) /* clear %xmm0 for null char checks */ - pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */ - pcmpeqb %xmm2, D(%xmm1) /* compare first 16 bytes for equality */ - psubb %xmm0, D(%xmm1) /* packed sub of comparison results*/ - pmovmskb %xmm1, %edx - sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */ - jnz LABEL(less16bytes)/* If not, find different value or null char */ -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz)/* finish comparison */ -#endif - add $16, %rsi /* prepare to search next 16 bytes */ - add $16, %rdi /* prepare to search next 16 bytes */ - - /* - * Determine source and destination string offsets from 16-byte - * alignment. Use relative offset difference between the two to - * determine which case below to use. - */ - .p2align 4 -LABEL(crosscache): - and $0xfffffffffffffff0, %rsi /* force %rsi is 16 byte aligned */ - and $0xfffffffffffffff0, %rdi /* force %rdi is 16 byte aligned */ - mov $0xffff, %edx /* for equivalent offset */ - xor %r8d, %r8d - and $0xf, %ecx /* offset of rsi */ - and $0xf, %eax /* offset of rdi */ - pxor %xmm0, D(%xmm0) /* clear %xmm0 for null char check */ - cmp %eax, %ecx - je LABEL(ashr_0) /* rsi and rdi relative offset same */ - ja LABEL(bigger) - mov %edx, %r8d /* r8d is offset flag for exit tail */ - xchg %ecx, %eax - xchg %rsi, %rdi -LABEL(bigger): - movdqa (%rdi), %xmm2 - movdqa (%rsi), %xmm1 - lea 15(%rax), %r9 - sub %rcx, %r9 - lea LABEL(unaligned_table)(%rip), %r10 - movslq (%r10, %r9,4), %r9 - pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */ - lea (%r10, %r9), %r10 - jmp *%r10 /* jump to corresponding case */ - -/* - * The following cases will be handled by ashr_0 - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case - * n(0~15) n(0~15) 15(15+ n-n) ashr_0 - */ - .p2align 4 -LABEL(ashr_0): - - movdqa (%rsi), %xmm1 - pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */ -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L - pcmpeqb (%rdi), D(%xmm1) /* compare 16 bytes for equality */ -#else - movdqa (%rdi), %xmm2 - TOLOWER (%xmm1, %xmm2) - pcmpeqb %xmm2, D(%xmm1) /* compare 16 bytes for equality */ -#endif - psubb %xmm0, D(%xmm1) /* packed sub of comparison results*/ - pmovmskb %xmm1, %r9d - shr %cl, %edx /* adjust 0xffff for offset */ - shr %cl, %r9d /* adjust for 16-byte offset */ - sub %r9d, %edx - /* - * edx must be the same with r9d if in left byte (16-rcx) is equal to - * the start from (16-rax) and no null char was seen. - */ - jne LABEL(less32bytes) /* mismatch or null char */ - UPDATE_STRNCMP_COUNTER - mov $16, %rcx - mov $16, %r9 - - /* - * Now both strings are aligned at 16-byte boundary. Loop over strings - * checking 32-bytes per iteration. - */ - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ - .p2align 4 -LABEL(ashr_0_use): - movdqa (%rdi,%rdx), %xmm0 -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L - pcmpistri $0x1a,(%rsi,%rdx), %xmm0 -#else - movdqa (%rsi,%rdx), %xmm1 - TOLOWER (%xmm0, %xmm1) - pcmpistri $0x1a, %xmm1, %xmm0 -#endif - lea 16(%rdx), %rdx - jbe LABEL(ashr_0_exit_use) -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - - movdqa (%rdi,%rdx), %xmm0 -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L - pcmpistri $0x1a,(%rsi,%rdx), %xmm0 -#else - movdqa (%rsi,%rdx), %xmm1 - TOLOWER (%xmm0, %xmm1) - pcmpistri $0x1a, %xmm1, %xmm0 -#endif - lea 16(%rdx), %rdx - jbe LABEL(ashr_0_exit_use) -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - jmp LABEL(ashr_0_use) - - - .p2align 4 -LABEL(ashr_0_exit_use): - jnc LABEL(strcmp_exitz) -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub %rcx, %r11 - jbe LABEL(strcmp_exitz) -#endif - lea -16(%rdx, %rcx), %rcx - movzbl (%rdi, %rcx), %eax - movzbl (%rsi, %rcx), %edx -#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L - leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx - movl (%rcx,%rax,4), %eax - movl (%rcx,%rdx,4), %edx -#endif - sub %edx, %eax - ret - - - -/* - * The following cases will be handled by ashr_1 - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case - * n(15) n -15 0(15 +(n-15) - n) ashr_1 - */ - .p2align 4 -LABEL(ashr_1): - pslldq $15, D(%xmm2) /* shift first string to align with second */ - TOLOWER (%xmm1, %xmm2) - pcmpeqb %xmm1, D(%xmm2) /* compare 16 bytes for equality */ - psubb %xmm0, D(%xmm2) /* packed sub of comparison results*/ - pmovmskb %xmm2, %r9d - shr %cl, %edx /* adjust 0xffff for offset */ - shr %cl, %r9d /* adjust for 16-byte offset */ - sub %r9d, %edx - jnz LABEL(less32bytes) /* mismatch or null char seen */ - movdqa (%rdi), %xmm3 - UPDATE_STRNCMP_COUNTER - - mov $16, %rcx /* index for loads*/ - mov $1, %r9d /* byte position left over from less32bytes case */ - /* - * Setup %r10 value allows us to detect crossing a page boundary. - * When %r10 goes positive we have crossed a page boundary and - * need to do a nibble. - */ - lea 1(%rdi), %r10 - and $0xfff, %r10 /* offset into 4K page */ - sub $0x1000, %r10 /* subtract 4K pagesize */ - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ - - .p2align 4 -LABEL(loop_ashr_1_use): - add $16, %r10 - jg LABEL(nibble_ashr_1_use) - -LABEL(nibble_ashr_1_restart_use): - movdqa (%rdi, %rdx), %xmm0 - palignr $1, -16(%rdi, %rdx), D(%xmm0) -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L - pcmpistri $0x1a,(%rsi,%rdx), %xmm0 -#else - movdqa (%rsi,%rdx), %xmm1 - TOLOWER (%xmm0, %xmm1) - pcmpistri $0x1a, %xmm1, %xmm0 -#endif - jbe LABEL(exit_use) -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - - add $16, %rdx - add $16, %r10 - jg LABEL(nibble_ashr_1_use) - - movdqa (%rdi, %rdx), %xmm0 - palignr $1, -16(%rdi, %rdx), D(%xmm0) -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L - pcmpistri $0x1a,(%rsi,%rdx), %xmm0 -#else - movdqa (%rsi,%rdx), %xmm1 - TOLOWER (%xmm0, %xmm1) - pcmpistri $0x1a, %xmm1, %xmm0 -#endif - jbe LABEL(exit_use) -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - add $16, %rdx - jmp LABEL(loop_ashr_1_use) - - .p2align 4 -LABEL(nibble_ashr_1_use): - sub $0x1000, %r10 - movdqa -16(%rdi, %rdx), %xmm0 - psrldq $1, D(%xmm0) - pcmpistri $0x3a,%xmm0, %xmm0 -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - cmp %r11, %rcx - jae LABEL(nibble_ashr_exit_use) -#endif - cmp $14, %ecx - ja LABEL(nibble_ashr_1_restart_use) - - jmp LABEL(nibble_ashr_exit_use) - -/* - * The following cases will be handled by ashr_2 - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case - * n(14~15) n -14 1(15 +(n-14) - n) ashr_2 - */ - .p2align 4 -LABEL(ashr_2): - pslldq $14, D(%xmm2) - TOLOWER (%xmm1, %xmm2) - pcmpeqb %xmm1, D(%xmm2) - psubb %xmm0, D(%xmm2) - pmovmskb %xmm2, %r9d - shr %cl, %edx - shr %cl, %r9d - sub %r9d, %edx - jnz LABEL(less32bytes) - movdqa (%rdi), %xmm3 - UPDATE_STRNCMP_COUNTER - - mov $16, %rcx /* index for loads */ - mov $2, %r9d /* byte position left over from less32bytes case */ - /* - * Setup %r10 value allows us to detect crossing a page boundary. - * When %r10 goes positive we have crossed a page boundary and - * need to do a nibble. - */ - lea 2(%rdi), %r10 - and $0xfff, %r10 /* offset into 4K page */ - sub $0x1000, %r10 /* subtract 4K pagesize */ - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ - - .p2align 4 -LABEL(loop_ashr_2_use): - add $16, %r10 - jg LABEL(nibble_ashr_2_use) - -LABEL(nibble_ashr_2_restart_use): - movdqa (%rdi, %rdx), %xmm0 - palignr $2, -16(%rdi, %rdx), D(%xmm0) -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L - pcmpistri $0x1a,(%rsi,%rdx), %xmm0 -#else - movdqa (%rsi,%rdx), %xmm1 - TOLOWER (%xmm0, %xmm1) - pcmpistri $0x1a, %xmm1, %xmm0 -#endif - jbe LABEL(exit_use) -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - - add $16, %rdx - add $16, %r10 - jg LABEL(nibble_ashr_2_use) - - movdqa (%rdi, %rdx), %xmm0 - palignr $2, -16(%rdi, %rdx), D(%xmm0) -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L - pcmpistri $0x1a,(%rsi,%rdx), %xmm0 -#else - movdqa (%rsi,%rdx), %xmm1 - TOLOWER (%xmm0, %xmm1) - pcmpistri $0x1a, %xmm1, %xmm0 -#endif - jbe LABEL(exit_use) -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - add $16, %rdx - jmp LABEL(loop_ashr_2_use) - - .p2align 4 -LABEL(nibble_ashr_2_use): - sub $0x1000, %r10 - movdqa -16(%rdi, %rdx), %xmm0 - psrldq $2, D(%xmm0) - pcmpistri $0x3a,%xmm0, %xmm0 -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - cmp %r11, %rcx - jae LABEL(nibble_ashr_exit_use) -#endif - cmp $13, %ecx - ja LABEL(nibble_ashr_2_restart_use) - - jmp LABEL(nibble_ashr_exit_use) - -/* - * The following cases will be handled by ashr_3 - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case - * n(13~15) n -13 2(15 +(n-13) - n) ashr_3 - */ - .p2align 4 -LABEL(ashr_3): - pslldq $13, D(%xmm2) - TOLOWER (%xmm1, %xmm2) - pcmpeqb %xmm1, D(%xmm2) - psubb %xmm0, D(%xmm2) - pmovmskb %xmm2, %r9d - shr %cl, %edx - shr %cl, %r9d - sub %r9d, %edx - jnz LABEL(less32bytes) - movdqa (%rdi), %xmm3 - - UPDATE_STRNCMP_COUNTER - - mov $16, %rcx /* index for loads */ - mov $3, %r9d /* byte position left over from less32bytes case */ - /* - * Setup %r10 value allows us to detect crossing a page boundary. - * When %r10 goes positive we have crossed a page boundary and - * need to do a nibble. - */ - lea 3(%rdi), %r10 - and $0xfff, %r10 /* offset into 4K page */ - sub $0x1000, %r10 /* subtract 4K pagesize */ - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ - -LABEL(loop_ashr_3_use): - add $16, %r10 - jg LABEL(nibble_ashr_3_use) - -LABEL(nibble_ashr_3_restart_use): - movdqa (%rdi, %rdx), %xmm0 - palignr $3, -16(%rdi, %rdx), D(%xmm0) -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L - pcmpistri $0x1a,(%rsi,%rdx), %xmm0 -#else - movdqa (%rsi,%rdx), %xmm1 - TOLOWER (%xmm0, %xmm1) - pcmpistri $0x1a, %xmm1, %xmm0 -#endif - jbe LABEL(exit_use) -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - - add $16, %rdx - add $16, %r10 - jg LABEL(nibble_ashr_3_use) - - movdqa (%rdi, %rdx), %xmm0 - palignr $3, -16(%rdi, %rdx), D(%xmm0) -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L - pcmpistri $0x1a,(%rsi,%rdx), %xmm0 -#else - movdqa (%rsi,%rdx), %xmm1 - TOLOWER (%xmm0, %xmm1) - pcmpistri $0x1a, %xmm1, %xmm0 -#endif - jbe LABEL(exit_use) -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - add $16, %rdx - jmp LABEL(loop_ashr_3_use) - - .p2align 4 -LABEL(nibble_ashr_3_use): - sub $0x1000, %r10 - movdqa -16(%rdi, %rdx), %xmm0 - psrldq $3, D(%xmm0) - pcmpistri $0x3a,%xmm0, %xmm0 -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - cmp %r11, %rcx - jae LABEL(nibble_ashr_exit_use) -#endif - cmp $12, %ecx - ja LABEL(nibble_ashr_3_restart_use) - - jmp LABEL(nibble_ashr_exit_use) - -/* - * The following cases will be handled by ashr_4 - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case - * n(12~15) n -12 3(15 +(n-12) - n) ashr_4 - */ - .p2align 4 -LABEL(ashr_4): - pslldq $12, D(%xmm2) - TOLOWER (%xmm1, %xmm2) - pcmpeqb %xmm1, D(%xmm2) - psubb %xmm0, D(%xmm2) - pmovmskb %xmm2, %r9d - shr %cl, %edx - shr %cl, %r9d - sub %r9d, %edx - jnz LABEL(less32bytes) - movdqa (%rdi), %xmm3 - - UPDATE_STRNCMP_COUNTER - - mov $16, %rcx /* index for loads */ - mov $4, %r9d /* byte position left over from less32bytes case */ - /* - * Setup %r10 value allows us to detect crossing a page boundary. - * When %r10 goes positive we have crossed a page boundary and - * need to do a nibble. - */ - lea 4(%rdi), %r10 - and $0xfff, %r10 /* offset into 4K page */ - sub $0x1000, %r10 /* subtract 4K pagesize */ - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ - - .p2align 4 -LABEL(loop_ashr_4_use): - add $16, %r10 - jg LABEL(nibble_ashr_4_use) - -LABEL(nibble_ashr_4_restart_use): - movdqa (%rdi, %rdx), %xmm0 - palignr $4, -16(%rdi, %rdx), D(%xmm0) -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L - pcmpistri $0x1a,(%rsi,%rdx), %xmm0 -#else - movdqa (%rsi,%rdx), %xmm1 - TOLOWER (%xmm0, %xmm1) - pcmpistri $0x1a, %xmm1, %xmm0 -#endif - jbe LABEL(exit_use) -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - - add $16, %rdx - add $16, %r10 - jg LABEL(nibble_ashr_4_use) - - movdqa (%rdi, %rdx), %xmm0 - palignr $4, -16(%rdi, %rdx), D(%xmm0) -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L - pcmpistri $0x1a,(%rsi,%rdx), %xmm0 -#else - movdqa (%rsi,%rdx), %xmm1 - TOLOWER (%xmm0, %xmm1) - pcmpistri $0x1a, %xmm1, %xmm0 -#endif - jbe LABEL(exit_use) -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - add $16, %rdx - jmp LABEL(loop_ashr_4_use) - - .p2align 4 -LABEL(nibble_ashr_4_use): - sub $0x1000, %r10 - movdqa -16(%rdi, %rdx), %xmm0 - psrldq $4, D(%xmm0) - pcmpistri $0x3a,%xmm0, %xmm0 -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - cmp %r11, %rcx - jae LABEL(nibble_ashr_exit_use) -#endif - cmp $11, %ecx - ja LABEL(nibble_ashr_4_restart_use) - - jmp LABEL(nibble_ashr_exit_use) - -/* - * The following cases will be handled by ashr_5 - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case - * n(11~15) n - 11 4(15 +(n-11) - n) ashr_5 - */ - .p2align 4 -LABEL(ashr_5): - pslldq $11, D(%xmm2) - TOLOWER (%xmm1, %xmm2) - pcmpeqb %xmm1, D(%xmm2) - psubb %xmm0, D(%xmm2) - pmovmskb %xmm2, %r9d - shr %cl, %edx - shr %cl, %r9d - sub %r9d, %edx - jnz LABEL(less32bytes) - movdqa (%rdi), %xmm3 - - UPDATE_STRNCMP_COUNTER - - mov $16, %rcx /* index for loads */ - mov $5, %r9d /* byte position left over from less32bytes case */ - /* - * Setup %r10 value allows us to detect crossing a page boundary. - * When %r10 goes positive we have crossed a page boundary and - * need to do a nibble. - */ - lea 5(%rdi), %r10 - and $0xfff, %r10 /* offset into 4K page */ - sub $0x1000, %r10 /* subtract 4K pagesize */ - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ - - .p2align 4 -LABEL(loop_ashr_5_use): - add $16, %r10 - jg LABEL(nibble_ashr_5_use) - -LABEL(nibble_ashr_5_restart_use): - movdqa (%rdi, %rdx), %xmm0 - palignr $5, -16(%rdi, %rdx), D(%xmm0) -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L - pcmpistri $0x1a,(%rsi,%rdx), %xmm0 -#else - movdqa (%rsi,%rdx), %xmm1 - TOLOWER (%xmm0, %xmm1) - pcmpistri $0x1a, %xmm1, %xmm0 -#endif - jbe LABEL(exit_use) -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - - add $16, %rdx - add $16, %r10 - jg LABEL(nibble_ashr_5_use) - - movdqa (%rdi, %rdx), %xmm0 - - palignr $5, -16(%rdi, %rdx), D(%xmm0) -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L - pcmpistri $0x1a,(%rsi,%rdx), %xmm0 -#else - movdqa (%rsi,%rdx), %xmm1 - TOLOWER (%xmm0, %xmm1) - pcmpistri $0x1a, %xmm1, %xmm0 -#endif - jbe LABEL(exit_use) -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - add $16, %rdx - jmp LABEL(loop_ashr_5_use) - - .p2align 4 -LABEL(nibble_ashr_5_use): - sub $0x1000, %r10 - movdqa -16(%rdi, %rdx), %xmm0 - psrldq $5, D(%xmm0) - pcmpistri $0x3a,%xmm0, %xmm0 -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - cmp %r11, %rcx - jae LABEL(nibble_ashr_exit_use) -#endif - cmp $10, %ecx - ja LABEL(nibble_ashr_5_restart_use) - - jmp LABEL(nibble_ashr_exit_use) - -/* - * The following cases will be handled by ashr_6 - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case - * n(10~15) n - 10 5(15 +(n-10) - n) ashr_6 - */ - .p2align 4 -LABEL(ashr_6): - pslldq $10, D(%xmm2) - TOLOWER (%xmm1, %xmm2) - pcmpeqb %xmm1, D(%xmm2) - psubb %xmm0, D(%xmm2) - pmovmskb %xmm2, %r9d - shr %cl, %edx - shr %cl, %r9d - sub %r9d, %edx - jnz LABEL(less32bytes) - movdqa (%rdi), %xmm3 - - UPDATE_STRNCMP_COUNTER - - mov $16, %rcx /* index for loads */ - mov $6, %r9d /* byte position left over from less32bytes case */ - /* - * Setup %r10 value allows us to detect crossing a page boundary. - * When %r10 goes positive we have crossed a page boundary and - * need to do a nibble. - */ - lea 6(%rdi), %r10 - and $0xfff, %r10 /* offset into 4K page */ - sub $0x1000, %r10 /* subtract 4K pagesize */ - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ - - .p2align 4 -LABEL(loop_ashr_6_use): - add $16, %r10 - jg LABEL(nibble_ashr_6_use) - -LABEL(nibble_ashr_6_restart_use): - movdqa (%rdi, %rdx), %xmm0 - palignr $6, -16(%rdi, %rdx), D(%xmm0) -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L - pcmpistri $0x1a,(%rsi,%rdx), %xmm0 -#else - movdqa (%rsi,%rdx), %xmm1 - TOLOWER (%xmm0, %xmm1) - pcmpistri $0x1a, %xmm1, %xmm0 -#endif - jbe LABEL(exit_use) -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - - add $16, %rdx - add $16, %r10 - jg LABEL(nibble_ashr_6_use) - - movdqa (%rdi, %rdx), %xmm0 - palignr $6, -16(%rdi, %rdx), D(%xmm0) -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L - pcmpistri $0x1a,(%rsi,%rdx), %xmm0 -#else - movdqa (%rsi,%rdx), %xmm1 - TOLOWER (%xmm0, %xmm1) - pcmpistri $0x1a, %xmm1, %xmm0 -#endif - jbe LABEL(exit_use) -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - add $16, %rdx - jmp LABEL(loop_ashr_6_use) - - .p2align 4 -LABEL(nibble_ashr_6_use): - sub $0x1000, %r10 - movdqa -16(%rdi, %rdx), %xmm0 - psrldq $6, D(%xmm0) - pcmpistri $0x3a,%xmm0, %xmm0 -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - cmp %r11, %rcx - jae LABEL(nibble_ashr_exit_use) -#endif - cmp $9, %ecx - ja LABEL(nibble_ashr_6_restart_use) - - jmp LABEL(nibble_ashr_exit_use) - -/* - * The following cases will be handled by ashr_7 - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case - * n(9~15) n - 9 6(15 +(n - 9) - n) ashr_7 - */ - .p2align 4 -LABEL(ashr_7): - pslldq $9, D(%xmm2) - TOLOWER (%xmm1, %xmm2) - pcmpeqb %xmm1, D(%xmm2) - psubb %xmm0, D(%xmm2) - pmovmskb %xmm2, %r9d - shr %cl, %edx - shr %cl, %r9d - sub %r9d, %edx - jnz LABEL(less32bytes) - movdqa (%rdi), %xmm3 - - UPDATE_STRNCMP_COUNTER - - mov $16, %rcx /* index for loads */ - mov $7, %r9d /* byte position left over from less32bytes case */ - /* - * Setup %r10 value allows us to detect crossing a page boundary. - * When %r10 goes positive we have crossed a page boundary and - * need to do a nibble. - */ - lea 7(%rdi), %r10 - and $0xfff, %r10 /* offset into 4K page */ - sub $0x1000, %r10 /* subtract 4K pagesize */ - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ - - .p2align 4 -LABEL(loop_ashr_7_use): - add $16, %r10 - jg LABEL(nibble_ashr_7_use) - -LABEL(nibble_ashr_7_restart_use): - movdqa (%rdi, %rdx), %xmm0 - palignr $7, -16(%rdi, %rdx), D(%xmm0) -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L - pcmpistri $0x1a,(%rsi,%rdx), %xmm0 -#else - movdqa (%rsi,%rdx), %xmm1 - TOLOWER (%xmm0, %xmm1) - pcmpistri $0x1a, %xmm1, %xmm0 -#endif - jbe LABEL(exit_use) -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - - add $16, %rdx - add $16, %r10 - jg LABEL(nibble_ashr_7_use) - - movdqa (%rdi, %rdx), %xmm0 - palignr $7, -16(%rdi, %rdx), D(%xmm0) -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L - pcmpistri $0x1a,(%rsi,%rdx), %xmm0 -#else - movdqa (%rsi,%rdx), %xmm1 - TOLOWER (%xmm0, %xmm1) - pcmpistri $0x1a, %xmm1, %xmm0 -#endif - jbe LABEL(exit_use) -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - add $16, %rdx - jmp LABEL(loop_ashr_7_use) - - .p2align 4 -LABEL(nibble_ashr_7_use): - sub $0x1000, %r10 - movdqa -16(%rdi, %rdx), %xmm0 - psrldq $7, D(%xmm0) - pcmpistri $0x3a,%xmm0, %xmm0 -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - cmp %r11, %rcx - jae LABEL(nibble_ashr_exit_use) -#endif - cmp $8, %ecx - ja LABEL(nibble_ashr_7_restart_use) - - jmp LABEL(nibble_ashr_exit_use) - -/* - * The following cases will be handled by ashr_8 - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case - * n(8~15) n - 8 7(15 +(n - 8) - n) ashr_8 - */ - .p2align 4 -LABEL(ashr_8): - pslldq $8, D(%xmm2) - TOLOWER (%xmm1, %xmm2) - pcmpeqb %xmm1, D(%xmm2) - psubb %xmm0, D(%xmm2) - pmovmskb %xmm2, %r9d - shr %cl, %edx - shr %cl, %r9d - sub %r9d, %edx - jnz LABEL(less32bytes) - movdqa (%rdi), %xmm3 - - UPDATE_STRNCMP_COUNTER - - mov $16, %rcx /* index for loads */ - mov $8, %r9d /* byte position left over from less32bytes case */ - /* - * Setup %r10 value allows us to detect crossing a page boundary. - * When %r10 goes positive we have crossed a page boundary and - * need to do a nibble. - */ - lea 8(%rdi), %r10 - and $0xfff, %r10 /* offset into 4K page */ - sub $0x1000, %r10 /* subtract 4K pagesize */ - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ - - .p2align 4 -LABEL(loop_ashr_8_use): - add $16, %r10 - jg LABEL(nibble_ashr_8_use) - -LABEL(nibble_ashr_8_restart_use): - movdqa (%rdi, %rdx), %xmm0 - palignr $8, -16(%rdi, %rdx), D(%xmm0) -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L - pcmpistri $0x1a, (%rsi,%rdx), %xmm0 -#else - movdqa (%rsi,%rdx), %xmm1 - TOLOWER (%xmm0, %xmm1) - pcmpistri $0x1a, %xmm1, %xmm0 -#endif - jbe LABEL(exit_use) -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - - add $16, %rdx - add $16, %r10 - jg LABEL(nibble_ashr_8_use) - - movdqa (%rdi, %rdx), %xmm0 - palignr $8, -16(%rdi, %rdx), D(%xmm0) -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L - pcmpistri $0x1a, (%rsi,%rdx), %xmm0 -#else - movdqa (%rsi,%rdx), %xmm1 - TOLOWER (%xmm0, %xmm1) - pcmpistri $0x1a, %xmm1, %xmm0 -#endif - jbe LABEL(exit_use) -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - add $16, %rdx - jmp LABEL(loop_ashr_8_use) - - .p2align 4 -LABEL(nibble_ashr_8_use): - sub $0x1000, %r10 - movdqa -16(%rdi, %rdx), %xmm0 - psrldq $8, D(%xmm0) - pcmpistri $0x3a,%xmm0, %xmm0 -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - cmp %r11, %rcx - jae LABEL(nibble_ashr_exit_use) -#endif - cmp $7, %ecx - ja LABEL(nibble_ashr_8_restart_use) - - jmp LABEL(nibble_ashr_exit_use) - -/* - * The following cases will be handled by ashr_9 - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case - * n(7~15) n - 7 8(15 +(n - 7) - n) ashr_9 - */ - .p2align 4 -LABEL(ashr_9): - pslldq $7, D(%xmm2) - TOLOWER (%xmm1, %xmm2) - pcmpeqb %xmm1, D(%xmm2) - psubb %xmm0, D(%xmm2) - pmovmskb %xmm2, %r9d - shr %cl, %edx - shr %cl, %r9d - sub %r9d, %edx - jnz LABEL(less32bytes) - movdqa (%rdi), %xmm3 - - UPDATE_STRNCMP_COUNTER - - mov $16, %rcx /* index for loads */ - mov $9, %r9d /* byte position left over from less32bytes case */ - /* - * Setup %r10 value allows us to detect crossing a page boundary. - * When %r10 goes positive we have crossed a page boundary and - * need to do a nibble. - */ - lea 9(%rdi), %r10 - and $0xfff, %r10 /* offset into 4K page */ - sub $0x1000, %r10 /* subtract 4K pagesize */ - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ - - .p2align 4 -LABEL(loop_ashr_9_use): - add $16, %r10 - jg LABEL(nibble_ashr_9_use) - -LABEL(nibble_ashr_9_restart_use): - movdqa (%rdi, %rdx), %xmm0 - - palignr $9, -16(%rdi, %rdx), D(%xmm0) -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L - pcmpistri $0x1a, (%rsi,%rdx), %xmm0 -#else - movdqa (%rsi,%rdx), %xmm1 - TOLOWER (%xmm0, %xmm1) - pcmpistri $0x1a, %xmm1, %xmm0 -#endif - jbe LABEL(exit_use) -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - - add $16, %rdx - add $16, %r10 - jg LABEL(nibble_ashr_9_use) - - movdqa (%rdi, %rdx), %xmm0 - palignr $9, -16(%rdi, %rdx), D(%xmm0) -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L - pcmpistri $0x1a, (%rsi,%rdx), %xmm0 -#else - movdqa (%rsi,%rdx), %xmm1 - TOLOWER (%xmm0, %xmm1) - pcmpistri $0x1a, %xmm1, %xmm0 -#endif - jbe LABEL(exit_use) -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - add $16, %rdx - jmp LABEL(loop_ashr_9_use) - - .p2align 4 -LABEL(nibble_ashr_9_use): - sub $0x1000, %r10 - movdqa -16(%rdi, %rdx), %xmm0 - psrldq $9, D(%xmm0) - pcmpistri $0x3a,%xmm0, %xmm0 -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - cmp %r11, %rcx - jae LABEL(nibble_ashr_exit_use) -#endif - cmp $6, %ecx - ja LABEL(nibble_ashr_9_restart_use) - - jmp LABEL(nibble_ashr_exit_use) - -/* - * The following cases will be handled by ashr_10 - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case - * n(6~15) n - 6 9(15 +(n - 6) - n) ashr_10 - */ - .p2align 4 -LABEL(ashr_10): - pslldq $6, D(%xmm2) - TOLOWER (%xmm1, %xmm2) - pcmpeqb %xmm1, D(%xmm2) - psubb %xmm0, D(%xmm2) - pmovmskb %xmm2, %r9d - shr %cl, %edx - shr %cl, %r9d - sub %r9d, %edx - jnz LABEL(less32bytes) - movdqa (%rdi), %xmm3 - - UPDATE_STRNCMP_COUNTER - - mov $16, %rcx /* index for loads */ - mov $10, %r9d /* byte position left over from less32bytes case */ - /* - * Setup %r10 value allows us to detect crossing a page boundary. - * When %r10 goes positive we have crossed a page boundary and - * need to do a nibble. - */ - lea 10(%rdi), %r10 - and $0xfff, %r10 /* offset into 4K page */ - sub $0x1000, %r10 /* subtract 4K pagesize */ - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ - - .p2align 4 -LABEL(loop_ashr_10_use): - add $16, %r10 - jg LABEL(nibble_ashr_10_use) - -LABEL(nibble_ashr_10_restart_use): - movdqa (%rdi, %rdx), %xmm0 - palignr $10, -16(%rdi, %rdx), D(%xmm0) -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L - pcmpistri $0x1a, (%rsi,%rdx), %xmm0 -#else - movdqa (%rsi,%rdx), %xmm1 - TOLOWER (%xmm0, %xmm1) - pcmpistri $0x1a, %xmm1, %xmm0 -#endif - jbe LABEL(exit_use) -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - - add $16, %rdx - add $16, %r10 - jg LABEL(nibble_ashr_10_use) - - movdqa (%rdi, %rdx), %xmm0 - palignr $10, -16(%rdi, %rdx), D(%xmm0) -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L - pcmpistri $0x1a, (%rsi,%rdx), %xmm0 -#else - movdqa (%rsi,%rdx), %xmm1 - TOLOWER (%xmm0, %xmm1) - pcmpistri $0x1a, %xmm1, %xmm0 -#endif - jbe LABEL(exit_use) -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - add $16, %rdx - jmp LABEL(loop_ashr_10_use) - - .p2align 4 -LABEL(nibble_ashr_10_use): - sub $0x1000, %r10 - movdqa -16(%rdi, %rdx), %xmm0 - psrldq $10, D(%xmm0) - pcmpistri $0x3a,%xmm0, %xmm0 -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - cmp %r11, %rcx - jae LABEL(nibble_ashr_exit_use) -#endif - cmp $5, %ecx - ja LABEL(nibble_ashr_10_restart_use) - - jmp LABEL(nibble_ashr_exit_use) - -/* - * The following cases will be handled by ashr_11 - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case - * n(5~15) n - 5 10(15 +(n - 5) - n) ashr_11 - */ - .p2align 4 -LABEL(ashr_11): - pslldq $5, D(%xmm2) - TOLOWER (%xmm1, %xmm2) - pcmpeqb %xmm1, D(%xmm2) - psubb %xmm0, D(%xmm2) - pmovmskb %xmm2, %r9d - shr %cl, %edx - shr %cl, %r9d - sub %r9d, %edx - jnz LABEL(less32bytes) - movdqa (%rdi), %xmm3 - - UPDATE_STRNCMP_COUNTER - - mov $16, %rcx /* index for loads */ - mov $11, %r9d /* byte position left over from less32bytes case */ - /* - * Setup %r10 value allows us to detect crossing a page boundary. - * When %r10 goes positive we have crossed a page boundary and - * need to do a nibble. - */ - lea 11(%rdi), %r10 - and $0xfff, %r10 /* offset into 4K page */ - sub $0x1000, %r10 /* subtract 4K pagesize */ - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ - - .p2align 4 -LABEL(loop_ashr_11_use): - add $16, %r10 - jg LABEL(nibble_ashr_11_use) - -LABEL(nibble_ashr_11_restart_use): - movdqa (%rdi, %rdx), %xmm0 - palignr $11, -16(%rdi, %rdx), D(%xmm0) -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L - pcmpistri $0x1a, (%rsi,%rdx), %xmm0 -#else - movdqa (%rsi,%rdx), %xmm1 - TOLOWER (%xmm0, %xmm1) - pcmpistri $0x1a, %xmm1, %xmm0 -#endif - jbe LABEL(exit_use) -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - - add $16, %rdx - add $16, %r10 - jg LABEL(nibble_ashr_11_use) - - movdqa (%rdi, %rdx), %xmm0 - palignr $11, -16(%rdi, %rdx), D(%xmm0) -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L - pcmpistri $0x1a, (%rsi,%rdx), %xmm0 -#else - movdqa (%rsi,%rdx), %xmm1 - TOLOWER (%xmm0, %xmm1) - pcmpistri $0x1a, %xmm1, %xmm0 -#endif - jbe LABEL(exit_use) -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - add $16, %rdx - jmp LABEL(loop_ashr_11_use) - - .p2align 4 -LABEL(nibble_ashr_11_use): - sub $0x1000, %r10 - movdqa -16(%rdi, %rdx), %xmm0 - psrldq $11, D(%xmm0) - pcmpistri $0x3a,%xmm0, %xmm0 -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - cmp %r11, %rcx - jae LABEL(nibble_ashr_exit_use) -#endif - cmp $4, %ecx - ja LABEL(nibble_ashr_11_restart_use) - - jmp LABEL(nibble_ashr_exit_use) - -/* - * The following cases will be handled by ashr_12 - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case - * n(4~15) n - 4 11(15 +(n - 4) - n) ashr_12 - */ - .p2align 4 -LABEL(ashr_12): - pslldq $4, D(%xmm2) - TOLOWER (%xmm1, %xmm2) - pcmpeqb %xmm1, D(%xmm2) - psubb %xmm0, D(%xmm2) - pmovmskb %xmm2, %r9d - shr %cl, %edx - shr %cl, %r9d - sub %r9d, %edx - jnz LABEL(less32bytes) - movdqa (%rdi), %xmm3 - - UPDATE_STRNCMP_COUNTER - - mov $16, %rcx /* index for loads */ - mov $12, %r9d /* byte position left over from less32bytes case */ - /* - * Setup %r10 value allows us to detect crossing a page boundary. - * When %r10 goes positive we have crossed a page boundary and - * need to do a nibble. - */ - lea 12(%rdi), %r10 - and $0xfff, %r10 /* offset into 4K page */ - sub $0x1000, %r10 /* subtract 4K pagesize */ - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ - - .p2align 4 -LABEL(loop_ashr_12_use): - add $16, %r10 - jg LABEL(nibble_ashr_12_use) - -LABEL(nibble_ashr_12_restart_use): - movdqa (%rdi, %rdx), %xmm0 - palignr $12, -16(%rdi, %rdx), D(%xmm0) -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L - pcmpistri $0x1a, (%rsi,%rdx), %xmm0 -#else - movdqa (%rsi,%rdx), %xmm1 - TOLOWER (%xmm0, %xmm1) - pcmpistri $0x1a, %xmm1, %xmm0 -#endif - jbe LABEL(exit_use) -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - - add $16, %rdx - add $16, %r10 - jg LABEL(nibble_ashr_12_use) - - movdqa (%rdi, %rdx), %xmm0 - palignr $12, -16(%rdi, %rdx), D(%xmm0) -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L - pcmpistri $0x1a, (%rsi,%rdx), %xmm0 -#else - movdqa (%rsi,%rdx), %xmm1 - TOLOWER (%xmm0, %xmm1) - pcmpistri $0x1a, %xmm1, %xmm0 -#endif - jbe LABEL(exit_use) -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - add $16, %rdx - jmp LABEL(loop_ashr_12_use) - - .p2align 4 -LABEL(nibble_ashr_12_use): - sub $0x1000, %r10 - movdqa -16(%rdi, %rdx), %xmm0 - psrldq $12, D(%xmm0) - pcmpistri $0x3a,%xmm0, %xmm0 -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - cmp %r11, %rcx - jae LABEL(nibble_ashr_exit_use) -#endif - cmp $3, %ecx - ja LABEL(nibble_ashr_12_restart_use) - - jmp LABEL(nibble_ashr_exit_use) - -/* - * The following cases will be handled by ashr_13 - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case - * n(3~15) n - 3 12(15 +(n - 3) - n) ashr_13 - */ - .p2align 4 -LABEL(ashr_13): - pslldq $3, D(%xmm2) - TOLOWER (%xmm1, %xmm2) - pcmpeqb %xmm1, D(%xmm2) - psubb %xmm0, D(%xmm2) - pmovmskb %xmm2, %r9d - shr %cl, %edx - shr %cl, %r9d - sub %r9d, %edx - jnz LABEL(less32bytes) - movdqa (%rdi), %xmm3 - - UPDATE_STRNCMP_COUNTER - - mov $16, %rcx /* index for loads */ - mov $13, %r9d /* byte position left over from less32bytes case */ - /* - * Setup %r10 value allows us to detect crossing a page boundary. - * When %r10 goes positive we have crossed a page boundary and - * need to do a nibble. - */ - lea 13(%rdi), %r10 - and $0xfff, %r10 /* offset into 4K page */ - sub $0x1000, %r10 /* subtract 4K pagesize */ - - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ - - .p2align 4 -LABEL(loop_ashr_13_use): - add $16, %r10 - jg LABEL(nibble_ashr_13_use) - -LABEL(nibble_ashr_13_restart_use): - movdqa (%rdi, %rdx), %xmm0 - palignr $13, -16(%rdi, %rdx), D(%xmm0) -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L - pcmpistri $0x1a, (%rsi,%rdx), %xmm0 -#else - movdqa (%rsi,%rdx), %xmm1 - TOLOWER (%xmm0, %xmm1) - pcmpistri $0x1a, %xmm1, %xmm0 -#endif - jbe LABEL(exit_use) -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - - add $16, %rdx - add $16, %r10 - jg LABEL(nibble_ashr_13_use) - - movdqa (%rdi, %rdx), %xmm0 - palignr $13, -16(%rdi, %rdx), D(%xmm0) -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L - pcmpistri $0x1a, (%rsi,%rdx), %xmm0 -#else - movdqa (%rsi,%rdx), %xmm1 - TOLOWER (%xmm0, %xmm1) - pcmpistri $0x1a, %xmm1, %xmm0 -#endif - jbe LABEL(exit_use) -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - add $16, %rdx - jmp LABEL(loop_ashr_13_use) - - .p2align 4 -LABEL(nibble_ashr_13_use): - sub $0x1000, %r10 - movdqa -16(%rdi, %rdx), %xmm0 - psrldq $13, D(%xmm0) - pcmpistri $0x3a,%xmm0, %xmm0 -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - cmp %r11, %rcx - jae LABEL(nibble_ashr_exit_use) -#endif - cmp $2, %ecx - ja LABEL(nibble_ashr_13_restart_use) - - jmp LABEL(nibble_ashr_exit_use) - -/* - * The following cases will be handled by ashr_14 - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case - * n(2~15) n - 2 13(15 +(n - 2) - n) ashr_14 - */ - .p2align 4 -LABEL(ashr_14): - pslldq $2, D(%xmm2) - TOLOWER (%xmm1, %xmm2) - pcmpeqb %xmm1, D(%xmm2) - psubb %xmm0, D(%xmm2) - pmovmskb %xmm2, %r9d - shr %cl, %edx - shr %cl, %r9d - sub %r9d, %edx - jnz LABEL(less32bytes) - movdqa (%rdi), %xmm3 - - UPDATE_STRNCMP_COUNTER - - mov $16, %rcx /* index for loads */ - mov $14, %r9d /* byte position left over from less32bytes case */ - /* - * Setup %r10 value allows us to detect crossing a page boundary. - * When %r10 goes positive we have crossed a page boundary and - * need to do a nibble. - */ - lea 14(%rdi), %r10 - and $0xfff, %r10 /* offset into 4K page */ - sub $0x1000, %r10 /* subtract 4K pagesize */ - - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ - - .p2align 4 -LABEL(loop_ashr_14_use): - add $16, %r10 - jg LABEL(nibble_ashr_14_use) - -LABEL(nibble_ashr_14_restart_use): - movdqa (%rdi, %rdx), %xmm0 - palignr $14, -16(%rdi, %rdx), D(%xmm0) -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L - pcmpistri $0x1a, (%rsi,%rdx), %xmm0 -#else - movdqa (%rsi,%rdx), %xmm1 - TOLOWER (%xmm0, %xmm1) - pcmpistri $0x1a, %xmm1, %xmm0 -#endif - jbe LABEL(exit_use) -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - - add $16, %rdx - add $16, %r10 - jg LABEL(nibble_ashr_14_use) - - movdqa (%rdi, %rdx), %xmm0 - palignr $14, -16(%rdi, %rdx), D(%xmm0) -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L - pcmpistri $0x1a, (%rsi,%rdx), %xmm0 -#else - movdqa (%rsi,%rdx), %xmm1 - TOLOWER (%xmm0, %xmm1) - pcmpistri $0x1a, %xmm1, %xmm0 -#endif - jbe LABEL(exit_use) -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - add $16, %rdx - jmp LABEL(loop_ashr_14_use) - - .p2align 4 -LABEL(nibble_ashr_14_use): - sub $0x1000, %r10 - movdqa -16(%rdi, %rdx), %xmm0 - psrldq $14, D(%xmm0) - pcmpistri $0x3a,%xmm0, %xmm0 -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - cmp %r11, %rcx - jae LABEL(nibble_ashr_exit_use) -#endif - cmp $1, %ecx - ja LABEL(nibble_ashr_14_restart_use) - - jmp LABEL(nibble_ashr_exit_use) - -/* - * The following cases will be handled by ashr_15 - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case - * n(1~15) n - 1 14(15 +(n - 1) - n) ashr_15 - */ - .p2align 4 -LABEL(ashr_15): - pslldq $1, D(%xmm2) - TOLOWER (%xmm1, %xmm2) - pcmpeqb %xmm1, D(%xmm2) - psubb %xmm0, D(%xmm2) - pmovmskb %xmm2, %r9d - shr %cl, %edx - shr %cl, %r9d - sub %r9d, %edx - jnz LABEL(less32bytes) - - movdqa (%rdi), %xmm3 - - UPDATE_STRNCMP_COUNTER - - mov $16, %rcx /* index for loads */ - mov $15, %r9d /* byte position left over from less32bytes case */ - /* - * Setup %r10 value allows us to detect crossing a page boundary. - * When %r10 goes positive we have crossed a page boundary and - * need to do a nibble. - */ - lea 15(%rdi), %r10 - and $0xfff, %r10 /* offset into 4K page */ - - sub $0x1000, %r10 /* subtract 4K pagesize */ - - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ - - .p2align 4 -LABEL(loop_ashr_15_use): - add $16, %r10 - jg LABEL(nibble_ashr_15_use) - -LABEL(nibble_ashr_15_restart_use): - movdqa (%rdi, %rdx), %xmm0 - palignr $15, -16(%rdi, %rdx), D(%xmm0) -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L - pcmpistri $0x1a, (%rsi,%rdx), %xmm0 -#else - movdqa (%rsi,%rdx), %xmm1 - TOLOWER (%xmm0, %xmm1) - pcmpistri $0x1a, %xmm1, %xmm0 -#endif - jbe LABEL(exit_use) -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - - add $16, %rdx - add $16, %r10 - jg LABEL(nibble_ashr_15_use) - - movdqa (%rdi, %rdx), %xmm0 - palignr $15, -16(%rdi, %rdx), D(%xmm0) -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L - pcmpistri $0x1a, (%rsi,%rdx), %xmm0 -#else - movdqa (%rsi,%rdx), %xmm1 - TOLOWER (%xmm0, %xmm1) - pcmpistri $0x1a, %xmm1, %xmm0 -#endif - jbe LABEL(exit_use) -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - add $16, %rdx - jmp LABEL(loop_ashr_15_use) - - .p2align 4 -LABEL(nibble_ashr_15_use): - sub $0x1000, %r10 - movdqa -16(%rdi, %rdx), %xmm0 - psrldq $15, D(%xmm0) - pcmpistri $0x3a,%xmm0, %xmm0 -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - cmp %r11, %rcx - jae LABEL(nibble_ashr_exit_use) -#endif - cmp $0, %ecx - ja LABEL(nibble_ashr_15_restart_use) - -LABEL(nibble_ashr_exit_use): -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L - pcmpistri $0x1a,(%rsi,%rdx), %xmm0 -#else - movdqa (%rsi,%rdx), %xmm1 - TOLOWER (%xmm0, %xmm1) - pcmpistri $0x1a, %xmm1, %xmm0 -#endif - .p2align 4 -LABEL(exit_use): - jnc LABEL(strcmp_exitz) -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub %rcx, %r11 - jbe LABEL(strcmp_exitz) -#endif - add %rcx, %rdx - lea -16(%rdi, %r9), %rdi - movzbl (%rdi, %rdx), %eax - movzbl (%rsi, %rdx), %edx - test %r8d, %r8d - jz LABEL(ret_use) - xchg %eax, %edx -LABEL(ret_use): -#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L - leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx - movl (%rcx,%rdx,4), %edx - movl (%rcx,%rax,4), %eax -#endif - - sub %edx, %eax - ret - -LABEL(less32bytes): - lea (%rdi, %rax), %rdi /* locate the exact address for first operand(rdi) */ - lea (%rsi, %rcx), %rsi /* locate the exact address for second operand(rsi) */ - test %r8d, %r8d - jz LABEL(ret) - xchg %rsi, %rdi /* recover original order according to flag(%r8d) */ - - .p2align 4 -LABEL(ret): -LABEL(less16bytes): - bsf %rdx, %rdx /* find and store bit index in %rdx */ - -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub %rdx, %r11 - jbe LABEL(strcmp_exitz) -#endif - movzbl (%rsi, %rdx), %ecx - movzbl (%rdi, %rdx), %eax - -#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L - leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx - movl (%rdx,%rcx,4), %ecx - movl (%rdx,%rax,4), %eax -#endif - - sub %ecx, %eax - ret - -LABEL(strcmp_exitz): - xor %eax, %eax - ret - - .p2align 4 - // XXX Same as code above -LABEL(Byte0): - movzx (%rsi), %ecx - movzx (%rdi), %eax - -#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L - leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx - movl (%rdx,%rcx,4), %ecx - movl (%rdx,%rax,4), %eax -#endif - - sub %ecx, %eax - ret - cfi_endproc - .size STRCMP_SSE42, .-STRCMP_SSE42 - -#undef UCLOW_reg -#undef UCHIGH_reg -#undef LCQWORD_reg -#undef TOLOWER - - /* Put all SSE 4.2 functions together. */ - .section .rodata.SECTION,"a",@progbits - .p2align 3 -LABEL(unaligned_table): - .int LABEL(ashr_1) - LABEL(unaligned_table) - .int LABEL(ashr_2) - LABEL(unaligned_table) - .int LABEL(ashr_3) - LABEL(unaligned_table) - .int LABEL(ashr_4) - LABEL(unaligned_table) - .int LABEL(ashr_5) - LABEL(unaligned_table) - .int LABEL(ashr_6) - LABEL(unaligned_table) - .int LABEL(ashr_7) - LABEL(unaligned_table) - .int LABEL(ashr_8) - LABEL(unaligned_table) - .int LABEL(ashr_9) - LABEL(unaligned_table) - .int LABEL(ashr_10) - LABEL(unaligned_table) - .int LABEL(ashr_11) - LABEL(unaligned_table) - .int LABEL(ashr_12) - LABEL(unaligned_table) - .int LABEL(ashr_13) - LABEL(unaligned_table) - .int LABEL(ashr_14) - LABEL(unaligned_table) - .int LABEL(ashr_15) - LABEL(unaligned_table) - .int LABEL(ashr_0) - LABEL(unaligned_table) - -#undef LABEL -#undef GLABEL -#undef SECTION -#undef movdqa -#undef movdqu -#undef pmovmskb -#undef pcmpistri -#undef psubb -#undef pcmpeqb -#undef psrldq -#undef pslldq -#undef palignr -#undef pxor -#undef D diff --git a/sysdeps/x86_64/multiarch/strcmp.S b/sysdeps/x86_64/multiarch/strcmp.S index f50f26c..63aa62e 100644 --- a/sysdeps/x86_64/multiarch/strcmp.S +++ b/sysdeps/x86_64/multiarch/strcmp.S @@ -31,8 +31,8 @@ test %r9, %r9; \ je LABEL(strcmp_exitz); \ mov %r9, %r11 - -# define STRCMP_SSE42 __strncmp_sse42 +# define STRCMP_AVX2 __strncmp_avx2 +# define STRCMP_SSE2_UNALIGNED __strncmp_sse2_unaligned # define STRCMP_SSSE3 __strncmp_ssse3 # define STRCMP_SSE2 __strncmp_sse2 # define __GI_STRCMP __GI_strncmp @@ -40,9 +40,8 @@ # include "locale-defines.h" # define UPDATE_STRNCMP_COUNTER - -# define STRCMP_AVX __strcasecmp_l_avx -# define STRCMP_SSE42 __strcasecmp_l_sse42 +# define STRCMP_AVX2 __strcasecmp_avx2_l +# define STRCMP_SSE2_UNALIGNED __strcasecmp_sse2_unaligned_l # define STRCMP_SSSE3 __strcasecmp_l_ssse3 # define STRCMP_SSE2 __strcasecmp_l_sse2 # define __GI_STRCMP __GI___strcasecmp_l @@ -60,8 +59,8 @@ je LABEL(strcmp_exitz); \ mov %r9, %r11 -# define STRCMP_AVX __strncasecmp_l_avx -# define STRCMP_SSE42 __strncasecmp_l_sse42 +# define STRCMP_AVX2 __strncasecmp_avx2_l +# define STRCMP_SSE2_UNALIGNED __strncasecmp_sse2_unaligned_l # define STRCMP_SSSE3 __strncasecmp_l_ssse3 # define STRCMP_SSE2 __strncasecmp_l_sse2 # define __GI_STRCMP __GI___strncasecmp_l @@ -69,8 +68,9 @@ # define USE_AS_STRCMP # define UPDATE_STRNCMP_COUNTER # ifndef STRCMP +# define STRCMP_AVX2 __strcmp_avx2 +# define STRCMP_SSE2_UNALIGNED __strcmp_sse2_unaligned # define STRCMP strcmp -# define STRCMP_SSE42 __strcmp_sse42 # define STRCMP_SSSE3 __strcmp_ssse3 # define STRCMP_SSE2 __strcmp_sse2 # define __GI_STRCMP __GI_strcmp @@ -89,17 +89,16 @@ ENTRY(STRCMP) jne 1f call __init_cpu_features 1: -#ifdef USE_AS_STRCMP - leaq __strcmp_sse2_unaligned(%rip), %rax +# ifdef HAVE_AVX2_SUPPORT + + leaq STRCMP_AVX2(%rip), %rax + testl $bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip) + jnz 3f +# endif + leaq STRCMP_SSE2_UNALIGNED(%rip), %rax testl $bit_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_Fast_Unaligned_Load(%rip) jnz 3f -#else - testl $bit_Slow_SSE4_2, __cpu_features+FEATURE_OFFSET+index_Slow_SSE4_2(%rip) - jnz 2f - leaq STRCMP_SSE42(%rip), %rax - testl $bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip) - jnz 3f -#endif + 2: leaq STRCMP_SSSE3(%rip), %rax testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip) jnz 3f @@ -115,21 +114,22 @@ ENTRY(__strcasecmp) jne 1f call __init_cpu_features 1: -# ifdef HAVE_AVX_SUPPORT - leaq __strcasecmp_avx(%rip), %rax - testl $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip) +# ifdef HAVE_AVX2_SUPPORT + + leaq __strcasecmp_avx2(%rip), %rax + testl $bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip) jnz 3f # endif - testl $bit_Slow_SSE4_2, __cpu_features+FEATURE_OFFSET+index_Slow_SSE4_2(%rip) - jnz 2f - leaq __strcasecmp_sse42(%rip), %rax - testl $bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip) - jnz 3f + leaq __strcasecmp_sse2_unaligned(%rip), %rax + testl $bit_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_Fast_Unaligned_Load(%rip) + jnz 3f + 2: leaq __strcasecmp_ssse3(%rip), %rax testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip) jnz 3f leaq __strcasecmp_sse2(%rip), %rax 3: ret + END(__strcasecmp) weak_alias (__strcasecmp, strcasecmp) # endif @@ -141,45 +141,26 @@ ENTRY(__strncasecmp) jne 1f call __init_cpu_features 1: -# ifdef HAVE_AVX_SUPPORT - leaq __strncasecmp_avx(%rip), %rax - testl $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip) +# ifdef HAVE_AVX2_SUPPORT + + leaq __strncasecmp_avx2(%rip), %rax + testl $bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip) jnz 3f # endif - testl $bit_Slow_SSE4_2, __cpu_features+FEATURE_OFFSET+index_Slow_SSE4_2(%rip) - jnz 2f - leaq __strncasecmp_sse42(%rip), %rax - testl $bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip) - jnz 3f + leaq __strncasecmp_sse2_unaligned(%rip), %rax + testl $bit_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_Fast_Unaligned_Load(%rip) + jnz 3f + 2: leaq __strncasecmp_ssse3(%rip), %rax testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip) jnz 3f leaq __strncasecmp_sse2(%rip), %rax 3: ret + END(__strncasecmp) weak_alias (__strncasecmp, strncasecmp) # endif -# undef LABEL -# define LABEL(l) .L##l##_sse42 -# define GLABEL(l) l##_sse42 -# define SECTION sse4.2 -# include "strcmp-sse42.S" - - -# ifdef HAVE_AVX_SUPPORT -# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L -# define LABEL(l) .L##l##_avx -# define GLABEL(l) l##_avx -# define USE_AVX 1 -# undef STRCMP_SSE42 -# define STRCMP_SSE42 STRCMP_AVX -# define SECTION avx -# include "strcmp-sse42.S" -# endif -# endif - - # undef ENTRY # define ENTRY(name) \ .type STRCMP_SSE2, @function; \ diff --git a/sysdeps/x86_64/multiarch/strncase_l-avx2.S b/sysdeps/x86_64/multiarch/strncase_l-avx2.S new file mode 100644 index 0000000..809b966 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strncase_l-avx2.S @@ -0,0 +1,6 @@ +#define AS_STRCASECMP +#define AS_STRNCMP +#define USE_AVX2 +#define __strncasecmp_sse2_unaligned __strncasecmp_avx2 +#define STRCMP __strncasecmp_avx2_l +#include "strcmp-sse2-unaligned.S" diff --git a/sysdeps/x86_64/multiarch/strncase_l-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strncase_l-sse2-unaligned.S new file mode 100644 index 0000000..a372ed4 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strncase_l-sse2-unaligned.S @@ -0,0 +1,4 @@ +#define AS_STRCASECMP +#define AS_STRNCMP +#define STRCMP __strncasecmp_sse2_unaligned_l +#include "strcmp-sse2-unaligned.S" diff --git a/sysdeps/x86_64/multiarch/strncmp-avx2.S b/sysdeps/x86_64/multiarch/strncmp-avx2.S new file mode 100644 index 0000000..2d9a032 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strncmp-avx2.S @@ -0,0 +1,4 @@ +#define USE_AVX2 +#define AS_STRNCMP +#define STRCMP __strncmp_avx2 +#include "strcmp-sse2-unaligned.S" diff --git a/sysdeps/x86_64/multiarch/strncmp-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strncmp-sse2-unaligned.S new file mode 100644 index 0000000..7f9a5fd --- /dev/null +++ b/sysdeps/x86_64/multiarch/strncmp-sse2-unaligned.S @@ -0,0 +1,3 @@ +#define AS_STRNCMP +#define STRCMP __strncmp_sse2_unaligned +#include "strcmp-sse2-unaligned.S"