Message ID | 20220712192808.335531-4-goldstein.w.n@gmail.com |
---|---|
State | New |
Headers | show |
Series | [v1,1/4] x86: Rename STRCASECMP_NONASCII macro to STRCASECMP_L_NONASCII | expand |
On Tue, Jul 12, 2022 at 12:28 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > This commit doesn't affect libc.so.6, its just housekeeping to prepare > for adding explicit ISA level support. > > Tested build on x86_64 and x86_32 with/without multiarch. > --- > .../x86_64/multiarch/strcasecmp_l-sse4_2.S | 3 +- > sysdeps/x86_64/multiarch/strcmp-sse42.S | 1782 ----------------- > sysdeps/x86_64/multiarch/strcmp-sse4_2.S | 1763 +++++++++++++++- > sysdeps/x86_64/multiarch/strncase_l-sse4_2.S | 3 +- > sysdeps/x86_64/multiarch/strncmp-sse4_2.S | 7 +- > 5 files changed, 1766 insertions(+), 1792 deletions(-) > delete mode 100644 sysdeps/x86_64/multiarch/strcmp-sse42.S > > diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-sse4_2.S b/sysdeps/x86_64/multiarch/strcasecmp_l-sse4_2.S > index 411ab7d283..ac03b95756 100644 > --- a/sysdeps/x86_64/multiarch/strcasecmp_l-sse4_2.S > +++ b/sysdeps/x86_64/multiarch/strcasecmp_l-sse4_2.S > @@ -16,6 +16,5 @@ > License along with the GNU C Library; if not, see > <https://www.gnu.org/licenses/>. */ > > -#define STRCMP_SSE42 __strcasecmp_l_sse42 > #define USE_AS_STRCASECMP_L > -#include "strcmp-sse42.S" > +#include "strcmp-sse4_2.S" > diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S > deleted file mode 100644 > index 60313c647a..0000000000 > --- a/sysdeps/x86_64/multiarch/strcmp-sse42.S > +++ /dev/null > @@ -1,1782 +0,0 @@ > -/* strcmp with SSE4.2 > - Copyright (C) 2009-2022 Free Software Foundation, Inc. > - This file is part of the GNU C Library. > - > - The GNU C Library is free software; you can redistribute it and/or > - modify it under the terms of the GNU Lesser General Public > - License as published by the Free Software Foundation; either > - version 2.1 of the License, or (at your option) any later version. > - > - The GNU C Library is distributed in the hope that it will be useful, > - but WITHOUT ANY WARRANTY; without even the implied warranty of > - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > - Lesser General Public License for more details. > - > - You should have received a copy of the GNU Lesser General Public > - License along with the GNU C Library; if not, see > - <https://www.gnu.org/licenses/>. */ > - > -#include <sysdep.h> > - > -#ifndef STRCMP_SSE42 > -# define STRCMP_SSE42 __strcmp_sse42 > -#endif > - > -#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L > -# include "locale-defines.h" > -#endif > - > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > -/* Since the counter, %r11, is unsigned, we branch to strcmp_exitz > - if the new counter > the old one or is 0. */ > -# define UPDATE_STRNCMP_COUNTER \ > - /* calculate left number to compare */ \ > - lea -16(%rcx, %r11), %r9; \ > - cmp %r9, %r11; \ > - jb LABEL(strcmp_exitz); \ > - test %r9, %r9; \ > - je LABEL(strcmp_exitz); \ > - mov %r9, %r11 > -#else > -# define UPDATE_STRNCMP_COUNTER > -#endif > - > -#define SECTION sse4.2 > -#define GLABEL(l) l##_sse42 > - > -#define LABEL(l) .L##l > - > -/* We use 0x1a: > - _SIDD_SBYTE_OPS > - | _SIDD_CMP_EQUAL_EACH > - | _SIDD_NEGATIVE_POLARITY > - | _SIDD_LEAST_SIGNIFICANT > - on pcmpistri to find out if two 16byte data elements are the same > - and the offset of the first different byte. There are 4 cases: > - > - 1. Both 16byte data elements are valid and identical. > - 2. Both 16byte data elements have EOS and identical. > - 3. Both 16byte data elements are valid and they differ at offset X. > - 4. At least one 16byte data element has EOS at offset X. Two 16byte > - data elements must differ at or before offset X. > - > - Here is the table of ECX, CFlag, ZFlag and SFlag for 4 cases: > - > - case ECX CFlag ZFlag SFlag > - 1 16 0 0 0 > - 2 16 0 1 1 > - 3 X 1 0 0 > - 4 0 <= X 1 0/1 0/1 > - > - We exit from the loop for cases 2, 3 and 4 with jbe which branches > - when either CFlag or ZFlag is 1. If CFlag == 0, we return 0 for > - case 2. */ > - > - /* Put all SSE 4.2 functions together. */ > - .section .text.SECTION,"ax",@progbits > - .align 16 > - .type STRCMP_SSE42, @function > - .globl STRCMP_SSE42 > -#ifdef USE_AS_STRCASECMP_L > -ENTRY (GLABEL(__strcasecmp)) > - movq __libc_tsd_LOCALE@gottpoff(%rip),%rax > - mov %fs:(%rax),%RDX_LP > - > - /* Either 1 or 5 bytes (dependeing if CET is enabled). */ > - .p2align 4 > -END (GLABEL(__strcasecmp)) > - /* FALLTHROUGH to strcasecmp_l. */ > -#endif > -#ifdef USE_AS_STRNCASECMP_L > -ENTRY (GLABEL(__strncasecmp)) > - movq __libc_tsd_LOCALE@gottpoff(%rip),%rax > - mov %fs:(%rax),%RCX_LP > - > - /* Either 1 or 5 bytes (dependeing if CET is enabled). */ > - .p2align 4 > -END (GLABEL(__strncasecmp)) > - /* FALLTHROUGH to strncasecmp_l. */ > -#endif > - > - > -#define arg arg > - > -STRCMP_SSE42: > - cfi_startproc > - _CET_ENDBR > - CALL_MCOUNT > - > -/* > - * This implementation uses SSE to compare up to 16 bytes at a time. > - */ > -#ifdef USE_AS_STRCASECMP_L > - /* We have to fall back on the C implementation for locales > - with encodings not matching ASCII for single bytes. */ > -# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0 > - mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rdx), %RAX_LP > -# else > - mov (%rdx), %RAX_LP > -# endif > - testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax) > - jne __strcasecmp_l_nonascii > -#endif > -#ifdef USE_AS_STRNCASECMP_L > - /* We have to fall back on the C implementation for locales > - with encodings not matching ASCII for single bytes. */ > -# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0 > - mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rcx), %RAX_LP > -# else > - mov (%rcx), %RAX_LP > -# endif > - testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax) > - jne __strncasecmp_l_nonascii > -#endif > - > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - test %RDX_LP, %RDX_LP > - je LABEL(strcmp_exitz) > - cmp $1, %RDX_LP > - je LABEL(Byte0) > - mov %RDX_LP, %R11_LP > -#endif > - mov %esi, %ecx > - mov %edi, %eax > -/* Use 64bit AND here to avoid long NOP padding. */ > - and $0x3f, %rcx /* rsi alignment in cache line */ > - and $0x3f, %rax /* rdi alignment in cache line */ > -#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L > - .section .rodata.cst16,"aM",@progbits,16 > - .align 16 > -LABEL(lcase_min): > - .quad 0x3f3f3f3f3f3f3f3f > - .quad 0x3f3f3f3f3f3f3f3f > -LABEL(lcase_max): > - .quad 0x9999999999999999 > - .quad 0x9999999999999999 > -LABEL(case_add): > - .quad 0x2020202020202020 > - .quad 0x2020202020202020 > - .previous > - movdqa LABEL(lcase_min)(%rip), %xmm4 > -# define LCASE_MIN_reg %xmm4 > - movdqa LABEL(lcase_max)(%rip), %xmm5 > -# define LCASE_MAX_reg %xmm5 > - movdqa LABEL(case_add)(%rip), %xmm6 > -# define CASE_ADD_reg %xmm6 > -#endif > - cmp $0x30, %ecx > - ja LABEL(crosscache)/* rsi: 16-byte load will cross cache line */ > - cmp $0x30, %eax > - ja LABEL(crosscache)/* rdi: 16-byte load will cross cache line */ > - movdqu (%rdi), %xmm1 > - movdqu (%rsi), %xmm2 > -#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L > -# define TOLOWER(reg1, reg2) \ > - movdqa LCASE_MIN_reg, %xmm7; \ > - movdqa LCASE_MIN_reg, %xmm8; \ > - paddb reg1, %xmm7; \ > - paddb reg2, %xmm8; \ > - pcmpgtb LCASE_MAX_reg, %xmm7; \ > - pcmpgtb LCASE_MAX_reg, %xmm8; \ > - pandn CASE_ADD_reg, %xmm7; \ > - pandn CASE_ADD_reg, %xmm8; \ > - paddb %xmm7, reg1; \ > - paddb %xmm8, reg2 > - > - TOLOWER (%xmm1, %xmm2) > -#else > -# define TOLOWER(reg1, reg2) > -#endif > - pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */ > - pcmpeqb %xmm1, %xmm0 /* Any null chars? */ > - pcmpeqb %xmm2, %xmm1 /* compare first 16 bytes for equality */ > - psubb %xmm0, %xmm1 /* packed sub of comparison results*/ > - pmovmskb %xmm1, %edx > - sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */ > - jnz LABEL(less16bytes)/* If not, find different value or null char */ > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - sub $16, %r11 > - jbe LABEL(strcmp_exitz)/* finish comparison */ > -#endif > - add $16, %rsi /* prepare to search next 16 bytes */ > - add $16, %rdi /* prepare to search next 16 bytes */ > - > - /* > - * Determine source and destination string offsets from 16-byte > - * alignment. Use relative offset difference between the two to > - * determine which case below to use. > - */ > - .p2align 4 > -LABEL(crosscache): > - and $0xfffffffffffffff0, %rsi /* force %rsi is 16 byte aligned */ > - and $0xfffffffffffffff0, %rdi /* force %rdi is 16 byte aligned */ > - mov $0xffff, %edx /* for equivalent offset */ > - xor %r8d, %r8d > - and $0xf, %ecx /* offset of rsi */ > - and $0xf, %eax /* offset of rdi */ > - pxor %xmm0, %xmm0 /* clear %xmm0 for null char check */ > - cmp %eax, %ecx > - je LABEL(ashr_0) /* rsi and rdi relative offset same */ > - ja LABEL(bigger) > - mov %edx, %r8d /* r8d is offset flag for exit tail */ > - xchg %ecx, %eax > - xchg %rsi, %rdi > -LABEL(bigger): > - movdqa (%rdi), %xmm2 > - movdqa (%rsi), %xmm1 > - lea 15(%rax), %r9 > - sub %rcx, %r9 > - lea LABEL(unaligned_table)(%rip), %r10 > - movslq (%r10, %r9,4), %r9 > - pcmpeqb %xmm1, %xmm0 /* Any null chars? */ > - lea (%r10, %r9), %r10 > - _CET_NOTRACK jmp *%r10 /* jump to corresponding case */ > - > -/* > - * The following cases will be handled by ashr_0 > - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case > - * n(0~15) n(0~15) 15(15+ n-n) ashr_0 > - */ > - .p2align 4 > -LABEL(ashr_0): > - > - movdqa (%rsi), %xmm1 > - pcmpeqb %xmm1, %xmm0 /* Any null chars? */ > -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > - pcmpeqb (%rdi), %xmm1 /* compare 16 bytes for equality */ > -#else > - movdqa (%rdi), %xmm2 > - TOLOWER (%xmm1, %xmm2) > - pcmpeqb %xmm2, %xmm1 /* compare 16 bytes for equality */ > -#endif > - psubb %xmm0, %xmm1 /* packed sub of comparison results*/ > - pmovmskb %xmm1, %r9d > - shr %cl, %edx /* adjust 0xffff for offset */ > - shr %cl, %r9d /* adjust for 16-byte offset */ > - sub %r9d, %edx > - /* > - * edx must be the same with r9d if in left byte (16-rcx) is equal to > - * the start from (16-rax) and no null char was seen. > - */ > - jne LABEL(less32bytes) /* mismatch or null char */ > - UPDATE_STRNCMP_COUNTER > - mov $16, %rcx > - mov $16, %r9 > - > - /* > - * Now both strings are aligned at 16-byte boundary. Loop over strings > - * checking 32-bytes per iteration. > - */ > - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ > - .p2align 4 > -LABEL(ashr_0_use): > - movdqa (%rdi,%rdx), %xmm0 > -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > - pcmpistri $0x1a,(%rsi,%rdx), %xmm0 > -#else > - movdqa (%rsi,%rdx), %xmm1 > - TOLOWER (%xmm0, %xmm1) > - pcmpistri $0x1a, %xmm1, %xmm0 > -#endif > - lea 16(%rdx), %rdx > - jbe LABEL(ashr_0_exit_use) > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - sub $16, %r11 > - jbe LABEL(strcmp_exitz) > -#endif > - > - movdqa (%rdi,%rdx), %xmm0 > -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > - pcmpistri $0x1a,(%rsi,%rdx), %xmm0 > -#else > - movdqa (%rsi,%rdx), %xmm1 > - TOLOWER (%xmm0, %xmm1) > - pcmpistri $0x1a, %xmm1, %xmm0 > -#endif > - lea 16(%rdx), %rdx > - jbe LABEL(ashr_0_exit_use) > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - sub $16, %r11 > - jbe LABEL(strcmp_exitz) > -#endif > - jmp LABEL(ashr_0_use) > - > - > - .p2align 4 > -LABEL(ashr_0_exit_use): > - jnc LABEL(strcmp_exitz) > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - sub %rcx, %r11 > - jbe LABEL(strcmp_exitz) > -#endif > - lea -16(%rdx, %rcx), %rcx > - movzbl (%rdi, %rcx), %eax > - movzbl (%rsi, %rcx), %edx > -#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L > - leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx > - movl (%rcx,%rax,4), %eax > - movl (%rcx,%rdx,4), %edx > -#endif > - sub %edx, %eax > - ret > - > - > - > -/* > - * The following cases will be handled by ashr_1 > - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case > - * n(15) n -15 0(15 +(n-15) - n) ashr_1 > - */ > - .p2align 4 > -LABEL(ashr_1): > - pslldq $15, %xmm2 /* shift first string to align with second */ > - TOLOWER (%xmm1, %xmm2) > - pcmpeqb %xmm1, %xmm2 /* compare 16 bytes for equality */ > - psubb %xmm0, %xmm2 /* packed sub of comparison results*/ > - pmovmskb %xmm2, %r9d > - shr %cl, %edx /* adjust 0xffff for offset */ > - shr %cl, %r9d /* adjust for 16-byte offset */ > - sub %r9d, %edx > - jnz LABEL(less32bytes) /* mismatch or null char seen */ > - movdqa (%rdi), %xmm3 > - UPDATE_STRNCMP_COUNTER > - > - mov $16, %rcx /* index for loads*/ > - mov $1, %r9d /* byte position left over from less32bytes case */ > - /* > - * Setup %r10 value allows us to detect crossing a page boundary. > - * When %r10 goes positive we have crossed a page boundary and > - * need to do a nibble. > - */ > - lea 1(%rdi), %r10 > - and $0xfff, %r10 /* offset into 4K page */ > - sub $0x1000, %r10 /* subtract 4K pagesize */ > - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ > - > - .p2align 4 > -LABEL(loop_ashr_1_use): > - add $16, %r10 > - jg LABEL(nibble_ashr_1_use) > - > -LABEL(nibble_ashr_1_restart_use): > - movdqa (%rdi, %rdx), %xmm0 > - palignr $1, -16(%rdi, %rdx), %xmm0 > -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > - pcmpistri $0x1a,(%rsi,%rdx), %xmm0 > -#else > - movdqa (%rsi,%rdx), %xmm1 > - TOLOWER (%xmm0, %xmm1) > - pcmpistri $0x1a, %xmm1, %xmm0 > -#endif > - jbe LABEL(exit_use) > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - sub $16, %r11 > - jbe LABEL(strcmp_exitz) > -#endif > - > - add $16, %rdx > - add $16, %r10 > - jg LABEL(nibble_ashr_1_use) > - > - movdqa (%rdi, %rdx), %xmm0 > - palignr $1, -16(%rdi, %rdx), %xmm0 > -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > - pcmpistri $0x1a,(%rsi,%rdx), %xmm0 > -#else > - movdqa (%rsi,%rdx), %xmm1 > - TOLOWER (%xmm0, %xmm1) > - pcmpistri $0x1a, %xmm1, %xmm0 > -#endif > - jbe LABEL(exit_use) > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - sub $16, %r11 > - jbe LABEL(strcmp_exitz) > -#endif > - add $16, %rdx > - jmp LABEL(loop_ashr_1_use) > - > - .p2align 4 > -LABEL(nibble_ashr_1_use): > - sub $0x1000, %r10 > - movdqa -16(%rdi, %rdx), %xmm0 > - psrldq $1, %xmm0 > - pcmpistri $0x3a,%xmm0, %xmm0 > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - cmp %r11, %rcx > - jae LABEL(nibble_ashr_exit_use) > -#endif > - cmp $14, %ecx > - ja LABEL(nibble_ashr_1_restart_use) > - > - jmp LABEL(nibble_ashr_exit_use) > - > -/* > - * The following cases will be handled by ashr_2 > - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case > - * n(14~15) n -14 1(15 +(n-14) - n) ashr_2 > - */ > - .p2align 4 > -LABEL(ashr_2): > - pslldq $14, %xmm2 > - TOLOWER (%xmm1, %xmm2) > - pcmpeqb %xmm1, %xmm2 > - psubb %xmm0, %xmm2 > - pmovmskb %xmm2, %r9d > - shr %cl, %edx > - shr %cl, %r9d > - sub %r9d, %edx > - jnz LABEL(less32bytes) > - movdqa (%rdi), %xmm3 > - UPDATE_STRNCMP_COUNTER > - > - mov $16, %rcx /* index for loads */ > - mov $2, %r9d /* byte position left over from less32bytes case */ > - /* > - * Setup %r10 value allows us to detect crossing a page boundary. > - * When %r10 goes positive we have crossed a page boundary and > - * need to do a nibble. > - */ > - lea 2(%rdi), %r10 > - and $0xfff, %r10 /* offset into 4K page */ > - sub $0x1000, %r10 /* subtract 4K pagesize */ > - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ > - > - .p2align 4 > -LABEL(loop_ashr_2_use): > - add $16, %r10 > - jg LABEL(nibble_ashr_2_use) > - > -LABEL(nibble_ashr_2_restart_use): > - movdqa (%rdi, %rdx), %xmm0 > - palignr $2, -16(%rdi, %rdx), %xmm0 > -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > - pcmpistri $0x1a,(%rsi,%rdx), %xmm0 > -#else > - movdqa (%rsi,%rdx), %xmm1 > - TOLOWER (%xmm0, %xmm1) > - pcmpistri $0x1a, %xmm1, %xmm0 > -#endif > - jbe LABEL(exit_use) > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - sub $16, %r11 > - jbe LABEL(strcmp_exitz) > -#endif > - > - add $16, %rdx > - add $16, %r10 > - jg LABEL(nibble_ashr_2_use) > - > - movdqa (%rdi, %rdx), %xmm0 > - palignr $2, -16(%rdi, %rdx), %xmm0 > -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > - pcmpistri $0x1a,(%rsi,%rdx), %xmm0 > -#else > - movdqa (%rsi,%rdx), %xmm1 > - TOLOWER (%xmm0, %xmm1) > - pcmpistri $0x1a, %xmm1, %xmm0 > -#endif > - jbe LABEL(exit_use) > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - sub $16, %r11 > - jbe LABEL(strcmp_exitz) > -#endif > - add $16, %rdx > - jmp LABEL(loop_ashr_2_use) > - > - .p2align 4 > -LABEL(nibble_ashr_2_use): > - sub $0x1000, %r10 > - movdqa -16(%rdi, %rdx), %xmm0 > - psrldq $2, %xmm0 > - pcmpistri $0x3a,%xmm0, %xmm0 > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - cmp %r11, %rcx > - jae LABEL(nibble_ashr_exit_use) > -#endif > - cmp $13, %ecx > - ja LABEL(nibble_ashr_2_restart_use) > - > - jmp LABEL(nibble_ashr_exit_use) > - > -/* > - * The following cases will be handled by ashr_3 > - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case > - * n(13~15) n -13 2(15 +(n-13) - n) ashr_3 > - */ > - .p2align 4 > -LABEL(ashr_3): > - pslldq $13, %xmm2 > - TOLOWER (%xmm1, %xmm2) > - pcmpeqb %xmm1, %xmm2 > - psubb %xmm0, %xmm2 > - pmovmskb %xmm2, %r9d > - shr %cl, %edx > - shr %cl, %r9d > - sub %r9d, %edx > - jnz LABEL(less32bytes) > - movdqa (%rdi), %xmm3 > - > - UPDATE_STRNCMP_COUNTER > - > - mov $16, %rcx /* index for loads */ > - mov $3, %r9d /* byte position left over from less32bytes case */ > - /* > - * Setup %r10 value allows us to detect crossing a page boundary. > - * When %r10 goes positive we have crossed a page boundary and > - * need to do a nibble. > - */ > - lea 3(%rdi), %r10 > - and $0xfff, %r10 /* offset into 4K page */ > - sub $0x1000, %r10 /* subtract 4K pagesize */ > - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ > - > -LABEL(loop_ashr_3_use): > - add $16, %r10 > - jg LABEL(nibble_ashr_3_use) > - > -LABEL(nibble_ashr_3_restart_use): > - movdqa (%rdi, %rdx), %xmm0 > - palignr $3, -16(%rdi, %rdx), %xmm0 > -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > - pcmpistri $0x1a,(%rsi,%rdx), %xmm0 > -#else > - movdqa (%rsi,%rdx), %xmm1 > - TOLOWER (%xmm0, %xmm1) > - pcmpistri $0x1a, %xmm1, %xmm0 > -#endif > - jbe LABEL(exit_use) > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - sub $16, %r11 > - jbe LABEL(strcmp_exitz) > -#endif > - > - add $16, %rdx > - add $16, %r10 > - jg LABEL(nibble_ashr_3_use) > - > - movdqa (%rdi, %rdx), %xmm0 > - palignr $3, -16(%rdi, %rdx), %xmm0 > -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > - pcmpistri $0x1a,(%rsi,%rdx), %xmm0 > -#else > - movdqa (%rsi,%rdx), %xmm1 > - TOLOWER (%xmm0, %xmm1) > - pcmpistri $0x1a, %xmm1, %xmm0 > -#endif > - jbe LABEL(exit_use) > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - sub $16, %r11 > - jbe LABEL(strcmp_exitz) > -#endif > - add $16, %rdx > - jmp LABEL(loop_ashr_3_use) > - > - .p2align 4 > -LABEL(nibble_ashr_3_use): > - sub $0x1000, %r10 > - movdqa -16(%rdi, %rdx), %xmm0 > - psrldq $3, %xmm0 > - pcmpistri $0x3a,%xmm0, %xmm0 > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - cmp %r11, %rcx > - jae LABEL(nibble_ashr_exit_use) > -#endif > - cmp $12, %ecx > - ja LABEL(nibble_ashr_3_restart_use) > - > - jmp LABEL(nibble_ashr_exit_use) > - > -/* > - * The following cases will be handled by ashr_4 > - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case > - * n(12~15) n -12 3(15 +(n-12) - n) ashr_4 > - */ > - .p2align 4 > -LABEL(ashr_4): > - pslldq $12, %xmm2 > - TOLOWER (%xmm1, %xmm2) > - pcmpeqb %xmm1, %xmm2 > - psubb %xmm0, %xmm2 > - pmovmskb %xmm2, %r9d > - shr %cl, %edx > - shr %cl, %r9d > - sub %r9d, %edx > - jnz LABEL(less32bytes) > - movdqa (%rdi), %xmm3 > - > - UPDATE_STRNCMP_COUNTER > - > - mov $16, %rcx /* index for loads */ > - mov $4, %r9d /* byte position left over from less32bytes case */ > - /* > - * Setup %r10 value allows us to detect crossing a page boundary. > - * When %r10 goes positive we have crossed a page boundary and > - * need to do a nibble. > - */ > - lea 4(%rdi), %r10 > - and $0xfff, %r10 /* offset into 4K page */ > - sub $0x1000, %r10 /* subtract 4K pagesize */ > - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ > - > - .p2align 4 > -LABEL(loop_ashr_4_use): > - add $16, %r10 > - jg LABEL(nibble_ashr_4_use) > - > -LABEL(nibble_ashr_4_restart_use): > - movdqa (%rdi, %rdx), %xmm0 > - palignr $4, -16(%rdi, %rdx), %xmm0 > -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > - pcmpistri $0x1a,(%rsi,%rdx), %xmm0 > -#else > - movdqa (%rsi,%rdx), %xmm1 > - TOLOWER (%xmm0, %xmm1) > - pcmpistri $0x1a, %xmm1, %xmm0 > -#endif > - jbe LABEL(exit_use) > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - sub $16, %r11 > - jbe LABEL(strcmp_exitz) > -#endif > - > - add $16, %rdx > - add $16, %r10 > - jg LABEL(nibble_ashr_4_use) > - > - movdqa (%rdi, %rdx), %xmm0 > - palignr $4, -16(%rdi, %rdx), %xmm0 > -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > - pcmpistri $0x1a,(%rsi,%rdx), %xmm0 > -#else > - movdqa (%rsi,%rdx), %xmm1 > - TOLOWER (%xmm0, %xmm1) > - pcmpistri $0x1a, %xmm1, %xmm0 > -#endif > - jbe LABEL(exit_use) > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - sub $16, %r11 > - jbe LABEL(strcmp_exitz) > -#endif > - add $16, %rdx > - jmp LABEL(loop_ashr_4_use) > - > - .p2align 4 > -LABEL(nibble_ashr_4_use): > - sub $0x1000, %r10 > - movdqa -16(%rdi, %rdx), %xmm0 > - psrldq $4, %xmm0 > - pcmpistri $0x3a,%xmm0, %xmm0 > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - cmp %r11, %rcx > - jae LABEL(nibble_ashr_exit_use) > -#endif > - cmp $11, %ecx > - ja LABEL(nibble_ashr_4_restart_use) > - > - jmp LABEL(nibble_ashr_exit_use) > - > -/* > - * The following cases will be handled by ashr_5 > - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case > - * n(11~15) n - 11 4(15 +(n-11) - n) ashr_5 > - */ > - .p2align 4 > -LABEL(ashr_5): > - pslldq $11, %xmm2 > - TOLOWER (%xmm1, %xmm2) > - pcmpeqb %xmm1, %xmm2 > - psubb %xmm0, %xmm2 > - pmovmskb %xmm2, %r9d > - shr %cl, %edx > - shr %cl, %r9d > - sub %r9d, %edx > - jnz LABEL(less32bytes) > - movdqa (%rdi), %xmm3 > - > - UPDATE_STRNCMP_COUNTER > - > - mov $16, %rcx /* index for loads */ > - mov $5, %r9d /* byte position left over from less32bytes case */ > - /* > - * Setup %r10 value allows us to detect crossing a page boundary. > - * When %r10 goes positive we have crossed a page boundary and > - * need to do a nibble. > - */ > - lea 5(%rdi), %r10 > - and $0xfff, %r10 /* offset into 4K page */ > - sub $0x1000, %r10 /* subtract 4K pagesize */ > - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ > - > - .p2align 4 > -LABEL(loop_ashr_5_use): > - add $16, %r10 > - jg LABEL(nibble_ashr_5_use) > - > -LABEL(nibble_ashr_5_restart_use): > - movdqa (%rdi, %rdx), %xmm0 > - palignr $5, -16(%rdi, %rdx), %xmm0 > -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > - pcmpistri $0x1a,(%rsi,%rdx), %xmm0 > -#else > - movdqa (%rsi,%rdx), %xmm1 > - TOLOWER (%xmm0, %xmm1) > - pcmpistri $0x1a, %xmm1, %xmm0 > -#endif > - jbe LABEL(exit_use) > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - sub $16, %r11 > - jbe LABEL(strcmp_exitz) > -#endif > - > - add $16, %rdx > - add $16, %r10 > - jg LABEL(nibble_ashr_5_use) > - > - movdqa (%rdi, %rdx), %xmm0 > - > - palignr $5, -16(%rdi, %rdx), %xmm0 > -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > - pcmpistri $0x1a,(%rsi,%rdx), %xmm0 > -#else > - movdqa (%rsi,%rdx), %xmm1 > - TOLOWER (%xmm0, %xmm1) > - pcmpistri $0x1a, %xmm1, %xmm0 > -#endif > - jbe LABEL(exit_use) > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - sub $16, %r11 > - jbe LABEL(strcmp_exitz) > -#endif > - add $16, %rdx > - jmp LABEL(loop_ashr_5_use) > - > - .p2align 4 > -LABEL(nibble_ashr_5_use): > - sub $0x1000, %r10 > - movdqa -16(%rdi, %rdx), %xmm0 > - psrldq $5, %xmm0 > - pcmpistri $0x3a,%xmm0, %xmm0 > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - cmp %r11, %rcx > - jae LABEL(nibble_ashr_exit_use) > -#endif > - cmp $10, %ecx > - ja LABEL(nibble_ashr_5_restart_use) > - > - jmp LABEL(nibble_ashr_exit_use) > - > -/* > - * The following cases will be handled by ashr_6 > - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case > - * n(10~15) n - 10 5(15 +(n-10) - n) ashr_6 > - */ > - .p2align 4 > -LABEL(ashr_6): > - pslldq $10, %xmm2 > - TOLOWER (%xmm1, %xmm2) > - pcmpeqb %xmm1, %xmm2 > - psubb %xmm0, %xmm2 > - pmovmskb %xmm2, %r9d > - shr %cl, %edx > - shr %cl, %r9d > - sub %r9d, %edx > - jnz LABEL(less32bytes) > - movdqa (%rdi), %xmm3 > - > - UPDATE_STRNCMP_COUNTER > - > - mov $16, %rcx /* index for loads */ > - mov $6, %r9d /* byte position left over from less32bytes case */ > - /* > - * Setup %r10 value allows us to detect crossing a page boundary. > - * When %r10 goes positive we have crossed a page boundary and > - * need to do a nibble. > - */ > - lea 6(%rdi), %r10 > - and $0xfff, %r10 /* offset into 4K page */ > - sub $0x1000, %r10 /* subtract 4K pagesize */ > - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ > - > - .p2align 4 > -LABEL(loop_ashr_6_use): > - add $16, %r10 > - jg LABEL(nibble_ashr_6_use) > - > -LABEL(nibble_ashr_6_restart_use): > - movdqa (%rdi, %rdx), %xmm0 > - palignr $6, -16(%rdi, %rdx), %xmm0 > -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > - pcmpistri $0x1a,(%rsi,%rdx), %xmm0 > -#else > - movdqa (%rsi,%rdx), %xmm1 > - TOLOWER (%xmm0, %xmm1) > - pcmpistri $0x1a, %xmm1, %xmm0 > -#endif > - jbe LABEL(exit_use) > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - sub $16, %r11 > - jbe LABEL(strcmp_exitz) > -#endif > - > - add $16, %rdx > - add $16, %r10 > - jg LABEL(nibble_ashr_6_use) > - > - movdqa (%rdi, %rdx), %xmm0 > - palignr $6, -16(%rdi, %rdx), %xmm0 > -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > - pcmpistri $0x1a,(%rsi,%rdx), %xmm0 > -#else > - movdqa (%rsi,%rdx), %xmm1 > - TOLOWER (%xmm0, %xmm1) > - pcmpistri $0x1a, %xmm1, %xmm0 > -#endif > - jbe LABEL(exit_use) > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - sub $16, %r11 > - jbe LABEL(strcmp_exitz) > -#endif > - add $16, %rdx > - jmp LABEL(loop_ashr_6_use) > - > - .p2align 4 > -LABEL(nibble_ashr_6_use): > - sub $0x1000, %r10 > - movdqa -16(%rdi, %rdx), %xmm0 > - psrldq $6, %xmm0 > - pcmpistri $0x3a,%xmm0, %xmm0 > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - cmp %r11, %rcx > - jae LABEL(nibble_ashr_exit_use) > -#endif > - cmp $9, %ecx > - ja LABEL(nibble_ashr_6_restart_use) > - > - jmp LABEL(nibble_ashr_exit_use) > - > -/* > - * The following cases will be handled by ashr_7 > - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case > - * n(9~15) n - 9 6(15 +(n - 9) - n) ashr_7 > - */ > - .p2align 4 > -LABEL(ashr_7): > - pslldq $9, %xmm2 > - TOLOWER (%xmm1, %xmm2) > - pcmpeqb %xmm1, %xmm2 > - psubb %xmm0, %xmm2 > - pmovmskb %xmm2, %r9d > - shr %cl, %edx > - shr %cl, %r9d > - sub %r9d, %edx > - jnz LABEL(less32bytes) > - movdqa (%rdi), %xmm3 > - > - UPDATE_STRNCMP_COUNTER > - > - mov $16, %rcx /* index for loads */ > - mov $7, %r9d /* byte position left over from less32bytes case */ > - /* > - * Setup %r10 value allows us to detect crossing a page boundary. > - * When %r10 goes positive we have crossed a page boundary and > - * need to do a nibble. > - */ > - lea 7(%rdi), %r10 > - and $0xfff, %r10 /* offset into 4K page */ > - sub $0x1000, %r10 /* subtract 4K pagesize */ > - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ > - > - .p2align 4 > -LABEL(loop_ashr_7_use): > - add $16, %r10 > - jg LABEL(nibble_ashr_7_use) > - > -LABEL(nibble_ashr_7_restart_use): > - movdqa (%rdi, %rdx), %xmm0 > - palignr $7, -16(%rdi, %rdx), %xmm0 > -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > - pcmpistri $0x1a,(%rsi,%rdx), %xmm0 > -#else > - movdqa (%rsi,%rdx), %xmm1 > - TOLOWER (%xmm0, %xmm1) > - pcmpistri $0x1a, %xmm1, %xmm0 > -#endif > - jbe LABEL(exit_use) > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - sub $16, %r11 > - jbe LABEL(strcmp_exitz) > -#endif > - > - add $16, %rdx > - add $16, %r10 > - jg LABEL(nibble_ashr_7_use) > - > - movdqa (%rdi, %rdx), %xmm0 > - palignr $7, -16(%rdi, %rdx), %xmm0 > -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > - pcmpistri $0x1a,(%rsi,%rdx), %xmm0 > -#else > - movdqa (%rsi,%rdx), %xmm1 > - TOLOWER (%xmm0, %xmm1) > - pcmpistri $0x1a, %xmm1, %xmm0 > -#endif > - jbe LABEL(exit_use) > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - sub $16, %r11 > - jbe LABEL(strcmp_exitz) > -#endif > - add $16, %rdx > - jmp LABEL(loop_ashr_7_use) > - > - .p2align 4 > -LABEL(nibble_ashr_7_use): > - sub $0x1000, %r10 > - movdqa -16(%rdi, %rdx), %xmm0 > - psrldq $7, %xmm0 > - pcmpistri $0x3a,%xmm0, %xmm0 > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - cmp %r11, %rcx > - jae LABEL(nibble_ashr_exit_use) > -#endif > - cmp $8, %ecx > - ja LABEL(nibble_ashr_7_restart_use) > - > - jmp LABEL(nibble_ashr_exit_use) > - > -/* > - * The following cases will be handled by ashr_8 > - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case > - * n(8~15) n - 8 7(15 +(n - 8) - n) ashr_8 > - */ > - .p2align 4 > -LABEL(ashr_8): > - pslldq $8, %xmm2 > - TOLOWER (%xmm1, %xmm2) > - pcmpeqb %xmm1, %xmm2 > - psubb %xmm0, %xmm2 > - pmovmskb %xmm2, %r9d > - shr %cl, %edx > - shr %cl, %r9d > - sub %r9d, %edx > - jnz LABEL(less32bytes) > - movdqa (%rdi), %xmm3 > - > - UPDATE_STRNCMP_COUNTER > - > - mov $16, %rcx /* index for loads */ > - mov $8, %r9d /* byte position left over from less32bytes case */ > - /* > - * Setup %r10 value allows us to detect crossing a page boundary. > - * When %r10 goes positive we have crossed a page boundary and > - * need to do a nibble. > - */ > - lea 8(%rdi), %r10 > - and $0xfff, %r10 /* offset into 4K page */ > - sub $0x1000, %r10 /* subtract 4K pagesize */ > - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ > - > - .p2align 4 > -LABEL(loop_ashr_8_use): > - add $16, %r10 > - jg LABEL(nibble_ashr_8_use) > - > -LABEL(nibble_ashr_8_restart_use): > - movdqa (%rdi, %rdx), %xmm0 > - palignr $8, -16(%rdi, %rdx), %xmm0 > -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > - pcmpistri $0x1a, (%rsi,%rdx), %xmm0 > -#else > - movdqa (%rsi,%rdx), %xmm1 > - TOLOWER (%xmm0, %xmm1) > - pcmpistri $0x1a, %xmm1, %xmm0 > -#endif > - jbe LABEL(exit_use) > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - sub $16, %r11 > - jbe LABEL(strcmp_exitz) > -#endif > - > - add $16, %rdx > - add $16, %r10 > - jg LABEL(nibble_ashr_8_use) > - > - movdqa (%rdi, %rdx), %xmm0 > - palignr $8, -16(%rdi, %rdx), %xmm0 > -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > - pcmpistri $0x1a, (%rsi,%rdx), %xmm0 > -#else > - movdqa (%rsi,%rdx), %xmm1 > - TOLOWER (%xmm0, %xmm1) > - pcmpistri $0x1a, %xmm1, %xmm0 > -#endif > - jbe LABEL(exit_use) > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - sub $16, %r11 > - jbe LABEL(strcmp_exitz) > -#endif > - add $16, %rdx > - jmp LABEL(loop_ashr_8_use) > - > - .p2align 4 > -LABEL(nibble_ashr_8_use): > - sub $0x1000, %r10 > - movdqa -16(%rdi, %rdx), %xmm0 > - psrldq $8, %xmm0 > - pcmpistri $0x3a,%xmm0, %xmm0 > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - cmp %r11, %rcx > - jae LABEL(nibble_ashr_exit_use) > -#endif > - cmp $7, %ecx > - ja LABEL(nibble_ashr_8_restart_use) > - > - jmp LABEL(nibble_ashr_exit_use) > - > -/* > - * The following cases will be handled by ashr_9 > - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case > - * n(7~15) n - 7 8(15 +(n - 7) - n) ashr_9 > - */ > - .p2align 4 > -LABEL(ashr_9): > - pslldq $7, %xmm2 > - TOLOWER (%xmm1, %xmm2) > - pcmpeqb %xmm1, %xmm2 > - psubb %xmm0, %xmm2 > - pmovmskb %xmm2, %r9d > - shr %cl, %edx > - shr %cl, %r9d > - sub %r9d, %edx > - jnz LABEL(less32bytes) > - movdqa (%rdi), %xmm3 > - > - UPDATE_STRNCMP_COUNTER > - > - mov $16, %rcx /* index for loads */ > - mov $9, %r9d /* byte position left over from less32bytes case */ > - /* > - * Setup %r10 value allows us to detect crossing a page boundary. > - * When %r10 goes positive we have crossed a page boundary and > - * need to do a nibble. > - */ > - lea 9(%rdi), %r10 > - and $0xfff, %r10 /* offset into 4K page */ > - sub $0x1000, %r10 /* subtract 4K pagesize */ > - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ > - > - .p2align 4 > -LABEL(loop_ashr_9_use): > - add $16, %r10 > - jg LABEL(nibble_ashr_9_use) > - > -LABEL(nibble_ashr_9_restart_use): > - movdqa (%rdi, %rdx), %xmm0 > - > - palignr $9, -16(%rdi, %rdx), %xmm0 > -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > - pcmpistri $0x1a, (%rsi,%rdx), %xmm0 > -#else > - movdqa (%rsi,%rdx), %xmm1 > - TOLOWER (%xmm0, %xmm1) > - pcmpistri $0x1a, %xmm1, %xmm0 > -#endif > - jbe LABEL(exit_use) > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - sub $16, %r11 > - jbe LABEL(strcmp_exitz) > -#endif > - > - add $16, %rdx > - add $16, %r10 > - jg LABEL(nibble_ashr_9_use) > - > - movdqa (%rdi, %rdx), %xmm0 > - palignr $9, -16(%rdi, %rdx), %xmm0 > -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > - pcmpistri $0x1a, (%rsi,%rdx), %xmm0 > -#else > - movdqa (%rsi,%rdx), %xmm1 > - TOLOWER (%xmm0, %xmm1) > - pcmpistri $0x1a, %xmm1, %xmm0 > -#endif > - jbe LABEL(exit_use) > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - sub $16, %r11 > - jbe LABEL(strcmp_exitz) > -#endif > - add $16, %rdx > - jmp LABEL(loop_ashr_9_use) > - > - .p2align 4 > -LABEL(nibble_ashr_9_use): > - sub $0x1000, %r10 > - movdqa -16(%rdi, %rdx), %xmm0 > - psrldq $9, %xmm0 > - pcmpistri $0x3a,%xmm0, %xmm0 > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - cmp %r11, %rcx > - jae LABEL(nibble_ashr_exit_use) > -#endif > - cmp $6, %ecx > - ja LABEL(nibble_ashr_9_restart_use) > - > - jmp LABEL(nibble_ashr_exit_use) > - > -/* > - * The following cases will be handled by ashr_10 > - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case > - * n(6~15) n - 6 9(15 +(n - 6) - n) ashr_10 > - */ > - .p2align 4 > -LABEL(ashr_10): > - pslldq $6, %xmm2 > - TOLOWER (%xmm1, %xmm2) > - pcmpeqb %xmm1, %xmm2 > - psubb %xmm0, %xmm2 > - pmovmskb %xmm2, %r9d > - shr %cl, %edx > - shr %cl, %r9d > - sub %r9d, %edx > - jnz LABEL(less32bytes) > - movdqa (%rdi), %xmm3 > - > - UPDATE_STRNCMP_COUNTER > - > - mov $16, %rcx /* index for loads */ > - mov $10, %r9d /* byte position left over from less32bytes case */ > - /* > - * Setup %r10 value allows us to detect crossing a page boundary. > - * When %r10 goes positive we have crossed a page boundary and > - * need to do a nibble. > - */ > - lea 10(%rdi), %r10 > - and $0xfff, %r10 /* offset into 4K page */ > - sub $0x1000, %r10 /* subtract 4K pagesize */ > - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ > - > - .p2align 4 > -LABEL(loop_ashr_10_use): > - add $16, %r10 > - jg LABEL(nibble_ashr_10_use) > - > -LABEL(nibble_ashr_10_restart_use): > - movdqa (%rdi, %rdx), %xmm0 > - palignr $10, -16(%rdi, %rdx), %xmm0 > -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > - pcmpistri $0x1a, (%rsi,%rdx), %xmm0 > -#else > - movdqa (%rsi,%rdx), %xmm1 > - TOLOWER (%xmm0, %xmm1) > - pcmpistri $0x1a, %xmm1, %xmm0 > -#endif > - jbe LABEL(exit_use) > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - sub $16, %r11 > - jbe LABEL(strcmp_exitz) > -#endif > - > - add $16, %rdx > - add $16, %r10 > - jg LABEL(nibble_ashr_10_use) > - > - movdqa (%rdi, %rdx), %xmm0 > - palignr $10, -16(%rdi, %rdx), %xmm0 > -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > - pcmpistri $0x1a, (%rsi,%rdx), %xmm0 > -#else > - movdqa (%rsi,%rdx), %xmm1 > - TOLOWER (%xmm0, %xmm1) > - pcmpistri $0x1a, %xmm1, %xmm0 > -#endif > - jbe LABEL(exit_use) > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - sub $16, %r11 > - jbe LABEL(strcmp_exitz) > -#endif > - add $16, %rdx > - jmp LABEL(loop_ashr_10_use) > - > - .p2align 4 > -LABEL(nibble_ashr_10_use): > - sub $0x1000, %r10 > - movdqa -16(%rdi, %rdx), %xmm0 > - psrldq $10, %xmm0 > - pcmpistri $0x3a,%xmm0, %xmm0 > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - cmp %r11, %rcx > - jae LABEL(nibble_ashr_exit_use) > -#endif > - cmp $5, %ecx > - ja LABEL(nibble_ashr_10_restart_use) > - > - jmp LABEL(nibble_ashr_exit_use) > - > -/* > - * The following cases will be handled by ashr_11 > - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case > - * n(5~15) n - 5 10(15 +(n - 5) - n) ashr_11 > - */ > - .p2align 4 > -LABEL(ashr_11): > - pslldq $5, %xmm2 > - TOLOWER (%xmm1, %xmm2) > - pcmpeqb %xmm1, %xmm2 > - psubb %xmm0, %xmm2 > - pmovmskb %xmm2, %r9d > - shr %cl, %edx > - shr %cl, %r9d > - sub %r9d, %edx > - jnz LABEL(less32bytes) > - movdqa (%rdi), %xmm3 > - > - UPDATE_STRNCMP_COUNTER > - > - mov $16, %rcx /* index for loads */ > - mov $11, %r9d /* byte position left over from less32bytes case */ > - /* > - * Setup %r10 value allows us to detect crossing a page boundary. > - * When %r10 goes positive we have crossed a page boundary and > - * need to do a nibble. > - */ > - lea 11(%rdi), %r10 > - and $0xfff, %r10 /* offset into 4K page */ > - sub $0x1000, %r10 /* subtract 4K pagesize */ > - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ > - > - .p2align 4 > -LABEL(loop_ashr_11_use): > - add $16, %r10 > - jg LABEL(nibble_ashr_11_use) > - > -LABEL(nibble_ashr_11_restart_use): > - movdqa (%rdi, %rdx), %xmm0 > - palignr $11, -16(%rdi, %rdx), %xmm0 > -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > - pcmpistri $0x1a, (%rsi,%rdx), %xmm0 > -#else > - movdqa (%rsi,%rdx), %xmm1 > - TOLOWER (%xmm0, %xmm1) > - pcmpistri $0x1a, %xmm1, %xmm0 > -#endif > - jbe LABEL(exit_use) > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - sub $16, %r11 > - jbe LABEL(strcmp_exitz) > -#endif > - > - add $16, %rdx > - add $16, %r10 > - jg LABEL(nibble_ashr_11_use) > - > - movdqa (%rdi, %rdx), %xmm0 > - palignr $11, -16(%rdi, %rdx), %xmm0 > -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > - pcmpistri $0x1a, (%rsi,%rdx), %xmm0 > -#else > - movdqa (%rsi,%rdx), %xmm1 > - TOLOWER (%xmm0, %xmm1) > - pcmpistri $0x1a, %xmm1, %xmm0 > -#endif > - jbe LABEL(exit_use) > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - sub $16, %r11 > - jbe LABEL(strcmp_exitz) > -#endif > - add $16, %rdx > - jmp LABEL(loop_ashr_11_use) > - > - .p2align 4 > -LABEL(nibble_ashr_11_use): > - sub $0x1000, %r10 > - movdqa -16(%rdi, %rdx), %xmm0 > - psrldq $11, %xmm0 > - pcmpistri $0x3a,%xmm0, %xmm0 > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - cmp %r11, %rcx > - jae LABEL(nibble_ashr_exit_use) > -#endif > - cmp $4, %ecx > - ja LABEL(nibble_ashr_11_restart_use) > - > - jmp LABEL(nibble_ashr_exit_use) > - > -/* > - * The following cases will be handled by ashr_12 > - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case > - * n(4~15) n - 4 11(15 +(n - 4) - n) ashr_12 > - */ > - .p2align 4 > -LABEL(ashr_12): > - pslldq $4, %xmm2 > - TOLOWER (%xmm1, %xmm2) > - pcmpeqb %xmm1, %xmm2 > - psubb %xmm0, %xmm2 > - pmovmskb %xmm2, %r9d > - shr %cl, %edx > - shr %cl, %r9d > - sub %r9d, %edx > - jnz LABEL(less32bytes) > - movdqa (%rdi), %xmm3 > - > - UPDATE_STRNCMP_COUNTER > - > - mov $16, %rcx /* index for loads */ > - mov $12, %r9d /* byte position left over from less32bytes case */ > - /* > - * Setup %r10 value allows us to detect crossing a page boundary. > - * When %r10 goes positive we have crossed a page boundary and > - * need to do a nibble. > - */ > - lea 12(%rdi), %r10 > - and $0xfff, %r10 /* offset into 4K page */ > - sub $0x1000, %r10 /* subtract 4K pagesize */ > - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ > - > - .p2align 4 > -LABEL(loop_ashr_12_use): > - add $16, %r10 > - jg LABEL(nibble_ashr_12_use) > - > -LABEL(nibble_ashr_12_restart_use): > - movdqa (%rdi, %rdx), %xmm0 > - palignr $12, -16(%rdi, %rdx), %xmm0 > -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > - pcmpistri $0x1a, (%rsi,%rdx), %xmm0 > -#else > - movdqa (%rsi,%rdx), %xmm1 > - TOLOWER (%xmm0, %xmm1) > - pcmpistri $0x1a, %xmm1, %xmm0 > -#endif > - jbe LABEL(exit_use) > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - sub $16, %r11 > - jbe LABEL(strcmp_exitz) > -#endif > - > - add $16, %rdx > - add $16, %r10 > - jg LABEL(nibble_ashr_12_use) > - > - movdqa (%rdi, %rdx), %xmm0 > - palignr $12, -16(%rdi, %rdx), %xmm0 > -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > - pcmpistri $0x1a, (%rsi,%rdx), %xmm0 > -#else > - movdqa (%rsi,%rdx), %xmm1 > - TOLOWER (%xmm0, %xmm1) > - pcmpistri $0x1a, %xmm1, %xmm0 > -#endif > - jbe LABEL(exit_use) > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - sub $16, %r11 > - jbe LABEL(strcmp_exitz) > -#endif > - add $16, %rdx > - jmp LABEL(loop_ashr_12_use) > - > - .p2align 4 > -LABEL(nibble_ashr_12_use): > - sub $0x1000, %r10 > - movdqa -16(%rdi, %rdx), %xmm0 > - psrldq $12, %xmm0 > - pcmpistri $0x3a,%xmm0, %xmm0 > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - cmp %r11, %rcx > - jae LABEL(nibble_ashr_exit_use) > -#endif > - cmp $3, %ecx > - ja LABEL(nibble_ashr_12_restart_use) > - > - jmp LABEL(nibble_ashr_exit_use) > - > -/* > - * The following cases will be handled by ashr_13 > - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case > - * n(3~15) n - 3 12(15 +(n - 3) - n) ashr_13 > - */ > - .p2align 4 > -LABEL(ashr_13): > - pslldq $3, %xmm2 > - TOLOWER (%xmm1, %xmm2) > - pcmpeqb %xmm1, %xmm2 > - psubb %xmm0, %xmm2 > - pmovmskb %xmm2, %r9d > - shr %cl, %edx > - shr %cl, %r9d > - sub %r9d, %edx > - jnz LABEL(less32bytes) > - movdqa (%rdi), %xmm3 > - > - UPDATE_STRNCMP_COUNTER > - > - mov $16, %rcx /* index for loads */ > - mov $13, %r9d /* byte position left over from less32bytes case */ > - /* > - * Setup %r10 value allows us to detect crossing a page boundary. > - * When %r10 goes positive we have crossed a page boundary and > - * need to do a nibble. > - */ > - lea 13(%rdi), %r10 > - and $0xfff, %r10 /* offset into 4K page */ > - sub $0x1000, %r10 /* subtract 4K pagesize */ > - > - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ > - > - .p2align 4 > -LABEL(loop_ashr_13_use): > - add $16, %r10 > - jg LABEL(nibble_ashr_13_use) > - > -LABEL(nibble_ashr_13_restart_use): > - movdqa (%rdi, %rdx), %xmm0 > - palignr $13, -16(%rdi, %rdx), %xmm0 > -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > - pcmpistri $0x1a, (%rsi,%rdx), %xmm0 > -#else > - movdqa (%rsi,%rdx), %xmm1 > - TOLOWER (%xmm0, %xmm1) > - pcmpistri $0x1a, %xmm1, %xmm0 > -#endif > - jbe LABEL(exit_use) > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - sub $16, %r11 > - jbe LABEL(strcmp_exitz) > -#endif > - > - add $16, %rdx > - add $16, %r10 > - jg LABEL(nibble_ashr_13_use) > - > - movdqa (%rdi, %rdx), %xmm0 > - palignr $13, -16(%rdi, %rdx), %xmm0 > -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > - pcmpistri $0x1a, (%rsi,%rdx), %xmm0 > -#else > - movdqa (%rsi,%rdx), %xmm1 > - TOLOWER (%xmm0, %xmm1) > - pcmpistri $0x1a, %xmm1, %xmm0 > -#endif > - jbe LABEL(exit_use) > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - sub $16, %r11 > - jbe LABEL(strcmp_exitz) > -#endif > - add $16, %rdx > - jmp LABEL(loop_ashr_13_use) > - > - .p2align 4 > -LABEL(nibble_ashr_13_use): > - sub $0x1000, %r10 > - movdqa -16(%rdi, %rdx), %xmm0 > - psrldq $13, %xmm0 > - pcmpistri $0x3a,%xmm0, %xmm0 > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - cmp %r11, %rcx > - jae LABEL(nibble_ashr_exit_use) > -#endif > - cmp $2, %ecx > - ja LABEL(nibble_ashr_13_restart_use) > - > - jmp LABEL(nibble_ashr_exit_use) > - > -/* > - * The following cases will be handled by ashr_14 > - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case > - * n(2~15) n - 2 13(15 +(n - 2) - n) ashr_14 > - */ > - .p2align 4 > -LABEL(ashr_14): > - pslldq $2, %xmm2 > - TOLOWER (%xmm1, %xmm2) > - pcmpeqb %xmm1, %xmm2 > - psubb %xmm0, %xmm2 > - pmovmskb %xmm2, %r9d > - shr %cl, %edx > - shr %cl, %r9d > - sub %r9d, %edx > - jnz LABEL(less32bytes) > - movdqa (%rdi), %xmm3 > - > - UPDATE_STRNCMP_COUNTER > - > - mov $16, %rcx /* index for loads */ > - mov $14, %r9d /* byte position left over from less32bytes case */ > - /* > - * Setup %r10 value allows us to detect crossing a page boundary. > - * When %r10 goes positive we have crossed a page boundary and > - * need to do a nibble. > - */ > - lea 14(%rdi), %r10 > - and $0xfff, %r10 /* offset into 4K page */ > - sub $0x1000, %r10 /* subtract 4K pagesize */ > - > - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ > - > - .p2align 4 > -LABEL(loop_ashr_14_use): > - add $16, %r10 > - jg LABEL(nibble_ashr_14_use) > - > -LABEL(nibble_ashr_14_restart_use): > - movdqa (%rdi, %rdx), %xmm0 > - palignr $14, -16(%rdi, %rdx), %xmm0 > -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > - pcmpistri $0x1a, (%rsi,%rdx), %xmm0 > -#else > - movdqa (%rsi,%rdx), %xmm1 > - TOLOWER (%xmm0, %xmm1) > - pcmpistri $0x1a, %xmm1, %xmm0 > -#endif > - jbe LABEL(exit_use) > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - sub $16, %r11 > - jbe LABEL(strcmp_exitz) > -#endif > - > - add $16, %rdx > - add $16, %r10 > - jg LABEL(nibble_ashr_14_use) > - > - movdqa (%rdi, %rdx), %xmm0 > - palignr $14, -16(%rdi, %rdx), %xmm0 > -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > - pcmpistri $0x1a, (%rsi,%rdx), %xmm0 > -#else > - movdqa (%rsi,%rdx), %xmm1 > - TOLOWER (%xmm0, %xmm1) > - pcmpistri $0x1a, %xmm1, %xmm0 > -#endif > - jbe LABEL(exit_use) > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - sub $16, %r11 > - jbe LABEL(strcmp_exitz) > -#endif > - add $16, %rdx > - jmp LABEL(loop_ashr_14_use) > - > - .p2align 4 > -LABEL(nibble_ashr_14_use): > - sub $0x1000, %r10 > - movdqa -16(%rdi, %rdx), %xmm0 > - psrldq $14, %xmm0 > - pcmpistri $0x3a,%xmm0, %xmm0 > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - cmp %r11, %rcx > - jae LABEL(nibble_ashr_exit_use) > -#endif > - cmp $1, %ecx > - ja LABEL(nibble_ashr_14_restart_use) > - > - jmp LABEL(nibble_ashr_exit_use) > - > -/* > - * The following cases will be handled by ashr_15 > - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case > - * n(1~15) n - 1 14(15 +(n - 1) - n) ashr_15 > - */ > - .p2align 4 > -LABEL(ashr_15): > - pslldq $1, %xmm2 > - TOLOWER (%xmm1, %xmm2) > - pcmpeqb %xmm1, %xmm2 > - psubb %xmm0, %xmm2 > - pmovmskb %xmm2, %r9d > - shr %cl, %edx > - shr %cl, %r9d > - sub %r9d, %edx > - jnz LABEL(less32bytes) > - > - movdqa (%rdi), %xmm3 > - > - UPDATE_STRNCMP_COUNTER > - > - mov $16, %rcx /* index for loads */ > - mov $15, %r9d /* byte position left over from less32bytes case */ > - /* > - * Setup %r10 value allows us to detect crossing a page boundary. > - * When %r10 goes positive we have crossed a page boundary and > - * need to do a nibble. > - */ > - lea 15(%rdi), %r10 > - and $0xfff, %r10 /* offset into 4K page */ > - > - sub $0x1000, %r10 /* subtract 4K pagesize */ > - > - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ > - > - .p2align 4 > -LABEL(loop_ashr_15_use): > - add $16, %r10 > - jg LABEL(nibble_ashr_15_use) > - > -LABEL(nibble_ashr_15_restart_use): > - movdqa (%rdi, %rdx), %xmm0 > - palignr $15, -16(%rdi, %rdx), %xmm0 > -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > - pcmpistri $0x1a, (%rsi,%rdx), %xmm0 > -#else > - movdqa (%rsi,%rdx), %xmm1 > - TOLOWER (%xmm0, %xmm1) > - pcmpistri $0x1a, %xmm1, %xmm0 > -#endif > - jbe LABEL(exit_use) > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - sub $16, %r11 > - jbe LABEL(strcmp_exitz) > -#endif > - > - add $16, %rdx > - add $16, %r10 > - jg LABEL(nibble_ashr_15_use) > - > - movdqa (%rdi, %rdx), %xmm0 > - palignr $15, -16(%rdi, %rdx), %xmm0 > -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > - pcmpistri $0x1a, (%rsi,%rdx), %xmm0 > -#else > - movdqa (%rsi,%rdx), %xmm1 > - TOLOWER (%xmm0, %xmm1) > - pcmpistri $0x1a, %xmm1, %xmm0 > -#endif > - jbe LABEL(exit_use) > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - sub $16, %r11 > - jbe LABEL(strcmp_exitz) > -#endif > - add $16, %rdx > - jmp LABEL(loop_ashr_15_use) > - > - .p2align 4 > -LABEL(nibble_ashr_15_use): > - sub $0x1000, %r10 > - movdqa -16(%rdi, %rdx), %xmm0 > - psrldq $15, %xmm0 > - pcmpistri $0x3a,%xmm0, %xmm0 > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - cmp %r11, %rcx > - jae LABEL(nibble_ashr_exit_use) > -#endif > - cmp $0, %ecx > - ja LABEL(nibble_ashr_15_restart_use) > - > -LABEL(nibble_ashr_exit_use): > -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > - pcmpistri $0x1a,(%rsi,%rdx), %xmm0 > -#else > - movdqa (%rsi,%rdx), %xmm1 > - TOLOWER (%xmm0, %xmm1) > - pcmpistri $0x1a, %xmm1, %xmm0 > -#endif > - .p2align 4 > -LABEL(exit_use): > - jnc LABEL(strcmp_exitz) > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - sub %rcx, %r11 > - jbe LABEL(strcmp_exitz) > -#endif > - add %rcx, %rdx > - lea -16(%rdi, %r9), %rdi > - movzbl (%rdi, %rdx), %eax > - movzbl (%rsi, %rdx), %edx > - test %r8d, %r8d > - jz LABEL(ret_use) > - xchg %eax, %edx > -LABEL(ret_use): > -#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L > - leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx > - movl (%rcx,%rdx,4), %edx > - movl (%rcx,%rax,4), %eax > -#endif > - > - sub %edx, %eax > - ret > - > -LABEL(less32bytes): > - lea (%rdi, %rax), %rdi /* locate the exact address for first operand(rdi) */ > - lea (%rsi, %rcx), %rsi /* locate the exact address for second operand(rsi) */ > - test %r8d, %r8d > - jz LABEL(ret) > - xchg %rsi, %rdi /* recover original order according to flag(%r8d) */ > - > - .p2align 4 > -LABEL(ret): > -LABEL(less16bytes): > - bsf %rdx, %rdx /* find and store bit index in %rdx */ > - > -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > - sub %rdx, %r11 > - jbe LABEL(strcmp_exitz) > -#endif > - movzbl (%rsi, %rdx), %ecx > - movzbl (%rdi, %rdx), %eax > - > -#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L > - leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx > - movl (%rdx,%rcx,4), %ecx > - movl (%rdx,%rax,4), %eax > -#endif > - > - sub %ecx, %eax > - ret > - > -LABEL(strcmp_exitz): > - xor %eax, %eax > - ret > - > - .p2align 4 > - // XXX Same as code above > -LABEL(Byte0): > - movzbl (%rsi), %ecx > - movzbl (%rdi), %eax > - > -#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L > - leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx > - movl (%rdx,%rcx,4), %ecx > - movl (%rdx,%rax,4), %eax > -#endif > - > - sub %ecx, %eax > - ret > - cfi_endproc > - .size STRCMP_SSE42, .-STRCMP_SSE42 > - > -#undef UCLOW_reg > -#undef UCHIGH_reg > -#undef LCQWORD_reg > -#undef TOLOWER > - > - /* Put all SSE 4.2 functions together. */ > - .section .rodata.SECTION,"a",@progbits > - .p2align 3 > -LABEL(unaligned_table): > - .int LABEL(ashr_1) - LABEL(unaligned_table) > - .int LABEL(ashr_2) - LABEL(unaligned_table) > - .int LABEL(ashr_3) - LABEL(unaligned_table) > - .int LABEL(ashr_4) - LABEL(unaligned_table) > - .int LABEL(ashr_5) - LABEL(unaligned_table) > - .int LABEL(ashr_6) - LABEL(unaligned_table) > - .int LABEL(ashr_7) - LABEL(unaligned_table) > - .int LABEL(ashr_8) - LABEL(unaligned_table) > - .int LABEL(ashr_9) - LABEL(unaligned_table) > - .int LABEL(ashr_10) - LABEL(unaligned_table) > - .int LABEL(ashr_11) - LABEL(unaligned_table) > - .int LABEL(ashr_12) - LABEL(unaligned_table) > - .int LABEL(ashr_13) - LABEL(unaligned_table) > - .int LABEL(ashr_14) - LABEL(unaligned_table) > - .int LABEL(ashr_15) - LABEL(unaligned_table) > - .int LABEL(ashr_0) - LABEL(unaligned_table) > - > -#undef LABEL > -#undef GLABEL > -#undef SECTION > -#undef movdqa > -#undef movdqu > -#undef pmovmskb > -#undef pcmpistri > -#undef psubb > -#undef pcmpeqb > -#undef psrldq > -#undef pslldq > -#undef palignr > -#undef pxor > -#undef D > diff --git a/sysdeps/x86_64/multiarch/strcmp-sse4_2.S b/sysdeps/x86_64/multiarch/strcmp-sse4_2.S > index 2c916bafa0..963e208ccb 100644 > --- a/sysdeps/x86_64/multiarch/strcmp-sse4_2.S > +++ b/sysdeps/x86_64/multiarch/strcmp-sse4_2.S > @@ -17,5 +17,1766 @@ > <https://www.gnu.org/licenses/>. */ > > #if IS_IN (libc) > -# include "strcmp-sse42.S" > +# include <sysdep.h> > + > +# define STRCMP_ISA _sse42 > +# include "strcmp-naming.h" > + > +# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L > +# include "locale-defines.h" > +# endif > + > +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > +/* Since the counter, %r11, is unsigned, we branch to strcmp_exitz > + if the new counter > the old one or is 0. */ > +# define UPDATE_STRNCMP_COUNTER \ > + /* calculate left number to compare */ \ > + lea -16(%rcx, %r11), %r9; \ > + cmp %r9, %r11; \ > + jb LABEL(strcmp_exitz); \ > + test %r9, %r9; \ > + je LABEL(strcmp_exitz); \ > + mov %r9, %r11 > +# else > +# define UPDATE_STRNCMP_COUNTER > +# endif > + > +# define SECTION sse4.2 > + > +# define LABEL(l) .L##l > + > +/* We use 0x1a: > + _SIDD_SBYTE_OPS > + | _SIDD_CMP_EQUAL_EACH > + | _SIDD_NEGATIVE_POLARITY > + | _SIDD_LEAST_SIGNIFICANT > + on pcmpistri to find out if two 16byte data elements are the same > + and the offset of the first different byte. There are 4 cases: > + > + 1. Both 16byte data elements are valid and identical. > + 2. Both 16byte data elements have EOS and identical. > + 3. Both 16byte data elements are valid and they differ at offset X. > + 4. At least one 16byte data element has EOS at offset X. Two 16byte > + data elements must differ at or before offset X. > + > + Here is the table of ECX, CFlag, ZFlag and SFlag for 4 cases: > + > + case ECX CFlag ZFlag SFlag > + 1 16 0 0 0 > + 2 16 0 1 1 > + 3 X 1 0 0 > + 4 0 <= X 1 0/1 0/1 > + > + We exit from the loop for cases 2, 3 and 4 with jbe which branches > + when either CFlag or ZFlag is 1. If CFlag == 0, we return 0 for > + case 2. */ > + > + /* Put all SSE 4.2 functions together. */ > + .section .text.SECTION,"ax",@progbits > + .align 16 > + .type STRCMP, @function > + .globl STRCMP > +# ifdef USE_AS_STRCASECMP_L > +ENTRY (STRCASECMP) > + movq __libc_tsd_LOCALE@gottpoff(%rip),%rax > + mov %fs:(%rax),%RDX_LP > + > + /* Either 1 or 5 bytes (dependeing if CET is enabled). */ > + .p2align 4 > +END (STRCASECMP) > + /* FALLTHROUGH to strcasecmp_l. */ > +# endif > +# ifdef USE_AS_STRNCASECMP_L > +ENTRY (STRCASECMP) > + movq __libc_tsd_LOCALE@gottpoff(%rip),%rax > + mov %fs:(%rax),%RCX_LP > + > + /* Either 1 or 5 bytes (dependeing if CET is enabled). */ > + .p2align 4 > +END (STRCASECMP) > + /* FALLTHROUGH to strncasecmp_l. */ > +# endif > + > + > +# define arg arg > + > +STRCMP: > + cfi_startproc > + _CET_ENDBR > + CALL_MCOUNT > + > +/* > + * This implementation uses SSE to compare up to 16 bytes at a time. > + */ > +# ifdef USE_AS_STRCASECMP_L > + /* We have to fall back on the C implementation for locales > + with encodings not matching ASCII for single bytes. */ > +# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0 > + mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rdx), %RAX_LP > +# else > + mov (%rdx), %RAX_LP > +# endif > + testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax) > + jne __strcasecmp_l_nonascii > +# endif > +# ifdef USE_AS_STRNCASECMP_L > + /* We have to fall back on the C implementation for locales > + with encodings not matching ASCII for single bytes. */ > +# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0 > + mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rcx), %RAX_LP > +# else > + mov (%rcx), %RAX_LP > +# endif > + testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax) > + jne __strncasecmp_l_nonascii > +# endif > + > +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > + test %RDX_LP, %RDX_LP > + je LABEL(strcmp_exitz) > + cmp $1, %RDX_LP > + je LABEL(Byte0) > + mov %RDX_LP, %R11_LP > +# endif > + mov %esi, %ecx > + mov %edi, %eax > +/* Use 64bit AND here to avoid long NOP padding. */ > + and $0x3f, %rcx /* rsi alignment in cache line */ > + and $0x3f, %rax /* rdi alignment in cache line */ > +# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L > + .section .rodata.cst16,"aM",@progbits,16 > + .align 16 > +LABEL(lcase_min): > + .quad 0x3f3f3f3f3f3f3f3f > + .quad 0x3f3f3f3f3f3f3f3f > +LABEL(lcase_max): > + .quad 0x9999999999999999 > + .quad 0x9999999999999999 > +LABEL(case_add): > + .quad 0x2020202020202020 > + .quad 0x2020202020202020 > + .previous > + movdqa LABEL(lcase_min)(%rip), %xmm4 > +# define LCASE_MIN_reg %xmm4 > + movdqa LABEL(lcase_max)(%rip), %xmm5 > +# define LCASE_MAX_reg %xmm5 > + movdqa LABEL(case_add)(%rip), %xmm6 > +# define CASE_ADD_reg %xmm6 > +# endif > + cmp $0x30, %ecx > + ja LABEL(crosscache)/* rsi: 16-byte load will cross cache line */ > + cmp $0x30, %eax > + ja LABEL(crosscache)/* rdi: 16-byte load will cross cache line */ > + movdqu (%rdi), %xmm1 > + movdqu (%rsi), %xmm2 > +# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L > +# define TOLOWER(reg1, reg2) \ > + movdqa LCASE_MIN_reg, %xmm7; \ > + movdqa LCASE_MIN_reg, %xmm8; \ > + paddb reg1, %xmm7; \ > + paddb reg2, %xmm8; \ > + pcmpgtb LCASE_MAX_reg, %xmm7; \ > + pcmpgtb LCASE_MAX_reg, %xmm8; \ > + pandn CASE_ADD_reg, %xmm7; \ > + pandn CASE_ADD_reg, %xmm8; \ > + paddb %xmm7, reg1; \ > + paddb %xmm8, reg2 > + > + TOLOWER (%xmm1, %xmm2) > +# else > +# define TOLOWER(reg1, reg2) > +# endif > + pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */ > + pcmpeqb %xmm1, %xmm0 /* Any null chars? */ > + pcmpeqb %xmm2, %xmm1 /* compare first 16 bytes for equality */ > + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ > + pmovmskb %xmm1, %edx > + sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */ > + jnz LABEL(less16bytes)/* If not, find different value or null char */ > +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > + sub $16, %r11 > + jbe LABEL(strcmp_exitz)/* finish comparison */ > +# endif > + add $16, %rsi /* prepare to search next 16 bytes */ > + add $16, %rdi /* prepare to search next 16 bytes */ > + > + /* > + * Determine source and destination string offsets from 16-byte > + * alignment. Use relative offset difference between the two to > + * determine which case below to use. > + */ > + .p2align 4 > +LABEL(crosscache): > + and $0xfffffffffffffff0, %rsi /* force %rsi is 16 byte aligned */ > + and $0xfffffffffffffff0, %rdi /* force %rdi is 16 byte aligned */ > + mov $0xffff, %edx /* for equivalent offset */ > + xor %r8d, %r8d > + and $0xf, %ecx /* offset of rsi */ > + and $0xf, %eax /* offset of rdi */ > + pxor %xmm0, %xmm0 /* clear %xmm0 for null char check */ > + cmp %eax, %ecx > + je LABEL(ashr_0) /* rsi and rdi relative offset same */ > + ja LABEL(bigger) > + mov %edx, %r8d /* r8d is offset flag for exit tail */ > + xchg %ecx, %eax > + xchg %rsi, %rdi > +LABEL(bigger): > + movdqa (%rdi), %xmm2 > + movdqa (%rsi), %xmm1 > + lea 15(%rax), %r9 > + sub %rcx, %r9 > + lea LABEL(unaligned_table)(%rip), %r10 > + movslq (%r10, %r9,4), %r9 > + pcmpeqb %xmm1, %xmm0 /* Any null chars? */ > + lea (%r10, %r9), %r10 > + _CET_NOTRACK jmp *%r10 /* jump to corresponding case */ > + > +/* > + * The following cases will be handled by ashr_0 > + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case > + * n(0~15) n(0~15) 15(15+ n-n) ashr_0 > + */ > + .p2align 4 > +LABEL(ashr_0): > + > + movdqa (%rsi), %xmm1 > + pcmpeqb %xmm1, %xmm0 /* Any null chars? */ > +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > + pcmpeqb (%rdi), %xmm1 /* compare 16 bytes for equality */ > +# else > + movdqa (%rdi), %xmm2 > + TOLOWER (%xmm1, %xmm2) > + pcmpeqb %xmm2, %xmm1 /* compare 16 bytes for equality */ > +# endif > + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ > + pmovmskb %xmm1, %r9d > + shr %cl, %edx /* adjust 0xffff for offset */ > + shr %cl, %r9d /* adjust for 16-byte offset */ > + sub %r9d, %edx > + /* > + * edx must be the same with r9d if in left byte (16-rcx) is equal to > + * the start from (16-rax) and no null char was seen. > + */ > + jne LABEL(less32bytes) /* mismatch or null char */ > + UPDATE_STRNCMP_COUNTER > + mov $16, %rcx > + mov $16, %r9 > + > + /* > + * Now both strings are aligned at 16-byte boundary. Loop over strings > + * checking 32-bytes per iteration. > + */ > + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ > + .p2align 4 > +LABEL(ashr_0_use): > + movdqa (%rdi,%rdx), %xmm0 > +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 > +# else > + movdqa (%rsi,%rdx), %xmm1 > + TOLOWER (%xmm0, %xmm1) > + pcmpistri $0x1a, %xmm1, %xmm0 > +# endif > + lea 16(%rdx), %rdx > + jbe LABEL(ashr_0_exit_use) > +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > + sub $16, %r11 > + jbe LABEL(strcmp_exitz) > +# endif > + > + movdqa (%rdi,%rdx), %xmm0 > +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 > +# else > + movdqa (%rsi,%rdx), %xmm1 > + TOLOWER (%xmm0, %xmm1) > + pcmpistri $0x1a, %xmm1, %xmm0 > +# endif > + lea 16(%rdx), %rdx > + jbe LABEL(ashr_0_exit_use) > +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > + sub $16, %r11 > + jbe LABEL(strcmp_exitz) > +# endif > + jmp LABEL(ashr_0_use) > + > + > + .p2align 4 > +LABEL(ashr_0_exit_use): > + jnc LABEL(strcmp_exitz) > +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > + sub %rcx, %r11 > + jbe LABEL(strcmp_exitz) > +# endif > + lea -16(%rdx, %rcx), %rcx > + movzbl (%rdi, %rcx), %eax > + movzbl (%rsi, %rcx), %edx > +# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L > + leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx > + movl (%rcx,%rax,4), %eax > + movl (%rcx,%rdx,4), %edx > +# endif > + sub %edx, %eax > + ret > + > + > + > +/* > + * The following cases will be handled by ashr_1 > + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case > + * n(15) n -15 0(15 +(n-15) - n) ashr_1 > + */ > + .p2align 4 > +LABEL(ashr_1): > + pslldq $15, %xmm2 /* shift first string to align with second */ > + TOLOWER (%xmm1, %xmm2) > + pcmpeqb %xmm1, %xmm2 /* compare 16 bytes for equality */ > + psubb %xmm0, %xmm2 /* packed sub of comparison results*/ > + pmovmskb %xmm2, %r9d > + shr %cl, %edx /* adjust 0xffff for offset */ > + shr %cl, %r9d /* adjust for 16-byte offset */ > + sub %r9d, %edx > + jnz LABEL(less32bytes) /* mismatch or null char seen */ > + movdqa (%rdi), %xmm3 > + UPDATE_STRNCMP_COUNTER > + > + mov $16, %rcx /* index for loads*/ > + mov $1, %r9d /* byte position left over from less32bytes case */ > + /* > + * Setup %r10 value allows us to detect crossing a page boundary. > + * When %r10 goes positive we have crossed a page boundary and > + * need to do a nibble. > + */ > + lea 1(%rdi), %r10 > + and $0xfff, %r10 /* offset into 4K page */ > + sub $0x1000, %r10 /* subtract 4K pagesize */ > + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ > + > + .p2align 4 > +LABEL(loop_ashr_1_use): > + add $16, %r10 > + jg LABEL(nibble_ashr_1_use) > + > +LABEL(nibble_ashr_1_restart_use): > + movdqa (%rdi, %rdx), %xmm0 > + palignr $1, -16(%rdi, %rdx), %xmm0 > +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 > +# else > + movdqa (%rsi,%rdx), %xmm1 > + TOLOWER (%xmm0, %xmm1) > + pcmpistri $0x1a, %xmm1, %xmm0 > +# endif > + jbe LABEL(exit_use) > +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > + sub $16, %r11 > + jbe LABEL(strcmp_exitz) > +# endif > + > + add $16, %rdx > + add $16, %r10 > + jg LABEL(nibble_ashr_1_use) > + > + movdqa (%rdi, %rdx), %xmm0 > + palignr $1, -16(%rdi, %rdx), %xmm0 > +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 > +# else > + movdqa (%rsi,%rdx), %xmm1 > + TOLOWER (%xmm0, %xmm1) > + pcmpistri $0x1a, %xmm1, %xmm0 > +# endif > + jbe LABEL(exit_use) > +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > + sub $16, %r11 > + jbe LABEL(strcmp_exitz) > +# endif > + add $16, %rdx > + jmp LABEL(loop_ashr_1_use) > + > + .p2align 4 > +LABEL(nibble_ashr_1_use): > + sub $0x1000, %r10 > + movdqa -16(%rdi, %rdx), %xmm0 > + psrldq $1, %xmm0 > + pcmpistri $0x3a,%xmm0, %xmm0 > +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > + cmp %r11, %rcx > + jae LABEL(nibble_ashr_exit_use) > +# endif > + cmp $14, %ecx > + ja LABEL(nibble_ashr_1_restart_use) > + > + jmp LABEL(nibble_ashr_exit_use) > + > +/* > + * The following cases will be handled by ashr_2 > + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case > + * n(14~15) n -14 1(15 +(n-14) - n) ashr_2 > + */ > + .p2align 4 > +LABEL(ashr_2): > + pslldq $14, %xmm2 > + TOLOWER (%xmm1, %xmm2) > + pcmpeqb %xmm1, %xmm2 > + psubb %xmm0, %xmm2 > + pmovmskb %xmm2, %r9d > + shr %cl, %edx > + shr %cl, %r9d > + sub %r9d, %edx > + jnz LABEL(less32bytes) > + movdqa (%rdi), %xmm3 > + UPDATE_STRNCMP_COUNTER > + > + mov $16, %rcx /* index for loads */ > + mov $2, %r9d /* byte position left over from less32bytes case */ > + /* > + * Setup %r10 value allows us to detect crossing a page boundary. > + * When %r10 goes positive we have crossed a page boundary and > + * need to do a nibble. > + */ > + lea 2(%rdi), %r10 > + and $0xfff, %r10 /* offset into 4K page */ > + sub $0x1000, %r10 /* subtract 4K pagesize */ > + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ > + > + .p2align 4 > +LABEL(loop_ashr_2_use): > + add $16, %r10 > + jg LABEL(nibble_ashr_2_use) > + > +LABEL(nibble_ashr_2_restart_use): > + movdqa (%rdi, %rdx), %xmm0 > + palignr $2, -16(%rdi, %rdx), %xmm0 > +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 > +# else > + movdqa (%rsi,%rdx), %xmm1 > + TOLOWER (%xmm0, %xmm1) > + pcmpistri $0x1a, %xmm1, %xmm0 > +# endif > + jbe LABEL(exit_use) > +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > + sub $16, %r11 > + jbe LABEL(strcmp_exitz) > +# endif > + > + add $16, %rdx > + add $16, %r10 > + jg LABEL(nibble_ashr_2_use) > + > + movdqa (%rdi, %rdx), %xmm0 > + palignr $2, -16(%rdi, %rdx), %xmm0 > +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 > +# else > + movdqa (%rsi,%rdx), %xmm1 > + TOLOWER (%xmm0, %xmm1) > + pcmpistri $0x1a, %xmm1, %xmm0 > +# endif > + jbe LABEL(exit_use) > +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > + sub $16, %r11 > + jbe LABEL(strcmp_exitz) > +# endif > + add $16, %rdx > + jmp LABEL(loop_ashr_2_use) > + > + .p2align 4 > +LABEL(nibble_ashr_2_use): > + sub $0x1000, %r10 > + movdqa -16(%rdi, %rdx), %xmm0 > + psrldq $2, %xmm0 > + pcmpistri $0x3a,%xmm0, %xmm0 > +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > + cmp %r11, %rcx > + jae LABEL(nibble_ashr_exit_use) > +# endif > + cmp $13, %ecx > + ja LABEL(nibble_ashr_2_restart_use) > + > + jmp LABEL(nibble_ashr_exit_use) > + > +/* > + * The following cases will be handled by ashr_3 > + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case > + * n(13~15) n -13 2(15 +(n-13) - n) ashr_3 > + */ > + .p2align 4 > +LABEL(ashr_3): > + pslldq $13, %xmm2 > + TOLOWER (%xmm1, %xmm2) > + pcmpeqb %xmm1, %xmm2 > + psubb %xmm0, %xmm2 > + pmovmskb %xmm2, %r9d > + shr %cl, %edx > + shr %cl, %r9d > + sub %r9d, %edx > + jnz LABEL(less32bytes) > + movdqa (%rdi), %xmm3 > + > + UPDATE_STRNCMP_COUNTER > + > + mov $16, %rcx /* index for loads */ > + mov $3, %r9d /* byte position left over from less32bytes case */ > + /* > + * Setup %r10 value allows us to detect crossing a page boundary. > + * When %r10 goes positive we have crossed a page boundary and > + * need to do a nibble. > + */ > + lea 3(%rdi), %r10 > + and $0xfff, %r10 /* offset into 4K page */ > + sub $0x1000, %r10 /* subtract 4K pagesize */ > + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ > + > +LABEL(loop_ashr_3_use): > + add $16, %r10 > + jg LABEL(nibble_ashr_3_use) > + > +LABEL(nibble_ashr_3_restart_use): > + movdqa (%rdi, %rdx), %xmm0 > + palignr $3, -16(%rdi, %rdx), %xmm0 > +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 > +# else > + movdqa (%rsi,%rdx), %xmm1 > + TOLOWER (%xmm0, %xmm1) > + pcmpistri $0x1a, %xmm1, %xmm0 > +# endif > + jbe LABEL(exit_use) > +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > + sub $16, %r11 > + jbe LABEL(strcmp_exitz) > +# endif > + > + add $16, %rdx > + add $16, %r10 > + jg LABEL(nibble_ashr_3_use) > + > + movdqa (%rdi, %rdx), %xmm0 > + palignr $3, -16(%rdi, %rdx), %xmm0 > +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 > +# else > + movdqa (%rsi,%rdx), %xmm1 > + TOLOWER (%xmm0, %xmm1) > + pcmpistri $0x1a, %xmm1, %xmm0 > +# endif > + jbe LABEL(exit_use) > +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > + sub $16, %r11 > + jbe LABEL(strcmp_exitz) > +# endif > + add $16, %rdx > + jmp LABEL(loop_ashr_3_use) > + > + .p2align 4 > +LABEL(nibble_ashr_3_use): > + sub $0x1000, %r10 > + movdqa -16(%rdi, %rdx), %xmm0 > + psrldq $3, %xmm0 > + pcmpistri $0x3a,%xmm0, %xmm0 > +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > + cmp %r11, %rcx > + jae LABEL(nibble_ashr_exit_use) > +# endif > + cmp $12, %ecx > + ja LABEL(nibble_ashr_3_restart_use) > + > + jmp LABEL(nibble_ashr_exit_use) > + > +/* > + * The following cases will be handled by ashr_4 > + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case > + * n(12~15) n -12 3(15 +(n-12) - n) ashr_4 > + */ > + .p2align 4 > +LABEL(ashr_4): > + pslldq $12, %xmm2 > + TOLOWER (%xmm1, %xmm2) > + pcmpeqb %xmm1, %xmm2 > + psubb %xmm0, %xmm2 > + pmovmskb %xmm2, %r9d > + shr %cl, %edx > + shr %cl, %r9d > + sub %r9d, %edx > + jnz LABEL(less32bytes) > + movdqa (%rdi), %xmm3 > + > + UPDATE_STRNCMP_COUNTER > + > + mov $16, %rcx /* index for loads */ > + mov $4, %r9d /* byte position left over from less32bytes case */ > + /* > + * Setup %r10 value allows us to detect crossing a page boundary. > + * When %r10 goes positive we have crossed a page boundary and > + * need to do a nibble. > + */ > + lea 4(%rdi), %r10 > + and $0xfff, %r10 /* offset into 4K page */ > + sub $0x1000, %r10 /* subtract 4K pagesize */ > + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ > + > + .p2align 4 > +LABEL(loop_ashr_4_use): > + add $16, %r10 > + jg LABEL(nibble_ashr_4_use) > + > +LABEL(nibble_ashr_4_restart_use): > + movdqa (%rdi, %rdx), %xmm0 > + palignr $4, -16(%rdi, %rdx), %xmm0 > +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 > +# else > + movdqa (%rsi,%rdx), %xmm1 > + TOLOWER (%xmm0, %xmm1) > + pcmpistri $0x1a, %xmm1, %xmm0 > +# endif > + jbe LABEL(exit_use) > +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > + sub $16, %r11 > + jbe LABEL(strcmp_exitz) > +# endif > + > + add $16, %rdx > + add $16, %r10 > + jg LABEL(nibble_ashr_4_use) > + > + movdqa (%rdi, %rdx), %xmm0 > + palignr $4, -16(%rdi, %rdx), %xmm0 > +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 > +# else > + movdqa (%rsi,%rdx), %xmm1 > + TOLOWER (%xmm0, %xmm1) > + pcmpistri $0x1a, %xmm1, %xmm0 > +# endif > + jbe LABEL(exit_use) > +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > + sub $16, %r11 > + jbe LABEL(strcmp_exitz) > +# endif > + add $16, %rdx > + jmp LABEL(loop_ashr_4_use) > + > + .p2align 4 > +LABEL(nibble_ashr_4_use): > + sub $0x1000, %r10 > + movdqa -16(%rdi, %rdx), %xmm0 > + psrldq $4, %xmm0 > + pcmpistri $0x3a,%xmm0, %xmm0 > +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > + cmp %r11, %rcx > + jae LABEL(nibble_ashr_exit_use) > +# endif > + cmp $11, %ecx > + ja LABEL(nibble_ashr_4_restart_use) > + > + jmp LABEL(nibble_ashr_exit_use) > + > +/* > + * The following cases will be handled by ashr_5 > + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case > + * n(11~15) n - 11 4(15 +(n-11) - n) ashr_5 > + */ > + .p2align 4 > +LABEL(ashr_5): > + pslldq $11, %xmm2 > + TOLOWER (%xmm1, %xmm2) > + pcmpeqb %xmm1, %xmm2 > + psubb %xmm0, %xmm2 > + pmovmskb %xmm2, %r9d > + shr %cl, %edx > + shr %cl, %r9d > + sub %r9d, %edx > + jnz LABEL(less32bytes) > + movdqa (%rdi), %xmm3 > + > + UPDATE_STRNCMP_COUNTER > + > + mov $16, %rcx /* index for loads */ > + mov $5, %r9d /* byte position left over from less32bytes case */ > + /* > + * Setup %r10 value allows us to detect crossing a page boundary. > + * When %r10 goes positive we have crossed a page boundary and > + * need to do a nibble. > + */ > + lea 5(%rdi), %r10 > + and $0xfff, %r10 /* offset into 4K page */ > + sub $0x1000, %r10 /* subtract 4K pagesize */ > + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ > + > + .p2align 4 > +LABEL(loop_ashr_5_use): > + add $16, %r10 > + jg LABEL(nibble_ashr_5_use) > + > +LABEL(nibble_ashr_5_restart_use): > + movdqa (%rdi, %rdx), %xmm0 > + palignr $5, -16(%rdi, %rdx), %xmm0 > +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 > +# else > + movdqa (%rsi,%rdx), %xmm1 > + TOLOWER (%xmm0, %xmm1) > + pcmpistri $0x1a, %xmm1, %xmm0 > +# endif > + jbe LABEL(exit_use) > +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > + sub $16, %r11 > + jbe LABEL(strcmp_exitz) > +# endif > + > + add $16, %rdx > + add $16, %r10 > + jg LABEL(nibble_ashr_5_use) > + > + movdqa (%rdi, %rdx), %xmm0 > + > + palignr $5, -16(%rdi, %rdx), %xmm0 > +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 > +# else > + movdqa (%rsi,%rdx), %xmm1 > + TOLOWER (%xmm0, %xmm1) > + pcmpistri $0x1a, %xmm1, %xmm0 > +# endif > + jbe LABEL(exit_use) > +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > + sub $16, %r11 > + jbe LABEL(strcmp_exitz) > +# endif > + add $16, %rdx > + jmp LABEL(loop_ashr_5_use) > + > + .p2align 4 > +LABEL(nibble_ashr_5_use): > + sub $0x1000, %r10 > + movdqa -16(%rdi, %rdx), %xmm0 > + psrldq $5, %xmm0 > + pcmpistri $0x3a,%xmm0, %xmm0 > +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > + cmp %r11, %rcx > + jae LABEL(nibble_ashr_exit_use) > +# endif > + cmp $10, %ecx > + ja LABEL(nibble_ashr_5_restart_use) > + > + jmp LABEL(nibble_ashr_exit_use) > + > +/* > + * The following cases will be handled by ashr_6 > + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case > + * n(10~15) n - 10 5(15 +(n-10) - n) ashr_6 > + */ > + .p2align 4 > +LABEL(ashr_6): > + pslldq $10, %xmm2 > + TOLOWER (%xmm1, %xmm2) > + pcmpeqb %xmm1, %xmm2 > + psubb %xmm0, %xmm2 > + pmovmskb %xmm2, %r9d > + shr %cl, %edx > + shr %cl, %r9d > + sub %r9d, %edx > + jnz LABEL(less32bytes) > + movdqa (%rdi), %xmm3 > + > + UPDATE_STRNCMP_COUNTER > + > + mov $16, %rcx /* index for loads */ > + mov $6, %r9d /* byte position left over from less32bytes case */ > + /* > + * Setup %r10 value allows us to detect crossing a page boundary. > + * When %r10 goes positive we have crossed a page boundary and > + * need to do a nibble. > + */ > + lea 6(%rdi), %r10 > + and $0xfff, %r10 /* offset into 4K page */ > + sub $0x1000, %r10 /* subtract 4K pagesize */ > + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ > + > + .p2align 4 > +LABEL(loop_ashr_6_use): > + add $16, %r10 > + jg LABEL(nibble_ashr_6_use) > + > +LABEL(nibble_ashr_6_restart_use): > + movdqa (%rdi, %rdx), %xmm0 > + palignr $6, -16(%rdi, %rdx), %xmm0 > +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 > +# else > + movdqa (%rsi,%rdx), %xmm1 > + TOLOWER (%xmm0, %xmm1) > + pcmpistri $0x1a, %xmm1, %xmm0 > +# endif > + jbe LABEL(exit_use) > +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > + sub $16, %r11 > + jbe LABEL(strcmp_exitz) > +# endif > + > + add $16, %rdx > + add $16, %r10 > + jg LABEL(nibble_ashr_6_use) > + > + movdqa (%rdi, %rdx), %xmm0 > + palignr $6, -16(%rdi, %rdx), %xmm0 > +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 > +# else > + movdqa (%rsi,%rdx), %xmm1 > + TOLOWER (%xmm0, %xmm1) > + pcmpistri $0x1a, %xmm1, %xmm0 > +# endif > + jbe LABEL(exit_use) > +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > + sub $16, %r11 > + jbe LABEL(strcmp_exitz) > +# endif > + add $16, %rdx > + jmp LABEL(loop_ashr_6_use) > + > + .p2align 4 > +LABEL(nibble_ashr_6_use): > + sub $0x1000, %r10 > + movdqa -16(%rdi, %rdx), %xmm0 > + psrldq $6, %xmm0 > + pcmpistri $0x3a,%xmm0, %xmm0 > +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > + cmp %r11, %rcx > + jae LABEL(nibble_ashr_exit_use) > +# endif > + cmp $9, %ecx > + ja LABEL(nibble_ashr_6_restart_use) > + > + jmp LABEL(nibble_ashr_exit_use) > + > +/* > + * The following cases will be handled by ashr_7 > + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case > + * n(9~15) n - 9 6(15 +(n - 9) - n) ashr_7 > + */ > + .p2align 4 > +LABEL(ashr_7): > + pslldq $9, %xmm2 > + TOLOWER (%xmm1, %xmm2) > + pcmpeqb %xmm1, %xmm2 > + psubb %xmm0, %xmm2 > + pmovmskb %xmm2, %r9d > + shr %cl, %edx > + shr %cl, %r9d > + sub %r9d, %edx > + jnz LABEL(less32bytes) > + movdqa (%rdi), %xmm3 > + > + UPDATE_STRNCMP_COUNTER > + > + mov $16, %rcx /* index for loads */ > + mov $7, %r9d /* byte position left over from less32bytes case */ > + /* > + * Setup %r10 value allows us to detect crossing a page boundary. > + * When %r10 goes positive we have crossed a page boundary and > + * need to do a nibble. > + */ > + lea 7(%rdi), %r10 > + and $0xfff, %r10 /* offset into 4K page */ > + sub $0x1000, %r10 /* subtract 4K pagesize */ > + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ > + > + .p2align 4 > +LABEL(loop_ashr_7_use): > + add $16, %r10 > + jg LABEL(nibble_ashr_7_use) > + > +LABEL(nibble_ashr_7_restart_use): > + movdqa (%rdi, %rdx), %xmm0 > + palignr $7, -16(%rdi, %rdx), %xmm0 > +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 > +# else > + movdqa (%rsi,%rdx), %xmm1 > + TOLOWER (%xmm0, %xmm1) > + pcmpistri $0x1a, %xmm1, %xmm0 > +# endif > + jbe LABEL(exit_use) > +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > + sub $16, %r11 > + jbe LABEL(strcmp_exitz) > +# endif > + > + add $16, %rdx > + add $16, %r10 > + jg LABEL(nibble_ashr_7_use) > + > + movdqa (%rdi, %rdx), %xmm0 > + palignr $7, -16(%rdi, %rdx), %xmm0 > +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 > +# else > + movdqa (%rsi,%rdx), %xmm1 > + TOLOWER (%xmm0, %xmm1) > + pcmpistri $0x1a, %xmm1, %xmm0 > +# endif > + jbe LABEL(exit_use) > +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > + sub $16, %r11 > + jbe LABEL(strcmp_exitz) > +# endif > + add $16, %rdx > + jmp LABEL(loop_ashr_7_use) > + > + .p2align 4 > +LABEL(nibble_ashr_7_use): > + sub $0x1000, %r10 > + movdqa -16(%rdi, %rdx), %xmm0 > + psrldq $7, %xmm0 > + pcmpistri $0x3a,%xmm0, %xmm0 > +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > + cmp %r11, %rcx > + jae LABEL(nibble_ashr_exit_use) > +# endif > + cmp $8, %ecx > + ja LABEL(nibble_ashr_7_restart_use) > + > + jmp LABEL(nibble_ashr_exit_use) > + > +/* > + * The following cases will be handled by ashr_8 > + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case > + * n(8~15) n - 8 7(15 +(n - 8) - n) ashr_8 > + */ > + .p2align 4 > +LABEL(ashr_8): > + pslldq $8, %xmm2 > + TOLOWER (%xmm1, %xmm2) > + pcmpeqb %xmm1, %xmm2 > + psubb %xmm0, %xmm2 > + pmovmskb %xmm2, %r9d > + shr %cl, %edx > + shr %cl, %r9d > + sub %r9d, %edx > + jnz LABEL(less32bytes) > + movdqa (%rdi), %xmm3 > + > + UPDATE_STRNCMP_COUNTER > + > + mov $16, %rcx /* index for loads */ > + mov $8, %r9d /* byte position left over from less32bytes case */ > + /* > + * Setup %r10 value allows us to detect crossing a page boundary. > + * When %r10 goes positive we have crossed a page boundary and > + * need to do a nibble. > + */ > + lea 8(%rdi), %r10 > + and $0xfff, %r10 /* offset into 4K page */ > + sub $0x1000, %r10 /* subtract 4K pagesize */ > + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ > + > + .p2align 4 > +LABEL(loop_ashr_8_use): > + add $16, %r10 > + jg LABEL(nibble_ashr_8_use) > + > +LABEL(nibble_ashr_8_restart_use): > + movdqa (%rdi, %rdx), %xmm0 > + palignr $8, -16(%rdi, %rdx), %xmm0 > +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 > +# else > + movdqa (%rsi,%rdx), %xmm1 > + TOLOWER (%xmm0, %xmm1) > + pcmpistri $0x1a, %xmm1, %xmm0 > +# endif > + jbe LABEL(exit_use) > +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > + sub $16, %r11 > + jbe LABEL(strcmp_exitz) > +# endif > + > + add $16, %rdx > + add $16, %r10 > + jg LABEL(nibble_ashr_8_use) > + > + movdqa (%rdi, %rdx), %xmm0 > + palignr $8, -16(%rdi, %rdx), %xmm0 > +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 > +# else > + movdqa (%rsi,%rdx), %xmm1 > + TOLOWER (%xmm0, %xmm1) > + pcmpistri $0x1a, %xmm1, %xmm0 > +# endif > + jbe LABEL(exit_use) > +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > + sub $16, %r11 > + jbe LABEL(strcmp_exitz) > +# endif > + add $16, %rdx > + jmp LABEL(loop_ashr_8_use) > + > + .p2align 4 > +LABEL(nibble_ashr_8_use): > + sub $0x1000, %r10 > + movdqa -16(%rdi, %rdx), %xmm0 > + psrldq $8, %xmm0 > + pcmpistri $0x3a,%xmm0, %xmm0 > +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > + cmp %r11, %rcx > + jae LABEL(nibble_ashr_exit_use) > +# endif > + cmp $7, %ecx > + ja LABEL(nibble_ashr_8_restart_use) > + > + jmp LABEL(nibble_ashr_exit_use) > + > +/* > + * The following cases will be handled by ashr_9 > + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case > + * n(7~15) n - 7 8(15 +(n - 7) - n) ashr_9 > + */ > + .p2align 4 > +LABEL(ashr_9): > + pslldq $7, %xmm2 > + TOLOWER (%xmm1, %xmm2) > + pcmpeqb %xmm1, %xmm2 > + psubb %xmm0, %xmm2 > + pmovmskb %xmm2, %r9d > + shr %cl, %edx > + shr %cl, %r9d > + sub %r9d, %edx > + jnz LABEL(less32bytes) > + movdqa (%rdi), %xmm3 > + > + UPDATE_STRNCMP_COUNTER > + > + mov $16, %rcx /* index for loads */ > + mov $9, %r9d /* byte position left over from less32bytes case */ > + /* > + * Setup %r10 value allows us to detect crossing a page boundary. > + * When %r10 goes positive we have crossed a page boundary and > + * need to do a nibble. > + */ > + lea 9(%rdi), %r10 > + and $0xfff, %r10 /* offset into 4K page */ > + sub $0x1000, %r10 /* subtract 4K pagesize */ > + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ > + > + .p2align 4 > +LABEL(loop_ashr_9_use): > + add $16, %r10 > + jg LABEL(nibble_ashr_9_use) > + > +LABEL(nibble_ashr_9_restart_use): > + movdqa (%rdi, %rdx), %xmm0 > + > + palignr $9, -16(%rdi, %rdx), %xmm0 > +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 > +# else > + movdqa (%rsi,%rdx), %xmm1 > + TOLOWER (%xmm0, %xmm1) > + pcmpistri $0x1a, %xmm1, %xmm0 > +# endif > + jbe LABEL(exit_use) > +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > + sub $16, %r11 > + jbe LABEL(strcmp_exitz) > +# endif > + > + add $16, %rdx > + add $16, %r10 > + jg LABEL(nibble_ashr_9_use) > + > + movdqa (%rdi, %rdx), %xmm0 > + palignr $9, -16(%rdi, %rdx), %xmm0 > +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 > +# else > + movdqa (%rsi,%rdx), %xmm1 > + TOLOWER (%xmm0, %xmm1) > + pcmpistri $0x1a, %xmm1, %xmm0 > +# endif > + jbe LABEL(exit_use) > +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > + sub $16, %r11 > + jbe LABEL(strcmp_exitz) > +# endif > + add $16, %rdx > + jmp LABEL(loop_ashr_9_use) > + > + .p2align 4 > +LABEL(nibble_ashr_9_use): > + sub $0x1000, %r10 > + movdqa -16(%rdi, %rdx), %xmm0 > + psrldq $9, %xmm0 > + pcmpistri $0x3a,%xmm0, %xmm0 > +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > + cmp %r11, %rcx > + jae LABEL(nibble_ashr_exit_use) > +# endif > + cmp $6, %ecx > + ja LABEL(nibble_ashr_9_restart_use) > + > + jmp LABEL(nibble_ashr_exit_use) > + > +/* > + * The following cases will be handled by ashr_10 > + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case > + * n(6~15) n - 6 9(15 +(n - 6) - n) ashr_10 > + */ > + .p2align 4 > +LABEL(ashr_10): > + pslldq $6, %xmm2 > + TOLOWER (%xmm1, %xmm2) > + pcmpeqb %xmm1, %xmm2 > + psubb %xmm0, %xmm2 > + pmovmskb %xmm2, %r9d > + shr %cl, %edx > + shr %cl, %r9d > + sub %r9d, %edx > + jnz LABEL(less32bytes) > + movdqa (%rdi), %xmm3 > + > + UPDATE_STRNCMP_COUNTER > + > + mov $16, %rcx /* index for loads */ > + mov $10, %r9d /* byte position left over from less32bytes case */ > + /* > + * Setup %r10 value allows us to detect crossing a page boundary. > + * When %r10 goes positive we have crossed a page boundary and > + * need to do a nibble. > + */ > + lea 10(%rdi), %r10 > + and $0xfff, %r10 /* offset into 4K page */ > + sub $0x1000, %r10 /* subtract 4K pagesize */ > + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ > + > + .p2align 4 > +LABEL(loop_ashr_10_use): > + add $16, %r10 > + jg LABEL(nibble_ashr_10_use) > + > +LABEL(nibble_ashr_10_restart_use): > + movdqa (%rdi, %rdx), %xmm0 > + palignr $10, -16(%rdi, %rdx), %xmm0 > +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 > +# else > + movdqa (%rsi,%rdx), %xmm1 > + TOLOWER (%xmm0, %xmm1) > + pcmpistri $0x1a, %xmm1, %xmm0 > +# endif > + jbe LABEL(exit_use) > +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > + sub $16, %r11 > + jbe LABEL(strcmp_exitz) > +# endif > + > + add $16, %rdx > + add $16, %r10 > + jg LABEL(nibble_ashr_10_use) > + > + movdqa (%rdi, %rdx), %xmm0 > + palignr $10, -16(%rdi, %rdx), %xmm0 > +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 > +# else > + movdqa (%rsi,%rdx), %xmm1 > + TOLOWER (%xmm0, %xmm1) > + pcmpistri $0x1a, %xmm1, %xmm0 > +# endif > + jbe LABEL(exit_use) > +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > + sub $16, %r11 > + jbe LABEL(strcmp_exitz) > +# endif > + add $16, %rdx > + jmp LABEL(loop_ashr_10_use) > + > + .p2align 4 > +LABEL(nibble_ashr_10_use): > + sub $0x1000, %r10 > + movdqa -16(%rdi, %rdx), %xmm0 > + psrldq $10, %xmm0 > + pcmpistri $0x3a,%xmm0, %xmm0 > +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > + cmp %r11, %rcx > + jae LABEL(nibble_ashr_exit_use) > +# endif > + cmp $5, %ecx > + ja LABEL(nibble_ashr_10_restart_use) > + > + jmp LABEL(nibble_ashr_exit_use) > + > +/* > + * The following cases will be handled by ashr_11 > + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case > + * n(5~15) n - 5 10(15 +(n - 5) - n) ashr_11 > + */ > + .p2align 4 > +LABEL(ashr_11): > + pslldq $5, %xmm2 > + TOLOWER (%xmm1, %xmm2) > + pcmpeqb %xmm1, %xmm2 > + psubb %xmm0, %xmm2 > + pmovmskb %xmm2, %r9d > + shr %cl, %edx > + shr %cl, %r9d > + sub %r9d, %edx > + jnz LABEL(less32bytes) > + movdqa (%rdi), %xmm3 > + > + UPDATE_STRNCMP_COUNTER > + > + mov $16, %rcx /* index for loads */ > + mov $11, %r9d /* byte position left over from less32bytes case */ > + /* > + * Setup %r10 value allows us to detect crossing a page boundary. > + * When %r10 goes positive we have crossed a page boundary and > + * need to do a nibble. > + */ > + lea 11(%rdi), %r10 > + and $0xfff, %r10 /* offset into 4K page */ > + sub $0x1000, %r10 /* subtract 4K pagesize */ > + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ > + > + .p2align 4 > +LABEL(loop_ashr_11_use): > + add $16, %r10 > + jg LABEL(nibble_ashr_11_use) > + > +LABEL(nibble_ashr_11_restart_use): > + movdqa (%rdi, %rdx), %xmm0 > + palignr $11, -16(%rdi, %rdx), %xmm0 > +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 > +# else > + movdqa (%rsi,%rdx), %xmm1 > + TOLOWER (%xmm0, %xmm1) > + pcmpistri $0x1a, %xmm1, %xmm0 > +# endif > + jbe LABEL(exit_use) > +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > + sub $16, %r11 > + jbe LABEL(strcmp_exitz) > +# endif > + > + add $16, %rdx > + add $16, %r10 > + jg LABEL(nibble_ashr_11_use) > + > + movdqa (%rdi, %rdx), %xmm0 > + palignr $11, -16(%rdi, %rdx), %xmm0 > +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 > +# else > + movdqa (%rsi,%rdx), %xmm1 > + TOLOWER (%xmm0, %xmm1) > + pcmpistri $0x1a, %xmm1, %xmm0 > +# endif > + jbe LABEL(exit_use) > +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > + sub $16, %r11 > + jbe LABEL(strcmp_exitz) > +# endif > + add $16, %rdx > + jmp LABEL(loop_ashr_11_use) > + > + .p2align 4 > +LABEL(nibble_ashr_11_use): > + sub $0x1000, %r10 > + movdqa -16(%rdi, %rdx), %xmm0 > + psrldq $11, %xmm0 > + pcmpistri $0x3a,%xmm0, %xmm0 > +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > + cmp %r11, %rcx > + jae LABEL(nibble_ashr_exit_use) > +# endif > + cmp $4, %ecx > + ja LABEL(nibble_ashr_11_restart_use) > + > + jmp LABEL(nibble_ashr_exit_use) > + > +/* > + * The following cases will be handled by ashr_12 > + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case > + * n(4~15) n - 4 11(15 +(n - 4) - n) ashr_12 > + */ > + .p2align 4 > +LABEL(ashr_12): > + pslldq $4, %xmm2 > + TOLOWER (%xmm1, %xmm2) > + pcmpeqb %xmm1, %xmm2 > + psubb %xmm0, %xmm2 > + pmovmskb %xmm2, %r9d > + shr %cl, %edx > + shr %cl, %r9d > + sub %r9d, %edx > + jnz LABEL(less32bytes) > + movdqa (%rdi), %xmm3 > + > + UPDATE_STRNCMP_COUNTER > + > + mov $16, %rcx /* index for loads */ > + mov $12, %r9d /* byte position left over from less32bytes case */ > + /* > + * Setup %r10 value allows us to detect crossing a page boundary. > + * When %r10 goes positive we have crossed a page boundary and > + * need to do a nibble. > + */ > + lea 12(%rdi), %r10 > + and $0xfff, %r10 /* offset into 4K page */ > + sub $0x1000, %r10 /* subtract 4K pagesize */ > + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ > + > + .p2align 4 > +LABEL(loop_ashr_12_use): > + add $16, %r10 > + jg LABEL(nibble_ashr_12_use) > + > +LABEL(nibble_ashr_12_restart_use): > + movdqa (%rdi, %rdx), %xmm0 > + palignr $12, -16(%rdi, %rdx), %xmm0 > +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 > +# else > + movdqa (%rsi,%rdx), %xmm1 > + TOLOWER (%xmm0, %xmm1) > + pcmpistri $0x1a, %xmm1, %xmm0 > +# endif > + jbe LABEL(exit_use) > +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > + sub $16, %r11 > + jbe LABEL(strcmp_exitz) > +# endif > + > + add $16, %rdx > + add $16, %r10 > + jg LABEL(nibble_ashr_12_use) > + > + movdqa (%rdi, %rdx), %xmm0 > + palignr $12, -16(%rdi, %rdx), %xmm0 > +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 > +# else > + movdqa (%rsi,%rdx), %xmm1 > + TOLOWER (%xmm0, %xmm1) > + pcmpistri $0x1a, %xmm1, %xmm0 > +# endif > + jbe LABEL(exit_use) > +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > + sub $16, %r11 > + jbe LABEL(strcmp_exitz) > +# endif > + add $16, %rdx > + jmp LABEL(loop_ashr_12_use) > + > + .p2align 4 > +LABEL(nibble_ashr_12_use): > + sub $0x1000, %r10 > + movdqa -16(%rdi, %rdx), %xmm0 > + psrldq $12, %xmm0 > + pcmpistri $0x3a,%xmm0, %xmm0 > +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > + cmp %r11, %rcx > + jae LABEL(nibble_ashr_exit_use) > +# endif > + cmp $3, %ecx > + ja LABEL(nibble_ashr_12_restart_use) > + > + jmp LABEL(nibble_ashr_exit_use) > + > +/* > + * The following cases will be handled by ashr_13 > + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case > + * n(3~15) n - 3 12(15 +(n - 3) - n) ashr_13 > + */ > + .p2align 4 > +LABEL(ashr_13): > + pslldq $3, %xmm2 > + TOLOWER (%xmm1, %xmm2) > + pcmpeqb %xmm1, %xmm2 > + psubb %xmm0, %xmm2 > + pmovmskb %xmm2, %r9d > + shr %cl, %edx > + shr %cl, %r9d > + sub %r9d, %edx > + jnz LABEL(less32bytes) > + movdqa (%rdi), %xmm3 > + > + UPDATE_STRNCMP_COUNTER > + > + mov $16, %rcx /* index for loads */ > + mov $13, %r9d /* byte position left over from less32bytes case */ > + /* > + * Setup %r10 value allows us to detect crossing a page boundary. > + * When %r10 goes positive we have crossed a page boundary and > + * need to do a nibble. > + */ > + lea 13(%rdi), %r10 > + and $0xfff, %r10 /* offset into 4K page */ > + sub $0x1000, %r10 /* subtract 4K pagesize */ > + > + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ > + > + .p2align 4 > +LABEL(loop_ashr_13_use): > + add $16, %r10 > + jg LABEL(nibble_ashr_13_use) > + > +LABEL(nibble_ashr_13_restart_use): > + movdqa (%rdi, %rdx), %xmm0 > + palignr $13, -16(%rdi, %rdx), %xmm0 > +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 > +# else > + movdqa (%rsi,%rdx), %xmm1 > + TOLOWER (%xmm0, %xmm1) > + pcmpistri $0x1a, %xmm1, %xmm0 > +# endif > + jbe LABEL(exit_use) > +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > + sub $16, %r11 > + jbe LABEL(strcmp_exitz) > +# endif > + > + add $16, %rdx > + add $16, %r10 > + jg LABEL(nibble_ashr_13_use) > + > + movdqa (%rdi, %rdx), %xmm0 > + palignr $13, -16(%rdi, %rdx), %xmm0 > +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 > +# else > + movdqa (%rsi,%rdx), %xmm1 > + TOLOWER (%xmm0, %xmm1) > + pcmpistri $0x1a, %xmm1, %xmm0 > +# endif > + jbe LABEL(exit_use) > +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > + sub $16, %r11 > + jbe LABEL(strcmp_exitz) > +# endif > + add $16, %rdx > + jmp LABEL(loop_ashr_13_use) > + > + .p2align 4 > +LABEL(nibble_ashr_13_use): > + sub $0x1000, %r10 > + movdqa -16(%rdi, %rdx), %xmm0 > + psrldq $13, %xmm0 > + pcmpistri $0x3a,%xmm0, %xmm0 > +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > + cmp %r11, %rcx > + jae LABEL(nibble_ashr_exit_use) > +# endif > + cmp $2, %ecx > + ja LABEL(nibble_ashr_13_restart_use) > + > + jmp LABEL(nibble_ashr_exit_use) > + > +/* > + * The following cases will be handled by ashr_14 > + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case > + * n(2~15) n - 2 13(15 +(n - 2) - n) ashr_14 > + */ > + .p2align 4 > +LABEL(ashr_14): > + pslldq $2, %xmm2 > + TOLOWER (%xmm1, %xmm2) > + pcmpeqb %xmm1, %xmm2 > + psubb %xmm0, %xmm2 > + pmovmskb %xmm2, %r9d > + shr %cl, %edx > + shr %cl, %r9d > + sub %r9d, %edx > + jnz LABEL(less32bytes) > + movdqa (%rdi), %xmm3 > + > + UPDATE_STRNCMP_COUNTER > + > + mov $16, %rcx /* index for loads */ > + mov $14, %r9d /* byte position left over from less32bytes case */ > + /* > + * Setup %r10 value allows us to detect crossing a page boundary. > + * When %r10 goes positive we have crossed a page boundary and > + * need to do a nibble. > + */ > + lea 14(%rdi), %r10 > + and $0xfff, %r10 /* offset into 4K page */ > + sub $0x1000, %r10 /* subtract 4K pagesize */ > + > + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ > + > + .p2align 4 > +LABEL(loop_ashr_14_use): > + add $16, %r10 > + jg LABEL(nibble_ashr_14_use) > + > +LABEL(nibble_ashr_14_restart_use): > + movdqa (%rdi, %rdx), %xmm0 > + palignr $14, -16(%rdi, %rdx), %xmm0 > +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 > +# else > + movdqa (%rsi,%rdx), %xmm1 > + TOLOWER (%xmm0, %xmm1) > + pcmpistri $0x1a, %xmm1, %xmm0 > +# endif > + jbe LABEL(exit_use) > +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > + sub $16, %r11 > + jbe LABEL(strcmp_exitz) > +# endif > + > + add $16, %rdx > + add $16, %r10 > + jg LABEL(nibble_ashr_14_use) > + > + movdqa (%rdi, %rdx), %xmm0 > + palignr $14, -16(%rdi, %rdx), %xmm0 > +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 > +# else > + movdqa (%rsi,%rdx), %xmm1 > + TOLOWER (%xmm0, %xmm1) > + pcmpistri $0x1a, %xmm1, %xmm0 > +# endif > + jbe LABEL(exit_use) > +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > + sub $16, %r11 > + jbe LABEL(strcmp_exitz) > +# endif > + add $16, %rdx > + jmp LABEL(loop_ashr_14_use) > + > + .p2align 4 > +LABEL(nibble_ashr_14_use): > + sub $0x1000, %r10 > + movdqa -16(%rdi, %rdx), %xmm0 > + psrldq $14, %xmm0 > + pcmpistri $0x3a,%xmm0, %xmm0 > +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > + cmp %r11, %rcx > + jae LABEL(nibble_ashr_exit_use) > +# endif > + cmp $1, %ecx > + ja LABEL(nibble_ashr_14_restart_use) > + > + jmp LABEL(nibble_ashr_exit_use) > + > +/* > + * The following cases will be handled by ashr_15 > + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case > + * n(1~15) n - 1 14(15 +(n - 1) - n) ashr_15 > + */ > + .p2align 4 > +LABEL(ashr_15): > + pslldq $1, %xmm2 > + TOLOWER (%xmm1, %xmm2) > + pcmpeqb %xmm1, %xmm2 > + psubb %xmm0, %xmm2 > + pmovmskb %xmm2, %r9d > + shr %cl, %edx > + shr %cl, %r9d > + sub %r9d, %edx > + jnz LABEL(less32bytes) > + > + movdqa (%rdi), %xmm3 > + > + UPDATE_STRNCMP_COUNTER > + > + mov $16, %rcx /* index for loads */ > + mov $15, %r9d /* byte position left over from less32bytes case */ > + /* > + * Setup %r10 value allows us to detect crossing a page boundary. > + * When %r10 goes positive we have crossed a page boundary and > + * need to do a nibble. > + */ > + lea 15(%rdi), %r10 > + and $0xfff, %r10 /* offset into 4K page */ > + > + sub $0x1000, %r10 /* subtract 4K pagesize */ > + > + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ > + > + .p2align 4 > +LABEL(loop_ashr_15_use): > + add $16, %r10 > + jg LABEL(nibble_ashr_15_use) > + > +LABEL(nibble_ashr_15_restart_use): > + movdqa (%rdi, %rdx), %xmm0 > + palignr $15, -16(%rdi, %rdx), %xmm0 > +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 > +# else > + movdqa (%rsi,%rdx), %xmm1 > + TOLOWER (%xmm0, %xmm1) > + pcmpistri $0x1a, %xmm1, %xmm0 > +# endif > + jbe LABEL(exit_use) > +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > + sub $16, %r11 > + jbe LABEL(strcmp_exitz) > +# endif > + > + add $16, %rdx > + add $16, %r10 > + jg LABEL(nibble_ashr_15_use) > + > + movdqa (%rdi, %rdx), %xmm0 > + palignr $15, -16(%rdi, %rdx), %xmm0 > +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 > +# else > + movdqa (%rsi,%rdx), %xmm1 > + TOLOWER (%xmm0, %xmm1) > + pcmpistri $0x1a, %xmm1, %xmm0 > +# endif > + jbe LABEL(exit_use) > +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > + sub $16, %r11 > + jbe LABEL(strcmp_exitz) > +# endif > + add $16, %rdx > + jmp LABEL(loop_ashr_15_use) > + > + .p2align 4 > +LABEL(nibble_ashr_15_use): > + sub $0x1000, %r10 > + movdqa -16(%rdi, %rdx), %xmm0 > + psrldq $15, %xmm0 > + pcmpistri $0x3a,%xmm0, %xmm0 > +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > + cmp %r11, %rcx > + jae LABEL(nibble_ashr_exit_use) > +# endif > + cmp $0, %ecx > + ja LABEL(nibble_ashr_15_restart_use) > + > +LABEL(nibble_ashr_exit_use): > +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L > + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 > +# else > + movdqa (%rsi,%rdx), %xmm1 > + TOLOWER (%xmm0, %xmm1) > + pcmpistri $0x1a, %xmm1, %xmm0 > +# endif > + .p2align 4 > +LABEL(exit_use): > + jnc LABEL(strcmp_exitz) > +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > + sub %rcx, %r11 > + jbe LABEL(strcmp_exitz) > +# endif > + add %rcx, %rdx > + lea -16(%rdi, %r9), %rdi > + movzbl (%rdi, %rdx), %eax > + movzbl (%rsi, %rdx), %edx > + test %r8d, %r8d > + jz LABEL(ret_use) > + xchg %eax, %edx > +LABEL(ret_use): > +# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L > + leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx > + movl (%rcx,%rdx,4), %edx > + movl (%rcx,%rax,4), %eax > +# endif > + > + sub %edx, %eax > + ret > + > +LABEL(less32bytes): > + lea (%rdi, %rax), %rdi /* locate the exact address for first operand(rdi) */ > + lea (%rsi, %rcx), %rsi /* locate the exact address for second operand(rsi) */ > + test %r8d, %r8d > + jz LABEL(ret) > + xchg %rsi, %rdi /* recover original order according to flag(%r8d) */ > + > + .p2align 4 > +LABEL(ret): > +LABEL(less16bytes): > + bsf %rdx, %rdx /* find and store bit index in %rdx */ > + > +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L > + sub %rdx, %r11 > + jbe LABEL(strcmp_exitz) > +# endif > + movzbl (%rsi, %rdx), %ecx > + movzbl (%rdi, %rdx), %eax > + > +# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L > + leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx > + movl (%rdx,%rcx,4), %ecx > + movl (%rdx,%rax,4), %eax > +# endif > + > + sub %ecx, %eax > + ret > + > +LABEL(strcmp_exitz): > + xor %eax, %eax > + ret > + > + .p2align 4 > + // XXX Same as code above > +LABEL(Byte0): > + movzbl (%rsi), %ecx > + movzbl (%rdi), %eax > + > +# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L > + leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx > + movl (%rdx,%rcx,4), %ecx > + movl (%rdx,%rax,4), %eax > +# endif > + > + sub %ecx, %eax > + ret > + cfi_endproc > + .size STRCMP, .-STRCMP > + > +# undef UCLOW_reg > +# undef UCHIGH_reg > +# undef LCQWORD_reg > +# undef TOLOWER > + > + /* Put all SSE 4.2 functions together. */ > + .section .rodata.SECTION,"a",@progbits > + .p2align 3 > +LABEL(unaligned_table): > + .int LABEL(ashr_1) - LABEL(unaligned_table) > + .int LABEL(ashr_2) - LABEL(unaligned_table) > + .int LABEL(ashr_3) - LABEL(unaligned_table) > + .int LABEL(ashr_4) - LABEL(unaligned_table) > + .int LABEL(ashr_5) - LABEL(unaligned_table) > + .int LABEL(ashr_6) - LABEL(unaligned_table) > + .int LABEL(ashr_7) - LABEL(unaligned_table) > + .int LABEL(ashr_8) - LABEL(unaligned_table) > + .int LABEL(ashr_9) - LABEL(unaligned_table) > + .int LABEL(ashr_10) - LABEL(unaligned_table) > + .int LABEL(ashr_11) - LABEL(unaligned_table) > + .int LABEL(ashr_12) - LABEL(unaligned_table) > + .int LABEL(ashr_13) - LABEL(unaligned_table) > + .int LABEL(ashr_14) - LABEL(unaligned_table) > + .int LABEL(ashr_15) - LABEL(unaligned_table) > + .int LABEL(ashr_0) - LABEL(unaligned_table) > + > +# undef LABEL > +# undef GLABEL > +# undef SECTION > +# undef movdqa > +# undef movdqu > +# undef pmovmskb > +# undef pcmpistri > +# undef psubb > +# undef pcmpeqb > +# undef psrldq > +# undef pslldq > +# undef palignr > +# undef pxor > +# undef D > #endif > diff --git a/sysdeps/x86_64/multiarch/strncase_l-sse4_2.S b/sysdeps/x86_64/multiarch/strncase_l-sse4_2.S > index 08e23548c3..1ce5c4e93f 100644 > --- a/sysdeps/x86_64/multiarch/strncase_l-sse4_2.S > +++ b/sysdeps/x86_64/multiarch/strncase_l-sse4_2.S > @@ -16,6 +16,5 @@ > License along with the GNU C Library; if not, see > <https://www.gnu.org/licenses/>. */ > > -#define STRCMP_SSE42 __strncasecmp_l_sse42 > #define USE_AS_STRNCASECMP_L > -#include "strcmp-sse42.S" > +#include "strcmp-sse4_2.S" > diff --git a/sysdeps/x86_64/multiarch/strncmp-sse4_2.S b/sysdeps/x86_64/multiarch/strncmp-sse4_2.S > index 310a6dbe77..2a02f0c2a6 100644 > --- a/sysdeps/x86_64/multiarch/strncmp-sse4_2.S > +++ b/sysdeps/x86_64/multiarch/strncmp-sse4_2.S > @@ -16,8 +16,5 @@ > License along with the GNU C Library; if not, see > <https://www.gnu.org/licenses/>. */ > > -#if IS_IN (libc) > -# define STRCMP_SSE42 __strncmp_sse42 > -# define USE_AS_STRNCMP > -# include "strcmp-sse42.S" > -#endif > +#define USE_AS_STRNCMP > +#include "strcmp-sse4_2.S" > -- > 2.34.1 > LGTM. Thanks.
diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-sse4_2.S b/sysdeps/x86_64/multiarch/strcasecmp_l-sse4_2.S index 411ab7d283..ac03b95756 100644 --- a/sysdeps/x86_64/multiarch/strcasecmp_l-sse4_2.S +++ b/sysdeps/x86_64/multiarch/strcasecmp_l-sse4_2.S @@ -16,6 +16,5 @@ License along with the GNU C Library; if not, see <https://www.gnu.org/licenses/>. */ -#define STRCMP_SSE42 __strcasecmp_l_sse42 #define USE_AS_STRCASECMP_L -#include "strcmp-sse42.S" +#include "strcmp-sse4_2.S" diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S deleted file mode 100644 index 60313c647a..0000000000 --- a/sysdeps/x86_64/multiarch/strcmp-sse42.S +++ /dev/null @@ -1,1782 +0,0 @@ -/* strcmp with SSE4.2 - Copyright (C) 2009-2022 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <https://www.gnu.org/licenses/>. */ - -#include <sysdep.h> - -#ifndef STRCMP_SSE42 -# define STRCMP_SSE42 __strcmp_sse42 -#endif - -#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L -# include "locale-defines.h" -#endif - -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L -/* Since the counter, %r11, is unsigned, we branch to strcmp_exitz - if the new counter > the old one or is 0. */ -# define UPDATE_STRNCMP_COUNTER \ - /* calculate left number to compare */ \ - lea -16(%rcx, %r11), %r9; \ - cmp %r9, %r11; \ - jb LABEL(strcmp_exitz); \ - test %r9, %r9; \ - je LABEL(strcmp_exitz); \ - mov %r9, %r11 -#else -# define UPDATE_STRNCMP_COUNTER -#endif - -#define SECTION sse4.2 -#define GLABEL(l) l##_sse42 - -#define LABEL(l) .L##l - -/* We use 0x1a: - _SIDD_SBYTE_OPS - | _SIDD_CMP_EQUAL_EACH - | _SIDD_NEGATIVE_POLARITY - | _SIDD_LEAST_SIGNIFICANT - on pcmpistri to find out if two 16byte data elements are the same - and the offset of the first different byte. There are 4 cases: - - 1. Both 16byte data elements are valid and identical. - 2. Both 16byte data elements have EOS and identical. - 3. Both 16byte data elements are valid and they differ at offset X. - 4. At least one 16byte data element has EOS at offset X. Two 16byte - data elements must differ at or before offset X. - - Here is the table of ECX, CFlag, ZFlag and SFlag for 4 cases: - - case ECX CFlag ZFlag SFlag - 1 16 0 0 0 - 2 16 0 1 1 - 3 X 1 0 0 - 4 0 <= X 1 0/1 0/1 - - We exit from the loop for cases 2, 3 and 4 with jbe which branches - when either CFlag or ZFlag is 1. If CFlag == 0, we return 0 for - case 2. */ - - /* Put all SSE 4.2 functions together. */ - .section .text.SECTION,"ax",@progbits - .align 16 - .type STRCMP_SSE42, @function - .globl STRCMP_SSE42 -#ifdef USE_AS_STRCASECMP_L -ENTRY (GLABEL(__strcasecmp)) - movq __libc_tsd_LOCALE@gottpoff(%rip),%rax - mov %fs:(%rax),%RDX_LP - - /* Either 1 or 5 bytes (dependeing if CET is enabled). */ - .p2align 4 -END (GLABEL(__strcasecmp)) - /* FALLTHROUGH to strcasecmp_l. */ -#endif -#ifdef USE_AS_STRNCASECMP_L -ENTRY (GLABEL(__strncasecmp)) - movq __libc_tsd_LOCALE@gottpoff(%rip),%rax - mov %fs:(%rax),%RCX_LP - - /* Either 1 or 5 bytes (dependeing if CET is enabled). */ - .p2align 4 -END (GLABEL(__strncasecmp)) - /* FALLTHROUGH to strncasecmp_l. */ -#endif - - -#define arg arg - -STRCMP_SSE42: - cfi_startproc - _CET_ENDBR - CALL_MCOUNT - -/* - * This implementation uses SSE to compare up to 16 bytes at a time. - */ -#ifdef USE_AS_STRCASECMP_L - /* We have to fall back on the C implementation for locales - with encodings not matching ASCII for single bytes. */ -# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0 - mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rdx), %RAX_LP -# else - mov (%rdx), %RAX_LP -# endif - testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax) - jne __strcasecmp_l_nonascii -#endif -#ifdef USE_AS_STRNCASECMP_L - /* We have to fall back on the C implementation for locales - with encodings not matching ASCII for single bytes. */ -# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0 - mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rcx), %RAX_LP -# else - mov (%rcx), %RAX_LP -# endif - testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax) - jne __strncasecmp_l_nonascii -#endif - -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - test %RDX_LP, %RDX_LP - je LABEL(strcmp_exitz) - cmp $1, %RDX_LP - je LABEL(Byte0) - mov %RDX_LP, %R11_LP -#endif - mov %esi, %ecx - mov %edi, %eax -/* Use 64bit AND here to avoid long NOP padding. */ - and $0x3f, %rcx /* rsi alignment in cache line */ - and $0x3f, %rax /* rdi alignment in cache line */ -#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L - .section .rodata.cst16,"aM",@progbits,16 - .align 16 -LABEL(lcase_min): - .quad 0x3f3f3f3f3f3f3f3f - .quad 0x3f3f3f3f3f3f3f3f -LABEL(lcase_max): - .quad 0x9999999999999999 - .quad 0x9999999999999999 -LABEL(case_add): - .quad 0x2020202020202020 - .quad 0x2020202020202020 - .previous - movdqa LABEL(lcase_min)(%rip), %xmm4 -# define LCASE_MIN_reg %xmm4 - movdqa LABEL(lcase_max)(%rip), %xmm5 -# define LCASE_MAX_reg %xmm5 - movdqa LABEL(case_add)(%rip), %xmm6 -# define CASE_ADD_reg %xmm6 -#endif - cmp $0x30, %ecx - ja LABEL(crosscache)/* rsi: 16-byte load will cross cache line */ - cmp $0x30, %eax - ja LABEL(crosscache)/* rdi: 16-byte load will cross cache line */ - movdqu (%rdi), %xmm1 - movdqu (%rsi), %xmm2 -#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L -# define TOLOWER(reg1, reg2) \ - movdqa LCASE_MIN_reg, %xmm7; \ - movdqa LCASE_MIN_reg, %xmm8; \ - paddb reg1, %xmm7; \ - paddb reg2, %xmm8; \ - pcmpgtb LCASE_MAX_reg, %xmm7; \ - pcmpgtb LCASE_MAX_reg, %xmm8; \ - pandn CASE_ADD_reg, %xmm7; \ - pandn CASE_ADD_reg, %xmm8; \ - paddb %xmm7, reg1; \ - paddb %xmm8, reg2 - - TOLOWER (%xmm1, %xmm2) -#else -# define TOLOWER(reg1, reg2) -#endif - pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */ - pcmpeqb %xmm1, %xmm0 /* Any null chars? */ - pcmpeqb %xmm2, %xmm1 /* compare first 16 bytes for equality */ - psubb %xmm0, %xmm1 /* packed sub of comparison results*/ - pmovmskb %xmm1, %edx - sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */ - jnz LABEL(less16bytes)/* If not, find different value or null char */ -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz)/* finish comparison */ -#endif - add $16, %rsi /* prepare to search next 16 bytes */ - add $16, %rdi /* prepare to search next 16 bytes */ - - /* - * Determine source and destination string offsets from 16-byte - * alignment. Use relative offset difference between the two to - * determine which case below to use. - */ - .p2align 4 -LABEL(crosscache): - and $0xfffffffffffffff0, %rsi /* force %rsi is 16 byte aligned */ - and $0xfffffffffffffff0, %rdi /* force %rdi is 16 byte aligned */ - mov $0xffff, %edx /* for equivalent offset */ - xor %r8d, %r8d - and $0xf, %ecx /* offset of rsi */ - and $0xf, %eax /* offset of rdi */ - pxor %xmm0, %xmm0 /* clear %xmm0 for null char check */ - cmp %eax, %ecx - je LABEL(ashr_0) /* rsi and rdi relative offset same */ - ja LABEL(bigger) - mov %edx, %r8d /* r8d is offset flag for exit tail */ - xchg %ecx, %eax - xchg %rsi, %rdi -LABEL(bigger): - movdqa (%rdi), %xmm2 - movdqa (%rsi), %xmm1 - lea 15(%rax), %r9 - sub %rcx, %r9 - lea LABEL(unaligned_table)(%rip), %r10 - movslq (%r10, %r9,4), %r9 - pcmpeqb %xmm1, %xmm0 /* Any null chars? */ - lea (%r10, %r9), %r10 - _CET_NOTRACK jmp *%r10 /* jump to corresponding case */ - -/* - * The following cases will be handled by ashr_0 - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case - * n(0~15) n(0~15) 15(15+ n-n) ashr_0 - */ - .p2align 4 -LABEL(ashr_0): - - movdqa (%rsi), %xmm1 - pcmpeqb %xmm1, %xmm0 /* Any null chars? */ -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L - pcmpeqb (%rdi), %xmm1 /* compare 16 bytes for equality */ -#else - movdqa (%rdi), %xmm2 - TOLOWER (%xmm1, %xmm2) - pcmpeqb %xmm2, %xmm1 /* compare 16 bytes for equality */ -#endif - psubb %xmm0, %xmm1 /* packed sub of comparison results*/ - pmovmskb %xmm1, %r9d - shr %cl, %edx /* adjust 0xffff for offset */ - shr %cl, %r9d /* adjust for 16-byte offset */ - sub %r9d, %edx - /* - * edx must be the same with r9d if in left byte (16-rcx) is equal to - * the start from (16-rax) and no null char was seen. - */ - jne LABEL(less32bytes) /* mismatch or null char */ - UPDATE_STRNCMP_COUNTER - mov $16, %rcx - mov $16, %r9 - - /* - * Now both strings are aligned at 16-byte boundary. Loop over strings - * checking 32-bytes per iteration. - */ - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ - .p2align 4 -LABEL(ashr_0_use): - movdqa (%rdi,%rdx), %xmm0 -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L - pcmpistri $0x1a,(%rsi,%rdx), %xmm0 -#else - movdqa (%rsi,%rdx), %xmm1 - TOLOWER (%xmm0, %xmm1) - pcmpistri $0x1a, %xmm1, %xmm0 -#endif - lea 16(%rdx), %rdx - jbe LABEL(ashr_0_exit_use) -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - - movdqa (%rdi,%rdx), %xmm0 -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L - pcmpistri $0x1a,(%rsi,%rdx), %xmm0 -#else - movdqa (%rsi,%rdx), %xmm1 - TOLOWER (%xmm0, %xmm1) - pcmpistri $0x1a, %xmm1, %xmm0 -#endif - lea 16(%rdx), %rdx - jbe LABEL(ashr_0_exit_use) -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - jmp LABEL(ashr_0_use) - - - .p2align 4 -LABEL(ashr_0_exit_use): - jnc LABEL(strcmp_exitz) -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub %rcx, %r11 - jbe LABEL(strcmp_exitz) -#endif - lea -16(%rdx, %rcx), %rcx - movzbl (%rdi, %rcx), %eax - movzbl (%rsi, %rcx), %edx -#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L - leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx - movl (%rcx,%rax,4), %eax - movl (%rcx,%rdx,4), %edx -#endif - sub %edx, %eax - ret - - - -/* - * The following cases will be handled by ashr_1 - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case - * n(15) n -15 0(15 +(n-15) - n) ashr_1 - */ - .p2align 4 -LABEL(ashr_1): - pslldq $15, %xmm2 /* shift first string to align with second */ - TOLOWER (%xmm1, %xmm2) - pcmpeqb %xmm1, %xmm2 /* compare 16 bytes for equality */ - psubb %xmm0, %xmm2 /* packed sub of comparison results*/ - pmovmskb %xmm2, %r9d - shr %cl, %edx /* adjust 0xffff for offset */ - shr %cl, %r9d /* adjust for 16-byte offset */ - sub %r9d, %edx - jnz LABEL(less32bytes) /* mismatch or null char seen */ - movdqa (%rdi), %xmm3 - UPDATE_STRNCMP_COUNTER - - mov $16, %rcx /* index for loads*/ - mov $1, %r9d /* byte position left over from less32bytes case */ - /* - * Setup %r10 value allows us to detect crossing a page boundary. - * When %r10 goes positive we have crossed a page boundary and - * need to do a nibble. - */ - lea 1(%rdi), %r10 - and $0xfff, %r10 /* offset into 4K page */ - sub $0x1000, %r10 /* subtract 4K pagesize */ - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ - - .p2align 4 -LABEL(loop_ashr_1_use): - add $16, %r10 - jg LABEL(nibble_ashr_1_use) - -LABEL(nibble_ashr_1_restart_use): - movdqa (%rdi, %rdx), %xmm0 - palignr $1, -16(%rdi, %rdx), %xmm0 -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L - pcmpistri $0x1a,(%rsi,%rdx), %xmm0 -#else - movdqa (%rsi,%rdx), %xmm1 - TOLOWER (%xmm0, %xmm1) - pcmpistri $0x1a, %xmm1, %xmm0 -#endif - jbe LABEL(exit_use) -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - - add $16, %rdx - add $16, %r10 - jg LABEL(nibble_ashr_1_use) - - movdqa (%rdi, %rdx), %xmm0 - palignr $1, -16(%rdi, %rdx), %xmm0 -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L - pcmpistri $0x1a,(%rsi,%rdx), %xmm0 -#else - movdqa (%rsi,%rdx), %xmm1 - TOLOWER (%xmm0, %xmm1) - pcmpistri $0x1a, %xmm1, %xmm0 -#endif - jbe LABEL(exit_use) -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - add $16, %rdx - jmp LABEL(loop_ashr_1_use) - - .p2align 4 -LABEL(nibble_ashr_1_use): - sub $0x1000, %r10 - movdqa -16(%rdi, %rdx), %xmm0 - psrldq $1, %xmm0 - pcmpistri $0x3a,%xmm0, %xmm0 -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - cmp %r11, %rcx - jae LABEL(nibble_ashr_exit_use) -#endif - cmp $14, %ecx - ja LABEL(nibble_ashr_1_restart_use) - - jmp LABEL(nibble_ashr_exit_use) - -/* - * The following cases will be handled by ashr_2 - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case - * n(14~15) n -14 1(15 +(n-14) - n) ashr_2 - */ - .p2align 4 -LABEL(ashr_2): - pslldq $14, %xmm2 - TOLOWER (%xmm1, %xmm2) - pcmpeqb %xmm1, %xmm2 - psubb %xmm0, %xmm2 - pmovmskb %xmm2, %r9d - shr %cl, %edx - shr %cl, %r9d - sub %r9d, %edx - jnz LABEL(less32bytes) - movdqa (%rdi), %xmm3 - UPDATE_STRNCMP_COUNTER - - mov $16, %rcx /* index for loads */ - mov $2, %r9d /* byte position left over from less32bytes case */ - /* - * Setup %r10 value allows us to detect crossing a page boundary. - * When %r10 goes positive we have crossed a page boundary and - * need to do a nibble. - */ - lea 2(%rdi), %r10 - and $0xfff, %r10 /* offset into 4K page */ - sub $0x1000, %r10 /* subtract 4K pagesize */ - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ - - .p2align 4 -LABEL(loop_ashr_2_use): - add $16, %r10 - jg LABEL(nibble_ashr_2_use) - -LABEL(nibble_ashr_2_restart_use): - movdqa (%rdi, %rdx), %xmm0 - palignr $2, -16(%rdi, %rdx), %xmm0 -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L - pcmpistri $0x1a,(%rsi,%rdx), %xmm0 -#else - movdqa (%rsi,%rdx), %xmm1 - TOLOWER (%xmm0, %xmm1) - pcmpistri $0x1a, %xmm1, %xmm0 -#endif - jbe LABEL(exit_use) -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - - add $16, %rdx - add $16, %r10 - jg LABEL(nibble_ashr_2_use) - - movdqa (%rdi, %rdx), %xmm0 - palignr $2, -16(%rdi, %rdx), %xmm0 -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L - pcmpistri $0x1a,(%rsi,%rdx), %xmm0 -#else - movdqa (%rsi,%rdx), %xmm1 - TOLOWER (%xmm0, %xmm1) - pcmpistri $0x1a, %xmm1, %xmm0 -#endif - jbe LABEL(exit_use) -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - add $16, %rdx - jmp LABEL(loop_ashr_2_use) - - .p2align 4 -LABEL(nibble_ashr_2_use): - sub $0x1000, %r10 - movdqa -16(%rdi, %rdx), %xmm0 - psrldq $2, %xmm0 - pcmpistri $0x3a,%xmm0, %xmm0 -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - cmp %r11, %rcx - jae LABEL(nibble_ashr_exit_use) -#endif - cmp $13, %ecx - ja LABEL(nibble_ashr_2_restart_use) - - jmp LABEL(nibble_ashr_exit_use) - -/* - * The following cases will be handled by ashr_3 - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case - * n(13~15) n -13 2(15 +(n-13) - n) ashr_3 - */ - .p2align 4 -LABEL(ashr_3): - pslldq $13, %xmm2 - TOLOWER (%xmm1, %xmm2) - pcmpeqb %xmm1, %xmm2 - psubb %xmm0, %xmm2 - pmovmskb %xmm2, %r9d - shr %cl, %edx - shr %cl, %r9d - sub %r9d, %edx - jnz LABEL(less32bytes) - movdqa (%rdi), %xmm3 - - UPDATE_STRNCMP_COUNTER - - mov $16, %rcx /* index for loads */ - mov $3, %r9d /* byte position left over from less32bytes case */ - /* - * Setup %r10 value allows us to detect crossing a page boundary. - * When %r10 goes positive we have crossed a page boundary and - * need to do a nibble. - */ - lea 3(%rdi), %r10 - and $0xfff, %r10 /* offset into 4K page */ - sub $0x1000, %r10 /* subtract 4K pagesize */ - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ - -LABEL(loop_ashr_3_use): - add $16, %r10 - jg LABEL(nibble_ashr_3_use) - -LABEL(nibble_ashr_3_restart_use): - movdqa (%rdi, %rdx), %xmm0 - palignr $3, -16(%rdi, %rdx), %xmm0 -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L - pcmpistri $0x1a,(%rsi,%rdx), %xmm0 -#else - movdqa (%rsi,%rdx), %xmm1 - TOLOWER (%xmm0, %xmm1) - pcmpistri $0x1a, %xmm1, %xmm0 -#endif - jbe LABEL(exit_use) -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - - add $16, %rdx - add $16, %r10 - jg LABEL(nibble_ashr_3_use) - - movdqa (%rdi, %rdx), %xmm0 - palignr $3, -16(%rdi, %rdx), %xmm0 -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L - pcmpistri $0x1a,(%rsi,%rdx), %xmm0 -#else - movdqa (%rsi,%rdx), %xmm1 - TOLOWER (%xmm0, %xmm1) - pcmpistri $0x1a, %xmm1, %xmm0 -#endif - jbe LABEL(exit_use) -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - add $16, %rdx - jmp LABEL(loop_ashr_3_use) - - .p2align 4 -LABEL(nibble_ashr_3_use): - sub $0x1000, %r10 - movdqa -16(%rdi, %rdx), %xmm0 - psrldq $3, %xmm0 - pcmpistri $0x3a,%xmm0, %xmm0 -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - cmp %r11, %rcx - jae LABEL(nibble_ashr_exit_use) -#endif - cmp $12, %ecx - ja LABEL(nibble_ashr_3_restart_use) - - jmp LABEL(nibble_ashr_exit_use) - -/* - * The following cases will be handled by ashr_4 - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case - * n(12~15) n -12 3(15 +(n-12) - n) ashr_4 - */ - .p2align 4 -LABEL(ashr_4): - pslldq $12, %xmm2 - TOLOWER (%xmm1, %xmm2) - pcmpeqb %xmm1, %xmm2 - psubb %xmm0, %xmm2 - pmovmskb %xmm2, %r9d - shr %cl, %edx - shr %cl, %r9d - sub %r9d, %edx - jnz LABEL(less32bytes) - movdqa (%rdi), %xmm3 - - UPDATE_STRNCMP_COUNTER - - mov $16, %rcx /* index for loads */ - mov $4, %r9d /* byte position left over from less32bytes case */ - /* - * Setup %r10 value allows us to detect crossing a page boundary. - * When %r10 goes positive we have crossed a page boundary and - * need to do a nibble. - */ - lea 4(%rdi), %r10 - and $0xfff, %r10 /* offset into 4K page */ - sub $0x1000, %r10 /* subtract 4K pagesize */ - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ - - .p2align 4 -LABEL(loop_ashr_4_use): - add $16, %r10 - jg LABEL(nibble_ashr_4_use) - -LABEL(nibble_ashr_4_restart_use): - movdqa (%rdi, %rdx), %xmm0 - palignr $4, -16(%rdi, %rdx), %xmm0 -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L - pcmpistri $0x1a,(%rsi,%rdx), %xmm0 -#else - movdqa (%rsi,%rdx), %xmm1 - TOLOWER (%xmm0, %xmm1) - pcmpistri $0x1a, %xmm1, %xmm0 -#endif - jbe LABEL(exit_use) -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - - add $16, %rdx - add $16, %r10 - jg LABEL(nibble_ashr_4_use) - - movdqa (%rdi, %rdx), %xmm0 - palignr $4, -16(%rdi, %rdx), %xmm0 -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L - pcmpistri $0x1a,(%rsi,%rdx), %xmm0 -#else - movdqa (%rsi,%rdx), %xmm1 - TOLOWER (%xmm0, %xmm1) - pcmpistri $0x1a, %xmm1, %xmm0 -#endif - jbe LABEL(exit_use) -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - add $16, %rdx - jmp LABEL(loop_ashr_4_use) - - .p2align 4 -LABEL(nibble_ashr_4_use): - sub $0x1000, %r10 - movdqa -16(%rdi, %rdx), %xmm0 - psrldq $4, %xmm0 - pcmpistri $0x3a,%xmm0, %xmm0 -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - cmp %r11, %rcx - jae LABEL(nibble_ashr_exit_use) -#endif - cmp $11, %ecx - ja LABEL(nibble_ashr_4_restart_use) - - jmp LABEL(nibble_ashr_exit_use) - -/* - * The following cases will be handled by ashr_5 - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case - * n(11~15) n - 11 4(15 +(n-11) - n) ashr_5 - */ - .p2align 4 -LABEL(ashr_5): - pslldq $11, %xmm2 - TOLOWER (%xmm1, %xmm2) - pcmpeqb %xmm1, %xmm2 - psubb %xmm0, %xmm2 - pmovmskb %xmm2, %r9d - shr %cl, %edx - shr %cl, %r9d - sub %r9d, %edx - jnz LABEL(less32bytes) - movdqa (%rdi), %xmm3 - - UPDATE_STRNCMP_COUNTER - - mov $16, %rcx /* index for loads */ - mov $5, %r9d /* byte position left over from less32bytes case */ - /* - * Setup %r10 value allows us to detect crossing a page boundary. - * When %r10 goes positive we have crossed a page boundary and - * need to do a nibble. - */ - lea 5(%rdi), %r10 - and $0xfff, %r10 /* offset into 4K page */ - sub $0x1000, %r10 /* subtract 4K pagesize */ - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ - - .p2align 4 -LABEL(loop_ashr_5_use): - add $16, %r10 - jg LABEL(nibble_ashr_5_use) - -LABEL(nibble_ashr_5_restart_use): - movdqa (%rdi, %rdx), %xmm0 - palignr $5, -16(%rdi, %rdx), %xmm0 -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L - pcmpistri $0x1a,(%rsi,%rdx), %xmm0 -#else - movdqa (%rsi,%rdx), %xmm1 - TOLOWER (%xmm0, %xmm1) - pcmpistri $0x1a, %xmm1, %xmm0 -#endif - jbe LABEL(exit_use) -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - - add $16, %rdx - add $16, %r10 - jg LABEL(nibble_ashr_5_use) - - movdqa (%rdi, %rdx), %xmm0 - - palignr $5, -16(%rdi, %rdx), %xmm0 -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L - pcmpistri $0x1a,(%rsi,%rdx), %xmm0 -#else - movdqa (%rsi,%rdx), %xmm1 - TOLOWER (%xmm0, %xmm1) - pcmpistri $0x1a, %xmm1, %xmm0 -#endif - jbe LABEL(exit_use) -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - add $16, %rdx - jmp LABEL(loop_ashr_5_use) - - .p2align 4 -LABEL(nibble_ashr_5_use): - sub $0x1000, %r10 - movdqa -16(%rdi, %rdx), %xmm0 - psrldq $5, %xmm0 - pcmpistri $0x3a,%xmm0, %xmm0 -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - cmp %r11, %rcx - jae LABEL(nibble_ashr_exit_use) -#endif - cmp $10, %ecx - ja LABEL(nibble_ashr_5_restart_use) - - jmp LABEL(nibble_ashr_exit_use) - -/* - * The following cases will be handled by ashr_6 - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case - * n(10~15) n - 10 5(15 +(n-10) - n) ashr_6 - */ - .p2align 4 -LABEL(ashr_6): - pslldq $10, %xmm2 - TOLOWER (%xmm1, %xmm2) - pcmpeqb %xmm1, %xmm2 - psubb %xmm0, %xmm2 - pmovmskb %xmm2, %r9d - shr %cl, %edx - shr %cl, %r9d - sub %r9d, %edx - jnz LABEL(less32bytes) - movdqa (%rdi), %xmm3 - - UPDATE_STRNCMP_COUNTER - - mov $16, %rcx /* index for loads */ - mov $6, %r9d /* byte position left over from less32bytes case */ - /* - * Setup %r10 value allows us to detect crossing a page boundary. - * When %r10 goes positive we have crossed a page boundary and - * need to do a nibble. - */ - lea 6(%rdi), %r10 - and $0xfff, %r10 /* offset into 4K page */ - sub $0x1000, %r10 /* subtract 4K pagesize */ - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ - - .p2align 4 -LABEL(loop_ashr_6_use): - add $16, %r10 - jg LABEL(nibble_ashr_6_use) - -LABEL(nibble_ashr_6_restart_use): - movdqa (%rdi, %rdx), %xmm0 - palignr $6, -16(%rdi, %rdx), %xmm0 -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L - pcmpistri $0x1a,(%rsi,%rdx), %xmm0 -#else - movdqa (%rsi,%rdx), %xmm1 - TOLOWER (%xmm0, %xmm1) - pcmpistri $0x1a, %xmm1, %xmm0 -#endif - jbe LABEL(exit_use) -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - - add $16, %rdx - add $16, %r10 - jg LABEL(nibble_ashr_6_use) - - movdqa (%rdi, %rdx), %xmm0 - palignr $6, -16(%rdi, %rdx), %xmm0 -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L - pcmpistri $0x1a,(%rsi,%rdx), %xmm0 -#else - movdqa (%rsi,%rdx), %xmm1 - TOLOWER (%xmm0, %xmm1) - pcmpistri $0x1a, %xmm1, %xmm0 -#endif - jbe LABEL(exit_use) -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - add $16, %rdx - jmp LABEL(loop_ashr_6_use) - - .p2align 4 -LABEL(nibble_ashr_6_use): - sub $0x1000, %r10 - movdqa -16(%rdi, %rdx), %xmm0 - psrldq $6, %xmm0 - pcmpistri $0x3a,%xmm0, %xmm0 -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - cmp %r11, %rcx - jae LABEL(nibble_ashr_exit_use) -#endif - cmp $9, %ecx - ja LABEL(nibble_ashr_6_restart_use) - - jmp LABEL(nibble_ashr_exit_use) - -/* - * The following cases will be handled by ashr_7 - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case - * n(9~15) n - 9 6(15 +(n - 9) - n) ashr_7 - */ - .p2align 4 -LABEL(ashr_7): - pslldq $9, %xmm2 - TOLOWER (%xmm1, %xmm2) - pcmpeqb %xmm1, %xmm2 - psubb %xmm0, %xmm2 - pmovmskb %xmm2, %r9d - shr %cl, %edx - shr %cl, %r9d - sub %r9d, %edx - jnz LABEL(less32bytes) - movdqa (%rdi), %xmm3 - - UPDATE_STRNCMP_COUNTER - - mov $16, %rcx /* index for loads */ - mov $7, %r9d /* byte position left over from less32bytes case */ - /* - * Setup %r10 value allows us to detect crossing a page boundary. - * When %r10 goes positive we have crossed a page boundary and - * need to do a nibble. - */ - lea 7(%rdi), %r10 - and $0xfff, %r10 /* offset into 4K page */ - sub $0x1000, %r10 /* subtract 4K pagesize */ - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ - - .p2align 4 -LABEL(loop_ashr_7_use): - add $16, %r10 - jg LABEL(nibble_ashr_7_use) - -LABEL(nibble_ashr_7_restart_use): - movdqa (%rdi, %rdx), %xmm0 - palignr $7, -16(%rdi, %rdx), %xmm0 -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L - pcmpistri $0x1a,(%rsi,%rdx), %xmm0 -#else - movdqa (%rsi,%rdx), %xmm1 - TOLOWER (%xmm0, %xmm1) - pcmpistri $0x1a, %xmm1, %xmm0 -#endif - jbe LABEL(exit_use) -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - - add $16, %rdx - add $16, %r10 - jg LABEL(nibble_ashr_7_use) - - movdqa (%rdi, %rdx), %xmm0 - palignr $7, -16(%rdi, %rdx), %xmm0 -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L - pcmpistri $0x1a,(%rsi,%rdx), %xmm0 -#else - movdqa (%rsi,%rdx), %xmm1 - TOLOWER (%xmm0, %xmm1) - pcmpistri $0x1a, %xmm1, %xmm0 -#endif - jbe LABEL(exit_use) -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - add $16, %rdx - jmp LABEL(loop_ashr_7_use) - - .p2align 4 -LABEL(nibble_ashr_7_use): - sub $0x1000, %r10 - movdqa -16(%rdi, %rdx), %xmm0 - psrldq $7, %xmm0 - pcmpistri $0x3a,%xmm0, %xmm0 -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - cmp %r11, %rcx - jae LABEL(nibble_ashr_exit_use) -#endif - cmp $8, %ecx - ja LABEL(nibble_ashr_7_restart_use) - - jmp LABEL(nibble_ashr_exit_use) - -/* - * The following cases will be handled by ashr_8 - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case - * n(8~15) n - 8 7(15 +(n - 8) - n) ashr_8 - */ - .p2align 4 -LABEL(ashr_8): - pslldq $8, %xmm2 - TOLOWER (%xmm1, %xmm2) - pcmpeqb %xmm1, %xmm2 - psubb %xmm0, %xmm2 - pmovmskb %xmm2, %r9d - shr %cl, %edx - shr %cl, %r9d - sub %r9d, %edx - jnz LABEL(less32bytes) - movdqa (%rdi), %xmm3 - - UPDATE_STRNCMP_COUNTER - - mov $16, %rcx /* index for loads */ - mov $8, %r9d /* byte position left over from less32bytes case */ - /* - * Setup %r10 value allows us to detect crossing a page boundary. - * When %r10 goes positive we have crossed a page boundary and - * need to do a nibble. - */ - lea 8(%rdi), %r10 - and $0xfff, %r10 /* offset into 4K page */ - sub $0x1000, %r10 /* subtract 4K pagesize */ - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ - - .p2align 4 -LABEL(loop_ashr_8_use): - add $16, %r10 - jg LABEL(nibble_ashr_8_use) - -LABEL(nibble_ashr_8_restart_use): - movdqa (%rdi, %rdx), %xmm0 - palignr $8, -16(%rdi, %rdx), %xmm0 -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L - pcmpistri $0x1a, (%rsi,%rdx), %xmm0 -#else - movdqa (%rsi,%rdx), %xmm1 - TOLOWER (%xmm0, %xmm1) - pcmpistri $0x1a, %xmm1, %xmm0 -#endif - jbe LABEL(exit_use) -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - - add $16, %rdx - add $16, %r10 - jg LABEL(nibble_ashr_8_use) - - movdqa (%rdi, %rdx), %xmm0 - palignr $8, -16(%rdi, %rdx), %xmm0 -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L - pcmpistri $0x1a, (%rsi,%rdx), %xmm0 -#else - movdqa (%rsi,%rdx), %xmm1 - TOLOWER (%xmm0, %xmm1) - pcmpistri $0x1a, %xmm1, %xmm0 -#endif - jbe LABEL(exit_use) -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - add $16, %rdx - jmp LABEL(loop_ashr_8_use) - - .p2align 4 -LABEL(nibble_ashr_8_use): - sub $0x1000, %r10 - movdqa -16(%rdi, %rdx), %xmm0 - psrldq $8, %xmm0 - pcmpistri $0x3a,%xmm0, %xmm0 -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - cmp %r11, %rcx - jae LABEL(nibble_ashr_exit_use) -#endif - cmp $7, %ecx - ja LABEL(nibble_ashr_8_restart_use) - - jmp LABEL(nibble_ashr_exit_use) - -/* - * The following cases will be handled by ashr_9 - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case - * n(7~15) n - 7 8(15 +(n - 7) - n) ashr_9 - */ - .p2align 4 -LABEL(ashr_9): - pslldq $7, %xmm2 - TOLOWER (%xmm1, %xmm2) - pcmpeqb %xmm1, %xmm2 - psubb %xmm0, %xmm2 - pmovmskb %xmm2, %r9d - shr %cl, %edx - shr %cl, %r9d - sub %r9d, %edx - jnz LABEL(less32bytes) - movdqa (%rdi), %xmm3 - - UPDATE_STRNCMP_COUNTER - - mov $16, %rcx /* index for loads */ - mov $9, %r9d /* byte position left over from less32bytes case */ - /* - * Setup %r10 value allows us to detect crossing a page boundary. - * When %r10 goes positive we have crossed a page boundary and - * need to do a nibble. - */ - lea 9(%rdi), %r10 - and $0xfff, %r10 /* offset into 4K page */ - sub $0x1000, %r10 /* subtract 4K pagesize */ - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ - - .p2align 4 -LABEL(loop_ashr_9_use): - add $16, %r10 - jg LABEL(nibble_ashr_9_use) - -LABEL(nibble_ashr_9_restart_use): - movdqa (%rdi, %rdx), %xmm0 - - palignr $9, -16(%rdi, %rdx), %xmm0 -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L - pcmpistri $0x1a, (%rsi,%rdx), %xmm0 -#else - movdqa (%rsi,%rdx), %xmm1 - TOLOWER (%xmm0, %xmm1) - pcmpistri $0x1a, %xmm1, %xmm0 -#endif - jbe LABEL(exit_use) -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - - add $16, %rdx - add $16, %r10 - jg LABEL(nibble_ashr_9_use) - - movdqa (%rdi, %rdx), %xmm0 - palignr $9, -16(%rdi, %rdx), %xmm0 -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L - pcmpistri $0x1a, (%rsi,%rdx), %xmm0 -#else - movdqa (%rsi,%rdx), %xmm1 - TOLOWER (%xmm0, %xmm1) - pcmpistri $0x1a, %xmm1, %xmm0 -#endif - jbe LABEL(exit_use) -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - add $16, %rdx - jmp LABEL(loop_ashr_9_use) - - .p2align 4 -LABEL(nibble_ashr_9_use): - sub $0x1000, %r10 - movdqa -16(%rdi, %rdx), %xmm0 - psrldq $9, %xmm0 - pcmpistri $0x3a,%xmm0, %xmm0 -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - cmp %r11, %rcx - jae LABEL(nibble_ashr_exit_use) -#endif - cmp $6, %ecx - ja LABEL(nibble_ashr_9_restart_use) - - jmp LABEL(nibble_ashr_exit_use) - -/* - * The following cases will be handled by ashr_10 - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case - * n(6~15) n - 6 9(15 +(n - 6) - n) ashr_10 - */ - .p2align 4 -LABEL(ashr_10): - pslldq $6, %xmm2 - TOLOWER (%xmm1, %xmm2) - pcmpeqb %xmm1, %xmm2 - psubb %xmm0, %xmm2 - pmovmskb %xmm2, %r9d - shr %cl, %edx - shr %cl, %r9d - sub %r9d, %edx - jnz LABEL(less32bytes) - movdqa (%rdi), %xmm3 - - UPDATE_STRNCMP_COUNTER - - mov $16, %rcx /* index for loads */ - mov $10, %r9d /* byte position left over from less32bytes case */ - /* - * Setup %r10 value allows us to detect crossing a page boundary. - * When %r10 goes positive we have crossed a page boundary and - * need to do a nibble. - */ - lea 10(%rdi), %r10 - and $0xfff, %r10 /* offset into 4K page */ - sub $0x1000, %r10 /* subtract 4K pagesize */ - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ - - .p2align 4 -LABEL(loop_ashr_10_use): - add $16, %r10 - jg LABEL(nibble_ashr_10_use) - -LABEL(nibble_ashr_10_restart_use): - movdqa (%rdi, %rdx), %xmm0 - palignr $10, -16(%rdi, %rdx), %xmm0 -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L - pcmpistri $0x1a, (%rsi,%rdx), %xmm0 -#else - movdqa (%rsi,%rdx), %xmm1 - TOLOWER (%xmm0, %xmm1) - pcmpistri $0x1a, %xmm1, %xmm0 -#endif - jbe LABEL(exit_use) -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - - add $16, %rdx - add $16, %r10 - jg LABEL(nibble_ashr_10_use) - - movdqa (%rdi, %rdx), %xmm0 - palignr $10, -16(%rdi, %rdx), %xmm0 -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L - pcmpistri $0x1a, (%rsi,%rdx), %xmm0 -#else - movdqa (%rsi,%rdx), %xmm1 - TOLOWER (%xmm0, %xmm1) - pcmpistri $0x1a, %xmm1, %xmm0 -#endif - jbe LABEL(exit_use) -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - add $16, %rdx - jmp LABEL(loop_ashr_10_use) - - .p2align 4 -LABEL(nibble_ashr_10_use): - sub $0x1000, %r10 - movdqa -16(%rdi, %rdx), %xmm0 - psrldq $10, %xmm0 - pcmpistri $0x3a,%xmm0, %xmm0 -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - cmp %r11, %rcx - jae LABEL(nibble_ashr_exit_use) -#endif - cmp $5, %ecx - ja LABEL(nibble_ashr_10_restart_use) - - jmp LABEL(nibble_ashr_exit_use) - -/* - * The following cases will be handled by ashr_11 - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case - * n(5~15) n - 5 10(15 +(n - 5) - n) ashr_11 - */ - .p2align 4 -LABEL(ashr_11): - pslldq $5, %xmm2 - TOLOWER (%xmm1, %xmm2) - pcmpeqb %xmm1, %xmm2 - psubb %xmm0, %xmm2 - pmovmskb %xmm2, %r9d - shr %cl, %edx - shr %cl, %r9d - sub %r9d, %edx - jnz LABEL(less32bytes) - movdqa (%rdi), %xmm3 - - UPDATE_STRNCMP_COUNTER - - mov $16, %rcx /* index for loads */ - mov $11, %r9d /* byte position left over from less32bytes case */ - /* - * Setup %r10 value allows us to detect crossing a page boundary. - * When %r10 goes positive we have crossed a page boundary and - * need to do a nibble. - */ - lea 11(%rdi), %r10 - and $0xfff, %r10 /* offset into 4K page */ - sub $0x1000, %r10 /* subtract 4K pagesize */ - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ - - .p2align 4 -LABEL(loop_ashr_11_use): - add $16, %r10 - jg LABEL(nibble_ashr_11_use) - -LABEL(nibble_ashr_11_restart_use): - movdqa (%rdi, %rdx), %xmm0 - palignr $11, -16(%rdi, %rdx), %xmm0 -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L - pcmpistri $0x1a, (%rsi,%rdx), %xmm0 -#else - movdqa (%rsi,%rdx), %xmm1 - TOLOWER (%xmm0, %xmm1) - pcmpistri $0x1a, %xmm1, %xmm0 -#endif - jbe LABEL(exit_use) -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - - add $16, %rdx - add $16, %r10 - jg LABEL(nibble_ashr_11_use) - - movdqa (%rdi, %rdx), %xmm0 - palignr $11, -16(%rdi, %rdx), %xmm0 -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L - pcmpistri $0x1a, (%rsi,%rdx), %xmm0 -#else - movdqa (%rsi,%rdx), %xmm1 - TOLOWER (%xmm0, %xmm1) - pcmpistri $0x1a, %xmm1, %xmm0 -#endif - jbe LABEL(exit_use) -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - add $16, %rdx - jmp LABEL(loop_ashr_11_use) - - .p2align 4 -LABEL(nibble_ashr_11_use): - sub $0x1000, %r10 - movdqa -16(%rdi, %rdx), %xmm0 - psrldq $11, %xmm0 - pcmpistri $0x3a,%xmm0, %xmm0 -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - cmp %r11, %rcx - jae LABEL(nibble_ashr_exit_use) -#endif - cmp $4, %ecx - ja LABEL(nibble_ashr_11_restart_use) - - jmp LABEL(nibble_ashr_exit_use) - -/* - * The following cases will be handled by ashr_12 - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case - * n(4~15) n - 4 11(15 +(n - 4) - n) ashr_12 - */ - .p2align 4 -LABEL(ashr_12): - pslldq $4, %xmm2 - TOLOWER (%xmm1, %xmm2) - pcmpeqb %xmm1, %xmm2 - psubb %xmm0, %xmm2 - pmovmskb %xmm2, %r9d - shr %cl, %edx - shr %cl, %r9d - sub %r9d, %edx - jnz LABEL(less32bytes) - movdqa (%rdi), %xmm3 - - UPDATE_STRNCMP_COUNTER - - mov $16, %rcx /* index for loads */ - mov $12, %r9d /* byte position left over from less32bytes case */ - /* - * Setup %r10 value allows us to detect crossing a page boundary. - * When %r10 goes positive we have crossed a page boundary and - * need to do a nibble. - */ - lea 12(%rdi), %r10 - and $0xfff, %r10 /* offset into 4K page */ - sub $0x1000, %r10 /* subtract 4K pagesize */ - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ - - .p2align 4 -LABEL(loop_ashr_12_use): - add $16, %r10 - jg LABEL(nibble_ashr_12_use) - -LABEL(nibble_ashr_12_restart_use): - movdqa (%rdi, %rdx), %xmm0 - palignr $12, -16(%rdi, %rdx), %xmm0 -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L - pcmpistri $0x1a, (%rsi,%rdx), %xmm0 -#else - movdqa (%rsi,%rdx), %xmm1 - TOLOWER (%xmm0, %xmm1) - pcmpistri $0x1a, %xmm1, %xmm0 -#endif - jbe LABEL(exit_use) -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - - add $16, %rdx - add $16, %r10 - jg LABEL(nibble_ashr_12_use) - - movdqa (%rdi, %rdx), %xmm0 - palignr $12, -16(%rdi, %rdx), %xmm0 -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L - pcmpistri $0x1a, (%rsi,%rdx), %xmm0 -#else - movdqa (%rsi,%rdx), %xmm1 - TOLOWER (%xmm0, %xmm1) - pcmpistri $0x1a, %xmm1, %xmm0 -#endif - jbe LABEL(exit_use) -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - add $16, %rdx - jmp LABEL(loop_ashr_12_use) - - .p2align 4 -LABEL(nibble_ashr_12_use): - sub $0x1000, %r10 - movdqa -16(%rdi, %rdx), %xmm0 - psrldq $12, %xmm0 - pcmpistri $0x3a,%xmm0, %xmm0 -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - cmp %r11, %rcx - jae LABEL(nibble_ashr_exit_use) -#endif - cmp $3, %ecx - ja LABEL(nibble_ashr_12_restart_use) - - jmp LABEL(nibble_ashr_exit_use) - -/* - * The following cases will be handled by ashr_13 - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case - * n(3~15) n - 3 12(15 +(n - 3) - n) ashr_13 - */ - .p2align 4 -LABEL(ashr_13): - pslldq $3, %xmm2 - TOLOWER (%xmm1, %xmm2) - pcmpeqb %xmm1, %xmm2 - psubb %xmm0, %xmm2 - pmovmskb %xmm2, %r9d - shr %cl, %edx - shr %cl, %r9d - sub %r9d, %edx - jnz LABEL(less32bytes) - movdqa (%rdi), %xmm3 - - UPDATE_STRNCMP_COUNTER - - mov $16, %rcx /* index for loads */ - mov $13, %r9d /* byte position left over from less32bytes case */ - /* - * Setup %r10 value allows us to detect crossing a page boundary. - * When %r10 goes positive we have crossed a page boundary and - * need to do a nibble. - */ - lea 13(%rdi), %r10 - and $0xfff, %r10 /* offset into 4K page */ - sub $0x1000, %r10 /* subtract 4K pagesize */ - - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ - - .p2align 4 -LABEL(loop_ashr_13_use): - add $16, %r10 - jg LABEL(nibble_ashr_13_use) - -LABEL(nibble_ashr_13_restart_use): - movdqa (%rdi, %rdx), %xmm0 - palignr $13, -16(%rdi, %rdx), %xmm0 -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L - pcmpistri $0x1a, (%rsi,%rdx), %xmm0 -#else - movdqa (%rsi,%rdx), %xmm1 - TOLOWER (%xmm0, %xmm1) - pcmpistri $0x1a, %xmm1, %xmm0 -#endif - jbe LABEL(exit_use) -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - - add $16, %rdx - add $16, %r10 - jg LABEL(nibble_ashr_13_use) - - movdqa (%rdi, %rdx), %xmm0 - palignr $13, -16(%rdi, %rdx), %xmm0 -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L - pcmpistri $0x1a, (%rsi,%rdx), %xmm0 -#else - movdqa (%rsi,%rdx), %xmm1 - TOLOWER (%xmm0, %xmm1) - pcmpistri $0x1a, %xmm1, %xmm0 -#endif - jbe LABEL(exit_use) -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - add $16, %rdx - jmp LABEL(loop_ashr_13_use) - - .p2align 4 -LABEL(nibble_ashr_13_use): - sub $0x1000, %r10 - movdqa -16(%rdi, %rdx), %xmm0 - psrldq $13, %xmm0 - pcmpistri $0x3a,%xmm0, %xmm0 -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - cmp %r11, %rcx - jae LABEL(nibble_ashr_exit_use) -#endif - cmp $2, %ecx - ja LABEL(nibble_ashr_13_restart_use) - - jmp LABEL(nibble_ashr_exit_use) - -/* - * The following cases will be handled by ashr_14 - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case - * n(2~15) n - 2 13(15 +(n - 2) - n) ashr_14 - */ - .p2align 4 -LABEL(ashr_14): - pslldq $2, %xmm2 - TOLOWER (%xmm1, %xmm2) - pcmpeqb %xmm1, %xmm2 - psubb %xmm0, %xmm2 - pmovmskb %xmm2, %r9d - shr %cl, %edx - shr %cl, %r9d - sub %r9d, %edx - jnz LABEL(less32bytes) - movdqa (%rdi), %xmm3 - - UPDATE_STRNCMP_COUNTER - - mov $16, %rcx /* index for loads */ - mov $14, %r9d /* byte position left over from less32bytes case */ - /* - * Setup %r10 value allows us to detect crossing a page boundary. - * When %r10 goes positive we have crossed a page boundary and - * need to do a nibble. - */ - lea 14(%rdi), %r10 - and $0xfff, %r10 /* offset into 4K page */ - sub $0x1000, %r10 /* subtract 4K pagesize */ - - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ - - .p2align 4 -LABEL(loop_ashr_14_use): - add $16, %r10 - jg LABEL(nibble_ashr_14_use) - -LABEL(nibble_ashr_14_restart_use): - movdqa (%rdi, %rdx), %xmm0 - palignr $14, -16(%rdi, %rdx), %xmm0 -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L - pcmpistri $0x1a, (%rsi,%rdx), %xmm0 -#else - movdqa (%rsi,%rdx), %xmm1 - TOLOWER (%xmm0, %xmm1) - pcmpistri $0x1a, %xmm1, %xmm0 -#endif - jbe LABEL(exit_use) -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - - add $16, %rdx - add $16, %r10 - jg LABEL(nibble_ashr_14_use) - - movdqa (%rdi, %rdx), %xmm0 - palignr $14, -16(%rdi, %rdx), %xmm0 -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L - pcmpistri $0x1a, (%rsi,%rdx), %xmm0 -#else - movdqa (%rsi,%rdx), %xmm1 - TOLOWER (%xmm0, %xmm1) - pcmpistri $0x1a, %xmm1, %xmm0 -#endif - jbe LABEL(exit_use) -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - add $16, %rdx - jmp LABEL(loop_ashr_14_use) - - .p2align 4 -LABEL(nibble_ashr_14_use): - sub $0x1000, %r10 - movdqa -16(%rdi, %rdx), %xmm0 - psrldq $14, %xmm0 - pcmpistri $0x3a,%xmm0, %xmm0 -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - cmp %r11, %rcx - jae LABEL(nibble_ashr_exit_use) -#endif - cmp $1, %ecx - ja LABEL(nibble_ashr_14_restart_use) - - jmp LABEL(nibble_ashr_exit_use) - -/* - * The following cases will be handled by ashr_15 - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case - * n(1~15) n - 1 14(15 +(n - 1) - n) ashr_15 - */ - .p2align 4 -LABEL(ashr_15): - pslldq $1, %xmm2 - TOLOWER (%xmm1, %xmm2) - pcmpeqb %xmm1, %xmm2 - psubb %xmm0, %xmm2 - pmovmskb %xmm2, %r9d - shr %cl, %edx - shr %cl, %r9d - sub %r9d, %edx - jnz LABEL(less32bytes) - - movdqa (%rdi), %xmm3 - - UPDATE_STRNCMP_COUNTER - - mov $16, %rcx /* index for loads */ - mov $15, %r9d /* byte position left over from less32bytes case */ - /* - * Setup %r10 value allows us to detect crossing a page boundary. - * When %r10 goes positive we have crossed a page boundary and - * need to do a nibble. - */ - lea 15(%rdi), %r10 - and $0xfff, %r10 /* offset into 4K page */ - - sub $0x1000, %r10 /* subtract 4K pagesize */ - - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ - - .p2align 4 -LABEL(loop_ashr_15_use): - add $16, %r10 - jg LABEL(nibble_ashr_15_use) - -LABEL(nibble_ashr_15_restart_use): - movdqa (%rdi, %rdx), %xmm0 - palignr $15, -16(%rdi, %rdx), %xmm0 -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L - pcmpistri $0x1a, (%rsi,%rdx), %xmm0 -#else - movdqa (%rsi,%rdx), %xmm1 - TOLOWER (%xmm0, %xmm1) - pcmpistri $0x1a, %xmm1, %xmm0 -#endif - jbe LABEL(exit_use) -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - - add $16, %rdx - add $16, %r10 - jg LABEL(nibble_ashr_15_use) - - movdqa (%rdi, %rdx), %xmm0 - palignr $15, -16(%rdi, %rdx), %xmm0 -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L - pcmpistri $0x1a, (%rsi,%rdx), %xmm0 -#else - movdqa (%rsi,%rdx), %xmm1 - TOLOWER (%xmm0, %xmm1) - pcmpistri $0x1a, %xmm1, %xmm0 -#endif - jbe LABEL(exit_use) -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub $16, %r11 - jbe LABEL(strcmp_exitz) -#endif - add $16, %rdx - jmp LABEL(loop_ashr_15_use) - - .p2align 4 -LABEL(nibble_ashr_15_use): - sub $0x1000, %r10 - movdqa -16(%rdi, %rdx), %xmm0 - psrldq $15, %xmm0 - pcmpistri $0x3a,%xmm0, %xmm0 -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - cmp %r11, %rcx - jae LABEL(nibble_ashr_exit_use) -#endif - cmp $0, %ecx - ja LABEL(nibble_ashr_15_restart_use) - -LABEL(nibble_ashr_exit_use): -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L - pcmpistri $0x1a,(%rsi,%rdx), %xmm0 -#else - movdqa (%rsi,%rdx), %xmm1 - TOLOWER (%xmm0, %xmm1) - pcmpistri $0x1a, %xmm1, %xmm0 -#endif - .p2align 4 -LABEL(exit_use): - jnc LABEL(strcmp_exitz) -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub %rcx, %r11 - jbe LABEL(strcmp_exitz) -#endif - add %rcx, %rdx - lea -16(%rdi, %r9), %rdi - movzbl (%rdi, %rdx), %eax - movzbl (%rsi, %rdx), %edx - test %r8d, %r8d - jz LABEL(ret_use) - xchg %eax, %edx -LABEL(ret_use): -#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L - leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx - movl (%rcx,%rdx,4), %edx - movl (%rcx,%rax,4), %eax -#endif - - sub %edx, %eax - ret - -LABEL(less32bytes): - lea (%rdi, %rax), %rdi /* locate the exact address for first operand(rdi) */ - lea (%rsi, %rcx), %rsi /* locate the exact address for second operand(rsi) */ - test %r8d, %r8d - jz LABEL(ret) - xchg %rsi, %rdi /* recover original order according to flag(%r8d) */ - - .p2align 4 -LABEL(ret): -LABEL(less16bytes): - bsf %rdx, %rdx /* find and store bit index in %rdx */ - -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L - sub %rdx, %r11 - jbe LABEL(strcmp_exitz) -#endif - movzbl (%rsi, %rdx), %ecx - movzbl (%rdi, %rdx), %eax - -#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L - leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx - movl (%rdx,%rcx,4), %ecx - movl (%rdx,%rax,4), %eax -#endif - - sub %ecx, %eax - ret - -LABEL(strcmp_exitz): - xor %eax, %eax - ret - - .p2align 4 - // XXX Same as code above -LABEL(Byte0): - movzbl (%rsi), %ecx - movzbl (%rdi), %eax - -#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L - leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx - movl (%rdx,%rcx,4), %ecx - movl (%rdx,%rax,4), %eax -#endif - - sub %ecx, %eax - ret - cfi_endproc - .size STRCMP_SSE42, .-STRCMP_SSE42 - -#undef UCLOW_reg -#undef UCHIGH_reg -#undef LCQWORD_reg -#undef TOLOWER - - /* Put all SSE 4.2 functions together. */ - .section .rodata.SECTION,"a",@progbits - .p2align 3 -LABEL(unaligned_table): - .int LABEL(ashr_1) - LABEL(unaligned_table) - .int LABEL(ashr_2) - LABEL(unaligned_table) - .int LABEL(ashr_3) - LABEL(unaligned_table) - .int LABEL(ashr_4) - LABEL(unaligned_table) - .int LABEL(ashr_5) - LABEL(unaligned_table) - .int LABEL(ashr_6) - LABEL(unaligned_table) - .int LABEL(ashr_7) - LABEL(unaligned_table) - .int LABEL(ashr_8) - LABEL(unaligned_table) - .int LABEL(ashr_9) - LABEL(unaligned_table) - .int LABEL(ashr_10) - LABEL(unaligned_table) - .int LABEL(ashr_11) - LABEL(unaligned_table) - .int LABEL(ashr_12) - LABEL(unaligned_table) - .int LABEL(ashr_13) - LABEL(unaligned_table) - .int LABEL(ashr_14) - LABEL(unaligned_table) - .int LABEL(ashr_15) - LABEL(unaligned_table) - .int LABEL(ashr_0) - LABEL(unaligned_table) - -#undef LABEL -#undef GLABEL -#undef SECTION -#undef movdqa -#undef movdqu -#undef pmovmskb -#undef pcmpistri -#undef psubb -#undef pcmpeqb -#undef psrldq -#undef pslldq -#undef palignr -#undef pxor -#undef D diff --git a/sysdeps/x86_64/multiarch/strcmp-sse4_2.S b/sysdeps/x86_64/multiarch/strcmp-sse4_2.S index 2c916bafa0..963e208ccb 100644 --- a/sysdeps/x86_64/multiarch/strcmp-sse4_2.S +++ b/sysdeps/x86_64/multiarch/strcmp-sse4_2.S @@ -17,5 +17,1766 @@ <https://www.gnu.org/licenses/>. */ #if IS_IN (libc) -# include "strcmp-sse42.S" +# include <sysdep.h> + +# define STRCMP_ISA _sse42 +# include "strcmp-naming.h" + +# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L +# include "locale-defines.h" +# endif + +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L +/* Since the counter, %r11, is unsigned, we branch to strcmp_exitz + if the new counter > the old one or is 0. */ +# define UPDATE_STRNCMP_COUNTER \ + /* calculate left number to compare */ \ + lea -16(%rcx, %r11), %r9; \ + cmp %r9, %r11; \ + jb LABEL(strcmp_exitz); \ + test %r9, %r9; \ + je LABEL(strcmp_exitz); \ + mov %r9, %r11 +# else +# define UPDATE_STRNCMP_COUNTER +# endif + +# define SECTION sse4.2 + +# define LABEL(l) .L##l + +/* We use 0x1a: + _SIDD_SBYTE_OPS + | _SIDD_CMP_EQUAL_EACH + | _SIDD_NEGATIVE_POLARITY + | _SIDD_LEAST_SIGNIFICANT + on pcmpistri to find out if two 16byte data elements are the same + and the offset of the first different byte. There are 4 cases: + + 1. Both 16byte data elements are valid and identical. + 2. Both 16byte data elements have EOS and identical. + 3. Both 16byte data elements are valid and they differ at offset X. + 4. At least one 16byte data element has EOS at offset X. Two 16byte + data elements must differ at or before offset X. + + Here is the table of ECX, CFlag, ZFlag and SFlag for 4 cases: + + case ECX CFlag ZFlag SFlag + 1 16 0 0 0 + 2 16 0 1 1 + 3 X 1 0 0 + 4 0 <= X 1 0/1 0/1 + + We exit from the loop for cases 2, 3 and 4 with jbe which branches + when either CFlag or ZFlag is 1. If CFlag == 0, we return 0 for + case 2. */ + + /* Put all SSE 4.2 functions together. */ + .section .text.SECTION,"ax",@progbits + .align 16 + .type STRCMP, @function + .globl STRCMP +# ifdef USE_AS_STRCASECMP_L +ENTRY (STRCASECMP) + movq __libc_tsd_LOCALE@gottpoff(%rip),%rax + mov %fs:(%rax),%RDX_LP + + /* Either 1 or 5 bytes (dependeing if CET is enabled). */ + .p2align 4 +END (STRCASECMP) + /* FALLTHROUGH to strcasecmp_l. */ +# endif +# ifdef USE_AS_STRNCASECMP_L +ENTRY (STRCASECMP) + movq __libc_tsd_LOCALE@gottpoff(%rip),%rax + mov %fs:(%rax),%RCX_LP + + /* Either 1 or 5 bytes (dependeing if CET is enabled). */ + .p2align 4 +END (STRCASECMP) + /* FALLTHROUGH to strncasecmp_l. */ +# endif + + +# define arg arg + +STRCMP: + cfi_startproc + _CET_ENDBR + CALL_MCOUNT + +/* + * This implementation uses SSE to compare up to 16 bytes at a time. + */ +# ifdef USE_AS_STRCASECMP_L + /* We have to fall back on the C implementation for locales + with encodings not matching ASCII for single bytes. */ +# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0 + mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rdx), %RAX_LP +# else + mov (%rdx), %RAX_LP +# endif + testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax) + jne __strcasecmp_l_nonascii +# endif +# ifdef USE_AS_STRNCASECMP_L + /* We have to fall back on the C implementation for locales + with encodings not matching ASCII for single bytes. */ +# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0 + mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rcx), %RAX_LP +# else + mov (%rcx), %RAX_LP +# endif + testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax) + jne __strncasecmp_l_nonascii +# endif + +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + test %RDX_LP, %RDX_LP + je LABEL(strcmp_exitz) + cmp $1, %RDX_LP + je LABEL(Byte0) + mov %RDX_LP, %R11_LP +# endif + mov %esi, %ecx + mov %edi, %eax +/* Use 64bit AND here to avoid long NOP padding. */ + and $0x3f, %rcx /* rsi alignment in cache line */ + and $0x3f, %rax /* rdi alignment in cache line */ +# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + .section .rodata.cst16,"aM",@progbits,16 + .align 16 +LABEL(lcase_min): + .quad 0x3f3f3f3f3f3f3f3f + .quad 0x3f3f3f3f3f3f3f3f +LABEL(lcase_max): + .quad 0x9999999999999999 + .quad 0x9999999999999999 +LABEL(case_add): + .quad 0x2020202020202020 + .quad 0x2020202020202020 + .previous + movdqa LABEL(lcase_min)(%rip), %xmm4 +# define LCASE_MIN_reg %xmm4 + movdqa LABEL(lcase_max)(%rip), %xmm5 +# define LCASE_MAX_reg %xmm5 + movdqa LABEL(case_add)(%rip), %xmm6 +# define CASE_ADD_reg %xmm6 +# endif + cmp $0x30, %ecx + ja LABEL(crosscache)/* rsi: 16-byte load will cross cache line */ + cmp $0x30, %eax + ja LABEL(crosscache)/* rdi: 16-byte load will cross cache line */ + movdqu (%rdi), %xmm1 + movdqu (%rsi), %xmm2 +# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L +# define TOLOWER(reg1, reg2) \ + movdqa LCASE_MIN_reg, %xmm7; \ + movdqa LCASE_MIN_reg, %xmm8; \ + paddb reg1, %xmm7; \ + paddb reg2, %xmm8; \ + pcmpgtb LCASE_MAX_reg, %xmm7; \ + pcmpgtb LCASE_MAX_reg, %xmm8; \ + pandn CASE_ADD_reg, %xmm7; \ + pandn CASE_ADD_reg, %xmm8; \ + paddb %xmm7, reg1; \ + paddb %xmm8, reg2 + + TOLOWER (%xmm1, %xmm2) +# else +# define TOLOWER(reg1, reg2) +# endif + pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */ + pcmpeqb %xmm1, %xmm0 /* Any null chars? */ + pcmpeqb %xmm2, %xmm1 /* compare first 16 bytes for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */ + jnz LABEL(less16bytes)/* If not, find different value or null char */ +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz)/* finish comparison */ +# endif + add $16, %rsi /* prepare to search next 16 bytes */ + add $16, %rdi /* prepare to search next 16 bytes */ + + /* + * Determine source and destination string offsets from 16-byte + * alignment. Use relative offset difference between the two to + * determine which case below to use. + */ + .p2align 4 +LABEL(crosscache): + and $0xfffffffffffffff0, %rsi /* force %rsi is 16 byte aligned */ + and $0xfffffffffffffff0, %rdi /* force %rdi is 16 byte aligned */ + mov $0xffff, %edx /* for equivalent offset */ + xor %r8d, %r8d + and $0xf, %ecx /* offset of rsi */ + and $0xf, %eax /* offset of rdi */ + pxor %xmm0, %xmm0 /* clear %xmm0 for null char check */ + cmp %eax, %ecx + je LABEL(ashr_0) /* rsi and rdi relative offset same */ + ja LABEL(bigger) + mov %edx, %r8d /* r8d is offset flag for exit tail */ + xchg %ecx, %eax + xchg %rsi, %rdi +LABEL(bigger): + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + lea 15(%rax), %r9 + sub %rcx, %r9 + lea LABEL(unaligned_table)(%rip), %r10 + movslq (%r10, %r9,4), %r9 + pcmpeqb %xmm1, %xmm0 /* Any null chars? */ + lea (%r10, %r9), %r10 + _CET_NOTRACK jmp *%r10 /* jump to corresponding case */ + +/* + * The following cases will be handled by ashr_0 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(0~15) n(0~15) 15(15+ n-n) ashr_0 + */ + .p2align 4 +LABEL(ashr_0): + + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 /* Any null chars? */ +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpeqb (%rdi), %xmm1 /* compare 16 bytes for equality */ +# else + movdqa (%rdi), %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm2, %xmm1 /* compare 16 bytes for equality */ +# endif + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %r9d + shr %cl, %edx /* adjust 0xffff for offset */ + shr %cl, %r9d /* adjust for 16-byte offset */ + sub %r9d, %edx + /* + * edx must be the same with r9d if in left byte (16-rcx) is equal to + * the start from (16-rax) and no null char was seen. + */ + jne LABEL(less32bytes) /* mismatch or null char */ + UPDATE_STRNCMP_COUNTER + mov $16, %rcx + mov $16, %r9 + + /* + * Now both strings are aligned at 16-byte boundary. Loop over strings + * checking 32-bytes per iteration. + */ + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ + .p2align 4 +LABEL(ashr_0_use): + movdqa (%rdi,%rdx), %xmm0 +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 +# else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +# endif + lea 16(%rdx), %rdx + jbe LABEL(ashr_0_exit_use) +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +# endif + + movdqa (%rdi,%rdx), %xmm0 +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 +# else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +# endif + lea 16(%rdx), %rdx + jbe LABEL(ashr_0_exit_use) +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +# endif + jmp LABEL(ashr_0_use) + + + .p2align 4 +LABEL(ashr_0_exit_use): + jnc LABEL(strcmp_exitz) +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub %rcx, %r11 + jbe LABEL(strcmp_exitz) +# endif + lea -16(%rdx, %rcx), %rcx + movzbl (%rdi, %rcx), %eax + movzbl (%rsi, %rcx), %edx +# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx + movl (%rcx,%rax,4), %eax + movl (%rcx,%rdx,4), %edx +# endif + sub %edx, %eax + ret + + + +/* + * The following cases will be handled by ashr_1 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(15) n -15 0(15 +(n-15) - n) ashr_1 + */ + .p2align 4 +LABEL(ashr_1): + pslldq $15, %xmm2 /* shift first string to align with second */ + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 /* compare 16 bytes for equality */ + psubb %xmm0, %xmm2 /* packed sub of comparison results*/ + pmovmskb %xmm2, %r9d + shr %cl, %edx /* adjust 0xffff for offset */ + shr %cl, %r9d /* adjust for 16-byte offset */ + sub %r9d, %edx + jnz LABEL(less32bytes) /* mismatch or null char seen */ + movdqa (%rdi), %xmm3 + UPDATE_STRNCMP_COUNTER + + mov $16, %rcx /* index for loads*/ + mov $1, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 1(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ + + .p2align 4 +LABEL(loop_ashr_1_use): + add $16, %r10 + jg LABEL(nibble_ashr_1_use) + +LABEL(nibble_ashr_1_restart_use): + movdqa (%rdi, %rdx), %xmm0 + palignr $1, -16(%rdi, %rdx), %xmm0 +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 +# else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +# endif + jbe LABEL(exit_use) +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +# endif + + add $16, %rdx + add $16, %r10 + jg LABEL(nibble_ashr_1_use) + + movdqa (%rdi, %rdx), %xmm0 + palignr $1, -16(%rdi, %rdx), %xmm0 +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 +# else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +# endif + jbe LABEL(exit_use) +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +# endif + add $16, %rdx + jmp LABEL(loop_ashr_1_use) + + .p2align 4 +LABEL(nibble_ashr_1_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 + psrldq $1, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx + jae LABEL(nibble_ashr_exit_use) +# endif + cmp $14, %ecx + ja LABEL(nibble_ashr_1_restart_use) + + jmp LABEL(nibble_ashr_exit_use) + +/* + * The following cases will be handled by ashr_2 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(14~15) n -14 1(15 +(n-14) - n) ashr_2 + */ + .p2align 4 +LABEL(ashr_2): + pslldq $14, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + UPDATE_STRNCMP_COUNTER + + mov $16, %rcx /* index for loads */ + mov $2, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 2(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ + + .p2align 4 +LABEL(loop_ashr_2_use): + add $16, %r10 + jg LABEL(nibble_ashr_2_use) + +LABEL(nibble_ashr_2_restart_use): + movdqa (%rdi, %rdx), %xmm0 + palignr $2, -16(%rdi, %rdx), %xmm0 +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 +# else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +# endif + jbe LABEL(exit_use) +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +# endif + + add $16, %rdx + add $16, %r10 + jg LABEL(nibble_ashr_2_use) + + movdqa (%rdi, %rdx), %xmm0 + palignr $2, -16(%rdi, %rdx), %xmm0 +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 +# else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +# endif + jbe LABEL(exit_use) +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +# endif + add $16, %rdx + jmp LABEL(loop_ashr_2_use) + + .p2align 4 +LABEL(nibble_ashr_2_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 + psrldq $2, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx + jae LABEL(nibble_ashr_exit_use) +# endif + cmp $13, %ecx + ja LABEL(nibble_ashr_2_restart_use) + + jmp LABEL(nibble_ashr_exit_use) + +/* + * The following cases will be handled by ashr_3 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(13~15) n -13 2(15 +(n-13) - n) ashr_3 + */ + .p2align 4 +LABEL(ashr_3): + pslldq $13, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + mov $16, %rcx /* index for loads */ + mov $3, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 3(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ + +LABEL(loop_ashr_3_use): + add $16, %r10 + jg LABEL(nibble_ashr_3_use) + +LABEL(nibble_ashr_3_restart_use): + movdqa (%rdi, %rdx), %xmm0 + palignr $3, -16(%rdi, %rdx), %xmm0 +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 +# else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +# endif + jbe LABEL(exit_use) +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +# endif + + add $16, %rdx + add $16, %r10 + jg LABEL(nibble_ashr_3_use) + + movdqa (%rdi, %rdx), %xmm0 + palignr $3, -16(%rdi, %rdx), %xmm0 +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 +# else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +# endif + jbe LABEL(exit_use) +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +# endif + add $16, %rdx + jmp LABEL(loop_ashr_3_use) + + .p2align 4 +LABEL(nibble_ashr_3_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 + psrldq $3, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx + jae LABEL(nibble_ashr_exit_use) +# endif + cmp $12, %ecx + ja LABEL(nibble_ashr_3_restart_use) + + jmp LABEL(nibble_ashr_exit_use) + +/* + * The following cases will be handled by ashr_4 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(12~15) n -12 3(15 +(n-12) - n) ashr_4 + */ + .p2align 4 +LABEL(ashr_4): + pslldq $12, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + mov $16, %rcx /* index for loads */ + mov $4, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 4(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ + + .p2align 4 +LABEL(loop_ashr_4_use): + add $16, %r10 + jg LABEL(nibble_ashr_4_use) + +LABEL(nibble_ashr_4_restart_use): + movdqa (%rdi, %rdx), %xmm0 + palignr $4, -16(%rdi, %rdx), %xmm0 +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 +# else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +# endif + jbe LABEL(exit_use) +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +# endif + + add $16, %rdx + add $16, %r10 + jg LABEL(nibble_ashr_4_use) + + movdqa (%rdi, %rdx), %xmm0 + palignr $4, -16(%rdi, %rdx), %xmm0 +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 +# else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +# endif + jbe LABEL(exit_use) +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +# endif + add $16, %rdx + jmp LABEL(loop_ashr_4_use) + + .p2align 4 +LABEL(nibble_ashr_4_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 + psrldq $4, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx + jae LABEL(nibble_ashr_exit_use) +# endif + cmp $11, %ecx + ja LABEL(nibble_ashr_4_restart_use) + + jmp LABEL(nibble_ashr_exit_use) + +/* + * The following cases will be handled by ashr_5 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(11~15) n - 11 4(15 +(n-11) - n) ashr_5 + */ + .p2align 4 +LABEL(ashr_5): + pslldq $11, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + mov $16, %rcx /* index for loads */ + mov $5, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 5(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ + + .p2align 4 +LABEL(loop_ashr_5_use): + add $16, %r10 + jg LABEL(nibble_ashr_5_use) + +LABEL(nibble_ashr_5_restart_use): + movdqa (%rdi, %rdx), %xmm0 + palignr $5, -16(%rdi, %rdx), %xmm0 +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 +# else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +# endif + jbe LABEL(exit_use) +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +# endif + + add $16, %rdx + add $16, %r10 + jg LABEL(nibble_ashr_5_use) + + movdqa (%rdi, %rdx), %xmm0 + + palignr $5, -16(%rdi, %rdx), %xmm0 +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 +# else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +# endif + jbe LABEL(exit_use) +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +# endif + add $16, %rdx + jmp LABEL(loop_ashr_5_use) + + .p2align 4 +LABEL(nibble_ashr_5_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 + psrldq $5, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx + jae LABEL(nibble_ashr_exit_use) +# endif + cmp $10, %ecx + ja LABEL(nibble_ashr_5_restart_use) + + jmp LABEL(nibble_ashr_exit_use) + +/* + * The following cases will be handled by ashr_6 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(10~15) n - 10 5(15 +(n-10) - n) ashr_6 + */ + .p2align 4 +LABEL(ashr_6): + pslldq $10, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + mov $16, %rcx /* index for loads */ + mov $6, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 6(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ + + .p2align 4 +LABEL(loop_ashr_6_use): + add $16, %r10 + jg LABEL(nibble_ashr_6_use) + +LABEL(nibble_ashr_6_restart_use): + movdqa (%rdi, %rdx), %xmm0 + palignr $6, -16(%rdi, %rdx), %xmm0 +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 +# else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +# endif + jbe LABEL(exit_use) +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +# endif + + add $16, %rdx + add $16, %r10 + jg LABEL(nibble_ashr_6_use) + + movdqa (%rdi, %rdx), %xmm0 + palignr $6, -16(%rdi, %rdx), %xmm0 +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 +# else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +# endif + jbe LABEL(exit_use) +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +# endif + add $16, %rdx + jmp LABEL(loop_ashr_6_use) + + .p2align 4 +LABEL(nibble_ashr_6_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 + psrldq $6, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx + jae LABEL(nibble_ashr_exit_use) +# endif + cmp $9, %ecx + ja LABEL(nibble_ashr_6_restart_use) + + jmp LABEL(nibble_ashr_exit_use) + +/* + * The following cases will be handled by ashr_7 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(9~15) n - 9 6(15 +(n - 9) - n) ashr_7 + */ + .p2align 4 +LABEL(ashr_7): + pslldq $9, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + mov $16, %rcx /* index for loads */ + mov $7, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 7(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ + + .p2align 4 +LABEL(loop_ashr_7_use): + add $16, %r10 + jg LABEL(nibble_ashr_7_use) + +LABEL(nibble_ashr_7_restart_use): + movdqa (%rdi, %rdx), %xmm0 + palignr $7, -16(%rdi, %rdx), %xmm0 +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 +# else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +# endif + jbe LABEL(exit_use) +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +# endif + + add $16, %rdx + add $16, %r10 + jg LABEL(nibble_ashr_7_use) + + movdqa (%rdi, %rdx), %xmm0 + palignr $7, -16(%rdi, %rdx), %xmm0 +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 +# else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +# endif + jbe LABEL(exit_use) +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +# endif + add $16, %rdx + jmp LABEL(loop_ashr_7_use) + + .p2align 4 +LABEL(nibble_ashr_7_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 + psrldq $7, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx + jae LABEL(nibble_ashr_exit_use) +# endif + cmp $8, %ecx + ja LABEL(nibble_ashr_7_restart_use) + + jmp LABEL(nibble_ashr_exit_use) + +/* + * The following cases will be handled by ashr_8 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(8~15) n - 8 7(15 +(n - 8) - n) ashr_8 + */ + .p2align 4 +LABEL(ashr_8): + pslldq $8, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + mov $16, %rcx /* index for loads */ + mov $8, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 8(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ + + .p2align 4 +LABEL(loop_ashr_8_use): + add $16, %r10 + jg LABEL(nibble_ashr_8_use) + +LABEL(nibble_ashr_8_restart_use): + movdqa (%rdi, %rdx), %xmm0 + palignr $8, -16(%rdi, %rdx), %xmm0 +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 +# else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +# endif + jbe LABEL(exit_use) +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +# endif + + add $16, %rdx + add $16, %r10 + jg LABEL(nibble_ashr_8_use) + + movdqa (%rdi, %rdx), %xmm0 + palignr $8, -16(%rdi, %rdx), %xmm0 +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 +# else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +# endif + jbe LABEL(exit_use) +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +# endif + add $16, %rdx + jmp LABEL(loop_ashr_8_use) + + .p2align 4 +LABEL(nibble_ashr_8_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 + psrldq $8, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx + jae LABEL(nibble_ashr_exit_use) +# endif + cmp $7, %ecx + ja LABEL(nibble_ashr_8_restart_use) + + jmp LABEL(nibble_ashr_exit_use) + +/* + * The following cases will be handled by ashr_9 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(7~15) n - 7 8(15 +(n - 7) - n) ashr_9 + */ + .p2align 4 +LABEL(ashr_9): + pslldq $7, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + mov $16, %rcx /* index for loads */ + mov $9, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 9(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ + + .p2align 4 +LABEL(loop_ashr_9_use): + add $16, %r10 + jg LABEL(nibble_ashr_9_use) + +LABEL(nibble_ashr_9_restart_use): + movdqa (%rdi, %rdx), %xmm0 + + palignr $9, -16(%rdi, %rdx), %xmm0 +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 +# else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +# endif + jbe LABEL(exit_use) +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +# endif + + add $16, %rdx + add $16, %r10 + jg LABEL(nibble_ashr_9_use) + + movdqa (%rdi, %rdx), %xmm0 + palignr $9, -16(%rdi, %rdx), %xmm0 +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 +# else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +# endif + jbe LABEL(exit_use) +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +# endif + add $16, %rdx + jmp LABEL(loop_ashr_9_use) + + .p2align 4 +LABEL(nibble_ashr_9_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 + psrldq $9, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx + jae LABEL(nibble_ashr_exit_use) +# endif + cmp $6, %ecx + ja LABEL(nibble_ashr_9_restart_use) + + jmp LABEL(nibble_ashr_exit_use) + +/* + * The following cases will be handled by ashr_10 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(6~15) n - 6 9(15 +(n - 6) - n) ashr_10 + */ + .p2align 4 +LABEL(ashr_10): + pslldq $6, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + mov $16, %rcx /* index for loads */ + mov $10, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 10(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ + + .p2align 4 +LABEL(loop_ashr_10_use): + add $16, %r10 + jg LABEL(nibble_ashr_10_use) + +LABEL(nibble_ashr_10_restart_use): + movdqa (%rdi, %rdx), %xmm0 + palignr $10, -16(%rdi, %rdx), %xmm0 +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 +# else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +# endif + jbe LABEL(exit_use) +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +# endif + + add $16, %rdx + add $16, %r10 + jg LABEL(nibble_ashr_10_use) + + movdqa (%rdi, %rdx), %xmm0 + palignr $10, -16(%rdi, %rdx), %xmm0 +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 +# else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +# endif + jbe LABEL(exit_use) +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +# endif + add $16, %rdx + jmp LABEL(loop_ashr_10_use) + + .p2align 4 +LABEL(nibble_ashr_10_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 + psrldq $10, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx + jae LABEL(nibble_ashr_exit_use) +# endif + cmp $5, %ecx + ja LABEL(nibble_ashr_10_restart_use) + + jmp LABEL(nibble_ashr_exit_use) + +/* + * The following cases will be handled by ashr_11 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(5~15) n - 5 10(15 +(n - 5) - n) ashr_11 + */ + .p2align 4 +LABEL(ashr_11): + pslldq $5, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + mov $16, %rcx /* index for loads */ + mov $11, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 11(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ + + .p2align 4 +LABEL(loop_ashr_11_use): + add $16, %r10 + jg LABEL(nibble_ashr_11_use) + +LABEL(nibble_ashr_11_restart_use): + movdqa (%rdi, %rdx), %xmm0 + palignr $11, -16(%rdi, %rdx), %xmm0 +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 +# else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +# endif + jbe LABEL(exit_use) +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +# endif + + add $16, %rdx + add $16, %r10 + jg LABEL(nibble_ashr_11_use) + + movdqa (%rdi, %rdx), %xmm0 + palignr $11, -16(%rdi, %rdx), %xmm0 +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 +# else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +# endif + jbe LABEL(exit_use) +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +# endif + add $16, %rdx + jmp LABEL(loop_ashr_11_use) + + .p2align 4 +LABEL(nibble_ashr_11_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 + psrldq $11, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx + jae LABEL(nibble_ashr_exit_use) +# endif + cmp $4, %ecx + ja LABEL(nibble_ashr_11_restart_use) + + jmp LABEL(nibble_ashr_exit_use) + +/* + * The following cases will be handled by ashr_12 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(4~15) n - 4 11(15 +(n - 4) - n) ashr_12 + */ + .p2align 4 +LABEL(ashr_12): + pslldq $4, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + mov $16, %rcx /* index for loads */ + mov $12, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 12(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ + + .p2align 4 +LABEL(loop_ashr_12_use): + add $16, %r10 + jg LABEL(nibble_ashr_12_use) + +LABEL(nibble_ashr_12_restart_use): + movdqa (%rdi, %rdx), %xmm0 + palignr $12, -16(%rdi, %rdx), %xmm0 +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 +# else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +# endif + jbe LABEL(exit_use) +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +# endif + + add $16, %rdx + add $16, %r10 + jg LABEL(nibble_ashr_12_use) + + movdqa (%rdi, %rdx), %xmm0 + palignr $12, -16(%rdi, %rdx), %xmm0 +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 +# else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +# endif + jbe LABEL(exit_use) +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +# endif + add $16, %rdx + jmp LABEL(loop_ashr_12_use) + + .p2align 4 +LABEL(nibble_ashr_12_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 + psrldq $12, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx + jae LABEL(nibble_ashr_exit_use) +# endif + cmp $3, %ecx + ja LABEL(nibble_ashr_12_restart_use) + + jmp LABEL(nibble_ashr_exit_use) + +/* + * The following cases will be handled by ashr_13 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(3~15) n - 3 12(15 +(n - 3) - n) ashr_13 + */ + .p2align 4 +LABEL(ashr_13): + pslldq $3, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + mov $16, %rcx /* index for loads */ + mov $13, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 13(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ + + .p2align 4 +LABEL(loop_ashr_13_use): + add $16, %r10 + jg LABEL(nibble_ashr_13_use) + +LABEL(nibble_ashr_13_restart_use): + movdqa (%rdi, %rdx), %xmm0 + palignr $13, -16(%rdi, %rdx), %xmm0 +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 +# else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +# endif + jbe LABEL(exit_use) +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +# endif + + add $16, %rdx + add $16, %r10 + jg LABEL(nibble_ashr_13_use) + + movdqa (%rdi, %rdx), %xmm0 + palignr $13, -16(%rdi, %rdx), %xmm0 +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 +# else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +# endif + jbe LABEL(exit_use) +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +# endif + add $16, %rdx + jmp LABEL(loop_ashr_13_use) + + .p2align 4 +LABEL(nibble_ashr_13_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 + psrldq $13, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx + jae LABEL(nibble_ashr_exit_use) +# endif + cmp $2, %ecx + ja LABEL(nibble_ashr_13_restart_use) + + jmp LABEL(nibble_ashr_exit_use) + +/* + * The following cases will be handled by ashr_14 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(2~15) n - 2 13(15 +(n - 2) - n) ashr_14 + */ + .p2align 4 +LABEL(ashr_14): + pslldq $2, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + mov $16, %rcx /* index for loads */ + mov $14, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 14(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ + + .p2align 4 +LABEL(loop_ashr_14_use): + add $16, %r10 + jg LABEL(nibble_ashr_14_use) + +LABEL(nibble_ashr_14_restart_use): + movdqa (%rdi, %rdx), %xmm0 + palignr $14, -16(%rdi, %rdx), %xmm0 +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 +# else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +# endif + jbe LABEL(exit_use) +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +# endif + + add $16, %rdx + add $16, %r10 + jg LABEL(nibble_ashr_14_use) + + movdqa (%rdi, %rdx), %xmm0 + palignr $14, -16(%rdi, %rdx), %xmm0 +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 +# else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +# endif + jbe LABEL(exit_use) +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +# endif + add $16, %rdx + jmp LABEL(loop_ashr_14_use) + + .p2align 4 +LABEL(nibble_ashr_14_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 + psrldq $14, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx + jae LABEL(nibble_ashr_exit_use) +# endif + cmp $1, %ecx + ja LABEL(nibble_ashr_14_restart_use) + + jmp LABEL(nibble_ashr_exit_use) + +/* + * The following cases will be handled by ashr_15 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(1~15) n - 1 14(15 +(n - 1) - n) ashr_15 + */ + .p2align 4 +LABEL(ashr_15): + pslldq $1, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + mov $16, %rcx /* index for loads */ + mov $15, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 15(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + + sub $0x1000, %r10 /* subtract 4K pagesize */ + + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ + + .p2align 4 +LABEL(loop_ashr_15_use): + add $16, %r10 + jg LABEL(nibble_ashr_15_use) + +LABEL(nibble_ashr_15_restart_use): + movdqa (%rdi, %rdx), %xmm0 + palignr $15, -16(%rdi, %rdx), %xmm0 +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 +# else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +# endif + jbe LABEL(exit_use) +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +# endif + + add $16, %rdx + add $16, %r10 + jg LABEL(nibble_ashr_15_use) + + movdqa (%rdi, %rdx), %xmm0 + palignr $15, -16(%rdi, %rdx), %xmm0 +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 +# else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +# endif + jbe LABEL(exit_use) +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +# endif + add $16, %rdx + jmp LABEL(loop_ashr_15_use) + + .p2align 4 +LABEL(nibble_ashr_15_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 + psrldq $15, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx + jae LABEL(nibble_ashr_exit_use) +# endif + cmp $0, %ecx + ja LABEL(nibble_ashr_15_restart_use) + +LABEL(nibble_ashr_exit_use): +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 +# else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +# endif + .p2align 4 +LABEL(exit_use): + jnc LABEL(strcmp_exitz) +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub %rcx, %r11 + jbe LABEL(strcmp_exitz) +# endif + add %rcx, %rdx + lea -16(%rdi, %r9), %rdi + movzbl (%rdi, %rdx), %eax + movzbl (%rsi, %rdx), %edx + test %r8d, %r8d + jz LABEL(ret_use) + xchg %eax, %edx +LABEL(ret_use): +# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx + movl (%rcx,%rdx,4), %edx + movl (%rcx,%rax,4), %eax +# endif + + sub %edx, %eax + ret + +LABEL(less32bytes): + lea (%rdi, %rax), %rdi /* locate the exact address for first operand(rdi) */ + lea (%rsi, %rcx), %rsi /* locate the exact address for second operand(rsi) */ + test %r8d, %r8d + jz LABEL(ret) + xchg %rsi, %rdi /* recover original order according to flag(%r8d) */ + + .p2align 4 +LABEL(ret): +LABEL(less16bytes): + bsf %rdx, %rdx /* find and store bit index in %rdx */ + +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub %rdx, %r11 + jbe LABEL(strcmp_exitz) +# endif + movzbl (%rsi, %rdx), %ecx + movzbl (%rdi, %rdx), %eax + +# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx + movl (%rdx,%rcx,4), %ecx + movl (%rdx,%rax,4), %eax +# endif + + sub %ecx, %eax + ret + +LABEL(strcmp_exitz): + xor %eax, %eax + ret + + .p2align 4 + // XXX Same as code above +LABEL(Byte0): + movzbl (%rsi), %ecx + movzbl (%rdi), %eax + +# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx + movl (%rdx,%rcx,4), %ecx + movl (%rdx,%rax,4), %eax +# endif + + sub %ecx, %eax + ret + cfi_endproc + .size STRCMP, .-STRCMP + +# undef UCLOW_reg +# undef UCHIGH_reg +# undef LCQWORD_reg +# undef TOLOWER + + /* Put all SSE 4.2 functions together. */ + .section .rodata.SECTION,"a",@progbits + .p2align 3 +LABEL(unaligned_table): + .int LABEL(ashr_1) - LABEL(unaligned_table) + .int LABEL(ashr_2) - LABEL(unaligned_table) + .int LABEL(ashr_3) - LABEL(unaligned_table) + .int LABEL(ashr_4) - LABEL(unaligned_table) + .int LABEL(ashr_5) - LABEL(unaligned_table) + .int LABEL(ashr_6) - LABEL(unaligned_table) + .int LABEL(ashr_7) - LABEL(unaligned_table) + .int LABEL(ashr_8) - LABEL(unaligned_table) + .int LABEL(ashr_9) - LABEL(unaligned_table) + .int LABEL(ashr_10) - LABEL(unaligned_table) + .int LABEL(ashr_11) - LABEL(unaligned_table) + .int LABEL(ashr_12) - LABEL(unaligned_table) + .int LABEL(ashr_13) - LABEL(unaligned_table) + .int LABEL(ashr_14) - LABEL(unaligned_table) + .int LABEL(ashr_15) - LABEL(unaligned_table) + .int LABEL(ashr_0) - LABEL(unaligned_table) + +# undef LABEL +# undef GLABEL +# undef SECTION +# undef movdqa +# undef movdqu +# undef pmovmskb +# undef pcmpistri +# undef psubb +# undef pcmpeqb +# undef psrldq +# undef pslldq +# undef palignr +# undef pxor +# undef D #endif diff --git a/sysdeps/x86_64/multiarch/strncase_l-sse4_2.S b/sysdeps/x86_64/multiarch/strncase_l-sse4_2.S index 08e23548c3..1ce5c4e93f 100644 --- a/sysdeps/x86_64/multiarch/strncase_l-sse4_2.S +++ b/sysdeps/x86_64/multiarch/strncase_l-sse4_2.S @@ -16,6 +16,5 @@ License along with the GNU C Library; if not, see <https://www.gnu.org/licenses/>. */ -#define STRCMP_SSE42 __strncasecmp_l_sse42 #define USE_AS_STRNCASECMP_L -#include "strcmp-sse42.S" +#include "strcmp-sse4_2.S" diff --git a/sysdeps/x86_64/multiarch/strncmp-sse4_2.S b/sysdeps/x86_64/multiarch/strncmp-sse4_2.S index 310a6dbe77..2a02f0c2a6 100644 --- a/sysdeps/x86_64/multiarch/strncmp-sse4_2.S +++ b/sysdeps/x86_64/multiarch/strncmp-sse4_2.S @@ -16,8 +16,5 @@ License along with the GNU C Library; if not, see <https://www.gnu.org/licenses/>. */ -#if IS_IN (libc) -# define STRCMP_SSE42 __strncmp_sse42 -# define USE_AS_STRNCMP -# include "strcmp-sse42.S" -#endif +#define USE_AS_STRNCMP +#include "strcmp-sse4_2.S"